diff --git a/Day03/book.py b/Day03/book.py index 1a69d46ff0d138af22827fcaf9e104d8128b5612..6bb748466560214d77d9062b211bc150e3170b88 100644 --- a/Day03/book.py +++ b/Day03/book.py @@ -2,6 +2,7 @@ import requests # 导入文件操作库 import codecs +import os from bs4 import BeautifulSoup import sys import importlib @@ -10,19 +11,20 @@ importlib.reload(sys) # 给请求指定一个请求头来模拟chrome浏览器 global headers headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'} -server = 'http://www.biquge.cm' +server = 'http://www.xbiquge.la/' # 星辰变地址 -book = 'http://www.biquge.cm/2/2042/' +book = 'http://www.xbiquge.la/5/5623/' # 定义存储位置 global save_path save_path = 'G:/星辰变' - +if os.path.exists(save_path) is False: + os.makedirs(save_path) # 获取章节内容 def get_contents(chapter): req = requests.get(url=chapter) html = req.content - html_doc = str(html, 'gbk') + html_doc = str(html, 'utf8') bf = BeautifulSoup(html_doc, 'html.parser') texts = bf.find_all('div', id="content") # 获取div标签id属性content的内容 \xa0 是不间断空白符   @@ -40,7 +42,7 @@ def write_txt(chapter, content, code): def main(): res = requests.get(book, headers=headers) html = res.content - html_doc = str(html, 'gbk') + html_doc = str(html, 'utf8') # 使用自带的html.parser解析 soup = BeautifulSoup(html_doc, 'html.parser') # 获取所有的章节 @@ -50,7 +52,7 @@ def main(): try: chapter = server + each.get('href') content = get_contents(chapter) - chapter = save_path + "/" + each.string + ".txt" + chapter = save_path + "/" + each.string.replace("?", "") + ".txt" write_txt(chapter, content, 'utf8') except Exception as e: print(e)