From cf45d76b8f8f140a96ba72147dd53804128b4be6 Mon Sep 17 00:00:00 2001 From: xieshike Date: Tue, 30 Mar 2021 15:37:39 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E7=A4=BA=E4=BE=8B=E7=BD=91?= =?UTF-8?q?=E5=9D=80=EF=BC=8C=E9=83=A8=E5=88=86=E7=BC=96=E7=A0=81=EF=BC=8C?= =?UTF-8?q?=E5=8F=8A=EF=BC=9F=E5=8F=B7=E6=96=87=E4=BB=B6=E5=90=8D=E4=BF=AE?= =?UTF-8?q?=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Day03/book.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/Day03/book.py b/Day03/book.py index 1a69d46..6bb7484 100644 --- a/Day03/book.py +++ b/Day03/book.py @@ -2,6 +2,7 @@ import requests # 导入文件操作库 import codecs +import os from bs4 import BeautifulSoup import sys import importlib @@ -10,19 +11,20 @@ importlib.reload(sys) # 给请求指定一个请求头来模拟chrome浏览器 global headers headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'} -server = 'http://www.biquge.cm' +server = 'http://www.xbiquge.la/' # 星辰变地址 -book = 'http://www.biquge.cm/2/2042/' +book = 'http://www.xbiquge.la/5/5623/' # 定义存储位置 global save_path save_path = 'G:/星辰变' - +if os.path.exists(save_path) is False: + os.makedirs(save_path) # 获取章节内容 def get_contents(chapter): req = requests.get(url=chapter) html = req.content - html_doc = str(html, 'gbk') + html_doc = str(html, 'utf8') bf = BeautifulSoup(html_doc, 'html.parser') texts = bf.find_all('div', id="content") # 获取div标签id属性content的内容 \xa0 是不间断空白符   @@ -40,7 +42,7 @@ def write_txt(chapter, content, code): def main(): res = requests.get(book, headers=headers) html = res.content - html_doc = str(html, 'gbk') + html_doc = str(html, 'utf8') # 使用自带的html.parser解析 soup = BeautifulSoup(html_doc, 'html.parser') # 获取所有的章节 @@ -50,7 +52,7 @@ def main(): try: chapter = server + each.get('href') content = get_contents(chapter) - chapter = save_path + "/" + each.string + ".txt" + chapter = save_path + "/" + each.string.replace("?", "") + ".txt" write_txt(chapter, content, 'utf8') except Exception as e: print(e) -- Gitee