代码拉取完成,页面将自动刷新
同步操作将从 YwY/html_to_md 强制同步,此操作会覆盖自 Fork 仓库以来所做的任何修改,且无法恢复!!!
确定后同步将在后台操作,完成时将刷新页面,请耐心等待。
import json
import os
import time
from tkinter import *
import requests
from bs4 import BeautifulSoup
from lxml.html import etree
# 总窗口
root = Tk()
# 设置标题
root.title('博客园随笔转换')
# 设置大小
def count_url_lis_to_dict(func_1):
def wrapper(*args, **kwargs):
dic = dict()
lis = func_1(*args, **kwargs)
count = lis[0]
url_lis = lis[1]
dic['count'] = count
name_xpath = '//*[@id="cb_post_title_url"]/text()'
for url in url_lis:
try:
response = requests.get(url)
response = response.text
response_html = etree.HTML(response)
name = response_html.xpath(name_xpath)[0]
dic[name] = url
show_windos(f'已经获取标题为{name},url为{url}')
except:
show_windos(f'链接{url}获取随笔失败')
continue
show_windos('全部读取完毕')
show_windos(f'随笔总数{dic["count"]}')
show_windos(f'保存的json文件中的内容{dic}')
return dic
return wrapper
@count_url_lis_to_dict
def get_count_url_lis(url):
lis = []
count = 1
url = f'https://www.cnblogs.com/{url}'
while True:
count_1 = len(lis)
response = requests.get(f'{url}default.html?page={count}')
response = response.text
data_1 = re.findall(' href="(.*?)"', response, re.S)
for a in data_1: # type:str
if a.startswith('http'):
if a.endswith('html'):
if 'archive' not in a:
lis.append(a)
count += 1
lis = set(lis)
lis = list(lis)
count_2 = len(lis)
if count_1 == count_2:
return count_2, lis # 博客的数据量,博客里面随笔的url
@count_url_lis_to_dict
def get_category_url_lis(url_):
"""获取某个博客下的所有博客链接"""
url = f'https://www.cnblogs.com/{url_}'
response = requests.get(url)
data = response.text
category_url_lis = re.findall('class="entrylistItemTitle" href="(.*?)"', data)
return len(category_url_lis), category_url_lis
# 保存爬取的字典成 json格式
def dump_json(dict, url):
file_name = url.split('/')[-2]
if not os.path.exists('博客园随笔'): os.mkdir('博客园随笔') # 创建个放随笔的文件夹
file_path = os.path.join('博客园随笔', f'{file_name}.json')
with open(file_path, 'w', encoding='utf8') as fw:
json.dump(dict, fw)
show_windos('字典保存json文件成功')
# 读取字典且保存去除字典中count 这一栏
def read_dic(url):
file_name = url.split('/')[-2]
file_path = os.path.join('博客园随笔', f'{file_name}.json')
try:
with open(file_path, 'r', encoding='utf8') as fr:
dic = json.load(fr)
# 删除计数这一栏
del dic["count"]
# 去掉头
dic_str = json.dumps(dic)
dic_str = re.sub(r'https://www.cnblogs.com/', '', dic_str)
# 变回字典
dic = json.loads(dic_str)
return dic
except:
show_windos('没有爬取下来链接')
return False
# 输入随笔的url转md格式文件
def url_to_md_txt(url):
try:
url = f'https://www.cnblogs.com/{url}'
response = requests.get(url)
# print(response.text)
a = re.findall(
'<div id="cnblogs_post_body" class="blogpost-body.*?">(.*?)<div id="MySignature"></div>',
response.text, re.S)
if not a:
response_dome = BeautifulSoup(response.text, 'html.parser')
response_dome_str = str(response_dome.div)
a = re.findall('<div class="postBody">(.*?)<div id="MySignature"></div>', response_dome_str, re.S)
a = a[0]
#去除a头尾的空格
a = a.strip()
#去除末尾的div
a = a[:-6]
#再去除一次宫格
a = a.strip()
# 标题
a = re.sub('<h1>.*?\d*\. (?P<name>.*?)</h1>', '<h1>\g<name>\n\n</h1>', a)
a = re.sub('<h1.*?>', '# ', a)
a = re.sub('<h2>.*?\d*\.\d* (?P<name>.*?)</h2>', '<h2>\g<name>\n\n</h2>', a)
a = re.sub('<h2.*?>', '## ', a)
a = re.sub('<h3>.*?\d*\.\d*\.\d* (?P<name>.*?)</h3>', '<h3>\g<name>\n\n</h3>', a)
a = re.sub('<h3.*?>', '### ', a)
a = re.sub('<h4>.*?\d*\.\d*\.\d*\.\d* (?P<name>.*?)</h4>', '<h4>\g<name>\n\n</h4>', a)
a = re.sub('<h4.*?>', '#### ', a)
a = re.sub('<h5>.*?\d*\.\d*\.\d*\.\d*\.\d* (?P<name>.*?)</h5>', '<h5>\g<name>\n\n</h5>', a)
a = re.sub('<h5.*?>', '##### ', a)
a = re.sub('<h6>.*?\d*\.\d*\.\d*\.\d*\.\d*\.\d* (?P<name>.*?)</h6>', '<h6>\g<name>\n\n</h6>', a)
a = re.sub('<h6.*?>', '###### ', a)
a = re.sub('</h1>|</h2>|</h3>|</h4>|</h5>|</h6>|', "", a)
# print(a)
# 三个点
if '<pre class=' in a:
a = re.sub('<pre class="', '```', a)
a = re.sub('"><code>', '\n', a)
a = re.sub('<pre><code.*?>','```\n',a)
a = re.sub('</code></pre>', '\n```', a)
#另外一个写法的a
a = re.sub('<div class="cnblogs_code".*?>', '```python', a)
a = re.sub('</div>', '```', a)
#一个点
a = re.sub('<code.*?>|</code>', '`', a)
# 标签
# 去掉开头的div标签
a = re.sub('<div.*?>', '', a)
# em标签
a = re.sub('<em.*?>|</em>', ' ', a)
# strong标签加粗
a = re.sub('<strong>|</strong>', '**', a)
# span标签
a = re.sub('<span.*?>|</span>', '', a)
# pre标签
a = re.sub('<pre.*?>|</pre>', '', a)
# p标签
a = re.sub('<p.*?>|</p>', '', a)
# br标签
a = re.sub('<br/>', '\n', a)
# 里面内容特殊变化
# 双引号
a = re.sub('"', '"', a)
# 单引号
a = re.sub(''', "'", a)
# >符号
a = re.sub('>', '>', a)
# 符号
a = re.sub('<', '<', a)
#ul与li
a = re.sub('<ul.*?>|</ul>|</li>','',a)
a = re.sub('<li.*?>','- ',a)
#html标签修正
print(a)
a = re.sub('<;', '<', a)
a = re.sub('>;', '>', a)
a = re.sub(';/', '/', a)
# 上面全是转md
return a
# 可能博客不一样会存在见状性没有用我匹配的格式找到内容
except:
return False
# 保存文档
def text_to_file(name, text):
# 创建文件夹
if not os.path.exists('博客园随笔md格式'): os.mkdir('博客园随笔md格式')
# 创建文件路径
file_path = os.path.join('博客园随笔md格式', f'{name}.md')
# 保存
try:
with open(file_path, 'w', encoding='utf8') as fw:
fw.write(text)
show_windos(f'{name}的markdown文件保存成功')
except:
show_windos(f'{name}的markdown文件保存失败')
# 展示窗口
windos_1 = Listbox(root)
def show_windos(*args, start_index='ywy'):
if start_index == 'ywy':
for data in args:
windos_1.insert(END, data)
elif start_index == 0:
windos_1.delete(0, END)
for data in args:
windos_1.insert(END, data)
# 功能选择输入框标题
count_1 = Label(root, text='批量爬取博客园首页的所有随笔字典并保存json文件,且随笔全部转成md格式文件')
count_2 = Label(root, text='输入指定随笔url把随笔内容转成md并且保存')
count_3 = Label(root, text='爬取某个分类下的所有博客')
# 功能输入框输入框
# 存储内容
entry_1_data = StringVar()
entry_2_data = StringVar()
entry_3_data = StringVar()
# 输入框
entry_1 = Entry(root, state='normal', textvariable=entry_1_data)
entry_2 = Entry(root, state='normal', textvariable=entry_2_data)
entry_3 = Entry(root, state='normal', textvariable=entry_3_data)
# 按键传参设置
# 按键1
def b_1_dump(enter=entry_1):
url = enter.get()
show_windos(f'查找的URL:https://www.cnblogs.com/{url}',start_index=0)
try:
# 读取字典
show_windos('爬取中请等待')
dic = get_count_url_lis(url)
# 保存字典
dump_json(dic, url)
# 读取字典title和url
dic = read_dic(url) # type:dict
for name, name_url in dic.items():
show_windos(name_url)
text = url_to_md_txt(name_url)
if text:
text_to_file(name, text)
else:
show_windos(f'{name}随笔转md失败')
show_windos('全部md转换完成')
except:
show_windos('输入url有误或者没有随笔')
# 记录文章默认名字序号
action_2_num = 0
def b_2_dump(enter=entry_2):
url = enter.get()
show_windos(f'查找的URL:https://www.cnblogs.com/{url}',start_index=0)
global action_2_num
name = f'第{action_2_num}篇'
action_2_num += 1
try:
text = url_to_md_txt(url)
text_to_file(name, text)
except:
show_windos('输入url有误或者没有随笔')
def b_3_dump(enter=entry_3):
url = enter.get()
show_windos(f'查找的URL:https://www.cnblogs.com/{url}',start_index=0)
try:
# 读取字典
show_windos('爬取中请等待')
dic = get_category_url_lis(url)
# 保存字典
dump_json(dic, url)
# 读取字典title和url
dic = read_dic(url) # type:dict
for name, name_url in dic.items():
show_windos(name_url)
text = url_to_md_txt(name_url)
if text:
text_to_file(name, text)
else:
show_windos(f'{name}随笔转md失败')
show_windos('全部md转换完成')
except:
show_windos('输入url有误或者没有随笔')
# 功能对应3个按钮
b_1 = Button(root, text='确定输入', command=b_1_dump)
b_1.grid(row=0, column=2)
b_2 = Button(root, text='确定输入', command=b_2_dump)
b_2.grid(row=1, column=2)
b_3 = Button(root, text='确定输入', command=b_3_dump)
b_3.grid(row=2, column=2)
# 位置排版
# 功能一
count_1.grid(row=0, column=0)
entry_1.grid(row=0, column=1)
# 功能二
count_2.grid(row=1, column=0)
entry_2.grid(row=1, column=1)
# 功能三
count_3.grid(row=2, column=0)
entry_3.grid(row=2, column=1)
# 默认展示窗口
msg = '''
功能一输入首页url例如
https://www.cnblogs.com/pythonywy/
你只要输入pythonywy/即可
功能二输入首页url例如
https://www.cnblogs.com/pythonywy/p/11123051.html
你只要输入pythonywy/p/11123051.html即可
功能三输入首页url例如
例如 https://www.cnblogs.com/nickchen121/category/1379216.html
你只要输入nickchen121/category/1379216.html即可
消息展示面板'''
# 提示界面
msg_Label = Label(root, text=msg)
msg_Label.grid(row=3, column=0, sticky=N + S + E + W)
# 实时消息界面
windos_1.grid(row=4, column=0, sticky=N + S + W + E)
time.sleep(1)
# 展示
root.mainloop()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。