From a51aafbd05ce6e2e8da990a9d49ef7294187fcb5 Mon Sep 17 00:00:00 2001 From: general Date: Thu, 18 Apr 2019 01:18:44 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0re=E6=AD=A3=E5=88=99=E6=A8=A1?= =?UTF-8?q?=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- page_parser.py | 12 ++++-------- transform.py | 33 +++++++++++++++++---------------- utils.py | 10 ++++++---- 3 files changed, 27 insertions(+), 28 deletions(-) diff --git a/page_parser.py b/page_parser.py index a56f126..d9dac86 100644 --- a/page_parser.py +++ b/page_parser.py @@ -4,7 +4,7 @@ from urllib.parse import urljoin, urlparse, urldefrag from pyquery import PyQuery -from utils import empty_link_pattern, url_filter +from utils import charset_pattern, empty_link_pattern, css_url_pattern, url_filter from transform import trans_to_local_link logger = logging.getLogger(__name__) @@ -18,7 +18,7 @@ def get_page_charset(page_content): meta1 = pq('meta[http-equiv]').attr('content') meta2 = pq('meta[charset]').attr('charset') if meta1 is not None: - res = re.findall(r'charset\s*=\s*(\S*)\s*;?', meta1) + res = re.findall(charset_pattern, meta1) if len(res) != 0: charset = res[0] if meta2 is not None: charset = meta2 return charset @@ -113,16 +113,12 @@ def parse_css_file(content, task, config, callback = None): ## 格式可能为url('./bg.jpg'), url("./bg.jpg"), url(bg.jpg) ## 如下, import_list可能是[('', '', 'bg.jpg'), ('', '', 'logo.png')] ## 元组中前两个空格表示匹配到的都是url(bg.jpg)这种形式的属性 - import_pattern = r'url\(\'(.*?)\'\)|url\(\"(.*?)\"\)|url\((.*?)\)' - match_list = re.findall(import_pattern, content) + match_list = re.findall(css_url_pattern, content) for match_item in match_list: for match_url in match_item: ## url属性的匹配模式有3种, 只有一种会被匹配上, 另外两种就是空 ## 如果为空, 或是引入了base64数据, 就跳过不进行处理 - if match_url == '' \ - or match_url.startswith('data') \ - or re.search(empty_link_pattern, match_url): - continue + if re.search(empty_link_pattern, match_url): continue full_url = urljoin(task['url'], match_url) ## 如果不满足过滤规则则跳过 diff --git a/transform.py b/transform.py index f029dfc..2450993 100644 --- a/transform.py +++ b/transform.py @@ -1,22 +1,27 @@ import os +import re from urllib.parse import urlparse, unquote -from utils import special_chars +from utils import special_chars, html_pattern + +def trans_query_for_local_link(local_link, query_str): + ''' + 将url中query部分中的特殊字符替换掉, 防止在写入本地文件时文件名非法. + ''' + for k, v in special_chars.items(): + if k in query_str: query_str = query_str.replace(k, v) + local_link = local_link + special_chars['?'] + query_str + return local_link def trans_to_local_link_for_page(urlObj): origin_path = urlObj.path origin_query = urlObj.query local_link = origin_path - if local_link == "": local_link = 'index.html' + if local_link == '': local_link = 'index.html' if local_link.endswith('/'): local_link += 'index.html' - if origin_query != '': - query_str = origin_query - for k, v in special_chars.items(): - if k in query_str: query_str = query_str.replace(k, v) - local_link = local_link + special_chars['?'] + query_str - if not local_link.endswith('.html') and not local_link.endswith('.htm'): - local_link += '.html' + if origin_query != '': local_link = trans_query_for_local_link(local_link, origin_query) + if not re.search(html_pattern, local_link): local_link += '.html' return local_link def trans_to_local_link_for_asset(urlObj): @@ -24,13 +29,9 @@ def trans_to_local_link_for_asset(urlObj): origin_query = urlObj.query local_link = origin_path - if local_link == "": local_link = 'index' + if local_link == '': local_link = 'index' if local_link.endswith('/'): local_link += 'index' - if origin_query != '': - query_str = origin_query - for k, v in special_chars.items(): - if k in query_str: query_str = query_str.replace(k, v) - local_link = local_link + special_chars['?'] + query_str + if origin_query != '': local_link = trans_query_for_local_link(local_link, origin_query) return local_link def trans_to_local_link(url, url_type, main_site): @@ -65,7 +66,7 @@ def trans_to_local_link(url, url_type, main_site): def trans_to_local_path(url, url_type, main_site): ''' @return - file_path: 目标文件的存储目录, 相对路径(不以/开头), 为""时, 表示当前目录 + file_path: 目标文件的存储目录, 相对路径(不以/开头), 为''时, 表示当前目录 file_name: 目标文件名称 ''' local_link = trans_to_local_link(url, url_type, main_site) diff --git a/utils.py b/utils.py index 05ac5b4..5bee6cb 100644 --- a/utils.py +++ b/utils.py @@ -7,8 +7,6 @@ import requests logger = logging.getLogger(__name__) -empty_link_pattern = r'about:blank|javascript:(void\(0\))?' - special_chars = { '\\': 'xg', ':': 'mh', @@ -20,8 +18,12 @@ special_chars = { ' ': 'kg' } -image_pattern = '\.((jpg)|(png)|(bmp)|(jpeg)|(gif)|(webp))$' -font_pattern = '\.((ttf)|(woff)|(woff2)|(otf)|(eot))$' +charset_pattern = r'charset\s*=\s*(\S*)\s*;?' +empty_link_pattern = r'(^$)|(^data:)|(about:blank)|(javascript:)' +css_url_pattern = r'url\(\'(.*?)\'\)|url\(\"(.*?)\"\)|url\((.*?)\)' +html_pattern = r'\.((html)|(htm)|(xhtml)|(xml))$' +image_pattern = r'\.((jpg)|(png)|(bmp)|(jpeg)|(gif)|(webp))$' +font_pattern = r'\.((ttf)|(woff)|(woff2)|(otf)|(eot))$' def request_get_async(task, config): ''' -- Gitee