From a51aafbd05ce6e2e8da990a9d49ef7294187fcb5 Mon Sep 17 00:00:00 2001
From: general <generals.space@gmail.com>
Date: Thu, 18 Apr 2019 01:18:44 +0800
Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0re=E6=AD=A3=E5=88=99=E6=A8=A1?=
 =?UTF-8?q?=E5=BC=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 page_parser.py | 12 ++++--------
 transform.py   | 33 +++++++++++++++++----------------
 utils.py       | 10 ++++++----
 3 files changed, 27 insertions(+), 28 deletions(-)

diff --git a/page_parser.py b/page_parser.py
index a56f126..d9dac86 100644
--- a/page_parser.py
+++ b/page_parser.py
@@ -4,7 +4,7 @@ from urllib.parse import urljoin, urlparse, urldefrag
 
 from pyquery import PyQuery
 
-from utils import empty_link_pattern, url_filter
+from utils import charset_pattern, empty_link_pattern, css_url_pattern, url_filter
 from transform import trans_to_local_link
 
 logger = logging.getLogger(__name__)
@@ -18,7 +18,7 @@ def get_page_charset(page_content):
     meta1 = pq('meta[http-equiv]').attr('content')
     meta2 = pq('meta[charset]').attr('charset')
     if meta1 is not None:
-        res = re.findall(r'charset\s*=\s*(\S*)\s*;?', meta1)
+        res = re.findall(charset_pattern, meta1)
         if len(res) != 0: charset = res[0]
     if meta2 is not None: charset = meta2
     return charset
@@ -113,16 +113,12 @@ def parse_css_file(content, task, config, callback = None):
     ## 格式可能为url('./bg.jpg'), url("./bg.jpg"), url(bg.jpg)
     ## 如下， import_list可能是[('', '', 'bg.jpg'), ('', '', 'logo.png')]
     ## 元组中前两个空格表示匹配到的都是url(bg.jpg)这种形式的属性
-    import_pattern = r'url\(\'(.*?)\'\)|url\(\"(.*?)\"\)|url\((.*?)\)'
-    match_list = re.findall(import_pattern, content)
+    match_list = re.findall(css_url_pattern, content)
     for match_item in match_list:
         for match_url in match_item:
             ## url属性的匹配模式有3种, 只有一种会被匹配上, 另外两种就是空
             ## 如果为空, 或是引入了base64数据, 就跳过不进行处理
-            if match_url == '' \
-                or match_url.startswith('data') \
-                or re.search(empty_link_pattern, match_url): 
-                continue
+            if re.search(empty_link_pattern, match_url): continue
 
             full_url = urljoin(task['url'], match_url)
             ## 如果不满足过滤规则则跳过
diff --git a/transform.py b/transform.py
index f029dfc..2450993 100644
--- a/transform.py
+++ b/transform.py
@@ -1,22 +1,27 @@
 import os
+import re
 from urllib.parse import urlparse, unquote
 
-from utils import special_chars
+from utils import special_chars, html_pattern
+
+def trans_query_for_local_link(local_link, query_str):
+    '''
+    将url中query部分中的特殊字符替换掉, 防止在写入本地文件时文件名非法.
+    '''
+    for k, v in special_chars.items():
+        if k in query_str: query_str = query_str.replace(k, v)
+    local_link = local_link + special_chars['?'] + query_str
+    return local_link
 
 def trans_to_local_link_for_page(urlObj):
     origin_path = urlObj.path
     origin_query = urlObj.query
 
     local_link = origin_path
-    if local_link == "": local_link = 'index.html'
+    if local_link == '': local_link = 'index.html'
     if local_link.endswith('/'): local_link += 'index.html'
-    if origin_query != '': 
-        query_str = origin_query
-        for k, v in special_chars.items():
-            if k in query_str: query_str = query_str.replace(k, v)
-        local_link = local_link + special_chars['?'] + query_str
-    if not local_link.endswith('.html') and not local_link.endswith('.htm'):
-        local_link += '.html'
+    if origin_query != '': local_link = trans_query_for_local_link(local_link, origin_query)
+    if not re.search(html_pattern, local_link): local_link += '.html'
     return local_link
 
 def trans_to_local_link_for_asset(urlObj):
@@ -24,13 +29,9 @@ def trans_to_local_link_for_asset(urlObj):
     origin_query = urlObj.query
 
     local_link = origin_path
-    if local_link == "": local_link = 'index'
+    if local_link == '': local_link = 'index'
     if local_link.endswith('/'): local_link += 'index'
-    if origin_query != '': 
-        query_str = origin_query
-        for k, v in special_chars.items():
-            if k in query_str: query_str = query_str.replace(k, v)
-        local_link = local_link + special_chars['?'] + query_str
+    if origin_query != '': local_link = trans_query_for_local_link(local_link, origin_query)
     return local_link
 
 def trans_to_local_link(url, url_type, main_site):
@@ -65,7 +66,7 @@ def trans_to_local_link(url, url_type, main_site):
 def trans_to_local_path(url, url_type, main_site):
     '''
     @return
-        file_path: 目标文件的存储目录, 相对路径(不以/开头), 为""时, 表示当前目录
+        file_path: 目标文件的存储目录, 相对路径(不以/开头), 为''时, 表示当前目录
         file_name: 目标文件名称
     '''
     local_link = trans_to_local_link(url, url_type, main_site)
diff --git a/utils.py b/utils.py
index 05ac5b4..5bee6cb 100644
--- a/utils.py
+++ b/utils.py
@@ -7,8 +7,6 @@ import requests
 
 logger = logging.getLogger(__name__)
 
-empty_link_pattern = r'about:blank|javascript:(void\(0\))?'
-
 special_chars = {
     '\\': 'xg',
     ':': 'mh',
@@ -20,8 +18,12 @@ special_chars = {
     ' ': 'kg'
 }
 
-image_pattern = '\.((jpg)|(png)|(bmp)|(jpeg)|(gif)|(webp))$'
-font_pattern = '\.((ttf)|(woff)|(woff2)|(otf)|(eot))$'
+charset_pattern = r'charset\s*=\s*(\S*)\s*;?'
+empty_link_pattern = r'(^$)|(^data:)|(about:blank)|(javascript:)'
+css_url_pattern = r'url\(\'(.*?)\'\)|url\(\"(.*?)\"\)|url\((.*?)\)'
+html_pattern = r'\.((html)|(htm)|(xhtml)|(xml))$'
+image_pattern = r'\.((jpg)|(png)|(bmp)|(jpeg)|(gif)|(webp))$'
+font_pattern = r'\.((ttf)|(woff)|(woff2)|(otf)|(eot))$'
 
 def request_get_async(task, config):
     '''
-- 
Gitee