From 546605405a5d2320c1875c876b64485a08299d2b Mon Sep 17 00:00:00 2001 From: general Date: Sat, 30 Mar 2019 20:35:21 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E6=8A=93=E5=8F=96=E8=A7=84?= =?UTF-8?q?=E5=88=99=E9=85=8D=E7=BD=AE=E5=8A=9F=E8=83=BD,=20=E8=BF=98?= =?UTF-8?q?=E6=9C=89=E9=BB=91=E5=90=8D=E5=8D=95=E5=88=97=E8=A1=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crawler.py | 4 +-- page_parser.py | 30 ++++++++++----------- settings.py | 22 +++++++++------ utils.py | 73 +++++++++++++++++++++++++++++++++++++++++--------- 4 files changed, 91 insertions(+), 38 deletions(-) diff --git a/crawler.py b/crawler.py index dca15a4..3d19b69 100644 --- a/crawler.py +++ b/crawler.py @@ -8,9 +8,9 @@ from urllib.parse import urlparse, urljoin from pyquery import PyQuery -from settings import outsite_asset, doc_pool_max, res_pool_max, main_url, max_depth, max_retry_times, empty_link_pattern, site_db +from settings import outsite_asset, doc_pool_max, res_pool_max, main_url, max_depth, max_retry_times, site_db from page_parser import get_page_charset, parse_linking_pages, parse_linking_assets, parse_css_file -from utils import logger, request_get_async, save_file_async, trans_to_local_link, trans_to_local_path +from utils import logger, empty_link_pattern, request_get_async, save_file_async, trans_to_local_link, trans_to_local_path from worker_pool import WorkerPool from db import init_db, query_url_record, add_url_record, query_page_tasks, query_asset_tasks, save_page_task, save_asset_task, update_record_to_success from cache_queue import CacheQueue diff --git a/page_parser.py b/page_parser.py index 0502408..764e7b9 100644 --- a/page_parser.py +++ b/page_parser.py @@ -3,8 +3,8 @@ from urllib.parse import urljoin, urlparse, urldefrag from pyquery import PyQuery -from settings import empty_link_pattern, outsite_asset -from utils import logger, get_main_site, trans_to_local_link +from settings import outsite_asset +from utils import logger, empty_link_pattern, get_main_site, trans_to_local_link, url_filter def get_page_charset(page_content): ''' @@ -40,15 +40,14 @@ def _parse_linking_pages(element_list, origin_url, attr_name, depth, callback = url_attr = PyQuery(li).attr(attr_name) if url_attr is None or re.search(empty_link_pattern, url_attr): continue + ## 拼接url并忽略url中的井号 full_url = urljoin(origin_url, url_attr) - ## 忽略url中的井号 full_url = urldefrag(full_url).url - ## 站外的页面绝对不会抓取, 倒是站外的资源可以下载下来 - if urlparse(full_url).netloc != main_site: - logger.info('不抓取站外页面: %s' % full_url) - continue - local_link = trans_to_local_link(full_url, True) + ## 如果不满足过滤规则则跳过 + if not url_filter(full_url, url_type='page'): continue + ## 重设链接地址为本地路径 + local_link = trans_to_local_link(full_url, True) PyQuery(li).attr(attr_name, local_link) if callback: callback(full_url, origin_url, depth) @@ -75,18 +74,15 @@ def _parse_linking_assets(element_list, origin_url, attr_name, depth, callback): if url_attr is None or re.search(empty_link_pattern, url_attr): continue + ## 拼接url并忽略url中的井号 full_url = urljoin(origin_url, url_attr) - ## 忽略url中的井号 full_url = urldefrag(full_url).url - host = urlparse(full_url).netloc - if host != main_site and not outsite_asset: - logger.info('不抓取站外资源: %s' % full_url) - continue + ## 如果不满足过滤规则则跳过 + if not url_filter(full_url, url_type='asset'): continue - local_link = trans_to_local_link(full_url, False) ## 重设链接地址为本地路径 + local_link = trans_to_local_link(full_url, False) PyQuery(li).attr(attr_name, local_link) - ## 尝试入队列 if callback: callback(full_url, origin_url, depth) def parse_css_file(content, origin_url, depth, callback = None): @@ -111,8 +107,10 @@ def parse_css_file(content, origin_url, depth, callback = None): continue full_url = urljoin(origin_url, match_url) + ## 如果不满足过滤规则则跳过 + if not url_filter(full_url, url_type='asset'): continue local_link = trans_to_local_link(full_url, False) - ## 尝试入队列 + if callback: callback(full_url, origin_url, depth) content = content.replace(match_url, local_link) return content.encode('utf-8') diff --git a/settings.py b/settings.py index 8c0efc4..b925bb2 100644 --- a/settings.py +++ b/settings.py @@ -1,19 +1,19 @@ import logging # 要爬取的网站url, 需要以http(s)开头 -main_url = 'http://97daimeng.com/' +main_url = 'https://m.heiyeba.com' # 设置代理 -proxies = {} # 代理格式: # { # "http": "127.0.0.1:1080", # "https": "127.0.0.1:1080", # } +proxies = {} # HTTP请求的header headers = { - 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36' + 'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1' } # 输出站点文件的路径,最后要加 '/' @@ -32,18 +32,24 @@ res_pool_max = 20 # 每次请求随机延迟的时间,单位s,[最大值,最小值] wait_time = [1, 3] -# 是否爬取该站以外的静态资源(不是页面) -outsite_asset = True - # 爬取页面的深度, 从1开始计, 爬到第N层为止. # 1表示只抓取单页, 0表示无限制 max_depth = 2 # 请求出错最大重试次数(超时也算出错) max_retry_times = 5 -empty_link_pattern = r'about:blank|javascript:(void\(0\))?' - logging_config = { 'level': logging.DEBUG, 'format': '%(asctime)s %(levelname)s - %(name)s - %(filename)s - %(message)s', } +############################################################ +## 抓取规则 + +## 是否爬取该站以外的静态资源(不是页面) +outsite_asset = True +no_js = True +no_css = False +no_images = True +no_fonts = False +## 黑名单, 列表类型. 规则格式为正则, 默认为空. +black_list = [] diff --git a/utils.py b/utils.py index 8c07755..157149a 100644 --- a/utils.py +++ b/utils.py @@ -6,12 +6,29 @@ from urllib.parse import urlparse, unquote import requests -from settings import main_url, headers, proxies, output_path, logging_config +from settings import main_url, headers, proxies, output_path, logging_config, outsite_asset +from settings import no_js, no_css, no_images, no_fonts, black_list logging.basicConfig(**logging_config) logger = logging.getLogger(__name__) ## logger.setLevel(logging.DEBUG) +empty_link_pattern = r'about:blank|javascript:(void\(0\))?' + +special_chars = { + '\\': 'xg', + ':': 'mh', + '*': 'xh', + '?': 'wh', + '<': 'xy', + '>': 'dy', + '|': 'sx', + ' ': 'kg' +} + +image_pattern = '\.((jpg)|(png)|(bmp)|(jpeg)|(gif)|(webp))$' +font_pattern = '\.((ttf)|(woff)|(woff2)|(otf)|(eot))$' + main_site = '' def get_main_site(): global main_site @@ -53,17 +70,6 @@ def save_file_async(file_path, file_name, byte_content): logger.error('Save Error: %s, path: %s, name: %s' % (err, path, file_name)) return (0, err) -special_chars = { - '\\': 'xg', - ':': 'mh', - '*': 'xh', - '?': 'wh', - '<': 'xy', - '>': 'dy', - '|': 'sx', - ' ': 'kg' -} - def trans_to_local_link(url, is_page = True): ''' @param @@ -120,3 +126,46 @@ def trans_to_local_path(url, is_page = True): file_name = os.path.basename(local_link) return file_dir, file_name + +def url_filter(url, url_type = 'page'): + ''' + @function 这个函数对url比对所有设置的规则, 判断目标url是否可以抓取. + @param: url_type url类型: page/asset + @return: True: 可以抓取, False: 不可以抓取 + ''' + main_site = get_main_site() + ## 站外的页面绝对不会抓取, 倒是站外的资源可以下载下来 + if url_type == 'page' and urlparse(url).netloc != main_site: + logger.info('不抓取站外页面: %s' % url) + return False + + urlObj = urlparse(url) + host = urlObj.netloc + if url_type == 'asset' and host != main_site and not outsite_asset: + logger.info('不抓取站外资源: %s' % url) + return False + + path = urlObj.path + if url_type == 'asset' and path.endswith('.js') and no_js: + logger.info('不抓取js资源: %s' % url) + return False + + if url_type == 'asset' and path.endswith('.css') and no_css: + logger.info('不抓取css资源: %s' % url) + return False + + if url_type == 'asset' and re.search(image_pattern, url) and no_images: + logger.info('不抓取图片资源: %s' % url) + return False + + if url_type == 'asset' and re.search(font_pattern, url) and no_fonts: + logger.info('不抓取字体资源: %s' % url) + return False + + ## 不抓取黑名单中的url + for pattern in black_list: + if re.search(pattern, url): + logger.info('不抓取黑名单中的url: %s' % url) + return False + + return True -- Gitee