From e8efebcb8e1ae385b402493600cf788d6adf5d34 Mon Sep 17 00:00:00 2001 From: general Date: Wed, 17 Apr 2019 02:04:58 +0800 Subject: [PATCH] =?UTF-8?q?=E9=85=8D=E7=BD=AE=E5=AF=B9=E8=B1=A1=E6=94=B9?= =?UTF-8?q?=E9=80=A0,=20=E5=90=8C=E6=97=B6=E4=BF=AE=E6=AD=A3=E6=97=A5?= =?UTF-8?q?=E5=BF=97=E6=A0=BC=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config.py | 44 ++++++++++++++ crawler.py | 63 +++++++++++++------- db.py | 10 +++- "doc/logging\344\275\277\347\224\250.md" | 76 ++++++++++++++++++++++++ main.py | 22 ++++++- page_parser.py | 43 +++++++------- settings.py | 55 ----------------- transform.py | 9 ++- utils.py | 36 ++++------- worker_pool.py | 10 ++-- 10 files changed, 232 insertions(+), 136 deletions(-) create mode 100644 config.py create mode 100644 "doc/logging\344\275\277\347\224\250.md" delete mode 100644 settings.py diff --git a/config.py b/config.py new file mode 100644 index 0000000..e2be942 --- /dev/null +++ b/config.py @@ -0,0 +1,44 @@ +import logging + +default_config = { + ## 起始页面, 要爬取的网站url, 需要以http(s)开头 + 'main_url': '', + ## proxies: request代理, 代理格式: + ## { + ## "http": "127.0.0.1:1080", + ## "https": "127.0.0.1:1080", + ## } + 'proxies': {}, + ## HTTP请求的header + 'headers': {}, + ## request请求超时时间 + 'request_timeout': 30, + + ## 站点保存路径 + 'site_path': './sites/', + ## 抓取记录存储文件 + 'site_db': 'site.db', + ## 页面抓取协程池中协程的数量 + 'page_pool_size': 20, + ## 静态资源抓取协程池中协程的数量 + 'asset_pool_size': 20, + ## 爬取页面的深度, 从1开始计, 爬到第N层为止. + ## 1表示只抓取单页, 0表示无限制 + 'max_depth': 0, + ## 请求出错最大重试次数 + 'max_retry_times': 5, + + ## 是否爬取该站以外的静态资源(不是页面) + 'outsite_asset': True, + 'no_js': True, + 'no_css': False, + 'no_images': False, + 'no_fonts': False, + ## 黑名单, 列表类型. 规则格式为正则, 默认为空. + 'black_list': [], + + 'logging_config': { + 'level': logging.INFO, + 'format': '%(asctime)s %(levelname)s - %(name)s - %(filename)s - %(message)s', + } +} \ No newline at end of file diff --git a/crawler.py b/crawler.py index 1e5d416..f3382b2 100644 --- a/crawler.py +++ b/crawler.py @@ -4,30 +4,35 @@ import requests import time import sqlite3 import copy +import logging from urllib.parse import urlparse, urljoin from pyquery import PyQuery -from settings import outsite_asset, doc_pool_max, res_pool_max, main_url, max_depth, max_retry_times, site_db from page_parser import get_page_charset, parse_linking_pages, parse_linking_assets, parse_css_file -from utils import logger, empty_link_pattern, request_get_async, save_file_async +from utils import empty_link_pattern, request_get_async, save_file_async from transform import trans_to_local_path from worker_pool import WorkerPool from db import init_db, query_url_record, add_url_record, query_page_tasks, query_asset_tasks, save_page_task, save_asset_task, update_record_status from cache_queue import CacheQueue +logger = logging.getLogger(__name__) + class Crawler: - def __init__(self): + def __init__(self, config): self.page_queue = CacheQueue() self.asset_queue = CacheQueue() self.page_counter = 0 self.asset_counter = 0 + self.config = config + + self.main_site = urlparse(self.config['main_url']).netloc ## 初始化数据文件, 创建表 - self.db_conn = init_db(site_db) + self.db_conn = init_db(self.config['site_db']) self.load_queue() main_task = { - 'url': main_url, + 'url': self.config['main_url'], 'url_type': 'page', 'refer': '', 'depth': 1, @@ -35,12 +40,22 @@ class Crawler: } self.enqueue_page(main_task) - self.page_worker_pool = WorkerPool(self.page_queue, self.get_html_page, doc_pool_max, worker_type = 'page') - self.asset_worker_pool = WorkerPool(self.asset_queue, self.get_static_asset, res_pool_max, worker_type = 'asset') + page_worker_pool_args = { + 'func': self.get_html_page, + 'pool_size': self.config['page_pool_size'], + 'worker_type': 'page', + } + self.page_worker_pool = WorkerPool(self.page_queue, **page_worker_pool_args) + asset_worker_pool_args = { + 'func': self.get_static_asset, + 'pool_size': self.config['asset_pool_size'], + 'worker_type': 'asset', + } + self.asset_worker_pool = WorkerPool(self.asset_queue, **asset_worker_pool_args) def start(self): - self.page_worker_pool.start() logger.info('页面工作池启动') + self.page_worker_pool.start() def get_html_page(self, task): ''' @@ -48,15 +63,15 @@ class Crawler: ''' msg = 'get_static_asset(): task: {task:s}' logger.debug(msg.format(task = str(task))) - if 0 < max_depth and max_depth < task['depth']: + if 0 < self.config['max_depth'] < task['depth']: msg = '已超过最大深度: task: {task:s}' logger.warning(msg.format(task = str(task))) return - if task['failed_times'] > max_retry_times: + if task['failed_times'] > self.config['max_retry_times']: msg = '失败次数过多, 不再重试: task: {task:s}' logger.warning(msg.format(task = str(task))) return - code, resp = request_get_async(task) + code, resp = request_get_async(task, self.config) if not code: msg = '请求页面失败, 重新入队列: task: {task:s}, err: {err:s}' logger.error(msg.format(task = str(task), err = resp)) @@ -75,18 +90,18 @@ class Crawler: ## 超过最大深度的页面不再抓取, 在入队列前就先判断. ## 但超过静态文件无所谓深度, 所以还是要抓取的. - if 0 < max_depth and max_depth < task['depth'] + 1: + if 0 < self.config['max_depth'] < task['depth'] + 1: msg = '当前页面已达到最大深度, 不再抓取新页面: task {task:s}' logger.warning(msg.format(task = str(task))) else: - parse_linking_pages(pq_selector, task, callback = self.enqueue_page) - parse_linking_assets(pq_selector, task, callback = self.enqueue_asset) + parse_linking_pages(pq_selector, task, self.config, callback = self.enqueue_page) + parse_linking_assets(pq_selector, task, self.config, callback = self.enqueue_asset) ## 抓取此页面上的静态文件 self.asset_worker_pool.start(task) byte_content = pq_selector.outer_html().encode('utf-8') - file_path, file_name = trans_to_local_path(task['url'], 'page') - code, data = save_file_async(file_path, file_name, byte_content) + file_path, file_name = trans_to_local_path(task['url'], 'page', self.main_site) + code, data = save_file_async(self.config['site_path'], file_path, file_name, byte_content) if code: update_record_status(self.db_conn, task['url'], 'success') except Exception as err: msg = '保存页面文件失败: task: {task:s}, err: {err:s}' @@ -99,9 +114,9 @@ class Crawler: msg = 'get_static_asset(): task: {task:s}' logger.debug(msg.format(task = str(task))) ## 如果该链接已经超过了最大尝试次数, 则放弃 - if task['failed_times'] > max_retry_times: return + if task['failed_times'] > self.config['max_retry_times']: return - code, resp = request_get_async(task) + code, resp = request_get_async(task, self.config) if not code: msg = '请求静态资源失败, 重新入队列: task: {task:s}, err: {err:s}' logger.error(msg.format(task = str(task), err = resp)) @@ -115,9 +130,9 @@ class Crawler: try: content = resp.content if 'content-type' in resp.headers and 'text/css' in resp.headers['content-type']: - content = parse_css_file(resp.text, task, callback = self.enqueue_asset) - file_path, file_name = trans_to_local_path(task['url'], 'asset') - code, data = save_file_async(file_path, file_name, content) + content = parse_css_file(resp.text, task, self.config, callback = self.enqueue_asset) + file_path, file_name = trans_to_local_path(task['url'], 'asset', self.main_site) + code, data = save_file_async(self.config['site_path'], file_path, file_name, content) if code: update_record_status(self.db_conn, task['url'], 'success') except Exception as err: msg = '保存静态文件失败: task: {task:s}, err: {err:s}' @@ -170,12 +185,14 @@ class Crawler: while True: if _tmp_page_queue.empty(): break task = _tmp_page_queue.pop() - page_tasks.append(task) + values = (task['url'], task['refer'], task['depth'], task['failed_times']) + page_tasks.append(values) while True: if _tmp_asset_queue.empty(): break task = _tmp_asset_queue.pop() - asset_tasks.append(task) + values = (task['url'], task['refer'], task['depth'], task['failed_times']) + asset_tasks.append(values) if len(page_tasks) > 0: save_page_task(self.db_conn, page_tasks) diff --git a/db.py b/db.py index e30dea0..9478a65 100644 --- a/db.py +++ b/db.py @@ -76,7 +76,8 @@ def add_url_record(db_conn, task): return last_id def query_tasks(db_conn, table_name): - sql_str = 'select url, refer, depth, failed_times from %s' % table_name + sql_str = 'select url, refer, depth, failed_times from {:s}' + sql_str = sql_str.format(table_name) cursor = db_conn.cursor() cursor.execute(sql_str) rows = cursor.fetchall() @@ -102,11 +103,14 @@ def update_record_status(db_conn, url, status): cursor.close() def save_task(db_conn, table_name, value_list): - sql_str = 'delete from %s' % table_name + sql_str = 'delete from {:s}'.format(table_name) cursor = db_conn.cursor() cursor.execute(sql_str) - sql_str = 'insert into %s(url, refer, depth, failed_times) values(?, ?, ?, ?)' % table_name + + sql_str = 'insert into {:s} (url, refer, depth, failed_times) values(?, ?, ?, ?)' + sql_str = sql_str.format(table_name) cursor.executemany(sql_str, value_list) + db_conn.commit() cursor.close() diff --git "a/doc/logging\344\275\277\347\224\250.md" "b/doc/logging\344\275\277\347\224\250.md" new file mode 100644 index 0000000..31594ae --- /dev/null +++ "b/doc/logging\344\275\277\347\224\250.md" @@ -0,0 +1,76 @@ +# logging使用 + +参考文章 + +1. [python logging模块](http://www.cnblogs.com/dahu-daqing/p/7040764.html) + +最初logging困扰我最大的问题是, 生成的logger对象是否为单例对象, 如何保证不同源文件中的logger的输出互不干扰. + +实验后发现其实logging似乎并不在意是不是单例的问题, ta只是简单的print而已(按照指定的格式). + +``` +. +├── sublib +│   └── sub.py +└── main.py +``` + +`main.py`文件 + +```py +import logging + +from sublib.sub import sub_show +logging_config = { + 'level': logging.DEBUG, + 'format': '%(asctime)s %(levelname)-7s %(name)s - %(filename)s:%(lineno)d %(message)s', +} + +logging.basicConfig(**logging_config) +logger = logging.getLogger(__name__) +## logger.setLevel(logging.DEBUG) + +def show(): + logger.info("Start print log") + logger.debug("Do something") + logger.warning("Something maybe fail.") + logger.error("Something error.") + logger.info("Finish") + +show() +sub_show() +``` + +`sublib/sub.py`文件 + +```py +import logging + +logger = logging.getLogger(__name__) +## logger.setLevel(logging.DEBUG) + +def sub_show(): + logger.info("Start print log") + logger.debug("Do something") + logger.warning("Something maybe fail.") + logger.error("Something error.") + logger.info("Finish") +``` + +------ + +输出 + +``` +$ python .\main.py +2019-04-17 02:03:27,353 INFO __main__ - main.py:14 Start print log +2019-04-17 02:03:27,353 DEBUG __main__ - main.py:15 Do something +2019-04-17 02:03:27,354 WARNING __main__ - main.py:16 Something maybe fail. +2019-04-17 02:03:27,354 ERROR __main__ - main.py:17 Something error. +2019-04-17 02:03:27,354 INFO __main__ - main.py:18 Finish +2019-04-17 02:03:27,355 INFO sublib.sub - sub.py:7 Start print log +2019-04-17 02:03:27,355 DEBUG sublib.sub - sub.py:8 Do something +2019-04-17 02:03:27,355 WARNING sublib.sub - sub.py:9 Something maybe fail. +2019-04-17 02:03:27,356 ERROR sublib.sub - sub.py:10 Something error. +2019-04-17 02:03:27,356 INFO sublib.sub - sub.py:11 Finish +``` diff --git a/main.py b/main.py index db728ae..815ad96 100644 --- a/main.py +++ b/main.py @@ -1,12 +1,30 @@ -# coding=utf-8 import gevent.monkey gevent.monkey.patch_all(thread=False) +import logging + from crawler import Crawler +from config import default_config if __name__ == '__main__': + config = { + 'main_url': 'https://m.xieeda.com/', + 'headers': { + 'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1' + }, + 'max_depth': 1, + 'logging_config': { + 'level': logging.DEBUG, + ## %(name)s表示模块路径(其实是__name__的值) + 'format': '%(asctime)s %(levelname)-7s %(name)s - %(filename)s:%(lineno)d %(message)s', + } + } + config = dict(default_config, **config) + + logging.basicConfig(**config['logging_config']) + ## logger.setLevel(logging.DEBUG) try: - c = Crawler() + c = Crawler(config) c.start() except KeyboardInterrupt: c.stop() diff --git a/page_parser.py b/page_parser.py index cb5fd39..a56f126 100644 --- a/page_parser.py +++ b/page_parser.py @@ -1,12 +1,14 @@ import re +import logging from urllib.parse import urljoin, urlparse, urldefrag from pyquery import PyQuery -from settings import outsite_asset -from utils import logger, empty_link_pattern, get_main_site, url_filter +from utils import empty_link_pattern, url_filter from transform import trans_to_local_link +logger = logging.getLogger(__name__) + def get_page_charset(page_content): ''' 从页面内容中获取编码类型, 默认为utf-8 @@ -21,7 +23,7 @@ def get_page_charset(page_content): if meta2 is not None: charset = meta2 return charset -def parse_linking_pages(pq_selector, task, callback = None): +def parse_linking_pages(pq_selector, task, config, callback = None): ''' 分别解析页面中的a, iframe等元素的链接属性, 得到http(s)://式的url, 并调用callback入队列. @@ -29,26 +31,26 @@ def parse_linking_pages(pq_selector, task, callback = None): 不可以直接传递string类型的页面内容. ''' a_list = pq_selector('a') - _parse_linking_pages(a_list, 'href', task, callback=callback) + _parse_linking_pages(a_list, 'href', task, config, callback=callback) -def _parse_linking_pages(node_list, attr_name, task, callback = None): +def _parse_linking_pages(node_list, attr_name, task, config, callback = None): ''' 处理页面中a标签, 将页面本身的url与a标签中的地址计算得到实际可访问的url, 然后加入队列. 同时修改原页面内容中a标签的链接属性值, 使得这些链接可指向下载到本地的html文件. ''' - main_site = get_main_site() + main_site = urlparse(config['main_url']).netloc for node_item in node_list: url_attr = PyQuery(node_item).attr(attr_name) if url_attr is None or re.search(empty_link_pattern, url_attr): continue ## 拼接url并忽略url中的井号 - full_url = urljoin(origin_url, url_attr) + full_url = urljoin(task['url'], url_attr) full_url = urldefrag(full_url).url ## 如果不满足过滤规则则跳过 - if not url_filter(full_url, url_type='page'): continue + if not url_filter(full_url, 'page', config): continue ## 重设链接地址为本地路径 - local_link = trans_to_local_link(full_url, 'page') + local_link = trans_to_local_link(full_url, 'page', main_site) PyQuery(node_item).attr(attr_name, local_link) new_task = { 'url': full_url, @@ -59,7 +61,7 @@ def _parse_linking_pages(node_list, attr_name, task, callback = None): } if callback: callback(new_task) -def parse_linking_assets(pq_selector, task, callback = None): +def parse_linking_assets(pq_selector, task, config, callback = None): ''' 分别解析页面中的link, script, img等元素的链接属性, 得到http(s)://式的url, 并调用callback入队列. @@ -67,16 +69,16 @@ def parse_linking_assets(pq_selector, task, callback = None): 不可以直接传递string类型的页面内容. ''' link_list = pq_selector('link') - _parse_linking_assets(link_list, 'href', task, callback) + _parse_linking_assets(link_list, 'href', task, config, callback) script_list = pq_selector('script') - _parse_linking_assets(script_list, 'src', task, callback) + _parse_linking_assets(script_list, 'src', task, config, callback) img_list = pq_selector('img') - _parse_linking_assets(img_list, 'src', task, callback) + _parse_linking_assets(img_list, 'src', task, config, callback) -def _parse_linking_assets(node_list, attr_name, task, callback): - main_site = get_main_site() +def _parse_linking_assets(node_list, attr_name, task, config, callback): + main_site = urlparse(config['main_url']).netloc for node_item in node_list: url_attr = PyQuery(node_item).attr(attr_name) if url_attr is None or re.search(empty_link_pattern, url_attr): @@ -86,10 +88,10 @@ def _parse_linking_assets(node_list, attr_name, task, callback): full_url = urljoin(task['url'], url_attr) full_url = urldefrag(full_url).url ## 如果不满足过滤规则则跳过 - if not url_filter(full_url, url_type='asset'): continue + if not url_filter(full_url, 'asset', config): continue ## 重设链接地址为本地路径 - local_link = trans_to_local_link(full_url, 'asset') + local_link = trans_to_local_link(full_url, 'asset', main_site) PyQuery(node_item).attr(attr_name, local_link) new_task = { 'url': full_url, @@ -100,12 +102,13 @@ def _parse_linking_assets(node_list, attr_name, task, callback): } if callback: callback(new_task) -def parse_css_file(content, task, callback = None): +def parse_css_file(content, task, config, callback = None): ''' 处理css文件中对静态资源的引用, 将引用的静态资源加入队列, 再转换为本地地址后返回css文件内容(byte类型) ''' + main_site = urlparse(config['main_url']).netloc ## css中可能包含url属性,或者是background-image属性的引用路径, ## 格式可能为url('./bg.jpg'), url("./bg.jpg"), url(bg.jpg) ## 如下, import_list可能是[('', '', 'bg.jpg'), ('', '', 'logo.png')] @@ -123,8 +126,8 @@ def parse_css_file(content, task, callback = None): full_url = urljoin(task['url'], match_url) ## 如果不满足过滤规则则跳过 - if not url_filter(full_url, url_type='asset'): continue - local_link = trans_to_local_link(full_url, 'asset') + if not url_filter(full_url, 'asset', config): continue + local_link = trans_to_local_link(full_url, 'asset', main_site) new_task = { 'url': full_url, 'url_type': 'asset', diff --git a/settings.py b/settings.py deleted file mode 100644 index b3c1a32..0000000 --- a/settings.py +++ /dev/null @@ -1,55 +0,0 @@ -import logging - -# 要爬取的网站url, 需要以http(s)开头 -main_url = 'https://m.xieeda.com/' - -# 设置代理 -# 代理格式: -# { -# "http": "127.0.0.1:1080", -# "https": "127.0.0.1:1080", -# } -proxies = {} - -# HTTP请求的header -headers = { - 'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1' -} - -# 输出站点文件的路径,最后要加 '/' -site_path = './sites/' - -site_db = 'site.db' -# 每次请求的最大超时时间 -request_timeout = 30 - -# 爬取页面的协程数 -doc_pool_max = 20 - -# 爬取资源文件的协程数 -res_pool_max = 20 - -# 每次请求随机延迟的时间,单位s,[最大值,最小值] -wait_time = [1, 3] - -# 爬取页面的深度, 从1开始计, 爬到第N层为止. -# 1表示只抓取单页, 0表示无限制 -max_depth = 1 -# 请求出错最大重试次数(超时也算出错) -max_retry_times = 5 - -logging_config = { - 'level': logging.DEBUG, - 'format': '%(asctime)s %(levelname)s - %(name)s - %(filename)s - %(message)s', -} -############################################################ -## 抓取规则 - -## 是否爬取该站以外的静态资源(不是页面) -outsite_asset = True -no_js = True -no_css = False -no_images = False -no_fonts = False -## 黑名单, 列表类型. 规则格式为正则, 默认为空. -black_list = [] diff --git a/transform.py b/transform.py index 57654aa..f029dfc 100644 --- a/transform.py +++ b/transform.py @@ -1,7 +1,7 @@ import os from urllib.parse import urlparse, unquote -from utils import get_main_site, special_chars +from utils import special_chars def trans_to_local_link_for_page(urlObj): origin_path = urlObj.path @@ -33,7 +33,7 @@ def trans_to_local_link_for_asset(urlObj): local_link = local_link + special_chars['?'] + query_str return local_link -def trans_to_local_link(url, url_type = 'page'): +def trans_to_local_link(url, url_type, main_site): ''' @param url: 待处理的url, 有时url为动态链接, 包含&, ?等特殊字符, 这种情况下需要对其进行编码. @@ -42,7 +42,6 @@ def trans_to_local_link(url, url_type = 'page'): local_link: 本地文件存储路径, 用于写入本地html文档中的link/script/img/a等标签的链接属性 ''' ## 对于域名为host的url, 资源存放目录为output根目录, 而不是域名文件夹. 默认不设置主host - main_site = get_main_site() urlObj = urlparse(url) origin_host = urlObj.netloc @@ -63,13 +62,13 @@ def trans_to_local_link(url, url_type = 'page'): local_link = unquote(local_link) return local_link -def trans_to_local_path(url, url_type = 'page'): +def trans_to_local_path(url, url_type, main_site): ''' @return file_path: 目标文件的存储目录, 相对路径(不以/开头), 为""时, 表示当前目录 file_name: 目标文件名称 ''' - local_link = trans_to_local_link(url, url_type) + local_link = trans_to_local_link(url, url_type, main_site) ## 如果是站外资源, local_link可能为/www.xxx.com/static/x.jpg, ## 但我们需要的存储目录是相对路径, 所以需要事先将链接起始的/移除 if local_link.startswith('/'): local_link = local_link[1:] diff --git a/utils.py b/utils.py index 59ae1f2..05ac5b4 100644 --- a/utils.py +++ b/utils.py @@ -5,12 +5,7 @@ from urllib.parse import urlparse import requests -from settings import main_url, headers, proxies, site_path, logging_config, outsite_asset -from settings import no_js, no_css, no_images, no_fonts, black_list - -logging.basicConfig(**logging_config) logger = logging.getLogger(__name__) -## logger.setLevel(logging.DEBUG) empty_link_pattern = r'about:blank|javascript:(void\(0\))?' @@ -28,26 +23,19 @@ special_chars = { image_pattern = '\.((jpg)|(png)|(bmp)|(jpeg)|(gif)|(webp))$' font_pattern = '\.((ttf)|(woff)|(woff2)|(otf)|(eot))$' -main_site = '' -def get_main_site(): - global main_site - if main_site == '': - main_site = urlparse(main_url).netloc - return main_site - -def request_get_async(task): +def request_get_async(task, config): ''' 协程形式发起get请求 return: requests.get()的结果 ''' try: - _headers = headers.copy() + _headers = config['headers'].copy() _headers['Referer'] = task['refer'].encode('utf-8') request_options = { 'url': task['url'], 'verify': True, 'headers': _headers, - 'proxies': proxies, + 'proxies': config['proxies'], } resp = requests.get(**request_options) return (1, resp) @@ -60,7 +48,7 @@ def request_get_async(task): logger.error(msg.format(task = str(task), err = err)) return (0, err) -def save_file_async(file_path, file_name, byte_content): +def save_file_async(site_path, file_path, file_name, byte_content): ''' 写入文件, 事先创建目标目录 ''' @@ -78,13 +66,13 @@ def save_file_async(file_path, file_name, byte_content): logger.error(msg.format(path = path, file = file_name, err = err)) return (0, err) -def url_filter(url, url_type = 'page'): +def url_filter(url, url_type, config): ''' @function 这个函数对url比对所有设置的规则, 判断目标url是否可以抓取. @param: url_type url类型: page/asset @return: True: 可以抓取, False: 不可以抓取 ''' - main_site = get_main_site() + main_site = urlparse(config['main_url']).netloc ## 站外的页面绝对不会抓取, 倒是站外的资源可以下载下来 if url_type == 'page' and urlparse(url).netloc != main_site: logger.info('不抓取站外页面: %s' % url) @@ -92,29 +80,29 @@ def url_filter(url, url_type = 'page'): urlObj = urlparse(url) host = urlObj.netloc - if url_type == 'asset' and host != main_site and not outsite_asset: + if url_type == 'asset' and host != main_site and not config['outsite_asset']: logger.info('不抓取站外资源: %s' % url) return False path = urlObj.path - if url_type == 'asset' and path.endswith('.js') and no_js: + if url_type == 'asset' and path.endswith('.js') and config['no_js']: logger.info('不抓取js资源: %s' % url) return False - if url_type == 'asset' and path.endswith('.css') and no_css: + if url_type == 'asset' and path.endswith('.css') and config['no_css']: logger.info('不抓取css资源: %s' % url) return False - if url_type == 'asset' and re.search(image_pattern, url) and no_images: + if url_type == 'asset' and re.search(image_pattern, url) and config['no_images']: logger.info('不抓取图片资源: %s' % url) return False - if url_type == 'asset' and re.search(font_pattern, url) and no_fonts: + if url_type == 'asset' and re.search(font_pattern, url) and config['no_fonts']: logger.info('不抓取字体资源: %s' % url) return False ## 不抓取黑名单中的url - for pattern in black_list: + for pattern in config['black_list']: if re.search(pattern, url): logger.info('不抓取黑名单中的url: %s' % url) return False diff --git a/worker_pool.py b/worker_pool.py index c1310b0..d52edd0 100644 --- a/worker_pool.py +++ b/worker_pool.py @@ -1,19 +1,21 @@ ''' gevent类型的协程工作池, 以while循环不断执行指定的worker方法 ''' +import logging + from gevent import sleep from gevent.pool import Pool -from utils import logger +logger = logging.getLogger(__name__) class WorkerPool: - def __init__(self, queue, func = None, pool_max = 100, worker_type = 'page'): + def __init__(self, queue, func = None, pool_size = 100, worker_type = 'page'): self.queue = queue self.worker = func self.exit_signal = False - self.pool_max = pool_max + self.pool_size = pool_size ## Pool类基于gevent.pool.Group类 - self.pool = Pool(pool_max) + self.pool = Pool(pool_size) self.worker_type = worker_type def start(self, page_task = None): -- Gitee