From 1dab5cb51f5d66c888920e465d8b9856e9ee3eb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?4=E7=8F=AD=E5=8A=A9=E6=95=99=20=7C=20=E5=BA=B7=E5=BA=B7?= Date: Mon, 8 Mar 2021 17:02:51 +0800 Subject: [PATCH] =?UTF-8?q?Revert=20'Pull=20Request=20!741=20:=20=E7=AC=AC?= =?UTF-8?q?=E5=8D=81=E5=91=A8=5F=E7=AC=AC=E4=BA=8C=E8=8A=82=20=E7=AC=AC?= =?UTF-8?q?=E4=B8=80=E4=B8=AA=20scrapy=20=E7=88=AC=E8=99=AB=E9=A1=B9?= =?UTF-8?q?=E7=9B=AE'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../.keep" | 0 .../items.py" | 18 --- .../jd_search_spider.py" | 52 --------- .../middlewares.py" | 110 ------------------ .../pipelines.py" | 32 ----- .../proxymiddleware.txt" | 25 ---- .../run.py" | 4 - .../settings.py" | 103 ---------------- .../.keep" | 0 .../Scrapy.md" | 83 ------------- .../quotes_spider.py" | 20 ---- 11 files changed, 447 deletions(-) delete mode 100644 "\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\270\211\350\212\202/.keep" delete mode 100644 "\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\270\211\350\212\202/items.py" delete mode 100644 "\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\270\211\350\212\202/jd_search_spider.py" delete mode 100644 "\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\270\211\350\212\202/middlewares.py" delete mode 100644 "\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\270\211\350\212\202/pipelines.py" delete mode 100644 "\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\270\211\350\212\202/proxymiddleware.txt" delete mode 100644 "\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\270\211\350\212\202/run.py" delete mode 100644 "\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\270\211\350\212\202/settings.py" delete mode 100644 "\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\272\214\350\212\202/.keep" delete mode 100644 "\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\272\214\350\212\202/Scrapy.md" delete mode 100644 "\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\272\214\350\212\202/quotes_spider.py" diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\270\211\350\212\202/.keep" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\270\211\350\212\202/.keep" deleted file mode 100644 index e69de29b..00000000 diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\270\211\350\212\202/items.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\270\211\350\212\202/items.py" deleted file mode 100644 index b72f7a4c..00000000 --- "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\270\211\350\212\202/items.py" +++ /dev/null @@ -1,18 +0,0 @@ -# Define here the models for your scraped items -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class JdSearchItem(scrapy.Item): - # define the fields for your item here like: - # name = scrapy.Field() - sku_id = scrapy.Field() - img = scrapy.Field() - price = scrapy.Field() - title = scrapy.Field() - shop = scrapy.Field() - icons = scrapy.Field() - diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\270\211\350\212\202/jd_search_spider.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\270\211\350\212\202/jd_search_spider.py" deleted file mode 100644 index 87b1a945..00000000 --- "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\270\211\350\212\202/jd_search_spider.py" +++ /dev/null @@ -1,52 +0,0 @@ -import scrapy -from bs4 import BeautifulSoup -import json -from W10_L3.jd_search.jd_search.items import JdSearchItem - - -class JdSearchSpider(scrapy.Spider): - name = "jd_search" - - def start_requests(self): - search_array = ["手机", "电脑", "显卡", "内存"] - for keyword in search_array: - for page in range(1, 4): - url = f'https://search.jd.com/Search?keyword={keyword}&page={page}' - - yield scrapy.FormRequest( - url=url, - method='GET', - callback=self.parse_search - ) - - - def parse_search(self, response): - html = response.text - soup = BeautifulSoup(html, 'lxml') - content = soup.select("ul[class='gl-warp clearfix'] li[class='gl-item']") - for item in content: - try: - sku_id = item.attrs["data-sku"] - img = item.select("img[data-img='1']") - price = item.select("div[class='p-price']") - title = item.select("div[class='p-name p-name-type-2'] em") - shop = item.select("div[class='p-shop']") - icons = item.select("div[class='p-icons']") - - img = img[0].attrs['data-lazy-img'] if img else "" - price = price[0].strong.i.text.strip() if price else "" - title = title[0].text.strip() if title else "" - shop = shop[0].text.strip() if shop else "" - icons = json.dumps([ele.text.strip() for ele in icons[0].select('i')]) if icons else '[]' - - items = JdSearchItem() - items["sku_id"] = sku_id - items["img"] = img - items["price"] = price - items["title"] = title - items["shop"] = shop - items["icons"] = icons - yield items - - except Exception as e: - print(e.args) \ No newline at end of file diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\270\211\350\212\202/middlewares.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\270\211\350\212\202/middlewares.py" deleted file mode 100644 index 1c8b8fdc..00000000 --- "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\270\211\350\212\202/middlewares.py" +++ /dev/null @@ -1,110 +0,0 @@ -# Define here the models for your spider middleware -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - -# useful for handling different item types with a single interface -from itemadapter import is_item, ItemAdapter - - -class JdSearchSpiderMiddleware: - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, or item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Request or item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class JdSearchDownloaderMiddleware: - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class JdSearchUAMiddleware: - - def process_request(self, request, spider): - # This method is used by Scrapy to add user agent headers. - request.headers["user-agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36" diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\270\211\350\212\202/pipelines.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\270\211\350\212\202/pipelines.py" deleted file mode 100644 index 79f7f7c3..00000000 --- "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\270\211\350\212\202/pipelines.py" +++ /dev/null @@ -1,32 +0,0 @@ -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html - - -# useful for handling different item types with a single interface -from itemadapter import ItemAdapter -import pymysql -from W10_L3.jd_search.jd_search.items import JdSearchItem - - -class JdSearchPipeline: - - def __init__(self): - self.mysql_con = None - - def process_item(self, item, spider): - if not self.mysql_con: - self.mysql_con = pymysql.connect(**spider.settings['MYSQL_CONF']) - - if isinstance(item, JdSearchItem): - cursor = self.mysql_con.cursor() - SQL = """INSERT INTO jd_search(sku_id, img, price, title, shop, icons) - VALUES ('{}', '{}', '{}', '{}', '{}', '{}')""".format( - item['sku_id'], item['img'], item['price'], item['title'], item['shop'], item['icons'] - ) - cursor.execute(SQL) - self.mysql_con.commit() - cursor.close() - - return item diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\270\211\350\212\202/proxymiddleware.txt" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\270\211\350\212\202/proxymiddleware.txt" deleted file mode 100644 index 3d030072..00000000 --- "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\270\211\350\212\202/proxymiddleware.txt" +++ /dev/null @@ -1,25 +0,0 @@ - -use HttpProxyMiddleware -""" -This middleware sets the HTTP proxy to use for requests, by setting the proxy meta value for Request objects. -You can also set the meta key proxy per-request, -a value like http://some_proxy_server:port or http://username:password@some_proxy_server:port -""" - -1. enable HttpProxyMiddleware in settings.py - -DOWNLOADER_MIDDLEWARES = { - #'jd_search.middlewares.JdSearchDownloaderMiddleware': 543, - 'jd_search.middlewares.JdSearchUAMiddleware': 100, - 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 100 -} - -2. add meta in request -yield scrapy.FormRequest( - url=url, - method='GET', - meta={ - 'proxy': 'http://proxy_ip:port' - }, - callback=self.parse_search - ) diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\270\211\350\212\202/run.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\270\211\350\212\202/run.py" deleted file mode 100644 index bb7a2068..00000000 --- "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\270\211\350\212\202/run.py" +++ /dev/null @@ -1,4 +0,0 @@ -from scrapy import cmdline as cmd - -command = "scrapy crawl jd_search".split() -cmd.execute(command) \ No newline at end of file diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\270\211\350\212\202/settings.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\270\211\350\212\202/settings.py" deleted file mode 100644 index 00c25bf3..00000000 --- "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\270\211\350\212\202/settings.py" +++ /dev/null @@ -1,103 +0,0 @@ -# Scrapy settings for jd_search project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://docs.scrapy.org/en/latest/topics/settings.html -# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'jd_search' - -SPIDER_MODULES = ['jd_search.spiders'] -NEWSPIDER_MODULE = 'jd_search.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'jd_search (+http://www.yourdomain.com)' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = False - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'jd_search.middlewares.JdSearchSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -DOWNLOADER_MIDDLEWARES = { - #'jd_search.middlewares.JdSearchDownloaderMiddleware': 543, - 'jd_search.middlewares.JdSearchUAMiddleware': 100, - #'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 100 -} - -# Enable or disable extensions -# See https://docs.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'jd_search.pipelines.JdSearchPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' - -# MYSQL CONF -MYSQL_CONF = { - 'host': '127.0.0.1', - 'port': 3306, - 'user': 'root', - 'password': 'wxx33043', - 'db': 'jd_search' -} - -# Log setting -LOG_FILE = "C:\\Users\\chaos\\Desktop\\scrapy_log\\execution.log" -LOG_LEVEL = "DEBUG" \ No newline at end of file diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\272\214\350\212\202/.keep" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\272\214\350\212\202/.keep" deleted file mode 100644 index e69de29b..00000000 diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\272\214\350\212\202/Scrapy.md" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\272\214\350\212\202/Scrapy.md" deleted file mode 100644 index ea17cf2b..00000000 --- "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\272\214\350\212\202/Scrapy.md" +++ /dev/null @@ -1,83 +0,0 @@ -# Scrapy - -## Architecture - -### Data flow - -![](https://docs.scrapy.org/en/latest/_images/scrapy_architecture_02.png) - -Scrapy 中的数据流是由执行引擎控制的,流程如下: - -1. `Engine` 从 `Spider` 拿到要爬取的最初的请求 -2. `Engine` 在 `Scheduler` 中调度请求,并询问下一个要爬取的请求 -3. `Scheduler` 返回给 `Engine` 下一个请求 -4. `Engine` 通过 `Downloader Middleware` 发送给 `Downloader` 请求 -5. 页面完成下载之后,`Downloader` 通过 `Downloader Middleware` 返回给 `Engine` 响应 -6. `Engine` 拿到响应之后通过 `Spider Middleware` 发送个 `Spider`,供其解析 -7. `Spider` 解析完成后,通过 `Spider Middleware` 发送给 `Engine` scraped items 和一个新的请求 -8. `Engine` 将 scraped item 发送给 `Item Pipelines`,然后发送那个新的请求到 `Scheduler` 并且询问可能的下一次爬取请求 -9. 重复上述步骤,直到不再有来自 `Scheduler` 的请求 - -### Event-driven networking frame - -Scrapy is written with Twisted. a popular event-driven networking framework for Python. Thus, it’s implemented using a non-blocking (aka **`asynchronous`** ) code for concurrency. (异步并发) - -## Scrapy Tutorial - -### Creating a project - -``` -scrapy startproject -``` - -目录结构 - -``` -tutorial/ -|___scrapy.cfg # 部署的配置文件 -|___tutorial/ - |___ __init__.py - |___ items.py # items 定义文件 - |___ middleware.py # 中间件定义文件 - |___ pipelines.py # pipelines 定义文件 - |___ settings.py # settings 文件 - |___ spiders/ # spiders 目录 - |___ __init__.py -``` - -### First Spider - -under spiders directory create a new quotes_spider.py - -```python -import scrapy - - -class QuotesSpider(scrapy.Spider): - name = "quotes" - - def start_requests(self): - urls = [ - 'http://quotes.toscrape.com/page/1/', - 'http://quotes.toscrape.com/page/2/', - ] - for url in urls: - yield scrapy.Request(url=url, callback=self.parse) - - def parse(self, response): - page = response.url.split("/")[-2] - filename = f'quotes-{page}.html' - with open(filename, 'wb') as f: - f.write(response.body) - self.log(f'Saved file {filename}') -``` - -### Run our spider - -``` -# 在 project 的顶层目录执行下面的命令 -scrapy crawl -``` - -### Extracting data - diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\272\214\350\212\202/quotes_spider.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\272\214\350\212\202/quotes_spider.py" deleted file mode 100644 index 2d058723..00000000 --- "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\345\221\250_\347\254\254\344\272\214\350\212\202/quotes_spider.py" +++ /dev/null @@ -1,20 +0,0 @@ -import scrapy - - -class QuoteSpider(scrapy.Spider): - name = "quotes" - - def start_requests(self): - urls = [ - 'http://quotes.toscrape.com/page/1/', - 'http://quotes.toscrape.com/page/2/', - ] - for url in urls: - yield scrapy.Request(url=url, callback=self.parse) - - def parse(self, response): - page = response.split("/")[-2] - filename = f'quote_{page}.html' - with open(filename, 'wb') as f: - f.write(response.body) - self.log(f'Saved file {filename}') -- Gitee