From 8451fa31a236df6ed249d4a55aa0097f52295a30 Mon Sep 17 00:00:00 2001 From: Suyongzhi1997 <1125699801@qq.com> Date: Wed, 10 Mar 2021 11:42:34 +0800 Subject: [PATCH 1/3] =?UTF-8?q?=E6=96=B0=E7=8E=AF=E5=A2=83=E6=B5=8B?= =?UTF-8?q?=E8=AF=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../week10/jd_crawlers/jd_crawlers/spiders/jd_spider.py" | 1 + 1 file changed, 1 insertion(+) diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/spiders/jd_spider.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/spiders/jd_spider.py" index 5b716544..119f5513 100644 --- "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/spiders/jd_spider.py" +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/spiders/jd_spider.py" @@ -6,6 +6,7 @@ @Desc : """ import scrapy +import requests class JdSpider(scrapy.Spider): -- Gitee From 57cd2cbdb6fbbcae62628f68864bcf39144d3919 Mon Sep 17 00:00:00 2001 From: Suyongzhi1997 <1125699801@qq.com> Date: Wed, 10 Mar 2021 12:13:12 +0800 Subject: [PATCH 2/3] =?UTF-8?q?=E7=AC=AC=E5=8D=81=E5=91=A8-=E7=AC=AC?= =?UTF-8?q?=E4=B8=89=E8=8A=82-=E4=BD=9C=E4=B8=9A=E6=8F=90=E4=BA=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../week10/jd_crawlers/jd_crawlers/items.py" | 7 +- .../jd_crawlers/jd_crawlers/middlewares.py" | 6 ++ .../jd_crawlers/jd_crawlers/pipelines.py" | 19 +++++- .../jd_crawlers/jd_crawlers/settings.py" | 67 ++++++++++--------- .../jd_crawlers/spiders/jd_spider.py" | 33 ++++++++- 5 files changed, 98 insertions(+), 34 deletions(-) diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/items.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/items.py" index 67c1e691..687a2819 100644 --- "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/items.py" +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/items.py" @@ -9,4 +9,9 @@ import scrapy class JdCrawlersItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() - pass + sku_id = scrapy.Field() + img = scrapy.Field() + price = scrapy.Field() + title = scrapy.Field() + shop = scrapy.Field() + icons = scrapy.Field() diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/middlewares.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/middlewares.py" index e8457d0c..059b562b 100644 --- "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/middlewares.py" +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/middlewares.py" @@ -101,3 +101,9 @@ class JdCrawlersDownloaderMiddleware: def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) + + +class UAMiddleware: + def process_request(self, request, spider): + request.headers[ + 'user-agent'] = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)' diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/pipelines.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/pipelines.py" index ccc9b0a4..ac58d99d 100644 --- "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/pipelines.py" +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/pipelines.py" @@ -5,9 +5,26 @@ # useful for handling different item types with a single interface +import pymysql from itemadapter import ItemAdapter +from jd_crawlers.items import JdCrawlersItem + class JdCrawlersPipeline: + def __init__(self): + self.mysql_con = None + def process_item(self, item, spider): - return item + if not self.mysql_con: + self.mysql_con = pymysql.connect(**spider.settings['MYSQL_CONF']) + + if isinstance(item, JdCrawlersItem): + cursor = self.mysql_con.cursor() + SQL = """INSERT INTO jd_search(sku_id, img, price, title, shop, icons) + VALUES ('{}', '{}', '{}', '{}', '{}', '{}')""".format( + item['sku_id'], item['img'], item['price'], item['title'], item['shop'], item['icons'] + ) + cursor.execute(SQL) + self.mysql_con.commit() + cursor.close() diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/settings.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/settings.py" index 846c0a1d..008286e2 100644 --- "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/settings.py" +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/settings.py" @@ -12,77 +12,84 @@ BOT_NAME = 'jd_crawlers' SPIDER_MODULES = ['jd_crawlers.spiders'] NEWSPIDER_MODULE = 'jd_crawlers.spiders' - # Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'jd_crawlers (+http://www.yourdomain.com)' +# USER_AGENT = 'jd_crawlers (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 +# CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 +# DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 +# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +# CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) -#COOKIES_ENABLED = False +# COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False +# TELNETCONSOLE_ENABLED = False # Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { +# DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', -#} +# } # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { +# SPIDER_MIDDLEWARES = { # 'jd_crawlers.middlewares.JdCrawlersSpiderMiddleware': 543, -#} +# } # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'jd_crawlers.middlewares.JdCrawlersDownloaderMiddleware': 543, -#} +DOWNLOADER_MIDDLEWARES = { + 'jd_crawlers.middlewares.JdCrawlersDownloaderMiddleware': 543, +} # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { +# EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, -#} +# } # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -#ITEM_PIPELINES = { -# 'jd_crawlers.pipelines.JdCrawlersPipeline': 300, -#} +ITEM_PIPELINES = { + 'jd_crawlers.pipelines.JdCrawlersPipeline': 300, +} # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True +# AUTOTHROTTLE_ENABLED = True # The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 +# AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 +# AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False +# AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = 'httpcache' +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +MYSQL_CONF = { + 'host': '127.0.0.1', + 'user': 'root', + 'password': '123456', + 'db': 'test' +} + diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/spiders/jd_spider.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/spiders/jd_spider.py" index 119f5513..43b6966d 100644 --- "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/spiders/jd_spider.py" +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/spiders/jd_spider.py" @@ -6,7 +6,11 @@ @Desc : """ import scrapy -import requests +import json + +from bs4 import BeautifulSoup + +from jd_crawlers.items import JdCrawlersItem class JdSpider(scrapy.Spider): @@ -23,4 +27,29 @@ class JdSpider(scrapy.Spider): ) def parse_jd(self, response): - print(response) + + soup = BeautifulSoup(response.text, "lxml") + item_array = soup.select("ul[class='gl-warp clearfix'] li[class='gl-item']") + for item in item_array: + sku_id = item.attrs['data-sku'] + img = item.select("img[data-img='1']") + price = item.select("div[class='p-price']") + title = item.select("div[class='p-name p-name-type-2']") + shop = item.select("div[class='p-shop']") + icons = item.select("div[class='p-icons']") + + img = img[0].attrs['data-lazy-img'] if img else "" + price = price[0].strong.i.text if price else "" + title = title[0].text.strip() if title else "" + shop = shop[0].span.a.attrs['title'] if shop[0].text.strip() else "" + icons = json.dumps([tag_ele.text for tag_ele in icons[0].select('i')]) if icons else '[]' + + items = JdCrawlersItem() + items['sku_id'] = sku_id + items['img'] = img + items['price'] = price + items['title'] = title + items['shop'] = shop + items['icons'] = icons + + yield item -- Gitee From a06b0867ee8a14eef57ccaabf2c693703bd70cbd Mon Sep 17 00:00:00 2001 From: Suyongzhi1997 <1125699801@qq.com> Date: Tue, 16 Mar 2021 19:23:17 +0800 Subject: [PATCH 3/3] =?UTF-8?q?=E7=AC=AC=E5=8D=81=E4=B8=80=E5=91=A8-?= =?UTF-8?q?=E4=B8=80=E4=BA=8C=E4=B8=89=E8=8A=82-=E4=BD=9C=E4=B8=9A?= =?UTF-8?q?=E6=8F=90=E4=BA=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../jd_crawlers/jd_crawlers/__init__.py" | 0 .../week11/jd_crawlers/jd_crawlers/items.py" | 17 ++ .../jd_crawlers/jd_crawlers/middlewares.py" | 169 ++++++++++++++++++ .../jd_crawlers/jd_crawlers/pipelines.py" | 30 ++++ .../jd_crawlers/jd_crawlers/settings.py" | 98 ++++++++++ .../jd_crawlers/spiders/__init__.py" | 4 + .../jd_crawlers/spiders/jd_spider.py" | 62 +++++++ .../week11/jd_crawlers/scrapy.cfg" | 11 ++ .../week11/redis_test.py" | 71 ++++++++ 9 files changed, 462 insertions(+) create mode 100644 "\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week11/jd_crawlers/jd_crawlers/__init__.py" create mode 100644 "\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week11/jd_crawlers/jd_crawlers/items.py" create mode 100644 "\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week11/jd_crawlers/jd_crawlers/middlewares.py" create mode 100644 "\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week11/jd_crawlers/jd_crawlers/pipelines.py" create mode 100644 "\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week11/jd_crawlers/jd_crawlers/settings.py" create mode 100644 "\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week11/jd_crawlers/jd_crawlers/spiders/__init__.py" create mode 100644 "\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week11/jd_crawlers/jd_crawlers/spiders/jd_spider.py" create mode 100644 "\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week11/jd_crawlers/scrapy.cfg" create mode 100644 "\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week11/redis_test.py" diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week11/jd_crawlers/jd_crawlers/__init__.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week11/jd_crawlers/jd_crawlers/__init__.py" new file mode 100644 index 00000000..e69de29b diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week11/jd_crawlers/jd_crawlers/items.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week11/jd_crawlers/jd_crawlers/items.py" new file mode 100644 index 00000000..687a2819 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week11/jd_crawlers/jd_crawlers/items.py" @@ -0,0 +1,17 @@ +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class JdCrawlersItem(scrapy.Item): + # define the fields for your item here like: + # name = scrapy.Field() + sku_id = scrapy.Field() + img = scrapy.Field() + price = scrapy.Field() + title = scrapy.Field() + shop = scrapy.Field() + icons = scrapy.Field() diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week11/jd_crawlers/jd_crawlers/middlewares.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week11/jd_crawlers/jd_crawlers/middlewares.py" new file mode 100644 index 00000000..d03640fa --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week11/jd_crawlers/jd_crawlers/middlewares.py" @@ -0,0 +1,169 @@ +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + +# useful for handling different item types with a single interface +from itemadapter import is_item, ItemAdapter + +from scrapy.downloadermiddlewares.retry import RetryMiddleware +from scrapy.utils.response import response_status_message + +from scrapy.dupefilters import RFPDupeFilter +from w3lib.url import canonicalize_url +from scrapy.utils.python import to_bytes +import hashlib +import weakref + + +class JdCrawlersSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class JdCrawlersDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class UAMiddleware: + def process_request(self, request, spider): + request.headers[ + 'user-agent'] = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)' + + +class ProxyMiddleware: + def __init__(self): + self.proxies = {} + + def process_request(self, request, spider): + if 'proxy' in request.meta: + if request.meta['proxy'] is None: + return + request.meta['proxy'] = 'http://110.52.224.130:4246' + elif not self.proxies: + return + + def process_exception(self, request, exception, spider): + if isinstance(exception, IndexError): + retry_times = request.meta.get('retry_times', 1) + request.meta['retry_times'] = retry_times - 1 + return request + + +class MyRetryMiddleware(RetryMiddleware): + """ + 重试中间件 + """ + + def process_response(self, request, response, spider): + if request.meta.get('dont_retry', False): + return response + if '验证码' in response.text: + reason = response_status_message(response.status) + return self._retry(request, reason, spider) or response + return response + + +class MyRFPDupeFilter(RFPDupeFilter): + """ + 过滤器中间件 + """ + + def request_fingerprint(self, request, include_headers=None, keep_fragments=False): + cache = _fingerprint_cache.setdefault(request, {}) + cache_key = (include_headers, keep_fragments) + if cache_key not in cache: + fp = hashlib.sha1() + fp.update(to_bytes(request.method)) + fp.update(to_bytes(canonicalize_url(request.url, keep_fragments=keep_fragments))) + fp.update(request.body or b'') + fp.update(request.meta.get("batch_no", "").encode("utf-8")) + cache[cache_key] = fp.hexdigest() + return cache[cache_key] diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week11/jd_crawlers/jd_crawlers/pipelines.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week11/jd_crawlers/jd_crawlers/pipelines.py" new file mode 100644 index 00000000..ac58d99d --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week11/jd_crawlers/jd_crawlers/pipelines.py" @@ -0,0 +1,30 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + + +# useful for handling different item types with a single interface +import pymysql +from itemadapter import ItemAdapter + +from jd_crawlers.items import JdCrawlersItem + + +class JdCrawlersPipeline: + def __init__(self): + self.mysql_con = None + + def process_item(self, item, spider): + if not self.mysql_con: + self.mysql_con = pymysql.connect(**spider.settings['MYSQL_CONF']) + + if isinstance(item, JdCrawlersItem): + cursor = self.mysql_con.cursor() + SQL = """INSERT INTO jd_search(sku_id, img, price, title, shop, icons) + VALUES ('{}', '{}', '{}', '{}', '{}', '{}')""".format( + item['sku_id'], item['img'], item['price'], item['title'], item['shop'], item['icons'] + ) + cursor.execute(SQL) + self.mysql_con.commit() + cursor.close() diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week11/jd_crawlers/jd_crawlers/settings.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week11/jd_crawlers/jd_crawlers/settings.py" new file mode 100644 index 00000000..c60b2bd6 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week11/jd_crawlers/jd_crawlers/settings.py" @@ -0,0 +1,98 @@ +# Scrapy settings for jd_crawlers project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'jd_crawlers' + +SPIDER_MODULES = ['jd_crawlers.spiders'] +NEWSPIDER_MODULE = 'jd_crawlers.spiders' + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +# USER_AGENT = 'jd_crawlers (+http://www.yourdomain.com)' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +# CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +# DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +# CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +# COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +# TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +# DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +# } + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +# SPIDER_MIDDLEWARES = { +# 'jd_crawlers.middlewares.JdCrawlersSpiderMiddleware': 543, +# } + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +DOWNLOADER_MIDDLEWARES = { + 'jd_crawlers.middlewares.ProxyMiddleware': 543, + 'jd_crawlers.middlewares.UAMiddleware': 544, + 'jd_crawlers.middlewares.MyRetryMiddleware': 545, + 'jd_crawlers.middlewares.MyRFPDupeFilter': 546, + 'jd_crawlers.middlewares.JdCrawlersDownloaderMiddleware': 547, +} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +# EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +# } + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'jd_crawlers.pipelines.JdCrawlersPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +# AUTOTHROTTLE_ENABLED = True +# The initial download delay +# AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +# AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +# AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = 'httpcache' +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +MYSQL_CONF = { + 'host': '127.0.0.1', + 'user': 'root', + 'password': '123456', + 'db': 'test' +} diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week11/jd_crawlers/jd_crawlers/spiders/__init__.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week11/jd_crawlers/jd_crawlers/spiders/__init__.py" new file mode 100644 index 00000000..ebd689ac --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week11/jd_crawlers/jd_crawlers/spiders/__init__.py" @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week11/jd_crawlers/jd_crawlers/spiders/jd_spider.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week11/jd_crawlers/jd_crawlers/spiders/jd_spider.py" new file mode 100644 index 00000000..82a43693 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week11/jd_crawlers/jd_crawlers/spiders/jd_spider.py" @@ -0,0 +1,62 @@ +# -*- coding: UTF-8 -*- +""" +@File :jd_spider.py +@Author :Super +@Date :2021/3/4 +@Desc : +""" +import scrapy +import json + +from bs4 import BeautifulSoup + +from jd_crawlers.items import JdCrawlersItem + + +class JdSpider(scrapy.Spider): + name = 'jd_spider' + + def start_requests(self): + for item in ["鼠标", "键盘", "显卡", "耳机"]: + for page in range(1, 10): + url = f'https://search.jd.com/Search?keyword={item}&wq={item}&page={page}' + yield scrapy.FormRequest( + url=url, + method='GET', + callback=self.parse_jd, # 指定回调函数处理response对象 + errback=self.process_error + ) + + def parse_jd(self, response): + + soup = BeautifulSoup(response.text, "lxml") + item_array = soup.select("ul[class='gl-warp clearfix'] li[class='gl-item']") + for item in item_array: + sku_id = item.attrs['data-sku'] + img = item.select("img[data-img='1']") + price = item.select("div[class='p-price']") + title = item.select("div[class='p-name p-name-type-2']") + shop = item.select("div[class='p-shop']") + icons = item.select("div[class='p-icons']") + + img = img[0].attrs['data-lazy-img'] if img else "" + price = price[0].strong.i.text if price else "" + title = title[0].text.strip() if title else "" + shop = shop[0].span.a.attrs['title'] if shop[0].text.strip() else "" + icons = json.dumps([tag_ele.text for tag_ele in icons[0].select('i')]) if icons else '[]' + + items = JdCrawlersItem() + items['sku_id'] = sku_id + items['img'] = img + items['price'] = price + items['title'] = title + items['shop'] = shop + items['icons'] = icons + + yield item + + def process_error(self, failure): + print(failure) + # 记录异常 + # 发送通知 + # 任务重做 diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week11/jd_crawlers/scrapy.cfg" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week11/jd_crawlers/scrapy.cfg" new file mode 100644 index 00000000..0efdfa54 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week11/jd_crawlers/scrapy.cfg" @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = jd_crawlers.settings + +[deploy] +#url = http://localhost:6800/ +project = jd_crawlers diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week11/redis_test.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week11/redis_test.py" new file mode 100644 index 00000000..2c176523 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week11/redis_test.py" @@ -0,0 +1,71 @@ +# -*- coding: UTF-8 -*- +""" +@File : redis_test.py +@Author : SuYongZhi +@Date : 2021/3/16 +@Desc : +""" + +import redis + +r = redis.Redis(host='127.0.0.1', port=6379, db=2) + +# String 操作 +r.set('name', 'suyongzhi') # 插入一条数据 +r.mset({"name1": 'zhangsan', "name2": 'lisi'}) # 插入多条数据 + +print(r.get('name')) # 获取一个值 +print(r.mget("name1", "name2")) # 获取多个值 + +# 设置新值,打印原值 +print(r.getset("name1", "wangwu")) # 输出:zhangsan +print(r.get("name1")) # 输出:wangwu + +# Hash操作 + +# name对应的hash中设置一个键值对(不存在,则创建,否则,修改) +r.hset("dic_name", "a1", "aa") +# 在name对应的hash中根据key获取value +print(r.hget("dic_name", "a1")) # 输出:aa + +# 获取name对应hash的所有键值 +print(r.hgetall("dic_name")) + +# 在name对应的hash中批量设置键值对,mapping:字典 +dic = {"a1": "aa", "b1": "bb"} +r.hmset("dic_name", dic) +print(r.hget("dic_name", "b1")) # 输出:bb + +# List操作 +# 在name对应的list中添加元素,每个新的元素都添加到列表的最左边 +# r.lpush("list_name", 2) +# r.lpush("list_name", 3, 4, 5) # 保存在列表中的顺序为5,4,3,2 +# name对应的list元素的个数 +print(r.llen("list_name")) + +# 在name对应的列表的某一个值前或后插入一个新值 +r.linsert("list_name", "BEFORE", "2", "SS") # 在列表内找到第一个元素2,在它前面插入SS + +# 对list中的某一个索引位置重新赋值 +r.lset("list_name", 0, "bbb") +# 删除name对应的list中的指定值 +# r.lrem("list_name", "SS", value=0) + +# Set操作 +# 给name对应的集合中添加元素 +r.sadd("set_name", "aa") +r.sadd("set_name", "aa", "bb") + +# 在第一个name对应的集合中且不在其他name对应的集合的元素集合 +r.sadd("set_name", "aa", "bb") +r.sadd("set_name1", "bb", "cc") +r.sadd("set_name2", "bb", "cc", "dd") + +print(r.sdiff("set_name", "set_name1", "set_name2")) # 输出:{aa} + +# 获取多个name对应集合的并集 +r.sadd("set_name", "aa", "bb") +r.sadd("set_name1", "bb", "cc") +r.sadd("set_name2", "bb", "cc", "dd") + +print(r.sinter("set_name", "set_name1", "set_name2")) # 输出:{bb} -- Gitee