From 76c9608da8c61590630bc769d3ee083d35b0e533 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=A6=8F=E6=9D=A5Lee?= Date: Mon, 22 Mar 2021 07:33:47 +0800 Subject: [PATCH] =?UTF-8?q?add=20=E7=AC=AC=E4=BA=8C=E6=9C=9F=E8=AE=AD?= =?UTF-8?q?=E7=BB=83=E8=90=A5/3=E7=8F=AD/3=E7=8F=AD=5F=E6=9D=8E=E6=B3=8A/?= =?UTF-8?q?=E7=AC=AC=E5=8D=81=E4=BA=8C=E5=91=A8/=E7=AC=AC=E4=B8=80?= =?UTF-8?q?=E8=8A=82/jd=5Fsearch.py.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../jd_search.py" | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 "\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/3\347\217\255/3\347\217\255_\346\235\216\346\263\212/\347\254\254\345\215\201\344\272\214\345\221\250/\347\254\254\344\270\200\350\212\202/jd_search.py" diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/3\347\217\255/3\347\217\255_\346\235\216\346\263\212/\347\254\254\345\215\201\344\272\214\345\221\250/\347\254\254\344\270\200\350\212\202/jd_search.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/3\347\217\255/3\347\217\255_\346\235\216\346\263\212/\347\254\254\345\215\201\344\272\214\345\221\250/\347\254\254\344\270\200\350\212\202/jd_search.py" new file mode 100644 index 00000000..ad95b488 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/3\347\217\255/3\347\217\255_\346\235\216\346\263\212/\347\254\254\345\215\201\344\272\214\345\221\250/\347\254\254\344\270\200\350\212\202/jd_search.py" @@ -0,0 +1,64 @@ +import scrapy +import json +from bs4 import BeautifulSoup +from jd_crawler_scrapy.items import JdCrawlerScrapyItem +import time +from scrapy.exceptions import CloseSpider +from scrapy_redis.spiders import RedisSpider +from scrapy import Spider + +class JdSearch(RedisSpider): + name = "jd_search" + redis_key = f"{name}:start_urls" + + def make_request_from_data(self, data): + task = json.loads(data.decode("utf-8")) + return scrapy.http.FormRequest(url=task['url'], + formdata=json.loads(task['body']) if task['body'] else '', + method=task['method'], + meta=task['meta'], + dont_filter=False, + callback=self.parse_search, + errback=self.process_error + ) + + def parse_search(self, response): + print(response) + soup = BeautifulSoup(response.text, "lxml") + item_array = soup.select("ul[class='gl-warp clearfix'] li[class=gl-item]") + for item in item_array: + try: + sku_id = item.attrs["data-sku"] + img = item.select("img[data-img='1']") + price = item.select("div[class='p-price']") + title = item.select("div[class='p-name p-name-type-2']") + shop = item.select("div[class='p-shop']") + icons = item.select("div[class='p-icons']") + + img = img[0].attrs['data-lazy-img'] if img else "" + price = price[0].strong.i.text if price else "" + title = title[0].text.strip() if title else "" + shop = shop[0].span.a.attrs['title'] if shop[0].text.strip() else "" + icons = json.dumps([tag_ele.text for tag_ele in icons[0].select("i")]) if icons else '[]' + + item = JdCrawlerScrapyItem() + item["sku_id"] = sku_id + item["img"] = img + item["price"] = price + item["title"] = title + item["shop"] = shop + item["icons"] = icons + item["sta_date"] = response.meta["sta_date"] + item["keyword"] = response.meta["keyword"] + yield item + + except Exception as e: + print(e.args) + + def process_error(self, failure): + print(failure) + if "身份已过期" in str(failure.value): + raise CloseSpider + #记录异常 + #发送通知 + #重做任务 -- Gitee