diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/4\347\217\255/4\347\217\255_LiPing/\347\254\254\345\215\201\344\272\214\345\221\250/\347\254\254\344\270\200\350\212\202/scripts/jd_producer.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/4\347\217\255/4\347\217\255_LiPing/\347\254\254\345\215\201\344\272\214\345\221\250/\347\254\254\344\270\200\350\212\202/scripts/jd_producer.py" new file mode 100644 index 0000000000000000000000000000000000000000..81c81f2c46eb01afbcebc8dccc2296f482da0542 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/4\347\217\255/4\347\217\255_LiPing/\347\254\254\345\215\201\344\272\214\345\221\250/\347\254\254\344\270\200\350\212\202/scripts/jd_producer.py" @@ -0,0 +1,28 @@ +import redis +import time +import json + +redis_con = redis.Redis(host='localhost', port=6379, db=5) + + +def search_producer(): + for keyword in ["鼠标", "键盘", "显卡", "耳机"]: + for page_num in range(1, 11): + url = f"https://search.jd.com/Search?keyword={keyword}&page={page_num}" + meta = { + "sta_date": time.strftime("%Y-%m-%d"), + "keyword": keyword, + "page_num": page_num + } + + task = json.dumps({ + "url": url, + "body": '', + "method": "GET", + "meta": meta + }) + redis_con.lpush("jd_search:start_urls", task) + + +if __name__ == "__main__": + search_producer() \ No newline at end of file diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/4\347\217\255/4\347\217\255_LiPing/\347\254\254\345\215\201\344\272\214\345\221\250/\347\254\254\344\270\200\350\212\202/settings.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/4\347\217\255/4\347\217\255_LiPing/\347\254\254\345\215\201\344\272\214\345\221\250/\347\254\254\344\270\200\350\212\202/settings.py" new file mode 100644 index 0000000000000000000000000000000000000000..a055fd330267254e7cf8414017bf938d0e82213e --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/4\347\217\255/4\347\217\255_LiPing/\347\254\254\345\215\201\344\272\214\345\221\250/\347\254\254\344\270\200\350\212\202/settings.py" @@ -0,0 +1,125 @@ +# Scrapy settings for jd_crawler_scrapy project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'jd_crawler_scrapy' + +SPIDER_MODULES = ['jd_crawler_scrapy.spiders'] +NEWSPIDER_MODULE = 'jd_crawler_scrapy.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = 'jd_crawler_scrapy (+http://www.yourdomain.com)' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Enable redirect +REDIRECT_ENABLE = False + +# Retry +RETRY_ENABLE = False +RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 429] + + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +CONCURRENT_REQUESTS = 1 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'jd_crawler_scrapy.middlewares.JdCrawlerScrapySpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +DOWNLOADER_MIDDLEWARES = { + # 'jd_crawler_scrapy.middlewares.JdCrawlerScrapyDownloaderMiddleware': 543, + 'jd_crawler_scrapy.middlewares.UAMiddleware': 100, + +} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'jd_crawler_scrapy.pipelines.JdCrawlerScrapyPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + + +# MYSQL CONF +MYSQL_CONF = { + "host": "127.0.0.1", + "user": "root", + "password": "0000", + "db": "tunan_class_4" +} + + +# LOG +# LOG_FILE = "F:/log/jd_search.log" +LOG_LEVEL = "DEBUG" + +# Scrapy-redis settings +SCHEDULER = 'scrapy_redis.scheduler.Scheduler' +SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue' +DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" + +SCHEDULER_PERSIST = True + +# Redis settings +REDIS_HOST = 'localhost' +REDIS_PORT = 6379 +# Redis 参数配置 +REDIS_PARAMS = {"db": 5} diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/4\347\217\255/4\347\217\255_LiPing/\347\254\254\345\215\201\344\272\214\345\221\250/\347\254\254\344\270\200\350\212\202/spiders/jd_search.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/4\347\217\255/4\347\217\255_LiPing/\347\254\254\345\215\201\344\272\214\345\221\250/\347\254\254\344\270\200\350\212\202/spiders/jd_search.py" new file mode 100644 index 0000000000000000000000000000000000000000..ebf853e9f257876bbc3822d8905008d367ece170 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/4\347\217\255/4\347\217\255_LiPing/\347\254\254\345\215\201\344\272\214\345\221\250/\347\254\254\344\270\200\350\212\202/spiders/jd_search.py" @@ -0,0 +1,68 @@ +import scrapy +from bs4 import BeautifulSoup +import json +from ..items import JdCrawlerScrapyItem +from twisted.internet.error import TimeoutError +from scrapy.spidermiddlewares.httperror import HttpError +from scrapy.exceptions import CloseSpider +from scrapy_redis.spiders import RedisSpider + + + +class JdSearchSpider(RedisSpider): + name = "jd_search" + redis_key = f"{name}:start_urls" + + def make_request_from_data(self, data): + task = json.loads(data.decode("utf-8")) + return scrapy.http.FormRequest(url=task['url'], + formdata=json.loads(task['body']) if task['body'] else '', + method=task['method'], + meta=task['meta'], + dont_filter=False, + callback=self.parse_search, + errback=self.process_error) + + def parse_search(self, response): + html = response.text + soup = BeautifulSoup(html, 'lxml') + content = soup.select("ul[class='gl-warp clearfix'] li[class='gl-item']") + for item in content: + try: + sku_id = item.attrs["data-sku"] + img = item.select("img[data-img='1']") + price = item.select("div[class='p-price']") + title = item.select("div[class='p-name p-name-type-2'] em") + shop = item.select("div[class='p-shop']") + icons = item.select("div[class='p-icons']") + + img = img[0].attrs['data-lazy-img'] if img else "" + price = price[0].strong.i.text.strip() if price else "" + title = title[0].text.strip() if title else "" + shop = shop[0].text.strip() if shop else "" + icons = json.dumps([ele.text.strip() for ele in icons[0].select('i')]) if icons else '[]' + + items = JdCrawlerScrapyItem() + items["sku_id"] = sku_id + items["img"] = img + items["price"] = price + items["title"] = title + items["shop"] = shop + items["icons"] = icons + yield items + + except Exception as e: + print(e.args) + + + def err_back(self, failure): + if failure.check(TimeoutError): + print('Timeout error on %s website', failure.request.url) + elif failure.check(HttpError): + response = failure.value.response + print('HttpError on %s' % response.url) + + def process_error(self, failure): + print(failure) + if "身份已过期" in str(failure.value): + raise CloseSpider \ No newline at end of file diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/4\347\217\255/4\347\217\255_LiPing/\347\254\254\345\215\201\344\272\214\345\221\250/\347\254\254\344\272\214\344\270\211\350\212\202/jd_search.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/4\347\217\255/4\347\217\255_LiPing/\347\254\254\345\215\201\344\272\214\345\221\250/\347\254\254\344\272\214\344\270\211\350\212\202/jd_search.py" new file mode 100644 index 0000000000000000000000000000000000000000..1472bf5dc92876e660d197900f591635fe614e04 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/4\347\217\255/4\347\217\255_LiPing/\347\254\254\345\215\201\344\272\214\345\221\250/\347\254\254\344\272\214\344\270\211\350\212\202/jd_search.py" @@ -0,0 +1,57 @@ +from selenium import webdriver +from jd_crawler.jd_parser import search +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +import time + + +driver_path = r"F:\PyWorkSpace\chromedriver\chromedriver.exe" + +class JdCrawler: + def __init__(self, proxy=None, ua=None, headless=False, user_dir=None): + chrome_option = webdriver.ChromeOptions() + chrome_option.add_argument("disable-blink-features=AutomationControlled") + if proxy: + chrome_option.add_argument(f"--proxy-server={proxy}") + if ua: + chrome_option.add_argument(f"--user-agent={ua}") + if headless: + chrome_option.add_argument("--headless") + if user_dir: + chrome_option.add_argument(f"--user-data-dir={user_dir}") + + self.browser = webdriver.Chrome(executable_path=driver_path, chrome_options=chrome_option) + + self.browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocumet", { + "source": """ + Object.defineProperty(navigator, 'webdriver', { + get: ()=> 'my_webdriver' + }) + """ + }) + + + def sim_search(self, keyword, url): + self.browser.get(url) + search_input = self.browser.find_element_by_css_selector('input[aria-label="搜索"]') + search_input.send_keys(keyword) + seach_button = self.browser.find_element_by_css_selector('button[aria-label="搜索"]') + seach_button.click() + + def main(self, keyword, url): + self.sim_search(keyword, url) + time.sleep(3) + self.browser.execute_script("window.open('http://baidu.com')") + item_array = search.parser_jd_item(self.browser.page_source) + print(item_array) + self.browser.close() + +if __name__ == "__main__": + jd_url = "https://www.jd.com/" + proxy = "127.0.0.1:8888" + # proxy = None + ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36" + user_dir = r"F:\PyWorkSpace\user_data\tmp" + jd_search = JdCrawler(proxy=proxy, ua=ua, headless=False, user_dir=user_dir) + jd_search.main("鼠标", jd_url)