diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_10/Scrapy\344\270\255\351\227\264\344\273\266\345\217\212debug-10-3noteyun.md" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_10/Scrapy\344\270\255\351\227\264\344\273\266\345\217\212debug-10-3noteyun.md" new file mode 100644 index 0000000000000000000000000000000000000000..4364a5132190977964187bcd2b540b46d6af7d6a --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_10/Scrapy\344\270\255\351\227\264\344\273\266\345\217\212debug-10-3noteyun.md" @@ -0,0 +1,84 @@ +10-3noteyun- + +# Scrapy的启动和debug + +- 命令行 + + ```python + scrapy crawl jd_search #在命令行中 + ``` + +- 启动脚本 + + ```python + # 新建run.py脚本 + + from scrapy import cmdline + + command = "scrapy crawl jd_search".split() + cmdline.execute(command) + ``` + +# Scrapy Item + +只是对解析的结构化结果进行一个约束, 在到达pipeline前就可以检查出数据错误. + +# Scrapy的设置 + +- ROBOTTEXT_OBEY + + 获取对方网站是否允许爬虫获取数据的信息. + +- 设置中间件 + + 数字越小, 离`ENGINE`越近,越先执行; + + ```python + DOWNLOADER_MIDDLEWARES = { + # 'jd_crawler_scrapy.middlewares.JdCrawlerScrapyDownloaderMiddleware': 543, + 'jd_crawler_scrapy.middlewares.UAMiddleware': 100, + } + ``` + + ![image-20210312174120669](C:\Users\yunqin\AppData\Roaming\Typora\typora-user-images\image-20210312174120669.png) + +- 设置PIPELINE + + ```python + ITEM_PIPELINES = { + 'jd_crawler_scrapy.pipelines.JdCrawlerScrapyPipeline': 300, + } + ``` + +- LOG 打印日志 + + - LOG_ENABLE + + 默认为`True`, 是否使用log 默认,使用log + + - LOG_FILE + + 设置保存的log文件目录 + + - LOG_LEVEL(按严重程序排序) + + - CRITICAL (critical) + - ERROR (error) + - WARNING (warning)警告 + - INFO (info)信息 + - DEBUG (debug) 打印全面的信息 + +# Scrapy的中间件 + +- 请求头中间件 + + ```python + class UAMiddleware: + def process_request(self, request, spider): + request.headers["user-agent"] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36" + ``` + +# 课后作业 + +- 将jd_crawler_scrapy完善. +- 完成代理中间件的编写(查阅文档). \ No newline at end of file diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_10/Scrapy\347\210\254\350\231\253\346\241\206\346\236\266\344\273\213\347\273\215-10-2noteyun-.md" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_10/Scrapy\347\210\254\350\231\253\346\241\206\346\236\266\344\273\213\347\273\215-10-2noteyun-.md" new file mode 100644 index 0000000000000000000000000000000000000000..38b37e6c3573448a94aca50dc90f5318555c349a --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_10/Scrapy\347\210\254\350\231\253\346\241\206\346\236\266\344\273\213\347\273\215-10-2noteyun-.md" @@ -0,0 +1,145 @@ +10-2noteyun + +# Scrapy爬虫框架介绍 + +- 文档 + + - [英文文档](https://docs.scrapy.org/en/latest/) + - [中文文档](https://scrapy-chs.readthedocs.io/zh_CN/0.24/intro/tutorial.html) + +- 什么是scrapy + + 基于`twisted`搭建的异步爬虫框架. + + scrapy爬虫框架根据组件化设计理念和丰富的中间件, 使其成为了一个兼具高性能和高扩展的框架 + +- scrapy提供的主要功能 + + - **具有优先级功能的调度器** + - **去重功能** + - 失败后的重试机制 + - 并发限制 + - ip使用次数限制 + - .... + +- scrapy的使用场景 + + - 不适合scrapy项目的场景 + - 业务非常简单, 对性能要求也没有那么高, 那么我们写多进程, 多线程, 异步脚本即可. + - 业务非常复杂, 请求之间有顺序和失效时间的限制. + - **如果你不遵守框架的主要设计理念, 那就不要使用框架** + - 适合使用scrapy项目 + - 数据量大, 对性能有一定要求, 又需要用到**去重功能**和**优先级功能的调度器** + +- **scrapy组件** + + ![img](https://docs.scrapy.org/en/latest/_images/scrapy_architecture_02.png) + + - `ENGINE`从`SPIDERS`中获取初始请求任务`Requests` + + - `ENGINE`得到`Requests`之后发送给`SCHEDULER`, `SCHEDULER`对请求进行调度后产出任务. + + - `Scheduler`返回下一个请求任务给`ENGINE` + + - `ENGINE`将请求任务交给`DOWNLOADER`去完成下载任务, 途径下载器中间件. + + - 一旦下载器完成请求任务, 将产生一个`Response`对象给`ENGINE`, 途径下载器中间件 + + - `ENGINE`收到`Response`对象后, 将该对象发送给`SPIDERS`去解析和处理, 途径爬虫中间件 + + - ```python + SPIDER + ``` + + 解析返回结果 + + - 将解析结果`ITEMS`发送给`ENGINE` + - 生成一个新的`REQUESTS`任务发送给`ENGINE`(如详情页中还有新的链接) + + - 如果`ENGINE`拿到的是`ITEMS`, 那么就会发送给`ITEM PIPELINES`做数据处理, 如果是`REQUESTS`则发送给`SCHEDULER` + + - 周而复始, 直到没有任务产出 + +# Scrapy教程 + +- 安装 + + ```python + pip install scrapy + ``` + +- 创建项目 + + ```python + scrapy startproject jd_crawler_scrapy + ``` + +- 目录结构 + + - spiders(目录) + + 存放`SPIDERS`项目文件, 一个scrapy项目下可以有多个爬虫实例 + + 多个爬虫实例,共用一套中间件、管道、设置文件等等 + + - items + + 解析后的结构化结果. + + - middlewares + + 下载器中间件和爬虫中间件的地方 + + - piplines + + 处理items的组件, 一般都在pipelines中完成items插入数据表的操作(数据库操作) + + - settings + + 统一化的全局爬虫配置文件 + + - scrapy.cfg + + 项目配置文件 + + + + + + scrapy爬虫demo + + ```python + import scrapy + + + class JdSearch(scrapy.Spider): + name = "jd_search" + + def start_requests(self): + for keyword in ["鼠标", "键盘", "显卡", "耳机"]: + for page_num in range(1, 11):#10页 + url = f"https://search.jd.com/Search?keyword={keyword}&page={page_num}" + + # 选用FormRequest是因为它既可以发送GET请求, 又可以发送POST请求 + yield scrapy.FormRequest( + url=url, + method='GET', + # formdata=data, # 如果是post请求, 携带数据使用formdata参数 + callback=self.parse_search # 指定回调函数处理response对象 + ) + + + def parse_search(self, response): + print(response) + ``` + +- 启动爬虫 + + ```python + scrapy crawl spider_name + ``` + +# 课后作业 + +- 背诵`scrapy`组件流程(必考) +- 完成scrapy项目的demo \ No newline at end of file diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_10/jd_crawler_scrapy/jd_crawler_scrapy/__init__.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_10/jd_crawler_scrapy/jd_crawler_scrapy/__init__.py" new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_10/jd_crawler_scrapy/jd_crawler_scrapy/items.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_10/jd_crawler_scrapy/jd_crawler_scrapy/items.py" new file mode 100644 index 0000000000000000000000000000000000000000..cd9f2584bf386aac18cbd6833fb5a5780d071899 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_10/jd_crawler_scrapy/jd_crawler_scrapy/items.py" @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# http://doc.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class JdCrawlerScrapyItem(scrapy.Item): + # define the fields for your item here like: + # name = scrapy.Field() + sku_id=scrapy.Field() + img=scrapy.Field() + price=scrapy.Field() + title=scrapy.Field() + shop=scrapy.Field() + icons=scrapy.Field() + + + # name=scrapy.Field() + # tel=scrapy.Field() + # address=scrapy.Field() + diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_10/jd_crawler_scrapy/jd_crawler_scrapy/middlewares.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_10/jd_crawler_scrapy/jd_crawler_scrapy/middlewares.py" new file mode 100644 index 0000000000000000000000000000000000000000..cd2c42d3e7e8d80bd753f4fb14403f069806e3a1 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_10/jd_crawler_scrapy/jd_crawler_scrapy/middlewares.py" @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- +# 下载器中间件、爬虫中间件 +# Define here the models for your spider middleware +# +# See documentation in: +# http://doc.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals +#自己写的类 +class UAMiddleware: + def process_request(self,request,spider): + """ + 在正式请求前,为当前请求添加headers + :param request: + :param spider: + :return: + """ + request.headers["user-agent"]="Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36" + +class JdCrawlerScrapySpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Response, dict + # or Item objects. + pass + + def process_start_requests(start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_10/jd_crawler_scrapy/jd_crawler_scrapy/pipelines.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_10/jd_crawler_scrapy/jd_crawler_scrapy/pipelines.py" new file mode 100644 index 0000000000000000000000000000000000000000..db82e4222720cc90aeb148f55c1a4ec0f6293f97 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_10/jd_crawler_scrapy/jd_crawler_scrapy/pipelines.py" @@ -0,0 +1,34 @@ +# -*- coding: utf-8 -*- +import pymysql + +from jd_crawler_scrapy.items import JdCrawlerScrapyItem + + + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html + + +class JdCrawlerScrapyPipeline(object): + def __init__(self): + self.mysql_con=None + + + def process_item(self, item, spider): + if not self.mysql_con: #若为空 + self.mysql_con=pymysql.connect(**spider.settings["MYSQL_CONF"]) + # mysql_con = pymysql.connect(**MYSQL_CONF) + + if isinstance(item,JdCrawlerScrapyItem): + cursor=self.mysql_con.cursor() + print(type(cursor)) + SQL="""INSERT INTO jd_search(sku_id,img,price, title, shop, icons) + VALUES ('{}', '{}', '{}', '{}', '{}', '{}')""".format( + item['sku_id'],item['img'],item['price'],item['title'],item['shop'],item['icons'] + ) + cursor.execute(SQL) + self.mysql_con.commit() # 提交、入库 + cursor.close() # 关闭游标 + return item diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_10/jd_crawler_scrapy/jd_crawler_scrapy/settings.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_10/jd_crawler_scrapy/jd_crawler_scrapy/settings.py" new file mode 100644 index 0000000000000000000000000000000000000000..e7bc1ee8c873dd0d60f7cb8a32dc52b888c0b884 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_10/jd_crawler_scrapy/jd_crawler_scrapy/settings.py" @@ -0,0 +1,105 @@ +# -*- coding: utf-8 -*- +# 统一化的全局爬虫配置文件 +# Scrapy settings for jd_crawler_scrapy project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# http://doc.scrapy.org/en/latest/topics/settings.html +# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html +# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'jd_crawler_scrapy' + +SPIDER_MODULES = ['jd_crawler_scrapy.spiders'] +NEWSPIDER_MODULE = 'jd_crawler_scrapy.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = 'jd_crawler_scrapy (+http://www.yourdomain.com)' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'jd_crawler_scrapy.middlewares.JdCrawlerScrapySpiderMiddleware': 543, +#} + +# 下载器中间件--改造 +# Enable or disable downloader middlewares +# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html +DOWNLOADER_MIDDLEWARES = { + # 'jd_crawler_scrapy.middlewares.MyCustomDownloaderMiddleware': 543, + 'jd_crawler_scrapy.middlewares.UAMiddleware': 100, +} + +# Enable or disable extensions +# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'jd_crawler_scrapy.pipelines.JdCrawlerScrapyPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See http://doc.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +# mysql # 配置 +MYSQL_CONF = { + "host": "127.0.0.1", + "user": "root", + "password": "123456", + "db": "py_class" +} + +LOG_FILE="E:/PycharmProjects/log/jd_search.log" + +# LOG_LEVEL='DEBUG' #非常全面的信息 +LOG_LEVEL='ERROR' #一行信息 \ No newline at end of file diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_10/jd_crawler_scrapy/jd_crawler_scrapy/spiders/__init__.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_10/jd_crawler_scrapy/jd_crawler_scrapy/spiders/__init__.py" new file mode 100644 index 0000000000000000000000000000000000000000..ebd689ac51d69c5e1dbbe80083c2b20a39f8bb79 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_10/jd_crawler_scrapy/jd_crawler_scrapy/spiders/__init__.py" @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_10/jd_crawler_scrapy/jd_crawler_scrapy/spiders/jd_search.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_10/jd_crawler_scrapy/jd_crawler_scrapy/spiders/jd_search.py" new file mode 100644 index 0000000000000000000000000000000000000000..f420d2aeca9085233d7ae2e570dfec0dc7a83095 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_10/jd_crawler_scrapy/jd_crawler_scrapy/spiders/jd_search.py" @@ -0,0 +1,61 @@ +import scrapy +from bs4 import BeautifulSoup +import json +from jd_crawler_scrapy.items import JdCrawlerScrapyItem + + +class JdSearch(scrapy.Spider): + name="jd_search"#指定运行的爬虫文件 + + def start_requests(self):#任务生产者 + for keyword in ["鼠标","键盘","显卡","耳机"]: + for page_num in range(1,11): + url=f"https://search.jd.com/Search?keyword={keyword}&page={page_num}" + #选用FormRequest:get和post + yield scrapy.FormRequest( + url=url, + method="GET", + # formdata=data, #如果是post请求,携带数据,使用formdata参数 + callback=self.parse_search #指定回调函数,处理response对象 + )#支持get和post方法 + #yield惰性返回数据(生成器) + break + + def parse_search(self,response): #解析函数 + print(response) + + soup = BeautifulSoup(response.text, "lxml") + item_array = soup.select("ul[class='gl-warp clearfix'] li[class='gl-item']") + # $("ul[class='gl-warp clearfix'] li[class='gl-item']") + for item in item_array: + try: + sku_id = item.attrs["data-sku"] + img = item.select("img[data-img='1']") + price = item.select("div[class='p-price']") + title = item.select("div[class='p-name p-name-type-2']") + shop = item.select("div[class='p-shop']") + icons = item.select("div[class='p-icons']") + # print(img) + + img = img[0].attrs['data-lazy-img'] if img else "" + price = price[0].strong.i.text if price else "" + title = title[0].text.strip() if title else "" + shop = shop[0].a.attrs['title'] if shop[0].text.strip() else "" + icons = json.dumps([tag_ele.text for tag_ele in icons[0].select("i")]) if icons else '[]' + + # result.append((sku_id, img, price, title, shop, icons)) # 收集结果 + #item 约束 + item=JdCrawlerScrapyItem() + item["sku_id"]=sku_id + item["img"]=img + item["price"]=price + item["title"]=title + item["shop"]=shop + item["icons"]=icons + yield item #异步框架 + + except Exception as e: + print(e.args) + + + diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_10/jd_crawler_scrapy/jd_crawler_scrapy/spiders/run.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_10/jd_crawler_scrapy/jd_crawler_scrapy/spiders/run.py" new file mode 100644 index 0000000000000000000000000000000000000000..288d5060bddffbe5e8d3b60d6ca86b7fcc6805cf --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_10/jd_crawler_scrapy/jd_crawler_scrapy/spiders/run.py" @@ -0,0 +1,5 @@ +from scrapy import cmdline + +command="scrapy crawl jd_search".split() +print(command) +cmdline.execute(command) \ No newline at end of file diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_10/jd_crawler_scrapy/scrapy.cfg" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_10/jd_crawler_scrapy/scrapy.cfg" new file mode 100644 index 0000000000000000000000000000000000000000..fa0c0f404b8118ee48947a4e0f6446d0a7e89e82 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_10/jd_crawler_scrapy/scrapy.cfg" @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.org/en/latest/deploy.html + +[settings] +default = jd_crawler_scrapy.settings + +[deploy] +#url = http://localhost:6800/ +project = jd_crawler_scrapy diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_10/\350\267\257\345\276\204\347\256\241\347\220\20610-1noteyun-0311.md" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_10/\350\267\257\345\276\204\347\256\241\347\220\20610-1noteyun-0311.md" new file mode 100644 index 0000000000000000000000000000000000000000..06659a6c552eab3f1dacb55d831672116f13acaf --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_10/\350\267\257\345\276\204\347\256\241\347\220\20610-1noteyun-0311.md" @@ -0,0 +1,153 @@ +10-1noteyun + +# 路径管理 + +## 路径 + +- 绝对路径 + + 总是从根目录开始 + + ```python + H:\PyCharmProjects\tutorials_2\jd_crawler\main.py + ``` + +- 相对路径 + + ```python + jd_crawler\main.py + ``` + + - `.`和`..` + + `.`代表当前目录, `..`代表父目录 + +- 工作目录 + + 当前执行命令所在的目录 + + ```python + # 将工作目录添加进当前的路径列表 + sys.path.append(os.getcwd()) + ``` + +# 路径列表 + +- 查看当前路径列表 + + 只有在路径列表当中的包和模块才可以导入和调用 + + ```python + import sys + print(sys.path) + ['E:\\PycharmProjects\\train002_1231\\second-python-bootcamp\\第二期训练营\\5班\\5班_云\\第9周0222--0228\\9-3\\jd_crawler', + 'E:\\PycharmProjects\\train002_1231\\second-python-bootcamp', + 'E:\\PycharmProjects\\train002_1231\\second-python-bootcamp\\第二期训练营\\5班\\5班_云\\第9周0222--0228\\9-3\\jd_crawler', + 'E:\\PycharmProjects\\train002_1231\\second-python-bootcamp\\第二期训练营\\5班\\5班_云\\第9周0222--0228\\9-3', + 'D:\\bsoft\\ananaconda\\python37.zip', + 'D:\\bsoft\\ananaconda\\DLLs', + 'D:\\bsoft\\ananaconda\\lib', + 'D:\\bsoft\\ananaconda', + 'D:\\bsoft\\ananaconda\\lib\\site-packages', 'D:\\bsoft\\ananaconda\\lib\\site-packages\\win32', 'D:\\bsoft\\ananaconda\\lib\\site-packages\\win32\\lib', + 'D:\\bsoft\\ananaconda\\lib\\site-packages\\Pythonwin'] + + ``` + + 命令行中: + + ```python + python main.py + python jd_crawler main.py + + ``` + + + +- 路径搜索顺序 + + - 当前脚本路径, 也就是执行文件的目录 + + - `PYTHONPATH`路径 + + - 虚拟环境路径 + + - ```python + site-packages + ``` + + - 安装的第三方库所在路径 + +- 可以向路径列表添加路径 + + ```python + sys.path.append(r"H:\PyCharmProjects\tutorials_2") + #不灵活 + ``` + +# 常见报错 + +image-20210310200003109 + +(1)ModuleNotFoundError: No module named 'xxxx'` + +- 为什么在pycharm中不报错, 在命令行当中报错 + + ```python + Pycharm会自动将当前项目的根目录添加到路径列表当中 + ``` + +(2) `ModuleNotFoundError: No module named 'parser.search'; 'parser' is not a package` + +- 自定义包和内置包名有冲突 + + 修改包名即可 + +- 导入的不是一个包 + +![image-20210310202625722](C:\Users\yunqin\AppData\Roaming\Typora\typora-user-images\image-20210310202625722.png) + +(3) `ModuleNotFoundError: No module named '__main__.jd_parser'; '__main__' is no t a package` + +- **入口程序不可以使用相对路径** + +- `__main__` + + 主程序模块名会被修改为`__main__` + + ```python + + if __name__ == "__main__":#入口 + # 用来代替生产者 + mysql_con = pymysql.connect(**MYSQL_CONF) + ``` + + + +(4) `ValueError: attempted relative import beyond top-level package` + +当前访问路径已经超过了python已知的最大路径 + +```python +from tutorial_2.jd_crawler.jd_parser.search import parse_jd_item + +top-level package 指的是上述from导入命令中的首路径tutorial_2, 而不是根据目录结构 +``` + +- 把工作目录加入到路径列表当中 +- 进入到项目根目录下执行命令 +- 上述两个操作相当于将项目根目录加入到路径列表当中 + +image-20210310203436498 + +# 注意事项 + +- 确定入口程序, 没有一个锚定的路径就没有办法做相对路径的管理 +- 将项目根目录加入到入口程序当中 +- 进入到项目根目录下执行命令 +- 项目目录结构不要嵌套的太深 +- 脚本文件或者临时运行单个模块中的方法, 可以将根目录临时添加到**路径列表**当中 + +# 课后作业 + +- 用命令行启动`jd_crawler` +- 在`/test`目录中增加`parser_test.py`模块做解析测试. \ No newline at end of file diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/\347\254\2549\345\221\2500222--0228/CSS-BeautifulSoup\345\205\203\347\264\240\345\256\232\344\275\215-9-2noteyun -0309\350\241\245.md" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/\347\254\2549\345\221\2500222--0228/CSS-BeautifulSoup\345\205\203\347\264\240\345\256\232\344\275\215-9-2noteyun -0309\350\241\245.md" new file mode 100644 index 0000000000000000000000000000000000000000..e5b75e91ba6c6f6880e3ba5ad765e1eaa8dcac6e --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/\347\254\2549\345\221\2500222--0228/CSS-BeautifulSoup\345\205\203\347\264\240\345\256\232\344\275\215-9-2noteyun -0309\350\241\245.md" @@ -0,0 +1,251 @@ +9-2noteyun + +# css-selector + +> 尽量避免解析路径中包含位置信息 + +> chrome页面中内置了Jquery环境, 用$符号来表示 + +## 直接定位元素 + +- 通过id进行定位 + + ```python + $("#id值") + ``` + +- 通过class进行定位 + + ```python + $(".class值") + ``` + +- **通过属性名进行定位** + + ```python + $("标签名[属性名='属性值']") + + $("ul[class='gl-warp clearfix']") + ``` + +## 获取兄弟节点 + +- 获取当前节点的下一个节点 + + - dom提供的接口, 不属于css-selector语法 + + ```python + tmp = $("li[data-sku='6039832']")[0] + tmp.nextElementSibling#获取兄弟节点;右边一个商品 + ``` + + - 通过css-selector(不建议) + + ```python + $("ul[class='gl-warp clearfix'] li:first-child + li") + ``` + +- 获取当前节点的上一个节点 + + - dom提供的接口, 不属于css-selector语法 + + ```python + tmp = $("li[data-sku='2136538']")[0] + tmp.previousElementSibling #获取兄弟节点;左边一个商品 + ``` + +## 获取父子节点 + +- 获取父节点 + + - dom提供的接口, 不属于css-selector语法 + + ```python + tmp.parentElement + ``` + +- 获取子节点 + + - 获取所有子节点 + + - **遍历**所有符合条件的元素 + + ```python + $("ul[class='gl-warp clearfix'] div[class='gl-i-wrap']") + + ``` + + $("ul[class='gl-warp clearfix'] li[class='gl-item']")[0] + + ``` + + - dom提供的接口, 不属于css-selector语法 + + ​```python + $("ul[class='gl-warp clearfix']")[0].children + ``` + + ![image-20210309165518896](C:\Users\yunqin\AppData\Roaming\Typora\typora-user-images\image-20210309165518896.png) + + - 获取第一个子节点 + + ```python + :fist-child + $("ul[class='gl-warp clearfix'] li:first-child")[0] + ``` + + - 获取最后一个子节点 + + ```python + :last-child + $("ul[class='gl-warp clearfix'] li:last-child")[0] + ``` + + - 获取第N个子节点 + + ```python + #:nth-child(索引) 获取第五个 + $("ul[class='gl-warp clearfix'] li:nth-child(5)")[0] + ``` + + ![image-20210309170300638](C:\Users\yunqin\AppData\Roaming\Typora\typora-user-images\image-20210309170300638.png) + +# 模糊匹配 + +- 匹配开头 + + `^` + + ```python + # 匹配data-sku属性值为2开头的元素 + $("li[data-sku^='2']") + ``` + + ![image-20210309173231429](C:\Users\yunqin\AppData\Roaming\Typora\typora-user-images\image-20210309173231429.png) + +- 匹配结尾 + + `$` + + ```python + $("li[data-sku$='2']") + ``` + +- 匹配子集 + + `*` + + ```python + $("li[data-sku*='2']") + ``` + + ![image-20210309173254084](C:\Users\yunqin\AppData\Roaming\Typora\typora-user-images\image-20210309173254084.png) + +- 获取文本值 + + ```python + $("li[data-sku='6039832'] div[class='p-name p-name-type-2'] em")[0].innerText + ``` + + ![image-20210309173518534](C:\Users\yunqin\AppData\Roaming\Typora\typora-user-images\image-20210309173518534.png) + + ![image-20210309174512487](C:\Users\yunqin\AppData\Roaming\Typora\typora-user-images\image-20210309174512487.png) + + + +- 获取属性值 + + ```python + $("ul[class='gl-warp clearfix'] li")[0].getAttribute("data-sku") + ``` + + ![image-20210309174838554](C:\Users\yunqin\AppData\Roaming\Typora\typora-user-images\image-20210309174838554.png) + +# BeautifulSoup + +- 安装 + + ```python + pip install bs4 + pip install lxml + ``` + +- 使用BeautifulSoup + + ```python + from bs4 import BeautifulSoup + + + def jd_search_parse(html): + soup = BeautifulSoup(html, "lxml") + item = soup.select("li[data-sku='6039832']")[0] + ``` + +- 直接定位元素 + + 略 + +- 去除空白字符 + + ```python + html = html.replace('\r\n', "").replace("\n", "").replace("\t", "") + ``` + + ![image-20210309181256218](C:\Users\yunqin\AppData\Roaming\Typora\typora-user-images\image-20210309181256218.png) + +- 获取兄弟节点 + + - 获取上一个节点 + + ```python + tmp_ele.previous_sibling + ``` + + - 获取下一个节点 + + ```python + tmp_ele.next_sibling + ``` + +- 获取父子节点 + + - 获取父节点 + + ```python + tmp_ele.parent + ``` + + - 获取子节点 + + ```python + tmp_ele.children + ``` + +- 模糊匹配 + + 略 + +- 获取文本值 + + ```python + content = tmp_ele.text.strip() + ``` + +- 获取属性值 + + ```python + value = tmp_ele.attrs["data-sku"] + ``` + + ![image-20210310092223685](C:\Users\yunqin\AppData\Roaming\Typora\typora-user-images\image-20210310092223685.png) + + ![image-20210310092449257](C:\Users\yunqin\AppData\Roaming\Typora\typora-user-images\image-20210310092449257.png) + + ![image-20210310093322969](C:\Users\yunqin\AppData\Roaming\Typora\typora-user-images\image-20210310093322969.png) + + debug:50分钟之后; + +# 课后作业 + +- 练习css-selector +- 练习用beautifulsoup进行页面解析 \ No newline at end of file diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/\347\254\2549\345\221\2500222--0228/jd_crawler/001.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/\347\254\2549\345\221\2500222--0228/jd_crawler/001.py" new file mode 100644 index 0000000000000000000000000000000000000000..229d72edceb621b08c99bb6919500be82b3d178a --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/\347\254\2549\345\221\2500222--0228/jd_crawler/001.py" @@ -0,0 +1,6 @@ +import sys +print(sys.path)#打印路径列表 +# ['E:\\PycharmProjects\\train002_1231\\second-python-bootcamp\\第二期训练营\\5班\\5班_云\\第9周0222--0228\\9-3\\jd_crawler', 'E:\\PycharmProjects\\train002_1231\\second-python-bootcamp', 'E:\\PycharmProjects\\train002_1231\\second-python-bootcamp\\第二期训练营\\5班\\5班_云\\第9周0222--0228\\9-3\\jd_crawler', 'E:\\PycharmProjects\\train002_1231\\second-python-bootcamp\\第二期训练营\\5班\\5班_云\\第9周0222--0228\\9-3', 'D:\\bsoft\\ananaconda\\python37.zip', 'D:\\bsoft\\ananaconda\\DLLs', 'D:\\bsoft\\ananaconda\\lib', 'D:\\bsoft\\ananaconda', 'D:\\bsoft\\ananaconda\\lib\\site-packages', 'D:\\bsoft\\ananaconda\\lib\\site-packages\\win32', 'D:\\bsoft\\ananaconda\\lib\\site-packages\\win32\\lib', 'D:\\bsoft\\ananaconda\\lib\\site-packages\\Pythonwin'] +# sys.path.append(r"H:\PyCharmProjects\tutorials_2")#不灵活 +print("001",__name__) +# sys.path.append(r"E:\PycharmProjects\Spider_Code\9-3") diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/\347\254\2549\345\221\2500222--0228/jd_crawler/jd_parser/detail.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/\347\254\2549\345\221\2500222--0228/jd_crawler/jd_parser/detail.py" new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/\347\254\2549\345\221\2500222--0228/jd_crawler/jd_parser/search.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/\347\254\2549\345\221\2500222--0228/jd_crawler/jd_parser/search.py" new file mode 100644 index 0000000000000000000000000000000000000000..1d1c7de8d90ceee46722cdda79b5159b969ed987 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/\347\254\2549\345\221\2500222--0228/jd_crawler/jd_parser/search.py" @@ -0,0 +1,39 @@ +from bs4 import BeautifulSoup +import json + +def parse_jd_item(html):#解析器 + result = []#列表,收集,解析的结果 + + soup = BeautifulSoup(html, "lxml") + print(soup) + item_array = soup.select("ul[class='gl-warp clearfix'] li[class='gl-item']") + # $("ul[class='gl-warp clearfix'] li[class='gl-item']") + for item in item_array: + try: + sku_id = item.attrs["data-sku"] + img = item.select("img[data-img='1']") + price = item.select("div[class='p-price']") + title = item.select("div[class='p-name p-name-type-2']") + shop = item.select("div[class='p-shop']") + icons = item.select("div[class='p-icons']") + # print(img) + + + img = img[0].attrs['data-lazy-img'] if img else "" + price = price[0].strong.i.text if price else "" + title = title[0].text.strip() if title else "" + shop = shop[0].a.attrs['title'] if shop[0].text.strip() else "" + icons = json.dumps([tag_ele.text for tag_ele in icons[0].select("i")]) if icons else '[]' + + result.append((sku_id, img, price, title, shop, icons))#收集结果 + except Exception as e: + print(e.args) + return result + +# +# if __name__ == "__main__": +# with open(r"..\\test\\search.html", "r", encoding="utf-8") as f: +# +# html = f.read() +# result = parse_jd_item(html) +# print(result) \ No newline at end of file diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/\347\254\2549\345\221\2500222--0228/jd_crawler/main.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/\347\254\2549\345\221\2500222--0228/jd_crawler/main.py" new file mode 100644 index 0000000000000000000000000000000000000000..b3056f5fb2dfbd7e29986db1f876daf392bd6cba --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/\347\254\2549\345\221\2500222--0228/jd_crawler/main.py" @@ -0,0 +1,61 @@ +import random +import pymysql +import requests +import sys +sys.path.append(r"E:\PycharmProjects\Spider_Code\week9_3") +sys.path.append(r"E:\PycharmProjects\Spider_Code\week9_3\jd_crawler") +print(sys.path) + +from jd_crawler.jd_parser.search import parse_jd_item +from jd_crawler.settings import MYSQL_CONF, HEADERS +import sys +print(sys.path) + + +def saver(item_array):#保存到sql数据库 + """ + 持久化爬取结果 + :param item_array: + :return: + """ + cursor = mysql_con.cursor()#每次建立一个游标 + SQL = """INSERT INTO jd_search(sku_id,img,price, title, shop, icons) + VALUES ( %s,%s, %s, %s, %s, %s)""" + cursor.executemany(SQL, item_array) + mysql_con.commit()#提交、入库 + cursor.close()#关闭游标 + +def donwloader(task):#下载器 + """ + 下载器 + 请求目标网址的组件 + :param task: + :return: + """ + url = "https://search.jd.com/Search" + params = { + "keyword": task #关键词的列表 + } + res = requests.get(url=url, params=params, headers=HEADERS, timeout=10, + # proxies={"https": f"https:144.255.48.62","http": f"http:144.255.48.62"} + ) + return res + + +def main(task_array):#main函数,调度器 + """ + 爬虫任务的调度 + :return: + """ + for task in task_array: + result = donwloader(task)#下载器 + item_array = parse_jd_item(result.text)#解析器 + print("GET ITEMS", item_array)#打印语句,检查执行的正确与否,及其过程 + saver(item_array) + + +if __name__ == "__main__":#入口 + # 用来代替生产者 + mysql_con = pymysql.connect(**MYSQL_CONF) + task_array = ["鼠标", "键盘", "显卡", "耳机"] + main(task_array) \ No newline at end of file diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/\347\254\2549\345\221\2500222--0228/jd_crawler/settings.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/\347\254\2549\345\221\2500222--0228/jd_crawler/settings.py" new file mode 100644 index 0000000000000000000000000000000000000000..d260ebbe81a4237833c331fedfe1fe8d34055935 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/\347\254\2549\345\221\2500222--0228/jd_crawler/settings.py" @@ -0,0 +1,14 @@ +# 设置文件 +# 请求头 +HEADERS = { + "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36", + # "upgrade-insecure-requests": "1" +} + +# 配置 +MYSQL_CONF = { + "host": "127.0.0.1", + "user": "root", + "password": "123456", + "db": "py_class" +} \ No newline at end of file diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/\347\254\2549\345\221\2500222--0228/jd_crawler/test/parser_test.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/\347\254\2549\345\221\2500222--0228/jd_crawler/test/parser_test.py" new file mode 100644 index 0000000000000000000000000000000000000000..779883b2bb5919b18abb6846b2bc2e9791fef706 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/\347\254\2549\345\221\2500222--0228/jd_crawler/test/parser_test.py" @@ -0,0 +1,12 @@ +import sys +sys.path.append(r"E:\PycharmProjects\Spider_Code\week9_3\jd_crawler") +print(sys.path) + +from jd_crawler.jd_parser.search import parse_jd_item + +with open(r"..\\test\\search.html", "r", encoding="utf-8") as f: + html = f.read() + result = parse_jd_item(html) + print(result) + + # with open(r"..\\test\\search.html", "r", encoding="utf-8") as f: \ No newline at end of file diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/\347\254\2549\345\221\2500222--0228/jd_crawler/test/search.html" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/\347\254\2549\345\221\2500222--0228/jd_crawler/test/search.html" new file mode 100644 index 0000000000000000000000000000000000000000..0f67859bf30f108ca01618bdfc3a29b171484be0 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/\347\254\2549\345\221\2500222--0228/jd_crawler/test/search.html" @@ -0,0 +1,6615 @@ + + + + + + + + + + + + + + + + + + + + +鼠标 - 商品搜索 - 京东 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + +
+
+
+
+
+ +
+
+ +
+ + +
+
+
+
+ + + 我的购物车 +
+
+
+
+ +
+
+
+
+ +
+ + +
+ + +
+
+
+
+ +
+ > +
+ "鼠标" +
+
+
+
+ +
+ +
+ +
+
+
品牌:
+
+
    +
  • 所有品牌
  • +
  • A
  • +
  • B
  • +
  • C
  • +
  • D
  • +
  • E
  • +
  • F
  • +
  • G
  • +
  • H
  • +
  • I
  • +
  • J
  • +
  • K
  • +
  • L
  • +
  • M
  • +
  • N
  • +
  • O
  • +
  • P
  • +
  • Q
  • +
  • R
  • +
  • S
  • +
  • T
  • +
  • U
  • +
  • W
  • +
  • X
  • +
  • Y
  • +
  • Z
  • +
+
+
+ +
+
已选条件:
    +
    + 确定 + 取消 +
    +
    +
    + 更多 + 多选 +
    +
    +
    +
    +
    +
    外设产品:
    +
    +
    + +
    +
    + 确定 + 取消 +
    +
    +
    + 更多 + +
    +
    +
    + +
    +
    +
    电脑整机:
    +
    +
    + +
    +
    + 确定 + 取消 +
    +
    +
    + 更多 + +
    +
    +
    + +
    +
    +
    连接方式:
    +
    +
    + +
    +
    + 确定 + 取消 +
    +
    +
    + 更多 + 多选 +
    +
    +
    +
    +
    +
    适用场景:
    +
    +
    + +
    +
    + 确定 + 取消 +
    +
    +
    + 更多 + 多选 +
    +
    +
    +
    +
    +
    高级选项:
    + +
    + + + + + + + + + + + + + + + +
    +
    +
    +
    +
    +
    + +
    +
    + +
    + + + + + +
    +
    +
    +
    + - +
    +
    +
    + + 清空 + 确定 +
    +
    +
    + + 1/100 + + < + > +
    +
    69万+件商品
    + +
    +
    +
    +
    配送至
    +
    +
    +
    北京
    + +
    +
    +
    +
    +
    +
    +
    + + +
    +
    + + +
    + + + +
    +
    正在加载中,请稍后~~
    +
    +
    +
    +
    +
    +
    + +

    商品精选

    +
    +
      +
    +
    +
    +

    精品推荐

    + +
    +
    + +
    + + +
    +
    +
    +
    +
    + +
    商品精选
    +
    +
    +
    +
    + +
    +
    +
    +
    + +
    +
    +
      +
    1. + 品类齐全,轻松购物 +
    2. +
    3. + 多仓直发,极速配送 +
    4. +
    5. + 正品行货,精致服务 +
    6. +
    7. + 天天低价,畅选无忧 +
    8. +
    +
    +
    +
    +
    +
    +
    购物指南
    +
    + 购物流程 +
    +
    + 会员介绍 +
    +
    + 生活旅行/团购 +
    +
    + 常见问题 +
    +
    + 大家电 +
    +
    + 联系客服 +
    +
    +
    +
    配送方式
    +
    + 上门自提 +
    +
    + 211限时达 +
    +
    + 配送服务查询 +
    +
    + 配送费收取标准 +
    +
    + 海外配送 +
    +
    +
    +
    支付方式
    +
    + 货到付款 +
    +
    + 在线支付 +
    +
    + 分期付款 +
    +
    + 公司转账 +
    +
    +
    +
    售后服务
    +
    + 售后政策 +
    +
    + 价格保护 +
    +
    + 退款说明 +
    +
    + 返修/退换货 +
    +
    + 取消订单 +
    +
    +
    +
    特色服务
    +
    + 夺宝岛 +
    +
    + DIY装机 +
    +
    + 延保服务 +
    +
    + 京东E卡 +
    +
    + 京东通信 +
    +
    + 京鱼座智能 +
    +
    + +
    +
    +
    +
    + + +
    + + + + + diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/\347\254\2549\345\221\2500222--0228/\347\254\254\344\270\200\344\270\252\347\210\254\350\231\253\351\241\271\347\233\256-9-3noteyun-0310\350\241\245-.md" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/\347\254\2549\345\221\2500222--0228/\347\254\254\344\270\200\344\270\252\347\210\254\350\231\253\351\241\271\347\233\256-9-3noteyun-0310\350\241\245-.md" new file mode 100644 index 0000000000000000000000000000000000000000..81848acb4b2a03460112b49187defa70b92a3a19 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/\347\254\2549\345\221\2500222--0228/\347\254\254\344\270\200\344\270\252\347\210\254\350\231\253\351\241\271\347\233\256-9-3noteyun-0310\350\241\245-.md" @@ -0,0 +1,85 @@ +9-3noteyun- + +# 一个小又全的爬虫项目 + +- 任务生成者 + + 生成爬虫任务的组件, 最大的作用就是建立生产消费者模型, 将生产者和消费者剥离, 可以达到程序暂停重启的功能. + +- 配置文件 + + 当前爬虫项目的基础配置信息, 目的就是统一化配置, 避免重复修改. + +- 主函数/调度器 + + 以逻辑控制流协同各个组件, 完成爬取工作, 具有一定的调度功能 + +- 下载器 + + 用来和目标服务器进行交互, 获取数据的组件 + +- 解析器 + + 用来解析非结构化的页面内容, 获取想要的数据. + +- 存储器 + + 用来持久化解析后的数据 + + - 数据库 + - 存为本地文件, 比较推荐的格式为json, 结构严谨的可以保存为csv + +# 课后作业 + +- 搭建第一个爬虫项目 +- 为当前爬虫项目添加代理 +- 扩展: 将当前项目改造成多线程 +- 扩展2: 将当前项目改造成多进程 +- 扩展3: 通过aiohttp, aiomysql将项目改造成协程. + +# 第8、9周答疑 + +**上周优秀学员** + +https://docs.qq.com/sheet/DTUpURGx2akN3eW1j + +## 8-1作业 + +![img](https://docimg2.docs.qq.com/image/sBQuFj2a610V_-eCnylD5g?w=474&h=137) + +优秀作业 + +[第八周第一节](https://gitee.com/mayugu/second-python-bootcamp/blob/master/第二期训练营/2班/2班_chaos/第八周_第一节/Mysql(四).md) + +第八周第 2 节作业 + +![img](https://docimg2.docs.qq.com/image/f0dlTUOYvf0YB6FSwk4_BQ?w=476&h=163) + +优秀作业 + +[第八周第二节](https://gitee.com/mayugu/second-python-bootcamp/blob/master/第二期训练营/2班/2班_chaos/第八周_第二节/Mysql(五).md) + +## 8-3作业 + +![img](https://docimg2.docs.qq.com/image/0NRGOkmgEEkxhrRf2JWwAg?w=621&h=170) + +## 9-1作业 + +![img](https://docimg6.docs.qq.com/image/VNjNY9pxW2BG-V7n0JMHkw?w=665&h=254) + +## 9-2作业 + +![img](https://docimg7.docs.qq.com/image/lLIogjEZCBB4YGRwfbUkDA?w=344&h=128) + +## 9-3作业 + +![img](https://docimg3.docs.qq.com/image/e5Ky-nFwLXlpg-soABMWPg?w=448&h=226) + +pyquery学习:https://www.cnblogs.com/progor/p/8536444.html + +## 加餐时间 + +我是如何利用技术赚到第一个 1000 元 + +在快手、头条、抖音,搬运小视频;2017年; +