diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\344\270\200\345\221\250_\347\254\254\344\270\211\350\212\202/.keep" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\344\270\200\345\221\250_\347\254\254\344\270\211\350\212\202/.keep" new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\344\270\200\345\221\250_\347\254\254\344\270\211\350\212\202/Redis.md" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\344\270\200\345\221\250_\347\254\254\344\270\211\350\212\202/Redis.md" new file mode 100644 index 0000000000000000000000000000000000000000..396c3eb71aaa8a76c61bfaacfb7dcf068fc88d25 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\344\270\200\345\221\250_\347\254\254\344\270\211\350\212\202/Redis.md" @@ -0,0 +1,264 @@ +# Redis + +# Install + +### Install from source code on linux + +- Download + - use browser + + - ``` + wget http://download.redis.io/releases/redis-6.2.1.tar.gz + ``` + +- Unzip + + - ``` + tar -zxvf redis-6.2.1.tar.gz + ``` + +- Build + + - ``` + cd redis-6.2.1 + make + ``` + + - If you get any error in running make command, run the following command + + - Centos + + ``` + sudo yum groupinstall ‘Development Tools’ + sudo yum install gcc make + ``` + + - Ubuntu + + ``` + sudo apt-get install build-essential + sudo apt-get gcc make + ``` + +- Run `make test` to check if everything is ok and the `make test install` + +- Configure Redis server + + ``` + cd utils/ + ./install_server.sh (Press Enter for default setting) + ``` + +- Start Redis service + + ``` + # select redis port for this redis server instance. default is 6379 + service redis_ start + ``` + +- Connect from localhost + + ``` + root@chaos:~# redis-cli + 127.0.0.1:6379> + ``` + +- Connect remotely + + - vi /etc/redis/6379.conf + + ``` + bind 127.0.0.1 192.168.111.111 + ``` + + - restart redis service + + ``` + service redis_6379 restart + ``` + + - connect + + ``` + redis-cli -h 192.168.111.111 -p 6379 + ``` + +## Redis data type + +Redis-key + +``` +127.0.0.1:6379> keys * +(empty array) +127.0.0.1:6379> set name chaos +OK +127.0.0.1:6379> keys * +1) "name" +127.0.0.1:6379> exists name +(integer) 1 +127.0.0.1:6379> exists age +(integer) 0 +127.0.0.1:6379> type name +string +127.0.0.1:6379> del name +(integer) 1 +127.0.0.1:6379> keys * +(empty array) +``` + +### String + +- 使用场景:计数器,统计粉丝数,对象缓存存储 + +``` +127.0.0.1:6379> set name chaos +OK +127.0.0.1:6379> get name +"chaos" +127.0.0.1:6379> append name wang +(integer) 9 +127.0.0.1:6379> get name +"chaoswang" +127.0.0.1:6379> strlen name +(integer) 9 +127.0.0.1:6379> del name +(integer) 1 +127.0.0.1:6379> set name chaoswang +OK +127.0.0.1:6379> setrange name 2 c +(integer) 9 +127.0.0.1:6379> get name +"chcoswang" +``` + +### List + +- 可以把 list 用作 栈,队列,阻塞队列等 +- list 实际上是一个链表,前后都可以插入 + +``` +127.0.0.1:6379> lpush list one +(integer) 1 +127.0.0.1:6379> lpush list two +(integer) 2 +127.0.0.1:6379> lpush list three +(integer) 3 +127.0.0.1:6379> lrange list 0 2 +1) "three" +2) "two" +3) "one" +127.0.0.1:6379> lpop list +"three" +127.0.0.1:6379> llen list +(integer) 2 +127.0.0.1:6379> lrem list 2 two +(integer) 1 +127.0.0.1:6379> lrange list 0 -1 +1) "one" +127.0.0.1:6379> lset list 0 hello +OK +127.0.0.1:6379> lrange list 0 -1 +1) "hello" +127.0.0.1:6379> linsert list after “hello” world +``` + +### Set + +``` +127.0.0.1:6379> sadd myset hello +(integer) 1 +127.0.0.1:6379> sadd myset world +(integer) 1 +127.0.0.1:6379> smembers myset +1) "world" +2) "hello" +127.0.0.1:6379> sismember myset python +(integer) 0 +127.0.0.1:6379> srem myset world +(integer) 1 +127.0.0.1:6379> smembers myset +1) "hello" +127.0.0.1:6379> sadd myset python +(integer) 1 +127.0.0.1:6379> smembers myset +1) "python" +2) "hello" +127.0.0.1:6379> srandmember myset +"hello" +127.0.0.1:6379> srandmember myset +"hello" +127.0.0.1:6379> srandmember myset 2 +1) "python" +2) "hello" +127.0.0.1:6379> spop myset +"hello" +127.0.0.1:6379> smembers myset +1) "python" +127.0.0.1:6379> +``` + +### Zset + +``` +127.0.0.1:6379> zadd myzset 1 one +(integer) 1 +127.0.0.1:6379> zadd myzset 2 two 3 three +(integer) 2 +127.0.0.1:6379> zrange myzset 0 -1 +1) "one" +2) "two" +3) "three" +127.0.0.1:6379> zadd myzset 5 five +(integer) 1 +127.0.0.1:6379> zadd myzset 4 four +(integer) 1 +127.0.0.1:6379> zrange myzset 0 -1 +1) "one" +2) "two" +3) "three" +4) "four" +5) "five" +127.0.0.1:6379> zrem myzset one +(integer) 1 +127.0.0.1:6379> zrange myzset 0 -1 +1) "two" +2) "three" +3) "four" +4) "five" +127.0.0.1:6379> zcount myzset 2 4 +(integer) 3 +127.0.0.1:6379> +``` + +### Hash + +Hash 适合存储经常变动的对象信息,String 更适合于存储字符串 + +``` +127.0.0.1:6379> hset myhash field hello +(integer) 1 +127.0.0.1:6379> hget myhash field +"hello" +127.0.0.1:6379> hdel myhash field +(integer) 1 +127.0.0.1:6379> hget myhash field +(nil) +127.0.0.1:6379> hset myhash field hello field2 world field3 python +(integer) 3 +127.0.0.1:6379> hkeys myhash +1) "field" +2) "field2" +3) "field3" +127.0.0.1:6379> hmget myhash field field1 field3 +1) "hello" +2) (nil) +3) "python" +127.0.0.1:6379> hmget myhash field field2 field3 +1) "hello" +2) "world" +3) "python" +127.0.0.1:6379> +``` + + + diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\344\270\200\345\221\250_\347\254\254\344\272\214\350\212\202/.keep" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\344\270\200\345\221\250_\347\254\254\344\272\214\350\212\202/.keep" new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\344\270\200\345\221\250_\347\254\254\344\272\214\350\212\202/jd_search_spider.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\344\270\200\345\221\250_\347\254\254\344\272\214\350\212\202/jd_search_spider.py" new file mode 100644 index 0000000000000000000000000000000000000000..862964f13f0a234de4e77bc7d891ec4634a2ac71 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\344\270\200\345\221\250_\347\254\254\344\272\214\350\212\202/jd_search_spider.py" @@ -0,0 +1,64 @@ +import scrapy +from bs4 import BeautifulSoup +import json +from W10_L3.jd_search.jd_search.items import JdSearchItem +from twisted.internet.error import TimeoutError +from scrapy.spidermiddlewares.httperror import HttpError + + +class JdSearchSpider(scrapy.Spider): + name = "jd_search" + + def start_requests(self): + search_array = ["手机", "电脑", "显卡", "内存"] + for keyword in search_array: + for page in range(1, 4): + url = f'https://search.jd.com/Search?keyword={keyword}&page={page}' + + yield scrapy.FormRequest( + dont_filter=False, + url=url, + method='GET', + errback=self.err_back, + callback=self.parse_search + ) + + + def parse_search(self, response): + html = response.text + soup = BeautifulSoup(html, 'lxml') + content = soup.select("ul[class='gl-warp clearfix'] li[class='gl-item']") + for item in content: + try: + sku_id = item.attrs["data-sku"] + img = item.select("img[data-img='1']") + price = item.select("div[class='p-price']") + title = item.select("div[class='p-name p-name-type-2'] em") + shop = item.select("div[class='p-shop']") + icons = item.select("div[class='p-icons']") + + img = img[0].attrs['data-lazy-img'] if img else "" + price = price[0].strong.i.text.strip() if price else "" + title = title[0].text.strip() if title else "" + shop = shop[0].text.strip() if shop else "" + icons = json.dumps([ele.text.strip() for ele in icons[0].select('i')]) if icons else '[]' + + items = JdSearchItem() + items["sku_id"] = sku_id + items["img"] = img + items["price"] = price + items["title"] = title + items["shop"] = shop + items["icons"] = icons + yield items + + except Exception as e: + print(e.args) + + + def err_back(self, failure): + if failure.check(TimeoutError): + print('Timeout error on %s website', failure.request.url) + elif failure.check(HttpError): + response = failure.value.response + print('HttpError on %s' % response.url) \ No newline at end of file diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\344\270\200\345\221\250_\347\254\254\344\272\214\350\212\202/middlewares.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\344\270\200\345\221\250_\347\254\254\344\272\214\350\212\202/middlewares.py" new file mode 100644 index 0000000000000000000000000000000000000000..3b6a2c4adc17a8ef5ebe27fcd44115514c3e2000 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_chaos/\347\254\254\345\215\201\344\270\200\345\221\250_\347\254\254\344\272\214\350\212\202/middlewares.py" @@ -0,0 +1,180 @@ +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html +import hashlib +import weakref + +from scrapy import signals + +# useful for handling different item types with a single interface +from itemadapter import is_item, ItemAdapter +from scrapy.downloadermiddlewares.retry import RetryMiddleware +from scrapy.utils.response import response_status_message +from scrapy.dupefilters import RFPDupeFilter +from scrapy.utils.python import to_bytes +from w3lib.url import canonicalize_url + + +class JdSearchSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class JdSearchDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class JdSearchUAMiddleware: + + def process_request(self, request, spider): + # This method is used by Scrapy to add user agent headers. + request.headers["user-agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36" + + +# W11_L1 Retry 中间件 +class JdSearchRetryMiddleware(RetryMiddleware): + + def process_response(self, request, response, spider): + if request.meta.get('dont_retry', False): + return response + if "验证码" in response.text: + reason = response_status_message(response.status) + return self._retry(request, reason, spider) or response + return response + + def process_exception(self, request, exception, spider): + if isinstance(exception, IndexError): + retry_times = request.meta.get('retry_times', 1) + request.meta['retry_time'] = retry_times - 1 + return request + + +# W11_L1 filter +_fingerprint_cache = weakref.WeakKeyDictionary() +class JdSearchRfpDupeMiddleware(RFPDupeFilter): + def request_fingerprint(self, request, include_headers=None, keep_fragments=False): + cache = _fingerprint_cache.setdefault(request, {}) + cache_key = (include_headers, keep_fragments) + if cache_key not in cache: + fp = hashlib.sha1() + fp.update(to_bytes(request.method)) + fp.update(to_bytes(canonicalize_url(request.url, keep_fragments=keep_fragments))) + fp.update(request.body or b'') + fp.update(request.meta.get("batch_no", "").encode("utf-8")) + cache[cache_key] = fp.hexdigest() + return cache[cache_key] + + +# W11_L2 异常处理重试中间件,处理 http code 为 403/500 时候的异常 +from twisted.internet import defer +from twisted.internet.error import DNSLookupError, TimeoutError, \ + ConnectionLost, ConnectionDone, ConnectError, TCPTimedOutError, \ + ConnectionRefusedError +from scrapy.http import HtmlResponse +from twisted.web.client import ResponseFailed +from scrapy.core.downloader.handlers.http11 import TunnelError + +class ProcessAllExceptionMiddleware(object): + ALL_EXCEPTIONS = (defer.TimeoutError, TimeoutError, DNSLookupError, + ConnectionRefusedError, ConnectionDone, ConnectError, + ConnectionLost, TCPTimedOutError, ResponseFailed, + IOError, TunnelError) + + def process_response(self, request, response, spider): + if str(response.status).startswith('4') or str(response.status).startswith('5'): + response = HtmlResponse(url='') + return response + return response + + def process_exception(self, request, exception, spider): + if isinstance(exception, self.ALL_EXCEPTIONS): + print("get exception %s" % exception) + response = HtmlResponse(url='exception') + return response + print("no exception") \ No newline at end of file