From 8faef0e085cfde04ea1d71e606db50d1c59611c2 Mon Sep 17 00:00:00 2001 From: giteeyunyunyun <18883994582@163.com> Date: Sun, 21 Mar 2021 16:34:47 +0800 Subject: [PATCH] 12-2-3zuoye --- ...7\347\202\271\345\207\273-12-3noteyun .md" | 280 ++++++++++++++++++ .../week_12/jd_search_selenium.py" | 73 +++++ ...25\344\273\213\347\273\215-12-2noteyun.md" | 174 +++++++++++ 3 files changed, 527 insertions(+) create mode 100644 "\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_12/chrome-selenium\345\220\204\347\247\215\345\217\202\346\225\260\351\205\215\347\275\256-\346\250\241\346\213\237\347\202\271\345\207\273-12-3noteyun .md" create mode 100644 "\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_12/jd_search_selenium.py" create mode 100644 "\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_12/selenium\347\256\200\345\215\225\344\273\213\347\273\215-12-2noteyun.md" diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_12/chrome-selenium\345\220\204\347\247\215\345\217\202\346\225\260\351\205\215\347\275\256-\346\250\241\346\213\237\347\202\271\345\207\273-12-3noteyun .md" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_12/chrome-selenium\345\220\204\347\247\215\345\217\202\346\225\260\351\205\215\347\275\256-\346\250\241\346\213\237\347\202\271\345\207\273-12-3noteyun .md" new file mode 100644 index 00000000..5299fa79 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_12/chrome-selenium\345\220\204\347\247\215\345\217\202\346\225\260\351\205\215\347\275\256-\346\250\241\346\213\237\347\202\271\345\207\273-12-3noteyun .md" @@ -0,0 +1,280 @@ +12-3noteyun-selenium页面解析、模拟事件 + +# selenium配置 + +- execute_path + + ```python + browser = webdriver.Chrome(executable_path=driver_path, chrome_options=chrome_options) + ``` + +- chrome_options + + chrome浏览器的配置对象 + + chrome配置(peter) + + https://peter.sh/experiments/chromium-command-line-switches/ + + - 添加代理 + + ```python + proxy = "127.0.0.1:8888"#fiddler代理 + chrome_options.add_argument(f"--proxy-server={proxy}") + ``` + + - 去除`navigator.webdriver`属性 + + ```python + chrome_options.add_argument("disable-blink-features=AutomationControlled") + ``` + + - 设置请求头(ua:user-agent) + + ```python + ua = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36" + + chrome_options.add_argument(f"--user-agent={ua}") + ``` + + - 无头模式 + + 取消了页面渲染部分, 减少了资源消耗的同时, 增加了被反爬风险. + + ```python + chrome_options.add_argument("--headless") + ``` + + - 加载用户缓存 + + 可以像我们正常使用浏览器一样, 记录使用记录和cookie. + + > 如果不指定用户缓存的路径, 不指定的时候会创建临时文件夹. + > + > 如果selenium实例没有正常销毁, 那么当前缓存文件夹不会被删除, 长此以往会占用大量磁盘空间 + + ```python + chrome_options.add_argument(f"--user-data-dir={user_dir}") + + #--user-data-dir ⊗ Directory where the browser stores the user profile. ↪ + + ``` + + - 加载插件 + + ```python + chrome_options.add_extension(插件路径) + 插件:注入js + 插件:帮助解析页面等 + ``` + + + + + + + + ![image-20210321103108662](C:\Users\yunqin\AppData\Roaming\Typora\typora-user-images\image-20210321103108662.png) + +![image-20210321102404900](C:\Users\yunqin\AppData\Roaming\Typora\typora-user-images\image-20210321102404900.png) + +![image-20210321102421377](C:\Users\yunqin\AppData\Roaming\Typora\typora-user-images\image-20210321102421377.png) + +image-20210321103848511 + +![image-20210321110033579](C:\Users\yunqin\AppData\Roaming\Typora\typora-user-images\image-20210321110033579.png) + + + + + + + +# 执行JS + +- 执行JS + + ``` + browser.execute_script("return location.href") + ``` + +- 在页面初始前注入JS + + ```python + browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", { + "source": """ + Object.defineProperty(navigator, 'webdriver', { + get: ()=> 'my_webdriver' + }) + """ + }) + ``` + +![image-20210321111806579](C:\Users\yunqin\AppData\Roaming\Typora\typora-user-images\image-20210321111806579.png) + +![image-20210321113149312](C:\Users\yunqin\AppData\Roaming\Typora\typora-user-images\image-20210321113149312.png) + + + +![image-20210321113121730](C:\Users\yunqin\AppData\Roaming\Typora\typora-user-images\image-20210321113121730.png) + + + +# 窗口操作 + +- window + + - 窗口的最大化 + + ``` + browser.maximize_window() + ``` + + - 自定义窗口大小 + + ``` + browser.set_window_size(width=1660, height=960) + ``` + + - 新建窗口 + + selenium框架不提供新建窗口的方法, 但是我们可以通过JS来控制新建一个窗口. + + ``` + browser.execute_script("window.open('http://baidu.com')") + ``` + + - **切换窗口** + + 新建和删除窗口不会更改当前窗口的控制权, 需要通过切换窗口来控制. + + ``` + browser.switch_to.window(self.browser.window_handles[-1])wind + ``` + + + + + +- frame + + 窗口中的嵌套document, 需要切换到frame环境下才可以解析 + + ``` + iframe = browser.find_element_by_css_selector(...) + browser.switch_to.frame(iframe) + + # 切回原来的窗口环境 + # 只要切换原来的窗口即可 + browser.switch_to.window(self.browser.window_handles[index]) + ``` + +![image-20210321120603618](C:\Users\yunqin\AppData\Roaming\Typora\typora-user-images\image-20210321120603618.png) + + + + + +# 页面解析 + +- 通过CSS_SELECTOR + + ``` + browser.find_element_by_css_selector + ``` + +- 通过BeautifulSoup库 + + ``` + soup = BeautifulSoup(browser.page_source, 'lxml') + ``` + +# 等待事件 + +- time.sleep + +- `WeDriverWait` 和 `expected_conditions` + + > https://selenium-python-zh.readthedocs.io/en/latest/api.html#module-selenium.webdriver.support.expected_conditions + + 等待事件本质就是根据`expected_conditions`进行轮询. + + ```python + from selenium.webdriver.common.by import By + from selenium.webdriver.support.ui import WebDriverWait + from selenium.webdriver.support import expected_conditions as EC + + # 声明一个事件等待对象,等待10秒 + waiter = WebDriverWait(browser, 10) + ``` + + - 等待某个元素渲染完毕( .until对象) + + ``` + waiter.until(EC.presence_of_element_located((By.ID, "myDynamicElement")) + ``` + + - 等待当前元素在DOM可点击 + + ``` + waiter.until(EC.presence_to_be_clickable((By.ID, "myDynamicElement")) + ``` + +# 模拟事件 + +- (1) ActionChains + + 行为链, 可以模拟连续的用户行为. + + ```python + from selenium.webdriver import ActionChains + + ac = ActionChains(self.browser) + ``` + +- (2)模拟鼠标输入 + + - 鼠标移动, 点击事件 + + ```python + # 将鼠标移动到某个元素上方 + ac.move_to_element(search_button).click().perform() + ``` + + - 根据相对位置移动鼠标 + + ```python + move_by_offset(x, y) + ``` + + - 拖动元素 + + ```python + drag_and_drop + ``` + + - 拖到元素到相对位置 + + ```python + drag_and_drop_by_offset + ``` + +- (3)模拟键盘输入 + + > https://selenium-python-zh.readthedocs.io/en/latest/api.html#module-selenium.webdriver.common.keys + > + > 模拟点击Enter键 + + - 模拟ENTER + + ```python + from selenium.webdriver.common.keys import Keys + + ac = ActionChains(self.browser) + ac.send_keys(Keys.ENTER).perform() + ``` + +# 课后作业 + +- 熟悉课堂上讲到的接口. +- 扩展: 对浏览器配置和模拟事件, 等待事件有具体解决方向. \ No newline at end of file diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_12/jd_search_selenium.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_12/jd_search_selenium.py" new file mode 100644 index 00000000..55ca2c9c --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_12/jd_search_selenium.py" @@ -0,0 +1,73 @@ +from selenium import webdriver +from jd_crawler.jd_parser import search +import time +from selenium.webdriver import ActionChains +from selenium.webdriver.common.keys import Keys + +driver_path = r"D:\bsoft\anchromedriver\chromedriver.exe" + + +class JdSearch: + def __init__(self,proxy=None,ua=None,headless=False,user_dir=None): + chrome_options = webdriver.ChromeOptions() + chrome_options.add_argument("disable-blink-features=AutomationControlled") + if proxy: + chrome_options.add_argument(f"--proxy-server={proxy}") + if ua: + chrome_options.add_argument(f"--user-agent={ua}") + if headless: + chrome_options.add_argument("--headless") + if user_dir: + chrome_options.add_argument(f"--user-data-dir={user_dir}") + + self.browser = webdriver.Chrome(executable_path=driver_path, chrome_options=chrome_options) + + # 窗口大小的设置 + # self.browser.maximize_window() + # self.browser.set_window_size(width=1660,height=960) + #页面初始前注入js,将navigator.webdriver修改为my_navigator + # self.browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", + # { + # "source":""" + # Object.defineProperty(navigator,'webdriver', + # { + # get:()=>'my_webdriver' + # }) + # """ + # } + # ) + + + #输入关键词并搜索 + def sim_search(self,keyword, url): + self.browser.get(url) + search_input = self.browser.find_element_by_css_selector('input[aria-label="搜索"]') + search_input.send_keys(keyword) + search_button = self.browser.find_element_by_css_selector('button[aria-label="搜索"]') + search_button.click() + + #main函数 + def main(self,keyword, url): + self.sim_search(keyword, url) + time.sleep(3) + # self.browser.execute_script("window.open('http://baidu.com')") + # self.browser.switch_to.window(self.browser.window_handles[-1])#最后一个 + + # ac=ActionChains(self.browser) + # ac.move_to_element(search_button).click().perform() + # ac.send_keys(Keys.ENTER).perform() + + + item_array = search.parse_jd_item(self.browser.page_source) + print(item_array) + self.browser.close() + + +if __name__ == "__main__": + jd_url = "https://www.jd.com/" + proxy="127.0.0.1:8888"#代理指向fiddler + ua="Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36" + user_dir=r"E:\user_data\tmp" + + jd_search=JdSearch(proxy=proxy,ua=ua,headless=False,user_dir=user_dir) + jd_search.main("鼠标", jd_url) diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_12/selenium\347\256\200\345\215\225\344\273\213\347\273\215-12-2noteyun.md" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_12/selenium\347\256\200\345\215\225\344\273\213\347\273\215-12-2noteyun.md" new file mode 100644 index 00000000..03f642f0 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/5\347\217\255/5\347\217\255_\344\272\221/week_12/selenium\347\256\200\345\215\225\344\273\213\347\273\215-12-2noteyun.md" @@ -0,0 +1,174 @@ +12-2noteyun-selenium简单介绍 + +# selenium简单介绍 + +``` +https://selenium-python.readthedocs.io/ +https://python-selenium-zh.readthedocs.io/zh_CN/latest/ +``` + +- 什么是selenium + + 通过**浏览器驱动**来自动化**操纵浏览器**的工具 + + ``` + selenium -> 传输指令 -> webdriver -> 转换指令 -> 浏览器 + + webdriver的出现是为了兼容各语言各版本, 使java或者python都可以通过同一个webdriver达到操纵浏览器的目的. + ``` + +- selenium常用来做什么? + + 本质是模拟用户行为, 可以用于以下领域 + + - 自动化测试 + - 获取网页数据 + - 复杂动态网页 + - 账号比较重要, 保证不被反爬风控 + - 数据量小, 但是验证复杂的网站 + - 获取session + - 用户端无法获取session, 但是selenium可以 + +# Selenium安装 + +- 安装Python版本的selenium + + ``` + pip install selenium + ``` + +- 查看当前浏览器版本 + + ``` + 最新版本为89 + ``` + +- 下载对应浏览器版本的chrome driver + +- http://npm.taobao.org/mirrors/chromedriver/ + + ``` + http://npm.taobao.org/mirrors/chromedriver/ + ``` + +- 下载后解压至特定目录 + + ``` + H:\drivers\chromedriver.exe + ``` + +# 快速启动 + +- 创建一个webdriver驱动实例 + + ```python + from selenium import webdriver + + driver_path = r"H:\drivers\chromedriver.exe" + + ``` + +- 浏览器设置 + + ```python + chrome_options = webdriver.ChromeOptions() + chrome_options.add_argument("disable-blink-features=AutomationControlled") + browser = webdriver.Chrome(executable_path=driver_path) + ``` + +- 请求网址 + +- 定位搜索框并模拟键盘输入 + + ```python + search_input = browser.find_element_by_css_selector('input[aria-label="搜索"]') + search_input.send_keys("鼠标") + ``` + +- 定位搜索按钮并模拟鼠标输入 + + ```python + search_button = browser.find_element_by_css_selector('button[aria-label="搜索"]') + search_button.click() + ``` + +- 页面解析 + + ``` + item_array = search.parse_jd_item(browser.page_source) + ``` + +```python +from selenium import webdriver +from jd_crawler.jd_parser import search +import time + +driver_path = r"H:\drivers\chromedriver.exe" + +chrome_options = webdriver.ChromeOptions() +chrome_options.add_argument("disable-blink-features=AutomationControlled") +browser = webdriver.Chrome(executable_path=driver_path, chrome_options=chrome_options) + + +#输入关键词并搜索 +def sim_search(keyword, url): + browser.get(url) + search_input = browser.find_element_by_css_selector('input[aria-label="搜索"]') + search_input.send_keys(keyword) + search_button = browser.find_element_by_css_selector('button[aria-label="搜索"]') + search_button.click() + +#main函数 +def main(keyword, url): + sim_search(keyword, url) + time.sleep(3) + item_array = search.parse_jd_item(browser.page_source) + print(item_array) + browser.close() + + +if __name__ == "__main__": + jd_url = "https://www.jd.com/" + main("鼠标", jd_url) +``` + +# 课后作业 + +- 完成快速启动中的案例 +- 将上述代码通过类的方式重构 + + + +```python +from selenium import webdriver +from jd_crawler.jd_parser import search +import time + +driver_path = r"D:\bsoft\anchromedriver\chromedriver.exe" +chrome_options = webdriver.ChromeOptions() +chrome_options.add_argument("disable-blink-features=AutomationControlled") + +browser = webdriver.Chrome(executable_path=driver_path, chrome_options=chrome_options) + +#输入关键词并搜索 +def sim_search(keyword, url): + browser.get(url) + search_input = browser.find_element_by_css_selector('input[aria-label="搜索"]') + search_input.send_keys(keyword) + search_button = browser.find_element_by_css_selector('button[aria-label="搜索"]') + search_button.click() + +#main函数 +def main(keyword, url): + sim_search(keyword, url) + time.sleep(3) + item_array = search.parse_jd_item(browser.page_source) + print(item_array) + browser.close() + + +if __name__ == "__main__": + jd_url = "https://www.jd.com/" + main("鼠标", jd_url) +``` + -- Gitee