Ai
1 Star 0 Fork 3

小码编程/nowcoderSpider

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
spider_main.py 4.47 KB
一键复制 编辑 原始数据 按行查看 历史
子沐 提交于 2019-08-19 09:29 +08:00 . 更新 spider_main.py
'''Copyright (c) [2019] [范志俊]
[nowcoderSpider] is licensed under the Mulan PSL v1.
You can use this software according to the terms and conditions of the Mulan PSL v1.
You may obtain a copy of Mulan PSL v1 at:
http://license.coscl.org.cn/MulanPSL
THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
PURPOSE.
See the Mulan PSL v1 for more details.'''
import re
import pymysql
from selenium import webdriver
import time
import random
# 登录
def login():
print('登录')
browser.get(url)
browser.find_element_by_id('jsEmailIpt').send_keys('XXXXX')# 请填写你的账号,替换XXXXX
browser.find_element_by_id('jsPasswordIpt').send_keys('XXXXX')# 请填写你的密码,替换XXXXX
browser.find_element_by_id('jsLoginBtn').click()
# 当前页数据采集
def get_page_data(url):
time.sleep(random.randint(1,3))
item_element = browser.find_elements_by_class_name(' js-go-summary')
print('获取当前页的数据')
for item_index, _ in enumerate(item_element):
print('访问面试题,第{}题'.format(item_index+1))
browser.execute_script("window.scrollTo(0,{})".format(500 * (item_index // 5)))
browser.find_elements_by_class_name(' js-go-summary')[item_index].click()
print('提交试卷')
paper_name=browser.find_element_by_xpath('//span[@class="js-paper-name"]').text
browser.find_element_by_id("next").click()
if 'login' in browser.current_url:
login()
try:
browser.find_element_by_id('aheadFinish').click()
browser.find_element_by_xpath('//div[@class="pop-footer clearfix"]/a[1]').click()
except Exception as e:
browser.find_element_by_id('next').click()
print('遍历题目和答案')
browser.find_element_by_xpath('//ul[@class="menu clearfix"]/li[2]/a').click()
li_element = browser.find_elements_by_xpath('//ul[@class="subject-num-list"]/li')
time.sleep(random.randint(2, 4))
for index, _ in enumerate(li_element):
browser.find_elements_by_xpath('//ul[@class="subject-num-list"]/li')[index].click()
question = browser.find_element_by_class_name('question-main').text
try:
answer_get = browser.find_element_by_xpath(
'//div[@class="result-subject-item result-subject-answer"]/h1').text
answer = re.compile('正确答案: (.*)?你的答案: ').findall(answer_get)
if answer:
answer_result = answer[0]
else:
try:
answer_result = browser.find_elements_by_xpath('string(//div[@class="design-answer-box"])').text
except Exception as e:
answer_result=''
result = [ele.text for ele in browser.find_elements_by_xpath(
'//div[@class="result-subject-item result-subject-answer"]/div/pre')]
except Exception as e:
answer_result = answer_get = ''
result = []
tag = [ele.text for ele in browser.find_elements_by_xpath('//a[@class="tag-label"]')]
cursor.execute('insert into nowcoder value (NULL,%s,%s,%s,%s,%s,%s,%s)',
(question, answer_result, '|'.join(result), browser.current_url, '|'.join(tag),paper_name,index))
except Exception as e:
browser.get(url)
continue
print('*'*100)
browser.get(url)
conn.commit()
if __name__=='__main__':
url='https://www.nowcoder.com/login?callBack=https%3A%2F%2Fwww.nowcoder.com%2FcontestRoom'
browser = webdriver.Chrome(executable_path='chromedriver.exe')
conn=pymysql.connect(host='172.16.100.90',user='root',passwd='123456',db='contestRoom',charset='utf8')
cursor=conn.cursor()
login()
while True:
url=browser.current_url
print('正在访问:{}'.format(url))
get_page_data(url)
print('点击下一页')
try:
next_page=browser.find_element_by_link_text('下一页').click()
print('Next')
except Exception as e:
print(e)
break
print('完成')
cursor.close()
conn.close()
browser.quit()
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/ismallcode/nowcoderSpider.git
git@gitee.com:ismallcode/nowcoderSpider.git
ismallcode
nowcoderSpider
nowcoderSpider
master

搜索帮助