代码拉取完成,页面将自动刷新
'''Copyright (c) [2019] [范志俊]
[nowcoderSpider] is licensed under the Mulan PSL v1.
You can use this software according to the terms and conditions of the Mulan PSL v1.
You may obtain a copy of Mulan PSL v1 at:
http://license.coscl.org.cn/MulanPSL
THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
PURPOSE.
See the Mulan PSL v1 for more details.'''
import re
import pymysql
from selenium import webdriver
import time
import random
# 登录
def login():
print('登录')
browser.get(url)
browser.find_element_by_id('jsEmailIpt').send_keys('XXXXX')# 请填写你的账号,替换XXXXX
browser.find_element_by_id('jsPasswordIpt').send_keys('XXXXX')# 请填写你的密码,替换XXXXX
browser.find_element_by_id('jsLoginBtn').click()
# 当前页数据采集
def get_page_data(url):
time.sleep(random.randint(1,3))
item_element = browser.find_elements_by_class_name(' js-go-summary')
print('获取当前页的数据')
for item_index, _ in enumerate(item_element):
print('访问面试题,第{}题'.format(item_index+1))
browser.execute_script("window.scrollTo(0,{})".format(500 * (item_index // 5)))
browser.find_elements_by_class_name(' js-go-summary')[item_index].click()
print('提交试卷')
paper_name=browser.find_element_by_xpath('//span[@class="js-paper-name"]').text
browser.find_element_by_id("next").click()
if 'login' in browser.current_url:
login()
try:
browser.find_element_by_id('aheadFinish').click()
browser.find_element_by_xpath('//div[@class="pop-footer clearfix"]/a[1]').click()
except Exception as e:
browser.find_element_by_id('next').click()
print('遍历题目和答案')
browser.find_element_by_xpath('//ul[@class="menu clearfix"]/li[2]/a').click()
li_element = browser.find_elements_by_xpath('//ul[@class="subject-num-list"]/li')
time.sleep(random.randint(2, 4))
for index, _ in enumerate(li_element):
browser.find_elements_by_xpath('//ul[@class="subject-num-list"]/li')[index].click()
question = browser.find_element_by_class_name('question-main').text
try:
answer_get = browser.find_element_by_xpath(
'//div[@class="result-subject-item result-subject-answer"]/h1').text
answer = re.compile('正确答案: (.*)?你的答案: ').findall(answer_get)
if answer:
answer_result = answer[0]
else:
try:
answer_result = browser.find_elements_by_xpath('string(//div[@class="design-answer-box"])').text
except Exception as e:
answer_result=''
result = [ele.text for ele in browser.find_elements_by_xpath(
'//div[@class="result-subject-item result-subject-answer"]/div/pre')]
except Exception as e:
answer_result = answer_get = ''
result = []
tag = [ele.text for ele in browser.find_elements_by_xpath('//a[@class="tag-label"]')]
cursor.execute('insert into nowcoder value (NULL,%s,%s,%s,%s,%s,%s,%s)',
(question, answer_result, '|'.join(result), browser.current_url, '|'.join(tag),paper_name,index))
except Exception as e:
browser.get(url)
continue
print('*'*100)
browser.get(url)
conn.commit()
if __name__=='__main__':
url='https://www.nowcoder.com/login?callBack=https%3A%2F%2Fwww.nowcoder.com%2FcontestRoom'
browser = webdriver.Chrome(executable_path='chromedriver.exe')
conn=pymysql.connect(host='172.16.100.90',user='root',passwd='123456',db='contestRoom',charset='utf8')
cursor=conn.cursor()
login()
while True:
url=browser.current_url
print('正在访问:{}'.format(url))
get_page_data(url)
print('点击下一页')
try:
next_page=browser.find_element_by_link_text('下一页').click()
print('Next')
except Exception as e:
print(e)
break
print('完成')
cursor.close()
conn.close()
browser.quit()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。