Ai
1 Star 0 Fork 1

Jason/python-noaa-data-use-selenium

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
noaa.py 3.33 KB
一键复制 编辑 原始数据 按行查看 历史
B.K. 提交于 2019-11-19 13:25 +08:00 . 修改变量大小写,更新注释内容
# coding=utf-8
import re
import time
import urllib.request
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# 网络较慢时,设置大一些
lower_network = 3
# 参数-要爬取的数据类型(也是保存的文件名)
type = "MSL"
# units 单位(0表示 Feet, 1表示 Meters)
units = 0
# epoch 时间(0表示 Present, 1表示 Superseded)
epoch = 0
# 爬虫的页面
main_url = 'https://tidesandcurrents.noaa.gov/datums.html?datum=' + type + '&units=' + str(units) + '&epoch=' + str(epoch)
# 本地的chromedriver地址
path = '/usr/bin/chromedriver'
# 引用并初始化webdriver
browser = webdriver.Chrome(executable_path=path)
# 等待浏览器加载
wait = WebDriverWait(browser, 10)
# 最后的数据输出
data_id = []
data_value = []
data_name = []
data_state = []
data_urls = []
# 保存数据到本地
def text_save(content, filename, mode='w+'):
file = open(filename + '.txt', mode)
d_id = content[0]
d_name = content[1]
d_value = content[2]
d_state = content[3]
data_urls = content[4]
for i in range(len(d_name)):
con = d_id[i] + ';' + d_name[i] + ';' + d_state[i] + ';' + d_value[i] + ';' + data_urls[i]
file.write(con + '\n')
file.close()
# 获取数据
def get_data():
html = browser.page_source
links = re.findall('<select id="stationselect"(.+?)class="chzn-done" style="display: none;">(.+?)</select>', html)
links = re.findall('<option\svalue="(.+?)".*?>(.+?)</option>', links[0][1])
# links = links[0:15] # 15条测试
for link in links:
starttime = time.time()
item = link[0]
id_length = len(re.findall('\d+\.?\d*', link[0])[0])
id = link[0][0:id_length]
except_id_item = link[0][id_length + 1:].split(',')
state = except_id_item[len(except_id_item) - 1]
total_length = len(item)
state_length = len(state)
name = item[id_length + 1:total_length - state_length - 1]
sub_url = main_url + '&id=' + id + '&name=' + name + '&state=' + state.strip()
sub_url = sub_url.replace(' ', '%20')
# 根据子连接获取数据
html1 = urllib.request.urlopen(sub_url).read()
html1 = html1.decode('utf-8')
msl = re.findall('<td><a href="/datum_options.html#' + type + '">' + type + '</a></td><td>(.*?)</td>', html1)[0]
msl[0].strip()
if msl == '':
msl = '0.00'
if msl == '-':
msl = '0.00'
data_id.append(id)
data_name.append(name)
data_value.append(msl)
data_state.append(state.strip())
data_urls.append(sub_url)
endtime = time.time()
dtime = endtime - starttime
print('Records(' + str(len(data_id)) + ')' + id + '&name=' + name + '&state=' + state.strip() + '===>' + msl + '【' + sub_url + '】' + 'Running time: %s Seconds'%dtime)
# 停一会儿
time.sleep(lower_network)
text_save([data_id, data_name, data_value, data_state, data_urls], type)
# 初始化-给个初始页面
init_url = main_url + '&id=1611347&name=PORT+ALLEN%2C+HANAPEPE+BAY%2C+KAUAI+ISLAND&state=HI'
browser.get(init_url)
handles = browser.window_handles
wait.until(EC.presence_of_element_located((By.ID, 'stationselect')), get_data())
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/xujason/python-noaa-data-use-selenium.git
git@gitee.com:xujason/python-noaa-data-use-selenium.git
xujason
python-noaa-data-use-selenium
python-noaa-data-use-selenium
master

搜索帮助