master

分支 (1)

管理

管理

master

python-noaa-data-use-selenium
/
noaa.py

# coding=utf-8
import re
import time
import urllib.request
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# 网络较慢时，设置大一些
lower_network = 3
# 参数-要爬取的数据类型(也是保存的文件名)
type = "MSL"
# units 单位（0表示 Feet， 1表示 Meters）
units = 0
# epoch 时间（0表示 Present， 1表示 Superseded）
epoch = 0
# 爬虫的页面
main_url = 'https://tidesandcurrents.noaa.gov/datums.html?datum=' + type + '&units=' + str(units) + '&epoch=' + str(epoch)
# 本地的chromedriver地址
path = '/usr/bin/chromedriver'
# 引用并初始化webdriver
browser = webdriver.Chrome(executable_path=path)
# 等待浏览器加载
wait = WebDriverWait(browser, 10)

# 最后的数据输出
data_id = []
data_value = []
data_name = []
data_state = []
data_urls = []


# 保存数据到本地
def text_save(content, filename, mode='w+'):
    file = open(filename + '.txt', mode)
    d_id = content[0]
    d_name = content[1]
    d_value = content[2]
    d_state = content[3]
    data_urls = content[4]
    for i in range(len(d_name)):
        con = d_id[i] + ';' + d_name[i] + ';' + d_state[i] + ';' + d_value[i] + ';' + data_urls[i]
        file.write(con + '\n')
    file.close()


# 获取数据
def get_data():
    html = browser.page_source
    links = re.findall('<select id="stationselect"(.+?)class="chzn-done" style="display: none;">(.+?)</select>', html)
    links = re.findall('<option\svalue="(.+?)".*?>(.+?)</option>', links[0][1])
    # links = links[0:15]  # 15条测试
    for link in links:
        starttime = time.time()
        item = link[0]
        id_length = len(re.findall('\d+\.?\d*', link[0])[0])
        id = link[0][0:id_length]
        except_id_item = link[0][id_length + 1:].split(',')
        state = except_id_item[len(except_id_item) - 1]
        total_length = len(item)
        state_length = len(state)
        name = item[id_length + 1:total_length - state_length - 1]
        sub_url = main_url + '&id=' + id + '&name=' + name + '&state=' + state.strip()
        sub_url = sub_url.replace(' ', '%20')

        # 根据子连接获取数据
        html1 = urllib.request.urlopen(sub_url).read()
        html1 = html1.decode('utf-8')
        msl = re.findall('<td><a href="/datum_options.html#' + type + '">' + type + '</a></td><td>(.*?)</td>', html1)[0]

        msl[0].strip()
        if msl == '':
            msl = '0.00'
        if msl == '-':
            msl = '0.00'
        data_id.append(id)
        data_name.append(name)
        data_value.append(msl)
        data_state.append(state.strip())
        data_urls.append(sub_url)
        endtime = time.time()
        dtime = endtime - starttime
        print('Records(' + str(len(data_id)) + ')' + id + '&name=' + name + '&state=' + state.strip() + '===>' + msl + '【' + sub_url + '】' + 'Running time: %s Seconds'%dtime)
        # 停一会儿
        time.sleep(lower_network)

    text_save([data_id, data_name, data_value, data_state, data_urls], type)


# 初始化-给个初始页面
init_url = main_url + '&id=1611347&name=PORT+ALLEN%2C+HANAPEPE+BAY%2C+KAUAI+ISLAND&state=HI'
browser.get(init_url)
handles = browser.window_handles
wait.until(EC.presence_of_element_located((By.ID, 'stationselect')), get_data())