2018年10月15日 星期一

[Python] 從中央氣象局下載地震活動彙整列表

氣象局網站的 地震活動彙整 列表, 檢視網頁內容, 真正資料網頁連結https://scweb.cwb.gov.tw/Page.aspx/?ItemId=20&loc=tw&adv=1, 因為是ASP網頁, 所以需先取出ASP的傳遞參數, 資料表格之 class 為 datalist4, 使用 BeautifulSoup 即可取出表格內容

# -*- coding: utf-8 -*-
"""
Created on Sun Oct  15 11:53:25 2018
@author: ghosty
@Program: cwbweb_list.py
@Prupose: download quake data from CWB web source: https://www.cwb.gov.tw/V7/earthquake/rtd_eq.htm
"""

import requests
from bs4 import BeautifulSoup
#import dateutil
  
def downloadCWBweb(year,month):
    url = 'https://scweb.cwb.gov.tw/Page.aspx/?ItemId=20&loc=tw&adv=1'             
    agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0'
    headers = {'Content-type': 'application/x-www-form-urlencoded',
           'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
           'User-Agent': agent}
    payload = {
            '__VIEWSTATE':'',
            '__VIEWSTATEGENERATOR':'',
            '__VIEWSTATEENCRYPTED':'',
            '__EVENTVALIDATION':'',
            'ctl03_ddlYear':'',
            'ctl03_ddlMonth':'',
            'ctl03_btnSearch':''
        }

    response1 = requests.post(url)
    if  response1.status_code != requests.codes.ok:
        print("CWB requesr fail")
        return
   
    soup = BeautifulSoup(response1.text, "lxml")
    payload['__VIEWSTATE']=soup.find('input',id='__VIEWSTATE')['value']    
    payload['__VIEWSTATEGENERATOR']=soup.find('input',id='__VIEWSTATEGENERATOR')['value']
    payload['__VIEWSTATEENCRYPTE']=soup.find('input',id='__VIEWSTATEENCRYPTED')['value']
    payload['__EVENTVALIDATION']=soup.find('input',id='__EVENTVALIDATION')['value']
    payload['ctl03$ddlYear']="{:4d}".format(year)
    payload['ctl03$ddlMonth']="{:0>2d}".format(month)
    payload['ctl03$btnSearch']=''
   
    response2 = requests.post(url,data=payload, headers=headers)
    soup2 = BeautifulSoup(response2.text, "lxml")
    table = soup2.find('table', attrs={'class':'datalist4'})
    rows = table.find_all('tr')

    quakeData  = []
    for row in rows:
        cols = row.find_all('td')
        cols = [item.text.strip() for item in cols]
        if (len(cols)>0): #skip empty row       
            quakeData.append([item for item in cols if item]) # Get rid of empty values
           
    return quakeData

quakeData = downloadCWBweb(2018,8)
for data in quakeData:
    print(data)


沒有留言:

張貼留言