由于 中国地面气象站基本要素观测资料表.pdf 是 pdf 格式,我们需要借助 Python 的 pdfplumber 提取表格。
# 将各站点监测数据按年合并为txt文件 # 导入包 import os import pandas as pd import zipfile import csv import shutil
os.chdir(r"D:\中国气象数据") os.getcwd()
# 解压文件 files = os.listdir() files = [file for file in files if".zip"in file] for file in files: zip_file = zipfile.ZipFile(file) zip_list = zip_file.namelist() # 得到压缩包里所有文件 for subfile in zip_list: # 循环解压文件 zip_file.extract(subfile)
# 将年度所有文件写入一个txt中 files = os.listdir() files = [file for file in files if os.path.isdir(file) and"china"in file]
for file in files: with open(file+".txt", "w", encoding="utf-8") as f: f.write("") subfiles = os.listdir(file) for subfile in subfiles: with open(r"./{}/{}".format(file,subfile),"r") as f: content = f.read() station = subfile.split("-")[0] content = content.replace("\n", " {}\n".format(station)) # 纵向追加气象站id with open(file+".txt", "a", encoding="utf-8") as f: f.write(content) print("整理完文件夹{}!".format(file))
# 删除解压的文件夹 for file in files: if os.path.exists(file): shutil.rmtree(file)
with open("./_数据说明/台站表.csv", "w", newline="", encoding="gb18030") as f: writer = csv.writer(f) writer.writerow("")
for i in range(0,len(pdf.pages)): page = pdf.pages[i] table = page.extract_table() for row in table: with open("./_数据说明/台站表.csv",
"a", newline="", encoding="gb18030") as f: writer = csv.writer(f) writer.writerow(row) print("整理完第{}页!".format(i))
在完成上述工作后,我们就可以用 Stata 来合并站点 ID 数据和气象数据,并根据年度和省份分组计算「省份年度气象数据」。
**# 1 中国地面气象站基本气象要素观测资料台站表 import delimited using "./_数据说明/台站表.csv", clear tostring 区站号, replace replace 区站号 = 区站号 + "0" //里面的区站号加上个 0 即站点 ID rename 省份 province rename 区站号 station rename 站名 location rename 纬度度分 altitude rename 经度度分 latitude rename 气压传感器拔海高度米 sensor_sea_level rename 观测场拔海高度米 station_sea_level save 台站表.dta, replace
**# 2 获取省级年度气象数据 cap mkdir tmp //创建临时文件夹 cap fs *.txt foreach file in `r(files)'{ import delimited using "`file'", clear replace v1 = stritrim(v1) split v1, p(" ") drop v1
gen year = real(v11) gen month = real(v12) gen day = real(v13) gen hour = real((v14)) gen air_temperature = real(v15) gen dew_point_temperature = real(v16) gen sea_level_pressure = real(v17) gen wind_direction = real(v18) gen wind_speed = real(v19) gen cloud_cover = real(v110) gen precipitation_one_hour = real(v111) gen precipitation_six_hour = real(v112) gen station = v113
keep year-station merge m:1 station using 台站表 keep if _merge == 3 drop _merge
*每个观测站每日平均 foreach var of varlist air_temperature-precipitation_six_hour{ replace `var' = . if `var' == -9999 //将 -9999 替换成缺失值. bys year month day station: egen `var'_day = mean(`var') } bys year month day station: keep if _n == _N
*每个省份年度平均 foreach var of varlist air_temperature_day-precipitation_six_hour_day{ local varname = subinstr("`var'", "_day", "",.) bys province: egen `varname'_year = mean(`var') } bys province: keep if _n == _N
keep year province air_temperature_year-precipitation_six_hour_year order year province air_temperature_year-precipitation_six_hour_year local dtaname = subinstr("`file'", ".txt", "",.) save "./tmp/`dtaname'", replace }
*合并数据 clear fs "./tmp/*.dta" foreach file in `r(files)'{ append using "./tmp/`file'" } drop precipitation_one_hour_year //缺失数值太多 sort province year save 省级气象数据.dta, replace ! rmdir /s/q "./tmp" //删除临时文件夹