python数据清洗为海地地震数据分析(标记地图求助信息)
程序员文章站
2022-03-11 16:24:32
我们需要分析海地地震求助的数据(GitHub),并画出求助分布点。1.检查并清洗数据。2.画出dummy_frame,为点分布做准备。3.在地图上显示求助信息。1.检查并清洗数据。import pandas as pdimport numpy as np%matplotlib inline%matplotlib notebookimport matplotlib.pyplot as pltfrom pandas import DataFramedata = pd.read_csv('d...
我们需要分析海地地震求助的数据(GitHub),并画出求助分布点。
1.检查并清洗数据。
2.画出dummy_frame,为点分布做准备。
3.在地图上显示求助信息。
1.检查并清洗数据。
import pandas as pd import numpy as np %matplotlib inline %matplotlib notebook import matplotlib.pyplot as plt from pandas import DataFrame
data = pd.read_csv('data\haiti\Haiti.csv') data.info() #<class 'pandas.core.frame.DataFrame'> #RangeIndex: 3593 entries, 0 to 3592 #Data columns (total 10 columns): # # Column Non-Null Count Dtype #--- ------ -------------- ----- # 0 Serial 3593 non-null int64 # 1 INCIDENT TITLE 3593 non-null object # 2 INCIDENT DATE 3593 non-null object # 3 LOCATION 3592 non-null object # 4 DESCRIPTION 3593 non-null object # 5 CATEGORY 3587 non-null object # 6 LATITUDE 3593 non-null float64 # 7 LONGITUDE 3593 non-null float64 # 8 APPROVED 3593 non-null object # 9 VERIFIED 3593 non-null object #dtypes: float64(2), int64(1), object(7) #memory usage: 182.5+ KB
# 检查数据 # 检查1 data[['INCIDENT DATE','LATITUDE','LONGITUDE']][:10] # INCIDENT DATE LATITUDE LONGITUDE #0 05/07/2010 17:26 18.233333 -72.533333 #1 28/06/2010 23:06 50.226029 5.729886 #2 24/06/2010 16:21 22.278381 114.174287 #3 20/06/2010 21:59 44.407062 8.933989 #4 18/05/2010 16:26 18.571084 -72.334671 #5 26/04/2010 13:14 18.593707 -72.310079 #6 26/04/2010 14:19 18.482800 -73.638800 #7 26/04/2010 14:27 18.415000 -73.195000 #8 15/03/2010 10:58 18.517443 -72.236841 #9 15/03/2010 11:00 18.547790 -72.410010 # 检查2 data['CATEGORY'][:6] #Out[4]: #0 1. Urgences | Emergency, 3. Public Health, #1 1. Urgences | Emergency, 2. Urgences logistiqu... #2 2. Urgences logistiques | Vital Lines, 8. Autr... #3 1. Urgences | Emergency, #4 1. Urgences | Emergency, #5 5e. Communication lines down, #Name: CATEGORY, dtype: object # 检查3 data.describe() # Serial LATITUDE LONGITUDE #count 3593.000000 3593.000000 3593.000000 #mean 2080.277484 18.611495 -72.322680 #std 1171.100360 0.738572 3.650776 #min 4.000000 18.041313 -74.452757 #25% 1074.000000 18.524070 -72.417500 #50% 2163.000000 18.539269 -72.335000 #75% 3088.000000 18.561820 -72.293570 #max 4052.000000 50.226029 114.174287 # 移除错误位置信息、移除缺失分类信息 data = data[(data.LATITUDE > 18)&(data.LATITUDE < 20)& (data.LONGITUDE > -75)&(data.LONGITUDE <-70)& (data.CATEGORY.notnull())] data.describe() # Serial LATITUDE LONGITUDE #count 3569.000000 3569.000000 3569.000000 #mean 2081.498459 18.592503 -72.424994 #std 1170.311824 0.273695 0.291018 #min 4.000000 18.041313 -74.452757 #25% 1074.000000 18.524200 -72.417498 #50% 2166.000000 18.539269 -72.335000 #75% 3089.000000 18.561800 -72.293939 #max 4052.000000 19.940630 -71.099489
2.画出dummy_frame,为点分布做准备。
# 将“用,分割的字符串”转化为list def to_cat_list(catstr): stripped = (x.strip() for x in catstr.split(',')) return [x for x in stripped if x] # 将series转化为集合 def get_all_categories(cat_series): cat_sets = (set(to_cat_list(x)) for x in cat_series) return sorted(set.union(*cat_sets)) # 删掉“用|分割字符串”的部分内容 def get_english(cat): code , names = cat.split('.') if '|' in names: names = names.split('|')[1] return code,names.strip()
# 提取分类 all_cats = get_all_categories(data.CATEGORY) english_mapping = dict(get_english(x) for x in all_cats)
# 建立dummy_table表格 # 获取编码 这里用在set和list def get_code(seq): return [x.split('.')[0] for x in seq if x ] all_codes = get_code(all_cats) code_index = pd.Index(np.unique(all_codes)) dummy_frame = pd.DataFrame(np.zeros((len(data),len(code_index))),index= data.index , columns = code_index,) print(dummy_frame.iloc[:,:]) # 1 1a 1b 1c 1d 2 2a 2b 2c 2d ... 7c 7d 7g \ #0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 #4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 #5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 #6 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 #7 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0
# 画1:进行第二次取code for row , cat in zip(data.index,data.CATEGORY): codes = get_code(to_cat_list(cat)) dummy_frame.loc[row,codes] = 1 data = data.join(dummy_frame.add_prefix('category_')) data2.iloc[:,:]
3.在地图上显示求助信息。
from mpl_toolkits.basemap import Basemap import matplotlib.pyplot as plt def basic_haiti_map(ax = None,lllat = 17.25,urlat = 20.25,lllon = -75 ,urlon = -71): m = Basemap(ax=ax,projection='stere', lon_0 = (urlon+lllon)/2, lat_0 =(urlat + lllat) /2, llcrnrlat = lllat,urcrnrlat = urlat, llcrnrlon =lllon,urcrnrlon = urlon, resolution='f') m.drawcoastlines() m.drawstates() m.drawcountries() return m
fig,axes = plt.subplots(nrows =2,ncols=2,figsize =(12,10)) fig.subplots_adjust(hspace=0.05,wspace=0.05) to_plot = ['2a','1','3c','7a'] lllat =17.25;urlat=20.25;lllon=-75;urlon=-71 for code,ax in zip(to_plot ,axes.flat): m = basic_haiti_map(ax,lllat=lllat,urlat=urlat,lllon=lllon,urlon=urlon) cat_data = data[data['category_%s' %code] == 1] x,y = m(cat_data.LONGITUDE.values,cat_data.LATITUDE.values) m.plot(x,y,'k.',alpha = 0.5) ax.set_title('%s:%s' % (code,english_mapping[code]))
PS:画出中国地图,标记中山、成都、重庆、昆山所在
fig,axes = plt.subplots(nrows =1,ncols=1,figsize =(12,10)) fig.subplots_adjust(hspace=0.05,wspace=0.05) lllat = 2 # left corner latitude lllon = 72 # left corner longitude urlat = 55 # right corner latitude urlon = 135 # right corner longitude code=['China : ZS,CD,CQ,KS'] LON=[113.38,104.07,106.33,120.98] LAT=[22.52,30.67,29.35,31.38] m = basic_haiti_map(ax=axes,lllat=lllat,urlat=urlat,lllon=lllon,urlon=urlon) for i in range(len(LON)): x,y = m(LON[i],LAT[i]) m.scatter(x, y, s=100, marker='o',color='#FF5600') axes.set_title('%s:'%(code))
本文地址:https://blog.csdn.net/m0_46629123/article/details/108876138