变量关系可视化展示
做数据分析前的可视化展示非常重要,因为即使能直接调用库,但一些画图参数还是要调,就比较麻烦,这里总结一下最近用到的模板,以后可以直接画图调用。其实官网matplotlib和seaborn的可视化教程做的非常好,有时间一定要亲手打出来过一遍,链接如下:
CATEGORICAL X CATEGORICAL
CATEGORICAL X CONTINUOUS
- Box plots of continuous for each category
- Violin plots of continuous distribution for each category
- Overlaid histograms (if 3 or less categories)
CONTINUOUS X CONTINOUS
下面直接上我的实例:
Target: GDP per capita
All of our variables are continuous so this will be a CONTINUOUS X CONTINUOUS analysis.
#直接用matplotlib画散点图
#seasonal_variability 季节变化(WRI)
# recent.drop('gdp_bin', axis=1).astype(float).plot(x='seasonal_variability',y='gdp_per_capita', kind='scatter');
plt.scatter(recent.seasonal_variability, recent.gdp_per_capita)
plt.xlabel('Seasonal variability');
plt.ylabel('GDP per capita ($USD/person)');
散点图模板
def plot_scatter(df, x, y, xlabel=None, ylabel=None, title=None,
logx=False, logy=False, by=None, ax=None):
if not ax:
fig, ax = plt.subplots(figsize=(12, 10))
colors = mpl.rcParams['axes.prop_cycle'].by_key()['color']
if by:
groups = df.groupby(by)
for j, (name, group) in enumerate(groups):
ax.scatter(group[x], group[y], color=colors[j], label=name)
ax.legend()
else:
ax.scatter(df[x], df[y], color=colors[0])
if logx:
ax.set_xscale('log')
if logy:
ax.set_yscale('log')
ax.set_xlabel(xlabel if xlabel else x);
ax.set_ylabel(ylabel if ylabel else y);
if title:
ax.set_title(title);
return ax
Joint plot
Now we might also want to understand the relationship between the two variables and the distribution of each individually. For this we can use the joint plot from seaborn.
svr = [recent.seasonal_variability.min(), recent.seasonal_variability.max()]
gdpr = [(recent.gdp_per_capita.min()), recent.gdp_per_capita.max()]
gdpbins = np.logspace(*np.log10(gdpr), 25)
#seaborn的散点图函数jointGrid()函数
g =sns.JointGrid(x="seasonal_variability", y="gdp_per_capita", data=recent, ylim=gdpr)
g.ax_marg_x.hist(recent.seasonal_variability, range=svr)
g.ax_marg_y.hist(recent.gdp_per_capita, range=gdpr, bins=gdpbins, orientation="horizontal")
g.plot_joint(plt.hexbin, gridsize=25)
ax = g.ax_joint
# ax.set_yscale('log')
g.fig.set_figheight(8)
g.fig.set_figwidth(9)
Correlation
相关度量两个变量之间的*线性关系的强度。我们可以使用相关性来识别变量。
#拿到相关系数
recent_corr = recent.corr().loc['gdp_per_capita'].drop(['gdp','gdp_per_capita'])
#模板直接用:相关系数多对一
def conditional_bar(series, bar_colors=None, color_labels=None, figsize=(13,24),
xlabel=None, by=None, ylabel=None, title=None):
fig, ax = plt.subplots(figsize=figsize)
if not bar_colors:
bar_colors = mpl.rcParams['axes.prop_cycle'].by_key()['color'][0]
plt.barh(range(len(series)),series.values, color=bar_colors)
plt.xlabel('' if not xlabel else xlabel);
plt.ylabel('' if not ylabel else ylabel)
plt.yticks(range(len(series)), series.index.tolist())
plt.title('' if not title else title);
plt.ylim([-1,len(series)]);
if color_labels:
for col, lab in color_labels.items():
plt.plot([], linestyle='',marker='s',c=col, label= lab);
lines, labels = ax.get_legend_handles_labels();
ax.legend(lines[-len(color_labels.keys()):], labels[-len(color_labels.keys()):], loc='upper right');
plt.close()
return fig
bar_colors = ['#0055A7' if x else '#2C3E4F' for x in list(recent_corr.values < 0)]
color_labels = {'#0055A7':'Negative correlation', '#2C3E4F':'Positive correlation'}
conditional_bar(recent_corr.apply(np.abs), bar_colors, color_labels,
title='Magnitude of correlation with GDP per capita, 2013-2017',
xlabel='|Correlation|')
#颜色表示正负相关,长度表示相关大小
Target: GDP per capita, binned
While correlation is useful for assessing relationships, it is limited to only linear relationships. As we have seen though, there seem to be many non-linear relationships. We could assess correlation for log transformed variables (see the extras below), but we still may have non-linear relationships. One way to address this is to bin variables into categories and look at the distribution of other variables for each category.
#画直方图
plot_hist(recent, 'gdp_per_capita', xlabel='GDP per capita ($)',
ylabel='Number of countries',
title='Distribution of GDP per capita, 2013-2017');
plot_hist(recent, 'gdp_per_capita', xlabel='GDP per capita ($)', logx=True,
ylabel='Number of countries', bins=25,
title='Distribution of log GDP per capita, 2013-2017');
#画箱线图
recent[['gdp_bin','total_pop_access_drinking']].boxplot(by='gdp_bin');
# plt.ylim([0,100000]);
plt.title('Distribution of percent of total population with access to drinking water across gdp per capita categories');
plt.xlabel('GDP per capita quintile');
plt.ylabel('Total population of country');