您的位置:首页 > 编程语言 > Python开发

《利用Python进行数据分析》笔记---第9章数据聚合与分组运算

2017-08-10 14:34 337 查看

写在前面的话:

实例中的所有数据都是在GitHub上下载的,打包下载即可。

地址是:http://github.com/pydata/pydata-book

还有一定要说明的:

我使用的是Python2.7,书中的代码有一些有错误,我使用自己的2.7版本调通。

# coding: utf-8
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

df =DataFrame({'key1':['a','a','b','b','a'],'key2':['one','two','one','two','one'],
'data1':np.random.randn(5),'data2':np.random.randn(5)})
df
grouped = df['data1'].groupby(df['key1'])
grouped
grouped.mean()
means = df['data1'].groupby([df['key1'],df['key2']]).mean()
means
means.unstack()
states = np.array(['Ohio','California','California','Ohio','Ohio'])
years = np.array([2005,2005,2006,2005,2006])
df['data1'].groupby([states,years]).mean()
df.groupby('key1').mean()
df.groupby(['key1','key2']).mean()
df.groupby(['key1','key2']).size()

for name,group in df.groupby('key1'):
print name
print group
for (k1,k2),group in df.groupby(['key1','key2']):
print k1,k2
print group
pieces = dict(list(df.groupby('key1')))
pieces['b']
df.dtypes
grouped = df.groupby(df.dtypes,axis = 1)
dict(list(grouped))

df.groupby('key1')['data1']
df.groupby('key1')[['data1']]
df.groupby(['key1','key2'])[['data2']].mean()
s_grouped = df.groupby(['key1','key2'])['data2']
s_grouped
s_grouped.mean()

people = DataFrame(np.random.randn(5,5),columns = ['a','b','c','d','e'],index = ['Joe','Steve','Wes','Jim','Travis'])
people.ix[2:3,['b','c']] = np.nan
people
mapping = {'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'}
by_column = people.groupby(mapping,axis = 1)
by_column.sum()
map_series = Series(mapping)
map_series
people.groupby(map_series,axis = 1).count()

people.groupby(len).sum()
key_list = ['one','one','one','two','two']
people.groupby([len,key_list]).min()

columns = pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],[1,3,5,1,3]],names = ['cty','tenor'])
hier_df = DataFrame(np.random.randn(4,5),columns = columns)
hier_df
hier_df.groupby(level = 'cty',axis = 1).count()
hier_df.groupby(level = 'tenor',axis = 1).count()
hier_df.groupby(level = ['cty','tenor'],axis = 1).count()

df
grouped = df.groupby('key1')
grouped['data1'].quantile(0.9),
def peak_to_peak(arr):
return arr.max() - arr.min()
grouped.agg(peak_to_peak)
grouped.describe()
tips = pd.read_csv('D:\Source Code\pydata-book-master\ch08\\tips.csv')
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips.head()
grouped = tips.groupby(['sex','smoker'])
grouped_pct = grouped['tip_pct']
grouped_pct.agg('mean')
grouped_pct.agg(['mean','std',peak_to_peak])
grouped_pct.agg([('foo','mean'),('bar',np.std)])
functions = ['count','mean','max']
result = grouped['tip_pct','total_bill'].agg(functions)
result
result['tip_pct']
ftuples = [('Durchschnitt','mean'),('Abweichung',np.var)]
grouped['tip_pct','total_bill'].agg(ftuples)
grouped.agg({'tip':np.max,'size':sum})
grouped.agg({'tip':['min','max','mean','std'],'size':sum})
tips.groupby(['sex','smoker'],as_index=False).mean()

df
k1_means = df.groupby('key1').mean().add_prefix('mean_')
k1_means
pd.merge(df,k1_means,left_on = 'key1',right_index = True)

people = DataFrame(np.random.randn(5,5),columns = ['a','b','c','d','e'],index = ['Joe','Steve','Wes','Jim','Travis'])
people
key = ['one','two','one','two','one']
people.groupby(key).mean()
people.groupby(key).transform(np.mean)
def demean(arr):
return arr - arr.mean()
demeaned = people.groupby(key).transform(demean)
demeaned
demeaned.groupby(key).mean()

def top(df,n = 5,column = 'tip_pct'):
return df.sort_index(by = column)[-n:]
top(tips,n = 6)
tips.groupby('smoker').apply(top)
tips.groupby(['smoker','day']).apply(top,n = 1,column = 'total_bill')
result = tips.groupby('smoker')['tip_pct'].describe()
result
result.unstack('smoker')
f = lambda x : x.describe()
tips.groupby('smoker')['tip_pct'].apply(f)
tips.groupby('smoker').apply(f)
tips.groupby('smoker',group_keys = False).apply(top)

frame = DataFrame({'data1':np.random.randn(1000),'data2':np.random.randn(1000)})
frame.head()
factor = pd.cut(frame.data1,4)
factor[:10]
def get_stats(group):
return {'min':group.min(),'max':group.max(),'count':group.count(),'mean':group.mean()}
grouped = frame.data2.groupby(factor)
grouped.apply(get_stats)
grouped.apply(get_stats).unstack()
grouping = pd.qcut(frame.data1,10)
grouping = pd.qcut(frame.data1,10,labels = False)
grouping
grouped = frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()

df = DataFrame({'category':['a','a','a','a','b','b','b','b'],
'data':np.random.randn(8),
'weights':np.random.randn(8)})
df
grouped = df.groupby('category')
get_wavg = lambda g:np.average(g['data'],weights=g['weights'])
grouped.apply(get_wavg)

close_px = pd.read_csv('D:\Source Code\pydata-book-master\ch09\stock_px.csv',parse_dates=True,index_col=0)
close_px
close_px[-4:]
rets = close_px.pct_change().dropna()
spx_corr = lambda x:x.corrwith(x['SPX'])
by_year = rets.groupby(lambda x:x.year)
by_year.apply(spx_corr)
by_year.apply(lambda g:g['AAPL'].corr(g['MSFT']))
import statsmodels.api as sm
def regress(data,yvax,xvars):
Y = data[yvax]
X = data[xvars]
X['intercept'] = 1
result = sm.OLS(Y,X).fit()
return result.params
by_year.apply(regress,'AAPL',['SPX'])

fec = pd.read_csv('D:\Source Code\pydata-book-master\ch09\P00000001-ALL.csv')
fec
fec.ix[123456]
unique_cands = fec.cand_nm.unique()
unique_cands
unique_cands[2]
parties = {'Bachmann, Michelle':'Republican',
'Cain, Herman':'Republican',
'Gingrich, Newt':'Republican',
'Huntsman, Jon':'Republican',
'Johnson, Gary Earl':'Republican',
'McCotter, Thaddeus G':'Republican',
'Obama, Barack':'Democrat',
'Paul, Ron':'Republican',
'Pawlenty, Timothy':'Republican',
'Perry, Rick':'Republican',
"Roemer, Charles E. 'Buddy' III":'Republican',
'Romney, Mitt':'Republican',
'Santorum, Rick':'Republican'}
fec.cand_nm[123456:123461]
fec.cand_nm[123456:123461].map(parties)
fec['party'] = fec.cand_nm.map(parties)
fec['party'].value_counts()
(fec.contb_receipt_amt > 0).value_counts()
fec = fec[fec.contb_receipt_amt >0]
fec_mrbo = fec[fec.cand_nm.isin(['Obama, Barack','Romney, Mitt'])]
fec_mrbo
fec.contbr_occupation.value_counts()[:10]
occ_mapping = {
'INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDED',
'INFORMATION REQUESTED':'NOT PROVIDED',
'INFORMATION REQUESTED (BEST EFFORTS)':'NOT PROVIDED',
'C.E.O':'CEO'
}
f = lambda x:occ_mapping.get(x,x)
fec.contbr_occupation = fec.contbr_occupation.map(f)
emp_mapping = {
'INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDED',
'INFORMATION REQUESTED':'NOT PROVIDED',
'SELF':'SELF-EMPLOYED',
'SELF EMPLOYED':'SELF-EMPLOYED'
}
f = lambda x:emp_mapping.get(x,x)
fec.contbr_employer = fec.contbr_employer.map(f)
by_occupation = fec.pivot_table('contb_receipt_amt',rows = 'contbr_occupation',cols = 'party',aggfunc = sum)
by_occupation.head()
over_2mm = by_occupation[by_occupation.sum(1) > 2000000]
over_2mm
over_2mm.plot(kind = 'barh')
def get_top_amounts(group,key,n = 5):
totals = group.groupby(key)['contb_receipt_amt'].sum()
return totals.order(ascending = False)[:n]
grouped = fec_mrbo.groupby('cand_nm')
grouped.apply(get_top_amounts,'contbr_occupation',n = 7),'\n'
fec_mrbo.groupby(['cand_nm','contbr_occupation'])['contb_receipt_amt'].sum()
grouped.apply(get_top_amounts,'contbr_employer',n = 10)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  python 数据分析