-----(四)、透视表和交叉表
from pandas import DataFrame,Series
import numpy as np
import pandas as pd
cd L:\czz
tips=pd.read_csv(r'ch08\tips.csv')
tips['tip_pct']=tips['tip']/tips['total_bill']
根据sex和smoker计算分组平均数
tips.pivot_table(rows=['sex','smoker'])
tips.pivot_table(['tip_pct','size'],rows=['sex','day'],cols='smoker')
添加小计
tips.pivot_table(['tip_pct',cols='smoker',margins=True)
使用其他的聚合函数
tips.pivot_table('tip_pct','smoker'],cols='day',aggfunc=len,margins=True)
填充空值
tips.pivot_table('size',rows=['time','sex',aggfunc='sum',fill_value=0)
tips.pivot_table('size',aggfunc='sum')
-----交叉表:crosstab
data=DataFrame({'Sample':[1,2,3,4,5,6,7,8,9,10],
'Gender':['FeMale','Male','FeMale','FeMale'],
'Handedness':['Right-handed','Left-handed','Right-handed','Right-handed']
})
pd.crosstab(data.Gender,data.Handedness,margins=True)
pd.crosstab([tips.time,tips.day],tips.smoker,margins=True)
---------------------------------------------------------------------------------------
-----(五)、2012联邦选举委员会数据库
**** fec=pd.read_csv(r'ch09\P00000001-ALL.csv')
fec.ix[123456]
获取全部的候选人名单
unique_cands=fec.cand_nm.unique()
unique_cands
利用字典说明党派关系
**** parties={
'Bachmann,Michelle':'Republican',
'Cain,Herman':'Republican',
'Gingrich,Newt':'Republican',
'Huntsman,Jon':'Republican',
'Johnson,Gary Earl':'Republican',
'McCotter,Thaddeus G':'Republican',
'Obama,Barack':'Democrat',
'Paul,Ron':'Republican',
'Pawlenty,Timothy':'Republican',
'Perry,Rick':'Republican',
"Roemer,Charles E. 'Buddy' III":'Republican',
'Romney,Mitt':'Republican',
'Santorum,
}
fec.cand_nm[123456:123461]
fec.cand_nm[123456:123461].map(parties)
将其添加为一个新列
**** fec['party']=fec.cand_nm.map(parties)
**** fec['party'].value_counts()
该数据既包括赞助也包括退款(负的出资额)
(fec.contb_receipt_amt>0).value_counts()
限定该数据集只能有正的出资额
**** fec=fec[fec.contb_receipt_amt>0]
只包含Obama,Barack和Romney,Mitt的信息
**** fec_mrbo=fec[fec.cand_nm.isin(['Obama,Barack','Romney,Mitt'])]
-----1.根据职业和雇主统计赞助信息
fec.contbr_occupation.value_counts()[:10]
**** occ_mapping={
'INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDED',
'INFORMATION REQUESTED':'NOT PROVIDED',
'INFORMATION REQUESTED (BEST EFFORTS)':'NOT PROVIDED',
'C.E.O.':'CEO'
}
#如果没有提供相关映射,则返回x
f=lambda x:occ_mapping.get(x,x)
fec.contbr_occupation=fec.contbr_occupation.map(f)
对雇主信息也进行同样的处理
**** emp_mapping={
'INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDED',
'SELF':'SELF-EMPLOYED',
'SELF EMPLOYED':'SELF-EMPLOYED'
}
f=lambda x:emp_mapping.get(x,x)
fec.contbr_employer=fec.contbr_employer.map(f)
通过pivot_table根据党派和职业对数据进行聚合,然后过滤掉总出资额不足200万美元的数据
**** by_occupation=fec.pivot_table('contb_receipt_amt',
rows='contbr_occupation',
cols='party',
aggfunc='sum'
)
over_2mm=by_occupation[by_occupation.sum(1)>2000000]
over_2mm
做成柱状图
**** over_2mm.plot(kind='barh')
-----2.对Obama和Romney总出资额最高的企业和职业
****
def get_top_amounts(group,key,n=5):
totals=group.groupby(key)['contb_receipt_amt'].sum()
return totals.order(ascending=False)[:n]
grouped=fec_mrbo.groupby('cand_nm')
grouped.apply(get_top_amounts,'contbr_occupation',n=7)
grouped.apply(get_top_amounts,'contbr_employer',n=10)
-----3.对出资额进行分组
bins=np.array([0,1,10,100,1000,10000,100000,1000000,10000000])
labels=pd.cut(fec_mrbo.contb_receipt_amt,bins)
grouped=fec_mrbo.groupby(['cand_nm',labels])
grouped.size().unstack(0)
bucket_sums=grouped.contb_receipt_amt.sum().unstack(0)
bucket_sums
两位候选人收到的各种捐赠额度的总额比例
normed_sums=bucket_sums.div(bucket_sums.sum(axis=1),axis=0)
normed_sums
normed_sums[:-2].plot(kind='barh',stacked=True)
-----4.根据州统计赞助信息
grouped=fec_mrbo.groupby(['cand_nm','contbr_st'])
totals=grouped.contb_receipt_amt.sum().unstack(0).fillna(0)
totals=totals[totals.sum(1)>100000]
totals[:10]
总赞助额比例
percent=totals.div(totals.sum(1),axis=0)
percent[:10]
在地图上画出来?