--------------------------------------------------------------------------------------- -----(三)、分组级运算和转换 df k1_means=df.groupby('key1').mean().add_prefix('mean_') k1_means pd.merge(df,k1_means,left_on='key1',right_index=True) 使用transform
方法 people=DataFrame([[1,2,3,4,28],[6,7,80,9,10],[11,100,13,14,15],[16,17,18,19,12],[21,23,24,25]],columns=['a','b','c','d','e'],index=['Joe','Steve','Wes','Jim','Travis']) key=['one','two','one','one'] people.groupby(key).max() people.groupby(key).transform(np.max) def demean(arr): return arr-arr.max() demeaned=people.groupby(key).transform(demean) demeaned demeaned.groupby(key).max() -----1.apply:一般性的"拆分-应用-合并" 根据分组选出最高的5个tip_pct值 def top(df,n=5,column='tip_pct'): return df.sort_index(by=column)[-n:] top(tips,n=6) tips.groupby('smoker').apply(top) tips.groupby(['smoker','day']).apply(top,n=1,column='total_bill') describe
函数 result=tips.groupby('smoker')['tip_pct'].describe() result.unstack('smoker')
禁止分组键 tips.groupby('smoker',group_keys=False).apply(top) -----2.分位数和桶分析 frame=DataFrame({'data1':np.random.randn(1000),'data2':np.random.randn(1000)}) factor=pd.cut(frame.data1,4) factor[:10] def get_stats(group): return{'min':group.min(),'max':group.max(),'count':group.count(),'mean':group.mean()} grouped=frame.data2.groupby(factor) grouped.apply(get_stats).unstack() grouping=pd.qcut(frame.data1,10,labels=False) grouped=frame.data2.groupby(grouping) grouped.apply(get_stats).unstack() -----3.示例:用特定于分组的值填充缺失值 s=Series(np.random.randn(6)) s[::2]=np.nan s.fillna(s.mean()) states=['Ohio','New York','Vermont','Florida','Oregon','Nevada','California','Idaho'] group_key=['East']*4+['West']*4 data=Series(np.random.randn(8),index=states) data[['Vermont','Idaho']]=np.nan data data.groupby(group_key).mean() 用分组平均值去填充NA值 fill_mean=lambda g:g.fillna(g.mean()) data.groupby(group_key).apply(fill_mean) 在
代码中预定义各组的填充值 fill_values={'East':0.5,'West':-1} fill_func=lambda g:g.fillna(fill_values[g.name]) data.groupby(group_key).apply(fill_func) -----4.示例:
随机采样和排列 #红桃(Hearts)、黑桃(Spades)、梅花(Clubs)、方片(Diamonds) suit=['H','S','C','D'] card_val=(range(1,11)+[10]*3)*4 base_names=['A']+range(2,11)+['J','Q','K'] cards=[] for suit in ['H','D']: cards.extend(str(num)+suit for num in base_names) deck=Series(card_val,index=cards) def draw(deck,n=5): return deck.take(np.random.permutation(len(deck))[:n]) draw(deck) 从每两种花色中
随机抽取两张牌 get_suit=lambda card:card[-1] deck.groupby(get_suit).apply(draw,n=2) deck.groupby(get_suit,group_keys=False).apply(draw,n=2) -----5.分组加权平均数和相关系数 df=DataFrame({'category':['a','a','b'],'data':np.random.randn(8),'weights':np.random.randn(8) }) grouped=df.groupby('category') get_wavg=lambda g:np.average(g['data'],weights=g['weights']) grouped.apply(get_wavg) -- close_px=pd.read_csv(r'ch09\stock_px.csv',parse_dates=True,index_col=0) close_px close_px[-4:] rets=close_px.pct_change().dropna()#日收益率 spx_corr=lambda x:x.corrwith(x['SPX']) by_year=rets.groupby(lambda x:x.year) by_year.apply(spx_corr) #苹果和微软的年度相关系数 by_year.apply(lambda g:g['AAPL'].corr(g['MSFT'])) -----6.面向分组的线性回归 最小二乘法 import statsmodels.api as sm def regress(data,yvar,xvars): Y=data[yvar] X=data[xvars] X['intercept']=1. result=sm.OLS(Y,X).fit() return result.params by_year.apply(regress,'AAPL',['SPX'])
原文链接:https://www.f2er.com/javaschema/284860.html