在熊猫中实施Apriori算法的最佳方法是什么?到目前为止,我一直坚持转换使用for循环提取模式.从for循环开始的一切都不起作用.在熊猫中有没有矢量化的方法呢?
import pandas as pd
import numpy as np
trans=pd.read_table('output.txt',header=None,index_col=0)
def apriori(trans,support=4):
ts=pd.get_dummies(trans.unstack().dropna()).groupby(level=1).sum()
#user input
collen,rowlen =ts.shape
#max length of items
tssum=ts.sum(axis=1)
maxlen=tssum.loc[tssum.idxmax()]
items=list(ts.columns)
results=[]
#loop through items
for c in range(1,maxlen):
#generate patterns
pattern=[]
for n in len(pattern):
#calculate support
pattern=['supp']=pattern.sum/rowlen
#filter by support level
Condit=pattern['supp']> support
pattern=pattern[Condit]
results.append(pattern)
return results
results =apriori(trans)
print results
当我插入支持3时
a b c d e
0
11 1 1 1 0 0
666 1 0 0 1 1
10101 0 1 1 1 0
1010 1 1 1 1 0
414147 0 1 1 0 0
10101 1 1 0 1 0
1242 0 0 0 1 1
101 1 1 1 1 0
411 0 0 1 1 1
444 1 1 1 0 0
它应该输出类似的东西
Pattern support
a 6
b 7
c 7
d 7
e 3
a,b 5
a,c 4
a,d 4
最佳答案
假设我明白你所追求的是什么,也许吧
from itertools import combinations
def get_support(df):
pp = []
for cnum in range(1,len(df.columns)+1):
for cols in combinations(df,cnum):
s = df[list(cols)].all(axis=1).sum()
pp.append([",".join(cols),s])
sdf = pd.DataFrame(pp,columns=["Pattern","Support"])
return sdf
会让你开始:
>>> s = get_support(df)
>>> s[s.Support >= 3]
Pattern Support
0 a 6
1 b 7
2 c 7
3 d 7
4 e 3
5 a,b 5
6 a,c 4
7 a,d 4
9 b,c 6
10 b,d 4
12 c,d 4
14 d,e 3
15 a,b,c 4
16 a,d 3
21 b,c,d 3
[15 rows x 2 columns]