您现在的位置是: 首页


程序员文章站 2024-02-12 12:19:22

使用的数据集是Groceries数据集,具体数据集的资料csdn上有介绍。在这里,我上传的数据集是已经处理过的csv格式。Groceries数据集,提取码是 6p8c。

import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import association_rules
from mlxtend.frequent_patterns import apriori
from tools import deal
from tools import read_csv
import datetime #记录时间

groceries = read_csv('D:\DataMining\Groceries.csv')#使用pandas将csv文件转化为mxltend包所需要使用的DataFrame数据
gro_df = pd.DataFrame(groceries)

dataset = gro_df.apply(deal,axis=1).tolist()

starttime = datetime.datetime.now()

te = TransactionEncoder() #定义模型

groceries_tf = te.fit_transform(dataset)

groce = pd.DataFrame(groceries_tf,columns=te.columns_)


frequent_itemsets = apriori(groce,min_support=0.01,use_colnames=True)    #use_colnames代表使用元素名字,如果默认false代表使用列序号
frequent_itemsets.sort_values(by='support',ascending=False,inplace=True)     # 频繁项集可以按支持度排序
print( frequent_itemsets.iloc[:,0].size-1)   #输出频繁项集个数。第一行是列名

association_rule = association_rules(frequent_itemsets,metric='confidence',min_threshold=0.5)	# metric可以有很多的度量选项,返回的表列名都可以作为参数

#mlxtend github地址:https://github.com/rasbt/mlxtend/tree/master/mlxtend

# metric : string (default: 'confidence')
#      Metric to evaluate if a rule is of interest.
#      **Automatically set to 'support' if `support_only=True`.**
#      Otherwise, supported metrics are 'support', 'confidence', 'lift',
#      'leverage', and 'conviction'
#      These metrics are computed as follows:
#      - support(A->C) = support(A+C) [aka 'support'], range: [0, 1]\n
#      - confidence(A->C) = support(A+C) / support(A), range: [0, 1]\n
#      - lift(A->C) = confidence(A->C) / support(C), range: [0, inf]\n
#      - leverage(A->C) = support(A->C) - support(A)*support(C),
#       range: [-1, 1]\n
#      - conviction = [1 - support(C)] / [1 - confidence(A->C)],
#        range: [0, inf]\n

#    min_threshold : float (default: 0.8)
#      Minimal threshold for the evaluation metric,
#      via the `metric` parameter,
#      to decide whether a candidate rule is of interest.
#    support_only : bool (default: False)
#      Only computes the rule support and fills the other
#      metric columns with NaNs. This is useful if:
#      a) the input DataFrame is incomplete, e.g., does
#      not contain support values for all rule antecedents
#      and consequents
#      b) you simply want to speed up the computation because
#      you don't need the other metrics.

association_rule.sort_values(by='leverage',ascending=False,inplace=True)    #关联规则可以按leverage排序
rule = pd.DataFrame(association_rule)
endtime = datetime.datetime.now()
print (endtime - starttime)
#print( rule.iloc[:,0].size-1)#第一行是列名


def deal(data):
	return data.dropna().tolist()

def read_csv(file_name):
    f = open(file_name, 'r')
    content = f.read()
    final_list = list()
    rows = content.split('\n')
    for row in rows:
        temp = row.split(',')
    return final_list
相关标签: 作业存档