2020秋-作业存档-数据挖掘-关联规则挖掘实验
程序员文章站
2024-02-12 12:19:22
...
实验报告被我误删了,存档一下代码。
使用的数据集是Groceries数据集,具体数据集的资料csdn上有介绍。在这里,我上传的数据集是已经处理过的csv格式。Groceries数据集,提取码是 6p8c。
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import association_rules
from mlxtend.frequent_patterns import apriori
from tools import deal
from tools import read_csv
import datetime #记录时间
#首先创建数据:转换为DataFrame格式
groceries = read_csv('D:\DataMining\Groceries.csv')#使用pandas将csv文件转化为mxltend包所需要使用的DataFrame数据
#print(groceries)
gro_df = pd.DataFrame(groceries)
#print(gro_df)
#转换数据列表:接着转换DataFrame数据为包含数据的列表。
dataset = gro_df.apply(deal,axis=1).tolist()
#print(dataset)
starttime = datetime.datetime.now()
#转换为模型可接受数据:由于mlxtend的模型只接受特定的数据格式。(TransactionEncoder类似于独热编码,每个值转换为一个唯一的bool值)
te = TransactionEncoder() #定义模型
groceries_tf = te.fit_transform(dataset)
groce = pd.DataFrame(groceries_tf,columns=te.columns_)
#print(groce)
#groce.to_csv('groce.csv')
#求频繁项集
frequent_itemsets = apriori(groce,min_support=0.01,use_colnames=True) #use_colnames代表使用元素名字,如果默认false代表使用列序号
frequent_itemsets.sort_values(by='support',ascending=False,inplace=True) # 频繁项集可以按支持度排序
print( frequent_itemsets.iloc[:,0].size-1) #输出频繁项集个数。第一行是列名
frequent_itemsets.to_csv('itemsets.csv')
association_rule = association_rules(frequent_itemsets,metric='confidence',min_threshold=0.5) # metric可以有很多的度量选项,返回的表列名都可以作为参数
#mlxtend github地址:https://github.com/rasbt/mlxtend/tree/master/mlxtend
# metric : string (default: 'confidence')
# Metric to evaluate if a rule is of interest.
# **Automatically set to 'support' if `support_only=True`.**
# Otherwise, supported metrics are 'support', 'confidence', 'lift',
# 'leverage', and 'conviction'
# These metrics are computed as follows:
# - support(A->C) = support(A+C) [aka 'support'], range: [0, 1]\n
# - confidence(A->C) = support(A+C) / support(A), range: [0, 1]\n
# - lift(A->C) = confidence(A->C) / support(C), range: [0, inf]\n
# - leverage(A->C) = support(A->C) - support(A)*support(C),
# range: [-1, 1]\n
# - conviction = [1 - support(C)] / [1 - confidence(A->C)],
# range: [0, inf]\n
# min_threshold : float (default: 0.8)
# Minimal threshold for the evaluation metric,
# via the `metric` parameter,
# to decide whether a candidate rule is of interest.
# support_only : bool (default: False)
# Only computes the rule support and fills the other
# metric columns with NaNs. This is useful if:
# a) the input DataFrame is incomplete, e.g., does
# not contain support values for all rule antecedents
# and consequents
# b) you simply want to speed up the computation because
# you don't need the other metrics.
association_rule.sort_values(by='leverage',ascending=False,inplace=True) #关联规则可以按leverage排序
#print(association_rule)
rule = pd.DataFrame(association_rule)
endtime = datetime.datetime.now()
print (endtime - starttime)
#print( rule.iloc[:,0].size-1)#第一行是列名
rule.to_csv('rule.csv')
其中,使用的工具代码:
def deal(data):
return data.dropna().tolist()
def read_csv(file_name):
f = open(file_name, 'r')
content = f.read()
final_list = list()
rows = content.split('\n')
for row in rows:
temp = row.split(',')
final_list.append(temp[1:])
return final_list
上一篇: 统一QML与C++互调方式
下一篇: PHP施用DES进行加密和解密