欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Machine learning demo

程序员文章站 2022-05-11 13:24:24
...
#实例化类DictVectorizer 字典特征提取
from sklearn.feature_extraction import DictVectorizer
def dict_demo():
    data = [{'city':'北京','temperature':100},{'city': '上海','temperature':60},{'city': '深圳','temperature':30}]
    # 实例化一个转换器类
    transfer = DictVectorizer(sparse=False)
    # 调用fit_transform
    data = transfer.fit_transform(data)
    print('返回的结果:\n', data)
    print('特征名称:\n', transfer.get_feature_names())
    return None
dict_demo()
# 文本特征提取
from sklearn.feature_extraction.text import CountVectorizer
def text_count_demo():
    data = ["life is short,i like like python", "life is too long,i dislike python"]
    # 实例化一个转换器类
    transfer = CountVectorizer()
    data = transfer.fit_transform(data)
    print("文本特征抽取的结果:\n", data.toarray())
    print("返回特征名字:\n", transfer.get_feature_names())
    return None
text_count_demo()
# 特征预处理 归一化   通过对原始数据进行变换把数据映射到(默认为[0,1])之间
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
def minmax_demo():
    data = pd.read_csv("dating.txt")
    print(data)
    transfer = MinMaxScaler(feature_range = (2,3))    #数据映射到[2,3]之间
    data = transfer.fit_transform(data[['milage','Liters','Consumtime']])
    print("最小值最大值归一化处理的结果:\n", data)
    return None
minmax_demo()
# 特征预处理 标准化   通过对原始数据进行变换把数据变换到均值为0,标准差为1范围内
import pandas as pd
from sklearn.preprocessing import StandardScaler
def stand_demo():
    data = pd.read_csv('dating.txt')
    print(data)
    transfer = StandardScaler()
    data = transfer.fit_transform(data[['milage','Liters','Consumtime']])
    print("标准化的结果:\n", data)
    print('每一列特征的平均值:\n', transfer.mean_)
    print('每一列特征的方差:\n', transfer.var_)
    return None
# k近邻算法 
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
# 加载模块
iris = load_iris()
# x_train,x_test,y_train,y_test为训练集特征值、测试集特征值、训练集目标值、测试集目标值
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size = 0.2, random_state = 22)
# 特征工程:标准化
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
# 实例化API
estimator = KNeighborsClassifier(n_neighbors=9)
estimator.fit(x_train, y_train)
# 模型评估
# 方法1:比对真实值和预测值
y_predict = estimator.predict(x_test)
print("预测结果为:\n", y_predict)
print("比对真实值和预测值:\n", y_predict == y_test)
# 方法2:直接计算准确率
score = estimator.score(x_test, y_test)
print("准确率为:\n", score)
# 模型选择与调优
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
# 1、获取数据集
iris = load_iris()
# 2、划分数据集
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=22)
# 3、特征工程:标准化
# 实例化一个转换器类
transfer = StandardScaler()
# 调用fit_transform
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
# 4、KNN预估器流程
#     1)实例化预估器类
estimator = KNeighborsClassifier()
# 5、模型选择与调优——网格搜索和交叉验证
# 准备要调的超参数
param_dict = {'n_neighbors':[1, 3, 5]}
estimator = GridSearchCV(estimator, param_grid=param_dict, cv =3)
#     2)fit数据进行训练
estimator.fit(x_train, y_train)
# 5、评估模型效果
# 方法a:比对预测结果和真实值
y_predict = estimator.predict(x_test)
print("比对预测结果和真实值:\n", y_predict == y_test)
# 方法b:直接计算准确率
score = estimator.score(x_test, y_test)
print("直接计算准确率:\n", score)
print("在交叉验证中验证的最好结果:\n", estimator.best_score_)
print("最好的参数模型:\n", estimator.best_estimator_)
print("每次交叉验证后的准确率结果:\n", estimator.cv_results_)
# 朴素贝叶斯算法
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
# 获取新闻的数据,20个类别
news = fetch_20newsgroups(subset='all')
# 进行数据集分割
x_train, x_test, y_train, y_test = train_test_split(news.data, news.target, test_size = 0.3)

# 对于文本数据,进行特征抽取
tf = TfidfVectorizer()
x_train = tf.fit_transform(x_train)
# 这里打印出来的列表是:训练集当中的所有不同词的组成的一个列表
print(tf.get_feature_names())
# 不能调用fit_transform
x_test = tf.transform(x_test)

# estimator估计器流程
mlb = MultinomialNB(alpha=1.0)
mlb.fit(x_train, y_train)

# 进行预测
y_predict = mlb.predict(x_test)

print("预测每篇文章的类别:", y_predict[:100])
print("真实类别为:", y_test[:100])
print("预测准确率为:", mlb.score(x_test, y_test))
# 决策树API
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier

titian = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
x = titian[['pclass', 'age', 'sex']]
y = titian['survived']

# 缺失值需要处理,将特征当中有类别的这些特征进行字典特征抽取
x['age'].fillna(x['age'].mean, implace=True)

# 对于x转换成字典数据x.to_dict(orient="records")
# [{"pclass": "1st", "age": 29.00, "sex": "female"}, {}]
dict = DictVectorizer(sparse=False)

x = dict.fit_transform(x.to_dict(orient = 'records'))
print(dict.get_feature_names())
print(x)

 # 分割训练集合测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

# 进行决策树的建立和预测
# 指定树的深度大小为5
dc = DecisionTreeClassifier(criterion='entropy', max_depth=5)

dc.fit(x_train, y_train)

print("预测的准确率为:", dc.score(x_test, y_test))

# 可视化,导出dot文件
export_graphviz(dc, out_file="./tree.dot", feature_names=['age', 'pclass=1st', 'pclass=2nd', 'pclass=3rd', '女性', '男性'])
# 随机森林
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier

titian = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
x = titian[['pclass', 'age', 'sex']]
y = titian['survived']

# 缺失值需要处理,将特征当中有类别的这些特征进行字典特征抽取
x['age'].fillna(x['age'].mean, implace=True)

# 对于x转换成字典数据x.to_dict(orient="records")
# [{"pclass": "1st", "age": 29.00, "sex": "female"}, {}]
dict = DictVectorizer(sparse=False)

x = dict.fit_transform(x.to_dict(orient = 'records'))
print(dict.get_feature_names())
print(x)

 # 分割训练集合测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

# 随机森林进行预测
rf = RandomForestClassifier()

# 定义超参数的选择列表
param = {"n_estimators": [120,200,300,500,800,1200], "max_depth": [5, 8, 15, 25, 30]}
# 参数调优
gc = GridSearchCV(rf, param_grid=param, cv = 2)
gc.fit(x_train, y_train)
print("随机森林预测的准确率为:", gc.score(x_test, y_test))
# 两种线性回归:正规方程,梯度下降
import 
def linear1():
    """
    用正规方程直接求出模型参数的方法进行对波士顿房价预测的线性回归案例
    :return: None
    """
    # 1、获取数据集
    boston = load_boston()
    print("boston:\n", boston.DESCR)
    # 2、划分数据集
    x_train, x_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=6)
    # 3、特征工程:标准化
    # 1)实例化一个转换器类
    transfer = StandardScaler()
    # 2)调用fit_transform
    x_train = transfer.fit_transform(x_train)
    x_test = transfer.transform(x_test)
    # 4、线性回归的预估器流程
    estimator = LinearRegression()
    estimator.fit(x_train, y_train)
    y_predict = estimator.predict(x_test)
    print("正规方程求出模型参数的方法预测的房屋价格为:\n", y_predict)
    # 5、得出模型
    print("正规方程求出的回归系数为:\n", estimator.coef_)
    print("正规方程求出的偏置为:\n", estimator.intercept_)
    # 6、模型评估——均方误差
    error = mean_squared_error(y_test, y_predict)
    print("正规方程的均方误差为:\n", error)
    return None


def linear2():
    """
    用梯度下降优化模型参数的方法进行对波士顿房价预测的线性回归案例
    :return: None
    """
    # 1、获取数据集
    boston = load_boston()
    # print("boston:\n", boston)
    # 2、划分数据集
    x_train, x_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=6)
    # 3、特征工程:标准化
    # 1)实例化一个转换器类
    transfer = StandardScaler()
    # 2)调用fit_transform
    x_train = transfer.fit_transform(x_train)
    x_test = transfer.transform(x_test)
    # 4、线性回归的预估器流程
    estimator = SGDRegressor()
    estimator.fit(x_train, y_train)
    y_predict = estimator.predict(x_test)
    print("梯度下降求出模型参数的方法预测的房屋价格为:\n", y_predict)
    # 5、得出模型
    print("梯度下降求出的回归系数为:\n", estimator.coef_)
    print("梯度下降求出的偏置为:\n", estimator.intercept_)
    # 6、模型评估——均方误差
    error = mean_squared_error(y_test, y_predict)
    print("梯度下降的均方误差为:\n", error)
    return None
# 线性回归的改进-岭回归
def linear3():
    """
    用岭回归的方法进行对波士顿房价预测的案例
    :return: None
    """
    # 1、获取数据集
    boston = load_boston()
    # print("boston:\n", boston)
    # 2、划分数据集
    x_train, x_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=6)
    # 3、特征工程:标准化
    # 1)实例化一个转换器类
    transfer = StandardScaler()
    # 2)调用fit_transform
    x_train = transfer.fit_transform(x_train)
    x_test = transfer.transform(x_test)
    # 4、线性回归的预估器流程
    estimator = Ridge()
    estimator.fit(x_train, y_train)
    y_predict = estimator.predict(x_test)
    print("岭回归求出模型参数的方法预测的房屋价格为:\n", y_predict)
    # 5、得出模型
    print("岭回归求出的回归系数为:\n", estimator.coef_)
    print("岭回归求出的偏置为:\n", estimator.intercept_)
    # 6、模型评估——均方误差
    error = mean_squared_error(y_test, y_predict)
    print("岭回归的均方误差为:\n", error)
    return None
# 逻辑回归与二分类
def logisticregression():
    """
    逻辑回归进行癌症预测
    :return: None
    """
    # 1、读取数据,处理缺失值以及标准化
    column_name = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
                   'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
                   'Normal Nucleoli', 'Mitoses', 'Class']

    data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data",
                       names=column_name)

    # 删除缺失值
    data = data.replace(to_replace='?', value=np.nan)

    data = data.dropna()

    # 取出特征值
    x = data[column_name[1:10]]

    y = data[column_name[10]]

    # 分割数据集
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

    # 进行标准化
    std = StandardScaler()

    x_train = std.fit_transform(x_train)

    x_test = std.transform(x_test)

    # 使用逻辑回归
    lr = LogisticRegression()

    lr.fit(x_train, y_train)

    print("得出来的权重:", lr.coef_)

    # 预测类别
    print("预测的类别:", lr.predict(x_test))

    # 得出准确率
    print("预测的准确率:", lr.score(x_test, y_test))
    return None
# 模型保存和加载

# 使用线性模型进行预测
# 使用正规方程求解
lr = LinearRegression()
# 此时在干什么?
lr.fit(x_train, y_train)
# 保存训练完结束的模型
joblib.dump(lr, "test.pkl")

# 通过已有的模型去预测房价
model = joblib.load("test.pkl")
print("从文件加载进来的模型预测房价的结果:", std_y.inverse_transform(model.predict(x_test)))