sklearn数据 预处理
程序员文章站
2022-05-05 08:54:03
...
简单易懂的机器学学习第三方库,使用前需要安装
pip install sklearn #提前安装,有点大
导入需要的包
#encoding=utf-8
from sklearn.cluster import KMeans
from sklearn.datasets import load_digits,fetch_20newsgroups
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
import jieba
import numpy as np
#特征抓取
#导入包
# from sklearn.feature_extraction.text import CountVectorizer
#实例化CountVectorizer()
vector = CountVectorizer()
#调用fit_transform 输入并转换数据
# res = vector.fit_transform(['life_is_short,i_like_python','life_is'])
#字典向量化
def dictvec():
#字典向量化
#有多少种状态,用多少位二进制表示,1表示该标签出现,0表示没有出现
dict = DictVectorizer(sparse=False)
data = dict.fit_transform([
{'city':'北京','pos':'北方','temperatue':100},
{'city':'上海', 'pos': '东方', 'temperatue': 60},
{'city': '深圳', 'pos': '西方', 'temperatue': 30},
{'city': '重庆', 'pos': '南方', 'temperatue': 70
}])
#先调出names,知道每个位置的标签在翻译
print(dict.get_feature_names())
print(data)
print(dict.get_feature_names()[0])
print(dict.get_feature_names()[1])
print(dict.get_feature_names()[2])
print(dict.get_feature_names()[3])
#再翻译成字典类型
print(dict.inverse_transform(data)[0])
jiaba切词与内置函数
def countvec():
#把每个词在文档中出现的次数填在指定为位置(词向量对应的位置)
cv = CountVectorizer()
data = cv.fit_transform(['this is a test test test',"we have have a test"])
print(cv.get_feature_names())
print(data.toarray())
return None
def cutword():
con1 = jieba.cut("床前明月光,我要学python.")
con2 = jieba.cut("床前明月光,疑是地上霜.")
con3 = jieba.cut("生存或死亡,这是一个问题")
#转化为列表
content1 = list(con1)
content2 = list(con2)
content3 = list(con3)
print(content1)
#列表转化为字符串
c1 = ' '.join(content1)
c2 = ' '.join(content2)
c3 = ' '.join(content3)
print(c1)
return c1,c2,c3
def hanzivec():
'''
中文特征化
:return:None
'''
c1,c2,c3 = cutword()
cv = CountVectorizer()
print(c1,c2,c3)
data = cv.fit_transform([c1,c2,c3])
for f_name in cv.get_feature_names():
print(f_name)
print(data.toarray())
return None
归一化处理
def mm():
"""
归一化处理
:return:
比如把[a,b,c,d]规划在[x,y]之间
假设lt = [a,b,c,d]极差为d-a
lt里面的数在[x,y之间对应的书为]
m = x + {[(y-x)*(lt[i] - lt(min))]/(lt.max-lt.min)}
例如[90,60,75]规划在(3,5)之间
最大间隔 a = 90 - 60 = 30
规划区域间隔b = 5 - 3 = 2
90对应的数据 3 + 2 *(90-60)/a = 5
60 对应 3
75对应 3 + 2*(75-60)/a = 4
"""
#划定在2-3之间
mm = MinMaxScaler(feature_range=(3,5))
data = mm.fit_transform([[90,2,10,40],[60,4,15,45],[75,3,13,46]])
print(data)
return None
mm()
def stand():
std = StandardScaler()
data = std.fit_transform([[1,-1,3],[2,4,2],[4,6,-1]])
print(data)
stand()
上一篇: 如何在MindManager15思维导图中添加批注
下一篇: 网易云音乐怎么玩第五人格测试?