医疗知识图谱笔记(二)
程序员文章站
2022-06-12 18:06:17
...
1.re库
import re
# 从字符串中匹配是否有该模板
print(re.search(pattern = 'w{2}', string = 'www.runoob.com'))
# 从字符串中替换掉该模板
print(re.sub(pattern = '#.*$', repl = "", string = "2004-959-559 # 这是一个国外电话号码"))
# 从字符串中找到所有匹配的子串
print(re.findall(pattern='\d+', string='runoob 123 google 456'))
# 将字符串根据模板进行分割
print(re.split(pattern = "\d+",string ="12a32bc43jf3") )
# w3c的正则表达式教程
# https://www.w3cschool.cn/zhengzebiaodashi/
E:\miniconda\python.exe E:/BaiduNetdiskDownload/05NLP项目——医疗知识图谱项目/课时附件资料/课时0/课时0/附件/re.py
<re.Match object; span=(0, 2), match='ww'>
2004-959-559
['123', '456']
['', 'a', 'bc', 'jf', '']
Process finished with exit code 0
re库的三个函数: findall sub split
2.pandas 库
import pandas as pd
import numpy as np
dates=pd.date_range('20180310',periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=['A','B','C','D'])#生成6行4列位置
print(df)#输出6行4列的表格
'''
A B C D
2018-03-10 -0.092889 -0.503172 0.692763 -1.261313
2018-03-11 -0.895628 -2.300249 -1.098069 0.468986
2018-03-12 0.084732 -1.275078 1.638007 -0.291145-*9
2018-03-13 -0.561528 0.431088 0.430414 1.065939
2018-03-14 1.485434 -0.341404 0.267613 -1.493366
2018-03-15 -1.671474 0.110933 1.688264 -0.910599
'''
print(df['B'])
'''
2018-03-10 -0.927291
2018-03-11 -0.406842
2018-03-12 -0.088316
2018-03-13 -1.631055
2018-03-14 -0.929926
2018-03-15 -0.010904
Freq: D, Name: B, dtype: float64
'''
#创建特定数据的DataFrame
df_1=pd.DataFrame({'A' : 1.,
'B' : pd.Timestamp('20180310'),
'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
'D' : np.array([3] * 4,dtype='int32'),
'E' : pd.Categorical(["test","train","test","train"]),
'F' : 'foo'
})
print(df_1)
'''
A B C D E F
0 1.0 2018-03-10 1.0 3 test foo
1 1.0 2018-03-10 1.0 3 train foo
2 1.0 2018-03-10 1.0 3 test foo
3 1.0 2018-03-10 1.0 3 train foo
'''
print(df_1.dtypes)
'''
A float64
B datetime64[ns]
C float32
D int32
E category
F object
dtype: object
'''
print(df_1.index)#行的序号
#Int64Index([0, 1, 2, 3], dtype='int64')
print(df_1.columns)#列的序号名字
'''
Int64Index([0, 1, 2, 3], dtype='int64')
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
'''
#Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
print(df_1.values)#把每个值进行打印出来
'''
[[1.0 Timestamp('2018-03-10 00:00:00') 1.0 3 'test' 'foo']
[1.0 Timestamp('2018-03-10 00:00:00') 1.0 3 'train' 'foo']
[1.0 Timestamp('2018-03-10 00:00:00') 1.0 3 'test' 'foo']
[1.0 Timestamp('2018-03-10 00:00:00') 1.0 3 'train' 'foo']]
'''
print(df_1.describe())#数字总结
'''
A C D
count 4.0 4.0 4.0
mean 1.0 1.0 3.0
std 0.0 0.0 0.0
min 1.0 1.0 3.0
25% 1.0 1.0 3.0
50% 1.0 1.0 3.0
75% 1.0 1.0 3.0
max 1.0 1.0 3.0
'''
print(df_1.T)#翻转数据
'''
0 1 2 \
A 1 1 1
B 2018-03-10 00:00:00 2018-03-10 00:00:00 2018-03-10 00:00:00
C 1 1 1
D 3 3 3
E test train test
F foo foo foo
3
A 1
B 2018-03-10 00:00:00
C 1
D 3
E train
F foo
'''
print(df_1.sort_index(axis=1, ascending=False))#axis等于1按列进行排序 如ABCDEFG 然后ascending倒叙进行显示
'''
F E D C B A
0 foo test 3 1.0 2018-03-10 1.0
1 foo train 3 1.0 2018-03-10 1.0
2 foo test 3 1.0 2018-03-10 1.0
3 foo train 3 1.0 2018-03-10 1.0
'''
print(df_1.sort_values(by='E'))#按值进行排序
'''
A B C D E F
0 1.0 2018-03-10 1.0 3 test foo
2 1.0 2018-03-10 1.0 3 test foo
1 1.0 2018-03-10 1.0 3 train foo
3 1.0 2018-03-10 1.0 3 train foo
'''
3.json
import json
# json 数据,和Python中的dict数据形式一样
data ={
"第一个key":"第一个value",
"第二个key":"第二个value"
}
print('原生json数据',data)
# 将json转换成str,方便在文件中保存
data_str = json.dumps(data)
print('json转成str',data_str)
# 将str转换成json,方便在Python的调用
data_json = json.loads(data_str)
print('从str转成json',data_json)
# 将一个json对象直接保存在文件中
with open('json.txt','w') as f :
json.dump(data_json,f)
# 将一个保存json对象的文件直接转成字符串
with open('json.txt','r') as f :
data_json_exchange = json.load(f)
print('从文件中获得json数据',data_json_exchange)
E:\miniconda\python.exe E:/BaiduNetdiskDownload/05NLP项目——医疗知识图谱项目/课时附件资料/课时0/课时0/附件/json.py
原生json数据 {'第一个key': '第一个value', '第二个key': '第二个value'}
json转成str {"\u7b2c\u4e00\u4e2akey": "\u7b2c\u4e00\u4e2avalue", "\u7b2c\u4e8c\u4e2akey": "\u7b2c\u4e8c\u4e2avalue"}
从str转成json {'第一个key': '第一个value', '第二个key': '第二个value'}
从文件中获得json数据 {'第一个key': '第一个value', '第二个key': '第二个value'}
Process finished with exit code 0
4. gensim词向量库
from gensim.models import Word2Vec
from random import choice
temp =[
['用来','测试','的','分词','之后','的','第一','句','话'],
['我','随便','写','的','一','句','话']
]
ls_of_words = [] # 存放分词列表的列表
for i in range(1500):
ls = choice(temp)
ls_of_words.append([choice(ls) for _ in range(9, 15)])
# 训练词向量模型,主要的参数就输输入文本,其他的参数影不是很大
model = Word2Vec(ls_of_words)
# 得到最想似的词
print(model.similar_by_word('用来'))
# 计算两者之间的相似度
print(model.similarity('用来', '测试'))
# 词向量聚类及可视化
from random import choice
ls_of_ls = [['芝士', '酸奶', '蛋糕', '巧克力', '做', '吃'],
['文本', '数据', '挖掘', '分析', '做', '玩'],
['佛山', '广州', '南海', '天河', '吃', '玩']]
ls_of_words = [] # 存放分词列表(假设是jieba.lcut后得到的)的列表
for i in range(2500):
ls = choice(ls_of_ls)
ls_of_words.append([choice(ls) for _ in range(9, 15)])
# 建模训练
from gensim.models import Word2Vec
model = Word2Vec(ls_of_words, size=3, window=7)
# 词向量聚类(基于密度)
from sklearn.cluster import DBSCAN
vectors = [model[word] for word in model.wv.index2word]
labels = DBSCAN(eps=0.24, min_samples=3).fit(vectors).labels_
# 词向量可视化
import matplotlib
from mpl_toolkits import mplot3d
import matplotlib.pyplot as mp
mp.rcParams['font.sans-serif'] = ['SimHei'] # 显示中文
matplotlib.rcParams['axes.unicode_minus'] = False # 显示负号
fig = mp.figure()
ax = mplot3d.Axes3D(fig) # 创建3d坐标轴
colors = ['red', 'blue', 'green', 'black']
for word, vector, label in zip(model.wv.index2word, vectors, labels):
ax.scatter(vector[0], vector[1], vector[2], c=colors[label], s=500, alpha=0.4)
ax.text(vector[0], vector[1], vector[2], word, ha='center', va='center')
mp.show()
5.collection
import collections
# 计数器
print(collections.Counter('abcdeabcdabcaba'))
# 双向链表
q= collections.deque(['a','b','c'])
q.append('x')
q.appendleft('y')
print(q)
# 默认字典,及当字典的key不存在时填写默认值
dic = collections.defaultdict(lambda :'N/A')
dic['k1'] = 'abc'
print(dic['k1']) #‘abc’
print(dic['k2']) #N/A
# 有序字典,写入顺序是唯一的
print('Normal Dictionary:')
d = {}
d['age'] = 'v2'
d['job'] = 'v3'
d1 = {}
d1['job'] = 'v3'
d1['age'] = 'v2'
print(d == d1)
print('OrderedDict:')
d2 = OrderedDict()
d2['age'] = 'v2'
d2['job'] = 'v3'
d3 = OrderedDict()
d3['job'] = 'v3'
d3['age'] = 'v2'
print(d2 == d3)
上一篇: 增删改查,接口范例
下一篇: Hibernate增删改查接口
推荐阅读