欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

医疗知识图谱笔记(二)

程序员文章站 2022-06-12 18:06:17
...
1.re库
import re
# 从字符串中匹配是否有该模板
print(re.search(pattern = 'w{2}', string = 'www.runoob.com')) 
# 从字符串中替换掉该模板
print(re.sub(pattern = '#.*$', repl = "", string = "2004-959-559 # 这是一个国外电话号码"))
# 从字符串中找到所有匹配的子串
print(re.findall(pattern='\d+', string='runoob 123 google 456'))
# 将字符串根据模板进行分割
print(re.split(pattern = "\d+",string ="12a32bc43jf3") )
# w3c的正则表达式教程
# https://www.w3cschool.cn/zhengzebiaodashi/
E:\miniconda\python.exe E:/BaiduNetdiskDownload/05NLP项目——医疗知识图谱项目/课时附件资料/课时0/课时0/附件/re.py
<re.Match object; span=(0, 2), match='ww'>
2004-959-559 
['123', '456']
['', 'a', 'bc', 'jf', '']

Process finished with exit code 0

re库的三个函数: findall sub split

 

2.pandas 库

import pandas as pd
import numpy as np
dates=pd.date_range('20180310',periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=['A','B','C','D'])#生成6行4列位置
print(df)#输出6行4列的表格
'''
                   A         B         C         D
2018-03-10 -0.092889 -0.503172  0.692763 -1.261313
2018-03-11 -0.895628 -2.300249 -1.098069  0.468986
2018-03-12  0.084732 -1.275078  1.638007 -0.291145-*9
2018-03-13 -0.561528  0.431088  0.430414  1.065939
2018-03-14  1.485434 -0.341404  0.267613 -1.493366
2018-03-15 -1.671474  0.110933  1.688264 -0.910599
  '''
print(df['B'])
'''
2018-03-10   -0.927291
2018-03-11   -0.406842
2018-03-12   -0.088316
2018-03-13   -1.631055
2018-03-14   -0.929926
2018-03-15   -0.010904
Freq: D, Name: B, dtype: float64
 '''

#创建特定数据的DataFrame
df_1=pd.DataFrame({'A' : 1.,
                    'B' : pd.Timestamp('20180310'),
                    'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D' : np.array([3] * 4,dtype='int32'),
                    'E' : pd.Categorical(["test","train","test","train"]),
                    'F' : 'foo'
                    })
print(df_1)
'''
     A          B    C  D      E    F
0  1.0 2018-03-10  1.0  3   test  foo
1  1.0 2018-03-10  1.0  3  train  foo
2  1.0 2018-03-10  1.0  3   test  foo
3  1.0 2018-03-10  1.0  3  train  foo
'''
print(df_1.dtypes)
'''
A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object
'''
print(df_1.index)#行的序号
#Int64Index([0, 1, 2, 3], dtype='int64')
print(df_1.columns)#列的序号名字
'''
Int64Index([0, 1, 2, 3], dtype='int64')
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
'''
#Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
print(df_1.values)#把每个值进行打印出来
'''
[[1.0 Timestamp('2018-03-10 00:00:00') 1.0 3 'test' 'foo']
 [1.0 Timestamp('2018-03-10 00:00:00') 1.0 3 'train' 'foo']
 [1.0 Timestamp('2018-03-10 00:00:00') 1.0 3 'test' 'foo']
 [1.0 Timestamp('2018-03-10 00:00:00') 1.0 3 'train' 'foo']]
 '''
print(df_1.describe())#数字总结
'''
         A    C    D
count  4.0  4.0  4.0
mean   1.0  1.0  3.0
std    0.0  0.0  0.0
min    1.0  1.0  3.0
25%    1.0  1.0  3.0
50%    1.0  1.0  3.0
75%    1.0  1.0  3.0
max    1.0  1.0  3.0
'''
print(df_1.T)#翻转数据
'''
                     0                    1                    2  \
A                    1                    1                    1   
B  2018-03-10 00:00:00  2018-03-10 00:00:00  2018-03-10 00:00:00   
C                    1                    1                    1   
D                    3                    3                    3   
E                 test                train                 test   
F                  foo                  foo                  foo   

                     3  
A                    1  
B  2018-03-10 00:00:00  
C                    1  
D                    3  
E                train  
F                  foo  
'''
print(df_1.sort_index(axis=1, ascending=False))#axis等于1按列进行排序 如ABCDEFG 然后ascending倒叙进行显示
'''
     F      E  D    C          B    A
0  foo   test  3  1.0 2018-03-10  1.0
1  foo  train  3  1.0 2018-03-10  1.0
2  foo   test  3  1.0 2018-03-10  1.0
3  foo  train  3  1.0 2018-03-10  1.0
'''
print(df_1.sort_values(by='E'))#按值进行排序
'''
     A          B    C  D      E    F
0  1.0 2018-03-10  1.0  3   test  foo
2  1.0 2018-03-10  1.0  3   test  foo
1  1.0 2018-03-10  1.0  3  train  foo
3  1.0 2018-03-10  1.0  3  train  foo
'''

3.json

import json
# json 数据,和Python中的dict数据形式一样
data ={
    "第一个key":"第一个value",
    "第二个key":"第二个value"
    }
print('原生json数据',data)
# 将json转换成str,方便在文件中保存
data_str = json.dumps(data)
print('json转成str',data_str)
# 将str转换成json,方便在Python的调用
data_json = json.loads(data_str)
print('从str转成json',data_json)
# 将一个json对象直接保存在文件中
with open('json.txt','w') as f :
    json.dump(data_json,f)

# 将一个保存json对象的文件直接转成字符串
with open('json.txt','r') as f :
    data_json_exchange = json.load(f)
    print('从文件中获得json数据',data_json_exchange)
E:\miniconda\python.exe E:/BaiduNetdiskDownload/05NLP项目——医疗知识图谱项目/课时附件资料/课时0/课时0/附件/json.py
原生json数据 {'第一个key': '第一个value', '第二个key': '第二个value'}
json转成str {"\u7b2c\u4e00\u4e2akey": "\u7b2c\u4e00\u4e2avalue", "\u7b2c\u4e8c\u4e2akey": "\u7b2c\u4e8c\u4e2avalue"}
从str转成json {'第一个key': '第一个value', '第二个key': '第二个value'}
从文件中获得json数据 {'第一个key': '第一个value', '第二个key': '第二个value'}

Process finished with exit code 0

4. gensim词向量库

from gensim.models import Word2Vec
from random import choice
temp =[
    ['用来','测试','的','分词','之后','的','第一','句','话'],
    ['我','随便','写','的','一','句','话']
]
ls_of_words = []  # 存放分词列表的列表
for i in range(1500):
    ls = choice(temp)
    ls_of_words.append([choice(ls) for _ in range(9, 15)])
# 训练词向量模型,主要的参数就输输入文本,其他的参数影不是很大
model = Word2Vec(ls_of_words)
# 得到最想似的词
print(model.similar_by_word('用来'))
# 计算两者之间的相似度
print(model.similarity('用来', '测试'))


# 词向量聚类及可视化
from random import choice
ls_of_ls = [['芝士', '酸奶', '蛋糕', '巧克力', '做', '吃'],
            ['文本', '数据', '挖掘', '分析', '做', '玩'],
            ['佛山', '广州', '南海', '天河', '吃', '玩']]
ls_of_words = []  # 存放分词列表(假设是jieba.lcut后得到的)的列表
for i in range(2500):
    ls = choice(ls_of_ls)
    ls_of_words.append([choice(ls) for _ in range(9, 15)])

# 建模训练
from gensim.models import Word2Vec
model = Word2Vec(ls_of_words, size=3, window=7)

# 词向量聚类(基于密度)
from sklearn.cluster import DBSCAN
vectors = [model[word] for word in model.wv.index2word]
labels = DBSCAN(eps=0.24, min_samples=3).fit(vectors).labels_

# 词向量可视化
import matplotlib
from mpl_toolkits import mplot3d
import matplotlib.pyplot as mp
mp.rcParams['font.sans-serif'] = ['SimHei']  # 显示中文
matplotlib.rcParams['axes.unicode_minus'] = False  # 显示负号
fig = mp.figure()
ax = mplot3d.Axes3D(fig)  # 创建3d坐标轴
colors = ['red', 'blue', 'green', 'black']
for word, vector, label in zip(model.wv.index2word, vectors, labels):
    ax.scatter(vector[0], vector[1], vector[2], c=colors[label], s=500, alpha=0.4)
    ax.text(vector[0], vector[1], vector[2], word, ha='center', va='center')
mp.show()

5.collection

import collections
# 计数器
print(collections.Counter('abcdeabcdabcaba'))
# 双向链表
q= collections.deque(['a','b','c'])
q.append('x')
q.appendleft('y')
print(q)
# 默认字典,及当字典的key不存在时填写默认值
dic = collections.defaultdict(lambda :'N/A')
dic['k1'] = 'abc'
print(dic['k1']) #‘abc’
print(dic['k2']) #N/A
# 有序字典,写入顺序是唯一的
print('Normal Dictionary:')
d = {}
d['age'] = 'v2'
d['job'] = 'v3'
d1 = {}
d1['job'] = 'v3'
d1['age'] = 'v2'
print(d == d1)
print('OrderedDict:')
d2 = OrderedDict()
d2['age'] = 'v2'
d2['job'] = 'v3'
d3 = OrderedDict()
d3['job'] = 'v3'
d3['age'] = 'v2'
print(d2 == d3)