二、知识图谱的搭建
程序员文章站
2022-06-12 17:13:39
...
先贴代码
__author__ = 'ding'
'''
知识图谱搭建
'''
import string
import xlrd
from tqdm import tqdm
from py2neo import Graph, Node, Relationship, NodeSelector
class movieGraph:
def __init__(self, password="123"):
self.rdb = None
self.graph = Graph("http://localhost:7474/db/data", password=password)
self.selector = NodeSelector(self.graph)
# 电影节点
def add_Movie_Cell(self, label='Movie', name=None, rating="", time='', genres='', content='', actors=''):
assert name is not None
node = self.selector.select('Movie').where(name=name).first()
if node:
node['name'] = name
node['rating'] = rating
node['time'] = time
node['genres'] = genres
node['content'] = content
node['actors'] = actors
self.graph.push(node)
else:
node = Node(label, name=name, rating=rating, time=time, genres=genres, content=content, actors=actors)
self.graph.create(node)
for genre in genres.split('|'):
node_genre = self.selector.select('Genre').where(genre=genre).first()
if node_genre:
g_r_n = Relationship(node, 'is', node_genre)
else:
node_genre = Node('Genre', genre=genre)
self.graph.create(node_genre)
g_r_n = Relationship(node, 'is', node_genre)
self.graph.create(g_r_n)
for actor in actors.split('|'):
node_actor = self.selector.select('Person').where(name=actor).first()
if node_actor:
a_r_n = Relationship(node, 'acting', node_actor)
else:
node_actor = Node('Person', name=actor)
self.graph.create(node_actor)
a_r_n = Relationship(node, 'acting', node_actor)
self.graph.create(a_r_n)
# 电影种类节点
def add_Movie_Genre(self, label='Genre', genre=None, movie=""):
assert genre is not None
node = self.selector.select('Genre').where(genre=genre).first()
if node:
node['genre'] = genre
self.graph.push(node)
else:
node = Node(label, genre=genre)
self.graph.create(node)
node_movie = self.selector.select('Movie').where(name=movie).first()
assert node_movie is not None
node_r_movie = Relationship(node, 'is', node_movie)
self.graph.create(node_r_movie)
# 人物信息节点
def add_Person_Cell(self, label='Person', name=None, borndata='', desc='', movie=''):
assert name is not None
node = self.selector.select('Person').where(name=name).first()
if node:
node['name'] = name
node['borndata'] = borndata
node['desc'] = desc
self.graph.push(node)
else:
node = Node(label, name=name, borndata=borndata, desc=desc)
self.graph.create(node)
node_movie = self.selector.select('Movie').where(name=movie).first()
assert node_movie is not None
node_r_movie = Relationship(node, 'acting', node_movie)
self.graph.create(node_r_movie)
def delete(self, pattern="n", label=None):
"""Batch delete data or subgraph in database.
在数据库中批量删除数据或者子图。
Args:
pattern: Type of subgraph. 子图类型。
label: Label of subgraph. 子图标签。
"""
if pattern == "all":
self.graph.delete_all()
elif pattern == "n":
self.graph.run("MATCH(n:" + label + ") DETACH DELETE n")
elif pattern == "r":
self.graph.run("MATCH (n)-[r:" + label + "]-(m) DETACH DELETE r")
elif pattern == "nr":
self.graph.run("MATCH (n)<-[r:" + label + "]-(m) DETACH DELETE r, n")
elif pattern == "rm":
self.graph.run("MATCH (n)-[r:" + label + "]->(m) DETACH DELETE r, m")
elif pattern == "nrm":
self.graph.run("MATCH (n)-[r:" + label + "]-(m) DETACH DELETE r, n, m")
# 读取excel中的信息
def handle_excel(self, filename=None, custom_sheets=[]):
assert filename is not None
data = xlrd.open_workbook(filename)
data_sheets = data.sheet_names()
if custom_sheets: # 可自定义要导入的子表格
sheet_names = list(set(data_sheets).intersection(set(custom_sheets)))
else:
sheet_names = data_sheets
for sheet_name in sheet_names:
table = data.sheet_by_name(sheet_name)
if table:
col_format = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
try:
nrows = table.nrows
str_upcase = [i for i in string.ascii_uppercase]
i_upcase = range(len(str_upcase))
ncols_dir = dict(zip(str_upcase, i_upcase))
col_index = [ncols_dir.get(i) for i in col_format]
for i in tqdm(range(1, nrows)):
name = table.cell_value(i, col_index[0])
rating = table.cell_value(i, col_index[7])
time = table.cell_value(i, col_index[4])
genres = table.cell_value(i, col_index[3])
# for genre in genres.split('|'):
# self.add_Movie_Genre(genre=genre, movie=name)
content = table.cell_value(i, col_index[6])
actors = table.cell_value(i, col_index[2])
# for actor in actors.split('|'):
# self.add_Person_Cell(name=actor, movie=name)
self.add_Movie_Cell(name=name, rating=rating, time=time, genres=genres,
content=content, actors=actors)
except Exception as error:
print('Error: %s' % error)
return None
此处是搭建了一个简单的知识图谱,电影、演员、电影种类各有自己的标签,电影节点中的属性还包含了许多信息。为查询的时候提供反馈。
代码比较简单就不解释了。本人也才接触没多久,有错误的地方,请不吝赐教