知识图谱与图神经网络
大约 12 分钟约 3678 字
知识图谱与图神经网络
简介
知识图谱以图结构组织实体和关系,图神经网络(GNN)在图结构数据上进行学习。理解知识图谱构建、图嵌入和 GNN 模型(GCN、GAT),有助于构建基于知识的 AI 应用和图结构数据分析系统。
特点
知识图谱构建
实体-关系建模
# 知识图谱三元组:(头实体, 关系, 尾实体)
# 例:(北京, 是首都, 中国), (Python, 是编程语言, 编程语言)
class KnowledgeGraph:
"""知识图谱"""
def __init__(self):
self.entities = set()
self.relations = set()
self.triples = [] # (head, relation, tail)
self.entity_neighbors = {} # entity -> [(relation, neighbor)]
def add_triple(self, head, relation, tail):
"""添加三元组"""
self.entities.add(head)
self.entities.add(tail)
self.relations.add(relation)
self.triples.append((head, relation, tail))
if head not in self.entity_neighbors:
self.entity_neighbors[head] = []
self.entity_neighbors[head].append((relation, tail))
if tail not in self.entity_neighbors:
self.entity_neighbors[tail] = []
self.entity_neighbors[tail].append((f"~{relation}", head))
def get_neighbors(self, entity):
"""获取实体的邻居"""
return self.entity_neighbors.get(entity, [])
def multi_hop_query(self, start_entity, max_hops=2):
"""多跳查询"""
results = {0: {start_entity}}
visited = {start_entity}
for hop in range(1, max_hops + 1):
results[hop] = set()
for entity in results[hop - 1]:
for relation, neighbor in self.get_neighbors(entity):
if neighbor not in visited:
results[hop].add(neighbor)
visited.add(neighbor)
return results
def shortest_path(self, start, end, max_depth=5):
"""BFS 最短路径"""
from collections import deque
queue = deque([(start, [start])])
visited = {start}
while queue:
current, path = queue.popleft()
if current == end:
return path
if len(path) > max_depth:
continue
for relation, neighbor in self.get_neighbors(current):
if neighbor not in visited:
visited.add(neighbor)
queue.append((neighbor, path + [neighbor]))
return None
# 构建示例知识图谱
kg = KnowledgeGraph()
kg.add_triple("北京", "是首都", "中国")
kg.add_triple("上海", "位于", "中国")
kg.add_triple("中国", "属于", "亚洲")
kg.add_triple("Python", "是", "编程语言")
kg.add_triple("GPT", "基于", "Transformer")
kg.add_triple("BERT", "基于", "Transformer")
kg.add_triple("Transformer", "发表于", "2017")
kg.add_triple("GPT", "用于", "文本生成")
kg.add_triple("BERT", "用于", "文本理解")图嵌入
TransE 与 Node2Vec
import torch
import torch.nn as nn
# TransE — 翻译嵌入
# 核心思想:h + r ≈ t
# 损失:max(0, ||h+r-t|| - ||h+r'-t'|| + margin)
class TransE(nn.Module):
def __init__(self, num_entities, num_relations, embed_dim=128, margin=1.0):
super().__init__()
self.margin = margin
# 实体和关系嵌入
self.entity_embeddings = nn.Embedding(num_entities, embed_dim)
self.relation_embeddings = nn.Embedding(num_relations, embed_dim)
# 初始化
nn.init.xavier_uniform_(self.entity_embeddings.weight)
nn.init.xavier_uniform_(self.relation_embeddings.weight)
def forward(self, pos_triples, neg_triples):
"""
pos_triples: (batch, 3) — 正样本三元组
neg_triples: (batch, 3) — 负样本三元组(替换头或尾)
"""
# 正样本距离
pos_heads = self.entity_embeddings(pos_triples[:, 0])
pos_rels = self.relation_embeddings(pos_triples[:, 1])
pos_tails = self.entity_embeddings(pos_triples[:, 2])
pos_dist = torch.norm(pos_heads + pos_rels - pos_tails, p=2, dim=1)
# 负样本距离
neg_heads = self.entity_embeddings(neg_triples[:, 0])
neg_rels = self.relation_embeddings(neg_triples[:, 1])
neg_tails = self.entity_embeddings(neg_triples[:, 2])
neg_dist = torch.norm(neg_heads + neg_rels - neg_tails, p=2, dim=1)
# Margin-based 损失
loss = torch.clamp(pos_dist - neg_dist + self.margin, min=0).mean()
return loss
def get_entity_embedding(self, entity_id):
"""获取实体嵌入(归一化)"""
emb = self.entity_embeddings(torch.tensor([entity_id]))
return F.normalize(emb, dim=-1)
# Node2Vec — 随机游走图嵌入
class Node2Vec:
def __init__(self, graph, embed_dim=128, walk_length=30, num_walks=200,
p=1.0, q=1.0):
self.graph = graph
self.embed_dim = embed_dim
self.walk_length = walk_length
self.num_walks = num_walks
self.p = p # 返回参数
self.q = q # 进出参数
def random_walk(self, start_node):
"""带偏置的随机游走"""
walk = [start_node]
while len(walk) < self.walk_length:
current = walk[-1]
neighbors = list(self.graph.neighbors(current))
if not neighbors:
break
if len(walk) == 1:
next_node = np.random.choice(neighbors)
else:
# 带偏置的采样
prev = walk[-2]
probs = []
for neighbor in neighbors:
if neighbor == prev:
probs.append(1.0 / self.p) # 返回
elif neighbor in self.graph.neighbors(prev):
probs.append(1.0) # 同一社区
else:
probs.append(1.0 / self.q) # 不同社区
probs = np.array(probs) / sum(probs)
next_node = np.random.choice(neighbors, p=probs)
walk.append(next_node)
return walk
def generate_walks(self):
"""生成所有随机游走序列"""
walks = []
nodes = list(self.graph.nodes())
for _ in range(self.num_walks):
np.random.shuffle(nodes)
for node in nodes:
walk = self.random_walk(node)
walks.append(walk)
return walks图卷积网络
GCN 实现
class GCNLayer(nn.Module):
"""图卷积层"""
def __init__(self, in_features, out_features):
super().__init__()
self.linear = nn.Linear(in_features, out_features)
def forward(self, x, adj):
"""
x: (num_nodes, in_features) — 节点特征
adj: (num_nodes, num_nodes) — 邻接矩阵(归一化)
"""
# D^{-1/2} A D^{-1/2} X W
support = self.linear(x) # (N, out_features)
output = torch.sparse.mm(adj, support) # 图卷积
return output
class GCN(nn.Module):
"""多层 GCN"""
def __init__(self, input_dim, hidden_dim, output_dim, dropout=0.5):
super().__init__()
self.layers = nn.ModuleList([
GCNLayer(input_dim, hidden_dim),
GCNLayer(hidden_dim, hidden_dim),
GCNLayer(hidden_dim, output_dim)
])
self.dropout = nn.Dropout(dropout)
def forward(self, x, adj):
for i, layer in enumerate(self.layers):
x = layer(x, adj)
if i < len(self.layers) - 1:
x = F.relu(x)
x = self.dropout(x)
return F.log_softmax(x, dim=1)
# 图注意力网络(GAT)
class GATLayer(nn.Module):
"""图注意力层"""
def __init__(self, in_features, out_features, num_heads=4, dropout=0.1):
super().__init__()
self.num_heads = num_heads
self.out_features = out_features
self.W = nn.Parameter(torch.Tensor(in_features, num_heads * out_features))
self.a = nn.Parameter(torch.Tensor(num_heads, 2 * out_features, 1))
self.dropout = nn.Dropout(dropout)
nn.init.xavier_uniform_(self.W)
nn.init.xavier_uniform_(self.a)
def forward(self, x, adj):
"""
x: (N, in_features)
adj: (N, N) — 邻接矩阵
"""
N = x.size(0)
# 线性变换
h = x @ self.W # (N, heads * out_features)
h = h.view(N, self.num_heads, self.out_features)
# 计算注意力系数
# e_ij = LeakyReLU(a^T [Wh_i || Wh_j])
h_i = h.unsqueeze(2).expand(N, self.num_heads, N, self.out_features)
h_j = h.unsqueeze(0).expand(N, self.num_heads, N, self.out_features)
concat = torch.cat([h_i, h_j], dim=-1) # (N, heads, N, 2*out)
e = F.leaky_relu(concat @ self.a, negative_slope=0.2).squeeze(-1) # (N, heads, N)
# 掩码:只计算邻居
mask = (adj == 0).unsqueeze(1).expand_as(e)
e = e.masked_fill(mask, float('-inf'))
# Softmax
alpha = F.softmax(e, dim=-1)
alpha = self.dropout(alpha)
# 加权聚合
out = torch.bmm(alpha.transpose(0, 1).reshape(self.num_heads * N, 1, N),
h.transpose(0, 1).reshape(self.num_heads, N, self.out_features)
.repeat(N, 1, 1).view(self.num_heads * N, N, self.out_features))
return out.mean(dim=0) # 多头平均优点
缺点
总结
知识图谱以三元组(头实体,关系,尾实体)建模实体关系,支持多跳查询和路径推理。图嵌入将实体和关系映射到低维向量空间,TransE 使用翻译模型(h+r≈t)。GCN 通过邻接矩阵的乘法实现图卷积,GAT 引入注意力机制加权邻居聚合。应用场景包括推荐系统(用户-物品图)、知识推理、药物发现和社交网络分析。
关键知识点
- 先分清模型能力边界、数据边界和工程边界。
- 任何 AI 主题都不只看效果,还要看延迟、成本、可解释性和安全性。
- 评估方式和失败样例往往比“换哪个模型”更重要。
- 先把模型结构、训练目标、评价指标和适用场景分开理解。
项目落地视角
- 给数据来源、Prompt 模板、Embedding 版本、评估集和实验结果做版本管理。
- 上线前准备兜底策略,例如拒答、回退、人工审核或缓存降级。
- 观察错误类型时,区分数据问题、召回问题、提示词问题和模型问题。
- 用固定数据集和固定指标比较方案,不要只看主观效果。
常见误区
- 只关注 Demo 效果,不考虑线上稳定性和可复现性。
- 没有评估集就频繁调参,最后无法解释为什么变好或变差。
- 忽略权限、审计、隐私和模型输出的安全边界。
- 只讨论模型结构,不讨论数据质量和标签质量。
进阶路线
- 继续补齐训练、推理、评估、MLOps 和治理链路。
- 把主题放回真实业务流程,思考谁提供数据、谁消费结果、谁负责兜底。
- 把 PoC 逐步升级到可观测、可回滚、可演进的生产方案。
- 继续补齐模型压缩、量化、蒸馏、在线评估和数据闭环。
适用场景
- 当你准备把《知识图谱与图神经网络》真正落到项目里时,最适合先在一个独立模块或最小样例里验证关键路径。
- 适合企业知识问答、内容生成、分类抽取和智能助手等场景。
- 当需求同时关注效果、时延、成本和安全边界时,这类主题最有价值。
落地建议
- 先定义评估集、成功标准和失败样例,再开始调模型或调提示。
- 把数据来源、分块方式、Embedding 版本和 Prompt 模板纳入版本管理。
- 上线前准备兜底策略,例如拒答、回退、人工审核或检索降级。
排错清单
- 先判断问题出在数据、检索、Prompt、模型还是后处理。
- 检查上下文是否过长、分块是否过碎或召回是否偏题。
- 对错误回答做分类,区分幻觉、事实过时、指令误解和格式错误。
复盘问题
- 如果把《知识图谱与图神经网络》放进你的当前项目,最先要验证的输入、输出和失败路径分别是什么?
- 《知识图谱与图神经网络》最容易在什么规模、什么边界条件下暴露问题?你会用什么指标或日志去确认?
- 相比默认实现或替代方案,采用《知识图谱与图神经网络》最大的收益和代价分别是什么?
知识图谱存储与查询(Neo4j)
# 使用 Neo4j 存储和查询知识图谱
from neo4j import GraphDatabase
class KnowledgeGraphStore:
"""基于 Neo4j 的知识图谱存储"""
def __init__(self, uri: str, user: str, password: str):
self.driver = GraphDatabase.driver(uri, auth=(user, password))
def close(self):
self.driver.close()
def add_entity(self, name: str, labels: list[str], properties: dict):
"""添加实体节点"""
label_str = ":".join(labels)
query = f"""
MERGE (n:{label_str} {{name: $name}})
SET n += $props
RETURN n
"""
with self.driver.session() as session:
session.run(query, name=name, props=properties)
def add_relation(self, head: str, relation: str, tail: str, properties: dict = None):
"""添加关系"""
query = """
MATCH (h {name: $head})
MATCH (t {name: $tail})
MERGE (h)-[r:RELATION {type: $relation}]->(t)
SET r += $props
RETURN r
"""
with self.driver.session() as session:
session.run(query, head=head, relation=relation,
tail=tail, props=properties or {})
def find_path(self, start: str, end: str, max_depth: int = 5):
"""查找两个实体之间的路径"""
query = """
MATCH path = shortestPath(
(h {name: $start})-[*1..%d]-(t {name: $end})
)
RETURN [n in nodes(path) | n.name] AS nodes,
[r in relationships(path) | r.type] AS relations
""" % max_depth
with self.driver.session() as session:
result = session.run(query, start=start, end=end)
for record in result:
return {
"nodes": record["nodes"],
"relations": record["relations"]
}
return None
def get_entity_neighbors(self, entity: str, depth: int = 1):
"""获取实体的邻居"""
query = """
MATCH (n {name: $name})-[r*1..%d]-(m)
RETURN n.name AS source,
type(r[-1]) AS relation,
m.name AS target,
labels(m)[0] AS target_type
""" % depth
with self.driver.session() as session:
result = session.run(query, name=entity)
return [dict(record) for record in result]
def entity_ranking(self, entity_type: str, limit: int = 10):
"""基于连接数的实体排序"""
query = """
MATCH (n:%s)-[r]-()
RETURN n.name AS entity, count(r) AS connections
ORDER BY connections DESC
LIMIT $limit
""" % entity_type
with self.driver.session() as session:
result = session.run(query, limit=limit)
return [dict(record) for record in result]
# 使用示例
# store = KnowledgeGraphStore("bolt://localhost:7687", "neo4j", "password")
# store.add_entity("Python", ["ProgrammingLanguage"], {"year": 1991})
# store.add_entity("GPT", ["AIModel"], {"company": "OpenAI"})
# store.add_relation("GPT", "基于", "Transformer")
# path = store.find_path("Python", "Transformer")GNN 训练与评估
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
def train_gcn(model, x, adj, labels, epochs=200, lr=0.01):
"""GCN 训练流程"""
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)
# 划分训练集和测试集(按节点)
num_nodes = x.size(0)
indices = torch.randperm(num_nodes)
train_size = int(0.8 * num_nodes)
train_idx = indices[:train_size]
test_idx = indices[train_size:]
train_mask = torch.zeros(num_nodes, dtype=torch.bool)
train_mask[train_idx] = True
test_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask[test_idx] = True
for epoch in range(epochs):
model.train()
optimizer.zero_grad()
output = model(x, adj)
loss = F.nll_loss(output[train_mask], labels[train_mask])
loss.backward()
optimizer.step()
# 评估
model.eval()
with torch.no_grad():
pred = output[test_mask].argmax(dim=1)
correct = (pred == labels[test_mask]).sum().item()
accuracy = correct / test_mask.sum().item()
if (epoch + 1) % 20 == 0:
print(f"Epoch {epoch+1:03d}, Loss: {loss.item():.4f}, Test Acc: {accuracy:.4f}")
return model
# GNN 常见问题
def explain_gnn_challenges():
"""GNN 训练常见问题
1. 过平滑(Over-smoothing):
- GCN 层数增加时,节点表示趋于一致
- 表现为:2-3 层后准确率不升反降
- 解决方案:
a) 使用残差连接(Residual Connection)
b) 使用 GAT(注意力机制缓解)
c) 控制层数在 2-4 层
d) 使用 JK-Net(跳跃知识网络)
2. 过拟合:
- 图数据通常很小(几千节点)
- 解决方案:
a) Dropout
b) L2 正则化
c) 数据增强(随机删除边/节点)
d) 使用 GraphSAGE 的采样策略
3. 大规模图训练:
- 全图训练内存不够
- 解决方案:
a) GraphSAGE:邻居采样
b) Cluster-GCN:图聚类分块
c) Mini-batch 训练
4. 归一化问题:
- 邻接矩阵需要正确的归一化
- 常见做法:D^{-1/2} A D^{-1/2}
- 也可以加自环:D'^{-1/2} A' D'^{-1/2},其中 A' = A + I
"""
print("GNN 训练建议:")
print(" 层数: 2-4 层(避免过平滑)")
print(" 归一化: D^{-1/2}(A+I)D^{-1/2}")
print(" 正则化: Dropout(0.5) + L2(5e-4)")
print(" 大图: 使用 GraphSAGE 采样")
explain_gnn_challenges()知识图谱嵌入评估
def evaluate_knowledge_graph_embeddings(model, test_triples, all_entities):
"""评估知识图谱嵌入质量
评估指标:
1. MRR (Mean Reciprocal Rank): 正确实体在排序列表中的倒数排名均值
2. Hits@K: 正确实体在 Top-K 中的比例
3. Mean Rank: 正确实体的平均排名
评估方法:
- 将正确三元组的尾实体替换为所有可能的实体
- 计算正确实体的排名
- 过滤掉训练集中已存在的三元组(Filtered 设置)
"""
import numpy as np
ranks = []
hits_at_1, hits_at_3, hits_at_10 = 0, 0, 0
for head, relation, tail in test_triples:
# 计算所有候选尾实体的分数
scores = []
for entity_id in range(len(all_entities)):
score = model.score(head, relation, entity_id)
scores.append(score)
scores = np.array(scores)
tail_score = scores[tail]
# 排名(从高到低)
rank = np.sum(scores >= tail_score)
ranks.append(rank)
if rank <= 1: hits_at_1 += 1
if rank <= 3: hits_at_3 += 1
if rank <= 10: hits_at_10 += 1
n = len(test_triples)
mrr = np.mean([1.0 / r for r in ranks])
print(f"MRR: {mrr:.4f}")
print(f"Hits@1: {hits_at_1 / n:.4f}")
print(f"Hits@3: {hits_at_3 / n:.4f}")
print(f"Hits@10: {hits_at_10 / n:.4f}")
print(f"Mean Rank: {np.mean(ranks):.2f}")
# 典型基线结果:
# TransE: MRR ~0.25, Hits@10 ~0.50
# TransH: MRR ~0.30, Hits@10 ~0.55
# RotatE: MRR ~0.35, Hits@10 ~0.60
# ComplEx: MRR ~0.34, Hits@10 ~0.58知识图谱与 LLM 结合
def explain_kg_llm_integration():
"""知识图谱与大语言模型的结合方式
1. KG-enhanced RAG:
- 用知识图谱替代或补充向量检索
- 先从 KG 中提取相关实体和关系
- 将结构化知识注入 LLM Prompt
- 适合:实体关系明确的问答场景
2. Graph RAG:
- 结合图遍历和语义检索
- 先在 KG 中找到相关实体
- 再沿关系边扩展获取上下文
- Microsoft 开源了 GraphRAG 框架
3. KG-augmented Generation:
- 用 LLM 从文本中抽取三元组
- 存入知识图谱
- 查询时先查 KG 获取结构化信息
- 再用 LLM 生成自然语言回答
4. 知识图谱问答流程:
a) 实体识别:从问题中提取关键实体
b) 关系预测:预测问题中涉及的实体关系
c) 图查询:在知识图谱中查找相关子图
d) 答案生成:将子图信息传入 LLM 生成回答
适用场景:
- 企业知识管理(组织架构、产品关系)
- 医疗诊断(症状-疾病-药物关系)
- 金融风控(企业关联、担保链)
- 推荐系统(用户-物品-属性图谱)
"""
print("KG + LLM 结合建议:")
print(" 结构化知识: KG 存储和检索")
print(" 自然语言理解: LLM 实体识别和关系抽取")
print(" 答案生成: LLM 基于图上下文生成回答")
print(" 知识更新: LLM 辅助从新文档中抽取三元组")
explain_kg_llm_integration()知识图谱构建的工程挑战:
- 实体消歧:同一实体可能有不同表述("Apple" = 公司/水果)
- 关系抽取:从非结构化文本中准确抽取实体关系
- 知识融合:多源知识的冲突消解和对齐
- 质量控制:自动化质检 + 人工审核
- 更新机制:知识过期检测和增量更新