LPMM 知识库删除能力与自检脚本增强(附关键健壮性修复)

为 LPMM 新增安全可控的删除能力:
KGManager.delete_paragraphs 支持按段落/实体哈希删除图节点及关联边,可选清理孤立实体,并从图中重建元数据
统一删除脚本 scripts/delete_lpmm_items.py,支持按批次(OpenIE 文件)、哈希文件、原始文本段落、关键字搜索进行删除,内置 dry-run 和最大节点数保护
新增自检与回归脚本:
scripts/inspect_lpmm_batch.py / scripts/inspect_lpmm_global.py 用于批次级和全局状态检查
scripts/test_lpmm_retrieval.py 一键初始化 LPMM 并用固定问题测试检索效果。
健壮性与性能保护:
在 KGManager.kg_search 中对 ent_appear_cnt 缺失增加兜底,避免实体权重计算时的 KeyError。
增加同义实体数量限制与 PPR 节点/关系阈值,必要时自动退回纯向量检索
文档补充:
docs-src/lpmm_user_guide.md:面向零基础用户的导入 / 删除 / 自检脚本使用指南
docs-src/lpmm_parameters_guide.md:[lpmm_knowledge] 关键参数说明与简单调参建议
This commit is contained in:
陈曦
2025-11-27 13:20:12 +08:00
parent fa4555197d
commit 1383caf249
9 changed files with 1376 additions and 5 deletions

View File

@@ -1,7 +1,8 @@
import json
import os
import time
from typing import Dict, List, Tuple
from typing import Dict, List, Tuple, Set
import xml.etree.ElementTree as ET
import numpy as np
import pandas as pd
@@ -98,6 +99,28 @@ class KGManager:
# 加载KG
self.graph = di_graph.load_from_file(self.graph_data_path)
def _rebuild_metadata_from_graph(self) -> None:
"""根据当前图重建 stored_paragraph_hashes 与 ent_appear_cnt"""
nodes = self.graph.get_node_list()
edges = self.graph.get_edge_list()
# 段落 hashparagraph-{hash}
self.stored_paragraph_hashes = set()
for node_id in nodes:
if node_id.startswith("paragraph-"):
self.stored_paragraph_hashes.add(node_id.split("paragraph-", 1)[1])
# 实体出现次数:基于 entity -> paragraph 的边权
ent_appear_cnt: Dict[str, float] = {}
for edge_tuple in edges:
src, tgt = edge_tuple[0], edge_tuple[1]
if src.startswith("entity") and tgt.startswith("paragraph"):
edge_data = self.graph[src, tgt]
weight = edge_data["weight"] if "weight" in edge_data else 1.0
ent_appear_cnt[src] = ent_appear_cnt.get(src, 0.0) + float(weight)
self.ent_appear_cnt = ent_appear_cnt
def _build_edges_between_ent(
self,
node_to_node: Dict[Tuple[str, str], float],
@@ -149,6 +172,13 @@ class KGManager:
ent_hash_list.add("entity" + "-" + get_sha256(triple[0]))
ent_hash_list.add("entity" + "-" + get_sha256(triple[2]))
ent_hash_list = list(ent_hash_list)
# 性能保护:限制同义连接的实体数量
max_synonym_entities = global_config.lpmm_knowledge.max_synonym_entities
if max_synonym_entities and len(ent_hash_list) > max_synonym_entities:
logger.warning(
f"同义连接实体数 {len(ent_hash_list)} 超过阈值 {max_synonym_entities},跳过同义边构建以保护性能"
)
return 0
synonym_hash_set = set()
synonym_result = {}
@@ -329,6 +359,14 @@ class KGManager:
embed_manager: EmbeddingManager对象
"""
# 图中存在的节点总集
# 性能保护:超限或关闭时直接返回向量检索结果
if (
not global_config.lpmm_knowledge.enable_ppr
or len(self.graph.get_node_list()) > global_config.lpmm_knowledge.ppr_node_cap
or len(relation_search_result) > global_config.lpmm_knowledge.ppr_relation_cap
):
logger.info("PPR 已禁用或超出阈值,使用纯向量检索结果")
return paragraph_search_result, None
existed_nodes = self.graph.get_node_list()
# 准备PPR使用的数据
@@ -357,7 +395,15 @@ class KGManager:
ent_mean_scores = {} # 记录实体的平均相似度
for ent_hash, scores in ent_sim_scores.items():
# 先对相似度进行累加,然后与实体计数相除获取最终权重
ent_weights[ent_hash] = float(np.sum(scores)) / self.ent_appear_cnt[ent_hash]
# 保护:有些实体在当前图中可能只有实体-实体关系,不会出现在 ent_appear_cnt 中
appear_cnt = self.ent_appear_cnt.get(ent_hash)
if not appear_cnt or appear_cnt <= 0:
logger.debug(
f"实体 {ent_hash} 在 ent_appear_cnt 中不存在或计数为 0"
f"将使用 1.0 作为默认出现次数参与权重计算"
)
appear_cnt = 1.0
ent_weights[ent_hash] = float(np.sum(scores)) / float(appear_cnt)
# 记录实体的平均相似度用于后续的top_k筛选
ent_mean_scores[ent_hash] = float(np.mean(scores))
del ent_sim_scores
@@ -434,3 +480,115 @@ class KGManager:
passage_node_res = sorted(passage_node_res, key=lambda item: item[1], reverse=True)
return passage_node_res, ppr_node_weights
def delete_paragraphs(
self,
pg_hashes: List[str],
ent_hashes: List[str] | None = None,
remove_orphan_entities: bool = False,
) -> Dict[str, int]:
"""删除段落/实体节点及相关边(基于 GraphML可选清理孤立实体并重建元数据"""
# 要删除的节点 ID
nodes_to_delete: Set[str] = {f"paragraph-{h}" for h in pg_hashes}
if ent_hashes:
nodes_to_delete.update({f"entity-{h}" for h in ent_hashes})
if not os.path.exists(self.graph_data_path):
raise FileNotFoundError(f"KG图文件{self.graph_data_path}不存在")
tree = ET.parse(self.graph_data_path)
root = tree.getroot()
# GraphML 可能带命名空间,用尾缀判断
def is_node(elem: ET.Element) -> bool:
return elem.tag.endswith("node")
def is_edge(elem: ET.Element) -> bool:
return elem.tag.endswith("edge")
graph_elem = None
for child in root:
if child.tag.endswith("graph"):
graph_elem = child
break
if graph_elem is None:
raise RuntimeError("GraphML 中未找到 <graph> 节点")
# 统计现有节点
existing_nodes: Set[str] = set()
for elem in graph_elem:
if is_node(elem):
node_id = elem.get("id")
if node_id:
existing_nodes.add(node_id)
deleted_nodes = len(nodes_to_delete & existing_nodes)
skipped_nodes = len(nodes_to_delete - existing_nodes)
# 先删除指定节点及相关边
# 删除节点
for elem in list(graph_elem):
if is_node(elem):
node_id = elem.get("id")
if node_id and node_id in nodes_to_delete:
graph_elem.remove(elem)
# 删除 incident edges
for elem in list(graph_elem):
if is_edge(elem):
src = elem.get("source")
tgt = elem.get("target")
if src in nodes_to_delete or tgt in nodes_to_delete:
graph_elem.remove(elem)
orphan_removed = 0
if remove_orphan_entities:
# 计算仍然参与边的节点
used_nodes: Set[str] = set()
for elem in graph_elem:
if is_edge(elem):
src = elem.get("source")
tgt = elem.get("target")
if src:
used_nodes.add(src)
if tgt:
used_nodes.add(tgt)
# 找出没有任何边的实体节点
orphan_entities: Set[str] = set()
for elem in graph_elem:
if is_node(elem):
node_id = elem.get("id")
if node_id and node_id.startswith("entity") and node_id not in used_nodes:
orphan_entities.add(node_id)
orphan_removed = len(orphan_entities)
if orphan_entities:
# 删除孤立实体节点
for elem in list(graph_elem):
if is_node(elem):
node_id = elem.get("id")
if node_id in orphan_entities:
graph_elem.remove(elem)
# 删除与孤立实体相关的边(理论上已无,但做一次防御性清理)
for elem in list(graph_elem):
if is_edge(elem):
src = elem.get("source")
tgt = elem.get("target")
if src in orphan_entities or tgt in orphan_entities:
graph_elem.remove(elem)
# 写回 GraphML
tree.write(self.graph_data_path, encoding="utf-8", xml_declaration=True)
# 重新加载图并重建元数据
self.graph = di_graph.load_from_file(self.graph_data_path)
self._rebuild_metadata_from_graph()
return {
"deleted": deleted_nodes,
"skipped": skipped_nodes,
"orphan_removed": orphan_removed,
}