LPMM 知识库删除能力与自检脚本增强(附关键健壮性修复)
为 LPMM 新增安全可控的删除能力: KGManager.delete_paragraphs 支持按段落/实体哈希删除图节点及关联边,可选清理孤立实体,并从图中重建元数据 统一删除脚本 scripts/delete_lpmm_items.py,支持按批次(OpenIE 文件)、哈希文件、原始文本段落、关键字搜索进行删除,内置 dry-run 和最大节点数保护 新增自检与回归脚本: scripts/inspect_lpmm_batch.py / scripts/inspect_lpmm_global.py 用于批次级和全局状态检查 scripts/test_lpmm_retrieval.py 一键初始化 LPMM 并用固定问题测试检索效果。 健壮性与性能保护: 在 KGManager.kg_search 中对 ent_appear_cnt 缺失增加兜底,避免实体权重计算时的 KeyError。 增加同义实体数量限制与 PPR 节点/关系阈值,必要时自动退回纯向量检索 文档补充: docs-src/lpmm_user_guide.md:面向零基础用户的导入 / 删除 / 自检脚本使用指南 docs-src/lpmm_parameters_guide.md:[lpmm_knowledge] 关键参数说明与简单调参建议
This commit is contained in:
132
scripts/inspect_lpmm_batch.py
Normal file
132
scripts/inspect_lpmm_batch.py
Normal file
@@ -0,0 +1,132 @@
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple
|
||||
|
||||
# 确保能导入 src.*
|
||||
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
||||
|
||||
from src.chat.knowledge.utils.hash import get_sha256
|
||||
from src.chat.knowledge.embedding_store import EmbeddingManager
|
||||
from src.chat.knowledge.kg_manager import KGManager
|
||||
from src.common.logger import get_logger
|
||||
|
||||
logger = get_logger("inspect_lpmm_batch")
|
||||
|
||||
|
||||
def load_openie_hashes(path: Path) -> Tuple[List[str], List[str], List[str]]:
|
||||
"""从 OpenIE JSON 中提取段落 / 实体 / 关系的哈希
|
||||
|
||||
注意:实体既包括 extracted_entities 中的条目,也包括三元组中的主语/宾语,
|
||||
以与 KG 构图逻辑保持一致。
|
||||
"""
|
||||
with path.open("r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
pg_hashes: List[str] = []
|
||||
ent_hashes: List[str] = []
|
||||
rel_hashes: List[str] = []
|
||||
|
||||
for doc in data.get("docs", []):
|
||||
if not isinstance(doc, dict):
|
||||
continue
|
||||
idx = doc.get("idx")
|
||||
if isinstance(idx, str) and idx.strip():
|
||||
pg_hashes.append(idx.strip())
|
||||
|
||||
ents = doc.get("extracted_entities", [])
|
||||
if isinstance(ents, list):
|
||||
for e in ents:
|
||||
if isinstance(e, str):
|
||||
ent_hashes.append(get_sha256(e))
|
||||
|
||||
triples = doc.get("extracted_triples", [])
|
||||
if isinstance(triples, list):
|
||||
for t in triples:
|
||||
if isinstance(t, list) and len(t) == 3:
|
||||
# 主语/宾语作为实体参与构图
|
||||
subj, _, obj = t
|
||||
if isinstance(subj, str):
|
||||
ent_hashes.append(get_sha256(subj))
|
||||
if isinstance(obj, str):
|
||||
ent_hashes.append(get_sha256(obj))
|
||||
rel_hashes.append(get_sha256(str(tuple(t))))
|
||||
|
||||
# 去重但保留顺序
|
||||
def unique(seq: List[str]) -> List[str]:
|
||||
seen = set()
|
||||
return [x for x in seq if not (x in seen or seen.add(x))]
|
||||
|
||||
return unique(pg_hashes), unique(ent_hashes), unique(rel_hashes)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="检查指定 OpenIE 文件对应批次在当前向量库与 KG 中的存在情况(用于验证删除效果)。"
|
||||
)
|
||||
parser.add_argument("--openie-file", required=True, help="OpenIE 输出 JSON 文件路径")
|
||||
args = parser.parse_args()
|
||||
|
||||
openie_path = Path(args.openie_file)
|
||||
if not openie_path.exists():
|
||||
logger.error(f"OpenIE 文件不存在: {openie_path}")
|
||||
sys.exit(1)
|
||||
|
||||
pg_hashes, ent_hashes, rel_hashes = load_openie_hashes(openie_path)
|
||||
logger.info(
|
||||
f"从 {openie_path.name} 解析到 段落 {len(pg_hashes)} 条,实体 {len(ent_hashes)} 个,关系 {len(rel_hashes)} 条"
|
||||
)
|
||||
|
||||
# 加载当前嵌入与 KG
|
||||
em = EmbeddingManager()
|
||||
kg = KGManager()
|
||||
try:
|
||||
em.load_from_file()
|
||||
kg.load_from_file()
|
||||
except Exception as e:
|
||||
logger.error(f"加载当前知识库失败: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
graph_nodes = set(kg.graph.get_node_list())
|
||||
|
||||
# 检查段落
|
||||
pg_keys = [f"paragraph-{h}" for h in pg_hashes]
|
||||
pg_in_vec = sum(1 for k in pg_keys if k in em.paragraphs_embedding_store.store)
|
||||
pg_in_kg = sum(1 for k in pg_keys if k in graph_nodes)
|
||||
|
||||
# 检查实体
|
||||
ent_keys = [f"entity-{h}" for h in ent_hashes]
|
||||
ent_in_vec = sum(1 for k in ent_keys if k in em.entities_embedding_store.store)
|
||||
ent_in_kg = sum(1 for k in ent_keys if k in graph_nodes)
|
||||
|
||||
# 检查关系(只针对向量库)
|
||||
rel_keys = [f"relation-{h}" for h in rel_hashes]
|
||||
rel_in_vec = sum(1 for k in rel_keys if k in em.relation_embedding_store.store)
|
||||
|
||||
print("==== 批次存在情况(删除前/后对比用) ====")
|
||||
print(f"段落: 总计 {len(pg_keys)}, 向量库剩余 {pg_in_vec}, KG 中剩余 {pg_in_kg}")
|
||||
print(f"实体: 总计 {len(ent_keys)}, 向量库剩余 {ent_in_vec}, KG 中剩余 {ent_in_kg}")
|
||||
print(f"关系: 总计 {len(rel_keys)}, 向量库剩余 {rel_in_vec}")
|
||||
|
||||
# 打印少量仍存在的样例,便于检查内容是否正常
|
||||
sample_pg = [k for k in pg_keys if k in graph_nodes][:3]
|
||||
if sample_pg:
|
||||
print("\n仍在 KG 中的段落节点示例:")
|
||||
for k in sample_pg:
|
||||
nd = kg.graph[k]
|
||||
content = nd["content"] if "content" in nd else k
|
||||
print(f"- {k}: {content[:80]}")
|
||||
|
||||
sample_ent = [k for k in ent_keys if k in graph_nodes][:3]
|
||||
if sample_ent:
|
||||
print("\n仍在 KG 中的实体节点示例:")
|
||||
for k in sample_ent:
|
||||
nd = kg.graph[k]
|
||||
content = nd["content"] if "content" in nd else k
|
||||
print(f"- {k}: {content[:80]}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user