LPMM 知识库删除能力与自检脚本增强（附关键健壮性修复）

为 LPMM 新增安全可控的删除能力： KGManager.delete_paragraphs 支持按段落/实体哈希删除图节点及关联边，可选清理孤立实体，并从图中重建元数据统一删除脚本 scripts/delete_lpmm_items.py，支持按批次（OpenIE 文件）、哈希文件、原始文本段落、关键字搜索进行删除，内置 dry-run 和最大节点数保护新增自检与回归脚本： scripts/inspect_lpmm_batch.py / scripts/inspect_lpmm_global.py 用于批次级和全局状态检查 scripts/test_lpmm_retrieval.py 一键初始化 LPMM 并用固定问题测试检索效果。健壮性与性能保护：在 KGManager.kg_search 中对 ent_appear_cnt 缺失增加兜底，避免实体权重计算时的 KeyError。增加同义实体数量限制与 PPR 节点/关系阈值，必要时自动退回纯向量检索文档补充： docs-src/lpmm_user_guide.md：面向零基础用户的导入 / 删除 / 自检脚本使用指南 docs-src/lpmm_parameters_guide.md：[lpmm_knowledge] 关键参数说明与简单调参建议
2025-11-27 13:20:12 +08:00
parent fa4555197d
commit 1383caf249
9 changed files with 1376 additions and 5 deletions
--- a/scripts/inspect_lpmm_batch.py
+++ b/scripts/inspect_lpmm_batch.py
@@ -0,0 +1,132 @@
+import argparse
+import json
+import os
+import sys
+from pathlib import Path
+from typing import List, Tuple
+
+# 确保能导入 src.*
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+
+from src.chat.knowledge.utils.hash import get_sha256
+from src.chat.knowledge.embedding_store import EmbeddingManager
+from src.chat.knowledge.kg_manager import KGManager
+from src.common.logger import get_logger
+
+logger = get_logger("inspect_lpmm_batch")
+
+
+def load_openie_hashes(path: Path) -> Tuple[List[str], List[str], List[str]]:
+    """从 OpenIE JSON 中提取段落 / 实体 / 关系的哈希
+
+    注意：实体既包括 extracted_entities 中的条目，也包括三元组中的主语/宾语，
+    以与 KG 构图逻辑保持一致。
+    """
+    with path.open("r", encoding="utf-8") as f:
+        data = json.load(f)
+
+    pg_hashes: List[str] = []
+    ent_hashes: List[str] = []
+    rel_hashes: List[str] = []
+
+    for doc in data.get("docs", []):
+        if not isinstance(doc, dict):
+            continue
+        idx = doc.get("idx")
+        if isinstance(idx, str) and idx.strip():
+            pg_hashes.append(idx.strip())
+
+        ents = doc.get("extracted_entities", [])
+        if isinstance(ents, list):
+            for e in ents:
+                if isinstance(e, str):
+                    ent_hashes.append(get_sha256(e))
+
+        triples = doc.get("extracted_triples", [])
+        if isinstance(triples, list):
+            for t in triples:
+                if isinstance(t, list) and len(t) == 3:
+                    # 主语/宾语作为实体参与构图
+                    subj, _, obj = t
+                    if isinstance(subj, str):
+                        ent_hashes.append(get_sha256(subj))
+                    if isinstance(obj, str):
+                        ent_hashes.append(get_sha256(obj))
+                    rel_hashes.append(get_sha256(str(tuple(t))))
+
+    # 去重但保留顺序
+    def unique(seq: List[str]) -> List[str]:
+        seen = set()
+        return [x for x in seq if not (x in seen or seen.add(x))]
+
+    return unique(pg_hashes), unique(ent_hashes), unique(rel_hashes)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="检查指定 OpenIE 文件对应批次在当前向量库与 KG 中的存在情况（用于验证删除效果）。"
+    )
+    parser.add_argument("--openie-file", required=True, help="OpenIE 输出 JSON 文件路径")
+    args = parser.parse_args()
+
+    openie_path = Path(args.openie_file)
+    if not openie_path.exists():
+        logger.error(f"OpenIE 文件不存在: {openie_path}")
+        sys.exit(1)
+
+    pg_hashes, ent_hashes, rel_hashes = load_openie_hashes(openie_path)
+    logger.info(
+        f"从 {openie_path.name} 解析到 段落 {len(pg_hashes)} 条，实体 {len(ent_hashes)} 个，关系 {len(rel_hashes)} 条"
+    )
+
+    # 加载当前嵌入与 KG
+    em = EmbeddingManager()
+    kg = KGManager()
+    try:
+        em.load_from_file()
+        kg.load_from_file()
+    except Exception as e:
+        logger.error(f"加载当前知识库失败: {e}")
+        sys.exit(1)
+
+    graph_nodes = set(kg.graph.get_node_list())
+
+    # 检查段落
+    pg_keys = [f"paragraph-{h}" for h in pg_hashes]
+    pg_in_vec = sum(1 for k in pg_keys if k in em.paragraphs_embedding_store.store)
+    pg_in_kg = sum(1 for k in pg_keys if k in graph_nodes)
+
+    # 检查实体
+    ent_keys = [f"entity-{h}" for h in ent_hashes]
+    ent_in_vec = sum(1 for k in ent_keys if k in em.entities_embedding_store.store)
+    ent_in_kg = sum(1 for k in ent_keys if k in graph_nodes)
+
+    # 检查关系（只针对向量库）
+    rel_keys = [f"relation-{h}" for h in rel_hashes]
+    rel_in_vec = sum(1 for k in rel_keys if k in em.relation_embedding_store.store)
+
+    print("==== 批次存在情况（删除前/后对比用） ====")
+    print(f"段落: 总计 {len(pg_keys)}, 向量库剩余 {pg_in_vec}, KG 中剩余 {pg_in_kg}")
+    print(f"实体: 总计 {len(ent_keys)}, 向量库剩余 {ent_in_vec}, KG 中剩余 {ent_in_kg}")
+    print(f"关系: 总计 {len(rel_keys)}, 向量库剩余 {rel_in_vec}")
+
+    # 打印少量仍存在的样例，便于检查内容是否正常
+    sample_pg = [k for k in pg_keys if k in graph_nodes][:3]
+    if sample_pg:
+        print("\n仍在 KG 中的段落节点示例：")
+        for k in sample_pg:
+            nd = kg.graph[k]
+            content = nd["content"] if "content" in nd else k
+            print(f"- {k}: {content[:80]}")
+
+    sample_ent = [k for k in ent_keys if k in graph_nodes][:3]
+    if sample_ent:
+        print("\n仍在 KG 中的实体节点示例：")
+        for k in sample_ent:
+            nd = kg.graph[k]
+            content = nd["content"] if "content" in nd else k
+            print(f"- {k}: {content[:80]}")
+
+
+if __name__ == "__main__":
+    main()