为 LPMM 新增安全可控的删除能力: KGManager.delete_paragraphs 支持按段落/实体哈希删除图节点及关联边,可选清理孤立实体,并从图中重建元数据 统一删除脚本 scripts/delete_lpmm_items.py,支持按批次(OpenIE 文件)、哈希文件、原始文本段落、关键字搜索进行删除,内置 dry-run 和最大节点数保护 新增自检与回归脚本: scripts/inspect_lpmm_batch.py / scripts/inspect_lpmm_global.py 用于批次级和全局状态检查 scripts/test_lpmm_retrieval.py 一键初始化 LPMM 并用固定问题测试检索效果。 健壮性与性能保护: 在 KGManager.kg_search 中对 ent_appear_cnt 缺失增加兜底,避免实体权重计算时的 KeyError。 增加同义实体数量限制与 PPR 节点/关系阈值,必要时自动退回纯向量检索 文档补充: docs-src/lpmm_user_guide.md:面向零基础用户的导入 / 删除 / 自检脚本使用指南 docs-src/lpmm_parameters_guide.md:[lpmm_knowledge] 关键参数说明与简单调参建议
80 lines
2.4 KiB
Python
80 lines
2.4 KiB
Python
import os
|
||
from pathlib import Path
|
||
import sys # 新增系统模块导入
|
||
|
||
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
||
from src.chat.knowledge.utils.hash import get_sha256
|
||
from src.common.logger import get_logger
|
||
|
||
logger = get_logger("lpmm")
|
||
ROOT_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
||
RAW_DATA_PATH = os.path.join(ROOT_PATH, "data/lpmm_raw_data")
|
||
# IMPORTED_DATA_PATH = os.path.join(ROOT_PATH, "data/imported_lpmm_data")
|
||
|
||
|
||
def _process_text_file(file_path):
|
||
"""处理单个文本文件,返回段落列表"""
|
||
with open(file_path, "r", encoding="utf-8") as f:
|
||
raw = f.read()
|
||
|
||
paragraphs = []
|
||
paragraph = ""
|
||
for line in raw.split("\n"):
|
||
if line.strip() == "":
|
||
if paragraph != "":
|
||
paragraphs.append(paragraph.strip())
|
||
paragraph = ""
|
||
else:
|
||
paragraph += line + "\n"
|
||
|
||
if paragraph != "":
|
||
paragraphs.append(paragraph.strip())
|
||
|
||
return paragraphs
|
||
|
||
|
||
def _process_multi_files() -> list:
|
||
raw_files = list(Path(RAW_DATA_PATH).glob("*.txt"))
|
||
if not raw_files:
|
||
logger.warning("警告: data/lpmm_raw_data 中没有找到任何 .txt 文件")
|
||
sys.exit(1)
|
||
# 处理所有文件
|
||
all_paragraphs = []
|
||
for file in raw_files:
|
||
logger.info(f"正在处理文件: {file.name}")
|
||
paragraphs = _process_text_file(file)
|
||
all_paragraphs.extend(paragraphs)
|
||
return all_paragraphs
|
||
|
||
|
||
def load_raw_data() -> tuple[list[str], list[str]]:
|
||
"""加载原始数据文件
|
||
|
||
读取原始数据文件,将原始数据加载到内存中
|
||
|
||
Args:
|
||
path: 可选,指定要读取的json文件绝对路径
|
||
|
||
Returns:
|
||
- raw_data: 原始数据列表
|
||
- sha256_list: 原始数据的SHA256集合
|
||
"""
|
||
raw_paragraphs = _process_multi_files()
|
||
sha256_list = []
|
||
sha256_set = set()
|
||
raw_data: list[str] = []
|
||
for item in raw_paragraphs:
|
||
if not isinstance(item, str):
|
||
logger.warning(f"数据类型错误:{item}")
|
||
continue
|
||
pg_hash = get_sha256(item)
|
||
if pg_hash in sha256_set:
|
||
logger.warning(f"重复数据:{item}")
|
||
continue
|
||
sha256_set.add(pg_hash)
|
||
sha256_list.append(pg_hash)
|
||
raw_data.append(item)
|
||
logger.info(f"共读取到{len(raw_data)}条数据")
|
||
|
||
return sha256_list, raw_data
|