Files
mai-bot/src/A_memorix/scripts/import_lpmm_json.py
A-Dawn 15d436b3a1 refactor: 将 A_Memorix 重构为主线长期记忆子系统并重建管理界面
- 将 A_Memorix 从旧 submodule / 插件形态迁入主线源码,主体落到 src/A_memorix
- 调整主程序接入方式,使 A_Memorix 作为源码内长期记忆子系统运行
- 回收父项目插件体系中针对 A_Memorix 的特判,减少对 plugin 通用层的侵入
- 将长期记忆配置、运行时、自检、导入、调优等能力收口到 memory 路由与主线服务层
- 重做长期记忆控制台与图谱页面,按 MaiBot 现有 dashboard 风格接入
- 补充实体关系图与证据视图双视图能力,支持查看节点、关系、段落及其证据链路
- 新增长期记忆配置编辑器与 memory-api,支持主线内配置管理
- 补齐删除管理能力:删除预览、混合删除、来源批量删除、删除操作恢复
- 优化删除预览与删除操作详情的前端展示,支持分页、检索,并以实体名/关系内容/段落摘要替代单纯 hash 展示
- 修复图谱与控制台相关前端问题,包括证据视图切换、查询触发时机、删除弹层空值保护等
- 新增或更新 A_Memorix 相关测试、WebUI 路由测试、前端 vitest 测试与辅助验证脚本
- 移除旧 plugins/A_memorix、.gitmodules 及相关历史维护文档
2026-04-03 08:08:24 +08:00

166 lines
5.6 KiB
Python

#!/usr/bin/env python3
"""
LPMM OpenIE JSON 导入工具。
功能:
1. 读取符合 LPMM 规范的 OpenIE JSON 文件
2. 转换为 A_Memorix 的统一导入格式
3. 复用 `process_knowledge.py` 中的 `AutoImporter` 直接入库
"""
from __future__ import annotations
import argparse
import asyncio
import json
import sys
import traceback
from pathlib import Path
from typing import Any, Dict, List
from rich.console import Console
from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
console = Console()
import _bootstrap # noqa: F401
def _build_arg_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="将 LPMM OpenIE JSON 导入 A_Memorix")
parser.add_argument("path", help="LPMM JSON 文件路径或目录")
parser.add_argument("--force", action="store_true", help="强制重新导入")
parser.add_argument("--concurrency", "-c", type=int, default=5, help="并发数")
return parser
if any(arg in {"-h", "--help"} for arg in sys.argv[1:]):
_build_arg_parser().print_help()
raise SystemExit(0)
try:
from process_knowledge import AutoImporter
from A_memorix.core.utils.hash import compute_paragraph_hash
from src.common.logger import get_logger
except ImportError as exc: # pragma: no cover - script bootstrap
print(f"导入模块失败,请确认 PYTHONPATH 与工作区结构: {exc}")
raise SystemExit(1)
logger = get_logger("A_Memorix.LPMMImport")
class LPMMConverter:
def convert_lpmm_to_memorix(self, lpmm_data: Dict[str, Any], filename: str) -> Dict[str, Any]:
memorix_data = {"paragraphs": [], "entities": []}
docs = lpmm_data.get("docs", []) or []
if not docs:
logger.warning(f"文件中未找到 docs 字段: {filename}")
return memorix_data
all_entities = set()
for doc in docs:
content = str(doc.get("passage", "") or "").strip()
if not content:
continue
relations: List[Dict[str, str]] = []
for triple in doc.get("extracted_triples", []) or []:
if isinstance(triple, list) and len(triple) == 3:
relations.append(
{
"subject": str(triple[0] or "").strip(),
"predicate": str(triple[1] or "").strip(),
"object": str(triple[2] or "").strip(),
}
)
entities = [str(item or "").strip() for item in doc.get("extracted_entities", []) or [] if str(item or "").strip()]
all_entities.update(entities)
for relation in relations:
if relation["subject"]:
all_entities.add(relation["subject"])
if relation["object"]:
all_entities.add(relation["object"])
memorix_data["paragraphs"].append(
{
"hash": compute_paragraph_hash(content),
"content": content,
"source": filename,
"entities": entities,
"relations": relations,
}
)
memorix_data["entities"] = sorted(all_entities)
return memorix_data
async def main() -> None:
parser = _build_arg_parser()
args = parser.parse_args()
target_path = Path(args.path)
if not target_path.exists():
logger.error(f"路径不存在: {target_path}")
return
if target_path.is_dir():
files_to_process = list(target_path.glob("*-openie.json")) or list(target_path.glob("*.json"))
else:
files_to_process = [target_path]
if not files_to_process:
logger.error("未找到可处理的 JSON 文件")
return
importer = AutoImporter(force=bool(args.force), concurrency=int(args.concurrency))
if not await importer.initialize():
logger.error("初始化存储失败")
return
converter = LPMMConverter()
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
TimeElapsedColumn(),
console=console,
transient=False,
) as progress:
for json_file in files_to_process:
logger.info(f"正在转换并导入: {json_file.name}")
try:
with open(json_file, "r", encoding="utf-8") as handle:
lpmm_data = json.load(handle)
memorix_data = converter.convert_lpmm_to_memorix(lpmm_data, json_file.name)
total_items = len(memorix_data.get("paragraphs", []))
if total_items <= 0:
logger.warning(f"转换结果为空: {json_file.name}")
continue
task_id = progress.add_task(f"Importing {json_file.name}", total=total_items)
def update_progress(step: int = 1) -> None:
progress.advance(task_id, advance=step)
await importer.import_json_data(
memorix_data,
filename=f"lpmm_{json_file.name}",
progress_callback=update_progress,
)
except Exception as exc:
logger.error(f"处理文件 {json_file.name} 失败: {exc}\n{traceback.format_exc()}")
await importer.close()
logger.info("全部处理完成")
if __name__ == "__main__":
if sys.platform == "win32": # pragma: no cover
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
asyncio.run(main())