Files
mai-bot/plugins/A_memorix/scripts/import_lpmm_json.py
DawnARC 71b3a828c6 添加 A_Memorix 插件 v2.0.0(包含运行时与文档)
引入 A_Memorix 插件 v2.0.0:新增大量运行时组件、存储/模式更新、检索能力提升、管理工具、导入/调优工作流以及相关文档。关键新增内容包括:lifecycle_orchestrator、SDKMemoryKernel/运行时初始化器、新的存储层与 metadata_store 变更(SCHEMA_VERSION v8)、检索增强(双路径检索、图关系召回、稀疏 BM25),以及多种工具服务(episode/person_profile/relation/segmentation/tuning/search execution)。同时新增 Web 导入/摘要导入器及大量维护脚本。还更新了插件清单、embedding API 适配器、plugin.py、requirements/pyproject,以及主入口文件,使新插件接入项目。该变更为 2.0.0 版本发布做好准备,实现统一的 SDK Tool 接口并扩展整体运行能力。
2026-03-19 00:09:04 +08:00

173 lines
5.9 KiB
Python

#!/usr/bin/env python3
"""
LPMM OpenIE JSON 导入工具。
功能:
1. 读取符合 LPMM 规范的 OpenIE JSON 文件
2. 转换为 A_Memorix 的统一导入格式
3. 复用 `process_knowledge.py` 中的 `AutoImporter` 直接入库
"""
from __future__ import annotations
import argparse
import asyncio
import json
import sys
import traceback
from pathlib import Path
from typing import Any, Dict, List
from rich.console import Console
from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
console = Console()
CURRENT_DIR = Path(__file__).resolve().parent
PLUGIN_ROOT = CURRENT_DIR.parent
WORKSPACE_ROOT = PLUGIN_ROOT.parent
MAIBOT_ROOT = WORKSPACE_ROOT / "MaiBot"
for path in (CURRENT_DIR, WORKSPACE_ROOT, MAIBOT_ROOT, PLUGIN_ROOT):
path_str = str(path)
if path_str not in sys.path:
sys.path.insert(0, path_str)
def _build_arg_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="将 LPMM OpenIE JSON 导入 A_Memorix")
parser.add_argument("path", help="LPMM JSON 文件路径或目录")
parser.add_argument("--force", action="store_true", help="强制重新导入")
parser.add_argument("--concurrency", "-c", type=int, default=5, help="并发数")
return parser
if any(arg in {"-h", "--help"} for arg in sys.argv[1:]):
_build_arg_parser().print_help()
raise SystemExit(0)
try:
from process_knowledge import AutoImporter
from A_memorix.core.utils.hash import compute_paragraph_hash
from src.common.logger import get_logger
except ImportError as exc: # pragma: no cover - script bootstrap
print(f"导入模块失败,请确认 PYTHONPATH 与工作区结构: {exc}")
raise SystemExit(1)
logger = get_logger("A_Memorix.LPMMImport")
class LPMMConverter:
def convert_lpmm_to_memorix(self, lpmm_data: Dict[str, Any], filename: str) -> Dict[str, Any]:
memorix_data = {"paragraphs": [], "entities": []}
docs = lpmm_data.get("docs", []) or []
if not docs:
logger.warning(f"文件中未找到 docs 字段: {filename}")
return memorix_data
all_entities = set()
for doc in docs:
content = str(doc.get("passage", "") or "").strip()
if not content:
continue
relations: List[Dict[str, str]] = []
for triple in doc.get("extracted_triples", []) or []:
if isinstance(triple, list) and len(triple) == 3:
relations.append(
{
"subject": str(triple[0] or "").strip(),
"predicate": str(triple[1] or "").strip(),
"object": str(triple[2] or "").strip(),
}
)
entities = [str(item or "").strip() for item in doc.get("extracted_entities", []) or [] if str(item or "").strip()]
all_entities.update(entities)
for relation in relations:
if relation["subject"]:
all_entities.add(relation["subject"])
if relation["object"]:
all_entities.add(relation["object"])
memorix_data["paragraphs"].append(
{
"hash": compute_paragraph_hash(content),
"content": content,
"source": filename,
"entities": entities,
"relations": relations,
}
)
memorix_data["entities"] = sorted(all_entities)
return memorix_data
async def main() -> None:
parser = _build_arg_parser()
args = parser.parse_args()
target_path = Path(args.path)
if not target_path.exists():
logger.error(f"路径不存在: {target_path}")
return
if target_path.is_dir():
files_to_process = list(target_path.glob("*-openie.json")) or list(target_path.glob("*.json"))
else:
files_to_process = [target_path]
if not files_to_process:
logger.error("未找到可处理的 JSON 文件")
return
importer = AutoImporter(force=bool(args.force), concurrency=int(args.concurrency))
if not await importer.initialize():
logger.error("初始化存储失败")
return
converter = LPMMConverter()
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
TimeElapsedColumn(),
console=console,
transient=False,
) as progress:
for json_file in files_to_process:
logger.info(f"正在转换并导入: {json_file.name}")
try:
with open(json_file, "r", encoding="utf-8") as handle:
lpmm_data = json.load(handle)
memorix_data = converter.convert_lpmm_to_memorix(lpmm_data, json_file.name)
total_items = len(memorix_data.get("paragraphs", []))
if total_items <= 0:
logger.warning(f"转换结果为空: {json_file.name}")
continue
task_id = progress.add_task(f"Importing {json_file.name}", total=total_items)
def update_progress(step: int = 1) -> None:
progress.advance(task_id, advance=step)
await importer.import_json_data(
memorix_data,
filename=f"lpmm_{json_file.name}",
progress_callback=update_progress,
)
except Exception as exc:
logger.error(f"处理文件 {json_file.name} 失败: {exc}\n{traceback.format_exc()}")
await importer.close()
logger.info("全部处理完成")
if __name__ == "__main__":
if sys.platform == "win32": # pragma: no cover
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
asyncio.run(main())