引入 A_Memorix 插件 v2.0.0:新增大量运行时组件、存储/模式更新、检索能力提升、管理工具、导入/调优工作流以及相关文档。关键新增内容包括:lifecycle_orchestrator、SDKMemoryKernel/运行时初始化器、新的存储层与 metadata_store 变更(SCHEMA_VERSION v8)、检索增强(双路径检索、图关系召回、稀疏 BM25),以及多种工具服务(episode/person_profile/relation/segmentation/tuning/search execution)。同时新增 Web 导入/摘要导入器及大量维护脚本。还更新了插件清单、embedding API 适配器、plugin.py、requirements/pyproject,以及主入口文件,使新插件接入项目。该变更为 2.0.0 版本发布做好准备,实现统一的 SDK Tool 接口并扩展整体运行能力。
173 lines
5.9 KiB
Python
173 lines
5.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
LPMM OpenIE JSON 导入工具。
|
|
|
|
功能:
|
|
1. 读取符合 LPMM 规范的 OpenIE JSON 文件
|
|
2. 转换为 A_Memorix 的统一导入格式
|
|
3. 复用 `process_knowledge.py` 中的 `AutoImporter` 直接入库
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import asyncio
|
|
import json
|
|
import sys
|
|
import traceback
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List
|
|
|
|
from rich.console import Console
|
|
from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
|
|
|
|
console = Console()
|
|
|
|
CURRENT_DIR = Path(__file__).resolve().parent
|
|
PLUGIN_ROOT = CURRENT_DIR.parent
|
|
WORKSPACE_ROOT = PLUGIN_ROOT.parent
|
|
MAIBOT_ROOT = WORKSPACE_ROOT / "MaiBot"
|
|
for path in (CURRENT_DIR, WORKSPACE_ROOT, MAIBOT_ROOT, PLUGIN_ROOT):
|
|
path_str = str(path)
|
|
if path_str not in sys.path:
|
|
sys.path.insert(0, path_str)
|
|
|
|
|
|
def _build_arg_parser() -> argparse.ArgumentParser:
|
|
parser = argparse.ArgumentParser(description="将 LPMM OpenIE JSON 导入 A_Memorix")
|
|
parser.add_argument("path", help="LPMM JSON 文件路径或目录")
|
|
parser.add_argument("--force", action="store_true", help="强制重新导入")
|
|
parser.add_argument("--concurrency", "-c", type=int, default=5, help="并发数")
|
|
return parser
|
|
|
|
|
|
if any(arg in {"-h", "--help"} for arg in sys.argv[1:]):
|
|
_build_arg_parser().print_help()
|
|
raise SystemExit(0)
|
|
|
|
|
|
try:
|
|
from process_knowledge import AutoImporter
|
|
from A_memorix.core.utils.hash import compute_paragraph_hash
|
|
from src.common.logger import get_logger
|
|
except ImportError as exc: # pragma: no cover - script bootstrap
|
|
print(f"导入模块失败,请确认 PYTHONPATH 与工作区结构: {exc}")
|
|
raise SystemExit(1)
|
|
|
|
|
|
logger = get_logger("A_Memorix.LPMMImport")
|
|
|
|
|
|
class LPMMConverter:
|
|
def convert_lpmm_to_memorix(self, lpmm_data: Dict[str, Any], filename: str) -> Dict[str, Any]:
|
|
memorix_data = {"paragraphs": [], "entities": []}
|
|
docs = lpmm_data.get("docs", []) or []
|
|
if not docs:
|
|
logger.warning(f"文件中未找到 docs 字段: {filename}")
|
|
return memorix_data
|
|
|
|
all_entities = set()
|
|
for doc in docs:
|
|
content = str(doc.get("passage", "") or "").strip()
|
|
if not content:
|
|
continue
|
|
|
|
relations: List[Dict[str, str]] = []
|
|
for triple in doc.get("extracted_triples", []) or []:
|
|
if isinstance(triple, list) and len(triple) == 3:
|
|
relations.append(
|
|
{
|
|
"subject": str(triple[0] or "").strip(),
|
|
"predicate": str(triple[1] or "").strip(),
|
|
"object": str(triple[2] or "").strip(),
|
|
}
|
|
)
|
|
|
|
entities = [str(item or "").strip() for item in doc.get("extracted_entities", []) or [] if str(item or "").strip()]
|
|
all_entities.update(entities)
|
|
for relation in relations:
|
|
if relation["subject"]:
|
|
all_entities.add(relation["subject"])
|
|
if relation["object"]:
|
|
all_entities.add(relation["object"])
|
|
|
|
memorix_data["paragraphs"].append(
|
|
{
|
|
"hash": compute_paragraph_hash(content),
|
|
"content": content,
|
|
"source": filename,
|
|
"entities": entities,
|
|
"relations": relations,
|
|
}
|
|
)
|
|
|
|
memorix_data["entities"] = sorted(all_entities)
|
|
return memorix_data
|
|
|
|
|
|
async def main() -> None:
|
|
parser = _build_arg_parser()
|
|
args = parser.parse_args()
|
|
|
|
target_path = Path(args.path)
|
|
if not target_path.exists():
|
|
logger.error(f"路径不存在: {target_path}")
|
|
return
|
|
|
|
if target_path.is_dir():
|
|
files_to_process = list(target_path.glob("*-openie.json")) or list(target_path.glob("*.json"))
|
|
else:
|
|
files_to_process = [target_path]
|
|
|
|
if not files_to_process:
|
|
logger.error("未找到可处理的 JSON 文件")
|
|
return
|
|
|
|
importer = AutoImporter(force=bool(args.force), concurrency=int(args.concurrency))
|
|
if not await importer.initialize():
|
|
logger.error("初始化存储失败")
|
|
return
|
|
|
|
converter = LPMMConverter()
|
|
with Progress(
|
|
SpinnerColumn(),
|
|
TextColumn("[progress.description]{task.description}"),
|
|
BarColumn(),
|
|
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
|
|
TimeElapsedColumn(),
|
|
console=console,
|
|
transient=False,
|
|
) as progress:
|
|
for json_file in files_to_process:
|
|
logger.info(f"正在转换并导入: {json_file.name}")
|
|
try:
|
|
with open(json_file, "r", encoding="utf-8") as handle:
|
|
lpmm_data = json.load(handle)
|
|
memorix_data = converter.convert_lpmm_to_memorix(lpmm_data, json_file.name)
|
|
total_items = len(memorix_data.get("paragraphs", []))
|
|
if total_items <= 0:
|
|
logger.warning(f"转换结果为空: {json_file.name}")
|
|
continue
|
|
|
|
task_id = progress.add_task(f"Importing {json_file.name}", total=total_items)
|
|
|
|
def update_progress(step: int = 1) -> None:
|
|
progress.advance(task_id, advance=step)
|
|
|
|
await importer.import_json_data(
|
|
memorix_data,
|
|
filename=f"lpmm_{json_file.name}",
|
|
progress_callback=update_progress,
|
|
)
|
|
except Exception as exc:
|
|
logger.error(f"处理文件 {json_file.name} 失败: {exc}\n{traceback.format_exc()}")
|
|
|
|
await importer.close()
|
|
logger.info("全部处理完成")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if sys.platform == "win32": # pragma: no cover
|
|
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
|
asyncio.run(main())
|