refactor: 将 A_Memorix 重构为主线长期记忆子系统并重建管理界面

- 将 A_Memorix 从旧 submodule / 插件形态迁入主线源码,主体落到 src/A_memorix
- 调整主程序接入方式,使 A_Memorix 作为源码内长期记忆子系统运行
- 回收父项目插件体系中针对 A_Memorix 的特判,减少对 plugin 通用层的侵入
- 将长期记忆配置、运行时、自检、导入、调优等能力收口到 memory 路由与主线服务层
- 重做长期记忆控制台与图谱页面,按 MaiBot 现有 dashboard 风格接入
- 补充实体关系图与证据视图双视图能力,支持查看节点、关系、段落及其证据链路
- 新增长期记忆配置编辑器与 memory-api,支持主线内配置管理
- 补齐删除管理能力:删除预览、混合删除、来源批量删除、删除操作恢复
- 优化删除预览与删除操作详情的前端展示,支持分页、检索,并以实体名/关系内容/段落摘要替代单纯 hash 展示
- 修复图谱与控制台相关前端问题,包括证据视图切换、查询触发时机、删除弹层空值保护等
- 新增或更新 A_Memorix 相关测试、WebUI 路由测试、前端 vitest 测试与辅助验证脚本
- 移除旧 plugins/A_memorix、.gitmodules 及相关历史维护文档
This commit is contained in:
A-Dawn
2026-04-03 08:08:24 +08:00
parent bf5eb45709
commit 15d436b3a1
136 changed files with 52533 additions and 629 deletions

View File

@@ -0,0 +1,22 @@
from __future__ import annotations
import sys
from pathlib import Path
CURRENT_DIR = Path(__file__).resolve().parent
PLUGIN_ROOT = CURRENT_DIR.parent
SRC_ROOT = PLUGIN_ROOT.parent
PROJECT_ROOT = SRC_ROOT.parent
WORKSPACE_ROOT = PROJECT_ROOT
MAIBOT_ROOT = PROJECT_ROOT
for _path in (SRC_ROOT, PROJECT_ROOT, PLUGIN_ROOT):
_path_str = str(_path)
if _path_str not in sys.path:
sys.path.insert(0, _path_str)
from A_memorix.paths import config_path, default_data_dir, resolve_repo_path
DEFAULT_CONFIG_PATH = config_path()
DEFAULT_DATA_DIR = default_data_dir()
DEFAULT_DB_PATH = PROJECT_ROOT / "data" / "MaiBot.db"

View File

@@ -0,0 +1,208 @@
#!/usr/bin/env python3
"""
A_Memorix 一致性审计脚本。
输出内容:
1. paragraph/entity/relation 向量覆盖率
2. relation vector_state 分布
3. 孤儿向量数量(向量存在但 metadata 不存在)
4. 状态与向量文件不一致统计
"""
from __future__ import annotations
import argparse
import json
import pickle
import sys
from pathlib import Path
from typing import Any, Dict, Set
from _bootstrap import DEFAULT_DATA_DIR, resolve_repo_path
def _build_arg_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="审计 A_Memorix 向量一致性")
parser.add_argument(
"--data-dir",
default=str(DEFAULT_DATA_DIR),
help="A_Memorix 数据目录(默认: data/plugins/a-dawn.a-memorix",
)
parser.add_argument("--json-out", default="", help="可选:输出 JSON 文件路径")
parser.add_argument(
"--strict",
action="store_true",
help="若发现一致性异常则返回非 0 退出码",
)
return parser
# --help/-h fast path: avoid heavy host/plugin bootstrap
if any(arg in {"-h", "--help"} for arg in sys.argv[1:]):
_build_arg_parser().print_help()
sys.exit(0)
try:
from A_memorix.core.storage.vector_store import VectorStore
from A_memorix.core.storage.metadata_store import MetadataStore
from A_memorix.core.storage import QuantizationType
except Exception as e: # pragma: no cover
print(f"❌ 导入核心模块失败: {e}")
sys.exit(1)
def _safe_ratio(numerator: int, denominator: int) -> float:
if denominator <= 0:
return 0.0
return float(numerator) / float(denominator)
def _load_vector_store(data_dir: Path) -> VectorStore:
meta_path = data_dir / "vectors" / "vectors_metadata.pkl"
if not meta_path.exists():
raise FileNotFoundError(f"未找到向量元数据文件: {meta_path}")
with open(meta_path, "rb") as f:
meta = pickle.load(f)
dimension = int(meta.get("dimension", 1024))
store = VectorStore(
dimension=max(1, dimension),
quantization_type=QuantizationType.INT8,
data_dir=data_dir / "vectors",
)
if store.has_data():
store.load()
return store
def _load_metadata_store(data_dir: Path) -> MetadataStore:
store = MetadataStore(data_dir=data_dir / "metadata")
store.connect()
return store
def _hash_set(metadata_store: MetadataStore, table: str) -> Set[str]:
return {str(h) for h in metadata_store.list_hashes(table)}
def _relation_state_stats(metadata_store: MetadataStore) -> Dict[str, int]:
return metadata_store.count_relations_by_vector_state()
def run_audit(data_dir: Path) -> Dict[str, Any]:
vector_store = _load_vector_store(data_dir)
metadata_store = _load_metadata_store(data_dir)
try:
paragraph_hashes = _hash_set(metadata_store, "paragraphs")
entity_hashes = _hash_set(metadata_store, "entities")
relation_hashes = _hash_set(metadata_store, "relations")
known_hashes = set(getattr(vector_store, "_known_hashes", set()))
live_vector_hashes = {h for h in known_hashes if h in vector_store}
para_vector_hits = len(paragraph_hashes & live_vector_hashes)
ent_vector_hits = len(entity_hashes & live_vector_hashes)
rel_vector_hits = len(relation_hashes & live_vector_hashes)
orphan_vector_hashes = sorted(
live_vector_hashes - paragraph_hashes - entity_hashes - relation_hashes
)
relation_rows = metadata_store.get_relations()
ready_but_missing = 0
not_ready_but_present = 0
for row in relation_rows:
h = str(row.get("hash") or "")
state = str(row.get("vector_state") or "none").lower()
in_vector = h in live_vector_hashes
if state == "ready" and not in_vector:
ready_but_missing += 1
if state != "ready" and in_vector:
not_ready_but_present += 1
relation_states = _relation_state_stats(metadata_store)
rel_total = max(0, int(relation_states.get("total", len(relation_hashes))))
ready_count = max(0, int(relation_states.get("ready", 0)))
result = {
"counts": {
"paragraphs": len(paragraph_hashes),
"entities": len(entity_hashes),
"relations": len(relation_hashes),
"vectors_live": len(live_vector_hashes),
},
"coverage": {
"paragraph_vector_coverage": _safe_ratio(para_vector_hits, len(paragraph_hashes)),
"entity_vector_coverage": _safe_ratio(ent_vector_hits, len(entity_hashes)),
"relation_vector_coverage": _safe_ratio(rel_vector_hits, len(relation_hashes)),
"relation_ready_coverage": _safe_ratio(ready_count, rel_total),
},
"relation_states": relation_states,
"orphans": {
"vector_only_count": len(orphan_vector_hashes),
"vector_only_sample": orphan_vector_hashes[:30],
},
"consistency_checks": {
"ready_but_missing_vector": ready_but_missing,
"not_ready_but_vector_present": not_ready_but_present,
},
}
return result
finally:
metadata_store.close()
def main() -> int:
parser = _build_arg_parser()
args = parser.parse_args()
data_dir = resolve_repo_path(args.data_dir, fallback=DEFAULT_DATA_DIR)
if not data_dir.exists():
print(f"❌ 数据目录不存在: {data_dir}")
return 2
try:
result = run_audit(data_dir)
except Exception as e:
print(f"❌ 审计失败: {e}")
return 2
print("=== A_Memorix Vector Consistency Audit ===")
print(f"data_dir: {data_dir}")
print(f"paragraphs: {result['counts']['paragraphs']}")
print(f"entities: {result['counts']['entities']}")
print(f"relations: {result['counts']['relations']}")
print(f"vectors_live: {result['counts']['vectors_live']}")
print(
"coverage: "
f"paragraph={result['coverage']['paragraph_vector_coverage']:.3f}, "
f"entity={result['coverage']['entity_vector_coverage']:.3f}, "
f"relation={result['coverage']['relation_vector_coverage']:.3f}, "
f"relation_ready={result['coverage']['relation_ready_coverage']:.3f}"
)
print(f"relation_states: {result['relation_states']}")
print(
"consistency_checks: "
f"ready_but_missing_vector={result['consistency_checks']['ready_but_missing_vector']}, "
f"not_ready_but_vector_present={result['consistency_checks']['not_ready_but_vector_present']}"
)
print(f"orphan_vectors: {result['orphans']['vector_only_count']}")
if args.json_out:
out_path = Path(args.json_out).resolve()
out_path.parent.mkdir(parents=True, exist_ok=True)
with open(out_path, "w", encoding="utf-8") as f:
json.dump(result, f, ensure_ascii=False, indent=2)
print(f"json_out: {out_path}")
has_anomaly = (
result["orphans"]["vector_only_count"] > 0
or result["consistency_checks"]["ready_but_missing_vector"] > 0
)
if args.strict and has_anomaly:
return 1
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -0,0 +1,265 @@
#!/usr/bin/env python3
"""
关系向量一次性回填脚本(灰度/离线执行)。
用途:
1. 对 relations 中 vector_state in (none, failed, pending) 的记录补齐向量。
2. 支持并发控制,降低总耗时。
3. 可作为灰度阶段验证工具,与 audit_vector_consistency.py 配合使用。
4. 可选自动纳入“ready 但向量缺失”的漂移记录进行修复。
"""
from __future__ import annotations
import argparse
import asyncio
import json
import sys
import time
from pathlib import Path
from typing import Any, Dict, List
import tomlkit
from _bootstrap import DEFAULT_CONFIG_PATH, DEFAULT_DATA_DIR, resolve_repo_path
def _build_arg_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="关系向量一次性回填")
parser.add_argument(
"--config",
default=str(DEFAULT_CONFIG_PATH),
help="配置文件路径(默认 config/a_memorix.toml",
)
parser.add_argument(
"--data-dir",
default=str(DEFAULT_DATA_DIR),
help="数据目录(默认 data/plugins/a-dawn.a-memorix",
)
parser.add_argument(
"--states",
default="none,failed,pending",
help="待处理状态列表,逗号分隔",
)
parser.add_argument("--limit", type=int, default=50000, help="最大处理数量")
parser.add_argument("--concurrency", type=int, default=8, help="并发数")
parser.add_argument("--max-retry", type=int, default=None, help="最大重试次数过滤")
parser.add_argument(
"--include-ready-missing",
action="store_true",
help="额外纳入 vector_state=ready 但向量缺失的关系",
)
parser.add_argument("--dry-run", action="store_true", help="仅统计候选,不写入")
return parser
# --help/-h fast path: avoid heavy host/plugin bootstrap
if any(arg in {"-h", "--help"} for arg in sys.argv[1:]):
_build_arg_parser().print_help()
raise SystemExit(0)
from A_memorix.core.storage import (
VectorStore,
GraphStore,
MetadataStore,
QuantizationType,
SparseMatrixFormat,
)
from A_memorix.core.embedding import create_embedding_api_adapter
from A_memorix.core.utils.relation_write_service import RelationWriteService
def _load_config(config_path: Path) -> Dict[str, Any]:
with open(config_path, "r", encoding="utf-8") as f:
raw = tomlkit.load(f)
return dict(raw) if isinstance(raw, dict) else {}
def _build_vector_store(data_dir: Path, emb_cfg: Dict[str, Any]) -> VectorStore:
q_type = str(emb_cfg.get("quantization_type", "int8")).lower()
if q_type != "int8":
raise ValueError(
"embedding.quantization_type 在 vNext 仅允许 int8(SQ8)。"
" 请先执行 scripts/release_vnext_migrate.py migrate。"
)
dim = int(emb_cfg.get("dimension", 1024))
store = VectorStore(
dimension=max(1, dim),
quantization_type=QuantizationType.INT8,
data_dir=data_dir / "vectors",
)
if store.has_data():
store.load()
return store
def _build_graph_store(data_dir: Path, graph_cfg: Dict[str, Any]) -> GraphStore:
fmt = str(graph_cfg.get("sparse_matrix_format", "csr")).lower()
fmt_map = {
"csr": SparseMatrixFormat.CSR,
"csc": SparseMatrixFormat.CSC,
}
store = GraphStore(
matrix_format=fmt_map.get(fmt, SparseMatrixFormat.CSR),
data_dir=data_dir / "graph",
)
if store.has_data():
store.load()
return store
def _build_metadata_store(data_dir: Path) -> MetadataStore:
store = MetadataStore(data_dir=data_dir / "metadata")
store.connect()
return store
def _build_embedding_manager(emb_cfg: Dict[str, Any]):
retry_cfg = emb_cfg.get("retry", {})
if not isinstance(retry_cfg, dict):
retry_cfg = {}
return create_embedding_api_adapter(
batch_size=int(emb_cfg.get("batch_size", 32)),
max_concurrent=int(emb_cfg.get("max_concurrent", 5)),
default_dimension=int(emb_cfg.get("dimension", 1024)),
model_name=str(emb_cfg.get("model_name", "auto")),
retry_config=retry_cfg,
)
async def _process_rows(
service: RelationWriteService,
rows: List[Dict[str, Any]],
concurrency: int,
) -> Dict[str, int]:
semaphore = asyncio.Semaphore(max(1, int(concurrency)))
stat = {"success": 0, "failed": 0, "skipped": 0}
async def _worker(row: Dict[str, Any]) -> None:
async with semaphore:
result = await service.ensure_relation_vector(
hash_value=str(row["hash"]),
subject=str(row.get("subject", "")),
predicate=str(row.get("predicate", "")),
obj=str(row.get("object", "")),
)
if result.vector_state == "ready":
if result.vector_written:
stat["success"] += 1
else:
stat["skipped"] += 1
else:
stat["failed"] += 1
await asyncio.gather(*[_worker(row) for row in rows])
return stat
async def main_async(args: argparse.Namespace) -> int:
config_path = resolve_repo_path(args.config, fallback=DEFAULT_CONFIG_PATH)
if not config_path.exists():
print(f"❌ 配置文件不存在: {config_path}")
return 2
cfg = _load_config(config_path)
emb_cfg = cfg.get("embedding", {}) if isinstance(cfg, dict) else {}
graph_cfg = cfg.get("graph", {}) if isinstance(cfg, dict) else {}
retrieval_cfg = cfg.get("retrieval", {}) if isinstance(cfg, dict) else {}
rv_cfg = retrieval_cfg.get("relation_vectorization", {}) if isinstance(retrieval_cfg, dict) else {}
if not isinstance(emb_cfg, dict):
emb_cfg = {}
if not isinstance(graph_cfg, dict):
graph_cfg = {}
if not isinstance(rv_cfg, dict):
rv_cfg = {}
data_dir = resolve_repo_path(args.data_dir, fallback=DEFAULT_DATA_DIR)
if not data_dir.exists():
print(f"❌ 数据目录不存在: {data_dir}")
return 2
print(f"data_dir: {data_dir}")
print(f"config: {config_path}")
vector_store = _build_vector_store(data_dir, emb_cfg)
graph_store = _build_graph_store(data_dir, graph_cfg)
metadata_store = _build_metadata_store(data_dir)
embedding_manager = _build_embedding_manager(emb_cfg)
service = RelationWriteService(
metadata_store=metadata_store,
graph_store=graph_store,
vector_store=vector_store,
embedding_manager=embedding_manager,
)
try:
states = [s.strip() for s in str(args.states).split(",") if s.strip()]
if not states:
states = ["none", "failed", "pending"]
max_retry = int(args.max_retry) if args.max_retry is not None else int(rv_cfg.get("max_retry", 3))
limit = int(args.limit)
rows = metadata_store.list_relations_by_vector_state(
states=states,
limit=max(1, limit),
max_retry=max(1, max_retry),
)
added_ready_missing = 0
if args.include_ready_missing:
ready_rows = metadata_store.list_relations_by_vector_state(
states=["ready"],
limit=max(1, limit),
max_retry=max(1, max_retry),
)
ready_missing_rows = [
row for row in ready_rows if str(row.get("hash", "")) not in vector_store
]
added_ready_missing = len(ready_missing_rows)
if ready_missing_rows:
dedup: Dict[str, Dict[str, Any]] = {}
for row in rows:
dedup[str(row.get("hash", ""))] = row
for row in ready_missing_rows:
dedup.setdefault(str(row.get("hash", "")), row)
rows = list(dedup.values())[: max(1, limit)]
print(f"candidates: {len(rows)} (states={states}, max_retry={max_retry})")
if args.include_ready_missing:
print(f"ready_missing_candidates_added: {added_ready_missing}")
if not rows:
return 0
if args.dry_run:
print("dry_run=true未执行写入。")
return 0
started = time.time()
stat = await _process_rows(
service=service,
rows=rows,
concurrency=int(args.concurrency),
)
elapsed = (time.time() - started) * 1000.0
vector_store.save()
graph_store.save()
state_stats = metadata_store.count_relations_by_vector_state()
output = {
"processed": len(rows),
"success": int(stat["success"]),
"failed": int(stat["failed"]),
"skipped": int(stat["skipped"]),
"elapsed_ms": elapsed,
"state_stats": state_stats,
}
print(json.dumps(output, ensure_ascii=False, indent=2))
return 0 if stat["failed"] == 0 else 1
finally:
metadata_store.close()
def parse_args() -> argparse.Namespace:
return _build_arg_parser().parse_args()
if __name__ == "__main__":
arguments = parse_args()
raise SystemExit(asyncio.run(main_async(arguments)))

View File

@@ -0,0 +1,65 @@
#!/usr/bin/env python3
"""
回填段落时序字段。
默认策略:
1. 若段落缺失 event_time/event_time_start/event_time_end
2. 且存在 created_at
3. 写入 event_time=created_at, time_granularity=day, time_confidence=0.2
"""
from __future__ import annotations
import argparse
from _bootstrap import DEFAULT_DATA_DIR, resolve_repo_path
from A_memorix.core.storage import MetadataStore # noqa: E402
def backfill(
data_dir: Path,
dry_run: bool,
limit: int,
no_created_fallback: bool,
) -> int:
store = MetadataStore(data_dir=data_dir)
store.connect()
summary = store.backfill_temporal_metadata_from_created_at(
limit=limit,
dry_run=dry_run,
no_created_fallback=no_created_fallback,
)
store.close()
if dry_run:
print(f"[dry-run] candidates={summary['candidates']}")
return int(summary["candidates"])
if no_created_fallback:
print(f"skip update (no-created-fallback), candidates={summary['candidates']}")
return 0
print(f"updated={summary['updated']}")
return int(summary["updated"])
def main() -> int:
parser = argparse.ArgumentParser(description="Backfill temporal metadata for A_Memorix paragraphs")
parser.add_argument("--data-dir", default=str(DEFAULT_DATA_DIR), help="数据目录")
parser.add_argument("--dry-run", action="store_true", help="仅统计,不写入")
parser.add_argument("--limit", type=int, default=100000, help="最大处理条数")
parser.add_argument(
"--no-created-fallback",
action="store_true",
help="不使用 created_at 回填,仅输出候选数量",
)
args = parser.parse_args()
backfill(
data_dir=resolve_repo_path(args.data_dir, fallback=DEFAULT_DATA_DIR),
dry_run=args.dry_run,
limit=max(1, int(args.limit)),
no_created_fallback=args.no_created_fallback,
)
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -0,0 +1,530 @@
#!/usr/bin/env python3
"""
LPMM 到 A_memorix 存储转换器
功能:
1. 读取 LPMM parquet 文件 (paragraph.parquet, entity.parquet, relation.parquet)
2. 读取 LPMM 图文件 (graph.graphml 或 graph_structure.pkl)
3. 直接写入 A_memorix 二进制 VectorStore 和稀疏 GraphStore
4. 绕过 Embedding 生成以节省 Token
"""
import sys
import os
import json
import argparse
import asyncio
import pickle
import logging
from pathlib import Path
from typing import Dict, Any, List, Tuple
import numpy as np
import tomlkit
from _bootstrap import DEFAULT_CONFIG_PATH, resolve_repo_path
def _build_arg_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="将 LPMM 数据转换为 A_memorix 格式")
parser.add_argument("--input", "-i", required=True, help="包含 LPMM 数据的输入目录 (parquet, graphml)")
parser.add_argument("--output", "-o", required=True, help="A_memorix 数据的输出目录")
parser.add_argument("--dim", type=int, default=384, help="Embedding 维度 (必须与 LPMM 模型匹配)")
parser.add_argument("--batch-size", type=int, default=1024, help="Parquet 分批读取大小 (默认 1024)")
parser.add_argument(
"--skip-relation-vector-rebuild",
action="store_true",
help="跳过按关系元数据重建关系向量(默认开启)",
)
return parser
# --help/-h fast path: avoid heavy host/plugin bootstrap
if any(arg in {"-h", "--help"} for arg in sys.argv[1:]):
_build_arg_parser().print_help()
sys.exit(0)
# 设置日志:优先复用 MaiBot 统一日志体系,失败时回退到标准 logging。
try:
from src.common.logger import get_logger
logger = get_logger("A_Memorix.LPMMConverter")
except Exception:
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
logger = logging.getLogger("A_Memorix.LPMMConverter")
try:
import networkx as nx
from scipy import sparse
import pyarrow.parquet as pq
except ImportError as e:
logger.error(f"缺少依赖: {e}")
logger.error("请安装: pip install pandas pyarrow networkx scipy")
sys.exit(1)
try:
from A_memorix.core.storage.vector_store import VectorStore
from A_memorix.core.storage.graph_store import GraphStore
from A_memorix.core.storage.metadata_store import MetadataStore
from A_memorix.core.storage import QuantizationType, SparseMatrixFormat
from A_memorix.core.embedding import create_embedding_api_adapter
from A_memorix.core.utils.relation_write_service import RelationWriteService
except ImportError as e:
logger.error(f"无法导入 A_memorix 核心模块: {e}")
logger.error("请确保在正确的环境中运行,且已安装所有依赖。")
sys.exit(1)
class LPMMConverter:
def __init__(
self,
lpmm_data_dir: Path,
output_dir: Path,
dimension: int = 384,
batch_size: int = 1024,
rebuild_relation_vectors: bool = True,
):
self.lpmm_dir = lpmm_data_dir
self.output_dir = output_dir
self.dimension = dimension
self.batch_size = max(1, int(batch_size))
self.rebuild_relation_vectors = bool(rebuild_relation_vectors)
self.vector_dir = output_dir / "vectors"
self.graph_dir = output_dir / "graph"
self.metadata_dir = output_dir / "metadata"
self.vector_store = None
self.graph_store = None
self.metadata_store = None
self.embedding_manager = None
self.relation_write_service = None
# LPMM 原 ID -> A_memorix ID 映射(用于图重写)
self.id_mapping: Dict[str, str] = {}
def _register_id_mapping(self, raw_id: Any, mapped_id: str, p_type: str) -> None:
"""记录 ID 映射,兼容带/不带类型前缀两种格式。"""
if raw_id is None:
return
raw = str(raw_id).strip()
if not raw:
return
self.id_mapping[raw] = mapped_id
prefix = f"{p_type}-"
if raw.startswith(prefix):
self.id_mapping[raw[len(prefix):]] = mapped_id
else:
self.id_mapping[prefix + raw] = mapped_id
def _map_node_id(self, node: Any) -> str:
"""将图节点 ID 映射到转换后的 A_memorix ID。"""
node_key = str(node)
return self.id_mapping.get(node_key, node_key)
def initialize_stores(self):
"""初始化空的 A_memorix 存储"""
logger.info(f"正在初始化存储于 {self.output_dir}...")
# 初始化 VectorStore (A_memorix 默认使用 INT8 量化)
self.vector_store = VectorStore(
dimension=self.dimension,
quantization_type=QuantizationType.INT8,
data_dir=self.vector_dir
)
self.vector_store.clear() # 清空旧数据
# 初始化 GraphStore (使用 CSR 格式)
self.graph_store = GraphStore(
matrix_format=SparseMatrixFormat.CSR,
data_dir=self.graph_dir
)
self.graph_store.clear()
# 初始化 MetadataStore
self.metadata_store = MetadataStore(data_dir=self.metadata_dir)
self.metadata_store.connect()
# 清空元数据表?理想情况下是的,但要小心。
# 对于转换,我们假设是全新的开始或覆盖。
# A_memorix 中的 MetadataStore 通常使用 SQLite。
# 如果目录是新的,我们会依赖它创建新文件。
if self.rebuild_relation_vectors:
self._init_relation_vector_service()
def _load_plugin_config(self) -> Dict[str, Any]:
config_path = DEFAULT_CONFIG_PATH
if not config_path.exists():
return {}
try:
with open(config_path, "r", encoding="utf-8") as f:
parsed = tomlkit.load(f)
return dict(parsed) if isinstance(parsed, dict) else {}
except Exception as e:
logger.warning(f"读取 config.toml 失败,使用默认 embedding 配置: {e}")
return {}
def _init_relation_vector_service(self) -> None:
if not self.rebuild_relation_vectors:
return
cfg = self._load_plugin_config()
emb_cfg = cfg.get("embedding", {}) if isinstance(cfg, dict) else {}
if not isinstance(emb_cfg, dict):
emb_cfg = {}
try:
self.embedding_manager = create_embedding_api_adapter(
batch_size=int(emb_cfg.get("batch_size", 32)),
max_concurrent=int(emb_cfg.get("max_concurrent", 5)),
default_dimension=int(emb_cfg.get("dimension", self.dimension)),
model_name=str(emb_cfg.get("model_name", "auto")),
retry_config=emb_cfg.get("retry", {}) if isinstance(emb_cfg.get("retry", {}), dict) else {},
)
self.relation_write_service = RelationWriteService(
metadata_store=self.metadata_store,
graph_store=self.graph_store,
vector_store=self.vector_store,
embedding_manager=self.embedding_manager,
)
except Exception as e:
self.embedding_manager = None
self.relation_write_service = None
logger.warning(f"初始化关系向量重建服务失败,将跳过关系向量回填: {e}")
async def _rebuild_relation_vectors(self) -> None:
if not self.rebuild_relation_vectors:
return
if self.relation_write_service is None:
logger.warning("关系向量重建已启用,但写入服务不可用,已跳过。")
return
rows = self.metadata_store.get_relations()
if not rows:
logger.info("未发现关系元数据,无需重建关系向量。")
return
success = 0
failed = 0
skipped = 0
for row in rows:
result = await self.relation_write_service.ensure_relation_vector(
hash_value=str(row["hash"]),
subject=str(row.get("subject", "")),
predicate=str(row.get("predicate", "")),
obj=str(row.get("object", "")),
)
if result.vector_state == "ready":
if result.vector_written:
success += 1
else:
skipped += 1
else:
failed += 1
logger.info(
"关系向量重建完成: "
f"total={len(rows)} "
f"success={success} "
f"skipped={skipped} "
f"failed={failed}"
)
@staticmethod
def _parse_relation_text(text: str) -> Tuple[str, str, str]:
raw = str(text or "").strip()
if not raw:
return "", "", ""
if "|" in raw:
parts = [p.strip() for p in raw.split("|") if p.strip()]
if len(parts) >= 3:
return parts[0], parts[1], parts[2]
if "->" in raw:
parts = [p.strip() for p in raw.split("->") if p.strip()]
if len(parts) >= 3:
return parts[0], parts[1], parts[2]
pieces = raw.split()
if len(pieces) >= 3:
return pieces[0], pieces[1], " ".join(pieces[2:])
return "", "", ""
def _import_relation_metadata_from_parquet(self, relation_path: Path) -> int:
if not relation_path.exists():
return 0
try:
parquet_file = pq.ParquetFile(relation_path)
except Exception as e:
logger.warning(f"读取 relation.parquet 失败,跳过关系元数据导入: {e}")
return 0
cols = set(parquet_file.schema_arrow.names)
has_triple_cols = {"subject", "predicate", "object"}.issubset(cols)
content_col = "str" if "str" in cols else ("content" if "content" in cols else "")
imported_hashes = set()
imported = 0
for record_batch in parquet_file.iter_batches(batch_size=self.batch_size):
df_batch = record_batch.to_pandas()
for _, row in df_batch.iterrows():
subject = ""
predicate = ""
obj = ""
if has_triple_cols:
subject = str(row.get("subject", "") or "").strip()
predicate = str(row.get("predicate", "") or "").strip()
obj = str(row.get("object", "") or "").strip()
elif content_col:
subject, predicate, obj = self._parse_relation_text(row.get(content_col, ""))
if not (subject and predicate and obj):
continue
rel_hash = self.metadata_store.add_relation(
subject=subject,
predicate=predicate,
obj=obj,
source_paragraph=None,
)
if rel_hash in imported_hashes:
continue
imported_hashes.add(rel_hash)
self.graph_store.add_edges([(subject, obj)], relation_hashes=[rel_hash])
try:
self.metadata_store.set_relation_vector_state(rel_hash, "none")
except Exception:
pass
imported += 1
return imported
def convert_vectors(self):
"""将 Parquet 向量转换为 VectorStore"""
# LPMM 默认文件名
parquet_files = {
"paragraph": self.lpmm_dir / "paragraph.parquet",
"entity": self.lpmm_dir / "entity.parquet",
"relation": self.lpmm_dir / "relation.parquet"
}
total_vectors = 0
for p_type, p_path in parquet_files.items():
# 关系向量在当前脚本中无法保证与 MetadataStore 的关系记录一一对应,
# 直接导入会污染召回结果(命中后无法反查 relation 元数据)。
if p_type == "relation":
relation_count = self._import_relation_metadata_from_parquet(p_path)
logger.warning(
"跳过 relation.parquet 向量导入(保持一致性);"
f"已导入关系元数据: {relation_count}"
)
continue
if not p_path.exists():
logger.warning(f"文件未找到: {p_path}, 跳过 {p_type} 向量。")
continue
logger.info(f"正在处理 {p_type} 向量,来源: {p_path}...")
try:
parquet_file = pq.ParquetFile(p_path)
total_rows = parquet_file.metadata.num_rows
if total_rows == 0:
logger.info(f"{p_path} 为空,跳过。")
continue
# LPMM Schema: 'hash', 'embedding', 'str'
cols = parquet_file.schema_arrow.names
# 兼容性检查
content_col = 'str' if 'str' in cols else 'content'
emb_col = 'embedding'
hash_col = 'hash'
if content_col not in cols or emb_col not in cols:
logger.error(f"{p_path} 中缺少必要列 (需包含 {content_col}, {emb_col})。发现: {cols}")
continue
batch_columns = [content_col, emb_col]
if hash_col in cols:
batch_columns.append(hash_col)
processed_rows = 0
added_for_type = 0
batch_idx = 0
for record_batch in parquet_file.iter_batches(
batch_size=self.batch_size,
columns=batch_columns,
):
batch_idx += 1
df_batch = record_batch.to_pandas()
embeddings_list = []
ids_list = []
# 同时处理元数据映射
for _, row in df_batch.iterrows():
processed_rows += 1
content = row[content_col]
emb = row[emb_col]
if content is None or (isinstance(content, float) and np.isnan(content)):
continue
content = str(content).strip()
if not content:
continue
if emb is None or len(emb) == 0:
continue
# 先写 MetadataStore并使用其返回的真实 hash 作为向量 ID
# 保证检索返回 ID 可以直接反查元数据。
store_id = None
if p_type == "paragraph":
store_id = self.metadata_store.add_paragraph(
content=content,
source="lpmm_import",
knowledge_type="factual",
)
elif p_type == "entity":
store_id = self.metadata_store.add_entity(name=content)
else:
continue
raw_hash = row[hash_col] if hash_col in df_batch.columns else None
if raw_hash is not None and not (isinstance(raw_hash, float) and np.isnan(raw_hash)):
self._register_id_mapping(raw_hash, store_id, p_type)
# 确保 embedding 是 numpy 数组
emb_np = np.array(emb, dtype=np.float32)
if emb_np.shape[0] != self.dimension:
logger.error(f"维度不匹配: {emb_np.shape[0]} vs {self.dimension}")
continue
embeddings_list.append(emb_np)
ids_list.append(store_id)
if embeddings_list:
# 分批添加到向量存储
vectors_np = np.stack(embeddings_list)
count = self.vector_store.add(vectors_np, ids_list)
added_for_type += count
total_vectors += count
if batch_idx == 1 or batch_idx % 10 == 0:
logger.info(
f"[{p_type}] 批次 {batch_idx}: 已扫描 {processed_rows}/{total_rows}, 已导入 {added_for_type}"
)
logger.info(
f"{p_type} 向量处理完成:总扫描 {processed_rows},总导入 {added_for_type}"
)
except Exception as e:
logger.error(f"处理 {p_path} 时出错: {e}")
# 提交向量存储
self.vector_store.save()
logger.info(f"向量转换完成。总向量数: {total_vectors}")
def convert_graph(self):
"""将 LPMM 图转换为 GraphStore"""
# LPMM 默认文件名是 rag-graph.graphml
graph_files = [
self.lpmm_dir / "rag-graph.graphml",
self.lpmm_dir / "graph.graphml",
self.lpmm_dir / "graph_structure.pkl"
]
nx_graph = None
for g_path in graph_files:
if g_path.exists():
logger.info(f"发现图文件: {g_path}")
try:
if g_path.suffix == ".graphml":
nx_graph = nx.read_graphml(g_path)
elif g_path.suffix == ".pkl":
with open(g_path, "rb") as f:
data = pickle.load(f)
# LPMM 可能会将图存储在包装类中
if hasattr(data, "graph") and isinstance(data.graph, nx.Graph):
nx_graph = data.graph
elif isinstance(data, nx.Graph):
nx_graph = data
break
except Exception as e:
logger.error(f"加载 {g_path} 失败: {e}")
if nx_graph is None:
logger.warning("未找到有效的图文件。跳过图转换。")
return
logger.info(f"已加载图,包含 {nx_graph.number_of_nodes()} 个节点和 {nx_graph.number_of_edges()} 条边。")
# 1. 添加节点
# LPMM 节点通常是哈希或带前缀的字符串。
# 我们需要将它们映射到 A_memorix 格式。
# 如果 LPMM 使用 "entity-HASH",则与 A_memorix 匹配。
nodes_to_add = []
node_attrs = {}
for node, attrs in nx_graph.nodes(data=True):
# 假设 LPMM 使用一致的命名 "entity-..." 或 "paragraph-..."
mapped_node = self._map_node_id(node)
nodes_to_add.append(mapped_node)
if attrs:
node_attrs[mapped_node] = attrs
self.graph_store.add_nodes(nodes_to_add, node_attrs)
# 2. 添加边
edges_to_add = []
weights = []
for u, v, data in nx_graph.edges(data=True):
weight = data.get("weight", 1.0)
edges_to_add.append((self._map_node_id(u), self._map_node_id(v)))
weights.append(float(weight))
# 如果可能,将关系同步到 MetadataStore
# 但图的边并不总是包含关系谓词
# 如果 LPMM 边数据有 'predicate',我们可以添加到元数据
# 通常 LPMM 边是加权和,谓词信息可能在简单图中丢失
if edges_to_add:
self.graph_store.add_edges(edges_to_add, weights)
self.graph_store.save()
logger.info("图转换完成。")
def run(self):
self.initialize_stores()
self.convert_vectors()
self.convert_graph()
asyncio.run(self._rebuild_relation_vectors())
self.vector_store.save()
self.graph_store.save()
self.metadata_store.close()
logger.info("所有转换成功完成。")
def main():
parser = _build_arg_parser()
args = parser.parse_args()
input_path = resolve_repo_path(args.input)
output_path = resolve_repo_path(args.output)
if not input_path.exists():
logger.error(f"输入目录不存在: {input_path}")
sys.exit(1)
converter = LPMMConverter(
input_path,
output_path,
dimension=args.dim,
batch_size=args.batch_size,
rebuild_relation_vectors=not bool(args.skip_relation_vector_rebuild),
)
converter.run()
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,165 @@
#!/usr/bin/env python3
"""
LPMM OpenIE JSON 导入工具。
功能:
1. 读取符合 LPMM 规范的 OpenIE JSON 文件
2. 转换为 A_Memorix 的统一导入格式
3. 复用 `process_knowledge.py` 中的 `AutoImporter` 直接入库
"""
from __future__ import annotations
import argparse
import asyncio
import json
import sys
import traceback
from pathlib import Path
from typing import Any, Dict, List
from rich.console import Console
from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
console = Console()
import _bootstrap # noqa: F401
def _build_arg_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="将 LPMM OpenIE JSON 导入 A_Memorix")
parser.add_argument("path", help="LPMM JSON 文件路径或目录")
parser.add_argument("--force", action="store_true", help="强制重新导入")
parser.add_argument("--concurrency", "-c", type=int, default=5, help="并发数")
return parser
if any(arg in {"-h", "--help"} for arg in sys.argv[1:]):
_build_arg_parser().print_help()
raise SystemExit(0)
try:
from process_knowledge import AutoImporter
from A_memorix.core.utils.hash import compute_paragraph_hash
from src.common.logger import get_logger
except ImportError as exc: # pragma: no cover - script bootstrap
print(f"导入模块失败,请确认 PYTHONPATH 与工作区结构: {exc}")
raise SystemExit(1)
logger = get_logger("A_Memorix.LPMMImport")
class LPMMConverter:
def convert_lpmm_to_memorix(self, lpmm_data: Dict[str, Any], filename: str) -> Dict[str, Any]:
memorix_data = {"paragraphs": [], "entities": []}
docs = lpmm_data.get("docs", []) or []
if not docs:
logger.warning(f"文件中未找到 docs 字段: {filename}")
return memorix_data
all_entities = set()
for doc in docs:
content = str(doc.get("passage", "") or "").strip()
if not content:
continue
relations: List[Dict[str, str]] = []
for triple in doc.get("extracted_triples", []) or []:
if isinstance(triple, list) and len(triple) == 3:
relations.append(
{
"subject": str(triple[0] or "").strip(),
"predicate": str(triple[1] or "").strip(),
"object": str(triple[2] or "").strip(),
}
)
entities = [str(item or "").strip() for item in doc.get("extracted_entities", []) or [] if str(item or "").strip()]
all_entities.update(entities)
for relation in relations:
if relation["subject"]:
all_entities.add(relation["subject"])
if relation["object"]:
all_entities.add(relation["object"])
memorix_data["paragraphs"].append(
{
"hash": compute_paragraph_hash(content),
"content": content,
"source": filename,
"entities": entities,
"relations": relations,
}
)
memorix_data["entities"] = sorted(all_entities)
return memorix_data
async def main() -> None:
parser = _build_arg_parser()
args = parser.parse_args()
target_path = Path(args.path)
if not target_path.exists():
logger.error(f"路径不存在: {target_path}")
return
if target_path.is_dir():
files_to_process = list(target_path.glob("*-openie.json")) or list(target_path.glob("*.json"))
else:
files_to_process = [target_path]
if not files_to_process:
logger.error("未找到可处理的 JSON 文件")
return
importer = AutoImporter(force=bool(args.force), concurrency=int(args.concurrency))
if not await importer.initialize():
logger.error("初始化存储失败")
return
converter = LPMMConverter()
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
TimeElapsedColumn(),
console=console,
transient=False,
) as progress:
for json_file in files_to_process:
logger.info(f"正在转换并导入: {json_file.name}")
try:
with open(json_file, "r", encoding="utf-8") as handle:
lpmm_data = json.load(handle)
memorix_data = converter.convert_lpmm_to_memorix(lpmm_data, json_file.name)
total_items = len(memorix_data.get("paragraphs", []))
if total_items <= 0:
logger.warning(f"转换结果为空: {json_file.name}")
continue
task_id = progress.add_task(f"Importing {json_file.name}", total=total_items)
def update_progress(step: int = 1) -> None:
progress.advance(task_id, advance=step)
await importer.import_json_data(
memorix_data,
filename=f"lpmm_{json_file.name}",
progress_callback=update_progress,
)
except Exception as exc:
logger.error(f"处理文件 {json_file.name} 失败: {exc}\n{traceback.format_exc()}")
await importer.close()
logger.info("全部处理完成")
if __name__ == "__main__":
if sys.platform == "win32": # pragma: no cover
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
asyncio.run(main())

View File

@@ -0,0 +1,99 @@
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import asyncio
import json
import sqlite3
from datetime import datetime
from typing import Any, Dict
from _bootstrap import DEFAULT_DATA_DIR, DEFAULT_DB_PATH, PLUGIN_ROOT, resolve_repo_path
from A_memorix.core.runtime.sdk_memory_kernel import SDKMemoryKernel # noqa: E402
def _parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="迁移 MaiBot chat_history 到 A_Memorix")
parser.add_argument("--db-path", default=str(DEFAULT_DB_PATH), help="MaiBot SQLite 路径")
parser.add_argument("--data-dir", default=str(DEFAULT_DATA_DIR), help="A_Memorix 数据目录")
parser.add_argument("--limit", type=int, default=0, help="限制迁移条数0 表示全部")
parser.add_argument("--dry-run", action="store_true", help="仅预览,不写入")
return parser.parse_args()
def _to_timestamp(value: Any) -> float | None:
if value is None:
return None
if isinstance(value, (int, float)):
return float(value)
text = str(value).strip()
if not text:
return None
try:
return datetime.fromisoformat(text).timestamp()
except ValueError:
return None
async def _main() -> int:
args = _parse_args()
db_path = resolve_repo_path(args.db_path, fallback=DEFAULT_DB_PATH)
if not db_path.exists():
print(f"数据库不存在: {db_path}")
return 1
conn = sqlite3.connect(str(db_path))
conn.row_factory = sqlite3.Row
sql = """
SELECT id, session_id, start_timestamp, end_timestamp, participants, theme, keywords, summary
FROM chat_history
ORDER BY id ASC
"""
if int(args.limit or 0) > 0:
sql += " LIMIT ?"
rows = conn.execute(sql, (int(args.limit),)).fetchall()
else:
rows = conn.execute(sql).fetchall()
conn.close()
print(f"chat_history 待处理: {len(rows)}")
if args.dry_run:
for row in rows[:5]:
print(f"- id={row['id']} session={row['session_id']} theme={row['theme']}")
return 0
data_dir = resolve_repo_path(args.data_dir, fallback=DEFAULT_DATA_DIR)
kernel = SDKMemoryKernel(plugin_root=PLUGIN_ROOT, config={"storage": {"data_dir": str(data_dir)}})
await kernel.initialize()
migrated = 0
skipped = 0
for row in rows:
participants = json.loads(row["participants"]) if row["participants"] else []
keywords = json.loads(row["keywords"]) if row["keywords"] else []
theme = str(row["theme"] or "").strip()
summary = str(row["summary"] or "").strip()
text = f"主题:{theme}\n概括:{summary}".strip()
result: Dict[str, Any] = await kernel.ingest_summary(
external_id=f"chat_history:{row['id']}",
chat_id=str(row["session_id"] or ""),
text=text,
participants=participants,
time_start=_to_timestamp(row["start_timestamp"]),
time_end=_to_timestamp(row["end_timestamp"]),
tags=keywords,
metadata={"theme": theme, "source_row_id": int(row["id"])},
)
if result.get("stored_ids"):
migrated += 1
else:
skipped += 1
print(f"迁移完成: migrated={migrated} skipped={skipped}")
print(json.dumps(kernel.memory_stats(), ensure_ascii=False))
kernel.close()
return 0
if __name__ == "__main__":
raise SystemExit(asyncio.run(_main()))

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,109 @@
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import asyncio
import json
import sqlite3
from typing import Any, Dict, List
from _bootstrap import DEFAULT_DATA_DIR, DEFAULT_DB_PATH, PLUGIN_ROOT, resolve_repo_path
from A_memorix.core.runtime.sdk_memory_kernel import SDKMemoryKernel # noqa: E402
def _parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="迁移 MaiBot person_info.memory_points 到 A_Memorix")
parser.add_argument("--db-path", default=str(DEFAULT_DB_PATH), help="MaiBot SQLite 路径")
parser.add_argument("--data-dir", default=str(DEFAULT_DATA_DIR), help="A_Memorix 数据目录")
parser.add_argument("--limit", type=int, default=0, help="限制迁移人数0 表示全部")
parser.add_argument("--dry-run", action="store_true", help="仅预览,不写入")
return parser.parse_args()
def _parse_memory_points(raw_value: Any) -> List[Dict[str, Any]]:
try:
values = json.loads(raw_value) if raw_value else []
except Exception:
values = []
items: List[Dict[str, Any]] = []
for index, item in enumerate(values):
text = str(item or "").strip()
if not text:
continue
parts = text.split(":")
if len(parts) >= 3:
category = parts[0].strip()
content = ":".join(parts[1:-1]).strip()
weight = parts[-1].strip()
else:
category = "其他"
content = text
weight = "1.0"
if content:
items.append({"index": index, "category": category or "其他", "content": content, "weight": weight or "1.0"})
return items
async def _main() -> int:
args = _parse_args()
db_path = resolve_repo_path(args.db_path, fallback=DEFAULT_DB_PATH)
if not db_path.exists():
print(f"数据库不存在: {db_path}")
return 1
conn = sqlite3.connect(str(db_path))
conn.row_factory = sqlite3.Row
sql = """
SELECT person_id, person_name, user_nickname, memory_points
FROM person_info
WHERE memory_points IS NOT NULL AND memory_points != ''
ORDER BY id ASC
"""
if int(args.limit or 0) > 0:
sql += " LIMIT ?"
rows = conn.execute(sql, (int(args.limit),)).fetchall()
else:
rows = conn.execute(sql).fetchall()
conn.close()
preview_total = sum(len(_parse_memory_points(row["memory_points"])) for row in rows)
print(f"person_info 待迁移人物: {len(rows)} 记忆点: {preview_total}")
if args.dry_run:
for row in rows[:5]:
print(f"- person_id={row['person_id']} person_name={row['person_name'] or row['user_nickname']}")
return 0
data_dir = resolve_repo_path(args.data_dir, fallback=DEFAULT_DATA_DIR)
kernel = SDKMemoryKernel(plugin_root=PLUGIN_ROOT, config={"storage": {"data_dir": str(data_dir)}})
await kernel.initialize()
migrated = 0
skipped = 0
for row in rows:
person_id = str(row["person_id"] or "").strip()
if not person_id:
continue
display_name = str(row["person_name"] or row["user_nickname"] or "").strip()
for item in _parse_memory_points(row["memory_points"]):
result: Dict[str, Any] = await kernel.ingest_text(
external_id=f"person_memory:{person_id}:{item['index']}",
source_type="person_fact",
text=f"[{item['category']}] {item['content']}",
person_ids=[person_id],
tags=[item["category"]],
entities=[person_id, display_name] if display_name else [person_id],
metadata={"category": item["category"], "weight": item["weight"], "display_name": display_name},
)
if result.get("stored_ids"):
migrated += 1
else:
skipped += 1
print(f"迁移完成: migrated={migrated} skipped={skipped}")
print(json.dumps(kernel.memory_stats(), ensure_ascii=False))
kernel.close()
return 0
if __name__ == "__main__":
raise SystemExit(asyncio.run(_main()))

View File

@@ -0,0 +1,720 @@
#!/usr/bin/env python3
"""
知识库自动导入脚本 (Strategy-Aware Version)
功能:
1. 扫描 data/plugins/a-dawn.a-memorix/raw 下的 .txt 文件
2. 检查 data/import_manifest.json 确认是否已导入
3. 使用 Strategy 模式处理文件 (Narrative/Factual/Quote)
4. 将生成的数据直接存入 VectorStore/GraphStore/MetadataStore
5. 更新 manifest
"""
import sys
import os
import json
import asyncio
import time
import random
import hashlib
import tomlkit
import argparse
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Any, Optional
from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn
from rich.console import Console
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
console = Console()
class LLMGenerationError(Exception):
pass
from _bootstrap import DEFAULT_CONFIG_PATH, DEFAULT_DATA_DIR
# 数据目录
DATA_DIR = DEFAULT_DATA_DIR
RAW_DIR = DATA_DIR / "raw"
PROCESSED_DIR = DATA_DIR / "processed"
MANIFEST_PATH = DATA_DIR / "import_manifest.json"
def _build_arg_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="A_Memorix Knowledge Importer (Strategy-Aware)")
parser.add_argument("--force", action="store_true", help="Force re-import")
parser.add_argument("--clear-manifest", action="store_true", help="Clear manifest")
parser.add_argument(
"--type",
"-t",
default="auto",
help="Target import strategy override (auto/narrative/factual/quote)",
)
parser.add_argument("--concurrency", "-c", type=int, default=5)
parser.add_argument(
"--chat-log",
action="store_true",
help="聊天记录导入模式:强制 narrative 策略,并使用 LLM 语义抽取 event_time/event_time_range",
)
parser.add_argument(
"--chat-reference-time",
default=None,
help="chat_log 模式的相对时间参考点(如 2026/02/12 10:30不传则使用当前本地时间",
)
return parser
# --help/-h fast path: avoid heavy host/plugin bootstrap
if any(arg in {"-h", "--help"} for arg in sys.argv[1:]):
_build_arg_parser().print_help()
sys.exit(0)
try:
import A_memorix.core as core_module
import A_memorix.core.storage as storage_module
from src.common.logger import get_logger
from src.services import llm_service as llm_api
from src.config.config import global_config, model_config
VectorStore = core_module.VectorStore
GraphStore = core_module.GraphStore
MetadataStore = core_module.MetadataStore
ImportStrategy = core_module.ImportStrategy
create_embedding_api_adapter = core_module.create_embedding_api_adapter
RelationWriteService = getattr(core_module, "RelationWriteService", None)
looks_like_quote_text = storage_module.looks_like_quote_text
parse_import_strategy = storage_module.parse_import_strategy
resolve_stored_knowledge_type = storage_module.resolve_stored_knowledge_type
select_import_strategy = storage_module.select_import_strategy
from A_memorix.core.utils.time_parser import normalize_time_meta
from A_memorix.core.utils.import_payloads import normalize_paragraph_import_item
from A_memorix.core.strategies.base import BaseStrategy, ProcessedChunk, KnowledgeType as StratKnowledgeType
from A_memorix.core.strategies.narrative import NarrativeStrategy
from A_memorix.core.strategies.factual import FactualStrategy
from A_memorix.core.strategies.quote import QuoteStrategy
except ImportError as e:
print(f"❌ 无法导入模块: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
logger = get_logger("A_Memorix.AutoImport")
def _log_before_retry(retry_state) -> None:
"""使用项目统一日志风格记录重试信息。"""
exc = None
if getattr(retry_state, "outcome", None) is not None and retry_state.outcome.failed:
exc = retry_state.outcome.exception()
next_sleep = getattr(getattr(retry_state, "next_action", None), "sleep", None)
logger.warning(
"LLM 调用即将重试: "
f"attempt={getattr(retry_state, 'attempt_number', '?')} "
f"next_sleep={next_sleep} "
f"error={exc}"
)
class AutoImporter:
def __init__(
self,
force: bool = False,
clear_manifest: bool = False,
target_type: str = "auto",
concurrency: int = 5,
chat_log: bool = False,
chat_reference_time: Optional[str] = None,
):
self.vector_store: Optional[VectorStore] = None
self.graph_store: Optional[GraphStore] = None
self.metadata_store: Optional[MetadataStore] = None
self.embedding_manager = None
self.relation_write_service = None
self.plugin_config = {}
self.manifest = {}
self.force = force
self.clear_manifest = clear_manifest
self.chat_log = chat_log
parsed_target_type = parse_import_strategy(target_type, default=ImportStrategy.AUTO)
self.target_type = ImportStrategy.NARRATIVE.value if chat_log else parsed_target_type.value
self.chat_reference_dt = self._parse_reference_time(chat_reference_time)
if self.chat_log and parsed_target_type not in {ImportStrategy.AUTO, ImportStrategy.NARRATIVE}:
logger.warning(
f"chat_log 模式已启用target_type={target_type} 将被覆盖为 narrative"
)
self.concurrency_limit = concurrency
self.semaphore = None
self.storage_lock = None
async def initialize(self):
logger.info(f"正在初始化... (并发数: {self.concurrency_limit})")
self.semaphore = asyncio.Semaphore(self.concurrency_limit)
self.storage_lock = asyncio.Lock()
RAW_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
if self.clear_manifest:
logger.info("🧹 清理 Mainfest")
self.manifest = {}
self._save_manifest()
elif MANIFEST_PATH.exists():
try:
with open(MANIFEST_PATH, "r", encoding="utf-8") as f:
self.manifest = json.load(f)
except Exception:
self.manifest = {}
config_path = DEFAULT_CONFIG_PATH
try:
with open(config_path, "r", encoding="utf-8") as f:
self.plugin_config = tomlkit.load(f)
except Exception as e:
logger.error(f"加载 A_Memorix 配置失败: {e}")
return False
try:
await self._init_stores()
except Exception as e:
logger.error(f"初始化存储失败: {e}")
return False
return True
async def _init_stores(self):
# ... (Same as original)
self.embedding_manager = create_embedding_api_adapter(
batch_size=self.plugin_config.get("embedding", {}).get("batch_size", 32),
default_dimension=self.plugin_config.get("embedding", {}).get("dimension", 384),
model_name=self.plugin_config.get("embedding", {}).get("model_name", "auto"),
retry_config=self.plugin_config.get("embedding", {}).get("retry", {}),
)
try:
dim = await self.embedding_manager._detect_dimension()
except:
dim = self.embedding_manager.default_dimension
q_type_str = str(self.plugin_config.get("embedding", {}).get("quantization_type", "int8") or "int8").lower()
# Need to access QuantizationType from storage_module if not imported globally
QuantizationType = storage_module.QuantizationType
if q_type_str != "int8":
raise ValueError(
"embedding.quantization_type 在 vNext 仅允许 int8(SQ8)。"
" 请先执行 scripts/release_vnext_migrate.py migrate。"
)
self.vector_store = VectorStore(
dimension=dim,
quantization_type=QuantizationType.INT8,
data_dir=DATA_DIR / "vectors"
)
SparseMatrixFormat = storage_module.SparseMatrixFormat
m_fmt_str = self.plugin_config.get("graph", {}).get("sparse_matrix_format", "csr")
m_map = {"csr": SparseMatrixFormat.CSR, "csc": SparseMatrixFormat.CSC}
self.graph_store = GraphStore(
matrix_format=m_map.get(m_fmt_str, SparseMatrixFormat.CSR),
data_dir=DATA_DIR / "graph"
)
self.metadata_store = MetadataStore(data_dir=DATA_DIR / "metadata")
self.metadata_store.connect()
if RelationWriteService is not None:
self.relation_write_service = RelationWriteService(
metadata_store=self.metadata_store,
graph_store=self.graph_store,
vector_store=self.vector_store,
embedding_manager=self.embedding_manager,
)
if self.vector_store.has_data(): self.vector_store.load()
if self.graph_store.has_data(): self.graph_store.load()
def _should_write_relation_vectors(self) -> bool:
retrieval_cfg = self.plugin_config.get("retrieval", {})
if not isinstance(retrieval_cfg, dict):
return False
rv_cfg = retrieval_cfg.get("relation_vectorization", {})
if not isinstance(rv_cfg, dict):
return False
return bool(rv_cfg.get("enabled", False)) and bool(rv_cfg.get("write_on_import", True))
def load_file(self, file_path: Path) -> str:
with open(file_path, "r", encoding="utf-8") as f:
return f.read()
def get_file_hash(self, content: str) -> str:
return hashlib.md5(content.encode("utf-8")).hexdigest()
def _parse_reference_time(self, value: Optional[str]) -> datetime:
"""解析 chat_log 模式的参考时间(用于相对时间语义解析)。"""
if not value:
return datetime.now()
formats = [
"%Y/%m/%d %H:%M:%S",
"%Y/%m/%d %H:%M",
"%Y-%m-%d %H:%M:%S",
"%Y-%m-%d %H:%M",
"%Y/%m/%d",
"%Y-%m-%d",
]
text = str(value).strip()
for fmt in formats:
try:
return datetime.strptime(text, fmt)
except ValueError:
continue
logger.warning(
f"无法解析 chat_reference_time={value},将回退为当前本地时间"
)
return datetime.now()
async def _extract_chat_time_meta_with_llm(
self,
text: str,
model_config: Any,
) -> Optional[Dict[str, Any]]:
"""
使用 LLM 从聊天文本语义中抽取时间信息。
支持将相对时间表达转换为绝对时间。
"""
if not text.strip():
return None
reference_now = self.chat_reference_dt.strftime("%Y/%m/%d %H:%M")
prompt = f"""You are a time extraction engine for chat logs.
Extract temporal information from the following chat paragraph.
Rules:
1. Use semantic understanding, not regex matching.
2. Convert relative expressions (e.g., yesterday evening, last Friday morning) to absolute local datetime using reference_now.
3. If a time span exists, return event_time_start/event_time_end.
4. If only one point in time exists, return event_time.
5. If no reliable time can be inferred, return all time fields as null.
6. Output ONLY valid JSON. No markdown, no explanation.
reference_now: {reference_now}
timezone: local system timezone
Allowed output formats for time values:
- "YYYY/MM/DD"
- "YYYY/MM/DD HH:mm"
JSON schema:
{{
"event_time": null,
"event_time_start": null,
"event_time_end": null,
"time_range": null,
"time_granularity": "day",
"time_confidence": 0.0
}}
Chat paragraph:
\"\"\"{text}\"\"\"
"""
try:
result = await self._llm_call(prompt, model_config)
except Exception as e:
logger.warning(f"chat_log 时间语义抽取失败: {e}")
return None
if not isinstance(result, dict):
return None
raw_time_meta = {
"event_time": result.get("event_time"),
"event_time_start": result.get("event_time_start"),
"event_time_end": result.get("event_time_end"),
"time_range": result.get("time_range"),
"time_granularity": result.get("time_granularity"),
"time_confidence": result.get("time_confidence"),
}
try:
normalized = normalize_time_meta(raw_time_meta)
except Exception as e:
logger.warning(f"chat_log 时间语义抽取结果不可用,已忽略: {e}")
return None
has_effective_time = any(
key in normalized
for key in ("event_time", "event_time_start", "event_time_end")
)
if not has_effective_time:
return None
return normalized
def _determine_strategy(self, filename: str, content: str) -> BaseStrategy:
"""Layer 1: Global Strategy Routing"""
strategy = select_import_strategy(
content,
override=self.target_type,
chat_log=self.chat_log,
)
if self.chat_log:
logger.info(f"chat_log 模式: {filename} 强制使用 NarrativeStrategy")
elif strategy == ImportStrategy.QUOTE:
logger.info(f"Auto-detected Quote/Lyric type for {filename}")
if strategy == ImportStrategy.FACTUAL:
return FactualStrategy(filename)
if strategy == ImportStrategy.QUOTE:
return QuoteStrategy(filename)
return NarrativeStrategy(filename)
def _chunk_rescue(self, chunk: ProcessedChunk, filename: str) -> Optional[BaseStrategy]:
"""Layer 2: Chunk-level rescue strategies"""
# If we are already in Quote strategy, no need to rescue
if chunk.type == StratKnowledgeType.QUOTE:
return None
if looks_like_quote_text(chunk.chunk.text):
logger.info(f" > Rescuing chunk {chunk.chunk.index} as Quote")
return QuoteStrategy(filename)
return None
async def process_and_import(self):
if not await self.initialize(): return
files = list(RAW_DIR.glob("*.txt"))
logger.info(f"扫描到 {len(files)} 个文件 in {RAW_DIR}")
if not files: return
tasks = []
for file_path in files:
tasks.append(asyncio.create_task(self._process_single_file(file_path)))
results = await asyncio.gather(*tasks, return_exceptions=True)
success_count = sum(1 for r in results if r is True)
logger.info(f"本次主处理完成,共成功处理 {success_count}/{len(files)} 个文件")
if self.vector_store: self.vector_store.save()
if self.graph_store: self.graph_store.save()
async def _process_single_file(self, file_path: Path) -> bool:
filename = file_path.name
async with self.semaphore:
try:
content = self.load_file(file_path)
file_hash = self.get_file_hash(content)
if not self.force and filename in self.manifest:
record = self.manifest[filename]
if record.get("hash") == file_hash and record.get("imported"):
logger.info(f"跳过已导入文件: {filename}")
return False
logger.info(f">>> 开始处理: {filename}")
# 1. Strategy Selection
strategy = self._determine_strategy(filename, content)
logger.info(f" 策略: {strategy.__class__.__name__}")
# 2. Split (Strategy-Aware)
initial_chunks = strategy.split(content)
logger.info(f" 初步分块: {len(initial_chunks)}")
processed_data = {"paragraphs": [], "entities": [], "relations": []}
# 3. Extract Loop
model_config = await self._select_model()
for i, chunk in enumerate(initial_chunks):
current_strategy = strategy
# Layer 2: Chunk Rescue
rescue_strategy = self._chunk_rescue(chunk, filename)
if rescue_strategy:
# Re-split? No, just re-process this text as a single chunk using the rescue strategy
# But rescue strategy might want to split it further?
# Simplification: Treat the whole chunk text as one block for the rescue strategy
# OR create a single chunk object for it.
# Creating a new chunk using rescue strategy logic might be complex if split behavior differs.
# Let's just instantiate a chunk of the new type manually
chunk.type = StratKnowledgeType.QUOTE
chunk.flags.verbatim = True
chunk.flags.requires_llm = False # Quotes don't usually need LLM
current_strategy = rescue_strategy
# Extraction
if chunk.flags.requires_llm:
result_chunk = await current_strategy.extract(chunk, lambda p: self._llm_call(p, model_config))
else:
# For quotes, extract might be just pass through or regex
result_chunk = await current_strategy.extract(chunk)
time_meta = None
if self.chat_log:
time_meta = await self._extract_chat_time_meta_with_llm(
result_chunk.chunk.text,
model_config,
)
# Normalize Data
self._normalize_and_aggregate(
result_chunk,
processed_data,
time_meta=time_meta,
)
logger.info(f" 已处理块 {i+1}/{len(initial_chunks)}")
# 4. Save Json
json_path = PROCESSED_DIR / f"{file_path.stem}.json"
with open(json_path, "w", encoding="utf-8") as f:
json.dump(processed_data, f, ensure_ascii=False, indent=2)
# 5. Import to DB
async with self.storage_lock:
await self._import_to_db(processed_data)
self.manifest[filename] = {
"hash": file_hash,
"timestamp": time.time(),
"imported": True
}
self._save_manifest()
self.vector_store.save()
self.graph_store.save()
logger.info(f"✅ 文件 {filename} 处理并导入完成")
return True
except Exception as e:
logger.error(f"❌ 处理失败 {filename}: {e}")
import traceback
traceback.print_exc()
return False
def _normalize_and_aggregate(
self,
chunk: ProcessedChunk,
all_data: Dict,
time_meta: Optional[Dict[str, Any]] = None,
):
"""Convert strategy-specific data to unified generic format for storage."""
# Generic fields
para_item = {
"content": chunk.chunk.text,
"source": chunk.source.file,
"knowledge_type": resolve_stored_knowledge_type(
chunk.type.value,
content=chunk.chunk.text,
).value,
"entities": [],
"relations": []
}
data = chunk.data
# 1. Triples (Factual)
if "triples" in data:
for t in data["triples"]:
para_item["relations"].append({
"subject": t.get("subject"),
"predicate": t.get("predicate"),
"object": t.get("object")
})
# Auto-add entities from triples
para_item["entities"].extend([t.get("subject"), t.get("object")])
# 2. Events & Relations (Narrative)
if "events" in data:
# Store events as content/metadata? Or entities?
# For now maybe just keep them in logic, or add as 'Event' entities?
# Creating entities for events is good.
para_item["entities"].extend(data["events"])
if "relations" in data: # Narrative also outputs relations list
para_item["relations"].extend(data["relations"])
for r in data["relations"]:
para_item["entities"].extend([r.get("subject"), r.get("object")])
# 3. Verbatim Entities (Quote)
if "verbatim_entities" in data:
para_item["entities"].extend(data["verbatim_entities"])
# Dedupe per paragraph
para_item["entities"] = list(set([e for e in para_item["entities"] if e]))
if time_meta:
para_item["time_meta"] = time_meta
all_data["paragraphs"].append(para_item)
all_data["entities"].extend(para_item["entities"])
if "relations" in para_item:
all_data["relations"].extend(para_item["relations"])
@retry(
retry=retry_if_exception_type((LLMGenerationError, json.JSONDecodeError)),
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=1, min=2, max=10),
before_sleep=_log_before_retry
)
async def _llm_call(self, prompt: str, model_config: Any) -> Dict:
"""Generic LLM Caller"""
success, response, _, _ = await llm_api.generate_with_model(
prompt=prompt,
model_config=model_config,
request_type="Script.ProcessKnowledge"
)
if success:
txt = response.strip()
if "```" in txt:
txt = txt.split("```json")[-1].split("```")[0].strip()
try:
return json.loads(txt)
except json.JSONDecodeError:
# Fallback: try to find first { and last }
start = txt.find('{')
end = txt.rfind('}')
if start != -1 and end != -1:
return json.loads(txt[start:end+1])
raise
else:
raise LLMGenerationError("LLM generation failed")
async def _select_model(self) -> Any:
models = llm_api.get_available_models()
if not models: raise ValueError("No LLM models")
config_model = self.plugin_config.get("advanced", {}).get("extraction_model", "auto")
if config_model != "auto" and config_model in models:
return models[config_model]
for task_key in ["lpmm_entity_extract", "lpmm_rdf_build", "embedding"]:
if task_key in models: return models[task_key]
return models[list(models.keys())[0]]
# Re-use existing methods
async def _add_entity_with_vector(self, name: str, source_paragraph: Optional[str] = None) -> str:
# Same as before
hash_value = self.metadata_store.add_entity(name, source_paragraph=source_paragraph)
self.graph_store.add_nodes([name])
try:
emb = await self.embedding_manager.encode(name)
try:
self.vector_store.add(emb.reshape(1, -1), [hash_value])
except ValueError: pass
except Exception: pass
return hash_value
async def import_json_data(self, data: Dict, filename: str = "script_import", progress_callback=None):
"""Public import entrypoint for pre-processed JSON payloads."""
if not self.storage_lock:
raise RuntimeError("Importer is not initialized. Call initialize() first.")
async with self.storage_lock:
await self._import_to_db(data, progress_callback=progress_callback)
self.manifest[filename] = {
"hash": self.get_file_hash(json.dumps(data, ensure_ascii=False, sort_keys=True)),
"timestamp": time.time(),
"imported": True,
}
self._save_manifest()
self.vector_store.save()
self.graph_store.save()
async def _import_to_db(self, data: Dict, progress_callback=None):
# Same logic, but ensure robust
with self.graph_store.batch_update():
for item in data.get("paragraphs", []):
paragraph = normalize_paragraph_import_item(
item,
default_source="script",
)
content = paragraph["content"]
source = paragraph["source"]
k_type_val = paragraph["knowledge_type"]
h_val = self.metadata_store.add_paragraph(
content=content,
source=source,
knowledge_type=k_type_val,
time_meta=paragraph["time_meta"],
)
if h_val not in self.vector_store:
try:
emb = await self.embedding_manager.encode(content)
self.vector_store.add(emb.reshape(1, -1), [h_val])
except Exception as e:
logger.error(f" Vector fail: {e}")
para_entities = paragraph["entities"]
for entity in para_entities:
if entity:
await self._add_entity_with_vector(entity, source_paragraph=h_val)
para_relations = paragraph["relations"]
for rel in para_relations:
s, p, o = rel.get("subject"), rel.get("predicate"), rel.get("object")
if s and p and o:
await self._add_entity_with_vector(s, source_paragraph=h_val)
await self._add_entity_with_vector(o, source_paragraph=h_val)
confidence = float(rel.get("confidence", 1.0) or 1.0)
rel_meta = rel.get("metadata", {})
write_vector = self._should_write_relation_vectors()
if self.relation_write_service is not None:
await self.relation_write_service.upsert_relation_with_vector(
subject=s,
predicate=p,
obj=o,
confidence=confidence,
source_paragraph=h_val,
metadata=rel_meta if isinstance(rel_meta, dict) else {},
write_vector=write_vector,
)
else:
rel_hash = self.metadata_store.add_relation(
s,
p,
o,
confidence=confidence,
source_paragraph=h_val,
metadata=rel_meta if isinstance(rel_meta, dict) else {},
)
self.graph_store.add_edges([(s, o)], relation_hashes=[rel_hash])
try:
self.metadata_store.set_relation_vector_state(rel_hash, "none")
except Exception:
pass
if progress_callback: progress_callback(1)
async def close(self):
if self.metadata_store: self.metadata_store.close()
def _save_manifest(self):
with open(MANIFEST_PATH, "w", encoding="utf-8") as f:
json.dump(self.manifest, f, ensure_ascii=False, indent=2)
async def main():
parser = _build_arg_parser()
args = parser.parse_args()
if not global_config: return
importer = AutoImporter(
force=args.force,
clear_manifest=args.clear_manifest,
target_type=args.type,
concurrency=args.concurrency,
chat_log=args.chat_log,
chat_reference_time=args.chat_reference_time,
)
await importer.process_and_import()
await importer.close()
if __name__ == "__main__":
if sys.platform == "win32":
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
asyncio.run(main())

View File

@@ -0,0 +1,119 @@
#!/usr/bin/env python3
"""Episode source 级重建工具。"""
from __future__ import annotations
import argparse
import asyncio
import sys
from typing import Any, Dict, List
from _bootstrap import DEFAULT_CONFIG_PATH, DEFAULT_DATA_DIR, resolve_repo_path
try:
import tomlkit # type: ignore
except Exception: # pragma: no cover
tomlkit = None
from A_memorix.core.storage import MetadataStore
from A_memorix.core.utils.episode_service import EpisodeService
def _build_arg_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="Rebuild A_Memorix episodes by source")
parser.add_argument("--data-dir", default=str(DEFAULT_DATA_DIR), help="插件数据目录")
parser.add_argument("--source", type=str, help="指定单个 source 入队/重建")
parser.add_argument("--all", action="store_true", help="对所有 source 入队/重建")
parser.add_argument("--wait", action="store_true", help="在脚本内同步执行重建")
return parser
if any(arg in {"-h", "--help"} for arg in sys.argv[1:]):
_build_arg_parser().print_help()
raise SystemExit(0)
def _load_plugin_config() -> Dict[str, Any]:
config_path = DEFAULT_CONFIG_PATH
if tomlkit is None or not config_path.exists():
return {}
try:
with open(config_path, "r", encoding="utf-8") as handle:
parsed = tomlkit.load(handle)
return dict(parsed) if isinstance(parsed, dict) else {}
except Exception:
return {}
def _resolve_sources(store: MetadataStore, *, source: str | None, rebuild_all: bool) -> List[str]:
if rebuild_all:
return list(store.list_episode_sources_for_rebuild())
token = str(source or "").strip()
if not token:
raise ValueError("必须提供 --source 或 --all")
return [token]
async def _run_rebuilds(store: MetadataStore, plugin_config: Dict[str, Any], sources: List[str]) -> int:
service = EpisodeService(metadata_store=store, plugin_config=plugin_config)
failures: List[str] = []
for source in sources:
started = store.mark_episode_source_running(source)
if not started:
failures.append(f"{source}: unable_to_mark_running")
continue
try:
result = await service.rebuild_source(source)
store.mark_episode_source_done(source)
print(
"rebuilt"
f" source={source}"
f" paragraphs={int(result.get('paragraph_count') or 0)}"
f" groups={int(result.get('group_count') or 0)}"
f" episodes={int(result.get('episode_count') or 0)}"
f" fallback={int(result.get('fallback_count') or 0)}"
)
except Exception as exc:
err = str(exc)[:500]
store.mark_episode_source_failed(source, err)
failures.append(f"{source}: {err}")
print(f"failed source={source} error={err}")
if failures:
for item in failures:
print(item)
return 1
return 0
def main() -> int:
parser = _build_arg_parser()
args = parser.parse_args()
if bool(args.all) == bool(args.source):
parser.error("必须且只能选择一个:--source 或 --all")
store = MetadataStore(data_dir=resolve_repo_path(args.data_dir, fallback=DEFAULT_DATA_DIR) / "metadata")
store.connect()
try:
sources = _resolve_sources(store, source=args.source, rebuild_all=bool(args.all))
if not sources:
print("no sources to rebuild")
return 0
enqueued = 0
reason = "script_rebuild_all" if args.all else "script_rebuild_source"
for source in sources:
enqueued += int(store.enqueue_episode_source_rebuild(source, reason=reason))
print(f"enqueued={enqueued} sources={len(sources)}")
if not args.wait:
return 0
plugin_config = _load_plugin_config()
return asyncio.run(_run_rebuilds(store, plugin_config, sources))
finally:
store.close()
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -0,0 +1,744 @@
#!/usr/bin/env python3
"""
vNext release migration entrypoint for A_Memorix.
Subcommands:
- preflight: detect legacy config/data/schema risks
- migrate: offline migrate config + vectors + metadata schema + graph edge hash map
- verify: strict post-migration consistency checks
"""
from __future__ import annotations
import argparse
import json
import pickle
import sqlite3
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
import tomlkit
from _bootstrap import DEFAULT_CONFIG_PATH, DEFAULT_DATA_DIR, resolve_repo_path
def _build_arg_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="A_Memorix vNext release migration tool")
parser.add_argument(
"--config",
default=str(DEFAULT_CONFIG_PATH),
help="config.toml path (default: config/a_memorix.toml)",
)
parser.add_argument(
"--data-dir",
default="",
help="optional data dir override; default resolved from config.storage.data_dir",
)
parser.add_argument("--json-out", default="", help="optional JSON report output path")
sub = parser.add_subparsers(dest="command", required=True)
p_preflight = sub.add_parser("preflight", help="scan legacy risks")
p_preflight.add_argument("--strict", action="store_true", help="return 1 if any error check exists")
p_migrate = sub.add_parser("migrate", help="run offline migration")
p_migrate.add_argument("--dry-run", action="store_true", help="only print planned changes")
p_migrate.add_argument(
"--verify-after",
action="store_true",
help="run verify automatically after migrate",
)
p_verify = sub.add_parser("verify", help="post-migration verification")
p_verify.add_argument("--strict", action="store_true", help="return 1 if any error check exists")
return parser
# --help/-h fast path: avoid heavy host/plugin bootstrap
if any(arg in {"-h", "--help"} for arg in sys.argv[1:]):
_build_arg_parser().print_help()
raise SystemExit(0)
try:
from A_memorix.core.storage import GraphStore, KnowledgeType, MetadataStore, QuantizationType, VectorStore
from A_memorix.core.storage.metadata_store import SCHEMA_VERSION
except Exception as e: # pragma: no cover
print(f"❌ failed to import storage modules: {e}")
raise SystemExit(2)
@dataclass
class CheckItem:
code: str
level: str
message: str
details: Optional[Dict[str, Any]] = None
def to_dict(self) -> Dict[str, Any]:
out = {
"code": self.code,
"level": self.level,
"message": self.message,
}
if self.details:
out["details"] = self.details
return out
def _read_toml(path: Path) -> Dict[str, Any]:
text = path.read_text(encoding="utf-8")
return tomlkit.parse(text)
def _write_toml(path: Path, data: Dict[str, Any]) -> None:
path.write_text(tomlkit.dumps(data), encoding="utf-8")
def _get_nested(obj: Dict[str, Any], keys: Sequence[str], default: Any = None) -> Any:
cur: Any = obj
for k in keys:
if not isinstance(cur, dict) or k not in cur:
return default
cur = cur[k]
return cur
def _ensure_table(obj: Dict[str, Any], key: str) -> Dict[str, Any]:
if key not in obj or not isinstance(obj[key], dict):
obj[key] = tomlkit.table()
return obj[key]
def _resolve_data_dir(config_doc: Dict[str, Any], explicit_data_dir: Optional[str]) -> Path:
if explicit_data_dir:
return resolve_repo_path(explicit_data_dir, fallback=DEFAULT_DATA_DIR)
raw = str(_get_nested(config_doc, ("storage", "data_dir"), "./data") or "./data").strip()
return resolve_repo_path(raw, fallback=DEFAULT_DATA_DIR)
def _sqlite_table_exists(conn: sqlite3.Connection, table: str) -> bool:
row = conn.execute(
"SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1",
(table,),
).fetchone()
return row is not None
def _collect_hash_alias_conflicts(conn: sqlite3.Connection) -> Dict[str, List[str]]:
hashes: List[str] = []
if _sqlite_table_exists(conn, "relations"):
rows = conn.execute("SELECT hash FROM relations").fetchall()
hashes.extend(str(r[0]) for r in rows if r and r[0])
if _sqlite_table_exists(conn, "deleted_relations"):
rows = conn.execute("SELECT hash FROM deleted_relations").fetchall()
hashes.extend(str(r[0]) for r in rows if r and r[0])
alias_map: Dict[str, str] = {}
conflicts: Dict[str, set[str]] = {}
for h in hashes:
if len(h) != 64:
continue
alias = h[:32]
old = alias_map.get(alias)
if old is None:
alias_map[alias] = h
continue
if old != h:
conflicts.setdefault(alias, set()).update({old, h})
return {k: sorted(v) for k, v in conflicts.items()}
def _collect_invalid_knowledge_types(conn: sqlite3.Connection) -> List[str]:
if not _sqlite_table_exists(conn, "paragraphs"):
return []
allowed = {item.value for item in KnowledgeType}
rows = conn.execute("SELECT DISTINCT knowledge_type FROM paragraphs").fetchall()
invalid: List[str] = []
for row in rows:
raw = row[0]
value = str(raw).strip().lower() if raw is not None else ""
if value not in allowed:
invalid.append(str(raw) if raw is not None else "")
return sorted(set(invalid))
def _guess_vector_dimension(config_doc: Dict[str, Any], vectors_dir: Path) -> int:
meta_path = vectors_dir / "vectors_metadata.pkl"
if meta_path.exists():
try:
with open(meta_path, "rb") as f:
meta = pickle.load(f)
dim = int(meta.get("dimension", 0))
if dim > 0:
return dim
except Exception:
pass
try:
dim_cfg = int(_get_nested(config_doc, ("embedding", "dimension"), 1024))
if dim_cfg > 0:
return dim_cfg
except Exception:
pass
return 1024
def _preflight_impl(config_path: Path, data_dir: Path) -> Dict[str, Any]:
checks: List[CheckItem] = []
facts: Dict[str, Any] = {
"config_path": str(config_path),
"data_dir": str(data_dir),
}
if not config_path.exists():
checks.append(CheckItem("CFG-00", "error", f"config not found: {config_path}"))
return {"ok": False, "checks": [c.to_dict() for c in checks], "facts": facts}
config_doc = _read_toml(config_path)
tool_mode = str(_get_nested(config_doc, ("routing", "tool_search_mode"), "forward") or "").strip().lower()
summary_model = _get_nested(config_doc, ("summarization", "model_name"), ["auto"])
summary_knowledge_type = str(
_get_nested(config_doc, ("summarization", "default_knowledge_type"), "narrative") or "narrative"
).strip().lower()
quantization = str(_get_nested(config_doc, ("embedding", "quantization_type"), "int8") or "").strip().lower()
facts["routing.tool_search_mode"] = tool_mode
facts["summarization.model_name_type"] = type(summary_model).__name__
facts["summarization.default_knowledge_type"] = summary_knowledge_type
facts["embedding.quantization_type"] = quantization
if tool_mode == "legacy":
checks.append(
CheckItem(
"CP-04",
"error",
"routing.tool_search_mode=legacy is no longer accepted at runtime",
)
)
elif tool_mode not in {"forward", "disabled"}:
checks.append(
CheckItem(
"CP-04",
"error",
f"routing.tool_search_mode invalid value: {tool_mode}",
)
)
if isinstance(summary_model, str):
checks.append(
CheckItem(
"CP-11",
"error",
"summarization.model_name must be List[str], string legacy format detected",
)
)
elif not isinstance(summary_model, list) or any(not isinstance(x, str) for x in summary_model):
checks.append(
CheckItem(
"CP-11",
"error",
"summarization.model_name must be List[str]",
)
)
if summary_knowledge_type not in {item.value for item in KnowledgeType}:
checks.append(
CheckItem(
"CP-13",
"error",
f"invalid summarization.default_knowledge_type: {summary_knowledge_type}",
)
)
if quantization != "int8":
checks.append(
CheckItem(
"UG-07",
"error",
"embedding.quantization_type must be int8 in vNext",
)
)
vectors_dir = data_dir / "vectors"
npy_path = vectors_dir / "vectors.npy"
bin_path = vectors_dir / "vectors.bin"
ids_bin_path = vectors_dir / "vectors_ids.bin"
facts["vectors.npy_exists"] = npy_path.exists()
facts["vectors.bin_exists"] = bin_path.exists()
facts["vectors_ids.bin_exists"] = ids_bin_path.exists()
if npy_path.exists() and not (bin_path.exists() and ids_bin_path.exists()):
checks.append(
CheckItem(
"CP-07",
"error",
"legacy vectors.npy detected; offline migrate required",
{"npy_path": str(npy_path)},
)
)
metadata_db = data_dir / "metadata" / "metadata.db"
facts["metadata_db_exists"] = metadata_db.exists()
relation_count = 0
if metadata_db.exists():
conn = sqlite3.connect(str(metadata_db))
try:
has_schema_table = _sqlite_table_exists(conn, "schema_migrations")
facts["schema_migrations_exists"] = has_schema_table
has_paragraph_backfill = _sqlite_table_exists(conn, "paragraph_vector_backfill")
facts["paragraph_vector_backfill_exists"] = has_paragraph_backfill
if not has_schema_table:
checks.append(
CheckItem(
"CP-08",
"error",
"schema_migrations table missing (legacy metadata schema)",
)
)
else:
row = conn.execute("SELECT MAX(version) FROM schema_migrations").fetchone()
version = int(row[0]) if row and row[0] is not None else 0
facts["schema_version"] = version
if version != SCHEMA_VERSION:
checks.append(
CheckItem(
"CP-08",
"error",
f"schema version mismatch: current={version}, expected={SCHEMA_VERSION}",
)
)
elif not has_paragraph_backfill:
checks.append(
CheckItem(
"CP-14",
"error",
"paragraph_vector_backfill table missing under current schema version",
)
)
if _sqlite_table_exists(conn, "relations"):
row = conn.execute("SELECT COUNT(*) FROM relations").fetchone()
relation_count = int(row[0]) if row and row[0] is not None else 0
facts["relations_count"] = relation_count
conflicts = _collect_hash_alias_conflicts(conn)
facts["alias_conflict_count"] = len(conflicts)
if conflicts:
checks.append(
CheckItem(
"CP-05",
"error",
"32-bit relation hash alias conflict detected",
{"aliases": sorted(conflicts.keys())[:20], "total": len(conflicts)},
)
)
invalid_knowledge_types = _collect_invalid_knowledge_types(conn)
facts["invalid_knowledge_type_values"] = invalid_knowledge_types
if invalid_knowledge_types:
checks.append(
CheckItem(
"CP-12",
"error",
"invalid paragraph knowledge_type values detected",
{"values": invalid_knowledge_types[:20], "total": len(invalid_knowledge_types)},
)
)
finally:
conn.close()
else:
checks.append(
CheckItem(
"META-00",
"warning",
"metadata.db not found, schema checks skipped",
)
)
graph_meta_path = data_dir / "graph" / "graph_metadata.pkl"
facts["graph_metadata_exists"] = graph_meta_path.exists()
if relation_count > 0:
if not graph_meta_path.exists():
checks.append(
CheckItem(
"CP-06",
"error",
"relations exist but graph metadata missing",
)
)
else:
try:
with open(graph_meta_path, "rb") as f:
graph_meta = pickle.load(f)
edge_hash_map = graph_meta.get("edge_hash_map", {})
edge_hash_map_size = len(edge_hash_map) if isinstance(edge_hash_map, dict) else 0
facts["edge_hash_map_size"] = edge_hash_map_size
if edge_hash_map_size <= 0:
checks.append(
CheckItem(
"CP-06",
"error",
"edge_hash_map missing/empty while relations exist",
)
)
except Exception as e:
checks.append(
CheckItem(
"CP-06",
"error",
f"failed to read graph metadata: {e}",
)
)
has_error = any(c.level == "error" for c in checks)
return {
"ok": not has_error,
"checks": [c.to_dict() for c in checks],
"facts": facts,
}
def _migrate_config(config_doc: Dict[str, Any]) -> Dict[str, Any]:
changes: Dict[str, Any] = {}
routing = _ensure_table(config_doc, "routing")
mode_raw = str(routing.get("tool_search_mode", "forward") or "").strip().lower()
mode_new = mode_raw
if mode_raw == "legacy" or mode_raw not in {"forward", "disabled"}:
mode_new = "forward"
if mode_new != mode_raw:
routing["tool_search_mode"] = mode_new
changes["routing.tool_search_mode"] = {"old": mode_raw, "new": mode_new}
summary = _ensure_table(config_doc, "summarization")
summary_model = summary.get("model_name", ["auto"])
if isinstance(summary_model, str):
normalized = [summary_model.strip() or "auto"]
summary["model_name"] = normalized
changes["summarization.model_name"] = {"old": summary_model, "new": normalized}
elif not isinstance(summary_model, list):
normalized = ["auto"]
summary["model_name"] = normalized
changes["summarization.model_name"] = {"old": str(type(summary_model)), "new": normalized}
elif any(not isinstance(x, str) for x in summary_model):
normalized = [str(x).strip() for x in summary_model if str(x).strip()]
if not normalized:
normalized = ["auto"]
summary["model_name"] = normalized
changes["summarization.model_name"] = {"old": summary_model, "new": normalized}
default_knowledge_type = str(summary.get("default_knowledge_type", "narrative") or "").strip().lower()
allowed_knowledge_types = {item.value for item in KnowledgeType}
if default_knowledge_type not in allowed_knowledge_types:
summary["default_knowledge_type"] = "narrative"
changes["summarization.default_knowledge_type"] = {
"old": default_knowledge_type,
"new": "narrative",
}
embedding = _ensure_table(config_doc, "embedding")
quantization = str(embedding.get("quantization_type", "int8") or "").strip().lower()
if quantization != "int8":
embedding["quantization_type"] = "int8"
changes["embedding.quantization_type"] = {"old": quantization, "new": "int8"}
return changes
def _migrate_impl(config_path: Path, data_dir: Path, dry_run: bool) -> Dict[str, Any]:
config_doc = _read_toml(config_path)
result: Dict[str, Any] = {
"config_path": str(config_path),
"data_dir": str(data_dir),
"dry_run": bool(dry_run),
"steps": {},
}
config_changes = _migrate_config(config_doc)
result["steps"]["config"] = {"changed": bool(config_changes), "changes": config_changes}
if config_changes and not dry_run:
_write_toml(config_path, config_doc)
vectors_dir = data_dir / "vectors"
vectors_dir.mkdir(parents=True, exist_ok=True)
npy_path = vectors_dir / "vectors.npy"
bin_path = vectors_dir / "vectors.bin"
ids_bin_path = vectors_dir / "vectors_ids.bin"
if npy_path.exists() and not (bin_path.exists() and ids_bin_path.exists()):
if dry_run:
result["steps"]["vector"] = {"migrated": False, "reason": "dry_run"}
else:
dim = _guess_vector_dimension(config_doc, vectors_dir)
store = VectorStore(
dimension=max(1, int(dim)),
quantization_type=QuantizationType.INT8,
data_dir=vectors_dir,
)
result["steps"]["vector"] = store.migrate_legacy_npy(vectors_dir)
else:
result["steps"]["vector"] = {"migrated": False, "reason": "not_required"}
metadata_dir = data_dir / "metadata"
metadata_dir.mkdir(parents=True, exist_ok=True)
metadata_db = metadata_dir / "metadata.db"
triples: List[Tuple[str, str, str, str]] = []
relation_count = 0
metadata_result: Dict[str, Any] = {"migrated": False, "reason": "not_required"}
if metadata_db.exists():
store = MetadataStore(data_dir=metadata_dir)
store.connect(enforce_schema=False)
try:
if dry_run:
metadata_result = {"migrated": False, "reason": "dry_run"}
else:
metadata_result = store.run_legacy_migration_for_vnext()
relation_count = int(store.count_relations())
if relation_count > 0:
triples = [(str(s), str(p), str(o), str(h)) for s, p, o, h in store.get_all_triples()]
finally:
store.close()
result["steps"]["metadata"] = metadata_result
graph_dir = data_dir / "graph"
graph_dir.mkdir(parents=True, exist_ok=True)
graph_matrix_format = str(_get_nested(config_doc, ("graph", "sparse_matrix_format"), "csr") or "csr")
graph_store = GraphStore(matrix_format=graph_matrix_format, data_dir=graph_dir)
graph_step: Dict[str, Any] = {
"rebuilt": False,
"mapped_hashes": 0,
"relation_count": relation_count,
"topology_rebuilt_from_relations": False,
}
if relation_count > 0:
if dry_run:
graph_step["reason"] = "dry_run"
else:
if graph_store.has_data():
graph_store.load()
mapped = graph_store.rebuild_edge_hash_map(triples)
# 兜底:历史数据里 graph 节点/边与 relations 脱节时,直接从 relations 重建图。
if mapped <= 0 or not graph_store.has_edge_hash_map():
nodes = sorted({s for s, _, o, _ in triples} | {o for _, _, o, _ in triples})
edges = [(s, o) for s, _, o, _ in triples]
hashes = [h for _, _, _, h in triples]
graph_store.clear()
if nodes:
graph_store.add_nodes(nodes)
if edges:
mapped = graph_store.add_edges(edges, relation_hashes=hashes)
else:
mapped = 0
graph_step.update(
{
"topology_rebuilt_from_relations": True,
"rebuilt_nodes": len(nodes),
"rebuilt_edges": int(graph_store.num_edges),
}
)
graph_store.save()
graph_step.update({"rebuilt": True, "mapped_hashes": int(mapped)})
else:
graph_step["reason"] = "no_relations"
result["steps"]["graph"] = graph_step
return result
def _verify_impl(config_path: Path, data_dir: Path) -> Dict[str, Any]:
checks: List[CheckItem] = []
facts: Dict[str, Any] = {
"config_path": str(config_path),
"data_dir": str(data_dir),
}
if not config_path.exists():
checks.append(CheckItem("CFG-00", "error", f"config not found: {config_path}"))
return {"ok": False, "checks": [c.to_dict() for c in checks], "facts": facts}
config_doc = _read_toml(config_path)
mode = str(_get_nested(config_doc, ("routing", "tool_search_mode"), "forward") or "").strip().lower()
if mode not in {"forward", "disabled"}:
checks.append(CheckItem("CP-04", "error", f"invalid routing.tool_search_mode: {mode}"))
summary_model = _get_nested(config_doc, ("summarization", "model_name"), ["auto"])
if not isinstance(summary_model, list) or any(not isinstance(x, str) for x in summary_model):
checks.append(CheckItem("CP-11", "error", "summarization.model_name must be List[str]"))
summary_knowledge_type = str(
_get_nested(config_doc, ("summarization", "default_knowledge_type"), "narrative") or "narrative"
).strip().lower()
if summary_knowledge_type not in {item.value for item in KnowledgeType}:
checks.append(
CheckItem("CP-13", "error", f"invalid summarization.default_knowledge_type: {summary_knowledge_type}")
)
quantization = str(_get_nested(config_doc, ("embedding", "quantization_type"), "int8") or "").strip().lower()
if quantization != "int8":
checks.append(CheckItem("UG-07", "error", "embedding.quantization_type must be int8"))
vectors_dir = data_dir / "vectors"
npy_path = vectors_dir / "vectors.npy"
bin_path = vectors_dir / "vectors.bin"
ids_bin_path = vectors_dir / "vectors_ids.bin"
if npy_path.exists() and not (bin_path.exists() and ids_bin_path.exists()):
checks.append(CheckItem("CP-07", "error", "legacy vectors.npy still exists without bin migration"))
metadata_dir = data_dir / "metadata"
store = MetadataStore(data_dir=metadata_dir)
try:
store.connect(enforce_schema=True)
schema_version = store.get_schema_version()
facts["schema_version"] = schema_version
if schema_version != SCHEMA_VERSION:
checks.append(CheckItem("CP-08", "error", f"schema version mismatch: {schema_version}"))
relation_count = int(store.count_relations())
facts["relations_count"] = relation_count
conflicts = {}
invalid_knowledge_types: List[str] = []
db_path = metadata_dir / "metadata.db"
if db_path.exists():
conn = sqlite3.connect(str(db_path))
try:
has_paragraph_backfill = _sqlite_table_exists(conn, "paragraph_vector_backfill")
facts["paragraph_vector_backfill_exists"] = bool(has_paragraph_backfill)
if not has_paragraph_backfill:
checks.append(
CheckItem(
"CP-14",
"error",
"paragraph_vector_backfill table missing after migration",
)
)
conflicts = _collect_hash_alias_conflicts(conn)
invalid_knowledge_types = _collect_invalid_knowledge_types(conn)
finally:
conn.close()
if conflicts:
checks.append(
CheckItem(
"CP-05",
"error",
"alias conflicts still exist after migration",
{"aliases": sorted(conflicts.keys())[:20], "total": len(conflicts)},
)
)
if invalid_knowledge_types:
checks.append(
CheckItem(
"CP-12",
"error",
"invalid paragraph knowledge_type values remain after migration",
{"values": invalid_knowledge_types[:20], "total": len(invalid_knowledge_types)},
)
)
if relation_count > 0:
graph_dir = data_dir / "graph"
if not (graph_dir / "graph_metadata.pkl").exists():
checks.append(CheckItem("CP-06", "error", "graph metadata missing while relations exist"))
else:
matrix_format = str(_get_nested(config_doc, ("graph", "sparse_matrix_format"), "csr") or "csr")
graph_store = GraphStore(matrix_format=matrix_format, data_dir=graph_dir)
graph_store.load()
if not graph_store.has_edge_hash_map():
checks.append(CheckItem("CP-06", "error", "edge_hash_map is empty"))
except Exception as e:
checks.append(CheckItem("CP-08", "error", f"metadata strict connect failed: {e}"))
finally:
try:
store.close()
except Exception:
pass
has_error = any(c.level == "error" for c in checks)
return {
"ok": not has_error,
"checks": [c.to_dict() for c in checks],
"facts": facts,
}
def _print_report(title: str, report: Dict[str, Any]) -> None:
print(f"=== {title} ===")
print(f"ok: {bool(report.get('ok', True))}")
facts = report.get("facts", {})
if facts:
print("facts:")
for k in sorted(facts.keys()):
print(f" - {k}: {facts[k]}")
checks = report.get("checks", [])
if checks:
print("checks:")
for item in checks:
print(f" - [{item.get('level')}] {item.get('code')}: {item.get('message')}")
else:
print("checks: none")
def _write_json_if_needed(path: str, payload: Dict[str, Any]) -> None:
if not path:
return
out = Path(path).expanduser().resolve()
out.parent.mkdir(parents=True, exist_ok=True)
out.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"json_out: {out}")
def main() -> int:
parser = _build_arg_parser()
args = parser.parse_args()
config_path = resolve_repo_path(args.config, fallback=DEFAULT_CONFIG_PATH)
if not config_path.exists():
print(f"❌ config not found: {config_path}")
return 2
config_doc = _read_toml(config_path)
data_dir = _resolve_data_dir(config_doc, args.data_dir)
if args.command == "preflight":
report = _preflight_impl(config_path, data_dir)
_print_report("vNext Preflight", report)
_write_json_if_needed(args.json_out, report)
has_error = any(item.get("level") == "error" for item in report.get("checks", []))
if args.strict and has_error:
return 1
return 0
if args.command == "migrate":
payload = _migrate_impl(config_path, data_dir, dry_run=bool(args.dry_run))
print("=== vNext Migrate ===")
print(json.dumps(payload, ensure_ascii=False, indent=2))
verify_report = None
if args.verify_after and not args.dry_run:
verify_report = _verify_impl(config_path, data_dir)
_print_report("vNext Verify (after migrate)", verify_report)
payload["verify_after"] = verify_report
_write_json_if_needed(args.json_out, payload)
if verify_report is not None:
has_error = any(item.get("level") == "error" for item in verify_report.get("checks", []))
if has_error:
return 1
return 0
if args.command == "verify":
report = _verify_impl(config_path, data_dir)
_print_report("vNext Verify", report)
_write_json_if_needed(args.json_out, report)
has_error = any(item.get("level") == "error" for item in report.get("checks", []))
if args.strict and has_error:
return 1
return 0
return 2
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -0,0 +1,144 @@
#!/usr/bin/env python3
"""Run A_Memorix runtime self-check against real embedding/runtime configuration."""
from __future__ import annotations
import argparse
import asyncio
import json
import sys
import tempfile
from pathlib import Path
from typing import Any
import tomlkit
from _bootstrap import DEFAULT_CONFIG_PATH, DEFAULT_DATA_DIR, resolve_repo_path
def _build_arg_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="A_Memorix runtime self-check")
parser.add_argument(
"--config",
default=str(DEFAULT_CONFIG_PATH),
help="config.toml path (default: config/a_memorix.toml)",
)
parser.add_argument(
"--data-dir",
default="",
help="optional data dir override; default resolved from config.storage.data_dir",
)
parser.add_argument(
"--use-config-data-dir",
action="store_true",
help="use config.storage.data_dir directly instead of an isolated temp dir",
)
parser.add_argument(
"--sample-text",
default="A_Memorix runtime self check",
help="sample text used for real embedding probe",
)
parser.add_argument("--json", action="store_true", help="print JSON report")
return parser
if any(arg in {"-h", "--help"} for arg in sys.argv[1:]):
_build_arg_parser().print_help()
raise SystemExit(0)
from A_memorix.core.runtime.lifecycle_orchestrator import initialize_storage_async
from A_memorix.core.utils.runtime_self_check import run_embedding_runtime_self_check
def _load_config(path: Path) -> dict[str, Any]:
with open(path, "r", encoding="utf-8") as f:
raw = tomlkit.load(f)
return dict(raw) if isinstance(raw, dict) else {}
def _nested_get(config: dict[str, Any], key: str, default: Any = None) -> Any:
current: Any = config
for part in key.split("."):
if isinstance(current, dict) and part in current:
current = current[part]
else:
return default
return current
class _PluginStub:
def __init__(self, config: dict[str, Any]):
self.config = config
self.vector_store = None
self.graph_store = None
self.metadata_store = None
self.embedding_manager = None
self.sparse_index = None
self.relation_write_service = None
def get_config(self, key: str, default: Any = None) -> Any:
return _nested_get(self.config, key, default)
async def _main_async(args: argparse.Namespace) -> int:
config_path = resolve_repo_path(args.config, fallback=DEFAULT_CONFIG_PATH)
if not config_path.exists():
print(f"❌ 配置文件不存在: {config_path}")
return 2
config = _load_config(config_path)
temp_dir_ctx = None
if args.data_dir:
storage_dir = str(resolve_repo_path(args.data_dir, fallback=DEFAULT_DATA_DIR))
elif args.use_config_data_dir:
raw_data_dir = str(_nested_get(config, "storage.data_dir", "./data") or "./data").strip()
storage_dir = str(resolve_repo_path(raw_data_dir, fallback=DEFAULT_DATA_DIR))
else:
temp_dir_ctx = tempfile.TemporaryDirectory(prefix="memorix-runtime-self-check-")
storage_dir = temp_dir_ctx.name
storage_cfg = config.setdefault("storage", {})
storage_cfg["data_dir"] = storage_dir
plugin = _PluginStub(config)
try:
await initialize_storage_async(plugin)
report = await run_embedding_runtime_self_check(
config=config,
vector_store=plugin.vector_store,
embedding_manager=plugin.embedding_manager,
sample_text=str(args.sample_text or "A_Memorix runtime self check"),
)
report["data_dir"] = storage_dir
report["isolated_data_dir"] = temp_dir_ctx is not None
if args.json:
print(json.dumps(report, ensure_ascii=False, indent=2))
else:
print("A_Memorix Runtime Self-Check")
print(f"ok: {report.get('ok')}")
print(f"code: {report.get('code')}")
print(f"message: {report.get('message')}")
print(f"configured_dimension: {report.get('configured_dimension')}")
print(f"vector_store_dimension: {report.get('vector_store_dimension')}")
print(f"detected_dimension: {report.get('detected_dimension')}")
print(f"encoded_dimension: {report.get('encoded_dimension')}")
print(f"elapsed_ms: {float(report.get('elapsed_ms', 0.0)):.2f}")
return 0 if bool(report.get("ok")) else 1
finally:
if plugin.metadata_store is not None:
try:
plugin.metadata_store.close()
except Exception:
pass
if temp_dir_ctx is not None:
temp_dir_ctx.cleanup()
def main() -> int:
parser = _build_arg_parser()
args = parser.parse_args()
return asyncio.run(_main_async(args))
if __name__ == "__main__":
raise SystemExit(main())