添加 A_Memorix 插件 v2.0.0(包含运行时与文档)
引入 A_Memorix 插件 v2.0.0:新增大量运行时组件、存储/模式更新、检索能力提升、管理工具、导入/调优工作流以及相关文档。关键新增内容包括:lifecycle_orchestrator、SDKMemoryKernel/运行时初始化器、新的存储层与 metadata_store 变更(SCHEMA_VERSION v8)、检索增强(双路径检索、图关系召回、稀疏 BM25),以及多种工具服务(episode/person_profile/relation/segmentation/tuning/search execution)。同时新增 Web 导入/摘要导入器及大量维护脚本。还更新了插件清单、embedding API 适配器、plugin.py、requirements/pyproject,以及主入口文件,使新插件接入项目。该变更为 2.0.0 版本发布做好准备,实现统一的 SDK Tool 接口并扩展整体运行能力。
This commit is contained in:
213
plugins/A_memorix/scripts/audit_vector_consistency.py
Normal file
213
plugins/A_memorix/scripts/audit_vector_consistency.py
Normal file
@@ -0,0 +1,213 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
A_Memorix 一致性审计脚本。
|
||||
|
||||
输出内容:
|
||||
1. paragraph/entity/relation 向量覆盖率
|
||||
2. relation vector_state 分布
|
||||
3. 孤儿向量数量(向量存在但 metadata 不存在)
|
||||
4. 状态与向量文件不一致统计
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import pickle
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Set
|
||||
|
||||
|
||||
CURRENT_DIR = Path(__file__).resolve().parent
|
||||
PLUGIN_ROOT = CURRENT_DIR.parent
|
||||
PROJECT_ROOT = PLUGIN_ROOT.parent.parent
|
||||
sys.path.insert(0, str(PROJECT_ROOT))
|
||||
sys.path.insert(0, str(PLUGIN_ROOT))
|
||||
|
||||
def _build_arg_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(description="审计 A_Memorix 向量一致性")
|
||||
parser.add_argument(
|
||||
"--data-dir",
|
||||
default=str(PLUGIN_ROOT / "data"),
|
||||
help="A_Memorix 数据目录(默认: plugins/A_memorix/data)",
|
||||
)
|
||||
parser.add_argument("--json-out", default="", help="可选:输出 JSON 文件路径")
|
||||
parser.add_argument(
|
||||
"--strict",
|
||||
action="store_true",
|
||||
help="若发现一致性异常则返回非 0 退出码",
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
# --help/-h fast path: avoid heavy host/plugin bootstrap
|
||||
if any(arg in {"-h", "--help"} for arg in sys.argv[1:]):
|
||||
_build_arg_parser().print_help()
|
||||
sys.exit(0)
|
||||
|
||||
try:
|
||||
from core.storage.vector_store import VectorStore
|
||||
from core.storage.metadata_store import MetadataStore
|
||||
from core.storage import QuantizationType
|
||||
except Exception as e: # pragma: no cover
|
||||
print(f"❌ 导入核心模块失败: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def _safe_ratio(numerator: int, denominator: int) -> float:
|
||||
if denominator <= 0:
|
||||
return 0.0
|
||||
return float(numerator) / float(denominator)
|
||||
|
||||
|
||||
def _load_vector_store(data_dir: Path) -> VectorStore:
|
||||
meta_path = data_dir / "vectors" / "vectors_metadata.pkl"
|
||||
if not meta_path.exists():
|
||||
raise FileNotFoundError(f"未找到向量元数据文件: {meta_path}")
|
||||
|
||||
with open(meta_path, "rb") as f:
|
||||
meta = pickle.load(f)
|
||||
dimension = int(meta.get("dimension", 1024))
|
||||
|
||||
store = VectorStore(
|
||||
dimension=max(1, dimension),
|
||||
quantization_type=QuantizationType.INT8,
|
||||
data_dir=data_dir / "vectors",
|
||||
)
|
||||
if store.has_data():
|
||||
store.load()
|
||||
return store
|
||||
|
||||
|
||||
def _load_metadata_store(data_dir: Path) -> MetadataStore:
|
||||
store = MetadataStore(data_dir=data_dir / "metadata")
|
||||
store.connect()
|
||||
return store
|
||||
|
||||
|
||||
def _hash_set(metadata_store: MetadataStore, table: str) -> Set[str]:
|
||||
return {str(h) for h in metadata_store.list_hashes(table)}
|
||||
|
||||
|
||||
def _relation_state_stats(metadata_store: MetadataStore) -> Dict[str, int]:
|
||||
return metadata_store.count_relations_by_vector_state()
|
||||
|
||||
|
||||
def run_audit(data_dir: Path) -> Dict[str, Any]:
|
||||
vector_store = _load_vector_store(data_dir)
|
||||
metadata_store = _load_metadata_store(data_dir)
|
||||
try:
|
||||
paragraph_hashes = _hash_set(metadata_store, "paragraphs")
|
||||
entity_hashes = _hash_set(metadata_store, "entities")
|
||||
relation_hashes = _hash_set(metadata_store, "relations")
|
||||
|
||||
known_hashes = set(getattr(vector_store, "_known_hashes", set()))
|
||||
live_vector_hashes = {h for h in known_hashes if h in vector_store}
|
||||
|
||||
para_vector_hits = len(paragraph_hashes & live_vector_hashes)
|
||||
ent_vector_hits = len(entity_hashes & live_vector_hashes)
|
||||
rel_vector_hits = len(relation_hashes & live_vector_hashes)
|
||||
|
||||
orphan_vector_hashes = sorted(
|
||||
live_vector_hashes - paragraph_hashes - entity_hashes - relation_hashes
|
||||
)
|
||||
|
||||
relation_rows = metadata_store.get_relations()
|
||||
ready_but_missing = 0
|
||||
not_ready_but_present = 0
|
||||
for row in relation_rows:
|
||||
h = str(row.get("hash") or "")
|
||||
state = str(row.get("vector_state") or "none").lower()
|
||||
in_vector = h in live_vector_hashes
|
||||
if state == "ready" and not in_vector:
|
||||
ready_but_missing += 1
|
||||
if state != "ready" and in_vector:
|
||||
not_ready_but_present += 1
|
||||
|
||||
relation_states = _relation_state_stats(metadata_store)
|
||||
rel_total = max(0, int(relation_states.get("total", len(relation_hashes))))
|
||||
ready_count = max(0, int(relation_states.get("ready", 0)))
|
||||
|
||||
result = {
|
||||
"counts": {
|
||||
"paragraphs": len(paragraph_hashes),
|
||||
"entities": len(entity_hashes),
|
||||
"relations": len(relation_hashes),
|
||||
"vectors_live": len(live_vector_hashes),
|
||||
},
|
||||
"coverage": {
|
||||
"paragraph_vector_coverage": _safe_ratio(para_vector_hits, len(paragraph_hashes)),
|
||||
"entity_vector_coverage": _safe_ratio(ent_vector_hits, len(entity_hashes)),
|
||||
"relation_vector_coverage": _safe_ratio(rel_vector_hits, len(relation_hashes)),
|
||||
"relation_ready_coverage": _safe_ratio(ready_count, rel_total),
|
||||
},
|
||||
"relation_states": relation_states,
|
||||
"orphans": {
|
||||
"vector_only_count": len(orphan_vector_hashes),
|
||||
"vector_only_sample": orphan_vector_hashes[:30],
|
||||
},
|
||||
"consistency_checks": {
|
||||
"ready_but_missing_vector": ready_but_missing,
|
||||
"not_ready_but_vector_present": not_ready_but_present,
|
||||
},
|
||||
}
|
||||
return result
|
||||
finally:
|
||||
metadata_store.close()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = _build_arg_parser()
|
||||
args = parser.parse_args()
|
||||
|
||||
data_dir = Path(args.data_dir).resolve()
|
||||
if not data_dir.exists():
|
||||
print(f"❌ 数据目录不存在: {data_dir}")
|
||||
return 2
|
||||
|
||||
try:
|
||||
result = run_audit(data_dir)
|
||||
except Exception as e:
|
||||
print(f"❌ 审计失败: {e}")
|
||||
return 2
|
||||
|
||||
print("=== A_Memorix Vector Consistency Audit ===")
|
||||
print(f"data_dir: {data_dir}")
|
||||
print(f"paragraphs: {result['counts']['paragraphs']}")
|
||||
print(f"entities: {result['counts']['entities']}")
|
||||
print(f"relations: {result['counts']['relations']}")
|
||||
print(f"vectors_live: {result['counts']['vectors_live']}")
|
||||
print(
|
||||
"coverage: "
|
||||
f"paragraph={result['coverage']['paragraph_vector_coverage']:.3f}, "
|
||||
f"entity={result['coverage']['entity_vector_coverage']:.3f}, "
|
||||
f"relation={result['coverage']['relation_vector_coverage']:.3f}, "
|
||||
f"relation_ready={result['coverage']['relation_ready_coverage']:.3f}"
|
||||
)
|
||||
print(f"relation_states: {result['relation_states']}")
|
||||
print(
|
||||
"consistency_checks: "
|
||||
f"ready_but_missing_vector={result['consistency_checks']['ready_but_missing_vector']}, "
|
||||
f"not_ready_but_vector_present={result['consistency_checks']['not_ready_but_vector_present']}"
|
||||
)
|
||||
print(f"orphan_vectors: {result['orphans']['vector_only_count']}")
|
||||
|
||||
if args.json_out:
|
||||
out_path = Path(args.json_out).resolve()
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(out_path, "w", encoding="utf-8") as f:
|
||||
json.dump(result, f, ensure_ascii=False, indent=2)
|
||||
print(f"json_out: {out_path}")
|
||||
|
||||
has_anomaly = (
|
||||
result["orphans"]["vector_only_count"] > 0
|
||||
or result["consistency_checks"]["ready_but_missing_vector"] > 0
|
||||
)
|
||||
if args.strict and has_anomaly:
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
270
plugins/A_memorix/scripts/backfill_relation_vectors.py
Normal file
270
plugins/A_memorix/scripts/backfill_relation_vectors.py
Normal file
@@ -0,0 +1,270 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
关系向量一次性回填脚本(灰度/离线执行)。
|
||||
|
||||
用途:
|
||||
1. 对 relations 中 vector_state in (none, failed, pending) 的记录补齐向量。
|
||||
2. 支持并发控制,降低总耗时。
|
||||
3. 可作为灰度阶段验证工具,与 audit_vector_consistency.py 配合使用。
|
||||
4. 可选自动纳入“ready 但向量缺失”的漂移记录进行修复。
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List
|
||||
|
||||
import tomlkit
|
||||
|
||||
|
||||
CURRENT_DIR = Path(__file__).resolve().parent
|
||||
PLUGIN_ROOT = CURRENT_DIR.parent
|
||||
PROJECT_ROOT = PLUGIN_ROOT.parent.parent
|
||||
sys.path.insert(0, str(PROJECT_ROOT))
|
||||
sys.path.insert(0, str(PLUGIN_ROOT))
|
||||
|
||||
def _build_arg_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(description="关系向量一次性回填")
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
default=str(PLUGIN_ROOT / "config.toml"),
|
||||
help="配置文件路径(默认 plugins/A_memorix/config.toml)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--data-dir",
|
||||
default=str(PLUGIN_ROOT / "data"),
|
||||
help="数据目录(默认 plugins/A_memorix/data)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--states",
|
||||
default="none,failed,pending",
|
||||
help="待处理状态列表,逗号分隔",
|
||||
)
|
||||
parser.add_argument("--limit", type=int, default=50000, help="最大处理数量")
|
||||
parser.add_argument("--concurrency", type=int, default=8, help="并发数")
|
||||
parser.add_argument("--max-retry", type=int, default=None, help="最大重试次数过滤")
|
||||
parser.add_argument(
|
||||
"--include-ready-missing",
|
||||
action="store_true",
|
||||
help="额外纳入 vector_state=ready 但向量缺失的关系",
|
||||
)
|
||||
parser.add_argument("--dry-run", action="store_true", help="仅统计候选,不写入")
|
||||
return parser
|
||||
|
||||
|
||||
# --help/-h fast path: avoid heavy host/plugin bootstrap
|
||||
if any(arg in {"-h", "--help"} for arg in sys.argv[1:]):
|
||||
_build_arg_parser().print_help()
|
||||
raise SystemExit(0)
|
||||
|
||||
from core.storage import (
|
||||
VectorStore,
|
||||
GraphStore,
|
||||
MetadataStore,
|
||||
QuantizationType,
|
||||
SparseMatrixFormat,
|
||||
)
|
||||
from core.embedding import create_embedding_api_adapter
|
||||
from core.utils.relation_write_service import RelationWriteService
|
||||
|
||||
|
||||
def _load_config(config_path: Path) -> Dict[str, Any]:
|
||||
with open(config_path, "r", encoding="utf-8") as f:
|
||||
raw = tomlkit.load(f)
|
||||
return dict(raw) if isinstance(raw, dict) else {}
|
||||
|
||||
|
||||
def _build_vector_store(data_dir: Path, emb_cfg: Dict[str, Any]) -> VectorStore:
|
||||
q_type = str(emb_cfg.get("quantization_type", "int8")).lower()
|
||||
if q_type != "int8":
|
||||
raise ValueError(
|
||||
"embedding.quantization_type 在 vNext 仅允许 int8(SQ8)。"
|
||||
" 请先执行 scripts/release_vnext_migrate.py migrate。"
|
||||
)
|
||||
dim = int(emb_cfg.get("dimension", 1024))
|
||||
store = VectorStore(
|
||||
dimension=max(1, dim),
|
||||
quantization_type=QuantizationType.INT8,
|
||||
data_dir=data_dir / "vectors",
|
||||
)
|
||||
if store.has_data():
|
||||
store.load()
|
||||
return store
|
||||
|
||||
|
||||
def _build_graph_store(data_dir: Path, graph_cfg: Dict[str, Any]) -> GraphStore:
|
||||
fmt = str(graph_cfg.get("sparse_matrix_format", "csr")).lower()
|
||||
fmt_map = {
|
||||
"csr": SparseMatrixFormat.CSR,
|
||||
"csc": SparseMatrixFormat.CSC,
|
||||
}
|
||||
store = GraphStore(
|
||||
matrix_format=fmt_map.get(fmt, SparseMatrixFormat.CSR),
|
||||
data_dir=data_dir / "graph",
|
||||
)
|
||||
if store.has_data():
|
||||
store.load()
|
||||
return store
|
||||
|
||||
|
||||
def _build_metadata_store(data_dir: Path) -> MetadataStore:
|
||||
store = MetadataStore(data_dir=data_dir / "metadata")
|
||||
store.connect()
|
||||
return store
|
||||
|
||||
|
||||
def _build_embedding_manager(emb_cfg: Dict[str, Any]):
|
||||
retry_cfg = emb_cfg.get("retry", {})
|
||||
if not isinstance(retry_cfg, dict):
|
||||
retry_cfg = {}
|
||||
return create_embedding_api_adapter(
|
||||
batch_size=int(emb_cfg.get("batch_size", 32)),
|
||||
max_concurrent=int(emb_cfg.get("max_concurrent", 5)),
|
||||
default_dimension=int(emb_cfg.get("dimension", 1024)),
|
||||
model_name=str(emb_cfg.get("model_name", "auto")),
|
||||
retry_config=retry_cfg,
|
||||
)
|
||||
|
||||
|
||||
async def _process_rows(
|
||||
service: RelationWriteService,
|
||||
rows: List[Dict[str, Any]],
|
||||
concurrency: int,
|
||||
) -> Dict[str, int]:
|
||||
semaphore = asyncio.Semaphore(max(1, int(concurrency)))
|
||||
stat = {"success": 0, "failed": 0, "skipped": 0}
|
||||
|
||||
async def _worker(row: Dict[str, Any]) -> None:
|
||||
async with semaphore:
|
||||
result = await service.ensure_relation_vector(
|
||||
hash_value=str(row["hash"]),
|
||||
subject=str(row.get("subject", "")),
|
||||
predicate=str(row.get("predicate", "")),
|
||||
obj=str(row.get("object", "")),
|
||||
)
|
||||
if result.vector_state == "ready":
|
||||
if result.vector_written:
|
||||
stat["success"] += 1
|
||||
else:
|
||||
stat["skipped"] += 1
|
||||
else:
|
||||
stat["failed"] += 1
|
||||
|
||||
await asyncio.gather(*[_worker(row) for row in rows])
|
||||
return stat
|
||||
|
||||
|
||||
async def main_async(args: argparse.Namespace) -> int:
|
||||
config_path = Path(args.config).resolve()
|
||||
if not config_path.exists():
|
||||
print(f"❌ 配置文件不存在: {config_path}")
|
||||
return 2
|
||||
|
||||
cfg = _load_config(config_path)
|
||||
emb_cfg = cfg.get("embedding", {}) if isinstance(cfg, dict) else {}
|
||||
graph_cfg = cfg.get("graph", {}) if isinstance(cfg, dict) else {}
|
||||
retrieval_cfg = cfg.get("retrieval", {}) if isinstance(cfg, dict) else {}
|
||||
rv_cfg = retrieval_cfg.get("relation_vectorization", {}) if isinstance(retrieval_cfg, dict) else {}
|
||||
if not isinstance(emb_cfg, dict):
|
||||
emb_cfg = {}
|
||||
if not isinstance(graph_cfg, dict):
|
||||
graph_cfg = {}
|
||||
if not isinstance(rv_cfg, dict):
|
||||
rv_cfg = {}
|
||||
|
||||
data_dir = Path(args.data_dir).resolve()
|
||||
if not data_dir.exists():
|
||||
print(f"❌ 数据目录不存在: {data_dir}")
|
||||
return 2
|
||||
|
||||
print(f"data_dir: {data_dir}")
|
||||
print(f"config: {config_path}")
|
||||
|
||||
vector_store = _build_vector_store(data_dir, emb_cfg)
|
||||
graph_store = _build_graph_store(data_dir, graph_cfg)
|
||||
metadata_store = _build_metadata_store(data_dir)
|
||||
embedding_manager = _build_embedding_manager(emb_cfg)
|
||||
service = RelationWriteService(
|
||||
metadata_store=metadata_store,
|
||||
graph_store=graph_store,
|
||||
vector_store=vector_store,
|
||||
embedding_manager=embedding_manager,
|
||||
)
|
||||
|
||||
try:
|
||||
states = [s.strip() for s in str(args.states).split(",") if s.strip()]
|
||||
if not states:
|
||||
states = ["none", "failed", "pending"]
|
||||
max_retry = int(args.max_retry) if args.max_retry is not None else int(rv_cfg.get("max_retry", 3))
|
||||
limit = int(args.limit)
|
||||
|
||||
rows = metadata_store.list_relations_by_vector_state(
|
||||
states=states,
|
||||
limit=max(1, limit),
|
||||
max_retry=max(1, max_retry),
|
||||
)
|
||||
added_ready_missing = 0
|
||||
if args.include_ready_missing:
|
||||
ready_rows = metadata_store.list_relations_by_vector_state(
|
||||
states=["ready"],
|
||||
limit=max(1, limit),
|
||||
max_retry=max(1, max_retry),
|
||||
)
|
||||
ready_missing_rows = [
|
||||
row for row in ready_rows if str(row.get("hash", "")) not in vector_store
|
||||
]
|
||||
added_ready_missing = len(ready_missing_rows)
|
||||
if ready_missing_rows:
|
||||
dedup: Dict[str, Dict[str, Any]] = {}
|
||||
for row in rows:
|
||||
dedup[str(row.get("hash", ""))] = row
|
||||
for row in ready_missing_rows:
|
||||
dedup.setdefault(str(row.get("hash", "")), row)
|
||||
rows = list(dedup.values())[: max(1, limit)]
|
||||
print(f"candidates: {len(rows)} (states={states}, max_retry={max_retry})")
|
||||
if args.include_ready_missing:
|
||||
print(f"ready_missing_candidates_added: {added_ready_missing}")
|
||||
if not rows:
|
||||
return 0
|
||||
|
||||
if args.dry_run:
|
||||
print("dry_run=true,未执行写入。")
|
||||
return 0
|
||||
|
||||
started = time.time()
|
||||
stat = await _process_rows(
|
||||
service=service,
|
||||
rows=rows,
|
||||
concurrency=int(args.concurrency),
|
||||
)
|
||||
elapsed = (time.time() - started) * 1000.0
|
||||
|
||||
vector_store.save()
|
||||
graph_store.save()
|
||||
state_stats = metadata_store.count_relations_by_vector_state()
|
||||
output = {
|
||||
"processed": len(rows),
|
||||
"success": int(stat["success"]),
|
||||
"failed": int(stat["failed"]),
|
||||
"skipped": int(stat["skipped"]),
|
||||
"elapsed_ms": elapsed,
|
||||
"state_stats": state_stats,
|
||||
}
|
||||
print(json.dumps(output, ensure_ascii=False, indent=2))
|
||||
return 0 if stat["failed"] == 0 else 1
|
||||
finally:
|
||||
metadata_store.close()
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
return _build_arg_parser().parse_args()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
arguments = parse_args()
|
||||
raise SystemExit(asyncio.run(main_async(arguments)))
|
||||
73
plugins/A_memorix/scripts/backfill_temporal_metadata.py
Normal file
73
plugins/A_memorix/scripts/backfill_temporal_metadata.py
Normal file
@@ -0,0 +1,73 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
回填段落时序字段。
|
||||
|
||||
默认策略:
|
||||
1. 若段落缺失 event_time/event_time_start/event_time_end
|
||||
2. 且存在 created_at
|
||||
3. 写入 event_time=created_at, time_granularity=day, time_confidence=0.2
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
import sys
|
||||
|
||||
|
||||
CURRENT_DIR = Path(__file__).resolve().parent
|
||||
PLUGIN_ROOT = CURRENT_DIR.parent
|
||||
PROJECT_ROOT = PLUGIN_ROOT.parent.parent
|
||||
sys.path.insert(0, str(PROJECT_ROOT))
|
||||
|
||||
from plugins.A_memorix.core.storage import MetadataStore # noqa: E402
|
||||
|
||||
|
||||
def backfill(
|
||||
data_dir: Path,
|
||||
dry_run: bool,
|
||||
limit: int,
|
||||
no_created_fallback: bool,
|
||||
) -> int:
|
||||
store = MetadataStore(data_dir=data_dir)
|
||||
store.connect()
|
||||
summary = store.backfill_temporal_metadata_from_created_at(
|
||||
limit=limit,
|
||||
dry_run=dry_run,
|
||||
no_created_fallback=no_created_fallback,
|
||||
)
|
||||
store.close()
|
||||
if dry_run:
|
||||
print(f"[dry-run] candidates={summary['candidates']}")
|
||||
return int(summary["candidates"])
|
||||
if no_created_fallback:
|
||||
print(f"skip update (no-created-fallback), candidates={summary['candidates']}")
|
||||
return 0
|
||||
print(f"updated={summary['updated']}")
|
||||
return int(summary["updated"])
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="Backfill temporal metadata for A_Memorix paragraphs")
|
||||
parser.add_argument("--data-dir", default=str(PLUGIN_ROOT / "data"), help="数据目录")
|
||||
parser.add_argument("--dry-run", action="store_true", help="仅统计,不写入")
|
||||
parser.add_argument("--limit", type=int, default=100000, help="最大处理条数")
|
||||
parser.add_argument(
|
||||
"--no-created-fallback",
|
||||
action="store_true",
|
||||
help="不使用 created_at 回填,仅输出候选数量",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
backfill(
|
||||
data_dir=Path(args.data_dir),
|
||||
dry_run=args.dry_run,
|
||||
limit=max(1, int(args.limit)),
|
||||
no_created_fallback=args.no_created_fallback,
|
||||
)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
|
||||
@@ -46,9 +46,14 @@ if any(arg in {"-h", "--help"} for arg in sys.argv[1:]):
|
||||
_build_arg_parser().print_help()
|
||||
sys.exit(0)
|
||||
|
||||
# 设置日志
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
||||
logger = logging.getLogger("LPMM_Converter")
|
||||
# 设置日志:优先复用 MaiBot 统一日志体系,失败时回退到标准 logging。
|
||||
try:
|
||||
from src.common.logger import get_logger
|
||||
|
||||
logger = get_logger("A_Memorix.LPMMConverter")
|
||||
except Exception:
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
||||
logger = logging.getLogger("A_Memorix.LPMMConverter")
|
||||
|
||||
try:
|
||||
import networkx as nx
|
||||
@@ -225,11 +230,11 @@ class LPMMConverter:
|
||||
failed += 1
|
||||
|
||||
logger.info(
|
||||
"关系向量重建完成: total=%s success=%s skipped=%s failed=%s",
|
||||
len(rows),
|
||||
success,
|
||||
skipped,
|
||||
failed,
|
||||
"关系向量重建完成: "
|
||||
f"total={len(rows)} "
|
||||
f"success={success} "
|
||||
f"skipped={skipped} "
|
||||
f"failed={failed}"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
@@ -317,8 +322,8 @@ class LPMMConverter:
|
||||
if p_type == "relation":
|
||||
relation_count = self._import_relation_metadata_from_parquet(p_path)
|
||||
logger.warning(
|
||||
"跳过 relation.parquet 向量导入(保持一致性);已导入关系元数据: %s",
|
||||
relation_count,
|
||||
"跳过 relation.parquet 向量导入(保持一致性);"
|
||||
f"已导入关系元数据: {relation_count}"
|
||||
)
|
||||
continue
|
||||
|
||||
|
||||
172
plugins/A_memorix/scripts/import_lpmm_json.py
Normal file
172
plugins/A_memorix/scripts/import_lpmm_json.py
Normal file
@@ -0,0 +1,172 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
LPMM OpenIE JSON 导入工具。
|
||||
|
||||
功能:
|
||||
1. 读取符合 LPMM 规范的 OpenIE JSON 文件
|
||||
2. 转换为 A_Memorix 的统一导入格式
|
||||
3. 复用 `process_knowledge.py` 中的 `AutoImporter` 直接入库
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
import traceback
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from rich.console import Console
|
||||
from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
|
||||
|
||||
console = Console()
|
||||
|
||||
CURRENT_DIR = Path(__file__).resolve().parent
|
||||
PLUGIN_ROOT = CURRENT_DIR.parent
|
||||
WORKSPACE_ROOT = PLUGIN_ROOT.parent
|
||||
MAIBOT_ROOT = WORKSPACE_ROOT / "MaiBot"
|
||||
for path in (CURRENT_DIR, WORKSPACE_ROOT, MAIBOT_ROOT, PLUGIN_ROOT):
|
||||
path_str = str(path)
|
||||
if path_str not in sys.path:
|
||||
sys.path.insert(0, path_str)
|
||||
|
||||
|
||||
def _build_arg_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(description="将 LPMM OpenIE JSON 导入 A_Memorix")
|
||||
parser.add_argument("path", help="LPMM JSON 文件路径或目录")
|
||||
parser.add_argument("--force", action="store_true", help="强制重新导入")
|
||||
parser.add_argument("--concurrency", "-c", type=int, default=5, help="并发数")
|
||||
return parser
|
||||
|
||||
|
||||
if any(arg in {"-h", "--help"} for arg in sys.argv[1:]):
|
||||
_build_arg_parser().print_help()
|
||||
raise SystemExit(0)
|
||||
|
||||
|
||||
try:
|
||||
from process_knowledge import AutoImporter
|
||||
from A_memorix.core.utils.hash import compute_paragraph_hash
|
||||
from src.common.logger import get_logger
|
||||
except ImportError as exc: # pragma: no cover - script bootstrap
|
||||
print(f"导入模块失败,请确认 PYTHONPATH 与工作区结构: {exc}")
|
||||
raise SystemExit(1)
|
||||
|
||||
|
||||
logger = get_logger("A_Memorix.LPMMImport")
|
||||
|
||||
|
||||
class LPMMConverter:
|
||||
def convert_lpmm_to_memorix(self, lpmm_data: Dict[str, Any], filename: str) -> Dict[str, Any]:
|
||||
memorix_data = {"paragraphs": [], "entities": []}
|
||||
docs = lpmm_data.get("docs", []) or []
|
||||
if not docs:
|
||||
logger.warning(f"文件中未找到 docs 字段: {filename}")
|
||||
return memorix_data
|
||||
|
||||
all_entities = set()
|
||||
for doc in docs:
|
||||
content = str(doc.get("passage", "") or "").strip()
|
||||
if not content:
|
||||
continue
|
||||
|
||||
relations: List[Dict[str, str]] = []
|
||||
for triple in doc.get("extracted_triples", []) or []:
|
||||
if isinstance(triple, list) and len(triple) == 3:
|
||||
relations.append(
|
||||
{
|
||||
"subject": str(triple[0] or "").strip(),
|
||||
"predicate": str(triple[1] or "").strip(),
|
||||
"object": str(triple[2] or "").strip(),
|
||||
}
|
||||
)
|
||||
|
||||
entities = [str(item or "").strip() for item in doc.get("extracted_entities", []) or [] if str(item or "").strip()]
|
||||
all_entities.update(entities)
|
||||
for relation in relations:
|
||||
if relation["subject"]:
|
||||
all_entities.add(relation["subject"])
|
||||
if relation["object"]:
|
||||
all_entities.add(relation["object"])
|
||||
|
||||
memorix_data["paragraphs"].append(
|
||||
{
|
||||
"hash": compute_paragraph_hash(content),
|
||||
"content": content,
|
||||
"source": filename,
|
||||
"entities": entities,
|
||||
"relations": relations,
|
||||
}
|
||||
)
|
||||
|
||||
memorix_data["entities"] = sorted(all_entities)
|
||||
return memorix_data
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
parser = _build_arg_parser()
|
||||
args = parser.parse_args()
|
||||
|
||||
target_path = Path(args.path)
|
||||
if not target_path.exists():
|
||||
logger.error(f"路径不存在: {target_path}")
|
||||
return
|
||||
|
||||
if target_path.is_dir():
|
||||
files_to_process = list(target_path.glob("*-openie.json")) or list(target_path.glob("*.json"))
|
||||
else:
|
||||
files_to_process = [target_path]
|
||||
|
||||
if not files_to_process:
|
||||
logger.error("未找到可处理的 JSON 文件")
|
||||
return
|
||||
|
||||
importer = AutoImporter(force=bool(args.force), concurrency=int(args.concurrency))
|
||||
if not await importer.initialize():
|
||||
logger.error("初始化存储失败")
|
||||
return
|
||||
|
||||
converter = LPMMConverter()
|
||||
with Progress(
|
||||
SpinnerColumn(),
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
|
||||
TimeElapsedColumn(),
|
||||
console=console,
|
||||
transient=False,
|
||||
) as progress:
|
||||
for json_file in files_to_process:
|
||||
logger.info(f"正在转换并导入: {json_file.name}")
|
||||
try:
|
||||
with open(json_file, "r", encoding="utf-8") as handle:
|
||||
lpmm_data = json.load(handle)
|
||||
memorix_data = converter.convert_lpmm_to_memorix(lpmm_data, json_file.name)
|
||||
total_items = len(memorix_data.get("paragraphs", []))
|
||||
if total_items <= 0:
|
||||
logger.warning(f"转换结果为空: {json_file.name}")
|
||||
continue
|
||||
|
||||
task_id = progress.add_task(f"Importing {json_file.name}", total=total_items)
|
||||
|
||||
def update_progress(step: int = 1) -> None:
|
||||
progress.advance(task_id, advance=step)
|
||||
|
||||
await importer.import_json_data(
|
||||
memorix_data,
|
||||
filename=f"lpmm_{json_file.name}",
|
||||
progress_callback=update_progress,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.error(f"处理文件 {json_file.name} 失败: {exc}\n{traceback.format_exc()}")
|
||||
|
||||
await importer.close()
|
||||
logger.info("全部处理完成")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if sys.platform == "win32": # pragma: no cover
|
||||
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
||||
asyncio.run(main())
|
||||
1714
plugins/A_memorix/scripts/migrate_maibot_memory.py
Normal file
1714
plugins/A_memorix/scripts/migrate_maibot_memory.py
Normal file
File diff suppressed because it is too large
Load Diff
728
plugins/A_memorix/scripts/process_knowledge.py
Normal file
728
plugins/A_memorix/scripts/process_knowledge.py
Normal file
@@ -0,0 +1,728 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
知识库自动导入脚本 (Strategy-Aware Version)
|
||||
|
||||
功能:
|
||||
1. 扫描 plugins/A_memorix/data/raw 下的 .txt 文件
|
||||
2. 检查 data/import_manifest.json 确认是否已导入
|
||||
3. 使用 Strategy 模式处理文件 (Narrative/Factual/Quote)
|
||||
4. 将生成的数据直接存入 VectorStore/GraphStore/MetadataStore
|
||||
5. 更新 manifest
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
import asyncio
|
||||
import time
|
||||
import random
|
||||
import hashlib
|
||||
import tomlkit
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Any, Optional
|
||||
from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn
|
||||
from rich.console import Console
|
||||
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
|
||||
|
||||
console = Console()
|
||||
|
||||
class LLMGenerationError(Exception):
|
||||
pass
|
||||
|
||||
# 路径设置
|
||||
current_dir = Path(__file__).resolve().parent
|
||||
plugin_root = current_dir.parent
|
||||
workspace_root = plugin_root.parent
|
||||
maibot_root = workspace_root / "MaiBot"
|
||||
for path in (workspace_root, maibot_root, plugin_root):
|
||||
path_str = str(path)
|
||||
if path_str not in sys.path:
|
||||
sys.path.insert(0, path_str)
|
||||
|
||||
# 数据目录
|
||||
DATA_DIR = plugin_root / "data"
|
||||
RAW_DIR = DATA_DIR / "raw"
|
||||
PROCESSED_DIR = DATA_DIR / "processed"
|
||||
MANIFEST_PATH = DATA_DIR / "import_manifest.json"
|
||||
|
||||
|
||||
def _build_arg_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(description="A_Memorix Knowledge Importer (Strategy-Aware)")
|
||||
parser.add_argument("--force", action="store_true", help="Force re-import")
|
||||
parser.add_argument("--clear-manifest", action="store_true", help="Clear manifest")
|
||||
parser.add_argument(
|
||||
"--type",
|
||||
"-t",
|
||||
default="auto",
|
||||
help="Target import strategy override (auto/narrative/factual/quote)",
|
||||
)
|
||||
parser.add_argument("--concurrency", "-c", type=int, default=5)
|
||||
parser.add_argument(
|
||||
"--chat-log",
|
||||
action="store_true",
|
||||
help="聊天记录导入模式:强制 narrative 策略,并使用 LLM 语义抽取 event_time/event_time_range",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--chat-reference-time",
|
||||
default=None,
|
||||
help="chat_log 模式的相对时间参考点(如 2026/02/12 10:30);不传则使用当前本地时间",
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
# --help/-h fast path: avoid heavy host/plugin bootstrap
|
||||
if any(arg in {"-h", "--help"} for arg in sys.argv[1:]):
|
||||
_build_arg_parser().print_help()
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
try:
|
||||
import A_memorix.core as core_module
|
||||
import A_memorix.core.storage as storage_module
|
||||
from src.common.logger import get_logger
|
||||
from src.services import llm_service as llm_api
|
||||
from src.config.config import global_config, model_config
|
||||
|
||||
VectorStore = core_module.VectorStore
|
||||
GraphStore = core_module.GraphStore
|
||||
MetadataStore = core_module.MetadataStore
|
||||
ImportStrategy = core_module.ImportStrategy
|
||||
create_embedding_api_adapter = core_module.create_embedding_api_adapter
|
||||
RelationWriteService = getattr(core_module, "RelationWriteService", None)
|
||||
|
||||
looks_like_quote_text = storage_module.looks_like_quote_text
|
||||
parse_import_strategy = storage_module.parse_import_strategy
|
||||
resolve_stored_knowledge_type = storage_module.resolve_stored_knowledge_type
|
||||
select_import_strategy = storage_module.select_import_strategy
|
||||
|
||||
from A_memorix.core.utils.time_parser import normalize_time_meta
|
||||
from A_memorix.core.utils.import_payloads import normalize_paragraph_import_item
|
||||
from A_memorix.core.strategies.base import BaseStrategy, ProcessedChunk, KnowledgeType as StratKnowledgeType
|
||||
from A_memorix.core.strategies.narrative import NarrativeStrategy
|
||||
from A_memorix.core.strategies.factual import FactualStrategy
|
||||
from A_memorix.core.strategies.quote import QuoteStrategy
|
||||
|
||||
except ImportError as e:
|
||||
print(f"❌ 无法导入模块: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
logger = get_logger("A_Memorix.AutoImport")
|
||||
|
||||
|
||||
def _log_before_retry(retry_state) -> None:
|
||||
"""使用项目统一日志风格记录重试信息。"""
|
||||
exc = None
|
||||
if getattr(retry_state, "outcome", None) is not None and retry_state.outcome.failed:
|
||||
exc = retry_state.outcome.exception()
|
||||
next_sleep = getattr(getattr(retry_state, "next_action", None), "sleep", None)
|
||||
logger.warning(
|
||||
"LLM 调用即将重试: "
|
||||
f"attempt={getattr(retry_state, 'attempt_number', '?')} "
|
||||
f"next_sleep={next_sleep} "
|
||||
f"error={exc}"
|
||||
)
|
||||
|
||||
class AutoImporter:
|
||||
def __init__(
|
||||
self,
|
||||
force: bool = False,
|
||||
clear_manifest: bool = False,
|
||||
target_type: str = "auto",
|
||||
concurrency: int = 5,
|
||||
chat_log: bool = False,
|
||||
chat_reference_time: Optional[str] = None,
|
||||
):
|
||||
self.vector_store: Optional[VectorStore] = None
|
||||
self.graph_store: Optional[GraphStore] = None
|
||||
self.metadata_store: Optional[MetadataStore] = None
|
||||
self.embedding_manager = None
|
||||
self.relation_write_service = None
|
||||
self.plugin_config = {}
|
||||
self.manifest = {}
|
||||
self.force = force
|
||||
self.clear_manifest = clear_manifest
|
||||
self.chat_log = chat_log
|
||||
parsed_target_type = parse_import_strategy(target_type, default=ImportStrategy.AUTO)
|
||||
self.target_type = ImportStrategy.NARRATIVE.value if chat_log else parsed_target_type.value
|
||||
self.chat_reference_dt = self._parse_reference_time(chat_reference_time)
|
||||
if self.chat_log and parsed_target_type not in {ImportStrategy.AUTO, ImportStrategy.NARRATIVE}:
|
||||
logger.warning(
|
||||
f"chat_log 模式已启用,target_type={target_type} 将被覆盖为 narrative"
|
||||
)
|
||||
self.concurrency_limit = concurrency
|
||||
self.semaphore = None
|
||||
self.storage_lock = None
|
||||
|
||||
async def initialize(self):
|
||||
logger.info(f"正在初始化... (并发数: {self.concurrency_limit})")
|
||||
self.semaphore = asyncio.Semaphore(self.concurrency_limit)
|
||||
self.storage_lock = asyncio.Lock()
|
||||
|
||||
RAW_DIR.mkdir(parents=True, exist_ok=True)
|
||||
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if self.clear_manifest:
|
||||
logger.info("🧹 清理 Mainfest")
|
||||
self.manifest = {}
|
||||
self._save_manifest()
|
||||
elif MANIFEST_PATH.exists():
|
||||
try:
|
||||
with open(MANIFEST_PATH, "r", encoding="utf-8") as f:
|
||||
self.manifest = json.load(f)
|
||||
except Exception:
|
||||
self.manifest = {}
|
||||
|
||||
config_path = plugin_root / "config.toml"
|
||||
try:
|
||||
with open(config_path, "r", encoding="utf-8") as f:
|
||||
self.plugin_config = tomlkit.load(f)
|
||||
except Exception as e:
|
||||
logger.error(f"加载插件配置失败: {e}")
|
||||
return False
|
||||
|
||||
try:
|
||||
await self._init_stores()
|
||||
except Exception as e:
|
||||
logger.error(f"初始化存储失败: {e}")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
async def _init_stores(self):
|
||||
# ... (Same as original)
|
||||
self.embedding_manager = create_embedding_api_adapter(
|
||||
batch_size=self.plugin_config.get("embedding", {}).get("batch_size", 32),
|
||||
default_dimension=self.plugin_config.get("embedding", {}).get("dimension", 384),
|
||||
model_name=self.plugin_config.get("embedding", {}).get("model_name", "auto"),
|
||||
retry_config=self.plugin_config.get("embedding", {}).get("retry", {}),
|
||||
)
|
||||
try:
|
||||
dim = await self.embedding_manager._detect_dimension()
|
||||
except:
|
||||
dim = self.embedding_manager.default_dimension
|
||||
|
||||
q_type_str = str(self.plugin_config.get("embedding", {}).get("quantization_type", "int8") or "int8").lower()
|
||||
# Need to access QuantizationType from storage_module if not imported globally
|
||||
QuantizationType = storage_module.QuantizationType
|
||||
if q_type_str != "int8":
|
||||
raise ValueError(
|
||||
"embedding.quantization_type 在 vNext 仅允许 int8(SQ8)。"
|
||||
" 请先执行 scripts/release_vnext_migrate.py migrate。"
|
||||
)
|
||||
|
||||
self.vector_store = VectorStore(
|
||||
dimension=dim,
|
||||
quantization_type=QuantizationType.INT8,
|
||||
data_dir=DATA_DIR / "vectors"
|
||||
)
|
||||
|
||||
SparseMatrixFormat = storage_module.SparseMatrixFormat
|
||||
m_fmt_str = self.plugin_config.get("graph", {}).get("sparse_matrix_format", "csr")
|
||||
m_map = {"csr": SparseMatrixFormat.CSR, "csc": SparseMatrixFormat.CSC}
|
||||
|
||||
self.graph_store = GraphStore(
|
||||
matrix_format=m_map.get(m_fmt_str, SparseMatrixFormat.CSR),
|
||||
data_dir=DATA_DIR / "graph"
|
||||
)
|
||||
|
||||
self.metadata_store = MetadataStore(data_dir=DATA_DIR / "metadata")
|
||||
self.metadata_store.connect()
|
||||
|
||||
if RelationWriteService is not None:
|
||||
self.relation_write_service = RelationWriteService(
|
||||
metadata_store=self.metadata_store,
|
||||
graph_store=self.graph_store,
|
||||
vector_store=self.vector_store,
|
||||
embedding_manager=self.embedding_manager,
|
||||
)
|
||||
|
||||
if self.vector_store.has_data(): self.vector_store.load()
|
||||
if self.graph_store.has_data(): self.graph_store.load()
|
||||
|
||||
def _should_write_relation_vectors(self) -> bool:
|
||||
retrieval_cfg = self.plugin_config.get("retrieval", {})
|
||||
if not isinstance(retrieval_cfg, dict):
|
||||
return False
|
||||
rv_cfg = retrieval_cfg.get("relation_vectorization", {})
|
||||
if not isinstance(rv_cfg, dict):
|
||||
return False
|
||||
return bool(rv_cfg.get("enabled", False)) and bool(rv_cfg.get("write_on_import", True))
|
||||
|
||||
def load_file(self, file_path: Path) -> str:
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
return f.read()
|
||||
|
||||
def get_file_hash(self, content: str) -> str:
|
||||
return hashlib.md5(content.encode("utf-8")).hexdigest()
|
||||
|
||||
def _parse_reference_time(self, value: Optional[str]) -> datetime:
|
||||
"""解析 chat_log 模式的参考时间(用于相对时间语义解析)。"""
|
||||
if not value:
|
||||
return datetime.now()
|
||||
formats = [
|
||||
"%Y/%m/%d %H:%M:%S",
|
||||
"%Y/%m/%d %H:%M",
|
||||
"%Y-%m-%d %H:%M:%S",
|
||||
"%Y-%m-%d %H:%M",
|
||||
"%Y/%m/%d",
|
||||
"%Y-%m-%d",
|
||||
]
|
||||
text = str(value).strip()
|
||||
for fmt in formats:
|
||||
try:
|
||||
return datetime.strptime(text, fmt)
|
||||
except ValueError:
|
||||
continue
|
||||
logger.warning(
|
||||
f"无法解析 chat_reference_time={value},将回退为当前本地时间"
|
||||
)
|
||||
return datetime.now()
|
||||
|
||||
async def _extract_chat_time_meta_with_llm(
|
||||
self,
|
||||
text: str,
|
||||
model_config: Any,
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
使用 LLM 从聊天文本语义中抽取时间信息。
|
||||
支持将相对时间表达转换为绝对时间。
|
||||
"""
|
||||
if not text.strip():
|
||||
return None
|
||||
|
||||
reference_now = self.chat_reference_dt.strftime("%Y/%m/%d %H:%M")
|
||||
prompt = f"""You are a time extraction engine for chat logs.
|
||||
Extract temporal information from the following chat paragraph.
|
||||
|
||||
Rules:
|
||||
1. Use semantic understanding, not regex matching.
|
||||
2. Convert relative expressions (e.g., yesterday evening, last Friday morning) to absolute local datetime using reference_now.
|
||||
3. If a time span exists, return event_time_start/event_time_end.
|
||||
4. If only one point in time exists, return event_time.
|
||||
5. If no reliable time can be inferred, return all time fields as null.
|
||||
6. Output ONLY valid JSON. No markdown, no explanation.
|
||||
|
||||
reference_now: {reference_now}
|
||||
timezone: local system timezone
|
||||
|
||||
Allowed output formats for time values:
|
||||
- "YYYY/MM/DD"
|
||||
- "YYYY/MM/DD HH:mm"
|
||||
|
||||
JSON schema:
|
||||
{{
|
||||
"event_time": null,
|
||||
"event_time_start": null,
|
||||
"event_time_end": null,
|
||||
"time_range": null,
|
||||
"time_granularity": "day",
|
||||
"time_confidence": 0.0
|
||||
}}
|
||||
|
||||
Chat paragraph:
|
||||
\"\"\"{text}\"\"\"
|
||||
"""
|
||||
try:
|
||||
result = await self._llm_call(prompt, model_config)
|
||||
except Exception as e:
|
||||
logger.warning(f"chat_log 时间语义抽取失败: {e}")
|
||||
return None
|
||||
|
||||
if not isinstance(result, dict):
|
||||
return None
|
||||
|
||||
raw_time_meta = {
|
||||
"event_time": result.get("event_time"),
|
||||
"event_time_start": result.get("event_time_start"),
|
||||
"event_time_end": result.get("event_time_end"),
|
||||
"time_range": result.get("time_range"),
|
||||
"time_granularity": result.get("time_granularity"),
|
||||
"time_confidence": result.get("time_confidence"),
|
||||
}
|
||||
try:
|
||||
normalized = normalize_time_meta(raw_time_meta)
|
||||
except Exception as e:
|
||||
logger.warning(f"chat_log 时间语义抽取结果不可用,已忽略: {e}")
|
||||
return None
|
||||
|
||||
has_effective_time = any(
|
||||
key in normalized
|
||||
for key in ("event_time", "event_time_start", "event_time_end")
|
||||
)
|
||||
if not has_effective_time:
|
||||
return None
|
||||
|
||||
return normalized
|
||||
|
||||
def _determine_strategy(self, filename: str, content: str) -> BaseStrategy:
|
||||
"""Layer 1: Global Strategy Routing"""
|
||||
strategy = select_import_strategy(
|
||||
content,
|
||||
override=self.target_type,
|
||||
chat_log=self.chat_log,
|
||||
)
|
||||
if self.chat_log:
|
||||
logger.info(f"chat_log 模式: {filename} 强制使用 NarrativeStrategy")
|
||||
elif strategy == ImportStrategy.QUOTE:
|
||||
logger.info(f"Auto-detected Quote/Lyric type for {filename}")
|
||||
|
||||
if strategy == ImportStrategy.FACTUAL:
|
||||
return FactualStrategy(filename)
|
||||
if strategy == ImportStrategy.QUOTE:
|
||||
return QuoteStrategy(filename)
|
||||
return NarrativeStrategy(filename)
|
||||
|
||||
def _chunk_rescue(self, chunk: ProcessedChunk, filename: str) -> Optional[BaseStrategy]:
|
||||
"""Layer 2: Chunk-level rescue strategies"""
|
||||
# If we are already in Quote strategy, no need to rescue
|
||||
if chunk.type == StratKnowledgeType.QUOTE:
|
||||
return None
|
||||
|
||||
if looks_like_quote_text(chunk.chunk.text):
|
||||
logger.info(f" > Rescuing chunk {chunk.chunk.index} as Quote")
|
||||
return QuoteStrategy(filename)
|
||||
|
||||
return None
|
||||
|
||||
async def process_and_import(self):
|
||||
if not await self.initialize(): return
|
||||
|
||||
files = list(RAW_DIR.glob("*.txt"))
|
||||
logger.info(f"扫描到 {len(files)} 个文件 in {RAW_DIR}")
|
||||
|
||||
if not files: return
|
||||
|
||||
tasks = []
|
||||
for file_path in files:
|
||||
tasks.append(asyncio.create_task(self._process_single_file(file_path)))
|
||||
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
success_count = sum(1 for r in results if r is True)
|
||||
logger.info(f"本次主处理完成,共成功处理 {success_count}/{len(files)} 个文件")
|
||||
|
||||
if self.vector_store: self.vector_store.save()
|
||||
if self.graph_store: self.graph_store.save()
|
||||
|
||||
async def _process_single_file(self, file_path: Path) -> bool:
|
||||
filename = file_path.name
|
||||
async with self.semaphore:
|
||||
try:
|
||||
content = self.load_file(file_path)
|
||||
file_hash = self.get_file_hash(content)
|
||||
|
||||
if not self.force and filename in self.manifest:
|
||||
record = self.manifest[filename]
|
||||
if record.get("hash") == file_hash and record.get("imported"):
|
||||
logger.info(f"跳过已导入文件: {filename}")
|
||||
return False
|
||||
|
||||
logger.info(f">>> 开始处理: {filename}")
|
||||
|
||||
# 1. Strategy Selection
|
||||
strategy = self._determine_strategy(filename, content)
|
||||
logger.info(f" 策略: {strategy.__class__.__name__}")
|
||||
|
||||
# 2. Split (Strategy-Aware)
|
||||
initial_chunks = strategy.split(content)
|
||||
logger.info(f" 初步分块: {len(initial_chunks)}")
|
||||
|
||||
processed_data = {"paragraphs": [], "entities": [], "relations": []}
|
||||
|
||||
# 3. Extract Loop
|
||||
model_config = await self._select_model()
|
||||
|
||||
for i, chunk in enumerate(initial_chunks):
|
||||
current_strategy = strategy
|
||||
# Layer 2: Chunk Rescue
|
||||
rescue_strategy = self._chunk_rescue(chunk, filename)
|
||||
if rescue_strategy:
|
||||
# Re-split? No, just re-process this text as a single chunk using the rescue strategy
|
||||
# But rescue strategy might want to split it further?
|
||||
# Simplification: Treat the whole chunk text as one block for the rescue strategy
|
||||
# OR create a single chunk object for it.
|
||||
# Creating a new chunk using rescue strategy logic might be complex if split behavior differs.
|
||||
# Let's just instantiate a chunk of the new type manually
|
||||
chunk.type = StratKnowledgeType.QUOTE
|
||||
chunk.flags.verbatim = True
|
||||
chunk.flags.requires_llm = False # Quotes don't usually need LLM
|
||||
current_strategy = rescue_strategy
|
||||
|
||||
# Extraction
|
||||
if chunk.flags.requires_llm:
|
||||
result_chunk = await current_strategy.extract(chunk, lambda p: self._llm_call(p, model_config))
|
||||
else:
|
||||
# For quotes, extract might be just pass through or regex
|
||||
result_chunk = await current_strategy.extract(chunk)
|
||||
|
||||
time_meta = None
|
||||
if self.chat_log:
|
||||
time_meta = await self._extract_chat_time_meta_with_llm(
|
||||
result_chunk.chunk.text,
|
||||
model_config,
|
||||
)
|
||||
|
||||
# Normalize Data
|
||||
self._normalize_and_aggregate(
|
||||
result_chunk,
|
||||
processed_data,
|
||||
time_meta=time_meta,
|
||||
)
|
||||
|
||||
logger.info(f" 已处理块 {i+1}/{len(initial_chunks)}")
|
||||
|
||||
# 4. Save Json
|
||||
json_path = PROCESSED_DIR / f"{file_path.stem}.json"
|
||||
with open(json_path, "w", encoding="utf-8") as f:
|
||||
json.dump(processed_data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
# 5. Import to DB
|
||||
async with self.storage_lock:
|
||||
await self._import_to_db(processed_data)
|
||||
|
||||
self.manifest[filename] = {
|
||||
"hash": file_hash,
|
||||
"timestamp": time.time(),
|
||||
"imported": True
|
||||
}
|
||||
self._save_manifest()
|
||||
self.vector_store.save()
|
||||
self.graph_store.save()
|
||||
logger.info(f"✅ 文件 {filename} 处理并导入完成")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ 处理失败 {filename}: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
def _normalize_and_aggregate(
|
||||
self,
|
||||
chunk: ProcessedChunk,
|
||||
all_data: Dict,
|
||||
time_meta: Optional[Dict[str, Any]] = None,
|
||||
):
|
||||
"""Convert strategy-specific data to unified generic format for storage."""
|
||||
# Generic fields
|
||||
para_item = {
|
||||
"content": chunk.chunk.text,
|
||||
"source": chunk.source.file,
|
||||
"knowledge_type": resolve_stored_knowledge_type(
|
||||
chunk.type.value,
|
||||
content=chunk.chunk.text,
|
||||
).value,
|
||||
"entities": [],
|
||||
"relations": []
|
||||
}
|
||||
|
||||
data = chunk.data
|
||||
|
||||
# 1. Triples (Factual)
|
||||
if "triples" in data:
|
||||
for t in data["triples"]:
|
||||
para_item["relations"].append({
|
||||
"subject": t.get("subject"),
|
||||
"predicate": t.get("predicate"),
|
||||
"object": t.get("object")
|
||||
})
|
||||
# Auto-add entities from triples
|
||||
para_item["entities"].extend([t.get("subject"), t.get("object")])
|
||||
|
||||
# 2. Events & Relations (Narrative)
|
||||
if "events" in data:
|
||||
# Store events as content/metadata? Or entities?
|
||||
# For now maybe just keep them in logic, or add as 'Event' entities?
|
||||
# Creating entities for events is good.
|
||||
para_item["entities"].extend(data["events"])
|
||||
|
||||
if "relations" in data: # Narrative also outputs relations list
|
||||
para_item["relations"].extend(data["relations"])
|
||||
for r in data["relations"]:
|
||||
para_item["entities"].extend([r.get("subject"), r.get("object")])
|
||||
|
||||
# 3. Verbatim Entities (Quote)
|
||||
if "verbatim_entities" in data:
|
||||
para_item["entities"].extend(data["verbatim_entities"])
|
||||
|
||||
# Dedupe per paragraph
|
||||
para_item["entities"] = list(set([e for e in para_item["entities"] if e]))
|
||||
|
||||
if time_meta:
|
||||
para_item["time_meta"] = time_meta
|
||||
|
||||
all_data["paragraphs"].append(para_item)
|
||||
all_data["entities"].extend(para_item["entities"])
|
||||
if "relations" in para_item:
|
||||
all_data["relations"].extend(para_item["relations"])
|
||||
|
||||
@retry(
|
||||
retry=retry_if_exception_type((LLMGenerationError, json.JSONDecodeError)),
|
||||
stop=stop_after_attempt(3),
|
||||
wait=wait_exponential(multiplier=1, min=2, max=10),
|
||||
before_sleep=_log_before_retry
|
||||
)
|
||||
async def _llm_call(self, prompt: str, model_config: Any) -> Dict:
|
||||
"""Generic LLM Caller"""
|
||||
success, response, _, _ = await llm_api.generate_with_model(
|
||||
prompt=prompt,
|
||||
model_config=model_config,
|
||||
request_type="Script.ProcessKnowledge"
|
||||
)
|
||||
if success:
|
||||
txt = response.strip()
|
||||
if "```" in txt:
|
||||
txt = txt.split("```json")[-1].split("```")[0].strip()
|
||||
try:
|
||||
return json.loads(txt)
|
||||
except json.JSONDecodeError:
|
||||
# Fallback: try to find first { and last }
|
||||
start = txt.find('{')
|
||||
end = txt.rfind('}')
|
||||
if start != -1 and end != -1:
|
||||
return json.loads(txt[start:end+1])
|
||||
raise
|
||||
else:
|
||||
raise LLMGenerationError("LLM generation failed")
|
||||
|
||||
async def _select_model(self) -> Any:
|
||||
models = llm_api.get_available_models()
|
||||
if not models: raise ValueError("No LLM models")
|
||||
|
||||
config_model = self.plugin_config.get("advanced", {}).get("extraction_model", "auto")
|
||||
if config_model != "auto" and config_model in models:
|
||||
return models[config_model]
|
||||
|
||||
for task_key in ["lpmm_entity_extract", "lpmm_rdf_build", "embedding"]:
|
||||
if task_key in models: return models[task_key]
|
||||
|
||||
return models[list(models.keys())[0]]
|
||||
|
||||
# Re-use existing methods
|
||||
async def _add_entity_with_vector(self, name: str, source_paragraph: Optional[str] = None) -> str:
|
||||
# Same as before
|
||||
hash_value = self.metadata_store.add_entity(name, source_paragraph=source_paragraph)
|
||||
self.graph_store.add_nodes([name])
|
||||
try:
|
||||
emb = await self.embedding_manager.encode(name)
|
||||
try:
|
||||
self.vector_store.add(emb.reshape(1, -1), [hash_value])
|
||||
except ValueError: pass
|
||||
except Exception: pass
|
||||
return hash_value
|
||||
|
||||
async def import_json_data(self, data: Dict, filename: str = "script_import", progress_callback=None):
|
||||
"""Public import entrypoint for pre-processed JSON payloads."""
|
||||
if not self.storage_lock:
|
||||
raise RuntimeError("Importer is not initialized. Call initialize() first.")
|
||||
|
||||
async with self.storage_lock:
|
||||
await self._import_to_db(data, progress_callback=progress_callback)
|
||||
self.manifest[filename] = {
|
||||
"hash": self.get_file_hash(json.dumps(data, ensure_ascii=False, sort_keys=True)),
|
||||
"timestamp": time.time(),
|
||||
"imported": True,
|
||||
}
|
||||
self._save_manifest()
|
||||
self.vector_store.save()
|
||||
self.graph_store.save()
|
||||
|
||||
async def _import_to_db(self, data: Dict, progress_callback=None):
|
||||
# Same logic, but ensure robust
|
||||
with self.graph_store.batch_update():
|
||||
for item in data.get("paragraphs", []):
|
||||
paragraph = normalize_paragraph_import_item(
|
||||
item,
|
||||
default_source="script",
|
||||
)
|
||||
content = paragraph["content"]
|
||||
source = paragraph["source"]
|
||||
k_type_val = paragraph["knowledge_type"]
|
||||
|
||||
h_val = self.metadata_store.add_paragraph(
|
||||
content=content,
|
||||
source=source,
|
||||
knowledge_type=k_type_val,
|
||||
time_meta=paragraph["time_meta"],
|
||||
)
|
||||
|
||||
if h_val not in self.vector_store:
|
||||
try:
|
||||
emb = await self.embedding_manager.encode(content)
|
||||
self.vector_store.add(emb.reshape(1, -1), [h_val])
|
||||
except Exception as e:
|
||||
logger.error(f" Vector fail: {e}")
|
||||
|
||||
para_entities = paragraph["entities"]
|
||||
for entity in para_entities:
|
||||
if entity:
|
||||
await self._add_entity_with_vector(entity, source_paragraph=h_val)
|
||||
|
||||
para_relations = paragraph["relations"]
|
||||
for rel in para_relations:
|
||||
s, p, o = rel.get("subject"), rel.get("predicate"), rel.get("object")
|
||||
if s and p and o:
|
||||
await self._add_entity_with_vector(s, source_paragraph=h_val)
|
||||
await self._add_entity_with_vector(o, source_paragraph=h_val)
|
||||
confidence = float(rel.get("confidence", 1.0) or 1.0)
|
||||
rel_meta = rel.get("metadata", {})
|
||||
write_vector = self._should_write_relation_vectors()
|
||||
if self.relation_write_service is not None:
|
||||
await self.relation_write_service.upsert_relation_with_vector(
|
||||
subject=s,
|
||||
predicate=p,
|
||||
obj=o,
|
||||
confidence=confidence,
|
||||
source_paragraph=h_val,
|
||||
metadata=rel_meta if isinstance(rel_meta, dict) else {},
|
||||
write_vector=write_vector,
|
||||
)
|
||||
else:
|
||||
rel_hash = self.metadata_store.add_relation(
|
||||
s,
|
||||
p,
|
||||
o,
|
||||
confidence=confidence,
|
||||
source_paragraph=h_val,
|
||||
metadata=rel_meta if isinstance(rel_meta, dict) else {},
|
||||
)
|
||||
self.graph_store.add_edges([(s, o)], relation_hashes=[rel_hash])
|
||||
try:
|
||||
self.metadata_store.set_relation_vector_state(rel_hash, "none")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if progress_callback: progress_callback(1)
|
||||
|
||||
async def close(self):
|
||||
if self.metadata_store: self.metadata_store.close()
|
||||
|
||||
def _save_manifest(self):
|
||||
with open(MANIFEST_PATH, "w", encoding="utf-8") as f:
|
||||
json.dump(self.manifest, f, ensure_ascii=False, indent=2)
|
||||
|
||||
async def main():
|
||||
parser = _build_arg_parser()
|
||||
args = parser.parse_args()
|
||||
|
||||
if not global_config: return
|
||||
|
||||
importer = AutoImporter(
|
||||
force=args.force,
|
||||
clear_manifest=args.clear_manifest,
|
||||
target_type=args.type,
|
||||
concurrency=args.concurrency,
|
||||
chat_log=args.chat_log,
|
||||
chat_reference_time=args.chat_reference_time,
|
||||
)
|
||||
await importer.process_and_import()
|
||||
await importer.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
if sys.platform == "win32":
|
||||
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
||||
asyncio.run(main())
|
||||
127
plugins/A_memorix/scripts/rebuild_episodes.py
Normal file
127
plugins/A_memorix/scripts/rebuild_episodes.py
Normal file
@@ -0,0 +1,127 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Episode source 级重建工具。"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List
|
||||
|
||||
CURRENT_DIR = Path(__file__).resolve().parent
|
||||
PLUGIN_ROOT = CURRENT_DIR.parent
|
||||
WORKSPACE_ROOT = PLUGIN_ROOT.parent
|
||||
MAIBOT_ROOT = WORKSPACE_ROOT / "MaiBot"
|
||||
for path in (WORKSPACE_ROOT, MAIBOT_ROOT, PLUGIN_ROOT):
|
||||
path_str = str(path)
|
||||
if path_str not in sys.path:
|
||||
sys.path.insert(0, path_str)
|
||||
|
||||
try:
|
||||
import tomlkit # type: ignore
|
||||
except Exception: # pragma: no cover
|
||||
tomlkit = None
|
||||
|
||||
from A_memorix.core.storage import MetadataStore
|
||||
from A_memorix.core.utils.episode_service import EpisodeService
|
||||
|
||||
|
||||
def _build_arg_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(description="Rebuild A_Memorix episodes by source")
|
||||
parser.add_argument("--data-dir", default=str(PLUGIN_ROOT / "data"), help="插件数据目录")
|
||||
parser.add_argument("--source", type=str, help="指定单个 source 入队/重建")
|
||||
parser.add_argument("--all", action="store_true", help="对所有 source 入队/重建")
|
||||
parser.add_argument("--wait", action="store_true", help="在脚本内同步执行重建")
|
||||
return parser
|
||||
|
||||
|
||||
if any(arg in {"-h", "--help"} for arg in sys.argv[1:]):
|
||||
_build_arg_parser().print_help()
|
||||
raise SystemExit(0)
|
||||
|
||||
|
||||
def _load_plugin_config() -> Dict[str, Any]:
|
||||
config_path = PLUGIN_ROOT / "config.toml"
|
||||
if tomlkit is None or not config_path.exists():
|
||||
return {}
|
||||
try:
|
||||
with open(config_path, "r", encoding="utf-8") as handle:
|
||||
parsed = tomlkit.load(handle)
|
||||
return dict(parsed) if isinstance(parsed, dict) else {}
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
|
||||
def _resolve_sources(store: MetadataStore, *, source: str | None, rebuild_all: bool) -> List[str]:
|
||||
if rebuild_all:
|
||||
return list(store.list_episode_sources_for_rebuild())
|
||||
token = str(source or "").strip()
|
||||
if not token:
|
||||
raise ValueError("必须提供 --source 或 --all")
|
||||
return [token]
|
||||
|
||||
|
||||
async def _run_rebuilds(store: MetadataStore, plugin_config: Dict[str, Any], sources: List[str]) -> int:
|
||||
service = EpisodeService(metadata_store=store, plugin_config=plugin_config)
|
||||
failures: List[str] = []
|
||||
for source in sources:
|
||||
started = store.mark_episode_source_running(source)
|
||||
if not started:
|
||||
failures.append(f"{source}: unable_to_mark_running")
|
||||
continue
|
||||
try:
|
||||
result = await service.rebuild_source(source)
|
||||
store.mark_episode_source_done(source)
|
||||
print(
|
||||
"rebuilt"
|
||||
f" source={source}"
|
||||
f" paragraphs={int(result.get('paragraph_count') or 0)}"
|
||||
f" groups={int(result.get('group_count') or 0)}"
|
||||
f" episodes={int(result.get('episode_count') or 0)}"
|
||||
f" fallback={int(result.get('fallback_count') or 0)}"
|
||||
)
|
||||
except Exception as exc:
|
||||
err = str(exc)[:500]
|
||||
store.mark_episode_source_failed(source, err)
|
||||
failures.append(f"{source}: {err}")
|
||||
print(f"failed source={source} error={err}")
|
||||
|
||||
if failures:
|
||||
for item in failures:
|
||||
print(item)
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = _build_arg_parser()
|
||||
args = parser.parse_args()
|
||||
if bool(args.all) == bool(args.source):
|
||||
parser.error("必须且只能选择一个:--source 或 --all")
|
||||
|
||||
store = MetadataStore(data_dir=Path(args.data_dir) / "metadata")
|
||||
store.connect()
|
||||
try:
|
||||
sources = _resolve_sources(store, source=args.source, rebuild_all=bool(args.all))
|
||||
if not sources:
|
||||
print("no sources to rebuild")
|
||||
return 0
|
||||
|
||||
enqueued = 0
|
||||
reason = "script_rebuild_all" if args.all else "script_rebuild_source"
|
||||
for source in sources:
|
||||
enqueued += int(store.enqueue_episode_source_rebuild(source, reason=reason))
|
||||
print(f"enqueued={enqueued} sources={len(sources)}")
|
||||
|
||||
if not args.wait:
|
||||
return 0
|
||||
|
||||
plugin_config = _load_plugin_config()
|
||||
return asyncio.run(_run_rebuilds(store, plugin_config, sources))
|
||||
finally:
|
||||
store.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
731
plugins/A_memorix/scripts/release_vnext_migrate.py
Normal file
731
plugins/A_memorix/scripts/release_vnext_migrate.py
Normal file
@@ -0,0 +1,731 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
vNext release migration entrypoint for A_Memorix.
|
||||
|
||||
Subcommands:
|
||||
- preflight: detect legacy config/data/schema risks
|
||||
- migrate: offline migrate config + vectors + metadata schema + graph edge hash map
|
||||
- verify: strict post-migration consistency checks
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import pickle
|
||||
import sqlite3
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
|
||||
|
||||
import tomlkit
|
||||
|
||||
|
||||
CURRENT_DIR = Path(__file__).resolve().parent
|
||||
PLUGIN_ROOT = CURRENT_DIR.parent
|
||||
PROJECT_ROOT = PLUGIN_ROOT.parent.parent
|
||||
sys.path.insert(0, str(PROJECT_ROOT))
|
||||
sys.path.insert(0, str(PLUGIN_ROOT))
|
||||
|
||||
def _build_arg_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(description="A_Memorix vNext release migration tool")
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
default=str(PLUGIN_ROOT / "config.toml"),
|
||||
help="config.toml path (default: plugins/A_memorix/config.toml)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--data-dir",
|
||||
default="",
|
||||
help="optional data dir override; default resolved from config.storage.data_dir",
|
||||
)
|
||||
parser.add_argument("--json-out", default="", help="optional JSON report output path")
|
||||
|
||||
sub = parser.add_subparsers(dest="command", required=True)
|
||||
|
||||
p_preflight = sub.add_parser("preflight", help="scan legacy risks")
|
||||
p_preflight.add_argument("--strict", action="store_true", help="return 1 if any error check exists")
|
||||
|
||||
p_migrate = sub.add_parser("migrate", help="run offline migration")
|
||||
p_migrate.add_argument("--dry-run", action="store_true", help="only print planned changes")
|
||||
p_migrate.add_argument(
|
||||
"--verify-after",
|
||||
action="store_true",
|
||||
help="run verify automatically after migrate",
|
||||
)
|
||||
|
||||
p_verify = sub.add_parser("verify", help="post-migration verification")
|
||||
p_verify.add_argument("--strict", action="store_true", help="return 1 if any error check exists")
|
||||
return parser
|
||||
|
||||
|
||||
# --help/-h fast path: avoid heavy host/plugin bootstrap
|
||||
if any(arg in {"-h", "--help"} for arg in sys.argv[1:]):
|
||||
_build_arg_parser().print_help()
|
||||
raise SystemExit(0)
|
||||
|
||||
try:
|
||||
from core.storage import GraphStore, KnowledgeType, MetadataStore, QuantizationType, VectorStore
|
||||
from core.storage.metadata_store import SCHEMA_VERSION
|
||||
except Exception as e: # pragma: no cover
|
||||
print(f"❌ failed to import storage modules: {e}")
|
||||
raise SystemExit(2)
|
||||
|
||||
|
||||
@dataclass
|
||||
class CheckItem:
|
||||
code: str
|
||||
level: str
|
||||
message: str
|
||||
details: Optional[Dict[str, Any]] = None
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
out = {
|
||||
"code": self.code,
|
||||
"level": self.level,
|
||||
"message": self.message,
|
||||
}
|
||||
if self.details:
|
||||
out["details"] = self.details
|
||||
return out
|
||||
|
||||
|
||||
def _read_toml(path: Path) -> Dict[str, Any]:
|
||||
text = path.read_text(encoding="utf-8")
|
||||
return tomlkit.parse(text)
|
||||
|
||||
|
||||
def _write_toml(path: Path, data: Dict[str, Any]) -> None:
|
||||
path.write_text(tomlkit.dumps(data), encoding="utf-8")
|
||||
|
||||
|
||||
def _get_nested(obj: Dict[str, Any], keys: Sequence[str], default: Any = None) -> Any:
|
||||
cur: Any = obj
|
||||
for k in keys:
|
||||
if not isinstance(cur, dict) or k not in cur:
|
||||
return default
|
||||
cur = cur[k]
|
||||
return cur
|
||||
|
||||
|
||||
def _ensure_table(obj: Dict[str, Any], key: str) -> Dict[str, Any]:
|
||||
if key not in obj or not isinstance(obj[key], dict):
|
||||
obj[key] = tomlkit.table()
|
||||
return obj[key]
|
||||
|
||||
|
||||
def _resolve_data_dir(config_doc: Dict[str, Any], explicit_data_dir: Optional[str]) -> Path:
|
||||
if explicit_data_dir:
|
||||
return Path(explicit_data_dir).expanduser().resolve()
|
||||
raw = str(_get_nested(config_doc, ("storage", "data_dir"), "./data") or "./data").strip()
|
||||
if raw.startswith("."):
|
||||
return (PLUGIN_ROOT / raw).resolve()
|
||||
return Path(raw).expanduser().resolve()
|
||||
|
||||
|
||||
def _sqlite_table_exists(conn: sqlite3.Connection, table: str) -> bool:
|
||||
row = conn.execute(
|
||||
"SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1",
|
||||
(table,),
|
||||
).fetchone()
|
||||
return row is not None
|
||||
|
||||
|
||||
def _collect_hash_alias_conflicts(conn: sqlite3.Connection) -> Dict[str, List[str]]:
|
||||
hashes: List[str] = []
|
||||
if _sqlite_table_exists(conn, "relations"):
|
||||
rows = conn.execute("SELECT hash FROM relations").fetchall()
|
||||
hashes.extend(str(r[0]) for r in rows if r and r[0])
|
||||
if _sqlite_table_exists(conn, "deleted_relations"):
|
||||
rows = conn.execute("SELECT hash FROM deleted_relations").fetchall()
|
||||
hashes.extend(str(r[0]) for r in rows if r and r[0])
|
||||
|
||||
alias_map: Dict[str, str] = {}
|
||||
conflicts: Dict[str, set[str]] = {}
|
||||
for h in hashes:
|
||||
if len(h) != 64:
|
||||
continue
|
||||
alias = h[:32]
|
||||
old = alias_map.get(alias)
|
||||
if old is None:
|
||||
alias_map[alias] = h
|
||||
continue
|
||||
if old != h:
|
||||
conflicts.setdefault(alias, set()).update({old, h})
|
||||
return {k: sorted(v) for k, v in conflicts.items()}
|
||||
|
||||
|
||||
def _collect_invalid_knowledge_types(conn: sqlite3.Connection) -> List[str]:
|
||||
if not _sqlite_table_exists(conn, "paragraphs"):
|
||||
return []
|
||||
|
||||
allowed = {item.value for item in KnowledgeType}
|
||||
rows = conn.execute("SELECT DISTINCT knowledge_type FROM paragraphs").fetchall()
|
||||
invalid: List[str] = []
|
||||
for row in rows:
|
||||
raw = row[0]
|
||||
value = str(raw).strip().lower() if raw is not None else ""
|
||||
if value not in allowed:
|
||||
invalid.append(str(raw) if raw is not None else "")
|
||||
return sorted(set(invalid))
|
||||
|
||||
|
||||
def _guess_vector_dimension(config_doc: Dict[str, Any], vectors_dir: Path) -> int:
|
||||
meta_path = vectors_dir / "vectors_metadata.pkl"
|
||||
if meta_path.exists():
|
||||
try:
|
||||
with open(meta_path, "rb") as f:
|
||||
meta = pickle.load(f)
|
||||
dim = int(meta.get("dimension", 0))
|
||||
if dim > 0:
|
||||
return dim
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
dim_cfg = int(_get_nested(config_doc, ("embedding", "dimension"), 1024))
|
||||
if dim_cfg > 0:
|
||||
return dim_cfg
|
||||
except Exception:
|
||||
pass
|
||||
return 1024
|
||||
|
||||
|
||||
def _preflight_impl(config_path: Path, data_dir: Path) -> Dict[str, Any]:
|
||||
checks: List[CheckItem] = []
|
||||
facts: Dict[str, Any] = {
|
||||
"config_path": str(config_path),
|
||||
"data_dir": str(data_dir),
|
||||
}
|
||||
|
||||
if not config_path.exists():
|
||||
checks.append(CheckItem("CFG-00", "error", f"config not found: {config_path}"))
|
||||
return {"ok": False, "checks": [c.to_dict() for c in checks], "facts": facts}
|
||||
|
||||
config_doc = _read_toml(config_path)
|
||||
tool_mode = str(_get_nested(config_doc, ("routing", "tool_search_mode"), "forward") or "").strip().lower()
|
||||
summary_model = _get_nested(config_doc, ("summarization", "model_name"), ["auto"])
|
||||
summary_knowledge_type = str(
|
||||
_get_nested(config_doc, ("summarization", "default_knowledge_type"), "narrative") or "narrative"
|
||||
).strip().lower()
|
||||
quantization = str(_get_nested(config_doc, ("embedding", "quantization_type"), "int8") or "").strip().lower()
|
||||
|
||||
facts["routing.tool_search_mode"] = tool_mode
|
||||
facts["summarization.model_name_type"] = type(summary_model).__name__
|
||||
facts["summarization.default_knowledge_type"] = summary_knowledge_type
|
||||
facts["embedding.quantization_type"] = quantization
|
||||
|
||||
if tool_mode == "legacy":
|
||||
checks.append(
|
||||
CheckItem(
|
||||
"CP-04",
|
||||
"error",
|
||||
"routing.tool_search_mode=legacy is no longer accepted at runtime",
|
||||
)
|
||||
)
|
||||
elif tool_mode not in {"forward", "disabled"}:
|
||||
checks.append(
|
||||
CheckItem(
|
||||
"CP-04",
|
||||
"error",
|
||||
f"routing.tool_search_mode invalid value: {tool_mode}",
|
||||
)
|
||||
)
|
||||
|
||||
if isinstance(summary_model, str):
|
||||
checks.append(
|
||||
CheckItem(
|
||||
"CP-11",
|
||||
"error",
|
||||
"summarization.model_name must be List[str], string legacy format detected",
|
||||
)
|
||||
)
|
||||
elif not isinstance(summary_model, list) or any(not isinstance(x, str) for x in summary_model):
|
||||
checks.append(
|
||||
CheckItem(
|
||||
"CP-11",
|
||||
"error",
|
||||
"summarization.model_name must be List[str]",
|
||||
)
|
||||
)
|
||||
|
||||
if summary_knowledge_type not in {item.value for item in KnowledgeType}:
|
||||
checks.append(
|
||||
CheckItem(
|
||||
"CP-13",
|
||||
"error",
|
||||
f"invalid summarization.default_knowledge_type: {summary_knowledge_type}",
|
||||
)
|
||||
)
|
||||
|
||||
if quantization != "int8":
|
||||
checks.append(
|
||||
CheckItem(
|
||||
"UG-07",
|
||||
"error",
|
||||
"embedding.quantization_type must be int8 in vNext",
|
||||
)
|
||||
)
|
||||
|
||||
vectors_dir = data_dir / "vectors"
|
||||
npy_path = vectors_dir / "vectors.npy"
|
||||
bin_path = vectors_dir / "vectors.bin"
|
||||
ids_bin_path = vectors_dir / "vectors_ids.bin"
|
||||
facts["vectors.npy_exists"] = npy_path.exists()
|
||||
facts["vectors.bin_exists"] = bin_path.exists()
|
||||
facts["vectors_ids.bin_exists"] = ids_bin_path.exists()
|
||||
|
||||
if npy_path.exists() and not (bin_path.exists() and ids_bin_path.exists()):
|
||||
checks.append(
|
||||
CheckItem(
|
||||
"CP-07",
|
||||
"error",
|
||||
"legacy vectors.npy detected; offline migrate required",
|
||||
{"npy_path": str(npy_path)},
|
||||
)
|
||||
)
|
||||
|
||||
metadata_db = data_dir / "metadata" / "metadata.db"
|
||||
facts["metadata_db_exists"] = metadata_db.exists()
|
||||
relation_count = 0
|
||||
if metadata_db.exists():
|
||||
conn = sqlite3.connect(str(metadata_db))
|
||||
try:
|
||||
has_schema_table = _sqlite_table_exists(conn, "schema_migrations")
|
||||
facts["schema_migrations_exists"] = has_schema_table
|
||||
if not has_schema_table:
|
||||
checks.append(
|
||||
CheckItem(
|
||||
"CP-08",
|
||||
"error",
|
||||
"schema_migrations table missing (legacy metadata schema)",
|
||||
)
|
||||
)
|
||||
else:
|
||||
row = conn.execute("SELECT MAX(version) FROM schema_migrations").fetchone()
|
||||
version = int(row[0]) if row and row[0] is not None else 0
|
||||
facts["schema_version"] = version
|
||||
if version != SCHEMA_VERSION:
|
||||
checks.append(
|
||||
CheckItem(
|
||||
"CP-08",
|
||||
"error",
|
||||
f"schema version mismatch: current={version}, expected={SCHEMA_VERSION}",
|
||||
)
|
||||
)
|
||||
|
||||
if _sqlite_table_exists(conn, "relations"):
|
||||
row = conn.execute("SELECT COUNT(*) FROM relations").fetchone()
|
||||
relation_count = int(row[0]) if row and row[0] is not None else 0
|
||||
facts["relations_count"] = relation_count
|
||||
|
||||
conflicts = _collect_hash_alias_conflicts(conn)
|
||||
facts["alias_conflict_count"] = len(conflicts)
|
||||
if conflicts:
|
||||
checks.append(
|
||||
CheckItem(
|
||||
"CP-05",
|
||||
"error",
|
||||
"32-bit relation hash alias conflict detected",
|
||||
{"aliases": sorted(conflicts.keys())[:20], "total": len(conflicts)},
|
||||
)
|
||||
)
|
||||
|
||||
invalid_knowledge_types = _collect_invalid_knowledge_types(conn)
|
||||
facts["invalid_knowledge_type_values"] = invalid_knowledge_types
|
||||
if invalid_knowledge_types:
|
||||
checks.append(
|
||||
CheckItem(
|
||||
"CP-12",
|
||||
"error",
|
||||
"invalid paragraph knowledge_type values detected",
|
||||
{"values": invalid_knowledge_types[:20], "total": len(invalid_knowledge_types)},
|
||||
)
|
||||
)
|
||||
finally:
|
||||
conn.close()
|
||||
else:
|
||||
checks.append(
|
||||
CheckItem(
|
||||
"META-00",
|
||||
"warning",
|
||||
"metadata.db not found, schema checks skipped",
|
||||
)
|
||||
)
|
||||
|
||||
graph_meta_path = data_dir / "graph" / "graph_metadata.pkl"
|
||||
facts["graph_metadata_exists"] = graph_meta_path.exists()
|
||||
if relation_count > 0:
|
||||
if not graph_meta_path.exists():
|
||||
checks.append(
|
||||
CheckItem(
|
||||
"CP-06",
|
||||
"error",
|
||||
"relations exist but graph metadata missing",
|
||||
)
|
||||
)
|
||||
else:
|
||||
try:
|
||||
with open(graph_meta_path, "rb") as f:
|
||||
graph_meta = pickle.load(f)
|
||||
edge_hash_map = graph_meta.get("edge_hash_map", {})
|
||||
edge_hash_map_size = len(edge_hash_map) if isinstance(edge_hash_map, dict) else 0
|
||||
facts["edge_hash_map_size"] = edge_hash_map_size
|
||||
if edge_hash_map_size <= 0:
|
||||
checks.append(
|
||||
CheckItem(
|
||||
"CP-06",
|
||||
"error",
|
||||
"edge_hash_map missing/empty while relations exist",
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
checks.append(
|
||||
CheckItem(
|
||||
"CP-06",
|
||||
"error",
|
||||
f"failed to read graph metadata: {e}",
|
||||
)
|
||||
)
|
||||
|
||||
has_error = any(c.level == "error" for c in checks)
|
||||
return {
|
||||
"ok": not has_error,
|
||||
"checks": [c.to_dict() for c in checks],
|
||||
"facts": facts,
|
||||
}
|
||||
|
||||
|
||||
def _migrate_config(config_doc: Dict[str, Any]) -> Dict[str, Any]:
|
||||
changes: Dict[str, Any] = {}
|
||||
|
||||
routing = _ensure_table(config_doc, "routing")
|
||||
mode_raw = str(routing.get("tool_search_mode", "forward") or "").strip().lower()
|
||||
mode_new = mode_raw
|
||||
if mode_raw == "legacy" or mode_raw not in {"forward", "disabled"}:
|
||||
mode_new = "forward"
|
||||
if mode_new != mode_raw:
|
||||
routing["tool_search_mode"] = mode_new
|
||||
changes["routing.tool_search_mode"] = {"old": mode_raw, "new": mode_new}
|
||||
|
||||
summary = _ensure_table(config_doc, "summarization")
|
||||
summary_model = summary.get("model_name", ["auto"])
|
||||
if isinstance(summary_model, str):
|
||||
normalized = [summary_model.strip() or "auto"]
|
||||
summary["model_name"] = normalized
|
||||
changes["summarization.model_name"] = {"old": summary_model, "new": normalized}
|
||||
elif not isinstance(summary_model, list):
|
||||
normalized = ["auto"]
|
||||
summary["model_name"] = normalized
|
||||
changes["summarization.model_name"] = {"old": str(type(summary_model)), "new": normalized}
|
||||
elif any(not isinstance(x, str) for x in summary_model):
|
||||
normalized = [str(x).strip() for x in summary_model if str(x).strip()]
|
||||
if not normalized:
|
||||
normalized = ["auto"]
|
||||
summary["model_name"] = normalized
|
||||
changes["summarization.model_name"] = {"old": summary_model, "new": normalized}
|
||||
|
||||
default_knowledge_type = str(summary.get("default_knowledge_type", "narrative") or "").strip().lower()
|
||||
allowed_knowledge_types = {item.value for item in KnowledgeType}
|
||||
if default_knowledge_type not in allowed_knowledge_types:
|
||||
summary["default_knowledge_type"] = "narrative"
|
||||
changes["summarization.default_knowledge_type"] = {
|
||||
"old": default_knowledge_type,
|
||||
"new": "narrative",
|
||||
}
|
||||
|
||||
embedding = _ensure_table(config_doc, "embedding")
|
||||
quantization = str(embedding.get("quantization_type", "int8") or "").strip().lower()
|
||||
if quantization != "int8":
|
||||
embedding["quantization_type"] = "int8"
|
||||
changes["embedding.quantization_type"] = {"old": quantization, "new": "int8"}
|
||||
|
||||
return changes
|
||||
|
||||
|
||||
def _migrate_impl(config_path: Path, data_dir: Path, dry_run: bool) -> Dict[str, Any]:
|
||||
config_doc = _read_toml(config_path)
|
||||
result: Dict[str, Any] = {
|
||||
"config_path": str(config_path),
|
||||
"data_dir": str(data_dir),
|
||||
"dry_run": bool(dry_run),
|
||||
"steps": {},
|
||||
}
|
||||
|
||||
config_changes = _migrate_config(config_doc)
|
||||
result["steps"]["config"] = {"changed": bool(config_changes), "changes": config_changes}
|
||||
if config_changes and not dry_run:
|
||||
_write_toml(config_path, config_doc)
|
||||
|
||||
vectors_dir = data_dir / "vectors"
|
||||
vectors_dir.mkdir(parents=True, exist_ok=True)
|
||||
npy_path = vectors_dir / "vectors.npy"
|
||||
bin_path = vectors_dir / "vectors.bin"
|
||||
ids_bin_path = vectors_dir / "vectors_ids.bin"
|
||||
if npy_path.exists() and not (bin_path.exists() and ids_bin_path.exists()):
|
||||
if dry_run:
|
||||
result["steps"]["vector"] = {"migrated": False, "reason": "dry_run"}
|
||||
else:
|
||||
dim = _guess_vector_dimension(config_doc, vectors_dir)
|
||||
store = VectorStore(
|
||||
dimension=max(1, int(dim)),
|
||||
quantization_type=QuantizationType.INT8,
|
||||
data_dir=vectors_dir,
|
||||
)
|
||||
result["steps"]["vector"] = store.migrate_legacy_npy(vectors_dir)
|
||||
else:
|
||||
result["steps"]["vector"] = {"migrated": False, "reason": "not_required"}
|
||||
|
||||
metadata_dir = data_dir / "metadata"
|
||||
metadata_dir.mkdir(parents=True, exist_ok=True)
|
||||
metadata_db = metadata_dir / "metadata.db"
|
||||
triples: List[Tuple[str, str, str, str]] = []
|
||||
relation_count = 0
|
||||
|
||||
metadata_result: Dict[str, Any] = {"migrated": False, "reason": "not_required"}
|
||||
if metadata_db.exists():
|
||||
store = MetadataStore(data_dir=metadata_dir)
|
||||
store.connect(enforce_schema=False)
|
||||
try:
|
||||
if dry_run:
|
||||
metadata_result = {"migrated": False, "reason": "dry_run"}
|
||||
else:
|
||||
metadata_result = store.run_legacy_migration_for_vnext()
|
||||
relation_count = int(store.count_relations())
|
||||
if relation_count > 0:
|
||||
triples = [(str(s), str(p), str(o), str(h)) for s, p, o, h in store.get_all_triples()]
|
||||
finally:
|
||||
store.close()
|
||||
result["steps"]["metadata"] = metadata_result
|
||||
|
||||
graph_dir = data_dir / "graph"
|
||||
graph_dir.mkdir(parents=True, exist_ok=True)
|
||||
graph_matrix_format = str(_get_nested(config_doc, ("graph", "sparse_matrix_format"), "csr") or "csr")
|
||||
graph_store = GraphStore(matrix_format=graph_matrix_format, data_dir=graph_dir)
|
||||
graph_step: Dict[str, Any] = {
|
||||
"rebuilt": False,
|
||||
"mapped_hashes": 0,
|
||||
"relation_count": relation_count,
|
||||
"topology_rebuilt_from_relations": False,
|
||||
}
|
||||
if relation_count > 0:
|
||||
if dry_run:
|
||||
graph_step["reason"] = "dry_run"
|
||||
else:
|
||||
if graph_store.has_data():
|
||||
graph_store.load()
|
||||
|
||||
mapped = graph_store.rebuild_edge_hash_map(triples)
|
||||
|
||||
# 兜底:历史数据里 graph 节点/边与 relations 脱节时,直接从 relations 重建图。
|
||||
if mapped <= 0 or not graph_store.has_edge_hash_map():
|
||||
nodes = sorted({s for s, _, o, _ in triples} | {o for _, _, o, _ in triples})
|
||||
edges = [(s, o) for s, _, o, _ in triples]
|
||||
hashes = [h for _, _, _, h in triples]
|
||||
|
||||
graph_store.clear()
|
||||
if nodes:
|
||||
graph_store.add_nodes(nodes)
|
||||
if edges:
|
||||
mapped = graph_store.add_edges(edges, relation_hashes=hashes)
|
||||
else:
|
||||
mapped = 0
|
||||
graph_step.update(
|
||||
{
|
||||
"topology_rebuilt_from_relations": True,
|
||||
"rebuilt_nodes": len(nodes),
|
||||
"rebuilt_edges": int(graph_store.num_edges),
|
||||
}
|
||||
)
|
||||
|
||||
graph_store.save()
|
||||
graph_step.update({"rebuilt": True, "mapped_hashes": int(mapped)})
|
||||
else:
|
||||
graph_step["reason"] = "no_relations"
|
||||
result["steps"]["graph"] = graph_step
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _verify_impl(config_path: Path, data_dir: Path) -> Dict[str, Any]:
|
||||
checks: List[CheckItem] = []
|
||||
facts: Dict[str, Any] = {
|
||||
"config_path": str(config_path),
|
||||
"data_dir": str(data_dir),
|
||||
}
|
||||
|
||||
if not config_path.exists():
|
||||
checks.append(CheckItem("CFG-00", "error", f"config not found: {config_path}"))
|
||||
return {"ok": False, "checks": [c.to_dict() for c in checks], "facts": facts}
|
||||
|
||||
config_doc = _read_toml(config_path)
|
||||
mode = str(_get_nested(config_doc, ("routing", "tool_search_mode"), "forward") or "").strip().lower()
|
||||
if mode not in {"forward", "disabled"}:
|
||||
checks.append(CheckItem("CP-04", "error", f"invalid routing.tool_search_mode: {mode}"))
|
||||
|
||||
summary_model = _get_nested(config_doc, ("summarization", "model_name"), ["auto"])
|
||||
if not isinstance(summary_model, list) or any(not isinstance(x, str) for x in summary_model):
|
||||
checks.append(CheckItem("CP-11", "error", "summarization.model_name must be List[str]"))
|
||||
summary_knowledge_type = str(
|
||||
_get_nested(config_doc, ("summarization", "default_knowledge_type"), "narrative") or "narrative"
|
||||
).strip().lower()
|
||||
if summary_knowledge_type not in {item.value for item in KnowledgeType}:
|
||||
checks.append(
|
||||
CheckItem("CP-13", "error", f"invalid summarization.default_knowledge_type: {summary_knowledge_type}")
|
||||
)
|
||||
|
||||
quantization = str(_get_nested(config_doc, ("embedding", "quantization_type"), "int8") or "").strip().lower()
|
||||
if quantization != "int8":
|
||||
checks.append(CheckItem("UG-07", "error", "embedding.quantization_type must be int8"))
|
||||
|
||||
vectors_dir = data_dir / "vectors"
|
||||
npy_path = vectors_dir / "vectors.npy"
|
||||
bin_path = vectors_dir / "vectors.bin"
|
||||
ids_bin_path = vectors_dir / "vectors_ids.bin"
|
||||
if npy_path.exists() and not (bin_path.exists() and ids_bin_path.exists()):
|
||||
checks.append(CheckItem("CP-07", "error", "legacy vectors.npy still exists without bin migration"))
|
||||
|
||||
metadata_dir = data_dir / "metadata"
|
||||
store = MetadataStore(data_dir=metadata_dir)
|
||||
try:
|
||||
store.connect(enforce_schema=True)
|
||||
schema_version = store.get_schema_version()
|
||||
facts["schema_version"] = schema_version
|
||||
if schema_version != SCHEMA_VERSION:
|
||||
checks.append(CheckItem("CP-08", "error", f"schema version mismatch: {schema_version}"))
|
||||
|
||||
relation_count = int(store.count_relations())
|
||||
facts["relations_count"] = relation_count
|
||||
|
||||
conflicts = {}
|
||||
invalid_knowledge_types: List[str] = []
|
||||
db_path = metadata_dir / "metadata.db"
|
||||
if db_path.exists():
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
try:
|
||||
conflicts = _collect_hash_alias_conflicts(conn)
|
||||
invalid_knowledge_types = _collect_invalid_knowledge_types(conn)
|
||||
finally:
|
||||
conn.close()
|
||||
if conflicts:
|
||||
checks.append(
|
||||
CheckItem(
|
||||
"CP-05",
|
||||
"error",
|
||||
"alias conflicts still exist after migration",
|
||||
{"aliases": sorted(conflicts.keys())[:20], "total": len(conflicts)},
|
||||
)
|
||||
)
|
||||
if invalid_knowledge_types:
|
||||
checks.append(
|
||||
CheckItem(
|
||||
"CP-12",
|
||||
"error",
|
||||
"invalid paragraph knowledge_type values remain after migration",
|
||||
{"values": invalid_knowledge_types[:20], "total": len(invalid_knowledge_types)},
|
||||
)
|
||||
)
|
||||
|
||||
if relation_count > 0:
|
||||
graph_dir = data_dir / "graph"
|
||||
if not (graph_dir / "graph_metadata.pkl").exists():
|
||||
checks.append(CheckItem("CP-06", "error", "graph metadata missing while relations exist"))
|
||||
else:
|
||||
matrix_format = str(_get_nested(config_doc, ("graph", "sparse_matrix_format"), "csr") or "csr")
|
||||
graph_store = GraphStore(matrix_format=matrix_format, data_dir=graph_dir)
|
||||
graph_store.load()
|
||||
if not graph_store.has_edge_hash_map():
|
||||
checks.append(CheckItem("CP-06", "error", "edge_hash_map is empty"))
|
||||
except Exception as e:
|
||||
checks.append(CheckItem("CP-08", "error", f"metadata strict connect failed: {e}"))
|
||||
finally:
|
||||
try:
|
||||
store.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
has_error = any(c.level == "error" for c in checks)
|
||||
return {
|
||||
"ok": not has_error,
|
||||
"checks": [c.to_dict() for c in checks],
|
||||
"facts": facts,
|
||||
}
|
||||
|
||||
|
||||
def _print_report(title: str, report: Dict[str, Any]) -> None:
|
||||
print(f"=== {title} ===")
|
||||
print(f"ok: {bool(report.get('ok', True))}")
|
||||
facts = report.get("facts", {})
|
||||
if facts:
|
||||
print("facts:")
|
||||
for k in sorted(facts.keys()):
|
||||
print(f" - {k}: {facts[k]}")
|
||||
checks = report.get("checks", [])
|
||||
if checks:
|
||||
print("checks:")
|
||||
for item in checks:
|
||||
print(f" - [{item.get('level')}] {item.get('code')}: {item.get('message')}")
|
||||
else:
|
||||
print("checks: none")
|
||||
|
||||
|
||||
def _write_json_if_needed(path: str, payload: Dict[str, Any]) -> None:
|
||||
if not path:
|
||||
return
|
||||
out = Path(path).expanduser().resolve()
|
||||
out.parent.mkdir(parents=True, exist_ok=True)
|
||||
out.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
print(f"json_out: {out}")
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = _build_arg_parser()
|
||||
args = parser.parse_args()
|
||||
config_path = Path(args.config).expanduser().resolve()
|
||||
if not config_path.exists():
|
||||
print(f"❌ config not found: {config_path}")
|
||||
return 2
|
||||
config_doc = _read_toml(config_path)
|
||||
data_dir = _resolve_data_dir(config_doc, args.data_dir)
|
||||
|
||||
if args.command == "preflight":
|
||||
report = _preflight_impl(config_path, data_dir)
|
||||
_print_report("vNext Preflight", report)
|
||||
_write_json_if_needed(args.json_out, report)
|
||||
has_error = any(item.get("level") == "error" for item in report.get("checks", []))
|
||||
if args.strict and has_error:
|
||||
return 1
|
||||
return 0
|
||||
|
||||
if args.command == "migrate":
|
||||
payload = _migrate_impl(config_path, data_dir, dry_run=bool(args.dry_run))
|
||||
print("=== vNext Migrate ===")
|
||||
print(json.dumps(payload, ensure_ascii=False, indent=2))
|
||||
|
||||
verify_report = None
|
||||
if args.verify_after and not args.dry_run:
|
||||
verify_report = _verify_impl(config_path, data_dir)
|
||||
_print_report("vNext Verify (after migrate)", verify_report)
|
||||
payload["verify_after"] = verify_report
|
||||
|
||||
_write_json_if_needed(args.json_out, payload)
|
||||
if verify_report is not None:
|
||||
has_error = any(item.get("level") == "error" for item in verify_report.get("checks", []))
|
||||
if has_error:
|
||||
return 1
|
||||
return 0
|
||||
|
||||
if args.command == "verify":
|
||||
report = _verify_impl(config_path, data_dir)
|
||||
_print_report("vNext Verify", report)
|
||||
_write_json_if_needed(args.json_out, report)
|
||||
has_error = any(item.get("level") == "error" for item in report.get("checks", []))
|
||||
if args.strict and has_error:
|
||||
return 1
|
||||
return 0
|
||||
|
||||
return 2
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
152
plugins/A_memorix/scripts/runtime_self_check.py
Normal file
152
plugins/A_memorix/scripts/runtime_self_check.py
Normal file
@@ -0,0 +1,152 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Run A_Memorix runtime self-check against real embedding/runtime configuration."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import tomlkit
|
||||
|
||||
|
||||
CURRENT_DIR = Path(__file__).resolve().parent
|
||||
PLUGIN_ROOT = CURRENT_DIR.parent
|
||||
PROJECT_ROOT = PLUGIN_ROOT.parent.parent
|
||||
sys.path.insert(0, str(PROJECT_ROOT))
|
||||
sys.path.insert(0, str(PLUGIN_ROOT))
|
||||
|
||||
|
||||
def _build_arg_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(description="A_Memorix runtime self-check")
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
default=str(PLUGIN_ROOT / "config.toml"),
|
||||
help="config.toml path (default: plugins/A_memorix/config.toml)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--data-dir",
|
||||
default="",
|
||||
help="optional data dir override; default resolved from config.storage.data_dir",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--use-config-data-dir",
|
||||
action="store_true",
|
||||
help="use config.storage.data_dir directly instead of an isolated temp dir",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sample-text",
|
||||
default="A_Memorix runtime self check",
|
||||
help="sample text used for real embedding probe",
|
||||
)
|
||||
parser.add_argument("--json", action="store_true", help="print JSON report")
|
||||
return parser
|
||||
|
||||
|
||||
if any(arg in {"-h", "--help"} for arg in sys.argv[1:]):
|
||||
_build_arg_parser().print_help()
|
||||
raise SystemExit(0)
|
||||
|
||||
from core.runtime.lifecycle_orchestrator import initialize_storage_async
|
||||
from core.utils.runtime_self_check import run_embedding_runtime_self_check
|
||||
|
||||
|
||||
def _load_config(path: Path) -> dict[str, Any]:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
raw = tomlkit.load(f)
|
||||
return dict(raw) if isinstance(raw, dict) else {}
|
||||
|
||||
|
||||
def _nested_get(config: dict[str, Any], key: str, default: Any = None) -> Any:
|
||||
current: Any = config
|
||||
for part in key.split("."):
|
||||
if isinstance(current, dict) and part in current:
|
||||
current = current[part]
|
||||
else:
|
||||
return default
|
||||
return current
|
||||
|
||||
|
||||
class _PluginStub:
|
||||
def __init__(self, config: dict[str, Any]):
|
||||
self.config = config
|
||||
self.vector_store = None
|
||||
self.graph_store = None
|
||||
self.metadata_store = None
|
||||
self.embedding_manager = None
|
||||
self.sparse_index = None
|
||||
self.relation_write_service = None
|
||||
|
||||
def get_config(self, key: str, default: Any = None) -> Any:
|
||||
return _nested_get(self.config, key, default)
|
||||
|
||||
|
||||
async def _main_async(args: argparse.Namespace) -> int:
|
||||
config_path = Path(args.config).resolve()
|
||||
if not config_path.exists():
|
||||
print(f"❌ 配置文件不存在: {config_path}")
|
||||
return 2
|
||||
|
||||
config = _load_config(config_path)
|
||||
temp_dir_ctx = None
|
||||
if args.data_dir:
|
||||
storage_dir = str(Path(args.data_dir).resolve())
|
||||
elif args.use_config_data_dir:
|
||||
raw_data_dir = str(_nested_get(config, "storage.data_dir", "./data") or "./data").strip()
|
||||
if raw_data_dir.startswith("."):
|
||||
storage_dir = str((config_path.parent / raw_data_dir).resolve())
|
||||
else:
|
||||
storage_dir = str(Path(raw_data_dir).resolve())
|
||||
else:
|
||||
temp_dir_ctx = tempfile.TemporaryDirectory(prefix="memorix-runtime-self-check-")
|
||||
storage_dir = temp_dir_ctx.name
|
||||
|
||||
storage_cfg = config.setdefault("storage", {})
|
||||
storage_cfg["data_dir"] = storage_dir
|
||||
|
||||
plugin = _PluginStub(config)
|
||||
try:
|
||||
await initialize_storage_async(plugin)
|
||||
report = await run_embedding_runtime_self_check(
|
||||
config=config,
|
||||
vector_store=plugin.vector_store,
|
||||
embedding_manager=plugin.embedding_manager,
|
||||
sample_text=str(args.sample_text or "A_Memorix runtime self check"),
|
||||
)
|
||||
report["data_dir"] = storage_dir
|
||||
report["isolated_data_dir"] = temp_dir_ctx is not None
|
||||
if args.json:
|
||||
print(json.dumps(report, ensure_ascii=False, indent=2))
|
||||
else:
|
||||
print("A_Memorix Runtime Self-Check")
|
||||
print(f"ok: {report.get('ok')}")
|
||||
print(f"code: {report.get('code')}")
|
||||
print(f"message: {report.get('message')}")
|
||||
print(f"configured_dimension: {report.get('configured_dimension')}")
|
||||
print(f"vector_store_dimension: {report.get('vector_store_dimension')}")
|
||||
print(f"detected_dimension: {report.get('detected_dimension')}")
|
||||
print(f"encoded_dimension: {report.get('encoded_dimension')}")
|
||||
print(f"elapsed_ms: {float(report.get('elapsed_ms', 0.0)):.2f}")
|
||||
return 0 if bool(report.get("ok")) else 1
|
||||
finally:
|
||||
if plugin.metadata_store is not None:
|
||||
try:
|
||||
plugin.metadata_store.close()
|
||||
except Exception:
|
||||
pass
|
||||
if temp_dir_ctx is not None:
|
||||
temp_dir_ctx.cleanup()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = _build_arg_parser()
|
||||
args = parser.parse_args()
|
||||
return asyncio.run(_main_async(args))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user