引入 A_Memorix 插件 v2.0.0:新增大量运行时组件、存储/模式更新、检索能力提升、管理工具、导入/调优工作流以及相关文档。关键新增内容包括:lifecycle_orchestrator、SDKMemoryKernel/运行时初始化器、新的存储层与 metadata_store 变更(SCHEMA_VERSION v8)、检索增强(双路径检索、图关系召回、稀疏 BM25),以及多种工具服务(episode/person_profile/relation/segmentation/tuning/search execution)。同时新增 Web 导入/摘要导入器及大量维护脚本。还更新了插件清单、embedding API 适配器、plugin.py、requirements/pyproject,以及主入口文件,使新插件接入项目。该变更为 2.0.0 版本发布做好准备,实现统一的 SDK Tool 接口并扩展整体运行能力。
214 lines
7.3 KiB
Python
214 lines
7.3 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
A_Memorix 一致性审计脚本。
|
||
|
||
输出内容:
|
||
1. paragraph/entity/relation 向量覆盖率
|
||
2. relation vector_state 分布
|
||
3. 孤儿向量数量(向量存在但 metadata 不存在)
|
||
4. 状态与向量文件不一致统计
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import json
|
||
import pickle
|
||
import sys
|
||
from pathlib import Path
|
||
from typing import Any, Dict, Set
|
||
|
||
|
||
CURRENT_DIR = Path(__file__).resolve().parent
|
||
PLUGIN_ROOT = CURRENT_DIR.parent
|
||
PROJECT_ROOT = PLUGIN_ROOT.parent.parent
|
||
sys.path.insert(0, str(PROJECT_ROOT))
|
||
sys.path.insert(0, str(PLUGIN_ROOT))
|
||
|
||
def _build_arg_parser() -> argparse.ArgumentParser:
|
||
parser = argparse.ArgumentParser(description="审计 A_Memorix 向量一致性")
|
||
parser.add_argument(
|
||
"--data-dir",
|
||
default=str(PLUGIN_ROOT / "data"),
|
||
help="A_Memorix 数据目录(默认: plugins/A_memorix/data)",
|
||
)
|
||
parser.add_argument("--json-out", default="", help="可选:输出 JSON 文件路径")
|
||
parser.add_argument(
|
||
"--strict",
|
||
action="store_true",
|
||
help="若发现一致性异常则返回非 0 退出码",
|
||
)
|
||
return parser
|
||
|
||
|
||
# --help/-h fast path: avoid heavy host/plugin bootstrap
|
||
if any(arg in {"-h", "--help"} for arg in sys.argv[1:]):
|
||
_build_arg_parser().print_help()
|
||
sys.exit(0)
|
||
|
||
try:
|
||
from core.storage.vector_store import VectorStore
|
||
from core.storage.metadata_store import MetadataStore
|
||
from core.storage import QuantizationType
|
||
except Exception as e: # pragma: no cover
|
||
print(f"❌ 导入核心模块失败: {e}")
|
||
sys.exit(1)
|
||
|
||
|
||
def _safe_ratio(numerator: int, denominator: int) -> float:
|
||
if denominator <= 0:
|
||
return 0.0
|
||
return float(numerator) / float(denominator)
|
||
|
||
|
||
def _load_vector_store(data_dir: Path) -> VectorStore:
|
||
meta_path = data_dir / "vectors" / "vectors_metadata.pkl"
|
||
if not meta_path.exists():
|
||
raise FileNotFoundError(f"未找到向量元数据文件: {meta_path}")
|
||
|
||
with open(meta_path, "rb") as f:
|
||
meta = pickle.load(f)
|
||
dimension = int(meta.get("dimension", 1024))
|
||
|
||
store = VectorStore(
|
||
dimension=max(1, dimension),
|
||
quantization_type=QuantizationType.INT8,
|
||
data_dir=data_dir / "vectors",
|
||
)
|
||
if store.has_data():
|
||
store.load()
|
||
return store
|
||
|
||
|
||
def _load_metadata_store(data_dir: Path) -> MetadataStore:
|
||
store = MetadataStore(data_dir=data_dir / "metadata")
|
||
store.connect()
|
||
return store
|
||
|
||
|
||
def _hash_set(metadata_store: MetadataStore, table: str) -> Set[str]:
|
||
return {str(h) for h in metadata_store.list_hashes(table)}
|
||
|
||
|
||
def _relation_state_stats(metadata_store: MetadataStore) -> Dict[str, int]:
|
||
return metadata_store.count_relations_by_vector_state()
|
||
|
||
|
||
def run_audit(data_dir: Path) -> Dict[str, Any]:
|
||
vector_store = _load_vector_store(data_dir)
|
||
metadata_store = _load_metadata_store(data_dir)
|
||
try:
|
||
paragraph_hashes = _hash_set(metadata_store, "paragraphs")
|
||
entity_hashes = _hash_set(metadata_store, "entities")
|
||
relation_hashes = _hash_set(metadata_store, "relations")
|
||
|
||
known_hashes = set(getattr(vector_store, "_known_hashes", set()))
|
||
live_vector_hashes = {h for h in known_hashes if h in vector_store}
|
||
|
||
para_vector_hits = len(paragraph_hashes & live_vector_hashes)
|
||
ent_vector_hits = len(entity_hashes & live_vector_hashes)
|
||
rel_vector_hits = len(relation_hashes & live_vector_hashes)
|
||
|
||
orphan_vector_hashes = sorted(
|
||
live_vector_hashes - paragraph_hashes - entity_hashes - relation_hashes
|
||
)
|
||
|
||
relation_rows = metadata_store.get_relations()
|
||
ready_but_missing = 0
|
||
not_ready_but_present = 0
|
||
for row in relation_rows:
|
||
h = str(row.get("hash") or "")
|
||
state = str(row.get("vector_state") or "none").lower()
|
||
in_vector = h in live_vector_hashes
|
||
if state == "ready" and not in_vector:
|
||
ready_but_missing += 1
|
||
if state != "ready" and in_vector:
|
||
not_ready_but_present += 1
|
||
|
||
relation_states = _relation_state_stats(metadata_store)
|
||
rel_total = max(0, int(relation_states.get("total", len(relation_hashes))))
|
||
ready_count = max(0, int(relation_states.get("ready", 0)))
|
||
|
||
result = {
|
||
"counts": {
|
||
"paragraphs": len(paragraph_hashes),
|
||
"entities": len(entity_hashes),
|
||
"relations": len(relation_hashes),
|
||
"vectors_live": len(live_vector_hashes),
|
||
},
|
||
"coverage": {
|
||
"paragraph_vector_coverage": _safe_ratio(para_vector_hits, len(paragraph_hashes)),
|
||
"entity_vector_coverage": _safe_ratio(ent_vector_hits, len(entity_hashes)),
|
||
"relation_vector_coverage": _safe_ratio(rel_vector_hits, len(relation_hashes)),
|
||
"relation_ready_coverage": _safe_ratio(ready_count, rel_total),
|
||
},
|
||
"relation_states": relation_states,
|
||
"orphans": {
|
||
"vector_only_count": len(orphan_vector_hashes),
|
||
"vector_only_sample": orphan_vector_hashes[:30],
|
||
},
|
||
"consistency_checks": {
|
||
"ready_but_missing_vector": ready_but_missing,
|
||
"not_ready_but_vector_present": not_ready_but_present,
|
||
},
|
||
}
|
||
return result
|
||
finally:
|
||
metadata_store.close()
|
||
|
||
|
||
def main() -> int:
|
||
parser = _build_arg_parser()
|
||
args = parser.parse_args()
|
||
|
||
data_dir = Path(args.data_dir).resolve()
|
||
if not data_dir.exists():
|
||
print(f"❌ 数据目录不存在: {data_dir}")
|
||
return 2
|
||
|
||
try:
|
||
result = run_audit(data_dir)
|
||
except Exception as e:
|
||
print(f"❌ 审计失败: {e}")
|
||
return 2
|
||
|
||
print("=== A_Memorix Vector Consistency Audit ===")
|
||
print(f"data_dir: {data_dir}")
|
||
print(f"paragraphs: {result['counts']['paragraphs']}")
|
||
print(f"entities: {result['counts']['entities']}")
|
||
print(f"relations: {result['counts']['relations']}")
|
||
print(f"vectors_live: {result['counts']['vectors_live']}")
|
||
print(
|
||
"coverage: "
|
||
f"paragraph={result['coverage']['paragraph_vector_coverage']:.3f}, "
|
||
f"entity={result['coverage']['entity_vector_coverage']:.3f}, "
|
||
f"relation={result['coverage']['relation_vector_coverage']:.3f}, "
|
||
f"relation_ready={result['coverage']['relation_ready_coverage']:.3f}"
|
||
)
|
||
print(f"relation_states: {result['relation_states']}")
|
||
print(
|
||
"consistency_checks: "
|
||
f"ready_but_missing_vector={result['consistency_checks']['ready_but_missing_vector']}, "
|
||
f"not_ready_but_vector_present={result['consistency_checks']['not_ready_but_vector_present']}"
|
||
)
|
||
print(f"orphan_vectors: {result['orphans']['vector_only_count']}")
|
||
|
||
if args.json_out:
|
||
out_path = Path(args.json_out).resolve()
|
||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||
with open(out_path, "w", encoding="utf-8") as f:
|
||
json.dump(result, f, ensure_ascii=False, indent=2)
|
||
print(f"json_out: {out_path}")
|
||
|
||
has_anomaly = (
|
||
result["orphans"]["vector_only_count"] > 0
|
||
or result["consistency_checks"]["ready_but_missing_vector"] > 0
|
||
)
|
||
if args.strict and has_anomaly:
|
||
return 1
|
||
return 0
|
||
|
||
|
||
if __name__ == "__main__":
|
||
raise SystemExit(main())
|