Files
mai-bot/plugins/A_memorix/scripts/audit_vector_consistency.py
DawnARC 71b3a828c6 添加 A_Memorix 插件 v2.0.0(包含运行时与文档)
引入 A_Memorix 插件 v2.0.0:新增大量运行时组件、存储/模式更新、检索能力提升、管理工具、导入/调优工作流以及相关文档。关键新增内容包括:lifecycle_orchestrator、SDKMemoryKernel/运行时初始化器、新的存储层与 metadata_store 变更(SCHEMA_VERSION v8)、检索增强(双路径检索、图关系召回、稀疏 BM25),以及多种工具服务(episode/person_profile/relation/segmentation/tuning/search execution)。同时新增 Web 导入/摘要导入器及大量维护脚本。还更新了插件清单、embedding API 适配器、plugin.py、requirements/pyproject,以及主入口文件,使新插件接入项目。该变更为 2.0.0 版本发布做好准备,实现统一的 SDK Tool 接口并扩展整体运行能力。
2026-03-19 00:09:04 +08:00

214 lines
7.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
A_Memorix 一致性审计脚本。
输出内容:
1. paragraph/entity/relation 向量覆盖率
2. relation vector_state 分布
3. 孤儿向量数量(向量存在但 metadata 不存在)
4. 状态与向量文件不一致统计
"""
from __future__ import annotations
import argparse
import json
import pickle
import sys
from pathlib import Path
from typing import Any, Dict, Set
CURRENT_DIR = Path(__file__).resolve().parent
PLUGIN_ROOT = CURRENT_DIR.parent
PROJECT_ROOT = PLUGIN_ROOT.parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
sys.path.insert(0, str(PLUGIN_ROOT))
def _build_arg_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="审计 A_Memorix 向量一致性")
parser.add_argument(
"--data-dir",
default=str(PLUGIN_ROOT / "data"),
help="A_Memorix 数据目录(默认: plugins/A_memorix/data",
)
parser.add_argument("--json-out", default="", help="可选:输出 JSON 文件路径")
parser.add_argument(
"--strict",
action="store_true",
help="若发现一致性异常则返回非 0 退出码",
)
return parser
# --help/-h fast path: avoid heavy host/plugin bootstrap
if any(arg in {"-h", "--help"} for arg in sys.argv[1:]):
_build_arg_parser().print_help()
sys.exit(0)
try:
from core.storage.vector_store import VectorStore
from core.storage.metadata_store import MetadataStore
from core.storage import QuantizationType
except Exception as e: # pragma: no cover
print(f"❌ 导入核心模块失败: {e}")
sys.exit(1)
def _safe_ratio(numerator: int, denominator: int) -> float:
if denominator <= 0:
return 0.0
return float(numerator) / float(denominator)
def _load_vector_store(data_dir: Path) -> VectorStore:
meta_path = data_dir / "vectors" / "vectors_metadata.pkl"
if not meta_path.exists():
raise FileNotFoundError(f"未找到向量元数据文件: {meta_path}")
with open(meta_path, "rb") as f:
meta = pickle.load(f)
dimension = int(meta.get("dimension", 1024))
store = VectorStore(
dimension=max(1, dimension),
quantization_type=QuantizationType.INT8,
data_dir=data_dir / "vectors",
)
if store.has_data():
store.load()
return store
def _load_metadata_store(data_dir: Path) -> MetadataStore:
store = MetadataStore(data_dir=data_dir / "metadata")
store.connect()
return store
def _hash_set(metadata_store: MetadataStore, table: str) -> Set[str]:
return {str(h) for h in metadata_store.list_hashes(table)}
def _relation_state_stats(metadata_store: MetadataStore) -> Dict[str, int]:
return metadata_store.count_relations_by_vector_state()
def run_audit(data_dir: Path) -> Dict[str, Any]:
vector_store = _load_vector_store(data_dir)
metadata_store = _load_metadata_store(data_dir)
try:
paragraph_hashes = _hash_set(metadata_store, "paragraphs")
entity_hashes = _hash_set(metadata_store, "entities")
relation_hashes = _hash_set(metadata_store, "relations")
known_hashes = set(getattr(vector_store, "_known_hashes", set()))
live_vector_hashes = {h for h in known_hashes if h in vector_store}
para_vector_hits = len(paragraph_hashes & live_vector_hashes)
ent_vector_hits = len(entity_hashes & live_vector_hashes)
rel_vector_hits = len(relation_hashes & live_vector_hashes)
orphan_vector_hashes = sorted(
live_vector_hashes - paragraph_hashes - entity_hashes - relation_hashes
)
relation_rows = metadata_store.get_relations()
ready_but_missing = 0
not_ready_but_present = 0
for row in relation_rows:
h = str(row.get("hash") or "")
state = str(row.get("vector_state") or "none").lower()
in_vector = h in live_vector_hashes
if state == "ready" and not in_vector:
ready_but_missing += 1
if state != "ready" and in_vector:
not_ready_but_present += 1
relation_states = _relation_state_stats(metadata_store)
rel_total = max(0, int(relation_states.get("total", len(relation_hashes))))
ready_count = max(0, int(relation_states.get("ready", 0)))
result = {
"counts": {
"paragraphs": len(paragraph_hashes),
"entities": len(entity_hashes),
"relations": len(relation_hashes),
"vectors_live": len(live_vector_hashes),
},
"coverage": {
"paragraph_vector_coverage": _safe_ratio(para_vector_hits, len(paragraph_hashes)),
"entity_vector_coverage": _safe_ratio(ent_vector_hits, len(entity_hashes)),
"relation_vector_coverage": _safe_ratio(rel_vector_hits, len(relation_hashes)),
"relation_ready_coverage": _safe_ratio(ready_count, rel_total),
},
"relation_states": relation_states,
"orphans": {
"vector_only_count": len(orphan_vector_hashes),
"vector_only_sample": orphan_vector_hashes[:30],
},
"consistency_checks": {
"ready_but_missing_vector": ready_but_missing,
"not_ready_but_vector_present": not_ready_but_present,
},
}
return result
finally:
metadata_store.close()
def main() -> int:
parser = _build_arg_parser()
args = parser.parse_args()
data_dir = Path(args.data_dir).resolve()
if not data_dir.exists():
print(f"❌ 数据目录不存在: {data_dir}")
return 2
try:
result = run_audit(data_dir)
except Exception as e:
print(f"❌ 审计失败: {e}")
return 2
print("=== A_Memorix Vector Consistency Audit ===")
print(f"data_dir: {data_dir}")
print(f"paragraphs: {result['counts']['paragraphs']}")
print(f"entities: {result['counts']['entities']}")
print(f"relations: {result['counts']['relations']}")
print(f"vectors_live: {result['counts']['vectors_live']}")
print(
"coverage: "
f"paragraph={result['coverage']['paragraph_vector_coverage']:.3f}, "
f"entity={result['coverage']['entity_vector_coverage']:.3f}, "
f"relation={result['coverage']['relation_vector_coverage']:.3f}, "
f"relation_ready={result['coverage']['relation_ready_coverage']:.3f}"
)
print(f"relation_states: {result['relation_states']}")
print(
"consistency_checks: "
f"ready_but_missing_vector={result['consistency_checks']['ready_but_missing_vector']}, "
f"not_ready_but_vector_present={result['consistency_checks']['not_ready_but_vector_present']}"
)
print(f"orphan_vectors: {result['orphans']['vector_only_count']}")
if args.json_out:
out_path = Path(args.json_out).resolve()
out_path.parent.mkdir(parents=True, exist_ok=True)
with open(out_path, "w", encoding="utf-8") as f:
json.dump(result, f, ensure_ascii=False, indent=2)
print(f"json_out: {out_path}")
has_anomaly = (
result["orphans"]["vector_only_count"] > 0
or result["consistency_checks"]["ready_but_missing_vector"] > 0
)
if args.strict and has_anomaly:
return 1
return 0
if __name__ == "__main__":
raise SystemExit(main())