fix(A_memorix):标准化导入数据并修复进度显示

为导入/LLM输出增加健壮的标准化与验证机制,修复进度计算逻辑,并改进UI摘要展示
This commit is contained in:
DawnARC
2026-05-06 22:41:35 +08:00
parent 1bb6f514e7
commit 9acf03d24b
9 changed files with 514 additions and 49 deletions

View File

@@ -0,0 +1,21 @@
from pathlib import Path
from src.A_memorix.core.storage.metadata_store import MetadataStore
def test_get_all_sources_ignores_soft_deleted_paragraphs(tmp_path: Path) -> None:
store = MetadataStore(data_dir=tmp_path)
store.connect()
try:
live_hash = store.add_paragraph("Alice 喜欢地图", source="live-source")
deleted_hash = store.add_paragraph("Bob 喜欢咖啡", source="deleted-source")
assert live_hash
store.mark_as_deleted([deleted_hash], "paragraph")
sources = store.get_all_sources()
finally:
store.close()
assert [item["source"] for item in sources] == ["live-source"]
assert sources[0]["count"] == 1

View File

@@ -1,6 +1,11 @@
import pytest
from src.A_memorix.core.utils.summary_importer import SummaryImporter
from src.A_memorix.core.utils.summary_importer import (
SummaryImporter,
_message_timestamp,
_normalize_entity_items,
_normalize_relation_items,
)
from src.config.model_configs import TaskConfig
from src.services import llm_service as llm_api
@@ -46,3 +51,22 @@ def test_resolve_summary_model_config_rejects_legacy_string_selector(monkeypatch
with pytest.raises(ValueError, match="List\\[str\\]"):
importer._resolve_summary_model_config()
def test_summary_importer_normalizes_llm_entities_and_relations():
assert _normalize_entity_items(["Alice", {"name": "地图"}, ["bad"], "Alice"]) == ["Alice", "地图"]
assert _normalize_entity_items("Alice") == []
assert _normalize_relation_items(
[
{"subject": "Alice", "predicate": "持有", "object": "地图"},
{"subject": "Alice", "predicate": "", "object": "地图"},
["bad"],
]
) == [{"subject": "Alice", "predicate": "持有", "object": "地图"}]
def test_summary_importer_message_timestamp_accepts_time_fallback():
class Message:
time = 123.5
assert _message_timestamp(Message()) == 123.5

View File

@@ -0,0 +1,117 @@
from types import SimpleNamespace
import numpy as np
import pytest
from src.A_memorix.core.strategies.base import ChunkContext, KnowledgeType, ProcessedChunk, SourceInfo
from src.A_memorix.core.utils.web_import_manager import ImportTaskManager
class _DummyMetadataStore:
def __init__(self) -> None:
self.paragraphs: list[dict[str, object]] = []
self.entities: list[str] = []
self.relations: list[tuple[str, str, str]] = []
def add_paragraph(self, **kwargs):
self.paragraphs.append(dict(kwargs))
return f"paragraph-{len(self.paragraphs)}"
def add_entity(self, *, name: str, source_paragraph: str = "") -> str:
del source_paragraph
self.entities.append(name)
return f"entity-{name}"
def add_relation(self, *, subject: str, predicate: str, obj: str, **kwargs) -> str:
del kwargs
self.relations.append((subject, predicate, obj))
return f"relation-{len(self.relations)}"
def set_relation_vector_state(self, rel_hash: str, state: str) -> None:
del rel_hash, state
class _DummyGraphStore:
def __init__(self) -> None:
self.nodes: list[list[str]] = []
self.edges: list[list[tuple[str, str]]] = []
def add_nodes(self, nodes):
self.nodes.append(list(nodes))
def add_edges(self, edges, relation_hashes=None):
del relation_hashes
self.edges.append(list(edges))
class _DummyVectorStore:
def __contains__(self, item: str) -> bool:
del item
return False
def add(self, vectors, ids):
del vectors, ids
class _DummyEmbeddingManager:
async def encode(self, text: str) -> np.ndarray:
del text
return np.ones(4, dtype=np.float32)
def _build_manager() -> tuple[ImportTaskManager, _DummyMetadataStore]:
metadata_store = _DummyMetadataStore()
plugin = SimpleNamespace(
metadata_store=metadata_store,
graph_store=_DummyGraphStore(),
vector_store=_DummyVectorStore(),
embedding_manager=_DummyEmbeddingManager(),
relation_write_service=None,
get_config=lambda key, default=None: default,
_is_embedding_degraded=lambda: False,
_allow_metadata_only_write=lambda: True,
write_paragraph_vector_or_enqueue=None,
)
manager = ImportTaskManager(plugin)
return manager, metadata_store
def _build_chunk(data) -> ProcessedChunk:
return ProcessedChunk(
type=KnowledgeType.FACTUAL,
source=SourceInfo(file="demo.txt", offset_start=0, offset_end=4),
chunk=ChunkContext(chunk_id="chunk-1", index=0, text="Alice 持有地图"),
data=data,
)
@pytest.mark.asyncio
async def test_persist_processed_chunk_rejects_non_object_before_paragraph_write() -> None:
manager, metadata_store = _build_manager()
file_record = SimpleNamespace(source_path="", source_kind="paste", name="demo.txt")
with pytest.raises(ValueError, match="分块抽取结果 必须返回 JSON 对象"):
await manager._persist_processed_chunk(file_record, _build_chunk(["bad"]))
assert metadata_store.paragraphs == []
@pytest.mark.asyncio
async def test_persist_processed_chunk_skips_invalid_nested_items() -> None:
manager, metadata_store = _build_manager()
file_record = SimpleNamespace(source_path="", source_kind="paste", name="demo.txt")
await manager._persist_processed_chunk(
file_record,
_build_chunk(
{
"triples": [{"subject": "Alice", "predicate": "持有", "object": "地图"}, ["bad"]],
"relations": [{"subject": "Alice", "predicate": "", "object": "地图"}],
"entities": ["Alice", {"name": "地图"}, ["bad"]],
}
),
)
assert len(metadata_store.paragraphs) == 1
assert set(metadata_store.entities) >= {"Alice", "地图"}
assert metadata_store.relations == [("Alice", "持有", "地图")]