Merge branch 'dev' of https://github.com/Mai-with-u/MaiBot into dev

2026-05-07 00:05:39 +08:00
parent 5846f6e0c4 87b42e43b7
commit 8edf13df14
9 changed files with 590 additions and 61 deletions
--- a/src/A_memorix/core/storage/metadata_store.py
+++ b/src/A_memorix/core/storage/metadata_store.py
@@ -2623,6 +2623,7 @@ class MetadataStore:
            SELECT source, COUNT(*) as count, MAX(created_at) as last_updated 
            FROM paragraphs 
            WHERE source IS NOT NULL AND source != ''
+              AND (is_deleted IS NULL OR is_deleted = 0)
            GROUP BY source
            ORDER BY last_updated DESC
        """)
--- a/src/A_memorix/core/utils/summary_importer.py
+++ b/src/A_memorix/core/utils/summary_importer.py
@@ -56,6 +56,63 @@ SUMMARY_PROMPT_TEMPLATE = """
 注意：总结应具有叙事性，能够作为长程记忆的一部分。直接使用实体的实际名称，不要使用 e1/e2 等代号。
 """

+
+def _normalize_entity_items(raw_entities: Any) -> List[str]:
+    if not isinstance(raw_entities, list):
+        return []
+    entities: List[str] = []
+    seen = set()
+    for item in raw_entities:
+        if isinstance(item, str):
+            name = item.strip()
+        elif isinstance(item, dict):
+            name = str(item.get("name") or item.get("label") or item.get("entity") or "").strip()
+        else:
+            name = ""
+        if not name:
+            continue
+        key = name.lower()
+        if key in seen:
+            continue
+        seen.add(key)
+        entities.append(name)
+    return entities
+
+
+def _normalize_relation_items(raw_relations: Any) -> List[Dict[str, str]]:
+    if not isinstance(raw_relations, list):
+        return []
+    relations: List[Dict[str, str]] = []
+    for item in raw_relations:
+        if not isinstance(item, dict):
+            continue
+        subject = str(item.get("subject", "") or "").strip()
+        predicate = str(item.get("predicate", "") or "").strip()
+        obj = str(item.get("object", "") or "").strip()
+        if not (subject and predicate and obj):
+            continue
+        relations.append({"subject": subject, "predicate": predicate, "object": obj})
+    return relations
+
+
+def _message_timestamp(message: Any) -> Optional[float]:
+    for attr_name in ("timestamp", "time"):
+        value = getattr(message, attr_name, None)
+        if value is None:
+            continue
+        timestamp_func = getattr(value, "timestamp", None)
+        if callable(timestamp_func):
+            try:
+                return float(timestamp_func())
+            except Exception:
+                continue
+        try:
+            return float(value)
+        except Exception:
+            continue
+    return None
+
+
 class SummaryImporter:
    """总结并导入知识的工具类"""

@@ -312,14 +369,12 @@ class SummaryImporter:
            if not data or "summary" not in data:
                return False, "解析 LLM 响应失败或总结为空"

-            summary_text = data["summary"]
-            entities = data.get("entities", [])
-            relations = data.get("relations", [])
-            msg_times = [
-                float(getattr(getattr(msg, "timestamp", None), "timestamp", lambda: 0.0)())
-                for msg in messages
-                if getattr(msg, "time", None) is not None
-            ]
+            summary_text = str(data["summary"] or "").strip()
+            if not summary_text:
+                return False, "解析 LLM 响应失败或总结为空"
+            entities = _normalize_entity_items(data.get("entities"))
+            relations = _normalize_relation_items(data.get("relations"))
+            msg_times = [timestamp for msg in messages if (timestamp := _message_timestamp(msg)) is not None]
            time_meta = {}
            if msg_times:
                time_meta = {
@@ -455,8 +510,8 @@ class SummaryImporter:
        if not isinstance(rv_cfg, dict):
            rv_cfg = {}
        write_vector = bool(rv_cfg.get("enabled", False)) and bool(rv_cfg.get("write_on_import", True))
-        for rel in relations:
-            s, p, o = rel.get("subject"), rel.get("predicate"), rel.get("object")
+        for rel in _normalize_relation_items(relations):
+            s, p, o = rel["subject"], rel["predicate"], rel["object"]
            if all([s, p, o]):
                if self.relation_write_service is not None:
                    await self.relation_write_service.upsert_relation_with_vector(
--- a/src/A_memorix/core/utils/web_import_manager.py
+++ b/src/A_memorix/core/utils/web_import_manager.py
@@ -6,6 +6,11 @@ Web Import Task Manager

 from __future__ import annotations

+from collections import deque
+from dataclasses import dataclass, field
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, Tuple
 import asyncio
 import hashlib
 import json
@@ -15,24 +20,24 @@ import sys
 import time
 import traceback
 import uuid
-from collections import deque
-from dataclasses import dataclass, field
-from datetime import datetime
-from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Tuple

 from src.common.logger import get_logger
 from src.services import llm_service as llm_api

 from ...paths import default_data_dir, repo_root, resolve_repo_path, scripts_root
 from ..storage import (
+    KnowledgeType,
+    MetadataStore,
    parse_import_strategy,
    resolve_stored_knowledge_type,
    select_import_strategy,
-    KnowledgeType,
-    MetadataStore,
 )
+from ..storage.knowledge_types import ImportStrategy
 from ..storage.type_detection import looks_like_quote_text
+from ..strategies.base import KnowledgeType as StrategyKnowledgeType, ProcessedChunk
+from ..strategies.factual import FactualStrategy
+from ..strategies.narrative import NarrativeStrategy
+from ..strategies.quote import QuoteStrategy
 from ..utils.import_payloads import (
    ImportPayloadValidationError,
    is_probable_hash_token,
@@ -42,11 +47,6 @@ from ..utils.import_payloads import (
 )
 from ..utils.runtime_self_check import ensure_runtime_self_check
 from ..utils.time_parser import normalize_time_meta
-from ..storage.knowledge_types import ImportStrategy
-from ..strategies.base import ProcessedChunk, KnowledgeType as StrategyKnowledgeType
-from ..strategies.narrative import NarrativeStrategy
-from ..strategies.factual import FactualStrategy
-from ..strategies.quote import QuoteStrategy

 logger = get_logger("A_Memorix.WebImportManager")

@@ -141,6 +141,44 @@ def _coerce_list(value: Any) -> List[str]:
    return out


+def _coerce_import_data_dict(value: Any, *, context: str) -> Dict[str, Any]:
+    """确保 LLM 抽取结果是对象，避免写入阶段出现部分提交。"""
+
+    if value is None:
+        return {}
+    if isinstance(value, dict):
+        return value
+    raise ValueError(f"{context} 必须返回 JSON 对象，当前类型: {type(value).__name__}")
+
+
+def _normalize_import_relation_list(value: Any) -> List[Dict[str, str]]:
+    if not isinstance(value, list):
+        return []
+    relations: List[Dict[str, str]] = []
+    for item in value:
+        relation = normalize_relation_import_item(item)
+        if relation is not None:
+            relations.append(relation)
+    return relations
+
+
+def _normalize_import_entity_list(value: Any) -> List[str]:
+    if not isinstance(value, list):
+        return []
+    entities: List[str] = []
+    seen = set()
+    for item in value:
+        name = normalize_entity_import_item(item)
+        if not name:
+            continue
+        key = name.lower()
+        if key in seen:
+            continue
+        seen.add(key)
+        entities.append(name)
+    return entities
+
+
 def _parse_optional_positive_int(value: Any, field_name: str) -> Optional[int]:
    if value is None:
        return None
@@ -2002,7 +2040,7 @@ class ImportTaskManager:
        if total <= 0:
            total = max(1, scanned)

-        progress = max(0.0, min(1.0, float(scanned) / float(total))) if total > 0 else 0.0
+        chunk_progress = max(0.0, min(1.0, float(scanned) / float(total))) if total > 0 else 0.0
        preview = f"scanned={scanned}/{total}, migrated={migrated}, bad={bad}, last_id={last_id}"

        async with self._lock:
@@ -2017,14 +2055,14 @@ class ImportTaskManager:
                if c.status not in {"completed", "failed", "cancelled"}:
                    c.status = "writing"
                    c.step = "migrating"
-                c.progress = progress
+                c.progress = chunk_progress
                c.content_preview = preview
                c.updated_at = _now()
            f.total_chunks = total
            f.done_chunks = done
            f.failed_chunks = bad
            f.cancelled_chunks = 0
-            f.progress = progress
+            self._recompute_file_progress(f)
            if f.status not in {"failed", "cancelled"}:
                f.status = "writing"
                f.current_step = "migrating"
@@ -2171,7 +2209,7 @@ class ImportTaskManager:
                f.done_chunks = max(0, min(f.done_chunks, f.total_chunks))
                f.failed_chunks = max(0, min(f.failed_chunks, f.total_chunks))
                f.cancelled_chunks = 0
-                f.progress = 1.0
+                self._recompute_file_progress(f)
                f.status = "completed"
                f.current_step = "completed"
                if bad_rows > 0 and not f.error:
@@ -3128,6 +3166,7 @@ class ImportTaskManager:
        if is_probable_hash_token(content):
            logger.warning(f"跳过疑似哈希段落写入: source={self._source_label(file_record)} preview={content[:32]}")
            return
+        data = _coerce_import_data_dict(processed.data, context="分块抽取结果")
        para_hash = self.plugin.metadata_store.add_paragraph(
            content=content,
            source=self._source_label(file_record),
@@ -3145,31 +3184,25 @@ class ImportTaskManager:
                f"web_import text paragraph 向量写入降级: hash={para_hash[:8]} detail={vector_result.get('detail')}"
            )

-        data = processed.data or {}
        entities: List[str] = []
        relations: List[Tuple[str, str, str]] = []

-        for triple in data.get("triples", []):
-            s = str(triple.get("subject", "")).strip()
-            p = str(triple.get("predicate", "")).strip()
-            o = str(triple.get("object", "")).strip()
-            if s and p and o:
-                relations.append((s, p, o))
-                entities.extend([s, o])
+        for triple in _normalize_import_relation_list(data.get("triples")):
+            s = triple["subject"]
+            p = triple["predicate"]
+            o = triple["object"]
+            relations.append((s, p, o))
+            entities.extend([s, o])

-        for rel in data.get("relations", []):
-            s = str(rel.get("subject", "")).strip()
-            p = str(rel.get("predicate", "")).strip()
-            o = str(rel.get("object", "")).strip()
-            if s and p and o:
-                relations.append((s, p, o))
-                entities.extend([s, o])
+        for rel in _normalize_import_relation_list(data.get("relations")):
+            s = rel["subject"]
+            p = rel["predicate"]
+            o = rel["object"]
+            relations.append((s, p, o))
+            entities.extend([s, o])

        for k in ("entities", "events", "verbatim_entities"):
-            for e in data.get(k, []):
-                name = str(e or "").strip()
-                if name and not is_probable_hash_token(name):
-                    entities.append(name)
+            entities.extend(_normalize_import_entity_list(data.get(k)))

        uniq_entities = list({x.strip().lower(): x.strip() for x in entities if str(x).strip()}.values())
        for name in uniq_entities:
@@ -3294,12 +3327,12 @@ class ImportTaskManager:
                        txt = txt[4:].strip()

                try:
-                    return json.loads(txt)
+                    return _coerce_import_data_dict(json.loads(txt), context="LLM 抽取结果")
                except Exception:
                    s = txt.find("{")
                    e = txt.rfind("}")
                    if s >= 0 and e > s:
-                        return json.loads(txt[s : e + 1])
+                        return _coerce_import_data_dict(json.loads(txt[s : e + 1]), context="LLM 抽取结果")
                    raise
            except Exception as err:
                last_error = err
@@ -3370,6 +3403,7 @@ JSON schema:
            logger.warning(f"chat_log 时间语义抽取失败: {e}")
            return None

+        result = _coerce_import_data_dict(result, context="chat_log 时间抽取结果")
        raw_time_meta = {
            "event_time": result.get("event_time"),
            "event_time_start": result.get("event_time_start"),
@@ -3541,9 +3575,7 @@ JSON schema:
                additional_cancelled += 1
            if additional_cancelled > 0:
                f.cancelled_chunks += additional_cancelled
-                f.progress = self._compute_ratio(
-                    f.done_chunks + f.failed_chunks + f.cancelled_chunks, f.total_chunks
-                )
+            self._recompute_file_progress(f)
            f.updated_at = _now()
            task.updated_at = _now()
            self._recompute_task_progress(task)
@@ -3601,7 +3633,7 @@ JSON schema:
            c.progress = 1.0
            c.updated_at = _now()
            f.done_chunks += 1
-            f.progress = self._compute_ratio(f.done_chunks + f.failed_chunks + f.cancelled_chunks, f.total_chunks)
+            self._recompute_file_progress(f)
            f.updated_at = _now()
            self._recompute_task_progress(task)

@@ -3629,7 +3661,7 @@ JSON schema:
            c.progress = 1.0
            c.updated_at = _now()
            f.failed_chunks += 1
-            f.progress = self._compute_ratio(f.done_chunks + f.failed_chunks + f.cancelled_chunks, f.total_chunks)
+            self._recompute_file_progress(f)
            if not f.error:
                f.error = str(error)
            f.updated_at = _now()
@@ -3653,7 +3685,7 @@ JSON schema:
            c.progress = 1.0
            c.updated_at = _now()
            f.cancelled_chunks += 1
-            f.progress = self._compute_ratio(f.done_chunks + f.failed_chunks + f.cancelled_chunks, f.total_chunks)
+            self._recompute_file_progress(f)
            f.updated_at = _now()
            self._recompute_task_progress(task)

@@ -3681,6 +3713,9 @@ JSON schema:
            return 1.0
        return max(0.0, min(1.0, float(done) / float(total)))

+    def _recompute_file_progress(self, file_record: ImportFileRecord) -> None:
+        file_record.progress = self._compute_ratio(file_record.done_chunks, file_record.total_chunks)
+
    def _recompute_task_progress(self, task: ImportTaskRecord) -> None:
        total = 0
        done = 0
@@ -3695,7 +3730,7 @@ JSON schema:
        task.done_chunks = done
        task.failed_chunks = failed
        task.cancelled_chunks = cancelled
-        task.progress = self._compute_ratio(done + failed + cancelled, total)
+        task.progress = self._compute_ratio(done, total)
        task.updated_at = _now()

    async def _should_cleanup_task_temp(self, task_id: str) -> bool:
@@ -3728,9 +3763,7 @@ JSON schema:
                additional_cancelled += 1
            if additional_cancelled > 0:
                f.cancelled_chunks += additional_cancelled
-            f.progress = self._compute_ratio(
-                f.done_chunks + f.failed_chunks + f.cancelled_chunks, f.total_chunks
-            )
+            self._recompute_file_progress(f)
            f.updated_at = _now()
        task.status = "cancelled"
        task.current_step = "cancelled"