Merge branch 'dev' of https://github.com/Mai-with-u/MaiBot into dev
This commit is contained in:
@@ -2623,6 +2623,7 @@ class MetadataStore:
|
||||
SELECT source, COUNT(*) as count, MAX(created_at) as last_updated
|
||||
FROM paragraphs
|
||||
WHERE source IS NOT NULL AND source != ''
|
||||
AND (is_deleted IS NULL OR is_deleted = 0)
|
||||
GROUP BY source
|
||||
ORDER BY last_updated DESC
|
||||
""")
|
||||
|
||||
@@ -56,6 +56,63 @@ SUMMARY_PROMPT_TEMPLATE = """
|
||||
注意:总结应具有叙事性,能够作为长程记忆的一部分。直接使用实体的实际名称,不要使用 e1/e2 等代号。
|
||||
"""
|
||||
|
||||
|
||||
def _normalize_entity_items(raw_entities: Any) -> List[str]:
|
||||
if not isinstance(raw_entities, list):
|
||||
return []
|
||||
entities: List[str] = []
|
||||
seen = set()
|
||||
for item in raw_entities:
|
||||
if isinstance(item, str):
|
||||
name = item.strip()
|
||||
elif isinstance(item, dict):
|
||||
name = str(item.get("name") or item.get("label") or item.get("entity") or "").strip()
|
||||
else:
|
||||
name = ""
|
||||
if not name:
|
||||
continue
|
||||
key = name.lower()
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
entities.append(name)
|
||||
return entities
|
||||
|
||||
|
||||
def _normalize_relation_items(raw_relations: Any) -> List[Dict[str, str]]:
|
||||
if not isinstance(raw_relations, list):
|
||||
return []
|
||||
relations: List[Dict[str, str]] = []
|
||||
for item in raw_relations:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
subject = str(item.get("subject", "") or "").strip()
|
||||
predicate = str(item.get("predicate", "") or "").strip()
|
||||
obj = str(item.get("object", "") or "").strip()
|
||||
if not (subject and predicate and obj):
|
||||
continue
|
||||
relations.append({"subject": subject, "predicate": predicate, "object": obj})
|
||||
return relations
|
||||
|
||||
|
||||
def _message_timestamp(message: Any) -> Optional[float]:
|
||||
for attr_name in ("timestamp", "time"):
|
||||
value = getattr(message, attr_name, None)
|
||||
if value is None:
|
||||
continue
|
||||
timestamp_func = getattr(value, "timestamp", None)
|
||||
if callable(timestamp_func):
|
||||
try:
|
||||
return float(timestamp_func())
|
||||
except Exception:
|
||||
continue
|
||||
try:
|
||||
return float(value)
|
||||
except Exception:
|
||||
continue
|
||||
return None
|
||||
|
||||
|
||||
class SummaryImporter:
|
||||
"""总结并导入知识的工具类"""
|
||||
|
||||
@@ -312,14 +369,12 @@ class SummaryImporter:
|
||||
if not data or "summary" not in data:
|
||||
return False, "解析 LLM 响应失败或总结为空"
|
||||
|
||||
summary_text = data["summary"]
|
||||
entities = data.get("entities", [])
|
||||
relations = data.get("relations", [])
|
||||
msg_times = [
|
||||
float(getattr(getattr(msg, "timestamp", None), "timestamp", lambda: 0.0)())
|
||||
for msg in messages
|
||||
if getattr(msg, "time", None) is not None
|
||||
]
|
||||
summary_text = str(data["summary"] or "").strip()
|
||||
if not summary_text:
|
||||
return False, "解析 LLM 响应失败或总结为空"
|
||||
entities = _normalize_entity_items(data.get("entities"))
|
||||
relations = _normalize_relation_items(data.get("relations"))
|
||||
msg_times = [timestamp for msg in messages if (timestamp := _message_timestamp(msg)) is not None]
|
||||
time_meta = {}
|
||||
if msg_times:
|
||||
time_meta = {
|
||||
@@ -455,8 +510,8 @@ class SummaryImporter:
|
||||
if not isinstance(rv_cfg, dict):
|
||||
rv_cfg = {}
|
||||
write_vector = bool(rv_cfg.get("enabled", False)) and bool(rv_cfg.get("write_on_import", True))
|
||||
for rel in relations:
|
||||
s, p, o = rel.get("subject"), rel.get("predicate"), rel.get("object")
|
||||
for rel in _normalize_relation_items(relations):
|
||||
s, p, o = rel["subject"], rel["predicate"], rel["object"]
|
||||
if all([s, p, o]):
|
||||
if self.relation_write_service is not None:
|
||||
await self.relation_write_service.upsert_relation_with_vector(
|
||||
|
||||
@@ -6,6 +6,11 @@ Web Import Task Manager
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import deque
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple
|
||||
import asyncio
|
||||
import hashlib
|
||||
import json
|
||||
@@ -15,24 +20,24 @@ import sys
|
||||
import time
|
||||
import traceback
|
||||
import uuid
|
||||
from collections import deque
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple
|
||||
|
||||
from src.common.logger import get_logger
|
||||
from src.services import llm_service as llm_api
|
||||
|
||||
from ...paths import default_data_dir, repo_root, resolve_repo_path, scripts_root
|
||||
from ..storage import (
|
||||
KnowledgeType,
|
||||
MetadataStore,
|
||||
parse_import_strategy,
|
||||
resolve_stored_knowledge_type,
|
||||
select_import_strategy,
|
||||
KnowledgeType,
|
||||
MetadataStore,
|
||||
)
|
||||
from ..storage.knowledge_types import ImportStrategy
|
||||
from ..storage.type_detection import looks_like_quote_text
|
||||
from ..strategies.base import KnowledgeType as StrategyKnowledgeType, ProcessedChunk
|
||||
from ..strategies.factual import FactualStrategy
|
||||
from ..strategies.narrative import NarrativeStrategy
|
||||
from ..strategies.quote import QuoteStrategy
|
||||
from ..utils.import_payloads import (
|
||||
ImportPayloadValidationError,
|
||||
is_probable_hash_token,
|
||||
@@ -42,11 +47,6 @@ from ..utils.import_payloads import (
|
||||
)
|
||||
from ..utils.runtime_self_check import ensure_runtime_self_check
|
||||
from ..utils.time_parser import normalize_time_meta
|
||||
from ..storage.knowledge_types import ImportStrategy
|
||||
from ..strategies.base import ProcessedChunk, KnowledgeType as StrategyKnowledgeType
|
||||
from ..strategies.narrative import NarrativeStrategy
|
||||
from ..strategies.factual import FactualStrategy
|
||||
from ..strategies.quote import QuoteStrategy
|
||||
|
||||
logger = get_logger("A_Memorix.WebImportManager")
|
||||
|
||||
@@ -141,6 +141,44 @@ def _coerce_list(value: Any) -> List[str]:
|
||||
return out
|
||||
|
||||
|
||||
def _coerce_import_data_dict(value: Any, *, context: str) -> Dict[str, Any]:
|
||||
"""确保 LLM 抽取结果是对象,避免写入阶段出现部分提交。"""
|
||||
|
||||
if value is None:
|
||||
return {}
|
||||
if isinstance(value, dict):
|
||||
return value
|
||||
raise ValueError(f"{context} 必须返回 JSON 对象,当前类型: {type(value).__name__}")
|
||||
|
||||
|
||||
def _normalize_import_relation_list(value: Any) -> List[Dict[str, str]]:
|
||||
if not isinstance(value, list):
|
||||
return []
|
||||
relations: List[Dict[str, str]] = []
|
||||
for item in value:
|
||||
relation = normalize_relation_import_item(item)
|
||||
if relation is not None:
|
||||
relations.append(relation)
|
||||
return relations
|
||||
|
||||
|
||||
def _normalize_import_entity_list(value: Any) -> List[str]:
|
||||
if not isinstance(value, list):
|
||||
return []
|
||||
entities: List[str] = []
|
||||
seen = set()
|
||||
for item in value:
|
||||
name = normalize_entity_import_item(item)
|
||||
if not name:
|
||||
continue
|
||||
key = name.lower()
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
entities.append(name)
|
||||
return entities
|
||||
|
||||
|
||||
def _parse_optional_positive_int(value: Any, field_name: str) -> Optional[int]:
|
||||
if value is None:
|
||||
return None
|
||||
@@ -2002,7 +2040,7 @@ class ImportTaskManager:
|
||||
if total <= 0:
|
||||
total = max(1, scanned)
|
||||
|
||||
progress = max(0.0, min(1.0, float(scanned) / float(total))) if total > 0 else 0.0
|
||||
chunk_progress = max(0.0, min(1.0, float(scanned) / float(total))) if total > 0 else 0.0
|
||||
preview = f"scanned={scanned}/{total}, migrated={migrated}, bad={bad}, last_id={last_id}"
|
||||
|
||||
async with self._lock:
|
||||
@@ -2017,14 +2055,14 @@ class ImportTaskManager:
|
||||
if c.status not in {"completed", "failed", "cancelled"}:
|
||||
c.status = "writing"
|
||||
c.step = "migrating"
|
||||
c.progress = progress
|
||||
c.progress = chunk_progress
|
||||
c.content_preview = preview
|
||||
c.updated_at = _now()
|
||||
f.total_chunks = total
|
||||
f.done_chunks = done
|
||||
f.failed_chunks = bad
|
||||
f.cancelled_chunks = 0
|
||||
f.progress = progress
|
||||
self._recompute_file_progress(f)
|
||||
if f.status not in {"failed", "cancelled"}:
|
||||
f.status = "writing"
|
||||
f.current_step = "migrating"
|
||||
@@ -2171,7 +2209,7 @@ class ImportTaskManager:
|
||||
f.done_chunks = max(0, min(f.done_chunks, f.total_chunks))
|
||||
f.failed_chunks = max(0, min(f.failed_chunks, f.total_chunks))
|
||||
f.cancelled_chunks = 0
|
||||
f.progress = 1.0
|
||||
self._recompute_file_progress(f)
|
||||
f.status = "completed"
|
||||
f.current_step = "completed"
|
||||
if bad_rows > 0 and not f.error:
|
||||
@@ -3128,6 +3166,7 @@ class ImportTaskManager:
|
||||
if is_probable_hash_token(content):
|
||||
logger.warning(f"跳过疑似哈希段落写入: source={self._source_label(file_record)} preview={content[:32]}")
|
||||
return
|
||||
data = _coerce_import_data_dict(processed.data, context="分块抽取结果")
|
||||
para_hash = self.plugin.metadata_store.add_paragraph(
|
||||
content=content,
|
||||
source=self._source_label(file_record),
|
||||
@@ -3145,31 +3184,25 @@ class ImportTaskManager:
|
||||
f"web_import text paragraph 向量写入降级: hash={para_hash[:8]} detail={vector_result.get('detail')}"
|
||||
)
|
||||
|
||||
data = processed.data or {}
|
||||
entities: List[str] = []
|
||||
relations: List[Tuple[str, str, str]] = []
|
||||
|
||||
for triple in data.get("triples", []):
|
||||
s = str(triple.get("subject", "")).strip()
|
||||
p = str(triple.get("predicate", "")).strip()
|
||||
o = str(triple.get("object", "")).strip()
|
||||
if s and p and o:
|
||||
relations.append((s, p, o))
|
||||
entities.extend([s, o])
|
||||
for triple in _normalize_import_relation_list(data.get("triples")):
|
||||
s = triple["subject"]
|
||||
p = triple["predicate"]
|
||||
o = triple["object"]
|
||||
relations.append((s, p, o))
|
||||
entities.extend([s, o])
|
||||
|
||||
for rel in data.get("relations", []):
|
||||
s = str(rel.get("subject", "")).strip()
|
||||
p = str(rel.get("predicate", "")).strip()
|
||||
o = str(rel.get("object", "")).strip()
|
||||
if s and p and o:
|
||||
relations.append((s, p, o))
|
||||
entities.extend([s, o])
|
||||
for rel in _normalize_import_relation_list(data.get("relations")):
|
||||
s = rel["subject"]
|
||||
p = rel["predicate"]
|
||||
o = rel["object"]
|
||||
relations.append((s, p, o))
|
||||
entities.extend([s, o])
|
||||
|
||||
for k in ("entities", "events", "verbatim_entities"):
|
||||
for e in data.get(k, []):
|
||||
name = str(e or "").strip()
|
||||
if name and not is_probable_hash_token(name):
|
||||
entities.append(name)
|
||||
entities.extend(_normalize_import_entity_list(data.get(k)))
|
||||
|
||||
uniq_entities = list({x.strip().lower(): x.strip() for x in entities if str(x).strip()}.values())
|
||||
for name in uniq_entities:
|
||||
@@ -3294,12 +3327,12 @@ class ImportTaskManager:
|
||||
txt = txt[4:].strip()
|
||||
|
||||
try:
|
||||
return json.loads(txt)
|
||||
return _coerce_import_data_dict(json.loads(txt), context="LLM 抽取结果")
|
||||
except Exception:
|
||||
s = txt.find("{")
|
||||
e = txt.rfind("}")
|
||||
if s >= 0 and e > s:
|
||||
return json.loads(txt[s : e + 1])
|
||||
return _coerce_import_data_dict(json.loads(txt[s : e + 1]), context="LLM 抽取结果")
|
||||
raise
|
||||
except Exception as err:
|
||||
last_error = err
|
||||
@@ -3370,6 +3403,7 @@ JSON schema:
|
||||
logger.warning(f"chat_log 时间语义抽取失败: {e}")
|
||||
return None
|
||||
|
||||
result = _coerce_import_data_dict(result, context="chat_log 时间抽取结果")
|
||||
raw_time_meta = {
|
||||
"event_time": result.get("event_time"),
|
||||
"event_time_start": result.get("event_time_start"),
|
||||
@@ -3541,9 +3575,7 @@ JSON schema:
|
||||
additional_cancelled += 1
|
||||
if additional_cancelled > 0:
|
||||
f.cancelled_chunks += additional_cancelled
|
||||
f.progress = self._compute_ratio(
|
||||
f.done_chunks + f.failed_chunks + f.cancelled_chunks, f.total_chunks
|
||||
)
|
||||
self._recompute_file_progress(f)
|
||||
f.updated_at = _now()
|
||||
task.updated_at = _now()
|
||||
self._recompute_task_progress(task)
|
||||
@@ -3601,7 +3633,7 @@ JSON schema:
|
||||
c.progress = 1.0
|
||||
c.updated_at = _now()
|
||||
f.done_chunks += 1
|
||||
f.progress = self._compute_ratio(f.done_chunks + f.failed_chunks + f.cancelled_chunks, f.total_chunks)
|
||||
self._recompute_file_progress(f)
|
||||
f.updated_at = _now()
|
||||
self._recompute_task_progress(task)
|
||||
|
||||
@@ -3629,7 +3661,7 @@ JSON schema:
|
||||
c.progress = 1.0
|
||||
c.updated_at = _now()
|
||||
f.failed_chunks += 1
|
||||
f.progress = self._compute_ratio(f.done_chunks + f.failed_chunks + f.cancelled_chunks, f.total_chunks)
|
||||
self._recompute_file_progress(f)
|
||||
if not f.error:
|
||||
f.error = str(error)
|
||||
f.updated_at = _now()
|
||||
@@ -3653,7 +3685,7 @@ JSON schema:
|
||||
c.progress = 1.0
|
||||
c.updated_at = _now()
|
||||
f.cancelled_chunks += 1
|
||||
f.progress = self._compute_ratio(f.done_chunks + f.failed_chunks + f.cancelled_chunks, f.total_chunks)
|
||||
self._recompute_file_progress(f)
|
||||
f.updated_at = _now()
|
||||
self._recompute_task_progress(task)
|
||||
|
||||
@@ -3681,6 +3713,9 @@ JSON schema:
|
||||
return 1.0
|
||||
return max(0.0, min(1.0, float(done) / float(total)))
|
||||
|
||||
def _recompute_file_progress(self, file_record: ImportFileRecord) -> None:
|
||||
file_record.progress = self._compute_ratio(file_record.done_chunks, file_record.total_chunks)
|
||||
|
||||
def _recompute_task_progress(self, task: ImportTaskRecord) -> None:
|
||||
total = 0
|
||||
done = 0
|
||||
@@ -3695,7 +3730,7 @@ JSON schema:
|
||||
task.done_chunks = done
|
||||
task.failed_chunks = failed
|
||||
task.cancelled_chunks = cancelled
|
||||
task.progress = self._compute_ratio(done + failed + cancelled, total)
|
||||
task.progress = self._compute_ratio(done, total)
|
||||
task.updated_at = _now()
|
||||
|
||||
async def _should_cleanup_task_temp(self, task_id: str) -> bool:
|
||||
@@ -3728,9 +3763,7 @@ JSON schema:
|
||||
additional_cancelled += 1
|
||||
if additional_cancelled > 0:
|
||||
f.cancelled_chunks += additional_cancelled
|
||||
f.progress = self._compute_ratio(
|
||||
f.done_chunks + f.failed_chunks + f.cancelled_chunks, f.total_chunks
|
||||
)
|
||||
self._recompute_file_progress(f)
|
||||
f.updated_at = _now()
|
||||
task.status = "cancelled"
|
||||
task.current_step = "cancelled"
|
||||
|
||||
Reference in New Issue
Block a user