feat：新增 A_Memorix 记忆插件

引入 A_Memorix 插件（v2.0.0）——一个轻量级的长期记忆提供器。新增插件清单（manifest）和入口（AMemorixPlugin），并提供完整的核心能力：嵌入（基于哈希的 EmbeddingAPIAdapter、EmbeddingManager、预设）、检索（双路径检索器、PageRank、图关系召回、BM25 稀疏索引、阈值与融合配置）、存储与元数据层，以及大量实用工具和迁移/转换脚本。同时更新 .gitignore 以允许 /plugins/A_memorix。该变更为在宿主应用中实现统一的记忆摄取、检索、分析与维护奠定了基础。
2026-03-18 21:33:15 +08:00
parent a5a6d2cb26
commit 999e7246e2
48 changed files with 17070 additions and 0 deletions
--- a/plugins/A_memorix/core/strategies/init.py
+++ b/plugins/A_memorix/core/strategies/init.py
--- a/plugins/A_memorix/core/strategies/base.py
+++ b/plugins/A_memorix/core/strategies/base.py
@@ -0,0 +1,89 @@
+from abc import ABC, abstractmethod
+from typing import List, Dict, Any, Optional, Union
+from dataclasses import dataclass, field
+from enum import Enum
+import hashlib
+
+class KnowledgeType(str, Enum):
+    NARRATIVE = "narrative"
+    FACTUAL = "factual"
+    QUOTE = "quote"
+    MIXED = "mixed"
+
+@dataclass
+class SourceInfo:
+    file: str
+    offset_start: int
+    offset_end: int
+    checksum: str = ""
+
+@dataclass
+class ChunkContext:
+    chunk_id: str
+    index: int
+    context: Dict[str, Any] = field(default_factory=dict)
+    text: str = ""
+
+@dataclass
+class ChunkFlags:
+    verbatim: bool = False
+    requires_llm: bool = True
+
+@dataclass
+class ProcessedChunk:
+    type: KnowledgeType
+    source: SourceInfo
+    chunk: ChunkContext
+    data: Dict[str, Any] = field(default_factory=dict) # triples、events、verbatim_entities
+    flags: ChunkFlags = field(default_factory=ChunkFlags)
+
+    def to_dict(self) -> Dict:
+        return {
+            "type": self.type.value,
+            "source": {
+                "file": self.source.file,
+                "offset_start": self.source.offset_start,
+                "offset_end": self.source.offset_end,
+                "checksum": self.source.checksum
+            },
+            "chunk": {
+                "text": self.chunk.text,
+                "chunk_id": self.chunk.chunk_id,
+                "index": self.chunk.index,
+                "context": self.chunk.context
+            },
+            "data": self.data,
+            "flags": {
+                "verbatim": self.flags.verbatim,
+                "requires_llm": self.flags.requires_llm
+            }
+        }
+
+class BaseStrategy(ABC):
+    def __init__(self, filename: str):
+        self.filename = filename
+
+    @abstractmethod
+    def split(self, text: str) -> List[ProcessedChunk]:
+        """按策略将文本切分为块。"""
+        pass
+
+    @abstractmethod
+    async def extract(self, chunk: ProcessedChunk, llm_func=None) -> ProcessedChunk:
+        """从文本块中抽取结构化信息。"""
+        pass
+
+    def calculate_checksum(self, text: str) -> str:
+        return hashlib.sha256(text.encode("utf-8")).hexdigest()
+
+    def build_language_guard(self, text: str) -> str:
+        """
+        构建统一的输出语言约束。
+        不区分语言类型，仅要求抽取值保持原文语言，不做翻译。
+        """
+        _ = text  # 预留参数，便于后续按需扩展
+        return (
+            "Focus on the original source language. Keep extracted events, entities, predicates "
+            "and objects in the same language as the source text, preserve names/terms as-is, "
+            "and do not translate."
+        )
--- a/plugins/A_memorix/core/strategies/factual.py
+++ b/plugins/A_memorix/core/strategies/factual.py
@@ -0,0 +1,98 @@
+import re
+from typing import List, Dict, Any
+from .base import BaseStrategy, ProcessedChunk, KnowledgeType, SourceInfo, ChunkContext
+
+class FactualStrategy(BaseStrategy):
+    def split(self, text: str) -> List[ProcessedChunk]:
+        # 结构感知切分
+        lines = text.split('\n')
+        chunks = []
+        current_chunk_lines = []
+        current_len = 0
+        target_size = 600
+        
+        for i, line in enumerate(lines):
+            # 判断是否应当切分
+            # 若当前行为列表项/定义/表格行，则尽量不切分
+            is_structure = self._is_structural_line(line)
+            
+            current_len += len(line) + 1
+            current_chunk_lines.append(line)
+            
+            # 达到目标长度且不在紧凑结构块内时切分（过长时强制切分）
+            if current_len >= target_size and not is_structure:
+                 chunks.append(self._create_chunk(current_chunk_lines, len(chunks)))
+                 current_chunk_lines = []
+                 current_len = 0
+            elif current_len >= target_size * 2: # 超长时强制切分
+                 chunks.append(self._create_chunk(current_chunk_lines, len(chunks)))
+                 current_chunk_lines = []
+                 current_len = 0
+
+        if current_chunk_lines:
+            chunks.append(self._create_chunk(current_chunk_lines, len(chunks)))
+            
+        return chunks
+
+    def _is_structural_line(self, line: str) -> bool:
+        line = line.strip()
+        if not line: return False
+        # 列表项
+        if re.match(r'^[\-\*]\s+', line) or re.match(r'^\d+\.\s+', line):
+            return True
+        # 定义项（术语: 定义）
+        if re.match(r'^[^：:]+[：:].+', line):
+            return True
+        # 表格行（按 markdown 语法假设）
+        if line.startswith('|') and line.endswith('|'):
+            return True
+        return False
+
+    def _create_chunk(self, lines: List[str], index: int) -> ProcessedChunk:
+        text = "\n".join(lines)
+        return ProcessedChunk(
+            type=KnowledgeType.FACTUAL,
+            source=SourceInfo(
+                file=self.filename,
+                offset_start=0, # 简化处理：真实偏移跟踪需要额外状态
+                offset_end=0,
+                checksum=self.calculate_checksum(text)
+            ),
+            chunk=ChunkContext(
+                chunk_id=f"{self.filename}_{index}",
+                index=index,
+                text=text
+            )
+        )
+
+    async def extract(self, chunk: ProcessedChunk, llm_func=None) -> ProcessedChunk:
+        if not llm_func:
+            raise ValueError("LLM function required for Factual extraction")
+
+        language_guard = self.build_language_guard(chunk.chunk.text)
+        prompt = f"""You are a factual knowledge extraction engine.
+Extract factual triples and entities from the text.
+Preserve lists and definitions accurately.
+
+Language constraints:
+- {language_guard}
+- Preserve original names and domain terms exactly when possible.
+- JSON keys must stay exactly as: triples, entities, subject, predicate, object.
+
+Text:
+{chunk.chunk.text}
+
+Return ONLY valid JSON:
+{{
+  "triples": [
+    {{"subject": "Entity", "predicate": "Relationship", "object": "Entity"}}
+  ],
+  "entities": ["Entity1", "Entity2"]
+}}
+"""
+        result = await llm_func(prompt)
+        
+        # 结果保持原样存入 data，后续统一归一化流程会处理
+        # vector_store 侧期望关系字段为 subject/predicate/object 映射形式
+        chunk.data = result
+        return chunk
--- a/plugins/A_memorix/core/strategies/narrative.py
+++ b/plugins/A_memorix/core/strategies/narrative.py
@@ -0,0 +1,126 @@
+import re
+from typing import List, Dict, Any
+from .base import BaseStrategy, ProcessedChunk, KnowledgeType, SourceInfo, ChunkContext
+
+class NarrativeStrategy(BaseStrategy):
+    def split(self, text: str) -> List[ProcessedChunk]:
+        scenes = self._split_into_scenes(text)
+        chunks = []
+        
+        for scene_idx, (scene_text, scene_title) in enumerate(scenes):
+             scene_chunks = self._sliding_window(scene_text, scene_title, scene_idx)
+             chunks.extend(scene_chunks)
+             
+        return chunks
+
+    def _split_into_scenes(self, text: str) -> List[tuple[str, str]]:
+        """按标题或分隔符把文本切分为场景。"""
+        # 简单启发式：按 markdown 标题或特定分隔符切分
+        # 该正则匹配以 #、Chapter 或 *** / === 开头的分隔行
+        # 该正则匹配以 #、Chapter 或 *** / === 开头的分隔行
+        scene_pattern_str = r'^(?:#{1,6}\s+.*|Chapter\s+\d+|^\*{3,}$|^={3,}$)'
+        
+        # 保留分隔符，以便识别场景起点
+        parts = re.split(f"({scene_pattern_str})", text, flags=re.MULTILINE)
+        
+        scenes = []
+        current_scene_title = "Start"
+        current_scene_content = []
+        
+        if parts and parts[0].strip() == "":
+            parts = parts[1:]
+            
+        for part in parts:
+            if re.match(scene_pattern_str, part, re.MULTILINE):
+                # 先保存上一段场景
+                if current_scene_content:
+                    scenes.append(("".join(current_scene_content), current_scene_title))
+                    current_scene_content = []
+                current_scene_title = part.strip()
+            else:
+                current_scene_content.append(part)
+                
+        if current_scene_content:
+             scenes.append(("".join(current_scene_content), current_scene_title))
+             
+        # 若未识别到场景，则把全文视作单一场景
+        if not scenes:
+            scenes = [(text, "Whole Text")]
+
+        return scenes
+
+    def _sliding_window(self, text: str, scene_id: str, scene_idx: int, window_size=800, overlap=200) -> List[ProcessedChunk]:
+        chunks = []
+        if len(text) <= window_size:
+            chunks.append(self._create_chunk(text, scene_id, scene_idx, 0, 0))
+            return chunks
+
+        stride = window_size - overlap
+        start = 0
+        local_idx = 0
+        while start < len(text):
+            end = min(start + window_size, len(text))
+            chunk_text = text[start:end]
+            
+            # 尽量对齐到最近换行，避免生硬截断句子
+            # 仅在未到文本尾部时进行回退
+            if end < len(text):
+                last_newline = chunk_text.rfind('\n')
+                if last_newline > window_size // 2: # 仅在回退距离可接受时启用
+                    end = start + last_newline + 1
+                    chunk_text = text[start:end]
+            
+            chunks.append(self._create_chunk(chunk_text, scene_id, scene_idx, local_idx, start))
+            
+            start += len(chunk_text) - overlap if end < len(text) else len(chunk_text)
+            local_idx += 1
+            
+        return chunks
+
+    def _create_chunk(self, text: str, scene_id: str, scene_idx: int, local_idx: int, offset: int) -> ProcessedChunk:
+        return ProcessedChunk(
+            type=KnowledgeType.NARRATIVE,
+            source=SourceInfo(
+                file=self.filename,
+                offset_start=offset,
+                offset_end=offset + len(text),
+                checksum=self.calculate_checksum(text)
+            ),
+            chunk=ChunkContext(
+                chunk_id=f"{self.filename}_{scene_idx}_{local_idx}",
+                index=local_idx,
+                text=text,
+                context={"scene_id": scene_id}
+            )
+        )
+
+    async def extract(self, chunk: ProcessedChunk, llm_func=None) -> ProcessedChunk:
+        if not llm_func:
+            raise ValueError("LLM function required for Narrative extraction")
+
+        language_guard = self.build_language_guard(chunk.chunk.text)
+        prompt = f"""You are a narrative knowledge extraction engine.
+Extract key events and character relations from the scene text.
+
+Language constraints:
+- {language_guard}
+- Preserve original names and terms exactly when possible.
+- JSON keys must stay exactly as: events, relations, subject, predicate, object.
+
+Scene:
+{chunk.chunk.context.get('scene_id')}
+
+Text:
+{chunk.chunk.text}
+
+Return ONLY valid JSON:
+{{
+  "events": ["event description 1", "event description 2"],
+  "relations": [
+    {{"subject": "CharacterA", "predicate": "relation", "object": "CharacterB"}}
+  ]
+}}
+"""
+        result = await llm_func(prompt)
+        chunk.data = result
+        return chunk
--- a/plugins/A_memorix/core/strategies/quote.py
+++ b/plugins/A_memorix/core/strategies/quote.py
@@ -0,0 +1,52 @@
+from typing import List, Dict, Any
+from .base import BaseStrategy, ProcessedChunk, KnowledgeType, SourceInfo, ChunkContext, ChunkFlags
+
+class QuoteStrategy(BaseStrategy):
+    def split(self, text: str) -> List[ProcessedChunk]:
+        # Split by double newlines (stanzas)
+        stanzas = text.split("\n\n")
+        chunks = []
+        offset = 0
+        
+        for idx, stanza in enumerate(stanzas):
+            if not stanza.strip():
+                offset += len(stanza) + 2
+                continue
+                
+            chunk = ProcessedChunk(
+                type=KnowledgeType.QUOTE,
+                source=SourceInfo(
+                    file=self.filename,
+                    offset_start=offset,
+                    offset_end=offset + len(stanza),
+                    checksum=self.calculate_checksum(stanza)
+                ),
+                chunk=ChunkContext(
+                    chunk_id=f"{self.filename}_{idx}",
+                    index=idx,
+                    text=stanza
+                ),
+                flags=ChunkFlags(
+                    verbatim=True,
+                    requires_llm=False # Default to no LLM, but can be overridden
+                )
+            )
+            chunks.append(chunk)
+            offset += len(stanza) + 2 # +2 for \n\n
+            
+        return chunks
+
+    async def extract(self, chunk: ProcessedChunk, llm_func=None) -> ProcessedChunk:
+        # For quotes, the text itself is the entity/knowledge
+        # We might use LLM to extract headers/metadata if requested, but core logic is pass-through
+        
+        # Treat the whole chunk text as a verbatim entity
+        chunk.data = {
+            "verbatim_entities": [chunk.chunk.text]
+        }
+        
+        if llm_func and chunk.flags.requires_llm:
+             # Optional: Extract metadata
+             pass
+             
+        return chunk