feat:新增 A_Memorix 记忆插件

引入 A_Memorix 插件(v2.0.0)——一个轻量级的长期记忆提供器。新增插件清单(manifest)和入口(AMemorixPlugin),并提供完整的核心能力:嵌入(基于哈希的 EmbeddingAPIAdapter、EmbeddingManager、预设)、检索(双路径检索器、PageRank、图关系召回、BM25 稀疏索引、阈值与融合配置)、存储与元数据层,以及大量实用工具和迁移/转换脚本。同时更新 .gitignore 以允许 /plugins/A_memorix。该变更为在宿主应用中实现统一的记忆摄取、检索、分析与维护奠定了基础。
This commit is contained in:
DawnARC
2026-03-18 21:33:15 +08:00
parent a5a6d2cb26
commit 999e7246e2
48 changed files with 17070 additions and 0 deletions

View File

@@ -0,0 +1,89 @@
from abc import ABC, abstractmethod
from typing import List, Dict, Any, Optional, Union
from dataclasses import dataclass, field
from enum import Enum
import hashlib
class KnowledgeType(str, Enum):
NARRATIVE = "narrative"
FACTUAL = "factual"
QUOTE = "quote"
MIXED = "mixed"
@dataclass
class SourceInfo:
file: str
offset_start: int
offset_end: int
checksum: str = ""
@dataclass
class ChunkContext:
chunk_id: str
index: int
context: Dict[str, Any] = field(default_factory=dict)
text: str = ""
@dataclass
class ChunkFlags:
verbatim: bool = False
requires_llm: bool = True
@dataclass
class ProcessedChunk:
type: KnowledgeType
source: SourceInfo
chunk: ChunkContext
data: Dict[str, Any] = field(default_factory=dict) # triples、events、verbatim_entities
flags: ChunkFlags = field(default_factory=ChunkFlags)
def to_dict(self) -> Dict:
return {
"type": self.type.value,
"source": {
"file": self.source.file,
"offset_start": self.source.offset_start,
"offset_end": self.source.offset_end,
"checksum": self.source.checksum
},
"chunk": {
"text": self.chunk.text,
"chunk_id": self.chunk.chunk_id,
"index": self.chunk.index,
"context": self.chunk.context
},
"data": self.data,
"flags": {
"verbatim": self.flags.verbatim,
"requires_llm": self.flags.requires_llm
}
}
class BaseStrategy(ABC):
def __init__(self, filename: str):
self.filename = filename
@abstractmethod
def split(self, text: str) -> List[ProcessedChunk]:
"""按策略将文本切分为块。"""
pass
@abstractmethod
async def extract(self, chunk: ProcessedChunk, llm_func=None) -> ProcessedChunk:
"""从文本块中抽取结构化信息。"""
pass
def calculate_checksum(self, text: str) -> str:
return hashlib.sha256(text.encode("utf-8")).hexdigest()
def build_language_guard(self, text: str) -> str:
"""
构建统一的输出语言约束。
不区分语言类型,仅要求抽取值保持原文语言,不做翻译。
"""
_ = text # 预留参数,便于后续按需扩展
return (
"Focus on the original source language. Keep extracted events, entities, predicates "
"and objects in the same language as the source text, preserve names/terms as-is, "
"and do not translate."
)

View File

@@ -0,0 +1,98 @@
import re
from typing import List, Dict, Any
from .base import BaseStrategy, ProcessedChunk, KnowledgeType, SourceInfo, ChunkContext
class FactualStrategy(BaseStrategy):
def split(self, text: str) -> List[ProcessedChunk]:
# 结构感知切分
lines = text.split('\n')
chunks = []
current_chunk_lines = []
current_len = 0
target_size = 600
for i, line in enumerate(lines):
# 判断是否应当切分
# 若当前行为列表项/定义/表格行,则尽量不切分
is_structure = self._is_structural_line(line)
current_len += len(line) + 1
current_chunk_lines.append(line)
# 达到目标长度且不在紧凑结构块内时切分(过长时强制切分)
if current_len >= target_size and not is_structure:
chunks.append(self._create_chunk(current_chunk_lines, len(chunks)))
current_chunk_lines = []
current_len = 0
elif current_len >= target_size * 2: # 超长时强制切分
chunks.append(self._create_chunk(current_chunk_lines, len(chunks)))
current_chunk_lines = []
current_len = 0
if current_chunk_lines:
chunks.append(self._create_chunk(current_chunk_lines, len(chunks)))
return chunks
def _is_structural_line(self, line: str) -> bool:
line = line.strip()
if not line: return False
# 列表项
if re.match(r'^[\-\*]\s+', line) or re.match(r'^\d+\.\s+', line):
return True
# 定义项(术语: 定义)
if re.match(r'^[^:]+[:].+', line):
return True
# 表格行(按 markdown 语法假设)
if line.startswith('|') and line.endswith('|'):
return True
return False
def _create_chunk(self, lines: List[str], index: int) -> ProcessedChunk:
text = "\n".join(lines)
return ProcessedChunk(
type=KnowledgeType.FACTUAL,
source=SourceInfo(
file=self.filename,
offset_start=0, # 简化处理:真实偏移跟踪需要额外状态
offset_end=0,
checksum=self.calculate_checksum(text)
),
chunk=ChunkContext(
chunk_id=f"{self.filename}_{index}",
index=index,
text=text
)
)
async def extract(self, chunk: ProcessedChunk, llm_func=None) -> ProcessedChunk:
if not llm_func:
raise ValueError("LLM function required for Factual extraction")
language_guard = self.build_language_guard(chunk.chunk.text)
prompt = f"""You are a factual knowledge extraction engine.
Extract factual triples and entities from the text.
Preserve lists and definitions accurately.
Language constraints:
- {language_guard}
- Preserve original names and domain terms exactly when possible.
- JSON keys must stay exactly as: triples, entities, subject, predicate, object.
Text:
{chunk.chunk.text}
Return ONLY valid JSON:
{{
"triples": [
{{"subject": "Entity", "predicate": "Relationship", "object": "Entity"}}
],
"entities": ["Entity1", "Entity2"]
}}
"""
result = await llm_func(prompt)
# 结果保持原样存入 data后续统一归一化流程会处理
# vector_store 侧期望关系字段为 subject/predicate/object 映射形式
chunk.data = result
return chunk

View File

@@ -0,0 +1,126 @@
import re
from typing import List, Dict, Any
from .base import BaseStrategy, ProcessedChunk, KnowledgeType, SourceInfo, ChunkContext
class NarrativeStrategy(BaseStrategy):
def split(self, text: str) -> List[ProcessedChunk]:
scenes = self._split_into_scenes(text)
chunks = []
for scene_idx, (scene_text, scene_title) in enumerate(scenes):
scene_chunks = self._sliding_window(scene_text, scene_title, scene_idx)
chunks.extend(scene_chunks)
return chunks
def _split_into_scenes(self, text: str) -> List[tuple[str, str]]:
"""按标题或分隔符把文本切分为场景。"""
# 简单启发式:按 markdown 标题或特定分隔符切分
# 该正则匹配以 #、Chapter 或 *** / === 开头的分隔行
# 该正则匹配以 #、Chapter 或 *** / === 开头的分隔行
scene_pattern_str = r'^(?:#{1,6}\s+.*|Chapter\s+\d+|^\*{3,}$|^={3,}$)'
# 保留分隔符,以便识别场景起点
parts = re.split(f"({scene_pattern_str})", text, flags=re.MULTILINE)
scenes = []
current_scene_title = "Start"
current_scene_content = []
if parts and parts[0].strip() == "":
parts = parts[1:]
for part in parts:
if re.match(scene_pattern_str, part, re.MULTILINE):
# 先保存上一段场景
if current_scene_content:
scenes.append(("".join(current_scene_content), current_scene_title))
current_scene_content = []
current_scene_title = part.strip()
else:
current_scene_content.append(part)
if current_scene_content:
scenes.append(("".join(current_scene_content), current_scene_title))
# 若未识别到场景,则把全文视作单一场景
if not scenes:
scenes = [(text, "Whole Text")]
return scenes
def _sliding_window(self, text: str, scene_id: str, scene_idx: int, window_size=800, overlap=200) -> List[ProcessedChunk]:
chunks = []
if len(text) <= window_size:
chunks.append(self._create_chunk(text, scene_id, scene_idx, 0, 0))
return chunks
stride = window_size - overlap
start = 0
local_idx = 0
while start < len(text):
end = min(start + window_size, len(text))
chunk_text = text[start:end]
# 尽量对齐到最近换行,避免生硬截断句子
# 仅在未到文本尾部时进行回退
if end < len(text):
last_newline = chunk_text.rfind('\n')
if last_newline > window_size // 2: # 仅在回退距离可接受时启用
end = start + last_newline + 1
chunk_text = text[start:end]
chunks.append(self._create_chunk(chunk_text, scene_id, scene_idx, local_idx, start))
start += len(chunk_text) - overlap if end < len(text) else len(chunk_text)
local_idx += 1
return chunks
def _create_chunk(self, text: str, scene_id: str, scene_idx: int, local_idx: int, offset: int) -> ProcessedChunk:
return ProcessedChunk(
type=KnowledgeType.NARRATIVE,
source=SourceInfo(
file=self.filename,
offset_start=offset,
offset_end=offset + len(text),
checksum=self.calculate_checksum(text)
),
chunk=ChunkContext(
chunk_id=f"{self.filename}_{scene_idx}_{local_idx}",
index=local_idx,
text=text,
context={"scene_id": scene_id}
)
)
async def extract(self, chunk: ProcessedChunk, llm_func=None) -> ProcessedChunk:
if not llm_func:
raise ValueError("LLM function required for Narrative extraction")
language_guard = self.build_language_guard(chunk.chunk.text)
prompt = f"""You are a narrative knowledge extraction engine.
Extract key events and character relations from the scene text.
Language constraints:
- {language_guard}
- Preserve original names and terms exactly when possible.
- JSON keys must stay exactly as: events, relations, subject, predicate, object.
Scene:
{chunk.chunk.context.get('scene_id')}
Text:
{chunk.chunk.text}
Return ONLY valid JSON:
{{
"events": ["event description 1", "event description 2"],
"relations": [
{{"subject": "CharacterA", "predicate": "relation", "object": "CharacterB"}}
]
}}
"""
result = await llm_func(prompt)
chunk.data = result
return chunk

View File

@@ -0,0 +1,52 @@
from typing import List, Dict, Any
from .base import BaseStrategy, ProcessedChunk, KnowledgeType, SourceInfo, ChunkContext, ChunkFlags
class QuoteStrategy(BaseStrategy):
def split(self, text: str) -> List[ProcessedChunk]:
# Split by double newlines (stanzas)
stanzas = text.split("\n\n")
chunks = []
offset = 0
for idx, stanza in enumerate(stanzas):
if not stanza.strip():
offset += len(stanza) + 2
continue
chunk = ProcessedChunk(
type=KnowledgeType.QUOTE,
source=SourceInfo(
file=self.filename,
offset_start=offset,
offset_end=offset + len(stanza),
checksum=self.calculate_checksum(stanza)
),
chunk=ChunkContext(
chunk_id=f"{self.filename}_{idx}",
index=idx,
text=stanza
),
flags=ChunkFlags(
verbatim=True,
requires_llm=False # Default to no LLM, but can be overridden
)
)
chunks.append(chunk)
offset += len(stanza) + 2 # +2 for \n\n
return chunks
async def extract(self, chunk: ProcessedChunk, llm_func=None) -> ProcessedChunk:
# For quotes, the text itself is the entity/knowledge
# We might use LLM to extract headers/metadata if requested, but core logic is pass-through
# Treat the whole chunk text as a verbatim entity
chunk.data = {
"verbatim_entities": [chunk.chunk.text]
}
if llm_func and chunk.flags.requires_llm:
# Optional: Extract metadata
pass
return chunk