feat:新增 A_Memorix 记忆插件
引入 A_Memorix 插件(v2.0.0)——一个轻量级的长期记忆提供器。新增插件清单(manifest)和入口(AMemorixPlugin),并提供完整的核心能力:嵌入(基于哈希的 EmbeddingAPIAdapter、EmbeddingManager、预设)、检索(双路径检索器、PageRank、图关系召回、BM25 稀疏索引、阈值与融合配置)、存储与元数据层,以及大量实用工具和迁移/转换脚本。同时更新 .gitignore 以允许 /plugins/A_memorix。该变更为在宿主应用中实现统一的记忆摄取、检索、分析与维护奠定了基础。
This commit is contained in:
0
plugins/A_memorix/core/strategies/__init__.py
Normal file
0
plugins/A_memorix/core/strategies/__init__.py
Normal file
89
plugins/A_memorix/core/strategies/base.py
Normal file
89
plugins/A_memorix/core/strategies/base.py
Normal file
@@ -0,0 +1,89 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Dict, Any, Optional, Union
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
import hashlib
|
||||
|
||||
class KnowledgeType(str, Enum):
|
||||
NARRATIVE = "narrative"
|
||||
FACTUAL = "factual"
|
||||
QUOTE = "quote"
|
||||
MIXED = "mixed"
|
||||
|
||||
@dataclass
|
||||
class SourceInfo:
|
||||
file: str
|
||||
offset_start: int
|
||||
offset_end: int
|
||||
checksum: str = ""
|
||||
|
||||
@dataclass
|
||||
class ChunkContext:
|
||||
chunk_id: str
|
||||
index: int
|
||||
context: Dict[str, Any] = field(default_factory=dict)
|
||||
text: str = ""
|
||||
|
||||
@dataclass
|
||||
class ChunkFlags:
|
||||
verbatim: bool = False
|
||||
requires_llm: bool = True
|
||||
|
||||
@dataclass
|
||||
class ProcessedChunk:
|
||||
type: KnowledgeType
|
||||
source: SourceInfo
|
||||
chunk: ChunkContext
|
||||
data: Dict[str, Any] = field(default_factory=dict) # triples、events、verbatim_entities
|
||||
flags: ChunkFlags = field(default_factory=ChunkFlags)
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
return {
|
||||
"type": self.type.value,
|
||||
"source": {
|
||||
"file": self.source.file,
|
||||
"offset_start": self.source.offset_start,
|
||||
"offset_end": self.source.offset_end,
|
||||
"checksum": self.source.checksum
|
||||
},
|
||||
"chunk": {
|
||||
"text": self.chunk.text,
|
||||
"chunk_id": self.chunk.chunk_id,
|
||||
"index": self.chunk.index,
|
||||
"context": self.chunk.context
|
||||
},
|
||||
"data": self.data,
|
||||
"flags": {
|
||||
"verbatim": self.flags.verbatim,
|
||||
"requires_llm": self.flags.requires_llm
|
||||
}
|
||||
}
|
||||
|
||||
class BaseStrategy(ABC):
|
||||
def __init__(self, filename: str):
|
||||
self.filename = filename
|
||||
|
||||
@abstractmethod
|
||||
def split(self, text: str) -> List[ProcessedChunk]:
|
||||
"""按策略将文本切分为块。"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def extract(self, chunk: ProcessedChunk, llm_func=None) -> ProcessedChunk:
|
||||
"""从文本块中抽取结构化信息。"""
|
||||
pass
|
||||
|
||||
def calculate_checksum(self, text: str) -> str:
|
||||
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
||||
|
||||
def build_language_guard(self, text: str) -> str:
|
||||
"""
|
||||
构建统一的输出语言约束。
|
||||
不区分语言类型,仅要求抽取值保持原文语言,不做翻译。
|
||||
"""
|
||||
_ = text # 预留参数,便于后续按需扩展
|
||||
return (
|
||||
"Focus on the original source language. Keep extracted events, entities, predicates "
|
||||
"and objects in the same language as the source text, preserve names/terms as-is, "
|
||||
"and do not translate."
|
||||
)
|
||||
98
plugins/A_memorix/core/strategies/factual.py
Normal file
98
plugins/A_memorix/core/strategies/factual.py
Normal file
@@ -0,0 +1,98 @@
|
||||
import re
|
||||
from typing import List, Dict, Any
|
||||
from .base import BaseStrategy, ProcessedChunk, KnowledgeType, SourceInfo, ChunkContext
|
||||
|
||||
class FactualStrategy(BaseStrategy):
|
||||
def split(self, text: str) -> List[ProcessedChunk]:
|
||||
# 结构感知切分
|
||||
lines = text.split('\n')
|
||||
chunks = []
|
||||
current_chunk_lines = []
|
||||
current_len = 0
|
||||
target_size = 600
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
# 判断是否应当切分
|
||||
# 若当前行为列表项/定义/表格行,则尽量不切分
|
||||
is_structure = self._is_structural_line(line)
|
||||
|
||||
current_len += len(line) + 1
|
||||
current_chunk_lines.append(line)
|
||||
|
||||
# 达到目标长度且不在紧凑结构块内时切分(过长时强制切分)
|
||||
if current_len >= target_size and not is_structure:
|
||||
chunks.append(self._create_chunk(current_chunk_lines, len(chunks)))
|
||||
current_chunk_lines = []
|
||||
current_len = 0
|
||||
elif current_len >= target_size * 2: # 超长时强制切分
|
||||
chunks.append(self._create_chunk(current_chunk_lines, len(chunks)))
|
||||
current_chunk_lines = []
|
||||
current_len = 0
|
||||
|
||||
if current_chunk_lines:
|
||||
chunks.append(self._create_chunk(current_chunk_lines, len(chunks)))
|
||||
|
||||
return chunks
|
||||
|
||||
def _is_structural_line(self, line: str) -> bool:
|
||||
line = line.strip()
|
||||
if not line: return False
|
||||
# 列表项
|
||||
if re.match(r'^[\-\*]\s+', line) or re.match(r'^\d+\.\s+', line):
|
||||
return True
|
||||
# 定义项(术语: 定义)
|
||||
if re.match(r'^[^::]+[::].+', line):
|
||||
return True
|
||||
# 表格行(按 markdown 语法假设)
|
||||
if line.startswith('|') and line.endswith('|'):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _create_chunk(self, lines: List[str], index: int) -> ProcessedChunk:
|
||||
text = "\n".join(lines)
|
||||
return ProcessedChunk(
|
||||
type=KnowledgeType.FACTUAL,
|
||||
source=SourceInfo(
|
||||
file=self.filename,
|
||||
offset_start=0, # 简化处理:真实偏移跟踪需要额外状态
|
||||
offset_end=0,
|
||||
checksum=self.calculate_checksum(text)
|
||||
),
|
||||
chunk=ChunkContext(
|
||||
chunk_id=f"{self.filename}_{index}",
|
||||
index=index,
|
||||
text=text
|
||||
)
|
||||
)
|
||||
|
||||
async def extract(self, chunk: ProcessedChunk, llm_func=None) -> ProcessedChunk:
|
||||
if not llm_func:
|
||||
raise ValueError("LLM function required for Factual extraction")
|
||||
|
||||
language_guard = self.build_language_guard(chunk.chunk.text)
|
||||
prompt = f"""You are a factual knowledge extraction engine.
|
||||
Extract factual triples and entities from the text.
|
||||
Preserve lists and definitions accurately.
|
||||
|
||||
Language constraints:
|
||||
- {language_guard}
|
||||
- Preserve original names and domain terms exactly when possible.
|
||||
- JSON keys must stay exactly as: triples, entities, subject, predicate, object.
|
||||
|
||||
Text:
|
||||
{chunk.chunk.text}
|
||||
|
||||
Return ONLY valid JSON:
|
||||
{{
|
||||
"triples": [
|
||||
{{"subject": "Entity", "predicate": "Relationship", "object": "Entity"}}
|
||||
],
|
||||
"entities": ["Entity1", "Entity2"]
|
||||
}}
|
||||
"""
|
||||
result = await llm_func(prompt)
|
||||
|
||||
# 结果保持原样存入 data,后续统一归一化流程会处理
|
||||
# vector_store 侧期望关系字段为 subject/predicate/object 映射形式
|
||||
chunk.data = result
|
||||
return chunk
|
||||
126
plugins/A_memorix/core/strategies/narrative.py
Normal file
126
plugins/A_memorix/core/strategies/narrative.py
Normal file
@@ -0,0 +1,126 @@
|
||||
import re
|
||||
from typing import List, Dict, Any
|
||||
from .base import BaseStrategy, ProcessedChunk, KnowledgeType, SourceInfo, ChunkContext
|
||||
|
||||
class NarrativeStrategy(BaseStrategy):
|
||||
def split(self, text: str) -> List[ProcessedChunk]:
|
||||
scenes = self._split_into_scenes(text)
|
||||
chunks = []
|
||||
|
||||
for scene_idx, (scene_text, scene_title) in enumerate(scenes):
|
||||
scene_chunks = self._sliding_window(scene_text, scene_title, scene_idx)
|
||||
chunks.extend(scene_chunks)
|
||||
|
||||
return chunks
|
||||
|
||||
def _split_into_scenes(self, text: str) -> List[tuple[str, str]]:
|
||||
"""按标题或分隔符把文本切分为场景。"""
|
||||
# 简单启发式:按 markdown 标题或特定分隔符切分
|
||||
# 该正则匹配以 #、Chapter 或 *** / === 开头的分隔行
|
||||
# 该正则匹配以 #、Chapter 或 *** / === 开头的分隔行
|
||||
scene_pattern_str = r'^(?:#{1,6}\s+.*|Chapter\s+\d+|^\*{3,}$|^={3,}$)'
|
||||
|
||||
# 保留分隔符,以便识别场景起点
|
||||
parts = re.split(f"({scene_pattern_str})", text, flags=re.MULTILINE)
|
||||
|
||||
scenes = []
|
||||
current_scene_title = "Start"
|
||||
current_scene_content = []
|
||||
|
||||
if parts and parts[0].strip() == "":
|
||||
parts = parts[1:]
|
||||
|
||||
for part in parts:
|
||||
if re.match(scene_pattern_str, part, re.MULTILINE):
|
||||
# 先保存上一段场景
|
||||
if current_scene_content:
|
||||
scenes.append(("".join(current_scene_content), current_scene_title))
|
||||
current_scene_content = []
|
||||
current_scene_title = part.strip()
|
||||
else:
|
||||
current_scene_content.append(part)
|
||||
|
||||
if current_scene_content:
|
||||
scenes.append(("".join(current_scene_content), current_scene_title))
|
||||
|
||||
# 若未识别到场景,则把全文视作单一场景
|
||||
if not scenes:
|
||||
scenes = [(text, "Whole Text")]
|
||||
|
||||
return scenes
|
||||
|
||||
def _sliding_window(self, text: str, scene_id: str, scene_idx: int, window_size=800, overlap=200) -> List[ProcessedChunk]:
|
||||
chunks = []
|
||||
if len(text) <= window_size:
|
||||
chunks.append(self._create_chunk(text, scene_id, scene_idx, 0, 0))
|
||||
return chunks
|
||||
|
||||
stride = window_size - overlap
|
||||
start = 0
|
||||
local_idx = 0
|
||||
while start < len(text):
|
||||
end = min(start + window_size, len(text))
|
||||
chunk_text = text[start:end]
|
||||
|
||||
# 尽量对齐到最近换行,避免生硬截断句子
|
||||
# 仅在未到文本尾部时进行回退
|
||||
if end < len(text):
|
||||
last_newline = chunk_text.rfind('\n')
|
||||
if last_newline > window_size // 2: # 仅在回退距离可接受时启用
|
||||
end = start + last_newline + 1
|
||||
chunk_text = text[start:end]
|
||||
|
||||
chunks.append(self._create_chunk(chunk_text, scene_id, scene_idx, local_idx, start))
|
||||
|
||||
start += len(chunk_text) - overlap if end < len(text) else len(chunk_text)
|
||||
local_idx += 1
|
||||
|
||||
return chunks
|
||||
|
||||
def _create_chunk(self, text: str, scene_id: str, scene_idx: int, local_idx: int, offset: int) -> ProcessedChunk:
|
||||
return ProcessedChunk(
|
||||
type=KnowledgeType.NARRATIVE,
|
||||
source=SourceInfo(
|
||||
file=self.filename,
|
||||
offset_start=offset,
|
||||
offset_end=offset + len(text),
|
||||
checksum=self.calculate_checksum(text)
|
||||
),
|
||||
chunk=ChunkContext(
|
||||
chunk_id=f"{self.filename}_{scene_idx}_{local_idx}",
|
||||
index=local_idx,
|
||||
text=text,
|
||||
context={"scene_id": scene_id}
|
||||
)
|
||||
)
|
||||
|
||||
async def extract(self, chunk: ProcessedChunk, llm_func=None) -> ProcessedChunk:
|
||||
if not llm_func:
|
||||
raise ValueError("LLM function required for Narrative extraction")
|
||||
|
||||
language_guard = self.build_language_guard(chunk.chunk.text)
|
||||
prompt = f"""You are a narrative knowledge extraction engine.
|
||||
Extract key events and character relations from the scene text.
|
||||
|
||||
Language constraints:
|
||||
- {language_guard}
|
||||
- Preserve original names and terms exactly when possible.
|
||||
- JSON keys must stay exactly as: events, relations, subject, predicate, object.
|
||||
|
||||
Scene:
|
||||
{chunk.chunk.context.get('scene_id')}
|
||||
|
||||
Text:
|
||||
{chunk.chunk.text}
|
||||
|
||||
Return ONLY valid JSON:
|
||||
{{
|
||||
"events": ["event description 1", "event description 2"],
|
||||
"relations": [
|
||||
{{"subject": "CharacterA", "predicate": "relation", "object": "CharacterB"}}
|
||||
]
|
||||
}}
|
||||
"""
|
||||
result = await llm_func(prompt)
|
||||
chunk.data = result
|
||||
return chunk
|
||||
52
plugins/A_memorix/core/strategies/quote.py
Normal file
52
plugins/A_memorix/core/strategies/quote.py
Normal file
@@ -0,0 +1,52 @@
|
||||
from typing import List, Dict, Any
|
||||
from .base import BaseStrategy, ProcessedChunk, KnowledgeType, SourceInfo, ChunkContext, ChunkFlags
|
||||
|
||||
class QuoteStrategy(BaseStrategy):
|
||||
def split(self, text: str) -> List[ProcessedChunk]:
|
||||
# Split by double newlines (stanzas)
|
||||
stanzas = text.split("\n\n")
|
||||
chunks = []
|
||||
offset = 0
|
||||
|
||||
for idx, stanza in enumerate(stanzas):
|
||||
if not stanza.strip():
|
||||
offset += len(stanza) + 2
|
||||
continue
|
||||
|
||||
chunk = ProcessedChunk(
|
||||
type=KnowledgeType.QUOTE,
|
||||
source=SourceInfo(
|
||||
file=self.filename,
|
||||
offset_start=offset,
|
||||
offset_end=offset + len(stanza),
|
||||
checksum=self.calculate_checksum(stanza)
|
||||
),
|
||||
chunk=ChunkContext(
|
||||
chunk_id=f"{self.filename}_{idx}",
|
||||
index=idx,
|
||||
text=stanza
|
||||
),
|
||||
flags=ChunkFlags(
|
||||
verbatim=True,
|
||||
requires_llm=False # Default to no LLM, but can be overridden
|
||||
)
|
||||
)
|
||||
chunks.append(chunk)
|
||||
offset += len(stanza) + 2 # +2 for \n\n
|
||||
|
||||
return chunks
|
||||
|
||||
async def extract(self, chunk: ProcessedChunk, llm_func=None) -> ProcessedChunk:
|
||||
# For quotes, the text itself is the entity/knowledge
|
||||
# We might use LLM to extract headers/metadata if requested, but core logic is pass-through
|
||||
|
||||
# Treat the whole chunk text as a verbatim entity
|
||||
chunk.data = {
|
||||
"verbatim_entities": [chunk.chunk.text]
|
||||
}
|
||||
|
||||
if llm_func and chunk.flags.requires_llm:
|
||||
# Optional: Extract metadata
|
||||
pass
|
||||
|
||||
return chunk
|
||||
Reference in New Issue
Block a user