feat:记忆遗忘,优化记忆提取

This commit is contained in:
SengokuCola
2025-11-12 01:29:11 +08:00
parent 2d6eba7da1
commit 012e0460e5
14 changed files with 464 additions and 1359 deletions

View File

@@ -6,40 +6,12 @@
import json
import re
from difflib import SequenceMatcher
from typing import List, Tuple, Optional
from src.common.database.database_model import MemoryChest as MemoryChestModel
from src.common.logger import get_logger
from json_repair import repair_json
from src.config.config import global_config
logger = get_logger("memory_utils")
def get_all_titles(exclude_locked: bool = False) -> list[str]:
"""
获取记忆仓库中的所有标题
Args:
exclude_locked: 是否排除锁定的记忆,默认为 False
Returns:
list: 包含所有标题的列表
"""
try:
# 查询所有记忆记录的标题
titles = []
for memory in MemoryChestModel.select():
if memory.title:
# 如果 exclude_locked 为 True 且记忆已锁定,则跳过
if exclude_locked and memory.locked:
continue
titles.append(memory.title)
return titles
except Exception as e:
print(f"获取记忆标题时出错: {e}")
return []
def parse_md_json(json_text: str) -> list[str]:
"""从Markdown格式的内容中提取JSON对象和推理内容"""
json_objects = []
@@ -134,259 +106,3 @@ def preprocess_text(text: str) -> str:
logger.error(f"预处理文本时出错: {e}")
return text
def fuzzy_find_memory_by_title(target_title: str, similarity_threshold: float = 0.9) -> List[Tuple[str, str, float]]:
"""
根据标题模糊查找记忆
Args:
target_title: 目标标题
similarity_threshold: 相似度阈值默认0.9
Returns:
List[Tuple[str, str, float]]: 匹配的记忆列表,每个元素为(title, content, similarity_score)
"""
try:
# 获取所有记忆
all_memories = MemoryChestModel.select()
matches = []
for memory in all_memories:
similarity = calculate_similarity(target_title, memory.title)
if similarity >= similarity_threshold:
matches.append((memory.title, memory.content, similarity))
# 按相似度降序排序
matches.sort(key=lambda x: x[2], reverse=True)
# logger.info(f"模糊查找标题 '{target_title}' 找到 {len(matches)} 个匹配项")
return matches
except Exception as e:
logger.error(f"模糊查找记忆时出错: {e}")
return []
def find_best_matching_memory(target_title: str, similarity_threshold: float = 0.9) -> Optional[Tuple[str, str, float]]:
"""
查找最佳匹配的记忆
Args:
target_title: 目标标题
similarity_threshold: 相似度阈值
Returns:
Optional[Tuple[str, str, float]]: 最佳匹配的记忆(title, content, similarity)或None
"""
try:
matches = fuzzy_find_memory_by_title(target_title, similarity_threshold)
if matches:
best_match = matches[0] # 已经按相似度排序,第一个是最佳匹配
# logger.info(f"找到最佳匹配: '{best_match[0]}' (相似度: {best_match[2]:.3f})")
return best_match
else:
logger.info(f"未找到相似度 >= {similarity_threshold} 的记忆")
return None
except Exception as e:
logger.error(f"查找最佳匹配记忆时出错: {e}")
return None
def check_title_exists_fuzzy(target_title: str, similarity_threshold: float = 0.9) -> bool:
"""
检查标题是否已存在(模糊匹配)
Args:
target_title: 目标标题
similarity_threshold: 相似度阈值默认0.9(较高阈值避免误判)
Returns:
bool: 是否存在相似标题
"""
try:
matches = fuzzy_find_memory_by_title(target_title, similarity_threshold)
exists = len(matches) > 0
if exists:
logger.info(f"发现相似标题: '{matches[0][0]}' (相似度: {matches[0][2]:.3f})")
else:
logger.debug("未发现相似标题")
return exists
except Exception as e:
logger.error(f"检查标题是否存在时出错: {e}")
return False
def get_memories_by_chat_id_weighted(target_chat_id: str, same_chat_weight: float = 0.95, other_chat_weight: float = 0.05) -> List[Tuple[str, str, str]]:
"""
根据chat_id进行加权抽样获取记忆列表
Args:
target_chat_id: 目标聊天ID
same_chat_weight: 同chat_id记忆的权重默认0.9595%概率)
other_chat_weight: 其他chat_id记忆的权重默认0.055%概率)
Returns:
List[Tuple[str, str, str]]: 选中的记忆列表,每个元素为(title, content, chat_id)
"""
try:
# 获取所有记忆
all_memories = MemoryChestModel.select()
# 按chat_id分组
same_chat_memories = []
other_chat_memories = []
for memory in all_memories:
if memory.title and not memory.locked: # 排除锁定的记忆
if memory.chat_id == target_chat_id:
same_chat_memories.append((memory.title, memory.content, memory.chat_id))
else:
other_chat_memories.append((memory.title, memory.content, memory.chat_id))
# 如果没有同chat_id的记忆返回空列表
if not same_chat_memories:
logger.warning(f"未找到chat_id为 '{target_chat_id}' 的记忆")
return []
# 计算抽样数量
total_same = len(same_chat_memories)
total_other = len(other_chat_memories)
# 根据权重计算抽样数量
if total_other > 0:
# 计算其他chat_id记忆的抽样数量至少1个最多不超过总数的10%
other_sample_count = max(1, min(total_other, int(total_same * other_chat_weight / same_chat_weight)))
else:
other_sample_count = 0
# 随机抽样
selected_memories = []
# 选择同chat_id的记忆全部选择因为权重很高
selected_memories.extend(same_chat_memories)
# 随机选择其他chat_id的记忆
if other_sample_count > 0 and total_other > 0:
import random
other_selected = random.sample(other_chat_memories, min(other_sample_count, total_other))
selected_memories.extend(other_selected)
logger.info(f"加权抽样结果: 同chat_id记忆 {len(same_chat_memories)}其他chat_id记忆 {min(other_sample_count, total_other)}")
return selected_memories
except Exception as e:
logger.error(f"按chat_id加权抽样记忆时出错: {e}")
return []
def get_memory_titles_by_chat_id_weighted(target_chat_id: str, same_chat_weight: float = 0.95, other_chat_weight: float = 0.05) -> List[str]:
"""
根据chat_id进行加权抽样获取记忆标题列表用于合并选择
Args:
target_chat_id: 目标聊天ID
same_chat_weight: 同chat_id记忆的权重默认0.9595%概率)
other_chat_weight: 其他chat_id记忆的权重默认0.055%概率)
Returns:
List[str]: 选中的记忆标题列表
"""
try:
memories = get_memories_by_chat_id_weighted(target_chat_id, same_chat_weight, other_chat_weight)
titles = [memory[0] for memory in memories] # 提取标题
return titles
except Exception as e:
logger.error(f"按chat_id加权抽样记忆标题时出错: {e}")
return []
def find_most_similar_memory_by_chat_id(target_title: str, target_chat_id: str, similarity_threshold: float = 0.5) -> Optional[Tuple[str, str, float]]:
"""
在指定chat_id的记忆中查找最相似的记忆
Args:
target_title: 目标标题
target_chat_id: 目标聊天ID
similarity_threshold: 相似度阈值默认0.7
Returns:
Optional[Tuple[str, str, float]]: 最相似的记忆(title, content, similarity)或None
"""
try:
# 获取指定chat_id的所有记忆
same_chat_memories = []
for memory in MemoryChestModel.select():
if memory.title and not memory.locked and memory.chat_id == target_chat_id:
same_chat_memories.append((memory.title, memory.content))
if not same_chat_memories:
logger.warning(f"未找到chat_id为 '{target_chat_id}' 的记忆")
return None
# 计算相似度并找到最佳匹配
best_match = None
best_similarity = 0.0
for title, content in same_chat_memories:
# 跳过目标标题本身
if title.strip() == target_title.strip():
continue
similarity = calculate_similarity(target_title, title)
if similarity > best_similarity:
best_similarity = similarity
best_match = (title, content, similarity)
# 检查是否超过阈值
if best_match and best_similarity >= similarity_threshold:
logger.info(f"找到最相似记忆: '{best_match[0]}' (相似度: {best_similarity:.3f})")
return best_match
else:
logger.info(f"未找到相似度 >= {similarity_threshold} 的记忆,最高相似度: {best_similarity:.3f}")
return None
except Exception as e:
logger.error(f"查找最相似记忆时出错: {e}")
return None
def compute_merge_similarity_threshold() -> float:
"""
根据当前记忆数量占比动态计算合并相似度阈值。
规则:占比越高,阈值越低。
- < 60%: 0.80(更严格,避免早期误合并)
- < 80%: 0.70
- < 100%: 0.60
- < 120%: 0.50
- >= 120%: 0.45(最宽松,加速收敛)
"""
try:
current_count = MemoryChestModel.select().count()
max_count = max(1, int(global_config.memory.max_memory_number))
percentage = current_count / max_count
if percentage < 0.6:
return 0.70
elif percentage < 0.8:
return 0.60
elif percentage < 1.0:
return 0.50
elif percentage < 1.5:
return 0.40
elif percentage < 2:
return 0.30
else:
return 0.25
except Exception:
# 发生异常时使用保守阈值
return 0.70