feat：记忆遗忘，优化记忆提取

2025-11-12 01:29:11 +08:00
parent 2d6eba7da1
commit 012e0460e5
14 changed files with 464 additions and 1359 deletions
--- a/src/memory_system/memory_utils.py
+++ b/src/memory_system/memory_utils.py
@@ -6,40 +6,12 @@
 import json
 import re
 from difflib import SequenceMatcher
-from typing import List, Tuple, Optional

-from src.common.database.database_model import MemoryChest as MemoryChestModel
 from src.common.logger import get_logger
-from json_repair import repair_json
-from src.config.config import global_config


 logger = get_logger("memory_utils")

-def get_all_titles(exclude_locked: bool = False) -> list[str]:
-    """
-    获取记忆仓库中的所有标题
-
-    Args:
-        exclude_locked: 是否排除锁定的记忆，默认为 False
-
-    Returns:
-        list: 包含所有标题的列表
-    """
-    try:
-        # 查询所有记忆记录的标题
-        titles = []
-        for memory in MemoryChestModel.select():
-            if memory.title:
-                # 如果 exclude_locked 为 True 且记忆已锁定，则跳过
-                if exclude_locked and memory.locked:
-                    continue
-                titles.append(memory.title)
-        return titles
-    except Exception as e:
-        print(f"获取记忆标题时出错: {e}")
-        return []
-
 def parse_md_json(json_text: str) -> list[str]:
    """从Markdown格式的内容中提取JSON对象和推理内容"""
    json_objects = []
@@ -134,259 +106,3 @@ def preprocess_text(text: str) -> str:
        logger.error(f"预处理文本时出错: {e}")
        return text

-
-def fuzzy_find_memory_by_title(target_title: str, similarity_threshold: float = 0.9) -> List[Tuple[str, str, float]]:
-    """
-    根据标题模糊查找记忆
-    
-    Args:
-        target_title: 目标标题
-        similarity_threshold: 相似度阈值，默认0.9
-        
-    Returns:
-        List[Tuple[str, str, float]]: 匹配的记忆列表，每个元素为(title, content, similarity_score)
-    """
-    try:
-        # 获取所有记忆
-        all_memories = MemoryChestModel.select()
-        
-        matches = []
-        for memory in all_memories:
-            similarity = calculate_similarity(target_title, memory.title)
-            if similarity >= similarity_threshold:
-                matches.append((memory.title, memory.content, similarity))
-        
-        # 按相似度降序排序
-        matches.sort(key=lambda x: x[2], reverse=True)
-        
-        # logger.info(f"模糊查找标题 '{target_title}' 找到 {len(matches)} 个匹配项")
-        return matches
-        
-    except Exception as e:
-        logger.error(f"模糊查找记忆时出错: {e}")
-        return []
-
-
-def find_best_matching_memory(target_title: str, similarity_threshold: float = 0.9) -> Optional[Tuple[str, str, float]]:
-    """
-    查找最佳匹配的记忆
-    
-    Args:
-        target_title: 目标标题
-        similarity_threshold: 相似度阈值
-        
-    Returns:
-        Optional[Tuple[str, str, float]]: 最佳匹配的记忆(title, content, similarity)或None
-    """
-    try:
-        matches = fuzzy_find_memory_by_title(target_title, similarity_threshold)
-        
-        if matches:
-            best_match = matches[0]  # 已经按相似度排序，第一个是最佳匹配
-            # logger.info(f"找到最佳匹配: '{best_match[0]}' (相似度: {best_match[2]:.3f})")
-            return best_match
-        else:
-            logger.info(f"未找到相似度 >= {similarity_threshold} 的记忆")
-            return None
-            
-    except Exception as e:
-        logger.error(f"查找最佳匹配记忆时出错: {e}")
-        return None
-
-
-def check_title_exists_fuzzy(target_title: str, similarity_threshold: float = 0.9) -> bool:
-    """
-    检查标题是否已存在（模糊匹配）
-    
-    Args:
-        target_title: 目标标题
-        similarity_threshold: 相似度阈值，默认0.9（较高阈值避免误判）
-        
-    Returns:
-        bool: 是否存在相似标题
-    """
-    try:
-        matches = fuzzy_find_memory_by_title(target_title, similarity_threshold)
-        exists = len(matches) > 0
-        
-        if exists:
-            logger.info(f"发现相似标题: '{matches[0][0]}' (相似度: {matches[0][2]:.3f})")
-        else:
-            logger.debug("未发现相似标题")
-            
-        return exists
-        
-    except Exception as e:
-        logger.error(f"检查标题是否存在时出错: {e}")
-        return False
-
-
-def get_memories_by_chat_id_weighted(target_chat_id: str, same_chat_weight: float = 0.95, other_chat_weight: float = 0.05) -> List[Tuple[str, str, str]]:
-    """
-    根据chat_id进行加权抽样获取记忆列表
-    
-    Args:
-        target_chat_id: 目标聊天ID
-        same_chat_weight: 同chat_id记忆的权重，默认0.95（95%概率）
-        other_chat_weight: 其他chat_id记忆的权重，默认0.05（5%概率）
-        
-    Returns:
-        List[Tuple[str, str, str]]: 选中的记忆列表，每个元素为(title, content, chat_id)
-    """
-    try:
-        # 获取所有记忆
-        all_memories = MemoryChestModel.select()
-        
-        # 按chat_id分组
-        same_chat_memories = []
-        other_chat_memories = []
-        
-        for memory in all_memories:
-            if memory.title and not memory.locked:  # 排除锁定的记忆
-                if memory.chat_id == target_chat_id:
-                    same_chat_memories.append((memory.title, memory.content, memory.chat_id))
-                else:
-                    other_chat_memories.append((memory.title, memory.content, memory.chat_id))
-        
-        # 如果没有同chat_id的记忆，返回空列表
-        if not same_chat_memories:
-            logger.warning(f"未找到chat_id为 '{target_chat_id}' 的记忆")
-            return []
-        
-        # 计算抽样数量
-        total_same = len(same_chat_memories)
-        total_other = len(other_chat_memories)
-        
-        # 根据权重计算抽样数量
-        if total_other > 0:
-            # 计算其他chat_id记忆的抽样数量（至少1个，最多不超过总数的10%）
-            other_sample_count = max(1, min(total_other, int(total_same * other_chat_weight / same_chat_weight)))
-        else:
-            other_sample_count = 0
-        
-        # 随机抽样
-        selected_memories = []
-        
-        # 选择同chat_id的记忆（全部选择，因为权重很高）
-        selected_memories.extend(same_chat_memories)
-        
-        # 随机选择其他chat_id的记忆
-        if other_sample_count > 0 and total_other > 0:
-            import random
-            other_selected = random.sample(other_chat_memories, min(other_sample_count, total_other))
-            selected_memories.extend(other_selected)
-        
-        logger.info(f"加权抽样结果: 同chat_id记忆 {len(same_chat_memories)} 条，其他chat_id记忆 {min(other_sample_count, total_other)} 条")
-        
-        return selected_memories
-        
-    except Exception as e:
-        logger.error(f"按chat_id加权抽样记忆时出错: {e}")
-        return []
-
-
-def get_memory_titles_by_chat_id_weighted(target_chat_id: str, same_chat_weight: float = 0.95, other_chat_weight: float = 0.05) -> List[str]:
-    """
-    根据chat_id进行加权抽样获取记忆标题列表（用于合并选择）
-    
-    Args:
-        target_chat_id: 目标聊天ID
-        same_chat_weight: 同chat_id记忆的权重，默认0.95（95%概率）
-        other_chat_weight: 其他chat_id记忆的权重，默认0.05（5%概率）
-        
-    Returns:
-        List[str]: 选中的记忆标题列表
-    """
-    try:
-        memories = get_memories_by_chat_id_weighted(target_chat_id, same_chat_weight, other_chat_weight)
-        titles = [memory[0] for memory in memories]  # 提取标题
-        return titles
-        
-    except Exception as e:
-        logger.error(f"按chat_id加权抽样记忆标题时出错: {e}")
-        return []
-
-
-def find_most_similar_memory_by_chat_id(target_title: str, target_chat_id: str, similarity_threshold: float = 0.5) -> Optional[Tuple[str, str, float]]:
-    """
-    在指定chat_id的记忆中查找最相似的记忆
-    
-    Args:
-        target_title: 目标标题
-        target_chat_id: 目标聊天ID
-        similarity_threshold: 相似度阈值，默认0.7
-        
-    Returns:
-        Optional[Tuple[str, str, float]]: 最相似的记忆(title, content, similarity)或None
-    """
-    try:
-        # 获取指定chat_id的所有记忆
-        same_chat_memories = []
-        for memory in MemoryChestModel.select():
-            if memory.title and not memory.locked and memory.chat_id == target_chat_id:
-                same_chat_memories.append((memory.title, memory.content))
-        
-        if not same_chat_memories:
-            logger.warning(f"未找到chat_id为 '{target_chat_id}' 的记忆")
-            return None
-        
-        # 计算相似度并找到最佳匹配
-        best_match = None
-        best_similarity = 0.0
-        
-        for title, content in same_chat_memories:
-            # 跳过目标标题本身
-            if title.strip() == target_title.strip():
-                continue
-                
-            similarity = calculate_similarity(target_title, title)
-            
-            if similarity > best_similarity:
-                best_similarity = similarity
-                best_match = (title, content, similarity)
-        
-        # 检查是否超过阈值
-        if best_match and best_similarity >= similarity_threshold:
-            logger.info(f"找到最相似记忆: '{best_match[0]}' (相似度: {best_similarity:.3f})")
-            return best_match
-        else:
-            logger.info(f"未找到相似度 >= {similarity_threshold} 的记忆，最高相似度: {best_similarity:.3f}")
-            return None
-            
-    except Exception as e:
-        logger.error(f"查找最相似记忆时出错: {e}")
-        return None
-    
-    
-    
-def compute_merge_similarity_threshold() -> float:
-    """
-    根据当前记忆数量占比动态计算合并相似度阈值。
-
-    规则：占比越高，阈值越低。
-    - < 60%: 0.80（更严格，避免早期误合并）
-    - < 80%: 0.70
-    - < 100%: 0.60
-    - < 120%: 0.50
-    - >= 120%: 0.45（最宽松，加速收敛）
-    """
-    try:
-        current_count = MemoryChestModel.select().count()
-        max_count = max(1, int(global_config.memory.max_memory_number))
-        percentage = current_count / max_count
-
-        if percentage < 0.6:
-            return 0.70
-        elif percentage < 0.8:
-            return 0.60
-        elif percentage < 1.0:
-            return 0.50
-        elif percentage < 1.5:
-            return 0.40
-        elif percentage < 2:
-            return 0.30
-        else:
-            return 0.25
-    except Exception:
-        # 发生异常时使用保守阈值
-        return 0.70