# -*- coding: utf-8 -*- """ 记忆系统工具函数 包含模糊查找、相似度计算等工具函数 """ import json import re from difflib import SequenceMatcher from typing import List, Tuple, Optional from src.common.database.database_model import MemoryChest as MemoryChestModel from src.common.logger import get_logger from json_repair import repair_json logger = get_logger("memory_utils") def get_all_titles(exclude_locked: bool = False) -> list[str]: """ 获取记忆仓库中的所有标题 Args: exclude_locked: 是否排除锁定的记忆,默认为 False Returns: list: 包含所有标题的列表 """ try: # 查询所有记忆记录的标题 titles = [] for memory in MemoryChestModel.select(): if memory.title: # 如果 exclude_locked 为 True 且记忆已锁定,则跳过 if exclude_locked and memory.locked: continue titles.append(memory.title) return titles except Exception as e: print(f"获取记忆标题时出错: {e}") return [] def parse_md_json(json_text: str) -> list[str]: """从Markdown格式的内容中提取JSON对象和推理内容""" json_objects = [] reasoning_content = "" # 使用正则表达式查找```json包裹的JSON内容 json_pattern = r"```json\s*(.*?)\s*```" matches = re.findall(json_pattern, json_text, re.DOTALL) # 提取JSON之前的内容作为推理文本 if matches: # 找到第一个```json的位置 first_json_pos = json_text.find("```json") if first_json_pos > 0: reasoning_content = json_text[:first_json_pos].strip() # 清理推理内容中的注释标记 reasoning_content = re.sub(r"^//\s*", "", reasoning_content, flags=re.MULTILINE) reasoning_content = reasoning_content.strip() for match in matches: try: # 清理可能的注释和格式问题 json_str = re.sub(r"//.*?\n", "\n", match) # 移除单行注释 json_str = re.sub(r"/\*.*?\*/", "", json_str, flags=re.DOTALL) # 移除多行注释 if json_str := json_str.strip(): json_obj = json.loads(json_str) if isinstance(json_obj, dict): json_objects.append(json_obj) elif isinstance(json_obj, list): for item in json_obj: if isinstance(item, dict): json_objects.append(item) except Exception as e: logger.warning(f"解析JSON块失败: {e}, 块内容: {match[:100]}...") continue return json_objects, reasoning_content def calculate_similarity(text1: str, text2: str) -> float: """ 计算两个文本的相似度 Args: text1: 第一个文本 text2: 第二个文本 Returns: float: 相似度分数 (0-1) """ try: # 预处理文本 text1 = preprocess_text(text1) text2 = preprocess_text(text2) # 使用SequenceMatcher计算相似度 similarity = SequenceMatcher(None, text1, text2).ratio() # 如果其中一个文本包含另一个,提高相似度 if text1 in text2 or text2 in text1: similarity = max(similarity, 0.8) return similarity except Exception as e: logger.error(f"计算相似度时出错: {e}") return 0.0 def preprocess_text(text: str) -> str: """ 预处理文本,提高匹配准确性 Args: text: 原始文本 Returns: str: 预处理后的文本 """ try: # 转换为小写 text = text.lower() # 移除标点符号和特殊字符 text = re.sub(r'[^\w\s]', '', text) # 移除多余空格 text = re.sub(r'\s+', ' ', text).strip() return text except Exception as e: logger.error(f"预处理文本时出错: {e}") return text def fuzzy_find_memory_by_title(target_title: str, similarity_threshold: float = 0.9) -> List[Tuple[str, str, float]]: """ 根据标题模糊查找记忆 Args: target_title: 目标标题 similarity_threshold: 相似度阈值,默认0.9 Returns: List[Tuple[str, str, float]]: 匹配的记忆列表,每个元素为(title, content, similarity_score) """ try: # 获取所有记忆 all_memories = MemoryChestModel.select() matches = [] for memory in all_memories: similarity = calculate_similarity(target_title, memory.title) if similarity >= similarity_threshold: matches.append((memory.title, memory.content, similarity)) # 按相似度降序排序 matches.sort(key=lambda x: x[2], reverse=True) # logger.info(f"模糊查找标题 '{target_title}' 找到 {len(matches)} 个匹配项") return matches except Exception as e: logger.error(f"模糊查找记忆时出错: {e}") return [] def find_best_matching_memory(target_title: str, similarity_threshold: float = 0.9) -> Optional[Tuple[str, str, float]]: """ 查找最佳匹配的记忆 Args: target_title: 目标标题 similarity_threshold: 相似度阈值 Returns: Optional[Tuple[str, str, float]]: 最佳匹配的记忆(title, content, similarity)或None """ try: matches = fuzzy_find_memory_by_title(target_title, similarity_threshold) if matches: best_match = matches[0] # 已经按相似度排序,第一个是最佳匹配 # logger.info(f"找到最佳匹配: '{best_match[0]}' (相似度: {best_match[2]:.3f})") return best_match else: logger.info(f"未找到相似度 >= {similarity_threshold} 的记忆") return None except Exception as e: logger.error(f"查找最佳匹配记忆时出错: {e}") return None def check_title_exists_fuzzy(target_title: str, similarity_threshold: float = 0.9) -> bool: """ 检查标题是否已存在(模糊匹配) Args: target_title: 目标标题 similarity_threshold: 相似度阈值,默认0.9(较高阈值避免误判) Returns: bool: 是否存在相似标题 """ try: matches = fuzzy_find_memory_by_title(target_title, similarity_threshold) exists = len(matches) > 0 if exists: logger.info(f"发现相似标题: '{matches[0][0]}' (相似度: {matches[0][2]:.3f})") else: logger.debug("未发现相似标题") return exists except Exception as e: logger.error(f"检查标题是否存在时出错: {e}") return False def get_memories_by_chat_id_weighted(target_chat_id: str, same_chat_weight: float = 0.95, other_chat_weight: float = 0.05) -> List[Tuple[str, str, str]]: """ 根据chat_id进行加权抽样获取记忆列表 Args: target_chat_id: 目标聊天ID same_chat_weight: 同chat_id记忆的权重,默认0.95(95%概率) other_chat_weight: 其他chat_id记忆的权重,默认0.05(5%概率) Returns: List[Tuple[str, str, str]]: 选中的记忆列表,每个元素为(title, content, chat_id) """ try: # 获取所有记忆 all_memories = MemoryChestModel.select() # 按chat_id分组 same_chat_memories = [] other_chat_memories = [] for memory in all_memories: if memory.title and not memory.locked: # 排除锁定的记忆 if memory.chat_id == target_chat_id: same_chat_memories.append((memory.title, memory.content, memory.chat_id)) else: other_chat_memories.append((memory.title, memory.content, memory.chat_id)) # 如果没有同chat_id的记忆,返回空列表 if not same_chat_memories: logger.warning(f"未找到chat_id为 '{target_chat_id}' 的记忆") return [] # 计算抽样数量 total_same = len(same_chat_memories) total_other = len(other_chat_memories) # 根据权重计算抽样数量 if total_other > 0: # 计算其他chat_id记忆的抽样数量(至少1个,最多不超过总数的10%) other_sample_count = max(1, min(total_other, int(total_same * other_chat_weight / same_chat_weight))) else: other_sample_count = 0 # 随机抽样 selected_memories = [] # 选择同chat_id的记忆(全部选择,因为权重很高) selected_memories.extend(same_chat_memories) # 随机选择其他chat_id的记忆 if other_sample_count > 0 and total_other > 0: import random other_selected = random.sample(other_chat_memories, min(other_sample_count, total_other)) selected_memories.extend(other_selected) logger.info(f"加权抽样结果: 同chat_id记忆 {len(same_chat_memories)} 条,其他chat_id记忆 {min(other_sample_count, total_other)} 条") return selected_memories except Exception as e: logger.error(f"按chat_id加权抽样记忆时出错: {e}") return [] def get_memory_titles_by_chat_id_weighted(target_chat_id: str, same_chat_weight: float = 0.95, other_chat_weight: float = 0.05) -> List[str]: """ 根据chat_id进行加权抽样获取记忆标题列表(用于合并选择) Args: target_chat_id: 目标聊天ID same_chat_weight: 同chat_id记忆的权重,默认0.95(95%概率) other_chat_weight: 其他chat_id记忆的权重,默认0.05(5%概率) Returns: List[str]: 选中的记忆标题列表 """ try: memories = get_memories_by_chat_id_weighted(target_chat_id, same_chat_weight, other_chat_weight) titles = [memory[0] for memory in memories] # 提取标题 return titles except Exception as e: logger.error(f"按chat_id加权抽样记忆标题时出错: {e}") return [] def find_most_similar_memory_by_chat_id(target_title: str, target_chat_id: str, similarity_threshold: float = 0.5) -> Optional[Tuple[str, str, float]]: """ 在指定chat_id的记忆中查找最相似的记忆 Args: target_title: 目标标题 target_chat_id: 目标聊天ID similarity_threshold: 相似度阈值,默认0.7 Returns: Optional[Tuple[str, str, float]]: 最相似的记忆(title, content, similarity)或None """ try: # 获取指定chat_id的所有记忆 same_chat_memories = [] for memory in MemoryChestModel.select(): if memory.title and not memory.locked and memory.chat_id == target_chat_id: same_chat_memories.append((memory.title, memory.content)) if not same_chat_memories: logger.warning(f"未找到chat_id为 '{target_chat_id}' 的记忆") return None # 计算相似度并找到最佳匹配 best_match = None best_similarity = 0.0 for title, content in same_chat_memories: # 跳过目标标题本身 if title.strip() == target_title.strip(): continue similarity = calculate_similarity(target_title, title) if similarity > best_similarity: best_similarity = similarity best_match = (title, content, similarity) # 检查是否超过阈值 if best_match and best_similarity >= similarity_threshold: logger.info(f"找到最相似记忆: '{best_match[0]}' (相似度: {best_similarity:.3f})") return best_match else: logger.info(f"未找到相似度 >= {similarity_threshold} 的记忆,最高相似度: {best_similarity:.3f}") return None except Exception as e: logger.error(f"查找最相似记忆时出错: {e}") return None