import json import asyncio import random from collections import OrderedDict from typing import List, Dict, Optional, Callable from json_repair import repair_json from sqlalchemy import func as fn from src.common.logger import get_logger from src.common.database.database_model import Jargon from src.llm_models.utils_model import LLMRequest from src.config.config import model_config, global_config from src.chat.message_receive.chat_manager import chat_manager as _chat_manager from src.prompt.prompt_manager import prompt_manager from src.bw_learner.learner_utils import ( parse_chat_id_list, chat_id_list_contains, update_chat_id_list, ) logger = get_logger("jargon") def _is_single_char_jargon(content: str) -> bool: """ 判断是否是单字黑话(单个汉字、英文或数字) Args: content: 词条内容 Returns: bool: 如果是单字黑话返回True,否则返回False """ if not content or len(content) != 1: return False char = content[0] # 判断是否是单个汉字、单个英文字母或单个数字 return ( "\u4e00" <= char <= "\u9fff" # 汉字 or "a" <= char <= "z" # 小写字母 or "A" <= char <= "Z" # 大写字母 or "0" <= char <= "9" # 数字 ) def _should_infer_meaning(jargon_obj: Jargon) -> bool: """ 判断是否需要进行含义推断 在 count 达到 3,6, 10, 20, 40, 60, 100 时进行推断 并且count必须大于last_inference_count,避免重启后重复判定 如果is_complete为True,不再进行推断 """ # 如果已完成所有推断,不再推断 if jargon_obj.is_complete: return False count = jargon_obj.count or 0 last_inference = jargon_obj.last_inference_count or 0 # 阈值列表:3,6, 10, 20, 40, 60, 100 thresholds = [2, 4, 8, 12, 24, 60, 100] if count < thresholds[0]: return False # 如果count没有超过上次判定值,不需要判定 if count <= last_inference: return False # 找到第一个大于last_inference的阈值 next_threshold = None for threshold in thresholds: if threshold > last_inference: next_threshold = threshold break # 如果没有找到下一个阈值,说明已经超过100,不应该再推断 if next_threshold is None: return False # 检查count是否达到或超过这个阈值 return count >= next_threshold class JargonMiner: def __init__(self, chat_id: str) -> None: self.chat_id = chat_id self.llm = LLMRequest( model_set=model_config.model_task_config.utils, request_type="jargon.extract", ) self.llm_inference = LLMRequest( model_set=model_config.model_task_config.utils, request_type="jargon.inference", ) # 初始化stream_name作为类属性,避免重复提取 chat_manager = _chat_manager stream_name = chat_manager.get_session_name(self.chat_id) self.stream_name = stream_name or self.chat_id self.cache_limit = 50 self.cache: OrderedDict[str, None] = OrderedDict() # 黑话提取锁,防止并发执行 self._extraction_lock = asyncio.Lock() def _add_to_cache(self, content: str) -> None: """将提取到的黑话加入缓存,保持LRU语义""" if not content: return key = content.strip() if not key: return # 单字黑话(单个汉字、英文或数字)不记录到缓存 if _is_single_char_jargon(key): return if key in self.cache: self.cache.move_to_end(key) else: self.cache[key] = None if len(self.cache) > self.cache_limit: self.cache.popitem(last=False) def get_cached_jargons(self) -> List[str]: """获取缓存中的所有黑话列表""" return list(self.cache.keys()) async def _infer_meaning_by_id(self, jargon_id: int) -> None: """通过ID加载对象并推断""" try: jargon_obj = Jargon.get_by_id(jargon_id) # 再次检查is_complete,因为可能在异步任务执行时已被标记为完成 if jargon_obj.is_complete: logger.debug(f"jargon {jargon_obj.content} 已完成所有推断,跳过") return await self.infer_meaning(jargon_obj) except Exception as e: logger.error(f"通过ID推断jargon失败: {e}") async def infer_meaning(self, jargon_obj: Jargon) -> None: """ 对jargon进行含义推断 """ try: content = jargon_obj.content raw_content_str = jargon_obj.raw_content or "" # 解析raw_content列表 raw_content_list = [] if raw_content_str: try: raw_content_list = ( json.loads(raw_content_str) if isinstance(raw_content_str, str) else raw_content_str ) if not isinstance(raw_content_list, list): raw_content_list = [raw_content_list] if raw_content_list else [] except (json.JSONDecodeError, TypeError): raw_content_list = [raw_content_str] if raw_content_str else [] if not raw_content_list: logger.warning(f"jargon {content} 没有raw_content,跳过推断") return # 获取当前count和上一次的meaning current_count = jargon_obj.count or 0 previous_meaning = jargon_obj.meaning or "" # 当count为24, 60时,随机移除一半的raw_content项目 if current_count in [24, 60] and len(raw_content_list) > 1: # 计算要保留的数量(至少保留1个) keep_count = max(1, len(raw_content_list) // 2) raw_content_list = random.sample(raw_content_list, keep_count) logger.info( f"jargon {content} count={current_count},随机移除后剩余 {len(raw_content_list)} 个raw_content项目" ) # 步骤1: 基于raw_content和content推断 raw_content_text = "\n".join(raw_content_list) # 当count为24, 60, 100时,在prompt中放入上一次推断出的meaning作为参考 previous_meaning_section = "" previous_meaning_instruction = "" if current_count in [24, 60, 100] and previous_meaning: previous_meaning_section = f"\n**上一次推断的含义(仅供参考)**\n{previous_meaning}" previous_meaning_instruction = ( "- 请参考上一次推断的含义,结合新的上下文信息,给出更准确或更新的推断结果" ) prompt1_template = prompt_manager.get_prompt("jargon_inference_with_context") prompt1_template.add_context("bot_name", global_config.bot.nickname) prompt1_template.add_context("content", str(content)) prompt1_template.add_context("raw_content_list", raw_content_text) prompt1_template.add_context("previous_meaning_section", previous_meaning_section) prompt1_template.add_context("previous_meaning_instruction", previous_meaning_instruction) prompt1 = await prompt_manager.render_prompt(prompt1_template) response1, _ = await self.llm_inference.generate_response_async(prompt1, temperature=0.3) if not response1: logger.warning(f"jargon {content} 推断1失败:无响应") return # 解析推断1结果 inference1 = None try: resp1 = response1.strip() if resp1.startswith("{") and resp1.endswith("}"): inference1 = json.loads(resp1) else: repaired = repair_json(resp1) inference1 = json.loads(repaired) if isinstance(repaired, str) else repaired if not isinstance(inference1, dict): logger.warning(f"jargon {content} 推断1结果格式错误") return except Exception as e: logger.error(f"jargon {content} 推断1解析失败: {e}") return # 检查推断1是否表示信息不足无法推断 no_info = inference1.get("no_info", False) meaning1 = inference1.get("meaning", "").strip() if no_info or not meaning1: logger.info(f"jargon {content} 推断1表示信息不足无法推断,放弃本次推断,待下次更新") # 更新最后一次判定的count值,避免在同一阈值重复尝试 jargon_obj.last_inference_count = jargon_obj.count or 0 jargon_obj.save() return # 步骤2: 仅基于content推断 prompt2_template = prompt_manager.get_prompt("jargon_inference_content_only") prompt2_template.add_context("content", str(content)) prompt2 = await prompt_manager.render_prompt(prompt2_template) response2, _ = await self.llm_inference.generate_response_async(prompt2, temperature=0.3) if not response2: logger.warning(f"jargon {content} 推断2失败:无响应") return # 解析推断2结果 inference2 = None try: resp2 = response2.strip() if resp2.startswith("{") and resp2.endswith("}"): inference2 = json.loads(resp2) else: repaired = repair_json(resp2) inference2 = json.loads(repaired) if isinstance(repaired, str) else repaired if not isinstance(inference2, dict): logger.warning(f"jargon {content} 推断2结果格式错误") return except Exception as e: logger.error(f"jargon {content} 推断2解析失败: {e}") return # logger.info(f"jargon {content} 推断2提示词: {prompt2}") # logger.info(f"jargon {content} 推断2结果: {response2}") # logger.info(f"jargon {content} 推断1提示词: {prompt1}") # logger.info(f"jargon {content} 推断1结果: {response1}") if global_config.debug.show_jargon_prompt: logger.info(f"jargon {content} 推断2提示词: {prompt2}") logger.info(f"jargon {content} 推断2结果: {response2}") logger.info(f"jargon {content} 推断1提示词: {prompt1}") logger.info(f"jargon {content} 推断1结果: {response1}") else: logger.debug(f"jargon {content} 推断2提示词: {prompt2}") logger.debug(f"jargon {content} 推断2结果: {response2}") logger.debug(f"jargon {content} 推断1提示词: {prompt1}") logger.debug(f"jargon {content} 推断1结果: {response1}") # 步骤3: 比较两个推断结果 prompt3_template = prompt_manager.get_prompt("jargon_compare_inference") prompt3_template.add_context("inference1", json.dumps(inference1, ensure_ascii=False)) prompt3_template.add_context("inference2", json.dumps(inference2, ensure_ascii=False)) prompt3 = await prompt_manager.render_prompt(prompt3_template) if global_config.debug.show_jargon_prompt: logger.info(f"jargon {content} 比较提示词: {prompt3}") response3, _ = await self.llm_inference.generate_response_async(prompt3, temperature=0.3) if not response3: logger.warning(f"jargon {content} 比较失败:无响应") return # 解析比较结果 comparison = None try: resp3 = response3.strip() if resp3.startswith("{") and resp3.endswith("}"): comparison = json.loads(resp3) else: repaired = repair_json(resp3) comparison = json.loads(repaired) if isinstance(repaired, str) else repaired if not isinstance(comparison, dict): logger.warning(f"jargon {content} 比较结果格式错误") return except Exception as e: logger.error(f"jargon {content} 比较解析失败: {e}") return # 判断是否为黑话 is_similar = comparison.get("is_similar", False) is_jargon = not is_similar # 如果相似,说明不是黑话;如果有差异,说明是黑话 # 更新数据库记录 jargon_obj.is_jargon = is_jargon if is_jargon: # 是黑话,使用推断1的结果(基于上下文,更准确) jargon_obj.meaning = inference1.get("meaning", "") else: # 不是黑话,清空含义,不再存储任何内容 jargon_obj.meaning = "" # 更新最后一次判定的count值,避免重启后重复判定 jargon_obj.last_inference_count = jargon_obj.count or 0 # 如果count>=100,标记为完成,不再进行推断 if (jargon_obj.count or 0) >= 100: jargon_obj.is_complete = True jargon_obj.save() logger.debug( f"jargon {content} 推断完成: is_jargon={is_jargon}, meaning={jargon_obj.meaning}, last_inference_count={jargon_obj.last_inference_count}, is_complete={jargon_obj.is_complete}" ) # 固定输出推断结果,格式化为可读形式 if is_jargon: # 是黑话,输出格式:[聊天名]xxx的含义是 xxxxxxxxxxx meaning = jargon_obj.meaning or "无详细说明" is_global = jargon_obj.is_global if is_global: logger.info(f"[黑话]{content}的含义是 {meaning}") else: logger.info(f"[{self.stream_name}]{content}的含义是 {meaning}") else: # 不是黑话,输出格式:[聊天名]xxx 不是黑话 logger.info(f"[{self.stream_name}]{content} 不是黑话") except Exception as e: logger.error(f"jargon推断失败: {e}") import traceback traceback.print_exc() async def process_extracted_entries( self, entries: List[Dict[str, List[str]]], person_name_filter: Optional[Callable[[str], bool]] = None ) -> None: """ 处理已提取的黑话条目(从 expression_learner 路由过来的) Args: entries: 黑话条目列表,每个元素格式为 {"content": "...", "raw_content": [...]} person_name_filter: 可选的过滤函数,用于检查内容是否包含人物名称 """ if not entries: return try: # 去重并合并raw_content(按 content 聚合) merged_entries: OrderedDict[str, Dict[str, List[str]]] = OrderedDict() for entry in entries: content_key = entry["content"] # 检查是否包含人物名称 # logger.info(f"process_extracted_entries 检查是否包含人物名称: {content_key}") # logger.info(f"person_name_filter: {person_name_filter}") if person_name_filter and person_name_filter(content_key): logger.info(f"process_extracted_entries 跳过包含人物名称的黑话: {content_key}") continue raw_list = entry.get("raw_content", []) or [] if content_key in merged_entries: merged_entries[content_key]["raw_content"].extend(raw_list) else: merged_entries[content_key] = { "content": content_key, "raw_content": list(raw_list), } uniq_entries = [] for merged_entry in merged_entries.values(): raw_content_list = merged_entry["raw_content"] if raw_content_list: merged_entry["raw_content"] = list(dict.fromkeys(raw_content_list)) uniq_entries.append(merged_entry) saved = 0 updated = 0 for entry in uniq_entries: content = entry["content"] raw_content_list = entry["raw_content"] # 已经是列表 try: # 查询所有content匹配的记录 query = Jargon.select().where(Jargon.content == content) # 查找匹配的记录 matched_obj = None for obj in query: if global_config.expression.all_global_jargon: # 开启all_global:所有content匹配的记录都可以 matched_obj = obj break else: # 关闭all_global:需要检查chat_id列表是否包含目标chat_id chat_id_list = parse_chat_id_list(obj.chat_id) if chat_id_list_contains(chat_id_list, self.chat_id): matched_obj = obj break if matched_obj: obj = matched_obj try: obj.count = (obj.count or 0) + 1 except Exception: obj.count = 1 # 合并raw_content列表:读取现有列表,追加新值,去重 existing_raw_content = [] if obj.raw_content: try: existing_raw_content = ( json.loads(obj.raw_content) if isinstance(obj.raw_content, str) else obj.raw_content ) if not isinstance(existing_raw_content, list): existing_raw_content = [existing_raw_content] if existing_raw_content else [] except (json.JSONDecodeError, TypeError): existing_raw_content = [obj.raw_content] if obj.raw_content else [] # 合并并去重 merged_list = list(dict.fromkeys(existing_raw_content + raw_content_list)) obj.raw_content = json.dumps(merged_list, ensure_ascii=False) # 更新chat_id列表:增加当前chat_id的计数 chat_id_list = parse_chat_id_list(obj.chat_id) updated_chat_id_list = update_chat_id_list(chat_id_list, self.chat_id, increment=1) obj.chat_id = json.dumps(updated_chat_id_list, ensure_ascii=False) # 开启all_global时,确保记录标记为is_global=True if global_config.expression.all_global_jargon: obj.is_global = True # 关闭all_global时,保持原有is_global不变(不修改) obj.save() # 检查是否需要推断(达到阈值且超过上次判定值) if _should_infer_meaning(obj): # 异步触发推断,不阻塞主流程 # 重新加载对象以确保数据最新 jargon_id = obj.id asyncio.create_task(self._infer_meaning_by_id(jargon_id)) updated += 1 else: # 没找到匹配记录,创建新记录 if global_config.expression.all_global_jargon: # 开启all_global:新记录默认为is_global=True is_global_new = True else: # 关闭all_global:新记录is_global=False is_global_new = False # 使用新格式创建chat_id列表:[[chat_id, count]] chat_id_list = [[self.chat_id, 1]] chat_id_json = json.dumps(chat_id_list, ensure_ascii=False) Jargon.create( content=content, raw_content=json.dumps(raw_content_list, ensure_ascii=False), chat_id=chat_id_json, is_global=is_global_new, count=1, ) saved += 1 except Exception as e: logger.error(f"保存jargon失败: chat_id={self.chat_id}, content={content}, err={e}") continue finally: self._add_to_cache(content) # 固定输出提取的jargon结果,格式化为可读形式(只要有提取结果就输出) if uniq_entries: # 收集所有提取的jargon内容 jargon_list = [entry["content"] for entry in uniq_entries] jargon_str = ",".join(jargon_list) # 输出格式化的结果(使用logger.info会自动应用jargon模块的颜色) logger.info(f"[{self.stream_name}]疑似黑话: {jargon_str}") if saved or updated: logger.debug(f"jargon写入: 新增 {saved} 条,更新 {updated} 条,chat_id={self.chat_id}") except Exception as e: logger.error(f"处理已提取的黑话条目失败: {e}") class JargonMinerManager: def __init__(self) -> None: self._miners: dict[str, JargonMiner] = {} def get_miner(self, chat_id: str) -> JargonMiner: if chat_id not in self._miners: self._miners[chat_id] = JargonMiner(chat_id) return self._miners[chat_id] miner_manager = JargonMinerManager() def search_jargon( keyword: str, chat_id: Optional[str] = None, limit: int = 10, case_sensitive: bool = False, fuzzy: bool = True ) -> List[Dict[str, str]]: """ 搜索jargon,支持大小写不敏感和模糊搜索 Args: keyword: 搜索关键词 chat_id: 可选的聊天ID - 如果开启了all_global:此参数被忽略,查询所有is_global=True的记录 - 如果关闭了all_global:如果提供则优先搜索该聊天或global的jargon limit: 返回结果数量限制,默认10 case_sensitive: 是否大小写敏感,默认False(不敏感) fuzzy: 是否模糊搜索,默认True(使用LIKE匹配) Returns: List[Dict[str, str]]: 包含content, meaning的字典列表 """ if not keyword or not keyword.strip(): return [] keyword = keyword.strip() # 构建查询(选择所有需要的字段,以便后续过滤) query = Jargon.select() # 构建搜索条件 if case_sensitive: # 大小写敏感 if fuzzy: # 模糊搜索 search_condition = Jargon.content.contains(keyword) else: # 精确匹配 search_condition = Jargon.content == keyword else: # 大小写不敏感 if fuzzy: # 模糊搜索(使用LOWER函数) search_condition = fn.LOWER(Jargon.content).contains(keyword.lower()) else: # 精确匹配(使用LOWER函数) search_condition = fn.LOWER(Jargon.content) == keyword.lower() query = query.where(search_condition) # 根据all_global配置决定查询逻辑 if global_config.expression.all_global_jargon: # 开启all_global:所有记录都是全局的,查询所有is_global=True的记录(无视chat_id) query = query.where(Jargon.is_global) # 注意:对于all_global=False的情况,chat_id过滤在Python层面进行,以便兼容新旧格式 # 注意:meaning的过滤移到Python层面,因为我们需要先过滤chat_id # 按count降序排序,优先返回出现频率高的 query = query.order_by(Jargon.count.desc()) # 限制结果数量(先多取一些,因为后面可能过滤) query = query.limit(limit * 2) # 执行查询并返回结果,过滤chat_id results = [] for jargon in query: # 如果提供了chat_id且all_global=False,需要检查chat_id列表是否包含目标chat_id if chat_id and not global_config.expression.all_global_jargon: chat_id_list = parse_chat_id_list(jargon.chat_id) # 如果记录是is_global=True,或者chat_id列表包含目标chat_id,则包含 if not jargon.is_global and not chat_id_list_contains(chat_id_list, chat_id): continue # 只返回有meaning的记录 if not jargon.meaning or jargon.meaning.strip() == "": continue results.append({"content": jargon.content or "", "meaning": jargon.meaning or ""}) # 达到限制数量后停止 if len(results) >= limit: break return results