feat：复用jargon和expression的部分代码，代码层面合并，合并配置项

缓解bot重复学习自身表达的问题缓解单字黑话推断时消耗过高的问题修复count过高时推断过长的问题移除表达方式学习强度配置
2025-12-07 14:28:30 +08:00
parent 717b18be1e
commit 2e31fa2055
20 changed files with 587 additions and 469 deletions
--- a/src/bw_learner/jargon_explainer.py
+++ b/src/bw_learner/jargon_explainer.py
@@ -0,0 +1,360 @@
+import re
+import time
+from typing import List, Dict, Optional, Any
+
+from src.common.logger import get_logger
+from src.common.database.database_model import Jargon
+from src.llm_models.utils_model import LLMRequest
+from src.config.config import model_config, global_config
+from src.chat.utils.prompt_builder import Prompt, global_prompt_manager
+from src.bw_learner.jargon_miner import search_jargon
+from src.bw_learner.learner_utils import is_bot_message, contains_bot_self_name, parse_chat_id_list, chat_id_list_contains
+
+logger = get_logger("jargon")
+
+
+def _init_explainer_prompts() -> None:
+    """初始化黑话解释器相关的prompt"""
+    # Prompt：概括黑话解释结果
+    summarize_prompt_str = """上下文聊天内容:
+{chat_context}
+
+在上下文中提取到的黑话及其含义:
+{jargon_explanations}
+
+请根据上述信息，对黑话解释进行概括和整理。
+- 如果上下文中有黑话出现，请简要说明这些黑话在上下文中的使用情况
+- 将所有黑话解释整理成简洁、易读的一段话
+- 输出格式要自然，适合作为回复参考信息
+请输出概括后的黑话解释（直接输出一段平文本，不要标题，无特殊格式或markdown格式，不要使用JSON格式）：
+"""
+    Prompt(summarize_prompt_str, "jargon_explainer_summarize_prompt")
+
+
+_init_explainer_prompts()
+
+
+class JargonExplainer:
+    """黑话解释器，用于在回复前识别和解释上下文中的黑话"""
+
+    def __init__(self, chat_id: str) -> None:
+        self.chat_id = chat_id
+        self.llm = LLMRequest(
+            model_set=model_config.model_task_config.utils,
+            request_type="jargon.explain",
+        )
+
+    def match_jargon_from_messages(self, messages: List[Any]) -> List[Dict[str, str]]:
+        """
+        通过直接匹配数据库中的jargon字符串来提取黑话
+
+        Args:
+            messages: 消息列表
+
+        Returns:
+            List[Dict[str, str]]: 提取到的黑话列表，每个元素包含content
+        """
+        start_time = time.time()
+
+        if not messages:
+            return []
+
+        # 收集所有消息的文本内容
+        message_texts: List[str] = []
+        for msg in messages:
+            # 跳过机器人自己的消息
+            if is_bot_message(msg):
+                continue
+
+            msg_text = (
+                getattr(msg, "display_message", None) or getattr(msg, "processed_plain_text", None) or ""
+            ).strip()
+            if msg_text:
+                message_texts.append(msg_text)
+
+        if not message_texts:
+            return []
+
+        # 合并所有消息文本
+        combined_text = " ".join(message_texts)
+
+        # 查询所有有meaning的jargon记录
+        query = Jargon.select().where((Jargon.meaning.is_null(False)) & (Jargon.meaning != ""))
+
+        # 根据all_global配置决定查询逻辑
+        if global_config.expression.all_global_jargon:
+            # 开启all_global：只查询is_global=True的记录
+            query = query.where(Jargon.is_global)
+        else:
+            # 关闭all_global：查询is_global=True或chat_id列表包含当前chat_id的记录
+            # 这里先查询所有，然后在Python层面过滤
+            pass
+
+        # 按count降序排序，优先匹配出现频率高的
+        query = query.order_by(Jargon.count.desc())
+
+        # 执行查询并匹配
+        matched_jargon: Dict[str, Dict[str, str]] = {}
+        query_time = time.time()
+
+        for jargon in query:
+            content = jargon.content or ""
+            if not content or not content.strip():
+                continue
+
+            # 跳过包含机器人昵称的词条
+            if contains_bot_self_name(content):
+                continue
+
+            # 检查chat_id（如果all_global=False）
+            if not global_config.expression.all_global_jargon:
+                if jargon.is_global:
+                    # 全局黑话，包含
+                    pass
+                else:
+                    # 检查chat_id列表是否包含当前chat_id
+                    chat_id_list = parse_chat_id_list(jargon.chat_id)
+                    if not chat_id_list_contains(chat_id_list, self.chat_id):
+                        continue
+
+            # 在文本中查找匹配（大小写不敏感）
+            pattern = re.escape(content)
+            # 使用单词边界或中文字符边界来匹配，避免部分匹配
+            # 对于中文，使用Unicode字符类；对于英文，使用单词边界
+            if re.search(r"[\u4e00-\u9fff]", content):
+                # 包含中文，使用更宽松的匹配
+                search_pattern = pattern
+            else:
+                # 纯英文/数字，使用单词边界
+                search_pattern = r"\b" + pattern + r"\b"
+
+            if re.search(search_pattern, combined_text, re.IGNORECASE):
+                # 找到匹配，记录（去重）
+                if content not in matched_jargon:
+                    matched_jargon[content] = {"content": content}
+
+        match_time = time.time()
+        total_time = match_time - start_time
+        query_duration = query_time - start_time
+        match_duration = match_time - query_time
+
+        logger.debug(
+            f"黑话匹配完成: 查询耗时 {query_duration:.3f}s, 匹配耗时 {match_duration:.3f}s, "
+            f"总耗时 {total_time:.3f}s, 匹配到 {len(matched_jargon)} 个黑话"
+        )
+
+        return list(matched_jargon.values())
+
+    async def explain_jargon(self, messages: List[Any], chat_context: str) -> Optional[str]:
+        """
+        解释上下文中的黑话
+
+        Args:
+            messages: 消息列表
+            chat_context: 聊天上下文的文本表示
+
+        Returns:
+            Optional[str]: 黑话解释的概括文本，如果没有黑话则返回None
+        """
+        if not messages:
+            return None
+
+        # 直接匹配方式：从数据库中查询jargon并在消息中匹配
+        jargon_entries = self.match_jargon_from_messages(messages)
+
+        if not jargon_entries:
+            return None
+
+        # 去重（按content）
+        unique_jargon: Dict[str, Dict[str, str]] = {}
+        for entry in jargon_entries:
+            content = entry["content"]
+            if content not in unique_jargon:
+                unique_jargon[content] = entry
+
+        jargon_list = list(unique_jargon.values())
+        logger.info(f"从上下文中提取到 {len(jargon_list)} 个黑话: {[j['content'] for j in jargon_list]}")
+
+        # 查询每个黑话的含义
+        jargon_explanations: List[str] = []
+        for entry in jargon_list:
+            content = entry["content"]
+
+            # 根据是否开启全局黑话，决定查询方式
+            if global_config.expression.all_global_jargon:
+                # 开启全局黑话：查询所有is_global=True的记录
+                results = search_jargon(
+                    keyword=content,
+                    chat_id=None,  # 不指定chat_id，查询全局黑话
+                    limit=1,
+                    case_sensitive=False,
+                    fuzzy=False,  # 精确匹配
+                )
+            else:
+                # 关闭全局黑话：优先查询当前聊天或全局的黑话
+                results = search_jargon(
+                    keyword=content,
+                    chat_id=self.chat_id,
+                    limit=1,
+                    case_sensitive=False,
+                    fuzzy=False,  # 精确匹配
+                )
+
+            if results and len(results) > 0:
+                meaning = results[0].get("meaning", "").strip()
+                if meaning:
+                    jargon_explanations.append(f"- {content}: {meaning}")
+                else:
+                    logger.info(f"黑话 {content} 没有找到含义")
+            else:
+                logger.info(f"黑话 {content} 未在数据库中找到")
+
+        if not jargon_explanations:
+            logger.info("没有找到任何黑话的含义，跳过解释")
+            return None
+
+        # 拼接所有黑话解释
+        explanations_text = "\n".join(jargon_explanations)
+
+        # 使用LLM概括黑话解释
+        summarize_prompt = await global_prompt_manager.format_prompt(
+            "jargon_explainer_summarize_prompt",
+            chat_context=chat_context,
+            jargon_explanations=explanations_text,
+        )
+
+        summary, _ = await self.llm.generate_response_async(summarize_prompt, temperature=0.3)
+        if not summary:
+            # 如果LLM概括失败，直接返回原始解释
+            return f"上下文中的黑话解释：\n{explanations_text}"
+
+        summary = summary.strip()
+        if not summary:
+            return f"上下文中的黑话解释：\n{explanations_text}"
+
+        return summary
+
+
+async def explain_jargon_in_context(chat_id: str, messages: List[Any], chat_context: str) -> Optional[str]:
+    """
+    解释上下文中的黑话（便捷函数）
+
+    Args:
+        chat_id: 聊天ID
+        messages: 消息列表
+        chat_context: 聊天上下文的文本表示
+
+    Returns:
+        Optional[str]: 黑话解释的概括文本，如果没有黑话则返回None
+    """
+    explainer = JargonExplainer(chat_id)
+    return await explainer.explain_jargon(messages, chat_context)
+
+
+def match_jargon_from_text(chat_text: str, chat_id: str) -> List[str]:
+    """直接在聊天文本中匹配已知的jargon，返回出现过的黑话列表
+
+    Args:
+        chat_text: 要匹配的聊天文本
+        chat_id: 聊天ID
+
+    Returns:
+        List[str]: 匹配到的黑话列表
+    """
+    if not chat_text or not chat_text.strip():
+        return []
+
+    query = Jargon.select().where((Jargon.meaning.is_null(False)) & (Jargon.meaning != ""))
+    if global_config.expression.all_global_jargon:
+        query = query.where(Jargon.is_global)
+
+    query = query.order_by(Jargon.count.desc())
+
+    matched: Dict[str, None] = {}
+
+    for jargon in query:
+        content = (jargon.content or "").strip()
+        if not content:
+            continue
+
+        if not global_config.expression.all_global_jargon and not jargon.is_global:
+            chat_id_list = parse_chat_id_list(jargon.chat_id)
+            if not chat_id_list_contains(chat_id_list, chat_id):
+                continue
+
+        pattern = re.escape(content)
+        if re.search(r"[\u4e00-\u9fff]", content):
+            search_pattern = pattern
+        else:
+            search_pattern = r"\b" + pattern + r"\b"
+
+        if re.search(search_pattern, chat_text, re.IGNORECASE):
+            matched[content] = None
+
+    logger.info(f"匹配到 {len(matched)} 个黑话")
+
+    return list(matched.keys())
+
+
+async def retrieve_concepts_with_jargon(concepts: List[str], chat_id: str) -> str:
+    """对概念列表进行jargon检索
+
+    Args:
+        concepts: 概念列表
+        chat_id: 聊天ID
+
+    Returns:
+        str: 检索结果字符串
+    """
+    if not concepts:
+        return ""
+
+    results = []
+    exact_matches = []  # 收集所有精确匹配的概念
+    for concept in concepts:
+        concept = concept.strip()
+        if not concept:
+            continue
+
+        # 先尝试精确匹配
+        jargon_results = search_jargon(keyword=concept, chat_id=chat_id, limit=10, case_sensitive=False, fuzzy=False)
+
+        is_fuzzy_match = False
+
+        # 如果精确匹配未找到，尝试模糊搜索
+        if not jargon_results:
+            jargon_results = search_jargon(keyword=concept, chat_id=chat_id, limit=10, case_sensitive=False, fuzzy=True)
+            is_fuzzy_match = True
+
+        if jargon_results:
+            # 找到结果
+            if is_fuzzy_match:
+                # 模糊匹配
+                output_parts = [f"未精确匹配到'{concept}'"]
+                for result in jargon_results:
+                    found_content = result.get("content", "").strip()
+                    meaning = result.get("meaning", "").strip()
+                    if found_content and meaning:
+                        output_parts.append(f"找到 '{found_content}' 的含义为：{meaning}")
+                results.append("，".join(output_parts))
+                logger.info(f"在jargon库中找到匹配（模糊搜索）: {concept}，找到{len(jargon_results)}条结果")
+            else:
+                # 精确匹配
+                output_parts = []
+                for result in jargon_results:
+                    meaning = result.get("meaning", "").strip()
+                    if meaning:
+                        output_parts.append(f"'{concept}' 为黑话或者网络简写，含义为：{meaning}")
+                results.append("；".join(output_parts) if len(output_parts) > 1 else output_parts[0])
+                exact_matches.append(concept)  # 收集精确匹配的概念，稍后统一打印
+        else:
+            # 未找到，不返回占位信息，只记录日志
+            logger.info(f"在jargon库中未找到匹配: {concept}")
+
+    # 合并所有精确匹配的日志
+    if exact_matches:
+        logger.info(f"找到黑话: {', '.join(exact_matches)}，共找到{len(exact_matches)}条结果")
+
+    if results:
+        return "【概念检索结果】\n" + "\n".join(results) + "\n"
+    return ""