test：合并express提取和Jargon提取

2025-12-07 16:24:13 +08:00
parent 2e31fa2055
commit b03e245817
5 changed files with 273 additions and 25 deletions
--- a/src/bw_learner/expression_learner.py
+++ b/src/bw_learner/expression_learner.py
@@ -3,7 +3,7 @@ import json
 import os
 import re
 import asyncio
-from typing import List, Optional, Tuple, Any
+from typing import List, Optional, Tuple, Any, Dict
 from src.common.logger import get_logger
 from src.common.database.database_model import Expression
 from src.llm_models.utils_model import LLMRequest
@@ -13,7 +13,8 @@ from src.chat.utils.chat_message_builder import (
 )
 from src.chat.utils.prompt_builder import Prompt, global_prompt_manager
 from src.chat.message_receive.chat_stream import get_chat_manager
-from src.bw_learner.learner_utils import filter_message_content, is_bot_message
+from src.bw_learner.learner_utils import filter_message_content, is_bot_message, build_context_paragraph, contains_bot_self_name
+from src.bw_learner.jargon_miner import miner_manager
 from json_repair import repair_json


@@ -24,7 +25,8 @@ logger = get_logger("expressor")

 def init_prompt() -> None:
    learn_style_prompt = """{chat_str}
-你的名字是{bot_name},现在请你请从上面这段群聊中用户的语言风格和说话方式
+你的名字是{bot_name},现在请你完成两个提取任务
+任务1：请从上面这段群聊中用户的语言风格和说话方式
 1. 只考虑文字，不要考虑表情包和图片
 2. 不要总结SELF的发言
 3. 不要涉及具体的人名，也不要涉及具体名词
@@ -33,19 +35,39 @@ def init_prompt() -> None:
 注意：总结成如下格式的规律，总结的内容要详细，但具有概括性：
 例如：当"AAAAA"时，可以"BBBBB", AAAAA代表某个场景，不超过20个字。BBBBB代表对应的语言风格，特定句式或表达方式，不超过20个字。

-请严格以 JSON 数组的形式输出结果，每个元素为一个对象，结构如下（注意字段名）：
+任务2：请从上面这段聊天内容中提取"可能是黑话"的候选项（黑话/俚语/网络缩写/口头禅）。
+- 必须为对话中真实出现过的短词或短语
+- 必须是你无法理解含义的词语，没有明确含义的词语，请不要选择有明确含义，或者含义清晰的词语
+- 排除：人名、@、表情包/图片中的内容、纯标点、常规功能词（如的、了、呢、啊等）
+- 每个词条长度建议 2-8 个字符（不强制），尽量短小
+- 请你提取出可能的黑话，最多30个黑话，请尽量提取所有
+
+黑话必须为以下几种类型：
+- 由字母构成的，汉语拼音首字母的简写词，例如：nb、yyds、xswl
+- 英文词语的缩写，用英文字母概括一个词汇或含义，例如：CPU、GPU、API
+- 中文词语的缩写，用几个汉字概括一个词汇或含义，例如：社死、内卷
+
+输出要求：
+将表达方式，语言风格和黑话以 JSON 数组输出，每个元素为一个对象，结构如下（注意字段名）：
+
 [
  {{"situation": "AAAAA", "style": "BBBBB", "source_id": "3"}},
  {{"situation": "CCCC", "style": "DDDD", "source_id": "7"}}
  {{"situation": "对某件事表示十分惊叹", "style": "使用 我嘞个xxxx", "source_id": "[消息编号]"}},
  {{"situation": "表示讽刺的赞同，不讲道理", "style": "对对对", "source_id": "[消息编号]"}},
  {{"situation": "当涉及游戏相关时，夸赞，略带戏谑意味", "style": "使用 这么强！", "source_id": "[消息编号]"}},
+  {{"content": "词条", "source_id": "12"}},
+  {{"content": "词条2", "source_id": "5"}}
 ]

 其中：
+表达方式条目：
 - situation：表示“在什么情境下”的简短概括（不超过20个字）
 - style：表示对应的语言风格或常用表达（不超过20个字）
 - source_id：该表达方式对应的“来源行编号”，即上方聊天记录中方括号里的数字（例如 [3]），请只输出数字本身，不要包含方括号
+黑话jargon条目：
+- content:表示黑话的内容
+- source_id：该黑话对应的“来源行编号”，即上方聊天记录中方括号里的数字（例如 [3]），请只输出数字本身，不要包含方括号

 现在请你输出 JSON：
 """
@@ -104,13 +126,25 @@ class ExpressionLearner:
            logger.error(f"学习表达方式失败,模型生成出错: {e}")
            return None

-        # 解析 LLM 返回的表达方式列表（包含来源行编号）
-        expressions: List[Tuple[str, str, str]] = self.parse_expression_response(response)
+        # 解析 LLM 返回的表达方式列表和黑话列表（包含来源行编号）
+        expressions: List[Tuple[str, str, str]]
+        jargon_entries: List[Tuple[str, str]]  # (content, source_id)
+        expressions, jargon_entries = self.parse_expression_response(response)
        expressions = self._filter_self_reference_styles(expressions)
+        
+        # 处理黑话条目，路由到 jargon_miner（即使没有表达方式也要处理黑话）
+        if jargon_entries:
+            await self._process_jargon_entries(jargon_entries, random_msg)
+        
+        # 如果没有表达方式，直接返回
        if not expressions:
            logger.info("过滤后没有可用的表达方式（style 与机器人名称重复）")
-            return None
-        # logger.debug(f"学习{type_str}的response: {response}")
+            return []
+        
+        logger.info(f"学习的prompt: {prompt}")
+        logger.info(f"学习的expressions: {expressions}")
+        logger.info(f"学习的jargon_entries: {jargon_entries}")
+        logger.info(f"学习的response: {response}")

        # 直接根据 source_id 在 random_msg 中溯源，获取 context
        filtered_expressions: List[Tuple[str, str, str]] = []  # (situation, style, context)
@@ -173,18 +207,24 @@ class ExpressionLearner:

        return learnt_expressions

-    def parse_expression_response(self, response: str) -> List[Tuple[str, str, str]]:
+    def parse_expression_response(self, response: str) -> Tuple[List[Tuple[str, str, str]], List[Tuple[str, str]]]:
        """
-        解析 LLM 返回的表达风格总结 JSON，提取 (situation, style, source_id) 元组列表。
+        解析 LLM 返回的表达风格总结和黑话 JSON，提取两个列表。

        期望的 JSON 结构：
        [
-          {"situation": "AAAAA", "style": "BBBBB", "source_id": "3"},
+          {"situation": "AAAAA", "style": "BBBBB", "source_id": "3"},  // 表达方式
+          {"content": "词条", "source_id": "12"},  // 黑话
          ...
        ]
+
+        Returns:
+            Tuple[List[Tuple[str, str, str]], List[Tuple[str, str]]]:
+                第一个列表是表达方式 (situation, style, source_id)
+                第二个列表是黑话 (content, source_id)
        """
        if not response:
-            return []
+            return [], []

        raw = response.strip()

@@ -200,7 +240,8 @@ class ExpressionLearner:
            raw = raw.strip()

        parsed = None
-        expressions: List[Tuple[str, str, str]] = []
+        expressions: List[Tuple[str, str, str]] = []  # (situation, style, source_id)
+        jargon_entries: List[Tuple[str, str]] = []  # (content, source_id)

        try:
            # 优先尝试直接解析
@@ -292,15 +333,23 @@ class ExpressionLearner:
        for item in parsed_list:
            if not isinstance(item, dict):
                continue
+            
+            # 检查是否是表达方式条目（有 situation 和 style）
            situation = str(item.get("situation", "")).strip()
            style = str(item.get("style", "")).strip()
            source_id = str(item.get("source_id", "")).strip()
-            if not situation or not style or not source_id:
-                # 三个字段必须同时存在
-                continue
-            expressions.append((situation, style, source_id))
+            
+            if situation and style and source_id:
+                # 表达方式条目
+                expressions.append((situation, style, source_id))
+            elif item.get("content"):
+                # 黑话条目（有 content 字段）
+                content = str(item.get("content", "")).strip()
+                source_id = str(item.get("source_id", "")).strip()
+                if content and source_id:
+                    jargon_entries.append((content, source_id))

-        return expressions
+        return expressions, jargon_entries

    def _filter_self_reference_styles(self, expressions: List[Tuple[str, str, str]]) -> List[Tuple[str, str, str]]:
        """
@@ -438,6 +487,66 @@ class ExpressionLearner:
            logger.error(f"概括表达情境失败: {e}")
        return None

+    async def _process_jargon_entries(self, jargon_entries: List[Tuple[str, str]], messages: List[Any]) -> None:
+        """
+        处理从 expression learner 提取的黑话条目，路由到 jargon_miner
+        
+        Args:
+            jargon_entries: 黑话条目列表，每个元素是 (content, source_id)
+            messages: 消息列表，用于构建上下文
+        """
+        if not jargon_entries or not messages:
+            return
+        
+        # 获取 jargon_miner 实例
+        jargon_miner = miner_manager.get_miner(self.chat_id)
+        
+        # 构建黑话条目格式，与 jargon_miner.run_once 中的格式一致
+        entries: List[Dict[str, List[str]]] = []
+        
+        for content, source_id in jargon_entries:
+            content = content.strip()
+            if not content:
+                continue
+            
+            # 检查是否包含机器人名称
+            if contains_bot_self_name(content):
+                logger.info(f"跳过包含机器人昵称/别名的黑话: {content}")
+                continue
+            
+            # 解析 source_id
+            source_id_str = (source_id or "").strip()
+            if not source_id_str.isdigit():
+                logger.warning(f"黑话条目 source_id 无效: content={content}, source_id={source_id_str}")
+                continue
+            
+            # build_anonymous_messages 的编号从 1 开始
+            line_index = int(source_id_str) - 1
+            if line_index < 0 or line_index >= len(messages):
+                logger.warning(f"黑话条目 source_id 超出范围: content={content}, source_id={source_id_str}")
+                continue
+            
+            # 检查是否是机器人自己的消息
+            target_msg = messages[line_index]
+            if is_bot_message(target_msg):
+                logger.info(f"跳过引用机器人自身消息的黑话: content={content}, source_id={source_id_str}")
+                continue
+            
+            # 构建上下文段落
+            context_paragraph = build_context_paragraph(messages, line_index)
+            if not context_paragraph:
+                logger.warning(f"黑话条目上下文为空: content={content}, source_id={source_id_str}")
+                continue
+            
+            entries.append({"content": content, "raw_content": [context_paragraph]})
+        
+        if not entries:
+            return
+        
+        # 调用 jargon_miner 处理这些条目
+        await jargon_miner.process_extracted_entries(entries)
+
+
 init_prompt()