将所有必要内容完全迁移后删除原文件

2026-03-12 09:16:51 +08:00
parent 71a288983f
commit e303fbeb6b
6 changed files with 271 additions and 1619 deletions
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -10,6 +10,7 @@
    - 对于不同文件夹下的模块导入，使用绝对导入。这些导入应该以`from src`开头，并且按照**不发生import错误的前提下**，尽量使得第二层的文件夹名称相同的导入放在一起；第二层文件夹名称排列随机。
 3. 标准库和第三方库的导入应该放在本地模块导入的前面。
 4. 各个导入块之间应该使用一个空行进行分隔。
 5. 对于现有的代码，如果导入顺序不符合上述规范，在重构代码时应该调整导入顺序以符合规范。
 # 代码规范
 ## 注释规范
--- a/src/bw_learner/expression_learner_old.py
+++ b/src/bw_learner/expression_learner_old.py
@@ -1,596 +0,0 @@
 import time
 import json
 import os
 import re
 import asyncio
 from typing import List, Optional, Tuple, Any, Dict
 from src.common.logger import get_logger
 from src.common.database.database_model import Expression
 from src.llm_models.utils_model import LLMRequest
 from src.config.config import model_config, global_config
 from src.chat.utils.chat_message_builder import (
    build_anonymous_messages,
 )
 from src.prompt.prompt_manager import prompt_manager
 from src.chat.message_receive.chat_stream import get_chat_manager
 from src.bw_learner.learner_utils_old import (
    filter_message_content,
    is_bot_message,
    build_context_paragraph,
    contains_bot_self_name,
    calculate_similarity,
    parse_expression_response,
 )
 from src.bw_learner.jargon_miner_old import miner_manager
 from src.bw_learner.expression_auto_check_task import (
    single_expression_check,
 )
 # MAX_EXPRESSION_COUNT = 300
 logger = get_logger("expressor")
 class ExpressionLearner:
    def __init__(self, chat_id: str) -> None:
        self.express_learn_model: LLMRequest = LLMRequest(
            model_set=model_config.model_task_config.utils, request_type="expression.learner"
        )
        self.summary_model: LLMRequest = LLMRequest(
            model_set=model_config.model_task_config.tool_use, request_type="expression.summary"
        )
        self.check_model: Optional[LLMRequest] = None  # 检查用的 LLM 实例，延迟初始化
        self.chat_id = chat_id
        self.chat_stream = get_chat_manager().get_stream(chat_id)
        self.chat_name = get_chat_manager().get_stream_name(chat_id) or chat_id
        # 学习锁，防止并发执行学习任务
        self._learning_lock = asyncio.Lock()
    async def learn_and_store(
        self,
        messages: List[Any],
    ) -> Optional[List[Tuple[str, str, str]]]:
        """
        学习并存储表达方式
        Args:
            messages: 外部传入的消息列表（必需）
            num: 学习数量
            timestamp_start: 学习开始的时间戳，如果为None则使用self.last_learning_time
        """
        if not messages:
            return None
        random_msg = messages
        # 学习用（开启行编号，便于溯源）
        random_msg_str: str = await build_anonymous_messages(random_msg, show_ids=True)
        prompt_template = prompt_manager.get_prompt("learn_style")
        prompt_template.add_context("bot_name", global_config.bot.nickname)
        prompt_template.add_context("chat_str", random_msg_str)
        prompt = await prompt_manager.render_prompt(prompt_template)
        # print(f"random_msg_str:{random_msg_str}")
        # logger.info(f"学习{type_str}的prompt: {prompt}")
        try:
            response, _ = await self.express_learn_model.generate_response_async(prompt, temperature=0.3)
        except Exception as e:
            logger.error(f"学习表达方式失败,模型生成出错: {e}")
            return None
        # 解析 LLM 返回的表达方式列表和黑话列表（包含来源行编号）
        expressions: List[Tuple[str, str, str]]
        jargon_entries: List[Tuple[str, str]]  # (content, source_id)
        expressions, jargon_entries = parse_expression_response(response)
        # 从缓存中检查 jargon 是否出现在 messages 中
        cached_jargon_entries = self._check_cached_jargons_in_messages(random_msg)
        if cached_jargon_entries:
            # 合并缓存中的 jargon 条目（去重：如果 content 已存在则跳过）
            existing_contents = {content for content, _ in jargon_entries}
            for content, source_id in cached_jargon_entries:
                if content not in existing_contents:
                    jargon_entries.append((content, source_id))
                    existing_contents.add(content)
                    logger.info(f"从缓存中检查到黑话: {content}")
        # 检查表达方式数量，如果超过10个则放弃本次表达学习
        if len(expressions) > 20:
            logger.info(f"表达方式提取数量超过10个（实际{len(expressions)}个），放弃本次表达学习")
            expressions = []
        # 检查黑话数量，如果超过30个则放弃本次黑话学习
        if len(jargon_entries) > 30:
            logger.info(f"黑话提取数量超过30个（实际{len(jargon_entries)}个），放弃本次黑话学习")
            jargon_entries = []
        # 处理黑话条目，路由到 jargon_miner（即使没有表达方式也要处理黑话）
        if jargon_entries:
            await self._process_jargon_entries(jargon_entries, random_msg)
        # 如果没有表达方式，直接返回
        if not expressions:
            logger.info("解析后没有可用的表达方式")
            return []
        logger.info(f"学习的prompt: {prompt}")
        logger.info(f"学习的expressions: {expressions}")
        logger.info(f"学习的jargon_entries: {jargon_entries}")
        logger.info(f"学习的response: {response}")
        # 过滤表达方式，根据 source_id 溯源并应用各种过滤规则
        learnt_expressions = self._filter_expressions(expressions, random_msg)
        if learnt_expressions is None:
            logger.info("没有学习到表达风格")
            return []
        # 展示学到的表达方式
        learnt_expressions_str = ""
        for situation, style in learnt_expressions:
            learnt_expressions_str += f"{situation}->{style}\n"
        logger.info(f"在 {self.chat_name} 学习到表达风格:\n{learnt_expressions_str}")
        current_time = time.time()
        # 存储到数据库 Expression 表
        for situation, style in learnt_expressions:
            await self._upsert_expression_record(
                situation=situation,
                style=style,
                current_time=current_time,
            )
        return learnt_expressions
    def _filter_expressions(
        self,
        expressions: List[Tuple[str, str, str]],
        messages: List[Any],
    ) -> List[Tuple[str, str, str]]:
        """
        过滤表达方式，移除不符合条件的条目
        Args:
            expressions: 表达方式列表，每个元素是 (situation, style, source_id)
            messages: 原始消息列表，用于溯源和验证
        Returns:
            过滤后的表达方式列表，每个元素是 (situation, style, context)
        """
        filtered_expressions: List[Tuple[str, str, str]] = []  # (situation, style, context)
        # 准备机器人名称集合（用于过滤 style 与机器人名称重复的表达）
        banned_names = set()
        bot_nickname = (global_config.bot.nickname or "").strip()
        if bot_nickname:
            banned_names.add(bot_nickname)
        alias_names = global_config.bot.alias_names or []
        for alias in alias_names:
            alias = alias.strip()
            if alias:
                banned_names.add(alias)
        banned_casefold = {name.casefold() for name in banned_names if name}
        for situation, style, source_id in expressions:
            source_id_str = (source_id or "").strip()
            if not source_id_str.isdigit():
                # 无效的来源行编号，跳过
                continue
            line_index = int(source_id_str) - 1  # build_anonymous_messages 的编号从 1 开始
            if line_index < 0 or line_index >= len(messages):
                # 超出范围，跳过
                continue
            # 当前行的原始内容
            current_msg = messages[line_index]
            # 过滤掉从bot自己发言中提取到的表达方式
            if is_bot_message(current_msg):
                continue
            context = filter_message_content(current_msg.processed_plain_text or "")
            if not context:
                continue
            # 过滤掉包含 SELF 的内容（不学习）
            if "SELF" in (situation or "") or "SELF" in (style or "") or "SELF" in context:
                logger.info(f"跳过包含 SELF 的表达方式: situation={situation}, style={style}, source_id={source_id}")
                continue
            # 过滤掉 style 与机器人名称/昵称重复的表达
            normalized_style = (style or "").strip()
            if normalized_style and normalized_style.casefold() in banned_casefold:
                logger.debug(
                    f"跳过 style 与机器人名称重复的表达方式: situation={situation}, style={style}, source_id={source_id}"
                )
                continue
            # 过滤掉包含 "表情：" 或 "表情:" 的内容
            if (
                "表情：" in (situation or "")
                or "表情:" in (situation or "")
                or "表情：" in (style or "")
                or "表情:" in (style or "")
                or "表情：" in context
                or "表情:" in context
            ):
                logger.info(f"跳过包含表情标记的表达方式: situation={situation}, style={style}, source_id={source_id}")
                continue
            # 过滤掉包含 "[图片" 的内容
            if "[图片" in (situation or "") or "[图片" in (style or "") or "[图片" in context:
                logger.info(f"跳过包含图片标记的表达方式: situation={situation}, style={style}, source_id={source_id}")
                continue
            filtered_expressions.append((situation, style))
        return filtered_expressions
    async def _upsert_expression_record(
        self,
        situation: str,
        style: str,
        current_time: float,
    ) -> None:
        # 检查是否有相似的 situation（相似度 >= 0.75，检查 content_list）
        # 完全匹配（相似度 == 1.0）和相似匹配（相似度 >= 0.75）统一处理
        expr_obj, similarity = await self._find_similar_situation_expression(situation, similarity_threshold=0.75)
        if expr_obj:
            # 根据相似度决定是否使用 LLM 总结
            # 完全匹配（相似度 == 1.0）时不总结，相似匹配时总结
            use_llm_summary = similarity < 1.0
            await self._update_existing_expression(
                expr_obj=expr_obj,
                situation=situation,
                current_time=current_time,
                use_llm_summary=use_llm_summary,
            )
            return
        # 没有找到匹配的记录，创建新记录
        await self._create_expression_record(
            situation=situation,
            style=style,
            current_time=current_time,
        )
    async def _create_expression_record(
        self,
        situation: str,
        style: str,
        current_time: float,
    ) -> None:
        content_list = [situation]
        # 创建新记录时，直接使用原始的 situation，不进行总结
        formatted_situation = situation
        Expression.create(
            situation=formatted_situation,
            style=style,
            content_list=json.dumps(content_list, ensure_ascii=False),
            count=1,
            last_active_time=current_time,
            chat_id=self.chat_id,
            create_date=current_time,
        )
    async def _update_existing_expression(
        self,
        expr_obj: Expression,
        situation: str,
        current_time: float,
        use_llm_summary: bool = True,
    ) -> None:
        """
        更新现有 Expression 记录（situation 完全匹配或相似的情况）
        将新的 situation 添加到 content_list，不合并 style
        Args:
            use_llm_summary: 是否使用 LLM 进行总结，完全匹配时为 False，相似匹配时为 True
        """
        # 更新 content_list（添加新的 situation）
        content_list = self._parse_content_list(expr_obj.content_list)
        content_list.append(situation)
        expr_obj.content_list = json.dumps(content_list, ensure_ascii=False)
        # 更新其他字段
        expr_obj.count = (expr_obj.count or 0) + 1
        expr_obj.checked = False  # count 增加时重置 checked 为 False
        expr_obj.last_active_time = current_time
        if use_llm_summary:
            # 相似匹配时，使用 LLM 重新组合 situation
            new_situation = await self._compose_situation_text(
                content_list=content_list,
                fallback=expr_obj.situation,
            )
            expr_obj.situation = new_situation
        expr_obj.save()
        # count 增加后，立即进行一次检查
        await self._check_expression_immediately(expr_obj)
    def _parse_content_list(self, stored_list: Optional[str]) -> List[str]:
        if not stored_list:
            return []
        try:
            data = json.loads(stored_list)
        except json.JSONDecodeError:
            return []
        return [str(item) for item in data if isinstance(item, str)] if isinstance(data, list) else []
    async def _find_similar_situation_expression(
        self, situation: str, similarity_threshold: float = 0.75
    ) -> Tuple[Optional[Expression], float]:
        """
        查找具有相似 situation 的 Expression 记录
        检查 content_list 中的每一项
        Args:
            situation: 要查找的 situation
            similarity_threshold: 相似度阈值，默认 0.75
        Returns:
            Tuple[Optional[Expression], float]:
                - 找到的最相似的 Expression 对象，如果没有找到则返回 None
                - 相似度值（如果找到匹配，范围在 similarity_threshold 到 1.0 之间）
        """
        # 查询同一 chat_id 的所有记录
        all_expressions = Expression.select().where(Expression.chat_id == self.chat_id)
        best_match = None
        best_similarity = 0.0
        for expr in all_expressions:
            # 检查 content_list 中的每一项
            content_list = self._parse_content_list(expr.content_list)
            for existing_situation in content_list:
                similarity = calculate_similarity(situation, existing_situation)
                if similarity >= similarity_threshold and similarity > best_similarity:
                    best_similarity = similarity
                    best_match = expr
        if best_match:
            logger.debug(
                f"找到相似的 situation: 相似度={best_similarity:.3f}, 现有='{best_match.situation}', 新='{situation}'"
            )
        return best_match, best_similarity
    async def _compose_situation_text(self, content_list: List[str], fallback: str = "") -> str:
        sanitized = [c.strip() for c in content_list if c.strip()]
        if not sanitized:
            return fallback
        prompt = (
            "请阅读以下多个聊天情境描述，并将它们概括成一句简短的话，"
            "长度不超过20个字，保留共同特点：\n"
            f"{chr(10).join(f'- {s}' for s in sanitized[-10:])}\n只输出概括内容。"
        )
        try:
            summary, _ = await self.summary_model.generate_response_async(prompt, temperature=0.2)
            summary = summary.strip()
            if summary:
                return summary
        except Exception as e:
            logger.error(f"概括表达情境失败: {e}")
        return "/".join(sanitized) if sanitized else fallback
    async def _init_check_model(self) -> None:
        """初始化检查用的 LLM 实例"""
        if self.check_model is None:
            try:
                self.check_model = LLMRequest(
                    model_set=model_config.model_task_config.tool_use, request_type="expression.check"
                )
                logger.debug("检查用 LLM 实例初始化成功")
            except Exception as e:
                logger.error(f"创建检查用 LLM 实例失败: {e}")
    async def _check_expression_immediately(self, expr_obj: Expression) -> None:
        """
        立即检查表达方式（在 count 增加后调用）
        Args:
            expr_obj: 要检查的表达方式对象
        """
        try:
            # 检查是否启用自动检查
            if not global_config.expression.expression_self_reflect:
                logger.debug("表达方式自动检查未启用，跳过立即检查")
                return
            # 初始化检查用的 LLM
            await self._init_check_model()
            if self.check_model is None:
                logger.warning("检查用 LLM 实例初始化失败，跳过立即检查")
                return
            # 执行 LLM 评估
            suitable, reason, error = await single_expression_check(expr_obj.situation, expr_obj.style)
            # 更新数据库
            expr_obj.checked = True
            expr_obj.rejected = not suitable  # 通过则 rejected=False，不通过则 rejected=True
            expr_obj.save()
            status = "通过" if suitable else "不通过"
            logger.info(
                f"表达方式立即检查完成 [ID: {expr_obj.id}] - {status} | "
                f"Situation: {expr_obj.situation[:30]}... | "
                f"Style: {expr_obj.style[:30]}... | "
                f"Reason: {reason[:50] if reason else '无'}..."
            )
            if error:
                logger.warning(f"表达方式立即检查时出现错误 [ID: {expr_obj.id}]: {error}")
        except Exception as e:
            logger.error(f"立即检查表达方式失败 [ID: {expr_obj.id}]: {e}", exc_info=True)
            # 检查失败时，保持 checked=False，等待后续自动检查任务处理
    def _check_cached_jargons_in_messages(self, messages: List[Any]) -> List[Tuple[str, str]]:
        """
        检查缓存中的 jargon 是否出现在 messages 中
        Args:
            messages: 消息列表
        Returns:
            List[Tuple[str, str]]: 匹配到的黑话条目列表，每个元素是 (content, source_id)
        """
        if not messages:
            return []
        # 获取 jargon_miner 实例
        jargon_miner = miner_manager.get_miner(self.chat_id)
        # 获取缓存中的所有 jargon
        cached_jargons = jargon_miner.get_cached_jargons()
        if not cached_jargons:
            return []
        matched_entries: List[Tuple[str, str]] = []
        # 遍历 messages，检查缓存中的 jargon 是否出现
        for i, msg in enumerate(messages):
            # 跳过机器人自己的消息
            if is_bot_message(msg):
                continue
            # 获取消息文本
            msg_text = (getattr(msg, "processed_plain_text", None) or "").strip()
            if not msg_text:
                continue
            # 检查每个缓存中的 jargon 是否出现在消息文本中
            for jargon in cached_jargons:
                if not jargon or not jargon.strip():
                    continue
                jargon_content = jargon.strip()
                # 使用正则匹配，考虑单词边界（类似 jargon_explainer 中的逻辑）
                pattern = re.escape(jargon_content)
                # 对于中文，使用更宽松的匹配；对于英文/数字，使用单词边界
                if re.search(r"[\u4e00-\u9fff]", jargon_content):
                    # 包含中文，使用更宽松的匹配
                    search_pattern = pattern
                else:
                    # 纯英文/数字，使用单词边界
                    search_pattern = r"\b" + pattern + r"\b"
                if re.search(search_pattern, msg_text, re.IGNORECASE):
                    # 找到匹配，构建条目（source_id 从 1 开始，因为 build_anonymous_messages 的编号从 1 开始）
                    source_id = str(i + 1)
                    matched_entries.append((jargon_content, source_id))
        return matched_entries
    async def _process_jargon_entries(self, jargon_entries: List[Tuple[str, str]], messages: List[Any]) -> None:
        """
        处理从 expression learner 提取的黑话条目，路由到 jargon_miner
        Args:
            jargon_entries: 黑话条目列表，每个元素是 (content, source_id)
            messages: 消息列表，用于构建上下文
        """
        if not jargon_entries or not messages:
            return
        # 获取 jargon_miner 实例
        jargon_miner = miner_manager.get_miner(self.chat_id)
        # 构建黑话条目格式，与 jargon_miner.run_once 中的格式一致
        entries: List[Dict[str, List[str]]] = []
        for content, source_id in jargon_entries:
            content = content.strip()
            if not content:
                continue
            # 过滤掉包含 SELF 的黑话，不学习
            if "SELF" in content:
                logger.info(f"跳过包含 SELF 的黑话: {content}")
                continue
            # 检查是否包含机器人名称
            if contains_bot_self_name(content):
                logger.info(f"跳过包含机器人昵称/别名的黑话: {content}")
                continue
            # 解析 source_id
            source_id_str = (source_id or "").strip()
            if not source_id_str.isdigit():
                logger.warning(f"黑话条目 source_id 无效: content={content}, source_id={source_id_str}")
                continue
            # build_anonymous_messages 的编号从 1 开始
            line_index = int(source_id_str) - 1
            if line_index < 0 or line_index >= len(messages):
                logger.warning(f"黑话条目 source_id 超出范围: content={content}, source_id={source_id_str}")
                continue
            # 检查是否是机器人自己的消息
            target_msg = messages[line_index]
            if is_bot_message(target_msg):
                logger.info(f"跳过引用机器人自身消息的黑话: content={content}, source_id={source_id_str}")
                continue
            # 构建上下文段落
            context_paragraph = build_context_paragraph(messages, line_index)
            if not context_paragraph:
                logger.warning(f"黑话条目上下文为空: content={content}, source_id={source_id_str}")
                continue
            entries.append({"content": content, "raw_content": [context_paragraph]})
        if not entries:
            return
        # 调用 jargon_miner 处理这些条目
        await jargon_miner.process_extracted_entries(entries)
 class ExpressionLearnerManager:
    def __init__(self):
        self.expression_learners = {}
        self._ensure_expression_directories()
    def get_expression_learner(self, chat_id: str) -> ExpressionLearner:
        if chat_id not in self.expression_learners:
            self.expression_learners[chat_id] = ExpressionLearner(chat_id)
        return self.expression_learners[chat_id]
    def _ensure_expression_directories(self):
        """
        确保表达方式相关的目录结构存在
        """
        base_dir = os.path.join("data", "expression")
        directories_to_create = [
            base_dir,
            os.path.join(base_dir, "learnt_style"),
            os.path.join(base_dir, "learnt_grammar"),
        ]
        for directory in directories_to_create:
            try:
                os.makedirs(directory, exist_ok=True)
                logger.debug(f"确保目录存在: {directory}")
            except Exception as e:
                logger.error(f"创建目录失败 {directory}: {e}")
 expression_learner_manager = ExpressionLearnerManager()
--- a/src/bw_learner/jargon_explainer.py
+++ b/src/bw_learner/jargon_explainer.py
@@ -0,0 +1,86 @@
 from typing import Optional, Dict, List
 from sqlmodel import select, func as fn
 import json
 from src.common.database.database import get_db_session
 from src.common.database.database_model import Jargon
 from src.common.logger import get_logger
 from src.config.config import global_config
 logger = get_logger("jargon_explainer")
 def search_jargon(
    keyword: str,
    chat_id: Optional[str] = None,
    limit: int = 10,
    case_sensitive: bool = False,
    fuzzy: bool = True,
 ) -> List[Dict[str, str]]:
    """
    搜索 jargon，支持大小写不敏感和模糊搜索
    Args:
        keyword: 搜索关键词
        chat_id: 可选的聊天 ID（session_id）
            - 如果开启了 all_global：此参数被忽略，查询所有 is_global=True 的记录
            - 如果关闭了 all_global：如果提供则优先搜索该聊天或 global 的 jargon
        limit: 返回结果数量限制，默认 10
        case_sensitive: 是否大小写敏感，默认 False（不敏感）
        fuzzy: 是否模糊搜索，默认 True（使用 LIKE 匹配）
    Returns:
        List[Dict[str, str]]: 包含 content, meaning 的字典列表
    """
    if not keyword or not keyword.strip():
        return []
    keyword = keyword.strip()
    # 构建搜索条件
    if case_sensitive:  # 大小写敏感
        search_condition = Jargon.content.contains(keyword) if fuzzy else Jargon.content == keyword  # type: ignore
    else:
        keyword_lower = keyword.lower()
        search_condition = (
            fn.LOWER(Jargon.content).contains(keyword_lower) if fuzzy else fn.LOWER(Jargon.content) == keyword_lower
        )
    # 根据 all_global 配置决定查询逻辑同时，限制结果数量（先多取一些，因为后面可能过滤）
    if global_config.expression.all_global_jargon:
        # 开启 all_global：所有记录都是全局的，查询所有 is_global=True 的记录（无视 chat_id）
        query = select(Jargon).where(search_condition, Jargon.is_global).order_by(Jargon.count.desc()).limit(limit * 2)  # type: ignore
    else:
        # 关闭 all_global：查询所有记录，chat_id 过滤在 Python 层面进行
        query = select(Jargon).where(search_condition).order_by(Jargon.count.desc()).limit(limit * 2)  # type: ignore
    # 执行查询并返回结果
    results: List[Dict[str, str]] = []
    with get_db_session() as session:
        jargons = session.exec(query).all()
        for jargon in jargons:
            # 如果提供了 chat_id 且 all_global=False，需要检查 session_id_dict 是否包含目标 chat_id
            if chat_id and not global_config.expression.all_global_jargon and not jargon.is_global:
                try:  # 解析 session_id_dict
                    session_id_dict = json.loads(jargon.session_id_dict) if jargon.session_id_dict else {}
                except (json.JSONDecodeError, TypeError):
                    session_id_dict = {}
                    logger.warning(
                        f"解析 session_id_dict 失败，jargon_id={jargon.id}，原始数据：{jargon.session_id_dict}"
                    )
                # 检查是否包含目标 chat_id
                if chat_id not in session_id_dict:
                    continue
            # 只返回有 meaning 的记录
            if not jargon.meaning.strip():
                continue
            results.append({"content": jargon.content or "", "meaning": jargon.meaning or ""})
            # 达到限制数量后停止
            if len(results) >= limit:
                break
    return results
--- a/src/bw_learner/jargon_miner_old.py
+++ b/src/bw_learner/jargon_miner_old.py
@@ -1,589 +0,0 @@
 import json
 import asyncio
 import random
 from collections import OrderedDict
 from typing import List, Dict, Optional, Callable
 from json_repair import repair_json
 from sqlalchemy import func as fn
 from src.common.logger import get_logger
 from src.common.database.database_model import Jargon
 from src.llm_models.utils_model import LLMRequest
 from src.config.config import model_config, global_config
 from src.chat.message_receive.chat_stream import get_chat_manager
 from src.prompt.prompt_manager import prompt_manager
 from src.bw_learner.learner_utils_old import (
    parse_chat_id_list,
    chat_id_list_contains,
    update_chat_id_list,
 )
 logger = get_logger("jargon")
 def _is_single_char_jargon(content: str) -> bool:
    """
    判断是否是单字黑话（单个汉字、英文或数字）
    Args:
        content: 词条内容
    Returns:
        bool: 如果是单字黑话返回True，否则返回False
    """
    if not content or len(content) != 1:
        return False
    char = content[0]
    # 判断是否是单个汉字、单个英文字母或单个数字
    return (
        "\u4e00" <= char <= "\u9fff"  # 汉字
        or "a" <= char <= "z"  # 小写字母
        or "A" <= char <= "Z"  # 大写字母
        or "0" <= char <= "9"  # 数字
    )
 def _should_infer_meaning(jargon_obj: Jargon) -> bool:
    """
    判断是否需要进行含义推断
    在 count 达到 3,6, 10, 20, 40, 60, 100 时进行推断
    并且count必须大于last_inference_count，避免重启后重复判定
    如果is_complete为True，不再进行推断
    """
    # 如果已完成所有推断，不再推断
    if jargon_obj.is_complete:
        return False
    count = jargon_obj.count or 0
    last_inference = jargon_obj.last_inference_count or 0
    # 阈值列表：3,6, 10, 20, 40, 60, 100
    thresholds = [2, 4, 8, 12, 24, 60, 100]
    if count < thresholds[0]:
        return False
    # 如果count没有超过上次判定值，不需要判定
    if count <= last_inference:
        return False
    # 找到第一个大于last_inference的阈值
    next_threshold = None
    for threshold in thresholds:
        if threshold > last_inference:
            next_threshold = threshold
            break
    # 如果没有找到下一个阈值，说明已经超过100，不应该再推断
    if next_threshold is None:
        return False
    # 检查count是否达到或超过这个阈值
    return count >= next_threshold
 class JargonMiner:
    def __init__(self, chat_id: str) -> None:
        self.chat_id = chat_id
        self.llm = LLMRequest(
            model_set=model_config.model_task_config.utils,
            request_type="jargon.extract",
        )
        self.llm_inference = LLMRequest(
            model_set=model_config.model_task_config.utils,
            request_type="jargon.inference",
        )
        # 初始化stream_name作为类属性，避免重复提取
        chat_manager = get_chat_manager()
        stream_name = chat_manager.get_stream_name(self.chat_id)
        self.stream_name = stream_name if stream_name else self.chat_id
        self.cache_limit = 50
        self.cache: OrderedDict[str, None] = OrderedDict()
        # 黑话提取锁，防止并发执行
        self._extraction_lock = asyncio.Lock()
    def _add_to_cache(self, content: str) -> None:
        """将提取到的黑话加入缓存，保持LRU语义"""
        if not content:
            return
        key = content.strip()
        if not key:
            return
        # 单字黑话（单个汉字、英文或数字）不记录到缓存
        if _is_single_char_jargon(key):
            return
        if key in self.cache:
            self.cache.move_to_end(key)
        else:
            self.cache[key] = None
            if len(self.cache) > self.cache_limit:
                self.cache.popitem(last=False)
    def get_cached_jargons(self) -> List[str]:
        """获取缓存中的所有黑话列表"""
        return list(self.cache.keys())
    async def _infer_meaning_by_id(self, jargon_id: int) -> None:
        """通过ID加载对象并推断"""
        try:
            jargon_obj = Jargon.get_by_id(jargon_id)
            # 再次检查is_complete，因为可能在异步任务执行时已被标记为完成
            if jargon_obj.is_complete:
                logger.debug(f"jargon {jargon_obj.content} 已完成所有推断，跳过")
                return
            await self.infer_meaning(jargon_obj)
        except Exception as e:
            logger.error(f"通过ID推断jargon失败: {e}")
    async def infer_meaning(self, jargon_obj: Jargon) -> None:
        """
        对jargon进行含义推断
        """
        try:
            content = jargon_obj.content
            raw_content_str = jargon_obj.raw_content or ""
            # 解析raw_content列表
            raw_content_list = []
            if raw_content_str:
                try:
                    raw_content_list = (
                        json.loads(raw_content_str) if isinstance(raw_content_str, str) else raw_content_str
                    )
                    if not isinstance(raw_content_list, list):
                        raw_content_list = [raw_content_list] if raw_content_list else []
                except (json.JSONDecodeError, TypeError):
                    raw_content_list = [raw_content_str] if raw_content_str else []
            if not raw_content_list:
                logger.warning(f"jargon {content} 没有raw_content，跳过推断")
                return
            # 获取当前count和上一次的meaning
            current_count = jargon_obj.count or 0
            previous_meaning = jargon_obj.meaning or ""
            # 当count为24, 60时，随机移除一半的raw_content项目
            if current_count in [24, 60] and len(raw_content_list) > 1:
                # 计算要保留的数量（至少保留1个）
                keep_count = max(1, len(raw_content_list) // 2)
                raw_content_list = random.sample(raw_content_list, keep_count)
                logger.info(
                    f"jargon {content} count={current_count}，随机移除后剩余 {len(raw_content_list)} 个raw_content项目"
                )
            # 步骤1: 基于raw_content和content推断
            raw_content_text = "\n".join(raw_content_list)
            # 当count为24, 60, 100时，在prompt中放入上一次推断出的meaning作为参考
            previous_meaning_section = ""
            previous_meaning_instruction = ""
            if current_count in [24, 60, 100] and previous_meaning:
                previous_meaning_section = f"\n**上一次推断的含义（仅供参考）**\n{previous_meaning}"
                previous_meaning_instruction = (
                    "- 请参考上一次推断的含义，结合新的上下文信息，给出更准确或更新的推断结果"
                )
            prompt1_template = prompt_manager.get_prompt("jargon_inference_with_context")
            prompt1_template.add_context("bot_name", global_config.bot.nickname)
            prompt1_template.add_context("content", str(content))
            prompt1_template.add_context("raw_content_list", raw_content_text)
            prompt1_template.add_context("previous_meaning_section", previous_meaning_section)
            prompt1_template.add_context("previous_meaning_instruction", previous_meaning_instruction)
            prompt1 = await prompt_manager.render_prompt(prompt1_template)
            response1, _ = await self.llm_inference.generate_response_async(prompt1, temperature=0.3)
            if not response1:
                logger.warning(f"jargon {content} 推断1失败：无响应")
                return
            # 解析推断1结果
            inference1 = None
            try:
                resp1 = response1.strip()
                if resp1.startswith("{") and resp1.endswith("}"):
                    inference1 = json.loads(resp1)
                else:
                    repaired = repair_json(resp1)
                    inference1 = json.loads(repaired) if isinstance(repaired, str) else repaired
                if not isinstance(inference1, dict):
                    logger.warning(f"jargon {content} 推断1结果格式错误")
                    return
            except Exception as e:
                logger.error(f"jargon {content} 推断1解析失败: {e}")
                return
            # 检查推断1是否表示信息不足无法推断
            no_info = inference1.get("no_info", False)
            meaning1 = inference1.get("meaning", "").strip()
            if no_info or not meaning1:
                logger.info(f"jargon {content} 推断1表示信息不足无法推断，放弃本次推断，待下次更新")
                # 更新最后一次判定的count值，避免在同一阈值重复尝试
                jargon_obj.last_inference_count = jargon_obj.count or 0
                jargon_obj.save()
                return
            # 步骤2: 仅基于content推断
            prompt2_template = prompt_manager.get_prompt("jargon_inference_content_only")
            prompt2_template.add_context("content", str(content))
            prompt2 = await prompt_manager.render_prompt(prompt2_template)
            response2, _ = await self.llm_inference.generate_response_async(prompt2, temperature=0.3)
            if not response2:
                logger.warning(f"jargon {content} 推断2失败：无响应")
                return
            # 解析推断2结果
            inference2 = None
            try:
                resp2 = response2.strip()
                if resp2.startswith("{") and resp2.endswith("}"):
                    inference2 = json.loads(resp2)
                else:
                    repaired = repair_json(resp2)
                    inference2 = json.loads(repaired) if isinstance(repaired, str) else repaired
                if not isinstance(inference2, dict):
                    logger.warning(f"jargon {content} 推断2结果格式错误")
                    return
            except Exception as e:
                logger.error(f"jargon {content} 推断2解析失败: {e}")
                return
            # logger.info(f"jargon {content} 推断2提示词: {prompt2}")
            # logger.info(f"jargon {content} 推断2结果: {response2}")
            # logger.info(f"jargon {content} 推断1提示词: {prompt1}")
            # logger.info(f"jargon {content} 推断1结果: {response1}")
            if global_config.debug.show_jargon_prompt:
                logger.info(f"jargon {content} 推断2提示词: {prompt2}")
                logger.info(f"jargon {content} 推断2结果: {response2}")
                logger.info(f"jargon {content} 推断1提示词: {prompt1}")
                logger.info(f"jargon {content} 推断1结果: {response1}")
            else:
                logger.debug(f"jargon {content} 推断2提示词: {prompt2}")
                logger.debug(f"jargon {content} 推断2结果: {response2}")
                logger.debug(f"jargon {content} 推断1提示词: {prompt1}")
                logger.debug(f"jargon {content} 推断1结果: {response1}")
            # 步骤3: 比较两个推断结果
            prompt3_template = prompt_manager.get_prompt("jargon_compare_inference")
            prompt3_template.add_context("inference1", json.dumps(inference1, ensure_ascii=False))
            prompt3_template.add_context("inference2", json.dumps(inference2, ensure_ascii=False))
            prompt3 = await prompt_manager.render_prompt(prompt3_template)
            if global_config.debug.show_jargon_prompt:
                logger.info(f"jargon {content} 比较提示词: {prompt3}")
            response3, _ = await self.llm_inference.generate_response_async(prompt3, temperature=0.3)
            if not response3:
                logger.warning(f"jargon {content} 比较失败：无响应")
                return
            # 解析比较结果
            comparison = None
            try:
                resp3 = response3.strip()
                if resp3.startswith("{") and resp3.endswith("}"):
                    comparison = json.loads(resp3)
                else:
                    repaired = repair_json(resp3)
                    comparison = json.loads(repaired) if isinstance(repaired, str) else repaired
                if not isinstance(comparison, dict):
                    logger.warning(f"jargon {content} 比较结果格式错误")
                    return
            except Exception as e:
                logger.error(f"jargon {content} 比较解析失败: {e}")
                return
            # 判断是否为黑话
            is_similar = comparison.get("is_similar", False)
            is_jargon = not is_similar  # 如果相似，说明不是黑话；如果有差异，说明是黑话
            # 更新数据库记录
            jargon_obj.is_jargon = is_jargon
            jargon_obj.meaning = inference1.get("meaning", "") if is_jargon else ""
            # 更新最后一次判定的count值，避免重启后重复判定
            jargon_obj.last_inference_count = jargon_obj.count or 0
            # 如果count>=100，标记为完成，不再进行推断
            if (jargon_obj.count or 0) >= 100:
                jargon_obj.is_complete = True
            jargon_obj.save()
            logger.debug(
                f"jargon {content} 推断完成: is_jargon={is_jargon}, meaning={jargon_obj.meaning}, last_inference_count={jargon_obj.last_inference_count}, is_complete={jargon_obj.is_complete}"
            )
            # 固定输出推断结果，格式化为可读形式
            if is_jargon:
                # 是黑话，输出格式：[聊天名]xxx的含义是 xxxxxxxxxxx
                meaning = jargon_obj.meaning or "无详细说明"
                is_global = jargon_obj.is_global
                if is_global:
                    logger.info(f"[黑话]{content}的含义是 {meaning}")
                else:
                    logger.info(f"[{self.stream_name}]{content}的含义是 {meaning}")
            else:
                # 不是黑话，输出格式：[聊天名]xxx 不是黑话
                logger.info(f"[{self.stream_name}]{content} 不是黑话")
        except Exception as e:
            logger.error(f"jargon推断失败: {e}")
            import traceback
            traceback.print_exc()
    async def process_extracted_entries(
        self, entries: List[Dict[str, List[str]]], person_name_filter: Optional[Callable[[str], bool]] = None
    ) -> None:
        """
        处理已提取的黑话条目（从 expression_learner 路由过来的）
        Args:
            entries: 黑话条目列表，每个元素格式为 {"content": "...", "raw_content": [...]}
            person_name_filter: 可选的过滤函数，用于检查内容是否包含人物名称
        """
        if not entries:
            return
        try:
            # 去重并合并raw_content（按 content 聚合）
            merged_entries: OrderedDict[str, Dict[str, List[str]]] = OrderedDict()
            for entry in entries:
                content_key = entry["content"]
                # 检查是否包含人物名称
                # logger.info(f"process_extracted_entries 检查是否包含人物名称: {content_key}")
                # logger.info(f"person_name_filter: {person_name_filter}")
                if person_name_filter and person_name_filter(content_key):
                    logger.info(f"process_extracted_entries 跳过包含人物名称的黑话: {content_key}")
                    continue
                raw_list = entry.get("raw_content", []) or []
                if content_key in merged_entries:
                    merged_entries[content_key]["raw_content"].extend(raw_list)
                else:
                    merged_entries[content_key] = {
                        "content": content_key,
                        "raw_content": list(raw_list),
                    }
            uniq_entries = []
            for merged_entry in merged_entries.values():
                raw_content_list = merged_entry["raw_content"]
                if raw_content_list:
                    merged_entry["raw_content"] = list(dict.fromkeys(raw_content_list))
                uniq_entries.append(merged_entry)
            saved = 0
            updated = 0
            for entry in uniq_entries:
                content = entry["content"]
                raw_content_list = entry["raw_content"]  # 已经是列表
                try:
                    # 查询所有content匹配的记录
                    query = Jargon.select().where(Jargon.content == content)
                    # 查找匹配的记录
                    matched_obj = None
                    for obj in query:
                        if global_config.expression.all_global_jargon:
                            # 开启all_global：所有content匹配的记录都可以
                            matched_obj = obj
                            break
                        else:
                            # 关闭all_global：需要检查chat_id列表是否包含目标chat_id
                            chat_id_list = parse_chat_id_list(obj.chat_id)
                            if chat_id_list_contains(chat_id_list, self.chat_id):
                                matched_obj = obj
                                break
                    if matched_obj:
                        obj = matched_obj
                        try:
                            obj.count = (obj.count or 0) + 1
                        except Exception:
                            obj.count = 1
                        # 合并raw_content列表：读取现有列表，追加新值，去重
                        existing_raw_content = []
                        if obj.raw_content:
                            try:
                                existing_raw_content = (
                                    json.loads(obj.raw_content) if isinstance(obj.raw_content, str) else obj.raw_content
                                )
                                if not isinstance(existing_raw_content, list):
                                    existing_raw_content = [existing_raw_content] if existing_raw_content else []
                            except (json.JSONDecodeError, TypeError):
                                existing_raw_content = [obj.raw_content] if obj.raw_content else []
                        # 合并并去重
                        merged_list = list(dict.fromkeys(existing_raw_content + raw_content_list))
                        obj.raw_content = json.dumps(merged_list, ensure_ascii=False)
                        # 更新chat_id列表：增加当前chat_id的计数
                        chat_id_list = parse_chat_id_list(obj.chat_id)
                        updated_chat_id_list = update_chat_id_list(chat_id_list, self.chat_id, increment=1)
                        obj.chat_id = json.dumps(updated_chat_id_list, ensure_ascii=False)
                        # 开启all_global时，确保记录标记为is_global=True
                        if global_config.expression.all_global_jargon:
                            obj.is_global = True
                        # 关闭all_global时，保持原有is_global不变（不修改）
                        obj.save()
                        # 检查是否需要推断（达到阈值且超过上次判定值）
                        if _should_infer_meaning(obj):
                            # 异步触发推断，不阻塞主流程
                            # 重新加载对象以确保数据最新
                            jargon_id = obj.id
                            asyncio.create_task(self._infer_meaning_by_id(jargon_id))
                        updated += 1
                    else:
                        # 没找到匹配记录，创建新记录
                        if global_config.expression.all_global_jargon:
                            # 开启all_global：新记录默认为is_global=True
                            is_global_new = True
                        else:
                            # 关闭all_global：新记录is_global=False
                            is_global_new = False
                        # 使用新格式创建chat_id列表：[[chat_id, count]]
                        chat_id_list = [[self.chat_id, 1]]
                        chat_id_json = json.dumps(chat_id_list, ensure_ascii=False)
                        Jargon.create(
                            content=content,
                            raw_content=json.dumps(raw_content_list, ensure_ascii=False),
                            chat_id=chat_id_json,
                            is_global=is_global_new,
                            count=1,
                        )
                        saved += 1
                except Exception as e:
                    logger.error(f"保存jargon失败: chat_id={self.chat_id}, content={content}, err={e}")
                    continue
                finally:
                    self._add_to_cache(content)
            # 固定输出提取的jargon结果，格式化为可读形式（只要有提取结果就输出）
            if uniq_entries:
                # 收集所有提取的jargon内容
                jargon_list = [entry["content"] for entry in uniq_entries]
                jargon_str = ",".join(jargon_list)
                # 输出格式化的结果（使用logger.info会自动应用jargon模块的颜色）
                logger.info(f"[{self.stream_name}]疑似黑话: {jargon_str}")
            if saved or updated:
                logger.debug(f"jargon写入: 新增 {saved} 条，更新 {updated} 条，chat_id={self.chat_id}")
        except Exception as e:
            logger.error(f"处理已提取的黑话条目失败: {e}")
 class JargonMinerManager:
    def __init__(self) -> None:
        self._miners: dict[str, JargonMiner] = {}
    def get_miner(self, chat_id: str) -> JargonMiner:
        if chat_id not in self._miners:
            self._miners[chat_id] = JargonMiner(chat_id)
        return self._miners[chat_id]
 miner_manager = JargonMinerManager()
 def search_jargon(
    keyword: str, chat_id: Optional[str] = None, limit: int = 10, case_sensitive: bool = False, fuzzy: bool = True
 ) -> List[Dict[str, str]]:
    """
    搜索jargon，支持大小写不敏感和模糊搜索
    Args:
        keyword: 搜索关键词
        chat_id: 可选的聊天ID
            - 如果开启了all_global：此参数被忽略，查询所有is_global=True的记录
            - 如果关闭了all_global：如果提供则优先搜索该聊天或global的jargon
        limit: 返回结果数量限制，默认10
        case_sensitive: 是否大小写敏感，默认False（不敏感）
        fuzzy: 是否模糊搜索，默认True（使用LIKE匹配）
    Returns:
        List[Dict[str, str]]: 包含content, meaning的字典列表
    """
    if not keyword or not keyword.strip():
        return []
    keyword = keyword.strip()
    # 构建查询（选择所有需要的字段，以便后续过滤）
    query = Jargon.select()
    # 构建搜索条件
    if case_sensitive:
        # 大小写敏感
        if fuzzy:
            # 模糊搜索
            search_condition = Jargon.content.contains(keyword)
        else:
            # 精确匹配
            search_condition = Jargon.content == keyword
    else:
        # 大小写不敏感
        if fuzzy:
            # 模糊搜索（使用LOWER函数）
            search_condition = fn.LOWER(Jargon.content).contains(keyword.lower())
        else:
            # 精确匹配（使用LOWER函数）
            search_condition = fn.LOWER(Jargon.content) == keyword.lower()
    query = query.where(search_condition)
    # 根据all_global配置决定查询逻辑
    if global_config.expression.all_global_jargon:
        # 开启all_global：所有记录都是全局的，查询所有is_global=True的记录（无视chat_id）
        query = query.where(Jargon.is_global)
    # 注意：对于all_global=False的情况，chat_id过滤在Python层面进行，以便兼容新旧格式
    # 注意：meaning的过滤移到Python层面，因为我们需要先过滤chat_id
    # 按count降序排序，优先返回出现频率高的
    query = query.order_by(Jargon.count.desc())
    # 限制结果数量（先多取一些，因为后面可能过滤）
    query = query.limit(limit * 2)
    # 执行查询并返回结果，过滤chat_id
    results = []
    for jargon in query:
        # 如果提供了chat_id且all_global=False，需要检查chat_id列表是否包含目标chat_id
        if chat_id and not global_config.expression.all_global_jargon:
            chat_id_list = parse_chat_id_list(jargon.chat_id)
            # 如果记录是is_global=True，或者chat_id列表包含目标chat_id，则包含
            if not jargon.is_global and not chat_id_list_contains(chat_id_list, chat_id):
                continue
        # 只返回有meaning的记录
        if not jargon.meaning or jargon.meaning.strip() == "":
            continue
        results.append({"content": jargon.content or "", "meaning": jargon.meaning or ""})
        # 达到限制数量后停止
        if len(results) >= limit:
            break
    return results
--- a/src/bw_learner/learner_utils_old.py
+++ b/src/bw_learner/learner_utils_old.py
@@ -16,77 +16,6 @@ from json_repair import repair_json
 logger = get_logger("learner_utils")
 def filter_message_content(content: Optional[str]) -> str:
    """
    过滤消息内容，移除回复、@、图片等格式
    Args:
        content: 原始消息内容
    Returns:
        str: 过滤后的内容
    """
    if not content:
        return ""
    # 移除以[回复开头、]结尾的部分，包括后面的"，说："部分
    content = re.sub(r"\[回复.*?\]，说：\s*", "", content)
    # 移除@<...>格式的内容
    content = re.sub(r"@<[^>]*>", "", content)
    # 移除[picid:...]格式的图片ID
    content = re.sub(r"\[picid:[^\]]*\]", "", content)
    # 移除[表情包：...]格式的内容
    content = re.sub(r"\[表情包：[^\]]*\]", "", content)
    return content.strip()
 def calculate_similarity(text1: str, text2: str) -> float:
    """
    计算两个文本的相似度，返回0-1之间的值
    使用SequenceMatcher计算相似度
    Args:
        text1: 第一个文本
        text2: 第二个文本
    Returns:
        float: 相似度值，范围0-1
    """
    return difflib.SequenceMatcher(None, text1, text2).ratio()
 def calculate_style_similarity(style1: str, style2: str) -> float:
    """
    计算两个 style 的相似度，返回0-1之间的值
    在计算前会移除"使用"和"句式"这两个词（参考 expression_similarity_analysis.py）
    Args:
        style1: 第一个 style
        style2: 第二个 style
    Returns:
        float: 相似度值，范围0-1
    """
    if not style1 or not style2:
        return 0.0
    # 移除"使用"和"句式"这两个词
    def remove_ignored_words(text: str) -> str:
        """移除需要忽略的词"""
        text = text.replace("使用", "")
        text = text.replace("句式", "")
        return text.strip()
    cleaned_style1 = remove_ignored_words(style1)
    cleaned_style2 = remove_ignored_words(style2)
    # 如果清理后文本为空，返回0
    if not cleaned_style1 or not cleaned_style2:
        return 0.0
    return difflib.SequenceMatcher(None, cleaned_style1, cleaned_style2).ratio()
 def _compute_weights(population: List[Dict]) -> List[float]:
    """
@@ -275,224 +204,224 @@ def contains_bot_self_name(content: str) -> bool:
    return any(name in target for name in candidates)
-def build_context_paragraph(messages: List[Any], center_index: int) -> Optional[str]:
+# def build_context_paragraph(messages: List[Any], center_index: int) -> Optional[str]:
-    """
+#     """
-    构建包含中心消息上下文的段落（前3条+后3条），使用标准的 readable builder 输出
+#     构建包含中心消息上下文的段落（前3条+后3条），使用标准的 readable builder 输出
-    """
+#     """
-    if not messages or center_index < 0 or center_index >= len(messages):
+#     if not messages or center_index < 0 or center_index >= len(messages):
-        return None
+#         return None
-    context_start = max(0, center_index - 3)
+#     context_start = max(0, center_index - 3)
-    context_end = min(len(messages), center_index + 1 + 3)
+#     context_end = min(len(messages), center_index + 1 + 3)
-    context_messages = messages[context_start:context_end]
+#     context_messages = messages[context_start:context_end]
-    if not context_messages:
+#     if not context_messages:
-        return None
+#         return None
-    try:
+#     try:
-        paragraph = build_readable_messages(
+#         paragraph = build_readable_messages(
-            messages=context_messages,
+#             messages=context_messages,
-            replace_bot_name=True,
+#             replace_bot_name=True,
-            timestamp_mode="relative",
+#             timestamp_mode="relative",
-            read_mark=0.0,
+#             read_mark=0.0,
-            truncate=False,
+#             truncate=False,
-            show_actions=False,
+#             show_actions=False,
-            show_pic=True,
+#             show_pic=True,
-            message_id_list=None,
+#             message_id_list=None,
-            remove_emoji_stickers=False,
+#             remove_emoji_stickers=False,
-            pic_single=True,
+#             pic_single=True,
-        )
+#         )
-    except Exception as e:
+#     except Exception as e:
-        logger.warning(f"构建上下文段落失败: {e}")
+#         logger.warning(f"构建上下文段落失败: {e}")
-        return None
+#         return None
-    paragraph = paragraph.strip()
+#     paragraph = paragraph.strip()
-    return paragraph or None
+#     return paragraph or None
-def is_bot_message(msg: Any) -> bool:
+# def is_bot_message(msg: Any) -> bool:
-    """判断消息是否来自机器人自身"""
+#     """判断消息是否来自机器人自身"""
-    if msg is None:
+#     if msg is None:
-        return False
+#         return False
-    bot_config = getattr(global_config, "bot", None)
+#     bot_config = getattr(global_config, "bot", None)
-    if not bot_config:
+#     if not bot_config:
-        return False
+#         return False
-    platform = (
+#     platform = (
-        str(getattr(msg, "user_platform", "") or getattr(getattr(msg, "user_info", None), "platform", "") or "")
+#         str(getattr(msg, "user_platform", "") or getattr(getattr(msg, "user_info", None), "platform", "") or "")
-        .strip()
+#         .strip()
-        .lower()
+#         .lower()
-    )
+#     )
-    user_id = str(getattr(msg, "user_id", "") or getattr(getattr(msg, "user_info", None), "user_id", "") or "").strip()
+#     user_id = str(getattr(msg, "user_id", "") or getattr(getattr(msg, "user_info", None), "user_id", "") or "").strip()
-    if not platform or not user_id:
+#     if not platform or not user_id:
-        return False
+#         return False
-    platform_accounts = {}
+#     platform_accounts = {}
-    try:
+#     try:
-        platform_accounts = parse_platform_accounts(getattr(bot_config, "platforms", []) or [])
+#         platform_accounts = parse_platform_accounts(getattr(bot_config, "platforms", []) or [])
-    except Exception:
+#     except Exception:
-        platform_accounts = {}
+#         platform_accounts = {}
-    bot_accounts: Dict[str, str] = {}
+#     bot_accounts: Dict[str, str] = {}
-    qq_account = str(getattr(bot_config, "qq_account", "") or "").strip()
+#     qq_account = str(getattr(bot_config, "qq_account", "") or "").strip()
-    if qq_account:
+#     if qq_account:
-        bot_accounts["qq"] = qq_account
+#         bot_accounts["qq"] = qq_account
-    telegram_account = str(getattr(bot_config, "telegram_account", "") or "").strip()
+#     telegram_account = str(getattr(bot_config, "telegram_account", "") or "").strip()
-    if telegram_account:
+#     if telegram_account:
-        bot_accounts["telegram"] = telegram_account
+#         bot_accounts["telegram"] = telegram_account
-    for plat, account in platform_accounts.items():
+#     for plat, account in platform_accounts.items():
-        if account and plat not in bot_accounts:
+#         if account and plat not in bot_accounts:
-            bot_accounts[plat] = account
+#             bot_accounts[plat] = account
-    bot_account = bot_accounts.get(platform)
+#     bot_account = bot_accounts.get(platform)
-    return bool(bot_account and user_id == bot_account)
+#     return bool(bot_account and user_id == bot_account)
-def parse_expression_response(response: str) -> Tuple[List[Tuple[str, str, str]], List[Tuple[str, str]]]:
+# def parse_expression_response(response: str) -> Tuple[List[Tuple[str, str, str]], List[Tuple[str, str]]]:
-    """
+#     """
-    解析 LLM 返回的表达风格总结和黑话 JSON，提取两个列表。
+#     解析 LLM 返回的表达风格总结和黑话 JSON，提取两个列表。
-    期望的 JSON 结构：
+#     期望的 JSON 结构：
-    [
+#     [
-        {"situation": "AAAAA", "style": "BBBBB", "source_id": "3"},  // 表达方式
+#         {"situation": "AAAAA", "style": "BBBBB", "source_id": "3"},  // 表达方式
-        {"content": "词条", "source_id": "12"},  // 黑话
+#         {"content": "词条", "source_id": "12"},  // 黑话
-        ...
+#         ...
-    ]
+#     ]
-    Returns:
+#     Returns:
-        Tuple[List[Tuple[str, str, str]], List[Tuple[str, str]]]:
+#         Tuple[List[Tuple[str, str, str]], List[Tuple[str, str]]]:
-            第一个列表是表达方式 (situation, style, source_id)
+#             第一个列表是表达方式 (situation, style, source_id)
-            第二个列表是黑话 (content, source_id)
+#             第二个列表是黑话 (content, source_id)
-    """
+#     """
-    if not response:
+#     if not response:
-        return [], []
+#         return [], []
-    raw = response.strip()
+#     raw = response.strip()
-    # 尝试提取 ```json 代码块
+#     # 尝试提取 ```json 代码块
-    json_block_pattern = r"```json\s*(.*?)\s*```"
+#     json_block_pattern = r"```json\s*(.*?)\s*```"
-    match = re.search(json_block_pattern, raw, re.DOTALL)
+#     match = re.search(json_block_pattern, raw, re.DOTALL)
-    if match:
+#     if match:
-        raw = match.group(1).strip()
+#         raw = match.group(1).strip()
-    else:
+#     else:
-        # 去掉可能存在的通用 ``` 包裹
+#         # 去掉可能存在的通用 ``` 包裹
-        raw = re.sub(r"^```\s*", "", raw, flags=re.MULTILINE)
+#         raw = re.sub(r"^```\s*", "", raw, flags=re.MULTILINE)
-        raw = re.sub(r"```\s*$", "", raw, flags=re.MULTILINE)
+#         raw = re.sub(r"```\s*$", "", raw, flags=re.MULTILINE)
-        raw = raw.strip()
+#         raw = raw.strip()
-    parsed = None
+#     parsed = None
-    expressions: List[Tuple[str, str, str]] = []  # (situation, style, source_id)
+#     expressions: List[Tuple[str, str, str]] = []  # (situation, style, source_id)
-    jargon_entries: List[Tuple[str, str]] = []  # (content, source_id)
+#     jargon_entries: List[Tuple[str, str]] = []  # (content, source_id)
-    try:
+#     try:
-        # 优先尝试直接解析
+#         # 优先尝试直接解析
-        if raw.startswith("[") and raw.endswith("]"):
+#         if raw.startswith("[") and raw.endswith("]"):
-            parsed = json.loads(raw)
+#             parsed = json.loads(raw)
-        else:
+#         else:
-            repaired = repair_json(raw)
+#             repaired = repair_json(raw)
-            if isinstance(repaired, str):
+#             if isinstance(repaired, str):
-                parsed = json.loads(repaired)
+#                 parsed = json.loads(repaired)
-            else:
+#             else:
-                parsed = repaired
+#                 parsed = repaired
-    except Exception as parse_error:
+#     except Exception as parse_error:
-        # 如果解析失败，尝试修复中文引号问题
+#         # 如果解析失败，尝试修复中文引号问题
-        # 使用状态机方法，在 JSON 字符串值内部将中文引号替换为转义的英文引号
+#         # 使用状态机方法，在 JSON 字符串值内部将中文引号替换为转义的英文引号
-        try:
+#         try:
-            def fix_chinese_quotes_in_json(text):
+#             def fix_chinese_quotes_in_json(text):
-                """使用状态机修复 JSON 字符串值中的中文引号"""
+#                 """使用状态机修复 JSON 字符串值中的中文引号"""
-                result = []
+#                 result = []
-                i = 0
+#                 i = 0
-                in_string = False
+#                 in_string = False
-                escape_next = False
+#                 escape_next = False
-                while i < len(text):
+#                 while i < len(text):
-                    char = text[i]
+#                     char = text[i]
-                    if escape_next:
+#                     if escape_next:
-                        # 当前字符是转义字符后的字符，直接添加
+#                         # 当前字符是转义字符后的字符，直接添加
-                        result.append(char)
+#                         result.append(char)
-                        escape_next = False
+#                         escape_next = False
-                        i += 1
+#                         i += 1
-                        continue
+#                         continue
-                    if char == "\\":
+#                     if char == "\\":
-                        # 转义字符
+#                         # 转义字符
-                        result.append(char)
+#                         result.append(char)
-                        escape_next = True
+#                         escape_next = True
-                        i += 1
+#                         i += 1
-                        continue
+#                         continue
-                    if char == '"' and not escape_next:
+#                     if char == '"' and not escape_next:
-                        # 遇到英文引号，切换字符串状态
+#                         # 遇到英文引号，切换字符串状态
-                        in_string = not in_string
+#                         in_string = not in_string
-                        result.append(char)
+#                         result.append(char)
-                        i += 1
+#                         i += 1
-                        continue
+#                         continue
-                    if in_string:
+#                     if in_string:
-                        # 在字符串值内部，将中文引号替换为转义的英文引号
+#                         # 在字符串值内部，将中文引号替换为转义的英文引号
-                        if char == '"':  # 中文左引号 U+201C
+#                         if char == '"':  # 中文左引号 U+201C
-                            result.append('\\"')
+#                             result.append('\\"')
-                        elif char == '"':  # 中文右引号 U+201D
+#                         elif char == '"':  # 中文右引号 U+201D
-                            result.append('\\"')
+#                             result.append('\\"')
-                        else:
+#                         else:
-                            result.append(char)
+#                             result.append(char)
-                    else:
+#                     else:
-                        # 不在字符串内，直接添加
+#                         # 不在字符串内，直接添加
-                        result.append(char)
+#                         result.append(char)
-                    i += 1
+#                     i += 1
-                return "".join(result)
+#                 return "".join(result)
-            fixed_raw = fix_chinese_quotes_in_json(raw)
+#             fixed_raw = fix_chinese_quotes_in_json(raw)
-            # 再次尝试解析
+#             # 再次尝试解析
-            if fixed_raw.startswith("[") and fixed_raw.endswith("]"):
+#             if fixed_raw.startswith("[") and fixed_raw.endswith("]"):
-                parsed = json.loads(fixed_raw)
+#                 parsed = json.loads(fixed_raw)
-            else:
+#             else:
-                repaired = repair_json(fixed_raw)
+#                 repaired = repair_json(fixed_raw)
-                if isinstance(repaired, str):
+#                 if isinstance(repaired, str):
-                    parsed = json.loads(repaired)
+#                     parsed = json.loads(repaired)
-                else:
+#                 else:
-                    parsed = repaired
+#                     parsed = repaired
-        except Exception as fix_error:
+#         except Exception as fix_error:
-            logger.error(f"解析表达风格 JSON 失败，初始错误: {type(parse_error).__name__}: {str(parse_error)}")
+#             logger.error(f"解析表达风格 JSON 失败，初始错误: {type(parse_error).__name__}: {str(parse_error)}")
-            logger.error(f"修复中文引号后仍失败，错误: {type(fix_error).__name__}: {str(fix_error)}")
+#             logger.error(f"修复中文引号后仍失败，错误: {type(fix_error).__name__}: {str(fix_error)}")
-            logger.error(f"解析表达风格 JSON 失败，原始响应：{response}")
+#             logger.error(f"解析表达风格 JSON 失败，原始响应：{response}")
-            logger.error(f"处理后的 JSON 字符串（前500字符）：{raw[:500]}")
+#             logger.error(f"处理后的 JSON 字符串（前500字符）：{raw[:500]}")
-            return [], []
+#             return [], []
-    if isinstance(parsed, dict):
+#     if isinstance(parsed, dict):
-        parsed_list = [parsed]
+#         parsed_list = [parsed]
-    elif isinstance(parsed, list):
+#     elif isinstance(parsed, list):
-        parsed_list = parsed
+#         parsed_list = parsed
-    else:
+#     else:
-        logger.error(f"表达风格解析结果类型异常: {type(parsed)}, 内容: {parsed}")
+#         logger.error(f"表达风格解析结果类型异常: {type(parsed)}, 内容: {parsed}")
-        return [], []
+#         return [], []
-    for item in parsed_list:
+#     for item in parsed_list:
-        if not isinstance(item, dict):
+#         if not isinstance(item, dict):
-            continue
+#             continue
-        # 检查是否是表达方式条目（有 situation 和 style）
+#         # 检查是否是表达方式条目（有 situation 和 style）
-        situation = str(item.get("situation", "")).strip()
+#         situation = str(item.get("situation", "")).strip()
-        style = str(item.get("style", "")).strip()
+#         style = str(item.get("style", "")).strip()
-        source_id = str(item.get("source_id", "")).strip()
+#         source_id = str(item.get("source_id", "")).strip()
-        if situation and style and source_id:
+#         if situation and style and source_id:
-            # 表达方式条目
+#             # 表达方式条目
-            expressions.append((situation, style, source_id))
+#             expressions.append((situation, style, source_id))
-        elif item.get("content"):
+#         elif item.get("content"):
-            # 黑话条目（有 content 字段）
+#             # 黑话条目（有 content 字段）
-            content = str(item.get("content", "")).strip()
+#             content = str(item.get("content", "")).strip()
-            source_id = str(item.get("source_id", "")).strip()
+#             source_id = str(item.get("source_id", "")).strip()
-            if content and source_id:
+#             if content and source_id:
-                jargon_entries.append((content, source_id))
+#                 jargon_entries.append((content, source_id))
-    return expressions, jargon_entries
+#     return expressions, jargon_entries
--- a/src/bw_learner/message_recorder_old.py
+++ b/src/bw_learner/message_recorder_old.py
@@ -1,179 +0,0 @@
 import time
 import asyncio
 from typing import List, Any
 from src.common.logger import get_logger
 from src.chat.message_receive.chat_manager import chat_manager as _chat_manager
 from src.chat.utils.chat_message_builder import get_raw_msg_by_timestamp_with_chat_inclusive
 from src.chat.utils.common_utils import TempMethodsExpression
 from src.bw_learner.expression_learner_old import expression_learner_manager
 from src.bw_learner.jargon_miner_old import miner_manager
 logger = get_logger("bw_learner")
 class MessageRecorder:
    """
    统一的消息记录器，负责管理时间窗口和消息提取，并将消息分发给 expression_learner 和 jargon_miner
    """
    def __init__(self, chat_id: str) -> None:
        self.chat_id = chat_id
        self.chat_stream = _chat_manager.get_session_by_session_id(chat_id)
        self.chat_name = _chat_manager.get_session_name(chat_id) or chat_id
        # 维护每个chat的上次提取时间
        self.last_extraction_time: float = time.time()
        # 提取锁，防止并发执行
        self._extraction_lock = asyncio.Lock()
        # 获取 expression 和 jargon 的配置参数
        self._init_parameters()
        # 获取 expression_learner 和 jargon_miner 实例
        self.expression_learner = expression_learner_manager.get_expression_learner(chat_id)
        self.jargon_miner = miner_manager.get_miner(chat_id)
    def _init_parameters(self) -> None:
        """初始化提取参数"""
        # 获取 expression 配置
        _, self.enable_expression_learning, self.enable_jargon_learning = (
            TempMethodsExpression.get_expression_config_for_chat(self.chat_id)
        )
        self.min_messages_for_extraction = 30
        self.min_extraction_interval = 60
        logger.debug(
            f"MessageRecorder 初始化: chat_id={self.chat_id}, "
            f"min_messages={self.min_messages_for_extraction}, "
            f"min_interval={self.min_extraction_interval}"
        )
    def should_trigger_extraction(self) -> bool:
        """
        检查是否应该触发消息提取
        Returns:
            bool: 是否应该触发提取
        """
        # 检查时间间隔
        time_diff = time.time() - self.last_extraction_time
        if time_diff < self.min_extraction_interval:
            return False
        # 检查消息数量
        recent_messages = get_raw_msg_by_timestamp_with_chat_inclusive(
            chat_id=self.chat_id,
            timestamp_start=self.last_extraction_time,
            timestamp_end=time.time(),
        )
        if not recent_messages or len(recent_messages) < self.min_messages_for_extraction:
            return False
        return True
    async def extract_and_distribute(self) -> None:
        """
        提取消息并分发给 expression_learner 和 jargon_miner
        """
        # 使用异步锁防止并发执行
        async with self._extraction_lock:
            # 在锁内检查，避免并发触发
            if not self.should_trigger_extraction():
                return
            # 检查 chat_stream 是否存在
            if not self.chat_stream:
                return
            # 记录本次提取的时间窗口，避免重复提取
            extraction_start_time = self.last_extraction_time
            extraction_end_time = time.time()
            # 立即更新提取时间，防止并发触发
            self.last_extraction_time = extraction_end_time
            try:
                # logger.info(f"在聊天流 {self.chat_name} 开始统一消息提取和分发")
                # 拉取提取窗口内的消息
                messages = get_raw_msg_by_timestamp_with_chat_inclusive(
                    chat_id=self.chat_id,
                    timestamp_start=extraction_start_time,
                    timestamp_end=extraction_end_time,
                )
                if not messages:
                    logger.debug(f"聊天流 {self.chat_name} 没有新消息，跳过提取")
                    return
                # 按时间排序，确保顺序一致
                messages = sorted(messages, key=lambda msg: msg.time or 0)
                logger.info(
                    f"聊天流 {self.chat_name} 提取到 {len(messages)} 条消息，"
                    f"时间窗口: {extraction_start_time:.2f} - {extraction_end_time:.2f}"
                )
                # 触发 expression_learner 和 jargon_miner 的处理
                if self.enable_expression_learning:
                    asyncio.create_task(self._trigger_expression_learning(messages))
            except Exception as e:
                logger.error(f"为聊天流 {self.chat_name} 提取和分发消息失败: {e}")
                import traceback
                traceback.print_exc()
                # 即使失败也保持时间戳更新，避免频繁重试
    async def _trigger_expression_learning(self, messages: List[Any]) -> None:
        """
        触发 expression 学习，使用指定的消息列表
        Args:
            timestamp_start: 开始时间戳
            timestamp_end: 结束时间戳
            messages: 消息列表
        """
        try:
            # 传递消息给 ExpressionLearner（必需参数）
            learnt_style = await self.expression_learner.learn_and_store(messages=messages)
            if learnt_style:
                logger.info(f"聊天流 {self.chat_name} 表达学习完成")
            else:
                logger.debug(f"聊天流 {self.chat_name} 表达学习未获得有效结果")
        except Exception as e:
            logger.error(f"为聊天流 {self.chat_name} 触发表达学习失败: {e}")
            import traceback
            traceback.print_exc()
 class MessageRecorderManager:
    """MessageRecorder 管理器"""
    def __init__(self) -> None:
        self._recorders: dict[str, MessageRecorder] = {}
    def get_recorder(self, chat_id: str) -> MessageRecorder:
        """获取或创建指定 chat_id 的 MessageRecorder"""
        if chat_id not in self._recorders:
            self._recorders[chat_id] = MessageRecorder(chat_id)
        return self._recorders[chat_id]
 # 全局管理器实例
 recorder_manager = MessageRecorderManager()
 async def extract_and_distribute_messages(chat_id: str) -> None:
    """
    统一的消息提取和分发入口函数
    Args:
        chat_id: 聊天流ID
    """
    recorder = recorder_manager.get_recorder(chat_id)
    await recorder.extract_and_distribute()