feat：新增自动表达优化功能，优化表达方式的提取

2025-12-27 17:20:11 +08:00
parent ba9b9d26a2
commit 99665e7918
14 changed files with 1177 additions and 837 deletions
--- a/src/bw_learner/expression_auto_check_task.py
+++ b/src/bw_learner/expression_auto_check_task.py
@@ -0,0 +1,235 @@
+"""
+表达方式自动检查定时任务
+
+功能：
+1. 定期随机选取指定数量的表达方式
+2. 使用LLM进行评估
+3. 通过评估的：rejected=0, checked=1
+4. 未通过评估的：rejected=1, checked=1
+"""
+
+import asyncio
+import json
+import random
+from typing import List
+
+from src.common.database.database_model import Expression
+from src.common.logger import get_logger
+from src.config.config import global_config
+from src.config.config import model_config
+from src.llm_models.utils_model import LLMRequest
+from src.manager.async_task_manager import AsyncTask
+
+logger = get_logger("expression_auto_check_task")
+
+
+def create_evaluation_prompt(situation: str, style: str) -> str:
+    """
+    创建评估提示词
+    
+    Args:
+        situation: 情境
+        style: 风格
+        
+    Returns:
+        评估提示词
+    """
+    prompt = f"""请评估以下表达方式或语言风格以及使用条件或使用情景是否合适：
+使用条件或使用情景：{situation}
+表达方式或言语风格：{style}
+
+请从以下方面进行评估：
+1. 表达方式或言语风格 是否与使用条件或使用情景 匹配
+2. 允许部分语法错误或口头化或缺省出现
+3. 表达方式不能太过特指，需要具有泛用性
+4. 一般不涉及具体的人名或名称
+
+请以JSON格式输出评估结果：
+{{
+    "suitable": true/false,
+    "reason": "评估理由（如果不合适，请说明原因）"
+
+}}
+如果合适，suitable设为true；如果不合适，suitable设为false，并在reason中说明原因。
+请严格按照JSON格式输出，不要包含其他内容。"""
+    
+    return prompt
+
+judge_llm = LLMRequest(
+    model_set=model_config.model_task_config.tool_use,
+    request_type="expression_check"
+)
+
+async def single_expression_check(situation: str, style: str) -> tuple[bool, str, str]:
+    """
+    执行单次LLM评估
+    
+    Args:
+        situation: 情境
+        style: 风格
+        
+    Returns:
+        (suitable, reason, error) 元组，如果出错则 suitable 为 False，error 包含错误信息
+    """
+    try:
+        prompt = create_evaluation_prompt(situation, style)
+        logger.debug(f"正在评估表达方式: situation={situation}, style={style}")
+        
+        response, (reasoning, model_name, _) = await judge_llm.generate_response_async(
+            prompt=prompt,
+            temperature=0.6,
+            max_tokens=1024
+        )
+        
+        logger.debug(f"LLM响应: {response}")
+        
+        # 解析JSON响应
+        try:
+            evaluation = json.loads(response)
+        except json.JSONDecodeError as e:
+            import re
+            json_match = re.search(r'\{[^{}]*"suitable"[^{}]*\}', response, re.DOTALL)
+            if json_match:
+                evaluation = json.loads(json_match.group())
+            else:
+                raise ValueError("无法从响应中提取JSON格式的评估结果") from e
+        
+        suitable = evaluation.get("suitable", False)
+        reason = evaluation.get("reason", "未提供理由")
+        
+        logger.debug(f"评估结果: {'通过' if suitable else '不通过'}")
+        return suitable, reason, None
+            
+    except Exception as e:
+        logger.error(f"评估表达方式 (situation={situation}, style={style}) 时出错: {e}")
+        return False, f"评估过程出错: {str(e)}", str(e)
+
+
+class ExpressionAutoCheckTask(AsyncTask):
+    """表达方式自动检查定时任务"""
+
+    def __init__(self):
+        # 从配置中获取检查间隔和一次检查数量
+        check_interval = global_config.expression.expression_auto_check_interval
+        super().__init__(
+            task_name="Expression Auto Check Task",
+            wait_before_start=60,  # 启动后等待60秒再开始第一次检查
+            run_interval=check_interval
+        )
+
+    async def _select_expressions(self, count: int) -> List[Expression]:
+        """
+        随机选择指定数量的未检查表达方式
+        
+        Args:
+            count: 需要选择的数量
+            
+        Returns:
+            选中的表达方式列表
+        """
+        try:
+            # 查询所有未检查的表达方式（checked=False）
+            unevaluated_expressions = list(
+                Expression.select().where(~Expression.checked)
+            )
+            
+            if not unevaluated_expressions:
+                logger.info("没有未检查的表达方式")
+                return []
+            
+            # 随机选择指定数量
+            selected_count = min(count, len(unevaluated_expressions))
+            selected = random.sample(unevaluated_expressions, selected_count)
+            
+            logger.info(f"从 {len(unevaluated_expressions)} 条未检查表达方式中随机选择了 {selected_count} 条")
+            return selected
+            
+        except Exception as e:
+            logger.error(f"选择表达方式时出错: {e}")
+            return []
+
+    async def _evaluate_expression(self, expression: Expression) -> bool:
+        """
+        评估单个表达方式
+        
+        Args:
+            expression: 要评估的表达方式
+            
+        Returns:
+            True表示通过，False表示不通过
+        """
+        
+        suitable, reason, error = await single_expression_check(
+            expression.situation,
+            expression.style,
+        )
+        
+        # 更新数据库
+        try:
+            expression.checked = True
+            expression.rejected = not suitable  # 通过则rejected=0，不通过则rejected=1
+            expression.save()
+            
+            status = "通过" if suitable else "不通过"
+            logger.info(
+                f"表达方式评估完成 [ID: {expression.id}] - {status} | "
+                f"Situation: {expression.situation}... | "
+                f"Style: {expression.style}... | "
+                f"Reason: {reason[:50]}..."
+            )
+            
+            if error:
+                logger.warning(f"表达方式评估时出现错误 [ID: {expression.id}]: {error}")
+            
+            return suitable
+            
+        except Exception as e:
+            logger.error(f"更新表达方式状态失败 [ID: {expression.id}]: {e}")
+            return False
+
+    async def run(self):
+        """执行检查任务"""
+        try:
+            # 检查是否启用自动检查
+            if not global_config.expression.expression_self_reflect:
+                logger.debug("表达方式自动检查未启用，跳过本次执行")
+                return
+            
+            check_count = global_config.expression.expression_auto_check_count
+            if check_count <= 0:
+                logger.warning(f"检查数量配置无效: {check_count}，跳过本次执行")
+                return
+            
+            logger.info(f"开始执行表达方式自动检查，本次将检查 {check_count} 条")
+            
+            
+            # 选择要检查的表达方式
+            expressions = await self._select_expressions(check_count)
+            
+            if not expressions:
+                logger.info("没有需要检查的表达方式")
+                return
+            
+            # 逐个评估
+            passed_count = 0
+            failed_count = 0
+            
+            for i, expression in enumerate(expressions, 1):
+                logger.info(f"正在评估 [{i}/{len(expressions)}]: ID={expression.id}")
+                
+                if await self._evaluate_expression(expression):
+                    passed_count += 1
+                else:
+                    failed_count += 1
+                
+                # 避免请求过快
+                await asyncio.sleep(0.3)
+            
+            logger.info(
+                f"表达方式自动检查完成: 总计 {len(expressions)} 条，"
+                f"通过 {passed_count} 条，不通过 {failed_count} 条"
+            )
+            
+        except Exception as e:
+            logger.error(f"执行表达方式自动检查任务时出错: {e}", exc_info=True)
+
--- a/src/bw_learner/expression_learner.py
+++ b/src/bw_learner/expression_learner.py
@@ -18,10 +18,13 @@ from src.bw_learner.learner_utils import (
    is_bot_message,
    build_context_paragraph,
    contains_bot_self_name,
-    calculate_style_similarity,
+    calculate_similarity,
+    parse_expression_response,
 )
 from src.bw_learner.jargon_miner import miner_manager
-from json_repair import repair_json
+from src.bw_learner.expression_auto_check_task import (
+    single_expression_check,
+)


 # MAX_EXPRESSION_COUNT = 300
@@ -91,6 +94,7 @@ class ExpressionLearner:
        self.summary_model: LLMRequest = LLMRequest(
            model_set=model_config.model_task_config.utils, request_type="expression.summary"
        )
+        self.check_model: Optional[LLMRequest] = None  # 检查用的 LLM 实例，延迟初始化
        self.chat_id = chat_id
        self.chat_stream = get_chat_manager().get_stream(chat_id)
        self.chat_name = get_chat_manager().get_stream_name(chat_id) or chat_id
@@ -136,11 +140,10 @@ class ExpressionLearner:
        # 解析 LLM 返回的表达方式列表和黑话列表（包含来源行编号）
        expressions: List[Tuple[str, str, str]]
        jargon_entries: List[Tuple[str, str]]  # (content, source_id)
-        expressions, jargon_entries = self.parse_expression_response(response)
-        expressions = self._filter_self_reference_styles(expressions)
+        expressions, jargon_entries = parse_expression_response(response)

        # 检查表达方式数量，如果超过10个则放弃本次表达学习
-        if len(expressions) > 10:
+        if len(expressions) > 20:
            logger.info(f"表达方式提取数量超过10个（实际{len(expressions)}个），放弃本次表达学习")
            expressions = []

@@ -155,7 +158,7 @@ class ExpressionLearner:

        # 如果没有表达方式，直接返回
        if not expressions:
-            logger.info("过滤后没有可用的表达方式（style 与机器人名称重复）")
+            logger.info("解析后没有可用的表达方式")
            return []

        logger.info(f"学习的prompt: {prompt}")
@@ -163,9 +166,60 @@ class ExpressionLearner:
        logger.info(f"学习的jargon_entries: {jargon_entries}")
        logger.info(f"学习的response: {response}")

-        # 直接根据 source_id 在 random_msg 中溯源，获取 context
+        # 过滤表达方式，根据 source_id 溯源并应用各种过滤规则
+        learnt_expressions = self._filter_expressions(expressions, random_msg)
+
+        if learnt_expressions is None:
+            logger.info("没有学习到表达风格")
+            return []
+
+        # 展示学到的表达方式
+        learnt_expressions_str = ""
+        for (situation,style) in learnt_expressions:
+            learnt_expressions_str += f"{situation}->{style}\n"
+        logger.info(f"在 {self.chat_name} 学习到表达风格:\n{learnt_expressions_str}")
+
+        current_time = time.time()
+
+        # 存储到数据库 Expression 表
+        for (situation,style) in learnt_expressions:
+            await self._upsert_expression_record(
+                situation=situation,
+                style=style,
+                current_time=current_time,
+            )
+
+        return learnt_expressions
+
+    def _filter_expressions(
+        self,
+        expressions: List[Tuple[str, str, str]],
+        messages: List[Any],
+    ) -> List[Tuple[str, str, str]]:
+        """
+        过滤表达方式，移除不符合条件的条目
+        
+        Args:
+            expressions: 表达方式列表，每个元素是 (situation, style, source_id)
+            messages: 原始消息列表，用于溯源和验证
+            
+        Returns:
+            过滤后的表达方式列表，每个元素是 (situation, style, context)
+        """
        filtered_expressions: List[Tuple[str, str, str]] = []  # (situation, style, context)

+        # 准备机器人名称集合（用于过滤 style 与机器人名称重复的表达）
+        banned_names = set()
+        bot_nickname = (global_config.bot.nickname or "").strip()
+        if bot_nickname:
+            banned_names.add(bot_nickname)
+        alias_names = global_config.bot.alias_names or []
+        for alias in alias_names:
+            alias = alias.strip()
+            if alias:
+                banned_names.add(alias)
+        banned_casefold = {name.casefold() for name in banned_names if name}
+
        for situation, style, source_id in expressions:
            source_id_str = (source_id or "").strip()
            if not source_id_str.isdigit():
@@ -173,12 +227,12 @@ class ExpressionLearner:
                continue

            line_index = int(source_id_str) - 1  # build_anonymous_messages 的编号从 1 开始
-            if line_index < 0 or line_index >= len(random_msg):
+            if line_index < 0 or line_index >= len(messages):
                # 超出范围，跳过
                continue

            # 当前行的原始内容
-            current_msg = random_msg[line_index]
+            current_msg = messages[line_index]

            # 过滤掉从bot自己发言中提取到的表达方式
            if is_bot_message(current_msg):
@@ -195,251 +249,53 @@ class ExpressionLearner:
                )
                continue

-            filtered_expressions.append((situation, style, context))
-
-        learnt_expressions = filtered_expressions
-
-        if learnt_expressions is None:
-            logger.info("没有学习到表达风格")
-            return []
-
-        # 展示学到的表达方式
-        learnt_expressions_str = ""
-        for (
-            situation,
-            style,
-            _context,
-        ) in learnt_expressions:
-            learnt_expressions_str += f"{situation}->{style}\n"
-        logger.info(f"在 {self.chat_name} 学习到表达风格:\n{learnt_expressions_str}")
-
-        current_time = time.time()
-
-        # 存储到数据库 Expression 表
-        for (
-            situation,
-            style,
-            context,
-        ) in learnt_expressions:
-            await self._upsert_expression_record(
-                situation=situation,
-                style=style,
-                context=context,
-                current_time=current_time,
-            )
-
-        return learnt_expressions
-
-    def parse_expression_response(self, response: str) -> Tuple[List[Tuple[str, str, str]], List[Tuple[str, str]]]:
-        """
-        解析 LLM 返回的表达风格总结和黑话 JSON，提取两个列表。
-
-        期望的 JSON 结构：
-        [
-          {"situation": "AAAAA", "style": "BBBBB", "source_id": "3"},  // 表达方式
-          {"content": "词条", "source_id": "12"},  // 黑话
-          ...
-        ]
-
-        Returns:
-            Tuple[List[Tuple[str, str, str]], List[Tuple[str, str]]]:
-                第一个列表是表达方式 (situation, style, source_id)
-                第二个列表是黑话 (content, source_id)
-        """
-        if not response:
-            return [], []
-
-        raw = response.strip()
-
-        # 尝试提取 ```json 代码块
-        json_block_pattern = r"```json\s*(.*?)\s*```"
-        match = re.search(json_block_pattern, raw, re.DOTALL)
-        if match:
-            raw = match.group(1).strip()
-        else:
-            # 去掉可能存在的通用 ``` 包裹
-            raw = re.sub(r"^```\s*", "", raw, flags=re.MULTILINE)
-            raw = re.sub(r"```\s*$", "", raw, flags=re.MULTILINE)
-            raw = raw.strip()
-
-        parsed = None
-        expressions: List[Tuple[str, str, str]] = []  # (situation, style, source_id)
-        jargon_entries: List[Tuple[str, str]] = []  # (content, source_id)
-
-        try:
-            # 优先尝试直接解析
-            if raw.startswith("[") and raw.endswith("]"):
-                parsed = json.loads(raw)
-            else:
-                repaired = repair_json(raw)
-                if isinstance(repaired, str):
-                    parsed = json.loads(repaired)
-                else:
-                    parsed = repaired
-        except Exception as parse_error:
-            # 如果解析失败，尝试修复中文引号问题
-            # 使用状态机方法，在 JSON 字符串值内部将中文引号替换为转义的英文引号
-            try:
-
-                def fix_chinese_quotes_in_json(text):
-                    """使用状态机修复 JSON 字符串值中的中文引号"""
-                    result = []
-                    i = 0
-                    in_string = False
-                    escape_next = False
-
-                    while i < len(text):
-                        char = text[i]
-
-                        if escape_next:
-                            # 当前字符是转义字符后的字符，直接添加
-                            result.append(char)
-                            escape_next = False
-                            i += 1
-                            continue
-
-                        if char == "\\":
-                            # 转义字符
-                            result.append(char)
-                            escape_next = True
-                            i += 1
-                            continue
-
-                        if char == '"' and not escape_next:
-                            # 遇到英文引号，切换字符串状态
-                            in_string = not in_string
-                            result.append(char)
-                            i += 1
-                            continue
-
-                        if in_string:
-                            # 在字符串值内部，将中文引号替换为转义的英文引号
-                            if char == '"':  # 中文左引号 U+201C
-                                result.append('\\"')
-                            elif char == '"':  # 中文右引号 U+201D
-                                result.append('\\"')
-                            else:
-                                result.append(char)
-                        else:
-                            # 不在字符串内，直接添加
-                            result.append(char)
-
-                        i += 1
-
-                    return "".join(result)
-
-                fixed_raw = fix_chinese_quotes_in_json(raw)
-
-                # 再次尝试解析
-                if fixed_raw.startswith("[") and fixed_raw.endswith("]"):
-                    parsed = json.loads(fixed_raw)
-                else:
-                    repaired = repair_json(fixed_raw)
-                    if isinstance(repaired, str):
-                        parsed = json.loads(repaired)
-                    else:
-                        parsed = repaired
-            except Exception as fix_error:
-                logger.error(f"解析表达风格 JSON 失败，初始错误: {type(parse_error).__name__}: {str(parse_error)}")
-                logger.error(f"修复中文引号后仍失败，错误: {type(fix_error).__name__}: {str(fix_error)}")
-                logger.error(f"解析表达风格 JSON 失败，原始响应：{response}")
-                logger.error(f"处理后的 JSON 字符串（前500字符）：{raw[:500]}")
-                return [], []
-
-        if isinstance(parsed, dict):
-            parsed_list = [parsed]
-        elif isinstance(parsed, list):
-            parsed_list = parsed
-        else:
-            logger.error(f"表达风格解析结果类型异常: {type(parsed)}, 内容: {parsed}")
-            return [], []
-
-        for item in parsed_list:
-            if not isinstance(item, dict):
+            # 过滤掉 style 与机器人名称/昵称重复的表达
+            normalized_style = (style or "").strip()
+            if normalized_style and normalized_style.casefold() in banned_casefold:
+                logger.debug(
+                    f"跳过 style 与机器人名称重复的表达方式: situation={situation}, style={style}, source_id={source_id}"
+                )
                continue

-            # 检查是否是表达方式条目（有 situation 和 style）
-            situation = str(item.get("situation", "")).strip()
-            style = str(item.get("style", "")).strip()
-            source_id = str(item.get("source_id", "")).strip()
+            # 过滤掉包含 "表情：" 或 "表情:" 的内容
+            if "表情：" in (situation or "") or "表情:" in (situation or "") or \
+               "表情：" in (style or "") or "表情:" in (style or "") or \
+               "表情：" in context or "表情:" in context:
+                logger.info(
+                    f"跳过包含表情标记的表达方式: situation={situation}, style={style}, source_id={source_id}"
+                )
+                continue

-            if situation and style and source_id:
-                # 表达方式条目
-                expressions.append((situation, style, source_id))
-            elif item.get("content"):
-                # 黑话条目（有 content 字段）
-                content = str(item.get("content", "")).strip()
-                source_id = str(item.get("source_id", "")).strip()
-                if content and source_id:
-                    jargon_entries.append((content, source_id))
+            # 过滤掉包含 "[图片" 的内容
+            if "[图片" in (situation or "") or "[图片" in (style or "") or "[图片" in context:
+                logger.info(
+                    f"跳过包含图片标记的表达方式: situation={situation}, style={style}, source_id={source_id}"
+                )
+                continue

-        return expressions, jargon_entries
+            filtered_expressions.append((situation, style))

-    def _filter_self_reference_styles(self, expressions: List[Tuple[str, str, str]]) -> List[Tuple[str, str, str]]:
-        """
-        过滤掉style与机器人名称/昵称重复的表达
-        """
-        banned_names = set()
-        bot_nickname = (global_config.bot.nickname or "").strip()
-        if bot_nickname:
-            banned_names.add(bot_nickname)
-
-        alias_names = global_config.bot.alias_names or []
-        for alias in alias_names:
-            alias = alias.strip()
-            if alias:
-                banned_names.add(alias)
-
-        banned_casefold = {name.casefold() for name in banned_names if name}
-
-        filtered: List[Tuple[str, str, str]] = []
-        removed_count = 0
-        for situation, style, source_id in expressions:
-            normalized_style = (style or "").strip()
-            if normalized_style and normalized_style.casefold() not in banned_casefold:
-                filtered.append((situation, style, source_id))
-            else:
-                removed_count += 1
-
-        if removed_count:
-            logger.debug(f"已过滤 {removed_count} 条style与机器人名称重复的表达方式")
-
-        return filtered
+        return filtered_expressions

    async def _upsert_expression_record(
        self,
        situation: str,
        style: str,
-        context: str,
        current_time: float,
    ) -> None:
-        # 第一层：检查是否有完全一致的 style（检查 style 字段和 style_list）
-        expr_obj = await self._find_exact_style_match(style)
+        # 检查是否有相似的 situation（相似度 >= 0.75，检查 content_list）
+        # 完全匹配（相似度 == 1.0）和相似匹配（相似度 >= 0.75）统一处理
+        expr_obj, similarity = await self._find_similar_situation_expression(situation, similarity_threshold=0.75)

        if expr_obj:
-            # 找到完全匹配的 style，合并到现有记录（不使用 LLM 总结）
+            # 根据相似度决定是否使用 LLM 总结
+            # 完全匹配（相似度 == 1.0）时不总结，相似匹配时总结
+            use_llm_summary = similarity < 1.0
            await self._update_existing_expression(
                expr_obj=expr_obj,
                situation=situation,
-                style=style,
-                context=context,
                current_time=current_time,
-                use_llm_summary=False,
-            )
-            return
-
-        # 第二层：检查是否有相似的 style（相似度 >= 0.75，检查 style 字段和 style_list）
-        similar_expr_obj = await self._find_similar_style_expression(style, similarity_threshold=0.75)
-
-        if similar_expr_obj:
-            # 找到相似的 style，合并到现有记录（使用 LLM 总结）
-            await self._update_existing_expression(
-                expr_obj=similar_expr_obj,
-                situation=situation,
-                style=style,
-                context=context,
-                current_time=current_time,
-                use_llm_summary=True,
+                use_llm_summary=use_llm_summary,
            )
            return

@@ -447,7 +303,6 @@ class ExpressionLearner:
        await self._create_expression_record(
            situation=situation,
            style=style,
-            context=context,
            current_time=current_time,
        )

@@ -455,7 +310,6 @@ class ExpressionLearner:
        self,
        situation: str,
        style: str,
-        context: str,
        current_time: float,
    ) -> None:
        content_list = [situation]
@@ -466,26 +320,22 @@ class ExpressionLearner:
            situation=formatted_situation,
            style=style,
            content_list=json.dumps(content_list, ensure_ascii=False),
-            style_list=None,  # 新记录初始时 style_list 为空
            count=1,
            last_active_time=current_time,
            chat_id=self.chat_id,
            create_date=current_time,
-            context=context,
        )

    async def _update_existing_expression(
        self,
        expr_obj: Expression,
        situation: str,
-        style: str,
-        context: str,
        current_time: float,
        use_llm_summary: bool = True,
    ) -> None:
        """
-        更新现有 Expression 记录（style 完全匹配或相似的情况）
-        将新的 situation 添加到 content_list，将新的 style 添加到 style_list（如果不同）
+        更新现有 Expression 记录（situation 完全匹配或相似的情况）
+        将新的 situation 添加到 content_list，不合并 style
        
        Args:
            use_llm_summary: 是否使用 LLM 进行总结，完全匹配时为 False，相似匹配时为 True
@@ -495,43 +345,24 @@ class ExpressionLearner:
        content_list.append(situation)
        expr_obj.content_list = json.dumps(content_list, ensure_ascii=False)

-        # 更新 style_list（如果 style 不同，添加到 style_list）
-        style_list = self._parse_style_list(expr_obj.style_list)
-        # 将原有的 style 也加入 style_list（如果还没有的话）
-        if expr_obj.style and expr_obj.style not in style_list:
-            style_list.append(expr_obj.style)
-        # 如果新的 style 不在 style_list 中，添加它
-        if style not in style_list:
-            style_list.append(style)
-        expr_obj.style_list = json.dumps(style_list, ensure_ascii=False)
-
        # 更新其他字段
        expr_obj.count = (expr_obj.count or 0) + 1
+        expr_obj.checked = False  # count 增加时重置 checked 为 False
        expr_obj.last_active_time = current_time
-        expr_obj.context = context

        if use_llm_summary:
-            # 相似匹配时，使用 LLM 重新组合 situation 和 style
+            # 相似匹配时，使用 LLM 重新组合 situation
            new_situation = await self._compose_situation_text(
                content_list=content_list,
-                count=expr_obj.count,
                fallback=expr_obj.situation,
            )
            expr_obj.situation = new_situation

-            new_style = await self._compose_style_text(
-                style_list=style_list,
-                count=expr_obj.count,
-                fallback=expr_obj.style or style,
-            )
-            expr_obj.style = new_style
-        else:
-            # 完全匹配时，不进行 LLM 总结，保持原有的 situation 和 style 不变
-            # 只更新 content_list 和 style_list
-            pass
-
        expr_obj.save()

+        # count 增加后，立即进行一次检查
+        await self._check_expression_immediately(expr_obj)
+
    def _parse_content_list(self, stored_list: Optional[str]) -> List[str]:
        if not stored_list:
            return []
@@ -541,49 +372,19 @@ class ExpressionLearner:
            return []
        return [str(item) for item in data if isinstance(item, str)] if isinstance(data, list) else []

-    def _parse_style_list(self, stored_list: Optional[str]) -> List[str]:
-        """解析 style_list JSON 字符串为列表，逻辑与 _parse_content_list 相同"""
-        if not stored_list:
-            return []
-        try:
-            data = json.loads(stored_list)
-        except json.JSONDecodeError:
-            return []
-        return [str(item) for item in data if isinstance(item, str)] if isinstance(data, list) else []
-
-    async def _find_exact_style_match(self, style: str) -> Optional[Expression]:
+    async def _find_similar_situation_expression(self, situation: str, similarity_threshold: float = 0.75) -> Tuple[Optional[Expression], float]:
        """
-        查找具有完全匹配 style 的 Expression 记录
-        只检查 style_list 中的每一项（不检查 style 字段，因为 style 可能是总结后的概括性描述）
+        查找具有相似 situation 的 Expression 记录
+        检查 content_list 中的每一项
        
        Args:
-            style: 要查找的 style
-            
-        Returns:
-            找到的 Expression 对象，如果没有找到则返回 None
-        """
-        # 查询同一 chat_id 的所有记录
-        all_expressions = Expression.select().where(Expression.chat_id == self.chat_id)
-        
-        for expr in all_expressions:
-            # 只检查 style_list 中的每一项
-            style_list = self._parse_style_list(expr.style_list)
-            if style in style_list:
-                return expr
-        
-        return None
-
-    async def _find_similar_style_expression(self, style: str, similarity_threshold: float = 0.75) -> Optional[Expression]:
-        """
-        查找具有相似 style 的 Expression 记录
-        只检查 style_list 中的每一项（不检查 style 字段，因为 style 可能是总结后的概括性描述）
-        
-        Args:
-            style: 要查找的 style
+            situation: 要查找的 situation
            similarity_threshold: 相似度阈值，默认 0.75
            
        Returns:
-            找到的最相似的 Expression 对象，如果没有找到则返回 None
+            Tuple[Optional[Expression], float]: 
+                - 找到的最相似的 Expression 对象，如果没有找到则返回 None
+                - 相似度值（如果找到匹配，范围在 similarity_threshold 到 1.0 之间）
        """
        # 查询同一 chat_id 的所有记录
        all_expressions = Expression.select().where(Expression.chat_id == self.chat_id)
@@ -592,96 +393,28 @@ class ExpressionLearner:
        best_similarity = 0.0
        
        for expr in all_expressions:
-            # 只检查 style_list 中的每一项
-            style_list = self._parse_style_list(expr.style_list)
-            for existing_style in style_list:
-                similarity = calculate_style_similarity(style, existing_style)
+            # 检查 content_list 中的每一项
+            content_list = self._parse_content_list(expr.content_list)
+            for existing_situation in content_list:
+                similarity = calculate_similarity(situation, existing_situation)
                if similarity >= similarity_threshold and similarity > best_similarity:
                    best_similarity = similarity
                    best_match = expr
        
        if best_match:
-            logger.debug(f"找到相似的 style: 相似度={best_similarity:.3f}, 现有='{best_match.style}', 新='{style}'")
+            logger.debug(f"找到相似的 situation: 相似度={best_similarity:.3f}, 现有='{best_match.situation}', 新='{situation}'")
        
-        return best_match
+        return best_match, best_similarity

-    async def _compose_situation_text(self, content_list: List[str], count: int, fallback: str = "") -> str:
+    async def _compose_situation_text(self, content_list: List[str], fallback: str = "") -> str:
        sanitized = [c.strip() for c in content_list if c.strip()]
-        summary = await self._summarize_situations(sanitized)
-        if summary:
-            return summary
-        return "/".join(sanitized) if sanitized else fallback
-
-    async def _compose_style_text(self, style_list: List[str], count: int, fallback: str = "") -> str:
-        """
-        组合 style 文本，如果 style_list 有多个元素则尝试总结
-        """
-        sanitized = [s.strip() for s in style_list if s.strip()]
-        if len(sanitized) > 1:
-            # 只有当有多个 style 时才尝试总结
-            summary = await self._summarize_styles(sanitized)
-            if summary:
-                return summary
-        # 如果只有一个或总结失败，返回第一个或 fallback
-        return sanitized[0] if sanitized else fallback
-
-    async def _summarize_styles(self, styles: List[str]) -> Optional[str]:
-        """总结多个 style，生成一个概括性的 style 描述"""
-        if not styles or len(styles) <= 1:
-            return None
-
-        # 计算输入列表中最长项目的长度
-        max_input_length = max(len(s) for s in styles) if styles else 0
-        max_summary_length = max_input_length * 2
-        
-        # 最多重试3次
-        max_retries = 3
-        retry_count = 0
-        
-        while retry_count < max_retries:
-            # 如果是重试，在 prompt 中强调要更简洁
-            length_hint = f"长度不超过{max_summary_length}个字符，" if retry_count > 0 else "长度不超过20个字，"
-            
-            prompt = (
-                "请阅读以下多个语言风格/表达方式，对其进行总结。"
-                "不要对其进行语义概括，而是尽可能找出其中不变的部分或共同表达，尽量使用原文"
-                f"{length_hint}保留共同特点：\n"
-                f"{chr(10).join(f'- {s}' for s in styles[-10:])}\n只输出概括内容。不要输出其他内容"
-            )
-
-            try:
-                summary, _ = await self.summary_model.generate_response_async(prompt, temperature=0.2)
-                summary = summary.strip()
-                if summary:
-                    # 检查总结长度是否超过限制
-                    if len(summary) <= max_summary_length:
-                        return summary
-                    else:
-                        retry_count += 1
-                        logger.debug(
-                            f"总结长度 {len(summary)} 超过限制 {max_summary_length} "
-                            f"(输入最长项长度: {max_input_length})，重试第 {retry_count} 次"
-                        )
-                        continue
-            except Exception as e:
-                logger.error(f"概括表达风格失败: {e}")
-                return None
-        
-        # 如果重试多次后仍然超过长度，返回 None（不进行总结）
-        logger.warning(
-            f"总结多次后仍超过长度限制，放弃总结。"
-            f"输入最长项长度: {max_input_length}, 最大允许长度: {max_summary_length}"
-        )
-        return None
-
-    async def _summarize_situations(self, situations: List[str]) -> Optional[str]:
-        if not situations:
-            return None
+        if not sanitized:
+            return fallback

        prompt = (
            "请阅读以下多个聊天情境描述，并将它们概括成一句简短的话，"
            "长度不超过20个字，保留共同特点：\n"
-            f"{chr(10).join(f'- {s}' for s in situations[-10:])}\n只输出概括内容。"
+            f"{chr(10).join(f'- {s}' for s in sanitized[-10:])}\n只输出概括内容。"
        )

        try:
@@ -691,7 +424,64 @@ class ExpressionLearner:
                return summary
        except Exception as e:
            logger.error(f"概括表达情境失败: {e}")
-        return None
+        return "/".join(sanitized) if sanitized else fallback
+
+    async def _init_check_model(self) -> None:
+        """初始化检查用的 LLM 实例"""
+        if self.check_model is None:
+            try:
+                self.check_model = LLMRequest(
+                    model_set=model_config.model_task_config.tool_use,
+                    request_type="expression.check"
+                )
+                logger.debug("检查用 LLM 实例初始化成功")
+            except Exception as e:
+                logger.error(f"创建检查用 LLM 实例失败: {e}")
+
+    async def _check_expression_immediately(self, expr_obj: Expression) -> None:
+        """
+        立即检查表达方式（在 count 增加后调用）
+        
+        Args:
+            expr_obj: 要检查的表达方式对象
+        """
+        try:
+            # 检查是否启用自动检查
+            if not global_config.expression.expression_self_reflect:
+                logger.debug("表达方式自动检查未启用，跳过立即检查")
+                return
+
+            # 初始化检查用的 LLM
+            await self._init_check_model()
+            if self.check_model is None:
+                logger.warning("检查用 LLM 实例初始化失败，跳过立即检查")
+                return
+
+            # 执行 LLM 评估
+            suitable, reason, error = await single_expression_check(
+                expr_obj.situation,
+                expr_obj.style
+            )
+
+            # 更新数据库
+            expr_obj.checked = True
+            expr_obj.rejected = not suitable  # 通过则 rejected=False，不通过则 rejected=True
+            expr_obj.save()
+
+            status = "通过" if suitable else "不通过"
+            logger.info(
+                f"表达方式立即检查完成 [ID: {expr_obj.id}] - {status} | "
+                f"Situation: {expr_obj.situation[:30]}... | "
+                f"Style: {expr_obj.style[:30]}... | "
+                f"Reason: {reason[:50] if reason else '无'}..."
+            )
+
+            if error:
+                logger.warning(f"表达方式立即检查时出现错误 [ID: {expr_obj.id}]: {error}")
+
+        except Exception as e:
+            logger.error(f"立即检查表达方式失败 [ID: {expr_obj.id}]: {e}", exc_info=True)
+            # 检查失败时，保持 checked=False，等待后续自动检查任务处理

    async def _process_jargon_entries(self, jargon_entries: List[Tuple[str, str]], messages: List[Any]) -> None:
        """
--- a/src/bw_learner/expression_reflector.py
+++ b/src/bw_learner/expression_reflector.py
@@ -28,11 +28,11 @@ class ExpressionReflector:
        try:
            logger.debug(f"[Expression Reflection] 开始检查是否需要提问 (stream_id: {self.chat_id})")

-            if not global_config.expression.reflect:
+            if not global_config.expression.expression_self_reflect:
                logger.debug("[Expression Reflection] 表达反思功能未启用，跳过")
                return False

-            operator_config = global_config.expression.reflect_operator_id
+            operator_config = global_config.expression.manual_reflect_operator_id
            if not operator_config:
                logger.debug("[Expression Reflection] Operator ID 未配置，跳过")
                return False
--- a/src/bw_learner/expression_selector.py
+++ b/src/bw_learner/expression_selector.py
@@ -123,9 +123,11 @@ class ExpressionSelector:
            related_chat_ids = self.get_related_chat_ids(chat_id)

            # 查询所有相关chat_id的表达方式，排除 rejected=1 的，且只选择 count > 1 的
-            style_query = Expression.select().where(
-                (Expression.chat_id.in_(related_chat_ids)) & (~Expression.rejected) & (Expression.count > 1)
-            )
+            # 如果 expression_checked_only 为 True，则只选择 checked=True 且 rejected=False 的
+            base_conditions = (Expression.chat_id.in_(related_chat_ids)) & (~Expression.rejected) & (Expression.count > 1)
+            if global_config.expression.expression_checked_only:
+                base_conditions = base_conditions & (Expression.checked)
+            style_query = Expression.select().where(base_conditions)

            style_exprs = [
                {
@@ -202,7 +204,11 @@ class ExpressionSelector:
            related_chat_ids = self.get_related_chat_ids(chat_id)

            # 优化：一次性查询所有相关chat_id的表达方式，排除 rejected=1 的表达
-            style_query = Expression.select().where((Expression.chat_id.in_(related_chat_ids)) & (~Expression.rejected))
+            # 如果 expression_checked_only 为 True，则只选择 checked=True 且 rejected=False 的
+            base_conditions = (Expression.chat_id.in_(related_chat_ids)) & (~Expression.rejected)
+            if global_config.expression.expression_checked_only:
+                base_conditions = base_conditions & (Expression.checked)
+            style_query = Expression.select().where(base_conditions)

            style_exprs = [
                {
@@ -295,7 +301,11 @@ class ExpressionSelector:
            # think_level == 1: 先选高count，再从所有表达方式中随机抽样
            # 1. 获取所有表达方式并分离 count > 1 和 count <= 1 的
            related_chat_ids = self.get_related_chat_ids(chat_id)
-            style_query = Expression.select().where((Expression.chat_id.in_(related_chat_ids)) & (~Expression.rejected))
+            # 如果 expression_checked_only 为 True，则只选择 checked=True 且 rejected=False 的
+            base_conditions = (Expression.chat_id.in_(related_chat_ids)) & (~Expression.rejected)
+            if global_config.expression.expression_checked_only:
+                base_conditions = base_conditions & (Expression.checked)
+            style_query = Expression.select().where(base_conditions)

            all_style_exprs = [
                {
--- a/src/bw_learner/learner_utils.py
+++ b/src/bw_learner/learner_utils.py
@@ -2,8 +2,7 @@ import re
 import difflib
 import random
 import json
-from datetime import datetime
-from typing import Optional, List, Dict, Any
+from typing import Optional, List, Dict, Any, Tuple

 from src.common.logger import get_logger
 from src.config.config import global_config
@@ -11,6 +10,7 @@ from src.chat.utils.chat_message_builder import (
    build_readable_messages,
 )
 from src.chat.utils.utils import parse_platform_accounts
+from json_repair import repair_json


 logger = get_logger("learner_utils")
@@ -88,33 +88,15 @@ def calculate_style_similarity(style1: str, style2: str) -> float:
    return difflib.SequenceMatcher(None, cleaned_style1, cleaned_style2).ratio()


-def format_create_date(timestamp: float) -> str:
-    """
-    将时间戳格式化为可读的日期字符串
-
-    Args:
-        timestamp: 时间戳
-
-    Returns:
-        str: 格式化后的日期字符串
-    """
-    try:
-        return datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d %H:%M:%S")
-    except (ValueError, OSError):
-        return "未知时间"
-
-
 def _compute_weights(population: List[Dict]) -> List[float]:
    """
    根据表达的count计算权重，范围限定在1~5之间。
    count越高，权重越高，但最多为基础权重的5倍。
-    如果表达已checked，权重会再乘以3倍。
    """
    if not population:
        return []

    counts = []
-    checked_flags = []
    for item in population:
        count = item.get("count", 1)
        try:
@@ -122,29 +104,19 @@ def _compute_weights(population: List[Dict]) -> List[float]:
        except (TypeError, ValueError):
            count_value = 1.0
        counts.append(max(count_value, 0.0))
-        # 获取checked状态
-        checked = item.get("checked", False)
-        checked_flags.append(bool(checked))

    min_count = min(counts)
    max_count = max(counts)

    if max_count == min_count:
-        base_weights = [1.0 for _ in counts]
+        weights = [1.0 for _ in counts]
    else:
-        base_weights = []
+        weights = []
        for count_value in counts:
            # 线性映射到[1,5]区间
            normalized = (count_value - min_count) / (max_count - min_count)
-            base_weights.append(1.0 + normalized * 4.0)  # 1~5
+            weights.append(1.0 + normalized * 4.0)  # 1~5

-    # 如果checked，权重乘以3
-    weights = []
-    for base_weight, checked in zip(base_weights, checked_flags, strict=False):
-        if checked:
-            weights.append(base_weight * 3.0)
-        else:
-            weights.append(base_weight)
    return weights


@@ -378,3 +350,149 @@ def is_bot_message(msg: Any) -> bool:

    bot_account = bot_accounts.get(platform)
    return bool(bot_account and user_id == bot_account)
+
+
+def parse_expression_response(response: str) -> Tuple[List[Tuple[str, str, str]], List[Tuple[str, str]]]:
+    """
+    解析 LLM 返回的表达风格总结和黑话 JSON，提取两个列表。
+
+    期望的 JSON 结构：
+    [
+        {"situation": "AAAAA", "style": "BBBBB", "source_id": "3"},  // 表达方式
+        {"content": "词条", "source_id": "12"},  // 黑话
+        ...
+    ]
+
+    Returns:
+        Tuple[List[Tuple[str, str, str]], List[Tuple[str, str]]]:
+            第一个列表是表达方式 (situation, style, source_id)
+            第二个列表是黑话 (content, source_id)
+    """
+    if not response:
+        return [], []
+
+    raw = response.strip()
+
+    # 尝试提取 ```json 代码块
+    json_block_pattern = r"```json\s*(.*?)\s*```"
+    match = re.search(json_block_pattern, raw, re.DOTALL)
+    if match:
+        raw = match.group(1).strip()
+    else:
+        # 去掉可能存在的通用 ``` 包裹
+        raw = re.sub(r"^```\s*", "", raw, flags=re.MULTILINE)
+        raw = re.sub(r"```\s*$", "", raw, flags=re.MULTILINE)
+        raw = raw.strip()
+
+    parsed = None
+    expressions: List[Tuple[str, str, str]] = []  # (situation, style, source_id)
+    jargon_entries: List[Tuple[str, str]] = []  # (content, source_id)
+
+    try:
+        # 优先尝试直接解析
+        if raw.startswith("[") and raw.endswith("]"):
+            parsed = json.loads(raw)
+        else:
+            repaired = repair_json(raw)
+            if isinstance(repaired, str):
+                parsed = json.loads(repaired)
+            else:
+                parsed = repaired
+    except Exception as parse_error:
+        # 如果解析失败，尝试修复中文引号问题
+        # 使用状态机方法，在 JSON 字符串值内部将中文引号替换为转义的英文引号
+        try:
+
+            def fix_chinese_quotes_in_json(text):
+                """使用状态机修复 JSON 字符串值中的中文引号"""
+                result = []
+                i = 0
+                in_string = False
+                escape_next = False
+
+                while i < len(text):
+                    char = text[i]
+
+                    if escape_next:
+                        # 当前字符是转义字符后的字符，直接添加
+                        result.append(char)
+                        escape_next = False
+                        i += 1
+                        continue
+
+                    if char == "\\":
+                        # 转义字符
+                        result.append(char)
+                        escape_next = True
+                        i += 1
+                        continue
+
+                    if char == '"' and not escape_next:
+                        # 遇到英文引号，切换字符串状态
+                        in_string = not in_string
+                        result.append(char)
+                        i += 1
+                        continue
+
+                    if in_string:
+                        # 在字符串值内部，将中文引号替换为转义的英文引号
+                        if char == '"':  # 中文左引号 U+201C
+                            result.append('\\"')
+                        elif char == '"':  # 中文右引号 U+201D
+                            result.append('\\"')
+                        else:
+                            result.append(char)
+                    else:
+                        # 不在字符串内，直接添加
+                        result.append(char)
+
+                    i += 1
+
+                return "".join(result)
+
+            fixed_raw = fix_chinese_quotes_in_json(raw)
+
+            # 再次尝试解析
+            if fixed_raw.startswith("[") and fixed_raw.endswith("]"):
+                parsed = json.loads(fixed_raw)
+            else:
+                repaired = repair_json(fixed_raw)
+                if isinstance(repaired, str):
+                    parsed = json.loads(repaired)
+                else:
+                    parsed = repaired
+        except Exception as fix_error:
+            logger.error(f"解析表达风格 JSON 失败，初始错误: {type(parse_error).__name__}: {str(parse_error)}")
+            logger.error(f"修复中文引号后仍失败，错误: {type(fix_error).__name__}: {str(fix_error)}")
+            logger.error(f"解析表达风格 JSON 失败，原始响应：{response}")
+            logger.error(f"处理后的 JSON 字符串（前500字符）：{raw[:500]}")
+            return [], []
+
+    if isinstance(parsed, dict):
+        parsed_list = [parsed]
+    elif isinstance(parsed, list):
+        parsed_list = parsed
+    else:
+        logger.error(f"表达风格解析结果类型异常: {type(parsed)}, 内容: {parsed}")
+        return [], []
+
+    for item in parsed_list:
+        if not isinstance(item, dict):
+            continue
+
+        # 检查是否是表达方式条目（有 situation 和 style）
+        situation = str(item.get("situation", "")).strip()
+        style = str(item.get("style", "")).strip()
+        source_id = str(item.get("source_id", "")).strip()
+
+        if situation and style and source_id:
+            # 表达方式条目
+            expressions.append((situation, style, source_id))
+        elif item.get("content"):
+            # 黑话条目（有 content 字段）
+            content = str(item.get("content", "")).strip()
+            source_id = str(item.get("source_id", "")).strip()
+            if content and source_id:
+                jargon_entries.append((content, source_id))
+
+    return expressions, jargon_entries
--- a/src/bw_learner/message_recorder.py
+++ b/src/bw_learner/message_recorder.py
@@ -116,20 +116,12 @@ class MessageRecorder:
                    f"时间窗口: {extraction_start_time:.2f} - {extraction_end_time:.2f}"
                )

-                # 分别触发 expression_learner 和 jargon_miner 的处理
-                # 传递提取的消息，避免它们重复获取
-                # 触发 expression 学习（如果启用）
+                # 触发 expression_learner 和 jargon_miner 的处理
                if self.enable_expression_learning:
                    asyncio.create_task(
-                        self._trigger_expression_learning(extraction_start_time, extraction_end_time, messages)
+                        self._trigger_expression_learning(messages)
                    )

-                # 触发 jargon 提取（如果启用），传递消息
-                # if self.enable_jargon_learning:
-                # asyncio.create_task(
-                # self._trigger_jargon_extraction(extraction_start_time, extraction_end_time, messages)
-                # )
-
            except Exception as e:
                logger.error(f"为聊天流 {self.chat_name} 提取和分发消息失败: {e}")
                import traceback
@@ -138,7 +130,7 @@ class MessageRecorder:
                # 即使失败也保持时间戳更新，避免频繁重试

    async def _trigger_expression_learning(
-        self, timestamp_start: float, timestamp_end: float, messages: List[Any]
+        self, messages: List[Any]
    ) -> None:
        """
        触发 expression 学习，使用指定的消息列表
@@ -162,27 +154,6 @@ class MessageRecorder:

            traceback.print_exc()

-    async def _trigger_jargon_extraction(
-        self, timestamp_start: float, timestamp_end: float, messages: List[Any]
-    ) -> None:
-        """
-        触发 jargon 提取，使用指定的消息列表
-
-        Args:
-            timestamp_start: 开始时间戳
-            timestamp_end: 结束时间戳
-            messages: 消息列表
-        """
-        try:
-            # 传递消息给 JargonMiner，避免它重复获取
-            await self.jargon_miner.run_once(messages=messages)
-
-        except Exception as e:
-            logger.error(f"为聊天流 {self.chat_name} 触发黑话提取失败: {e}")
-            import traceback
-
-            traceback.print_exc()
-

 class MessageRecorderManager:
    """MessageRecorder 管理器"""