feat：提高工具调用成功率，移除冗余的描述中参数介绍，增加索引列表的描述，修改prompt，移除timing的wait打断

2026-04-10 00:45:32 +08:00
parent 0852c38e81
commit fee9341620
17 changed files with 828 additions and 450 deletions
--- a/src/learners/expression_auto_check_task.py
+++ b/src/learners/expression_auto_check_task.py
@@ -1,5 +1,5 @@
 """
-表达方式自动检查定时任务
+表达方式自动检查定时任务。

 功能：
 1. 定期随机选取指定数量的表达方式
@@ -9,52 +9,48 @@
 """

 import asyncio
-import json
 import random
 from typing import List

 from sqlmodel import select

-from src.learners.expression_review_store import get_review_state, set_review_state
+from src.common.data_models.llm_service_data_models import LLMGenerationOptions
 from src.common.database.database import get_db_session
 from src.common.database.database_model import Expression
 from src.common.logger import get_logger
 from src.config.config import global_config
-from src.common.data_models.llm_service_data_models import LLMGenerationOptions
-from src.services.llm_service import LLMServiceClient
+from src.learners.expression_review_store import get_review_state, set_review_state
+from src.learners.expression_utils import parse_evaluation_response
 from src.manager.async_task_manager import AsyncTask
+from src.services.llm_service import LLMServiceClient

 logger = get_logger("expressor")


 def create_evaluation_prompt(situation: str, style: str) -> str:
    """
-    创建评估提示词
+    创建评估提示词。

    Args:
-        situation: 情境
+        situation: 情景
        style: 风格

    Returns:
        评估提示词
    """
-    # 基础评估标准
    base_criteria = [
-        "表达方式或言语风格 是否与使用条件或使用情景 匹配",
-        "允许部分语法错误或口头化或缺省出现",
+        "表达方式或言语风格是否与使用条件或使用情景匹配",
+        "允许部分语法错误或口语化或缺省出现",
        "表达方式不能太过特指，需要具有泛用性",
        "一般不涉及具体的人名或名称",
    ]

-    # 从配置中获取额外的自定义标准
    custom_criteria = global_config.expression.expression_auto_check_custom_criteria

-    # 合并所有评估标准
    all_criteria = base_criteria.copy()
    if custom_criteria:
        all_criteria.extend(custom_criteria)

-    # 构建评估标准列表字符串
    criteria_list = "\n".join([f"{i + 1}. {criterion}" for i, criterion in enumerate(all_criteria)])

    prompt = f"""请评估以下表达方式或语言风格以及使用条件或使用情景是否合适：
@@ -64,14 +60,13 @@ def create_evaluation_prompt(situation: str, style: str) -> str:
 请从以下方面进行评估：
 {criteria_list}

-请以JSON格式输出评估结果：
+请以 JSON 格式输出评估结果：
 {{
    "suitable": true/false,
    "reason": "评估理由（如果不合适，请说明原因）"
-
 }}
-如果合适，suitable设为true；如果不合适，suitable设为false，并在reason中说明原因。
-请严格按照JSON格式输出，不要包含其他内容。"""
+如果合适，suitable 设为 true；如果不合适，suitable 设为 false，并在 reason 中说明原因。
+请严格按照 JSON 格式输出，不要包含其他内容。"""

    return prompt

@@ -81,10 +76,10 @@ judge_llm = LLMServiceClient(task_name="utils", request_type="expression_check")

 async def single_expression_check(situation: str, style: str) -> tuple[bool, str, str | None]:
    """
-    执行单次LLM评估
+    执行单次 LLM 评估。

    Args:
-        situation: 情境
+        situation: 情景
        style: 风格

    Returns:
@@ -101,20 +96,10 @@ async def single_expression_check(situation: str, style: str) -> tuple[bool, str
        response = generation_result.response
        logger.debug(f"LLM响应: {response}")

-        # 解析JSON响应
-        try:
-            evaluation = json.loads(response)
-        except json.JSONDecodeError as e:
-            import re
+        evaluation = parse_evaluation_response(response)

-            json_match = re.search(r'\{[^{}]*"suitable"[^{}]*\}', response, re.DOTALL)
-            if json_match:
-                evaluation = json.loads(json_match.group())
-            else:
-                raise ValueError("无法从响应中提取JSON格式的评估结果") from e
-
-        suitable = evaluation.get("suitable", False)
-        reason = evaluation.get("reason", "未提供理由")
+        suitable = bool(evaluation.get("suitable", False))
+        reason = str(evaluation.get("reason", "未提供理由"))

        logger.debug(f"评估结果: {'通过' if suitable else '不通过'}")
        return suitable, reason, None
@@ -125,20 +110,19 @@ async def single_expression_check(situation: str, style: str) -> tuple[bool, str


 class ExpressionAutoCheckTask(AsyncTask):
-    """表达方式自动检查定时任务"""
+    """表达方式自动检查定时任务。"""

    def __init__(self):
-        # 从配置中获取检查间隔和一次检查数量
        check_interval = global_config.expression.expression_auto_check_interval
        super().__init__(
            task_name="Expression Auto Check Task",
-            wait_before_start=60,  # 启动后等待60秒再开始第一次检查
+            wait_before_start=60,
            run_interval=check_interval,
        )

    async def _select_expressions(self, count: int) -> List[Expression]:
        """
-        随机选择指定数量的未检查表达方式
+        随机选择指定数量的未检查表达方式。

        Args:
            count: 需要选择的数量
@@ -158,11 +142,12 @@ class ExpressionAutoCheckTask(AsyncTask):
                logger.info("没有未检查的表达方式")
                return []

-            # 随机选择指定数量
            selected_count = min(count, len(unevaluated_expressions))
            selected = random.sample(unevaluated_expressions, selected_count)

-            logger.info(f"从 {len(unevaluated_expressions)} 条未检查表达方式中随机选择了 {selected_count} 条")
+            logger.info(
+                f"从 {len(unevaluated_expressions)} 条未检查表达方式中随机选择了 {selected_count} 条"
+            )
            return selected

        except Exception as e:
@@ -171,35 +156,35 @@ class ExpressionAutoCheckTask(AsyncTask):

    async def _evaluate_expression(self, expression: Expression) -> bool:
        """
-        评估单个表达方式
+        评估单个表达方式。

        Args:
            expression: 要评估的表达方式

        Returns:
-            True表示通过，False表示不通过
+            True 表示通过，False 表示不通过
        """
-
        suitable, reason, error = await single_expression_check(
            expression.situation,
            expression.style,
        )

-        # 更新数据库
        try:
            set_review_state(expression.id, True, not suitable, "ai")

            status = "通过" if suitable else "不通过"
+            # 保留这段注释，方便后续需要时恢复更详细的审核日志。
            # logger.info(
-                # f"表达方式评估完成 [ID: {expression.id}] - {status} | "
-                # f"Situation: {expression.situation}... | "
-                # f"Style: {expression.style}... | "
-                # f"Reason: {reason[:50]}..."
+            #     f"表达方式评估完成 [ID: {expression.id}] - {status} | "
+            #     f"Situation: {expression.situation}... | "
+            #     f"Style: {expression.style}... | "
+            #     f"Reason: {reason[:50]}..."
            # )

            if error:
                logger.warning(f"表达方式评估时出现错误 [ID: {expression.id}]: {error}")

+            logger.debug(f"表达方式 [ID: {expression.id}] 评估完成: {status}, reason={reason}")
            return suitable

        except Exception as e:
@@ -207,9 +192,8 @@ class ExpressionAutoCheckTask(AsyncTask):
            return False

    async def run(self):
-        """执行检查任务"""
+        """执行检查任务。"""
        try:
-            # 检查是否启用自动检查
            if not global_config.expression.expression_self_reflect:
                logger.debug("表达方式自动检查未启用，跳过本次执行")
                return
@@ -221,26 +205,22 @@ class ExpressionAutoCheckTask(AsyncTask):

            logger.info(f"开始执行表达方式自动检查，本次将检查 {check_count} 条")

-            # 选择要检查的表达方式
            expressions = await self._select_expressions(check_count)
-
            if not expressions:
                logger.info("没有需要检查的表达方式")
                return

-            # 逐个评估
            passed_count = 0
            failed_count = 0

-            for i, expression in enumerate(expressions, 1):
-                logger.debug(f"正在评估 [{i}/{len(expressions)}]: ID={expression.id}")
+            for index, expression in enumerate(expressions, 1):
+                logger.debug(f"正在评估 [{index}/{len(expressions)}]: ID={expression.id}")

                if await self._evaluate_expression(expression):
                    passed_count += 1
                else:
                    failed_count += 1

-                # 避免请求过快
                await asyncio.sleep(0.3)

            logger.info(
--- a/src/learners/expression_utils.py
+++ b/src/learners/expression_utils.py
@@ -1,14 +1,14 @@
-from json_repair import repair_json
-from typing import Any, List, Optional, Tuple
-
 import json
 import re
+from typing import Any, Dict, List, Optional, Tuple
+
+from json_repair import repair_json

-from src.config.config import global_config
 from src.common.data_models.llm_service_data_models import LLMGenerationOptions
-from src.services.llm_service import LLMServiceClient
-from src.prompt.prompt_manager import prompt_manager
 from src.common.logger import get_logger
+from src.config.config import global_config
+from src.prompt.prompt_manager import prompt_manager
+from src.services.llm_service import LLMServiceClient

 logger = get_logger("expression_utils")

@@ -16,17 +16,7 @@ judge_llm = LLMServiceClient(task_name="utils", request_type="expression_check")


 def _normalize_repair_json_result(repaired_result: Any) -> str:
-    """将 repair_json 的返回值规范化为 JSON 字符串。
-
-    Args:
-        repaired_result: `repair_json` 的返回值，可能是字符串或带附加信息的元组。
-
-    Returns:
-        str: 可供 `json.loads` 继续解析的 JSON 字符串。
-
-    Raises:
-        TypeError: 当返回值无法规范化为字符串时抛出。
-    """
+    """将 `repair_json` 的返回结果统一转换为字符串。"""
    if isinstance(repaired_result, str):
        return repaired_result
    if isinstance(repaired_result, tuple) and repaired_result:
@@ -37,22 +27,121 @@ def _normalize_repair_json_result(repaired_result: Any) -> str:
    raise TypeError(f"repair_json 返回了无法处理的结果类型: {type(repaired_result)}")


+def _strip_markdown_code_fence(text: str) -> str:
+    """移除 LLM 可能附带的 Markdown 代码块包裹。"""
+    raw = text.strip()
+    if match := re.search(r"```json\s*(.*?)\s*```", raw, re.DOTALL):
+        return match[1].strip()
+    raw = re.sub(r"^```\s*", "", raw, flags=re.MULTILINE)
+    raw = re.sub(r"```\s*$", "", raw, flags=re.MULTILINE)
+    return raw.strip()
+
+
+def _extract_json_object_candidate(text: str) -> str:
+    """尽量从文本中提取首个 JSON 对象片段。"""
+    start_index = text.find("{")
+    end_index = text.rfind("}")
+    if start_index != -1 and end_index != -1 and start_index < end_index:
+        return text[start_index : end_index + 1].strip()
+    return text.strip()
+
+
+def _extract_reason_from_text(text: str) -> Optional[str]:
+    """从格式不完整的 JSON 文本中兜底提取 reason 字段。"""
+    reason_key_match = re.search(r'["“”]?reason["“”]?\s*:\s*', text, re.IGNORECASE)
+    if reason_key_match is None:
+        return None
+
+    value_text = text[reason_key_match.end() :].strip()
+    if not value_text:
+        return None
+
+    if value_text.endswith("}"):
+        value_text = value_text[:-1].rstrip()
+    if value_text.endswith(","):
+        value_text = value_text[:-1].rstrip()
+    if not value_text:
+        return None
+
+    if value_text[0] in {'"', "'", "“", "”", "‘", "’"}:
+        value_text = value_text[1:]
+        while value_text and value_text[-1] in {'"', "'", "“", "”", "‘", "’"}:
+            value_text = value_text[:-1].rstrip()
+
+    return value_text.strip() or None
+
+
+def _normalize_reason_text(reason: Any) -> str:
+    """清理解析后 reason 中残留的包裹引号。"""
+    normalized_reason = str(reason).strip()
+
+    if len(normalized_reason) >= 2 and normalized_reason[0] == normalized_reason[-1]:
+        if normalized_reason[0] in {'"', "'", "“", "”", "‘", "’"}:
+            normalized_reason = normalized_reason[1:-1].strip()
+
+    if normalized_reason.endswith('"') and normalized_reason.count('"') % 2 == 1:
+        normalized_reason = normalized_reason[:-1].rstrip()
+    if normalized_reason.endswith("'") and normalized_reason.count("'") % 2 == 1:
+        normalized_reason = normalized_reason[:-1].rstrip()
+    if normalized_reason.endswith('"') and not normalized_reason.startswith('"'):
+        normalized_reason = normalized_reason[:-1].rstrip()
+    if normalized_reason.endswith("'") and not normalized_reason.startswith("'"):
+        normalized_reason = normalized_reason[:-1].rstrip()
+
+    return normalized_reason
+
+
+def parse_evaluation_response(response: str) -> Dict[str, Any]:
+    """解析表达方式评估结果，兼容不完全合法的 JSON。"""
+    raw = _strip_markdown_code_fence(response)
+    if not raw:
+        raise ValueError("LLM 响应为空")
+
+    parse_candidates = [raw]
+    json_candidate = _extract_json_object_candidate(raw)
+    if json_candidate and json_candidate not in parse_candidates:
+        parse_candidates.append(json_candidate)
+
+    for candidate in parse_candidates:
+        parsed = _try_parse(candidate)
+        if isinstance(parsed, dict):
+            if "reason" in parsed:
+                parsed["reason"] = _normalize_reason_text(parsed["reason"])
+            return parsed
+
+        fixed_candidate = fix_chinese_quotes_in_json(candidate)
+        if fixed_candidate != candidate:
+            parsed = _try_parse(fixed_candidate)
+            if isinstance(parsed, dict):
+                if "reason" in parsed:
+                    parsed["reason"] = _normalize_reason_text(parsed["reason"])
+                return parsed
+
+    suitable_match = re.search(r'["“”]?suitable["“”]?\s*:\s*(true|false)', raw, re.IGNORECASE)
+    reason = _extract_reason_from_text(json_candidate or raw)
+    if suitable_match is None or reason is None:
+        raise ValueError(f"无法解析 LLM 响应为评估结果 JSON: {response}")
+
+    return {
+        "suitable": suitable_match.group(1).lower() == "true",
+        "reason": _normalize_reason_text(reason),
+    }
+
+
 async def check_expression_suitability(situation: str, style: str) -> Tuple[bool, str, Optional[str]]:
    """
-    执行单次LLM评估
+    执行单次 LLM 评估。

    Args:
-        situation: 情境
+        situation: 情景
        style: 风格

    Returns:
        (suitable, reason, error) 元组，如果出错则 suitable 为 False，error 包含错误信息
    """
-    # 构建评估提示词
-    # 基础评估标准
    base_criteria = [
        "表达方式或言语风格是否与使用条件或使用情景匹配",
-        "允许部分语法错误或口头化或缺省出现",
+        "允许部分语法错误或口语化或缺省出现",
        "表达方式不能太过特指，需要具有泛用性",
        "一般不涉及具体的人名或名称",
    ]
@@ -60,7 +149,6 @@ async def check_expression_suitability(situation: str, style: str) -> Tuple[bool
    if custom_criteria := global_config.expression.expression_auto_check_custom_criteria:
        base_criteria.extend(custom_criteria)

-    # 构建评估标准列表字符串
    criteria_list = "\n".join([f"{i + 1}. {criterion}" for i, criterion in enumerate(base_criteria)])

    prompt_template = prompt_manager.get_prompt("expression_evaluation")
@@ -81,18 +169,13 @@ async def check_expression_suitability(situation: str, style: str) -> Tuple[bool
    logger.debug(f"评估结果: {response}")

    try:
-        evaluation = json.loads(response)
-    except json.JSONDecodeError:
-        try:
-            response_repaired = _normalize_repair_json_result(repair_json(response))
-            evaluation = json.loads(response_repaired)
-        except Exception as e:
-            raise ValueError(f"无法解析LLM响应为JSON: {response}") from e
+        evaluation = parse_evaluation_response(response)
    except Exception as e:
        return False, f"评估表达方式时发生错误: {e}", str(e)
+
    try:
-        suitable = evaluation.get("suitable", False)
-        reason = evaluation.get("reason", "未提供理由")
+        suitable = bool(evaluation.get("suitable", False))
+        reason = _normalize_reason_text(evaluation.get("reason", "未提供理由"))
        logger.debug(f"评估结果: {'通过' if suitable else '不通过'}")
        return suitable, reason, None
    except Exception as e:
@@ -100,69 +183,48 @@ async def check_expression_suitability(situation: str, style: str) -> Tuple[bool


 def fix_chinese_quotes_in_json(text: str) -> str:
-    """使用状态机修复 JSON 字符串值中的中文引号"""
-    result = []
-    i = 0
+    """使用状态机修复 JSON 字符串值中的中文引号。"""
+    result: List[str] = []
    in_string = False
    escape_next = False

-    while i < len(text):
-        char = text[i]
+    for char in text:
        if escape_next:
-            # 当前字符是转义字符后的字符，直接添加
            result.append(char)
            escape_next = False
-            i += 1
            continue
+
        if char == "\\":
-            # 转义字符
            result.append(char)
            escape_next = True
-            i += 1
            continue
-        if char == '"' and not escape_next:
-            # 遇到英文引号，切换字符串状态
+
+        if char == '"':
            in_string = not in_string
            result.append(char)
-            i += 1
            continue
+
        if in_string and char in ["“", "”"]:
            result.append('\\"')
-        else:
-            result.append(char)
-        i += 1
+            continue
+
+        result.append(char)

    return "".join(result)


 def parse_expression_response(response: str) -> Tuple[List[Tuple[str, str, str]], List[Tuple[str, str]]]:
    """
-    解析 LLM 返回的表达风格总结和黑话 JSON，提取两个列表。
-
-    期望的 JSON 结构：
-    [
-        {"situation": "AAAAA", "style": "BBBBB", "source_id": "3"},  // 表达方式
-        {"content": "词条", "source_id": "12"},  // 黑话
-        ...
-    ]
+    解析 LLM 返回的表达方式总结和黑话 JSON，提取两个列表。

    Returns:
-        Tuple[List[Tuple[str, str, str]], List[Tuple[str, str]]]:
-            第一个列表是表达方式 (situation, style, source_id)
-            第二个列表是黑话 (content, source_id)
+        第一个列表是表达方式 (situation, style, source_id)
+        第二个列表是黑话 (content, source_id)
    """
    if not response:
        return [], []

-    raw = response.strip()
-
-    if match := re.search(r"```json\s*(.*?)\s*```", raw, re.DOTALL):
-        raw = match[1].strip()
-    else:
-        # 去掉可能存在的通用 ``` 包裹
-        raw = re.sub(r"^```\s*", "", raw, flags=re.MULTILINE)
-        raw = re.sub(r"```\s*$", "", raw, flags=re.MULTILINE)
-        raw = raw.strip()
+    raw = _strip_markdown_code_fence(response)

    parsed = _try_parse(raw)
    if parsed is None:
@@ -180,22 +242,21 @@ def parse_expression_response(response: str) -> Tuple[List[Tuple[str, str, str]]
        logger.error(f"表达风格解析结果类型异常: {type(parsed)}, 内容: {parsed}")
        return [], []

-    expressions: List[Tuple[str, str, str]] = []  # (situation, style, source_id)
-    jargon_entries: List[Tuple[str, str]] = []  # (content, source_id)
+    expressions: List[Tuple[str, str, str]] = []
+    jargon_entries: List[Tuple[str, str]] = []

    for item in parsed_list:
        if not isinstance(item, dict):
            continue

-        # 检查是否是表达方式条目（有 situation 和 style）
        situation = str(item.get("situation", "")).strip()
        style = str(item.get("style", "")).strip()
        source_id = str(item.get("source_id", "")).strip()

        if situation and style and source_id:
-            # 表达方式条目
            expressions.append((situation, style, source_id))
            continue
+
        content = str(item.get("content", "")).strip()
        if content and source_id:
            jargon_entries.append((content, source_id))
@@ -204,25 +265,16 @@ def parse_expression_response(response: str) -> Tuple[List[Tuple[str, str, str]]


 def is_single_char_jargon(content: str) -> bool:
-    """
-    判断是否是单字黑话（单个汉字、英文或数字）
-
-    Args:
-        content: 词条内容
-
-    Returns:
-        bool: 如果是单字黑话返回True，否则返回False
-    """
+    """判断是否是单字黑话（单个汉字、英文或数字）。"""
    if not content or len(content) != 1:
        return False

    char = content[0]
-    # 判断是否是单个汉字、单个英文字母或单个数字
    return (
-        "\u4e00" <= char <= "\u9fff"  # 汉字
-        or "a" <= char <= "z"  # 小写字母
-        or "A" <= char <= "Z"  # 大写字母
-        or "0" <= char <= "9"  # 数字
+        "\u4e00" <= char <= "\u9fff"
+        or "a" <= char <= "z"
+        or "A" <= char <= "Z"
+        or "0" <= char <= "9"
    )