fix：正常获取Bot自身发送的消息id

2026-04-05 13:05:18 +08:00
parent ead90cbdf3
commit 18d48e0145
8 changed files with 1147 additions and 46 deletions
--- a/scripts/test_model_tool_call_params.py
+++ b/scripts/test_model_tool_call_params.py
@@ -0,0 +1,845 @@
+from argparse import ArgumentParser, Namespace
+from contextlib import contextmanager
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Any, Dict, Iterator, List, Sequence
+
+import asyncio
+import json
+import sys
+import time
+
+
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+from src.common.data_models.llm_service_data_models import LLMServiceRequest, LLMServiceResult  # noqa: E402
+from src.config.config import config_manager  # noqa: E402
+from src.config.model_configs import APIProvider, ModelInfo, TaskConfig  # noqa: E402
+from src.llm_models.payload_content.tool_option import ToolCall  # noqa: E402
+from src.services.llm_service import generate  # noqa: E402
+from src.services.service_task_resolver import get_available_models  # noqa: E402
+
+
+DEFAULT_SKIP_TASKS = {"embedding", "voice"}
+
+
+@dataclass(slots=True)
+class ToolCallCase:
+    """Tool call 参数测试用例。"""
+
+    name: str
+    description: str
+    tool_definition: Dict[str, Any]
+    expected_arguments: Dict[str, Any]
+
+    @property
+    def tool_name(self) -> str:
+        """返回工具名称。"""
+        if self.tool_definition.get("type") == "function":
+            function_definition = self.tool_definition.get("function", {})
+            return str(function_definition.get("name", "") or "")
+        return str(self.tool_definition.get("name", "") or "")
+
+    @property
+    def parameters_schema(self) -> Dict[str, Any]:
+        """返回参数 Schema。"""
+        if self.tool_definition.get("type") == "function":
+            function_definition = self.tool_definition.get("function", {})
+            parameters = function_definition.get("parameters", {})
+            return parameters if isinstance(parameters, dict) else {}
+        parameters = self.tool_definition.get("parameters", {})
+        return parameters if isinstance(parameters, dict) else {}
+
+    def build_messages(self) -> List[Dict[str, Any]]:
+        """构造测试消息。"""
+        expected_json = json.dumps(self.expected_arguments, ensure_ascii=False, indent=2)
+        system_prompt = (
+            "你正在执行严格的工具调用参数兼容性测试。"
+            "你必须通过工具调用响应，不能输出自然语言，不能解释，不能补充额外字段。"
+        )
+        user_prompt = (
+            f"请立刻调用工具 `{self.tool_name}`。\n"
+            "参数必须与下面 JSON 完全一致，键名、值、布尔类型、整数类型、浮点数、数组顺序和对象结构都不能改变。\n"
+            "不要输出任何解释文本，只返回工具调用。\n"
+            f"{expected_json}"
+        )
+        return [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt},
+        ]
+
+
+@dataclass(slots=True)
+class ProbeTarget:
+    """单个待测试模型目标。"""
+
+    task_name: str
+    model_name: str
+    provider_name: str
+    client_type: str
+    tool_argument_parse_mode: str
+
+
+@dataclass(slots=True)
+class ProbeResult:
+    """单次测试结果。"""
+
+    task_name: str
+    target_model_name: str
+    actual_model_name: str
+    provider_name: str
+    client_type: str
+    tool_argument_parse_mode: str
+    case_name: str
+    attempt: int
+    success: bool
+    elapsed_seconds: float
+    errors: List[str]
+    warnings: List[str]
+    response_text: str
+    reasoning_text: str
+    tool_calls: List[Dict[str, Any]]
+
+
+def _ensure_utf8_console() -> None:
+    """尽量将控制台编码切到 UTF-8。"""
+    try:
+        if hasattr(sys.stdout, "reconfigure"):
+            sys.stdout.reconfigure(encoding="utf-8")
+        if hasattr(sys.stderr, "reconfigure"):
+            sys.stderr.reconfigure(encoding="utf-8")
+    except Exception:
+        pass
+
+
+def _build_function_tool(name: str, description: str, parameters: Dict[str, Any]) -> Dict[str, Any]:
+    """构造 OpenAI 风格 function tool 定义。"""
+    return {
+        "type": "function",
+        "function": {
+            "name": name,
+            "description": description,
+            "parameters": parameters,
+        },
+    }
+
+
+def _build_default_cases() -> List[ToolCallCase]:
+    """构造默认测试用例。"""
+    simple_expected_arguments = {
+        "request_id": "probe-simple-001",
+        "count": 7,
+        "enabled": True,
+        "mode": "strict",
+        "ratio": 2.5,
+    }
+    simple_parameters = {
+        "type": "object",
+        "properties": {
+            "request_id": {"type": "string", "description": "请求 ID"},
+            "count": {"type": "integer", "description": "数量"},
+            "enabled": {"type": "boolean", "description": "是否启用"},
+            "mode": {
+                "type": "string",
+                "description": "模式",
+                "enum": ["strict", "loose"],
+            },
+            "ratio": {"type": "number", "description": "比例"},
+        },
+        "required": ["request_id", "count", "enabled", "mode", "ratio"],
+        "additionalProperties": False,
+    }
+
+    nested_expected_arguments = {
+        "request_id": "probe-nested-001",
+        "notify": False,
+        "profile": {
+            "channel": "stable",
+            "priority": 2,
+        },
+        "tags": ["alpha", "beta", "gamma"],
+        "items": [
+            {"count": 2, "name": "apple"},
+            {"count": 5, "name": "banana"},
+        ],
+    }
+    nested_parameters = {
+        "type": "object",
+        "properties": {
+            "request_id": {"type": "string", "description": "请求 ID"},
+            "notify": {"type": "boolean", "description": "是否通知"},
+            "profile": {
+                "type": "object",
+                "description": "配置对象",
+                "properties": {
+                    "channel": {"type": "string", "description": "渠道"},
+                    "priority": {"type": "integer", "description": "优先级"},
+                },
+                "required": ["channel", "priority"],
+                "additionalProperties": False,
+            },
+            "tags": {
+                "type": "array",
+                "description": "标签列表",
+                "items": {"type": "string"},
+            },
+            "items": {
+                "type": "array",
+                "description": "条目列表",
+                "items": {
+                    "type": "object",
+                    "properties": {
+                        "count": {"type": "integer", "description": "数量"},
+                        "name": {"type": "string", "description": "名称"},
+                    },
+                    "required": ["count", "name"],
+                    "additionalProperties": False,
+                },
+            },
+        },
+        "required": ["request_id", "notify", "profile", "tags", "items"],
+        "additionalProperties": False,
+    }
+
+    return [
+        ToolCallCase(
+            name="simple",
+            description="标量参数类型校验",
+            tool_definition=_build_function_tool(
+                name="record_simple_probe",
+                description="记录简单参数探测结果",
+                parameters=simple_parameters,
+            ),
+            expected_arguments=simple_expected_arguments,
+        ),
+        ToolCallCase(
+            name="nested",
+            description="嵌套对象与数组参数校验",
+            tool_definition=_build_function_tool(
+                name="record_nested_probe",
+                description="记录嵌套参数探测结果",
+                parameters=nested_parameters,
+            ),
+            expected_arguments=nested_expected_arguments,
+        ),
+    ]
+
+
+def _parse_multi_value_args(raw_values: Sequence[str] | None) -> List[str]:
+    """解析命令行中的多值参数。"""
+    parsed_values: List[str] = []
+    for raw_value in raw_values or []:
+        for item in str(raw_value).split(","):
+            normalized_item = item.strip()
+            if normalized_item:
+                parsed_values.append(normalized_item)
+    return parsed_values
+
+
+def _build_model_map() -> Dict[str, ModelInfo]:
+    """构造模型名称到模型配置的映射。"""
+    return {model.name: model for model in config_manager.get_model_config().models}
+
+
+def _build_provider_map() -> Dict[str, APIProvider]:
+    """构造 Provider 名称到配置的映射。"""
+    return {provider.name: provider for provider in config_manager.get_model_config().api_providers}
+
+
+def _pick_default_task_name(task_names: Sequence[str]) -> str:
+    """选择默认任务名。"""
+    if "utils" in task_names:
+        return "utils"
+    if not task_names:
+        raise ValueError("当前没有可用的任务配置")
+    return str(task_names[0])
+
+
+def _resolve_targets(task_filters: Sequence[str], model_filters: Sequence[str], fallback_task: str) -> List[ProbeTarget]:
+    """根据命令行参数解析待测试目标。"""
+    available_tasks = get_available_models()
+    model_map = _build_model_map()
+    provider_map = _build_provider_map()
+
+    if not available_tasks:
+        raise ValueError("未找到任何可用的模型任务配置")
+
+    if task_filters:
+        selected_task_names = []
+        for task_name in task_filters:
+            if task_name not in available_tasks:
+                raise ValueError(f"未找到任务 `{task_name}`")
+            selected_task_names.append(task_name)
+    else:
+        selected_task_names = [
+            task_name
+            for task_name in available_tasks
+            if task_name not in DEFAULT_SKIP_TASKS
+        ]
+
+    if not selected_task_names:
+        raise ValueError("没有可用于 tool call 测试的任务，请显式通过 --task 指定")
+
+    default_task_name = fallback_task if fallback_task in available_tasks else _pick_default_task_name(selected_task_names)
+    resolved_targets: List[ProbeTarget] = []
+    seen_models: set[str] = set()
+
+    if model_filters:
+        model_names = list(model_filters)
+    else:
+        model_names = []
+        for task_name in selected_task_names:
+            task_config = available_tasks[task_name]
+            for model_name in task_config.model_list:
+                if model_name not in model_names:
+                    model_names.append(model_name)
+
+    for model_name in model_names:
+        if model_name in seen_models:
+            continue
+        if model_name not in model_map:
+            raise ValueError(f"未找到模型 `{model_name}`")
+
+        target_task_name = ""
+        for task_name in selected_task_names:
+            if model_name in available_tasks[task_name].model_list:
+                target_task_name = task_name
+                break
+        if not target_task_name:
+            target_task_name = default_task_name
+
+        model_info = model_map[model_name]
+        provider_info = provider_map[model_info.api_provider]
+        resolved_targets.append(
+            ProbeTarget(
+                task_name=target_task_name,
+                model_name=model_name,
+                provider_name=provider_info.name,
+                client_type=provider_info.client_type,
+                tool_argument_parse_mode=provider_info.tool_argument_parse_mode,
+            )
+        )
+        seen_models.add(model_name)
+
+    return resolved_targets
+
+
+@contextmanager
+def _pin_task_to_model(task_name: str, model_name: str) -> Iterator[None]:
+    """临时将某个任务锁定到单模型。"""
+    model_task_config = config_manager.get_model_config().model_task_config
+    task_config = getattr(model_task_config, task_name, None)
+    if not isinstance(task_config, TaskConfig):
+        raise ValueError(f"未找到任务 `{task_name}` 对应的配置")
+
+    original_model_list = list(task_config.model_list)
+    original_selection_strategy = task_config.selection_strategy
+    task_config.model_list = [model_name]
+    task_config.selection_strategy = "balance"
+    try:
+        yield
+    finally:
+        task_config.model_list = original_model_list
+        task_config.selection_strategy = original_selection_strategy
+
+
+def _serialize_tool_calls(tool_calls: List[ToolCall] | None) -> List[Dict[str, Any]]:
+    """序列化工具调用结果。"""
+    if not tool_calls:
+        return []
+    return [
+        {
+            "id": tool_call.call_id,
+            "function": {
+                "name": tool_call.func_name,
+                "arguments": dict(tool_call.args or {}),
+            },
+        }
+        for tool_call in tool_calls
+    ]
+
+
+def _is_integer_value(value: Any) -> bool:
+    """判断是否为整数类型且排除布尔值。"""
+    return isinstance(value, int) and not isinstance(value, bool)
+
+
+def _is_number_value(value: Any) -> bool:
+    """判断是否为数值类型且排除布尔值。"""
+    return (isinstance(value, int) or isinstance(value, float)) and not isinstance(value, bool)
+
+
+def _schema_type(schema: Dict[str, Any]) -> str:
+    """解析 Schema 的类型。"""
+    schema_type = str(schema.get("type", "") or "").strip()
+    if schema_type:
+        return schema_type
+    if "properties" in schema or "required" in schema:
+        return "object"
+    return ""
+
+
+def _validate_schema(schema: Dict[str, Any], actual_value: Any, path: str = "args") -> List[str]:
+    """按简化 JSON Schema 校验工具参数。"""
+    errors: List[str] = []
+    schema_type = _schema_type(schema)
+
+    if "enum" in schema and actual_value not in schema["enum"]:
+        errors.append(f"{path} 枚举值不合法，期望属于 {schema['enum']}，实际为 {actual_value!r}")
+
+    if schema_type == "string":
+        if not isinstance(actual_value, str):
+            errors.append(f"{path} 类型错误，期望 string，实际为 {type(actual_value).__name__}")
+        return errors
+
+    if schema_type == "integer":
+        if not _is_integer_value(actual_value):
+            errors.append(f"{path} 类型错误，期望 integer，实际为 {type(actual_value).__name__}")
+        return errors
+
+    if schema_type == "number":
+        if not _is_number_value(actual_value):
+            errors.append(f"{path} 类型错误，期望 number，实际为 {type(actual_value).__name__}")
+        return errors
+
+    if schema_type == "boolean":
+        if not isinstance(actual_value, bool):
+            errors.append(f"{path} 类型错误，期望 boolean，实际为 {type(actual_value).__name__}")
+        return errors
+
+    if schema_type == "array":
+        if not isinstance(actual_value, list):
+            errors.append(f"{path} 类型错误，期望 array，实际为 {type(actual_value).__name__}")
+            return errors
+        item_schema = schema.get("items")
+        if isinstance(item_schema, dict):
+            for index, item in enumerate(actual_value):
+                errors.extend(_validate_schema(item_schema, item, f"{path}[{index}]"))
+        return errors
+
+    if schema_type == "object":
+        if not isinstance(actual_value, dict):
+            errors.append(f"{path} 类型错误，期望 object，实际为 {type(actual_value).__name__}")
+            return errors
+
+        properties = schema.get("properties", {})
+        required_fields = [str(item) for item in schema.get("required", [])]
+        for required_field in required_fields:
+            if required_field not in actual_value:
+                errors.append(f"{path}.{required_field} 缺少必填字段")
+
+        for field_name, field_value in actual_value.items():
+            field_path = f"{path}.{field_name}"
+            field_schema = properties.get(field_name)
+            if isinstance(field_schema, dict):
+                errors.extend(_validate_schema(field_schema, field_value, field_path))
+                continue
+
+            additional_properties = schema.get("additionalProperties", True)
+            if additional_properties is False:
+                errors.append(f"{field_path} 是未定义字段")
+            elif isinstance(additional_properties, dict):
+                errors.extend(_validate_schema(additional_properties, field_value, field_path))
+        return errors
+
+    return errors
+
+
+def _compare_expected_values(expected_value: Any, actual_value: Any, path: str = "args") -> List[str]:
+    """递归比较实际值与期望值是否完全一致。"""
+    errors: List[str] = []
+
+    if isinstance(expected_value, dict):
+        if not isinstance(actual_value, dict):
+            return [f"{path} 值不一致，期望 object，实际为 {type(actual_value).__name__}"]
+
+        expected_keys = set(expected_value.keys())
+        actual_keys = set(actual_value.keys())
+        for missing_key in sorted(expected_keys - actual_keys):
+            errors.append(f"{path}.{missing_key} 缺少期望字段")
+        for extra_key in sorted(actual_keys - expected_keys):
+            errors.append(f"{path}.{extra_key} 出现了额外字段")
+        for shared_key in sorted(expected_keys & actual_keys):
+            errors.extend(
+                _compare_expected_values(
+                    expected_value[shared_key],
+                    actual_value[shared_key],
+                    f"{path}.{shared_key}",
+                )
+            )
+        return errors
+
+    if isinstance(expected_value, list):
+        if not isinstance(actual_value, list):
+            return [f"{path} 值不一致，期望 array，实际为 {type(actual_value).__name__}"]
+
+        if len(expected_value) != len(actual_value):
+            errors.append(f"{path} 列表长度不一致，期望 {len(expected_value)}，实际 {len(actual_value)}")
+        for index, (expected_item, actual_item) in enumerate(
+            zip(expected_value, actual_value, strict=False)
+        ):
+            errors.extend(_compare_expected_values(expected_item, actual_item, f"{path}[{index}]"))
+        return errors
+
+    if isinstance(expected_value, bool):
+        if not isinstance(actual_value, bool) or actual_value is not expected_value:
+            errors.append(f"{path} 值不一致，期望 {expected_value!r}，实际 {actual_value!r}")
+        return errors
+
+    if _is_integer_value(expected_value):
+        if not _is_integer_value(actual_value) or actual_value != expected_value:
+            errors.append(f"{path} 值不一致，期望 {expected_value!r}，实际 {actual_value!r}")
+        return errors
+
+    if isinstance(expected_value, float):
+        if not _is_number_value(actual_value) or float(actual_value) != expected_value:
+            errors.append(f"{path} 值不一致，期望 {expected_value!r}，实际 {actual_value!r}")
+        return errors
+
+    if expected_value != actual_value:
+        errors.append(f"{path} 值不一致，期望 {expected_value!r}，实际 {actual_value!r}")
+    return errors
+
+
+def _pick_tool_call(tool_calls: List[ToolCall], expected_tool_name: str) -> ToolCall:
+    """优先选择同名工具调用，否则回退到第一条。"""
+    for tool_call in tool_calls:
+        if tool_call.func_name == expected_tool_name:
+            return tool_call
+    return tool_calls[0]
+
+
+def _validate_service_result(
+    service_result: LLMServiceResult,
+    target: ProbeTarget,
+    case: ToolCallCase,
+) -> tuple[List[str], List[str], List[Dict[str, Any]]]:
+    """校验服务层返回结果。"""
+    errors: List[str] = []
+    warnings: List[str] = []
+    completion = service_result.completion
+    serialized_tool_calls = _serialize_tool_calls(completion.tool_calls)
+
+    if not service_result.success:
+        errors.append(service_result.error or completion.response or "请求失败但未返回错误信息")
+        return errors, warnings, serialized_tool_calls
+
+    if completion.model_name and completion.model_name != target.model_name:
+        errors.append(
+            f"实际命中的模型为 `{completion.model_name}`，与目标模型 `{target.model_name}` 不一致"
+        )
+
+    tool_calls = completion.tool_calls or []
+    if not tool_calls:
+        errors.append("模型未返回 tool_calls")
+        if completion.response.strip():
+            warnings.append("模型返回了自然语言文本而不是工具调用")
+        return errors, warnings, serialized_tool_calls
+
+    if len(tool_calls) != 1:
+        errors.append(f"返回了 {len(tool_calls)} 个 tool_calls，预期为 1 个")
+
+    selected_tool_call = _pick_tool_call(tool_calls, case.tool_name)
+    if selected_tool_call.func_name != case.tool_name:
+        errors.append(
+            f"工具名不一致，期望 `{case.tool_name}`，实际 `{selected_tool_call.func_name}`"
+        )
+
+    actual_arguments = selected_tool_call.args
+    if not isinstance(actual_arguments, dict):
+        errors.append("工具参数未被解析为对象")
+        return errors, warnings, serialized_tool_calls
+
+    errors.extend(_validate_schema(case.parameters_schema, actual_arguments))
+    errors.extend(_compare_expected_values(case.expected_arguments, actual_arguments))
+
+    if completion.response.strip():
+        warnings.append("模型同时返回了自然语言文本")
+    return errors, warnings, serialized_tool_calls
+
+
+async def _run_single_probe(
+    target: ProbeTarget,
+    case: ToolCallCase,
+    attempt: int,
+    max_tokens: int,
+    temperature: float,
+) -> ProbeResult:
+    """执行单次工具调用参数探测。"""
+    request = LLMServiceRequest(
+        task_name=target.task_name,
+        request_type=f"tool_call_param_probe.{case.name}.attempt_{attempt}",
+        prompt=case.build_messages(),
+        tool_options=[case.tool_definition],
+        temperature=temperature,
+        max_tokens=max_tokens,
+    )
+
+    started_at = time.perf_counter()
+    with _pin_task_to_model(target.task_name, target.model_name):
+        service_result = await generate(request)
+    elapsed_seconds = time.perf_counter() - started_at
+
+    errors, warnings, serialized_tool_calls = _validate_service_result(service_result, target, case)
+    completion = service_result.completion
+    return ProbeResult(
+        task_name=target.task_name,
+        target_model_name=target.model_name,
+        actual_model_name=completion.model_name,
+        provider_name=target.provider_name,
+        client_type=target.client_type,
+        tool_argument_parse_mode=target.tool_argument_parse_mode,
+        case_name=case.name,
+        attempt=attempt,
+        success=not errors,
+        elapsed_seconds=elapsed_seconds,
+        errors=errors,
+        warnings=warnings,
+        response_text=completion.response,
+        reasoning_text=completion.reasoning,
+        tool_calls=serialized_tool_calls,
+    )
+
+
+def _print_targets(targets: Sequence[ProbeTarget]) -> None:
+    """打印待测试目标。"""
+    print("待测试目标：")
+    for index, target in enumerate(targets, start=1):
+        print(
+            f"{index}. model={target.model_name} | task={target.task_name} | "
+            f"provider={target.provider_name} | client={target.client_type} | "
+            f"tool_argument_parse_mode={target.tool_argument_parse_mode}"
+        )
+
+
+def _print_available_targets() -> None:
+    """打印当前可用任务与模型。"""
+    available_tasks = get_available_models()
+    model_map = _build_model_map()
+    task_names = list(available_tasks.keys())
+
+    print("当前可用任务：")
+    for task_name in task_names:
+        task_config = available_tasks[task_name]
+        print(f"- {task_name}: {list(task_config.model_list)}")
+
+    referenced_models = {
+        model_name
+        for task_config in available_tasks.values()
+        for model_name in task_config.model_list
+    }
+
+    print("\n当前配置中的模型：")
+    for model_name, model_info in model_map.items():
+        referenced_mark = "已被任务引用" if model_name in referenced_models else "未被任务引用"
+        print(
+            f"- {model_name}: provider={model_info.api_provider}, "
+            f"identifier={model_info.model_identifier}, {referenced_mark}"
+        )
+
+
+def _select_cases(case_filters: Sequence[str]) -> List[ToolCallCase]:
+    """根据参数筛选测试用例。"""
+    all_cases = {case.name: case for case in _build_default_cases()}
+    if not case_filters:
+        return list(all_cases.values())
+
+    selected_cases: List[ToolCallCase] = []
+    for case_name in case_filters:
+        if case_name not in all_cases:
+            raise ValueError(f"未知测试用例 `{case_name}`，可选值: {', '.join(sorted(all_cases))}")
+        selected_cases.append(all_cases[case_name])
+    return selected_cases
+
+
+def _print_single_result(result: ProbeResult, show_response: bool) -> None:
+    """打印单次结果。"""
+    status_text = "PASS" if result.success else "FAIL"
+    print(
+        f"[{status_text}] model={result.target_model_name} | task={result.task_name} | "
+        f"case={result.case_name} | attempt={result.attempt} | elapsed={result.elapsed_seconds:.2f}s"
+    )
+    if result.errors:
+        for error in result.errors:
+            print(f"  ERROR: {error}")
+    if result.warnings:
+        for warning in result.warnings:
+            print(f"  WARN: {warning}")
+    if result.tool_calls:
+        print(f"  tool_calls: {json.dumps(result.tool_calls, ensure_ascii=False)}")
+    if show_response and result.response_text.strip():
+        print(f"  response: {result.response_text}")
+
+
+def _build_summary(results: Sequence[ProbeResult]) -> Dict[str, Any]:
+    """构造结果摘要。"""
+    total_count = len(results)
+    passed_count = sum(1 for result in results if result.success)
+    failed_count = total_count - passed_count
+    failed_items = [
+        {
+            "model_name": result.target_model_name,
+            "case_name": result.case_name,
+            "attempt": result.attempt,
+            "errors": list(result.errors),
+        }
+        for result in results
+        if not result.success
+    ]
+    return {
+        "total": total_count,
+        "passed": passed_count,
+        "failed": failed_count,
+        "failed_items": failed_items,
+    }
+
+
+def _write_json_report(json_out: str, results: Sequence[ProbeResult]) -> None:
+    """将测试结果写入 JSON 文件。"""
+    output_path = Path(json_out).expanduser().resolve()
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    payload = {
+        "generated_at": time.strftime("%Y-%m-%d %H:%M:%S"),
+        "summary": _build_summary(results),
+        "results": [asdict(result) for result in results],
+    }
+    output_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
+    print(f"\n结果已写入: {output_path}")
+
+
+async def _run_probes(args: Namespace) -> List[ProbeResult]:
+    """执行所有探测请求。"""
+    task_filters = _parse_multi_value_args(args.task)
+    model_filters = _parse_multi_value_args(args.model)
+    case_filters = _parse_multi_value_args(args.case)
+
+    selected_cases = _select_cases(case_filters)
+    targets = _resolve_targets(task_filters, model_filters, args.fallback_task)
+
+    _print_targets(targets)
+    print("")
+
+    results: List[ProbeResult] = []
+    for target in targets:
+        for attempt in range(1, args.repeat + 1):
+            for case in selected_cases:
+                print(
+                    f"开始测试: model={target.model_name}, task={target.task_name}, "
+                    f"case={case.name}, attempt={attempt}"
+                )
+                result = await _run_single_probe(
+                    target=target,
+                    case=case,
+                    attempt=attempt,
+                    max_tokens=args.max_tokens,
+                    temperature=args.temperature,
+                )
+                _print_single_result(result, args.show_response)
+                print("")
+                results.append(result)
+    return results
+
+
+def _build_parser() -> ArgumentParser:
+    """构造命令行参数解析器。"""
+    parser = ArgumentParser(
+        description=(
+            "测试 config/model_config.toml 中不同模型的 tool call 参数兼容性。\n"
+            "默认会测试所有非 voice / embedding 任务中引用到的模型。"
+        )
+    )
+    parser.add_argument(
+        "--task",
+        action="append",
+        help="指定任务名，可重复传入，或使用逗号分隔多个值，例如 --task utils --task planner",
+    )
+    parser.add_argument(
+        "--model",
+        action="append",
+        help="指定模型名，可重复传入，或使用逗号分隔多个值，例如 --model qwen3.6-plus",
+    )
+    parser.add_argument(
+        "--case",
+        action="append",
+        help="指定测试用例名，可选 simple、nested；不传则运行全部默认用例",
+    )
+    parser.add_argument(
+        "--repeat",
+        type=int,
+        default=1,
+        help="每个模型每个用例重复测试次数，默认 1",
+    )
+    parser.add_argument(
+        "--max-tokens",
+        type=int,
+        default=512,
+        help="单次测试的最大输出 token 数，默认 512",
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=0.0,
+        help="单次测试温度，默认 0.0 以尽量提高稳定性",
+    )
+    parser.add_argument(
+        "--fallback-task",
+        default="utils",
+        help="当指定模型未被任何已选任务引用时，用于挂载该模型的任务名，默认 utils",
+    )
+    parser.add_argument(
+        "--json-out",
+        help="可选，将结果写入指定 JSON 文件",
+    )
+    parser.add_argument(
+        "--list-targets",
+        action="store_true",
+        help="仅打印当前任务与模型映射，不发起网络请求",
+    )
+    parser.add_argument(
+        "--show-response",
+        action="store_true",
+        help="打印模型返回的自然语言文本内容",
+    )
+    return parser
+
+
+def main() -> int:
+    """脚本入口。"""
+    _ensure_utf8_console()
+    parser = _build_parser()
+    args = parser.parse_args()
+
+    if args.repeat < 1:
+        parser.error("--repeat 必须大于等于 1")
+    if args.max_tokens < 1:
+        parser.error("--max-tokens 必须大于等于 1")
+
+    if args.list_targets:
+        _print_available_targets()
+        return 0
+
+    results = asyncio.run(_run_probes(args))
+    summary = _build_summary(results)
+
+    print("测试摘要：")
+    print(
+        f"total={summary['total']} | passed={summary['passed']} | failed={summary['failed']}"
+    )
+    if summary["failed_items"]:
+        print("失败明细：")
+        for failed_item in summary["failed_items"]:
+            print(
+                f"- model={failed_item['model_name']} | case={failed_item['case_name']} | "
+                f"attempt={failed_item['attempt']} | errors={failed_item['errors']}"
+            )
+
+    if args.json_out:
+        _write_json_report(args.json_out, results)
+
+    return 0 if summary["failed"] == 0 else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())