feat:优化对多模态/非多模态replyer的配置

This commit is contained in:
SengokuCola
2026-04-11 19:30:23 +08:00
parent c0230fc313
commit d9b3440169
12 changed files with 150 additions and 44 deletions

View File

@@ -33,3 +33,34 @@ def test_legacy_learning_list_with_numeric_fourth_column_is_migrated():
"enable_jargon_learning": False,
},
]
def test_visual_multimodal_replyer_is_migrated_to_replyer_mode() -> None:
payload = {
"visual": {
"multimodal_replyer": True,
}
}
result = try_migrate_legacy_bot_config_dict(payload)
assert result.migrated is True
assert "visual.multimodal_replyer_moved_to_visual.replyer_mode" in result.reason
assert result.data["visual"]["replyer_mode"] == "multimodal"
assert "multimodal_replyer" not in result.data["visual"]
def test_chat_replyer_generator_type_is_migrated_to_replyer_mode() -> None:
payload = {
"chat": {
"replyer_generator_type": "legacy",
},
"visual": {},
}
result = try_migrate_legacy_bot_config_dict(payload)
assert result.migrated is True
assert "chat.replyer_generator_type_moved_to_visual.replyer_mode" in result.reason
assert result.data["visual"]["replyer_mode"] == "text"
assert "replyer_generator_type" not in result.data["chat"]

View File

@@ -1,10 +1,8 @@
from datetime import datetime
from typing import Any, Callable, Optional
from src.chat.message_receive.chat_manager import BotChatSession
from src.common.prompt_i18n import load_prompt
from src.config.config import global_config
from src.maisaka.context_messages import SessionBackedMessage
from src.services.llm_service import LLMServiceClient
from .maisaka_generator_base import BaseMaisakaReplyGenerator
@@ -26,9 +24,6 @@ class MaisakaReplyGenerator(BaseMaisakaReplyGenerator):
request_type=request_type,
llm_client_cls=llm_client_cls or LLMServiceClient,
load_prompt_func=load_prompt_func or load_prompt,
enable_visual_message=(
global_config.visual.multimodal_replyer
if enable_visual_message is None
else enable_visual_message
),
enable_visual_message=enable_visual_message,
replyer_mode=global_config.visual.replyer_mode,
)

View File

@@ -1,7 +1,7 @@
import time
from dataclasses import dataclass, field
from datetime import datetime
from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple
from typing import Any, Awaitable, Callable, Dict, List, Literal, Optional, Tuple
import random
@@ -23,6 +23,7 @@ from src.common.data_models.reply_generation_data_models import (
from src.common.logger import get_logger
from src.common.utils.utils_session import SessionUtils
from src.config.config import global_config
from src.config.model_configs import ModelInfo
from src.core.types import ActionInfo
from src.llm_models.payload_content.message import Message, MessageBuilder, RoleType
from src.maisaka.context_messages import (
@@ -59,13 +60,15 @@ class BaseMaisakaReplyGenerator:
request_type: str = "maisaka_replyer",
llm_client_cls: Any,
load_prompt_func: Callable[..., str],
enable_visual_message: bool,
enable_visual_message: Optional[bool],
replyer_mode: Literal["text", "multimodal", "auto"],
) -> None:
self.chat_stream = chat_stream
self.request_type = request_type
self._llm_client_cls = llm_client_cls
self._load_prompt = load_prompt_func
self._enable_visual_message = enable_visual_message
self._replyer_mode = replyer_mode
self.express_model = llm_client_cls(
task_name="replyer",
request_type=request_type,
@@ -265,8 +268,9 @@ class BaseMaisakaReplyGenerator:
def _build_visual_user_message(
self,
message: SessionBackedMessage,
enable_visual_message: bool,
) -> Optional[Message]:
if not self._enable_visual_message:
if not enable_visual_message:
return None
raw_message = clone_message_sequence(message.raw_message)
@@ -283,7 +287,11 @@ class BaseMaisakaReplyGenerator:
)
return visual_message.to_llm_message()
def _build_history_messages(self, chat_history: List[LLMContextMessage]) -> List[Message]:
def _build_history_messages(
self,
chat_history: List[LLMContextMessage],
enable_visual_message: bool,
) -> List[Message]:
bot_nickname = global_config.bot.nickname.strip() or "Bot"
default_user_name = global_config.maisaka.cli_user_name.strip() or "User"
messages: List[Message] = []
@@ -300,7 +308,7 @@ class BaseMaisakaReplyGenerator:
)
continue
visual_message = self._build_visual_user_message(message)
visual_message = self._build_visual_user_message(message, enable_visual_message)
if visual_message is not None:
messages.append(visual_message)
continue
@@ -337,6 +345,7 @@ class BaseMaisakaReplyGenerator:
reply_reason: str,
expression_habits: str = "",
stream_id: Optional[str] = None,
enable_visual_message: bool = False,
) -> List[Message]:
messages: List[Message] = []
system_prompt = self._build_system_prompt(
@@ -348,10 +357,21 @@ class BaseMaisakaReplyGenerator:
instruction = self._build_reply_instruction()
messages.append(MessageBuilder().set_role(RoleType.System).add_text_content(system_prompt).build())
messages.extend(self._build_history_messages(chat_history))
messages.extend(self._build_history_messages(chat_history, enable_visual_message))
messages.append(MessageBuilder().set_role(RoleType.User).add_text_content(instruction).build())
return messages
def _resolve_enable_visual_message(self, model_info: Optional[ModelInfo] = None) -> bool:
if self._enable_visual_message is not None:
return self._enable_visual_message
if self._replyer_mode == "multimodal":
if model_info is not None and not model_info.visual:
raise ValueError(f"replyer_mode=multimodal但模型 '{model_info.name}' 未开启 visual无法使用多模态 replyer")
return True
if self._replyer_mode == "text":
return False
return bool(model_info.visual) if model_info is not None else False
def _resolve_session_id(self, stream_id: Optional[str]) -> str:
if stream_id:
return stream_id
@@ -494,7 +514,19 @@ class BaseMaisakaReplyGenerator:
show_replyer_prompt = bool(getattr(global_config.debug, "show_replyer_prompt", False))
show_replyer_reasoning = bool(getattr(global_config.debug, "show_replyer_reasoning", False))
def message_factory(_client: object) -> List[Message]:
def message_factory(_client: object, model_info: Optional[ModelInfo] = None) -> List[Message]:
nonlocal prompt_ms, prompt_preview, request_messages
prompt_started_at = time.perf_counter()
request_messages = self._build_request_messages(
chat_history=filtered_history,
reply_message=reply_message,
reply_reason=reply_reason or "",
expression_habits=merged_expression_habits,
stream_id=stream_id,
enable_visual_message=self._resolve_enable_visual_message(model_info),
)
prompt_ms = round((time.perf_counter() - prompt_started_at) * 1000, 2)
prompt_preview = PromptCLIVisualizer._build_prompt_dump_text(request_messages)
return request_messages
result.completion.request_prompt = prompt_preview
@@ -531,6 +563,8 @@ class BaseMaisakaReplyGenerator:
)
return finalize(False)
result.completion.request_prompt = prompt_preview
result.request_messages = serialize_prompt_messages(request_messages)
llm_ms = round((time.perf_counter() - llm_started_at) * 1000, 2)
response_text = (generation_result.response or "").strip()
result.success = bool(response_text)

View File

@@ -1,4 +1,4 @@
from typing import TYPE_CHECKING, Any, Dict, Optional
from typing import Any, Dict, Optional
from src.chat.message_receive.chat_manager import BotChatSession, chat_manager as _chat_manager
from src.config.config import global_config
@@ -6,10 +6,6 @@ from src.common.logger import get_logger
from .maisaka_generator import MaisakaReplyGenerator
if TYPE_CHECKING:
from src.chat.replyer.group_generator import DefaultReplyer
from src.chat.replyer.private_generator import PrivateReplyer
logger = get_logger("ReplyerManager")
@@ -22,7 +18,7 @@ class ReplyerManager:
@staticmethod
def _get_maisaka_generator_type() -> str:
"""返回当前配置下 Maisaka replyer 的消息模式。"""
return "multimodal" if global_config.visual.multimodal_replyer else "legacy"
return global_config.visual.replyer_mode
def get_replyer(
self,
@@ -30,7 +26,7 @@ class ReplyerManager:
chat_id: Optional[str] = None,
request_type: str = "replyer",
replyer_type: str = "default",
) -> Optional["DefaultReplyer | PrivateReplyer | Any"]:
) -> Optional[MaisakaReplyGenerator]:
"""按会话和 replyer 类型获取实例。"""
stream_id = chat_stream.session_id if chat_stream else chat_id
if not stream_id:

View File

@@ -14,7 +14,6 @@ from src.llm_models.payload_content.resp_format import RespFormat
from src.llm_models.payload_content.tool_option import ToolCall, ToolDefinitionInput
if TYPE_CHECKING:
from src.llm_models.model_client.base_client import BaseClient
from src.llm_models.payload_content.message import Message
@@ -24,7 +23,7 @@ PromptMessage: TypeAlias = Dict[str, Any]
PromptInput: TypeAlias = str | List[PromptMessage]
"""统一的提示输入类型。"""
MessageFactory: TypeAlias = Callable[["BaseClient"], List["Message"]]
MessageFactory: TypeAlias = Callable[..., List["Message"]]
"""统一的消息工厂类型。"""

View File

@@ -54,7 +54,7 @@ CONFIG_DIR: Path = PROJECT_ROOT / "config"
BOT_CONFIG_PATH: Path = (CONFIG_DIR / "bot_config.toml").resolve().absolute()
MODEL_CONFIG_PATH: Path = (CONFIG_DIR / "model_config.toml").resolve().absolute()
MMC_VERSION: str = "1.0.0"
CONFIG_VERSION: str = "8.5.5"
CONFIG_VERSION: str = "8.6.0"
MODEL_CONFIG_VERSION: str = "1.14.0"
logger = get_logger("config")

View File

@@ -253,6 +253,8 @@ def _migrate_target_item_list(parent: dict[str, Any], key: str) -> bool:
raw = _as_list(parent.get(key))
if raw is None:
return False
if not raw:
return False
if raw and all(isinstance(i, dict) for i in raw):
return False
targets: list[dict[str, str]] = []
@@ -285,18 +287,18 @@ def _migrate_extra_prompt_list(exp: dict[str, Any], key: str) -> bool:
return True
def _parse_multimodal_replyer(v: Any) -> Optional[bool]:
def _parse_replyer_mode(v: Any) -> Optional[str]:
"""兼容旧 replyer_generator_type 到布尔开关的迁移。"""
if isinstance(v, bool):
return v
return "multimodal" if v else "text"
if not isinstance(v, str):
return None
normalized_value = v.strip().lower()
if normalized_value == "multimodal":
return True
if normalized_value in {"text", "multimodal", "auto"}:
return normalized_value
if normalized_value == "legacy":
return False
return "text"
return None
@@ -403,14 +405,23 @@ def try_migrate_legacy_bot_config_dict(data: dict[str, Any]) -> MigrationResult:
migrated_any = True
reasons.append("chat.multimodal_planner_moved_to_visual.multimodal_planner")
if visual is not None and "multimodal_replyer" in visual:
replyer_mode = _parse_replyer_mode(visual.get("multimodal_replyer"))
if "replyer_mode" not in visual and replyer_mode is not None:
visual["replyer_mode"] = replyer_mode
if "replyer_mode" in visual:
visual.pop("multimodal_replyer", None)
migrated_any = True
reasons.append("visual.multimodal_replyer_moved_to_visual.replyer_mode")
if visual is not None and "replyer_generator_type" in chat:
multimodal_replyer = _parse_multimodal_replyer(chat["replyer_generator_type"])
if "multimodal_replyer" not in visual and multimodal_replyer is not None:
visual["multimodal_replyer"] = multimodal_replyer
if "multimodal_replyer" in visual:
replyer_mode = _parse_replyer_mode(chat["replyer_generator_type"])
if "replyer_mode" not in visual and replyer_mode is not None:
visual["replyer_mode"] = replyer_mode
if "replyer_mode" in visual:
chat.pop("replyer_generator_type", None)
migrated_any = True
reasons.append("chat.replyer_generator_type_moved_to_visual.multimodal_replyer")
reasons.append("chat.replyer_generator_type_moved_to_visual.replyer_mode")
maisaka = _as_dict(data.get("maisaka"))
mem = _as_dict(data.get("memory"))

View File

@@ -152,16 +152,16 @@ class VisualConfig(ConfigBase):
"x-icon": "image",
},
)
"""是否直接输入图片"""
"""是否启用多模态planner"""
multimodal_replyer: bool = Field(
default=False,
replyer_mode: Literal["text", "multimodal", "auto"] = Field(
default="auto",
json_schema_extra={
"x-widget": "switch",
"x-widget": "select",
"x-icon": "git-branch",
},
)
"""是否启用 Maisaka 多模态 replyer 生成器"""
"""回复器模式auto根据模型信息自动选择text为纯文本模式multimodal为多模态模式"""
visual_style: str = Field(
default="请用中文描述这张图片的内容。如果有文字请把文字描述概括出来请留意其主题直观感受输出为一段平文本最多30字请注意不要分点就输出一段文本",

View File

@@ -3,6 +3,7 @@ from enum import Enum
from typing import Any, Callable, Dict, List, Optional, Set, Tuple
import asyncio
import inspect
import random
import re
import time
@@ -910,7 +911,11 @@ class LLMOrchestrator:
model_info, api_provider, client = self._select_model(exclude_models=failed_models_this_request)
message_list = []
if message_factory:
message_list = message_factory(client)
parameter_count = len(inspect.signature(message_factory).parameters)
if parameter_count >= 2:
message_list = message_factory(client, model_info)
else:
message_list = message_factory(client)
try:
request = self._build_client_request(
request_type=request_type,

View File

@@ -2,11 +2,11 @@
from datetime import datetime
from io import BytesIO
import math
from random import sample
from typing import Any, Dict, Optional
import asyncio
import math
from PIL import Image as PILImage
from PIL import ImageDraw, ImageFont
@@ -20,12 +20,14 @@ from src.common.logger import get_logger
from src.config.config import global_config
from src.core.tooling import ToolExecutionContext, ToolExecutionResult, ToolInvocation, ToolSpec
from src.llm_models.payload_content.resp_format import RespFormat, RespFormatType
from src.llm_models.payload_content.message import MessageBuilder, RoleType
from src.maisaka.context_messages import (
LLMContextMessage,
ReferenceMessage,
ReferenceMessageType,
SessionBackedMessage,
)
from src.plugin_runtime.hook_payloads import serialize_prompt_messages
from .context import BuiltinToolRuntimeContext
@@ -270,6 +272,7 @@ def _build_send_emoji_prompt_preview(
def _build_send_emoji_monitor_detail(
*,
prompt_text: str = "",
request_messages: Optional[list[dict[str, Any]]] = None,
reasoning_text: str = "",
output_text: str = "",
metrics: Optional[Dict[str, Any]] = None,
@@ -280,6 +283,8 @@ def _build_send_emoji_monitor_detail(
detail: Dict[str, Any] = {}
if prompt_text.strip():
detail["prompt_text"] = prompt_text.strip()
if isinstance(request_messages, list) and request_messages:
detail["request_messages"] = request_messages
if reasoning_text.strip():
detail["reasoning_text"] = reasoning_text.strip()
if output_text.strip():
@@ -394,6 +399,16 @@ async def _select_emoji_with_sub_agent(
grid_columns=grid_columns,
sampled_emojis=sampled_emojis,
)
request_messages = [
MessageBuilder().set_role(RoleType.System).add_text_content(system_prompt).build(),
]
prompt_llm_message = prompt_message.to_llm_message()
if prompt_llm_message is not None:
request_messages.append(prompt_llm_message)
candidate_llm_message = candidate_message.to_llm_message()
if candidate_llm_message is not None:
request_messages.append(candidate_llm_message)
serialized_request_messages = serialize_prompt_messages(request_messages)
selection_started_at = datetime.now()
response = await tool_ctx.runtime.run_sub_agent(
@@ -422,6 +437,7 @@ async def _select_emoji_with_sub_agent(
if selection_metadata is not None:
selection_metadata["monitor_detail"] = _build_send_emoji_monitor_detail(
prompt_text=prompt_preview,
request_messages=serialized_request_messages,
output_text=response.content or "",
metrics=selection_metrics,
extra_sections=[{
@@ -436,6 +452,7 @@ async def _select_emoji_with_sub_agent(
selection_metadata["reason"] = selection.reason.strip()
selection_metadata["monitor_detail"] = _build_send_emoji_monitor_detail(
prompt_text=prompt_preview,
request_messages=serialized_request_messages,
reasoning_text=selection.reason,
output_text=response.content or "",
metrics=selection_metrics,

View File

@@ -440,6 +440,9 @@ class PromptCLIVisualizer:
return (
"<div class='image-card'>"
f"<div class='image-meta'>图片 image/{html.escape(normalized_format)} {html.escape(size_text)}</div>"
f"<a class='image-preview-link' href='{html.escape(file_uri, quote=True)}'>"
f"<img class='image-preview' src='{html.escape(file_uri, quote=True)}' alt='图片预览' />"
"</a>"
f"<div class='image-path'>{html.escape(str(file_path))}</div>"
f"<a class='image-link' href='{html.escape(file_uri, quote=True)}'>打开图片</a>"
"</div>"
@@ -727,6 +730,22 @@ class PromptCLIVisualizer:
font-family: "Cascadia Mono", "JetBrains Mono", "Consolas", monospace;
word-break: break-all;
}}
.image-preview-link {{
display: block;
margin-top: 10px;
}}
.image-preview {{
display: block;
max-width: min(100%, 560px);
max-height: 420px;
width: auto;
height: auto;
border-radius: 12px;
border: 1px solid #dbe4f0;
background: #fff;
box-shadow: 0 8px 20px rgba(15, 23, 42, 0.08);
object-fit: contain;
}}
.image-link {{
display: inline-block;
margin-top: 8px;

View File

@@ -11,8 +11,7 @@ from typing import Any, Dict, List, Optional, Tuple, TYPE_CHECKING
from rich.traceback import install
from src.chat.message_receive.chat_manager import BotChatSession
from src.chat.replyer.group_generator import DefaultReplyer
from src.chat.replyer.private_generator import PrivateReplyer
from src.chat.replyer.maisaka_generator import MaisakaReplyGenerator
from src.chat.replyer.replyer_manager import replyer_manager
from src.chat.utils.utils import process_llm_response
from src.common.data_models.message_component_data_model import MessageSequence, TextComponent
@@ -38,7 +37,7 @@ def _get_replyer(
chat_stream: Optional[BotChatSession] = None,
chat_id: Optional[str] = None,
request_type: str = "replyer",
) -> Optional[DefaultReplyer | PrivateReplyer]:
) -> Optional[MaisakaReplyGenerator]:
"""获取回复器对象"""
if not chat_id and not chat_stream:
raise ValueError("chat_stream 和 chat_id 不可均为空")