fix:修复非多模态模型意外传入图片的问题

This commit is contained in:
SengokuCola
2026-04-15 11:46:22 +08:00
parent 4729f5acdb
commit 6297c50011
5 changed files with 139 additions and 84 deletions

View File

@@ -13,7 +13,6 @@ from src.chat.message_receive.chat_manager import BotChatSession
from src.chat.message_receive.message import SessionMessage
from src.chat.utils.utils import get_chat_type_and_target_info
from src.cli.console import console
from src.common.data_models.message_component_data_model import MessageSequence, TextComponent
from src.common.data_models.reply_generation_data_models import (
GenerationMetrics,
LLMCompletionResult,
@@ -32,9 +31,10 @@ from src.maisaka.context_messages import (
ReferenceMessage,
SessionBackedMessage,
ToolResultMessage,
build_llm_message_from_context,
)
from src.maisaka.display.prompt_cli_renderer import PromptCLIVisualizer
from src.maisaka.message_adapter import clone_message_sequence, parse_speaker_content
from src.maisaka.message_adapter import parse_speaker_content
from src.plugin_runtime.hook_payloads import serialize_prompt_messages
from .maisaka_expression_selector import maisaka_expression_selector
@@ -253,28 +253,6 @@ class BaseMaisakaReplyGenerator:
def _build_reply_instruction(self) -> str:
return "请自然地回复。不要输出多余说明、括号、@ 或额外标记,只输出实际要发送的内容。"
def _build_visual_user_message(
self,
message: SessionBackedMessage,
enable_visual_message: bool,
) -> Optional[Message]:
if not enable_visual_message:
return None
raw_message = clone_message_sequence(message.raw_message)
if not raw_message.components:
raw_message = MessageSequence([TextComponent(message.processed_plain_text)])
visual_message = SessionBackedMessage(
raw_message=raw_message,
visible_text=message.processed_plain_text,
timestamp=message.timestamp,
message_id=message.message_id,
original_message=message.original_message,
source_kind=message.source_kind,
)
return visual_message.to_llm_message()
def _build_history_messages(
self,
chat_history: List[LLMContextMessage],
@@ -294,12 +272,10 @@ class BaseMaisakaReplyGenerator:
)
continue
visual_message = self._build_visual_user_message(message, enable_visual_message)
if visual_message is not None:
messages.append(visual_message)
continue
llm_message = message.to_llm_message()
llm_message = build_llm_message_from_context(
message,
enable_visual_message=enable_visual_message,
)
if llm_message is not None:
messages.append(llm_message)
continue

View File

@@ -30,9 +30,15 @@ from src.plugin_runtime.host.hook_spec_registry import HookSpec, HookSpecRegistr
from src.services.llm_service import LLMServiceClient
from .builtin_tool import get_builtin_tools
from .context_messages import AssistantMessage, LLMContextMessage, ToolResultMessage
from .context_messages import (
AssistantMessage,
LLMContextMessage,
ToolResultMessage,
build_llm_message_from_context,
)
from .history_utils import drop_orphan_tool_results
from .display.prompt_cli_renderer import PromptCLIVisualizer
from .visual_mode_utils import resolve_enable_visual_planner
TIMING_GATE_TOOL_NAMES = {"continue", "no_reply", "wait"}
@@ -395,6 +401,7 @@ class MaisakaChatLoopService:
self,
selected_history: List[LLMContextMessage],
*,
enable_visual_message: bool,
injected_user_messages: Sequence[str] | None = None,
system_prompt: Optional[str] = None,
) -> List[Message]:
@@ -413,7 +420,10 @@ class MaisakaChatLoopService:
messages.append(system_msg.build())
for msg in selected_history:
llm_message = msg.to_llm_message()
llm_message = build_llm_message_from_context(
msg,
enable_visual_message=enable_visual_message,
)
if llm_message is not None:
messages.append(llm_message)
@@ -475,12 +485,15 @@ class MaisakaChatLoopService:
if not self._prompts_loaded:
await self.ensure_chat_prompt_loaded()
enable_visual_message = self._resolve_enable_visual_message(request_kind)
selected_history, selection_reason = self.select_llm_context_messages(
chat_history,
request_kind=request_kind,
enable_visual_message=enable_visual_message,
)
built_messages = self._build_request_messages(
selected_history,
enable_visual_message=enable_visual_message,
injected_user_messages=injected_user_messages,
)
@@ -602,6 +615,7 @@ class MaisakaChatLoopService:
def select_llm_context_messages(
chat_history: List[LLMContextMessage],
*,
enable_visual_message: Optional[bool] = None,
request_kind: str = "planner",
max_context_size: Optional[int] = None,
) -> tuple[List[LLMContextMessage], str]:
@@ -615,9 +629,21 @@ class MaisakaChatLoopService:
selected_indices: List[int] = []
counted_message_count = 0
active_enable_visual_message = (
enable_visual_message
if enable_visual_message is not None
else MaisakaChatLoopService._resolve_enable_visual_message(request_kind)
)
for index in range(len(filtered_history) - 1, -1, -1):
message = filtered_history[index]
if message.to_llm_message() is None:
if (
build_llm_message_from_context(
message,
enable_visual_message=active_enable_visual_message,
)
is None
):
continue
selected_indices.append(index)
@@ -683,6 +709,12 @@ class MaisakaChatLoopService:
return filtered_history
@staticmethod
def _resolve_enable_visual_message(request_kind: str) -> bool:
if request_kind in {"planner", "timing_gate"}:
return resolve_enable_visual_planner()
return True
@staticmethod
def _hide_early_assistant_messages(
selected_history: List[LLMContextMessage],

View File

@@ -40,10 +40,15 @@ def _guess_image_format(image_bytes: bytes) -> Optional[str]:
return None
def _append_emoji_component(builder: MessageBuilder, component: EmojiComponent) -> bool:
def _append_emoji_component(
builder: MessageBuilder,
component: EmojiComponent,
*,
enable_visual_message: bool,
) -> bool:
"""将表情组件追加到 LLM 消息构建器。"""
image_format = _guess_image_format(component.binary_data)
if image_format and component.binary_data:
if enable_visual_message and image_format and component.binary_data:
builder.add_text_content("[消息类型]表情包")
builder.add_image_content(image_format, base64.b64encode(component.binary_data).decode("utf-8"))
return True
@@ -56,10 +61,15 @@ def _append_emoji_component(builder: MessageBuilder, component: EmojiComponent)
return True
def _append_image_component(builder: MessageBuilder, component: ImageComponent) -> bool:
def _append_image_component(
builder: MessageBuilder,
component: ImageComponent,
*,
enable_visual_message: bool,
) -> bool:
"""将图片组件追加到 LLM 消息构建器。"""
image_format = _guess_image_format(component.binary_data)
if image_format and component.binary_data:
if enable_visual_message and image_format and component.binary_data:
builder.add_text_content("[消息类型]图片")
builder.add_image_content(image_format, base64.b64encode(component.binary_data).decode("utf-8"))
return True
@@ -216,6 +226,7 @@ def _build_message_from_sequence(
message_sequence: MessageSequence,
fallback_text: str,
*,
enable_visual_message: bool = True,
tool_call_id: Optional[str] = None,
tool_name: Optional[str] = None,
tool_calls: Optional[list[ToolCall]] = None,
@@ -238,11 +249,25 @@ def _build_message_from_sequence(
continue
if isinstance(component, EmojiComponent):
has_content = _append_emoji_component(builder, component) or has_content
has_content = (
_append_emoji_component(
builder,
component,
enable_visual_message=enable_visual_message,
)
or has_content
)
continue
if isinstance(component, ImageComponent):
has_content = _append_image_component(builder, component) or has_content
has_content = (
_append_image_component(
builder,
component,
enable_visual_message=enable_visual_message,
)
or has_content
)
continue
if isinstance(component, AtComponent):
@@ -297,7 +322,7 @@ class LLMContextMessage(ABC):
return self.__class__.__name__
@abstractmethod
def to_llm_message(self) -> Optional[Message]:
def to_llm_message(self, enable_visual_message: bool = True) -> Optional[Message]:
"""转换为统一 LLM 消息。"""
def consume_once(self) -> bool:
@@ -328,11 +353,12 @@ class SessionBackedMessage(LLMContextMessage):
def source(self) -> str:
return self.source_kind
def to_llm_message(self) -> Optional[Message]:
def to_llm_message(self, enable_visual_message: bool = True) -> Optional[Message]:
return _build_message_from_sequence(
RoleType.User,
self.raw_message,
self.processed_plain_text,
enable_visual_message=enable_visual_message,
)
@classmethod
@@ -366,7 +392,8 @@ class ComplexSessionMessage(SessionBackedMessage):
def source(self) -> str:
return f"{self.source_kind}:{self.complex_message_type}"
def to_llm_message(self) -> Optional[Message]:
def to_llm_message(self, enable_visual_message: bool = True) -> Optional[Message]:
del enable_visual_message
message_sequence = MessageSequence([TextComponent(self.prompt_text)])
return _build_message_from_sequence(
RoleType.User,
@@ -426,7 +453,8 @@ class ReferenceMessage(LLMContextMessage):
def source(self) -> str:
return self.reference_type.value
def to_llm_message(self) -> Optional[Message]:
def to_llm_message(self, enable_visual_message: bool = True) -> Optional[Message]:
del enable_visual_message
message_sequence = MessageSequence([TextComponent(self.processed_plain_text)])
return _build_message_from_sequence(RoleType.User, message_sequence, self.processed_plain_text)
@@ -463,7 +491,8 @@ class AssistantMessage(LLMContextMessage):
def source(self) -> str:
return self.source_kind
def to_llm_message(self) -> Optional[Message]:
def to_llm_message(self, enable_visual_message: bool = True) -> Optional[Message]:
del enable_visual_message
message_sequence = MessageSequence([])
if self.content:
message_sequence.text(self.content)
@@ -501,7 +530,8 @@ class ToolResultMessage(LLMContextMessage):
def source(self) -> str:
return self.tool_name or "tool"
def to_llm_message(self) -> Optional[Message]:
def to_llm_message(self, enable_visual_message: bool = True) -> Optional[Message]:
del enable_visual_message
message_sequence = MessageSequence([TextComponent(self.content)])
return _build_message_from_sequence(
RoleType.Tool,
@@ -510,3 +540,13 @@ class ToolResultMessage(LLMContextMessage):
tool_call_id=self.tool_call_id,
tool_name=self.tool_name,
)
def build_llm_message_from_context(
context_message: LLMContextMessage,
*,
enable_visual_message: bool = True,
) -> Optional[Message]:
"""将 Maisaka 内部上下文消息转换为发给 LLM 的统一消息。"""
return context_message.to_llm_message(enable_visual_message=enable_visual_message)

View File

@@ -14,7 +14,7 @@ from src.chat.message_receive.message import SessionMessage
from src.common.data_models.message_component_data_model import EmojiComponent, ImageComponent, MessageSequence
from src.common.logger import get_logger
from src.common.prompt_i18n import load_prompt
from src.config.config import config_manager, global_config
from src.config.config import global_config
from src.core.tooling import ToolExecutionContext, ToolExecutionResult, ToolInvocation, ToolSpec
from src.llm_models.exceptions import ReqAbortException
from src.llm_models.payload_content.tool_option import ToolCall
@@ -43,6 +43,7 @@ from .monitor_events import (
emit_timing_gate_result,
)
from .planner_message_utils import build_planner_user_prefix_from_session_message
from .visual_mode_utils import resolve_enable_visual_planner
if TYPE_CHECKING:
from .runtime import MaisakaHeartFlowChatting
@@ -738,47 +739,10 @@ class MaisakaReasoningEngine:
planner_prefix: str,
) -> MessageSequence:
message_sequence = build_prefixed_message_sequence(message.raw_message, planner_prefix)
if self._resolve_enable_visual_planner():
if resolve_enable_visual_planner():
await self._hydrate_visual_components(message_sequence.components)
return message_sequence
@staticmethod
def _resolve_enable_visual_planner() -> bool:
planner_mode = global_config.visual.planner_mode
planner_task_config = config_manager.get_model_config().model_task_config.planner
models_by_name = {model.name: model for model in config_manager.get_model_config().models}
if planner_mode == "text":
return False
planner_models: list[str] = list(planner_task_config.model_list)
missing_models = [model_name for model_name in planner_models if model_name not in models_by_name]
non_visual_models = [
model_name for model_name in planner_models if model_name in models_by_name and not models_by_name[model_name].visual
]
if planner_mode == "multimodal":
if missing_models:
raise ValueError(
"planner_mode=multimodal但 planner 任务存在未定义的模型:"
f"{', '.join(missing_models)}"
)
if non_visual_models:
raise ValueError(
"planner_mode=multimodal但 planner 任务存在未开启 visual 的模型:"
f"{', '.join(non_visual_models)}"
)
return True
if missing_models:
logger.warning(
"planner_mode=auto 时发现 planner 任务存在未定义模型:"
f"{', '.join(missing_models)},将退化为纯文本 planner"
)
return False
return bool(planner_models) and not non_visual_models
async def _hydrate_visual_components(self, planner_components: list[object]) -> None:
"""在 Maisaka 真正需要图片或表情时,按需回填二进制数据。"""
load_tasks: list[asyncio.Task[None]] = []

View File

@@ -0,0 +1,43 @@
from src.common.logger import get_logger
from src.config.config import config_manager, global_config
logger = get_logger("maisaka_visual_mode")
def resolve_enable_visual_planner() -> bool:
"""根据 planner 配置解析当前是否应启用视觉消息。"""
planner_mode = global_config.visual.planner_mode
planner_task_config = config_manager.get_model_config().model_task_config.planner
models_by_name = {model.name: model for model in config_manager.get_model_config().models}
if planner_mode == "text":
return False
planner_models: list[str] = list(planner_task_config.model_list)
missing_models = [model_name for model_name in planner_models if model_name not in models_by_name]
non_visual_models = [
model_name for model_name in planner_models if model_name in models_by_name and not models_by_name[model_name].visual
]
if planner_mode == "multimodal":
if missing_models:
raise ValueError(
"planner_mode=multimodal但 planner 任务存在未定义的模型:"
f"{', '.join(missing_models)}"
)
if non_visual_models:
raise ValueError(
"planner_mode=multimodal但 planner 任务存在未开启 visual 的模型:"
f"{', '.join(non_visual_models)}"
)
return True
if missing_models:
logger.warning(
"planner_mode=auto 时发现 planner 任务存在未定义模型:"
f"{', '.join(missing_models)},将退化为纯文本 planner"
)
return False
return bool(planner_models) and not non_visual_models