fix:修复非多模态模型意外传入图片的问题

This commit is contained in:
SengokuCola
2026-04-15 11:46:22 +08:00
parent 4729f5acdb
commit 6297c50011
5 changed files with 139 additions and 84 deletions

View File

@@ -30,9 +30,15 @@ from src.plugin_runtime.host.hook_spec_registry import HookSpec, HookSpecRegistr
from src.services.llm_service import LLMServiceClient
from .builtin_tool import get_builtin_tools
from .context_messages import AssistantMessage, LLMContextMessage, ToolResultMessage
from .context_messages import (
AssistantMessage,
LLMContextMessage,
ToolResultMessage,
build_llm_message_from_context,
)
from .history_utils import drop_orphan_tool_results
from .display.prompt_cli_renderer import PromptCLIVisualizer
from .visual_mode_utils import resolve_enable_visual_planner
TIMING_GATE_TOOL_NAMES = {"continue", "no_reply", "wait"}
@@ -395,6 +401,7 @@ class MaisakaChatLoopService:
self,
selected_history: List[LLMContextMessage],
*,
enable_visual_message: bool,
injected_user_messages: Sequence[str] | None = None,
system_prompt: Optional[str] = None,
) -> List[Message]:
@@ -413,7 +420,10 @@ class MaisakaChatLoopService:
messages.append(system_msg.build())
for msg in selected_history:
llm_message = msg.to_llm_message()
llm_message = build_llm_message_from_context(
msg,
enable_visual_message=enable_visual_message,
)
if llm_message is not None:
messages.append(llm_message)
@@ -475,12 +485,15 @@ class MaisakaChatLoopService:
if not self._prompts_loaded:
await self.ensure_chat_prompt_loaded()
enable_visual_message = self._resolve_enable_visual_message(request_kind)
selected_history, selection_reason = self.select_llm_context_messages(
chat_history,
request_kind=request_kind,
enable_visual_message=enable_visual_message,
)
built_messages = self._build_request_messages(
selected_history,
enable_visual_message=enable_visual_message,
injected_user_messages=injected_user_messages,
)
@@ -602,6 +615,7 @@ class MaisakaChatLoopService:
def select_llm_context_messages(
chat_history: List[LLMContextMessage],
*,
enable_visual_message: Optional[bool] = None,
request_kind: str = "planner",
max_context_size: Optional[int] = None,
) -> tuple[List[LLMContextMessage], str]:
@@ -615,9 +629,21 @@ class MaisakaChatLoopService:
selected_indices: List[int] = []
counted_message_count = 0
active_enable_visual_message = (
enable_visual_message
if enable_visual_message is not None
else MaisakaChatLoopService._resolve_enable_visual_message(request_kind)
)
for index in range(len(filtered_history) - 1, -1, -1):
message = filtered_history[index]
if message.to_llm_message() is None:
if (
build_llm_message_from_context(
message,
enable_visual_message=active_enable_visual_message,
)
is None
):
continue
selected_indices.append(index)
@@ -683,6 +709,12 @@ class MaisakaChatLoopService:
return filtered_history
@staticmethod
def _resolve_enable_visual_message(request_kind: str) -> bool:
if request_kind in {"planner", "timing_gate"}:
return resolve_enable_visual_planner()
return True
@staticmethod
def _hide_early_assistant_messages(
selected_history: List[LLMContextMessage],

View File

@@ -40,10 +40,15 @@ def _guess_image_format(image_bytes: bytes) -> Optional[str]:
return None
def _append_emoji_component(builder: MessageBuilder, component: EmojiComponent) -> bool:
def _append_emoji_component(
builder: MessageBuilder,
component: EmojiComponent,
*,
enable_visual_message: bool,
) -> bool:
"""将表情组件追加到 LLM 消息构建器。"""
image_format = _guess_image_format(component.binary_data)
if image_format and component.binary_data:
if enable_visual_message and image_format and component.binary_data:
builder.add_text_content("[消息类型]表情包")
builder.add_image_content(image_format, base64.b64encode(component.binary_data).decode("utf-8"))
return True
@@ -56,10 +61,15 @@ def _append_emoji_component(builder: MessageBuilder, component: EmojiComponent)
return True
def _append_image_component(builder: MessageBuilder, component: ImageComponent) -> bool:
def _append_image_component(
builder: MessageBuilder,
component: ImageComponent,
*,
enable_visual_message: bool,
) -> bool:
"""将图片组件追加到 LLM 消息构建器。"""
image_format = _guess_image_format(component.binary_data)
if image_format and component.binary_data:
if enable_visual_message and image_format and component.binary_data:
builder.add_text_content("[消息类型]图片")
builder.add_image_content(image_format, base64.b64encode(component.binary_data).decode("utf-8"))
return True
@@ -216,6 +226,7 @@ def _build_message_from_sequence(
message_sequence: MessageSequence,
fallback_text: str,
*,
enable_visual_message: bool = True,
tool_call_id: Optional[str] = None,
tool_name: Optional[str] = None,
tool_calls: Optional[list[ToolCall]] = None,
@@ -238,11 +249,25 @@ def _build_message_from_sequence(
continue
if isinstance(component, EmojiComponent):
has_content = _append_emoji_component(builder, component) or has_content
has_content = (
_append_emoji_component(
builder,
component,
enable_visual_message=enable_visual_message,
)
or has_content
)
continue
if isinstance(component, ImageComponent):
has_content = _append_image_component(builder, component) or has_content
has_content = (
_append_image_component(
builder,
component,
enable_visual_message=enable_visual_message,
)
or has_content
)
continue
if isinstance(component, AtComponent):
@@ -297,7 +322,7 @@ class LLMContextMessage(ABC):
return self.__class__.__name__
@abstractmethod
def to_llm_message(self) -> Optional[Message]:
def to_llm_message(self, enable_visual_message: bool = True) -> Optional[Message]:
"""转换为统一 LLM 消息。"""
def consume_once(self) -> bool:
@@ -328,11 +353,12 @@ class SessionBackedMessage(LLMContextMessage):
def source(self) -> str:
return self.source_kind
def to_llm_message(self) -> Optional[Message]:
def to_llm_message(self, enable_visual_message: bool = True) -> Optional[Message]:
return _build_message_from_sequence(
RoleType.User,
self.raw_message,
self.processed_plain_text,
enable_visual_message=enable_visual_message,
)
@classmethod
@@ -366,7 +392,8 @@ class ComplexSessionMessage(SessionBackedMessage):
def source(self) -> str:
return f"{self.source_kind}:{self.complex_message_type}"
def to_llm_message(self) -> Optional[Message]:
def to_llm_message(self, enable_visual_message: bool = True) -> Optional[Message]:
del enable_visual_message
message_sequence = MessageSequence([TextComponent(self.prompt_text)])
return _build_message_from_sequence(
RoleType.User,
@@ -426,7 +453,8 @@ class ReferenceMessage(LLMContextMessage):
def source(self) -> str:
return self.reference_type.value
def to_llm_message(self) -> Optional[Message]:
def to_llm_message(self, enable_visual_message: bool = True) -> Optional[Message]:
del enable_visual_message
message_sequence = MessageSequence([TextComponent(self.processed_plain_text)])
return _build_message_from_sequence(RoleType.User, message_sequence, self.processed_plain_text)
@@ -463,7 +491,8 @@ class AssistantMessage(LLMContextMessage):
def source(self) -> str:
return self.source_kind
def to_llm_message(self) -> Optional[Message]:
def to_llm_message(self, enable_visual_message: bool = True) -> Optional[Message]:
del enable_visual_message
message_sequence = MessageSequence([])
if self.content:
message_sequence.text(self.content)
@@ -501,7 +530,8 @@ class ToolResultMessage(LLMContextMessage):
def source(self) -> str:
return self.tool_name or "tool"
def to_llm_message(self) -> Optional[Message]:
def to_llm_message(self, enable_visual_message: bool = True) -> Optional[Message]:
del enable_visual_message
message_sequence = MessageSequence([TextComponent(self.content)])
return _build_message_from_sequence(
RoleType.Tool,
@@ -510,3 +540,13 @@ class ToolResultMessage(LLMContextMessage):
tool_call_id=self.tool_call_id,
tool_name=self.tool_name,
)
def build_llm_message_from_context(
context_message: LLMContextMessage,
*,
enable_visual_message: bool = True,
) -> Optional[Message]:
"""将 Maisaka 内部上下文消息转换为发给 LLM 的统一消息。"""
return context_message.to_llm_message(enable_visual_message=enable_visual_message)

View File

@@ -14,7 +14,7 @@ from src.chat.message_receive.message import SessionMessage
from src.common.data_models.message_component_data_model import EmojiComponent, ImageComponent, MessageSequence
from src.common.logger import get_logger
from src.common.prompt_i18n import load_prompt
from src.config.config import config_manager, global_config
from src.config.config import global_config
from src.core.tooling import ToolExecutionContext, ToolExecutionResult, ToolInvocation, ToolSpec
from src.llm_models.exceptions import ReqAbortException
from src.llm_models.payload_content.tool_option import ToolCall
@@ -43,6 +43,7 @@ from .monitor_events import (
emit_timing_gate_result,
)
from .planner_message_utils import build_planner_user_prefix_from_session_message
from .visual_mode_utils import resolve_enable_visual_planner
if TYPE_CHECKING:
from .runtime import MaisakaHeartFlowChatting
@@ -738,47 +739,10 @@ class MaisakaReasoningEngine:
planner_prefix: str,
) -> MessageSequence:
message_sequence = build_prefixed_message_sequence(message.raw_message, planner_prefix)
if self._resolve_enable_visual_planner():
if resolve_enable_visual_planner():
await self._hydrate_visual_components(message_sequence.components)
return message_sequence
@staticmethod
def _resolve_enable_visual_planner() -> bool:
planner_mode = global_config.visual.planner_mode
planner_task_config = config_manager.get_model_config().model_task_config.planner
models_by_name = {model.name: model for model in config_manager.get_model_config().models}
if planner_mode == "text":
return False
planner_models: list[str] = list(planner_task_config.model_list)
missing_models = [model_name for model_name in planner_models if model_name not in models_by_name]
non_visual_models = [
model_name for model_name in planner_models if model_name in models_by_name and not models_by_name[model_name].visual
]
if planner_mode == "multimodal":
if missing_models:
raise ValueError(
"planner_mode=multimodal但 planner 任务存在未定义的模型:"
f"{', '.join(missing_models)}"
)
if non_visual_models:
raise ValueError(
"planner_mode=multimodal但 planner 任务存在未开启 visual 的模型:"
f"{', '.join(non_visual_models)}"
)
return True
if missing_models:
logger.warning(
"planner_mode=auto 时发现 planner 任务存在未定义模型:"
f"{', '.join(missing_models)},将退化为纯文本 planner"
)
return False
return bool(planner_models) and not non_visual_models
async def _hydrate_visual_components(self, planner_components: list[object]) -> None:
"""在 Maisaka 真正需要图片或表情时,按需回填二进制数据。"""
load_tasks: list[asyncio.Task[None]] = []

View File

@@ -0,0 +1,43 @@
from src.common.logger import get_logger
from src.config.config import config_manager, global_config
logger = get_logger("maisaka_visual_mode")
def resolve_enable_visual_planner() -> bool:
"""根据 planner 配置解析当前是否应启用视觉消息。"""
planner_mode = global_config.visual.planner_mode
planner_task_config = config_manager.get_model_config().model_task_config.planner
models_by_name = {model.name: model for model in config_manager.get_model_config().models}
if planner_mode == "text":
return False
planner_models: list[str] = list(planner_task_config.model_list)
missing_models = [model_name for model_name in planner_models if model_name not in models_by_name]
non_visual_models = [
model_name for model_name in planner_models if model_name in models_by_name and not models_by_name[model_name].visual
]
if planner_mode == "multimodal":
if missing_models:
raise ValueError(
"planner_mode=multimodal但 planner 任务存在未定义的模型:"
f"{', '.join(missing_models)}"
)
if non_visual_models:
raise ValueError(
"planner_mode=multimodal但 planner 任务存在未开启 visual 的模型:"
f"{', '.join(non_visual_models)}"
)
return True
if missing_models:
logger.warning(
"planner_mode=auto 时发现 planner 任务存在未定义模型:"
f"{', '.join(missing_models)},将退化为纯文本 planner"
)
return False
return bool(planner_models) and not non_visual_models