feat:提供原生vlm支持

This commit is contained in:
SengokuCola
2026-03-24 20:57:57 +08:00
parent 6cfc92e66f
commit a5fc4d172d
12 changed files with 447 additions and 59 deletions

View File

@@ -2,13 +2,18 @@
MaiSaka message adapters built on top of the main project's MaiMessage model.
"""
from copy import deepcopy
from datetime import datetime
import re
from io import BytesIO
from typing import Optional
from uuid import uuid4
import base64
import re
from src.common.data_models.mai_message_data_model import MaiMessage, MessageInfo, UserInfo
from src.common.data_models.message_component_data_model import MessageSequence
from PIL import Image as PILImage
from src.common.data_models.mai_message_data_model import GroupInfo, MaiMessage, MessageInfo, UserInfo
from src.common.data_models.message_component_data_model import EmojiComponent, ImageComponent, MessageSequence, TextComponent
from src.config.config import global_config
from src.llm_models.payload_content.message import Message, MessageBuilder, RoleType
from src.llm_models.payload_content.tool_option import ToolCall
@@ -22,7 +27,10 @@ SOURCE_KEY = "maisaka_source"
LLM_ROLE_KEY = "maisaka_llm_role"
TOOL_CALL_ID_KEY = "maisaka_tool_call_id"
TOOL_CALLS_KEY = "maisaka_tool_calls"
SPEAKER_PREFIX_PATTERN = re.compile(r"^\[(?P<speaker>[^\]]+)\](?P<content>.*)$", re.DOTALL)
SPEAKER_PREFIX_PATTERN = re.compile(
r"^(?:(?P<timestamp>\d{2}:\d{2}:\d{2}))?(?:<mid:(?P<message_id>[^>]+)>)?\[(?P<speaker>[^\]]+)\](?P<content>.*)$",
re.DOTALL,
)
def _build_user_info_for_role(role: str) -> UserInfo:
@@ -55,7 +63,7 @@ def _deserialize_tool_call(data: dict) -> ToolCall:
def build_message(
role: str,
content: str,
content: str = "",
*,
message_kind: str = "normal",
source: Optional[str] = None,
@@ -63,6 +71,12 @@ def build_message(
tool_calls: Optional[list[ToolCall]] = None,
timestamp: Optional[datetime] = None,
message_id: Optional[str] = None,
platform: str = MAISAKA_PLATFORM,
session_id: str = MAISAKA_SESSION_ID,
user_info: Optional[UserInfo] = None,
group_info: Optional[GroupInfo] = None,
raw_message: Optional[MessageSequence] = None,
display_text: Optional[str] = None,
) -> MaiMessage:
"""Build a MaiMessage for the Maisaka session history."""
resolved_timestamp = timestamp or datetime.now()
@@ -70,11 +84,11 @@ def build_message(
message = MaiMessage(
message_id=message_id or f"maisaka_{uuid4().hex}",
timestamp=resolved_timestamp,
platform=MAISAKA_PLATFORM,
platform=platform,
)
message.message_info = MessageInfo(
user_info=_build_user_info_for_role(resolved_role),
group_info=None,
user_info=user_info or _build_user_info_for_role(resolved_role),
group_info=group_info,
additional_config={
LLM_ROLE_KEY: resolved_role,
MESSAGE_KIND_KEY: message_kind,
@@ -83,18 +97,26 @@ def build_message(
TOOL_CALLS_KEY: [_serialize_tool_call(tool_call) for tool_call in (tool_calls or [])],
},
)
message.session_id = MAISAKA_SESSION_ID
message.raw_message = MessageSequence([])
if content:
message.session_id = session_id
message.raw_message = raw_message if raw_message is not None else MessageSequence([])
if raw_message is None and content:
message.raw_message.text(content)
message.processed_plain_text = content
message.display_message = content
visible_text = display_text if display_text is not None else content
message.processed_plain_text = visible_text
message.display_message = visible_text
return message
def format_speaker_content(speaker_name: str, content: str) -> str:
def format_speaker_content(
speaker_name: str,
content: str,
timestamp: Optional[datetime] = None,
message_id: Optional[str] = None,
) -> str:
"""Format visible conversation content with an explicit speaker label."""
return f"[{speaker_name}]{content}"
time_prefix = timestamp.strftime("%H:%M:%S") if timestamp is not None else ""
message_id_prefix = f"<mid:{message_id}>" if message_id else ""
return f"{time_prefix}{message_id_prefix}[{speaker_name}]{content}"
def parse_speaker_content(content: str) -> tuple[Optional[str], str]:
@@ -105,6 +127,39 @@ def parse_speaker_content(content: str) -> tuple[Optional[str], str]:
return match.group("speaker"), match.group("content")
def clone_message_sequence(message_sequence: MessageSequence) -> MessageSequence:
"""Create a detached copy of a message sequence."""
return MessageSequence([deepcopy(component) for component in message_sequence.components])
def build_visible_text_from_sequence(message_sequence: MessageSequence) -> str:
"""Extract visible text from a message sequence without forcing image descriptions."""
parts: list[str] = []
for component in message_sequence.components:
if isinstance(component, TextComponent):
parts.append(SPEAKER_PREFIX_PATTERN.sub(r"\g<timestamp>[\g<speaker>]\g<content>", component.text))
continue
if isinstance(component, EmojiComponent):
parts.append("[表情包]")
continue
if isinstance(component, ImageComponent):
parts.append("[图片]")
return "".join(parts)
def _guess_image_format(image_bytes: bytes) -> Optional[str]:
if not image_bytes:
return None
try:
with PILImage.open(BytesIO(image_bytes)) as image:
return image.format.lower() if image.format else None
except Exception:
return None
def get_message_text(message: MaiMessage) -> str:
if message.processed_plain_text is not None:
return message.processed_plain_text
@@ -156,7 +211,6 @@ def remove_last_perception(messages: list[MaiMessage]) -> None:
def to_llm_message(message: MaiMessage) -> Optional[Message]:
role = get_message_role(message)
content = get_message_text(message)
tool_call_id = get_tool_call_id(message)
tool_calls = get_tool_calls(message)
@@ -176,6 +230,28 @@ def to_llm_message(message: MaiMessage) -> Optional[Message]:
builder.set_tool_calls(tool_calls)
if role_type == RoleType.Tool and tool_call_id:
builder.add_tool_call(tool_call_id)
if content:
builder.add_text_content(content)
has_content = False
for component in message.raw_message.components:
if isinstance(component, TextComponent):
if component.text:
builder.add_text_content(component.text)
has_content = True
continue
if isinstance(component, (ImageComponent, EmojiComponent)):
image_format = _guess_image_format(component.binary_data)
if image_format and component.binary_data:
builder.add_image_content(image_format, base64.b64encode(component.binary_data).decode("utf-8"))
has_content = True
continue
if component.content:
builder.add_text_content(component.content)
has_content = True
if not has_content:
content = get_message_text(message)
if content:
builder.add_text_content(content)
return builder.build()