perf: stabilize maisaka prompt cache
This commit is contained in:
@@ -41,6 +41,11 @@ from .display.prompt_cli_renderer import PromptCLIVisualizer
|
|||||||
from .visual_mode_utils import resolve_enable_visual_planner
|
from .visual_mode_utils import resolve_enable_visual_planner
|
||||||
|
|
||||||
TIMING_GATE_TOOL_NAMES = {"continue", "no_reply", "wait"}
|
TIMING_GATE_TOOL_NAMES = {"continue", "no_reply", "wait"}
|
||||||
|
REQUEST_TYPE_BY_REQUEST_KIND = {
|
||||||
|
"planner": "maisaka_planner",
|
||||||
|
"timing_gate": "maisaka_timing_gate",
|
||||||
|
}
|
||||||
|
CONTEXT_SELECTION_CACHE_STABILITY_RATIO = 2.0
|
||||||
|
|
||||||
|
|
||||||
@dataclass(slots=True)
|
@dataclass(slots=True)
|
||||||
@@ -212,7 +217,7 @@ class MaisakaChatLoopService:
|
|||||||
self._chat_system_prompt = f"{self._personality_prompt}\n\nYou are a helpful AI assistant."
|
self._chat_system_prompt = f"{self._personality_prompt}\n\nYou are a helpful AI assistant."
|
||||||
else:
|
else:
|
||||||
self._chat_system_prompt = chat_system_prompt
|
self._chat_system_prompt = chat_system_prompt
|
||||||
self._llm_chat = LLMServiceClient(task_name="planner", request_type="maisaka_planner")
|
self._llm_chat_clients: dict[str, LLMServiceClient] = {}
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def personality_prompt(self) -> str:
|
def personality_prompt(self) -> str:
|
||||||
@@ -220,6 +225,30 @@ class MaisakaChatLoopService:
|
|||||||
|
|
||||||
return self._personality_prompt
|
return self._personality_prompt
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _resolve_llm_request_type(request_kind: str) -> str:
|
||||||
|
"""根据 Maisaka 请求类型解析 LLM 统计口径。"""
|
||||||
|
|
||||||
|
normalized_request_kind = str(request_kind or "").strip()
|
||||||
|
return REQUEST_TYPE_BY_REQUEST_KIND.get(
|
||||||
|
normalized_request_kind,
|
||||||
|
f"maisaka_{normalized_request_kind}" if normalized_request_kind else "maisaka_planner",
|
||||||
|
)
|
||||||
|
|
||||||
|
def _get_llm_chat_client(self, request_kind: str) -> LLMServiceClient:
|
||||||
|
"""获取当前请求类型对应的 planner LLM 客户端。"""
|
||||||
|
|
||||||
|
request_type = self._resolve_llm_request_type(request_kind)
|
||||||
|
llm_client = self._llm_chat_clients.get(request_type)
|
||||||
|
if llm_client is None:
|
||||||
|
llm_client = LLMServiceClient(
|
||||||
|
task_name="planner",
|
||||||
|
request_type=request_type,
|
||||||
|
session_id=self._session_id,
|
||||||
|
)
|
||||||
|
self._llm_chat_clients[request_type] = llm_client
|
||||||
|
return llm_client
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_runtime_manager() -> Any:
|
def _get_runtime_manager() -> Any:
|
||||||
"""获取插件运行时管理器。
|
"""获取插件运行时管理器。
|
||||||
@@ -321,7 +350,13 @@ class MaisakaChatLoopService:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _build_time_block() -> str:
|
def _build_time_block() -> str:
|
||||||
"""构建当前时间提示块。"""
|
"""构建静态时间提示块。"""
|
||||||
|
|
||||||
|
return "当前时间会在每次请求末尾以用户消息形式提供。"
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _build_current_time_user_message() -> str:
|
||||||
|
"""构建追加到请求末尾的当前时间消息。"""
|
||||||
|
|
||||||
return f"当前时间:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
|
return f"当前时间:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
|
||||||
|
|
||||||
@@ -446,7 +481,11 @@ class MaisakaChatLoopService:
|
|||||||
messages.append(llm_message)
|
messages.append(llm_message)
|
||||||
|
|
||||||
normalized_injected_messages: List[Message] = []
|
normalized_injected_messages: List[Message] = []
|
||||||
for injected_message in injected_user_messages or []:
|
final_user_messages = [
|
||||||
|
*(injected_user_messages or []),
|
||||||
|
self._build_current_time_user_message(),
|
||||||
|
]
|
||||||
|
for injected_message in final_user_messages:
|
||||||
normalized_message = str(injected_message or "").strip()
|
normalized_message = str(injected_message or "").strip()
|
||||||
if not normalized_message:
|
if not normalized_message:
|
||||||
continue
|
continue
|
||||||
@@ -458,31 +497,10 @@ class MaisakaChatLoopService:
|
|||||||
)
|
)
|
||||||
|
|
||||||
if normalized_injected_messages:
|
if normalized_injected_messages:
|
||||||
insertion_index = self._resolve_injected_user_messages_insertion_index(messages)
|
messages.extend(normalized_injected_messages)
|
||||||
messages[insertion_index:insertion_index] = normalized_injected_messages
|
|
||||||
|
|
||||||
return messages
|
return messages
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _resolve_injected_user_messages_insertion_index(messages: Sequence[Message]) -> int:
|
|
||||||
"""计算 injected meta user messages 在请求中的插入位置。
|
|
||||||
|
|
||||||
规则与 deferred attachment 更接近:
|
|
||||||
- 从尾部向前寻找最近的 stopping point;
|
|
||||||
- stopping point 为 assistant 消息或 tool 结果消息;
|
|
||||||
- 找到后插入到其后面;
|
|
||||||
- 若不存在 stopping point,则退回到 system 消息之后。
|
|
||||||
"""
|
|
||||||
|
|
||||||
for index in range(len(messages) - 1, -1, -1):
|
|
||||||
message = messages[index]
|
|
||||||
if message.role in {RoleType.Assistant, RoleType.Tool}:
|
|
||||||
return index + 1
|
|
||||||
|
|
||||||
if messages and messages[0].role == RoleType.System:
|
|
||||||
return 1
|
|
||||||
return 0
|
|
||||||
|
|
||||||
async def chat_loop_step(
|
async def chat_loop_step(
|
||||||
self,
|
self,
|
||||||
chat_history: List[LLMContextMessage],
|
chat_history: List[LLMContextMessage],
|
||||||
@@ -575,7 +593,8 @@ class MaisakaChatLoopService:
|
|||||||
tool_definitions=list(all_tools),
|
tool_definitions=list(all_tools),
|
||||||
)
|
)
|
||||||
|
|
||||||
generation_result = await self._llm_chat.generate_response_with_messages(
|
llm_chat = self._get_llm_chat_client(request_kind)
|
||||||
|
generation_result = await llm_chat.generate_response_with_messages(
|
||||||
message_factory=message_factory,
|
message_factory=message_factory,
|
||||||
options=LLMGenerationOptions(
|
options=LLMGenerationOptions(
|
||||||
tool_options=all_tools if all_tools else None,
|
tool_options=all_tools if all_tools else None,
|
||||||
@@ -654,7 +673,11 @@ class MaisakaChatLoopService:
|
|||||||
chat_history,
|
chat_history,
|
||||||
request_kind=request_kind,
|
request_kind=request_kind,
|
||||||
)
|
)
|
||||||
effective_context_size = max(1, int(max_context_size or global_config.chat.max_context_size))
|
base_context_size = max(1, int(max_context_size or global_config.chat.max_context_size))
|
||||||
|
effective_context_size = max(
|
||||||
|
base_context_size,
|
||||||
|
int(base_context_size * CONTEXT_SELECTION_CACHE_STABILITY_RATIO),
|
||||||
|
)
|
||||||
selected_indices: List[int] = []
|
selected_indices: List[int] = []
|
||||||
counted_message_count = 0
|
counted_message_count = 0
|
||||||
|
|
||||||
@@ -690,9 +713,11 @@ class MaisakaChatLoopService:
|
|||||||
selected_history, _ = normalize_tool_result_order(selected_history)
|
selected_history, _ = normalize_tool_result_order(selected_history)
|
||||||
tool_message_count = sum(1 for message in selected_history if isinstance(message, ToolResultMessage))
|
tool_message_count = sum(1 for message in selected_history if isinstance(message, ToolResultMessage))
|
||||||
normal_message_count = len(selected_history) - tool_message_count
|
normal_message_count = len(selected_history) - tool_message_count
|
||||||
|
stability_text = f"|cache_window {base_context_size}->{effective_context_size}"
|
||||||
selection_reason = (
|
selection_reason = (
|
||||||
f"实际发送 {len(selected_history)} 条消息"
|
f"实际发送 {len(selected_history)} 条消息"
|
||||||
f"|消息 {normal_message_count} 条|tool {tool_message_count} 条"
|
f"|消息 {normal_message_count} 条|tool {tool_message_count} 条"
|
||||||
|
f"{stability_text}"
|
||||||
)
|
)
|
||||||
return (
|
return (
|
||||||
selected_history,
|
selected_history,
|
||||||
|
|||||||
@@ -3,11 +3,11 @@
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from math import ceil
|
from math import ceil
|
||||||
|
|
||||||
from .context_messages import AssistantMessage, LLMContextMessage
|
from .context_messages import LLMContextMessage
|
||||||
from .history_utils import drop_leading_orphan_tool_results, drop_orphan_tool_results, normalize_tool_result_order
|
from .history_utils import drop_leading_orphan_tool_results, drop_orphan_tool_results, normalize_tool_result_order
|
||||||
|
|
||||||
EARLY_TRIM_RATIO = 0.3
|
TRIM_TARGET_RATIO = 1.0
|
||||||
TRIM_THRESHOLD_RATIO = 1.2
|
TRIM_THRESHOLD_RATIO = 2.0
|
||||||
|
|
||||||
|
|
||||||
@dataclass(slots=True)
|
@dataclass(slots=True)
|
||||||
@@ -36,21 +36,16 @@ def process_chat_history_after_cycle(
|
|||||||
compact_removed_count = 0
|
compact_removed_count = 0
|
||||||
trim_threshold = ceil(max_context_size * TRIM_THRESHOLD_RATIO)
|
trim_threshold = ceil(max_context_size * TRIM_THRESHOLD_RATIO)
|
||||||
if remaining_context_count > trim_threshold:
|
if remaining_context_count > trim_threshold:
|
||||||
removed_early_message_count = _remove_early_history_messages(processed_history)
|
target_context_count = max(1, int(max_context_size * TRIM_TARGET_RATIO))
|
||||||
processed_history, removed_after_message_trim_count, moved_after_message_trim_count = (
|
removed_early_message_count = _trim_history_to_context_target(
|
||||||
_normalize_history_structure(processed_history)
|
processed_history,
|
||||||
|
target_context_count=target_context_count,
|
||||||
)
|
)
|
||||||
removed_assistant_thought_count = _remove_early_assistant_thoughts(processed_history)
|
processed_history, removed_after_trim_count, moved_after_trim_count = _normalize_history_structure(
|
||||||
processed_history, removed_after_thought_trim_count, moved_after_thought_trim_count = (
|
processed_history
|
||||||
_normalize_history_structure(processed_history)
|
|
||||||
)
|
)
|
||||||
compact_removed_count = (
|
compact_removed_count = removed_early_message_count + removed_after_trim_count
|
||||||
removed_early_message_count
|
moved_tool_result_count += moved_after_trim_count
|
||||||
+ removed_after_message_trim_count
|
|
||||||
+ removed_assistant_thought_count
|
|
||||||
+ removed_after_thought_trim_count
|
|
||||||
)
|
|
||||||
moved_tool_result_count += moved_after_message_trim_count + moved_after_thought_trim_count
|
|
||||||
|
|
||||||
remaining_context_count = sum(1 for message in processed_history if message.count_in_context)
|
remaining_context_count = sum(1 for message in processed_history if message.count_in_context)
|
||||||
removed_count = normalized_removed_count + compact_removed_count
|
removed_count = normalized_removed_count + compact_removed_count
|
||||||
@@ -78,42 +73,27 @@ def _normalize_history_structure(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _remove_early_history_messages(chat_history: list[LLMContextMessage]) -> int:
|
def _trim_history_to_context_target(
|
||||||
"""移除最早 30% 的全部历史消息。"""
|
chat_history: list[LLMContextMessage],
|
||||||
|
*,
|
||||||
|
target_context_count: int,
|
||||||
|
) -> int:
|
||||||
|
"""移除最早的一段历史,直到普通上下文消息数量降到目标值以内。"""
|
||||||
|
|
||||||
|
remaining_context_count = sum(1 for message in chat_history if message.count_in_context)
|
||||||
|
if remaining_context_count <= target_context_count:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
remove_count = 0
|
||||||
|
for message in chat_history:
|
||||||
|
remove_count += 1
|
||||||
|
if message.count_in_context:
|
||||||
|
remaining_context_count -= 1
|
||||||
|
if remaining_context_count <= target_context_count:
|
||||||
|
break
|
||||||
|
|
||||||
remove_count = int(len(chat_history) * EARLY_TRIM_RATIO)
|
|
||||||
if remove_count <= 0:
|
if remove_count <= 0:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
del chat_history[:remove_count]
|
del chat_history[:remove_count]
|
||||||
return remove_count
|
return remove_count
|
||||||
|
|
||||||
|
|
||||||
def _remove_early_assistant_thoughts(chat_history: list[LLMContextMessage]) -> int:
|
|
||||||
"""移除最早 30% 的非工具 assistant 思考内容。"""
|
|
||||||
|
|
||||||
candidate_indexes = [
|
|
||||||
index
|
|
||||||
for index, message in enumerate(chat_history)
|
|
||||||
if isinstance(message, AssistantMessage)
|
|
||||||
and not message.tool_calls
|
|
||||||
and message.source_kind != "perception"
|
|
||||||
and bool(message.content.strip())
|
|
||||||
]
|
|
||||||
remove_count = int(len(candidate_indexes) * EARLY_TRIM_RATIO)
|
|
||||||
if remove_count <= 0:
|
|
||||||
return 0
|
|
||||||
|
|
||||||
removed_indexes = set(candidate_indexes[:remove_count])
|
|
||||||
filtered_history: list[LLMContextMessage] = []
|
|
||||||
removed_total = 0
|
|
||||||
for index, message in enumerate(chat_history):
|
|
||||||
if index in removed_indexes:
|
|
||||||
removed_total += 1
|
|
||||||
continue
|
|
||||||
filtered_history.append(message)
|
|
||||||
|
|
||||||
chat_history[:] = filtered_history
|
|
||||||
return removed_total
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -52,7 +52,7 @@ if TYPE_CHECKING:
|
|||||||
|
|
||||||
logger = get_logger("maisaka_reasoning_engine")
|
logger = get_logger("maisaka_reasoning_engine")
|
||||||
|
|
||||||
TIMING_GATE_CONTEXT_LIMIT = 24
|
TIMING_GATE_CONTEXT_DROP_HEAD_RATIO = 0.7
|
||||||
TIMING_GATE_MAX_TOKENS = 384
|
TIMING_GATE_MAX_TOKENS = 384
|
||||||
TIMING_GATE_MAX_ATTEMPTS = 3
|
TIMING_GATE_MAX_ATTEMPTS = 3
|
||||||
TIMING_GATE_TOOL_NAMES = {"continue", "no_reply", "wait"}
|
TIMING_GATE_TOOL_NAMES = {"continue", "no_reply", "wait"}
|
||||||
@@ -124,7 +124,6 @@ class MaisakaReasoningEngine:
|
|||||||
async def _run_timing_gate_sub_agent(
|
async def _run_timing_gate_sub_agent(
|
||||||
self,
|
self,
|
||||||
*,
|
*,
|
||||||
context_message_limit: int,
|
|
||||||
system_prompt: str,
|
system_prompt: str,
|
||||||
tool_definitions: list[dict[str, Any]],
|
tool_definitions: list[dict[str, Any]],
|
||||||
) -> Any:
|
) -> Any:
|
||||||
@@ -134,7 +133,10 @@ class MaisakaReasoningEngine:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
return await self._runtime.run_sub_agent(
|
return await self._runtime.run_sub_agent(
|
||||||
context_message_limit=context_message_limit,
|
context_message_limit=self._runtime._max_context_size,
|
||||||
|
drop_head_context_count=int(
|
||||||
|
self._runtime._max_context_size * TIMING_GATE_CONTEXT_DROP_HEAD_RATIO,
|
||||||
|
),
|
||||||
system_prompt=system_prompt,
|
system_prompt=system_prompt,
|
||||||
request_kind="timing_gate",
|
request_kind="timing_gate",
|
||||||
interrupt_flag=None,
|
interrupt_flag=None,
|
||||||
@@ -255,7 +257,6 @@ class MaisakaReasoningEngine:
|
|||||||
invalid_tool_text = ""
|
invalid_tool_text = ""
|
||||||
for attempt_index in range(TIMING_GATE_MAX_ATTEMPTS):
|
for attempt_index in range(TIMING_GATE_MAX_ATTEMPTS):
|
||||||
response = await self._run_timing_gate_sub_agent(
|
response = await self._run_timing_gate_sub_agent(
|
||||||
context_message_limit=TIMING_GATE_CONTEXT_LIMIT,
|
|
||||||
system_prompt=self._build_timing_gate_system_prompt(),
|
system_prompt=self._build_timing_gate_system_prompt(),
|
||||||
tool_definitions=get_timing_tools(),
|
tool_definitions=get_timing_tools(),
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -45,6 +45,7 @@ from .context_messages import (
|
|||||||
from .display.display_utils import build_tool_call_summary_lines, format_token_count
|
from .display.display_utils import build_tool_call_summary_lines, format_token_count
|
||||||
from .display.prompt_cli_renderer import PromptCLIVisualizer
|
from .display.prompt_cli_renderer import PromptCLIVisualizer
|
||||||
from .display.stage_status_board import remove_stage_status, update_stage_status
|
from .display.stage_status_board import remove_stage_status, update_stage_status
|
||||||
|
from .history_utils import drop_leading_orphan_tool_results
|
||||||
from .reasoning_engine import MaisakaReasoningEngine
|
from .reasoning_engine import MaisakaReasoningEngine
|
||||||
from .reply_effect import ReplyEffectTracker
|
from .reply_effect import ReplyEffectTracker
|
||||||
from .reply_effect.image_utils import extract_visual_attachments_from_sequence
|
from .reply_effect.image_utils import extract_visual_attachments_from_sequence
|
||||||
@@ -583,6 +584,7 @@ class MaisakaHeartFlowChatting:
|
|||||||
self,
|
self,
|
||||||
*,
|
*,
|
||||||
context_message_limit: int,
|
context_message_limit: int,
|
||||||
|
drop_head_context_count: int = 0,
|
||||||
system_prompt: str,
|
system_prompt: str,
|
||||||
request_kind: str = "sub_agent",
|
request_kind: str = "sub_agent",
|
||||||
extra_messages: Optional[Sequence[LLMContextMessage]] = None,
|
extra_messages: Optional[Sequence[LLMContextMessage]] = None,
|
||||||
@@ -598,7 +600,10 @@ class MaisakaHeartFlowChatting:
|
|||||||
request_kind=request_kind,
|
request_kind=request_kind,
|
||||||
max_context_size=context_message_limit,
|
max_context_size=context_message_limit,
|
||||||
)
|
)
|
||||||
sub_agent_history = list(selected_history)
|
sub_agent_history = self._drop_head_context_messages(
|
||||||
|
selected_history,
|
||||||
|
drop_head_context_count,
|
||||||
|
)
|
||||||
if extra_messages:
|
if extra_messages:
|
||||||
sub_agent_history.extend(list(extra_messages))
|
sub_agent_history.extend(list(extra_messages))
|
||||||
|
|
||||||
@@ -616,6 +621,31 @@ class MaisakaHeartFlowChatting:
|
|||||||
tool_definitions=[] if tool_definitions is None else tool_definitions,
|
tool_definitions=[] if tool_definitions is None else tool_definitions,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _drop_head_context_messages(
|
||||||
|
chat_history: Sequence[LLMContextMessage],
|
||||||
|
drop_context_count: int,
|
||||||
|
) -> list[LLMContextMessage]:
|
||||||
|
"""从已选上下文头部丢弃指定数量的普通上下文消息。"""
|
||||||
|
|
||||||
|
if drop_context_count <= 0:
|
||||||
|
return list(chat_history)
|
||||||
|
|
||||||
|
first_kept_index = 0
|
||||||
|
dropped_context_count = 0
|
||||||
|
while (
|
||||||
|
first_kept_index < len(chat_history)
|
||||||
|
and dropped_context_count < drop_context_count
|
||||||
|
):
|
||||||
|
message = chat_history[first_kept_index]
|
||||||
|
if message.count_in_context:
|
||||||
|
dropped_context_count += 1
|
||||||
|
first_kept_index += 1
|
||||||
|
|
||||||
|
trimmed_history = list(chat_history[first_kept_index:])
|
||||||
|
trimmed_history, _ = drop_leading_orphan_tool_results(trimmed_history)
|
||||||
|
return trimmed_history
|
||||||
|
|
||||||
async def _run_reply_effect_judge(self, prompt: str) -> str:
|
async def _run_reply_effect_judge(self, prompt: str) -> str:
|
||||||
"""运行回复效果观察器使用的临时 LLM 评审。"""
|
"""运行回复效果观察器使用的临时 LLM 评审。"""
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user