feat(plugin-runtime): add plugin isolation IPC infrastructure

- Protocol layer: Envelope model with Pydantic schema, MsgPack/JSON codecs, unified error codes
- Transport layer: cross-platform IPC abstraction with 4-byte length-prefixed framing (UDS + TCP fallback)
- Host: RPC server, policy engine, circuit breaker, capability service, supervisor with hot-reload
- Runner: RPC client, plugin loader, process entry point
- Tests: 16 passing tests covering protocol, transport, host, and E2E handshake
This commit is contained in:
DrSmoothl
2026-03-06 02:01:30 +08:00
parent 10d5c81268
commit 61dc15a513
22 changed files with 2695 additions and 1 deletions

View File

@@ -0,0 +1 @@
# Host 端 - Supervisor、RPC Server、策略引擎、路由

View File

@@ -0,0 +1,108 @@
"""能力服务层
Host 端实现的能力服务,处理来自插件的 cap.* 请求。
每个能力方法被注册到 RPC Server接收 Runner 转发的请求并执行实际操作。
"""
from typing import Any, Callable, Awaitable
import logging
from src.plugin_runtime.protocol.envelope import (
CapabilityRequestPayload,
CapabilityResponsePayload,
Envelope,
)
from src.plugin_runtime.protocol.errors import ErrorCode, RPCError
from src.plugin_runtime.host.policy_engine import PolicyEngine
logger = logging.getLogger("plugin_runtime.host.capability_service")
# 能力实现函数类型: (plugin_id, capability, args) -> result
CapabilityImpl = Callable[[str, str, dict[str, Any]], Awaitable[Any]]
class CapabilityService:
"""能力服务
负责:
1. 注册能力实现
2. 接收插件的能力调用请求
3. 通过策略引擎校验权限和限流
4. 执行实际操作并返回结果
"""
def __init__(self, policy_engine: PolicyEngine):
self._policy = policy_engine
# capability_name -> implementation
self._implementations: dict[str, CapabilityImpl] = {}
def register_capability(self, name: str, impl: CapabilityImpl) -> None:
"""注册一个能力实现
Args:
name: 能力名称,如 "send.text", "db.query", "llm.generate"
impl: 实现函数
"""
self._implementations[name] = impl
logger.debug(f"注册能力实现: {name}")
async def handle_capability_request(self, envelope: Envelope) -> Envelope:
"""处理能力调用请求(作为 RPC Server 的 method handler
从 envelope 中提取 capability 名称和参数,
校验权限后调用对应实现。
"""
plugin_id = envelope.plugin_id
try:
req = CapabilityRequestPayload.model_validate(envelope.payload)
except Exception as e:
return envelope.make_error_response(
ErrorCode.E_BAD_PAYLOAD.value,
f"能力调用 payload 格式错误: {e}",
)
capability = req.capability
# 1. 权限校验
allowed, reason = self._policy.check_capability(plugin_id, capability)
if not allowed:
return envelope.make_error_response(
ErrorCode.E_CAPABILITY_DENIED.value,
reason,
)
# 2. 限流校验
allowed, reason = self._policy.check_rate_limit(plugin_id)
if not allowed:
return envelope.make_error_response(
ErrorCode.E_BACKPRESSURE.value,
reason,
)
# 3. 查找实现
impl = self._implementations.get(capability)
if impl is None:
return envelope.make_error_response(
ErrorCode.E_METHOD_NOT_ALLOWED.value,
f"未注册的能力: {capability}",
)
# 4. 执行
try:
result = await impl(plugin_id, capability, req.args)
resp_payload = CapabilityResponsePayload(success=True, result=result)
return envelope.make_response(payload=resp_payload.model_dump())
except RPCError as e:
return envelope.make_error_response(e.code.value, e.message, e.details)
except Exception as e:
logger.error(f"能力 {capability} 执行异常: {e}", exc_info=True)
return envelope.make_error_response(
ErrorCode.E_CAPABILITY_FAILED.value,
str(e),
)
def list_capabilities(self) -> list[str]:
"""列出所有已注册的能力"""
return list(self._implementations.keys())

View File

@@ -0,0 +1,105 @@
"""熔断器
为每个插件提供熔断保护,连续失败超过阈值后临时禁用。
支持指数退避恢复。
"""
from enum import Enum
import time
class CircuitState(str, Enum):
CLOSED = "closed" # 正常工作
OPEN = "open" # 熔断(拒绝所有调用)
HALF_OPEN = "half_open" # 探测恢复
class CircuitBreaker:
"""单个插件的熔断器"""
def __init__(
self,
failure_threshold: int = 5,
recovery_timeout_sec: float = 30.0,
max_recovery_timeout_sec: float = 300.0,
):
self.failure_threshold = failure_threshold
self.base_recovery_timeout = recovery_timeout_sec
self.max_recovery_timeout = max_recovery_timeout_sec
self._state = CircuitState.CLOSED
self._failure_count = 0
self._last_failure_time = 0.0
self._consecutive_opens = 0 # 用于指数退避
@property
def state(self) -> CircuitState:
if self._state == CircuitState.OPEN:
# 检查是否可以进入半开状态
elapsed = time.monotonic() - self._last_failure_time
recovery_timeout = min(
self.base_recovery_timeout * (2 ** self._consecutive_opens),
self.max_recovery_timeout,
)
if elapsed >= recovery_timeout:
self._state = CircuitState.HALF_OPEN
return self._state
def allow_request(self) -> bool:
"""是否允许通过请求"""
state = self.state
if state == CircuitState.CLOSED:
return True
if state == CircuitState.HALF_OPEN:
return True # 允许一次试探
return False # OPEN 状态拒绝
def record_success(self) -> None:
"""记录一次成功调用"""
if self._state == CircuitState.HALF_OPEN:
# 半开状态成功 -> 关闭熔断
self._state = CircuitState.CLOSED
self._failure_count = 0
self._consecutive_opens = 0
elif self._state == CircuitState.CLOSED:
self._failure_count = 0
def record_failure(self) -> None:
"""记录一次失败调用"""
self._failure_count += 1
self._last_failure_time = time.monotonic()
if self._state == CircuitState.HALF_OPEN:
# 半开状态失败 -> 重新开启熔断
self._state = CircuitState.OPEN
self._consecutive_opens += 1
elif self._failure_count >= self.failure_threshold:
self._state = CircuitState.OPEN
self._consecutive_opens += 1
def reset(self) -> None:
"""重置熔断器"""
self._state = CircuitState.CLOSED
self._failure_count = 0
self._consecutive_opens = 0
class CircuitBreakerRegistry:
"""熔断器注册表,为每个插件维护独立的熔断器"""
def __init__(self, **default_kwargs):
self._breakers: dict[str, CircuitBreaker] = {}
self._default_kwargs = default_kwargs
def get(self, plugin_id: str) -> CircuitBreaker:
if plugin_id not in self._breakers:
self._breakers[plugin_id] = CircuitBreaker(**self._default_kwargs)
return self._breakers[plugin_id]
def remove(self, plugin_id: str) -> None:
self._breakers.pop(plugin_id, None)
def reset_all(self) -> None:
for breaker in self._breakers.values():
breaker.reset()

View File

@@ -0,0 +1,125 @@
"""策略引擎
负责能力授权校验、限流、配额管理。
每个插件在 manifest 中声明能力需求Host 启动时签发能力令牌。
"""
from dataclasses import dataclass, field
import time
@dataclass
class CapabilityToken:
"""能力令牌
描述某个插件在当前会话中被授予的能力和资源限制。
"""
plugin_id: str
generation: int
capabilities: set[str] = field(default_factory=set)
qps_limit: int = 20
burst_limit: int = 50
daily_token_limit: int = 200000
max_payload_kb: int = 256
# 运行时统计
_call_count: int = field(default=0, init=False, repr=False)
_window_start: float = field(default_factory=time.monotonic, init=False, repr=False)
_window_calls: int = field(default=0, init=False, repr=False)
class PolicyEngine:
"""策略引擎
管理所有插件的能力令牌,提供授权校验与限流决策。
"""
def __init__(self):
# plugin_id -> CapabilityToken
self._tokens: dict[str, CapabilityToken] = {}
def register_plugin(
self,
plugin_id: str,
generation: int,
capabilities: list[str],
limits: dict | None = None,
) -> CapabilityToken:
"""为插件签发能力令牌"""
limits = limits or {}
token = CapabilityToken(
plugin_id=plugin_id,
generation=generation,
capabilities=set(capabilities),
qps_limit=limits.get("qps", 20),
burst_limit=limits.get("burst", 50),
daily_token_limit=limits.get("daily_tokens", 200000),
max_payload_kb=limits.get("max_payload_kb", 256),
)
self._tokens[plugin_id] = token
return token
def revoke_plugin(self, plugin_id: str) -> None:
"""撤销插件的能力令牌"""
self._tokens.pop(plugin_id, None)
def check_capability(self, plugin_id: str, capability: str) -> tuple[bool, str]:
"""检查插件是否有权调用某项能力
Returns:
(allowed, reason)
"""
token = self._tokens.get(plugin_id)
if token is None:
return False, f"插件 {plugin_id} 未注册能力令牌"
if capability not in token.capabilities:
return False, f"插件 {plugin_id} 未获授权能力: {capability}"
return True, ""
def check_rate_limit(self, plugin_id: str) -> tuple[bool, str]:
"""检查插件是否超过调用频率限制(滑动窗口)
Returns:
(allowed, reason)
"""
token = self._tokens.get(plugin_id)
if token is None:
return False, f"插件 {plugin_id} 未注册"
now = time.monotonic()
elapsed = now - token._window_start
# 每秒重置窗口
if elapsed >= 1.0:
token._window_start = now
token._window_calls = 0
token._window_calls += 1
if token._window_calls > token.burst_limit:
return False, f"插件 {plugin_id} 超过突发限制 ({token.burst_limit}/s)"
return True, ""
def check_payload_size(self, plugin_id: str, payload_size_bytes: int) -> tuple[bool, str]:
"""检查 payload 大小是否在限制内"""
token = self._tokens.get(plugin_id)
if token is None:
return False, f"插件 {plugin_id} 未注册"
max_bytes = token.max_payload_kb * 1024
if payload_size_bytes > max_bytes:
return False, f"payload 大小 {payload_size_bytes} 超过限制 {max_bytes}"
return True, ""
def get_token(self, plugin_id: str) -> CapabilityToken | None:
"""获取插件的能力令牌"""
return self._tokens.get(plugin_id)
def list_plugins(self) -> list[str]:
"""列出所有已注册的插件"""
return list(self._tokens.keys())

View File

@@ -0,0 +1,357 @@
"""Host 端 RPC Server
负责:
1. 监听 Runner 连接
2. 处理握手runner.hello
3. 分发调用请求给 Runner / 处理 Runner 的能力调用
4. 请求-响应关联与超时管理
"""
from typing import Any, Callable, Awaitable
import asyncio
import logging
import secrets
from src.plugin_runtime.protocol.codec import Codec, create_codec
from src.plugin_runtime.protocol.envelope import (
PROTOCOL_VERSION,
MIN_SDK_VERSION,
MAX_SDK_VERSION,
Envelope,
HelloPayload,
HelloResponsePayload,
MessageType,
RequestIdGenerator,
)
from src.plugin_runtime.protocol.errors import ErrorCode, RPCError
from src.plugin_runtime.transport.base import Connection, TransportServer
logger = logging.getLogger("plugin_runtime.host.rpc_server")
# RPC 方法处理器类型
MethodHandler = Callable[[Envelope], Awaitable[Envelope]]
class RPCServer:
"""Host 端 RPC 服务器
管理与 Runner 的 IPC 连接,处理双向 RPC 调用。
"""
def __init__(
self,
transport: TransportServer,
session_token: str | None = None,
codec: Codec | None = None,
send_queue_size: int = 128,
):
self._transport = transport
self._session_token = session_token or secrets.token_hex(32)
self._codec = codec or create_codec()
self._send_queue_size = send_queue_size
self._id_gen = RequestIdGenerator()
self._connection: Connection | None = None # 当前活跃的 Runner 连接
self._runner_id: str | None = None
self._runner_generation: int = 0
# 方法处理器注册表
self._method_handlers: dict[str, MethodHandler] = {}
# 等待响应的 pending 请求: request_id -> Future
self._pending_requests: dict[int, asyncio.Future] = {}
# 发送队列(背压控制)
self._send_queue: asyncio.Queue | None = None
# 运行状态
self._running = False
self._tasks: list[asyncio.Task] = []
@property
def session_token(self) -> str:
return self._session_token
@property
def is_connected(self) -> bool:
return self._connection is not None and not self._connection.is_closed
def register_method(self, method: str, handler: MethodHandler) -> None:
"""注册 RPC 方法处理器"""
self._method_handlers[method] = handler
async def start(self) -> None:
"""启动 RPC 服务器"""
self._running = True
self._send_queue = asyncio.Queue(maxsize=self._send_queue_size)
await self._transport.start(self._handle_connection)
logger.info(f"RPC Server 已启动,监听地址: {self._transport.get_address()}")
async def stop(self) -> None:
"""停止 RPC 服务器"""
self._running = False
# 取消所有 pending 请求
for req_id, future in self._pending_requests.items():
if not future.done():
future.set_exception(RPCError(ErrorCode.E_TIMEOUT, "服务器关闭"))
self._pending_requests.clear()
# 取消后台任务
for task in self._tasks:
task.cancel()
self._tasks.clear()
# 关闭连接
if self._connection:
await self._connection.close()
self._connection = None
await self._transport.stop()
logger.info("RPC Server 已停止")
async def send_request(
self,
method: str,
plugin_id: str = "",
payload: dict[str, Any] | None = None,
timeout_ms: int = 30000,
) -> Envelope:
"""向 Runner 发送 RPC 请求并等待响应
Args:
method: RPC 方法名
plugin_id: 目标插件 ID
payload: 请求数据
timeout_ms: 超时时间(ms)
Returns:
响应 Envelope
Raises:
RPCError: 调用失败
"""
if not self.is_connected:
raise RPCError(ErrorCode.E_PLUGIN_CRASHED, "Runner 未连接")
request_id = self._id_gen.next()
envelope = Envelope(
request_id=request_id,
message_type=MessageType.REQUEST,
method=method,
plugin_id=plugin_id,
generation=self._runner_generation,
timeout_ms=timeout_ms,
payload=payload or {},
)
# 背压检查
if self._send_queue and self._send_queue.full():
raise RPCError(ErrorCode.E_BACKPRESSURE, "发送队列已满")
# 注册 pending future
loop = asyncio.get_event_loop()
future: asyncio.Future[Envelope] = loop.create_future()
self._pending_requests[request_id] = future
try:
# 发送请求
data = self._codec.encode_envelope(envelope)
await self._connection.send_frame(data)
# 等待响应
timeout_sec = timeout_ms / 1000.0
response = await asyncio.wait_for(future, timeout=timeout_sec)
return response
except asyncio.TimeoutError:
self._pending_requests.pop(request_id, None)
raise RPCError(ErrorCode.E_TIMEOUT, f"请求 {method} 超时 ({timeout_ms}ms)")
except Exception as e:
self._pending_requests.pop(request_id, None)
if isinstance(e, RPCError):
raise
raise RPCError(ErrorCode.E_UNKNOWN, str(e))
async def send_event(self, method: str, plugin_id: str = "", payload: dict[str, Any] | None = None) -> None:
"""向 Runner 发送单向事件(不等待响应)"""
if not self.is_connected:
return
request_id = self._id_gen.next()
envelope = Envelope(
request_id=request_id,
message_type=MessageType.EVENT,
method=method,
plugin_id=plugin_id,
generation=self._runner_generation,
payload=payload or {},
)
data = self._codec.encode_envelope(envelope)
await self._connection.send_frame(data)
# ─── 内部方法 ──────────────────────────────────────────────
async def _handle_connection(self, conn: Connection) -> None:
"""处理新的 Runner 连接"""
logger.info("收到 Runner 连接")
# 第一条消息必须是 runner.hello 握手
try:
handshake_ok = await self._handle_handshake(conn)
if not handshake_ok:
await conn.close()
return
except Exception as e:
logger.error(f"握手失败: {e}")
await conn.close()
return
# 握手成功,保存连接
self._connection = conn
logger.info(f"Runner 握手成功: runner_id={self._runner_id}, generation={self._runner_generation}")
# 启动消息接收循环
try:
await self._recv_loop(conn)
except Exception as e:
logger.error(f"连接异常断开: {e}")
finally:
self._connection = None
self._runner_id = None
async def _handle_handshake(self, conn: Connection) -> bool:
"""处理 runner.hello 握手"""
# 接收握手请求
data = await asyncio.wait_for(conn.recv_frame(), timeout=10.0)
envelope = self._codec.decode_envelope(data)
if envelope.method != "runner.hello":
logger.error(f"期望 runner.hello收到 {envelope.method}")
error_resp = envelope.make_error_response(
ErrorCode.E_PROTOCOL_MISMATCH.value,
"首条消息必须为 runner.hello",
)
await conn.send_frame(self._codec.encode_envelope(error_resp))
return False
# 解析握手 payload
hello = HelloPayload.model_validate(envelope.payload)
# 校验会话令牌
if hello.session_token != self._session_token:
logger.error("会话令牌不匹配")
resp_payload = HelloResponsePayload(
accepted=False,
reason="会话令牌无效",
)
resp = envelope.make_response(payload=resp_payload.model_dump())
await conn.send_frame(self._codec.encode_envelope(resp))
return False
# 校验 SDK 版本
if not self._check_sdk_version(hello.sdk_version):
logger.error(f"SDK 版本不兼容: {hello.sdk_version}")
resp_payload = HelloResponsePayload(
accepted=False,
reason=f"SDK 版本 {hello.sdk_version} 不在支持范围 [{MIN_SDK_VERSION}, {MAX_SDK_VERSION}]",
)
resp = envelope.make_response(payload=resp_payload.model_dump())
await conn.send_frame(self._codec.encode_envelope(resp))
return False
# 握手成功
self._runner_id = hello.runner_id
self._runner_generation += 1
resp_payload = HelloResponsePayload(
accepted=True,
host_version=PROTOCOL_VERSION,
assigned_generation=self._runner_generation,
)
resp = envelope.make_response(payload=resp_payload.model_dump())
await conn.send_frame(self._codec.encode_envelope(resp))
return True
async def _recv_loop(self, conn: Connection) -> None:
"""消息接收主循环"""
while self._running and not conn.is_closed:
try:
data = await conn.recv_frame()
except (asyncio.IncompleteReadError, ConnectionError):
logger.info("Runner 连接已断开")
break
except Exception as e:
logger.error(f"接收帧失败: {e}")
break
try:
envelope = self._codec.decode_envelope(data)
except Exception as e:
logger.error(f"解码消息失败: {e}")
continue
# 分发消息
if envelope.is_response():
self._handle_response(envelope)
elif envelope.is_request():
# 异步处理请求Runner 发来的能力调用)
task = asyncio.create_task(self._handle_request(envelope, conn))
self._tasks.append(task)
task.add_done_callback(lambda t: self._tasks.remove(t) if t in self._tasks else None)
elif envelope.is_event():
task = asyncio.create_task(self._handle_event(envelope))
self._tasks.append(task)
task.add_done_callback(lambda t: self._tasks.remove(t) if t in self._tasks else None)
def _handle_response(self, envelope: Envelope) -> None:
"""处理来自 Runner 的响应"""
future = self._pending_requests.pop(envelope.request_id, None)
if future and not future.done():
if envelope.error:
future.set_exception(RPCError.from_dict(envelope.error))
else:
future.set_result(envelope)
async def _handle_request(self, envelope: Envelope, conn: Connection) -> None:
"""处理来自 Runner 的请求(通常是能力调用 cap.*"""
handler = self._method_handlers.get(envelope.method)
if handler is None:
error_resp = envelope.make_error_response(
ErrorCode.E_METHOD_NOT_ALLOWED.value,
f"未注册的方法: {envelope.method}",
)
await conn.send_frame(self._codec.encode_envelope(error_resp))
return
try:
response = await handler(envelope)
await conn.send_frame(self._codec.encode_envelope(response))
except RPCError as e:
error_resp = envelope.make_error_response(e.code.value, e.message, e.details)
await conn.send_frame(self._codec.encode_envelope(error_resp))
except Exception as e:
logger.error(f"处理请求 {envelope.method} 异常: {e}", exc_info=True)
error_resp = envelope.make_error_response(ErrorCode.E_UNKNOWN.value, str(e))
await conn.send_frame(self._codec.encode_envelope(error_resp))
async def _handle_event(self, envelope: Envelope) -> None:
"""处理来自 Runner 的事件"""
handler = self._method_handlers.get(envelope.method)
if handler:
try:
await handler(envelope)
except Exception as e:
logger.error(f"处理事件 {envelope.method} 异常: {e}", exc_info=True)
@staticmethod
def _check_sdk_version(sdk_version: str) -> bool:
"""检查 SDK 版本是否在支持范围内"""
try:
sdk_parts = [int(x) for x in sdk_version.split(".")]
min_parts = [int(x) for x in MIN_SDK_VERSION.split(".")]
max_parts = [int(x) for x in MAX_SDK_VERSION.split(".")]
return min_parts <= sdk_parts <= max_parts
except (ValueError, AttributeError):
return False

View File

@@ -0,0 +1,315 @@
"""Supervisor - 插件生命周期管理
负责:
1. 拉起 Runner 子进程
2. 健康检查
3. 熔断与恢复
4. 代码热重载generation 切换)
5. 优雅关停
"""
from typing import Any
import asyncio
import logging
import os
import sys
from src.plugin_runtime.host.capability_service import CapabilityService
from src.plugin_runtime.host.circuit_breaker import CircuitBreakerRegistry
from src.plugin_runtime.host.policy_engine import PolicyEngine
from src.plugin_runtime.host.rpc_server import RPCServer
from src.plugin_runtime.protocol.envelope import (
Envelope,
HealthPayload,
RegisterComponentsPayload,
ShutdownPayload,
)
from src.plugin_runtime.protocol.errors import ErrorCode, RPCError
from src.plugin_runtime.transport.factory import create_transport_server
logger = logging.getLogger("plugin_runtime.host.supervisor")
class PluginSupervisor:
"""插件 Supervisor
Host 端的核心管理器,负责整个插件 Runner 进程的生命周期。
"""
def __init__(
self,
plugin_dirs: list[str] | None = None,
socket_path: str | None = None,
health_check_interval_sec: float = 30.0,
use_json_codec: bool = False,
):
self._plugin_dirs = plugin_dirs or []
self._health_interval = health_check_interval_sec
# 基础设施
self._transport = create_transport_server(socket_path=socket_path)
self._policy = PolicyEngine()
self._breakers = CircuitBreakerRegistry()
self._capability_service = CapabilityService(self._policy)
# 编解码
from src.plugin_runtime.protocol.codec import create_codec
codec = create_codec(use_json=use_json_codec)
self._rpc_server = RPCServer(
transport=self._transport,
codec=codec,
)
# Runner 子进程
self._runner_process: asyncio.subprocess.Process | None = None
self._runner_generation: int = 0
# 已注册的插件组件信息
self._registered_plugins: dict[str, RegisterComponentsPayload] = {}
# 后台任务
self._health_task: asyncio.Task | None = None
self._running = False
# 注册内部 RPC 方法
self._register_internal_methods()
@property
def policy_engine(self) -> PolicyEngine:
return self._policy
@property
def capability_service(self) -> CapabilityService:
return self._capability_service
@property
def rpc_server(self) -> RPCServer:
return self._rpc_server
async def start(self) -> None:
"""启动 Supervisor
1. 启动 RPC Server
2. 拉起 Runner 子进程
3. 启动健康检查
"""
self._running = True
# 启动 RPC Server
await self._rpc_server.start()
# 拉起 Runner 进程
await self._spawn_runner()
# 启动健康检查
self._health_task = asyncio.create_task(self._health_check_loop())
logger.info("PluginSupervisor 已启动")
async def stop(self) -> None:
"""停止 Supervisor"""
self._running = False
# 停止健康检查
if self._health_task:
self._health_task.cancel()
self._health_task = None
# 优雅关停 Runner
await self._shutdown_runner()
# 停止 RPC Server
await self._rpc_server.stop()
logger.info("PluginSupervisor 已停止")
async def invoke_plugin(
self,
method: str,
plugin_id: str,
component_name: str,
args: dict[str, Any] | None = None,
timeout_ms: int = 30000,
) -> Envelope:
"""调用插件组件
由主进程业务逻辑调用,通过 RPC 转发给 Runner。
"""
# 熔断检查
breaker = self._breakers.get(plugin_id)
if not breaker.allow_request():
raise RPCError(ErrorCode.E_PLUGIN_CRASHED, f"插件 {plugin_id} 已被熔断")
try:
response = await self._rpc_server.send_request(
method=method,
plugin_id=plugin_id,
payload={
"component_name": component_name,
"args": args or {},
},
timeout_ms=timeout_ms,
)
breaker.record_success()
return response
except RPCError:
breaker.record_failure()
raise
async def reload_plugins(self, reason: str = "manual") -> None:
"""热重载所有插件(进程级 generation 切换)
1. 拉起新 Runner
2. 等待新 Runner 完成注册和健康检查
3. 关停旧 Runner
"""
logger.info(f"开始热重载插件,原因: {reason}")
# 保存旧进程引用
old_process = self._runner_process
# 拉起新 Runner
await self._spawn_runner()
# 等待新 Runner 连接并完成握手
for _ in range(30): # 最多等待 30 秒
if self._rpc_server.is_connected:
break
await asyncio.sleep(1.0)
else:
logger.error("新 Runner 连接超时,回滚")
# 回滚:终止新进程
if self._runner_process and self._runner_process != old_process:
self._runner_process.terminate()
self._runner_process = old_process
return
# 健康检查
try:
resp = await self._rpc_server.send_request("plugin.health", timeout_ms=5000)
health = HealthPayload.model_validate(resp.payload)
if not health.healthy:
raise RPCError(ErrorCode.E_PLUGIN_CRASHED, "新 Runner 健康检查失败")
except Exception as e:
logger.error(f"新 Runner 健康检查失败: {e},回滚")
if self._runner_process and self._runner_process != old_process:
self._runner_process.terminate()
self._runner_process = old_process
return
# 关停旧 Runner
if old_process and old_process.returncode is None:
try:
old_process.terminate()
await asyncio.wait_for(old_process.wait(), timeout=10.0)
except asyncio.TimeoutError:
old_process.kill()
logger.info("热重载完成")
# ─── 内部方法 ──────────────────────────────────────────────
def _register_internal_methods(self) -> None:
"""注册 Host 端的 RPC 方法处理器"""
# Runner -> Host 的能力调用统一走 capability_service
self._rpc_server.register_method("cap.request", self._capability_service.handle_capability_request)
# 插件注册
self._rpc_server.register_method("plugin.register_components", self._handle_register_components)
async def _handle_register_components(self, envelope: Envelope) -> Envelope:
"""处理插件组件注册请求"""
try:
reg = RegisterComponentsPayload.model_validate(envelope.payload)
except Exception as e:
return envelope.make_error_response(ErrorCode.E_BAD_PAYLOAD.value, str(e))
# 记录注册信息
self._registered_plugins[reg.plugin_id] = reg
# 在策略引擎中注册插件
self._policy.register_plugin(
plugin_id=reg.plugin_id,
generation=envelope.generation,
capabilities=reg.capabilities_required,
)
logger.info(
f"插件 {reg.plugin_id} v{reg.plugin_version} 注册成功,"
f"组件数: {len(reg.components)}, 能力需求: {reg.capabilities_required}"
)
return envelope.make_response(payload={"accepted": True})
async def _spawn_runner(self) -> None:
"""拉起 Runner 子进程"""
runner_module = "src.plugin_runtime.runner.runner_main"
address = self._transport.get_address()
token = self._rpc_server.session_token
env = os.environ.copy()
env["MAIBOT_IPC_ADDRESS"] = address
env["MAIBOT_SESSION_TOKEN"] = token
env["MAIBOT_PLUGIN_DIRS"] = os.pathsep.join(self._plugin_dirs)
self._runner_process = await asyncio.create_subprocess_exec(
sys.executable, "-m", runner_module,
env=env,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
self._runner_generation += 1
logger.info(f"Runner 子进程已启动: pid={self._runner_process.pid}, generation={self._runner_generation}")
async def _shutdown_runner(self) -> None:
"""优雅关停 Runner"""
if not self._runner_process or self._runner_process.returncode is not None:
return
# 发送 prepare_shutdown
try:
if self._rpc_server.is_connected:
shutdown_payload = ShutdownPayload(reason="host_shutdown", drain_timeout_ms=5000)
await self._rpc_server.send_request(
"plugin.prepare_shutdown",
payload=shutdown_payload.model_dump(),
timeout_ms=5000,
)
await self._rpc_server.send_request(
"plugin.shutdown",
payload=shutdown_payload.model_dump(),
timeout_ms=5000,
)
except Exception as e:
logger.warning(f"发送关停命令失败: {e}")
# 等待进程退出
try:
await asyncio.wait_for(self._runner_process.wait(), timeout=10.0)
except asyncio.TimeoutError:
logger.warning("Runner 未在超时内退出,强制终止")
self._runner_process.kill()
await self._runner_process.wait()
async def _health_check_loop(self) -> None:
"""周期性健康检查"""
while self._running:
await asyncio.sleep(self._health_interval)
if not self._rpc_server.is_connected:
logger.warning("Runner 未连接,跳过健康检查")
continue
try:
resp = await self._rpc_server.send_request("plugin.health", timeout_ms=5000)
health = HealthPayload.model_validate(resp.payload)
if not health.healthy:
logger.warning(f"Runner 健康检查异常: {health}")
except RPCError as e:
logger.error(f"健康检查失败: {e}")
except asyncio.CancelledError:
break
except Exception as e:
logger.error(f"健康检查异常: {e}")