feat(plugin-runtime): add plugin isolation IPC infrastructure
- Protocol layer: Envelope model with Pydantic schema, MsgPack/JSON codecs, unified error codes - Transport layer: cross-platform IPC abstraction with 4-byte length-prefixed framing (UDS + TCP fallback) - Host: RPC server, policy engine, circuit breaker, capability service, supervisor with hot-reload - Runner: RPC client, plugin loader, process entry point - Tests: 16 passing tests covering protocol, transport, host, and E2E handshake
This commit is contained in:
1
src/plugin_runtime/host/__init__.py
Normal file
1
src/plugin_runtime/host/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# Host 端 - Supervisor、RPC Server、策略引擎、路由
|
||||
108
src/plugin_runtime/host/capability_service.py
Normal file
108
src/plugin_runtime/host/capability_service.py
Normal file
@@ -0,0 +1,108 @@
|
||||
"""能力服务层
|
||||
|
||||
Host 端实现的能力服务,处理来自插件的 cap.* 请求。
|
||||
每个能力方法被注册到 RPC Server,接收 Runner 转发的请求并执行实际操作。
|
||||
"""
|
||||
|
||||
from typing import Any, Callable, Awaitable
|
||||
|
||||
import logging
|
||||
|
||||
from src.plugin_runtime.protocol.envelope import (
|
||||
CapabilityRequestPayload,
|
||||
CapabilityResponsePayload,
|
||||
Envelope,
|
||||
)
|
||||
from src.plugin_runtime.protocol.errors import ErrorCode, RPCError
|
||||
from src.plugin_runtime.host.policy_engine import PolicyEngine
|
||||
|
||||
logger = logging.getLogger("plugin_runtime.host.capability_service")
|
||||
|
||||
# 能力实现函数类型: (plugin_id, capability, args) -> result
|
||||
CapabilityImpl = Callable[[str, str, dict[str, Any]], Awaitable[Any]]
|
||||
|
||||
|
||||
class CapabilityService:
|
||||
"""能力服务
|
||||
|
||||
负责:
|
||||
1. 注册能力实现
|
||||
2. 接收插件的能力调用请求
|
||||
3. 通过策略引擎校验权限和限流
|
||||
4. 执行实际操作并返回结果
|
||||
"""
|
||||
|
||||
def __init__(self, policy_engine: PolicyEngine):
|
||||
self._policy = policy_engine
|
||||
# capability_name -> implementation
|
||||
self._implementations: dict[str, CapabilityImpl] = {}
|
||||
|
||||
def register_capability(self, name: str, impl: CapabilityImpl) -> None:
|
||||
"""注册一个能力实现
|
||||
|
||||
Args:
|
||||
name: 能力名称,如 "send.text", "db.query", "llm.generate"
|
||||
impl: 实现函数
|
||||
"""
|
||||
self._implementations[name] = impl
|
||||
logger.debug(f"注册能力实现: {name}")
|
||||
|
||||
async def handle_capability_request(self, envelope: Envelope) -> Envelope:
|
||||
"""处理能力调用请求(作为 RPC Server 的 method handler)
|
||||
|
||||
从 envelope 中提取 capability 名称和参数,
|
||||
校验权限后调用对应实现。
|
||||
"""
|
||||
plugin_id = envelope.plugin_id
|
||||
|
||||
try:
|
||||
req = CapabilityRequestPayload.model_validate(envelope.payload)
|
||||
except Exception as e:
|
||||
return envelope.make_error_response(
|
||||
ErrorCode.E_BAD_PAYLOAD.value,
|
||||
f"能力调用 payload 格式错误: {e}",
|
||||
)
|
||||
|
||||
capability = req.capability
|
||||
|
||||
# 1. 权限校验
|
||||
allowed, reason = self._policy.check_capability(plugin_id, capability)
|
||||
if not allowed:
|
||||
return envelope.make_error_response(
|
||||
ErrorCode.E_CAPABILITY_DENIED.value,
|
||||
reason,
|
||||
)
|
||||
|
||||
# 2. 限流校验
|
||||
allowed, reason = self._policy.check_rate_limit(plugin_id)
|
||||
if not allowed:
|
||||
return envelope.make_error_response(
|
||||
ErrorCode.E_BACKPRESSURE.value,
|
||||
reason,
|
||||
)
|
||||
|
||||
# 3. 查找实现
|
||||
impl = self._implementations.get(capability)
|
||||
if impl is None:
|
||||
return envelope.make_error_response(
|
||||
ErrorCode.E_METHOD_NOT_ALLOWED.value,
|
||||
f"未注册的能力: {capability}",
|
||||
)
|
||||
|
||||
# 4. 执行
|
||||
try:
|
||||
result = await impl(plugin_id, capability, req.args)
|
||||
resp_payload = CapabilityResponsePayload(success=True, result=result)
|
||||
return envelope.make_response(payload=resp_payload.model_dump())
|
||||
except RPCError as e:
|
||||
return envelope.make_error_response(e.code.value, e.message, e.details)
|
||||
except Exception as e:
|
||||
logger.error(f"能力 {capability} 执行异常: {e}", exc_info=True)
|
||||
return envelope.make_error_response(
|
||||
ErrorCode.E_CAPABILITY_FAILED.value,
|
||||
str(e),
|
||||
)
|
||||
|
||||
def list_capabilities(self) -> list[str]:
|
||||
"""列出所有已注册的能力"""
|
||||
return list(self._implementations.keys())
|
||||
105
src/plugin_runtime/host/circuit_breaker.py
Normal file
105
src/plugin_runtime/host/circuit_breaker.py
Normal file
@@ -0,0 +1,105 @@
|
||||
"""熔断器
|
||||
|
||||
为每个插件提供熔断保护,连续失败超过阈值后临时禁用。
|
||||
支持指数退避恢复。
|
||||
"""
|
||||
|
||||
from enum import Enum
|
||||
|
||||
import time
|
||||
|
||||
|
||||
class CircuitState(str, Enum):
|
||||
CLOSED = "closed" # 正常工作
|
||||
OPEN = "open" # 熔断(拒绝所有调用)
|
||||
HALF_OPEN = "half_open" # 探测恢复
|
||||
|
||||
|
||||
class CircuitBreaker:
|
||||
"""单个插件的熔断器"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
failure_threshold: int = 5,
|
||||
recovery_timeout_sec: float = 30.0,
|
||||
max_recovery_timeout_sec: float = 300.0,
|
||||
):
|
||||
self.failure_threshold = failure_threshold
|
||||
self.base_recovery_timeout = recovery_timeout_sec
|
||||
self.max_recovery_timeout = max_recovery_timeout_sec
|
||||
|
||||
self._state = CircuitState.CLOSED
|
||||
self._failure_count = 0
|
||||
self._last_failure_time = 0.0
|
||||
self._consecutive_opens = 0 # 用于指数退避
|
||||
|
||||
@property
|
||||
def state(self) -> CircuitState:
|
||||
if self._state == CircuitState.OPEN:
|
||||
# 检查是否可以进入半开状态
|
||||
elapsed = time.monotonic() - self._last_failure_time
|
||||
recovery_timeout = min(
|
||||
self.base_recovery_timeout * (2 ** self._consecutive_opens),
|
||||
self.max_recovery_timeout,
|
||||
)
|
||||
if elapsed >= recovery_timeout:
|
||||
self._state = CircuitState.HALF_OPEN
|
||||
return self._state
|
||||
|
||||
def allow_request(self) -> bool:
|
||||
"""是否允许通过请求"""
|
||||
state = self.state
|
||||
if state == CircuitState.CLOSED:
|
||||
return True
|
||||
if state == CircuitState.HALF_OPEN:
|
||||
return True # 允许一次试探
|
||||
return False # OPEN 状态拒绝
|
||||
|
||||
def record_success(self) -> None:
|
||||
"""记录一次成功调用"""
|
||||
if self._state == CircuitState.HALF_OPEN:
|
||||
# 半开状态成功 -> 关闭熔断
|
||||
self._state = CircuitState.CLOSED
|
||||
self._failure_count = 0
|
||||
self._consecutive_opens = 0
|
||||
elif self._state == CircuitState.CLOSED:
|
||||
self._failure_count = 0
|
||||
|
||||
def record_failure(self) -> None:
|
||||
"""记录一次失败调用"""
|
||||
self._failure_count += 1
|
||||
self._last_failure_time = time.monotonic()
|
||||
|
||||
if self._state == CircuitState.HALF_OPEN:
|
||||
# 半开状态失败 -> 重新开启熔断
|
||||
self._state = CircuitState.OPEN
|
||||
self._consecutive_opens += 1
|
||||
elif self._failure_count >= self.failure_threshold:
|
||||
self._state = CircuitState.OPEN
|
||||
self._consecutive_opens += 1
|
||||
|
||||
def reset(self) -> None:
|
||||
"""重置熔断器"""
|
||||
self._state = CircuitState.CLOSED
|
||||
self._failure_count = 0
|
||||
self._consecutive_opens = 0
|
||||
|
||||
|
||||
class CircuitBreakerRegistry:
|
||||
"""熔断器注册表,为每个插件维护独立的熔断器"""
|
||||
|
||||
def __init__(self, **default_kwargs):
|
||||
self._breakers: dict[str, CircuitBreaker] = {}
|
||||
self._default_kwargs = default_kwargs
|
||||
|
||||
def get(self, plugin_id: str) -> CircuitBreaker:
|
||||
if plugin_id not in self._breakers:
|
||||
self._breakers[plugin_id] = CircuitBreaker(**self._default_kwargs)
|
||||
return self._breakers[plugin_id]
|
||||
|
||||
def remove(self, plugin_id: str) -> None:
|
||||
self._breakers.pop(plugin_id, None)
|
||||
|
||||
def reset_all(self) -> None:
|
||||
for breaker in self._breakers.values():
|
||||
breaker.reset()
|
||||
125
src/plugin_runtime/host/policy_engine.py
Normal file
125
src/plugin_runtime/host/policy_engine.py
Normal file
@@ -0,0 +1,125 @@
|
||||
"""策略引擎
|
||||
|
||||
负责能力授权校验、限流、配额管理。
|
||||
每个插件在 manifest 中声明能力需求,Host 启动时签发能力令牌。
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
import time
|
||||
|
||||
|
||||
@dataclass
|
||||
class CapabilityToken:
|
||||
"""能力令牌
|
||||
|
||||
描述某个插件在当前会话中被授予的能力和资源限制。
|
||||
"""
|
||||
plugin_id: str
|
||||
generation: int
|
||||
capabilities: set[str] = field(default_factory=set)
|
||||
qps_limit: int = 20
|
||||
burst_limit: int = 50
|
||||
daily_token_limit: int = 200000
|
||||
max_payload_kb: int = 256
|
||||
|
||||
# 运行时统计
|
||||
_call_count: int = field(default=0, init=False, repr=False)
|
||||
_window_start: float = field(default_factory=time.monotonic, init=False, repr=False)
|
||||
_window_calls: int = field(default=0, init=False, repr=False)
|
||||
|
||||
|
||||
class PolicyEngine:
|
||||
"""策略引擎
|
||||
|
||||
管理所有插件的能力令牌,提供授权校验与限流决策。
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
# plugin_id -> CapabilityToken
|
||||
self._tokens: dict[str, CapabilityToken] = {}
|
||||
|
||||
def register_plugin(
|
||||
self,
|
||||
plugin_id: str,
|
||||
generation: int,
|
||||
capabilities: list[str],
|
||||
limits: dict | None = None,
|
||||
) -> CapabilityToken:
|
||||
"""为插件签发能力令牌"""
|
||||
limits = limits or {}
|
||||
token = CapabilityToken(
|
||||
plugin_id=plugin_id,
|
||||
generation=generation,
|
||||
capabilities=set(capabilities),
|
||||
qps_limit=limits.get("qps", 20),
|
||||
burst_limit=limits.get("burst", 50),
|
||||
daily_token_limit=limits.get("daily_tokens", 200000),
|
||||
max_payload_kb=limits.get("max_payload_kb", 256),
|
||||
)
|
||||
self._tokens[plugin_id] = token
|
||||
return token
|
||||
|
||||
def revoke_plugin(self, plugin_id: str) -> None:
|
||||
"""撤销插件的能力令牌"""
|
||||
self._tokens.pop(plugin_id, None)
|
||||
|
||||
def check_capability(self, plugin_id: str, capability: str) -> tuple[bool, str]:
|
||||
"""检查插件是否有权调用某项能力
|
||||
|
||||
Returns:
|
||||
(allowed, reason)
|
||||
"""
|
||||
token = self._tokens.get(plugin_id)
|
||||
if token is None:
|
||||
return False, f"插件 {plugin_id} 未注册能力令牌"
|
||||
|
||||
if capability not in token.capabilities:
|
||||
return False, f"插件 {plugin_id} 未获授权能力: {capability}"
|
||||
|
||||
return True, ""
|
||||
|
||||
def check_rate_limit(self, plugin_id: str) -> tuple[bool, str]:
|
||||
"""检查插件是否超过调用频率限制(滑动窗口)
|
||||
|
||||
Returns:
|
||||
(allowed, reason)
|
||||
"""
|
||||
token = self._tokens.get(plugin_id)
|
||||
if token is None:
|
||||
return False, f"插件 {plugin_id} 未注册"
|
||||
|
||||
now = time.monotonic()
|
||||
elapsed = now - token._window_start
|
||||
|
||||
# 每秒重置窗口
|
||||
if elapsed >= 1.0:
|
||||
token._window_start = now
|
||||
token._window_calls = 0
|
||||
|
||||
token._window_calls += 1
|
||||
|
||||
if token._window_calls > token.burst_limit:
|
||||
return False, f"插件 {plugin_id} 超过突发限制 ({token.burst_limit}/s)"
|
||||
|
||||
return True, ""
|
||||
|
||||
def check_payload_size(self, plugin_id: str, payload_size_bytes: int) -> tuple[bool, str]:
|
||||
"""检查 payload 大小是否在限制内"""
|
||||
token = self._tokens.get(plugin_id)
|
||||
if token is None:
|
||||
return False, f"插件 {plugin_id} 未注册"
|
||||
|
||||
max_bytes = token.max_payload_kb * 1024
|
||||
if payload_size_bytes > max_bytes:
|
||||
return False, f"payload 大小 {payload_size_bytes} 超过限制 {max_bytes}"
|
||||
|
||||
return True, ""
|
||||
|
||||
def get_token(self, plugin_id: str) -> CapabilityToken | None:
|
||||
"""获取插件的能力令牌"""
|
||||
return self._tokens.get(plugin_id)
|
||||
|
||||
def list_plugins(self) -> list[str]:
|
||||
"""列出所有已注册的插件"""
|
||||
return list(self._tokens.keys())
|
||||
357
src/plugin_runtime/host/rpc_server.py
Normal file
357
src/plugin_runtime/host/rpc_server.py
Normal file
@@ -0,0 +1,357 @@
|
||||
"""Host 端 RPC Server
|
||||
|
||||
负责:
|
||||
1. 监听 Runner 连接
|
||||
2. 处理握手(runner.hello)
|
||||
3. 分发调用请求给 Runner / 处理 Runner 的能力调用
|
||||
4. 请求-响应关联与超时管理
|
||||
"""
|
||||
|
||||
from typing import Any, Callable, Awaitable
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import secrets
|
||||
|
||||
from src.plugin_runtime.protocol.codec import Codec, create_codec
|
||||
from src.plugin_runtime.protocol.envelope import (
|
||||
PROTOCOL_VERSION,
|
||||
MIN_SDK_VERSION,
|
||||
MAX_SDK_VERSION,
|
||||
Envelope,
|
||||
HelloPayload,
|
||||
HelloResponsePayload,
|
||||
MessageType,
|
||||
RequestIdGenerator,
|
||||
)
|
||||
from src.plugin_runtime.protocol.errors import ErrorCode, RPCError
|
||||
from src.plugin_runtime.transport.base import Connection, TransportServer
|
||||
|
||||
logger = logging.getLogger("plugin_runtime.host.rpc_server")
|
||||
|
||||
# RPC 方法处理器类型
|
||||
MethodHandler = Callable[[Envelope], Awaitable[Envelope]]
|
||||
|
||||
|
||||
class RPCServer:
|
||||
"""Host 端 RPC 服务器
|
||||
|
||||
管理与 Runner 的 IPC 连接,处理双向 RPC 调用。
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
transport: TransportServer,
|
||||
session_token: str | None = None,
|
||||
codec: Codec | None = None,
|
||||
send_queue_size: int = 128,
|
||||
):
|
||||
self._transport = transport
|
||||
self._session_token = session_token or secrets.token_hex(32)
|
||||
self._codec = codec or create_codec()
|
||||
self._send_queue_size = send_queue_size
|
||||
|
||||
self._id_gen = RequestIdGenerator()
|
||||
self._connection: Connection | None = None # 当前活跃的 Runner 连接
|
||||
self._runner_id: str | None = None
|
||||
self._runner_generation: int = 0
|
||||
|
||||
# 方法处理器注册表
|
||||
self._method_handlers: dict[str, MethodHandler] = {}
|
||||
|
||||
# 等待响应的 pending 请求: request_id -> Future
|
||||
self._pending_requests: dict[int, asyncio.Future] = {}
|
||||
|
||||
# 发送队列(背压控制)
|
||||
self._send_queue: asyncio.Queue | None = None
|
||||
|
||||
# 运行状态
|
||||
self._running = False
|
||||
self._tasks: list[asyncio.Task] = []
|
||||
|
||||
@property
|
||||
def session_token(self) -> str:
|
||||
return self._session_token
|
||||
|
||||
@property
|
||||
def is_connected(self) -> bool:
|
||||
return self._connection is not None and not self._connection.is_closed
|
||||
|
||||
def register_method(self, method: str, handler: MethodHandler) -> None:
|
||||
"""注册 RPC 方法处理器"""
|
||||
self._method_handlers[method] = handler
|
||||
|
||||
async def start(self) -> None:
|
||||
"""启动 RPC 服务器"""
|
||||
self._running = True
|
||||
self._send_queue = asyncio.Queue(maxsize=self._send_queue_size)
|
||||
await self._transport.start(self._handle_connection)
|
||||
logger.info(f"RPC Server 已启动,监听地址: {self._transport.get_address()}")
|
||||
|
||||
async def stop(self) -> None:
|
||||
"""停止 RPC 服务器"""
|
||||
self._running = False
|
||||
|
||||
# 取消所有 pending 请求
|
||||
for req_id, future in self._pending_requests.items():
|
||||
if not future.done():
|
||||
future.set_exception(RPCError(ErrorCode.E_TIMEOUT, "服务器关闭"))
|
||||
self._pending_requests.clear()
|
||||
|
||||
# 取消后台任务
|
||||
for task in self._tasks:
|
||||
task.cancel()
|
||||
self._tasks.clear()
|
||||
|
||||
# 关闭连接
|
||||
if self._connection:
|
||||
await self._connection.close()
|
||||
self._connection = None
|
||||
|
||||
await self._transport.stop()
|
||||
logger.info("RPC Server 已停止")
|
||||
|
||||
async def send_request(
|
||||
self,
|
||||
method: str,
|
||||
plugin_id: str = "",
|
||||
payload: dict[str, Any] | None = None,
|
||||
timeout_ms: int = 30000,
|
||||
) -> Envelope:
|
||||
"""向 Runner 发送 RPC 请求并等待响应
|
||||
|
||||
Args:
|
||||
method: RPC 方法名
|
||||
plugin_id: 目标插件 ID
|
||||
payload: 请求数据
|
||||
timeout_ms: 超时时间(ms)
|
||||
|
||||
Returns:
|
||||
响应 Envelope
|
||||
|
||||
Raises:
|
||||
RPCError: 调用失败
|
||||
"""
|
||||
if not self.is_connected:
|
||||
raise RPCError(ErrorCode.E_PLUGIN_CRASHED, "Runner 未连接")
|
||||
|
||||
request_id = self._id_gen.next()
|
||||
envelope = Envelope(
|
||||
request_id=request_id,
|
||||
message_type=MessageType.REQUEST,
|
||||
method=method,
|
||||
plugin_id=plugin_id,
|
||||
generation=self._runner_generation,
|
||||
timeout_ms=timeout_ms,
|
||||
payload=payload or {},
|
||||
)
|
||||
|
||||
# 背压检查
|
||||
if self._send_queue and self._send_queue.full():
|
||||
raise RPCError(ErrorCode.E_BACKPRESSURE, "发送队列已满")
|
||||
|
||||
# 注册 pending future
|
||||
loop = asyncio.get_event_loop()
|
||||
future: asyncio.Future[Envelope] = loop.create_future()
|
||||
self._pending_requests[request_id] = future
|
||||
|
||||
try:
|
||||
# 发送请求
|
||||
data = self._codec.encode_envelope(envelope)
|
||||
await self._connection.send_frame(data)
|
||||
|
||||
# 等待响应
|
||||
timeout_sec = timeout_ms / 1000.0
|
||||
response = await asyncio.wait_for(future, timeout=timeout_sec)
|
||||
return response
|
||||
except asyncio.TimeoutError:
|
||||
self._pending_requests.pop(request_id, None)
|
||||
raise RPCError(ErrorCode.E_TIMEOUT, f"请求 {method} 超时 ({timeout_ms}ms)")
|
||||
except Exception as e:
|
||||
self._pending_requests.pop(request_id, None)
|
||||
if isinstance(e, RPCError):
|
||||
raise
|
||||
raise RPCError(ErrorCode.E_UNKNOWN, str(e))
|
||||
|
||||
async def send_event(self, method: str, plugin_id: str = "", payload: dict[str, Any] | None = None) -> None:
|
||||
"""向 Runner 发送单向事件(不等待响应)"""
|
||||
if not self.is_connected:
|
||||
return
|
||||
|
||||
request_id = self._id_gen.next()
|
||||
envelope = Envelope(
|
||||
request_id=request_id,
|
||||
message_type=MessageType.EVENT,
|
||||
method=method,
|
||||
plugin_id=plugin_id,
|
||||
generation=self._runner_generation,
|
||||
payload=payload or {},
|
||||
)
|
||||
data = self._codec.encode_envelope(envelope)
|
||||
await self._connection.send_frame(data)
|
||||
|
||||
# ─── 内部方法 ──────────────────────────────────────────────
|
||||
|
||||
async def _handle_connection(self, conn: Connection) -> None:
|
||||
"""处理新的 Runner 连接"""
|
||||
logger.info("收到 Runner 连接")
|
||||
|
||||
# 第一条消息必须是 runner.hello 握手
|
||||
try:
|
||||
handshake_ok = await self._handle_handshake(conn)
|
||||
if not handshake_ok:
|
||||
await conn.close()
|
||||
return
|
||||
except Exception as e:
|
||||
logger.error(f"握手失败: {e}")
|
||||
await conn.close()
|
||||
return
|
||||
|
||||
# 握手成功,保存连接
|
||||
self._connection = conn
|
||||
logger.info(f"Runner 握手成功: runner_id={self._runner_id}, generation={self._runner_generation}")
|
||||
|
||||
# 启动消息接收循环
|
||||
try:
|
||||
await self._recv_loop(conn)
|
||||
except Exception as e:
|
||||
logger.error(f"连接异常断开: {e}")
|
||||
finally:
|
||||
self._connection = None
|
||||
self._runner_id = None
|
||||
|
||||
async def _handle_handshake(self, conn: Connection) -> bool:
|
||||
"""处理 runner.hello 握手"""
|
||||
# 接收握手请求
|
||||
data = await asyncio.wait_for(conn.recv_frame(), timeout=10.0)
|
||||
envelope = self._codec.decode_envelope(data)
|
||||
|
||||
if envelope.method != "runner.hello":
|
||||
logger.error(f"期望 runner.hello,收到 {envelope.method}")
|
||||
error_resp = envelope.make_error_response(
|
||||
ErrorCode.E_PROTOCOL_MISMATCH.value,
|
||||
"首条消息必须为 runner.hello",
|
||||
)
|
||||
await conn.send_frame(self._codec.encode_envelope(error_resp))
|
||||
return False
|
||||
|
||||
# 解析握手 payload
|
||||
hello = HelloPayload.model_validate(envelope.payload)
|
||||
|
||||
# 校验会话令牌
|
||||
if hello.session_token != self._session_token:
|
||||
logger.error("会话令牌不匹配")
|
||||
resp_payload = HelloResponsePayload(
|
||||
accepted=False,
|
||||
reason="会话令牌无效",
|
||||
)
|
||||
resp = envelope.make_response(payload=resp_payload.model_dump())
|
||||
await conn.send_frame(self._codec.encode_envelope(resp))
|
||||
return False
|
||||
|
||||
# 校验 SDK 版本
|
||||
if not self._check_sdk_version(hello.sdk_version):
|
||||
logger.error(f"SDK 版本不兼容: {hello.sdk_version}")
|
||||
resp_payload = HelloResponsePayload(
|
||||
accepted=False,
|
||||
reason=f"SDK 版本 {hello.sdk_version} 不在支持范围 [{MIN_SDK_VERSION}, {MAX_SDK_VERSION}]",
|
||||
)
|
||||
resp = envelope.make_response(payload=resp_payload.model_dump())
|
||||
await conn.send_frame(self._codec.encode_envelope(resp))
|
||||
return False
|
||||
|
||||
# 握手成功
|
||||
self._runner_id = hello.runner_id
|
||||
self._runner_generation += 1
|
||||
|
||||
resp_payload = HelloResponsePayload(
|
||||
accepted=True,
|
||||
host_version=PROTOCOL_VERSION,
|
||||
assigned_generation=self._runner_generation,
|
||||
)
|
||||
resp = envelope.make_response(payload=resp_payload.model_dump())
|
||||
await conn.send_frame(self._codec.encode_envelope(resp))
|
||||
|
||||
return True
|
||||
|
||||
async def _recv_loop(self, conn: Connection) -> None:
|
||||
"""消息接收主循环"""
|
||||
while self._running and not conn.is_closed:
|
||||
try:
|
||||
data = await conn.recv_frame()
|
||||
except (asyncio.IncompleteReadError, ConnectionError):
|
||||
logger.info("Runner 连接已断开")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"接收帧失败: {e}")
|
||||
break
|
||||
|
||||
try:
|
||||
envelope = self._codec.decode_envelope(data)
|
||||
except Exception as e:
|
||||
logger.error(f"解码消息失败: {e}")
|
||||
continue
|
||||
|
||||
# 分发消息
|
||||
if envelope.is_response():
|
||||
self._handle_response(envelope)
|
||||
elif envelope.is_request():
|
||||
# 异步处理请求(Runner 发来的能力调用)
|
||||
task = asyncio.create_task(self._handle_request(envelope, conn))
|
||||
self._tasks.append(task)
|
||||
task.add_done_callback(lambda t: self._tasks.remove(t) if t in self._tasks else None)
|
||||
elif envelope.is_event():
|
||||
task = asyncio.create_task(self._handle_event(envelope))
|
||||
self._tasks.append(task)
|
||||
task.add_done_callback(lambda t: self._tasks.remove(t) if t in self._tasks else None)
|
||||
|
||||
def _handle_response(self, envelope: Envelope) -> None:
|
||||
"""处理来自 Runner 的响应"""
|
||||
future = self._pending_requests.pop(envelope.request_id, None)
|
||||
if future and not future.done():
|
||||
if envelope.error:
|
||||
future.set_exception(RPCError.from_dict(envelope.error))
|
||||
else:
|
||||
future.set_result(envelope)
|
||||
|
||||
async def _handle_request(self, envelope: Envelope, conn: Connection) -> None:
|
||||
"""处理来自 Runner 的请求(通常是能力调用 cap.*)"""
|
||||
handler = self._method_handlers.get(envelope.method)
|
||||
if handler is None:
|
||||
error_resp = envelope.make_error_response(
|
||||
ErrorCode.E_METHOD_NOT_ALLOWED.value,
|
||||
f"未注册的方法: {envelope.method}",
|
||||
)
|
||||
await conn.send_frame(self._codec.encode_envelope(error_resp))
|
||||
return
|
||||
|
||||
try:
|
||||
response = await handler(envelope)
|
||||
await conn.send_frame(self._codec.encode_envelope(response))
|
||||
except RPCError as e:
|
||||
error_resp = envelope.make_error_response(e.code.value, e.message, e.details)
|
||||
await conn.send_frame(self._codec.encode_envelope(error_resp))
|
||||
except Exception as e:
|
||||
logger.error(f"处理请求 {envelope.method} 异常: {e}", exc_info=True)
|
||||
error_resp = envelope.make_error_response(ErrorCode.E_UNKNOWN.value, str(e))
|
||||
await conn.send_frame(self._codec.encode_envelope(error_resp))
|
||||
|
||||
async def _handle_event(self, envelope: Envelope) -> None:
|
||||
"""处理来自 Runner 的事件"""
|
||||
handler = self._method_handlers.get(envelope.method)
|
||||
if handler:
|
||||
try:
|
||||
await handler(envelope)
|
||||
except Exception as e:
|
||||
logger.error(f"处理事件 {envelope.method} 异常: {e}", exc_info=True)
|
||||
|
||||
@staticmethod
|
||||
def _check_sdk_version(sdk_version: str) -> bool:
|
||||
"""检查 SDK 版本是否在支持范围内"""
|
||||
try:
|
||||
sdk_parts = [int(x) for x in sdk_version.split(".")]
|
||||
min_parts = [int(x) for x in MIN_SDK_VERSION.split(".")]
|
||||
max_parts = [int(x) for x in MAX_SDK_VERSION.split(".")]
|
||||
return min_parts <= sdk_parts <= max_parts
|
||||
except (ValueError, AttributeError):
|
||||
return False
|
||||
315
src/plugin_runtime/host/supervisor.py
Normal file
315
src/plugin_runtime/host/supervisor.py
Normal file
@@ -0,0 +1,315 @@
|
||||
"""Supervisor - 插件生命周期管理
|
||||
|
||||
负责:
|
||||
1. 拉起 Runner 子进程
|
||||
2. 健康检查
|
||||
3. 熔断与恢复
|
||||
4. 代码热重载(generation 切换)
|
||||
5. 优雅关停
|
||||
"""
|
||||
|
||||
from typing import Any
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
|
||||
from src.plugin_runtime.host.capability_service import CapabilityService
|
||||
from src.plugin_runtime.host.circuit_breaker import CircuitBreakerRegistry
|
||||
from src.plugin_runtime.host.policy_engine import PolicyEngine
|
||||
from src.plugin_runtime.host.rpc_server import RPCServer
|
||||
from src.plugin_runtime.protocol.envelope import (
|
||||
Envelope,
|
||||
HealthPayload,
|
||||
RegisterComponentsPayload,
|
||||
ShutdownPayload,
|
||||
)
|
||||
from src.plugin_runtime.protocol.errors import ErrorCode, RPCError
|
||||
from src.plugin_runtime.transport.factory import create_transport_server
|
||||
|
||||
logger = logging.getLogger("plugin_runtime.host.supervisor")
|
||||
|
||||
|
||||
class PluginSupervisor:
|
||||
"""插件 Supervisor
|
||||
|
||||
Host 端的核心管理器,负责整个插件 Runner 进程的生命周期。
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
plugin_dirs: list[str] | None = None,
|
||||
socket_path: str | None = None,
|
||||
health_check_interval_sec: float = 30.0,
|
||||
use_json_codec: bool = False,
|
||||
):
|
||||
self._plugin_dirs = plugin_dirs or []
|
||||
self._health_interval = health_check_interval_sec
|
||||
|
||||
# 基础设施
|
||||
self._transport = create_transport_server(socket_path=socket_path)
|
||||
self._policy = PolicyEngine()
|
||||
self._breakers = CircuitBreakerRegistry()
|
||||
self._capability_service = CapabilityService(self._policy)
|
||||
|
||||
# 编解码
|
||||
from src.plugin_runtime.protocol.codec import create_codec
|
||||
codec = create_codec(use_json=use_json_codec)
|
||||
|
||||
self._rpc_server = RPCServer(
|
||||
transport=self._transport,
|
||||
codec=codec,
|
||||
)
|
||||
|
||||
# Runner 子进程
|
||||
self._runner_process: asyncio.subprocess.Process | None = None
|
||||
self._runner_generation: int = 0
|
||||
|
||||
# 已注册的插件组件信息
|
||||
self._registered_plugins: dict[str, RegisterComponentsPayload] = {}
|
||||
|
||||
# 后台任务
|
||||
self._health_task: asyncio.Task | None = None
|
||||
self._running = False
|
||||
|
||||
# 注册内部 RPC 方法
|
||||
self._register_internal_methods()
|
||||
|
||||
@property
|
||||
def policy_engine(self) -> PolicyEngine:
|
||||
return self._policy
|
||||
|
||||
@property
|
||||
def capability_service(self) -> CapabilityService:
|
||||
return self._capability_service
|
||||
|
||||
@property
|
||||
def rpc_server(self) -> RPCServer:
|
||||
return self._rpc_server
|
||||
|
||||
async def start(self) -> None:
|
||||
"""启动 Supervisor
|
||||
|
||||
1. 启动 RPC Server
|
||||
2. 拉起 Runner 子进程
|
||||
3. 启动健康检查
|
||||
"""
|
||||
self._running = True
|
||||
|
||||
# 启动 RPC Server
|
||||
await self._rpc_server.start()
|
||||
|
||||
# 拉起 Runner 进程
|
||||
await self._spawn_runner()
|
||||
|
||||
# 启动健康检查
|
||||
self._health_task = asyncio.create_task(self._health_check_loop())
|
||||
|
||||
logger.info("PluginSupervisor 已启动")
|
||||
|
||||
async def stop(self) -> None:
|
||||
"""停止 Supervisor"""
|
||||
self._running = False
|
||||
|
||||
# 停止健康检查
|
||||
if self._health_task:
|
||||
self._health_task.cancel()
|
||||
self._health_task = None
|
||||
|
||||
# 优雅关停 Runner
|
||||
await self._shutdown_runner()
|
||||
|
||||
# 停止 RPC Server
|
||||
await self._rpc_server.stop()
|
||||
|
||||
logger.info("PluginSupervisor 已停止")
|
||||
|
||||
async def invoke_plugin(
|
||||
self,
|
||||
method: str,
|
||||
plugin_id: str,
|
||||
component_name: str,
|
||||
args: dict[str, Any] | None = None,
|
||||
timeout_ms: int = 30000,
|
||||
) -> Envelope:
|
||||
"""调用插件组件
|
||||
|
||||
由主进程业务逻辑调用,通过 RPC 转发给 Runner。
|
||||
"""
|
||||
# 熔断检查
|
||||
breaker = self._breakers.get(plugin_id)
|
||||
if not breaker.allow_request():
|
||||
raise RPCError(ErrorCode.E_PLUGIN_CRASHED, f"插件 {plugin_id} 已被熔断")
|
||||
|
||||
try:
|
||||
response = await self._rpc_server.send_request(
|
||||
method=method,
|
||||
plugin_id=plugin_id,
|
||||
payload={
|
||||
"component_name": component_name,
|
||||
"args": args or {},
|
||||
},
|
||||
timeout_ms=timeout_ms,
|
||||
)
|
||||
breaker.record_success()
|
||||
return response
|
||||
except RPCError:
|
||||
breaker.record_failure()
|
||||
raise
|
||||
|
||||
async def reload_plugins(self, reason: str = "manual") -> None:
|
||||
"""热重载所有插件(进程级 generation 切换)
|
||||
|
||||
1. 拉起新 Runner
|
||||
2. 等待新 Runner 完成注册和健康检查
|
||||
3. 关停旧 Runner
|
||||
"""
|
||||
logger.info(f"开始热重载插件,原因: {reason}")
|
||||
|
||||
# 保存旧进程引用
|
||||
old_process = self._runner_process
|
||||
|
||||
# 拉起新 Runner
|
||||
await self._spawn_runner()
|
||||
|
||||
# 等待新 Runner 连接并完成握手
|
||||
for _ in range(30): # 最多等待 30 秒
|
||||
if self._rpc_server.is_connected:
|
||||
break
|
||||
await asyncio.sleep(1.0)
|
||||
else:
|
||||
logger.error("新 Runner 连接超时,回滚")
|
||||
# 回滚:终止新进程
|
||||
if self._runner_process and self._runner_process != old_process:
|
||||
self._runner_process.terminate()
|
||||
self._runner_process = old_process
|
||||
return
|
||||
|
||||
# 健康检查
|
||||
try:
|
||||
resp = await self._rpc_server.send_request("plugin.health", timeout_ms=5000)
|
||||
health = HealthPayload.model_validate(resp.payload)
|
||||
if not health.healthy:
|
||||
raise RPCError(ErrorCode.E_PLUGIN_CRASHED, "新 Runner 健康检查失败")
|
||||
except Exception as e:
|
||||
logger.error(f"新 Runner 健康检查失败: {e},回滚")
|
||||
if self._runner_process and self._runner_process != old_process:
|
||||
self._runner_process.terminate()
|
||||
self._runner_process = old_process
|
||||
return
|
||||
|
||||
# 关停旧 Runner
|
||||
if old_process and old_process.returncode is None:
|
||||
try:
|
||||
old_process.terminate()
|
||||
await asyncio.wait_for(old_process.wait(), timeout=10.0)
|
||||
except asyncio.TimeoutError:
|
||||
old_process.kill()
|
||||
|
||||
logger.info("热重载完成")
|
||||
|
||||
# ─── 内部方法 ──────────────────────────────────────────────
|
||||
|
||||
def _register_internal_methods(self) -> None:
|
||||
"""注册 Host 端的 RPC 方法处理器"""
|
||||
# Runner -> Host 的能力调用统一走 capability_service
|
||||
self._rpc_server.register_method("cap.request", self._capability_service.handle_capability_request)
|
||||
# 插件注册
|
||||
self._rpc_server.register_method("plugin.register_components", self._handle_register_components)
|
||||
|
||||
async def _handle_register_components(self, envelope: Envelope) -> Envelope:
|
||||
"""处理插件组件注册请求"""
|
||||
try:
|
||||
reg = RegisterComponentsPayload.model_validate(envelope.payload)
|
||||
except Exception as e:
|
||||
return envelope.make_error_response(ErrorCode.E_BAD_PAYLOAD.value, str(e))
|
||||
|
||||
# 记录注册信息
|
||||
self._registered_plugins[reg.plugin_id] = reg
|
||||
|
||||
# 在策略引擎中注册插件
|
||||
self._policy.register_plugin(
|
||||
plugin_id=reg.plugin_id,
|
||||
generation=envelope.generation,
|
||||
capabilities=reg.capabilities_required,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"插件 {reg.plugin_id} v{reg.plugin_version} 注册成功,"
|
||||
f"组件数: {len(reg.components)}, 能力需求: {reg.capabilities_required}"
|
||||
)
|
||||
|
||||
return envelope.make_response(payload={"accepted": True})
|
||||
|
||||
async def _spawn_runner(self) -> None:
|
||||
"""拉起 Runner 子进程"""
|
||||
runner_module = "src.plugin_runtime.runner.runner_main"
|
||||
address = self._transport.get_address()
|
||||
token = self._rpc_server.session_token
|
||||
|
||||
env = os.environ.copy()
|
||||
env["MAIBOT_IPC_ADDRESS"] = address
|
||||
env["MAIBOT_SESSION_TOKEN"] = token
|
||||
env["MAIBOT_PLUGIN_DIRS"] = os.pathsep.join(self._plugin_dirs)
|
||||
|
||||
self._runner_process = await asyncio.create_subprocess_exec(
|
||||
sys.executable, "-m", runner_module,
|
||||
env=env,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
|
||||
self._runner_generation += 1
|
||||
logger.info(f"Runner 子进程已启动: pid={self._runner_process.pid}, generation={self._runner_generation}")
|
||||
|
||||
async def _shutdown_runner(self) -> None:
|
||||
"""优雅关停 Runner"""
|
||||
if not self._runner_process or self._runner_process.returncode is not None:
|
||||
return
|
||||
|
||||
# 发送 prepare_shutdown
|
||||
try:
|
||||
if self._rpc_server.is_connected:
|
||||
shutdown_payload = ShutdownPayload(reason="host_shutdown", drain_timeout_ms=5000)
|
||||
await self._rpc_server.send_request(
|
||||
"plugin.prepare_shutdown",
|
||||
payload=shutdown_payload.model_dump(),
|
||||
timeout_ms=5000,
|
||||
)
|
||||
await self._rpc_server.send_request(
|
||||
"plugin.shutdown",
|
||||
payload=shutdown_payload.model_dump(),
|
||||
timeout_ms=5000,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"发送关停命令失败: {e}")
|
||||
|
||||
# 等待进程退出
|
||||
try:
|
||||
await asyncio.wait_for(self._runner_process.wait(), timeout=10.0)
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning("Runner 未在超时内退出,强制终止")
|
||||
self._runner_process.kill()
|
||||
await self._runner_process.wait()
|
||||
|
||||
async def _health_check_loop(self) -> None:
|
||||
"""周期性健康检查"""
|
||||
while self._running:
|
||||
await asyncio.sleep(self._health_interval)
|
||||
|
||||
if not self._rpc_server.is_connected:
|
||||
logger.warning("Runner 未连接,跳过健康检查")
|
||||
continue
|
||||
|
||||
try:
|
||||
resp = await self._rpc_server.send_request("plugin.health", timeout_ms=5000)
|
||||
health = HealthPayload.model_validate(resp.payload)
|
||||
if not health.healthy:
|
||||
logger.warning(f"Runner 健康检查异常: {health}")
|
||||
except RPCError as e:
|
||||
logger.error(f"健康检查失败: {e}")
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"健康检查异常: {e}")
|
||||
Reference in New Issue
Block a user