后端: 1. 服务级 outbox 基础设施全量落地——新增 service route / service catalog / route registry,重构 outbox engine、repository、event bus 和 model,按 `event_type -> service -> table/topic/group` 统一写入与投递,保留 `agent` 兼容壳但不再依赖共享 outbox 2. Kafka 投递、消费与启动装配同步切换——更新 kafka config、consumer、envelope,接入服务级 topic 与 consumer group,并同步调整 mysql 初始化、start/main/router 装配,保证各服务 relay / consumer 独立装配 3. 业务事件处理器按服务归属重接新 bus——`active-scheduler` 触发链路,以及 `agent` / `memory` / `notification` / `task` 相关 outbox handler 统一切到新路由注册与服务目录,避免新流量回流共享表 4. 同步更新《微服务四步迁移与第二阶段并行开发计划》,把阶段 1 改成当前基线并补齐结构图、阶段快照、风险回退和多代理执行口径
376 lines
12 KiB
Go
376 lines
12 KiB
Go
package service
|
||
|
||
import (
|
||
"context"
|
||
"errors"
|
||
"fmt"
|
||
"strings"
|
||
"time"
|
||
|
||
activegraph "github.com/LoveLosita/smartflow/backend/active_scheduler/graph"
|
||
activepreview "github.com/LoveLosita/smartflow/backend/active_scheduler/preview"
|
||
"github.com/LoveLosita/smartflow/backend/active_scheduler/trigger"
|
||
"github.com/LoveLosita/smartflow/backend/dao"
|
||
kafkabus "github.com/LoveLosita/smartflow/backend/infra/kafka"
|
||
outboxinfra "github.com/LoveLosita/smartflow/backend/infra/outbox"
|
||
"github.com/LoveLosita/smartflow/backend/model"
|
||
sharedevents "github.com/LoveLosita/smartflow/backend/shared/events"
|
||
"github.com/google/uuid"
|
||
"gorm.io/gorm"
|
||
"gorm.io/gorm/clause"
|
||
)
|
||
|
||
const (
|
||
triggerErrorCodePayloadMismatch = "payload_mismatch"
|
||
triggerErrorCodeWorkerFailed = "worker_failed"
|
||
)
|
||
|
||
// TriggerWorkflowService 负责第四阶段的 trigger -> dry-run -> preview -> notification 编排。
|
||
//
|
||
// 职责边界:
|
||
// 1. 只推进主动调度 trigger 的后台状态机,不负责启动 outbox worker;
|
||
// 2. dry-run 与选择器都复用 active_scheduler 独立模块,不再往 newAgent 里塞主动调度逻辑;
|
||
// 3. notification 只发布 requested 事件,不直接接真实飞书 provider。
|
||
type TriggerWorkflowService struct {
|
||
activeDAO *dao.ActiveScheduleDAO
|
||
graphRunner *activegraph.Runner
|
||
outbox *outboxinfra.Repository
|
||
kafkaCfg kafkabus.Config
|
||
agentDAO *dao.AgentDAO
|
||
sessionDAO *dao.ActiveScheduleSessionDAO
|
||
clock func() time.Time
|
||
}
|
||
|
||
func NewTriggerWorkflowService(
|
||
activeDAO *dao.ActiveScheduleDAO,
|
||
graphRunner *activegraph.Runner,
|
||
outboxRepo *outboxinfra.Repository,
|
||
kafkaCfg kafkabus.Config,
|
||
) (*TriggerWorkflowService, error) {
|
||
return NewTriggerWorkflowServiceWithOptions(activeDAO, graphRunner, outboxRepo, kafkaCfg)
|
||
}
|
||
|
||
// NewTriggerWorkflowServiceWithOptions 创建主动调度 trigger 编排服务,并允许注入迁移期可选能力。
|
||
func NewTriggerWorkflowServiceWithOptions(
|
||
activeDAO *dao.ActiveScheduleDAO,
|
||
graphRunner *activegraph.Runner,
|
||
outboxRepo *outboxinfra.Repository,
|
||
kafkaCfg kafkabus.Config,
|
||
opts ...TriggerWorkflowOption,
|
||
) (*TriggerWorkflowService, error) {
|
||
if activeDAO == nil {
|
||
return nil, errors.New("active schedule dao 不能为空")
|
||
}
|
||
if graphRunner == nil {
|
||
return nil, errors.New("active scheduler graph runner 不能为空")
|
||
}
|
||
if outboxRepo == nil {
|
||
return nil, errors.New("outbox repository 不能为空")
|
||
}
|
||
svc := &TriggerWorkflowService{
|
||
activeDAO: activeDAO,
|
||
graphRunner: graphRunner,
|
||
outbox: outboxRepo,
|
||
kafkaCfg: kafkaCfg,
|
||
clock: time.Now,
|
||
}
|
||
for _, opt := range opts {
|
||
if opt != nil {
|
||
opt(svc)
|
||
}
|
||
}
|
||
return svc, nil
|
||
}
|
||
|
||
func (s *TriggerWorkflowService) SetClock(clock func() time.Time) {
|
||
if s != nil && clock != nil {
|
||
s.clock = clock
|
||
}
|
||
}
|
||
|
||
// TriggerWorkflowOption 是 trigger 编排服务的可选注入项。
|
||
type TriggerWorkflowOption func(*TriggerWorkflowService)
|
||
|
||
// ProcessTriggeredInTx 在 outbox 消费事务内推进 trigger 主链路。
|
||
//
|
||
// 步骤化说明:
|
||
// 1. 先锁 trigger 行,确保同一 trigger 在并发 worker 下只能由一个事务推进;
|
||
// 2. 再把状态切到 processing,避免排障时看不出消息已经被消费;
|
||
// 3. 复用 active scheduler graph 跑 dry-run + 受限选择;若发现已有 preview,则直接复用,避免重复写库;
|
||
// 4. preview 成功后回写 trigger 状态,并在同一事务里补发 notification.requested outbox;
|
||
// 5. 任一步失败都返回 error,由外层 handler 负责记录 failed 状态并触发 outbox retry。
|
||
func (s *TriggerWorkflowService) ProcessTriggeredInTx(
|
||
ctx context.Context,
|
||
tx *gorm.DB,
|
||
payload sharedevents.ActiveScheduleTriggeredPayload,
|
||
) error {
|
||
if s == nil || s.activeDAO == nil || s.graphRunner == nil || s.outbox == nil {
|
||
return errors.New("trigger workflow service 未初始化")
|
||
}
|
||
if tx == nil {
|
||
return errors.New("gorm tx 不能为空")
|
||
}
|
||
if err := payload.Validate(); err != nil {
|
||
return err
|
||
}
|
||
|
||
now := s.now()
|
||
triggerRow, err := s.lockTrigger(ctx, tx, payload.TriggerID)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
|
||
txDAO := s.activeDAO.WithTx(tx)
|
||
if completed, err := s.tryFinishByTerminalStatus(ctx, txDAO, *triggerRow); err != nil || completed {
|
||
return err
|
||
}
|
||
if handled, err := s.tryRejectMismatchedPayload(ctx, txDAO, *triggerRow, payload, now); err != nil || handled {
|
||
return err
|
||
}
|
||
|
||
if err := txDAO.UpdateTriggerFields(ctx, triggerRow.ID, map[string]any{
|
||
"status": model.ActiveScheduleTriggerStatusProcessing,
|
||
"processed_at": &now,
|
||
"last_error_code": nil,
|
||
"last_error": nil,
|
||
}); err != nil {
|
||
return err
|
||
}
|
||
|
||
existingPreview, err := txDAO.GetPreviewByTriggerID(ctx, triggerRow.ID)
|
||
switch {
|
||
case err == nil:
|
||
return s.finishWithExistingPreview(ctx, txDAO, *triggerRow, *existingPreview, now)
|
||
case errors.Is(err, gorm.ErrRecordNotFound):
|
||
// 继续创建新 preview。
|
||
default:
|
||
return err
|
||
}
|
||
|
||
domainTrigger := buildDomainTriggerFromModel(*triggerRow, payload)
|
||
graphResult, err := s.graphRunner.Run(ctx, domainTrigger)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
if graphResult == nil || graphResult.DryRunData == nil {
|
||
return errors.New("active scheduler graph 返回空结果")
|
||
}
|
||
dryRunData := graphResult.DryRunData
|
||
if len(dryRunData.Candidates) == 0 {
|
||
return s.markClosedWithoutPreview(ctx, txDAO, triggerRow.ID, now)
|
||
}
|
||
if !dryRunData.Observation.Decision.ShouldNotify && !dryRunData.Observation.Decision.ShouldWritePreview {
|
||
return s.markClosedWithoutPreview(ctx, txDAO, triggerRow.ID, now)
|
||
}
|
||
|
||
previewService, err := activepreview.NewService(txDAO)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
previewResp, err := previewService.CreatePreview(ctx, activepreview.CreatePreviewRequest{
|
||
ActiveContext: dryRunData.Context,
|
||
Observation: dryRunData.Observation,
|
||
Candidates: dryRunData.Candidates,
|
||
TriggerID: triggerRow.ID,
|
||
GeneratedAt: now,
|
||
SelectedCandidateID: graphResult.SelectionResult.SelectedCandidateID,
|
||
ExplanationText: graphResult.SelectionResult.ExplanationText,
|
||
NotificationSummary: graphResult.SelectionResult.NotificationSummary,
|
||
FallbackUsed: graphResult.SelectionResult.FallbackUsed,
|
||
})
|
||
if err != nil {
|
||
return err
|
||
}
|
||
|
||
previewID := previewResp.Detail.PreviewID
|
||
if err = txDAO.UpdateTriggerFields(ctx, triggerRow.ID, map[string]any{
|
||
"status": model.ActiveScheduleTriggerStatusPreviewGenerated,
|
||
"preview_id": &previewID,
|
||
"completed_at": &now,
|
||
"last_error_code": nil,
|
||
"last_error": nil,
|
||
}); err != nil {
|
||
return err
|
||
}
|
||
|
||
if !dryRunData.Observation.Decision.ShouldNotify {
|
||
return nil
|
||
}
|
||
|
||
// 1. 离线通知发出前,先把用户点击后要进入的助手会话和主动调度 session 预热好。
|
||
// 2. 这一步和 preview / notification outbox 在同一事务内提交,避免出现“飞书已送达但会话空白”的断裂状态。
|
||
if err := s.bootstrapActiveScheduleConversationInTx(ctx, tx, *triggerRow, previewResp.Detail, graphResult.SelectionResult, now); err != nil {
|
||
return err
|
||
}
|
||
|
||
notificationPayload := BuildFeishuRequestedPayload(
|
||
*triggerRow,
|
||
previewID,
|
||
previewResp.Detail.Notification,
|
||
now,
|
||
)
|
||
return EnqueueNotificationFeishuRequestedInTx(ctx, s.outbox.WithTx(tx), s.kafkaCfg.MaxRetry, notificationPayload)
|
||
}
|
||
|
||
// MarkTriggerFailedBestEffort 在事务外补记 trigger failed 状态,供 outbox retry 前排障。
|
||
//
|
||
// 职责边界:
|
||
// 1. 只做 best-effort 状态回写,不能影响外层对原始错误的返回;
|
||
// 2. 不负责错误分类,当前统一记为 worker_failed;
|
||
// 3. 失败时静默返回,让真正的重试仍由 outbox 状态机负责。
|
||
func (s *TriggerWorkflowService) MarkTriggerFailedBestEffort(ctx context.Context, triggerID string, err error) {
|
||
if s == nil || s.activeDAO == nil || strings.TrimSpace(triggerID) == "" {
|
||
return
|
||
}
|
||
message := ""
|
||
if err != nil {
|
||
message = err.Error()
|
||
}
|
||
_ = s.activeDAO.UpdateTriggerFields(ctx, triggerID, map[string]any{
|
||
"status": model.ActiveScheduleTriggerStatusFailed,
|
||
"last_error_code": triggerErrorCodeWorkerFailed,
|
||
"last_error": &message,
|
||
})
|
||
}
|
||
|
||
func (s *TriggerWorkflowService) lockTrigger(ctx context.Context, tx *gorm.DB, triggerID string) (*model.ActiveScheduleTrigger, error) {
|
||
var row model.ActiveScheduleTrigger
|
||
err := tx.WithContext(ctx).
|
||
Clauses(clause.Locking{Strength: "UPDATE"}).
|
||
Where("id = ?", triggerID).
|
||
First(&row).Error
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
return &row, nil
|
||
}
|
||
|
||
func (s *TriggerWorkflowService) tryFinishByTerminalStatus(
|
||
ctx context.Context,
|
||
txDAO *dao.ActiveScheduleDAO,
|
||
row model.ActiveScheduleTrigger,
|
||
) (bool, error) {
|
||
switch row.Status {
|
||
case model.ActiveScheduleTriggerStatusPreviewGenerated,
|
||
model.ActiveScheduleTriggerStatusClosed,
|
||
model.ActiveScheduleTriggerStatusSkipped,
|
||
model.ActiveScheduleTriggerStatusRejected:
|
||
return true, nil
|
||
case model.ActiveScheduleTriggerStatusPending,
|
||
model.ActiveScheduleTriggerStatusProcessing,
|
||
model.ActiveScheduleTriggerStatusFailed:
|
||
return false, nil
|
||
default:
|
||
// 1. 遇到未知状态时,不直接报错中断,而是继续按 processing 流程推进。
|
||
// 2. 这样可以兼容迁移期历史脏数据,避免单条异常阻塞整批消费。
|
||
// 3. 真实状态最终会被下面的 UpdateTriggerFields 覆盖为 processing。
|
||
return false, nil
|
||
}
|
||
}
|
||
|
||
func (s *TriggerWorkflowService) tryRejectMismatchedPayload(
|
||
ctx context.Context,
|
||
txDAO *dao.ActiveScheduleDAO,
|
||
row model.ActiveScheduleTrigger,
|
||
payload sharedevents.ActiveScheduleTriggeredPayload,
|
||
now time.Time,
|
||
) (bool, error) {
|
||
mismatchReason := buildPayloadMismatchReason(row, payload)
|
||
if mismatchReason == "" {
|
||
return false, nil
|
||
}
|
||
if err := txDAO.UpdateTriggerFields(ctx, row.ID, map[string]any{
|
||
"status": model.ActiveScheduleTriggerStatusRejected,
|
||
"last_error_code": triggerErrorCodePayloadMismatch,
|
||
"last_error": &mismatchReason,
|
||
"completed_at": &now,
|
||
}); err != nil {
|
||
return false, err
|
||
}
|
||
return true, nil
|
||
}
|
||
|
||
func (s *TriggerWorkflowService) finishWithExistingPreview(
|
||
ctx context.Context,
|
||
txDAO *dao.ActiveScheduleDAO,
|
||
triggerRow model.ActiveScheduleTrigger,
|
||
previewRow model.ActiveSchedulePreview,
|
||
now time.Time,
|
||
) error {
|
||
previewID := previewRow.ID
|
||
return txDAO.UpdateTriggerFields(ctx, triggerRow.ID, map[string]any{
|
||
"status": model.ActiveScheduleTriggerStatusPreviewGenerated,
|
||
"preview_id": &previewID,
|
||
"completed_at": &now,
|
||
"last_error_code": nil,
|
||
"last_error": nil,
|
||
})
|
||
}
|
||
|
||
func (s *TriggerWorkflowService) markClosedWithoutPreview(
|
||
ctx context.Context,
|
||
txDAO *dao.ActiveScheduleDAO,
|
||
triggerID string,
|
||
now time.Time,
|
||
) error {
|
||
return txDAO.UpdateTriggerFields(ctx, triggerID, map[string]any{
|
||
"status": model.ActiveScheduleTriggerStatusClosed,
|
||
"completed_at": &now,
|
||
"last_error_code": nil,
|
||
"last_error": nil,
|
||
})
|
||
}
|
||
|
||
func (s *TriggerWorkflowService) now() time.Time {
|
||
if s == nil || s.clock == nil {
|
||
return time.Now()
|
||
}
|
||
return s.clock()
|
||
}
|
||
|
||
func buildDomainTriggerFromModel(
|
||
row model.ActiveScheduleTrigger,
|
||
payload sharedevents.ActiveScheduleTriggeredPayload,
|
||
) trigger.ActiveScheduleTrigger {
|
||
mockNow := row.MockNow
|
||
if mockNow == nil && payload.MockNow != nil {
|
||
mockNow = payload.MockNow
|
||
}
|
||
traceID := strings.TrimSpace(row.TraceID)
|
||
if traceID == "" {
|
||
traceID = strings.TrimSpace(payload.TraceID)
|
||
}
|
||
if traceID == "" {
|
||
traceID = "trace_active_trigger_" + uuid.NewString()
|
||
}
|
||
return trigger.ActiveScheduleTrigger{
|
||
TriggerID: row.ID,
|
||
UserID: row.UserID,
|
||
TriggerType: trigger.TriggerType(row.TriggerType),
|
||
Source: trigger.Source(row.Source),
|
||
TargetType: trigger.TargetType(row.TargetType),
|
||
TargetID: row.TargetID,
|
||
FeedbackID: row.FeedbackID,
|
||
IdempotencyKey: row.IdempotencyKey,
|
||
MockNow: mockNow,
|
||
IsMockTime: row.IsMockTime || payload.IsMockTime,
|
||
RequestedAt: row.RequestedAt,
|
||
TraceID: traceID,
|
||
}
|
||
}
|
||
|
||
func buildPayloadMismatchReason(row model.ActiveScheduleTrigger, payload sharedevents.ActiveScheduleTriggeredPayload) string {
|
||
switch {
|
||
case row.UserID != payload.UserID:
|
||
return fmt.Sprintf("trigger 事件 user_id 不一致: row=%d payload=%d", row.UserID, payload.UserID)
|
||
case row.TriggerType != payload.TriggerType:
|
||
return fmt.Sprintf("trigger 事件 trigger_type 不一致: row=%s payload=%s", row.TriggerType, payload.TriggerType)
|
||
case row.TargetType != payload.TargetType:
|
||
return fmt.Sprintf("trigger 事件 target_type 不一致: row=%s payload=%s", row.TargetType, payload.TargetType)
|
||
case row.TargetID != payload.TargetID:
|
||
return fmt.Sprintf("trigger 事件 target_id 不一致: row=%d payload=%d", row.TargetID, payload.TargetID)
|
||
default:
|
||
return ""
|
||
}
|
||
}
|