Version: 0.9.65.dev.260503

后端： 1. 阶段 1.5/1.6 收口 llm-service / rag-service，统一模型出口与检索基础设施入口，清退 backend/infra/llm 与 backend/infra/rag 旧实现； 2. 同步更新相关调用链与微服务迁移计划文档
2026-05-03 23:21:03 +08:00
parent a6c1e5d077
commit 9902ca3563
65 changed files with 550 additions and 376 deletions
--- a/backend/newAgent/model/graph_run_state.go
+++ b/backend/newAgent/model/graph_run_state.go
@@ -5,10 +5,10 @@ import (
 	"strings"
 	"time"

-	infrallm "github.com/LoveLosita/smartflow/backend/infra/llm"
 	newagentstream "github.com/LoveLosita/smartflow/backend/newAgent/stream"
 	newagenttools "github.com/LoveLosita/smartflow/backend/newAgent/tools"
 	schedule "github.com/LoveLosita/smartflow/backend/newAgent/tools/schedule"
+	llmservice "github.com/LoveLosita/smartflow/backend/services/llm"
 	"github.com/cloudwego/eino/schema"
 )

@@ -71,10 +71,10 @@ type PersistVisibleMessageFunc func(ctx context.Context, state *CommonState, msg
 // 2. Chat/Plan/Execute/Deliver 允许分别挂不同 client，但也允许先复用同一个 client；
 // 3. ChunkEmitter 统一承接阶段提示、正文、工具事件、确认请求等 SSE 输出。
 type AgentGraphDeps struct {
-	ChatClient           *infrallm.Client
-	PlanClient           *infrallm.Client
-	ExecuteClient        *infrallm.Client
-	DeliverClient        *infrallm.Client
+	ChatClient           *llmservice.Client
+	PlanClient           *llmservice.Client
+	ExecuteClient        *llmservice.Client
+	DeliverClient        *llmservice.Client
 	ChunkEmitter         *newagentstream.ChunkEmitter
 	StateStore           AgentStateStore
 	ToolRegistry         *newagenttools.ToolRegistry
@@ -141,7 +141,7 @@ func (d *AgentGraphDeps) EnsureChunkEmitter() *newagentstream.ChunkEmitter {
 }

 // ResolveChatClient 返回 chat 阶段可用的模型客户端。
-func (d *AgentGraphDeps) ResolveChatClient() *infrallm.Client {
+func (d *AgentGraphDeps) ResolveChatClient() *llmservice.Client {
 	if d == nil {
 		return nil
 	}
@@ -154,7 +154,7 @@ func (d *AgentGraphDeps) ResolveChatClient() *infrallm.Client {
 // 1. 优先使用显式注入的 PlanClient；
 // 2. 若未单独注入，则回退到 ChatClient；
 // 3. 这样在骨架期可先用一套 client 跑通，再按需拆分 strategist / worker。
-func (d *AgentGraphDeps) ResolvePlanClient() *infrallm.Client {
+func (d *AgentGraphDeps) ResolvePlanClient() *llmservice.Client {
 	if d == nil {
 		return nil
 	}
@@ -165,7 +165,7 @@ func (d *AgentGraphDeps) ResolvePlanClient() *infrallm.Client {
 }

 // ResolveExecuteClient 返回 execute 阶段可用的模型客户端。
-func (d *AgentGraphDeps) ResolveExecuteClient() *infrallm.Client {
+func (d *AgentGraphDeps) ResolveExecuteClient() *llmservice.Client {
 	if d == nil {
 		return nil
 	}
@@ -179,7 +179,7 @@ func (d *AgentGraphDeps) ResolveExecuteClient() *infrallm.Client {
 }

 // ResolveDeliverClient 返回 deliver 阶段可用的模型客户端。
-func (d *AgentGraphDeps) ResolveDeliverClient() *infrallm.Client {
+func (d *AgentGraphDeps) ResolveDeliverClient() *llmservice.Client {
 	if d == nil {
 		return nil
 	}
--- a/backend/newAgent/node/chat.go
+++ b/backend/newAgent/node/chat.go
@@ -11,11 +11,11 @@ import (
 	"github.com/cloudwego/eino/schema"
 	"github.com/google/uuid"

-	infrallm "github.com/LoveLosita/smartflow/backend/infra/llm"
 	newagentmodel "github.com/LoveLosita/smartflow/backend/newAgent/model"
 	newagentprompt "github.com/LoveLosita/smartflow/backend/newAgent/prompt"
 	newagentrouter "github.com/LoveLosita/smartflow/backend/newAgent/router"
 	newagentstream "github.com/LoveLosita/smartflow/backend/newAgent/stream"
+	llmservice "github.com/LoveLosita/smartflow/backend/services/llm"
 )

 const (
@@ -50,7 +50,7 @@ type ChatNodeInput struct {
 	UserInput             string
 	ConfirmAction         string
 	ResumeInteractionID   string
-	Client                *infrallm.Client
+	Client                *llmservice.Client
 	ChunkEmitter          *newagentstream.ChunkEmitter
 	CompactionStore       newagentmodel.CompactionStore // 上下文压缩持久化
 	PersistVisibleMessage newagentmodel.PersistVisibleMessageFunc
@@ -107,9 +107,9 @@ func RunChatNode(ctx context.Context, input ChatNodeInput) error {
 	})
 	logNodeLLMContext(chatStageName, "routing", flowState, messages)

-	reader, err := input.Client.Stream(ctx, messages, infrallm.GenerateOptions{
+	reader, err := input.Client.Stream(ctx, messages, llmservice.GenerateOptions{
 		Temperature: 0.7,
-		Thinking:    infrallm.ThinkingModeDisabled,
+		Thinking:    llmservice.ThinkingModeDisabled,
 		Metadata: map[string]any{
 			"stage": chatStageName,
 			"phase": "routing",
@@ -172,7 +172,7 @@ func isExecuteLoopClosedMarker(msg *schema.Message) bool {
 // 3. 控制码解析超时或流异常结束 → fallback 到 plan。
 func streamAndDispatch(
 	ctx context.Context,
-	reader infrallm.StreamReader,
+	reader llmservice.StreamReader,
 	parser *newagentrouter.StreamRouteParser,
 	input ChatNodeInput,
 	emitter *newagentstream.ChunkEmitter,
@@ -292,7 +292,7 @@ func resolveEffectiveThinking(mode string, route newagentmodel.ChatRoute, decisi
 // 2. thinking=true：关闭路由流，发起第二次 thinking 流式调用。
 func handleDirectReplyStream(
 	ctx context.Context,
-	reader infrallm.StreamReader,
+	reader llmservice.StreamReader,
 	input ChatNodeInput,
 	emitter *newagentstream.ChunkEmitter,
 	conversationContext *newagentmodel.ConversationContext,
@@ -309,7 +309,7 @@ func handleDirectReplyStream(
 // handleThinkingReplyStream 处理需要思考的回复：关闭路由流 → 第二次 thinking 流式调用。
 func handleThinkingReplyStream(
 	ctx context.Context,
-	reader infrallm.StreamReader,
+	reader llmservice.StreamReader,
 	input ChatNodeInput,
 	emitter *newagentstream.ChunkEmitter,
 	conversationContext *newagentmodel.ConversationContext,
@@ -327,10 +327,10 @@ func handleThinkingReplyStream(
 		StatusBlockID:   chatStatusBlockID,
 	})
 	logNodeLLMContext(chatStageName, "direct_reply_thinking", flowState, deepMessages)
-	deepReader, err := input.Client.Stream(ctx, deepMessages, infrallm.GenerateOptions{
+	deepReader, err := input.Client.Stream(ctx, deepMessages, llmservice.GenerateOptions{
 		Temperature: 0.5,
 		MaxTokens:   2000,
-		Thinking:    infrallm.ThinkingModeEnabled,
+		Thinking:    llmservice.ThinkingModeEnabled,
 		Metadata: map[string]any{
 			"stage": chatStageName,
 			"phase": "direct_reply_thinking",
@@ -363,7 +363,7 @@ func handleThinkingReplyStream(
 // handleDirectReplyContinueStream 处理无思考的闲聊：同一流续传。
 func handleDirectReplyContinueStream(
 	ctx context.Context,
-	reader infrallm.StreamReader,
+	reader llmservice.StreamReader,
 	input ChatNodeInput,
 	emitter *newagentstream.ChunkEmitter,
 	conversationContext *newagentmodel.ConversationContext,
@@ -419,7 +419,7 @@ func handleDirectReplyContinueStream(
 // 2. 推送轻量状态通知；
 // 3. 设置流程状态，进入 Execute 或 RoughBuild。
 func handleRouteExecuteStream(
-	reader infrallm.StreamReader,
+	reader llmservice.StreamReader,
 	emitter *newagentstream.ChunkEmitter,
 	flowState *newagentmodel.CommonState,
 	decision *newagentmodel.ChatRoutingDecision,
@@ -674,7 +674,7 @@ func isExplicitNoRefineAfterRoughBuildRequest(userInput string) bool {
 // 4. 完整回复写入 history。
 func handleDeepAnswerStream(
 	ctx context.Context,
-	reader infrallm.StreamReader,
+	reader llmservice.StreamReader,
 	input ChatNodeInput,
 	emitter *newagentstream.ChunkEmitter,
 	conversationContext *newagentmodel.ConversationContext,
@@ -685,9 +685,9 @@ func handleDeepAnswerStream(
 	_ = reader.Close()

 	// 2. 第二次流式调用。
-	thinkingOpt := infrallm.ThinkingModeDisabled
+	thinkingOpt := llmservice.ThinkingModeDisabled
 	if effectiveThinking {
-		thinkingOpt = infrallm.ThinkingModeEnabled
+		thinkingOpt = llmservice.ThinkingModeEnabled
 	}
 	deepMessages := newagentprompt.BuildDeepAnswerMessages(flowState, conversationContext, input.UserInput)
 	deepMessages = compactUnifiedMessagesIfNeeded(ctx, deepMessages, UnifiedCompactInput{
@@ -699,7 +699,7 @@ func handleDeepAnswerStream(
 		StatusBlockID:   chatStatusBlockID,
 	})
 	logNodeLLMContext(chatStageName, "deep_answer", flowState, deepMessages)
-	deepReader, err := input.Client.Stream(ctx, deepMessages, infrallm.GenerateOptions{
+	deepReader, err := input.Client.Stream(ctx, deepMessages, llmservice.GenerateOptions{
 		Temperature: 0.5,
 		MaxTokens:   2000,
 		Thinking:    thinkingOpt,
@@ -741,7 +741,7 @@ func handleDeepAnswerStream(

 // handleRoutePlanStream 处理规划路由：推送状态确认 → 设 PhasePlanning。
 func handleRoutePlanStream(
-	reader infrallm.StreamReader,
+	reader llmservice.StreamReader,
 	emitter *newagentstream.ChunkEmitter,
 	flowState *newagentmodel.CommonState,
 	effectiveThinking bool,
--- a/backend/newAgent/node/deliver.go
+++ b/backend/newAgent/node/deliver.go
@@ -9,10 +9,10 @@ import (

 	"github.com/cloudwego/eino/schema"

-	infrallm "github.com/LoveLosita/smartflow/backend/infra/llm"
 	newagentmodel "github.com/LoveLosita/smartflow/backend/newAgent/model"
 	newagentprompt "github.com/LoveLosita/smartflow/backend/newAgent/prompt"
 	newagentstream "github.com/LoveLosita/smartflow/backend/newAgent/stream"
+	llmservice "github.com/LoveLosita/smartflow/backend/services/llm"
 )

 const (
@@ -31,7 +31,7 @@ const (
 type DeliverNodeInput struct {
 	RuntimeState          *newagentmodel.AgentRuntimeState
 	ConversationContext   *newagentmodel.ConversationContext
-	Client                *infrallm.Client
+	Client                *llmservice.Client
 	ChunkEmitter          *newagentstream.ChunkEmitter
 	ThinkingEnabled       bool                          // 是否开启 thinking，由 config.yaml 的 agent.thinking.deliver 注入
 	CompactionStore       newagentmodel.CompactionStore // 上下文压缩持久化
@@ -128,7 +128,7 @@ func RunDeliverNode(ctx context.Context, input DeliverNodeInput) error {
 //   - streamed：true 表示文本已通过 EmitStreamAssistantText 真流式推送到前端，调用方无需再伪流式。
 func generateDeliverSummary(
 	ctx context.Context,
-	client *infrallm.Client,
+	client *llmservice.Client,
 	flowState *newagentmodel.CommonState,
 	conversationContext *newagentmodel.ConversationContext,
 	thinkingEnabled bool,
@@ -162,7 +162,7 @@ func generateDeliverSummary(
 	reader, err := client.Stream(
 		ctx,
 		messages,
-		infrallm.GenerateOptions{
+		llmservice.GenerateOptions{
 			Temperature: 0.5,
 			MaxTokens:   800,
 			Thinking:    resolveThinkingMode(thinkingEnabled),
--- a/backend/newAgent/node/execute/action_router.go
+++ b/backend/newAgent/node/execute/action_router.go
@@ -8,11 +8,11 @@ import (
 	"log"
 	"strings"

-	infrallm "github.com/LoveLosita/smartflow/backend/infra/llm"
 	newagentmodel "github.com/LoveLosita/smartflow/backend/newAgent/model"
 	newagentrouter "github.com/LoveLosita/smartflow/backend/newAgent/router"
 	newagentstream "github.com/LoveLosita/smartflow/backend/newAgent/stream"
 	newagenttools "github.com/LoveLosita/smartflow/backend/newAgent/tools"
+	llmservice "github.com/LoveLosita/smartflow/backend/services/llm"
 	"github.com/cloudwego/eino/schema"
 	"github.com/google/uuid"
 )
@@ -38,7 +38,7 @@ func collectExecuteDecisionFromLLM(
 	reader, err := input.Client.Stream(
 		ctx,
 		messages,
-		infrallm.GenerateOptions{
+		llmservice.GenerateOptions{
 			Temperature: 1.0,
 			MaxTokens:   131072,
 			Thinking:    newagentshared.ResolveThinkingMode(input.ThinkingEnabled),
@@ -123,7 +123,7 @@ func collectExecuteDecisionFromLLM(
 			return nil, nil
 		}

-		decision, parseErr := infrallm.ParseJSONObject[newagentmodel.ExecuteDecision](result.DecisionJSON)
+		decision, parseErr := llmservice.ParseJSONObject[newagentmodel.ExecuteDecision](result.DecisionJSON)
 		if parseErr != nil {
 			log.Printf(
 				"[DEBUG] execute LLM JSON 解析失败 chat=%s round=%d json=%s raw=%s",
--- a/backend/newAgent/node/execute/run.go
+++ b/backend/newAgent/node/execute/run.go
@@ -5,12 +5,12 @@ import (
 	"fmt"
 	newagentshared "github.com/LoveLosita/smartflow/backend/newAgent/shared"

-	infrallm "github.com/LoveLosita/smartflow/backend/infra/llm"
 	newagentmodel "github.com/LoveLosita/smartflow/backend/newAgent/model"
 	newagentprompt "github.com/LoveLosita/smartflow/backend/newAgent/prompt"
 	newagentstream "github.com/LoveLosita/smartflow/backend/newAgent/stream"
 	newagenttools "github.com/LoveLosita/smartflow/backend/newAgent/tools"
 	"github.com/LoveLosita/smartflow/backend/newAgent/tools/schedule"
+	llmservice "github.com/LoveLosita/smartflow/backend/services/llm"
 )

 const (
@@ -29,7 +29,7 @@ type ExecuteNodeInput struct {
 	RuntimeState          *newagentmodel.AgentRuntimeState
 	ConversationContext   *newagentmodel.ConversationContext
 	UserInput             string
-	Client                *infrallm.Client
+	Client                *llmservice.Client
 	ChunkEmitter          *newagentstream.ChunkEmitter
 	ResumeNode            string
 	ToolRegistry          *newagenttools.ToolRegistry
--- a/backend/newAgent/node/plan.go
+++ b/backend/newAgent/node/plan.go
@@ -10,11 +10,11 @@ import (

 	"github.com/google/uuid"

-	infrallm "github.com/LoveLosita/smartflow/backend/infra/llm"
 	newagentmodel "github.com/LoveLosita/smartflow/backend/newAgent/model"
 	newagentprompt "github.com/LoveLosita/smartflow/backend/newAgent/prompt"
 	newagentrouter "github.com/LoveLosita/smartflow/backend/newAgent/router"
 	newagentstream "github.com/LoveLosita/smartflow/backend/newAgent/stream"
+	llmservice "github.com/LoveLosita/smartflow/backend/services/llm"
 	"github.com/cloudwego/eino/schema"
 )

@@ -34,7 +34,7 @@ type PlanNodeInput struct {
 	RuntimeState          *newagentmodel.AgentRuntimeState
 	ConversationContext   *newagentmodel.ConversationContext
 	UserInput             string
-	Client                *infrallm.Client
+	Client                *llmservice.Client
 	ChunkEmitter          *newagentstream.ChunkEmitter
 	ResumeNode            string
 	AlwaysExecute         bool                          // true 时计划生成后自动确认，不进入 confirm 节点
@@ -87,7 +87,7 @@ func RunPlanNode(ctx context.Context, input PlanNodeInput) error {
 	reader, err := input.Client.Stream(
 		ctx,
 		messages,
-		infrallm.GenerateOptions{
+		llmservice.GenerateOptions{
 			Temperature: 0.2,
 			// 显式设置上限，避免依赖框架默认值（默认 4096）导致长决策被截断。
 			// 注意：当前模型接口 max_tokens 上限为 131072，超过会 400。
@@ -149,7 +149,7 @@ func RunPlanNode(ctx context.Context, input PlanNodeInput) error {
 			return fmt.Errorf("规划解析失败，原始输出=%s", result.RawBuffer)
 		}

-		decision, parseErr := infrallm.ParseJSONObject[newagentmodel.PlanDecision](result.DecisionJSON)
+		decision, parseErr := llmservice.ParseJSONObject[newagentmodel.PlanDecision](result.DecisionJSON)
 		if parseErr != nil {
 			return fmt.Errorf("规划决策 JSON 解析失败: %w (raw=%s)", parseErr, result.RawBuffer)
 		}
@@ -390,9 +390,9 @@ func buildPinnedPlanText(steps []newagentmodel.PlanStep) string {

 // resolveThinkingMode 根据配置布尔值返回对应的 ThinkingMode。
 // 供 plan / execute / deliver 节点统一使用。
-func resolveThinkingMode(enabled bool) infrallm.ThinkingMode {
+func resolveThinkingMode(enabled bool) llmservice.ThinkingMode {
 	if enabled {
-		return infrallm.ThinkingModeEnabled
+		return llmservice.ThinkingModeEnabled
 	}
-	return infrallm.ThinkingModeDisabled
+	return llmservice.ThinkingModeDisabled
 }
--- a/backend/newAgent/node/quick_task.go
+++ b/backend/newAgent/node/quick_task.go
@@ -8,13 +8,13 @@ import (
 	"strings"
 	"time"

-	infrallm "github.com/LoveLosita/smartflow/backend/infra/llm"
 	taskmodel "github.com/LoveLosita/smartflow/backend/model"
 	newagentmodel "github.com/LoveLosita/smartflow/backend/newAgent/model"
 	newagentprompt "github.com/LoveLosita/smartflow/backend/newAgent/prompt"
 	newagentrouter "github.com/LoveLosita/smartflow/backend/newAgent/router"
 	newagentshared "github.com/LoveLosita/smartflow/backend/newAgent/shared"
 	newagentstream "github.com/LoveLosita/smartflow/backend/newAgent/stream"
+	llmservice "github.com/LoveLosita/smartflow/backend/services/llm"
 	"github.com/cloudwego/eino/schema"
 )

@@ -30,7 +30,7 @@ type QuickTaskNodeInput struct {
 	RuntimeState          *newagentmodel.AgentRuntimeState
 	ConversationContext   *newagentmodel.ConversationContext
 	UserInput             string
-	Client                *infrallm.Client
+	Client                *llmservice.Client
 	ChunkEmitter          *newagentstream.ChunkEmitter
 	QuickTaskDeps         newagentmodel.QuickTaskDeps
 	PersistVisibleMessage newagentmodel.PersistVisibleMessageFunc
@@ -77,7 +77,7 @@ func RunQuickTaskNode(ctx context.Context, input QuickTaskNodeInput) error {
 	messages := newagentprompt.BuildQuickTaskMessagesSimple(input.UserInput)

 	// 2. 真流式调用 LLM。
-	reader, err := input.Client.Stream(ctx, messages, infrallm.GenerateOptions{
+	reader, err := input.Client.Stream(ctx, messages, llmservice.GenerateOptions{
 		Temperature: 0.3,
 		MaxTokens:   512,
 	})
@@ -130,7 +130,7 @@ func RunQuickTaskNode(ctx context.Context, input QuickTaskNodeInput) error {
 		// 解析 JSON。
 		log.Printf("[DEBUG] quick_task: LLM 原始决策 JSON chat=%s json=%s", flowState.ConversationID, result.DecisionJSON)
 		var parseErr error
-		decision, parseErr = infrallm.ParseJSONObject[quickTaskDecision](result.DecisionJSON)
+		decision, parseErr = llmservice.ParseJSONObject[quickTaskDecision](result.DecisionJSON)
 		if parseErr != nil {
 			log.Printf("[DEBUG] quick_task: JSON 解析失败 chat=%s json=%s", flowState.ConversationID, result.DecisionJSON)
 			if result.RawBuffer != "" {
--- a/backend/newAgent/node/unified_compact.go
+++ b/backend/newAgent/node/unified_compact.go
@@ -6,11 +6,11 @@ import (
 	"fmt"
 	"log"

-	infrallm "github.com/LoveLosita/smartflow/backend/infra/llm"
 	newagentmodel "github.com/LoveLosita/smartflow/backend/newAgent/model"
 	newagentprompt "github.com/LoveLosita/smartflow/backend/newAgent/prompt"
 	newagentstream "github.com/LoveLosita/smartflow/backend/newAgent/stream"
 	"github.com/LoveLosita/smartflow/backend/pkg"
+	llmservice "github.com/LoveLosita/smartflow/backend/services/llm"
 	"github.com/cloudwego/eino/schema"
 )

@@ -22,7 +22,7 @@ import (
 //  3. StageName 和 StatusBlockID 用于区分日志来源和 SSE 状态推送。
 type UnifiedCompactInput struct {
 	// Client 用于调用 LLM 压缩 msg1/msg2。
-	Client *infrallm.Client
+	Client *llmservice.Client
 	// CompactionStore 用于持久化压缩摘要和 token 统计，为 nil 时跳过持久化。
 	CompactionStore newagentmodel.CompactionStore
 	// FlowState 提供 userID / chatID / roundUsed 等定位信息。
--- a/backend/newAgent/prompt/compact_msg1.go
+++ b/backend/newAgent/prompt/compact_msg1.go
@@ -4,7 +4,7 @@ import (
 	"context"
 	"fmt"

-	infrallm "github.com/LoveLosita/smartflow/backend/infra/llm"
+	llmservice "github.com/LoveLosita/smartflow/backend/services/llm"
 	"github.com/cloudwego/eino/schema"
 )

@@ -24,7 +24,7 @@ const compactMsg1SystemPrompt = `你是一个对话压缩助手。你的任务
 // existingSummary 不为空时表示已有旧摘要，需要合并压缩。
 func CompactMsg1(
 	ctx context.Context,
-	client *infrallm.Client,
+	client *llmservice.Client,
 	historyText string,
 	existingSummary string,
 ) (string, error) {
@@ -49,7 +49,7 @@ func CompactMsg1(
 		schema.UserMessage(userContent),
 	}

-	result, err := client.GenerateText(ctx, messages, infrallm.GenerateOptions{
+	result, err := client.GenerateText(ctx, messages, llmservice.GenerateOptions{
 		MaxTokens: 4000,
 	})
 	if err != nil {
--- a/backend/newAgent/prompt/compact_msg2.go
+++ b/backend/newAgent/prompt/compact_msg2.go
@@ -4,7 +4,7 @@ import (
 	"context"
 	"fmt"

-	infrallm "github.com/LoveLosita/smartflow/backend/infra/llm"
+	llmservice "github.com/LoveLosita/smartflow/backend/services/llm"
 	"github.com/cloudwego/eino/schema"
 )

@@ -23,7 +23,7 @@ const compactMsg2SystemPrompt = `你是一个执行记录压缩助手。你的
 // recentText 是保留的近期记录原文，不参与压缩。
 func CompactMsg2(
 	ctx context.Context,
-	client *infrallm.Client,
+	client *llmservice.Client,
 	earlyLoopText string,
 ) (string, error) {
 	userContent := fmt.Sprintf(`早期的 ReAct 执行记录：
@@ -36,7 +36,7 @@ func CompactMsg2(
 		schema.UserMessage(userContent),
 	}

-	result, err := client.GenerateText(ctx, messages, infrallm.GenerateOptions{
+	result, err := client.GenerateText(ctx, messages, llmservice.GenerateOptions{
 		MaxTokens: 4000,
 	})
 	if err != nil {
--- a/backend/newAgent/router/decision_parser.go
+++ b/backend/newAgent/router/decision_parser.go
@@ -26,7 +26,7 @@ var (
 // StreamDecisionResult 描述解析器的最终输出状态。
 type StreamDecisionResult struct {
 	// DecisionJSON 是标签内提取的完整 JSON 字符串。
-	// 调用方应使用 infrallm.ParseJSONObject[T] 将其解析为具体决策类型。
+	// 调用方应使用 llmservice.ParseJSONObject[T] 将其解析为具体决策类型。
 	DecisionJSON string

 	// BeforeText 是 <SMARTFLOW_DECISION> 标签之前的自然语言前言。
@@ -179,7 +179,7 @@ func (p *StreamDecisionParser) Result() *StreamDecisionResult {
 }

 // extractJSONFromTag 从标签内文本中提取第一个完整 JSON 对象。
-// 复用括号计数逻辑，与 infrallm.ExtractJSONObject 一致。
+// 复用括号计数逻辑，与 llmservice.ExtractJSONObject 一致。
 func extractJSONFromTag(text string) string {
 	clean := strings.TrimSpace(text)
 	if clean == "" {
--- a/backend/newAgent/shared/node_thinking.go
+++ b/backend/newAgent/shared/node_thinking.go
@@ -1,10 +1,10 @@
 package newagentshared

-import infrallm "github.com/LoveLosita/smartflow/backend/infra/llm"
+import llmservice "github.com/LoveLosita/smartflow/backend/services/llm"

-func ResolveThinkingMode(enabled bool) infrallm.ThinkingMode {
+func ResolveThinkingMode(enabled bool) llmservice.ThinkingMode {
 	if enabled {
-		return infrallm.ThinkingModeEnabled
+		return llmservice.ThinkingModeEnabled
 	}
-	return infrallm.ThinkingModeDisabled
+	return llmservice.ThinkingModeDisabled
 }
--- a/backend/newAgent/shared/node_unified_compact.go
+++ b/backend/newAgent/shared/node_unified_compact.go
@@ -6,11 +6,11 @@ import (
 	"fmt"
 	"log"

-	infrallm "github.com/LoveLosita/smartflow/backend/infra/llm"
 	newagentmodel "github.com/LoveLosita/smartflow/backend/newAgent/model"
 	newagentprompt "github.com/LoveLosita/smartflow/backend/newAgent/prompt"
 	newagentstream "github.com/LoveLosita/smartflow/backend/newAgent/stream"
 	"github.com/LoveLosita/smartflow/backend/pkg"
+	llmservice "github.com/LoveLosita/smartflow/backend/services/llm"
 	"github.com/cloudwego/eino/schema"
 )

@@ -22,7 +22,7 @@ import (
 // 3. StageName 和 StatusBlockID 用于区分日志来源与 SSE 状态推送目标。
 type UnifiedCompactInput struct {
 	// Client 用于调用 LLM 压缩 msg1/msg2。
-	Client *infrallm.Client
+	Client *llmservice.Client
 	// CompactionStore 用于持久化压缩摘要和 token 统计，为 nil 时跳过持久化。
 	CompactionStore newagentmodel.CompactionStore
 	// FlowState 提供 userID / conversationID / roundUsed 等定位信息。
--- a/backend/newAgent/stream/emitter.go
+++ b/backend/newAgent/stream/emitter.go
@@ -8,7 +8,7 @@ import (
 	"sync"
 	"time"

-	infrallm "github.com/LoveLosita/smartflow/backend/infra/llm"
+	llmservice "github.com/LoveLosita/smartflow/backend/services/llm"
 )

 // PayloadEmitter 是真正向外层 SSE 管道写 chunk 的最小接口。
@@ -540,7 +540,7 @@ func (e *ChunkEmitter) EmitDone() error {
 // 3. 不负责打开/关闭 StreamReader，调用方负责生命周期管理。
 func (e *ChunkEmitter) EmitStreamAssistantText(
 	ctx context.Context,
-	reader infrallm.StreamReader,
+	reader llmservice.StreamReader,
 	blockID, stage string,
 ) (string, error) {
 	if e == nil || reader == nil {
@@ -598,7 +598,7 @@ func (e *ChunkEmitter) EmitStreamAssistantText(
 // 用于只需展示思考过程而无需展示正文的场景。
 func (e *ChunkEmitter) EmitStreamReasoningText(
 	ctx context.Context,
-	reader infrallm.StreamReader,
+	reader llmservice.StreamReader,
 	blockID, stage string,
 ) (string, error) {
 	if e == nil || reader == nil {
--- a/backend/newAgent/tools/registry.go
+++ b/backend/newAgent/tools/registry.go
@@ -5,9 +5,9 @@ import (
 	"sort"
 	"strings"

-	infrarag "github.com/LoveLosita/smartflow/backend/infra/rag"
 	"github.com/LoveLosita/smartflow/backend/newAgent/tools/schedule"
 	"github.com/LoveLosita/smartflow/backend/newAgent/tools/web"
+	ragservice "github.com/LoveLosita/smartflow/backend/services/rag"
 )

 // ToolHandler 约定所有工具的统一执行签名。
@@ -32,7 +32,7 @@ type ToolSchemaEntry struct {
 // 2. 某些依赖即便暂未使用也允许保留，避免业务层重新到处 new；
 // 3. 具体依赖缺失时由对应工具自行返回结构化失败结果。
 type DefaultRegistryDeps struct {
-	RAGRuntime infrarag.Runtime
+	RAGRuntime ragservice.Runtime

 	// WebSearchProvider 为 nil 时，web_search / web_fetch 仍会注册，
 	// 但 handler 会返回“暂未启用”的只读 observation，不阻断主流程。