Files
smartmate/backend/service/course_parse_ark.go
Losita 04b5836b39 Version: 0.9.42.dev.260424
后端:
1. 新增课表图片识别接口,支持上传截图后返回“可编辑草稿”(success / partial / reject),并补齐大图、空图、格式不支持、识别能力未配置等错误分支。
2. 课表识别服务接入多模态 Responses 链路,完善图片请求归一化与安全校验(大小、MIME、内容探测),并对识别结果做结构化清洗、强/弱约束校验、告警去重与默认文案兜底。
3. 新增 Ark Responses 统一客户端抽象,支持文本+图片输入、JSON对象输出、usage统计透传与不完整输出识别;同时补齐模型返回 finish_reason 透传,便于定位截断问题。
4. 启动阶段增加课表识图模型与参数注入(模型名、最大图片字节、最大输出token),并将配置示例收敛为“仅保留当前代码实际读取项”。

前端:
5. 课表中心新增“导入课表”完整闭环:上传图片识别、草稿编辑校对、正式导入落库;并新增对应 API 与类型定义。
6. 导入弹窗支持识别中止、全局告警与行级告警展示、低置信度提示、行内编辑、手动新增、删除、拖拽排序、本地校验与提交前二次确认。
7. 正式导入前将草稿按“课程名+地点+是否允许嵌入”聚合为导入结构,并统一携带幂等键请求头,降低重复提交风险。
8. 周课表画板修复跨节次事件遮挡导致的网格错位问题,改进“完全遮挡/部分遮挡”渲染判定与 grid 行定位。
9. 助手流式区域优化“思考中”指示逻辑与样式,避免已有正文时仍展示回答中占位;同时补充全局组件视觉统一(弹窗/按钮)样式。

仓库:
10. 新增课表图片识别前端对接说明文档,补充主动优化能力 PRD 讨论稿,并在协作规范中新增“实现 Eino 新能力前需先查官方文档”的约束。
2026-04-24 23:33:43 +08:00

225 lines
7.0 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package service
import (
"context"
"encoding/base64"
"fmt"
"log"
"strings"
"time"
infrallm "github.com/LoveLosita/smartflow/backend/infra/llm"
"github.com/LoveLosita/smartflow/backend/model"
)
// ParseCourseTableImage 使用 Ark SDK Responses 解析课程表图片。
func (ss *CourseService) ParseCourseTableImage(ctx context.Context, req model.CourseImageParseRequest) (*model.CourseImageParseResponse, error) {
if ss == nil || ss.courseImageResponsesClient == nil {
log.Printf(
"[COURSE_PARSE][SERVICE] parser unavailable model_name=%q filename=%q mime=%q bytes=%d",
ss.courseImageModel,
req.Filename,
req.MIMEType,
len(req.ImageBytes),
)
return nil, ErrCourseImageParserUnavailable
}
normalizedReq, err := normalizeCourseImageParseRequest(req, ss.courseImageConfig)
if err != nil {
log.Printf(
"[COURSE_PARSE][SERVICE] request normalization failed filename=%q mime=%q bytes=%d err=%v",
req.Filename,
req.MIMEType,
len(req.ImageBytes),
err,
)
return nil, err
}
log.Printf(
"[COURSE_PARSE][SERVICE] normalized request model_name=%q filename=%q mime=%q bytes=%d max_bytes=%d",
ss.courseImageModel,
normalizedReq.Filename,
normalizedReq.MIMEType,
len(normalizedReq.ImageBytes),
ss.courseImageConfig.MaxImageBytes,
)
messages, base64Chars, promptChars := buildCourseImageParseResponsesMessages(normalizedReq)
startAt := time.Now()
log.Printf(
"[COURSE_PARSE][SERVICE] model invoke start model_name=%q filename=%q mime=%q message_count=%d base64_chars=%d prompt_chars=%d payload_chars_estimate=%d thinking=%s temperature=%.2f max_output_tokens=%d text_format=%s",
ss.courseImageModel,
normalizedReq.Filename,
normalizedReq.MIMEType,
len(messages),
base64Chars,
promptChars,
base64Chars+promptChars+len(strings.TrimSpace(courseImageParseSystemPrompt)),
infrallm.ThinkingModeDisabled,
courseImageParseTemperature,
ss.courseImageConfig.MaxTokens,
"json_object",
)
// 1. 课程表图片识别输出体量大,显式透传 max_output_tokens避免被默认值截断。
// 2. text_format 固定为 json_object降低输出混入解释文本导致解析失败的概率。
// 3. thinking 显式关闭,优先保证课程导入链路稳定性。
draft, rawResult, err := infrallm.GenerateArkResponsesJSON[model.CourseImageParseResponse](ctx, ss.courseImageResponsesClient, messages, infrallm.ArkResponsesOptions{
Temperature: courseImageParseTemperature,
MaxOutputTokens: ss.courseImageConfig.MaxTokens,
Thinking: infrallm.ThinkingModeDisabled,
TextFormat: "json_object",
})
if err != nil {
rawText := ""
rawChars := 0
status := ""
incompleteReason := ""
errorCode := ""
errorMessage := ""
inputTokens := int64(0)
outputTokens := int64(0)
totalTokens := int64(0)
if rawResult != nil {
rawText = strings.TrimSpace(rawResult.Text)
rawChars = len(rawText)
status = strings.TrimSpace(rawResult.Status)
incompleteReason = strings.TrimSpace(rawResult.IncompleteReason)
errorCode = strings.TrimSpace(rawResult.ErrorCode)
errorMessage = strings.TrimSpace(rawResult.ErrorMessage)
if rawResult.Usage != nil {
inputTokens = rawResult.Usage.InputTokens
outputTokens = rawResult.Usage.OutputTokens
totalTokens = rawResult.Usage.TotalTokens
}
}
log.Printf(
"[COURSE_PARSE][SERVICE] model invoke failed model_name=%q filename=%q mime=%q cost_ms=%d err=%v status=%q incomplete_reason=%q error_code=%q error_message=%q input_tokens=%d output_tokens=%d total_tokens=%d raw_chars=%d raw_full=\n%s",
ss.courseImageModel,
normalizedReq.Filename,
normalizedReq.MIMEType,
time.Since(startAt).Milliseconds(),
err,
status,
incompleteReason,
errorCode,
errorMessage,
inputTokens,
outputTokens,
totalTokens,
rawChars,
rawText,
)
if isCourseImageOutputTruncated(rawResult) {
return nil, fmt.Errorf(
"课程表识别输出疑似被 max_output_tokens 截断status=%s incomplete_reason=%s output_tokens=%d max_output_tokens=%d",
status,
incompleteReason,
outputTokens,
ss.courseImageConfig.MaxTokens,
)
}
return nil, err
}
rawText := ""
rawChars := 0
status := ""
incompleteReason := ""
errorCode := ""
errorMessage := ""
inputTokens := int64(0)
outputTokens := int64(0)
totalTokens := int64(0)
if rawResult != nil {
rawText = strings.TrimSpace(rawResult.Text)
rawChars = len(rawText)
status = strings.TrimSpace(rawResult.Status)
incompleteReason = strings.TrimSpace(rawResult.IncompleteReason)
errorCode = strings.TrimSpace(rawResult.ErrorCode)
errorMessage = strings.TrimSpace(rawResult.ErrorMessage)
if rawResult.Usage != nil {
inputTokens = rawResult.Usage.InputTokens
outputTokens = rawResult.Usage.OutputTokens
totalTokens = rawResult.Usage.TotalTokens
}
}
log.Printf(
"[COURSE_PARSE][SERVICE] model invoke success model_name=%q filename=%q mime=%q cost_ms=%d status=%q incomplete_reason=%q error_code=%q error_message=%q input_tokens=%d output_tokens=%d total_tokens=%d raw_chars=%d raw_full=\n%s",
ss.courseImageModel,
normalizedReq.Filename,
normalizedReq.MIMEType,
time.Since(startAt).Milliseconds(),
status,
incompleteReason,
errorCode,
errorMessage,
inputTokens,
outputTokens,
totalTokens,
rawChars,
rawText,
)
normalizedDraft, err := normalizeCourseImageParseResponse(draft)
if err != nil {
log.Printf(
"[COURSE_PARSE][SERVICE] draft normalization failed model_name=%q filename=%q err=%v draft_status=%v row_count=%d",
ss.courseImageModel,
normalizedReq.Filename,
err,
draft.DraftStatus,
len(draft.Rows),
)
return nil, err
}
log.Printf(
"[COURSE_PARSE][SERVICE] draft normalization success model_name=%q filename=%q draft_status=%s rows=%d warnings=%d",
ss.courseImageModel,
normalizedReq.Filename,
normalizedDraft.DraftStatus,
len(normalizedDraft.Rows),
len(normalizedDraft.Warnings),
)
return normalizedDraft, nil
}
func buildCourseImageParseResponsesMessages(req *model.CourseImageParseRequest) ([]infrallm.ArkResponsesMessage, int, int) {
userPrompt := fmt.Sprintf(courseImageParseUserPromptTemplate, req.Filename, req.MIMEType)
base64Data := base64.StdEncoding.EncodeToString(req.ImageBytes)
imageDataURL := fmt.Sprintf("data:%s;base64,%s", req.MIMEType, base64Data)
messages := []infrallm.ArkResponsesMessage{
{
Role: "system",
Text: strings.TrimSpace(courseImageParseSystemPrompt),
},
{
Role: "user",
Text: strings.TrimSpace(userPrompt),
ImageURL: imageDataURL,
ImageDetail: "high",
},
}
return messages, len(base64Data), len(strings.TrimSpace(userPrompt))
}
func isCourseImageOutputTruncated(rawResult *infrallm.ArkResponsesResult) bool {
if rawResult == nil {
return false
}
reason := strings.ToLower(strings.TrimSpace(rawResult.IncompleteReason))
if strings.Contains(reason, "max_output_tokens") ||
strings.Contains(reason, "max_tokens") ||
strings.Contains(reason, "length") {
return true
}
return strings.EqualFold(strings.TrimSpace(rawResult.Status), "incomplete") && reason == ""
}