Files
smartmate/backend/memory/utils/normalize_facts.go
LoveLosita fae162162a Version: 0.9.13.dev.260410
后端:
1. Memory Day1 链路打通(chat_history -> outbox -> memory_jobs)
   - 更新 service/events/chat_history_persist.go:聊天消息落库同事务追加 memory.extract.requested 事件(仅 user 消息,失败回滚后由 outbox 重试)
   - 新建 service/events/memory_extract_requested.go:消费 memory.extract.requested 并幂等入队 memory_jobs,补齐 payload 校验、文本截断与 idempotency key
   - 更新 cmd/start.go:注册 RegisterMemoryExtractRequestedHandler
2. Memory 模块骨架落地(先跑通状态机,再接入真实抽取)
   - 新建 memory/model、repo、service、orchestrator、worker、utils 目录与 Day1 mock 抽取执行链
   - 新建 model/memory.go:补齐 memory_items / memory_jobs / memory_audit_logs / memory_user_settings 与事件 payload 模型
   - 更新 inits/mysql.go:接入 4 张 memory 相关表 AutoMigrate
3. RAG 复用基础设施预埋(依赖可替换)
   - 新建 infra/rag:core pipeline + chunk/embed/retrieve/rerank/store/corpus/config 分层实现
   - 默认接入 MockEmbedder + InMemoryStore,预留 Milvus / Eino 适配实现
   - 新增 infra/rag/RAG复用接口实施计划.md
4. 本地依赖与交接文档同步
   - 更新 docker-compose.yml:新增 etcd / minio / milvus / attu 服务与数据卷
   - 删除 newAgent/HANDOFF_工具研究与运行态重置.md、newAgent/阶段3_上下文瘦身设计.md
   - 新增 newAgent/HANDOFF_WebSearch两阶段实施计划.md、memory/HANDOFF-RAG复用后续实施计划.md、memory/README.md
前端:无 仓库:无
2026-04-10 13:07:54 +08:00

103 lines
2.4 KiB
Go

package utils
import (
"crypto/sha256"
"encoding/hex"
"fmt"
"strings"
memorymodel "github.com/LoveLosita/smartflow/backend/memory/model"
)
const (
maxTitleLength = 64
maxContentLength = 1000
)
// NormalizeFacts 对候选事实做标准化与过滤。
//
// 步骤:
// 1. 标准化 memory_type 与文本字段,丢弃空值和非法类型;
// 2. 对超长内容截断,避免脏数据污染后续链路;
// 3. 基于“类型+标准化内容”做去重,避免同一轮重复写入。
func NormalizeFacts(candidates []memorymodel.FactCandidate) []memorymodel.NormalizedFact {
if len(candidates) == 0 {
return nil
}
result := make([]memorymodel.NormalizedFact, 0, len(candidates))
seen := make(map[string]struct{}, len(candidates))
for _, candidate := range candidates {
memoryType := memorymodel.NormalizeMemoryType(candidate.MemoryType)
if memoryType == "" {
continue
}
content := normalizeWhitespace(candidate.Content)
if content == "" {
continue
}
content = truncateByRune(content, maxContentLength)
title := normalizeWhitespace(candidate.Title)
if title == "" {
title = truncateByRune(content, maxTitleLength)
}
title = truncateByRune(title, maxTitleLength)
confidence := clamp01(candidate.Confidence)
if confidence == 0 {
confidence = 0.6
}
normalizedContent := strings.ToLower(content)
contentHash := hashContent(memoryType, normalizedContent)
dedupKey := fmt.Sprintf("%s:%s", memoryType, contentHash)
if _, exists := seen[dedupKey]; exists {
continue
}
seen[dedupKey] = struct{}{}
result = append(result, memorymodel.NormalizedFact{
MemoryType: memoryType,
Title: title,
Content: content,
NormalizedContent: normalizedContent,
ContentHash: contentHash,
Confidence: confidence,
IsExplicit: candidate.IsExplicit,
})
}
return result
}
func normalizeWhitespace(raw string) string {
return strings.Join(strings.Fields(strings.TrimSpace(raw)), " ")
}
func truncateByRune(raw string, max int) string {
if max <= 0 {
return ""
}
runes := []rune(raw)
if len(runes) <= max {
return raw
}
return string(runes[:max])
}
func clamp01(v float64) float64 {
if v < 0 {
return 0
}
if v > 1 {
return 1
}
return v
}
func hashContent(memoryType, normalizedContent string) string {
sum := sha256.Sum256([]byte(memoryType + "::" + normalizedContent))
return hex.EncodeToString(sum[:])
}