后端: 1. Memory Day1 链路打通(chat_history -> outbox -> memory_jobs) - 更新 service/events/chat_history_persist.go:聊天消息落库同事务追加 memory.extract.requested 事件(仅 user 消息,失败回滚后由 outbox 重试) - 新建 service/events/memory_extract_requested.go:消费 memory.extract.requested 并幂等入队 memory_jobs,补齐 payload 校验、文本截断与 idempotency key - 更新 cmd/start.go:注册 RegisterMemoryExtractRequestedHandler 2. Memory 模块骨架落地(先跑通状态机,再接入真实抽取) - 新建 memory/model、repo、service、orchestrator、worker、utils 目录与 Day1 mock 抽取执行链 - 新建 model/memory.go:补齐 memory_items / memory_jobs / memory_audit_logs / memory_user_settings 与事件 payload 模型 - 更新 inits/mysql.go:接入 4 张 memory 相关表 AutoMigrate 3. RAG 复用基础设施预埋(依赖可替换) - 新建 infra/rag:core pipeline + chunk/embed/retrieve/rerank/store/corpus/config 分层实现 - 默认接入 MockEmbedder + InMemoryStore,预留 Milvus / Eino 适配实现 - 新增 infra/rag/RAG复用接口实施计划.md 4. 本地依赖与交接文档同步 - 更新 docker-compose.yml:新增 etcd / minio / milvus / attu 服务与数据卷 - 删除 newAgent/HANDOFF_工具研究与运行态重置.md、newAgent/阶段3_上下文瘦身设计.md - 新增 newAgent/HANDOFF_WebSearch两阶段实施计划.md、memory/HANDOFF-RAG复用后续实施计划.md、memory/README.md 前端:无 仓库:无
86 lines
1.8 KiB
Go
86 lines
1.8 KiB
Go
package chunk
|
||
|
||
import (
|
||
"context"
|
||
"fmt"
|
||
"strings"
|
||
|
||
"github.com/LoveLosita/smartflow/backend/infra/rag/core"
|
||
)
|
||
|
||
// TextChunker 是默认文本切块器。
|
||
type TextChunker struct{}
|
||
|
||
func NewTextChunker() *TextChunker {
|
||
return &TextChunker{}
|
||
}
|
||
|
||
// Chunk 对文本执行固定窗口切块。
|
||
//
|
||
// 步骤化说明:
|
||
// 1. 先做空白归一,避免无效块进入向量库;
|
||
// 2. 再按 chunk_size/overlap 滑窗切割;
|
||
// 3. 每块继承原文 metadata,并补充 chunk 序号。
|
||
func (c *TextChunker) Chunk(_ context.Context, doc core.SourceDocument, opt core.ChunkOption) ([]core.Chunk, error) {
|
||
if strings.TrimSpace(doc.ID) == "" {
|
||
return nil, fmt.Errorf("empty document id")
|
||
}
|
||
text := strings.TrimSpace(doc.Text)
|
||
if text == "" {
|
||
return nil, nil
|
||
}
|
||
if opt.ChunkSize <= 0 {
|
||
opt.ChunkSize = 400
|
||
}
|
||
if opt.ChunkOverlap < 0 {
|
||
opt.ChunkOverlap = 0
|
||
}
|
||
if opt.ChunkOverlap >= opt.ChunkSize {
|
||
opt.ChunkOverlap = opt.ChunkSize / 5
|
||
}
|
||
|
||
runes := []rune(text)
|
||
step := opt.ChunkSize - opt.ChunkOverlap
|
||
if step <= 0 {
|
||
step = opt.ChunkSize
|
||
}
|
||
|
||
result := make([]core.Chunk, 0, len(runes)/step+1)
|
||
order := 0
|
||
for start := 0; start < len(runes); start += step {
|
||
end := start + opt.ChunkSize
|
||
if end > len(runes) {
|
||
end = len(runes)
|
||
}
|
||
chunkText := strings.TrimSpace(string(runes[start:end]))
|
||
if chunkText == "" {
|
||
continue
|
||
}
|
||
metadata := cloneMap(doc.Metadata)
|
||
metadata["chunk_order"] = order
|
||
result = append(result, core.Chunk{
|
||
ID: fmt.Sprintf("%s#%d", doc.ID, order),
|
||
DocumentID: doc.ID,
|
||
Text: chunkText,
|
||
Order: order,
|
||
Metadata: metadata,
|
||
})
|
||
order++
|
||
if end == len(runes) {
|
||
break
|
||
}
|
||
}
|
||
return result, nil
|
||
}
|
||
|
||
func cloneMap(src map[string]any) map[string]any {
|
||
if len(src) == 0 {
|
||
return map[string]any{}
|
||
}
|
||
dst := make(map[string]any, len(src))
|
||
for k, v := range src {
|
||
dst[k] = v
|
||
}
|
||
return dst
|
||
}
|