Version: 0.9.65.dev.260503
后端: 1. 阶段 1.5/1.6 收口 llm-service / rag-service,统一模型出口与检索基础设施入口,清退 backend/infra/llm 与 backend/infra/rag 旧实现; 2. 同步更新相关调用链与微服务迁移计划文档
This commit is contained in:
85
backend/services/rag/chunk/text_chunker.go
Normal file
85
backend/services/rag/chunk/text_chunker.go
Normal file
@@ -0,0 +1,85 @@
|
||||
package chunk
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/LoveLosita/smartflow/backend/services/rag/core"
|
||||
)
|
||||
|
||||
// TextChunker 是默认文本切块器。
|
||||
type TextChunker struct{}
|
||||
|
||||
func NewTextChunker() *TextChunker {
|
||||
return &TextChunker{}
|
||||
}
|
||||
|
||||
// Chunk 对文本执行固定窗口切块。
|
||||
//
|
||||
// 步骤化说明:
|
||||
// 1. 先做空白归一,避免无效块进入向量库;
|
||||
// 2. 再按 chunk_size/overlap 滑窗切割;
|
||||
// 3. 每块继承原文 metadata,并补充 chunk 序号。
|
||||
func (c *TextChunker) Chunk(_ context.Context, doc core.SourceDocument, opt core.ChunkOption) ([]core.Chunk, error) {
|
||||
if strings.TrimSpace(doc.ID) == "" {
|
||||
return nil, fmt.Errorf("empty document id")
|
||||
}
|
||||
text := strings.TrimSpace(doc.Text)
|
||||
if text == "" {
|
||||
return nil, nil
|
||||
}
|
||||
if opt.ChunkSize <= 0 {
|
||||
opt.ChunkSize = 400
|
||||
}
|
||||
if opt.ChunkOverlap < 0 {
|
||||
opt.ChunkOverlap = 0
|
||||
}
|
||||
if opt.ChunkOverlap >= opt.ChunkSize {
|
||||
opt.ChunkOverlap = opt.ChunkSize / 5
|
||||
}
|
||||
|
||||
runes := []rune(text)
|
||||
step := opt.ChunkSize - opt.ChunkOverlap
|
||||
if step <= 0 {
|
||||
step = opt.ChunkSize
|
||||
}
|
||||
|
||||
result := make([]core.Chunk, 0, len(runes)/step+1)
|
||||
order := 0
|
||||
for start := 0; start < len(runes); start += step {
|
||||
end := start + opt.ChunkSize
|
||||
if end > len(runes) {
|
||||
end = len(runes)
|
||||
}
|
||||
chunkText := strings.TrimSpace(string(runes[start:end]))
|
||||
if chunkText == "" {
|
||||
continue
|
||||
}
|
||||
metadata := cloneMap(doc.Metadata)
|
||||
metadata["chunk_order"] = order
|
||||
result = append(result, core.Chunk{
|
||||
ID: fmt.Sprintf("%s#%d", doc.ID, order),
|
||||
DocumentID: doc.ID,
|
||||
Text: chunkText,
|
||||
Order: order,
|
||||
Metadata: metadata,
|
||||
})
|
||||
order++
|
||||
if end == len(runes) {
|
||||
break
|
||||
}
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func cloneMap(src map[string]any) map[string]any {
|
||||
if len(src) == 0 {
|
||||
return map[string]any{}
|
||||
}
|
||||
dst := make(map[string]any, len(src))
|
||||
for k, v := range src {
|
||||
dst[k] = v
|
||||
}
|
||||
return dst
|
||||
}
|
||||
Reference in New Issue
Block a user