better：优化表达方式学习和分割

2025-12-18 16:39:16 +08:00
parent dd891c4b18
commit f7a2f2329a
7 changed files with 1400 additions and 35 deletions
--- a/src/bw_learner/expression_learner.py
+++ b/src/bw_learner/expression_learner.py
@@ -18,6 +18,7 @@ from src.bw_learner.learner_utils import (
    is_bot_message,
    build_context_paragraph,
    contains_bot_self_name,
+    calculate_style_similarity,
 )
 from src.bw_learner.jargon_miner import miner_manager
 from json_repair import repair_json
@@ -405,17 +406,37 @@ class ExpressionLearner:
        context: str,
        current_time: float,
    ) -> None:
-        expr_obj = Expression.select().where((Expression.chat_id == self.chat_id) & (Expression.style == style)).first()
+        # 第一层：检查是否有完全一致的 style（检查 style 字段和 style_list）
+        expr_obj = await self._find_exact_style_match(style)

        if expr_obj:
+            # 找到完全匹配的 style，合并到现有记录（不使用 LLM 总结）
            await self._update_existing_expression(
                expr_obj=expr_obj,
                situation=situation,
+                style=style,
                context=context,
                current_time=current_time,
+                use_llm_summary=False,
            )
            return

+        # 第二层：检查是否有相似的 style（相似度 >= 0.75，检查 style 字段和 style_list）
+        similar_expr_obj = await self._find_similar_style_expression(style, similarity_threshold=0.75)
+
+        if similar_expr_obj:
+            # 找到相似的 style，合并到现有记录（使用 LLM 总结）
+            await self._update_existing_expression(
+                expr_obj=similar_expr_obj,
+                situation=situation,
+                style=style,
+                context=context,
+                current_time=current_time,
+                use_llm_summary=True,
+            )
+            return
+
+        # 没有找到匹配的记录，创建新记录
        await self._create_expression_record(
            situation=situation,
            style=style,
@@ -431,12 +452,14 @@ class ExpressionLearner:
        current_time: float,
    ) -> None:
        content_list = [situation]
-        formatted_situation = await self._compose_situation_text(content_list, 1, situation)
+        # 创建新记录时，直接使用原始的 situation，不进行总结
+        formatted_situation = situation

        Expression.create(
            situation=formatted_situation,
            style=style,
            content_list=json.dumps(content_list, ensure_ascii=False),
+            style_list=None,  # 新记录初始时 style_list 为空
            count=1,
            last_active_time=current_time,
            chat_id=self.chat_id,
@@ -448,23 +471,57 @@ class ExpressionLearner:
        self,
        expr_obj: Expression,
        situation: str,
+        style: str,
        context: str,
        current_time: float,
+        use_llm_summary: bool = True,
    ) -> None:
+        """
+        更新现有 Expression 记录（style 完全匹配或相似的情况）
+        将新的 situation 添加到 content_list，将新的 style 添加到 style_list（如果不同）
+        
+        Args:
+            use_llm_summary: 是否使用 LLM 进行总结，完全匹配时为 False，相似匹配时为 True
+        """
+        # 更新 content_list（添加新的 situation）
        content_list = self._parse_content_list(expr_obj.content_list)
        content_list.append(situation)
-
        expr_obj.content_list = json.dumps(content_list, ensure_ascii=False)
+
+        # 更新 style_list（如果 style 不同，添加到 style_list）
+        style_list = self._parse_style_list(expr_obj.style_list)
+        # 将原有的 style 也加入 style_list（如果还没有的话）
+        if expr_obj.style and expr_obj.style not in style_list:
+            style_list.append(expr_obj.style)
+        # 如果新的 style 不在 style_list 中，添加它
+        if style not in style_list:
+            style_list.append(style)
+        expr_obj.style_list = json.dumps(style_list, ensure_ascii=False)
+
+        # 更新其他字段
        expr_obj.count = (expr_obj.count or 0) + 1
        expr_obj.last_active_time = current_time
        expr_obj.context = context

-        new_situation = await self._compose_situation_text(
-            content_list=content_list,
-            count=expr_obj.count,
-            fallback=expr_obj.situation,
-        )
-        expr_obj.situation = new_situation
+        if use_llm_summary:
+            # 相似匹配时，使用 LLM 重新组合 situation 和 style
+            new_situation = await self._compose_situation_text(
+                content_list=content_list,
+                count=expr_obj.count,
+                fallback=expr_obj.situation,
+            )
+            expr_obj.situation = new_situation
+
+            new_style = await self._compose_style_text(
+                style_list=style_list,
+                count=expr_obj.count,
+                fallback=expr_obj.style or style,
+            )
+            expr_obj.style = new_style
+        else:
+            # 完全匹配时，不进行 LLM 总结，保持原有的 situation 和 style 不变
+            # 只更新 content_list 和 style_list
+            pass

        expr_obj.save()

@@ -477,6 +534,80 @@ class ExpressionLearner:
            return []
        return [str(item) for item in data if isinstance(item, str)] if isinstance(data, list) else []

+    def _parse_style_list(self, stored_list: Optional[str]) -> List[str]:
+        """解析 style_list JSON 字符串为列表，逻辑与 _parse_content_list 相同"""
+        if not stored_list:
+            return []
+        try:
+            data = json.loads(stored_list)
+        except json.JSONDecodeError:
+            return []
+        return [str(item) for item in data if isinstance(item, str)] if isinstance(data, list) else []
+
+    async def _find_exact_style_match(self, style: str) -> Optional[Expression]:
+        """
+        查找具有完全匹配 style 的 Expression 记录
+        检查 style 字段和 style_list 中的每一项
+        
+        Args:
+            style: 要查找的 style
+            
+        Returns:
+            找到的 Expression 对象，如果没有找到则返回 None
+        """
+        # 查询同一 chat_id 的所有记录
+        all_expressions = Expression.select().where(Expression.chat_id == self.chat_id)
+        
+        for expr in all_expressions:
+            # 检查 style 字段
+            if expr.style == style:
+                return expr
+            
+            # 检查 style_list 中的每一项
+            style_list = self._parse_style_list(expr.style_list)
+            if style in style_list:
+                return expr
+        
+        return None
+
+    async def _find_similar_style_expression(self, style: str, similarity_threshold: float = 0.75) -> Optional[Expression]:
+        """
+        查找具有相似 style 的 Expression 记录
+        检查 style 字段和 style_list 中的每一项
+        
+        Args:
+            style: 要查找的 style
+            similarity_threshold: 相似度阈值，默认 0.75
+            
+        Returns:
+            找到的最相似的 Expression 对象，如果没有找到则返回 None
+        """
+        # 查询同一 chat_id 的所有记录
+        all_expressions = Expression.select().where(Expression.chat_id == self.chat_id)
+        
+        best_match = None
+        best_similarity = 0.0
+        
+        for expr in all_expressions:
+            # 检查 style 字段
+            similarity = calculate_style_similarity(style, expr.style)
+            if similarity >= similarity_threshold and similarity > best_similarity:
+                best_similarity = similarity
+                best_match = expr
+            
+            # 检查 style_list 中的每一项
+            style_list = self._parse_style_list(expr.style_list)
+            for existing_style in style_list:
+                similarity = calculate_style_similarity(style, existing_style)
+                if similarity >= similarity_threshold and similarity > best_similarity:
+                    best_similarity = similarity
+                    best_match = expr
+        
+        if best_match:
+            logger.debug(f"找到相似的 style: 相似度={best_similarity:.3f}, 现有='{best_match.style}', 新='{style}'")
+        
+        return best_match
+
    async def _compose_situation_text(self, content_list: List[str], count: int, fallback: str = "") -> str:
        sanitized = [c.strip() for c in content_list if c.strip()]
        summary = await self._summarize_situations(sanitized)
@@ -484,6 +615,39 @@ class ExpressionLearner:
            return summary
        return "/".join(sanitized) if sanitized else fallback

+    async def _compose_style_text(self, style_list: List[str], count: int, fallback: str = "") -> str:
+        """
+        组合 style 文本，如果 style_list 有多个元素则尝试总结
+        """
+        sanitized = [s.strip() for s in style_list if s.strip()]
+        if len(sanitized) > 1:
+            # 只有当有多个 style 时才尝试总结
+            summary = await self._summarize_styles(sanitized)
+            if summary:
+                return summary
+        # 如果只有一个或总结失败，返回第一个或 fallback
+        return sanitized[0] if sanitized else fallback
+
+    async def _summarize_styles(self, styles: List[str]) -> Optional[str]:
+        """总结多个 style，生成一个概括性的 style 描述"""
+        if not styles or len(styles) <= 1:
+            return None
+
+        prompt = (
+            "请阅读以下多个语言风格/表达方式，并将它们概括成一句简短的话，"
+            "长度不超过20个字，保留共同特点：\n"
+            f"{chr(10).join(f'- {s}' for s in styles[-10:])}\n只输出概括内容。"
+        )
+
+        try:
+            summary, _ = await self.summary_model.generate_response_async(prompt, temperature=0.2)
+            summary = summary.strip()
+            if summary:
+                return summary
+        except Exception as e:
+            logger.error(f"概括表达风格失败: {e}")
+        return None
+
    async def _summarize_situations(self, situations: List[str]) -> Optional[str]:
        if not situations:
            return None
--- a/src/bw_learner/learner_utils.py
+++ b/src/bw_learner/learner_utils.py
@@ -56,6 +56,38 @@ def calculate_similarity(text1: str, text2: str) -> float:
    return difflib.SequenceMatcher(None, text1, text2).ratio()


+def calculate_style_similarity(style1: str, style2: str) -> float:
+    """
+    计算两个 style 的相似度，返回0-1之间的值
+    在计算前会移除"使用"和"句式"这两个词（参考 expression_similarity_analysis.py）
+    
+    Args:
+        style1: 第一个 style
+        style2: 第二个 style
+    
+    Returns:
+        float: 相似度值，范围0-1
+    """
+    if not style1 or not style2:
+        return 0.0
+    
+    # 移除"使用"和"句式"这两个词
+    def remove_ignored_words(text: str) -> str:
+        """移除需要忽略的词"""
+        text = text.replace("使用", "")
+        text = text.replace("句式", "")
+        return text.strip()
+    
+    cleaned_style1 = remove_ignored_words(style1)
+    cleaned_style2 = remove_ignored_words(style2)
+    
+    # 如果清理后文本为空，返回0
+    if not cleaned_style1 or not cleaned_style2:
+        return 0.0
+    
+    return difflib.SequenceMatcher(None, cleaned_style1, cleaned_style2).ratio()
+
+
 def format_create_date(timestamp: float) -> str:
    """
    将时间戳格式化为可读的日期字符串
--- a/src/chat/utils/utils.py
+++ b/src/chat/utils/utils.py
@@ -211,7 +211,40 @@ def split_into_sentences_w_remove_punctuation(text: str) -> list[str]:
    if len_text < 3:
        return list(text) if random.random() < 0.01 else [text]

-    # 定义分隔符（包含换行符，换行符必须强制分割）
+    # 先标记哪些位置位于成对引号内部，避免在引号内部进行句子分割
+    # 支持的引号包括：中英文单/双引号和常见中文书名号/引号
+    quote_chars = {
+        '"',
+        "'",
+        "“",
+        "”",
+        "‘",
+        "’",
+        "「",
+        "」",
+        "『",
+        "』",
+    }
+    inside_quote = [False] * len_text
+    in_quote = False
+    current_quote_char = ""
+    for idx, ch in enumerate(text):
+        if ch in quote_chars:
+            # 遇到引号时切换状态（英文引号本身开闭相同，用同一个字符表示）
+            if not in_quote:
+                in_quote = True
+                current_quote_char = ch
+                inside_quote[idx] = False
+            else:
+                # 只有遇到同一类引号才视为关闭
+                if ch == current_quote_char or ch in {'"', "'"} and current_quote_char in {'"', "'"}:
+                    in_quote = False
+                    current_quote_char = ""
+                inside_quote[idx] = False
+        else:
+            inside_quote[idx] = in_quote
+
+    # 定义分隔符（包含换行符）
    separators = {"，", ",", " ", "。", ";", "\n"}
    segments = []
    current_segment = ""
@@ -221,31 +254,35 @@ def split_into_sentences_w_remove_punctuation(text: str) -> list[str]:
    while i < len(text):
        char = text[i]
        if char in separators:
-            # 换行符必须强制分割，不受其他规则影响
-            if char == "\n":
-                can_split = True
+            # 引号内部一律不作为分割点（包括换行）
+            if inside_quote[i]:
+                can_split = False
            else:
-                # 检查分割条件
-                can_split = True
-                # 检查分隔符左右是否有冒号（中英文），如果有则不分割
-                if i > 0:
-                    prev_char = text[i - 1]
-                    if prev_char in {":", "："}:
-                        can_split = False
-                if i < len(text) - 1:
-                    next_char = text[i + 1]
-                    if next_char in {":", "："}:
-                        can_split = False
-                
-                # 如果左右没有冒号，再检查空格的特殊情况
-                if can_split and char == " " and i > 0 and i < len(text) - 1:
-                    prev_char = text[i - 1]
-                    next_char = text[i + 1]
-                    # 不分割数字和数字、数字和英文、英文和数字、英文和英文之间的空格
-                    prev_is_alnum = prev_char.isdigit() or is_english_letter(prev_char)
-                    next_is_alnum = next_char.isdigit() or is_english_letter(next_char)
-                    if prev_is_alnum and next_is_alnum:
-                        can_split = False
+                # 换行符在不在引号内时都强制分割
+                if char == "\n":
+                    can_split = True
+                else:
+                    # 检查分割条件
+                    can_split = True
+                    # 检查分隔符左右是否有冒号（中英文），如果有则不分割
+                    if i > 0:
+                        prev_char = text[i - 1]
+                        if prev_char in {":", "："}:
+                            can_split = False
+                    if i < len(text) - 1:
+                        next_char = text[i + 1]
+                        if next_char in {":", "："}:
+                            can_split = False
+
+                    # 如果左右没有冒号，再检查空格的特殊情况
+                    if can_split and char == " " and i > 0 and i < len(text) - 1:
+                        prev_char = text[i - 1]
+                        next_char = text[i + 1]
+                        # 不分割数字和数字、数字和英文、英文和数字、英文和英文之间的空格
+                        prev_is_alnum = prev_char.isdigit() or is_english_letter(prev_char)
+                        next_is_alnum = next_char.isdigit() or is_english_letter(next_char)
+                        if prev_is_alnum and next_is_alnum:
+                            can_split = False

            if can_split:
                # 只有当当前段不为空时才添加
--- a/src/common/database/database_model.py
+++ b/src/common/database/database_model.py
@@ -326,6 +326,7 @@ class Expression(BaseModel):
    context = TextField(null=True)

    content_list = TextField(null=True)
+    style_list = TextField(null=True)  # 存储相似的 style，格式与 content_list 相同（JSON 数组）
    count = IntegerField(default=1)
    last_active_time = FloatField()
    chat_id = TextField(index=True)