better:优化表达方式学习和分割

This commit is contained in:
SengokuCola
2025-12-18 16:39:16 +08:00
parent dd891c4b18
commit f7a2f2329a
7 changed files with 1400 additions and 35 deletions

View File

@@ -18,6 +18,7 @@ from src.bw_learner.learner_utils import (
is_bot_message,
build_context_paragraph,
contains_bot_self_name,
calculate_style_similarity,
)
from src.bw_learner.jargon_miner import miner_manager
from json_repair import repair_json
@@ -405,17 +406,37 @@ class ExpressionLearner:
context: str,
current_time: float,
) -> None:
expr_obj = Expression.select().where((Expression.chat_id == self.chat_id) & (Expression.style == style)).first()
# 第一层:检查是否有完全一致的 style检查 style 字段和 style_list
expr_obj = await self._find_exact_style_match(style)
if expr_obj:
# 找到完全匹配的 style合并到现有记录不使用 LLM 总结)
await self._update_existing_expression(
expr_obj=expr_obj,
situation=situation,
style=style,
context=context,
current_time=current_time,
use_llm_summary=False,
)
return
# 第二层:检查是否有相似的 style相似度 >= 0.75,检查 style 字段和 style_list
similar_expr_obj = await self._find_similar_style_expression(style, similarity_threshold=0.75)
if similar_expr_obj:
# 找到相似的 style合并到现有记录使用 LLM 总结)
await self._update_existing_expression(
expr_obj=similar_expr_obj,
situation=situation,
style=style,
context=context,
current_time=current_time,
use_llm_summary=True,
)
return
# 没有找到匹配的记录,创建新记录
await self._create_expression_record(
situation=situation,
style=style,
@@ -431,12 +452,14 @@ class ExpressionLearner:
current_time: float,
) -> None:
content_list = [situation]
formatted_situation = await self._compose_situation_text(content_list, 1, situation)
# 创建新记录时,直接使用原始的 situation,不进行总结
formatted_situation = situation
Expression.create(
situation=formatted_situation,
style=style,
content_list=json.dumps(content_list, ensure_ascii=False),
style_list=None, # 新记录初始时 style_list 为空
count=1,
last_active_time=current_time,
chat_id=self.chat_id,
@@ -448,23 +471,57 @@ class ExpressionLearner:
self,
expr_obj: Expression,
situation: str,
style: str,
context: str,
current_time: float,
use_llm_summary: bool = True,
) -> None:
"""
更新现有 Expression 记录style 完全匹配或相似的情况)
将新的 situation 添加到 content_list将新的 style 添加到 style_list如果不同
Args:
use_llm_summary: 是否使用 LLM 进行总结,完全匹配时为 False相似匹配时为 True
"""
# 更新 content_list添加新的 situation
content_list = self._parse_content_list(expr_obj.content_list)
content_list.append(situation)
expr_obj.content_list = json.dumps(content_list, ensure_ascii=False)
# 更新 style_list如果 style 不同,添加到 style_list
style_list = self._parse_style_list(expr_obj.style_list)
# 将原有的 style 也加入 style_list如果还没有的话
if expr_obj.style and expr_obj.style not in style_list:
style_list.append(expr_obj.style)
# 如果新的 style 不在 style_list 中,添加它
if style not in style_list:
style_list.append(style)
expr_obj.style_list = json.dumps(style_list, ensure_ascii=False)
# 更新其他字段
expr_obj.count = (expr_obj.count or 0) + 1
expr_obj.last_active_time = current_time
expr_obj.context = context
new_situation = await self._compose_situation_text(
content_list=content_list,
count=expr_obj.count,
fallback=expr_obj.situation,
)
expr_obj.situation = new_situation
if use_llm_summary:
# 相似匹配时,使用 LLM 重新组合 situation 和 style
new_situation = await self._compose_situation_text(
content_list=content_list,
count=expr_obj.count,
fallback=expr_obj.situation,
)
expr_obj.situation = new_situation
new_style = await self._compose_style_text(
style_list=style_list,
count=expr_obj.count,
fallback=expr_obj.style or style,
)
expr_obj.style = new_style
else:
# 完全匹配时,不进行 LLM 总结,保持原有的 situation 和 style 不变
# 只更新 content_list 和 style_list
pass
expr_obj.save()
@@ -477,6 +534,80 @@ class ExpressionLearner:
return []
return [str(item) for item in data if isinstance(item, str)] if isinstance(data, list) else []
def _parse_style_list(self, stored_list: Optional[str]) -> List[str]:
"""解析 style_list JSON 字符串为列表,逻辑与 _parse_content_list 相同"""
if not stored_list:
return []
try:
data = json.loads(stored_list)
except json.JSONDecodeError:
return []
return [str(item) for item in data if isinstance(item, str)] if isinstance(data, list) else []
async def _find_exact_style_match(self, style: str) -> Optional[Expression]:
"""
查找具有完全匹配 style 的 Expression 记录
检查 style 字段和 style_list 中的每一项
Args:
style: 要查找的 style
Returns:
找到的 Expression 对象,如果没有找到则返回 None
"""
# 查询同一 chat_id 的所有记录
all_expressions = Expression.select().where(Expression.chat_id == self.chat_id)
for expr in all_expressions:
# 检查 style 字段
if expr.style == style:
return expr
# 检查 style_list 中的每一项
style_list = self._parse_style_list(expr.style_list)
if style in style_list:
return expr
return None
async def _find_similar_style_expression(self, style: str, similarity_threshold: float = 0.75) -> Optional[Expression]:
"""
查找具有相似 style 的 Expression 记录
检查 style 字段和 style_list 中的每一项
Args:
style: 要查找的 style
similarity_threshold: 相似度阈值,默认 0.75
Returns:
找到的最相似的 Expression 对象,如果没有找到则返回 None
"""
# 查询同一 chat_id 的所有记录
all_expressions = Expression.select().where(Expression.chat_id == self.chat_id)
best_match = None
best_similarity = 0.0
for expr in all_expressions:
# 检查 style 字段
similarity = calculate_style_similarity(style, expr.style)
if similarity >= similarity_threshold and similarity > best_similarity:
best_similarity = similarity
best_match = expr
# 检查 style_list 中的每一项
style_list = self._parse_style_list(expr.style_list)
for existing_style in style_list:
similarity = calculate_style_similarity(style, existing_style)
if similarity >= similarity_threshold and similarity > best_similarity:
best_similarity = similarity
best_match = expr
if best_match:
logger.debug(f"找到相似的 style: 相似度={best_similarity:.3f}, 现有='{best_match.style}', 新='{style}'")
return best_match
async def _compose_situation_text(self, content_list: List[str], count: int, fallback: str = "") -> str:
sanitized = [c.strip() for c in content_list if c.strip()]
summary = await self._summarize_situations(sanitized)
@@ -484,6 +615,39 @@ class ExpressionLearner:
return summary
return "/".join(sanitized) if sanitized else fallback
async def _compose_style_text(self, style_list: List[str], count: int, fallback: str = "") -> str:
"""
组合 style 文本,如果 style_list 有多个元素则尝试总结
"""
sanitized = [s.strip() for s in style_list if s.strip()]
if len(sanitized) > 1:
# 只有当有多个 style 时才尝试总结
summary = await self._summarize_styles(sanitized)
if summary:
return summary
# 如果只有一个或总结失败,返回第一个或 fallback
return sanitized[0] if sanitized else fallback
async def _summarize_styles(self, styles: List[str]) -> Optional[str]:
"""总结多个 style生成一个概括性的 style 描述"""
if not styles or len(styles) <= 1:
return None
prompt = (
"请阅读以下多个语言风格/表达方式,并将它们概括成一句简短的话,"
"长度不超过20个字保留共同特点\n"
f"{chr(10).join(f'- {s}' for s in styles[-10:])}\n只输出概括内容。"
)
try:
summary, _ = await self.summary_model.generate_response_async(prompt, temperature=0.2)
summary = summary.strip()
if summary:
return summary
except Exception as e:
logger.error(f"概括表达风格失败: {e}")
return None
async def _summarize_situations(self, situations: List[str]) -> Optional[str]:
if not situations:
return None

View File

@@ -56,6 +56,38 @@ def calculate_similarity(text1: str, text2: str) -> float:
return difflib.SequenceMatcher(None, text1, text2).ratio()
def calculate_style_similarity(style1: str, style2: str) -> float:
"""
计算两个 style 的相似度返回0-1之间的值
在计算前会移除"使用""句式"这两个词(参考 expression_similarity_analysis.py
Args:
style1: 第一个 style
style2: 第二个 style
Returns:
float: 相似度值范围0-1
"""
if not style1 or not style2:
return 0.0
# 移除"使用"和"句式"这两个词
def remove_ignored_words(text: str) -> str:
"""移除需要忽略的词"""
text = text.replace("使用", "")
text = text.replace("句式", "")
return text.strip()
cleaned_style1 = remove_ignored_words(style1)
cleaned_style2 = remove_ignored_words(style2)
# 如果清理后文本为空返回0
if not cleaned_style1 or not cleaned_style2:
return 0.0
return difflib.SequenceMatcher(None, cleaned_style1, cleaned_style2).ratio()
def format_create_date(timestamp: float) -> str:
"""
将时间戳格式化为可读的日期字符串

View File

@@ -211,7 +211,40 @@ def split_into_sentences_w_remove_punctuation(text: str) -> list[str]:
if len_text < 3:
return list(text) if random.random() < 0.01 else [text]
# 定义分隔符(包含换行符,换行符必须强制分割
# 先标记哪些位置位于成对引号内部,避免在引号内部进行句子分割
# 支持的引号包括:中英文单/双引号和常见中文书名号/引号
quote_chars = {
'"',
"'",
"",
"",
"",
"",
"",
"",
"",
"",
}
inside_quote = [False] * len_text
in_quote = False
current_quote_char = ""
for idx, ch in enumerate(text):
if ch in quote_chars:
# 遇到引号时切换状态(英文引号本身开闭相同,用同一个字符表示)
if not in_quote:
in_quote = True
current_quote_char = ch
inside_quote[idx] = False
else:
# 只有遇到同一类引号才视为关闭
if ch == current_quote_char or ch in {'"', "'"} and current_quote_char in {'"', "'"}:
in_quote = False
current_quote_char = ""
inside_quote[idx] = False
else:
inside_quote[idx] = in_quote
# 定义分隔符(包含换行符)
separators = {"", ",", " ", "", ";", "\n"}
segments = []
current_segment = ""
@@ -221,31 +254,35 @@ def split_into_sentences_w_remove_punctuation(text: str) -> list[str]:
while i < len(text):
char = text[i]
if char in separators:
# 换行符必须强制分割,不受其他规则影响
if char == "\n":
can_split = True
# 引号内部一律不作为分割点(包括换行)
if inside_quote[i]:
can_split = False
else:
# 检查分割条件
can_split = True
# 检查分隔符左右是否有冒号(中英文),如果有则不分割
if i > 0:
prev_char = text[i - 1]
if prev_char in {":", ""}:
can_split = False
if i < len(text) - 1:
next_char = text[i + 1]
if next_char in {":", ""}:
can_split = False
# 如果左右没有冒号,再检查空格的特殊情况
if can_split and char == " " and i > 0 and i < len(text) - 1:
prev_char = text[i - 1]
next_char = text[i + 1]
# 不分割数字和数字、数字和英文、英文和数字、英文和英文之间的空格
prev_is_alnum = prev_char.isdigit() or is_english_letter(prev_char)
next_is_alnum = next_char.isdigit() or is_english_letter(next_char)
if prev_is_alnum and next_is_alnum:
can_split = False
# 换行符在不在引号内时都强制分割
if char == "\n":
can_split = True
else:
# 检查分割条件
can_split = True
# 检查分隔符左右是否有冒号(中英文),如果有则不分割
if i > 0:
prev_char = text[i - 1]
if prev_char in {":", ""}:
can_split = False
if i < len(text) - 1:
next_char = text[i + 1]
if next_char in {":", ""}:
can_split = False
# 如果左右没有冒号,再检查空格的特殊情况
if can_split and char == " " and i > 0 and i < len(text) - 1:
prev_char = text[i - 1]
next_char = text[i + 1]
# 不分割数字和数字、数字和英文、英文和数字、英文和英文之间的空格
prev_is_alnum = prev_char.isdigit() or is_english_letter(prev_char)
next_is_alnum = next_char.isdigit() or is_english_letter(next_char)
if prev_is_alnum and next_is_alnum:
can_split = False
if can_split:
# 只有当当前段不为空时才添加

View File

@@ -326,6 +326,7 @@ class Expression(BaseModel):
context = TextField(null=True)
content_list = TextField(null=True)
style_list = TextField(null=True) # 存储相似的 style格式与 content_list 相同JSON 数组)
count = IntegerField(default=1)
last_active_time = FloatField()
chat_id = TextField(index=True)