fix:修复表情包不保存的问题/新增统计项目

This commit is contained in:
SengokuCola
2025-11-29 20:06:18 +08:00
parent 2b29ba360f
commit e609632455
5 changed files with 218 additions and 68 deletions

View File

@@ -8,8 +8,9 @@ import json
import time
import re
from pathlib import Path
from typing import Dict, List, Optional, Set
from typing import Any, Dict, List, Optional, Set
from dataclasses import dataclass, field
from json_repair import repair_json
from src.common.logger import get_logger
from src.common.data_models.database_data_model import DatabaseMessages
@@ -369,12 +370,12 @@ class ChatHistorySummarizer:
should_check = False
# 条件1: 消息数量 >= 100触发一次检查
if message_count >= 50:
if message_count >= 80:
should_check = True
logger.info(f"{self.log_prefix} 触发检查条件: 消息数量达到 {message_count} 条(阈值: 100条")
# 条件2: 距离上一次检查 > 3600 秒1小时触发一次检查
elif time_since_last_check > 1200:
elif time_since_last_check > 2400:
should_check = True
logger.info(f"{self.log_prefix} 触发检查条件: 距上次检查 {time_str}(阈值: 1小时")
@@ -483,11 +484,11 @@ class ChatHistorySummarizer:
topics_to_finalize: List[str] = []
for topic, item in self.topic_cache.items():
if item.no_update_checks >= 3:
logger.info(f"{self.log_prefix} 话题[{topic}] 连续 5 次检查无新增内容,触发打包存储")
logger.info(f"{self.log_prefix} 话题[{topic}] 连续 3 次检查无新增内容,触发打包存储")
topics_to_finalize.append(topic)
continue
if len(item.messages) > 8:
logger.info(f"{self.log_prefix} 话题[{topic}] 消息条数超过 30,触发打包存储")
if len(item.messages) > 5:
logger.info(f"{self.log_prefix} 话题[{topic}] 消息条数超过 4,触发打包存储")
topics_to_finalize.append(topic)
for topic in topics_to_finalize:
@@ -606,18 +607,42 @@ class ChatHistorySummarizer:
max_tokens=800,
)
import re
logger.info(f"{self.log_prefix} 话题识别LLM Prompt: {prompt}")
logger.info(f"{self.log_prefix} 话题识别LLM Response: {response}")
json_str = response.strip()
# 移除可能的 markdown 代码块标记
json_str = re.sub(r"^```json\s*", "", json_str, flags=re.MULTILINE)
json_str = re.sub(r"^```\s*", "", json_str, flags=re.MULTILINE)
json_str = json_str.strip()
# 尝试从响应中提取JSON代码块
json_str = None
json_pattern = r"```json\s*(.*?)\s*```"
matches = re.findall(json_pattern, response, re.DOTALL)
if matches:
# 找到JSON代码块使用第一个匹配
json_str = matches[0].strip()
else:
# 如果没有找到代码块尝试查找JSON数组的开始和结束位置
# 查找第一个 [ 和最后一个 ]
start_idx = response.find('[')
end_idx = response.rfind(']')
if start_idx != -1 and end_idx != -1 and end_idx > start_idx:
json_str = response[start_idx:end_idx + 1].strip()
else:
# 如果还是找不到尝试直接使用整个响应移除可能的markdown标记
json_str = response.strip()
json_str = re.sub(r"^```json\s*", "", json_str, flags=re.MULTILINE)
json_str = re.sub(r"^```\s*", "", json_str, flags=re.MULTILINE)
json_str = json_str.strip()
# 尝试直接解析为 JSON 数组
result = json.loads(json_str)
# 使用json_repair修复可能的JSON错误
if json_str:
try:
repaired_json = repair_json(json_str)
result = json.loads(repaired_json) if isinstance(repaired_json, str) else repaired_json
except Exception as repair_error:
# 如果repair失败尝试直接解析
logger.warning(f"{self.log_prefix} JSON修复失败尝试直接解析: {repair_error}")
result = json.loads(json_str)
else:
raise ValueError("无法从响应中提取JSON内容")
if not isinstance(result, list):
logger.error(f"{self.log_prefix} 话题识别返回的 JSON 不是列表: {result}")
@@ -722,41 +747,30 @@ class ChatHistorySummarizer:
)
# 解析JSON响应
import re
# 移除可能的markdown代码块标记
json_str = response.strip()
json_str = re.sub(r"^```json\s*", "", json_str, flags=re.MULTILINE)
json_str = re.sub(r"^```\s*", "", json_str, flags=re.MULTILINE)
json_str = json_str.strip()
# 尝试找到JSON对象的开始结束位置
# 查找第一个 { 和最后一个匹配的 }
# 查找JSON对象的开始结束
start_idx = json_str.find("{")
if start_idx == -1:
raise ValueError("未找到JSON对象开始标记")
# 从后往前查找最后一个 }
end_idx = json_str.rfind("}")
if end_idx == -1 or end_idx <= start_idx:
raise ValueError("未找到JSON对象结束标记")
logger.warning(f"{self.log_prefix} JSON缺少结束标记,尝试自动修复")
extracted_json = json_str[start_idx:]
else:
extracted_json = json_str[start_idx : end_idx + 1]
# 提取JSON字符串
json_str = json_str[start_idx : end_idx + 1]
# 尝试解析JSON
try:
result = json.loads(json_str)
except json.JSONDecodeError:
# 如果解析失败,尝试修复字符串值中的中文引号
# 简单方法:将字符串值中的中文引号替换为转义的英文引号
# 使用状态机方法:遍历字符串,在字符串值内部替换中文引号
fixed_chars = []
def _parse_with_quote_fix(payload: str) -> Dict[str, Any]:
fixed_chars: List[str] = []
in_string = False
escape_next = False
i = 0
while i < len(json_str):
char = json_str[i]
while i < len(payload):
char = payload[i]
if escape_next:
fixed_chars.append(char)
escape_next = False
@@ -766,16 +780,28 @@ class ChatHistorySummarizer:
elif char == '"' and not escape_next:
fixed_chars.append(char)
in_string = not in_string
elif in_string and (char == '"' or char == '"'):
elif in_string and char in {"", ""}:
# 在字符串值内部,将中文引号替换为转义的英文引号
fixed_chars.append('\\"')
else:
fixed_chars.append(char)
i += 1
json_str = "".join(fixed_chars)
# 再次尝试解析
result = json.loads(json_str)
repaired = "".join(fixed_chars)
return json.loads(repaired)
try:
result = json.loads(extracted_json)
except json.JSONDecodeError:
try:
repaired_json = repair_json(extracted_json)
if isinstance(repaired_json, str):
result = json.loads(repaired_json)
else:
result = repaired_json
except Exception as repair_error:
logger.warning(f"{self.log_prefix} repair_json 失败,使用引号修复: {repair_error}")
result = _parse_with_quote_fix(extracted_json)
keywords = result.get("keywords", [])
summary = result.get("summary", "无概括")