135 lines
4.6 KiB
Python
135 lines
4.6 KiB
Python
from json_repair import repair_json
|
||
from typing import List, Tuple
|
||
|
||
import re
|
||
import json
|
||
|
||
from src.common.logger import get_logger
|
||
|
||
logger = get_logger("learner_utils")
|
||
|
||
|
||
def fix_chinese_quotes_in_json(text):
|
||
"""使用状态机修复 JSON 字符串值中的中文引号"""
|
||
result = []
|
||
i = 0
|
||
in_string = False
|
||
escape_next = False
|
||
|
||
while i < len(text):
|
||
char = text[i]
|
||
if escape_next:
|
||
# 当前字符是转义字符后的字符,直接添加
|
||
result.append(char)
|
||
escape_next = False
|
||
i += 1
|
||
continue
|
||
if char == "\\":
|
||
# 转义字符
|
||
result.append(char)
|
||
escape_next = True
|
||
i += 1
|
||
continue
|
||
if char == '"' and not escape_next:
|
||
# 遇到英文引号,切换字符串状态
|
||
in_string = not in_string
|
||
result.append(char)
|
||
i += 1
|
||
continue
|
||
if in_string and char in ["“", "”"]:
|
||
result.append('\\"')
|
||
else:
|
||
result.append(char)
|
||
i += 1
|
||
|
||
return "".join(result)
|
||
|
||
|
||
def parse_expression_response(response: str) -> Tuple[List[Tuple[str, str, str]], List[Tuple[str, str]]]:
|
||
"""
|
||
解析 LLM 返回的表达风格总结和黑话 JSON,提取两个列表。
|
||
|
||
期望的 JSON 结构:
|
||
[
|
||
{"situation": "AAAAA", "style": "BBBBB", "source_id": "3"}, // 表达方式
|
||
{"content": "词条", "source_id": "12"}, // 黑话
|
||
...
|
||
]
|
||
|
||
Returns:
|
||
Tuple[List[Tuple[str, str, str]], List[Tuple[str, str]]]:
|
||
第一个列表是表达方式 (situation, style, source_id)
|
||
第二个列表是黑话 (content, source_id)
|
||
"""
|
||
if not response:
|
||
return [], []
|
||
|
||
raw = response.strip()
|
||
|
||
if match := re.search(r"```json\s*(.*?)\s*```", raw, re.DOTALL):
|
||
raw = match[1].strip()
|
||
else:
|
||
# 去掉可能存在的通用 ``` 包裹
|
||
raw = re.sub(r"^```\s*", "", raw, flags=re.MULTILINE)
|
||
raw = re.sub(r"```\s*$", "", raw, flags=re.MULTILINE)
|
||
raw = raw.strip()
|
||
|
||
parsed = None
|
||
expressions: List[Tuple[str, str, str]] = [] # (situation, style, source_id)
|
||
jargon_entries: List[Tuple[str, str]] = [] # (content, source_id)
|
||
|
||
try:
|
||
# 优先尝试直接解析
|
||
if raw.startswith("[") and raw.endswith("]"):
|
||
parsed = json.loads(raw)
|
||
else:
|
||
repaired = repair_json(raw)
|
||
parsed = json.loads(repaired) if isinstance(repaired, str) else repaired
|
||
except Exception as parse_error:
|
||
# 如果解析失败,尝试修复中文引号问题
|
||
# 使用状态机方法,在 JSON 字符串值内部将中文引号替换为转义的英文引号
|
||
try:
|
||
fixed_raw = fix_chinese_quotes_in_json(raw)
|
||
|
||
# 再次尝试解析
|
||
if fixed_raw.startswith("[") and fixed_raw.endswith("]"):
|
||
parsed = json.loads(fixed_raw)
|
||
else:
|
||
repaired = repair_json(fixed_raw)
|
||
parsed = json.loads(repaired) if isinstance(repaired, str) else repaired
|
||
except Exception as fix_error:
|
||
logger.error(f"解析表达风格 JSON 失败,初始错误: {type(parse_error).__name__}: {str(parse_error)}")
|
||
logger.error(f"修复中文引号后仍失败,错误: {type(fix_error).__name__}: {str(fix_error)}")
|
||
logger.error(f"解析表达风格 JSON 失败,原始响应:{response}")
|
||
logger.error(f"处理后的 JSON 字符串(前500字符):{raw[:500]}")
|
||
return [], []
|
||
|
||
if isinstance(parsed, dict):
|
||
parsed_list = [parsed]
|
||
elif isinstance(parsed, list):
|
||
parsed_list = parsed
|
||
else:
|
||
logger.error(f"表达风格解析结果类型异常: {type(parsed)}, 内容: {parsed}")
|
||
return [], []
|
||
|
||
for item in parsed_list:
|
||
if not isinstance(item, dict):
|
||
continue
|
||
|
||
# 检查是否是表达方式条目(有 situation 和 style)
|
||
situation = str(item.get("situation", "")).strip()
|
||
style = str(item.get("style", "")).strip()
|
||
source_id = str(item.get("source_id", "")).strip()
|
||
|
||
if situation and style and source_id:
|
||
# 表达方式条目
|
||
expressions.append((situation, style, source_id))
|
||
elif item.get("content"):
|
||
# 黑话条目(有 content 字段)
|
||
content = str(item.get("content", "")).strip()
|
||
source_id = str(item.get("source_id", "")).strip()
|
||
if content and source_id:
|
||
jargon_entries.append((content, source_id))
|
||
|
||
return expressions, jargon_entries
|