更改文件结构

This commit is contained in:
SengokuCola
2026-03-24 11:36:26 +08:00
parent 6a0b902e17
commit 03ed59e388
22 changed files with 248 additions and 93 deletions

View File

@@ -0,0 +1,212 @@
from json_repair import repair_json
from typing import Tuple, Optional, List
import json
import re
from src.config.config import model_config
from src.config.config import global_config
from src.llm_models.utils_model import LLMRequest
from src.prompt.prompt_manager import prompt_manager
from src.common.logger import get_logger
logger = get_logger("expression_utils")
# TODO: 重构完LLM相关内容后替换成新的模型调用方式
judge_llm = LLMRequest(model_set=model_config.model_task_config.tool_use, request_type="expression_check")
async def check_expression_suitability(situation: str, style: str) -> Tuple[bool, str, Optional[str]]:
"""
执行单次LLM评估
Args:
situation: 情境
style: 风格
Returns:
(suitable, reason, error) 元组,如果出错则 suitable 为 Falseerror 包含错误信息
"""
# 构建评估提示词
# 基础评估标准
base_criteria = [
"表达方式或言语风格是否与使用条件或使用情景匹配",
"允许部分语法错误或口头化或缺省出现",
"表达方式不能太过特指,需要具有泛用性",
"一般不涉及具体的人名或名称",
]
if custom_criteria := global_config.expression.expression_auto_check_custom_criteria:
base_criteria.extend(custom_criteria)
# 构建评估标准列表字符串
criteria_list = "\n".join([f"{i + 1}. {criterion}" for i, criterion in enumerate(base_criteria)])
prompt_template = prompt_manager.get_prompt("expression_evaluation")
prompt_template.add_context("situation", situation)
prompt_template.add_context("style", style)
prompt_template.add_context("criteria_list", criteria_list)
prompt = await prompt_manager.render_prompt(prompt_template)
logger.info(f"正在评估表达方式: situation={situation}, style={style}")
response, _ = await judge_llm.generate_response_async(prompt=prompt, temperature=0.6, max_tokens=1024)
logger.debug(f"评估结果: {response}")
try:
evaluation = json.loads(response)
except json.JSONDecodeError:
try:
response_repaired = repair_json(response)
evaluation = json.loads(response_repaired)
except Exception as e:
raise ValueError(f"无法解析LLM响应为JSON: {response}") from e
except Exception as e:
return False, f"评估表达方式时发生错误: {e}", str(e)
try:
suitable = evaluation.get("suitable", False)
reason = evaluation.get("reason", "未提供理由")
logger.debug(f"评估结果: {'通过' if suitable else '不通过'}")
return suitable, reason, None
except Exception as e:
return False, f"评估结果格式错误: {e}", str(e)
def fix_chinese_quotes_in_json(text):
"""使用状态机修复 JSON 字符串值中的中文引号"""
result = []
i = 0
in_string = False
escape_next = False
while i < len(text):
char = text[i]
if escape_next:
# 当前字符是转义字符后的字符,直接添加
result.append(char)
escape_next = False
i += 1
continue
if char == "\\":
# 转义字符
result.append(char)
escape_next = True
i += 1
continue
if char == '"' and not escape_next:
# 遇到英文引号,切换字符串状态
in_string = not in_string
result.append(char)
i += 1
continue
if in_string and char in ["", ""]:
result.append('\\"')
else:
result.append(char)
i += 1
return "".join(result)
def parse_expression_response(response: str) -> Tuple[List[Tuple[str, str, str]], List[Tuple[str, str]]]:
"""
解析 LLM 返回的表达风格总结和黑话 JSON提取两个列表。
期望的 JSON 结构:
[
{"situation": "AAAAA", "style": "BBBBB", "source_id": "3"}, // 表达方式
{"content": "词条", "source_id": "12"}, // 黑话
...
]
Returns:
Tuple[List[Tuple[str, str, str]], List[Tuple[str, str]]]:
第一个列表是表达方式 (situation, style, source_id)
第二个列表是黑话 (content, source_id)
"""
if not response:
return [], []
raw = response.strip()
if match := re.search(r"```json\s*(.*?)\s*```", raw, re.DOTALL):
raw = match[1].strip()
else:
# 去掉可能存在的通用 ``` 包裹
raw = re.sub(r"^```\s*", "", raw, flags=re.MULTILINE)
raw = re.sub(r"```\s*$", "", raw, flags=re.MULTILINE)
raw = raw.strip()
parsed = _try_parse(raw)
if parsed is None:
fixed = fix_chinese_quotes_in_json(raw)
parsed = _try_parse(fixed)
if parsed is None:
logger.error(f"处理后的 JSON 字符串前500字符{raw[:500]}")
return [], []
if isinstance(parsed, dict):
parsed_list = [parsed]
elif isinstance(parsed, list):
parsed_list = parsed
else:
logger.error(f"表达风格解析结果类型异常: {type(parsed)}, 内容: {parsed}")
return [], []
expressions: List[Tuple[str, str, str]] = [] # (situation, style, source_id)
jargon_entries: List[Tuple[str, str]] = [] # (content, source_id)
for item in parsed_list:
if not isinstance(item, dict):
continue
# 检查是否是表达方式条目(有 situation 和 style
situation = str(item.get("situation", "")).strip()
style = str(item.get("style", "")).strip()
source_id = str(item.get("source_id", "")).strip()
if situation and style and source_id:
# 表达方式条目
expressions.append((situation, style, source_id))
continue
content = str(item.get("content", "")).strip()
if content and source_id:
jargon_entries.append((content, source_id))
return expressions, jargon_entries
def is_single_char_jargon(content: str) -> bool:
"""
判断是否是单字黑话(单个汉字、英文或数字)
Args:
content: 词条内容
Returns:
bool: 如果是单字黑话返回True否则返回False
"""
if not content or len(content) != 1:
return False
char = content[0]
# 判断是否是单个汉字、单个英文字母或单个数字
return (
"\u4e00" <= char <= "\u9fff" # 汉字
or "a" <= char <= "z" # 小写字母
or "A" <= char <= "Z" # 大写字母
or "0" <= char <= "9" # 数字
)
def _try_parse(text):
try:
return json.loads(text)
except Exception:
try:
repaired = repair_json(text)
return json.loads(repaired)
except Exception:
return None