117 lines
4.6 KiB
Python
117 lines
4.6 KiB
Python
"""回复效果 LLM 窄维度评审。"""
|
||
|
||
from __future__ import annotations
|
||
|
||
from collections.abc import Awaitable, Callable
|
||
from typing import Any, Dict, List, Tuple
|
||
|
||
import json
|
||
|
||
from .models import FollowupMessageSnapshot, ReplyEffectRecord, RubricScoreItem, RubricScores
|
||
from .scoring import normalize_text_for_prompt
|
||
|
||
JudgeRunner = Callable[[str], Awaitable[str]]
|
||
|
||
|
||
async def judge_reply_effect(record: ReplyEffectRecord, judge_runner: JudgeRunner | None) -> Tuple[RubricScores, str]:
|
||
"""执行 LLM rubric judge,失败时返回中性分。"""
|
||
|
||
if judge_runner is None:
|
||
return RubricScores(), "未提供 LLM judge runner"
|
||
|
||
prompt = build_judge_prompt(record)
|
||
try:
|
||
response_text = await judge_runner(prompt)
|
||
payload = _loads_json_object(response_text)
|
||
return parse_rubric_scores(payload), ""
|
||
except Exception as exc:
|
||
return RubricScores(), str(exc)
|
||
|
||
|
||
def build_judge_prompt(record: ReplyEffectRecord) -> str:
|
||
"""构建窄维度评分 prompt。"""
|
||
|
||
followup_text = _format_followups(record.followup_messages)
|
||
return (
|
||
"你是 Maisaka 回复效果的窄维度评审器,只评估这一次 bot 回复的交互感知质量。\n"
|
||
"不要评价总体满意度,不要给建议,只输出 JSON。\n\n"
|
||
"评分范围:1 到 5,1=很差,3=中性,5=很好。\n"
|
||
"uncanny_risk 的 1=完全不怪,5=非常过度拟人/越界/油腻。\n\n"
|
||
f"bot 回复:\n{normalize_text_for_prompt(record.reply.reply_text, 1200)}\n\n"
|
||
f"后续用户消息:\n{followup_text or '(暂无后续用户消息)'}\n\n"
|
||
"请输出严格 JSON 对象,格式如下:\n"
|
||
"{\n"
|
||
' "social_presence": {"score": 3, "reason": "...", "evidence_spans": ["..."], "confidence": 0.7},\n'
|
||
' "warmth": {"score": 3, "reason": "...", "evidence_spans": ["..."], "confidence": 0.7},\n'
|
||
' "competence": {"score": 3, "reason": "...", "evidence_spans": ["..."], "confidence": 0.7},\n'
|
||
' "appropriateness": {"score": 3, "reason": "...", "evidence_spans": ["..."], "confidence": 0.7},\n'
|
||
' "uncanny_risk": {"score": 3, "reason": "...", "evidence_spans": ["..."], "confidence": 0.7}\n'
|
||
"}"
|
||
)
|
||
|
||
|
||
def parse_rubric_scores(payload: Dict[str, Any]) -> RubricScores:
|
||
"""解析 LLM rubric JSON。"""
|
||
|
||
return RubricScores(
|
||
social_presence=_parse_item(payload.get("social_presence")),
|
||
warmth=_parse_item(payload.get("warmth")),
|
||
competence=_parse_item(payload.get("competence")),
|
||
appropriateness=_parse_item(payload.get("appropriateness")),
|
||
uncanny_risk=_parse_item(payload.get("uncanny_risk")),
|
||
available=True,
|
||
)
|
||
|
||
|
||
def _parse_item(raw_item: Any) -> RubricScoreItem:
|
||
if not isinstance(raw_item, dict):
|
||
raw_item = {}
|
||
score = _coerce_float(raw_item.get("score"), 3.0)
|
||
score = max(1.0, min(5.0, score))
|
||
evidence_spans = raw_item.get("evidence_spans")
|
||
if not isinstance(evidence_spans, list):
|
||
evidence_spans = []
|
||
return RubricScoreItem(
|
||
score=score,
|
||
normalized_score=round((score - 1.0) / 4.0, 4),
|
||
reason=str(raw_item.get("reason") or "").strip(),
|
||
evidence_spans=[str(item).strip() for item in evidence_spans if str(item).strip()],
|
||
confidence=max(0.0, min(1.0, _coerce_float(raw_item.get("confidence"), 0.0))),
|
||
)
|
||
|
||
|
||
def _loads_json_object(response_text: str) -> Dict[str, Any]:
|
||
normalized_response = str(response_text or "").strip()
|
||
if normalized_response.startswith("```"):
|
||
normalized_response = normalized_response.strip("`")
|
||
if normalized_response.lower().startswith("json"):
|
||
normalized_response = normalized_response[4:].strip()
|
||
try:
|
||
parsed = json.loads(normalized_response)
|
||
except json.JSONDecodeError:
|
||
start = normalized_response.find("{")
|
||
end = normalized_response.rfind("}")
|
||
if start < 0 or end <= start:
|
||
raise
|
||
parsed = json.loads(normalized_response[start : end + 1])
|
||
if not isinstance(parsed, dict):
|
||
raise ValueError("LLM judge 未返回 JSON 对象")
|
||
return parsed
|
||
|
||
|
||
def _format_followups(followups: List[FollowupMessageSnapshot]) -> str:
|
||
lines: List[str] = []
|
||
for index, followup in enumerate(followups[:5], start=1):
|
||
marker = "目标用户" if followup.is_target_user else "其他用户"
|
||
lines.append(
|
||
f"{index}. [{marker}] {normalize_text_for_prompt(followup.visible_text or followup.plain_text, 500)}"
|
||
)
|
||
return "\n".join(lines)
|
||
|
||
|
||
def _coerce_float(value: Any, default: float) -> float:
|
||
try:
|
||
return float(value)
|
||
except (TypeError, ValueError):
|
||
return default
|