feat:添加回复后打分追踪器

This commit is contained in:
SengokuCola
2026-04-17 23:05:46 +08:00
parent f3f61d6192
commit abada55884
16 changed files with 3707 additions and 51 deletions

View File

@@ -0,0 +1,116 @@
"""回复效果 LLM 窄维度评审。"""
from __future__ import annotations
from collections.abc import Awaitable, Callable
from typing import Any, Dict, List, Tuple
import json
from .models import FollowupMessageSnapshot, ReplyEffectRecord, RubricScoreItem, RubricScores
from .scoring import normalize_text_for_prompt
JudgeRunner = Callable[[str], Awaitable[str]]
async def judge_reply_effect(record: ReplyEffectRecord, judge_runner: JudgeRunner | None) -> Tuple[RubricScores, str]:
"""执行 LLM rubric judge失败时返回中性分。"""
if judge_runner is None:
return RubricScores(), "未提供 LLM judge runner"
prompt = build_judge_prompt(record)
try:
response_text = await judge_runner(prompt)
payload = _loads_json_object(response_text)
return parse_rubric_scores(payload), ""
except Exception as exc:
return RubricScores(), str(exc)
def build_judge_prompt(record: ReplyEffectRecord) -> str:
"""构建窄维度评分 prompt。"""
followup_text = _format_followups(record.followup_messages)
return (
"你是 Maisaka 回复效果的窄维度评审器,只评估这一次 bot 回复的交互感知质量。\n"
"不要评价总体满意度,不要给建议,只输出 JSON。\n\n"
"评分范围1 到 51=很差3=中性5=很好。\n"
"uncanny_risk 的 1=完全不怪5=非常过度拟人/越界/油腻。\n\n"
f"bot 回复:\n{normalize_text_for_prompt(record.reply.reply_text, 1200)}\n\n"
f"后续用户消息:\n{followup_text or '(暂无后续用户消息)'}\n\n"
"请输出严格 JSON 对象,格式如下:\n"
"{\n"
' "social_presence": {"score": 3, "reason": "...", "evidence_spans": ["..."], "confidence": 0.7},\n'
' "warmth": {"score": 3, "reason": "...", "evidence_spans": ["..."], "confidence": 0.7},\n'
' "competence": {"score": 3, "reason": "...", "evidence_spans": ["..."], "confidence": 0.7},\n'
' "appropriateness": {"score": 3, "reason": "...", "evidence_spans": ["..."], "confidence": 0.7},\n'
' "uncanny_risk": {"score": 3, "reason": "...", "evidence_spans": ["..."], "confidence": 0.7}\n'
"}"
)
def parse_rubric_scores(payload: Dict[str, Any]) -> RubricScores:
"""解析 LLM rubric JSON。"""
return RubricScores(
social_presence=_parse_item(payload.get("social_presence")),
warmth=_parse_item(payload.get("warmth")),
competence=_parse_item(payload.get("competence")),
appropriateness=_parse_item(payload.get("appropriateness")),
uncanny_risk=_parse_item(payload.get("uncanny_risk")),
available=True,
)
def _parse_item(raw_item: Any) -> RubricScoreItem:
if not isinstance(raw_item, dict):
raw_item = {}
score = _coerce_float(raw_item.get("score"), 3.0)
score = max(1.0, min(5.0, score))
evidence_spans = raw_item.get("evidence_spans")
if not isinstance(evidence_spans, list):
evidence_spans = []
return RubricScoreItem(
score=score,
normalized_score=round((score - 1.0) / 4.0, 4),
reason=str(raw_item.get("reason") or "").strip(),
evidence_spans=[str(item).strip() for item in evidence_spans if str(item).strip()],
confidence=max(0.0, min(1.0, _coerce_float(raw_item.get("confidence"), 0.0))),
)
def _loads_json_object(response_text: str) -> Dict[str, Any]:
normalized_response = str(response_text or "").strip()
if normalized_response.startswith("```"):
normalized_response = normalized_response.strip("`")
if normalized_response.lower().startswith("json"):
normalized_response = normalized_response[4:].strip()
try:
parsed = json.loads(normalized_response)
except json.JSONDecodeError:
start = normalized_response.find("{")
end = normalized_response.rfind("}")
if start < 0 or end <= start:
raise
parsed = json.loads(normalized_response[start : end + 1])
if not isinstance(parsed, dict):
raise ValueError("LLM judge 未返回 JSON 对象")
return parsed
def _format_followups(followups: List[FollowupMessageSnapshot]) -> str:
lines: List[str] = []
for index, followup in enumerate(followups[:5], start=1):
marker = "目标用户" if followup.is_target_user else "其他用户"
lines.append(
f"{index}. [{marker}] {normalize_text_for_prompt(followup.visible_text or followup.plain_text, 500)}"
)
return "\n".join(lines)
def _coerce_float(value: Any, default: float) -> float:
try:
return float(value)
except (TypeError, ValueError):
return default