Files
mai-bot/src/maisaka/reply_effect/judge.py
2026-04-17 23:05:46 +08:00

117 lines
4.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""回复效果 LLM 窄维度评审。"""
from __future__ import annotations
from collections.abc import Awaitable, Callable
from typing import Any, Dict, List, Tuple
import json
from .models import FollowupMessageSnapshot, ReplyEffectRecord, RubricScoreItem, RubricScores
from .scoring import normalize_text_for_prompt
JudgeRunner = Callable[[str], Awaitable[str]]
async def judge_reply_effect(record: ReplyEffectRecord, judge_runner: JudgeRunner | None) -> Tuple[RubricScores, str]:
"""执行 LLM rubric judge失败时返回中性分。"""
if judge_runner is None:
return RubricScores(), "未提供 LLM judge runner"
prompt = build_judge_prompt(record)
try:
response_text = await judge_runner(prompt)
payload = _loads_json_object(response_text)
return parse_rubric_scores(payload), ""
except Exception as exc:
return RubricScores(), str(exc)
def build_judge_prompt(record: ReplyEffectRecord) -> str:
"""构建窄维度评分 prompt。"""
followup_text = _format_followups(record.followup_messages)
return (
"你是 Maisaka 回复效果的窄维度评审器,只评估这一次 bot 回复的交互感知质量。\n"
"不要评价总体满意度,不要给建议,只输出 JSON。\n\n"
"评分范围1 到 51=很差3=中性5=很好。\n"
"uncanny_risk 的 1=完全不怪5=非常过度拟人/越界/油腻。\n\n"
f"bot 回复:\n{normalize_text_for_prompt(record.reply.reply_text, 1200)}\n\n"
f"后续用户消息:\n{followup_text or '(暂无后续用户消息)'}\n\n"
"请输出严格 JSON 对象,格式如下:\n"
"{\n"
' "social_presence": {"score": 3, "reason": "...", "evidence_spans": ["..."], "confidence": 0.7},\n'
' "warmth": {"score": 3, "reason": "...", "evidence_spans": ["..."], "confidence": 0.7},\n'
' "competence": {"score": 3, "reason": "...", "evidence_spans": ["..."], "confidence": 0.7},\n'
' "appropriateness": {"score": 3, "reason": "...", "evidence_spans": ["..."], "confidence": 0.7},\n'
' "uncanny_risk": {"score": 3, "reason": "...", "evidence_spans": ["..."], "confidence": 0.7}\n'
"}"
)
def parse_rubric_scores(payload: Dict[str, Any]) -> RubricScores:
"""解析 LLM rubric JSON。"""
return RubricScores(
social_presence=_parse_item(payload.get("social_presence")),
warmth=_parse_item(payload.get("warmth")),
competence=_parse_item(payload.get("competence")),
appropriateness=_parse_item(payload.get("appropriateness")),
uncanny_risk=_parse_item(payload.get("uncanny_risk")),
available=True,
)
def _parse_item(raw_item: Any) -> RubricScoreItem:
if not isinstance(raw_item, dict):
raw_item = {}
score = _coerce_float(raw_item.get("score"), 3.0)
score = max(1.0, min(5.0, score))
evidence_spans = raw_item.get("evidence_spans")
if not isinstance(evidence_spans, list):
evidence_spans = []
return RubricScoreItem(
score=score,
normalized_score=round((score - 1.0) / 4.0, 4),
reason=str(raw_item.get("reason") or "").strip(),
evidence_spans=[str(item).strip() for item in evidence_spans if str(item).strip()],
confidence=max(0.0, min(1.0, _coerce_float(raw_item.get("confidence"), 0.0))),
)
def _loads_json_object(response_text: str) -> Dict[str, Any]:
normalized_response = str(response_text or "").strip()
if normalized_response.startswith("```"):
normalized_response = normalized_response.strip("`")
if normalized_response.lower().startswith("json"):
normalized_response = normalized_response[4:].strip()
try:
parsed = json.loads(normalized_response)
except json.JSONDecodeError:
start = normalized_response.find("{")
end = normalized_response.rfind("}")
if start < 0 or end <= start:
raise
parsed = json.loads(normalized_response[start : end + 1])
if not isinstance(parsed, dict):
raise ValueError("LLM judge 未返回 JSON 对象")
return parsed
def _format_followups(followups: List[FollowupMessageSnapshot]) -> str:
lines: List[str] = []
for index, followup in enumerate(followups[:5], start=1):
marker = "目标用户" if followup.is_target_user else "其他用户"
lines.append(
f"{index}. [{marker}] {normalize_text_for_prompt(followup.visible_text or followup.plain_text, 500)}"
)
return "\n".join(lines)
def _coerce_float(value: Any, default: float) -> float:
try:
return float(value)
except (TypeError, ValueError):
return default