feat：添加回复后打分追踪器

2026-04-17 23:05:46 +08:00
parent f3f61d6192
commit abada55884
16 changed files with 3707 additions and 51 deletions
--- a/src/maisaka/reply_effect/init.py
+++ b/src/maisaka/reply_effect/init.py
@@ -0,0 +1,5 @@
+"""Maisaka 回复效果观察器。"""
+
+from .tracker import ReplyEffectTracker
+
+__all__ = ["ReplyEffectTracker"]
--- a/src/maisaka/reply_effect/image_utils.py
+++ b/src/maisaka/reply_effect/image_utils.py
@@ -0,0 +1,100 @@
+"""回复效果记录中的图片/表情附件提取工具。"""
+
+from base64 import b64encode
+from pathlib import Path
+from typing import Any
+
+from src.common.data_models.message_component_data_model import EmojiComponent, ImageComponent, MessageSequence
+
+
+_MAX_INLINE_IMAGE_BYTES = 2 * 1024 * 1024
+
+
+def extract_visual_attachments_from_sequence(message_sequence: MessageSequence | None) -> list[dict[str, Any]]:
+    """从消息片段中提取可供评分页面展示的图片/表情信息。"""
+
+    if message_sequence is None:
+        return []
+
+    attachments: list[dict[str, Any]] = []
+    for index, component in enumerate(message_sequence.components):
+        if isinstance(component, ImageComponent):
+            attachments.append(_build_visual_attachment(component, index=index, kind="image"))
+        elif isinstance(component, EmojiComponent):
+            attachments.append(_build_visual_attachment(component, index=index, kind="emoji"))
+    return attachments
+
+
+def _build_visual_attachment(component: ImageComponent | EmojiComponent, *, index: int, kind: str) -> dict[str, Any]:
+    binary_hash = str(component.binary_hash or "").strip()
+    attachment: dict[str, Any] = {
+        "kind": kind,
+        "index": index,
+        "hash": binary_hash,
+        "content": str(component.content or "").strip(),
+        "path": "",
+        "data_url": "",
+    }
+
+    file_path = _resolve_image_path(binary_hash, kind=kind)
+    if file_path:
+        attachment["path"] = str(file_path)
+        attachment["file_name"] = file_path.name
+        attachment["mime_type"] = _guess_mime_type(file_path.suffix)
+        return attachment
+
+    binary_data = bytes(component.binary_data or b"")
+    if binary_data and len(binary_data) <= _MAX_INLINE_IMAGE_BYTES:
+        mime_type = _guess_mime_type_from_bytes(binary_data)
+        attachment["mime_type"] = mime_type
+        attachment["data_url"] = f"data:{mime_type};base64,{b64encode(binary_data).decode('ascii')}"
+    return attachment
+
+
+def _resolve_image_path(binary_hash: str, *, kind: str) -> Path | None:
+    if not binary_hash:
+        return None
+
+    try:
+        from sqlmodel import select
+
+        from src.common.database.database import get_db_session
+        from src.common.database.database_model import Images, ImageType
+
+        image_type = ImageType.EMOJI if kind == "emoji" else ImageType.IMAGE
+        with get_db_session() as db:
+            statement = select(Images).filter_by(image_hash=binary_hash, image_type=image_type).limit(1)
+            image_record = db.exec(statement).first()
+        if image_record is None or getattr(image_record, "no_file_flag", False):
+            return None
+        file_path = Path(str(image_record.full_path or "")).expanduser().resolve()
+        if file_path.is_file():
+            return file_path
+    except Exception:
+        return None
+    return None
+
+
+def _guess_mime_type(suffix: str) -> str:
+    normalized_suffix = suffix.lower().lstrip(".")
+    if normalized_suffix in {"jpg", "jpeg"}:
+        return "image/jpeg"
+    if normalized_suffix == "gif":
+        return "image/gif"
+    if normalized_suffix == "webp":
+        return "image/webp"
+    if normalized_suffix == "bmp":
+        return "image/bmp"
+    return "image/png"
+
+
+def _guess_mime_type_from_bytes(binary_data: bytes) -> str:
+    if binary_data.startswith(b"\xff\xd8\xff"):
+        return "image/jpeg"
+    if binary_data.startswith(b"GIF8"):
+        return "image/gif"
+    if binary_data.startswith(b"RIFF") and b"WEBP" in binary_data[:16]:
+        return "image/webp"
+    if binary_data.startswith(b"BM"):
+        return "image/bmp"
+    return "image/png"
--- a/src/maisaka/reply_effect/judge.py
+++ b/src/maisaka/reply_effect/judge.py
@@ -0,0 +1,116 @@
+"""回复效果 LLM 窄维度评审。"""
+
+from __future__ import annotations
+
+from collections.abc import Awaitable, Callable
+from typing import Any, Dict, List, Tuple
+
+import json
+
+from .models import FollowupMessageSnapshot, ReplyEffectRecord, RubricScoreItem, RubricScores
+from .scoring import normalize_text_for_prompt
+
+JudgeRunner = Callable[[str], Awaitable[str]]
+
+
+async def judge_reply_effect(record: ReplyEffectRecord, judge_runner: JudgeRunner | None) -> Tuple[RubricScores, str]:
+    """执行 LLM rubric judge，失败时返回中性分。"""
+
+    if judge_runner is None:
+        return RubricScores(), "未提供 LLM judge runner"
+
+    prompt = build_judge_prompt(record)
+    try:
+        response_text = await judge_runner(prompt)
+        payload = _loads_json_object(response_text)
+        return parse_rubric_scores(payload), ""
+    except Exception as exc:
+        return RubricScores(), str(exc)
+
+
+def build_judge_prompt(record: ReplyEffectRecord) -> str:
+    """构建窄维度评分 prompt。"""
+
+    followup_text = _format_followups(record.followup_messages)
+    return (
+        "你是 Maisaka 回复效果的窄维度评审器，只评估这一次 bot 回复的交互感知质量。\n"
+        "不要评价总体满意度，不要给建议，只输出 JSON。\n\n"
+        "评分范围：1 到 5，1=很差，3=中性，5=很好。\n"
+        "uncanny_risk 的 1=完全不怪，5=非常过度拟人/越界/油腻。\n\n"
+        f"bot 回复：\n{normalize_text_for_prompt(record.reply.reply_text, 1200)}\n\n"
+        f"后续用户消息：\n{followup_text or '（暂无后续用户消息）'}\n\n"
+        "请输出严格 JSON 对象，格式如下：\n"
+        "{\n"
+        '  "social_presence": {"score": 3, "reason": "...", "evidence_spans": ["..."], "confidence": 0.7},\n'
+        '  "warmth": {"score": 3, "reason": "...", "evidence_spans": ["..."], "confidence": 0.7},\n'
+        '  "competence": {"score": 3, "reason": "...", "evidence_spans": ["..."], "confidence": 0.7},\n'
+        '  "appropriateness": {"score": 3, "reason": "...", "evidence_spans": ["..."], "confidence": 0.7},\n'
+        '  "uncanny_risk": {"score": 3, "reason": "...", "evidence_spans": ["..."], "confidence": 0.7}\n'
+        "}"
+    )
+
+
+def parse_rubric_scores(payload: Dict[str, Any]) -> RubricScores:
+    """解析 LLM rubric JSON。"""
+
+    return RubricScores(
+        social_presence=_parse_item(payload.get("social_presence")),
+        warmth=_parse_item(payload.get("warmth")),
+        competence=_parse_item(payload.get("competence")),
+        appropriateness=_parse_item(payload.get("appropriateness")),
+        uncanny_risk=_parse_item(payload.get("uncanny_risk")),
+        available=True,
+    )
+
+
+def _parse_item(raw_item: Any) -> RubricScoreItem:
+    if not isinstance(raw_item, dict):
+        raw_item = {}
+    score = _coerce_float(raw_item.get("score"), 3.0)
+    score = max(1.0, min(5.0, score))
+    evidence_spans = raw_item.get("evidence_spans")
+    if not isinstance(evidence_spans, list):
+        evidence_spans = []
+    return RubricScoreItem(
+        score=score,
+        normalized_score=round((score - 1.0) / 4.0, 4),
+        reason=str(raw_item.get("reason") or "").strip(),
+        evidence_spans=[str(item).strip() for item in evidence_spans if str(item).strip()],
+        confidence=max(0.0, min(1.0, _coerce_float(raw_item.get("confidence"), 0.0))),
+    )
+
+
+def _loads_json_object(response_text: str) -> Dict[str, Any]:
+    normalized_response = str(response_text or "").strip()
+    if normalized_response.startswith("```"):
+        normalized_response = normalized_response.strip("`")
+        if normalized_response.lower().startswith("json"):
+            normalized_response = normalized_response[4:].strip()
+    try:
+        parsed = json.loads(normalized_response)
+    except json.JSONDecodeError:
+        start = normalized_response.find("{")
+        end = normalized_response.rfind("}")
+        if start < 0 or end <= start:
+            raise
+        parsed = json.loads(normalized_response[start : end + 1])
+    if not isinstance(parsed, dict):
+        raise ValueError("LLM judge 未返回 JSON 对象")
+    return parsed
+
+
+def _format_followups(followups: List[FollowupMessageSnapshot]) -> str:
+    lines: List[str] = []
+    for index, followup in enumerate(followups[:5], start=1):
+        marker = "目标用户" if followup.is_target_user else "其他用户"
+        lines.append(
+            f"{index}. [{marker}] {normalize_text_for_prompt(followup.visible_text or followup.plain_text, 500)}"
+        )
+    return "\n".join(lines)
+
+
+def _coerce_float(value: Any, default: float) -> float:
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return default
--- a/src/maisaka/reply_effect/models.py
+++ b/src/maisaka/reply_effect/models.py
@@ -0,0 +1,164 @@
+"""回复效果观察器的数据模型。"""
+
+from dataclasses import asdict, dataclass, field
+from datetime import datetime
+from enum import Enum
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+
+SCHEMA_VERSION = 1
+
+
+class ReplyEffectStatus(str, Enum):
+    """回复效果记录状态。"""
+
+    PENDING = "pending"
+    FINALIZED = "finalized"
+
+
+@dataclass(slots=True)
+class SessionSnapshot:
+    """会话快照。"""
+
+    session_id: str
+    platform_type_id: str
+    platform: str
+    chat_type: str
+    group_id: str
+    user_id: str
+    session_name: str
+
+
+@dataclass(slots=True)
+class UserSnapshot:
+    """用户快照。"""
+
+    user_id: str
+    nickname: str
+    cardname: str
+
+
+@dataclass(slots=True)
+class ReplySnapshot:
+    """被观察的回复内容。"""
+
+    tool_call_id: str
+    target_message_id: str
+    set_quote: bool
+    reply_text: str
+    reply_segments: List[str]
+    planner_reasoning: str
+    reference_info: str
+    reply_metadata: Dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass(slots=True)
+class FollowupMessageSnapshot:
+    """后续用户消息快照。"""
+
+    message_id: str
+    timestamp: str
+    user_id: str
+    nickname: str
+    cardname: str
+    visible_text: str
+    plain_text: str
+    latency_seconds: float
+    is_target_user: bool
+    attachments: List[Dict[str, Any]] = field(default_factory=list)
+
+
+@dataclass(slots=True)
+class BehaviorSignals:
+    """行为满意度信号。"""
+
+    continue_2turns: float = 0.0
+    next_user_sentiment: float = 0.5
+    user_expansion: float = 0.0
+    no_correction: float = 1.0
+    no_abort: float = 1.0
+    evidence_source: str = "no_followup"
+
+
+@dataclass(slots=True)
+class RubricScoreItem:
+    """单个 LLM rubric 项。"""
+
+    score: float = 3.0
+    normalized_score: float = 0.5
+    reason: str = ""
+    evidence_spans: List[str] = field(default_factory=list)
+    confidence: float = 0.0
+
+
+@dataclass(slots=True)
+class RubricScores:
+    """LLM 感知质量评分。"""
+
+    social_presence: RubricScoreItem = field(default_factory=RubricScoreItem)
+    warmth: RubricScoreItem = field(default_factory=RubricScoreItem)
+    competence: RubricScoreItem = field(default_factory=RubricScoreItem)
+    appropriateness: RubricScoreItem = field(default_factory=RubricScoreItem)
+    uncanny_risk: RubricScoreItem = field(default_factory=RubricScoreItem)
+    available: bool = False
+
+
+@dataclass(slots=True)
+class FrictionSignals:
+    """摩擦和反感信号。"""
+
+    explicit_negative: float = 0.0
+    repair_loop: float = 0.0
+    uncanny_risk: float = 0.5
+    evidence_messages: List[str] = field(default_factory=list)
+
+
+@dataclass(slots=True)
+class ReplyEffectScores:
+    """最终效果评分。"""
+
+    asi: float
+    behavior_score: float
+    relational_score: float
+    friction_score: float
+    behavior_signals: BehaviorSignals
+    rubric_scores: RubricScores
+    friction_signals: FrictionSignals
+    judge_error: str = ""
+
+
+@dataclass(slots=True)
+class ReplyEffectRecord:
+    """一条回复效果观察记录。"""
+
+    effect_id: str
+    status: ReplyEffectStatus
+    created_at: str
+    updated_at: str
+    session: SessionSnapshot
+    reply: ReplySnapshot
+    target_user: UserSnapshot
+    context_snapshot: List[Dict[str, Any]] = field(default_factory=list)
+    followup_messages: List[FollowupMessageSnapshot] = field(default_factory=list)
+    scores: Optional[ReplyEffectScores] = None
+    finalized_at: str = ""
+    finalize_reason: str = ""
+    confidence_note: str = ""
+    followup_summary: Dict[str, Any] = field(default_factory=dict)
+    file_path: Optional[Path] = field(default=None, repr=False)
+
+    def to_json_dict(self) -> Dict[str, Any]:
+        """转换为可直接写入 JSON 的字典。"""
+
+        payload = asdict(self)
+        payload["schema_version"] = SCHEMA_VERSION
+        payload["status"] = self.status.value
+        payload.pop("file_path", None)
+        return payload
+
+
+def now_iso() -> str:
+    """返回本地时区 ISO 时间字符串。"""
+
+    return datetime.now().astimezone().isoformat(timespec="seconds")
--- a/src/maisaka/reply_effect/path_utils.py
+++ b/src/maisaka/reply_effect/path_utils.py
@@ -0,0 +1,24 @@
+"""回复效果日志路径工具。"""
+
+from pathlib import Path
+
+from src.maisaka.display.preview_path_utils import build_preview_chat_dir_name, normalize_preview_name
+
+BASE_DIR = Path("logs") / "maisaka_reply_effect"
+
+
+def build_reply_effect_chat_dir_name(session_id: str) -> str:
+    """构建回复效果日志的会话目录名。"""
+
+    chat_dir_name = build_preview_chat_dir_name(session_id)
+    normalized_chat_dir_name = normalize_preview_name(chat_dir_name)
+    if normalized_chat_dir_name != "unknown":
+        return normalized_chat_dir_name
+    return "unknown_chat"
+
+
+def build_reply_effect_chat_dir(session_id: str, base_dir: Path | None = None) -> Path:
+    """返回某个会话对应的回复效果日志目录。"""
+
+    root_dir = base_dir or BASE_DIR
+    return root_dir / build_reply_effect_chat_dir_name(session_id)
--- a/src/maisaka/reply_effect/scoring.py
+++ b/src/maisaka/reply_effect/scoring.py
@@ -0,0 +1,262 @@
+"""回复效果评分规则。"""
+
+from __future__ import annotations
+
+from typing import Iterable, List
+
+import re
+
+from .models import BehaviorSignals, FollowupMessageSnapshot, FrictionSignals, ReplyEffectScores, RubricScores
+
+NEGATIVE_PATTERNS = (
+    "你没懂",
+    "没懂",
+    "不是这个意思",
+    "不是",
+    "别这样",
+    "好烦",
+    "烦死",
+    "算了",
+    "离谱",
+    "无语",
+    "你在说什么",
+    "听不懂",
+    "看不懂",
+    "错了",
+    "不对",
+)
+REPAIR_PATTERNS = (
+    "我是说",
+    "我说的是",
+    "重新说",
+    "再说一遍",
+    "不是问",
+    "你理解错",
+    "你搞错",
+    "我问的是",
+    "纠正",
+)
+POSITIVE_PATTERNS = (
+    "谢谢",
+    "感谢",
+    "懂了",
+    "明白了",
+    "可以",
+    "有用",
+    "不错",
+    "好耶",
+    "太好了",
+)
+
+
+def clamp(value: float, lower: float = 0.0, upper: float = 1.0) -> float:
+    """限制数值范围。"""
+
+    return max(lower, min(upper, value))
+
+
+def score_reply_effect(
+    followups: List[FollowupMessageSnapshot],
+    rubric_scores: RubricScores,
+    *,
+    target_user_id: str = "",
+    judge_error: str = "",
+) -> ReplyEffectScores:
+    """计算一条回复的 ASI 分数。"""
+
+    behavior_signals = build_behavior_signals(followups, target_user_id=target_user_id)
+    friction_signals = build_friction_signals(followups, rubric_scores, target_user_id=target_user_id)
+    behavior_score = calculate_behavior_score(behavior_signals)
+    relational_score = calculate_relational_score(rubric_scores)
+    friction_score = calculate_friction_score(friction_signals)
+    asi = calculate_asi_score(behavior_score, relational_score, friction_score)
+    return ReplyEffectScores(
+        asi=asi,
+        behavior_score=round(behavior_score, 4),
+        relational_score=round(relational_score, 4),
+        friction_score=round(friction_score, 4),
+        behavior_signals=behavior_signals,
+        rubric_scores=rubric_scores,
+        friction_signals=friction_signals,
+        judge_error=judge_error,
+    )
+
+
+def build_behavior_signals(
+    followups: List[FollowupMessageSnapshot],
+    *,
+    target_user_id: str = "",
+) -> BehaviorSignals:
+    """从后续消息构造行为满意度信号。"""
+
+    target_followups = [
+        followup
+        for followup in followups
+        if target_user_id and followup.user_id == target_user_id
+    ]
+    evidence_followups = target_followups or followups
+    evidence_source = (
+        "target_user_feedback"
+        if target_followups
+        else "indirect_session_feedback"
+        if followups
+        else "no_followup"
+    )
+    if not evidence_followups:
+        return BehaviorSignals(
+            continue_2turns=0.0,
+            next_user_sentiment=0.5,
+            user_expansion=0.0,
+            no_correction=1.0,
+            no_abort=0.6,
+            evidence_source=evidence_source,
+        )
+
+    combined_text = "\n".join(followup.plain_text for followup in evidence_followups)
+    negative_count = count_matches(combined_text, NEGATIVE_PATTERNS)
+    repair_count = count_matches(combined_text, REPAIR_PATTERNS)
+    positive_count = count_matches(combined_text, POSITIVE_PATTERNS)
+    average_length = sum(len(followup.plain_text.strip()) for followup in evidence_followups) / len(evidence_followups)
+
+    return BehaviorSignals(
+        continue_2turns=1.0 if len(evidence_followups) >= 2 else 0.5,
+        next_user_sentiment=estimate_sentiment(positive_count, negative_count, repair_count),
+        user_expansion=clamp((average_length - 8.0) / 42.0),
+        no_correction=0.0 if repair_count > 0 else 1.0,
+        no_abort=0.0 if negative_count >= 2 or "算了" in combined_text else 1.0,
+        evidence_source=evidence_source,
+    )
+
+
+def build_friction_signals(
+    followups: List[FollowupMessageSnapshot],
+    rubric_scores: RubricScores,
+    *,
+    target_user_id: str = "",
+) -> FrictionSignals:
+    """从后续消息和 LLM judge 结果构造摩擦信号。"""
+
+    evidence_messages: List[str] = []
+    explicit_negative = 0.0
+    repair_loop = 0.0
+    for followup in followups:
+        text = followup.plain_text
+        source_weight = 1.0 if target_user_id and followup.user_id == target_user_id else 0.65
+        if any(pattern in text for pattern in NEGATIVE_PATTERNS):
+            explicit_negative = max(explicit_negative, source_weight)
+            evidence_messages.append(followup.message_id)
+        if any(pattern in text for pattern in REPAIR_PATTERNS):
+            repair_loop = max(repair_loop, source_weight)
+            evidence_messages.append(followup.message_id)
+
+    uncanny_risk = rubric_scores.uncanny_risk.normalized_score if rubric_scores.available else 0.5
+    return FrictionSignals(
+        explicit_negative=round(clamp(explicit_negative), 4),
+        repair_loop=round(clamp(repair_loop), 4),
+        uncanny_risk=round(clamp(uncanny_risk), 4),
+        evidence_messages=sorted(set(evidence_messages)),
+    )
+
+
+def calculate_behavior_score(signals: BehaviorSignals) -> float:
+    """计算行为满意度分数。"""
+
+    return clamp(
+        0.30 * signals.continue_2turns
+        + 0.25 * signals.next_user_sentiment
+        + 0.20 * signals.user_expansion
+        + 0.15 * signals.no_correction
+        + 0.10 * signals.no_abort
+    )
+
+
+def calculate_relational_score(rubric_scores: RubricScores) -> float:
+    """计算感知质量分数。"""
+
+    if not rubric_scores.available:
+        return 0.5
+    return clamp(
+        0.35 * rubric_scores.social_presence.normalized_score
+        + 0.25 * rubric_scores.warmth.normalized_score
+        + 0.25 * rubric_scores.competence.normalized_score
+        + 0.15 * rubric_scores.appropriateness.normalized_score
+    )
+
+
+def calculate_friction_score(signals: FrictionSignals) -> float:
+    """计算摩擦惩罚分数。"""
+
+    return clamp(
+        0.40 * signals.explicit_negative
+        + 0.30 * signals.repair_loop
+        + 0.30 * signals.uncanny_risk
+    )
+
+
+def calculate_asi_score(behavior_score: float, relational_score: float, friction_score: float) -> float:
+    """计算 0-100 的 ASI 总分，摩擦分越高扣分越多。"""
+
+    return round(
+        clamp(
+            0.45 * behavior_score
+            + 0.35 * relational_score
+            + 0.20 * (1.0 - friction_score)
+        )
+        * 100,
+        2,
+    )
+
+
+def has_explicit_negative_feedback(
+    followups: Iterable[FollowupMessageSnapshot],
+    *,
+    target_user_id: str = "",
+    allow_indirect: bool = False,
+) -> bool:
+    """判断是否出现可提前结算的明确负反馈。"""
+
+    for followup in followups:
+        if target_user_id and followup.user_id != target_user_id and not allow_indirect:
+            continue
+        if any(pattern in followup.plain_text for pattern in NEGATIVE_PATTERNS):
+            return True
+    return False
+
+
+def has_repair_loop(
+    followups: Iterable[FollowupMessageSnapshot],
+    *,
+    target_user_id: str = "",
+    allow_indirect: bool = False,
+) -> bool:
+    """判断是否出现修复循环。"""
+
+    repair_count = 0
+    for followup in followups:
+        if target_user_id and followup.user_id != target_user_id and not allow_indirect:
+            continue
+        if any(pattern in followup.plain_text for pattern in REPAIR_PATTERNS):
+            repair_count += 1
+    return repair_count >= 1
+
+
+def count_matches(text: str, patterns: Iterable[str]) -> int:
+    """统计模式命中次数。"""
+
+    return sum(1 for pattern in patterns if pattern and pattern in text)
+
+
+def estimate_sentiment(positive_count: int, negative_count: int, repair_count: int) -> float:
+    """用轻量规则估计后续消息情绪。"""
+
+    raw_score = 0.5 + 0.2 * positive_count - 0.25 * negative_count - 0.15 * repair_count
+    return round(clamp(raw_score), 4)
+
+
+def normalize_text_for_prompt(text: str, limit: int = 800) -> str:
+    """清理用于评分 prompt 的文本。"""
+
+    normalized_text = re.sub(r"\s+", " ", str(text or "")).strip()
+    if len(normalized_text) <= limit:
+        return normalized_text
+    return normalized_text[: limit - 1] + "…"
--- a/src/maisaka/reply_effect/storage.py
+++ b/src/maisaka/reply_effect/storage.py
@@ -0,0 +1,75 @@
+"""回复效果独立 JSON 存储。"""
+
+from pathlib import Path
+from typing import Dict
+
+import json
+import time
+
+from .models import ReplyEffectRecord
+from .path_utils import BASE_DIR, build_reply_effect_chat_dir, normalize_preview_name
+
+
+class ReplyEffectStorage:
+    """负责回复效果记录的独立 JSON 文件存储。"""
+
+    _MAX_RECORDS_PER_CHAT = 1024
+    _TRIM_COUNT = 100
+
+    def __init__(self, base_dir: Path | None = None) -> None:
+        self._base_dir = base_dir or BASE_DIR
+
+    def create_record_file(self, record: ReplyEffectRecord) -> Path:
+        """为新记录创建文件路径并写入初始 JSON。"""
+
+        chat_dir_name = normalize_preview_name(record.session.platform_type_id)
+        if chat_dir_name == "unknown":
+            chat_dir = build_reply_effect_chat_dir(record.session.session_id, self._base_dir).resolve()
+        else:
+            chat_dir = (self._base_dir / chat_dir_name).resolve()
+        chat_dir.mkdir(parents=True, exist_ok=True)
+        timestamp_ms = int(time.time() * 1000)
+        safe_effect_id = record.effect_id.replace("-", "")
+        file_path = chat_dir / f"{timestamp_ms}_{safe_effect_id}.json"
+        record.file_path = file_path
+        self.save_record(record)
+        self._trim_overflow(chat_dir)
+        return file_path
+
+    def save_record(self, record: ReplyEffectRecord) -> None:
+        """原子写入记录 JSON。"""
+
+        if record.file_path is None:
+            self.create_record_file(record)
+            return
+
+        file_path = record.file_path
+        file_path.parent.mkdir(parents=True, exist_ok=True)
+        temp_path = file_path.with_name(f".{file_path.name}.tmp")
+        temp_path.write_text(
+            json.dumps(record.to_json_dict(), ensure_ascii=False, indent=2, default=str),
+            encoding="utf-8",
+        )
+        temp_path.replace(file_path)
+
+    @staticmethod
+    def read_json(file_path: Path) -> Dict[str, object]:
+        """读取已保存的 JSON 文件。"""
+
+        return json.loads(file_path.read_text(encoding="utf-8"))
+
+    def _trim_overflow(self, chat_dir: Path) -> None:
+        """超过容量时删除最旧的回复效果记录。"""
+
+        files = [file_path for file_path in chat_dir.glob("*.json") if file_path.is_file()]
+        if len(files) <= self._MAX_RECORDS_PER_CHAT:
+            return
+
+        sorted_files = sorted(files, key=lambda file_path: file_path.stat().st_mtime)
+        overflow_count = len(files) - self._MAX_RECORDS_PER_CHAT
+        trim_count = min(len(sorted_files), max(self._TRIM_COUNT, overflow_count))
+        for old_file in sorted_files[:trim_count]:
+            try:
+                old_file.unlink()
+            except FileNotFoundError:
+                continue
--- a/src/maisaka/reply_effect/tracker.py
+++ b/src/maisaka/reply_effect/tracker.py
@@ -0,0 +1,267 @@
+"""会话级回复效果观察器。"""
+
+from __future__ import annotations
+
+from datetime import datetime
+from typing import Any, Dict, List
+
+import asyncio
+import time
+import uuid
+
+from src.chat.message_receive.message import SessionMessage
+from src.maisaka.history_utils import build_session_message_visible_text
+
+from .image_utils import extract_visual_attachments_from_sequence
+from .judge import JudgeRunner, judge_reply_effect
+from .models import (
+    FollowupMessageSnapshot,
+    ReplyEffectRecord,
+    ReplyEffectStatus,
+    ReplySnapshot,
+    SessionSnapshot,
+    UserSnapshot,
+    now_iso,
+)
+from .path_utils import build_reply_effect_chat_dir_name
+from .scoring import (
+    has_explicit_negative_feedback,
+    has_repair_loop,
+    score_reply_effect,
+)
+from .storage import ReplyEffectStorage
+
+TARGET_USER_FOLLOWUP_LIMIT = 2
+SESSION_FOLLOWUP_LIMIT = 5
+OBSERVATION_WINDOW_SECONDS = 600.0
+
+
+class ReplyEffectTracker:
+    """追踪单个 Maisaka 会话内 reply 工具回复后的用户反馈。"""
+
+    def __init__(
+        self,
+        *,
+        session_id: str,
+        session_name: str,
+        chat_stream: Any,
+        judge_runner: JudgeRunner | None = None,
+        storage: ReplyEffectStorage | None = None,
+    ) -> None:
+        self._session_id = session_id
+        self._session_name = session_name
+        self._chat_stream = chat_stream
+        self._judge_runner = judge_runner
+        self._storage = storage or ReplyEffectStorage()
+        self._pending_records: Dict[str, ReplyEffectRecord] = {}
+        self._timeout_tasks: Dict[str, asyncio.Task[None]] = {}
+
+    async def record_reply(
+        self,
+        *,
+        tool_call_id: str,
+        target_message: SessionMessage,
+        set_quote: bool,
+        reply_text: str,
+        reply_segments: List[str],
+        planner_reasoning: str,
+        reference_info: str,
+        reply_metadata: Dict[str, Any] | None = None,
+        context_snapshot: List[Dict[str, Any]] | None = None,
+    ) -> ReplyEffectRecord:
+        """登记一条已经成功发出的 reply 回复。"""
+
+        effect_id = str(uuid.uuid4())
+        target_user_info = target_message.message_info.user_info
+        record = ReplyEffectRecord(
+            effect_id=effect_id,
+            status=ReplyEffectStatus.PENDING,
+            created_at=now_iso(),
+            updated_at=now_iso(),
+            session=self._build_session_snapshot(),
+            reply=ReplySnapshot(
+                tool_call_id=tool_call_id,
+                target_message_id=target_message.message_id,
+                set_quote=set_quote,
+                reply_text=reply_text,
+                reply_segments=list(reply_segments),
+                planner_reasoning=planner_reasoning,
+                reference_info=reference_info,
+                reply_metadata=dict(reply_metadata or {}),
+            ),
+            target_user=UserSnapshot(
+                user_id=str(target_user_info.user_id or "").strip(),
+                nickname=str(target_user_info.user_nickname or "").strip(),
+                cardname=str(target_user_info.user_cardname or "").strip(),
+            ),
+            context_snapshot=list(context_snapshot or []),
+        )
+        self._storage.create_record_file(record)
+        self._pending_records[effect_id] = record
+        self._timeout_tasks[effect_id] = asyncio.create_task(self._finalize_after_timeout(effect_id))
+        return record
+
+    async def observe_user_message(self, message: SessionMessage) -> None:
+        """观察一条后续用户消息，并在满足规则时完成相关 pending 记录。"""
+
+        if not self._pending_records or message.session_id != self._session_id:
+            return
+
+        for effect_id, record in list(self._pending_records.items()):
+            if record.status != ReplyEffectStatus.PENDING:
+                continue
+            followup = self._build_followup_snapshot(message, record)
+            record.followup_messages.append(followup)
+            record.updated_at = now_iso()
+            self._storage.save_record(record)
+
+            reason = self._resolve_finalize_reason(record)
+            if reason:
+                await self.finalize(effect_id, reason)
+
+    async def finalize_all(self, reason: str = "runtime_stop") -> None:
+        """强制完成当前会话所有 pending 记录。"""
+
+        for effect_id in list(self._pending_records.keys()):
+            await self.finalize(effect_id, reason)
+
+    async def finalize(self, effect_id: str, reason: str) -> None:
+        """完成一条 pending 记录并写回 JSON。"""
+
+        record = self._pending_records.pop(effect_id, None)
+        if record is None or record.status == ReplyEffectStatus.FINALIZED:
+            return
+
+        timeout_task = self._timeout_tasks.pop(effect_id, None)
+        current_task = asyncio.current_task()
+        if timeout_task is not None and timeout_task is not current_task:
+            timeout_task.cancel()
+
+        rubric_scores, judge_error = await judge_reply_effect(record, self._judge_runner)
+        record.scores = score_reply_effect(
+            record.followup_messages,
+            rubric_scores,
+            target_user_id=record.target_user.user_id,
+            judge_error=judge_error,
+        )
+        record.status = ReplyEffectStatus.FINALIZED
+        record.finalized_at = now_iso()
+        record.updated_at = record.finalized_at
+        record.finalize_reason = reason
+        record.confidence_note = self._build_confidence_note(record)
+        record.followup_summary = self._build_followup_summary(record)
+        self._storage.save_record(record)
+
+    def _build_session_snapshot(self) -> SessionSnapshot:
+        platform = str(getattr(self._chat_stream, "platform", "") or "").strip()
+        group_id = str(getattr(self._chat_stream, "group_id", "") or "").strip()
+        user_id = str(getattr(self._chat_stream, "user_id", "") or "").strip()
+        is_group_session = bool(getattr(self._chat_stream, "is_group_session", False))
+        return SessionSnapshot(
+            session_id=self._session_id,
+            platform_type_id=build_reply_effect_chat_dir_name(self._session_id),
+            platform=platform,
+            chat_type="group" if is_group_session else "private",
+            group_id=group_id,
+            user_id=user_id,
+            session_name=self._session_name,
+        )
+
+    def _build_followup_snapshot(
+        self,
+        message: SessionMessage,
+        record: ReplyEffectRecord,
+    ) -> FollowupMessageSnapshot:
+        user_info = message.message_info.user_info
+        plain_text = str(message.processed_plain_text or "").strip()
+        try:
+            visible_text = build_session_message_visible_text(message)
+        except Exception:
+            visible_text = plain_text
+        latency_seconds = max(0.0, time.time() - _parse_iso_timestamp(record.created_at))
+        user_id = str(user_info.user_id or "").strip()
+        return FollowupMessageSnapshot(
+            message_id=str(message.message_id or "").strip(),
+            timestamp=_message_timestamp_to_iso(message),
+            user_id=user_id,
+            nickname=str(user_info.user_nickname or "").strip(),
+            cardname=str(user_info.user_cardname or "").strip(),
+            visible_text=visible_text,
+            plain_text=plain_text,
+            latency_seconds=round(latency_seconds, 3),
+            is_target_user=bool(record.target_user.user_id and user_id == record.target_user.user_id),
+            attachments=extract_visual_attachments_from_sequence(message.raw_message),
+        )
+
+    def _resolve_finalize_reason(self, record: ReplyEffectRecord) -> str:
+        target_user_id = record.target_user.user_id
+        target_followups = [
+            followup
+            for followup in record.followup_messages
+            if target_user_id and followup.user_id == target_user_id
+        ]
+        has_target_feedback = bool(target_followups)
+        if has_explicit_negative_feedback(target_followups, target_user_id=target_user_id, allow_indirect=False):
+            return "explicit_negative"
+        if has_repair_loop(target_followups, target_user_id=target_user_id, allow_indirect=False):
+            return "repair_loop"
+        if len(target_followups) >= TARGET_USER_FOLLOWUP_LIMIT:
+            return "target_user_followups"
+
+        if not target_user_id or not has_target_feedback:
+            allow_indirect = not target_user_id
+            if has_explicit_negative_feedback(
+                record.followup_messages,
+                target_user_id=target_user_id,
+                allow_indirect=allow_indirect,
+            ):
+                return "explicit_negative"
+            if has_repair_loop(
+                record.followup_messages,
+                target_user_id=target_user_id,
+                allow_indirect=allow_indirect,
+            ):
+                return "repair_loop"
+            if len(record.followup_messages) >= SESSION_FOLLOWUP_LIMIT:
+                return "session_followups_limit"
+
+        return ""
+
+    async def _finalize_after_timeout(self, effect_id: str) -> None:
+        try:
+            await asyncio.sleep(OBSERVATION_WINDOW_SECONDS)
+            await self.finalize(effect_id, "window_timeout")
+        except asyncio.CancelledError:
+            return
+
+    @staticmethod
+    def _build_confidence_note(record: ReplyEffectRecord) -> str:
+        if not record.followup_messages:
+            return "没有观察到后续用户消息，行为分使用保守中性信号。"
+        if any(followup.is_target_user for followup in record.followup_messages):
+            return "行为反馈包含回复对象本人的后续发言。"
+        return "行为反馈来自同会话其他用户，不是回复对象本人，置信度较低。"
+
+    @staticmethod
+    def _build_followup_summary(record: ReplyEffectRecord) -> Dict[str, Any]:
+        target_count = sum(1 for followup in record.followup_messages if followup.is_target_user)
+        return {
+            "total_count": len(record.followup_messages),
+            "target_user_count": target_count,
+            "other_user_count": len(record.followup_messages) - target_count,
+            "target_user_id": record.target_user.user_id,
+        }
+
+
+def _message_timestamp_to_iso(message: SessionMessage) -> str:
+    timestamp = getattr(message, "timestamp", None)
+    if isinstance(timestamp, datetime):
+        return timestamp.astimezone().isoformat(timespec="seconds")
+    return now_iso()
+
+
+def _parse_iso_timestamp(value: str) -> float:
+    try:
+        return datetime.fromisoformat(value).timestamp()
+    except ValueError:
+        return time.time()