feat：可以对不同chat自定义一段额外prompt

2025-11-05 00:35:16 +08:00
parent a4d43e1aee
commit 03e06c282c
7 changed files with 329 additions and 76 deletions
--- a/src/chat/emoji_system/emoji_manager.py
+++ b/src/chat/emoji_system/emoji_manager.py
@@ -940,13 +940,13 @@ class EmojiManager:
                    image_base64 = get_image_manager().transform_gif(image_base64)  # type: ignore
                    if not image_base64:
                        raise RuntimeError("GIF表情包转换失败")
-                    prompt = "这是一个动态图表情包，每一张图代表了动态图的某一帧，黑色背景代表透明，简短描述一下表情包表达的情感和内容，描述细节，从互联网梗,meme的角度去分析"
+                    prompt = "这是一个动态图表情包，每一张图代表了动态图的某一帧，黑色背景代表透明，简短描述一下表情包表达的情感和内容，从互联网梗,meme的角度去分析，精简回答"
                    description, _ = await self.vlm.generate_response_for_image(
                        prompt, image_base64, "jpg", temperature=0.5
                    )
                else:
                    prompt = (
-                        "这是一个表情包，请详细描述一下表情包所表达的情感和内容，简短描述细节，从互联网梗,meme的角度去分析"
+                        "这是一个表情包，请详细描述一下表情包所表达的情感和内容，简短描述细节，从互联网梗,meme的角度去分析，精简回答"
                    )
                    description, _ = await self.vlm.generate_response_for_image(
                        prompt, image_base64, image_format, temperature=0.5
--- a/src/chat/heart_flow/heartFC_chat.py
+++ b/src/chat/heart_flow/heartFC_chat.py
@@ -17,12 +17,12 @@ from src.chat.planner_actions.planner import ActionPlanner
 from src.chat.planner_actions.action_modifier import ActionModifier
 from src.chat.planner_actions.action_manager import ActionManager
 from src.chat.heart_flow.hfc_utils import CycleDetail
-from src.chat.heart_flow.hfc_utils import send_typing, stop_typing
 from src.express.expression_learner import expression_learner_manager
 from src.chat.frequency_control.frequency_control import frequency_control_manager
 from src.memory_system.question_maker import QuestionMaker
 from src.memory_system.questions import global_conflict_tracker
 from src.memory_system.curious import check_and_make_question
+from src.jargon import extract_and_store_jargon
 from src.person_info.person_info import Person
 from src.plugin_system.base.component_types import EventType, ActionInfo
 from src.plugin_system.core import events_manager
@@ -336,7 +336,9 @@ class HeartFChatting:
            asyncio.create_task(frequency_control_manager.get_or_create_frequency_control(self.stream_id).trigger_frequency_adjust())  
            
            # 添加curious检测任务 - 检测聊天记录中的矛盾、冲突或需要提问的内容
-            asyncio.create_task(check_and_make_question(self.stream_id, recent_messages_list))
+            asyncio.create_task(check_and_make_question(self.stream_id))
+            # 添加jargon提取任务 - 提取聊天中的黑话/俚语并入库（内部自行取消息并带冷却）
+            asyncio.create_task(extract_and_store_jargon(self.stream_id))
            
            
            cycle_timers, thinking_id = self.start_cycle()
--- a/src/common/database/database_model.py
+++ b/src/common/database/database_model.py
@@ -20,6 +20,8 @@ logger = get_logger("database_model")

 # 定义一个基础模型是一个好习惯，所有其他模型都应继承自它。
 # 这允许您在一个地方为所有模型指定数据库。
+
+
 class BaseModel(Model):
    class Meta:
        # 将下面的 'db' 替换为您实际的数据库实例变量名。
@@ -343,30 +345,45 @@ class MemoryConflict(BaseModel):

    class Meta:
        table_name = "memory_conflicts"
+        
+class Jargon(BaseModel):
+    """
+    用于存储俚语的模型
+    """
+    content = TextField()
+    raw_content = TextField(null=True)
+    type = TextField(null=True)
+    translation = TextField(null=True)
+    meaning = TextField(null=True)
+    chat_id = TextField(index=True)
+    is_global = BooleanField(default=False)
+    count = IntegerField(default=0)
+    
+    class Meta:
+        table_name = "jargon"

-
+MODELS = [
+    ChatStreams,
+    LLMUsage,
+    Emoji,
+    Messages,
+    Images,
+    ImageDescriptions,
+    OnlineTime,
+    PersonInfo,
+    Expression,
+    ActionRecords,
+    MemoryChest,
+    MemoryConflict,
+    Jargon,
+]

 def create_tables():
    """
    创建所有在模型中定义的数据库表。
    """
    with db:
-        db.create_tables(
-            [
-                ChatStreams,
-                LLMUsage,
-                Emoji,
-                Messages,
-                Images,
-                ImageDescriptions,
-                OnlineTime,
-                PersonInfo,
-                Expression,
-                ActionRecords,  # 添加 ActionRecords 到初始化列表
-                MemoryChest,
-                MemoryConflict,  # 添加记忆冲突表
-            ]
-        )
+        db.create_tables(MODELS)


 def initialize_database(sync_constraints=False):
@@ -379,24 +396,9 @@ def initialize_database(sync_constraints=False):
                               如果为 True，会检查并修复字段的 NULL 约束不一致问题。
    """

-    models = [
-        ChatStreams,
-        LLMUsage,
-        Emoji,
-        Messages,
-        Images,
-        ImageDescriptions,
-        OnlineTime,
-        PersonInfo,
-        Expression,
-        ActionRecords,  # 添加 ActionRecords 到初始化列表
-        MemoryChest,
-        MemoryConflict,
-    ]
-
    try:
        with db:  # 管理 table_exists 检查的连接
-            for model in models:
+            for model in MODELS:
                table_name = model._meta.table_name
                if not db.table_exists(model):
                    logger.warning(f"表 '{table_name}' 未找到，正在创建...")
@@ -476,24 +478,9 @@ def sync_field_constraints():
    如果发现不一致，会自动修复字段约束。
    """

-    models = [
-        ChatStreams,
-        LLMUsage,
-        Emoji,
-        Messages,
-        Images,
-        ImageDescriptions,
-        OnlineTime,
-        PersonInfo,
-        Expression,
-        ActionRecords,
-        MemoryChest,
-        MemoryConflict,
-    ]
-
    try:
        with db:
-            for model in models:
+            for model in MODELS:
                table_name = model._meta.table_name
                if not db.table_exists(model):
                    logger.warning(f"表 '{table_name}' 不存在，跳过约束检查")
@@ -660,26 +647,11 @@ def check_field_constraints():
    用于在修复前预览需要修复的内容。
    """

-    models = [
-        ChatStreams,
-        LLMUsage,
-        Emoji,
-        Messages,
-        Images,
-        ImageDescriptions,
-        OnlineTime,
-        PersonInfo,
-        Expression,
-        ActionRecords,
-        MemoryChest,
-        MemoryConflict,
-    ]
-
    inconsistencies = {}

    try:
        with db:
-            for model in models:
+            for model in MODELS:
                table_name = model._meta.table_name
                if not db.table_exists(model):
                    continue
--- a/src/jargon/init.py
+++ b/src/jargon/init.py
@@ -0,0 +1,7 @@
+from .jargon_miner import extract_and_store_jargon
+
+__all__ = [
+    "extract_and_store_jargon",
+]
+
+
--- a/src/jargon/jargon_miner.py
+++ b/src/jargon/jargon_miner.py
@@ -0,0 +1,230 @@
+import time
+import json
+from typing import List
+from json_repair import repair_json
+
+from src.common.logger import get_logger
+from src.common.database.database_model import Jargon
+from src.llm_models.utils_model import LLMRequest
+from src.config.config import model_config
+from src.chat.message_receive.chat_stream import get_chat_manager
+from src.chat.utils.chat_message_builder import (
+    build_anonymous_messages,
+    get_raw_msg_by_timestamp_with_chat_inclusive,
+)
+from src.chat.utils.prompt_builder import Prompt, global_prompt_manager
+
+
+logger = get_logger("jargon")
+
+
+def _init_prompt() -> None:
+    prompt_str = """
+**聊天内容**
+{chat_str}
+
+请从上面这段聊天内容中提取"可能是黑话"的候选项（黑话/俚语/网络缩写/口头禅）。
+- 必须为对话中真实出现过的短词或短语
+- 必须是你无法理解含义的词语，或者出现频率较高的词语
+- 必须是这几种类别之一：英文或中文缩写、中文拼音短语、字母数字混合、意义不明但频繁的词汇
+- 排除：人名、@、明显的表情/图片占位、纯标点、常规功能词（如的、了、呢、啊等）
+- 每个词条长度建议 2-8 个字符（不强制），尽量短小
+- 合并重复项，去重
+
+分类规则：
+- p（拼音缩写）：由字母或字母和汉字构成的，疑似拼音简写词，例如：nb、yyds、xswl
+- c（中文缩写）：中文词语的缩写，用几个汉字概括一个词汇或含义，例如：社死、内卷
+- e（英文缩写）：英文词语的缩写，用英文字母概括一个词汇或含义，例如：CPU、GPU、API
+
+以 JSON 数组输出，元素为对象（严格按以下结构）：
+[
+  {{"content": "词条", "raw_content": "包含该词条的完整句子", "type": "p"}},
+  {{"content": "词条2", "raw_content": "包含该词条的完整句子", "type": "c"}}
+]
+
+现在请输出：
+"""
+    Prompt(prompt_str, "extract_jargon_prompt")
+
+
+_init_prompt()
+
+
+class JargonMiner:
+    def __init__(self, chat_id: str) -> None:
+        self.chat_id = chat_id
+        self.last_learning_time: float = time.time()
+        # 频率控制，可按需调整
+        self.min_messages_for_learning: int = 20
+        self.min_learning_interval: float = 30  
+
+        self.llm = LLMRequest(
+            model_set=model_config.model_task_config.utils,
+            request_type="jargon.extract",
+        )
+
+    def should_trigger(self) -> bool:
+        # 冷却时间检查
+        if time.time() - self.last_learning_time < self.min_learning_interval:
+            return False
+
+        # 拉取最近消息数量是否足够
+        recent_messages = get_raw_msg_by_timestamp_with_chat_inclusive(
+            chat_id=self.chat_id,
+            timestamp_start=self.last_learning_time,
+            timestamp_end=time.time(),
+        )
+        return bool(recent_messages and len(recent_messages) >= self.min_messages_for_learning)
+
+    async def run_once(self) -> None:
+        try:
+            if not self.should_trigger():
+                return
+
+            chat_stream = get_chat_manager().get_stream(self.chat_id)
+            if not chat_stream:
+                return
+
+            # 拉取学习窗口内的消息
+            messages = get_raw_msg_by_timestamp_with_chat_inclusive(
+                chat_id=self.chat_id,
+                timestamp_start=self.last_learning_time,
+                timestamp_end=time.time(),
+                limit=20,
+            )
+            if not messages:
+                return
+
+            chat_str: str = await build_anonymous_messages(messages)
+            if not chat_str.strip():
+                return
+
+            prompt: str = await global_prompt_manager.format_prompt(
+                "extract_jargon_prompt",
+                chat_str=chat_str,
+            )
+
+            response, _ = await self.llm.generate_response_async(prompt, temperature=0.2)
+            if not response:
+                return
+            
+            logger.info(f"jargon提取提示词: {prompt}")
+            logger.info(f"jargon提取结果: {response}")
+
+            # 解析为JSON
+            entries: List[dict] = []
+            try:
+                resp = response.strip()
+                parsed = None
+                if resp.startswith("[") and resp.endswith("]"):
+                    parsed = json.loads(resp)
+                else:
+                    repaired = repair_json(resp)
+                    if isinstance(repaired, str):
+                        parsed = json.loads(repaired)
+                    else:
+                        parsed = repaired
+
+                if isinstance(parsed, dict):
+                    parsed = [parsed]
+
+                if not isinstance(parsed, list):
+                    return
+
+                for item in parsed:
+                    if not isinstance(item, dict):
+                        continue
+                    content = str(item.get("content", "")).strip()
+                    raw_content = str(item.get("raw_content", "")).strip()
+                    type_str = str(item.get("type", "")).strip().lower()
+                    
+                    # 验证type是否为有效值
+                    if type_str not in ["p", "c", "e"]:
+                        type_str = "p"  # 默认值
+                    
+                    if content:
+                        entries.append({
+                            "content": content,
+                            "raw_content": raw_content,
+                            "type": type_str
+                        })
+            except Exception as e:
+                logger.error(f"解析jargon JSON失败: {e}; 原始: {response}")
+                return
+
+            if not entries:
+                return
+
+            # 去重并写入DB（按 chat_id + content 去重）
+            # 使用content作为去重键
+            seen = set()
+            uniq_entries = []
+            for entry in entries:
+                content_key = entry["content"]
+                if content_key not in seen:
+                    seen.add(content_key)
+                    uniq_entries.append(entry)
+            
+            saved = 0
+            updated = 0
+            for entry in uniq_entries:
+                content = entry["content"]
+                raw_content = entry["raw_content"]
+                type_str = entry["type"]
+                try:
+                    query = (
+                        Jargon.select()
+                        .where((Jargon.chat_id == self.chat_id) & (Jargon.content == content))
+                    )
+                    if query.exists():
+                        obj = query.get()
+                        try:
+                            obj.count = (obj.count or 0) + 1
+                        except Exception:
+                            obj.count = 1
+                        # 更新raw_content和type（如果为空或需要更新）
+                        if raw_content and not obj.raw_content:
+                            obj.raw_content = raw_content
+                        if type_str and not obj.type:
+                            obj.type = type_str
+                        obj.save()
+                        updated += 1
+                    else:
+                        Jargon.create(
+                            content=content,
+                            raw_content=raw_content,
+                            type=type_str,
+                            chat_id=self.chat_id,
+                            is_global=False,
+                            count=1
+                        )
+                        saved += 1
+                except Exception as e:
+                    logger.error(f"保存jargon失败: chat_id={self.chat_id}, content={content}, err={e}")
+                    continue
+
+            if saved or updated:
+                logger.info(f"jargon写入: 新增 {saved} 条，更新 {updated} 条，chat_id={self.chat_id}")
+                self.last_learning_time = time.time()
+        except Exception as e:
+            logger.error(f"JargonMiner 运行失败: {e}")
+
+
+class JargonMinerManager:
+    def __init__(self) -> None:
+        self._miners: dict[str, JargonMiner] = {}
+
+    def get_miner(self, chat_id: str) -> JargonMiner:
+        if chat_id not in self._miners:
+            self._miners[chat_id] = JargonMiner(chat_id)
+        return self._miners[chat_id]
+
+
+miner_manager = JargonMinerManager()
+
+
+async def extract_and_store_jargon(chat_id: str) -> None:
+    miner = miner_manager.get_miner(chat_id)
+    await miner.run_once()
+
+
--- a/src/llm_models/model_client/openai_client.py
+++ b/src/llm_models/model_client/openai_client.py
@@ -444,7 +444,7 @@ def _default_normal_response_parser(
        choice0 = resp.choices[0]
        reason = getattr(choice0, "finish_reason", None)
        if reason and reason == "length":
-            print(resp)
+            # print(resp)
            _model_name = resp.model
            # 统一日志格式
            logger.info(
--- a/src/memory_system/curious.py
+++ b/src/memory_system/curious.py
@@ -1,9 +1,8 @@
 import time
-import asyncio
-from typing import List, Optional, Tuple
+from typing import List, Optional
 from src.common.logger import get_logger
 from src.chat.utils.chat_message_builder import (
-    get_raw_msg_before_timestamp_with_chat,
+    get_raw_msg_by_timestamp_with_chat_inclusive,
    build_readable_messages_with_id,
 )
 from src.llm_models.utils_model import LLMRequest
@@ -25,7 +24,21 @@ class CuriousDetector:
            model_set=model_config.model_task_config.utils,
            request_type="curious_detector",
        )
+        # 触发控制
+        self.last_detection_time: float = time.time()
+        self.min_interval_seconds: float = 60.0
+        self.min_messages: int = 20
    
+    def should_trigger(self) -> bool:
+        if time.time() - self.last_detection_time < self.min_interval_seconds:
+            return False
+        recent_messages = get_raw_msg_by_timestamp_with_chat_inclusive(
+            chat_id=self.chat_id,
+            timestamp_start=self.last_detection_time,
+            timestamp_end=time.time(),
+        )
+        return bool(recent_messages and len(recent_messages) >= self.min_messages)
+
    async def detect_questions(self, recent_messages: List) -> Optional[str]:
        """
        检测最近消息中是否有需要提问的内容
@@ -91,6 +104,9 @@ class CuriousDetector:

            result_text, _ = await self.llm_request.generate_response_async(prompt, temperature=0.3)
            
+            logger.info(f"好奇心检测提示词: {prompt}")
+            logger.info(f"好奇心检测结果: {result_text}")
+            
            if not result_text:
                return None
            
@@ -154,7 +170,20 @@ class CuriousDetector:
            return False


-async def check_and_make_question(chat_id: str, recent_messages: List) -> bool:
+class CuriousManager:
+    def __init__(self) -> None:
+        self._detectors: dict[str, CuriousDetector] = {}
+
+    def get_detector(self, chat_id: str) -> CuriousDetector:
+        if chat_id not in self._detectors:
+            self._detectors[chat_id] = CuriousDetector(chat_id)
+        return self._detectors[chat_id]
+
+
+curious_manager = CuriousManager()
+
+
+async def check_and_make_question(chat_id: str) -> bool:
    """
    检查聊天记录并生成问题（如果检测到需要提问的内容）
    
@@ -166,8 +195,20 @@ async def check_and_make_question(chat_id: str, recent_messages: List) -> bool:
        bool: 是否检测到并记录了问题
    """
    try:
-        detector = CuriousDetector(chat_id)
-        
+        detector = curious_manager.get_detector(chat_id)
+        if not detector.should_trigger():
+            return False
+
+        # 拉取窗口内消息
+        recent_messages = get_raw_msg_by_timestamp_with_chat_inclusive(
+            chat_id=chat_id,
+            timestamp_start=detector.last_detection_time,
+            timestamp_end=time.time(),
+            limit=80,
+        )
+        if not recent_messages:
+            return False
+
        # 检测是否需要提问
        question = await detector.detect_questions(recent_messages)
        
@@ -176,6 +217,7 @@ async def check_and_make_question(chat_id: str, recent_messages: List) -> bool:
            success = await detector.make_question_from_detection(question)
            if success:
                logger.info(f"成功检测并记录问题: {question}")
+                detector.last_detection_time = time.time()
                return True
        
        return False