From 03e06c282ce2401c9253f6f85b2208347244977e Mon Sep 17 00:00:00 2001 From: SengokuCola <1026294844@qq.com> Date: Wed, 5 Nov 2025 00:35:16 +0800 Subject: [PATCH] =?UTF-8?q?feat=EF=BC=9A=E5=8F=AF=E4=BB=A5=E5=AF=B9?= =?UTF-8?q?=E4=B8=8D=E5=90=8Cchat=E8=87=AA=E5=AE=9A=E4=B9=89=E4=B8=80?= =?UTF-8?q?=E6=AE=B5=E9=A2=9D=E5=A4=96prompt?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/chat/emoji_system/emoji_manager.py | 4 +- src/chat/heart_flow/heartFC_chat.py | 6 +- src/common/database/database_model.py | 102 +++----- src/jargon/__init__.py | 7 + src/jargon/jargon_miner.py | 230 +++++++++++++++++++ src/llm_models/model_client/openai_client.py | 2 +- src/memory_system/curious.py | 54 ++++- 7 files changed, 329 insertions(+), 76 deletions(-) create mode 100644 src/jargon/__init__.py create mode 100644 src/jargon/jargon_miner.py diff --git a/src/chat/emoji_system/emoji_manager.py b/src/chat/emoji_system/emoji_manager.py index 512e7e55..b26ab844 100644 --- a/src/chat/emoji_system/emoji_manager.py +++ b/src/chat/emoji_system/emoji_manager.py @@ -940,13 +940,13 @@ class EmojiManager: image_base64 = get_image_manager().transform_gif(image_base64) # type: ignore if not image_base64: raise RuntimeError("GIF表情包转换失败") - prompt = "这是一个动态图表情包,每一张图代表了动态图的某一帧,黑色背景代表透明,简短描述一下表情包表达的情感和内容,描述细节,从互联网梗,meme的角度去分析" + prompt = "这是一个动态图表情包,每一张图代表了动态图的某一帧,黑色背景代表透明,简短描述一下表情包表达的情感和内容,从互联网梗,meme的角度去分析,精简回答" description, _ = await self.vlm.generate_response_for_image( prompt, image_base64, "jpg", temperature=0.5 ) else: prompt = ( - "这是一个表情包,请详细描述一下表情包所表达的情感和内容,简短描述细节,从互联网梗,meme的角度去分析" + "这是一个表情包,请详细描述一下表情包所表达的情感和内容,简短描述细节,从互联网梗,meme的角度去分析,精简回答" ) description, _ = await self.vlm.generate_response_for_image( prompt, image_base64, image_format, temperature=0.5 diff --git a/src/chat/heart_flow/heartFC_chat.py b/src/chat/heart_flow/heartFC_chat.py index e4900197..99e55122 100644 --- a/src/chat/heart_flow/heartFC_chat.py +++ b/src/chat/heart_flow/heartFC_chat.py @@ -17,12 +17,12 @@ from src.chat.planner_actions.planner import ActionPlanner from src.chat.planner_actions.action_modifier import ActionModifier from src.chat.planner_actions.action_manager import ActionManager from src.chat.heart_flow.hfc_utils import CycleDetail -from src.chat.heart_flow.hfc_utils import send_typing, stop_typing from src.express.expression_learner import expression_learner_manager from src.chat.frequency_control.frequency_control import frequency_control_manager from src.memory_system.question_maker import QuestionMaker from src.memory_system.questions import global_conflict_tracker from src.memory_system.curious import check_and_make_question +from src.jargon import extract_and_store_jargon from src.person_info.person_info import Person from src.plugin_system.base.component_types import EventType, ActionInfo from src.plugin_system.core import events_manager @@ -336,7 +336,9 @@ class HeartFChatting: asyncio.create_task(frequency_control_manager.get_or_create_frequency_control(self.stream_id).trigger_frequency_adjust()) # 添加curious检测任务 - 检测聊天记录中的矛盾、冲突或需要提问的内容 - asyncio.create_task(check_and_make_question(self.stream_id, recent_messages_list)) + asyncio.create_task(check_and_make_question(self.stream_id)) + # 添加jargon提取任务 - 提取聊天中的黑话/俚语并入库(内部自行取消息并带冷却) + asyncio.create_task(extract_and_store_jargon(self.stream_id)) cycle_timers, thinking_id = self.start_cycle() diff --git a/src/common/database/database_model.py b/src/common/database/database_model.py index 89e0a019..a1aaaa23 100644 --- a/src/common/database/database_model.py +++ b/src/common/database/database_model.py @@ -20,6 +20,8 @@ logger = get_logger("database_model") # 定义一个基础模型是一个好习惯,所有其他模型都应继承自它。 # 这允许您在一个地方为所有模型指定数据库。 + + class BaseModel(Model): class Meta: # 将下面的 'db' 替换为您实际的数据库实例变量名。 @@ -343,30 +345,45 @@ class MemoryConflict(BaseModel): class Meta: table_name = "memory_conflicts" + +class Jargon(BaseModel): + """ + 用于存储俚语的模型 + """ + content = TextField() + raw_content = TextField(null=True) + type = TextField(null=True) + translation = TextField(null=True) + meaning = TextField(null=True) + chat_id = TextField(index=True) + is_global = BooleanField(default=False) + count = IntegerField(default=0) + + class Meta: + table_name = "jargon" - +MODELS = [ + ChatStreams, + LLMUsage, + Emoji, + Messages, + Images, + ImageDescriptions, + OnlineTime, + PersonInfo, + Expression, + ActionRecords, + MemoryChest, + MemoryConflict, + Jargon, +] def create_tables(): """ 创建所有在模型中定义的数据库表。 """ with db: - db.create_tables( - [ - ChatStreams, - LLMUsage, - Emoji, - Messages, - Images, - ImageDescriptions, - OnlineTime, - PersonInfo, - Expression, - ActionRecords, # 添加 ActionRecords 到初始化列表 - MemoryChest, - MemoryConflict, # 添加记忆冲突表 - ] - ) + db.create_tables(MODELS) def initialize_database(sync_constraints=False): @@ -379,24 +396,9 @@ def initialize_database(sync_constraints=False): 如果为 True,会检查并修复字段的 NULL 约束不一致问题。 """ - models = [ - ChatStreams, - LLMUsage, - Emoji, - Messages, - Images, - ImageDescriptions, - OnlineTime, - PersonInfo, - Expression, - ActionRecords, # 添加 ActionRecords 到初始化列表 - MemoryChest, - MemoryConflict, - ] - try: with db: # 管理 table_exists 检查的连接 - for model in models: + for model in MODELS: table_name = model._meta.table_name if not db.table_exists(model): logger.warning(f"表 '{table_name}' 未找到,正在创建...") @@ -476,24 +478,9 @@ def sync_field_constraints(): 如果发现不一致,会自动修复字段约束。 """ - models = [ - ChatStreams, - LLMUsage, - Emoji, - Messages, - Images, - ImageDescriptions, - OnlineTime, - PersonInfo, - Expression, - ActionRecords, - MemoryChest, - MemoryConflict, - ] - try: with db: - for model in models: + for model in MODELS: table_name = model._meta.table_name if not db.table_exists(model): logger.warning(f"表 '{table_name}' 不存在,跳过约束检查") @@ -660,26 +647,11 @@ def check_field_constraints(): 用于在修复前预览需要修复的内容。 """ - models = [ - ChatStreams, - LLMUsage, - Emoji, - Messages, - Images, - ImageDescriptions, - OnlineTime, - PersonInfo, - Expression, - ActionRecords, - MemoryChest, - MemoryConflict, - ] - inconsistencies = {} try: with db: - for model in models: + for model in MODELS: table_name = model._meta.table_name if not db.table_exists(model): continue diff --git a/src/jargon/__init__.py b/src/jargon/__init__.py new file mode 100644 index 00000000..1a60a94a --- /dev/null +++ b/src/jargon/__init__.py @@ -0,0 +1,7 @@ +from .jargon_miner import extract_and_store_jargon + +__all__ = [ + "extract_and_store_jargon", +] + + diff --git a/src/jargon/jargon_miner.py b/src/jargon/jargon_miner.py new file mode 100644 index 00000000..c0f035d8 --- /dev/null +++ b/src/jargon/jargon_miner.py @@ -0,0 +1,230 @@ +import time +import json +from typing import List +from json_repair import repair_json + +from src.common.logger import get_logger +from src.common.database.database_model import Jargon +from src.llm_models.utils_model import LLMRequest +from src.config.config import model_config +from src.chat.message_receive.chat_stream import get_chat_manager +from src.chat.utils.chat_message_builder import ( + build_anonymous_messages, + get_raw_msg_by_timestamp_with_chat_inclusive, +) +from src.chat.utils.prompt_builder import Prompt, global_prompt_manager + + +logger = get_logger("jargon") + + +def _init_prompt() -> None: + prompt_str = """ +**聊天内容** +{chat_str} + +请从上面这段聊天内容中提取"可能是黑话"的候选项(黑话/俚语/网络缩写/口头禅)。 +- 必须为对话中真实出现过的短词或短语 +- 必须是你无法理解含义的词语,或者出现频率较高的词语 +- 必须是这几种类别之一:英文或中文缩写、中文拼音短语、字母数字混合、意义不明但频繁的词汇 +- 排除:人名、@、明显的表情/图片占位、纯标点、常规功能词(如的、了、呢、啊等) +- 每个词条长度建议 2-8 个字符(不强制),尽量短小 +- 合并重复项,去重 + +分类规则: +- p(拼音缩写):由字母或字母和汉字构成的,疑似拼音简写词,例如:nb、yyds、xswl +- c(中文缩写):中文词语的缩写,用几个汉字概括一个词汇或含义,例如:社死、内卷 +- e(英文缩写):英文词语的缩写,用英文字母概括一个词汇或含义,例如:CPU、GPU、API + +以 JSON 数组输出,元素为对象(严格按以下结构): +[ + {{"content": "词条", "raw_content": "包含该词条的完整句子", "type": "p"}}, + {{"content": "词条2", "raw_content": "包含该词条的完整句子", "type": "c"}} +] + +现在请输出: +""" + Prompt(prompt_str, "extract_jargon_prompt") + + +_init_prompt() + + +class JargonMiner: + def __init__(self, chat_id: str) -> None: + self.chat_id = chat_id + self.last_learning_time: float = time.time() + # 频率控制,可按需调整 + self.min_messages_for_learning: int = 20 + self.min_learning_interval: float = 30 + + self.llm = LLMRequest( + model_set=model_config.model_task_config.utils, + request_type="jargon.extract", + ) + + def should_trigger(self) -> bool: + # 冷却时间检查 + if time.time() - self.last_learning_time < self.min_learning_interval: + return False + + # 拉取最近消息数量是否足够 + recent_messages = get_raw_msg_by_timestamp_with_chat_inclusive( + chat_id=self.chat_id, + timestamp_start=self.last_learning_time, + timestamp_end=time.time(), + ) + return bool(recent_messages and len(recent_messages) >= self.min_messages_for_learning) + + async def run_once(self) -> None: + try: + if not self.should_trigger(): + return + + chat_stream = get_chat_manager().get_stream(self.chat_id) + if not chat_stream: + return + + # 拉取学习窗口内的消息 + messages = get_raw_msg_by_timestamp_with_chat_inclusive( + chat_id=self.chat_id, + timestamp_start=self.last_learning_time, + timestamp_end=time.time(), + limit=20, + ) + if not messages: + return + + chat_str: str = await build_anonymous_messages(messages) + if not chat_str.strip(): + return + + prompt: str = await global_prompt_manager.format_prompt( + "extract_jargon_prompt", + chat_str=chat_str, + ) + + response, _ = await self.llm.generate_response_async(prompt, temperature=0.2) + if not response: + return + + logger.info(f"jargon提取提示词: {prompt}") + logger.info(f"jargon提取结果: {response}") + + # 解析为JSON + entries: List[dict] = [] + try: + resp = response.strip() + parsed = None + if resp.startswith("[") and resp.endswith("]"): + parsed = json.loads(resp) + else: + repaired = repair_json(resp) + if isinstance(repaired, str): + parsed = json.loads(repaired) + else: + parsed = repaired + + if isinstance(parsed, dict): + parsed = [parsed] + + if not isinstance(parsed, list): + return + + for item in parsed: + if not isinstance(item, dict): + continue + content = str(item.get("content", "")).strip() + raw_content = str(item.get("raw_content", "")).strip() + type_str = str(item.get("type", "")).strip().lower() + + # 验证type是否为有效值 + if type_str not in ["p", "c", "e"]: + type_str = "p" # 默认值 + + if content: + entries.append({ + "content": content, + "raw_content": raw_content, + "type": type_str + }) + except Exception as e: + logger.error(f"解析jargon JSON失败: {e}; 原始: {response}") + return + + if not entries: + return + + # 去重并写入DB(按 chat_id + content 去重) + # 使用content作为去重键 + seen = set() + uniq_entries = [] + for entry in entries: + content_key = entry["content"] + if content_key not in seen: + seen.add(content_key) + uniq_entries.append(entry) + + saved = 0 + updated = 0 + for entry in uniq_entries: + content = entry["content"] + raw_content = entry["raw_content"] + type_str = entry["type"] + try: + query = ( + Jargon.select() + .where((Jargon.chat_id == self.chat_id) & (Jargon.content == content)) + ) + if query.exists(): + obj = query.get() + try: + obj.count = (obj.count or 0) + 1 + except Exception: + obj.count = 1 + # 更新raw_content和type(如果为空或需要更新) + if raw_content and not obj.raw_content: + obj.raw_content = raw_content + if type_str and not obj.type: + obj.type = type_str + obj.save() + updated += 1 + else: + Jargon.create( + content=content, + raw_content=raw_content, + type=type_str, + chat_id=self.chat_id, + is_global=False, + count=1 + ) + saved += 1 + except Exception as e: + logger.error(f"保存jargon失败: chat_id={self.chat_id}, content={content}, err={e}") + continue + + if saved or updated: + logger.info(f"jargon写入: 新增 {saved} 条,更新 {updated} 条,chat_id={self.chat_id}") + self.last_learning_time = time.time() + except Exception as e: + logger.error(f"JargonMiner 运行失败: {e}") + + +class JargonMinerManager: + def __init__(self) -> None: + self._miners: dict[str, JargonMiner] = {} + + def get_miner(self, chat_id: str) -> JargonMiner: + if chat_id not in self._miners: + self._miners[chat_id] = JargonMiner(chat_id) + return self._miners[chat_id] + + +miner_manager = JargonMinerManager() + + +async def extract_and_store_jargon(chat_id: str) -> None: + miner = miner_manager.get_miner(chat_id) + await miner.run_once() + + diff --git a/src/llm_models/model_client/openai_client.py b/src/llm_models/model_client/openai_client.py index 36af7775..8c91e867 100644 --- a/src/llm_models/model_client/openai_client.py +++ b/src/llm_models/model_client/openai_client.py @@ -444,7 +444,7 @@ def _default_normal_response_parser( choice0 = resp.choices[0] reason = getattr(choice0, "finish_reason", None) if reason and reason == "length": - print(resp) + # print(resp) _model_name = resp.model # 统一日志格式 logger.info( diff --git a/src/memory_system/curious.py b/src/memory_system/curious.py index badb421d..80bffdae 100644 --- a/src/memory_system/curious.py +++ b/src/memory_system/curious.py @@ -1,9 +1,8 @@ import time -import asyncio -from typing import List, Optional, Tuple +from typing import List, Optional from src.common.logger import get_logger from src.chat.utils.chat_message_builder import ( - get_raw_msg_before_timestamp_with_chat, + get_raw_msg_by_timestamp_with_chat_inclusive, build_readable_messages_with_id, ) from src.llm_models.utils_model import LLMRequest @@ -25,7 +24,21 @@ class CuriousDetector: model_set=model_config.model_task_config.utils, request_type="curious_detector", ) + # 触发控制 + self.last_detection_time: float = time.time() + self.min_interval_seconds: float = 60.0 + self.min_messages: int = 20 + def should_trigger(self) -> bool: + if time.time() - self.last_detection_time < self.min_interval_seconds: + return False + recent_messages = get_raw_msg_by_timestamp_with_chat_inclusive( + chat_id=self.chat_id, + timestamp_start=self.last_detection_time, + timestamp_end=time.time(), + ) + return bool(recent_messages and len(recent_messages) >= self.min_messages) + async def detect_questions(self, recent_messages: List) -> Optional[str]: """ 检测最近消息中是否有需要提问的内容 @@ -91,6 +104,9 @@ class CuriousDetector: result_text, _ = await self.llm_request.generate_response_async(prompt, temperature=0.3) + logger.info(f"好奇心检测提示词: {prompt}") + logger.info(f"好奇心检测结果: {result_text}") + if not result_text: return None @@ -154,7 +170,20 @@ class CuriousDetector: return False -async def check_and_make_question(chat_id: str, recent_messages: List) -> bool: +class CuriousManager: + def __init__(self) -> None: + self._detectors: dict[str, CuriousDetector] = {} + + def get_detector(self, chat_id: str) -> CuriousDetector: + if chat_id not in self._detectors: + self._detectors[chat_id] = CuriousDetector(chat_id) + return self._detectors[chat_id] + + +curious_manager = CuriousManager() + + +async def check_and_make_question(chat_id: str) -> bool: """ 检查聊天记录并生成问题(如果检测到需要提问的内容) @@ -166,8 +195,20 @@ async def check_and_make_question(chat_id: str, recent_messages: List) -> bool: bool: 是否检测到并记录了问题 """ try: - detector = CuriousDetector(chat_id) - + detector = curious_manager.get_detector(chat_id) + if not detector.should_trigger(): + return False + + # 拉取窗口内消息 + recent_messages = get_raw_msg_by_timestamp_with_chat_inclusive( + chat_id=chat_id, + timestamp_start=detector.last_detection_time, + timestamp_end=time.time(), + limit=80, + ) + if not recent_messages: + return False + # 检测是否需要提问 question = await detector.detect_questions(recent_messages) @@ -176,6 +217,7 @@ async def check_and_make_question(chat_id: str, recent_messages: List) -> bool: success = await detector.make_question_from_detection(question) if success: logger.info(f"成功检测并记录问题: {question}") + detector.last_detection_time = time.time() return True return False