ref:分离know模块和cli模块
This commit is contained in:
3
src/know_u/__init__.py
Normal file
3
src/know_u/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
"""
|
||||
Knowledge utilities package for Maisaka.
|
||||
"""
|
||||
239
src/know_u/knowledge.py
Normal file
239
src/know_u/knowledge.py
Normal file
@@ -0,0 +1,239 @@
|
||||
"""
|
||||
Maisaka knowledge retrieval and learning helpers.
|
||||
"""
|
||||
|
||||
from typing import Any, Dict, List
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
|
||||
from src.chat.message_receive.message import SessionMessage
|
||||
from src.chat.utils.utils import is_bot_self
|
||||
from src.common.data_models.llm_service_data_models import LLMGenerationOptions
|
||||
from src.common.logger import get_logger
|
||||
from src.services.llm_service import LLMServiceClient
|
||||
|
||||
from src.know_u.knowledge_store import KNOWLEDGE_CATEGORIES, get_knowledge_store
|
||||
from src.maisaka.message_adapter import get_message_role, get_message_text, parse_speaker_content
|
||||
|
||||
logger = get_logger("maisaka_knowledge")
|
||||
|
||||
NO_RESULT_KEYWORDS = [
|
||||
"无",
|
||||
"没有",
|
||||
"不适用",
|
||||
"无需",
|
||||
"无相关",
|
||||
]
|
||||
|
||||
|
||||
def extract_category_ids_from_result(result: str) -> List[str]:
|
||||
"""Extract valid category ids from an LLM result string."""
|
||||
if not result:
|
||||
return []
|
||||
|
||||
normalized = result.strip()
|
||||
if not normalized:
|
||||
return []
|
||||
|
||||
lowered = normalized.lower()
|
||||
if any(keyword in lowered for keyword in ["none", "no relevant", "no_need", "no need"]):
|
||||
return []
|
||||
if any(keyword in normalized for keyword in NO_RESULT_KEYWORDS):
|
||||
return []
|
||||
|
||||
category_ids: List[str] = []
|
||||
for part in normalized.replace(",", " ").replace(",", " ").replace("\n", " ").split():
|
||||
candidate = part.strip()
|
||||
if candidate in KNOWLEDGE_CATEGORIES and candidate not in category_ids:
|
||||
category_ids.append(candidate)
|
||||
|
||||
return category_ids
|
||||
|
||||
|
||||
async def retrieve_relevant_knowledge(
|
||||
knowledge_analyzer: Any,
|
||||
chat_history: List[SessionMessage],
|
||||
) -> str:
|
||||
"""Retrieve formatted knowledge snippets relevant to the current chat history."""
|
||||
store = get_knowledge_store()
|
||||
categories_summary = store.get_categories_summary()
|
||||
|
||||
try:
|
||||
category_ids = await knowledge_analyzer.analyze_knowledge_need(chat_history, categories_summary)
|
||||
if not category_ids:
|
||||
return ""
|
||||
return store.get_formatted_knowledge(category_ids)
|
||||
except Exception:
|
||||
logger.exception("Failed to retrieve relevant knowledge")
|
||||
return ""
|
||||
|
||||
|
||||
class KnowledgeLearner:
|
||||
"""
|
||||
从最近对话中提取用户画像类知识并写入知识库。
|
||||
"""
|
||||
|
||||
def __init__(self, session_id: str) -> None:
|
||||
self._session_id = session_id
|
||||
self._store = get_knowledge_store()
|
||||
self._llm = LLMServiceClient(task_name="utils", request_type="maisaka.knowledge.learn")
|
||||
self._learning_lock = asyncio.Lock()
|
||||
self._messages_cache: List[SessionMessage] = []
|
||||
|
||||
def add_messages(self, messages: List[SessionMessage]) -> None:
|
||||
"""缓存待学习的消息。"""
|
||||
self._messages_cache.extend(messages)
|
||||
|
||||
def get_cache_size(self) -> int:
|
||||
"""获取缓存消息数量。"""
|
||||
return len(self._messages_cache)
|
||||
|
||||
async def learn(self) -> int:
|
||||
"""
|
||||
从缓存消息中提取知识并落库。
|
||||
|
||||
Returns:
|
||||
新增入库的知识条数
|
||||
"""
|
||||
if not self._messages_cache:
|
||||
return 0
|
||||
|
||||
async with self._learning_lock:
|
||||
chat_excerpt = self._build_chat_excerpt()
|
||||
if not chat_excerpt:
|
||||
return 0
|
||||
|
||||
prompt = self._build_learning_prompt(chat_excerpt)
|
||||
try:
|
||||
result = await self._llm.generate_response(
|
||||
prompt=prompt,
|
||||
options=LLMGenerationOptions(
|
||||
temperature=0.1,
|
||||
max_tokens=512,
|
||||
),
|
||||
)
|
||||
except Exception:
|
||||
logger.exception("Knowledge learning model call failed")
|
||||
return 0
|
||||
|
||||
knowledge_items = self._parse_learning_result(result.response or "")
|
||||
if not knowledge_items:
|
||||
logger.debug("Knowledge learning finished without extracted entries")
|
||||
return 0
|
||||
|
||||
added_count = 0
|
||||
for item in knowledge_items:
|
||||
category_id = str(item.get("category_id", "")).strip()
|
||||
content = str(item.get("content", "")).strip()
|
||||
if not category_id or not content:
|
||||
continue
|
||||
|
||||
if self._store.add_knowledge(
|
||||
category_id=category_id,
|
||||
content=content,
|
||||
metadata={
|
||||
"session_id": self._session_id,
|
||||
"source": "maisaka_learning",
|
||||
},
|
||||
):
|
||||
added_count += 1
|
||||
|
||||
if added_count > 0:
|
||||
logger.info(
|
||||
f"Maisaka knowledge learning finished: session_id={self._session_id} added={added_count}"
|
||||
)
|
||||
else:
|
||||
logger.debug(
|
||||
f"Maisaka knowledge learning finished without new entries: session_id={self._session_id}"
|
||||
)
|
||||
|
||||
return added_count
|
||||
|
||||
def _build_chat_excerpt(self) -> str:
|
||||
"""
|
||||
构建适合画像提取的对话片段,只保留用户可见文本。
|
||||
"""
|
||||
lines: List[str] = []
|
||||
for message in self._messages_cache[-30:]:
|
||||
if get_message_role(message) == "assistant":
|
||||
continue
|
||||
if get_message_role(message) == "tool":
|
||||
continue
|
||||
if is_bot_self(message.platform, message.message_info.user_info.user_id):
|
||||
continue
|
||||
|
||||
raw_text = get_message_text(message).strip()
|
||||
if not raw_text:
|
||||
continue
|
||||
|
||||
speaker_name, body = parse_speaker_content(raw_text)
|
||||
visible_text = (body or raw_text).strip()
|
||||
if not visible_text:
|
||||
continue
|
||||
|
||||
speaker = speaker_name or message.message_info.user_info.user_nickname or "用户"
|
||||
lines.append(f"{speaker}: {visible_text}")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
def _build_learning_prompt(self, chat_excerpt: str) -> str:
|
||||
"""构建知识提取提示词。"""
|
||||
categories_text = "\n".join(
|
||||
f"{category_id}. {category_name}" for category_id, category_name in KNOWLEDGE_CATEGORIES.items()
|
||||
)
|
||||
return (
|
||||
"你是一个用户画像知识提取器,需要从聊天记录里提取稳定、可复用的用户事实。\n"
|
||||
"只提取用户明确表达或高置信度可归纳的信息,不要猜测,不要提取一次性情绪,不要重复表述。\n"
|
||||
"如果没有可提取内容,返回空数组 []。\n"
|
||||
"输出必须是 JSON 数组,每项格式为 "
|
||||
'{"category_id":"分类编号","content":"简洁中文陈述"}。\n'
|
||||
"分类如下:\n"
|
||||
f"{categories_text}\n\n"
|
||||
"聊天记录:\n"
|
||||
f"{chat_excerpt}"
|
||||
)
|
||||
|
||||
def _parse_learning_result(self, result: str) -> List[Dict[str, str]]:
|
||||
"""解析模型返回的知识条目。"""
|
||||
normalized = result.strip()
|
||||
if not normalized:
|
||||
return []
|
||||
|
||||
if "```" in normalized:
|
||||
normalized = normalized.replace("```json", "").replace("```JSON", "").replace("```", "").strip()
|
||||
|
||||
try:
|
||||
parsed = json.loads(normalized)
|
||||
except json.JSONDecodeError:
|
||||
logger.warning("Knowledge learning result is not valid JSON")
|
||||
return []
|
||||
|
||||
if not isinstance(parsed, list):
|
||||
return []
|
||||
|
||||
normalized_items: List[Dict[str, str]] = []
|
||||
seen_pairs: set[tuple[str, str]] = set()
|
||||
for item in parsed:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
|
||||
category_id = str(item.get("category_id", "")).strip()
|
||||
content = " ".join(str(item.get("content", "")).strip().split())
|
||||
if category_id not in KNOWLEDGE_CATEGORIES:
|
||||
continue
|
||||
if not content:
|
||||
continue
|
||||
|
||||
pair = (category_id, content)
|
||||
if pair in seen_pairs:
|
||||
continue
|
||||
seen_pairs.add(pair)
|
||||
normalized_items.append(
|
||||
{
|
||||
"category_id": category_id,
|
||||
"content": content,
|
||||
}
|
||||
)
|
||||
|
||||
return normalized_items
|
||||
197
src/know_u/knowledge_store.py
Normal file
197
src/know_u/knowledge_store.py
Normal file
@@ -0,0 +1,197 @@
|
||||
"""
|
||||
MaiSaka knowledge store.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import json
|
||||
|
||||
# 数据目录位于项目根目录下的 mai_knowledge
|
||||
PROJECT_ROOT = Path(__file__).resolve().parents[2]
|
||||
KNOWLEDGE_DATA_DIR = PROJECT_ROOT / "mai_knowledge"
|
||||
KNOWLEDGE_FILE = KNOWLEDGE_DATA_DIR / "knowledge.json"
|
||||
|
||||
|
||||
KNOWLEDGE_CATEGORIES = {
|
||||
"1": "性别",
|
||||
"2": "性格",
|
||||
"3": "饮食口味",
|
||||
"4": "交友喜好",
|
||||
"5": "情绪/理性倾向",
|
||||
"6": "兴趣爱好",
|
||||
"7": "职业/专业",
|
||||
"8": "生活习惯",
|
||||
"9": "价值观",
|
||||
"10": "沟通风格",
|
||||
"11": "学习方式",
|
||||
"12": "压力应对方式",
|
||||
}
|
||||
|
||||
|
||||
class KnowledgeStore:
|
||||
"""
|
||||
简单的 Maisaka 知识存储。
|
||||
|
||||
特性:
|
||||
- 持久化到 JSON 文件
|
||||
- 按分类存储用户画像类知识
|
||||
- 支持基础去重
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""初始化知识存储。"""
|
||||
self._knowledge: Dict[str, List[Dict[str, Any]]] = {
|
||||
category_id: [] for category_id in KNOWLEDGE_CATEGORIES
|
||||
}
|
||||
self._ensure_data_dir()
|
||||
self._load()
|
||||
|
||||
def _ensure_data_dir(self) -> None:
|
||||
"""确保数据目录存在。"""
|
||||
KNOWLEDGE_DATA_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def _load(self) -> None:
|
||||
"""从文件加载知识数据。"""
|
||||
if not KNOWLEDGE_FILE.exists():
|
||||
self._knowledge = {category_id: [] for category_id in KNOWLEDGE_CATEGORIES}
|
||||
return
|
||||
|
||||
try:
|
||||
with open(KNOWLEDGE_FILE, "r", encoding="utf-8") as file:
|
||||
loaded = json.load(file)
|
||||
|
||||
normalized_knowledge: Dict[str, List[Dict[str, Any]]] = {
|
||||
category_id: [] for category_id in KNOWLEDGE_CATEGORIES
|
||||
}
|
||||
for category_id in KNOWLEDGE_CATEGORIES:
|
||||
category_items = loaded.get(category_id, [])
|
||||
if isinstance(category_items, list):
|
||||
normalized_knowledge[category_id] = [
|
||||
item for item in category_items if isinstance(item, dict)
|
||||
]
|
||||
self._knowledge = normalized_knowledge
|
||||
except Exception:
|
||||
self._knowledge = {category_id: [] for category_id in KNOWLEDGE_CATEGORIES}
|
||||
|
||||
def _save(self) -> None:
|
||||
"""保存知识数据到文件。"""
|
||||
with open(KNOWLEDGE_FILE, "w", encoding="utf-8") as file:
|
||||
json.dump(self._knowledge, file, ensure_ascii=False, indent=2)
|
||||
|
||||
@staticmethod
|
||||
def _normalize_content(content: str) -> str:
|
||||
"""标准化知识内容,便于去重。"""
|
||||
return " ".join(str(content).strip().split())
|
||||
|
||||
def add_knowledge(
|
||||
self,
|
||||
category_id: str,
|
||||
content: str,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
) -> bool:
|
||||
"""
|
||||
添加一条知识信息。
|
||||
|
||||
Args:
|
||||
category_id: 分类编号
|
||||
content: 知识内容
|
||||
metadata: 附加元数据
|
||||
|
||||
Returns:
|
||||
是否新增成功;若命中去重则返回 False
|
||||
"""
|
||||
if category_id not in KNOWLEDGE_CATEGORIES:
|
||||
return False
|
||||
|
||||
normalized_content = self._normalize_content(content)
|
||||
if not normalized_content:
|
||||
return False
|
||||
|
||||
existing_items = self._knowledge.get(category_id, [])
|
||||
for item in existing_items:
|
||||
existing_content = self._normalize_content(str(item.get("content", "")))
|
||||
if existing_content == normalized_content:
|
||||
return False
|
||||
|
||||
knowledge_item = {
|
||||
"id": f"know_{category_id}_{datetime.now().timestamp()}",
|
||||
"content": normalized_content,
|
||||
"metadata": metadata or {},
|
||||
"created_at": datetime.now().isoformat(),
|
||||
}
|
||||
self._knowledge[category_id].append(knowledge_item)
|
||||
self._save()
|
||||
return True
|
||||
|
||||
def get_category_knowledge(self, category_id: str) -> List[Dict[str, Any]]:
|
||||
"""获取某个分类下的所有知识。"""
|
||||
return self._knowledge.get(category_id, [])
|
||||
|
||||
def get_all_knowledge(self) -> Dict[str, List[Dict[str, Any]]]:
|
||||
"""获取全部知识。"""
|
||||
return self._knowledge
|
||||
|
||||
def get_category_name(self, category_id: str) -> str:
|
||||
"""获取分类名称。"""
|
||||
return KNOWLEDGE_CATEGORIES.get(category_id, "未知分类")
|
||||
|
||||
def get_categories_summary(self) -> str:
|
||||
"""获取分类摘要,供模型判断是否需要检索。"""
|
||||
lines: List[str] = []
|
||||
for category_id, category_name in KNOWLEDGE_CATEGORIES.items():
|
||||
count = len(self._knowledge.get(category_id, []))
|
||||
count_text = f"{count}条" if count > 0 else "无数据"
|
||||
lines.append(f"{category_id}. {category_name} ({count_text})")
|
||||
return "\n".join(lines)
|
||||
|
||||
def get_formatted_knowledge(self, category_ids: List[str], limit_per_category: int = 5) -> str:
|
||||
"""
|
||||
获取指定分类的格式化知识内容。
|
||||
|
||||
Args:
|
||||
category_ids: 分类编号列表
|
||||
limit_per_category: 每个分类最多返回多少条
|
||||
|
||||
Returns:
|
||||
格式化后的知识内容
|
||||
"""
|
||||
parts: List[str] = []
|
||||
for category_id in category_ids:
|
||||
items = self.get_category_knowledge(category_id)
|
||||
if not items:
|
||||
continue
|
||||
|
||||
category_name = self.get_category_name(category_id)
|
||||
parts.append(f"【{category_name}】")
|
||||
|
||||
recent_items = items[-limit_per_category:]
|
||||
for item in recent_items:
|
||||
content = str(item.get("content", "")).strip()
|
||||
if content:
|
||||
parts.append(f"- {content}")
|
||||
|
||||
return "\n".join(parts)
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""获取知识数据统计。"""
|
||||
total_items = sum(len(items) for items in self._knowledge.values())
|
||||
return {
|
||||
"total_categories": len(KNOWLEDGE_CATEGORIES),
|
||||
"total_items": total_items,
|
||||
"data_file": str(KNOWLEDGE_FILE),
|
||||
"data_exists": KNOWLEDGE_FILE.exists(),
|
||||
"data_size_kb": KNOWLEDGE_FILE.stat().st_size / 1024 if KNOWLEDGE_FILE.exists() else 0,
|
||||
}
|
||||
|
||||
|
||||
_knowledge_store_instance: Optional[KnowledgeStore] = None
|
||||
|
||||
|
||||
def get_knowledge_store() -> KnowledgeStore:
|
||||
"""获取知识存储单例。"""
|
||||
global _knowledge_store_instance
|
||||
if _knowledge_store_instance is None:
|
||||
_knowledge_store_instance = KnowledgeStore()
|
||||
return _knowledge_store_instance
|
||||
Reference in New Issue
Block a user