352 lines
14 KiB
Python
352 lines
14 KiB
Python
"""
|
||
基于聊天记录的人格特征分析系统
|
||
"""
|
||
|
||
from typing import Dict, List
|
||
import json
|
||
import os
|
||
from pathlib import Path
|
||
from dotenv import load_dotenv
|
||
import sys
|
||
import random
|
||
from collections import defaultdict
|
||
import matplotlib.pyplot as plt
|
||
import numpy as np
|
||
from datetime import datetime
|
||
import matplotlib.font_manager as fm
|
||
|
||
current_dir = Path(__file__).resolve().parent
|
||
project_root = current_dir.parent.parent.parent
|
||
env_path = project_root / ".env.prod"
|
||
|
||
root_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../.."))
|
||
sys.path.append(root_path)
|
||
|
||
from src.plugins.personality.scene import get_scene_by_factor, PERSONALITY_SCENES # noqa: E402
|
||
from src.plugins.personality.questionnaire import FACTOR_DESCRIPTIONS # noqa: E402
|
||
from src.plugins.personality.offline_llm import LLMModel # noqa: E402
|
||
from src.plugins.personality.who_r_u import MessageAnalyzer # noqa: E402
|
||
|
||
# 加载环境变量
|
||
if env_path.exists():
|
||
print(f"从 {env_path} 加载环境变量")
|
||
load_dotenv(env_path)
|
||
else:
|
||
print(f"未找到环境变量文件: {env_path}")
|
||
print("将使用默认配置")
|
||
|
||
class ChatBasedPersonalityEvaluator:
|
||
def __init__(self):
|
||
self.personality_traits = {"开放性": 0, "严谨性": 0, "外向性": 0, "宜人性": 0, "神经质": 0}
|
||
self.scenarios = []
|
||
self.message_analyzer = MessageAnalyzer()
|
||
self.llm = LLMModel()
|
||
self.trait_scores_history = defaultdict(list) # 记录每个特质的得分历史
|
||
|
||
# 为每个人格特质获取对应的场景
|
||
for trait in PERSONALITY_SCENES:
|
||
scenes = get_scene_by_factor(trait)
|
||
if not scenes:
|
||
continue
|
||
scene_keys = list(scenes.keys())
|
||
selected_scenes = random.sample(scene_keys, min(3, len(scene_keys)))
|
||
|
||
for scene_key in selected_scenes:
|
||
scene = scenes[scene_key]
|
||
other_traits = [t for t in PERSONALITY_SCENES if t != trait]
|
||
secondary_trait = random.choice(other_traits)
|
||
self.scenarios.append({
|
||
"场景": scene["scenario"],
|
||
"评估维度": [trait, secondary_trait],
|
||
"场景编号": scene_key
|
||
})
|
||
|
||
def analyze_chat_context(self, messages: List[Dict]) -> str:
|
||
"""
|
||
分析一组消息的上下文,生成场景描述
|
||
"""
|
||
context = ""
|
||
for msg in messages:
|
||
nickname = msg.get('user_info', {}).get('user_nickname', '未知用户')
|
||
content = msg.get('processed_plain_text', msg.get('detailed_plain_text', ''))
|
||
if content:
|
||
context += f"{nickname}: {content}\n"
|
||
return context
|
||
|
||
def evaluate_chat_response(
|
||
self, user_nickname: str, chat_context: str, dimensions: List[str] = None) -> Dict[str, float]:
|
||
"""
|
||
评估聊天内容在各个人格维度上的得分
|
||
"""
|
||
# 使用所有维度进行评估
|
||
dimensions = list(self.personality_traits.keys())
|
||
|
||
dimension_descriptions = []
|
||
for dim in dimensions:
|
||
desc = FACTOR_DESCRIPTIONS.get(dim, "")
|
||
if desc:
|
||
dimension_descriptions.append(f"- {dim}:{desc}")
|
||
|
||
dimensions_text = "\n".join(dimension_descriptions)
|
||
|
||
prompt = f"""请根据以下聊天记录,评估"{user_nickname}"在大五人格模型中的维度得分(1-6分)。
|
||
|
||
聊天记录:
|
||
{chat_context}
|
||
|
||
需要评估的维度说明:
|
||
{dimensions_text}
|
||
|
||
请按照以下格式输出评估结果,注意,你的评价对象是"{user_nickname}"(仅输出JSON格式):
|
||
{{
|
||
"开放性": 分数,
|
||
"严谨性": 分数,
|
||
"外向性": 分数,
|
||
"宜人性": 分数,
|
||
"神经质": 分数
|
||
}}
|
||
|
||
评分标准:
|
||
1 = 非常不符合该维度特征
|
||
2 = 比较不符合该维度特征
|
||
3 = 有点不符合该维度特征
|
||
4 = 有点符合该维度特征
|
||
5 = 比较符合该维度特征
|
||
6 = 非常符合该维度特征
|
||
|
||
如果你觉得某个维度没有相关信息或者无法判断,请输出0分
|
||
|
||
请根据聊天记录的内容和语气,结合维度说明进行评分。如果维度可以评分,确保分数在1-6之间。如果没有体现,请输出0分"""
|
||
|
||
try:
|
||
ai_response, _ = self.llm.generate_response(prompt)
|
||
start_idx = ai_response.find("{")
|
||
end_idx = ai_response.rfind("}") + 1
|
||
if start_idx != -1 and end_idx != 0:
|
||
json_str = ai_response[start_idx:end_idx]
|
||
scores = json.loads(json_str)
|
||
return {k: max(0, min(6, float(v))) for k, v in scores.items()}
|
||
else:
|
||
print("AI响应格式不正确,使用默认评分")
|
||
return {dim: 0 for dim in dimensions}
|
||
except Exception as e:
|
||
print(f"评估过程出错:{str(e)}")
|
||
return {dim: 0 for dim in dimensions}
|
||
|
||
def evaluate_user_personality(self, qq_id: str, num_samples: int = 10, context_length: int = 5) -> Dict:
|
||
"""
|
||
基于用户的聊天记录评估人格特征
|
||
|
||
Args:
|
||
qq_id (str): 用户QQ号
|
||
num_samples (int): 要分析的聊天片段数量
|
||
context_length (int): 每个聊天片段的上下文长度
|
||
|
||
Returns:
|
||
Dict: 评估结果
|
||
"""
|
||
# 获取用户的随机消息及其上下文
|
||
chat_contexts, user_nickname = self.message_analyzer.get_user_random_contexts(
|
||
qq_id, num_messages=num_samples, context_length=context_length)
|
||
if not chat_contexts:
|
||
return {"error": f"没有找到QQ号 {qq_id} 的消息记录"}
|
||
|
||
# 初始化评分
|
||
final_scores = defaultdict(float)
|
||
dimension_counts = defaultdict(int)
|
||
chat_samples = []
|
||
|
||
# 清空历史记录
|
||
self.trait_scores_history.clear()
|
||
|
||
# 分析每个聊天上下文
|
||
for chat_context in chat_contexts:
|
||
# 评估这段聊天内容的所有维度
|
||
scores = self.evaluate_chat_response(user_nickname, chat_context)
|
||
|
||
# 记录样本
|
||
chat_samples.append({
|
||
"聊天内容": chat_context,
|
||
"评估维度": list(self.personality_traits.keys()),
|
||
"评分": scores
|
||
})
|
||
|
||
# 更新总分和历史记录
|
||
for dimension, score in scores.items():
|
||
if score > 0: # 只统计大于0的有效分数
|
||
final_scores[dimension] += score
|
||
dimension_counts[dimension] += 1
|
||
self.trait_scores_history[dimension].append(score)
|
||
|
||
# 计算平均分
|
||
average_scores = {}
|
||
for dimension in self.personality_traits:
|
||
if dimension_counts[dimension] > 0:
|
||
average_scores[dimension] = round(final_scores[dimension] / dimension_counts[dimension], 2)
|
||
else:
|
||
average_scores[dimension] = 0 # 如果没有有效分数,返回0
|
||
|
||
# 生成趋势图
|
||
self._generate_trend_plot(qq_id, user_nickname)
|
||
|
||
result = {
|
||
"用户QQ": qq_id,
|
||
"用户昵称": user_nickname,
|
||
"样本数量": len(chat_samples),
|
||
"人格特征评分": average_scores,
|
||
"维度评估次数": dict(dimension_counts),
|
||
"详细样本": chat_samples,
|
||
"特质得分历史": {k: v for k, v in self.trait_scores_history.items()}
|
||
}
|
||
|
||
# 保存结果
|
||
os.makedirs("results", exist_ok=True)
|
||
result_file = f"results/personality_result_{qq_id}.json"
|
||
with open(result_file, "w", encoding="utf-8") as f:
|
||
json.dump(result, f, ensure_ascii=False, indent=2)
|
||
|
||
return result
|
||
|
||
def _generate_trend_plot(self, qq_id: str, user_nickname: str):
|
||
"""
|
||
生成人格特质累计平均分变化趋势图
|
||
"""
|
||
# 查找系统中可用的中文字体
|
||
chinese_fonts = []
|
||
for f in fm.fontManager.ttflist:
|
||
try:
|
||
if '简' in f.name or 'SC' in f.name or '黑' in f.name or '宋' in f.name or '微软' in f.name:
|
||
chinese_fonts.append(f.name)
|
||
except Exception:
|
||
continue
|
||
|
||
if chinese_fonts:
|
||
plt.rcParams['font.sans-serif'] = chinese_fonts + ['SimHei', 'Microsoft YaHei', 'Arial Unicode MS']
|
||
else:
|
||
# 如果没有找到中文字体,使用默认字体,并将中文昵称转换为拼音或英文
|
||
try:
|
||
from pypinyin import lazy_pinyin
|
||
user_nickname = ''.join(lazy_pinyin(user_nickname))
|
||
except ImportError:
|
||
user_nickname = "User" # 如果无法转换为拼音,使用默认英文
|
||
|
||
plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题
|
||
|
||
plt.figure(figsize=(12, 6))
|
||
plt.style.use('bmh') # 使用内置的bmh样式,它有类似seaborn的美观效果
|
||
|
||
colors = {
|
||
"开放性": "#FF9999",
|
||
"严谨性": "#66B2FF",
|
||
"外向性": "#99FF99",
|
||
"宜人性": "#FFCC99",
|
||
"神经质": "#FF99CC"
|
||
}
|
||
|
||
# 计算每个维度在每个时间点的累计平均分
|
||
cumulative_averages = {}
|
||
for trait, scores in self.trait_scores_history.items():
|
||
if not scores:
|
||
continue
|
||
|
||
averages = []
|
||
total = 0
|
||
valid_count = 0
|
||
for score in scores:
|
||
if score > 0: # 只计算大于0的有效分数
|
||
total += score
|
||
valid_count += 1
|
||
if valid_count > 0:
|
||
averages.append(total / valid_count)
|
||
else:
|
||
# 如果当前分数无效,使用前一个有效的平均分
|
||
if averages:
|
||
averages.append(averages[-1])
|
||
else:
|
||
continue # 跳过无效分数
|
||
|
||
if averages: # 只有在有有效分数的情况下才添加到累计平均中
|
||
cumulative_averages[trait] = averages
|
||
|
||
# 绘制每个维度的累计平均分变化趋势
|
||
for trait, averages in cumulative_averages.items():
|
||
x = range(1, len(averages) + 1)
|
||
plt.plot(x, averages, 'o-', label=trait, color=colors.get(trait), linewidth=2, markersize=8)
|
||
|
||
# 添加趋势线
|
||
z = np.polyfit(x, averages, 1)
|
||
p = np.poly1d(z)
|
||
plt.plot(x, p(x), '--', color=colors.get(trait), alpha=0.5)
|
||
|
||
plt.title(f"{user_nickname} 的人格特质累计平均分变化趋势", fontsize=14, pad=20)
|
||
plt.xlabel("评估次数", fontsize=12)
|
||
plt.ylabel("累计平均分", fontsize=12)
|
||
plt.grid(True, linestyle='--', alpha=0.7)
|
||
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
|
||
plt.ylim(0, 7)
|
||
plt.tight_layout()
|
||
|
||
# 保存图表
|
||
os.makedirs("results/plots", exist_ok=True)
|
||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||
plot_file = f"results/plots/personality_trend_{qq_id}_{timestamp}.png"
|
||
plt.savefig(plot_file, dpi=300, bbox_inches='tight')
|
||
plt.close()
|
||
|
||
def analyze_user_personality(qq_id: str, num_samples: int = 10, context_length: int = 5) -> str:
|
||
"""
|
||
分析用户人格特征的便捷函数
|
||
|
||
Args:
|
||
qq_id (str): 用户QQ号
|
||
num_samples (int): 要分析的聊天片段数量
|
||
context_length (int): 每个聊天片段的上下文长度
|
||
|
||
Returns:
|
||
str: 格式化的分析结果
|
||
"""
|
||
evaluator = ChatBasedPersonalityEvaluator()
|
||
result = evaluator.evaluate_user_personality(qq_id, num_samples, context_length)
|
||
|
||
if "error" in result:
|
||
return result["error"]
|
||
|
||
# 格式化输出
|
||
output = f"QQ号 {qq_id} ({result['用户昵称']}) 的人格特征分析结果:\n"
|
||
output += "=" * 50 + "\n\n"
|
||
|
||
output += "人格特征评分:\n"
|
||
for trait, score in result["人格特征评分"].items():
|
||
if score == 0:
|
||
output += f"{trait}: 数据不足,无法判断 (评估次数: {result['维度评估次数'].get(trait, 0)})\n"
|
||
else:
|
||
output += f"{trait}: {score}/6 (评估次数: {result['维度评估次数'].get(trait, 0)})\n"
|
||
|
||
# 添加变化趋势描述
|
||
if trait in result["特质得分历史"] and len(result["特质得分历史"][trait]) > 1:
|
||
scores = [s for s in result["特质得分历史"][trait] if s != 0] # 过滤掉无效分数
|
||
if len(scores) > 1: # 确保有足够的有效分数计算趋势
|
||
trend = np.polyfit(range(len(scores)), scores, 1)[0]
|
||
if abs(trend) < 0.1:
|
||
trend_desc = "保持稳定"
|
||
elif trend > 0:
|
||
trend_desc = "呈上升趋势"
|
||
else:
|
||
trend_desc = "呈下降趋势"
|
||
output += f" 变化趋势: {trend_desc} (斜率: {trend:.2f})\n"
|
||
|
||
output += f"\n分析样本数量:{result['样本数量']}\n"
|
||
output += f"结果已保存至:results/personality_result_{qq_id}.json\n"
|
||
output += "变化趋势图已保存至:results/plots/目录\n"
|
||
|
||
return output
|
||
|
||
if __name__ == "__main__":
|
||
# 测试代码
|
||
# test_qq = "" # 替换为要测试的QQ号
|
||
# print(analyze_user_personality(test_qq, num_samples=30, context_length=20))
|
||
# test_qq = ""
|
||
# print(analyze_user_personality(test_qq, num_samples=30, context_length=20))
|
||
test_qq = "1026294844"
|
||
print(analyze_user_personality(test_qq, num_samples=30, context_length=30))
|