feat：记忆系统重出江湖，移除了即时记忆和定期记忆

2025-08-27 22:18:22 +08:00
parent 01197cb2b7
commit 6d3e9fd3d4
14 changed files with 481 additions and 486 deletions
--- a/src/chat/utils/utils.py
+++ b/src/chat/utils/utils.py
@@ -834,3 +834,79 @@ def parse_keywords_string(keywords_input) -> list[str]:

    # 如果没有分隔符，返回单个关键词
    return [keywords_str] if keywords_str else []
+
+
+
+
+def cut_key_words(concept_name: str) -> list[str]:
+    """对概念名称进行jieba分词，并过滤掉关键词列表中的关键词"""
+    concept_name_tokens = list(jieba.cut(concept_name))
+
+    # 定义常见连词、停用词与标点
+    conjunctions = {
+        "和", "与", "及", "跟", "以及", "并且", "而且", "或", "或者", "并"
+    }
+    stop_words = {
+        "的", "了", "呢", "吗", "吧", "啊", "哦", "恩", "嗯", "呀", "嘛", "哇",
+        "在", "是", "很", "也", "又", "就", "都", "还", "更", "最", "被", "把",
+        "给", "对", "和", "与", "及", "跟", "并", "而且", "或者", "或", "以及"
+    }
+    chinese_punctuations = set("，。！？、；：（）【】《》“”‘’—…·-——,.!?;:()[]<>'\"/\\")
+
+    # 清理空白并初步过滤纯标点
+    cleaned_tokens = []
+    for tok in concept_name_tokens:
+        t = tok.strip()
+        if not t:
+            continue
+        # 去除纯标点
+        if all(ch in chinese_punctuations for ch in t):
+            continue
+        cleaned_tokens.append(t)
+
+    # 合并连词两侧的词（仅当两侧都存在且不是标点/停用词时）
+    merged_tokens = []
+    i = 0
+    n = len(cleaned_tokens)
+    while i < n:
+        tok = cleaned_tokens[i]
+        if tok in conjunctions and merged_tokens and i + 1 < n:
+            left = merged_tokens[-1]
+            right = cleaned_tokens[i + 1]
+            # 左右都需要是有效词
+            if left and right \
+                and left not in conjunctions and right not in conjunctions \
+                and left not in stop_words and right not in stop_words \
+                and not all(ch in chinese_punctuations for ch in left) \
+                and not all(ch in chinese_punctuations for ch in right):
+                # 合并为一个新词，并替换掉左侧与跳过右侧
+                combined = f"{left}{tok}{right}"
+                merged_tokens[-1] = combined
+                i += 2
+                continue
+        # 常规推进
+        merged_tokens.append(tok)
+        i += 1
+
+    # 二次过滤：去除停用词、单字符纯标点与无意义项
+    result_tokens = []
+    seen = set()
+    # ban_words = set(getattr(global_config.memory, "memory_ban_words", []) or [])
+    for tok in merged_tokens:
+        if tok in conjunctions:
+            # 独立连词丢弃
+            continue
+        if tok in stop_words:
+            continue
+        # if tok in ban_words:
+            # continue
+        if all(ch in chinese_punctuations for ch in tok):
+            continue
+        if tok.strip() == "":
+            continue
+        if tok not in seen:
+            seen.add(tok)
+            result_tokens.append(tok)
+
+    filtered_concept_name_tokens = result_tokens
+    return filtered_concept_name_tokens