better:优化了表达方式采样
This commit is contained in:
@@ -8,6 +8,8 @@ except Exception:
|
||||
_HAS_JIEBA = False
|
||||
|
||||
_WORD_RE = re.compile(r"[A-Za-z0-9_]+")
|
||||
# 匹配纯符号的正则表达式
|
||||
_SYMBOL_RE = re.compile(r'^[^\w\u4e00-\u9fff]+$')
|
||||
|
||||
def simple_en_tokenize(text: str) -> List[str]:
|
||||
return _WORD_RE.findall(text.lower())
|
||||
@@ -25,4 +27,5 @@ class Tokenizer:
|
||||
toks = [t.strip().lower() for t in jieba.cut(text) if t.strip()]
|
||||
else:
|
||||
toks = simple_en_tokenize(text)
|
||||
return [t for t in toks if t not in self.stopwords]
|
||||
# 过滤掉纯符号和停用词
|
||||
return [t for t in toks if t not in self.stopwords and not _SYMBOL_RE.match(t)]
|
||||
Reference in New Issue
Block a user