Ruff fix
This commit is contained in:
@@ -3,17 +3,20 @@ from typing import List, Optional, Set
|
||||
|
||||
try:
|
||||
import jieba
|
||||
|
||||
_HAS_JIEBA = True
|
||||
except Exception:
|
||||
_HAS_JIEBA = False
|
||||
|
||||
_WORD_RE = re.compile(r"[A-Za-z0-9_]+")
|
||||
# 匹配纯符号的正则表达式
|
||||
_SYMBOL_RE = re.compile(r'^[^\w\u4e00-\u9fff]+$')
|
||||
_SYMBOL_RE = re.compile(r"^[^\w\u4e00-\u9fff]+$")
|
||||
|
||||
|
||||
def simple_en_tokenize(text: str) -> List[str]:
|
||||
return _WORD_RE.findall(text.lower())
|
||||
|
||||
|
||||
class Tokenizer:
|
||||
def __init__(self, stopwords: Optional[Set[str]] = None, use_jieba: bool = True):
|
||||
self.stopwords = stopwords or set()
|
||||
@@ -28,4 +31,4 @@ class Tokenizer:
|
||||
else:
|
||||
toks = simple_en_tokenize(text)
|
||||
# 过滤掉纯符号和停用词
|
||||
return [t for t in toks if t not in self.stopwords and not _SYMBOL_RE.match(t)]
|
||||
return [t for t in toks if t not in self.stopwords and not _SYMBOL_RE.match(t)]
|
||||
|
||||
Reference in New Issue
Block a user