better:优化表达方式学习和分割

This commit is contained in:
SengokuCola
2025-12-18 16:39:16 +08:00
parent dd891c4b18
commit f7a2f2329a
7 changed files with 1400 additions and 35 deletions

View File

@@ -211,7 +211,40 @@ def split_into_sentences_w_remove_punctuation(text: str) -> list[str]:
if len_text < 3:
return list(text) if random.random() < 0.01 else [text]
# 定义分隔符(包含换行符,换行符必须强制分割
# 先标记哪些位置位于成对引号内部,避免在引号内部进行句子分割
# 支持的引号包括:中英文单/双引号和常见中文书名号/引号
quote_chars = {
'"',
"'",
"",
"",
"",
"",
"",
"",
"",
"",
}
inside_quote = [False] * len_text
in_quote = False
current_quote_char = ""
for idx, ch in enumerate(text):
if ch in quote_chars:
# 遇到引号时切换状态(英文引号本身开闭相同,用同一个字符表示)
if not in_quote:
in_quote = True
current_quote_char = ch
inside_quote[idx] = False
else:
# 只有遇到同一类引号才视为关闭
if ch == current_quote_char or ch in {'"', "'"} and current_quote_char in {'"', "'"}:
in_quote = False
current_quote_char = ""
inside_quote[idx] = False
else:
inside_quote[idx] = in_quote
# 定义分隔符(包含换行符)
separators = {"", ",", " ", "", ";", "\n"}
segments = []
current_segment = ""
@@ -221,31 +254,35 @@ def split_into_sentences_w_remove_punctuation(text: str) -> list[str]:
while i < len(text):
char = text[i]
if char in separators:
# 换行符必须强制分割,不受其他规则影响
if char == "\n":
can_split = True
# 引号内部一律不作为分割点(包括换行)
if inside_quote[i]:
can_split = False
else:
# 检查分割条件
can_split = True
# 检查分隔符左右是否有冒号(中英文),如果有则不分割
if i > 0:
prev_char = text[i - 1]
if prev_char in {":", ""}:
can_split = False
if i < len(text) - 1:
next_char = text[i + 1]
if next_char in {":", ""}:
can_split = False
# 如果左右没有冒号,再检查空格的特殊情况
if can_split and char == " " and i > 0 and i < len(text) - 1:
prev_char = text[i - 1]
next_char = text[i + 1]
# 不分割数字和数字、数字和英文、英文和数字、英文和英文之间的空格
prev_is_alnum = prev_char.isdigit() or is_english_letter(prev_char)
next_is_alnum = next_char.isdigit() or is_english_letter(next_char)
if prev_is_alnum and next_is_alnum:
can_split = False
# 换行符在不在引号内时都强制分割
if char == "\n":
can_split = True
else:
# 检查分割条件
can_split = True
# 检查分隔符左右是否有冒号(中英文),如果有则不分割
if i > 0:
prev_char = text[i - 1]
if prev_char in {":", ""}:
can_split = False
if i < len(text) - 1:
next_char = text[i + 1]
if next_char in {":", ""}:
can_split = False
# 如果左右没有冒号,再检查空格的特殊情况
if can_split and char == " " and i > 0 and i < len(text) - 1:
prev_char = text[i - 1]
next_char = text[i + 1]
# 不分割数字和数字、数字和英文、英文和数字、英文和英文之间的空格
prev_is_alnum = prev_char.isdigit() or is_english_letter(prev_char)
next_is_alnum = next_char.isdigit() or is_english_letter(next_char)
if prev_is_alnum and next_is_alnum:
can_split = False
if can_split:
# 只有当当前段不为空时才添加