You've already forked DataMate
feat(annotation): 添加文本分段标注功能
- 引入文本分割器实现长文本按200字符自动分段 - 增加分段状态管理和段落导航界面 - 支持按段落保存和加载标注数据 - 实现分段模式下的标注状态跟踪 - 扩展API接口支持段落索引参数 - 添加分段相关的数据模型定义
This commit is contained in:
@@ -0,0 +1,113 @@
|
||||
"""
|
||||
标注文本分割器
|
||||
|
||||
职责:将长文本按指定规则分割为适合标注的段落
|
||||
- 最大200字符(CJK按1字符计)
|
||||
- 分隔符:。;以及正则 \\?|\\!|(?<!\\d)\\.(?!\\d)
|
||||
- 超长句子保持完整
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import List, TypedDict
|
||||
|
||||
|
||||
class SegmentInfo(TypedDict):
|
||||
"""段落信息"""
|
||||
idx: int # 段落索引
|
||||
text: str # 段落文本
|
||||
start: int # 在原文中的起始位置
|
||||
end: int # 在原文中的结束位置
|
||||
|
||||
|
||||
class AnnotationTextSplitter:
|
||||
"""标注文本分割器"""
|
||||
|
||||
# 分隔符正则:全角句号、全角分号、以及非数字间的英文句号/问号/感叹号
|
||||
# 使用捕获组保留分隔符
|
||||
SEPARATOR_PATTERN = r'(。|;|\?|\!|(?<!\d)\.(?!\d))'
|
||||
|
||||
def __init__(self, max_chars: int = 200):
|
||||
"""
|
||||
初始化分割器
|
||||
|
||||
Args:
|
||||
max_chars: 每个段落的最大字符数(默认200)
|
||||
"""
|
||||
self.max_chars = max_chars
|
||||
|
||||
def split(self, text: str) -> List[SegmentInfo]:
|
||||
"""
|
||||
将文本分割为段落列表
|
||||
|
||||
规则:
|
||||
1. 按分隔符切分为句子
|
||||
2. 贪心合并句子,直到超过 max_chars
|
||||
3. 单句超过 max_chars 则独立成段(保持句子完整)
|
||||
|
||||
Args:
|
||||
text: 待分割的文本
|
||||
|
||||
Returns:
|
||||
段落列表,每个元素包含 idx, text, start, end
|
||||
"""
|
||||
if not text:
|
||||
return [{"idx": 0, "text": "", "start": 0, "end": 0}]
|
||||
|
||||
# 短文本不需要分割
|
||||
if len(text) <= self.max_chars:
|
||||
return [{"idx": 0, "text": text, "start": 0, "end": len(text)}]
|
||||
|
||||
# 按分隔符切分,保留分隔符
|
||||
parts = re.split(self.SEPARATOR_PATTERN, text)
|
||||
|
||||
# 合并句子和分隔符
|
||||
sentences: List[str] = []
|
||||
i = 0
|
||||
while i < len(parts):
|
||||
part = parts[i]
|
||||
# 检查下一个是否是分隔符(匹配捕获组)
|
||||
if i + 1 < len(parts) and re.fullmatch(self.SEPARATOR_PATTERN, parts[i + 1]):
|
||||
# 将分隔符附加到当前部分
|
||||
part += parts[i + 1]
|
||||
i += 2
|
||||
else:
|
||||
i += 1
|
||||
# 跳过空字符串
|
||||
if part:
|
||||
sentences.append(part)
|
||||
|
||||
# 贪心合并
|
||||
segments: List[SegmentInfo] = []
|
||||
current_text = ""
|
||||
current_start = 0
|
||||
idx = 0
|
||||
|
||||
for sentence in sentences:
|
||||
if not current_text:
|
||||
# 开始新段落
|
||||
current_text = sentence
|
||||
elif len(current_text) + len(sentence) <= self.max_chars:
|
||||
# 可以合并到当前段落
|
||||
current_text += sentence
|
||||
else:
|
||||
# 当前段落已满,保存
|
||||
segments.append({
|
||||
"idx": idx,
|
||||
"text": current_text,
|
||||
"start": current_start,
|
||||
"end": current_start + len(current_text)
|
||||
})
|
||||
idx += 1
|
||||
current_start += len(current_text)
|
||||
current_text = sentence
|
||||
|
||||
# 处理最后一个段落
|
||||
if current_text:
|
||||
segments.append({
|
||||
"idx": idx,
|
||||
"text": current_text,
|
||||
"start": current_start,
|
||||
"end": current_start + len(current_text)
|
||||
})
|
||||
|
||||
return segments
|
||||
Reference in New Issue
Block a user