feat(annotation): 添加文本分段标注功能

- 引入文本分割器实现长文本按200字符自动分段 - 增加分段状态管理和段落导航界面 - 支持按段落保存和加载标注数据 - 实现分段模式下的标注状态跟踪 - 扩展API接口支持段落索引参数 - 添加分段相关的数据模型定义
2026-01-19 18:18:19 +08:00
parent 3af0f0b3a1
commit 71c4a8d8a6
6 changed files with 395 additions and 41 deletions
--- a/runtime/datamate-python/app/module/annotation/service/annotation_text_splitter.py
+++ b/runtime/datamate-python/app/module/annotation/service/annotation_text_splitter.py
@@ -0,0 +1,113 @@
+"""
+标注文本分割器
+
+职责：将长文本按指定规则分割为适合标注的段落
+- 最大200字符（CJK按1字符计）
+- 分隔符：。；以及正则 \\?|\\!|(?<!\\d)\\.(?!\\d)
+- 超长句子保持完整
+"""
+
+import re
+from typing import List, TypedDict
+
+
+class SegmentInfo(TypedDict):
+    """段落信息"""
+    idx: int           # 段落索引
+    text: str          # 段落文本
+    start: int         # 在原文中的起始位置
+    end: int           # 在原文中的结束位置
+
+
+class AnnotationTextSplitter:
+    """标注文本分割器"""
+
+    # 分隔符正则：全角句号、全角分号、以及非数字间的英文句号/问号/感叹号
+    # 使用捕获组保留分隔符
+    SEPARATOR_PATTERN = r'(。|；|\?|\!|(?<!\d)\.(?!\d))'
+
+    def __init__(self, max_chars: int = 200):
+        """
+        初始化分割器
+
+        Args:
+            max_chars: 每个段落的最大字符数（默认200）
+        """
+        self.max_chars = max_chars
+
+    def split(self, text: str) -> List[SegmentInfo]:
+        """
+        将文本分割为段落列表
+
+        规则：
+        1. 按分隔符切分为句子
+        2. 贪心合并句子，直到超过 max_chars
+        3. 单句超过 max_chars 则独立成段（保持句子完整）
+
+        Args:
+            text: 待分割的文本
+
+        Returns:
+            段落列表，每个元素包含 idx, text, start, end
+        """
+        if not text:
+            return [{"idx": 0, "text": "", "start": 0, "end": 0}]
+
+        # 短文本不需要分割
+        if len(text) <= self.max_chars:
+            return [{"idx": 0, "text": text, "start": 0, "end": len(text)}]
+
+        # 按分隔符切分，保留分隔符
+        parts = re.split(self.SEPARATOR_PATTERN, text)
+
+        # 合并句子和分隔符
+        sentences: List[str] = []
+        i = 0
+        while i < len(parts):
+            part = parts[i]
+            # 检查下一个是否是分隔符（匹配捕获组）
+            if i + 1 < len(parts) and re.fullmatch(self.SEPARATOR_PATTERN, parts[i + 1]):
+                # 将分隔符附加到当前部分
+                part += parts[i + 1]
+                i += 2
+            else:
+                i += 1
+            # 跳过空字符串
+            if part:
+                sentences.append(part)
+
+        # 贪心合并
+        segments: List[SegmentInfo] = []
+        current_text = ""
+        current_start = 0
+        idx = 0
+
+        for sentence in sentences:
+            if not current_text:
+                # 开始新段落
+                current_text = sentence
+            elif len(current_text) + len(sentence) <= self.max_chars:
+                # 可以合并到当前段落
+                current_text += sentence
+            else:
+                # 当前段落已满，保存
+                segments.append({
+                    "idx": idx,
+                    "text": current_text,
+                    "start": current_start,
+                    "end": current_start + len(current_text)
+                })
+                idx += 1
+                current_start += len(current_text)
+                current_text = sentence
+
+        # 处理最后一个段落
+        if current_text:
+            segments.append({
+                "idx": idx,
+                "text": current_text,
+                "start": current_start,
+                "end": current_start + len(current_text)
+            })
+
+        return segments