init datamate

2025-10-21 23:00:48 +08:00
commit 1c97afed7d
692 changed files with 135442 additions and 0 deletions
--- a/runtime/ops/slicer/init.py
+++ b/runtime/ops/slicer/init.py
@@ -0,0 +1,22 @@
+# -*- coding: utf-8 -*-
+
+import sys
+from pathlib import Path
+from datamate.common.utils.custom_importer import CustomImporter
+
+
+def _configure_importer():
+    base_path = Path(__file__).resolve().parent
+    sys.meta_path.append(CustomImporter(base_path))
+
+
+_configure_importer()
+
+
+def _import_operators():
+    from . import slide_simple_slicer
+    from . import slide_annotation_slicer
+    from . import segmentation
+
+
+_import_operators()
--- a/runtime/ops/slicer/segmentation/init.py
+++ b/runtime/ops/slicer/segmentation/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='Segmentation',
+                          module_path="ops.slicer.segmentation.process")
--- a/runtime/ops/slicer/segmentation/metadata.yml
+++ b/runtime/ops/slicer/segmentation/metadata.yml
@@ -0,0 +1,16 @@
+name: '文本切分'
+name_en: 'Text Segmentation'
+description: '将文本切分成多个切片。'
+description_en: 'Text Segmentation.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'Segmentation'
+version: '1.0.0'
+types:
+  - 'consolidate'
+modal: 'text'
+effect:
+  before: ''
+  after: ''
+inputs: 'text'
+outputs: 'text'
--- a/runtime/ops/slicer/segmentation/process.py
+++ b/runtime/ops/slicer/segmentation/process.py
@@ -0,0 +1,62 @@
+#!/user/bin/python
+# -*- coding: utf-8 -*-
+"""
+Description: 文本切分算子
+Create: 2023/11/09 10:17
+"""
+
+import random
+from typing import List, Dict, Any
+
+from loguru import logger
+
+from datamate.common.utils.text_splitter import TextSplitter
+from datamate.core.base_op import Slicer
+
+
+class TextSegmentationOperator:
+    def __init__(self, max_characters, chunk_size, chunk_overlap):
+        try:
+            self.text_splitter = TextSplitter(max_characters, chunk_size, chunk_overlap)
+        except Exception as err:
+            logger.exception(f"init text splitter failed, error is： {err}")
+            raise Exception(83001, "init text splitter failed") from None
+
+    def process(self, input_data: str) -> List[str]:
+        if input_data.strip() == "":
+            logger.info("input text is empty, return empty chunks.")
+            return []
+        return self.text_splitter.split_text(input_data)
+
+
+class Segmentation(Slicer):
+    """切片算法插件"""
+
+    def __init__(self, *args, **kwargs):
+        super(Segmentation, self).__init__(*args, **kwargs)
+        self.max_characters = kwargs.get("maxCharacters", -1)
+        self.chunk_size = kwargs.get("chunkSize", 800)
+        self.chunk_overlap = kwargs.get("chunkOverlap", 100)
+        self.slice_num = kwargs.get("sliceNum", 5)
+        self.op = TextSegmentationOperator(self.max_characters, self.chunk_size, self.chunk_overlap)
+        self.last_ops = True
+
+    def execute(self, sample: Dict[str, Any]) -> List[Dict]:
+
+        try:
+            chunks = self.op.process(sample[self.text_key])
+        except Exception as err:
+            logger.exception(f"split text failed, error is: {err}")
+            raise Exception(83002, "init text splitter failed") from None
+        num_to_sample = min(self.slice_num, len(chunks))
+        sampled_indices = random.sample(chunks, num_to_sample)
+        for idx, chunk in enumerate(sampled_indices):
+            temp_sample = {self.text_key: chunk, self.data_key: "", self.export_path_key: sample[self.export_path_key],
+                           self.filename_key: sample[self.filename_key], self.fileid_key: sample[self.fileid_key],
+                           "sequenceId": str(idx), "chunkSize": str(len(chunk))}
+            self.save_patch_sample(temp_sample, idx, save_format="text")
+
+        sample["fileNum"] = len(chunks)
+        sample[self.text_key] = "Success"
+
+        return [sample]
--- a/runtime/ops/slicer/slide_annotation_slicer/init.py
+++ b/runtime/ops/slicer/slide_annotation_slicer/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='AnnotationSlicer',
+                          module_path="ops.slicer.slide_annotation_slicer.process")
--- a/runtime/ops/slicer/slide_annotation_slicer/metadata.yml
+++ b/runtime/ops/slicer/slide_annotation_slicer/metadata.yml
@@ -0,0 +1,16 @@
+name: '病理图片标注切片'
+name_en: 'Pathological Image Annotation Slicing'
+description: '根据标注文件对病理图片进行切片。'
+description_en: 'Slicing pathology image based on annotation file..'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'AnnotationSlicer'
+version: '1.0.0'
+types:
+  - 'consolidate'
+modal: 'image'
+effect:
+  before: ''
+  after: ''
+inputs: 'image'
+outputs: 'image'
--- a/runtime/ops/slicer/slide_annotation_slicer/process.py
+++ b/runtime/ops/slicer/slide_annotation_slicer/process.py
@@ -0,0 +1,117 @@
+# -- encoding: utf-8 --
+
+"""
+Description: 医疗图片按坐标切片
+Create: 2025/02/08 11:00
+"""
+import copy
+import time
+import os
+from typing import List, Dict, Any
+
+import xml.etree.ElementTree as ET
+from loguru import logger
+
+import numpy as np
+import cv2
+from openslide import OpenSlide
+
+from datamate.core.base_op import Slicer
+from datamate.common.utils import bytes_transform
+
+
+class AnnotationSlicer(Slicer):
+
+    def __init__(self, *args, **kwargs):
+        super(AnnotationSlicer, self).__init__(*args, **kwargs)
+        self.last_ops = True
+
+    def execute(self, sample: Dict[str, Any]) -> List[Dict[str, Any]]:
+        start = time.time()
+
+        slide: OpenSlide = OpenSlide(sample[self.filepath_key])
+        if not isinstance(slide, OpenSlide):
+            logger.error("Not desired <Image.Image> object.")
+
+        annotation_path: str = sample["extraFilePath"]
+        annotations = self.parse_xml_annotations(annotation_path)
+
+        patch_num = self.auto_coordinate_slicer(sample, slide, annotations)
+        sample["slice_num"] = patch_num
+
+        file_name = sample[self.filename_key]
+        logger.info(f"fileName: {file_name}, method: CoordinateSlider costs {(time.time() - start):6f} s")
+
+        return [sample]
+
+    def parse_xml_annotations(self, xml_path: str) -> List:
+        """ 解析 XML 文件，提取所有 Annotation 的坐标和 PartOfGroup """
+        tree = ET.parse(xml_path)
+        root = tree.getroot()
+
+        annotations = []
+
+        # 找到所有 <Annotations> 标签
+        annotations_tag = root.find('Annotations')
+        if annotations_tag is None:
+            raise ValueError("未找到 Annotations 标签")
+
+        # 遍历所有 <Annotation> 标签
+        for annotation in annotations_tag.findall('Annotation'):
+            part_of_group = annotation.get('PartOfGroup')
+            coordinates = []
+            for coord in annotation.find('Coordinates').findall('Coordinate'):
+                x = float(coord.get('X'))
+                y = float(coord.get('Y'))
+                coordinates.append((x, y))
+            annotations.append({
+                'part_of_group': part_of_group,
+                'coordinates': np.array(coordinates, dtype=np.int32)
+            })
+
+        return annotations
+
+    def auto_coordinate_slicer(
+            self,
+            original_sample: Dict,
+            slide: OpenSlide,
+            annotations: List
+    ) -> int:
+        """
+        自动根据给定的标注文件切片原图像
+
+        Return: 
+            List[Content] 每个 content 都是一个 data 为 patch 的 content
+        """
+        wsi_width, wsi_height = slide.dimensions
+
+        patch_no = 0
+        # 遍历每个 Annotation
+        for _, annotation in enumerate(annotations):
+            part_of_group = annotation['part_of_group']
+            coordinates = annotation['coordinates']
+
+            # 转换坐标为整数（确保在图像范围内）
+            coordinates = coordinates.clip(min=0, max=(wsi_width, wsi_height))
+
+            # 创建掩码（mask）图像
+            mask = np.zeros((wsi_height, wsi_width), dtype=np.uint8)
+            cv2.fillPoly(mask, [coordinates], 255)  # 填充多边形区域为白色
+
+            # 找到掩码中的非零区域（肿瘤区域）
+            x, y, w, h = cv2.boundingRect(coordinates)  # 获取多边形的边界框
+
+            # 读取 WSI 图像的切片区域
+            region = slide.read_region((x, y), 0, (w, h))
+
+            # 转换为 NumPy 数组
+            region_np = np.array(region.convert("RGB"))
+
+            patch_sample = copy.deepcopy(original_sample)
+            patch_sample[self.data_key] = bytes_transform.numpy_to_bytes(region_np, '.png')
+            patch_no += 1
+            self.save_patch_sample(patch_sample, patch_no, save_format="image")
+
+        logger.info(f">>> {patch_no} annotations found and sliced.")
+
+        return patch_no
--- a/runtime/ops/slicer/slide_simple_slicer/init.py
+++ b/runtime/ops/slicer/slide_simple_slicer/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='SimpleSlicer',
+                          module_path="ops.slicer.slide_simple_slicer.process")
--- a/runtime/ops/slicer/slide_simple_slicer/metadata.yml
+++ b/runtime/ops/slicer/slide_simple_slicer/metadata.yml
@@ -0,0 +1,43 @@
+name: '病理图片自动切片'
+name_en: 'Pathological Image Automatic Slicing'
+description: '按照给定规格对病理图片进行切片。'
+description_en: 'Slicing pathology image with given box size.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'SimpleSlicer'
+version: '1.0.0'
+types:
+  - 'consolidate'
+modal: 'image'
+effect:
+  before: ''
+  after: ''
+inputs: 'image'
+outputs: 'image'
+settings:
+  sliceSize:
+    name: 切片尺寸
+    type: multiple
+    properties:
+      - type: inputNumber
+        name: 宽度
+        description: 像素
+        defaultVal: 128
+        min: 100
+        max: 1024
+        step: 1
+      - type: inputNumber
+        name: 高度
+        description: 像素
+        defaultVal: 128
+        min: 100
+        max: 1024
+        step: 1
+  overlap:
+    name: 重叠比例
+    description: 重叠比例越大，切片间重叠面积越大。
+    type: slider
+    defaultVal: 0
+    min: 0
+    max: 0.9
+    step: 0.1
--- a/runtime/ops/slicer/slide_simple_slicer/process.py
+++ b/runtime/ops/slicer/slide_simple_slicer/process.py
@@ -0,0 +1,98 @@
+# -- encoding: utf-8 --
+
+"""
+Description: 医疗图片按坐标切片
+Create: 2025/02/08 11:00
+"""
+import copy
+import time
+from typing import List, Tuple, Dict, Any
+
+import itertools
+from loguru import logger
+
+from openslide import OpenSlide
+import numpy as np
+
+from datamate.core.base_op import Slicer
+
+from datamate.common.utils import bytes_transform
+
+
+class SimpleSlicer(Slicer):
+
+    def __init__(self, *args, **kwargs):
+        super(SimpleSlicer, self).__init__(*args, **kwargs)
+
+        self._target_size = kwargs.get("sliceSize", [128, 128])
+        self._overlap = kwargs.get("overlap", 0)
+        self.last_ops = True
+
+        if not isinstance(self._target_size, List):
+            raise TypeError(f"<targetSize> received as {type(self._target_size)}, but expected list.")
+        if len(self._target_size) != 2:
+            raise ValueError(f"<targetSize> has {len(self._target_size)} elements, but expected 2.")
+        if not all(isinstance(dim, int) for dim in self._target_size):
+            raise TypeError(f"Elements in <targetSize> must be integers, but got {self._target_size}.")
+        if not isinstance(self._overlap, (int, float)):
+            raise TypeError(f"<overlap> received as {type(self._overlap)}, but expected int.")
+        if self._overlap < 0 or self._overlap > 1:
+            raise ValueError(
+                f"<overlap> received an out of range value: {self._overlap}, "
+                f"but (0 <= overlap <= 1) is expected."
+            )
+
+    def execute(self, sample: Dict[str, Any]) -> List[Dict]:
+        start = time.time()
+
+        slide: OpenSlide = OpenSlide(sample["filePath"])
+        if not isinstance(slide, OpenSlide):
+            logger.error("Not desired <Image.Image> object.")
+        dimensions: tuple[int, int] = slide.dimensions
+
+        target_size = self._target_size
+        overlap = self._overlap
+
+        patch_num = self.auto_simple_slicer(sample, slide, dimensions, target_size, overlap)
+        sample["slice_num"] = patch_num
+
+        file_name = sample[self.filename_key]
+        logger.info(f"fileName: {file_name}, method: CoordinateSlider costs {(time.time() - start):6f} s")
+
+        return [sample]
+
+    def auto_simple_slicer(
+            self,
+            original_sample: Dict[str, Any],
+            slide: OpenSlide,
+            dimensions: Tuple[int, int],
+            target_size: Tuple[int, int],
+            overlap: float
+    ) -> int:
+        """
+        自动根据给定规格切片原图像
+
+        Return: 
+            List[Content] 每个 content 都是一个 data 为 patch 的 content
+        """
+        stride_x, stride_y = map(lambda x: int(x * (1 - overlap)), target_size)
+        w, h = target_size
+
+        patch_no = 0
+        for x, y in itertools.product(
+                range(0, dimensions[0] - w + 1, stride_x),
+                range(0, dimensions[1] - h + 1, stride_y)
+        ):
+            # 切片
+            region = slide.read_region((x, y), 0, target_size)
+
+            region_np = np.array(region.convert("RGB"))
+
+            patch_sample = copy.deepcopy(original_sample)
+            patch_sample[self.data_key] = bytes_transform.numpy_to_bytes(region_np, ".png")
+            patch_no += 1
+            self.save_patch_sample(patch_sample, patch_no, save_format="image")
+
+        logger.info(f"One image sliced into pieces: {patch_no}")
+
+        return patch_no