init datamate

2025-10-21 23:00:48 +08:00
commit 1c97afed7d
692 changed files with 135442 additions and 0 deletions
--- a/runtime/ops/slicer/slide_annotation_slicer/init.py
+++ b/runtime/ops/slicer/slide_annotation_slicer/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='AnnotationSlicer',
+                          module_path="ops.slicer.slide_annotation_slicer.process")
--- a/runtime/ops/slicer/slide_annotation_slicer/metadata.yml
+++ b/runtime/ops/slicer/slide_annotation_slicer/metadata.yml
@@ -0,0 +1,16 @@
+name: '病理图片标注切片'
+name_en: 'Pathological Image Annotation Slicing'
+description: '根据标注文件对病理图片进行切片。'
+description_en: 'Slicing pathology image based on annotation file..'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'AnnotationSlicer'
+version: '1.0.0'
+types:
+  - 'consolidate'
+modal: 'image'
+effect:
+  before: ''
+  after: ''
+inputs: 'image'
+outputs: 'image'
--- a/runtime/ops/slicer/slide_annotation_slicer/process.py
+++ b/runtime/ops/slicer/slide_annotation_slicer/process.py
@@ -0,0 +1,117 @@
+# -- encoding: utf-8 --
+
+"""
+Description: 医疗图片按坐标切片
+Create: 2025/02/08 11:00
+"""
+import copy
+import time
+import os
+from typing import List, Dict, Any
+
+import xml.etree.ElementTree as ET
+from loguru import logger
+
+import numpy as np
+import cv2
+from openslide import OpenSlide
+
+from datamate.core.base_op import Slicer
+from datamate.common.utils import bytes_transform
+
+
+class AnnotationSlicer(Slicer):
+
+    def __init__(self, *args, **kwargs):
+        super(AnnotationSlicer, self).__init__(*args, **kwargs)
+        self.last_ops = True
+
+    def execute(self, sample: Dict[str, Any]) -> List[Dict[str, Any]]:
+        start = time.time()
+
+        slide: OpenSlide = OpenSlide(sample[self.filepath_key])
+        if not isinstance(slide, OpenSlide):
+            logger.error("Not desired <Image.Image> object.")
+
+        annotation_path: str = sample["extraFilePath"]
+        annotations = self.parse_xml_annotations(annotation_path)
+
+        patch_num = self.auto_coordinate_slicer(sample, slide, annotations)
+        sample["slice_num"] = patch_num
+
+        file_name = sample[self.filename_key]
+        logger.info(f"fileName: {file_name}, method: CoordinateSlider costs {(time.time() - start):6f} s")
+
+        return [sample]
+
+    def parse_xml_annotations(self, xml_path: str) -> List:
+        """ 解析 XML 文件，提取所有 Annotation 的坐标和 PartOfGroup """
+        tree = ET.parse(xml_path)
+        root = tree.getroot()
+
+        annotations = []
+
+        # 找到所有 <Annotations> 标签
+        annotations_tag = root.find('Annotations')
+        if annotations_tag is None:
+            raise ValueError("未找到 Annotations 标签")
+
+        # 遍历所有 <Annotation> 标签
+        for annotation in annotations_tag.findall('Annotation'):
+            part_of_group = annotation.get('PartOfGroup')
+            coordinates = []
+            for coord in annotation.find('Coordinates').findall('Coordinate'):
+                x = float(coord.get('X'))
+                y = float(coord.get('Y'))
+                coordinates.append((x, y))
+            annotations.append({
+                'part_of_group': part_of_group,
+                'coordinates': np.array(coordinates, dtype=np.int32)
+            })
+
+        return annotations
+
+    def auto_coordinate_slicer(
+            self,
+            original_sample: Dict,
+            slide: OpenSlide,
+            annotations: List
+    ) -> int:
+        """
+        自动根据给定的标注文件切片原图像
+
+        Return: 
+            List[Content] 每个 content 都是一个 data 为 patch 的 content
+        """
+        wsi_width, wsi_height = slide.dimensions
+
+        patch_no = 0
+        # 遍历每个 Annotation
+        for _, annotation in enumerate(annotations):
+            part_of_group = annotation['part_of_group']
+            coordinates = annotation['coordinates']
+
+            # 转换坐标为整数（确保在图像范围内）
+            coordinates = coordinates.clip(min=0, max=(wsi_width, wsi_height))
+
+            # 创建掩码（mask）图像
+            mask = np.zeros((wsi_height, wsi_width), dtype=np.uint8)
+            cv2.fillPoly(mask, [coordinates], 255)  # 填充多边形区域为白色
+
+            # 找到掩码中的非零区域（肿瘤区域）
+            x, y, w, h = cv2.boundingRect(coordinates)  # 获取多边形的边界框
+
+            # 读取 WSI 图像的切片区域
+            region = slide.read_region((x, y), 0, (w, h))
+
+            # 转换为 NumPy 数组
+            region_np = np.array(region.convert("RGB"))
+
+            patch_sample = copy.deepcopy(original_sample)
+            patch_sample[self.data_key] = bytes_transform.numpy_to_bytes(region_np, '.png')
+            patch_no += 1
+            self.save_patch_sample(patch_sample, patch_no, save_format="image")
+
+        logger.info(f">>> {patch_no} annotations found and sliced.")
+
+        return patch_no