init datamate

This commit is contained in:
Dallas98
2025-10-21 23:00:48 +08:00
commit 1c97afed7d
692 changed files with 135442 additions and 0 deletions

View File

@@ -0,0 +1,22 @@
# -*- coding: utf-8 -*-
import sys
from pathlib import Path
from datamate.common.utils.custom_importer import CustomImporter
def _configure_importer():
base_path = Path(__file__).resolve().parent
sys.meta_path.append(CustomImporter(base_path))
_configure_importer()
def _import_operators():
from . import slide_simple_slicer
from . import slide_annotation_slicer
from . import segmentation
_import_operators()

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='Segmentation',
module_path="ops.slicer.segmentation.process")

View File

@@ -0,0 +1,16 @@
name: '文本切分'
name_en: 'Text Segmentation'
description: '将文本切分成多个切片。'
description_en: 'Text Segmentation.'
language: 'python'
vendor: 'huawei'
raw_id: 'Segmentation'
version: '1.0.0'
types:
- 'consolidate'
modal: 'text'
effect:
before: ''
after: ''
inputs: 'text'
outputs: 'text'

View File

@@ -0,0 +1,62 @@
#!/user/bin/python
# -*- coding: utf-8 -*-
"""
Description: 文本切分算子
Create: 2023/11/09 10:17
"""
import random
from typing import List, Dict, Any
from loguru import logger
from datamate.common.utils.text_splitter import TextSplitter
from datamate.core.base_op import Slicer
class TextSegmentationOperator:
def __init__(self, max_characters, chunk_size, chunk_overlap):
try:
self.text_splitter = TextSplitter(max_characters, chunk_size, chunk_overlap)
except Exception as err:
logger.exception(f"init text splitter failed, error is: {err}")
raise Exception(83001, "init text splitter failed") from None
def process(self, input_data: str) -> List[str]:
if input_data.strip() == "":
logger.info("input text is empty, return empty chunks.")
return []
return self.text_splitter.split_text(input_data)
class Segmentation(Slicer):
"""切片算法插件"""
def __init__(self, *args, **kwargs):
super(Segmentation, self).__init__(*args, **kwargs)
self.max_characters = kwargs.get("maxCharacters", -1)
self.chunk_size = kwargs.get("chunkSize", 800)
self.chunk_overlap = kwargs.get("chunkOverlap", 100)
self.slice_num = kwargs.get("sliceNum", 5)
self.op = TextSegmentationOperator(self.max_characters, self.chunk_size, self.chunk_overlap)
self.last_ops = True
def execute(self, sample: Dict[str, Any]) -> List[Dict]:
try:
chunks = self.op.process(sample[self.text_key])
except Exception as err:
logger.exception(f"split text failed, error is: {err}")
raise Exception(83002, "init text splitter failed") from None
num_to_sample = min(self.slice_num, len(chunks))
sampled_indices = random.sample(chunks, num_to_sample)
for idx, chunk in enumerate(sampled_indices):
temp_sample = {self.text_key: chunk, self.data_key: "", self.export_path_key: sample[self.export_path_key],
self.filename_key: sample[self.filename_key], self.fileid_key: sample[self.fileid_key],
"sequenceId": str(idx), "chunkSize": str(len(chunk))}
self.save_patch_sample(temp_sample, idx, save_format="text")
sample["fileNum"] = len(chunks)
sample[self.text_key] = "Success"
return [sample]

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='AnnotationSlicer',
module_path="ops.slicer.slide_annotation_slicer.process")

View File

@@ -0,0 +1,16 @@
name: '病理图片标注切片'
name_en: 'Pathological Image Annotation Slicing'
description: '根据标注文件对病理图片进行切片。'
description_en: 'Slicing pathology image based on annotation file..'
language: 'python'
vendor: 'huawei'
raw_id: 'AnnotationSlicer'
version: '1.0.0'
types:
- 'consolidate'
modal: 'image'
effect:
before: ''
after: ''
inputs: 'image'
outputs: 'image'

View File

@@ -0,0 +1,117 @@
# -- encoding: utf-8 --
"""
Description: 医疗图片按坐标切片
Create: 2025/02/08 11:00
"""
import copy
import time
import os
from typing import List, Dict, Any
import xml.etree.ElementTree as ET
from loguru import logger
import numpy as np
import cv2
from openslide import OpenSlide
from datamate.core.base_op import Slicer
from datamate.common.utils import bytes_transform
class AnnotationSlicer(Slicer):
def __init__(self, *args, **kwargs):
super(AnnotationSlicer, self).__init__(*args, **kwargs)
self.last_ops = True
def execute(self, sample: Dict[str, Any]) -> List[Dict[str, Any]]:
start = time.time()
slide: OpenSlide = OpenSlide(sample[self.filepath_key])
if not isinstance(slide, OpenSlide):
logger.error("Not desired <Image.Image> object.")
annotation_path: str = sample["extraFilePath"]
annotations = self.parse_xml_annotations(annotation_path)
patch_num = self.auto_coordinate_slicer(sample, slide, annotations)
sample["slice_num"] = patch_num
file_name = sample[self.filename_key]
logger.info(f"fileName: {file_name}, method: CoordinateSlider costs {(time.time() - start):6f} s")
return [sample]
def parse_xml_annotations(self, xml_path: str) -> List:
""" 解析 XML 文件,提取所有 Annotation 的坐标和 PartOfGroup """
tree = ET.parse(xml_path)
root = tree.getroot()
annotations = []
# 找到所有 <Annotations> 标签
annotations_tag = root.find('Annotations')
if annotations_tag is None:
raise ValueError("未找到 Annotations 标签")
# 遍历所有 <Annotation> 标签
for annotation in annotations_tag.findall('Annotation'):
part_of_group = annotation.get('PartOfGroup')
coordinates = []
for coord in annotation.find('Coordinates').findall('Coordinate'):
x = float(coord.get('X'))
y = float(coord.get('Y'))
coordinates.append((x, y))
annotations.append({
'part_of_group': part_of_group,
'coordinates': np.array(coordinates, dtype=np.int32)
})
return annotations
def auto_coordinate_slicer(
self,
original_sample: Dict,
slide: OpenSlide,
annotations: List
) -> int:
"""
自动根据给定的标注文件切片原图像
Return:
List[Content] 每个 content 都是一个 data 为 patch 的 content
"""
wsi_width, wsi_height = slide.dimensions
patch_no = 0
# 遍历每个 Annotation
for _, annotation in enumerate(annotations):
part_of_group = annotation['part_of_group']
coordinates = annotation['coordinates']
# 转换坐标为整数(确保在图像范围内)
coordinates = coordinates.clip(min=0, max=(wsi_width, wsi_height))
# 创建掩码(mask)图像
mask = np.zeros((wsi_height, wsi_width), dtype=np.uint8)
cv2.fillPoly(mask, [coordinates], 255) # 填充多边形区域为白色
# 找到掩码中的非零区域(肿瘤区域)
x, y, w, h = cv2.boundingRect(coordinates) # 获取多边形的边界框
# 读取 WSI 图像的切片区域
region = slide.read_region((x, y), 0, (w, h))
# 转换为 NumPy 数组
region_np = np.array(region.convert("RGB"))
patch_sample = copy.deepcopy(original_sample)
patch_sample[self.data_key] = bytes_transform.numpy_to_bytes(region_np, '.png')
patch_no += 1
self.save_patch_sample(patch_sample, patch_no, save_format="image")
logger.info(f">>> {patch_no} annotations found and sliced.")
return patch_no

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='SimpleSlicer',
module_path="ops.slicer.slide_simple_slicer.process")

View File

@@ -0,0 +1,43 @@
name: '病理图片自动切片'
name_en: 'Pathological Image Automatic Slicing'
description: '按照给定规格对病理图片进行切片。'
description_en: 'Slicing pathology image with given box size.'
language: 'python'
vendor: 'huawei'
raw_id: 'SimpleSlicer'
version: '1.0.0'
types:
- 'consolidate'
modal: 'image'
effect:
before: ''
after: ''
inputs: 'image'
outputs: 'image'
settings:
sliceSize:
name: 切片尺寸
type: multiple
properties:
- type: inputNumber
name: 宽度
description: 像素
defaultVal: 128
min: 100
max: 1024
step: 1
- type: inputNumber
name: 高度
description: 像素
defaultVal: 128
min: 100
max: 1024
step: 1
overlap:
name: 重叠比例
description: 重叠比例越大,切片间重叠面积越大。
type: slider
defaultVal: 0
min: 0
max: 0.9
step: 0.1

View File

@@ -0,0 +1,98 @@
# -- encoding: utf-8 --
"""
Description: 医疗图片按坐标切片
Create: 2025/02/08 11:00
"""
import copy
import time
from typing import List, Tuple, Dict, Any
import itertools
from loguru import logger
from openslide import OpenSlide
import numpy as np
from datamate.core.base_op import Slicer
from datamate.common.utils import bytes_transform
class SimpleSlicer(Slicer):
def __init__(self, *args, **kwargs):
super(SimpleSlicer, self).__init__(*args, **kwargs)
self._target_size = kwargs.get("sliceSize", [128, 128])
self._overlap = kwargs.get("overlap", 0)
self.last_ops = True
if not isinstance(self._target_size, List):
raise TypeError(f"<targetSize> received as {type(self._target_size)}, but expected list.")
if len(self._target_size) != 2:
raise ValueError(f"<targetSize> has {len(self._target_size)} elements, but expected 2.")
if not all(isinstance(dim, int) for dim in self._target_size):
raise TypeError(f"Elements in <targetSize> must be integers, but got {self._target_size}.")
if not isinstance(self._overlap, (int, float)):
raise TypeError(f"<overlap> received as {type(self._overlap)}, but expected int.")
if self._overlap < 0 or self._overlap > 1:
raise ValueError(
f"<overlap> received an out of range value: {self._overlap}, "
f"but (0 <= overlap <= 1) is expected."
)
def execute(self, sample: Dict[str, Any]) -> List[Dict]:
start = time.time()
slide: OpenSlide = OpenSlide(sample["filePath"])
if not isinstance(slide, OpenSlide):
logger.error("Not desired <Image.Image> object.")
dimensions: tuple[int, int] = slide.dimensions
target_size = self._target_size
overlap = self._overlap
patch_num = self.auto_simple_slicer(sample, slide, dimensions, target_size, overlap)
sample["slice_num"] = patch_num
file_name = sample[self.filename_key]
logger.info(f"fileName: {file_name}, method: CoordinateSlider costs {(time.time() - start):6f} s")
return [sample]
def auto_simple_slicer(
self,
original_sample: Dict[str, Any],
slide: OpenSlide,
dimensions: Tuple[int, int],
target_size: Tuple[int, int],
overlap: float
) -> int:
"""
自动根据给定规格切片原图像
Return:
List[Content] 每个 content 都是一个 data 为 patch 的 content
"""
stride_x, stride_y = map(lambda x: int(x * (1 - overlap)), target_size)
w, h = target_size
patch_no = 0
for x, y in itertools.product(
range(0, dimensions[0] - w + 1, stride_x),
range(0, dimensions[1] - h + 1, stride_y)
):
# 切片
region = slide.read_region((x, y), 0, target_size)
region_np = np.array(region.convert("RGB"))
patch_sample = copy.deepcopy(original_sample)
patch_sample[self.data_key] = bytes_transform.numpy_to_bytes(region_np, ".png")
patch_no += 1
self.save_patch_sample(patch_sample, patch_no, save_format="image")
logger.info(f"One image sliced into pieces: {patch_no}")
return patch_no