You've already forked DataMate
feat(annotation): 添加文本文件内容读取和多类型标签导出功能
- 新增异步函数 _read_file_content 用于安全读取文本文件内容 - 实现在导出时包含文本文件的实际内容数据 - 扩展 CSV 导出格式支持多种标注类型标签提取 - 添加对矩形标签、多边形标签、画笔标签等多种标注类型的支持 - 更新 COCO 格式导出文档说明bbox坐标转换注意事项
This commit is contained in:
@@ -27,6 +27,32 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|||||||
from app.core.logging import get_logger
|
from app.core.logging import get_logger
|
||||||
from app.db.models import AnnotationResult, Dataset, DatasetFiles, LabelingProject
|
from app.db.models import AnnotationResult, Dataset, DatasetFiles, LabelingProject
|
||||||
|
|
||||||
|
|
||||||
|
async def _read_file_content(file_path: str, max_size: int = 10 * 1024 * 1024) -> Optional[str]:
|
||||||
|
"""读取文件内容,仅适用于文本文件
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: 文件路径
|
||||||
|
max_size: 最大读取字节数(默认10MB)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
文件内容字符串,如果读取失败返回 None
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# 检查文件是否存在且大小在限制内
|
||||||
|
if not os.path.exists(file_path):
|
||||||
|
return None
|
||||||
|
|
||||||
|
file_size = os.path.getsize(file_path)
|
||||||
|
if file_size > max_size:
|
||||||
|
return f"[File too large: {file_size} bytes]"
|
||||||
|
|
||||||
|
# 尝试以文本方式读取
|
||||||
|
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
|
||||||
|
return f.read()
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
from ..schema.export import (
|
from ..schema.export import (
|
||||||
AnnotationExportItem,
|
AnnotationExportItem,
|
||||||
COCOExportFormat,
|
COCOExportFormat,
|
||||||
@@ -147,11 +173,17 @@ class AnnotationExportService:
|
|||||||
|
|
||||||
for ann, file in rows:
|
for ann, file in rows:
|
||||||
annotation_data = ann.annotation or {}
|
annotation_data = ann.annotation or {}
|
||||||
|
# 获取文件内容(如果是文本文件且用户要求包含数据)
|
||||||
|
file_content = None
|
||||||
|
if include_data:
|
||||||
|
file_path = getattr(file, "file_path", "")
|
||||||
|
file_content = await _read_file_content(file_path)
|
||||||
|
|
||||||
items.append(
|
items.append(
|
||||||
AnnotationExportItem(
|
AnnotationExportItem(
|
||||||
file_id=str(file.id),
|
file_id=str(file.id),
|
||||||
file_name=str(getattr(file, "file_name", "")),
|
file_name=str(getattr(file, "file_name", "")),
|
||||||
data={"text": ""} if include_data else None, # TEXT 类型数据需要单独获取
|
data={"text": file_content} if include_data else None,
|
||||||
annotations=[annotation_data] if annotation_data else [],
|
annotations=[annotation_data] if annotation_data else [],
|
||||||
created_at=ann.created_at,
|
created_at=ann.created_at,
|
||||||
updated_at=ann.updated_at,
|
updated_at=ann.updated_at,
|
||||||
@@ -177,12 +209,18 @@ class AnnotationExportService:
|
|||||||
file_id = str(file.id)
|
file_id = str(file.id)
|
||||||
ann = annotations.get(file_id)
|
ann = annotations.get(file_id)
|
||||||
annotation_data = ann.annotation if ann else {}
|
annotation_data = ann.annotation if ann else {}
|
||||||
|
|
||||||
|
# 获取文件内容(如果是文本文件且用户要求包含数据)
|
||||||
|
file_content = None
|
||||||
|
if include_data:
|
||||||
|
file_path = getattr(file, "file_path", "")
|
||||||
|
file_content = await _read_file_content(file_path)
|
||||||
|
|
||||||
items.append(
|
items.append(
|
||||||
AnnotationExportItem(
|
AnnotationExportItem(
|
||||||
file_id=file_id,
|
file_id=file_id,
|
||||||
file_name=str(getattr(file, "file_name", "")),
|
file_name=str(getattr(file, "file_name", "")),
|
||||||
data={"text": ""} if include_data else None,
|
data={"text": file_content} if include_data else None,
|
||||||
annotations=[annotation_data] if annotation_data else [],
|
annotations=[annotation_data] if annotation_data else [],
|
||||||
created_at=ann.created_at if ann else None,
|
created_at=ann.created_at if ann else None,
|
||||||
updated_at=ann.updated_at if ann else None,
|
updated_at=ann.updated_at if ann else None,
|
||||||
@@ -256,12 +294,14 @@ class AnnotationExportService:
|
|||||||
writer.writeheader()
|
writer.writeheader()
|
||||||
|
|
||||||
for item in items:
|
for item in items:
|
||||||
# 提取标签信息
|
# 提取标签信息(支持多种标注类型)
|
||||||
labels = []
|
labels = []
|
||||||
for ann in item.annotations:
|
for ann in item.annotations:
|
||||||
results = ann.get("result", [])
|
results = ann.get("result", [])
|
||||||
for r in results:
|
for r in results:
|
||||||
value = r.get("value", {})
|
value = r.get("value", {})
|
||||||
|
label_type = r.get("type", "")
|
||||||
|
|
||||||
# 提取不同类型的标签值
|
# 提取不同类型的标签值
|
||||||
if "choices" in value:
|
if "choices" in value:
|
||||||
labels.extend(value["choices"])
|
labels.extend(value["choices"])
|
||||||
@@ -269,6 +309,18 @@ class AnnotationExportService:
|
|||||||
labels.append(value["text"])
|
labels.append(value["text"])
|
||||||
elif "labels" in value:
|
elif "labels" in value:
|
||||||
labels.extend(value["labels"])
|
labels.extend(value["labels"])
|
||||||
|
elif "rectanglelabels" in value:
|
||||||
|
labels.extend(value["rectanglelabels"])
|
||||||
|
elif "polygonlabels" in value:
|
||||||
|
labels.extend(value["polygonlabels"])
|
||||||
|
elif "brushlabels" in value:
|
||||||
|
labels.extend(value["brushlabels"])
|
||||||
|
elif "hypertextlabels" in value:
|
||||||
|
labels.extend(value["hypertextlabels"])
|
||||||
|
elif "timeserieslabels" in value:
|
||||||
|
labels.extend(value["timeserieslabels"])
|
||||||
|
elif "transcription" in value:
|
||||||
|
labels.append(value["transcription"])
|
||||||
|
|
||||||
writer.writerow({
|
writer.writerow({
|
||||||
"file_id": item.file_id,
|
"file_id": item.file_id,
|
||||||
@@ -286,7 +338,11 @@ class AnnotationExportService:
|
|||||||
def _export_coco(
|
def _export_coco(
|
||||||
self, items: List[AnnotationExportItem], project_name: str
|
self, items: List[AnnotationExportItem], project_name: str
|
||||||
) -> Tuple[bytes, str, str]:
|
) -> Tuple[bytes, str, str]:
|
||||||
"""导出为 COCO 格式(适用于目标检测标注)"""
|
"""导出为 COCO 格式(适用于目标检测标注)
|
||||||
|
|
||||||
|
注意:当前实现中图片宽高被设置为0,因为需要读取实际图片文件获取尺寸。
|
||||||
|
bbox 坐标使用 Label Studio 的百分比值(0-100),使用时需要转换为像素坐标。
|
||||||
|
"""
|
||||||
coco_format = COCOExportFormat(
|
coco_format = COCOExportFormat(
|
||||||
info={
|
info={
|
||||||
"description": f"Exported from DataMate project: {project_name}",
|
"description": f"Exported from DataMate project: {project_name}",
|
||||||
|
|||||||
Reference in New Issue
Block a user