You've already forked DataMate
feat: Enhance file tag update functionality with automatic format conversion (#84)
- Updated `update_file_tags` to support both simplified and full tag formats. - Introduced `TagFormatConverter` to handle conversion from simplified external tags to internal storage format. - Added logic to fetch and utilize the appropriate annotation template for conversion. - Improved error handling for missing templates and unknown controls during tag updates. - Created example script demonstrating the usage of the new tag format conversion feature. - Added unit tests for `TagFormatConverter` to ensure correct functionality and edge case handling.
This commit is contained in:
@@ -2,5 +2,10 @@
|
||||
Annotation Module Utilities
|
||||
"""
|
||||
from .config_validator import LabelStudioConfigValidator
|
||||
from .tag_converter import TagFormatConverter, create_converter_from_template_config
|
||||
|
||||
__all__ = ['LabelStudioConfigValidator']
|
||||
__all__ = [
|
||||
'LabelStudioConfigValidator',
|
||||
'TagFormatConverter',
|
||||
'create_converter_from_template_config'
|
||||
]
|
||||
|
||||
@@ -0,0 +1,232 @@
|
||||
"""
|
||||
Tag Format Converter
|
||||
|
||||
Converts simplified external tag format to internal storage format by looking up
|
||||
the type from the annotation template configuration.
|
||||
|
||||
External format (from users):
|
||||
[
|
||||
{
|
||||
"from_name": "label",
|
||||
"to_name": "image",
|
||||
"values": ["cat", "dog"]
|
||||
}
|
||||
]
|
||||
|
||||
Internal storage format:
|
||||
[
|
||||
{
|
||||
"id": "unique_id",
|
||||
"from_name": "label",
|
||||
"to_name": "image",
|
||||
"type": "choices",
|
||||
"value": {
|
||||
"choices": ["cat", "dog"]
|
||||
}
|
||||
}
|
||||
]
|
||||
"""
|
||||
|
||||
import uuid
|
||||
from typing import List, Dict, Any, Optional
|
||||
from datetime import datetime
|
||||
|
||||
from app.core.logging import get_logger
|
||||
from ..schema.template import TemplateConfiguration
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class TagFormatConverter:
|
||||
"""Convert between simplified external tag format and internal storage format"""
|
||||
|
||||
def __init__(self, template_config: TemplateConfiguration):
|
||||
"""
|
||||
Initialize converter with template configuration
|
||||
|
||||
Args:
|
||||
template_config: The template configuration containing label definitions
|
||||
"""
|
||||
self.template_config = template_config
|
||||
# Build a lookup map: from_name -> type
|
||||
self._type_map = self._build_type_map()
|
||||
|
||||
def _build_type_map(self) -> Dict[str, str]:
|
||||
"""
|
||||
Build a mapping from from_name to type from template labels
|
||||
|
||||
Returns:
|
||||
Dictionary mapping from_name to control type
|
||||
"""
|
||||
type_map = {}
|
||||
for label_def in self.template_config.labels:
|
||||
from_name = label_def.from_name
|
||||
control_type = label_def.type
|
||||
type_map[from_name] = control_type
|
||||
logger.debug(f"Registered control: {from_name} -> {control_type}")
|
||||
|
||||
return type_map
|
||||
|
||||
def get_type_for_from_name(self, from_name: str) -> Optional[str]:
|
||||
"""
|
||||
Get the control type for a given from_name
|
||||
|
||||
Args:
|
||||
from_name: The control name
|
||||
|
||||
Returns:
|
||||
Control type or None if not found
|
||||
"""
|
||||
return self._type_map.get(from_name)
|
||||
|
||||
def convert_simplified_to_full(
|
||||
self,
|
||||
simplified_tags: List[Dict[str, Any]]
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Convert simplified tag format to full internal storage format
|
||||
|
||||
Args:
|
||||
simplified_tags: List of tags in simplified format with structure:
|
||||
[
|
||||
{
|
||||
"from_name": "label",
|
||||
"to_name": "image",
|
||||
"values": ["cat", "dog"] # Can be list or single value
|
||||
}
|
||||
]
|
||||
|
||||
Returns:
|
||||
List of tags in full internal format:
|
||||
[
|
||||
{
|
||||
"id": "unique_id",
|
||||
"from_name": "label",
|
||||
"to_name": "image",
|
||||
"type": "choices",
|
||||
"value": {
|
||||
"choices": ["cat", "dog"]
|
||||
}
|
||||
}
|
||||
]
|
||||
"""
|
||||
full_tags = []
|
||||
|
||||
for simplified_tag in simplified_tags:
|
||||
# Support both camelCase and snake_case from external sources
|
||||
from_name = simplified_tag.get('from_name') or simplified_tag.get('fromName')
|
||||
to_name = simplified_tag.get('to_name') or simplified_tag.get('toName')
|
||||
values = simplified_tag.get('values')
|
||||
tag_id = simplified_tag.get('id') # Use existing ID if provided
|
||||
|
||||
if not from_name or not to_name:
|
||||
logger.warning(f"Skipping tag with missing from_name or to_name: {simplified_tag}")
|
||||
continue
|
||||
|
||||
# Look up the type from template configuration
|
||||
control_type = self.get_type_for_from_name(from_name)
|
||||
|
||||
if not control_type:
|
||||
logger.warning(
|
||||
f"Could not find type for from_name '{from_name}' in template. "
|
||||
f"Tag will be skipped. Available controls: {list(self._type_map.keys())}"
|
||||
)
|
||||
continue
|
||||
|
||||
# Generate ID if not provided
|
||||
if not tag_id:
|
||||
tag_id = str(uuid.uuid4())
|
||||
|
||||
# Convert values to the proper nested structure
|
||||
# The key in the value dict should match the control type
|
||||
full_tag = {
|
||||
"id": tag_id,
|
||||
"from_name": from_name,
|
||||
"to_name": to_name,
|
||||
"type": control_type,
|
||||
"value": {
|
||||
control_type: values
|
||||
}
|
||||
}
|
||||
|
||||
full_tags.append(full_tag)
|
||||
logger.debug(f"Converted tag: {from_name} ({control_type}) with {len(values) if isinstance(values, list) else 1} values")
|
||||
|
||||
return full_tags
|
||||
|
||||
def is_simplified_format(self, tag: Dict[str, Any]) -> bool:
|
||||
"""
|
||||
Check if a tag is in simplified format (missing type field)
|
||||
|
||||
Args:
|
||||
tag: Tag dictionary to check
|
||||
|
||||
Returns:
|
||||
True if tag appears to be in simplified format
|
||||
"""
|
||||
# Simplified format has 'values' at top level and no 'type' field
|
||||
has_values = 'values' in tag
|
||||
has_type = 'type' in tag
|
||||
has_value = 'value' in tag
|
||||
|
||||
# If it has 'values' but no 'type', it's simplified
|
||||
# If it has 'type' and nested 'value', it's already full format
|
||||
return has_values and not has_type and not has_value
|
||||
|
||||
def convert_if_needed(
|
||||
self,
|
||||
tags: List[Dict[str, Any]]
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Convert tags to full format if they are in simplified format
|
||||
|
||||
This method can handle mixed formats - it will convert simplified tags
|
||||
and pass through tags that are already in full format.
|
||||
|
||||
Args:
|
||||
tags: List of tags in either format
|
||||
|
||||
Returns:
|
||||
List of tags in full internal format
|
||||
"""
|
||||
if not tags:
|
||||
return []
|
||||
|
||||
result = []
|
||||
|
||||
for tag in tags:
|
||||
if self.is_simplified_format(tag):
|
||||
# Convert simplified format
|
||||
converted = self.convert_simplified_to_full([tag])
|
||||
result.extend(converted)
|
||||
else:
|
||||
# Already in full format, pass through
|
||||
result.append(tag)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def create_converter_from_template_config(
|
||||
template_config_dict: Dict[str, Any]
|
||||
) -> TagFormatConverter:
|
||||
"""
|
||||
Create a TagFormatConverter from a template configuration dictionary
|
||||
|
||||
Args:
|
||||
template_config_dict: Template configuration as dict (from database JSON)
|
||||
|
||||
Returns:
|
||||
TagFormatConverter instance
|
||||
|
||||
Raises:
|
||||
ValueError: If template configuration is invalid
|
||||
"""
|
||||
try:
|
||||
# Parse the configuration using Pydantic model
|
||||
from ..schema.template import TemplateConfiguration
|
||||
|
||||
template_config = TemplateConfiguration(**template_config_dict)
|
||||
return TagFormatConverter(template_config)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create tag converter from template config: {e}")
|
||||
raise ValueError(f"Invalid template configuration: {e}")
|
||||
@@ -0,0 +1,337 @@
|
||||
"""
|
||||
Unit tests for TagFormatConverter
|
||||
|
||||
Run with: pytest app/module/annotation/utils/test_tag_converter.py -v
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from .tag_converter import TagFormatConverter, create_converter_from_template_config
|
||||
from ..schema.template import TemplateConfiguration, LabelDefinition, ObjectDefinition
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_template_config():
|
||||
"""Create a sample template configuration for testing"""
|
||||
return TemplateConfiguration(
|
||||
labels=[
|
||||
LabelDefinition(
|
||||
fromName="sentiment",
|
||||
toName="text",
|
||||
type="choices",
|
||||
options=["positive", "negative", "neutral"],
|
||||
required=True,
|
||||
labels=None,
|
||||
description=None
|
||||
),
|
||||
LabelDefinition(
|
||||
fromName="bbox",
|
||||
toName="image",
|
||||
type="rectanglelabels",
|
||||
labels=["cat", "dog", "bird"],
|
||||
required=False,
|
||||
options=None,
|
||||
description=None
|
||||
),
|
||||
LabelDefinition(
|
||||
fromName="comment",
|
||||
toName="text",
|
||||
type="textarea",
|
||||
required=False,
|
||||
options=None,
|
||||
labels=None,
|
||||
description=None
|
||||
)
|
||||
],
|
||||
objects=[
|
||||
ObjectDefinition(name="text", type="Text", value="$text"),
|
||||
ObjectDefinition(name="image", type="Image", value="$image")
|
||||
],
|
||||
metadata=None
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def converter(sample_template_config):
|
||||
"""Create a converter instance"""
|
||||
return TagFormatConverter(sample_template_config)
|
||||
|
||||
|
||||
class TestTagFormatConverter:
|
||||
"""Test TagFormatConverter functionality"""
|
||||
|
||||
def test_type_map_building(self, converter):
|
||||
"""Test that type map is built correctly from template"""
|
||||
assert converter.get_type_for_from_name("sentiment") == "choices"
|
||||
assert converter.get_type_for_from_name("bbox") == "rectanglelabels"
|
||||
assert converter.get_type_for_from_name("comment") == "textarea"
|
||||
assert converter.get_type_for_from_name("nonexistent") is None
|
||||
|
||||
def test_convert_simplified_to_full_single_value(self, converter):
|
||||
"""Test conversion of simplified format with single value"""
|
||||
simplified = [
|
||||
{
|
||||
"from_name": "sentiment",
|
||||
"to_name": "text",
|
||||
"values": ["positive"]
|
||||
}
|
||||
]
|
||||
|
||||
result = converter.convert_simplified_to_full(simplified)
|
||||
|
||||
assert len(result) == 1
|
||||
tag = result[0]
|
||||
assert tag["from_name"] == "sentiment"
|
||||
assert tag["to_name"] == "text"
|
||||
assert tag["type"] == "choices"
|
||||
assert tag["value"] == {"choices": ["positive"]}
|
||||
assert "id" in tag
|
||||
|
||||
def test_convert_simplified_to_full_multiple_values(self, converter):
|
||||
"""Test conversion of simplified format with multiple values"""
|
||||
simplified = [
|
||||
{
|
||||
"from_name": "bbox",
|
||||
"to_name": "image",
|
||||
"values": ["cat", "dog"]
|
||||
}
|
||||
]
|
||||
|
||||
result = converter.convert_simplified_to_full(simplified)
|
||||
|
||||
assert len(result) == 1
|
||||
tag = result[0]
|
||||
assert tag["type"] == "rectanglelabels"
|
||||
assert tag["value"] == {"rectanglelabels": ["cat", "dog"]}
|
||||
|
||||
def test_convert_simplified_camelcase(self, converter):
|
||||
"""Test that camelCase field names are supported"""
|
||||
simplified = [
|
||||
{
|
||||
"fromName": "sentiment", # camelCase
|
||||
"toName": "text", # camelCase
|
||||
"values": ["neutral"]
|
||||
}
|
||||
]
|
||||
|
||||
result = converter.convert_simplified_to_full(simplified)
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0]["from_name"] == "sentiment"
|
||||
assert result[0]["to_name"] == "text"
|
||||
|
||||
def test_convert_multiple_tags(self, converter):
|
||||
"""Test conversion of multiple tags at once"""
|
||||
simplified = [
|
||||
{
|
||||
"from_name": "sentiment",
|
||||
"to_name": "text",
|
||||
"values": ["positive"]
|
||||
},
|
||||
{
|
||||
"from_name": "bbox",
|
||||
"to_name": "image",
|
||||
"values": ["cat"]
|
||||
}
|
||||
]
|
||||
|
||||
result = converter.convert_simplified_to_full(simplified)
|
||||
|
||||
assert len(result) == 2
|
||||
assert result[0]["type"] == "choices"
|
||||
assert result[1]["type"] == "rectanglelabels"
|
||||
|
||||
def test_convert_with_existing_id(self, converter):
|
||||
"""Test that existing IDs are preserved"""
|
||||
existing_id = "my-custom-id-123"
|
||||
simplified = [
|
||||
{
|
||||
"id": existing_id,
|
||||
"from_name": "sentiment",
|
||||
"to_name": "text",
|
||||
"values": ["positive"]
|
||||
}
|
||||
]
|
||||
|
||||
result = converter.convert_simplified_to_full(simplified)
|
||||
|
||||
assert result[0]["id"] == existing_id
|
||||
|
||||
def test_skip_unknown_from_name(self, converter):
|
||||
"""Test that tags with unknown from_name are skipped"""
|
||||
simplified = [
|
||||
{
|
||||
"from_name": "unknown_control",
|
||||
"to_name": "text",
|
||||
"values": ["value"]
|
||||
}
|
||||
]
|
||||
|
||||
result = converter.convert_simplified_to_full(simplified)
|
||||
|
||||
assert len(result) == 0 # Should be skipped
|
||||
|
||||
def test_skip_missing_fields(self, converter):
|
||||
"""Test that tags with missing required fields are skipped"""
|
||||
simplified = [
|
||||
{
|
||||
"from_name": "sentiment",
|
||||
# Missing to_name
|
||||
"values": ["positive"]
|
||||
}
|
||||
]
|
||||
|
||||
result = converter.convert_simplified_to_full(simplified)
|
||||
|
||||
assert len(result) == 0 # Should be skipped
|
||||
|
||||
def test_is_simplified_format(self, converter):
|
||||
"""Test detection of simplified format"""
|
||||
# Simplified format
|
||||
assert converter.is_simplified_format({
|
||||
"from_name": "x",
|
||||
"to_name": "y",
|
||||
"values": ["a"]
|
||||
}) is True
|
||||
|
||||
# Full format
|
||||
assert converter.is_simplified_format({
|
||||
"id": "123",
|
||||
"from_name": "x",
|
||||
"to_name": "y",
|
||||
"type": "choices",
|
||||
"value": {"choices": ["a"]}
|
||||
}) is False
|
||||
|
||||
# Edge case: has both (should not be considered simplified)
|
||||
assert converter.is_simplified_format({
|
||||
"from_name": "x",
|
||||
"to_name": "y",
|
||||
"type": "choices",
|
||||
"values": ["a"]
|
||||
}) is False
|
||||
|
||||
def test_convert_if_needed_mixed_formats(self, converter):
|
||||
"""Test conversion of mixed format tags"""
|
||||
mixed = [
|
||||
# Simplified format
|
||||
{
|
||||
"from_name": "sentiment",
|
||||
"to_name": "text",
|
||||
"values": ["positive"]
|
||||
},
|
||||
# Full format
|
||||
{
|
||||
"id": "existing-123",
|
||||
"from_name": "bbox",
|
||||
"to_name": "image",
|
||||
"type": "rectanglelabels",
|
||||
"value": {"rectanglelabels": ["cat"]}
|
||||
}
|
||||
]
|
||||
|
||||
result = converter.convert_if_needed(mixed)
|
||||
|
||||
assert len(result) == 2
|
||||
# First should be converted
|
||||
assert result[0]["type"] == "choices"
|
||||
assert result[0]["value"] == {"choices": ["positive"]}
|
||||
# Second should pass through unchanged
|
||||
assert result[1]["id"] == "existing-123"
|
||||
assert result[1]["type"] == "rectanglelabels"
|
||||
|
||||
|
||||
class TestCreateConverterFromDict:
|
||||
"""Test the factory function for creating converter from dict"""
|
||||
|
||||
def test_create_from_valid_dict(self):
|
||||
"""Test creating converter from valid configuration dict"""
|
||||
config_dict = {
|
||||
"labels": [
|
||||
{
|
||||
"fromName": "label",
|
||||
"toName": "image",
|
||||
"type": "choices",
|
||||
"options": ["a", "b"]
|
||||
}
|
||||
],
|
||||
"objects": [
|
||||
{
|
||||
"name": "image",
|
||||
"type": "Image",
|
||||
"value": "$image"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
converter = create_converter_from_template_config(config_dict)
|
||||
|
||||
assert isinstance(converter, TagFormatConverter)
|
||||
assert converter.get_type_for_from_name("label") == "choices"
|
||||
|
||||
def test_create_from_invalid_dict(self):
|
||||
"""Test that invalid config raises ValueError"""
|
||||
invalid_config = {
|
||||
"labels": "not-a-list", # Should be a list
|
||||
"objects": []
|
||||
}
|
||||
|
||||
with pytest.raises(ValueError, match="Invalid template configuration"):
|
||||
create_converter_from_template_config(invalid_config)
|
||||
|
||||
|
||||
class TestIntegrationScenarios:
|
||||
"""Test real-world usage scenarios"""
|
||||
|
||||
def test_external_api_submission(self, converter):
|
||||
"""Simulate external user submitting tags via API"""
|
||||
# User submits simplified format
|
||||
user_submission = [
|
||||
{
|
||||
"fromName": "sentiment", # User uses camelCase
|
||||
"toName": "text",
|
||||
"values": ["positive", "negative"]
|
||||
}
|
||||
]
|
||||
|
||||
# System converts to internal format
|
||||
internal_tags = converter.convert_if_needed(user_submission)
|
||||
|
||||
# Verify correct storage format
|
||||
assert len(internal_tags) == 1
|
||||
assert internal_tags[0]["type"] == "choices"
|
||||
assert internal_tags[0]["value"] == {"choices": ["positive", "negative"]}
|
||||
assert "id" in internal_tags[0]
|
||||
|
||||
def test_update_existing_tags(self, converter):
|
||||
"""Simulate updating existing tags with new values"""
|
||||
# Existing tags in database (full format)
|
||||
existing_tags = [
|
||||
{
|
||||
"id": "tag-001",
|
||||
"from_name": "sentiment",
|
||||
"to_name": "text",
|
||||
"type": "choices",
|
||||
"value": {"choices": ["positive"]}
|
||||
}
|
||||
]
|
||||
|
||||
# User updates with simplified format
|
||||
update_request = [
|
||||
{
|
||||
"id": "tag-001", # Same ID to update
|
||||
"from_name": "sentiment",
|
||||
"to_name": "text",
|
||||
"values": ["negative"] # New value
|
||||
}
|
||||
]
|
||||
|
||||
# Convert update request
|
||||
converted_update = converter.convert_if_needed(update_request)
|
||||
|
||||
# Merge logic would replace tag-001
|
||||
assert converted_update[0]["id"] == "tag-001"
|
||||
assert converted_update[0]["value"] == {"choices": ["negative"]}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
Reference in New Issue
Block a user