feat: Enhance file tag update functionality with automatic format conversion (#84)

- Updated `update_file_tags` to support both simplified and full tag formats.
- Introduced `TagFormatConverter` to handle conversion from simplified external tags to internal storage format.
- Added logic to fetch and utilize the appropriate annotation template for conversion.
- Improved error handling for missing templates and unknown controls during tag updates.
- Created example script demonstrating the usage of the new tag format conversion feature.
- Added unit tests for `TagFormatConverter` to ensure correct functionality and edge case handling.
This commit is contained in:
Jason Wang
2025-11-14 12:42:39 +08:00
committed by GitHub
parent 5cef9cb273
commit df853a5177
10 changed files with 1127 additions and 54 deletions

View File

@@ -2,5 +2,10 @@
Annotation Module Utilities
"""
from .config_validator import LabelStudioConfigValidator
from .tag_converter import TagFormatConverter, create_converter_from_template_config
__all__ = ['LabelStudioConfigValidator']
__all__ = [
'LabelStudioConfigValidator',
'TagFormatConverter',
'create_converter_from_template_config'
]

View File

@@ -0,0 +1,232 @@
"""
Tag Format Converter
Converts simplified external tag format to internal storage format by looking up
the type from the annotation template configuration.
External format (from users):
[
{
"from_name": "label",
"to_name": "image",
"values": ["cat", "dog"]
}
]
Internal storage format:
[
{
"id": "unique_id",
"from_name": "label",
"to_name": "image",
"type": "choices",
"value": {
"choices": ["cat", "dog"]
}
}
]
"""
import uuid
from typing import List, Dict, Any, Optional
from datetime import datetime
from app.core.logging import get_logger
from ..schema.template import TemplateConfiguration
logger = get_logger(__name__)
class TagFormatConverter:
"""Convert between simplified external tag format and internal storage format"""
def __init__(self, template_config: TemplateConfiguration):
"""
Initialize converter with template configuration
Args:
template_config: The template configuration containing label definitions
"""
self.template_config = template_config
# Build a lookup map: from_name -> type
self._type_map = self._build_type_map()
def _build_type_map(self) -> Dict[str, str]:
"""
Build a mapping from from_name to type from template labels
Returns:
Dictionary mapping from_name to control type
"""
type_map = {}
for label_def in self.template_config.labels:
from_name = label_def.from_name
control_type = label_def.type
type_map[from_name] = control_type
logger.debug(f"Registered control: {from_name} -> {control_type}")
return type_map
def get_type_for_from_name(self, from_name: str) -> Optional[str]:
"""
Get the control type for a given from_name
Args:
from_name: The control name
Returns:
Control type or None if not found
"""
return self._type_map.get(from_name)
def convert_simplified_to_full(
self,
simplified_tags: List[Dict[str, Any]]
) -> List[Dict[str, Any]]:
"""
Convert simplified tag format to full internal storage format
Args:
simplified_tags: List of tags in simplified format with structure:
[
{
"from_name": "label",
"to_name": "image",
"values": ["cat", "dog"] # Can be list or single value
}
]
Returns:
List of tags in full internal format:
[
{
"id": "unique_id",
"from_name": "label",
"to_name": "image",
"type": "choices",
"value": {
"choices": ["cat", "dog"]
}
}
]
"""
full_tags = []
for simplified_tag in simplified_tags:
# Support both camelCase and snake_case from external sources
from_name = simplified_tag.get('from_name') or simplified_tag.get('fromName')
to_name = simplified_tag.get('to_name') or simplified_tag.get('toName')
values = simplified_tag.get('values')
tag_id = simplified_tag.get('id') # Use existing ID if provided
if not from_name or not to_name:
logger.warning(f"Skipping tag with missing from_name or to_name: {simplified_tag}")
continue
# Look up the type from template configuration
control_type = self.get_type_for_from_name(from_name)
if not control_type:
logger.warning(
f"Could not find type for from_name '{from_name}' in template. "
f"Tag will be skipped. Available controls: {list(self._type_map.keys())}"
)
continue
# Generate ID if not provided
if not tag_id:
tag_id = str(uuid.uuid4())
# Convert values to the proper nested structure
# The key in the value dict should match the control type
full_tag = {
"id": tag_id,
"from_name": from_name,
"to_name": to_name,
"type": control_type,
"value": {
control_type: values
}
}
full_tags.append(full_tag)
logger.debug(f"Converted tag: {from_name} ({control_type}) with {len(values) if isinstance(values, list) else 1} values")
return full_tags
def is_simplified_format(self, tag: Dict[str, Any]) -> bool:
"""
Check if a tag is in simplified format (missing type field)
Args:
tag: Tag dictionary to check
Returns:
True if tag appears to be in simplified format
"""
# Simplified format has 'values' at top level and no 'type' field
has_values = 'values' in tag
has_type = 'type' in tag
has_value = 'value' in tag
# If it has 'values' but no 'type', it's simplified
# If it has 'type' and nested 'value', it's already full format
return has_values and not has_type and not has_value
def convert_if_needed(
self,
tags: List[Dict[str, Any]]
) -> List[Dict[str, Any]]:
"""
Convert tags to full format if they are in simplified format
This method can handle mixed formats - it will convert simplified tags
and pass through tags that are already in full format.
Args:
tags: List of tags in either format
Returns:
List of tags in full internal format
"""
if not tags:
return []
result = []
for tag in tags:
if self.is_simplified_format(tag):
# Convert simplified format
converted = self.convert_simplified_to_full([tag])
result.extend(converted)
else:
# Already in full format, pass through
result.append(tag)
return result
def create_converter_from_template_config(
template_config_dict: Dict[str, Any]
) -> TagFormatConverter:
"""
Create a TagFormatConverter from a template configuration dictionary
Args:
template_config_dict: Template configuration as dict (from database JSON)
Returns:
TagFormatConverter instance
Raises:
ValueError: If template configuration is invalid
"""
try:
# Parse the configuration using Pydantic model
from ..schema.template import TemplateConfiguration
template_config = TemplateConfiguration(**template_config_dict)
return TagFormatConverter(template_config)
except Exception as e:
logger.error(f"Failed to create tag converter from template config: {e}")
raise ValueError(f"Invalid template configuration: {e}")

View File

@@ -0,0 +1,337 @@
"""
Unit tests for TagFormatConverter
Run with: pytest app/module/annotation/utils/test_tag_converter.py -v
"""
import pytest
from .tag_converter import TagFormatConverter, create_converter_from_template_config
from ..schema.template import TemplateConfiguration, LabelDefinition, ObjectDefinition
@pytest.fixture
def sample_template_config():
"""Create a sample template configuration for testing"""
return TemplateConfiguration(
labels=[
LabelDefinition(
fromName="sentiment",
toName="text",
type="choices",
options=["positive", "negative", "neutral"],
required=True,
labels=None,
description=None
),
LabelDefinition(
fromName="bbox",
toName="image",
type="rectanglelabels",
labels=["cat", "dog", "bird"],
required=False,
options=None,
description=None
),
LabelDefinition(
fromName="comment",
toName="text",
type="textarea",
required=False,
options=None,
labels=None,
description=None
)
],
objects=[
ObjectDefinition(name="text", type="Text", value="$text"),
ObjectDefinition(name="image", type="Image", value="$image")
],
metadata=None
)
@pytest.fixture
def converter(sample_template_config):
"""Create a converter instance"""
return TagFormatConverter(sample_template_config)
class TestTagFormatConverter:
"""Test TagFormatConverter functionality"""
def test_type_map_building(self, converter):
"""Test that type map is built correctly from template"""
assert converter.get_type_for_from_name("sentiment") == "choices"
assert converter.get_type_for_from_name("bbox") == "rectanglelabels"
assert converter.get_type_for_from_name("comment") == "textarea"
assert converter.get_type_for_from_name("nonexistent") is None
def test_convert_simplified_to_full_single_value(self, converter):
"""Test conversion of simplified format with single value"""
simplified = [
{
"from_name": "sentiment",
"to_name": "text",
"values": ["positive"]
}
]
result = converter.convert_simplified_to_full(simplified)
assert len(result) == 1
tag = result[0]
assert tag["from_name"] == "sentiment"
assert tag["to_name"] == "text"
assert tag["type"] == "choices"
assert tag["value"] == {"choices": ["positive"]}
assert "id" in tag
def test_convert_simplified_to_full_multiple_values(self, converter):
"""Test conversion of simplified format with multiple values"""
simplified = [
{
"from_name": "bbox",
"to_name": "image",
"values": ["cat", "dog"]
}
]
result = converter.convert_simplified_to_full(simplified)
assert len(result) == 1
tag = result[0]
assert tag["type"] == "rectanglelabels"
assert tag["value"] == {"rectanglelabels": ["cat", "dog"]}
def test_convert_simplified_camelcase(self, converter):
"""Test that camelCase field names are supported"""
simplified = [
{
"fromName": "sentiment", # camelCase
"toName": "text", # camelCase
"values": ["neutral"]
}
]
result = converter.convert_simplified_to_full(simplified)
assert len(result) == 1
assert result[0]["from_name"] == "sentiment"
assert result[0]["to_name"] == "text"
def test_convert_multiple_tags(self, converter):
"""Test conversion of multiple tags at once"""
simplified = [
{
"from_name": "sentiment",
"to_name": "text",
"values": ["positive"]
},
{
"from_name": "bbox",
"to_name": "image",
"values": ["cat"]
}
]
result = converter.convert_simplified_to_full(simplified)
assert len(result) == 2
assert result[0]["type"] == "choices"
assert result[1]["type"] == "rectanglelabels"
def test_convert_with_existing_id(self, converter):
"""Test that existing IDs are preserved"""
existing_id = "my-custom-id-123"
simplified = [
{
"id": existing_id,
"from_name": "sentiment",
"to_name": "text",
"values": ["positive"]
}
]
result = converter.convert_simplified_to_full(simplified)
assert result[0]["id"] == existing_id
def test_skip_unknown_from_name(self, converter):
"""Test that tags with unknown from_name are skipped"""
simplified = [
{
"from_name": "unknown_control",
"to_name": "text",
"values": ["value"]
}
]
result = converter.convert_simplified_to_full(simplified)
assert len(result) == 0 # Should be skipped
def test_skip_missing_fields(self, converter):
"""Test that tags with missing required fields are skipped"""
simplified = [
{
"from_name": "sentiment",
# Missing to_name
"values": ["positive"]
}
]
result = converter.convert_simplified_to_full(simplified)
assert len(result) == 0 # Should be skipped
def test_is_simplified_format(self, converter):
"""Test detection of simplified format"""
# Simplified format
assert converter.is_simplified_format({
"from_name": "x",
"to_name": "y",
"values": ["a"]
}) is True
# Full format
assert converter.is_simplified_format({
"id": "123",
"from_name": "x",
"to_name": "y",
"type": "choices",
"value": {"choices": ["a"]}
}) is False
# Edge case: has both (should not be considered simplified)
assert converter.is_simplified_format({
"from_name": "x",
"to_name": "y",
"type": "choices",
"values": ["a"]
}) is False
def test_convert_if_needed_mixed_formats(self, converter):
"""Test conversion of mixed format tags"""
mixed = [
# Simplified format
{
"from_name": "sentiment",
"to_name": "text",
"values": ["positive"]
},
# Full format
{
"id": "existing-123",
"from_name": "bbox",
"to_name": "image",
"type": "rectanglelabels",
"value": {"rectanglelabels": ["cat"]}
}
]
result = converter.convert_if_needed(mixed)
assert len(result) == 2
# First should be converted
assert result[0]["type"] == "choices"
assert result[0]["value"] == {"choices": ["positive"]}
# Second should pass through unchanged
assert result[1]["id"] == "existing-123"
assert result[1]["type"] == "rectanglelabels"
class TestCreateConverterFromDict:
"""Test the factory function for creating converter from dict"""
def test_create_from_valid_dict(self):
"""Test creating converter from valid configuration dict"""
config_dict = {
"labels": [
{
"fromName": "label",
"toName": "image",
"type": "choices",
"options": ["a", "b"]
}
],
"objects": [
{
"name": "image",
"type": "Image",
"value": "$image"
}
]
}
converter = create_converter_from_template_config(config_dict)
assert isinstance(converter, TagFormatConverter)
assert converter.get_type_for_from_name("label") == "choices"
def test_create_from_invalid_dict(self):
"""Test that invalid config raises ValueError"""
invalid_config = {
"labels": "not-a-list", # Should be a list
"objects": []
}
with pytest.raises(ValueError, match="Invalid template configuration"):
create_converter_from_template_config(invalid_config)
class TestIntegrationScenarios:
"""Test real-world usage scenarios"""
def test_external_api_submission(self, converter):
"""Simulate external user submitting tags via API"""
# User submits simplified format
user_submission = [
{
"fromName": "sentiment", # User uses camelCase
"toName": "text",
"values": ["positive", "negative"]
}
]
# System converts to internal format
internal_tags = converter.convert_if_needed(user_submission)
# Verify correct storage format
assert len(internal_tags) == 1
assert internal_tags[0]["type"] == "choices"
assert internal_tags[0]["value"] == {"choices": ["positive", "negative"]}
assert "id" in internal_tags[0]
def test_update_existing_tags(self, converter):
"""Simulate updating existing tags with new values"""
# Existing tags in database (full format)
existing_tags = [
{
"id": "tag-001",
"from_name": "sentiment",
"to_name": "text",
"type": "choices",
"value": {"choices": ["positive"]}
}
]
# User updates with simplified format
update_request = [
{
"id": "tag-001", # Same ID to update
"from_name": "sentiment",
"to_name": "text",
"values": ["negative"] # New value
}
]
# Convert update request
converted_update = converter.convert_if_needed(update_request)
# Merge logic would replace tag-001
assert converted_update[0]["id"] == "tag-001"
assert converted_update[0]["value"] == {"choices": ["negative"]}
if __name__ == "__main__":
pytest.main([__file__, "-v"])