Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## Unreleased

### Fixed
- Fix Protobuf export for arrays of objects and improve message/enum naming to UpperCamelCase (#1012 @Schokuroff)

## [0.11.8] - 2026-04-10

### Added
Expand Down
275 changes: 214 additions & 61 deletions datacontract/export/protobuf_exporter.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
import sys
from typing import List, Optional

from open_data_contract_standard.model import OpenDataContractStandard, SchemaProperty

from datacontract.export.exporter import Exporter

OBJECT_TYPES: set = {"object", "record", "struct"}


class ProtoBufExporter(Exporter):
def export(self, data_contract, schema_name, server, sql_server_type, export_args) -> dict:
Comment thread
Schokuroff marked this conversation as resolved.
# Returns a dict containing the protobuf representation.
"""Exports data contract to Protobuf format."""
proto = to_protobuf(data_contract)
return proto


def _get_config_value(prop: SchemaProperty, key: str) -> Optional[str]:
"""Get a custom property value."""
"""Get a custom property value from customProperties."""
if prop.customProperties is None:
return None
for cp in prop.customProperties:
Expand Down Expand Up @@ -57,10 +58,10 @@ def to_protobuf(data_contract: OpenDataContractStandard) -> str:

# Build header with syntax and package declarations.
header = 'syntax = "proto3";\n\n'
package = "example" # Default package
package = "example" # Default package, can be customized
header += f"package {package};\n\n"

# Append enum definitions.
# Append enum definitions before messages.
for enum_name, enum_values in enum_definitions.items():
header += f"// Enum for {enum_name}\n"
header += f"enum {enum_name} {{\n"
Expand All @@ -72,6 +73,7 @@ def to_protobuf(data_contract: OpenDataContractStandard) -> str:
else:
header += f" // Warning: Enum values for {enum_name} are not a dictionary\n"
header += "}\n\n"

return header + messages


Expand All @@ -86,11 +88,12 @@ def _is_enum_field(prop: SchemaProperty) -> bool:
def _get_enum_name(prop: SchemaProperty) -> str:
"""
Returns the enum name either from the field's "enum_name" or derived from the field name.
Uses UpperCamelCase formatting.
"""
enum_name = _get_config_value(prop, "enum_name")
if enum_name:
return enum_name
return _to_protobuf_message_name(prop.name)
return _snake_to_upper_camel(enum_name)
return _snake_to_upper_camel(prop.name)


def _get_enum_values(prop: SchemaProperty) -> dict:
Expand All @@ -103,69 +106,126 @@ def _get_enum_values(prop: SchemaProperty) -> dict:
return {}


def _to_protobuf_message_name(name: str) -> str:
def _snake_to_upper_camel(name: str) -> str:
"""
Returns a valid Protobuf message/enum name by capitalizing the first letter.
Convert snake_case to UpperCamelCase.
Preserves existing capitalization in parts.

Examples:
"fsa_room" -> "FsaRoom"
"FsaRegister" -> "FsaRegister" (already in UpperCamelCase)
"simple_obj" -> "SimpleObj"
"""
return name[0].upper() + name[1:] if name else name
if not name:
return name

# If already UpperCamelCase (first letter uppercase, no underscores after first word)
if name and name[0].isupper() and "_" not in name:
return name

def to_protobuf_message(
model_name: str, properties: List[SchemaProperty], description: str, indent_level: int = 0
) -> str:
parts = name.split("_")
# Capitalize each part while preserving internal capitalization
return "".join(part[0].upper() + part[1:] if part else "" for part in parts)


def _get_type_name(prop: SchemaProperty) -> str:
"""
Generates a Protobuf message definition from the model's fields.
Handles nested messages for complex types.
Get appropriate message/enum type name in UpperCamelCase.
Used for message declarations and field type references.
"""
result = ""
if description:
result += f"{indent(indent_level)}// {description}\n"
# For enums
if _is_enum_field(prop):
return _get_enum_name(prop)

result += f"message {_to_protobuf_message_name(model_name)} {{\n"
number = 1
for prop in properties:
# For nested objects, generate a nested message.
field_type = prop.logicalType or ""
if field_type.lower() in ["object", "record", "struct"]:
nested_desc = prop.description or ""
nested_props = prop.properties or []
nested_message = to_protobuf_message(prop.name, nested_props, nested_desc, indent_level + 1)
result += nested_message + "\n"
# For regular objects
if prop.logicalType and prop.logicalType.lower() in OBJECT_TYPES:
return _snake_to_upper_camel(prop.name)

field_desc = prop.description or ""
result += to_protobuf_field(prop, field_desc, number, indent_level + 1) + "\n"
number += 1
# For objects inside arrays
if (
prop.logicalType
and prop.logicalType.lower() == "array"
and prop.items
and prop.items.logicalType
and prop.items.logicalType.lower() in OBJECT_TYPES
):
# If explicit name is provided in items.name
if hasattr(prop.items, "name") and prop.items.name:
# Normalize items.name the same way as message declarations
return _snake_to_upper_camel(prop.items.name)

result += f"{indent(indent_level)}}}\n"
return result
# Otherwise generate from field name
return _snake_to_upper_camel(prop.name)

return _snake_to_upper_camel(prop.name)


def to_protobuf_field(prop: SchemaProperty, description: str, number: int, indent_level: int = 0) -> str:
def _should_create_nested_message(prop: SchemaProperty) -> bool:
"""
Generates a field definition within a Protobuf message.
Check if we need to create a nested message for this property.
Returns True for objects and arrays of objects.
"""
result = ""
if description:
result += f"{indent(indent_level)}// {description}\n"
result += f"{indent(indent_level)}{_convert_type(prop)} {prop.name} = {number};"
return result
if not prop.logicalType:
return False

lower_type = prop.logicalType.lower()

def indent(indent_level: int) -> str:
return " " * indent_level
# Regular object
if lower_type in OBJECT_TYPES:
return True

# Array of objects
if lower_type == "array" and prop.items:
items_lower_type = prop.items.logicalType.lower() if prop.items.logicalType else ""
return items_lower_type in OBJECT_TYPES

return False


def _convert_type(prop: SchemaProperty) -> str:
def _get_nested_properties(prop: SchemaProperty) -> Optional[List[SchemaProperty]]:
"""
Converts a field's type (from the data contract) to a Protobuf type.
Prioritizes enum conversion if a non-empty "values" property exists.
Get properties for nested message.
Returns None if no nested properties.
"""
# For debugging purposes
print("Converting field:", prop.name, file=sys.stderr)
# If the field should be treated as an enum, return its enum name.
if _is_enum_field(prop):
return _get_enum_name(prop)
if prop.logicalType and prop.logicalType.lower() in OBJECT_TYPES:
return prop.properties or []

if (
prop.logicalType
and prop.logicalType.lower() == "array"
and prop.items
and prop.items.logicalType
and prop.items.logicalType.lower() in OBJECT_TYPES
):
return prop.items.properties or []

return None


def _get_nested_description(prop: SchemaProperty) -> str:
"""
Get description for nested message.
"""
if prop.logicalType and prop.logicalType.lower() in OBJECT_TYPES:
return prop.description or ""

if (
prop.logicalType
and prop.logicalType.lower() == "array"
and prop.items
and prop.items.logicalType
and prop.items.logicalType.lower() in OBJECT_TYPES
):
return prop.items.description or ""

return ""


def _get_primitive_type(prop: SchemaProperty) -> str:
"""
Get Protobuf type for primitive fields.
Handles recursive type resolution for arrays of primitives.
"""
field_type = prop.logicalType or ""
lower_type = field_type.lower()

Expand All @@ -185,19 +245,112 @@ def _convert_type(prop: SchemaProperty) -> str:
return "bool"
if lower_type in ["bytes"]:
return "bytes"
if lower_type in ["object", "record", "struct"]:
return _to_protobuf_message_name(prop.name)

# Recursive handling for arrays of primitives
if lower_type == "array" and prop.items:
return _get_primitive_type(prop.items)

return "string" # Fallback for unrecognized types


def _get_field_type(prop: SchemaProperty) -> str:
"""
Get Protobuf type for field (string, int32, repeated TypeName, etc).
Combines repeated keyword with type name for arrays.
"""
field_type = prop.logicalType or ""
lower_type = field_type.lower()

# Handle arrays
if lower_type == "array":
# Handle array types. Check for an "items" property.
if prop.items:
items_type = prop.items.logicalType or ""
if items_type.lower() in ["object", "record", "struct"]:
# Singularize the field name (a simple approach).
singular = prop.name[:-1] if prop.name.endswith("s") else prop.name
return "repeated " + _to_protobuf_message_name(singular)
items_lower_type = items_type.lower()

# If array contains objects
if items_lower_type in OBJECT_TYPES:
type_name = _get_type_name(prop) # e.g., FsaRoom
return f"repeated {type_name}"
else:
return "repeated " + _convert_type(prop.items)
# For primitive types
primitive_type = _get_primitive_type(prop.items)
return f"repeated {primitive_type}"
else:
return "repeated string"
# Fallback for unrecognized types.
return "string"
return "repeated string" # Default array type

# Handle regular objects
if lower_type in OBJECT_TYPES:
type_name = _get_type_name(prop) # e.g., SimpleObj
return type_name

# Handle enums
if _is_enum_field(prop):
return _get_enum_name(prop)

# Handle primitive types
return _get_primitive_type(prop)


def to_protobuf_message(
model_name: str, properties: List[SchemaProperty], description: str, indent_level: int = 0
) -> str:
"""
Generates a Protobuf message definition from the model's fields.
Handles nested messages for complex types recursively.
"""
result = ""
if description:
result += f"{indent(indent_level)}// {description}\n"

# Message name always in UpperCamelCase
message_name = _snake_to_upper_camel(model_name)
result += f"{indent(indent_level)}message {message_name} {{\n"

# Phase 1: Create all nested messages
for prop in properties:
if _should_create_nested_message(prop):
type_name = _get_type_name(prop) # UpperCamelCase
nested_props = _get_nested_properties(prop)
nested_desc = _get_nested_description(prop)

if nested_props is not None:
nested_message = to_protobuf_message(type_name, nested_props, nested_desc, indent_level + 1)
result += nested_message + "\n"

# Phase 2: Create all fields
number = 1
for prop in properties:
field_name = prop.name # snake_case (preserve as in YAML)
field_decl = _get_field_declaration(prop)
field_desc = prop.description or ""

result += f"{indent(indent_level + 1)}"
if field_desc:
result += f"// {field_desc}\n{indent(indent_level + 1)}"

result += f"{field_decl} {field_name} = {number};\n"
number += 1

result += f"{indent(indent_level)}}}\n"
return result


def indent(indent_level: int) -> str:
"""Generate indentation string for Protobuf formatting."""
return " " * indent_level


def _get_field_declaration(prop: SchemaProperty) -> str:
"""
Returns field declaration with optional keyword if needed.
"""
field_type = _get_field_type(prop) # includes "repeated" if needed

logical_type = (prop.logicalType or "").lower()
is_array = logical_type == "array"
is_message_type = logical_type in OBJECT_TYPES

# Add 'optional' only for non-required, non-array, non-message fields (scalars/enums)
if hasattr(prop, "required") and prop.required is False and not is_array and not is_message_type:
return f"optional {field_type}"
return field_type
Loading
Loading