getml · Urfoex · Aug 29, 2025 · Aug 14, 2025
diff --git a/src/getml_io/getml/feature_learning.py b/src/getml_io/getml/feature_learning.py
@@ -0,0 +1,123 @@
+from __future__ import annotations
+
+from collections.abc import Set as AbstractSet
+from typing import Annotated, Literal
+
+from getml.feature_learning.aggregations.types import (
+    FastPropAggregations,
+    MultirelAggregations,
+)
+from getml.feature_learning.loss_functions import (
+    CrossEntropyLossType,
+    SquareLossType,
+)
+from pydantic import Field
+from pydantic.dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class FastProp:
+    aggregation: AbstractSet[FastPropAggregations]
+    delta_t: float
+    loss_function: CrossEntropyLossType | SquareLossType | None
+    max_lag: int
+    min_df: int
+    n_most_frequent: int
+    num_features: int
+    num_threads: int
+    sampling_factor: float
+    silent: bool
+    vocab_size: int
+    type: Literal["fast_prop"] = "fast_prop"
+
+
+@dataclass(frozen=True)
+class Fastboost:
+    gamma: float
+    loss_function: CrossEntropyLossType | SquareLossType | None
+    max_depth: int
+    min_child_weights: float
+    num_features: int
+    num_threads: int
+    reg_lambda: float
+    seed: int
+    shrinkage: float
+    silent: bool
+    subsample: float
+    type: Literal["fastboost"] = "fastboost"
+
+
+@dataclass(frozen=True)
+class Multirel:
+    aggregation: AbstractSet[MultirelAggregations]
+    allow_sets: bool
+    delta_t: float
+    grid_factor: float
+    loss_function: CrossEntropyLossType | SquareLossType | None
+    max_length: int
+    min_df: int
+    min_num_samples: int
+    num_features: int
+    num_subfeatures: int
+    num_threads: int
+    propositionalization: FastProp
+    regularization: float
+    round_robin: bool
+    sampling_factor: float
+    seed: int
+    share_aggregations: float
+    share_conditions: float
+    shrinkage: float
+    silent: bool
+    vocab_size: int
+    type: Literal["multirel"] = "multirel"
+
+
+@dataclass(frozen=True)
+class Relboost:
+    allow_null_weights: bool
+    delta_t: float
+    gamma: float
+    loss_function: CrossEntropyLossType | SquareLossType | None
+    max_depth: int
+    min_df: int
+    min_num_samples: int
+    num_features: int
+    num_subfeatures: int
+    num_threads: int
+    propositionalization: FastProp
+    reg_lambda: float
+    sampling_factor: float
+    seed: int
+    shrinkage: float
+    silent: bool
+    vocab_size: int
+    type: Literal["relboost"] = "relboost"
+
+
+@dataclass(frozen=True)
+class RelMT:
+    allow_avg: bool
+    delta_t: float
+    gamma: float
+    loss_function: CrossEntropyLossType | SquareLossType | None
+    max_depth: int
+    min_df: int
+    min_num_samples: int
+    num_features: int
+    num_subfeatures: int
+    num_threads: int
+    propositionalization: FastProp
+    reg_lambda: float
+    sampling_factor: float
+    seed: int
+    shrinkage: float
+    silent: bool
+    vocab_size: int
+    type: Literal["rel_mt"] = "rel_mt"
+
+
+FeatureLearner = Annotated[
+    FastProp | Fastboost | Multirel | Relboost | RelMT,
+    Field(discriminator="type"),
+]
diff --git a/src/getml_io/getml/predictors.py b/src/getml_io/getml/predictors.py
@@ -0,0 +1,126 @@
+from __future__ import annotations
+
+from typing import Annotated, Literal
+
+from pydantic import Field
+from pydantic.dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class LinearRegression:
+    learning_rate: float
+    reg_lambda: float
+    type: Literal["linear_regression"] = "linear_regression"
+
+
+@dataclass(frozen=True)
+class LogisticRegression:
+    learning_rate: float
+    reg_lambda: float
+    type: Literal["logistic_regression"] = "logistic_regression"
+
+
+@dataclass(frozen=True)
+class ScaleGBMClassifier:
+    colsample_bylevel: float
+    colsample_bytree: float
+    early_stopping_rounds: int
+    gamma: float
+    goss_a: float
+    goss_b: float
+    learning_rate: float
+    max_depth: int
+    min_child_weights: float
+    n_estimators: int
+    n_jobs: int
+    objective: Literal["binary:logistic"]
+    reg_lambda: float
+    seed: int
+    type: Literal["scale_gbm_classifier"] = "scale_gbm_classifier"
+
+
+@dataclass(frozen=True)
+class ScaleGBMRegressor:
+    colsample_bylevel: float
+    colsample_bytree: float
+    early_stopping_rounds: int
+    gamma: float
+    goss_a: float
+    goss_b: float
+    learning_rate: float
+    max_depth: int
+    min_child_weights: float
+    n_estimators: int
+    n_jobs: int
+    objective: Literal["reg:squarederror"]
+    reg_lambda: float
+    seed: int
+    type: Literal["scale_gbm_regressor"] = "scale_gbm_regressor"
+
+
+@dataclass(frozen=True)
+class XGBoostClassifier:
+    booster: str
+    colsample_bylevel: float
+    colsample_bytree: float
+    early_stopping_rounds: int
+    gamma: float
+    learning_rate: float
+    max_delta_step: float
+    max_depth: int
+    min_child_weights: float
+    n_estimators: int
+    external_memory: bool
+    normalize_type: str
+    num_parallel_tree: int
+    n_jobs: int
+    objective: Literal["reg:logistic", "binary:logistic", "binary:logitraw"]
+    one_drop: bool
+    rate_drop: float
+    reg_alpha: float
+    reg_lambda: float
+    sample_type: str
+    silent: bool
+    skip_drop: float
+    subsample: float
+    type: Literal["xgboost_classifier"] = "xgboost_classifier"
+
+
+@dataclass(frozen=True)
+class XGBoostRegressor:
+    booster: str
+    colsample_bylevel: float
+    colsample_bytree: float
+    early_stopping_rounds: int
+    external_memory: bool
+    gamma: float
+    learning_rate: float
+    max_delta_step: float
+    max_depth: int
+    min_child_weights: float
+    n_estimators: int
+    normalize_type: str
+    num_parallel_tree: int
+    n_jobs: int
+    objective: Literal["reg:squarederror", "reg:tweedie", "reg:linear"]
+    one_drop: bool
+    rate_drop: float
+    reg_alpha: float
+    reg_lambda: float
+    sample_type: str
+    silent: bool
+    skip_drop: float
+    subsample: float
+    type: Literal["xgboost_regressor"] = "xgboost_regressor"
+
+
+FeatureSelector = Annotated[
+    LinearRegression
+    | LogisticRegression
+    | ScaleGBMRegressor
+    | ScaleGBMClassifier
+    | XGBoostRegressor
+    | XGBoostClassifier,
+    Field(discriminator="type"),
+]
+Predictor = FeatureSelector
diff --git a/src/getml_io/getml/preprocessors.py b/src/getml_io/getml/preprocessors.py
@@ -0,0 +1,69 @@
+from __future__ import annotations
+
+from collections.abc import Set as AbstractSet
+from typing import Annotated, Literal
+
+from getml.feature_learning.aggregations.types import MappingAggregations
+from pydantic import Field
+from pydantic.dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class CategoryTrimmer:
+    max_num_categories: int
+    min_freq: int
+    type: Literal["category_trimmer"] = "category_trimmer"
+
+
+@dataclass(frozen=True)
+class EmailDomain:
+    type: Literal["email_domain"] = "email_domain"
+
+
+@dataclass(frozen=True)
+class Imputation:
+    add_dummies: bool
+    type: Literal["imputation"] = "imputation"
+
+
+@dataclass(frozen=True)
+class Mapping:
+    aggregation: AbstractSet[MappingAggregations]
+    min_freq: int
+    multithreading: bool
+    type: Literal["mapping"] = "mapping"
+
+
+@dataclass(frozen=True)
+class Seasonal:
+    disable_year: bool
+    disable_month: bool
+    disable_weekday: bool
+    disable_hour: bool
+    disable_minute: bool
+    type: Literal["seasonal"] = "seasonal"
+
+
+@dataclass(frozen=True)
+class Substring:
+    begin: int
+    length: int
+    unit: str
+    type: Literal["substring"] = "substring"
+
+
+@dataclass(frozen=True)
+class TextFieldSplitter:
+    type: Literal["text_field_splitter"] = "text_field_splitter"
+
+
+Preprocessor = Annotated[
+    CategoryTrimmer
+    | EmailDomain
+    | Imputation
+    | Mapping
+    | Seasonal
+    | Substring
+    | TextFieldSplitter,
+    Field(discriminator="type"),
+]
diff --git a/src/getml_io/getml/relationships.py b/src/getml_io/getml/relationships.py
@@ -0,0 +1,13 @@
+from __future__ import annotations
+
+from enum import Enum
+
+from getml.data import relationship as getml_relationship
+
+
+class Relationship(str, Enum):
+    MANY_TO_MANY = getml_relationship.many_to_many
+    MANY_TO_ONE = getml_relationship.many_to_one
+    ONE_TO_MANY = getml_relationship.one_to_many
+    ONE_TO_ONE = getml_relationship.one_to_one
+    PROPOSITIONALIZATION = getml_relationship.propositionalization
diff --git a/src/getml_io/getml/roles.py b/src/getml_io/getml/roles.py
@@ -0,0 +1,30 @@
+from __future__ import annotations
+
+from collections.abc import Sequence
+from enum import Enum
+
+from getml.data import roles
+from pydantic.dataclasses import dataclass
+
+
+class Role(str, Enum):
+    CATEGORICAL = roles.categorical
+    JOIN_KEY = roles.join_key
+    NUMERICAL = roles.numerical
+    TARGET = roles.target
+    TEXT = roles.text
+    TIME_STAMP = roles.time_stamp
+    UNUSED_FLOAT = roles.unused_float
+    UNUSED_STRING = roles.unused_string
+
+
+@dataclass(frozen=True)
+class Roles:
+    categorical: Sequence[str]
+    join_key: Sequence[str]
+    numerical: Sequence[str]
+    target: Sequence[str]
+    text: Sequence[str]
+    time_stamp: Sequence[str]
+    unused_float: Sequence[str]
+    unused_string: Sequence[str]
diff --git a/src/getml_io/metadata/container_information.py b/src/getml_io/metadata/container_information.py
@@ -30,6 +30,7 @@ class ContainerInformation:
     deep_copy: bool
     path: Annotated[Path, Field(exclude=True)]
 
+    # TODO @urfoex: #53 Adjust relative path already in serialization function
     @model_serializer()
     def _serialize_model(
         self,

diff --git a/src/getml_io/metadata/data_model_information.py b/src/getml_io/metadata/data_model_information.py
@@ -0,0 +1,13 @@
+from __future__ import annotations
+
+from collections.abc import Mapping, Sequence
+
+from pydantic.dataclasses import dataclass
+
+from getml_io.metadata.placeholder_information import PlaceholderInformation
+
+
+@dataclass(frozen=True)
+class DataModelInformation:
+    population: PlaceholderInformation
+    peripheral: Mapping[str, Sequence[PlaceholderInformation]]