Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 123 additions & 0 deletions src/getml_io/getml/feature_learning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
from __future__ import annotations

from collections.abc import Set as AbstractSet
from typing import Annotated, Literal

from getml.feature_learning.aggregations.types import (
FastPropAggregations,
MultirelAggregations,
)
from getml.feature_learning.loss_functions import (
CrossEntropyLossType,
SquareLossType,
)
from pydantic import Field
from pydantic.dataclasses import dataclass


@dataclass(frozen=True)
class FastProp:
aggregation: AbstractSet[FastPropAggregations]
Comment thread
Urfoex marked this conversation as resolved.
delta_t: float
loss_function: CrossEntropyLossType | SquareLossType | None
max_lag: int
min_df: int
n_most_frequent: int
num_features: int
num_threads: int
sampling_factor: float
silent: bool
vocab_size: int
type: Literal["fast_prop"] = "fast_prop"


@dataclass(frozen=True)
class Fastboost:
gamma: float
loss_function: CrossEntropyLossType | SquareLossType | None
max_depth: int
min_child_weights: float
num_features: int
num_threads: int
reg_lambda: float
seed: int
shrinkage: float
silent: bool
subsample: float
type: Literal["fastboost"] = "fastboost"


@dataclass(frozen=True)
class Multirel:
aggregation: AbstractSet[MultirelAggregations]
allow_sets: bool
delta_t: float
grid_factor: float
loss_function: CrossEntropyLossType | SquareLossType | None
max_length: int
min_df: int
min_num_samples: int
num_features: int
num_subfeatures: int
num_threads: int
propositionalization: FastProp
regularization: float
round_robin: bool
sampling_factor: float
seed: int
share_aggregations: float
share_conditions: float
shrinkage: float
silent: bool
vocab_size: int
type: Literal["multirel"] = "multirel"


@dataclass(frozen=True)
class Relboost:
allow_null_weights: bool
delta_t: float
gamma: float
loss_function: CrossEntropyLossType | SquareLossType | None
max_depth: int
min_df: int
min_num_samples: int
num_features: int
num_subfeatures: int
num_threads: int
propositionalization: FastProp
reg_lambda: float
sampling_factor: float
seed: int
shrinkage: float
silent: bool
vocab_size: int
type: Literal["relboost"] = "relboost"


@dataclass(frozen=True)
class RelMT:
allow_avg: bool
delta_t: float
gamma: float
loss_function: CrossEntropyLossType | SquareLossType | None
max_depth: int
min_df: int
min_num_samples: int
num_features: int
num_subfeatures: int
num_threads: int
propositionalization: FastProp
reg_lambda: float
sampling_factor: float
seed: int
shrinkage: float
silent: bool
vocab_size: int
type: Literal["rel_mt"] = "rel_mt"


FeatureLearner = Annotated[
FastProp | Fastboost | Multirel | Relboost | RelMT,
Field(discriminator="type"),
]
Comment thread
Urfoex marked this conversation as resolved.
126 changes: 126 additions & 0 deletions src/getml_io/getml/predictors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
from __future__ import annotations

from typing import Annotated, Literal

from pydantic import Field
from pydantic.dataclasses import dataclass


@dataclass(frozen=True)
class LinearRegression:
learning_rate: float
reg_lambda: float
type: Literal["linear_regression"] = "linear_regression"


@dataclass(frozen=True)
class LogisticRegression:
learning_rate: float
reg_lambda: float
type: Literal["logistic_regression"] = "logistic_regression"


@dataclass(frozen=True)
class ScaleGBMClassifier:
colsample_bylevel: float
colsample_bytree: float
early_stopping_rounds: int
gamma: float
goss_a: float
goss_b: float
learning_rate: float
max_depth: int
min_child_weights: float
n_estimators: int
n_jobs: int
objective: Literal["binary:logistic"]
reg_lambda: float
seed: int
type: Literal["scale_gbm_classifier"] = "scale_gbm_classifier"


@dataclass(frozen=True)
class ScaleGBMRegressor:
colsample_bylevel: float
colsample_bytree: float
early_stopping_rounds: int
gamma: float
goss_a: float
goss_b: float
learning_rate: float
max_depth: int
min_child_weights: float
n_estimators: int
n_jobs: int
objective: Literal["reg:squarederror"]
reg_lambda: float
seed: int
type: Literal["scale_gbm_regressor"] = "scale_gbm_regressor"


@dataclass(frozen=True)
class XGBoostClassifier:
booster: str
colsample_bylevel: float
colsample_bytree: float
early_stopping_rounds: int
gamma: float
learning_rate: float
max_delta_step: float
max_depth: int
min_child_weights: float
n_estimators: int
external_memory: bool
normalize_type: str
num_parallel_tree: int
n_jobs: int
objective: Literal["reg:logistic", "binary:logistic", "binary:logitraw"]
one_drop: bool
rate_drop: float
reg_alpha: float
reg_lambda: float
sample_type: str
silent: bool
skip_drop: float
subsample: float
type: Literal["xgboost_classifier"] = "xgboost_classifier"


@dataclass(frozen=True)
class XGBoostRegressor:
booster: str
colsample_bylevel: float
colsample_bytree: float
early_stopping_rounds: int
external_memory: bool
gamma: float
learning_rate: float
max_delta_step: float
max_depth: int
min_child_weights: float
n_estimators: int
normalize_type: str
num_parallel_tree: int
n_jobs: int
objective: Literal["reg:squarederror", "reg:tweedie", "reg:linear"]
one_drop: bool
rate_drop: float
reg_alpha: float
reg_lambda: float
sample_type: str
silent: bool
skip_drop: float
subsample: float
type: Literal["xgboost_regressor"] = "xgboost_regressor"


FeatureSelector = Annotated[
LinearRegression
| LogisticRegression
| ScaleGBMRegressor
| ScaleGBMClassifier
| XGBoostRegressor
| XGBoostClassifier,
Field(discriminator="type"),
]
Predictor = FeatureSelector
69 changes: 69 additions & 0 deletions src/getml_io/getml/preprocessors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
from __future__ import annotations

from collections.abc import Set as AbstractSet
from typing import Annotated, Literal

from getml.feature_learning.aggregations.types import MappingAggregations
from pydantic import Field
from pydantic.dataclasses import dataclass


@dataclass(frozen=True)
class CategoryTrimmer:
max_num_categories: int
min_freq: int
type: Literal["category_trimmer"] = "category_trimmer"


@dataclass(frozen=True)
class EmailDomain:
type: Literal["email_domain"] = "email_domain"


@dataclass(frozen=True)
class Imputation:
add_dummies: bool
type: Literal["imputation"] = "imputation"


@dataclass(frozen=True)
class Mapping:
aggregation: AbstractSet[MappingAggregations]
min_freq: int
multithreading: bool
type: Literal["mapping"] = "mapping"


@dataclass(frozen=True)
class Seasonal:
disable_year: bool
disable_month: bool
disable_weekday: bool
disable_hour: bool
disable_minute: bool
type: Literal["seasonal"] = "seasonal"


@dataclass(frozen=True)
class Substring:
begin: int
length: int
unit: str
type: Literal["substring"] = "substring"


@dataclass(frozen=True)
class TextFieldSplitter:
type: Literal["text_field_splitter"] = "text_field_splitter"


Preprocessor = Annotated[
CategoryTrimmer
| EmailDomain
| Imputation
| Mapping
| Seasonal
| Substring
| TextFieldSplitter,
Field(discriminator="type"),
]
13 changes: 13 additions & 0 deletions src/getml_io/getml/relationships.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from __future__ import annotations

from enum import Enum

from getml.data import relationship as getml_relationship


class Relationship(str, Enum):
MANY_TO_MANY = getml_relationship.many_to_many
MANY_TO_ONE = getml_relationship.many_to_one
ONE_TO_MANY = getml_relationship.one_to_many
ONE_TO_ONE = getml_relationship.one_to_one
PROPOSITIONALIZATION = getml_relationship.propositionalization
30 changes: 30 additions & 0 deletions src/getml_io/getml/roles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from __future__ import annotations

from collections.abc import Sequence
from enum import Enum

from getml.data import roles
from pydantic.dataclasses import dataclass


class Role(str, Enum):
CATEGORICAL = roles.categorical
JOIN_KEY = roles.join_key
NUMERICAL = roles.numerical
TARGET = roles.target
TEXT = roles.text
TIME_STAMP = roles.time_stamp
UNUSED_FLOAT = roles.unused_float
UNUSED_STRING = roles.unused_string


@dataclass(frozen=True)
class Roles:
categorical: Sequence[str]
join_key: Sequence[str]
numerical: Sequence[str]
target: Sequence[str]
text: Sequence[str]
time_stamp: Sequence[str]
unused_float: Sequence[str]
unused_string: Sequence[str]
1 change: 1 addition & 0 deletions src/getml_io/metadata/container_information.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ class ContainerInformation:
deep_copy: bool
path: Annotated[Path, Field(exclude=True)]

# TODO @urfoex: #53 Adjust relative path already in serialization function
@model_serializer()
def _serialize_model(
self,
Expand Down
13 changes: 13 additions & 0 deletions src/getml_io/metadata/data_model_information.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from __future__ import annotations

from collections.abc import Mapping, Sequence

from pydantic.dataclasses import dataclass

from getml_io.metadata.placeholder_information import PlaceholderInformation


@dataclass(frozen=True)
class DataModelInformation:
population: PlaceholderInformation
peripheral: Mapping[str, Sequence[PlaceholderInformation]]
Loading
Loading