Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/getml_io/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def deserialize(
pipeline_id=pipeline,
container_id=container,
)
message = f"Deserializing {project_information} from {root_storage_directory}"
message = f"Deserializing {project_information!r} from {root_storage_directory}"
# TODO @urfoex: #20: Implement deserialization logic
raise NotImplementedError(message)

Expand Down
30 changes: 17 additions & 13 deletions src/getml_io/getml/feature_learning.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

from collections.abc import Set as AbstractSet
from typing import Annotated, Literal
from typing import Annotated, ClassVar, Literal

from getml.feature_learning.aggregations.types import (
FastPropAggregations,
Expand All @@ -11,12 +11,12 @@
CrossEntropyLossType,
SquareLossType,
)
from pydantic import Field
from pydantic.dataclasses import dataclass
from pydantic import BaseModel, ConfigDict, Field


@dataclass(frozen=True)
class FastProp:
class FastProp(BaseModel):
model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True)
Comment thread
Urfoex marked this conversation as resolved.

aggregation: AbstractSet[FastPropAggregations]
delta_t: float
loss_function: CrossEntropyLossType | SquareLossType | None
Expand All @@ -31,8 +31,9 @@ class FastProp:
type: Literal["fast_prop"] = "fast_prop"


@dataclass(frozen=True)
class Fastboost:
class Fastboost(BaseModel):
model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True)

gamma: float
loss_function: CrossEntropyLossType | SquareLossType | None
max_depth: int
Expand All @@ -47,8 +48,9 @@ class Fastboost:
type: Literal["fastboost"] = "fastboost"


@dataclass(frozen=True)
class Multirel:
class Multirel(BaseModel):
model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True)

aggregation: AbstractSet[MultirelAggregations]
allow_sets: bool
delta_t: float
Expand All @@ -73,8 +75,9 @@ class Multirel:
type: Literal["multirel"] = "multirel"


@dataclass(frozen=True)
class Relboost:
class Relboost(BaseModel):
model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True)

allow_null_weights: bool
delta_t: float
gamma: float
Expand All @@ -95,8 +98,9 @@ class Relboost:
type: Literal["relboost"] = "relboost"


@dataclass(frozen=True)
class RelMT:
class RelMT(BaseModel):
model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True)

allow_avg: bool
delta_t: float
gamma: float
Expand Down
35 changes: 20 additions & 15 deletions src/getml_io/getml/predictors.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,29 @@
from __future__ import annotations

from typing import Annotated, Literal
from typing import Annotated, ClassVar, Literal

from pydantic import Field
from pydantic.dataclasses import dataclass
from pydantic import BaseModel, ConfigDict, Field


@dataclass(frozen=True)
class LinearRegression:
class LinearRegression(BaseModel):
model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True)

learning_rate: float
reg_lambda: float
type: Literal["linear_regression"] = "linear_regression"


@dataclass(frozen=True)
class LogisticRegression:
class LogisticRegression(BaseModel):
model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True)

learning_rate: float
reg_lambda: float
type: Literal["logistic_regression"] = "logistic_regression"


@dataclass(frozen=True)
class ScaleGBMClassifier:
class ScaleGBMClassifier(BaseModel):
model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True)

colsample_bylevel: float
colsample_bytree: float
early_stopping_rounds: int
Expand All @@ -39,8 +41,9 @@ class ScaleGBMClassifier:
type: Literal["scale_gbm_classifier"] = "scale_gbm_classifier"


@dataclass(frozen=True)
class ScaleGBMRegressor:
class ScaleGBMRegressor(BaseModel):
model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True)

colsample_bylevel: float
colsample_bytree: float
early_stopping_rounds: int
Expand All @@ -58,8 +61,9 @@ class ScaleGBMRegressor:
type: Literal["scale_gbm_regressor"] = "scale_gbm_regressor"


@dataclass(frozen=True)
class XGBoostClassifier:
class XGBoostClassifier(BaseModel):
model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True)

booster: str
colsample_bylevel: float
colsample_bytree: float
Expand All @@ -86,8 +90,9 @@ class XGBoostClassifier:
type: Literal["xgboost_classifier"] = "xgboost_classifier"


@dataclass(frozen=True)
class XGBoostRegressor:
class XGBoostRegressor(BaseModel):
model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True)

booster: str
colsample_bylevel: float
colsample_bytree: float
Expand Down
40 changes: 23 additions & 17 deletions src/getml_io/getml/preprocessors.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,45 @@
from __future__ import annotations

from collections.abc import Set as AbstractSet
from typing import Annotated, Literal
from typing import Annotated, ClassVar, Literal

from getml.feature_learning.aggregations.types import MappingAggregations
from pydantic import Field
from pydantic.dataclasses import dataclass
from pydantic import BaseModel, ConfigDict, Field


@dataclass(frozen=True)
class CategoryTrimmer:
class CategoryTrimmer(BaseModel):
model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True)

max_num_categories: int
min_freq: int
type: Literal["category_trimmer"] = "category_trimmer"


@dataclass(frozen=True)
class EmailDomain:
class EmailDomain(BaseModel):
model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True)

type: Literal["email_domain"] = "email_domain"


@dataclass(frozen=True)
class Imputation:
class Imputation(BaseModel):
model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True)

add_dummies: bool
type: Literal["imputation"] = "imputation"


@dataclass(frozen=True)
class Mapping:
class Mapping(BaseModel):
model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True)

aggregation: AbstractSet[MappingAggregations]
min_freq: int
multithreading: bool
type: Literal["mapping"] = "mapping"


@dataclass(frozen=True)
class Seasonal:
class Seasonal(BaseModel):
model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True)

disable_year: bool
disable_month: bool
disable_weekday: bool
Expand All @@ -44,16 +48,18 @@ class Seasonal:
type: Literal["seasonal"] = "seasonal"


@dataclass(frozen=True)
class Substring:
class Substring(BaseModel):
model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True)

begin: int
length: int
unit: str
type: Literal["substring"] = "substring"


@dataclass(frozen=True)
class TextFieldSplitter:
class TextFieldSplitter(BaseModel):
model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True)

type: Literal["text_field_splitter"] = "text_field_splitter"


Expand Down
11 changes: 8 additions & 3 deletions src/getml_io/getml/project.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import logging
from collections.abc import Generator
from contextlib import contextmanager
from dataclasses import dataclass
from typing import ClassVar

from getml.data import Container
from getml.pipeline import Pipeline
from pydantic import BaseModel, ConfigDict

from getml_io.getml.exception import (
PipelineNotFoundError,
Expand All @@ -24,8 +25,12 @@
logger: logging.Logger = logging.getLogger(__name__)


@dataclass
class Project:
class Project(BaseModel):
model_config: ClassVar[ConfigDict] = ConfigDict(
arbitrary_types_allowed=True,
frozen=True,
)

name: str
pipeline: Pipeline
container: Container
Expand Down
9 changes: 6 additions & 3 deletions src/getml_io/getml/project_information.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
from dataclasses import dataclass
from typing import ClassVar

from pydantic import BaseModel, ConfigDict


class ProjectInformation(BaseModel):
model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True)

@dataclass
class ProjectInformation:
project_name: str
pipeline_id: str
container_id: str
8 changes: 5 additions & 3 deletions src/getml_io/getml/roles.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@

from collections.abc import Sequence
from enum import Enum
from typing import ClassVar

from getml.data import roles
from pydantic.dataclasses import dataclass
from pydantic import BaseModel, ConfigDict


class Role(str, Enum):
Expand All @@ -18,8 +19,9 @@ class Role(str, Enum):
UNUSED_STRING = roles.unused_string


@dataclass(frozen=True)
class Roles:
class Roles(BaseModel):
model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True)

categorical: Sequence[str]
join_key: Sequence[str]
numerical: Sequence[str]
Expand Down
62 changes: 10 additions & 52 deletions src/getml_io/metadata/container_information.py
Original file line number Diff line number Diff line change
@@ -1,62 +1,20 @@
from __future__ import annotations

from collections.abc import Mapping
from pathlib import Path
from typing import Annotated
from typing import ClassVar

from pydantic import Field, model_serializer
from pydantic.dataclasses import dataclass
from typing_extensions import TypedDict
from pydantic import BaseModel, ConfigDict

from getml_io.metadata.dataframe_information import DataFrameInformation
from getml_io.metadata.utils import derive_instance_with_relative_path
from getml_io.metadata.dataframe_information import (
DataFrameInformation,
DataFrameInformationByName,
)


class ContainerInformationDict(TypedDict):
Comment thread
Urfoex marked this conversation as resolved.
id: str
population: DataFrameInformation | None
peripheral: Mapping[str, DataFrameInformation]
subsets: Mapping[str, DataFrameInformation]
deep_copy: bool
class ContainerInformation(BaseModel):
model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True)


@dataclass
class ContainerInformation:
id: str
population: DataFrameInformation | None
peripheral: Mapping[str, DataFrameInformation]
subsets: Mapping[str, DataFrameInformation]
peripheral: DataFrameInformationByName
subsets: DataFrameInformationByName
deep_copy: bool
path: Annotated[Path, Field(exclude=True)]

# TODO @urfoex: #53 Adjust relative path already in serialization function
@model_serializer()
def _serialize_model(
self,
) -> ContainerInformationDict:
return {
"id": self.id,
"population": self._create_dataframe_information_with_relative_path(
self.population,
)
if self.population
else None,
"peripheral": {
name: self._create_dataframe_information_with_relative_path(peripheral)
for name, peripheral in self.peripheral.items()
},
"subsets": {
name: self._create_dataframe_information_with_relative_path(subset)
for name, subset in self.subsets.items()
},
"deep_copy": self.deep_copy,
}

def _create_dataframe_information_with_relative_path(
self,
dataframe_information: DataFrameInformation,
) -> DataFrameInformation:
return derive_instance_with_relative_path(
dataframe_information,
self.path,
)
8 changes: 5 additions & 3 deletions src/getml_io/metadata/data_model_information.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
from __future__ import annotations

from collections.abc import Mapping, Sequence
from typing import ClassVar

from pydantic.dataclasses import dataclass
from pydantic import BaseModel, ConfigDict

from getml_io.metadata.placeholder_information import PlaceholderInformation


@dataclass(frozen=True)
class DataModelInformation:
class DataModelInformation(BaseModel):
model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True)

population: PlaceholderInformation
peripheral: Mapping[str, Sequence[PlaceholderInformation]]
Loading
Loading