Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/python-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ jobs:
run: |
uv run ruff check --extend-ignore FIX .
uv run ruff format --check .
uv run basedpyright .
- name: Show TODOs
run: |
uv run ruff check --select FIX . || true
Expand Down
2 changes: 1 addition & 1 deletion src/getml_io/getml/getml.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def alive_getml() -> Generator[None]:
getml_is_alive_on_entry = getml.communication.is_monitor_alive()
if not getml_is_alive_on_entry:
try:
getml.engine.launch(launch_browser=False)
getml.engine.launch(launch_browser=False) # pyright: ignore [reportUnknownMemberType]
except Exception as exception:
message = (
"Failed to launch getML engine. "
Expand Down
2 changes: 0 additions & 2 deletions src/getml_io/metadata/container_information.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from typing_extensions import TypedDict

from getml_io.metadata.dataframe_information import DataFrameInformation
from getml_io.metadata.exception import DataFrameInformationPathNotRelativeError
from getml_io.metadata.utils import derive_instance_with_relative_path


Expand Down Expand Up @@ -60,5 +59,4 @@ def _create_dataframe_information_with_relative_path(
return derive_instance_with_relative_path(
dataframe_information,
self.path,
DataFrameInformationPathNotRelativeError,
)
2 changes: 1 addition & 1 deletion src/getml_io/metadata/dataframe_information.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class ColumnStatisticsDouble:
q25: float
q50: float
q75: float
std: float
std: float | None
null_percentage: float
column_type: Literal["DOUBLE"]

Expand Down
22 changes: 0 additions & 22 deletions src/getml_io/metadata/exception.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,25 +42,3 @@ def __init__(
path=path,
base_path=base_path,
)


class TableInformationPathNotRelativeError(PathNotRelativeError):
"""Exception raised on erroneous TableInformation path.

Raised when the path of a TableInformation is not relative
to the given base path.
"""

def __init__(
self,
name: str,
path: Path,
base_path: Path,
) -> None:
"""Initialize the exception with a custom message."""
super().__init__(
item_type="TableInformation",
item_name=name,
path=path,
base_path=base_path,
)
10 changes: 2 additions & 8 deletions src/getml_io/metadata/pipeline_information.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,6 @@
from getml_io.getml.predictors import FeatureSelector, Predictor
from getml_io.getml.preprocessors import Preprocessor
from getml_io.metadata.data_model_information import DataModelInformation
from getml_io.metadata.exception import (
DataFrameInformationPathNotRelativeError,
TableInformationPathNotRelativeError,
)
from getml_io.metadata.feature_sets import FeatureSets
from getml_io.metadata.placeholder_information import PlaceholderInformation
from getml_io.metadata.prediction_results import PredictionResults
Expand Down Expand Up @@ -92,17 +88,15 @@ def _serialize_model(self) -> PipelineInformationDict:
"id": self.id,
"predictions": {
name: derive_instance_with_relative_path(
table_information,
dataframe_information,
self.path,
TableInformationPathNotRelativeError,
)
for name, table_information in self.predictions.items()
for name, dataframe_information in self.predictions.items()
},
"feature_sets": {
name: derive_instance_with_relative_path(
dataframe_information,
self.path,
DataFrameInformationPathNotRelativeError,
)
for name, dataframe_information in self.feature_sets.items()
},
Expand Down
4 changes: 2 additions & 2 deletions src/getml_io/metadata/prediction_results.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from collections.abc import Mapping
from typing import TypeAlias

from getml_io.metadata.table_information import TableInformation
from getml_io.metadata.dataframe_information import DataFrameInformation

PredictionResults: TypeAlias = Mapping[str, TableInformation]
PredictionResults: TypeAlias = Mapping[str, DataFrameInformation]
9 changes: 0 additions & 9 deletions src/getml_io/metadata/table_information.py

This file was deleted.

47 changes: 15 additions & 32 deletions src/getml_io/metadata/utils.py
Original file line number Diff line number Diff line change
@@ -1,55 +1,38 @@
import dataclasses
from collections.abc import Callable
from pathlib import Path
from typing import Protocol, TypeVar

from getml_io.metadata.exception import PathNotRelativeError


class InstanceProtocol(Protocol):
name: str
path: Path


InstanceType = TypeVar("InstanceType", bound=InstanceProtocol)
ErrorType = TypeVar("ErrorType", bound=PathNotRelativeError)
ErrorFactory = Callable[[str, Path, Path], ErrorType]
from getml_io.metadata.dataframe_information import DataFrameInformation
from getml_io.metadata.exception import (
DataFrameInformationPathNotRelativeError,
)


def derive_instance_with_relative_path(
instance: InstanceType,
dataframe_information: DataFrameInformation,
base_path: Path,
error_factory: ErrorFactory[ErrorType],
) -> InstanceType:
) -> DataFrameInformation:
"""Derive a copy of an instance with a path relative to the given base path.

Args:
instance: The instance to use as a template.
dataframe_information: The instance to use as a template.
base_path: The base path to which the instance's path should be relative.
error_factory: A callable that creates an error if the path is not relative.

Returns:
A new instance with the path relative to the base path.

Raises:
PathNotRelativeError: If the instance's path cannot be made relative
to the base path. The specific subclass raised is determined
by the `error_factory`.
TypeError: If the instance is not a dataclass.
DataFrameInformationPathNotRelativeError: If the instance's path cannot be made
relative to the base path.

"""
if not dataclasses.is_dataclass(instance):
message = f"Instance must be a dataclass: {type(instance)}"
raise TypeError(message)
try:
return dataclasses.replace(
instance,
path=instance.path.relative_to(base_path),
dataframe_information,
path=dataframe_information.path.relative_to(base_path),
)
except Exception as exception:
error = error_factory(
instance.name,
instance.path,
raise DataFrameInformationPathNotRelativeError(
dataframe_information.name,
dataframe_information.path,
base_path,
)
raise error from exception
) from exception
Comment thread
Urfoex marked this conversation as resolved.
4 changes: 2 additions & 2 deletions src/getml_io/serialize/dataframe_or_view.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def _fetch_raw_summary_statistics(
parquet_filepath: Path,
) -> dict[str, dict[str, str | int | float]]:
with (
duckdb.connect() as connection,
duckdb.connect() as connection, # pyright: ignore [reportUnknownMemberType]
):
logger.debug(
"Calculating summary statistics for Parquet '%s'",
Expand All @@ -141,7 +141,7 @@ def _fetch_raw_summary_statistics(
"dict[str, dict[str, str | int | float]]",
cast(
"object",
connection.execute(
connection.execute( # pyright: ignore [reportUnknownMemberType]
SUMMARIZE_STATEMENT_TEMPLATE,
[str(parquet_filepath)],
)
Expand Down
8 changes: 0 additions & 8 deletions src/getml_io/serialize/exception.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,14 +72,6 @@ def __init__(self, pipeline_id: str, pipeline_json_path: Path) -> None:
super().__init__("pipeline information", pipeline_id, pipeline_json_path)


class TableParquetStorageError(GetMLIOStorageError):
"""Exception raised when storing Table as parquet fails."""

def __init__(self, name: str, path: Path) -> None:
"""Initialize the exception with a custom message."""
super().__init__("Table as parquet", name, path)


class UnsupportedColumnStatisticsError(GetMLIOError):
"""Exception raised when an unsupported column statistics is encountered."""

Expand Down
54 changes: 0 additions & 54 deletions src/getml_io/serialize/ndarray.py

This file was deleted.

31 changes: 18 additions & 13 deletions src/getml_io/serialize/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import cast

import numpy as np
import pyarrow as pa
from getml import feature_learning as getml_feature_learner
from getml import predictors as getml_predictor
from getml import preprocessors as getml_preprocessor
Expand Down Expand Up @@ -47,10 +48,8 @@
PipelineInformation,
)
from getml_io.metadata.prediction_results import PredictionResults
from getml_io.metadata.table_information import TableInformation
from getml_io.serialize.data_model import serialize_data_model
from getml_io.serialize.dataframe_or_view import serialize_dataframe_or_view
from getml_io.serialize.ndarray import serialize_ndarray
from getml_io.serialize.pipeline_information import serialize_pipeline_information
from getml_io.serialize.placeholder import serialize_placeholder
from getml_io.utils.convert import (
Expand Down Expand Up @@ -151,16 +150,22 @@ def serialize_predictions(
predict_storage_directory = target_storage_directory / "predictions"
prediction_results: PredictionResults = {}
for subset_name in assume_is_dict_str_to_dataframe_or_view(container.subsets):
prediction = pipeline.predict(container[subset_name])
# TODO @urfoex: #54 Convert NDArray to DataFrame and use dataframe serialization
path = serialize_ndarray(
array=cast("NDArray[np.float64]", prediction),
target_storage_directory=predict_storage_directory,
name=subset_name,
prediction = cast(
"NDArray[np.float64]",
pipeline.predict(container[subset_name]), # pyright: ignore [reportUnknownMemberType]
)
Comment thread
Urfoex marked this conversation as resolved.
prediction_results[subset_name] = TableInformation(
name=subset_name,
path=path,
prediction_table = pa.Table.from_arrays( # pyright: ignore [reportUnknownMemberType, reportUnknownVariableType]
Comment thread
Urfoex marked this conversation as resolved.
prediction.T,
names=list(map(str, range(prediction.shape[1]))),
)
prediction_dataframe = DataFrame.from_arrow( # pyright: ignore [reportUnknownMemberType]
prediction_table, # pyright: ignore [reportUnknownArgumentType]
Comment thread
Urfoex marked this conversation as resolved.
f"prediction.{subset_name}",
)

prediction_results[subset_name] = serialize_dataframe_or_view(
prediction_dataframe,
predict_storage_directory,
)

return prediction_results
Expand All @@ -187,7 +192,7 @@ def serialize_feature_sets(
transform_storage_directory = target_storage_directory / "feature_sets"
feature_sets: FeatureSets = {}
for subset_name in assume_is_dict_str_to_dataframe_or_view(container.subsets):
features = pipeline.transform(
features = pipeline.transform( # pyright: ignore [reportUnknownMemberType, reportUnknownVariableType]
container[subset_name],
df_name=f"features.{subset_name}",
)
Expand Down Expand Up @@ -263,7 +268,7 @@ def serialize_predictor(
return TypeAdapter(XGBoostRegressor).validate_python(predictor_as_dict)


def serialize_preprocessor(
def serialize_preprocessor( # noqa: PLR0911
preprocessor: getml_preprocessor.CategoryTrimmer
| getml_preprocessor.EmailDomain
| getml_preprocessor.Imputation
Expand Down
2 changes: 1 addition & 1 deletion tests/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@


def as_magicmock(
value: Any, # noqa: ANN401
value: Any, # noqa: ANN401 # pyright: ignore [reportAny, reportExplicitAny]
) -> unittest.mock.MagicMock:
"""Assume `value` conforms to the return type for static analysis.

Expand Down
11 changes: 1 addition & 10 deletions tests/integration/assertions.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
DataFrameInformation,
)
from getml_io.metadata.pipeline_information import PipelineInformation
from getml_io.metadata.table_information import TableInformation


def assert_container_parquets(
Expand Down Expand Up @@ -168,7 +167,7 @@ def assert_pipeline_information(
prediction_name,
prediction,
) in expected_pipeline_information.predictions.items():
assert_table_information(
assert_dataframe_information(
pipeline_information.predictions[prediction_name],
prediction,
)
Expand Down Expand Up @@ -211,11 +210,3 @@ def assert_pipeline_information(
assert pipeline_information.tags
assert pipeline_information.targets == expected_pipeline_information.targets
assert pipeline_information.data_model == expected_pipeline_information.data_model


def assert_table_information(
table_information: TableInformation,
expected_table_information: TableInformation,
) -> None:
assert table_information.name == expected_table_information.name
assert table_information.path == expected_table_information.path
2 changes: 1 addition & 1 deletion tests/integration/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def data_path() -> Path:

@pytest.fixture
def project_name(request: pytest.FixtureRequest) -> str:
name = cast("str", request.node.name)
name = cast("str", request.node.name) # pyright: ignore [reportUnknownMemberType]
sanitized_name = re.sub(r"[^a-zA-Z0-9_-]", "_", name)
name_hash = hashlib.md5(name.encode("utf-8")).hexdigest()[:6] # noqa: S324
return f"getml-io-{sanitized_name}-{name_hash}"
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/data/getmlproject.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def _save_project_bundle(self) -> None:
self._project_name,
str(self._path),
)
getml.project.save(filename=self._path)
getml.project.save(filename=self._path) # pyright: ignore [reportUnknownMemberType]


@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
Expand Down
Loading