diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml index 732aec98..259b0b87 100644 --- a/.github/workflows/lint.yaml +++ b/.github/workflows/lint.yaml @@ -10,6 +10,8 @@ concurrency: jobs: ruff: runs-on: ubuntu-latest + permissions: + contents: read strategy: matrix: python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] @@ -31,6 +33,8 @@ jobs: pyright: runs-on: ubuntu-latest + permissions: + contents: read strategy: matrix: python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] diff --git a/.github/workflows/pytest-full.yaml b/.github/workflows/pytest-full.yaml index 20cf0d10..81885f05 100644 --- a/.github/workflows/pytest-full.yaml +++ b/.github/workflows/pytest-full.yaml @@ -8,22 +8,24 @@ on: jobs: pytest_full: runs-on: ubuntu-latest + permissions: + contents: read strategy: max-parallel: 2 matrix: - optimization-test: + data-qa-test: - test: classification/classification_aws_test.py - env: RUN_CLASSIFICATION_AWS_OPTIMIZATION + env: RUN_CLASSIFICATION_AWS_DATA_QA - test: classification/classification_gcp_test.py - env: RUN_CLASSIFICATION_GCP_OPTIMIZATION + env: RUN_CLASSIFICATION_GCP_DATA_QA - test: classification/sanity_gcp_test.py - env: RUN_CLASSIFICATION_GCP_SANITY_OPTIMIZATION + env: RUN_CLASSIFICATION_GCP_SANITY_DATA_QA - test: object-detection/od_aws_test.py - env: RUN_AWS_OD_OPTIMIZATION + env: RUN_AWS_OD_DATA_QA - test: object-detection/od_git_test.py - env: RUN_OD_GIT_OPTIMIZATION + env: RUN_OD_GIT_DATA_QA - test: tests/object-detection/sama_coco_test.py - env: RUN_COCO_OD_GCP_SANITY_OPTIMIZATION + env: RUN_COCO_OD_GCP_SANITY_DATA_QA steps: - uses: actions/checkout@v4 - name: Set up Python @@ -38,7 +40,7 @@ jobs: source .venv/bin/activate pip install -r requirements/dev.txt -r requirements/polars.txt - name: Run PyTest - run: .venv/bin/pytest tests/${{ matrix.optimization-test['test'] }} + run: .venv/bin/pytest tests/${{ matrix.data-qa-test['test'] }} env: API_HOST: ${{ secrets.API_HOST }} API_KEY: ${{ secrets.API_KEY }} @@ -47,4 +49,4 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }} UNIQUE_ID: ${{ github.ref }}-${{ github.run_number }} - ${{ matrix.optimization-test['env'] }}: true + ${{ matrix.data-qa-test['env'] }}: true diff --git a/.github/workflows/pytest-sanity.yaml b/.github/workflows/pytest-sanity.yaml index 7561a196..ad800870 100644 --- a/.github/workflows/pytest-sanity.yaml +++ b/.github/workflows/pytest-sanity.yaml @@ -14,6 +14,8 @@ concurrency: jobs: pytest_sanity: runs-on: ${{ matrix.os }} + permissions: + contents: read strategy: max-parallel: 4 matrix: diff --git a/.github/workflows/safety-scan.yml b/.github/workflows/safety-scan.yml index 0a69f36d..e4d2dc35 100644 --- a/.github/workflows/safety-scan.yml +++ b/.github/workflows/safety-scan.yml @@ -15,6 +15,8 @@ concurrency: jobs: safety-scan: runs-on: ubuntu-latest + permissions: + contents: read steps: - name: Checkout hirundo-client uses: actions/checkout@v4 diff --git a/README.md b/README.md index cd4cb83a..9679f35d 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # Hirundo -This package exposes access to Hirundo APIs for dataset optimization for Machine Learning. +This package exposes access to Hirundo APIs for dataset QA for Machine Learning. -Dataset optimization is currently available for datasets labelled for classification and object detection. +Dataset QA is currently available for datasets labelled for classification and object detection. Support dataset storage configs include: @@ -73,7 +73,7 @@ Classification example: from hirundo import ( HirundoCSV, LabelingType, - OptimizationDataset, + QADataset, StorageGCP, StorageConfig, StorageTypes, @@ -84,7 +84,7 @@ gcp_bucket = StorageGCP( project="Hirundo-global", credentials_json=json.loads(os.environ["GCP_CREDENTIALS"]), ) -test_dataset = OptimizationDataset( +test_dataset = QADataset( name="TEST-GCP cifar 100 classification dataset", labeling_type=LabelingType.SINGLE_LABEL_CLASSIFICATION, storage_config=StorageConfig( @@ -99,7 +99,7 @@ test_dataset = OptimizationDataset( classes=cifar100_classes, ) -test_dataset.run_optimization() +test_dataset.run_qa() results = test_dataset.check_run() print(results) ``` @@ -111,7 +111,7 @@ from hirundo import ( GitRepo, HirundoCSV, LabelingType, - OptimizationDataset, + QADataset, StorageGit, StorageConfig, StorageTypes, @@ -124,7 +124,7 @@ git_storage = StorageGit( ), branch="main", ) -test_dataset = OptimizationDataset( +test_dataset = QADataset( name="TEST-HuggingFace-BDD-100k-validation-OD-validation-dataset", labeling_type=LabelingType.OBJECT_DETECTION, storage_config=StorageConfig( @@ -140,7 +140,7 @@ test_dataset = OptimizationDataset( ), ) -test_dataset.run_optimization() +test_dataset.run_qa() results = test_dataset.check_run() print(results) ``` diff --git a/docs/hirundo.dataset_optimization.rst b/docs/hirundo.dataset_optimization.rst index df2a20e5..06b359f1 100644 --- a/docs/hirundo.dataset_optimization.rst +++ b/docs/hirundo.dataset_optimization.rst @@ -1,10 +1,10 @@ .. meta:: :http-equiv=Content-Security-Policy: default-src 'self', frame-ancestors 'none' -hirundo.dataset\_optimization module +hirundo.dataset\_qa module ==================================== -.. automodule:: hirundo.dataset_optimization +.. automodule:: hirundo.dataset_qa :members: :undoc-members: :show-inheritance: diff --git a/docs/hirundo.rst b/docs/hirundo.rst index 924f2687..b34766ca 100644 --- a/docs/hirundo.rst +++ b/docs/hirundo.rst @@ -11,7 +11,7 @@ Submodules :maxdepth: 4 hirundo.cli - hirundo.dataset_optimization + hirundo.dataset_qa hirundo.enum hirundo.git hirundo.storage diff --git a/hirundo/__init__.py b/hirundo/__init__.py index f569fa0e..1f3e31ab 100644 --- a/hirundo/__init__.py +++ b/hirundo/__init__.py @@ -3,13 +3,15 @@ LabelingType, StorageTypes, ) -from .dataset_optimization import ( +from .dataset_qa import ( + ClassificationRunArgs, + Domain, HirundoError, - OptimizationDataset, + ObjectDetectionRunArgs, + QADataset, RunArgs, - VisionRunArgs, ) -from .dataset_optimization_results import DatasetOptimizationResults +from .dataset_qa_results import DatasetQAResults from .git import GitPlainAuth, GitRepo, GitSSHAuth from .labeling import ( COCO, @@ -40,9 +42,11 @@ "KeylabsObjDetVideo", "KeylabsObjSegImages", "KeylabsObjSegVideo", - "OptimizationDataset", + "QADataset", + "Domain", "RunArgs", - "VisionRunArgs", + "ClassificationRunArgs", + "ObjectDetectionRunArgs", "DatasetMetadataType", "LabelingType", "GitPlainAuth", @@ -54,7 +58,7 @@ # "StorageAzure", TODO: Azure storage is coming soon "StorageGit", "StorageConfig", - "DatasetOptimizationResults", + "DatasetQAResults", "load_df", "load_from_zip", ] diff --git a/hirundo/_constraints.py b/hirundo/_constraints.py index 7fc3862b..1703273b 100644 --- a/hirundo/_constraints.py +++ b/hirundo/_constraints.py @@ -11,7 +11,7 @@ if TYPE_CHECKING: from hirundo._urls import HirundoUrl - from hirundo.dataset_optimization import LabelingInfo + from hirundo.dataset_qa import LabelingInfo from hirundo.storage import ( ResponseStorageConfig, StorageConfig, diff --git a/hirundo/cli.py b/hirundo/cli.py index c9db2da4..fc64d52d 100644 --- a/hirundo/cli.py +++ b/hirundo/cli.py @@ -198,9 +198,9 @@ def check_run( """ Check the status of a run. """ - from hirundo.dataset_optimization import OptimizationDataset + from hirundo.dataset_qa import QADataset - results = OptimizationDataset.check_run_by_id(run_id) + results = QADataset.check_run_by_id(run_id) print(f"Run results saved to {results.cached_zip_path}") @@ -209,9 +209,9 @@ def list_runs(): """ List all runs available. """ - from hirundo.dataset_optimization import OptimizationDataset + from hirundo.dataset_qa import QADataset - runs = OptimizationDataset.list_runs() + runs = QADataset.list_runs() console = Console() table = Table( diff --git a/hirundo/dataset_optimization.py b/hirundo/dataset_qa.py similarity index 80% rename from hirundo/dataset_optimization.py rename to hirundo/dataset_qa.py index a82c3b68..117d015b 100644 --- a/hirundo/dataset_optimization.py +++ b/hirundo/dataset_qa.py @@ -19,7 +19,7 @@ from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT from hirundo._urls import HirundoUrl from hirundo.dataset_enum import DatasetMetadataType, LabelingType -from hirundo.dataset_optimization_results import DatasetOptimizationResults +from hirundo.dataset_qa_results import DatasetQAResults from hirundo.labeling import YOLO, LabelingInfo from hirundo.logger import get_logger from hirundo.storage import ResponseStorageConfig, StorageConfig @@ -30,7 +30,7 @@ class HirundoError(Exception): """ - Custom exception used to indicate errors in `hirundo` dataset optimization runs + Custom exception used to indicate errors in `hirundo` dataset QA runs """ pass @@ -51,14 +51,14 @@ class RunStatus(Enum): STATUS_TO_TEXT_MAP = { - RunStatus.STARTED.value: "Optimization run in progress. Downloading dataset", - RunStatus.PENDING.value: "Optimization run queued and not yet started", - RunStatus.SUCCESS.value: "Optimization run completed successfully", - RunStatus.FAILURE.value: "Optimization run failed", + RunStatus.STARTED.value: "Dataset QA run in progress. Downloading dataset", + RunStatus.PENDING.value: "Dataset QA run queued and not yet started", + RunStatus.SUCCESS.value: "Dataset QA run completed successfully", + RunStatus.FAILURE.value: "Dataset QA run failed", RunStatus.AWAITING_MANUAL_APPROVAL.value: "Awaiting manual approval", - RunStatus.RETRY.value: "Optimization run failed. Retrying", - RunStatus.REVOKED.value: "Optimization run was cancelled", - RunStatus.REJECTED.value: "Optimization run was rejected", + RunStatus.RETRY.value: "Dataset QA run failed. Retrying", + RunStatus.REVOKED.value: "Dataset QA run was cancelled", + RunStatus.REJECTED.value: "Dataset QA run was rejected", } STATUS_TO_PROGRESS_MAP = { RunStatus.STARTED.value: 0.0, @@ -72,33 +72,51 @@ class RunStatus(Enum): } -class VisionRunArgs(BaseModel): - upsample: bool = False +class ClassificationRunArgs(BaseModel): + image_size: typing.Optional[tuple[int, int]] = (224, 224) + """ + Size (width, height) to which to resize classification images. + It is recommended to keep this value at (224, 224) unless your classes are differentiated by very small differences. + """ + upsample: typing.Optional[bool] = False """ Whether to upsample the dataset to attempt to balance the classes. """ - min_abs_bbox_size: int = 0 + + +class ObjectDetectionRunArgs(ClassificationRunArgs): + min_abs_bbox_size: typing.Optional[int] = None """ - Minimum valid size (in pixels) of a bounding box to keep it in the dataset for optimization. + Minimum valid size (in pixels) of a bounding box to keep it in the dataset for QA. """ - min_abs_bbox_area: int = 0 + min_abs_bbox_area: typing.Optional[int] = None """ - Minimum valid absolute area (in pixels²) of a bounding box to keep it in the dataset for optimization. + Minimum valid absolute area (in pixels²) of a bounding box to keep it in the dataset for QA. """ - min_rel_bbox_size: float = 0.0 + min_rel_bbox_size: typing.Optional[float] = None """ Minimum valid size (as a fraction of both image height and width) for a bounding box - to keep it in the dataset for optimization, relative to the corresponding dimension size, + to keep it in the dataset for QA, relative to the corresponding dimension size, i.e. if the bounding box is 10% of the image width and 5% of the image height, it will be kept if this value is 0.05, but not if the value is 0.06 (since both width and height are checked). """ - min_rel_bbox_area: float = 0.0 + min_rel_bbox_area: typing.Optional[float] = None + """ + Minimum valid relative area (as a fraction of the image area) of a bounding box to keep it in the dataset for QA. + """ + crop_ratio: typing.Optional[float] = None + """ + Ratio of the bounding box to crop. + Change this value at your own risk. It is recommended to keep it at 1.0 unless you know what you are doing. + """ + add_mask_channel: typing.Optional[bool] = None """ - Minimum valid relative area (as a fraction of the image area) of a bounding box to keep it in the dataset for optimization. + Whether to add a mask channel to the image. + Change at your own risk. It is recommended to keep it at False unless you know what you are doing. """ -RunArgs = typing.Union[VisionRunArgs] +RunArgs = typing.Union[ClassificationRunArgs, ObjectDetectionRunArgs] class AugmentationName(str, Enum): @@ -111,13 +129,14 @@ class AugmentationName(str, Enum): GAUSSIAN_BLUR = "GaussianBlur" -class Modality(str, Enum): - IMAGE = "Image" - RADAR = "Radar" - EKG = "EKG" +class Domain(str, Enum): + RADAR = "RADAR" + VISION = "VISION" + SPEECH = "SPEECH" + TABULAR = "TABULAR" -class OptimizationDataset(BaseModel): +class QADataset(BaseModel): id: typing.Optional[int] = Field(default=None) """ The ID of the dataset created on the server. @@ -171,15 +190,15 @@ class OptimizationDataset(BaseModel): For audio datasets, this field is ignored. If no value is provided, all augmentations are applied to vision datasets. """ - modality: Modality = Modality.IMAGE + domain: Domain = Domain.VISION """ - Used to define the modality of the dataset. + Used to define the domain of the dataset. Defaults to Image. """ run_id: typing.Optional[str] = Field(default=None, init=False) """ - The ID of the Dataset Optimization run created on the server. + The ID of the Dataset QA run created on the server. """ status: typing.Optional[RunStatus] = None @@ -229,52 +248,52 @@ def validate_dataset(self): return self @staticmethod - def get_by_id(dataset_id: int) -> "OptimizationDataset": + def get_by_id(dataset_id: int) -> "QADataset": """ - Get a `OptimizationDataset` instance from the server by its ID + Get a `QADataset` instance from the server by its ID Args: - dataset_id: The ID of the `OptimizationDataset` instance to get + dataset_id: The ID of the `QADataset` instance to get """ response = requests.get( - f"{API_HOST}/dataset-optimization/dataset/{dataset_id}", + f"{API_HOST}/dataset-qa/dataset/{dataset_id}", headers=get_headers(), timeout=READ_TIMEOUT, ) raise_for_status_with_reason(response) dataset = response.json() - return OptimizationDataset(**dataset) + return QADataset(**dataset) @staticmethod - def get_by_name(name: str) -> "OptimizationDataset": + def get_by_name(name: str) -> "QADataset": """ - Get a `OptimizationDataset` instance from the server by its name + Get a `QADataset` instance from the server by its name Args: - name: The name of the `OptimizationDataset` instance to get + name: The name of the `QADataset` instance to get """ response = requests.get( - f"{API_HOST}/dataset-optimization/dataset/by-name/{name}", + f"{API_HOST}/dataset-qa/dataset/by-name/{name}", headers=get_headers(), timeout=READ_TIMEOUT, ) raise_for_status_with_reason(response) dataset = response.json() - return OptimizationDataset(**dataset) + return QADataset(**dataset) @staticmethod def list_datasets( organization_id: typing.Optional[int] = None, - ) -> list["DataOptimizationDatasetOut"]: + ) -> list["QADatasetOut"]: """ - Lists all the optimization datasets created by user's default organization + Lists all the datasets created by user's default organization or the `organization_id` passed Args: organization_id: The ID of the organization to list the datasets for. """ response = requests.get( - f"{API_HOST}/dataset-optimization/dataset/", + f"{API_HOST}/dataset-qa/dataset/", params={"dataset_organization_id": organization_id}, headers=get_headers(), timeout=READ_TIMEOUT, @@ -282,7 +301,7 @@ def list_datasets( raise_for_status_with_reason(response) datasets = response.json() return [ - DataOptimizationDatasetOut( + QADatasetOut( **ds, ) for ds in datasets @@ -291,17 +310,17 @@ def list_datasets( @staticmethod def list_runs( organization_id: typing.Optional[int] = None, - ) -> list["DataOptimizationRunOut"]: + ) -> list["DataQARunOut"]: """ - Lists all the `OptimizationDataset` instances created by user's default organization + Lists all the `QADataset` instances created by user's default organization or the `organization_id` passed - Note: The return type is `list[dict]` and not `list[OptimizationDataset]` + Note: The return type is `list[dict]` and not `list[QADataset]` Args: organization_id: The ID of the organization to list the datasets for. """ response = requests.get( - f"{API_HOST}/dataset-optimization/run/list", + f"{API_HOST}/dataset-qa/run/list", params={"dataset_organization_id": organization_id}, headers=get_headers(), timeout=READ_TIMEOUT, @@ -309,7 +328,7 @@ def list_runs( raise_for_status_with_reason(response) runs = response.json() return [ - DataOptimizationRunOut( + DataQARunOut( **run, ) for run in runs @@ -318,13 +337,13 @@ def list_runs( @staticmethod def delete_by_id(dataset_id: int) -> None: """ - Deletes a `OptimizationDataset` instance from the server by its ID + Deletes a `QADataset` instance from the server by its ID Args: - dataset_id: The ID of the `OptimizationDataset` instance to delete + dataset_id: The ID of the `QADataset` instance to delete """ response = requests.delete( - f"{API_HOST}/dataset-optimization/dataset/{dataset_id}", + f"{API_HOST}/dataset-qa/dataset/{dataset_id}", headers=get_headers(), timeout=MODIFY_TIMEOUT, ) @@ -333,14 +352,14 @@ def delete_by_id(dataset_id: int) -> None: def delete(self, storage_config=True) -> None: """ - Deletes the active `OptimizationDataset` instance from the server. - It can only be used on a `OptimizationDataset` instance that has been created. + Deletes the active `QADataset` instance from the server. + It can only be used on a `QADataset` instance that has been created. Args: - storage_config: If True, the `OptimizationDataset`'s `StorageConfig` will also be deleted + storage_config: If True, the `QADataset`'s `StorageConfig` will also be deleted Note: If `storage_config` is not set to `False` then the `storage_config_id` must be set - This can either be set manually or by creating the `StorageConfig` instance via the `OptimizationDataset`'s + This can either be set manually or by creating the `StorageConfig` instance via the `QADataset`'s `create` method """ if storage_config: @@ -357,7 +376,7 @@ def create( replace_if_exists: bool = False, ) -> int: """ - Create a `OptimizationDataset` instance on the server. + Create a `QADataset` instance on the server. If the `storage_config_id` field is not set, the storage config will also be created and the field will be set. Args: @@ -366,7 +385,7 @@ def create( (this is determined by a dataset of the same name in the same organization). Returns: - The ID of the created `OptimizationDataset` instance + The ID of the created `QADataset` instance """ if self.storage_config is None and self.storage_config_id is None: raise ValueError("No dataset storage has been provided") @@ -391,7 +410,7 @@ def create( model_dict = self.model_dump(mode="json") # ⬆️ Get dict of model fields from Pydantic model instance dataset_response = requests.post( - f"{API_HOST}/dataset-optimization/dataset/", + f"{API_HOST}/dataset-qa/dataset/", json={ **{k: model_dict[k] for k in model_dict.keys() - {"storage_config"}}, "organization_id": organization_id, @@ -408,17 +427,17 @@ def create( return self.id @staticmethod - def launch_optimization_run( + def launch_qa_run( dataset_id: int, organization_id: typing.Optional[int] = None, run_args: typing.Optional[RunArgs] = None, ) -> str: """ - Run the dataset optimization process on the server using the dataset with the given ID + Run the dataset QA process on the server using the dataset with the given ID i.e. `dataset_id`. Args: - dataset_id: The ID of the dataset to run optimization on. + dataset_id: The ID of the dataset to run QA on. Returns: ID of the run (`run_id`). @@ -429,7 +448,7 @@ def launch_optimization_run( if run_args: run_info["run_args"] = run_args.model_dump(mode="json") run_response = requests.post( - f"{API_HOST}/dataset-optimization/run/{dataset_id}", + f"{API_HOST}/dataset-qa/run/{dataset_id}", json=run_info if len(run_info) > 0 else None, headers=get_headers(), timeout=MODIFY_TIMEOUT, @@ -440,12 +459,16 @@ def launch_optimization_run( def _validate_run_args(self, run_args: RunArgs) -> None: if self.labeling_type == LabelingType.SPEECH_TO_TEXT: raise Exception("Speech to text cannot have `run_args` set") - if self.labeling_type != LabelingType.OBJECT_DETECTION and any( - ( - run_args.min_abs_bbox_size != 0, - run_args.min_abs_bbox_area != 0, - run_args.min_rel_bbox_size != 0, - run_args.min_rel_bbox_area != 0, + if ( + self.labeling_type != LabelingType.OBJECT_DETECTION + and isinstance(run_args, ObjectDetectionRunArgs) + and any( + ( + run_args.min_abs_bbox_size != 0, + run_args.min_abs_bbox_area != 0, + run_args.min_rel_bbox_size != 0, + run_args.min_rel_bbox_area != 0, + ) ) ): raise Exception( @@ -454,7 +477,7 @@ def _validate_run_args(self, run_args: RunArgs) -> None: + f"labeling type {self.labeling_type}" ) - def run_optimization( + def run_qa( self, organization_id: typing.Optional[int] = None, replace_dataset_if_exists: bool = False, @@ -462,13 +485,13 @@ def run_optimization( ) -> str: """ If the dataset was not created on the server yet, it is created. - Run the dataset optimization process on the server using the active `OptimizationDataset` instance + Run the dataset QA process on the server using the active `QADataset` instance Args: - organization_id: The ID of the organization to run the optimization for. + organization_id: The ID of the organization to run the QA for. replace_dataset_if_exists: If True, the dataset will be replaced if it already exists (this is determined by a dataset of the same name in the same organization). - run_args: The run arguments to use for the optimization run + run_args: The run arguments to use for the QA run Returns: An ID of the run (`run_id`) and stores that `run_id` on the instance @@ -478,7 +501,7 @@ def run_optimization( self.id = self.create(replace_if_exists=replace_dataset_if_exists) if run_args is not None: self._validate_run_args(run_args) - run_id = self.launch_optimization_run(self.id, organization_id, run_args) + run_id = self.launch_qa_run(self.id, organization_id, run_args) self.run_id = run_id logger.info("Started the run with ID: %s", run_id) return run_id @@ -516,7 +539,7 @@ def _check_run_by_id(run_id: str, retry=0) -> Generator[dict, None, None]: for sse in iter_sse_retrying( client, "GET", - f"{API_HOST}/dataset-optimization/run/{run_id}", + f"{API_HOST}/dataset-qa/run/{run_id}", headers=get_headers(), ): if sse.event == "ping": @@ -542,50 +565,46 @@ def _check_run_by_id(run_id: str, retry=0) -> Generator[dict, None, None]: raise HirundoError("Unknown error") yield data if not last_event or last_event["data"]["state"] == RunStatus.PENDING.value: - OptimizationDataset._check_run_by_id(run_id, retry + 1) + QADataset._check_run_by_id(run_id, retry + 1) @staticmethod def _handle_failure(iteration: dict): if iteration["result"]: - raise HirundoError( - f"Optimization run failed with error: {iteration['result']}" - ) + raise HirundoError(f"QA run failed with error: {iteration['result']}") else: - raise HirundoError( - "Optimization run failed with an unknown error in _handle_failure" - ) + raise HirundoError("QA run failed with an unknown error in _handle_failure") @staticmethod @overload def check_run_by_id( run_id: str, stop_on_manual_approval: typing.Literal[True] - ) -> typing.Optional[DatasetOptimizationResults]: ... + ) -> typing.Optional[DatasetQAResults]: ... @staticmethod @overload def check_run_by_id( run_id: str, stop_on_manual_approval: typing.Literal[False] = False - ) -> DatasetOptimizationResults: ... + ) -> DatasetQAResults: ... @staticmethod @overload def check_run_by_id( run_id: str, stop_on_manual_approval: bool - ) -> typing.Optional[DatasetOptimizationResults]: ... + ) -> typing.Optional[DatasetQAResults]: ... @staticmethod def check_run_by_id( run_id: str, stop_on_manual_approval: bool = False - ) -> typing.Optional[DatasetOptimizationResults]: + ) -> typing.Optional[DatasetQAResults]: """ Check the status of a run given its ID Args: - run_id: The `run_id` produced by a `run_optimization` call + run_id: The `run_id` produced by a `run_qa` call stop_on_manual_approval: If True, the function will return `None` if the run is awaiting manual approval Returns: - A DatasetOptimizationResults object with the results of the optimization run + A DatasetQAResults object with the results of the QA run Raises: HirundoError: If the maximum number of retries is reached or if the run fails @@ -593,7 +612,7 @@ def check_run_by_id( logger.debug("Checking run with ID: %s", run_id) with logging_redirect_tqdm(): t = tqdm(total=100.0) - for iteration in OptimizationDataset._check_run_by_id(run_id): + for iteration in QADataset._check_run_by_id(run_id): if iteration["state"] in STATUS_TO_PROGRESS_MAP: t.set_description(STATUS_TO_TEXT_MAP[iteration["state"]]) t.n = STATUS_TO_PROGRESS_MAP[iteration["state"]] @@ -608,11 +627,11 @@ def check_run_by_id( "State is failure, rejected, or revoked: %s", iteration["state"], ) - OptimizationDataset._handle_failure(iteration) + QADataset._handle_failure(iteration) elif iteration["state"] == RunStatus.SUCCESS.value: t.close() zip_temporary_url = iteration["result"] - logger.debug("Optimization run completed. Downloading results") + logger.debug("QA run completed. Downloading results") return download_and_extract_zip( run_id, @@ -644,7 +663,7 @@ def check_run_by_id( stage = "Unknown progress state" current_progress_percentage = t.n # Keep the same progress desc = ( - "Optimization run completed. Uploading results" + "QA run completed. Uploading results" if current_progress_percentage == 100.0 else stage ) @@ -652,28 +671,26 @@ def check_run_by_id( t.n = current_progress_percentage logger.debug("Setting progress to %s", t.n) t.refresh() - raise HirundoError( - "Optimization run failed with an unknown error in check_run_by_id" - ) + raise HirundoError("QA run failed with an unknown error in check_run_by_id") @overload def check_run( self, stop_on_manual_approval: typing.Literal[True] - ) -> typing.Optional[DatasetOptimizationResults]: ... + ) -> typing.Optional[DatasetQAResults]: ... @overload def check_run( self, stop_on_manual_approval: typing.Literal[False] = False - ) -> DatasetOptimizationResults: ... + ) -> DatasetQAResults: ... def check_run( self, stop_on_manual_approval: bool = False - ) -> typing.Optional[DatasetOptimizationResults]: + ) -> typing.Optional[DatasetQAResults]: """ Check the status of the current active instance's run. Returns: - A pandas DataFrame with the results of the optimization run + A pandas DataFrame with the results of the QA run """ if not self.run_id: @@ -690,7 +707,7 @@ async def acheck_run_by_id(run_id: str, retry=0) -> AsyncGenerator[dict, None]: This generator will produce values to show progress of the run. Args: - run_id: The `run_id` produced by a `run_optimization` call + run_id: The `run_id` produced by a `run_qa` call retry: A number used to track the number of retries to limit re-checks. *Do not* provide this value manually. Yields: @@ -709,7 +726,7 @@ async def acheck_run_by_id(run_id: str, retry=0) -> AsyncGenerator[dict, None]: async_iterator = await aiter_sse_retrying( client, "GET", - f"{API_HOST}/dataset-optimization/run/{run_id}", + f"{API_HOST}/dataset-qa/run/{run_id}", headers=get_headers(), ) async for sse in async_iterator: @@ -725,7 +742,7 @@ async def acheck_run_by_id(run_id: str, retry=0) -> AsyncGenerator[dict, None]: last_event = json.loads(sse.data) yield last_event["data"] if not last_event or last_event["data"]["state"] == RunStatus.PENDING.value: - OptimizationDataset.acheck_run_by_id(run_id, retry + 1) + QADataset.acheck_run_by_id(run_id, retry + 1) async def acheck_run(self) -> AsyncGenerator[dict, None]: """ @@ -749,14 +766,14 @@ async def acheck_run(self) -> AsyncGenerator[dict, None]: @staticmethod def cancel_by_id(run_id: str) -> None: """ - Cancel the dataset optimization run for the given `run_id`. + Cancel the dataset QA run for the given `run_id`. Args: run_id: The ID of the run to cancel """ logger.info("Cancelling run with ID: %s", run_id) response = requests.delete( - f"{API_HOST}/dataset-optimization/run/{run_id}", + f"{API_HOST}/dataset-qa/run/{run_id}", headers=get_headers(), timeout=MODIFY_TIMEOUT, ) @@ -773,14 +790,14 @@ def cancel(self) -> None: @staticmethod def archive_run_by_id(run_id: str) -> None: """ - Archive the dataset optimization run for the given `run_id`. + Archive the dataset QA run for the given `run_id`. Args: run_id: The ID of the run to archive """ logger.info("Archiving run with ID: %s", run_id) response = requests.patch( - f"{API_HOST}/dataset-optimization/run/archive/{run_id}", + f"{API_HOST}/dataset-qa/run/archive/{run_id}", headers=get_headers(), timeout=MODIFY_TIMEOUT, ) @@ -795,7 +812,7 @@ def archive(self) -> None: self.archive_run_by_id(self.run_id) -class DataOptimizationDatasetOut(BaseModel): +class QADatasetOut(BaseModel): id: int name: str @@ -814,7 +831,7 @@ class DataOptimizationDatasetOut(BaseModel): updated_at: datetime.datetime -class DataOptimizationRunOut(BaseModel): +class DataQARunOut(BaseModel): id: int name: str dataset_id: int diff --git a/hirundo/dataset_optimization_results.py b/hirundo/dataset_qa_results.py similarity index 82% rename from hirundo/dataset_optimization_results.py rename to hirundo/dataset_qa_results.py index c962ee9f..5c776324 100644 --- a/hirundo/dataset_optimization_results.py +++ b/hirundo/dataset_qa_results.py @@ -21,7 +21,7 @@ T = typing.TypeVar("T") -class DatasetOptimizationResults(BaseModel, typing.Generic[T]): +class DatasetQAResults(BaseModel, typing.Generic[T]): model_config = {"arbitrary_types_allowed": True} cached_zip_path: Path @@ -30,13 +30,13 @@ class DatasetOptimizationResults(BaseModel, typing.Generic[T]): """ suspects: T """ - A polars/pandas DataFrame containing the results of the optimization run + A polars/pandas DataFrame containing the results of the data QA run """ object_suspects: typing.Optional[T] """ - A polars/pandas DataFrame containing the object-level results of the optimization run + A polars/pandas DataFrame containing the object-level results of the data QA run """ warnings_and_errors: T """ - A polars/pandas DataFrame containing the warnings and errors of the optimization run + A polars/pandas DataFrame containing the warnings and errors of the data QA run """ diff --git a/hirundo/unzip.py b/hirundo/unzip.py index aba2a85b..30fd40ba 100644 --- a/hirundo/unzip.py +++ b/hirundo/unzip.py @@ -19,9 +19,9 @@ from hirundo._env import API_HOST from hirundo._headers import _get_auth_headers from hirundo._timeouts import DOWNLOAD_READ_TIMEOUT -from hirundo.dataset_optimization_results import ( +from hirundo.dataset_qa_results import ( DataFrameType, - DatasetOptimizationResults, + DatasetQAResults, ) from hirundo.logger import get_logger @@ -117,7 +117,7 @@ def get_mislabel_suspect_filename(filenames: list[str]): def download_and_extract_zip( run_id: str, zip_url: str -) -> DatasetOptimizationResults[DataFrameType]: +) -> DatasetQAResults[DataFrameType]: """ Download and extract the zip file from the given URL. @@ -127,11 +127,11 @@ def download_and_extract_zip( and `warnings_and_errors.csv` files from the zip file. Args: - run_id: The ID of the optimization run. + run_id: The ID of the dataset QA run. zip_url: The URL of the zip file to download. Returns: - The dataset optimization results object. + The dataset QA results object. """ # Define the local file path cache_dir = Path.home() / ".hirundo" / "cache" @@ -140,9 +140,8 @@ def download_and_extract_zip( headers = None if Url(zip_url).scheme == "file": - zip_url = ( - f"{API_HOST}/dataset-optimization/run/local-download" - + zip_url.replace("file://", "") + zip_url = f"{API_HOST}/dataset-qa/run/local-download" + zip_url.replace( + "file://", "" ) headers = _get_auth_headers() # Stream the zip file download @@ -217,7 +216,7 @@ def download_and_extract_zip( "Failed to load warnings and errors into DataFrame", exc_info=e ) - return DatasetOptimizationResults[DataFrameType]( + return DatasetQAResults[DataFrameType]( cached_zip_path=zip_file_path, suspects=suspects_df, object_suspects=object_suspects_df, diff --git a/notebooks/Cancel_Dataset_Optimization_Run.ipynb b/notebooks/Cancel_Dataset_QA_Run.ipynb similarity index 74% rename from notebooks/Cancel_Dataset_Optimization_Run.ipynb rename to notebooks/Cancel_Dataset_QA_Run.ipynb index d6a1dfc0..4b2f7a38 100644 --- a/notebooks/Cancel_Dataset_Optimization_Run.ipynb +++ b/notebooks/Cancel_Dataset_QA_Run.ipynb @@ -7,7 +7,7 @@ "id": "view-in-github" }, "source": [ - "\"Open" + "\"Open" ] }, { @@ -16,7 +16,7 @@ "id": "WOHjhp9ed6LM" }, "source": [ - "# How to use cancel a Hirundo's Dataset Optimization run\n", + "# How to use cancel a Hirundo's Dataset QA run\n", "\n", "---\n", "\n", @@ -51,7 +51,7 @@ "id": "szD58TSCivAN" }, "source": [ - "2. Import `OptimizeDataset` class from `hirundo`" + "2. Import `QADataset` class from `hirundo`" ] }, { @@ -63,7 +63,7 @@ "outputs": [], "source": [ "from hirundo import (\n", - " OptimizationDataset,\n", + " QADataset,\n", ")" ] }, @@ -73,7 +73,7 @@ "id": "eJhmScbels65" }, "source": [ - "3. Cancel dataset optimization run by calling `cancel_by_id` with the dataset optimization run ID (a string)" + "3. Cancel dataset QA run by calling `cancel_by_id` with the dataset QA run ID (a string)" ] }, { @@ -84,12 +84,12 @@ }, "outputs": [], "source": [ - "dataset_optimization_run_id = \"\" # @param {type:\"string\"}\n", + "dataset_qa_run_id = \"\" # @param {type:\"string\"}\n", "\n", - "if not dataset_optimization_run_id:\n", - " raise ValueError(\"You must enter a dataset optimization run ID\")\n", + "if not dataset_qa_run_id:\n", + " raise ValueError(\"You must enter a dataset QA run ID\")\n", "\n", - "OptimizationDataset.cancel_by_id(dataset_optimization_run_id)" + "QADataset.cancel_by_id(dataset_qa_run_id)" ] } ], diff --git a/notebooks/Create_BDD100k_dataset_AWS_S3.ipynb b/notebooks/Create_BDD100k_dataset_AWS_S3.ipynb index 0e490015..f63d3b41 100644 --- a/notebooks/Create_BDD100k_dataset_AWS_S3.ipynb +++ b/notebooks/Create_BDD100k_dataset_AWS_S3.ipynb @@ -203,7 +203,7 @@ " \"bbox_id\": str(\n", " i\n", " ), # Box index in image used for box ID (since no unique ID is provided)\n", - " \"label\": label[\"category\"],\n", + " \"class_name\": label[\"category\"],\n", " \"xmin\": int(label[\"box2d\"][\"x1\"]),\n", " \"ymin\": int(label[\"box2d\"][\"y1\"]),\n", " \"xmax\": int(label[\"box2d\"][\"x2\"]),\n", diff --git a/notebooks/Create_BDD100k_dataset_HuggingFace.ipynb b/notebooks/Create_BDD100k_dataset_HuggingFace.ipynb index fdf79e88..f2d310cd 100644 --- a/notebooks/Create_BDD100k_dataset_HuggingFace.ipynb +++ b/notebooks/Create_BDD100k_dataset_HuggingFace.ipynb @@ -150,7 +150,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -188,7 +188,7 @@ " \"bbox_id\": str(\n", " i\n", " ), # Box index in image used for box ID (since no unique ID is provided)\n", - " \"label\": label[\"category\"],\n", + " \"class_name\": label[\"category\"],\n", " \"xmin\": int(label[\"box2d\"][\"x1\"]),\n", " \"ymin\": int(label[\"box2d\"][\"y1\"]),\n", " \"xmax\": int(label[\"box2d\"][\"x2\"]),\n", diff --git a/notebooks/Create_cifar100_dataset_GCP.ipynb b/notebooks/Create_cifar100_dataset_GCP.ipynb index 5e47fd50..8d766887 100644 --- a/notebooks/Create_cifar100_dataset_GCP.ipynb +++ b/notebooks/Create_cifar100_dataset_GCP.ipynb @@ -112,13 +112,13 @@ "img_dir = temp_dir_dataset_path / \"images\"\n", "img_dir.mkdir()\n", "\n", - "csv = pd.DataFrame(columns=[\"image_path\", \"label\"], index=range(len(cifar100)))\n", + "csv = pd.DataFrame(columns=[\"image_path\", \"class_name\"], index=range(len(cifar100)))\n", "for i, item in enumerate(cifar100):\n", " image, target = item\n", " image_path = img_dir / f\"{i}.png\"\n", " csv.loc[i] = {\n", " \"image_path\": image_path.relative_to(temp_dir_dataset_path),\n", - " \"label\": cifar100.classes[target],\n", + " \"class_name\": cifar100.classes[target],\n", " }\n", " image.save(image_path)\n", "\n", diff --git a/notebooks/Hirundo_Dataset_Optimization_S3.ipynb b/notebooks/Hirundo_Dataset_Optimization_S3.ipynb index 418026bf..6b8364c9 100644 --- a/notebooks/Hirundo_Dataset_Optimization_S3.ipynb +++ b/notebooks/Hirundo_Dataset_Optimization_S3.ipynb @@ -7,7 +7,7 @@ "id": "view-in-github" }, "source": [ - "\"Open" + "\"Open" ] }, { @@ -16,7 +16,7 @@ "id": "WOHjhp9ed6LM" }, "source": [ - "# How to use Hirundo's Dataset Optimization (S3)\n", + "# How to use Hirundo's Dataset QA (S3)\n", "\n", "---\n", "\n", @@ -60,7 +60,7 @@ "id": "3qSUfS1liadM" }, "source": [ - "2. We import the `OptimizationDataset` class, as well as the `LabelingType` enum, the `StorageConfig` (to indicate where the dataset files are saved) class, `the StorageTypes` enum, and the `StorageS3` storage class" + "2. We import the `QADataset` class, as well as the `LabelingType` enum, the `StorageConfig` (to indicate where the dataset files are saved) class, `the StorageTypes` enum, and the `StorageS3` storage class" ] }, { @@ -74,7 +74,7 @@ "from hirundo import (\n", " HirundoCSV,\n", " LabelingType,\n", - " OptimizationDataset,\n", + " QADataset,\n", " StorageConfig,\n", " StorageS3,\n", " StorageTypes,\n", @@ -87,7 +87,7 @@ "id": "8KhP3Cw6gqlB" }, "source": [ - "3. First we create the `OptimizationDataset` object" + "3. First we create the `QADataset` object" ] }, { @@ -104,7 +104,7 @@ " access_key_id=os.environ[\"AWS_ACCESS_KEY\"],\n", " secret_access_key=os.environ[\"AWS_SECRET_ACCESS_KEY\"],\n", ")\n", - "test_dataset = OptimizationDataset(\n", + "test_dataset = QADataset(\n", " name=\"AWS-test-OD-BDD-validation-dataset\",\n", " labeling_type=LabelingType.OBJECT_DETECTION,\n", " storage_config=StorageConfig(\n", @@ -129,7 +129,7 @@ "id": "eJhmScbels65" }, "source": [ - "4. Now that we have created our dataset, we can launch a dataset optimization run" + "4. Now that we have created our dataset, we can launch a dataset QA run" ] }, { @@ -140,8 +140,8 @@ }, "outputs": [], "source": [ - "run_id = test_dataset.run_optimization(replace_dataset_if_exists=True)\n", - "print(\"Running optimization. Run ID is \", run_id)\n", + "run_id = test_dataset.run_qa(replace_dataset_if_exists=True)\n", + "print(\"Running dataset QA. Run ID is \", run_id)\n", "test_dataset.check_run()" ] } diff --git a/notebooks/Hirundo_Dataset_Optimization_HuggingFace.ipynb b/notebooks/Hirundo_QA_Dataset_HuggingFace.ipynb similarity index 83% rename from notebooks/Hirundo_Dataset_Optimization_HuggingFace.ipynb rename to notebooks/Hirundo_QA_Dataset_HuggingFace.ipynb index e4bcd2da..781a1f14 100644 --- a/notebooks/Hirundo_Dataset_Optimization_HuggingFace.ipynb +++ b/notebooks/Hirundo_QA_Dataset_HuggingFace.ipynb @@ -7,7 +7,7 @@ "id": "view-in-github" }, "source": [ - "\"Open" + "\"Open" ] }, { @@ -16,7 +16,7 @@ "id": "WOHjhp9ed6LM" }, "source": [ - "# How to use Hirundo's Dataset Optimization (HuggingFace)\n", + "# How to use Hirundo's Dataset QA (HuggingFace)\n", "\n", "---\n", "\n", @@ -57,7 +57,7 @@ "id": "pywDlD47if67" }, "source": [ - "2. We import the `GitRepo` class, the `OptimizationDataset` class, as well as the `LabelingType` enum, the `StorageConfig` class (to indicate where the dataset files are saved), `the StorageTypes` enum, and the `StorageGit` storage class" + "2. We import the `GitRepo` class, the `QADataset` class, as well as the `LabelingType` enum, the `StorageConfig` class (to indicate where the dataset files are saved), `the StorageTypes` enum, and the `StorageGit` storage class" ] }, { @@ -72,7 +72,7 @@ " GitRepo,\n", " HirundoCSV,\n", " LabelingType,\n", - " OptimizationDataset,\n", + " QADataset,\n", " StorageConfig,\n", " StorageGit,\n", " StorageTypes,\n", @@ -85,7 +85,7 @@ "id": "eJhmScbels65" }, "source": [ - "3. First we create the `OptimizationDataset` object" + "3. First we create the `QADataset` object" ] }, { @@ -103,7 +103,7 @@ " ),\n", " branch=\"main\",\n", ")\n", - "test_dataset = OptimizationDataset(\n", + "test_dataset = QADataset(\n", " name=\"HuggingFace-test-OD-BDD-validation dataset\",\n", " labeling_type=LabelingType.OBJECT_DETECTION,\n", " storage_config=StorageConfig(\n", @@ -126,7 +126,7 @@ "id": "FT50g4x7rsaW" }, "source": [ - "4. Now that we have created our dataset, we can launch a dataset optimization run" + "4. Now that we have created our dataset, we can launch a dataset QA run" ] }, { @@ -137,8 +137,8 @@ }, "outputs": [], "source": [ - "run_id = test_dataset.run_optimization()\n", - "print(\"Running optimization. Run ID is \", run_id)\n", + "run_id = test_dataset.run_qa()\n", + "print(\"Running dataset QA. Run ID is \", run_id)\n", "test_dataset.check_run()" ] } diff --git a/on-prem/on_prem_audio_ar_test_notebook.ipynb b/on-prem/on_prem_audio_ar_test_notebook.ipynb index 7f945aa8..45eacffa 100644 --- a/on-prem/on_prem_audio_ar_test_notebook.ipynb +++ b/on-prem/on_prem_audio_ar_test_notebook.ipynb @@ -10,11 +10,12 @@ "import os\n", "\n", "from hirundo import (\n", + " Domain,\n", " GitPlainAuth,\n", " GitRepo,\n", " HirundoCSV,\n", " LabelingType,\n", - " OptimizationDataset,\n", + " QADataset,\n", " StorageConfig,\n", " StorageGit,\n", " StorageTypes,\n", @@ -26,18 +27,18 @@ "\n", "run_ids = [\n", " run.run_id\n", - " for run in OptimizationDataset.list_runs()\n", + " for run in QADataset.list_runs()\n", " if run.run_id and run.name.startswith(\"TEST-STT-MASC-dataset\")\n", "]\n", "for run_id in run_ids:\n", - " OptimizationDataset.cancel_by_id(run_id)\n", + " QADataset.cancel_by_id(run_id)\n", "dataset_ids = [\n", " dataset.id\n", - " for dataset in OptimizationDataset.list_datasets()\n", + " for dataset in QADataset.list_datasets()\n", " if dataset.name.startswith(\"TEST-STT-MASC-dataset\")\n", "]\n", "for dataset_id in dataset_ids:\n", - " OptimizationDataset.delete_by_id(dataset_id)\n", + " QADataset.delete_by_id(dataset_id)\n", "storage_ids = [\n", " storage_config.id\n", " for storage_config in StorageConfig.list()\n", @@ -64,8 +65,9 @@ " ),\n", " branch=\"main\",\n", ")\n", - "test_dataset = OptimizationDataset(\n", + "test_dataset = QADataset(\n", " name=f\"TEST-STT-MASC-dataset{unique_id}\",\n", + " domain=Domain.SPEECH,\n", " labeling_type=LabelingType.SPEECH_TO_TEXT,\n", " language=\"ar\",\n", " storage_config=StorageConfig(\n", @@ -79,59 +81,14 @@ " ),\n", ")\n", "\n", - "# test_dataset.run_optimization()" + "# test_dataset.run_qa()" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Optimization run queued and not yet started: 0%| | 0.0/100.0 [00:00 69\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m\n\u001b[1;32m 70\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n", - "File \u001b[0;32m~/Programming/hirundo-client/.venv/lib/python3.9/site-packages/httpx/_transports/default.py:113\u001b[0m, in \u001b[0;36mResponseStream.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 112\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m map_httpcore_exceptions():\n\u001b[0;32m--> 113\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m part \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_httpcore_stream:\n\u001b[1;32m 114\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m part\n", - "File \u001b[0;32m~/Programming/hirundo-client/.venv/lib/python3.9/site-packages/httpcore/_sync/connection_pool.py:367\u001b[0m, in \u001b[0;36mPoolByteStream.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 366\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclose()\n\u001b[0;32m--> 367\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exc \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n", - "File \u001b[0;32m~/Programming/hirundo-client/.venv/lib/python3.9/site-packages/httpcore/_sync/connection_pool.py:363\u001b[0m, in \u001b[0;36mPoolByteStream.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 362\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 363\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m part \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_stream:\n\u001b[1;32m 364\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m part\n", - "File \u001b[0;32m~/Programming/hirundo-client/.venv/lib/python3.9/site-packages/httpcore/_sync/http11.py:349\u001b[0m, in \u001b[0;36mHTTP11ConnectionByteStream.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 348\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclose()\n\u001b[0;32m--> 349\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exc\n", - "File \u001b[0;32m~/Programming/hirundo-client/.venv/lib/python3.9/site-packages/httpcore/_sync/http11.py:341\u001b[0m, in \u001b[0;36mHTTP11ConnectionByteStream.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 340\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m Trace(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mreceive_response_body\u001b[39m\u001b[38;5;124m\"\u001b[39m, logger, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_request, kwargs):\n\u001b[0;32m--> 341\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m chunk \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_connection\u001b[38;5;241m.\u001b[39m_receive_response_body(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 342\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m chunk\n", - "File \u001b[0;32m~/Programming/hirundo-client/.venv/lib/python3.9/site-packages/httpcore/_sync/http11.py:210\u001b[0m, in \u001b[0;36mHTTP11Connection._receive_response_body\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 209\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[0;32m--> 210\u001b[0m event \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_receive_event\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 211\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(event, h11\u001b[38;5;241m.\u001b[39mData):\n", - "File \u001b[0;32m~/Programming/hirundo-client/.venv/lib/python3.9/site-packages/httpcore/_sync/http11.py:221\u001b[0m, in \u001b[0;36mHTTP11Connection._receive_event\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 220\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m map_exceptions({h11\u001b[38;5;241m.\u001b[39mRemoteProtocolError: RemoteProtocolError}):\n\u001b[0;32m--> 221\u001b[0m event \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_h11_state\u001b[38;5;241m.\u001b[39mnext_event()\n\u001b[1;32m 223\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m event \u001b[38;5;129;01mis\u001b[39;00m h11\u001b[38;5;241m.\u001b[39mNEED_DATA:\n", - "File \u001b[0;32m~/.pyenv/versions/3.9.19/lib/python3.9/contextlib.py:137\u001b[0m, in \u001b[0;36m_GeneratorContextManager.__exit__\u001b[0;34m(self, typ, value, traceback)\u001b[0m\n\u001b[1;32m 136\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 137\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgen\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mthrow\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtyp\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtraceback\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 138\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 139\u001b[0m \u001b[38;5;66;03m# Suppress StopIteration *unless* it's the same exception that\u001b[39;00m\n\u001b[1;32m 140\u001b[0m \u001b[38;5;66;03m# was passed to throw(). This prevents a StopIteration\u001b[39;00m\n\u001b[1;32m 141\u001b[0m \u001b[38;5;66;03m# raised inside the \"with\" statement from being suppressed.\u001b[39;00m\n", - "File \u001b[0;32m~/Programming/hirundo-client/.venv/lib/python3.9/site-packages/httpcore/_exceptions.py:14\u001b[0m, in \u001b[0;36mmap_exceptions\u001b[0;34m(map)\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(exc, from_exc):\n\u001b[0;32m---> 14\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m to_exc(exc) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mexc\u001b[39;00m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m\n", - "\u001b[0;31mRemoteProtocolError\u001b[0m: peer closed connection without sending complete message body (incomplete chunked read)", - "\nThe above exception was the direct cause of the following exception:\n", - "\u001b[0;31mRemoteProtocolError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[2], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[43mtest_dataset\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcheck_run\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2\u001b[0m results\u001b[38;5;241m.\u001b[39msuspects\u001b[38;5;241m.\u001b[39mto_csv(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mar-on-prem-audio-test.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "File \u001b[0;32m~/Programming/hirundo-client/hirundo/dataset_optimization.py:507\u001b[0m, in \u001b[0;36mOptimizationDataset.check_run\u001b[0;34m(self, stop_on_manual_approval)\u001b[0m\n\u001b[1;32m 505\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrun_id:\n\u001b[1;32m 506\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNo run has been started\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 507\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcheck_run_by_id\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstop_on_manual_approval\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/Programming/hirundo-client/hirundo/dataset_optimization.py:440\u001b[0m, in \u001b[0;36mOptimizationDataset.check_run_by_id\u001b[0;34m(run_id, stop_on_manual_approval)\u001b[0m\n\u001b[1;32m 438\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m logging_redirect_tqdm():\n\u001b[1;32m 439\u001b[0m t \u001b[38;5;241m=\u001b[39m tqdm(total\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m100.0\u001b[39m)\n\u001b[0;32m--> 440\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m iteration \u001b[38;5;129;01min\u001b[39;00m OptimizationDataset\u001b[38;5;241m.\u001b[39m_check_run_by_id(run_id):\n\u001b[1;32m 441\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m iteration[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstate\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;129;01min\u001b[39;00m STATUS_TO_PROGRESS_MAP:\n\u001b[1;32m 442\u001b[0m t\u001b[38;5;241m.\u001b[39mset_description(STATUS_TO_TEXT_MAP[iteration[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstate\u001b[39m\u001b[38;5;124m\"\u001b[39m]])\n", - "File \u001b[0;32m~/Programming/hirundo-client/hirundo/dataset_optimization.py:370\u001b[0m, in \u001b[0;36mOptimizationDataset._check_run_by_id\u001b[0;34m(run_id, retry)\u001b[0m\n\u001b[1;32m 368\u001b[0m last_event \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 369\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m httpx\u001b[38;5;241m.\u001b[39mClient(timeout\u001b[38;5;241m=\u001b[39mhttpx\u001b[38;5;241m.\u001b[39mTimeout(\u001b[38;5;28;01mNone\u001b[39;00m, connect\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m5.0\u001b[39m)) \u001b[38;5;28;01mas\u001b[39;00m client:\n\u001b[0;32m--> 370\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m sse \u001b[38;5;129;01min\u001b[39;00m iter_sse_retrying(\n\u001b[1;32m 371\u001b[0m client,\n\u001b[1;32m 372\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGET\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 373\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mAPI_HOST\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/dataset-optimization/run/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrun_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 374\u001b[0m headers\u001b[38;5;241m=\u001b[39mget_auth_headers(),\n\u001b[1;32m 375\u001b[0m ):\n\u001b[1;32m 376\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m sse\u001b[38;5;241m.\u001b[39mevent \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mping\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 377\u001b[0m \u001b[38;5;28;01mcontinue\u001b[39;00m\n", - "File \u001b[0;32m~/Programming/hirundo-client/hirundo/_iter_sse_retrying.py:62\u001b[0m, in \u001b[0;36miter_sse_retrying.._iter_sse\u001b[0;34m()\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m connect_sse(client, method, url, headers\u001b[38;5;241m=\u001b[39mconnect_headers) \u001b[38;5;28;01mas\u001b[39;00m event_source:\n\u001b[1;32m 61\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m---> 62\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m sse \u001b[38;5;129;01min\u001b[39;00m event_source\u001b[38;5;241m.\u001b[39miter_sse():\n\u001b[1;32m 63\u001b[0m last_event_id \u001b[38;5;241m=\u001b[39m sse\u001b[38;5;241m.\u001b[39mid\n\u001b[1;32m 65\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m sse\u001b[38;5;241m.\u001b[39mretry \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", - "File \u001b[0;32m~/Programming/hirundo-client/.venv/lib/python3.9/site-packages/httpx_sse/_api.py:30\u001b[0m, in \u001b[0;36mEventSource.iter_sse\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 28\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_content_type()\n\u001b[1;32m 29\u001b[0m decoder \u001b[38;5;241m=\u001b[39m SSEDecoder()\n\u001b[0;32m---> 30\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m line \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_response\u001b[38;5;241m.\u001b[39miter_lines():\n\u001b[1;32m 31\u001b[0m line \u001b[38;5;241m=\u001b[39m line\u001b[38;5;241m.\u001b[39mrstrip(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 32\u001b[0m sse \u001b[38;5;241m=\u001b[39m decoder\u001b[38;5;241m.\u001b[39mdecode(line)\n", - "File \u001b[0;32m~/Programming/hirundo-client/.venv/lib/python3.9/site-packages/httpx/_models.py:861\u001b[0m, in \u001b[0;36mResponse.iter_lines\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 859\u001b[0m decoder \u001b[38;5;241m=\u001b[39m LineDecoder()\n\u001b[1;32m 860\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m request_context(request\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_request):\n\u001b[0;32m--> 861\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m text \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39miter_text():\n\u001b[1;32m 862\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m line \u001b[38;5;129;01min\u001b[39;00m decoder\u001b[38;5;241m.\u001b[39mdecode(text):\n\u001b[1;32m 863\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m line\n", - "File \u001b[0;32m~/Programming/hirundo-client/.venv/lib/python3.9/site-packages/httpx/_models.py:848\u001b[0m, in \u001b[0;36mResponse.iter_text\u001b[0;34m(self, chunk_size)\u001b[0m\n\u001b[1;32m 846\u001b[0m chunker \u001b[38;5;241m=\u001b[39m TextChunker(chunk_size\u001b[38;5;241m=\u001b[39mchunk_size)\n\u001b[1;32m 847\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m request_context(request\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_request):\n\u001b[0;32m--> 848\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m byte_content \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39miter_bytes():\n\u001b[1;32m 849\u001b[0m text_content \u001b[38;5;241m=\u001b[39m decoder\u001b[38;5;241m.\u001b[39mdecode(byte_content)\n\u001b[1;32m 850\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m chunk \u001b[38;5;129;01min\u001b[39;00m chunker\u001b[38;5;241m.\u001b[39mdecode(text_content):\n", - "File \u001b[0;32m~/Programming/hirundo-client/.venv/lib/python3.9/site-packages/httpx/_models.py:829\u001b[0m, in \u001b[0;36mResponse.iter_bytes\u001b[0;34m(self, chunk_size)\u001b[0m\n\u001b[1;32m 827\u001b[0m chunker \u001b[38;5;241m=\u001b[39m ByteChunker(chunk_size\u001b[38;5;241m=\u001b[39mchunk_size)\n\u001b[1;32m 828\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m request_context(request\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_request):\n\u001b[0;32m--> 829\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m raw_bytes \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39miter_raw():\n\u001b[1;32m 830\u001b[0m decoded \u001b[38;5;241m=\u001b[39m decoder\u001b[38;5;241m.\u001b[39mdecode(raw_bytes)\n\u001b[1;32m 831\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m chunk \u001b[38;5;129;01min\u001b[39;00m chunker\u001b[38;5;241m.\u001b[39mdecode(decoded):\n", - "File \u001b[0;32m~/Programming/hirundo-client/.venv/lib/python3.9/site-packages/httpx/_models.py:883\u001b[0m, in \u001b[0;36mResponse.iter_raw\u001b[0;34m(self, chunk_size)\u001b[0m\n\u001b[1;32m 880\u001b[0m chunker \u001b[38;5;241m=\u001b[39m ByteChunker(chunk_size\u001b[38;5;241m=\u001b[39mchunk_size)\n\u001b[1;32m 882\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m request_context(request\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_request):\n\u001b[0;32m--> 883\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m raw_stream_bytes \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstream:\n\u001b[1;32m 884\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_num_bytes_downloaded \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(raw_stream_bytes)\n\u001b[1;32m 885\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m chunk \u001b[38;5;129;01min\u001b[39;00m chunker\u001b[38;5;241m.\u001b[39mdecode(raw_stream_bytes):\n", - "File \u001b[0;32m~/Programming/hirundo-client/.venv/lib/python3.9/site-packages/httpx/_client.py:126\u001b[0m, in \u001b[0;36mBoundSyncStream.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 125\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__iter__\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m typing\u001b[38;5;241m.\u001b[39mIterator[\u001b[38;5;28mbytes\u001b[39m]:\n\u001b[0;32m--> 126\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m chunk \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_stream:\n\u001b[1;32m 127\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m chunk\n", - "File \u001b[0;32m~/Programming/hirundo-client/.venv/lib/python3.9/site-packages/httpx/_transports/default.py:114\u001b[0m, in \u001b[0;36mResponseStream.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 112\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m map_httpcore_exceptions():\n\u001b[1;32m 113\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m part \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_httpcore_stream:\n\u001b[0;32m--> 114\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m part\n", - "File \u001b[0;32m~/.pyenv/versions/3.9.19/lib/python3.9/contextlib.py:137\u001b[0m, in \u001b[0;36m_GeneratorContextManager.__exit__\u001b[0;34m(self, typ, value, traceback)\u001b[0m\n\u001b[1;32m 135\u001b[0m value \u001b[38;5;241m=\u001b[39m typ()\n\u001b[1;32m 136\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 137\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgen\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mthrow\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtyp\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtraceback\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 138\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 139\u001b[0m \u001b[38;5;66;03m# Suppress StopIteration *unless* it's the same exception that\u001b[39;00m\n\u001b[1;32m 140\u001b[0m \u001b[38;5;66;03m# was passed to throw(). This prevents a StopIteration\u001b[39;00m\n\u001b[1;32m 141\u001b[0m \u001b[38;5;66;03m# raised inside the \"with\" statement from being suppressed.\u001b[39;00m\n\u001b[1;32m 142\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m exc \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m value\n", - "File \u001b[0;32m~/Programming/hirundo-client/.venv/lib/python3.9/site-packages/httpx/_transports/default.py:86\u001b[0m, in \u001b[0;36mmap_httpcore_exceptions\u001b[0;34m()\u001b[0m\n\u001b[1;32m 83\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m\n\u001b[1;32m 85\u001b[0m message \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(exc)\n\u001b[0;32m---> 86\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m mapped_exc(message) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mexc\u001b[39;00m\n", - "\u001b[0;31mRemoteProtocolError\u001b[0m: peer closed connection without sending complete message body (incomplete chunked read)" - ] - } - ], + "outputs": [], "source": [ "results = test_dataset.check_run()\n", "results.suspects.to_csv(\"ar-on-prem-audio-test.csv\")" diff --git a/on-prem/on_prem_audio_he_small_test_notebook.ipynb b/on-prem/on_prem_audio_he_small_test_notebook.ipynb index aaa6b7a5..7546037d 100644 --- a/on-prem/on_prem_audio_he_small_test_notebook.ipynb +++ b/on-prem/on_prem_audio_he_small_test_notebook.ipynb @@ -10,11 +10,12 @@ "import os\n", "\n", "from hirundo import (\n", + " Domain,\n", " GitPlainAuth,\n", " GitRepo,\n", " HirundoCSV,\n", " LabelingType,\n", - " OptimizationDataset,\n", + " QADataset,\n", " StorageConfig,\n", " StorageGit,\n", " StorageTypes,\n", @@ -24,12 +25,12 @@ "\n", "unique_id = os.getenv(\"UNIQUE_ID\", \"\").replace(\".\", \"-\").replace(\"/\", \"-\")\n", "\n", - "run_ids = [run.run_id for run in OptimizationDataset.list_runs() if run.run_id]\n", + "run_ids = [run.run_id for run in QADataset.list_runs() if run.run_id]\n", "for run_id in run_ids:\n", - " OptimizationDataset.cancel_by_id(run_id)\n", - "dataset_ids = [dataset.id for dataset in OptimizationDataset.list_datasets()]\n", + " QADataset.cancel_by_id(run_id)\n", + "dataset_ids = [dataset.id for dataset in QADataset.list_datasets()]\n", "for dataset_id in dataset_ids:\n", - " OptimizationDataset.delete_by_id(dataset_id)\n", + " QADataset.delete_by_id(dataset_id)\n", "storage_ids = [storage_config.id for storage_config in StorageConfig.list()]\n", "for storage_id in storage_ids:\n", " StorageConfig.delete_by_id(storage_id)\n", @@ -48,8 +49,9 @@ " ),\n", " branch=\"main\",\n", ")\n", - "test_dataset = OptimizationDataset(\n", + "test_dataset = QADataset(\n", " name=f\"TEST-STT-RoboShaulGolden-dataset{unique_id}\",\n", + " domain=Domain.SPEECH,\n", " labeling_type=LabelingType.SPEECH_TO_TEXT,\n", " language=\"he\",\n", " storage_config=StorageConfig(\n", @@ -63,7 +65,7 @@ " ),\n", ")\n", "\n", - "test_dataset.run_optimization()" + "test_dataset.run_qa()" ] }, { diff --git a/on-prem/on_prem_audio_he_test_notebook.ipynb b/on-prem/on_prem_audio_he_test_notebook.ipynb index dc407217..636fa5e4 100644 --- a/on-prem/on_prem_audio_he_test_notebook.ipynb +++ b/on-prem/on_prem_audio_he_test_notebook.ipynb @@ -4,40 +4,18 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/blewis/Programming/hirundo-client/.venv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - }, - { - "ename": "ValidationError", - "evalue": "5 validation errors for DataOptimizationDatasetOut\nlabeling_type\n Field required [type=missing, input_value={'name': 'Keymakr data4 -...1500, 'path': '/data4'}}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.8/v/missing\nstorage_integration\n Field required [type=missing, input_value={'name': 'Keymakr data4 -...1500, 'path': '/data4'}}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.8/v/missing\ndata_root_url\n Field required [type=missing, input_value={'name': 'Keymakr data4 -...1500, 'path': '/data4'}}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.8/v/missing\nlabeling_info\n Field required [type=missing, input_value={'name': 'Keymakr data4 -...1500, 'path': '/data4'}}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.8/v/missing\nstatus\n Field required [type=missing, input_value={'name': 'Keymakr data4 -...1500, 'path': '/data4'}}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.8/v/missing", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValidationError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[2], line 20\u001b[0m\n\u001b[1;32m 15\u001b[0m logger \u001b[38;5;241m=\u001b[39m logging\u001b[38;5;241m.\u001b[39mgetLogger(\u001b[38;5;18m__name__\u001b[39m)\n\u001b[1;32m 17\u001b[0m unique_id \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mgetenv(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUNIQUE_ID\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mreplace(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m-\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mreplace(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m-\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 19\u001b[0m run_ids \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m---> 20\u001b[0m dataset\u001b[38;5;241m.\u001b[39mrun_id \u001b[38;5;28;01mfor\u001b[39;00m dataset \u001b[38;5;129;01min\u001b[39;00m \u001b[43mOptimizationDataset\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlist\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mif\u001b[39;00m dataset\u001b[38;5;241m.\u001b[39mrun_id\n\u001b[1;32m 21\u001b[0m ]\n\u001b[1;32m 22\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m run_id \u001b[38;5;129;01min\u001b[39;00m run_ids:\n\u001b[1;32m 23\u001b[0m OptimizationDataset\u001b[38;5;241m.\u001b[39mcancel_by_id(run_id)\n", - "File \u001b[0;32m~/Programming/hirundo-client/hirundo/dataset_optimization.py:298\u001b[0m, in \u001b[0;36mOptimizationDataset.list\u001b[0;34m(organization_id)\u001b[0m\n\u001b[1;32m 296\u001b[0m raise_for_status_with_reason(response)\n\u001b[1;32m 297\u001b[0m datasets \u001b[38;5;241m=\u001b[39m response\u001b[38;5;241m.\u001b[39mjson()\n\u001b[0;32m--> 298\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m [\n\u001b[1;32m 299\u001b[0m DataOptimizationDatasetOut(\n\u001b[1;32m 300\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mds,\n\u001b[1;32m 301\u001b[0m )\n\u001b[1;32m 302\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m ds \u001b[38;5;129;01min\u001b[39;00m datasets\n\u001b[1;32m 303\u001b[0m ]\n", - "File \u001b[0;32m~/Programming/hirundo-client/hirundo/dataset_optimization.py:299\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 296\u001b[0m raise_for_status_with_reason(response)\n\u001b[1;32m 297\u001b[0m datasets \u001b[38;5;241m=\u001b[39m response\u001b[38;5;241m.\u001b[39mjson()\n\u001b[1;32m 298\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m [\n\u001b[0;32m--> 299\u001b[0m \u001b[43mDataOptimizationDatasetOut\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 300\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 301\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 302\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m ds \u001b[38;5;129;01min\u001b[39;00m datasets\n\u001b[1;32m 303\u001b[0m ]\n", - "File \u001b[0;32m~/Programming/hirundo-client/.venv/lib/python3.9/site-packages/pydantic/main.py:193\u001b[0m, in \u001b[0;36mBaseModel.__init__\u001b[0;34m(self, **data)\u001b[0m\n\u001b[1;32m 191\u001b[0m \u001b[38;5;66;03m# `__tracebackhide__` tells pytest and some other tools to omit this function from tracebacks\u001b[39;00m\n\u001b[1;32m 192\u001b[0m __tracebackhide__ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 193\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__pydantic_validator__\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalidate_python\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mself_instance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\n", - "\u001b[0;31mValidationError\u001b[0m: 5 validation errors for DataOptimizationDatasetOut\nlabeling_type\n Field required [type=missing, input_value={'name': 'Keymakr data4 -...1500, 'path': '/data4'}}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.8/v/missing\nstorage_integration\n Field required [type=missing, input_value={'name': 'Keymakr data4 -...1500, 'path': '/data4'}}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.8/v/missing\ndata_root_url\n Field required [type=missing, input_value={'name': 'Keymakr data4 -...1500, 'path': '/data4'}}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.8/v/missing\nlabeling_info\n Field required [type=missing, input_value={'name': 'Keymakr data4 -...1500, 'path': '/data4'}}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.8/v/missing\nstatus\n Field required [type=missing, input_value={'name': 'Keymakr data4 -...1500, 'path': '/data4'}}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.8/v/missing" - ] - } - ], + "outputs": [], "source": [ "import logging\n", "import os\n", "\n", "from hirundo import (\n", + " Domain,\n", " GitPlainAuth,\n", " GitRepo,\n", " HirundoCSV,\n", " LabelingType,\n", - " OptimizationDataset,\n", + " QADataset,\n", " StorageConfig,\n", " StorageGit,\n", " StorageTypes,\n", @@ -47,12 +25,12 @@ "\n", "unique_id = os.getenv(\"UNIQUE_ID\", \"\").replace(\".\", \"-\").replace(\"/\", \"-\")\n", "\n", - "run_ids = [run.run_id for run in OptimizationDataset.list_runs() if run.run_id]\n", + "run_ids = [run.run_id for run in QADataset.list_runs() if run.run_id]\n", "for run_id in run_ids:\n", - " OptimizationDataset.cancel_by_id(run_id)\n", - "dataset_ids = [dataset.id for dataset in OptimizationDataset.list_datasets()]\n", + " QADataset.cancel_by_id(run_id)\n", + "dataset_ids = [dataset.id for dataset in QADataset.list_datasets()]\n", "for dataset_id in dataset_ids:\n", - " OptimizationDataset.delete_by_id(dataset_id)\n", + " QADataset.delete_by_id(dataset_id)\n", "storage_ids = [storage_config.id for storage_config in StorageConfig.list()]\n", "for storage_id in storage_ids:\n", " StorageConfig.delete_by_id(storage_id)\n", @@ -71,8 +49,9 @@ " ),\n", " branch=\"main\",\n", ")\n", - "test_dataset = OptimizationDataset(\n", + "test_dataset = QADataset(\n", " name=f\"TEST-STT-RoboShaul-dataset{unique_id}\",\n", + " domain=Domain.SPEECH,\n", " labeling_type=LabelingType.SPEECH_TO_TEXT,\n", " language=\"he\",\n", " storage_config=StorageConfig(\n", @@ -86,7 +65,7 @@ " ),\n", ")\n", "\n", - "test_dataset.run_optimization()" + "test_dataset.run_qa()" ] }, { @@ -99,13 +78,6 @@ "results.suspects.to_csv(\"he-on-prem-audio-test.csv\")" ] }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "code", "execution_count": null, diff --git a/on-prem/on_prem_audio_sanity_test_notebook.ipynb b/on-prem/on_prem_audio_sanity_test_notebook.ipynb index 7ba0b2a1..d82a9f0a 100644 --- a/on-prem/on_prem_audio_sanity_test_notebook.ipynb +++ b/on-prem/on_prem_audio_sanity_test_notebook.ipynb @@ -1,179 +1,181 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ + "cells": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/blewis/Programming/hirundo-client/.venv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n", - "Deleted storage integration with ID: 9\n", - "Created storage integration with ID: 11\n", - "Created dataset with ID: 8\n", - "Started the run with ID: 2f7e0a41-75c4-4568-95fc-a177411c5162\n", - "Optimization run completed successfully: 100%|██████████| 100.0/100.0 [08:35<00:00, 5.16s/it] " - ] + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/blewis/Programming/hirundo-client/.venv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "Deleted storage integration with ID: 9\n", + "Created storage integration with ID: 11\n", + "Created dataset with ID: 8\n", + "Started the run with ID: 2f7e0a41-75c4-4568-95fc-a177411c5162\n", + "Dataset QA run completed successfully: 100%|██████████| 100.0/100.0 [08:35<00:00, 5.16s/it] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "suspects= audio_segment_id audio_path \\\n", + "0 6 wavs/automatic_0012 \n", + "1 20 wavs/automatic_0017 \n", + "2 15 wavs/automatic_0018 \n", + "3 36 wavs/automatic_0020 \n", + "4 37 wavs/automatic_0010 \n", + ".. ... ... \n", + "120 32 wavs/automatic_0004 \n", + "121 34 wavs/automatic_0012 \n", + "122 39 wavs/automatic_0017 \n", + "123 39 wavs/automatic_0017 \n", + "124 3 wavs/automatic_0017 \n", + "\n", + " transcript \\\n", + "0 ‏וגם אם לא תהיה פה השעה בסופו של דבר \n", + "1 ‏פרשה שהתחילה בחקירה שנפתחה יום אחד ב2017 בעצם... \n", + "2 ‏ללשכת ראש הממשלה הדבר הזה נולד רק דרך אילן יש... \n", + "3 ‏ה \n", + "4 ‏הפרשה הזאת בעצם שואלת האם לחץ להטות את הסיכור... \n", + ".. ... \n", + "120 ‏היום אנחנו יוצאים אה נו לא אני גאון אני יושב ... \n", + "121 ‏הסיפור הזה נותן לנו הצצה לדברים שאנחנו אף פעם... \n", + "122 ‏כי במקור זו הייתה פרשה שקראנו לה פרשת בזק שענ... \n", + "123 ‏כי במקור זו הייתה פרשה שקראנו לה פרשת בזק שענ... \n", + "124 ‏הוא האיש האחראי לזה שתיק 4000 בכלל קיים ובא ל... \n", + "\n", + " suspect_region_start_index suspect_region_end_index \\\n", + "0 0 6 \n", + "1 0 12 \n", + "2 0 9 \n", + "3 0 -2 \n", + "4 138 157 \n", + ".. ... ... \n", + "120 20 32 \n", + "121 0 10 \n", + "122 17 33 \n", + "123 58 -2 \n", + "124 0 8 \n", + "\n", + " suspect_region suspect_level \\\n", + "0 ‏וגם אם » לא תהיה פה השעה בסופו של דבר 1.0 \n", + "1 ‏פרשה שהתחילה » בחקירה שנפתחה יום אחד ב2017 בע... 1.0 \n", + "2 ‏ללשכת ראש » הממשלה הדבר הזה נולד רק דרך אילן ... 1.0 \n", + "3 ‏ה 1.0 \n", + "4 ‏הפרשה הזאת בעצם שואלת האם לחץ להטות את הסיכור... 1.0 \n", + ".. ... ... \n", + "120 ‏היום אנחנו יוצאים אה « נו לא אני » גאון אני י... 0.5 \n", + "121 ‏הסיפור הזה » נותן לנו הצצה לדברים שאנחנו אף פ... 0.5 \n", + "122 ‏כי במקור זו הייתה « פרשה שקראו לה » פרשת בזק ... 0.5 \n", + "123 ‏כי במקור זו הייתה פרשה שקראנו לה פרשת בזק שענ... 0.5 \n", + "124 ‏הוא האיש » האחראי לזה שתיק 4000 בכלל קיים ובא... 0.5 \n", + "\n", + " suspect_score rank start_time end_time \n", + "0 1.000000 1 0.00 2.96 \n", + "1 0.946653 2 58.64 68.00 \n", + "2 0.794076 3 8.00 16.00 \n", + "3 0.741773 4 0.00 1.00 \n", + "4 0.728718 5 17.00 34.00 \n", + ".. ... ... ... ... \n", + "120 0.001446 121 0.00 5.20 \n", + "121 0.001355 122 2.96 6.64 \n", + "122 0.001330 123 50.96 58.64 \n", + "123 0.001073 124 50.96 58.64 \n", + "124 0.001006 125 44.48 50.96 \n", + "\n", + "[125 rows x 11 columns] warnings_and_errors=Empty DataFrame\n", + "Columns: [audio_path, status]\n", + "Index: []\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "import logging\n", + "import os\n", + "from pathlib import Path\n", + "\n", + "from hirundo import (\n", + " Domain,\n", + " HirundoCSV,\n", + " LabelingType,\n", + " QADataset,\n", + " StorageConfig,\n", + " StorageTypes,\n", + ")\n", + "from pydantic_core import Url\n", + "\n", + "logger = logging.getLogger(__name__)\n", + "\n", + "unique_id = os.getenv(\"UNIQUE_ID\", \"\").replace(\".\", \"-\").replace(\"/\", \"-\")\n", + "\n", + "run_ids = [run.run_id for run in QADataset.list_runs() if run.run_id]\n", + "for run_id in run_ids:\n", + " QADataset.cancel_by_id(run_id)\n", + "dataset_ids = [dataset.id for dataset in QADataset.list_datasets()]\n", + "for dataset_id in dataset_ids:\n", + " QADataset.delete_by_id(dataset_id)\n", + "storage_ids = [storage_config.id for storage_config in StorageConfig.list()]\n", + "for storage_id in storage_ids:\n", + " StorageConfig.delete_by_id(storage_id)\n", + "\n", + "test_dataset = QADataset(\n", + " name=f\"TEST-STT-RoboShaulTiny-dataset{unique_id}\",\n", + " domain=Domain.SPEECH,\n", + " labeling_type=LabelingType.SPEECH_TO_TEXT,\n", + " language=\"he\",\n", + " storage_config=StorageConfig(\n", + " name=f\"STT-RoboShaulTiny-dataset{unique_id}\",\n", + " type=StorageTypes.LOCAL,\n", + " ),\n", + " data_root_url=Url(Path(\"/datasets/RoboShaulTiny/wavs\").as_uri()),\n", + " labeling_info=HirundoCSV(\n", + " csv_url=Url(Path(\"/datasets/RoboShaulTiny/meta.csv\").as_uri()),\n", + " ),\n", + ")\n", + "\n", + "test_dataset.run_qa()\n", + "results = test_dataset.check_run()\n", + "print(results)" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "suspects= audio_segment_id audio_path \\\n", - "0 6 wavs/automatic_0012 \n", - "1 20 wavs/automatic_0017 \n", - "2 15 wavs/automatic_0018 \n", - "3 36 wavs/automatic_0020 \n", - "4 37 wavs/automatic_0010 \n", - ".. ... ... \n", - "120 32 wavs/automatic_0004 \n", - "121 34 wavs/automatic_0012 \n", - "122 39 wavs/automatic_0017 \n", - "123 39 wavs/automatic_0017 \n", - "124 3 wavs/automatic_0017 \n", - "\n", - " transcript \\\n", - "0 ‏וגם אם לא תהיה פה השעה בסופו של דבר \n", - "1 ‏פרשה שהתחילה בחקירה שנפתחה יום אחד ב2017 בעצם... \n", - "2 ‏ללשכת ראש הממשלה הדבר הזה נולד רק דרך אילן יש... \n", - "3 ‏ה \n", - "4 ‏הפרשה הזאת בעצם שואלת האם לחץ להטות את הסיכור... \n", - ".. ... \n", - "120 ‏היום אנחנו יוצאים אה נו לא אני גאון אני יושב ... \n", - "121 ‏הסיפור הזה נותן לנו הצצה לדברים שאנחנו אף פעם... \n", - "122 ‏כי במקור זו הייתה פרשה שקראנו לה פרשת בזק שענ... \n", - "123 ‏כי במקור זו הייתה פרשה שקראנו לה פרשת בזק שענ... \n", - "124 ‏הוא האיש האחראי לזה שתיק 4000 בכלל קיים ובא ל... \n", - "\n", - " suspect_region_start_index suspect_region_end_index \\\n", - "0 0 6 \n", - "1 0 12 \n", - "2 0 9 \n", - "3 0 -2 \n", - "4 138 157 \n", - ".. ... ... \n", - "120 20 32 \n", - "121 0 10 \n", - "122 17 33 \n", - "123 58 -2 \n", - "124 0 8 \n", - "\n", - " suspect_region suspect_level \\\n", - "0 ‏וגם אם » לא תהיה פה השעה בסופו של דבר 1.0 \n", - "1 ‏פרשה שהתחילה » בחקירה שנפתחה יום אחד ב2017 בע... 1.0 \n", - "2 ‏ללשכת ראש » הממשלה הדבר הזה נולד רק דרך אילן ... 1.0 \n", - "3 ‏ה 1.0 \n", - "4 ‏הפרשה הזאת בעצם שואלת האם לחץ להטות את הסיכור... 1.0 \n", - ".. ... ... \n", - "120 ‏היום אנחנו יוצאים אה « נו לא אני » גאון אני י... 0.5 \n", - "121 ‏הסיפור הזה » נותן לנו הצצה לדברים שאנחנו אף פ... 0.5 \n", - "122 ‏כי במקור זו הייתה « פרשה שקראו לה » פרשת בזק ... 0.5 \n", - "123 ‏כי במקור זו הייתה פרשה שקראנו לה פרשת בזק שענ... 0.5 \n", - "124 ‏הוא האיש » האחראי לזה שתיק 4000 בכלל קיים ובא... 0.5 \n", - "\n", - " suspect_score rank start_time end_time \n", - "0 1.000000 1 0.00 2.96 \n", - "1 0.946653 2 58.64 68.00 \n", - "2 0.794076 3 8.00 16.00 \n", - "3 0.741773 4 0.00 1.00 \n", - "4 0.728718 5 17.00 34.00 \n", - ".. ... ... ... ... \n", - "120 0.001446 121 0.00 5.20 \n", - "121 0.001355 122 2.96 6.64 \n", - "122 0.001330 123 50.96 58.64 \n", - "123 0.001073 124 50.96 58.64 \n", - "124 0.001006 125 44.48 50.96 \n", - "\n", - "[125 rows x 11 columns] warnings_and_errors=Empty DataFrame\n", - "Columns: [audio_path, status]\n", - "Index: []\n" - ] + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.19" } - ], - "source": [ - "import logging\n", - "import os\n", - "from pathlib import Path\n", - "\n", - "from hirundo import (\n", - " HirundoCSV,\n", - " LabelingType,\n", - " OptimizationDataset,\n", - " StorageConfig,\n", - " StorageTypes,\n", - ")\n", - "from pydantic_core import Url\n", - "\n", - "logger = logging.getLogger(__name__)\n", - "\n", - "unique_id = os.getenv(\"UNIQUE_ID\", \"\").replace(\".\", \"-\").replace(\"/\", \"-\")\n", - "\n", - "run_ids = [run.run_id for run in OptimizationDataset.list_runs() if run.run_id]\n", - "for run_id in run_ids:\n", - " OptimizationDataset.cancel_by_id(run_id)\n", - "dataset_ids = [dataset.id for dataset in OptimizationDataset.list_datasets()]\n", - "for dataset_id in dataset_ids:\n", - " OptimizationDataset.delete_by_id(dataset_id)\n", - "storage_ids = [storage_config.id for storage_config in StorageConfig.list()]\n", - "for storage_id in storage_ids:\n", - " StorageConfig.delete_by_id(storage_id)\n", - "\n", - "test_dataset = OptimizationDataset(\n", - " name=f\"TEST-STT-RoboShaulTiny-dataset{unique_id}\",\n", - " labeling_type=LabelingType.SPEECH_TO_TEXT,\n", - " language=\"he\",\n", - " storage_config=StorageConfig(\n", - " name=f\"STT-RoboShaulTiny-dataset{unique_id}\",\n", - " type=StorageTypes.LOCAL,\n", - " ),\n", - " data_root_url=Url(Path(\"/datasets/RoboShaulTiny/wavs\").as_uri()),\n", - " labeling_info=HirundoCSV(\n", - " csv_url=Url(Path(\"/datasets/RoboShaulTiny/meta.csv\").as_uri()),\n", - " ),\n", - ")\n", - "\n", - "test_dataset.run_optimization()\n", - "results = test_dataset.check_run()\n", - "print(results)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.19" - } - }, - "nbformat": 4, - "nbformat_minor": 2 + "nbformat": 4, + "nbformat_minor": 2 } diff --git a/on-prem/on_prem_test_notebook.ipynb b/on-prem/on_prem_test_notebook.ipynb index 807fc511..62742cb7 100644 --- a/on-prem/on_prem_test_notebook.ipynb +++ b/on-prem/on_prem_test_notebook.ipynb @@ -1,153 +1,153 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ + "cells": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/blewis/Programming/hirundo-client/.venv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n", - "Cancelling run with ID: e2ee490c-99de-444b-8f34-83caf2d715c4\n", - "Deleted dataset with ID: 135\n", - "Deleted storage integration with ID: 140\n", - "Created storage integration with ID: 141\n", - "Created dataset with ID: 136\n", - "Started the run with ID: efd28fc7-d295-4a8b-9d95-b4d3bbceadc8\n", - "Optimization run completed successfully: 100%|██████████| 100.0/100.0 [05:16<00:00, 3.17s/it] " - ] + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/blewis/Programming/hirundo-client/.venv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "Cancelling run with ID: e2ee490c-99de-444b-8f34-83caf2d715c4\n", + "Deleted dataset with ID: 135\n", + "Deleted storage integration with ID: 140\n", + "Created storage integration with ID: 141\n", + "Created dataset with ID: 136\n", + "Started the run with ID: efd28fc7-d295-4a8b-9d95-b4d3bbceadc8\n", + "Dataset QA run completed successfully: 100%|██████████| 100.0/100.0 [05:16<00:00, 3.17s/it] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "suspects= image_path label split suspect_score suspect_level \\\n", + "3632 /images/36320.png cat train 1.000000e+00 1.0 \n", + "1479 /images/14790.png ship train 9.710029e-01 1.0 \n", + "894 /images/8940.png ship train 7.296918e-01 1.0 \n", + "2034 /images/20340.png deer train 7.133279e-01 1.0 \n", + "709 /images/7090.png dog train 4.025281e-01 1.0 \n", + "... ... ... ... ... ... \n", + "2216 /images/22160.png truck train 1.899035e-13 0.0 \n", + "1814 /images/18140.png ship train 9.983570e-14 0.0 \n", + "1679 /images/16790.png bird train 7.334427e-14 0.0 \n", + "288 /images/2880.png truck train 5.965531e-14 0.0 \n", + "2204 /images/22040.png airplane train 2.194576e-14 0.0 \n", + "\n", + " suggested_label suggested_label_conf rank \n", + "3632 cat 0.999084 1 \n", + "1479 ship 1.000000 2 \n", + "894 ship 0.999997 3 \n", + "2034 deer 0.999986 4 \n", + "709 dog 1.000000 5 \n", + "... ... ... ... \n", + "2216 NaN NaN 4996 \n", + "1814 NaN NaN 4997 \n", + "1679 NaN NaN 4998 \n", + "288 NaN NaN 4999 \n", + "2204 NaN NaN 5000 \n", + "\n", + "[5000 rows x 8 columns] warnings_and_errors=Empty DataFrame\n", + "Columns: [image_path, label, status, split]\n", + "Index: []\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "import logging\n", + "import os\n", + "from pathlib import Path\n", + "\n", + "from hirundo import (\n", + " HirundoCSV,\n", + " LabelingType,\n", + " QADataset,\n", + " StorageConfig,\n", + " StorageTypes,\n", + ")\n", + "from pydantic_core import Url\n", + "\n", + "logger = logging.getLogger(__name__)\n", + "\n", + "unique_id = os.getenv(\"UNIQUE_ID\", \"\").replace(\".\", \"-\").replace(\"/\", \"-\")\n", + "\n", + "run_ids = [run.run_id for run in QADataset.list_runs() if run.run_id]\n", + "for run_id in run_ids:\n", + " QADataset.cancel_by_id(run_id)\n", + "dataset_ids = [dataset.id for dataset in QADataset.list_datasets()]\n", + "for dataset_id in dataset_ids:\n", + " QADataset.delete_by_id(dataset_id)\n", + "storage_ids = [storage_config.id for storage_config in StorageConfig.list()]\n", + "for storage_id in storage_ids:\n", + " StorageConfig.delete_by_id(storage_id)\n", + "\n", + "test_dataset = QADataset(\n", + " name=f\"TEST-GCP sanity dataset{unique_id}\",\n", + " labeling_type=LabelingType.SINGLE_LABEL_CLASSIFICATION,\n", + " storage_config=StorageConfig(\n", + " name=f\"local{unique_id}\",\n", + " type=StorageTypes.LOCAL,\n", + " ),\n", + " data_root_url=Url(Path(\"/datasets/pytorch-cifar/data\").as_uri()),\n", + " labeling_info=HirundoCSV(\n", + " csv_url=Url(Path(\"/datasets/pytorch-cifar/cifar1.csv\").as_uri()),\n", + " ),\n", + " classes=[\n", + " \"airplane\",\n", + " \"automobile\",\n", + " \"bird\",\n", + " \"cat\",\n", + " \"deer\",\n", + " \"dog\",\n", + " \"frog\",\n", + " \"horse\",\n", + " \"ship\",\n", + " \"truck\",\n", + " ],\n", + ")\n", + "\n", + "test_dataset.run_qa()\n", + "results = test_dataset.check_run()\n", + "print(results)" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "suspects= image_path label split suspect_score suspect_level \\\n", - "3632 /images/36320.png cat train 1.000000e+00 1.0 \n", - "1479 /images/14790.png ship train 9.710029e-01 1.0 \n", - "894 /images/8940.png ship train 7.296918e-01 1.0 \n", - "2034 /images/20340.png deer train 7.133279e-01 1.0 \n", - "709 /images/7090.png dog train 4.025281e-01 1.0 \n", - "... ... ... ... ... ... \n", - "2216 /images/22160.png truck train 1.899035e-13 0.0 \n", - "1814 /images/18140.png ship train 9.983570e-14 0.0 \n", - "1679 /images/16790.png bird train 7.334427e-14 0.0 \n", - "288 /images/2880.png truck train 5.965531e-14 0.0 \n", - "2204 /images/22040.png airplane train 2.194576e-14 0.0 \n", - "\n", - " suggested_label suggested_label_conf rank \n", - "3632 cat 0.999084 1 \n", - "1479 ship 1.000000 2 \n", - "894 ship 0.999997 3 \n", - "2034 deer 0.999986 4 \n", - "709 dog 1.000000 5 \n", - "... ... ... ... \n", - "2216 NaN NaN 4996 \n", - "1814 NaN NaN 4997 \n", - "1679 NaN NaN 4998 \n", - "288 NaN NaN 4999 \n", - "2204 NaN NaN 5000 \n", - "\n", - "[5000 rows x 8 columns] warnings_and_errors=Empty DataFrame\n", - "Columns: [image_path, label, status, split]\n", - "Index: []\n" - ] + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.19" } - ], - "source": [ - "import logging\n", - "import os\n", - "from pathlib import Path\n", - "\n", - "from hirundo import (\n", - " HirundoCSV,\n", - " LabelingType,\n", - " OptimizationDataset,\n", - " StorageConfig,\n", - " StorageTypes,\n", - ")\n", - "from pydantic_core import Url\n", - "\n", - "logger = logging.getLogger(__name__)\n", - "\n", - "unique_id = os.getenv(\"UNIQUE_ID\", \"\").replace(\".\", \"-\").replace(\"/\", \"-\")\n", - "\n", - "run_ids = [run.run_id for run in OptimizationDataset.list_runs() if run.run_id]\n", - "for run_id in run_ids:\n", - " OptimizationDataset.cancel_by_id(run_id)\n", - "dataset_ids = [dataset.id for dataset in OptimizationDataset.list_datasets()]\n", - "for dataset_id in dataset_ids:\n", - " OptimizationDataset.delete_by_id(dataset_id)\n", - "storage_ids = [storage_config.id for storage_config in StorageConfig.list()]\n", - "for storage_id in storage_ids:\n", - " StorageConfig.delete_by_id(storage_id)\n", - "\n", - "test_dataset = OptimizationDataset(\n", - " name=f\"TEST-GCP sanity dataset{unique_id}\",\n", - " labeling_type=LabelingType.SINGLE_LABEL_CLASSIFICATION,\n", - " storage_config=StorageConfig(\n", - " name=f\"local{unique_id}\",\n", - " type=StorageTypes.LOCAL,\n", - " ),\n", - " data_root_url=Url(Path(\"/datasets/pytorch-cifar/data\").as_uri()),\n", - " labeling_info=HirundoCSV(\n", - " csv_url=Url(Path(\"/datasets/pytorch-cifar/cifar1.csv\").as_uri()),\n", - " ),\n", - " classes=[\n", - " \"airplane\",\n", - " \"automobile\",\n", - " \"bird\",\n", - " \"cat\",\n", - " \"deer\",\n", - " \"dog\",\n", - " \"frog\",\n", - " \"horse\",\n", - " \"ship\",\n", - " \"truck\",\n", - " ],\n", - ")\n", - "\n", - "test_dataset.run_optimization()\n", - "results = test_dataset.check_run()\n", - "print(results)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.19" - } - }, - "nbformat": 4, - "nbformat_minor": 2 + "nbformat": 4, + "nbformat_minor": 2 } diff --git a/on_prem_test_notebook.ipynb b/on_prem_test_notebook.ipynb index f3fc76b5..759479ba 100644 --- a/on_prem_test_notebook.ipynb +++ b/on_prem_test_notebook.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -17,7 +17,7 @@ "Created storage integration with ID: 141\n", "Created dataset with ID: 136\n", "Started the run with ID: efd28fc7-d295-4a8b-9d95-b4d3bbceadc8\n", - "Optimization run completed successfully: 100%|██████████| 100.0/100.0 [05:16<00:00, 3.17s/it] " + "Dataset QA run completed successfully: 100%|██████████| 100.0/100.0 [05:16<00:00, 3.17s/it] " ] }, { @@ -67,39 +67,34 @@ "import logging\n", "import os\n", "\n", - "from hirundo.dataset_optimization import OptimizationDataset\n", - "from hirundo.enum import LabellingType\n", - "from hirundo.storage import StorageIntegration, StorageLink, StorageTypes\n", + "from hirundo import HirundoCSV, LabelingType, QADataset, StorageConfig, StorageTypes\n", + "from pydantic_core import Url\n", "\n", "logger = logging.getLogger(__name__)\n", "\n", "unique_id = os.getenv(\"UNIQUE_ID\", \"\").replace(\".\", \"-\").replace(\"/\", \"-\")\n", "\n", - "run_ids = [\n", - " dataset[\"run_id\"] for dataset in OptimizationDataset.list() if dataset[\"run_id\"]\n", - "]\n", + "run_ids = [dataset.run_id for dataset in QADataset.list_runs() if dataset.run_id]\n", "for run_id in run_ids:\n", - " OptimizationDataset.cancel_by_id(run_id)\n", - "dataset_ids = [dataset[\"id\"] for dataset in OptimizationDataset.list()]\n", + " QADataset.cancel_by_id(run_id)\n", + "dataset_ids = [dataset.id for dataset in QADataset.list_datasets()]\n", "for dataset_id in dataset_ids:\n", - " OptimizationDataset.delete_by_id(dataset_id)\n", - "storage_integration_ids = [\n", - " storage_integration[\"id\"] for storage_integration in StorageIntegration.list()\n", - "]\n", - "for storage_integration_id in storage_integration_ids:\n", - " StorageIntegration.delete_by_id(storage_integration_id)\n", + " QADataset.delete_by_id(dataset_id)\n", + "storage_config_ids = [storage_config.id for storage_config in StorageConfig.list()]\n", + "for storage_config_id in storage_config_ids:\n", + " StorageConfig.delete_by_id(storage_config_id)\n", "\n", - "test_dataset = OptimizationDataset(\n", + "test_dataset = QADataset(\n", " name=f\"TEST-GCP sanity dataset{unique_id}\",\n", - " labelling_type=LabellingType.SingleLabelClassification,\n", - " dataset_storage=StorageLink(\n", - " storage_integration=StorageIntegration(\n", - " name=f\"local{unique_id}\",\n", - " type=StorageTypes.LOCAL,\n", - " ),\n", - " path=\"/datasets/pytorch-cifar/data\",\n", + " labeling_type=LabelingType.SINGLE_LABEL_CLASSIFICATION,\n", + " storage_config=StorageConfig(\n", + " name=f\"local{unique_id}\",\n", + " type=StorageTypes.LOCAL,\n", + " ),\n", + " data_root_url=Url(\"/datasets/pytorch-cifar/data\"),\n", + " labeling_info=HirundoCSV(\n", + " csv_url=Url(\"/datasets/pytorch-cifar/data/cifar1.csv\"),\n", " ),\n", - " dataset_metadata_path=\"cifar1.csv\",\n", " classes=[\n", " \"airplane\",\n", " \"automobile\",\n", @@ -114,7 +109,7 @@ " ],\n", ")\n", "\n", - "test_dataset.run_optimization()\n", + "test_dataset.run_qa()\n", "results = test_dataset.check_run()\n", "print(results)" ] diff --git a/pyproject.toml b/pyproject.toml index 9a603e13..c7db1848 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,6 @@ [build-system] -requires = ["setuptools", "wheel"] +requires = ["setuptools>=78.1.1", "wheel"] +# ⬆️ Required to fix vulnerability GHSA-5rjg-fvgr-3xxf build-backend = "setuptools.build_meta" [tool.setuptools] @@ -31,6 +32,12 @@ dependencies = [ "stamina>=24.2.0", "httpx-sse>=0.4.0", "tqdm>=4.66.5", + "h11>=0.16.0", + # ⬆️ Required to fix vulnerability GHSA-vqfr-h8mv-ghfj + "requests>=2.32.4", + # ⬆️ Required to fix vulnerability GHSA-9hjg-9r4m-mvj7 + "urllib3>=2.5.0", + # ⬆️ Required to fix vulnerabilities GHSA-pq67-6m6q-mj2v and GHSA-48p4-8xcf-vxj5 ] [project.scripts] @@ -58,10 +65,15 @@ dev = [ "pre-commit>=3.7.1", "virtualenv>=20.6.6", # ⬆️ Needed for `pre-commit` and locking version for `safety-cli` + # version fixes vulnerability GHSA-rqc4-2hc7-8c8v "ruff>=0.12.0", "bumpver", "platformdirs>=4.3.6", "safety>=3.2.13", + "cryptography>=44.0.1", + # ⬆️ Required to fix vulnerability GHSA-79v4-65xg-pq4g + "jinja2>=3.1.6", + # ⬆️ Required to fix vulnerabilities GHSA-cpwx-vrp4-4pq7 , GHSA-gmj6-6f8f-6699 & GHSA-q2x7-8rv6-6q7h ] docs = [ "sphinx>=7.4.7", @@ -71,9 +83,12 @@ docs = [ "furo", "sphinx-multiversion", "esbonio", - "starlette>0.40.0", + "starlette>=0.47.2", + # ⬆️ Required to fix vulnerability GHSA-2c2j-9gv5-cj73 "markupsafe>=3.0.2", # Force `starlette` and `markupsafe` to versions compatible with `dev` dependencies. + "jinja2>=3.1.6", + # ⬆️ Required to fix vulnerabilities GHSA-cpwx-vrp4-4pq7 , GHSA-gmj6-6f8f-6699 & GHSA-q2x7-8rv6-6q7h ] pandas = ["pandas>=2.2.3"] polars = ["polars>=1.0.0"] @@ -172,4 +187,3 @@ dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" [tool.pyright] typeCheckingMode = "standard" -autoSearchPaths = true diff --git a/requirements/dev.txt b/requirements/dev.txt index b93dbd03..4236766c 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -37,14 +37,11 @@ click==8.1.7 # safety # typer colorama==0.4.6 + # via bumpver +cryptography==45.0.5 # via - # -c requirements/requirements.txt - # bumpver - # click - # pytest - # tqdm -cryptography==44.0.0 - # via authlib + # hirundo (pyproject.toml) + # authlib distlib==0.3.8 # via virtualenv docutils==0.21.2 @@ -64,11 +61,12 @@ filelock==3.12.4 # via # safety # virtualenv -h11==0.14.0 +h11==0.16.0 # via # -c requirements/requirements.txt + # hirundo (pyproject.toml) # httpcore -httpcore==1.0.5 +httpcore==1.0.9 # via # -c requirements/requirements.txt # httpx @@ -107,8 +105,10 @@ jaraco-functools==4.0.1 # via # -c requirements/requirements.txt # keyring -jinja2==3.1.4 - # via safety +jinja2==3.1.6 + # via + # hirundo (pyproject.toml) + # safety keyring==25.2.1 # via # -c requirements/requirements.txt @@ -188,10 +188,6 @@ python-dotenv==1.0.1 # via # -c requirements/requirements.txt # hirundo (pyproject.toml) -pywin32-ctypes==0.2.3 - # via - # -c requirements/requirements.txt - # keyring pyyaml==6.0.1 # via # -c requirements/requirements.txt @@ -201,9 +197,10 @@ readme-renderer==44.0 # via # -c requirements/requirements.txt # twine -requests==2.32.3 +requests==2.32.4 # via # -c requirements/requirements.txt + # hirundo (pyproject.toml) # requests-toolbelt # safety # twine @@ -291,9 +288,10 @@ typing-extensions==4.12.2 # safety-schemas # stamina # typer -urllib3==2.2.2 +urllib3==2.5.0 # via # -c requirements/requirements.txt + # hirundo (pyproject.toml) # requests # safety # twine diff --git a/requirements/docs.txt b/requirements/docs.txt index 554fb74c..3267a9f8 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -47,12 +47,7 @@ click==8.1.7 # typer # uvicorn colorama==0.4.6 - # via - # -c requirements/requirements.txt - # click - # sphinx - # sphinx-autobuild - # tqdm + # via sphinx-autobuild docutils==0.21.2 # via # -c requirements/requirements.txt @@ -68,12 +63,13 @@ exceptiongroup==1.2.2 # cattrs furo==2024.7.18 # via hirundo (pyproject.toml) -h11==0.14.0 +h11==0.16.0 # via # -c requirements/requirements.txt + # hirundo (pyproject.toml) # httpcore # uvicorn -httpcore==1.0.5 +httpcore==1.0.9 # via # -c requirements/requirements.txt # httpx @@ -111,8 +107,10 @@ jaraco-functools==4.0.1 # via # -c requirements/requirements.txt # keyring -jinja2==3.1.4 - # via sphinx +jinja2==3.1.6 + # via + # hirundo (pyproject.toml) + # sphinx keyring==25.2.1 # via # -c requirements/requirements.txt @@ -176,10 +174,6 @@ python-dotenv==1.0.1 # -c requirements/requirements.txt # hirundo (pyproject.toml) # pydantic-settings -pywin32-ctypes==0.2.3 - # via - # -c requirements/requirements.txt - # keyring pyyaml==6.0.1 # via # -c requirements/requirements.txt @@ -188,9 +182,10 @@ readme-renderer==44.0 # via # -c requirements/requirements.txt # twine -requests==2.32.3 +requests==2.32.4 # via # -c requirements/requirements.txt + # hirundo (pyproject.toml) # requests-toolbelt # sphinx # twine @@ -254,7 +249,7 @@ stamina==24.2.0 # via # -c requirements/requirements.txt # hirundo (pyproject.toml) -starlette==0.41.2 +starlette==0.47.2 # via # hirundo (pyproject.toml) # sphinx-autobuild @@ -295,9 +290,10 @@ typing-extensions==4.12.2 # starlette # typer # uvicorn -urllib3==2.2.2 +urllib3==2.5.0 # via # -c requirements/requirements.txt + # hirundo (pyproject.toml) # requests # twine # types-requests diff --git a/requirements/pandas.txt b/requirements/pandas.txt index d7b65866..fbf1120e 100644 --- a/requirements/pandas.txt +++ b/requirements/pandas.txt @@ -26,11 +26,6 @@ click==8.1.7 # via # -c requirements/requirements.txt # typer -colorama==0.4.6 - # via - # -c requirements/requirements.txt - # click - # tqdm docutils==0.21.2 # via # -c requirements/requirements.txt @@ -39,11 +34,12 @@ exceptiongroup==1.2.2 # via # -c requirements/requirements.txt # anyio -h11==0.14.0 +h11==0.16.0 # via # -c requirements/requirements.txt + # hirundo (pyproject.toml) # httpcore -httpcore==1.0.5 +httpcore==1.0.9 # via # -c requirements/requirements.txt # httpx @@ -128,10 +124,6 @@ python-dotenv==1.0.1 # hirundo (pyproject.toml) pytz==2024.1 # via pandas -pywin32-ctypes==0.2.3 - # via - # -c requirements/requirements.txt - # keyring pyyaml==6.0.1 # via # -c requirements/requirements.txt @@ -140,9 +132,10 @@ readme-renderer==44.0 # via # -c requirements/requirements.txt # twine -requests==2.32.3 +requests==2.32.4 # via # -c requirements/requirements.txt + # hirundo (pyproject.toml) # requests-toolbelt # twine requests-toolbelt==1.0.0 @@ -207,9 +200,10 @@ typing-extensions==4.12.2 # typer tzdata==2024.1 # via pandas -urllib3==2.2.2 +urllib3==2.5.0 # via # -c requirements/requirements.txt + # hirundo (pyproject.toml) # requests # twine # types-requests diff --git a/requirements/polars.txt b/requirements/polars.txt index e2ab3bdf..e9d8e619 100644 --- a/requirements/polars.txt +++ b/requirements/polars.txt @@ -26,11 +26,6 @@ click==8.1.7 # via # -c requirements/requirements.txt # typer -colorama==0.4.6 - # via - # -c requirements/requirements.txt - # click - # tqdm docutils==0.21.2 # via # -c requirements/requirements.txt @@ -39,11 +34,12 @@ exceptiongroup==1.2.2 # via # -c requirements/requirements.txt # anyio -h11==0.14.0 +h11==0.16.0 # via # -c requirements/requirements.txt + # hirundo (pyproject.toml) # httpcore -httpcore==1.0.5 +httpcore==1.0.9 # via # -c requirements/requirements.txt # httpx @@ -122,10 +118,6 @@ python-dotenv==1.0.1 # via # -c requirements/requirements.txt # hirundo (pyproject.toml) -pywin32-ctypes==0.2.3 - # via - # -c requirements/requirements.txt - # keyring pyyaml==6.0.1 # via # -c requirements/requirements.txt @@ -134,9 +126,10 @@ readme-renderer==44.0 # via # -c requirements/requirements.txt # twine -requests==2.32.3 +requests==2.32.4 # via # -c requirements/requirements.txt + # hirundo (pyproject.toml) # requests-toolbelt # twine requests-toolbelt==1.0.0 @@ -197,9 +190,10 @@ typing-extensions==4.12.2 # pydantic-core # stamina # typer -urllib3==2.2.2 +urllib3==2.5.0 # via # -c requirements/requirements.txt + # hirundo (pyproject.toml) # requests # twine # types-requests diff --git a/requirements/requirements.txt b/requirements/requirements.txt index b261d5e0..af7cf92b 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -15,17 +15,15 @@ charset-normalizer==3.3.2 # via requests click==8.1.7 # via typer -colorama==0.4.6 - # via - # click - # tqdm docutils==0.21.2 # via readme-renderer exceptiongroup==1.2.2 # via anyio -h11==0.14.0 - # via httpcore -httpcore==1.0.5 +h11==0.16.0 + # via + # hirundo (pyproject.toml) + # httpcore +httpcore==1.0.9 # via httpx httpx==0.27.0 # via hirundo (pyproject.toml) @@ -70,14 +68,13 @@ pygments==2.18.0 # rich python-dotenv==1.0.1 # via hirundo (pyproject.toml) -pywin32-ctypes==0.2.3 - # via keyring pyyaml==6.0.1 # via hirundo (pyproject.toml) readme-renderer==44.0 # via twine -requests==2.32.3 +requests==2.32.4 # via + # hirundo (pyproject.toml) # requests-toolbelt # twine requests-toolbelt==1.0.0 @@ -115,8 +112,9 @@ typing-extensions==4.12.2 # pydantic-core # stamina # typer -urllib3==2.2.2 +urllib3==2.5.0 # via + # hirundo (pyproject.toml) # requests # twine # types-requests diff --git a/tests/classification/classification_aws_test.py b/tests/classification/classification_aws_test.py index c1ea0c2d..c133a1e5 100644 --- a/tests/classification/classification_aws_test.py +++ b/tests/classification/classification_aws_test.py @@ -5,14 +5,14 @@ from hirundo import ( HirundoCSV, LabelingType, - OptimizationDataset, + QADataset, StorageConfig, StorageS3, StorageTypes, ) -from tests.dataset_optimization_shared import ( +from tests.dataset_qa_shared import ( cleanup, - dataset_optimization_sync_test, + dataset_qa_sync_test, get_unique_id, ) @@ -25,7 +25,7 @@ access_key_id=os.environ["AWS_ACCESS_KEY"], secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"], ) -test_dataset = OptimizationDataset( +test_dataset = QADataset( name=f"TEST-AWS cifar10 classification dataset{unique_id}", labeling_type=LabelingType.SINGLE_LABEL_CLASSIFICATION, storage_config=StorageConfig( @@ -59,13 +59,11 @@ def cleanup_tests(): cleanup(test_dataset) -def test_dataset_optimization(): - full_run = dataset_optimization_sync_test( - test_dataset, "RUN_CLASSIFICATION_AWS_OPTIMIZATION" - ) +def test_dataset_qa(): + full_run = dataset_qa_sync_test(test_dataset, "RUN_CLASSIFICATION_AWS_DATA_QA") if full_run is not None: assert full_run.warnings_and_errors is not None assert full_run.warnings_and_errors.shape[0] == 0 # TODO: Add more assertions for results else: - logger.info("Full dataset optimization was not run!") + logger.info("Full dataset QA was not run!") diff --git a/tests/classification/classification_gcp_test.py b/tests/classification/classification_gcp_test.py index 90490103..d3c444f0 100644 --- a/tests/classification/classification_gcp_test.py +++ b/tests/classification/classification_gcp_test.py @@ -6,15 +6,15 @@ from hirundo import ( HirundoCSV, LabelingType, - OptimizationDataset, + QADataset, StorageConfig, StorageGCP, StorageTypes, ) from tests.classification.cifar100_classes import cifar100_classes -from tests.dataset_optimization_shared import ( +from tests.dataset_qa_shared import ( cleanup, - dataset_optimization_sync_test, + dataset_qa_sync_test, get_unique_id, ) @@ -26,7 +26,7 @@ project="Hirundo-global", credentials_json=json.loads(os.environ["GCP_CREDENTIALS"]), ) -test_dataset = OptimizationDataset( +test_dataset = QADataset( name=f"TEST-GCP cifar 100 classification dataset{unique_id}", labeling_type=LabelingType.SINGLE_LABEL_CLASSIFICATION, storage_config=StorageConfig( @@ -49,13 +49,11 @@ def cleanup_tests(): cleanup(test_dataset) -def test_dataset_optimization(): - full_run = dataset_optimization_sync_test( - test_dataset, "RUN_CLASSIFICATION_GCP_OPTIMIZATION" - ) +def test_dataset_qa(): + full_run = dataset_qa_sync_test(test_dataset, "RUN_CLASSIFICATION_GCP_DATA_QA") if full_run is not None: assert full_run.warnings_and_errors is not None assert full_run.warnings_and_errors.shape[0] == 0 # TODO: Add more assertions for results else: - logger.info("Full dataset optimization was not run!") + logger.info("Full dataset QA was not run!") diff --git a/tests/classification/sanity_gcp_test.py b/tests/classification/sanity_gcp_test.py index 17d6975b..8b1fc1d5 100644 --- a/tests/classification/sanity_gcp_test.py +++ b/tests/classification/sanity_gcp_test.py @@ -6,16 +6,16 @@ from hirundo import ( HirundoCSV, LabelingType, - OptimizationDataset, + QADataset, StorageConfig, StorageGCP, StorageTypes, ) -from hirundo.dataset_optimization import AugmentationName -from tests.dataset_optimization_shared import ( +from hirundo.dataset_qa import AugmentationName +from tests.dataset_qa_shared import ( cleanup, - dataset_optimization_async_test, - dataset_optimization_sync_test, + dataset_qa_async_test, + dataset_qa_sync_test, get_unique_id, ) @@ -27,7 +27,7 @@ project="Hirundo-global", credentials_json=json.loads(os.environ["GCP_CREDENTIALS"]), ) -test_dataset = OptimizationDataset( +test_dataset = QADataset( name=f"TEST-GCP sanity dataset{unique_id}", labeling_type=LabelingType.SINGLE_LABEL_CLASSIFICATION, storage_config=StorageConfig( @@ -66,11 +66,11 @@ def cleanup_tests(): cleanup(test_dataset) -def test_dataset_optimization(): - full_run = dataset_optimization_sync_test( +def test_dataset_qa(): + full_run = dataset_qa_sync_test( test_dataset, sanity=True, - alternative_env="RUN_CLASSIFICATION_GCP_SANITY_OPTIMIZATION", + alternative_env="RUN_CLASSIFICATION_GCP_SANITY_DATA_QA", ) if full_run is not None: assert full_run.warnings_and_errors is not None @@ -78,11 +78,9 @@ def test_dataset_optimization(): assert full_run.suspects is not None assert full_run.suspects.shape[0] >= 5_000 else: - logger.info("Full dataset optimization was not run!") + logger.info("Full dataset QA was not run!") @pytest.mark.asyncio -async def test_async_dataset_optimization(): - await dataset_optimization_async_test( - test_dataset, "RUN_CLASSIFICATION_GCP_SANITY_OPTIMIZATION" - ) +async def test_async_dataset_qa(): + await dataset_qa_async_test(test_dataset, "RUN_CLASSIFICATION_GCP_SANITY_DATA_QA") diff --git a/tests/dataset_optimization_shared.py b/tests/dataset_qa_shared.py similarity index 80% rename from tests/dataset_optimization_shared.py rename to tests/dataset_qa_shared.py index c59780e6..754290b8 100644 --- a/tests/dataset_optimization_shared.py +++ b/tests/dataset_qa_shared.py @@ -6,11 +6,11 @@ import requests from hirundo import ( GitRepo, - OptimizationDataset, + QADataset, RunArgs, StorageConfig, ) -from hirundo.dataset_optimization import RunStatus +from hirundo.dataset_qa import RunStatus from hirundo.logger import get_logger logger = get_logger(__name__) @@ -25,9 +25,9 @@ def get_unique_id(): def cleanup_conflict_by_unique_id(unique_id: typing.Optional[str]): if not unique_id: return - runs = OptimizationDataset.list_runs() + runs = QADataset.list_runs() conflicting_run_ids = [run.run_id for run in runs if unique_id in run.name] - datasets = OptimizationDataset.list_datasets() + datasets = QADataset.list_datasets() conflicting_dataset_ids = [ dataset.id for dataset in datasets if unique_id in dataset.name ] @@ -41,7 +41,7 @@ def cleanup_conflict_by_unique_id(unique_id: typing.Optional[str]): ] for conflicting_run_id in conflicting_run_ids: try: - OptimizationDataset.archive_run_by_id(conflicting_run_id) + QADataset.archive_run_by_id(conflicting_run_id) except Exception as e: logger.warning( "Failed to archive run with ID %s and exception %s", @@ -50,7 +50,7 @@ def cleanup_conflict_by_unique_id(unique_id: typing.Optional[str]): ) for conflicting_dataset_id in conflicting_dataset_ids: try: - OptimizationDataset.delete_by_id(conflicting_dataset_id) + QADataset.delete_by_id(conflicting_dataset_id) except Exception as e: logger.warning( "Failed to delete dataset with ID %s and exception %s", @@ -78,13 +78,13 @@ def cleanup_conflict_by_unique_id(unique_id: typing.Optional[str]): @contextmanager -def _handle_not_found_error(dataset: OptimizationDataset): +def _handle_not_found_error(dataset: QADataset): try: yield except requests.HTTPError as e: if e.response.status_code == 404: logger.info( - "Optimization dataset with name %s not found, skipping cleanup", + "QA dataset with name %s not found, skipping cleanup", dataset.name, ) return @@ -93,7 +93,7 @@ def _handle_not_found_error(dataset: OptimizationDataset): def _get_runs_by_dataset(): - runs = OptimizationDataset.list_runs() + runs = QADataset.list_runs() runs_by_dataset = defaultdict(list) for run in runs: if run.dataset_id is not None and run.run_id is not None: @@ -101,17 +101,17 @@ def _get_runs_by_dataset(): return runs_by_dataset -def cleanup(test_dataset: OptimizationDataset): +def cleanup(test_dataset: QADataset): logger.info("Started cleanup") with _handle_not_found_error(test_dataset): - dataset = OptimizationDataset.get_by_name(test_dataset.name) + dataset = QADataset.get_by_name(test_dataset.name) storage_config_id = ( dataset.storage_config.id if dataset.storage_config is not None else None ) runs_by_dataset = _get_runs_by_dataset() if dataset.id is not None: logger.debug( - "Found optimization dataset with the same name, deleting it", + "Found QA dataset with the same name, deleting it", ) logger.debug( "Note: If I am not the owner, I will not be able to delete them" @@ -119,14 +119,12 @@ def cleanup(test_dataset: OptimizationDataset): try: if dataset.id in runs_by_dataset: for run_id in runs_by_dataset[dataset.id]: - logger.debug( - "Archiving optimization dataset with run ID %s", run_id - ) - OptimizationDataset.archive_run_by_id(run_id) - OptimizationDataset.delete_by_id(dataset.id) + logger.debug("Archiving QA dataset with run ID %s", run_id) + QADataset.archive_run_by_id(run_id) + QADataset.delete_by_id(dataset.id) except Exception as e: logger.warning( - "Unable to delete optimization dataset with ID %s and exception %s", + "Unable to delete QA dataset with ID %s and exception %s", dataset.id, e, ) @@ -170,8 +168,8 @@ def cleanup(test_dataset: OptimizationDataset): logger.info("Finished cleanup") -def dataset_optimization_sync_test( - test_dataset: OptimizationDataset, +def dataset_qa_sync_test( + test_dataset: QADataset, alternative_env: typing.Optional[str] = None, sanity=False, run_args: typing.Optional[RunArgs] = None, @@ -180,10 +178,8 @@ def dataset_optimization_sync_test( if (os.getenv("FULL_TEST", "false") == "true" and sanity) or ( alternative_env and os.getenv(alternative_env, "false") == "true" ): - run_id = test_dataset.run_optimization( - replace_dataset_if_exists=True, run_args=run_args - ) - logger.info("Sync: Started dataset optimization run with run ID %s", run_id) + run_id = test_dataset.run_qa(replace_dataset_if_exists=True, run_args=run_args) + logger.info("Sync: Started dataset QA run with run ID %s", run_id) logger.info("Sync: Checking run progress") result = test_dataset.check_run(stop_on_manual_approval=True) logger.info("Sync: Results %s", result) @@ -194,17 +190,15 @@ def dataset_optimization_sync_test( return None -async def dataset_optimization_async_test( - test_dataset: OptimizationDataset, +async def dataset_qa_async_test( + test_dataset: QADataset, env: str, run_args: typing.Optional[RunArgs] = None, ): logger.info("Async: Finished cleanup") if os.getenv(env, "false") == "true": - run_id = test_dataset.run_optimization( - replace_dataset_if_exists=True, run_args=run_args - ) - logger.info("Async: Started dataset optimization run with run ID %s", run_id) + run_id = test_dataset.run_qa(replace_dataset_if_exists=True, run_args=run_args) + logger.info("Async: Started dataset QA run with run ID %s", run_id) events_generator = test_dataset.acheck_run() logger.info("Async: Checking run progress") last_event = {} diff --git a/tests/get_by_name_test.py b/tests/get_by_name_test.py index 65f13c41..78348f2c 100644 --- a/tests/get_by_name_test.py +++ b/tests/get_by_name_test.py @@ -7,20 +7,20 @@ GitRepo, HirundoCSV, LabelingType, - OptimizationDataset, + QADataset, StorageConfig, StorageGCP, StorageGit, StorageTypes, ) -from tests.dataset_optimization_shared import get_unique_id +from tests.dataset_qa_shared import get_unique_id unique_id = get_unique_id() gcp_storage_config_name = f"T-cifar1bucket_get_by_name{unique_id}" -gcp_optimization_dataset_name = f"T-cifar1_get_by_name{unique_id}" +gcp_qa_dataset_name = f"T-cifar1_get_by_name{unique_id}" git_storage_config_name = f"T-BDD-100k-validation-git_get_by_name{unique_id}" git_repository_name = f"T-BDD-100k-validation-git-repo_get_by_name{unique_id}" -git_optimization_dataset_name = f"T-BDD-100k-validation-dataset_get_by_name{unique_id}" +git_qa_dataset_name = f"T-BDD-100k-validation-dataset_get_by_name{unique_id}" new_storage_config: typing.Optional[StorageConfig] = None new_dataset = None @@ -56,8 +56,8 @@ def test_get_by_name_gcp(): assert new_storage_config.gcp is not None storage_gcp = new_storage_config.gcp - OptimizationDataset( - name=gcp_optimization_dataset_name, + QADataset( + name=gcp_qa_dataset_name, labeling_type=LabelingType.SINGLE_LABEL_CLASSIFICATION, storage_config=new_storage_config, labeling_info=HirundoCSV( @@ -66,7 +66,7 @@ def test_get_by_name_gcp(): data_root_url=storage_gcp.get_url("/pytorch-cifar/data"), ).create(replace_if_exists=True) - dataset = OptimizationDataset.get_by_name(gcp_optimization_dataset_name) + dataset = QADataset.get_by_name(gcp_qa_dataset_name) assert dataset is not None @@ -90,8 +90,8 @@ def test_get_by_name_git(): assert new_storage_config.git is not None storage_git = new_storage_config.git - OptimizationDataset( - name=git_optimization_dataset_name, + QADataset( + name=git_qa_dataset_name, labeling_type=LabelingType.OBJECT_DETECTION, storage_config=new_storage_config, labeling_info=HirundoCSV( @@ -102,5 +102,5 @@ def test_get_by_name_git(): data_root_url=storage_git.get_url(path="/BDD100K Val from Hirundo.zip/bdd100k"), ).create(replace_if_exists=True) - new_dataset = OptimizationDataset.get_by_name(git_optimization_dataset_name) + new_dataset = QADataset.get_by_name(git_qa_dataset_name) assert new_dataset is not None diff --git a/tests/object-detection/od_aws_test.py b/tests/object-detection/od_aws_test.py index 5632c582..da74d18d 100644 --- a/tests/object-detection/od_aws_test.py +++ b/tests/object-detection/od_aws_test.py @@ -5,14 +5,14 @@ from hirundo import ( HirundoCSV, LabelingType, - OptimizationDataset, + QADataset, StorageConfig, StorageS3, StorageTypes, ) -from tests.dataset_optimization_shared import ( +from tests.dataset_qa_shared import ( cleanup, - dataset_optimization_sync_test, + dataset_qa_sync_test, get_unique_id, ) @@ -25,7 +25,7 @@ access_key_id=os.environ["AWS_ACCESS_KEY"], secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"], ) -test_dataset = OptimizationDataset( +test_dataset = QADataset( name=f"TEST-AWS-BDD-100k-validation-OD-dataset{unique_id}", labeling_type=LabelingType.OBJECT_DETECTION, storage_config=StorageConfig( @@ -62,10 +62,10 @@ def cleanup_tests(): cleanup(test_dataset) -def test_dataset_optimization(): - full_run = dataset_optimization_sync_test(test_dataset, "RUN_AWS_OD_OPTIMIZATION") +def test_dataset_qa(): + full_run = dataset_qa_sync_test(test_dataset, "RUN_AWS_OD_DATA_QA") if full_run is not None: pass # TODO: Add add assertion for result else: - logger.info("Full dataset optimization was not run!") + logger.info("Full dataset QA was not run!") diff --git a/tests/object-detection/od_git_test.py b/tests/object-detection/od_git_test.py index 8f2af855..9836ff62 100644 --- a/tests/object-detection/od_git_test.py +++ b/tests/object-detection/od_git_test.py @@ -5,14 +5,14 @@ GitRepo, HirundoCSV, LabelingType, - OptimizationDataset, + QADataset, StorageConfig, StorageGit, StorageTypes, ) -from tests.dataset_optimization_shared import ( +from tests.dataset_qa_shared import ( cleanup, - dataset_optimization_sync_test, + dataset_qa_sync_test, get_unique_id, ) @@ -26,7 +26,7 @@ ), branch="main", ) -test_dataset = OptimizationDataset( +test_dataset = QADataset( name=f"TEST-HuggingFace-BDD-100k-validation-OD-validation-dataset{unique_id}", labeling_type=LabelingType.OBJECT_DETECTION, storage_config=StorageConfig( @@ -65,10 +65,10 @@ def cleanup_tests(): cleanup(test_dataset) -def test_dataset_optimization(): - full_run = dataset_optimization_sync_test(test_dataset, "RUN_OD_GIT_OPTIMIZATION") +def test_dataset_qa(): + full_run = dataset_qa_sync_test(test_dataset, "RUN_OD_GIT_DATA_QA") if full_run is not None: pass # TODO: Add add assertion for result else: - logger.info("Full dataset optimization was not run!") + logger.info("Full dataset QA was not run!") diff --git a/tests/object-detection/rockpaperscisssors_yolo_test.py b/tests/object-detection/rockpaperscisssors_yolo_test.py index 7a6683b6..6bca0b34 100644 --- a/tests/object-detection/rockpaperscisssors_yolo_test.py +++ b/tests/object-detection/rockpaperscisssors_yolo_test.py @@ -6,15 +6,15 @@ from hirundo import ( YOLO, LabelingType, - OptimizationDataset, + QADataset, StorageConfig, StorageGCP, StorageTypes, ) -from tests.dataset_optimization_shared import ( +from tests.dataset_qa_shared import ( cleanup, - dataset_optimization_async_test, - dataset_optimization_sync_test, + dataset_qa_async_test, + dataset_qa_sync_test, get_unique_id, ) @@ -26,7 +26,7 @@ project="Hirundo-global", credentials_json=json.loads(os.environ["GCP_CREDENTIALS"]), ) -test_dataset = OptimizationDataset( +test_dataset = QADataset( name=f"TEST-GCP sanity YOLO dataset{unique_id}", labeling_type=LabelingType.OBJECT_DETECTION, storage_config=StorageConfig( @@ -55,10 +55,10 @@ def cleanup_tests(): cleanup(test_dataset) -def test_dataset_optimization(): - full_run = dataset_optimization_sync_test( +def test_dataset_qa(): + full_run = dataset_qa_sync_test( test_dataset, - alternative_env="RUN_YOLO_OD_GCP_SANITY_OPTIMIZATION", + alternative_env="RUN_YOLO_OD_GCP_SANITY_DATA_QA", ) if full_run is not None: assert full_run.warnings_and_errors is not None @@ -67,11 +67,9 @@ def test_dataset_optimization(): assert full_run.suspects.shape[0] >= 30_000 # TODO: Add more assertions for results else: - logger.info("Full dataset optimization was not run!") + logger.info("Full dataset QA was not run!") @pytest.mark.asyncio -async def test_async_dataset_optimization(): - await dataset_optimization_async_test( - test_dataset, "RUN_YOLO_OD_GCP_SANITY_OPTIMIZATION" - ) +async def test_async_dataset_qa(): + await dataset_qa_async_test(test_dataset, "RUN_YOLO_OD_GCP_SANITY_DATA_QA") diff --git a/tests/object-detection/sama_coco_test.py b/tests/object-detection/sama_coco_test.py index 0d70b361..50fb23dd 100644 --- a/tests/object-detection/sama_coco_test.py +++ b/tests/object-detection/sama_coco_test.py @@ -6,15 +6,15 @@ from hirundo import ( COCO, LabelingType, - OptimizationDataset, + QADataset, StorageConfig, StorageGCP, StorageTypes, ) -from tests.dataset_optimization_shared import ( +from tests.dataset_qa_shared import ( cleanup, - dataset_optimization_async_test, - dataset_optimization_sync_test, + dataset_qa_async_test, + dataset_qa_sync_test, get_unique_id, ) @@ -26,7 +26,7 @@ project="Hirundo-global", credentials_json=json.loads(os.environ["GCP_CREDENTIALS"]), ) -test_dataset = OptimizationDataset( +test_dataset = QADataset( name=f"TEST-GCP sanity COCO dataset{unique_id}", labeling_type=LabelingType.OBJECT_DETECTION, storage_config=StorageConfig( @@ -50,10 +50,10 @@ def cleanup_tests(): cleanup(test_dataset) -def test_dataset_optimization(): - full_run = dataset_optimization_sync_test( +def test_dataset_qa(): + full_run = dataset_qa_sync_test( test_dataset, - alternative_env="RUN_COCO_OD_GCP_SANITY_OPTIMIZATION", + alternative_env="RUN_COCO_OD_GCP_SANITY_DATA_QA", ) if full_run is not None: assert full_run.warnings_and_errors is not None @@ -62,11 +62,9 @@ def test_dataset_optimization(): assert full_run.suspects.shape[0] >= 30_000 # TODO: Add more assertions for results else: - logger.info("Full dataset optimization was not run!") + logger.info("Full dataset QA was not run!") @pytest.mark.asyncio -async def test_async_dataset_optimization(): - await dataset_optimization_async_test( - test_dataset, "RUN_COCO_OD_GCP_SANITY_OPTIMIZATION" - ) +async def test_async_dataset_qa(): + await dataset_qa_async_test(test_dataset, "RUN_COCO_OD_GCP_SANITY_DATA_QA") diff --git a/tests/object-detection/sanity_aws_test.py b/tests/object-detection/sanity_aws_test.py index 038b690d..f5375a16 100644 --- a/tests/object-detection/sanity_aws_test.py +++ b/tests/object-detection/sanity_aws_test.py @@ -5,16 +5,16 @@ from hirundo import ( HirundoCSV, LabelingType, - OptimizationDataset, + ObjectDetectionRunArgs, + QADataset, StorageConfig, StorageS3, StorageTypes, - VisionRunArgs, ) -from tests.dataset_optimization_shared import ( +from tests.dataset_qa_shared import ( cleanup, - dataset_optimization_async_test, - dataset_optimization_sync_test, + dataset_qa_async_test, + dataset_qa_sync_test, get_unique_id, ) @@ -27,7 +27,7 @@ access_key_id=os.environ["AWS_ACCESS_KEY"], secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"], ) -test_dataset = OptimizationDataset( +test_dataset = QADataset( name=f"TEST-AWS-BDD-100k-subset-1000-OD-dataset{unique_id}", labeling_type=LabelingType.OBJECT_DETECTION, storage_config=StorageConfig( @@ -66,12 +66,12 @@ def cleanup_tests(): cleanup(test_dataset) -def test_dataset_optimization(): - full_run = dataset_optimization_sync_test( +def test_dataset_qa(): + full_run = dataset_qa_sync_test( test_dataset, sanity=True, - alternative_env="RUN_OD_AWS_SANITY_OPTIMIZATION", - run_args=VisionRunArgs( + alternative_env="RUN_OD_AWS_SANITY_DATA_QA", + run_args=ObjectDetectionRunArgs( upsample=True, min_abs_bbox_size=11, min_abs_bbox_area=121, @@ -87,11 +87,9 @@ def test_dataset_optimization(): assert full_run.suspects.shape[0] == 1_107 # TODO: Add more assertions for results else: - logger.info("Full dataset optimization was not run!") + logger.info("Full dataset QA was not run!") @pytest.mark.asyncio -async def test_async_dataset_optimization(): - await dataset_optimization_async_test( - test_dataset, "RUN_AWS_OD_SANITY_OPTIMIZATION" - ) +async def test_async_dataset_qa(): + await dataset_qa_async_test(test_dataset, "RUN_AWS_OD_SANITY_DATA_QA") diff --git a/tests/speech-to-text/sanity_stt_git_test.py b/tests/speech-to-text/sanity_stt_git_test.py index 0e7cfec0..8c7bff95 100644 --- a/tests/speech-to-text/sanity_stt_git_test.py +++ b/tests/speech-to-text/sanity_stt_git_test.py @@ -3,18 +3,19 @@ import pytest from hirundo import ( + Domain, GitPlainAuth, GitRepo, HirundoCSV, LabelingType, - OptimizationDataset, + QADataset, StorageConfig, StorageGit, StorageTypes, ) -from tests.dataset_optimization_shared import ( +from tests.dataset_qa_shared import ( cleanup, - dataset_optimization_sync_test, + dataset_qa_sync_test, get_unique_id, ) @@ -32,8 +33,9 @@ ), branch="main", ) -test_dataset = OptimizationDataset( +test_dataset = QADataset( name=f"TEST-STT-RoboShaulTiny-dataset{unique_id}", + domain=Domain.SPEECH, labeling_type=LabelingType.SPEECH_TO_TEXT, language="he", storage_config=StorageConfig( @@ -55,9 +57,9 @@ def cleanup_tests(): cleanup(test_dataset) -def test_dataset_optimization(): - full_run = dataset_optimization_sync_test( - test_dataset, sanity=True, alternative_env="RUN_STT_GIT_OPTIMIZATION" +def test_dataset_qa(): + full_run = dataset_qa_sync_test( + test_dataset, sanity=True, alternative_env="RUN_STT_GIT_DATA_QA" ) if full_run is not None: assert full_run.warnings_and_errors is not None @@ -66,4 +68,4 @@ def test_dataset_optimization(): assert full_run.suspects.shape[0] > 45 assert full_run.suspects.shape[0] < 100 else: - logger.info("Full dataset optimization was not run!") + logger.info("Full dataset QA was not run!") diff --git a/tests/speech-to-text/stt_git_test.py b/tests/speech-to-text/stt_git_test.py index e8f5054e..eeaaf2f9 100644 --- a/tests/speech-to-text/stt_git_test.py +++ b/tests/speech-to-text/stt_git_test.py @@ -3,18 +3,19 @@ import pytest from hirundo import ( + Domain, GitPlainAuth, GitRepo, HirundoCSV, LabelingType, - OptimizationDataset, + QADataset, StorageConfig, StorageGit, StorageTypes, ) -from tests.dataset_optimization_shared import ( +from tests.dataset_qa_shared import ( cleanup, - dataset_optimization_sync_test, + dataset_qa_sync_test, get_unique_id, ) @@ -32,8 +33,9 @@ ), branch="main", ) -test_dataset = OptimizationDataset( +test_dataset = QADataset( name=f"TEST-STT-MASC-dataset{unique_id}", + domain=Domain.SPEECH, labeling_type=LabelingType.SPEECH_TO_TEXT, language="ar", storage_config=StorageConfig( @@ -55,8 +57,8 @@ def cleanup_tests(): cleanup(test_dataset) -def test_dataset_optimization(): - full_run = dataset_optimization_sync_test(test_dataset, "RUN_STT_GIT_OPTIMIZATION") +def test_dataset_qa(): + full_run = dataset_qa_sync_test(test_dataset, "RUN_STT_GIT_DATA_QA") if full_run is not None: assert full_run.warnings_and_errors is not None assert ( @@ -65,4 +67,4 @@ def test_dataset_optimization(): assert full_run.suspects is not None assert full_run.suspects.shape[0] <= 10_000 else: - logger.info("Full dataset optimization was not run!") + logger.info("Full dataset QA was not run!")