diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
index 732aec98..259b0b87 100644
--- a/.github/workflows/lint.yaml
+++ b/.github/workflows/lint.yaml
@@ -10,6 +10,8 @@ concurrency:
jobs:
ruff:
runs-on: ubuntu-latest
+ permissions:
+ contents: read
strategy:
matrix:
python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
@@ -31,6 +33,8 @@ jobs:
pyright:
runs-on: ubuntu-latest
+ permissions:
+ contents: read
strategy:
matrix:
python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
diff --git a/.github/workflows/pytest-full.yaml b/.github/workflows/pytest-full.yaml
index 20cf0d10..81885f05 100644
--- a/.github/workflows/pytest-full.yaml
+++ b/.github/workflows/pytest-full.yaml
@@ -8,22 +8,24 @@ on:
jobs:
pytest_full:
runs-on: ubuntu-latest
+ permissions:
+ contents: read
strategy:
max-parallel: 2
matrix:
- optimization-test:
+ data-qa-test:
- test: classification/classification_aws_test.py
- env: RUN_CLASSIFICATION_AWS_OPTIMIZATION
+ env: RUN_CLASSIFICATION_AWS_DATA_QA
- test: classification/classification_gcp_test.py
- env: RUN_CLASSIFICATION_GCP_OPTIMIZATION
+ env: RUN_CLASSIFICATION_GCP_DATA_QA
- test: classification/sanity_gcp_test.py
- env: RUN_CLASSIFICATION_GCP_SANITY_OPTIMIZATION
+ env: RUN_CLASSIFICATION_GCP_SANITY_DATA_QA
- test: object-detection/od_aws_test.py
- env: RUN_AWS_OD_OPTIMIZATION
+ env: RUN_AWS_OD_DATA_QA
- test: object-detection/od_git_test.py
- env: RUN_OD_GIT_OPTIMIZATION
+ env: RUN_OD_GIT_DATA_QA
- test: tests/object-detection/sama_coco_test.py
- env: RUN_COCO_OD_GCP_SANITY_OPTIMIZATION
+ env: RUN_COCO_OD_GCP_SANITY_DATA_QA
steps:
- uses: actions/checkout@v4
- name: Set up Python
@@ -38,7 +40,7 @@ jobs:
source .venv/bin/activate
pip install -r requirements/dev.txt -r requirements/polars.txt
- name: Run PyTest
- run: .venv/bin/pytest tests/${{ matrix.optimization-test['test'] }}
+ run: .venv/bin/pytest tests/${{ matrix.data-qa-test['test'] }}
env:
API_HOST: ${{ secrets.API_HOST }}
API_KEY: ${{ secrets.API_KEY }}
@@ -47,4 +49,4 @@ jobs:
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
UNIQUE_ID: ${{ github.ref }}-${{ github.run_number }}
- ${{ matrix.optimization-test['env'] }}: true
+ ${{ matrix.data-qa-test['env'] }}: true
diff --git a/.github/workflows/pytest-sanity.yaml b/.github/workflows/pytest-sanity.yaml
index 7561a196..ad800870 100644
--- a/.github/workflows/pytest-sanity.yaml
+++ b/.github/workflows/pytest-sanity.yaml
@@ -14,6 +14,8 @@ concurrency:
jobs:
pytest_sanity:
runs-on: ${{ matrix.os }}
+ permissions:
+ contents: read
strategy:
max-parallel: 4
matrix:
diff --git a/.github/workflows/safety-scan.yml b/.github/workflows/safety-scan.yml
index 0a69f36d..e4d2dc35 100644
--- a/.github/workflows/safety-scan.yml
+++ b/.github/workflows/safety-scan.yml
@@ -15,6 +15,8 @@ concurrency:
jobs:
safety-scan:
runs-on: ubuntu-latest
+ permissions:
+ contents: read
steps:
- name: Checkout hirundo-client
uses: actions/checkout@v4
diff --git a/README.md b/README.md
index cd4cb83a..9679f35d 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
# Hirundo
-This package exposes access to Hirundo APIs for dataset optimization for Machine Learning.
+This package exposes access to Hirundo APIs for dataset QA for Machine Learning.
-Dataset optimization is currently available for datasets labelled for classification and object detection.
+Dataset QA is currently available for datasets labelled for classification and object detection.
Support dataset storage configs include:
@@ -73,7 +73,7 @@ Classification example:
from hirundo import (
HirundoCSV,
LabelingType,
- OptimizationDataset,
+ QADataset,
StorageGCP,
StorageConfig,
StorageTypes,
@@ -84,7 +84,7 @@ gcp_bucket = StorageGCP(
project="Hirundo-global",
credentials_json=json.loads(os.environ["GCP_CREDENTIALS"]),
)
-test_dataset = OptimizationDataset(
+test_dataset = QADataset(
name="TEST-GCP cifar 100 classification dataset",
labeling_type=LabelingType.SINGLE_LABEL_CLASSIFICATION,
storage_config=StorageConfig(
@@ -99,7 +99,7 @@ test_dataset = OptimizationDataset(
classes=cifar100_classes,
)
-test_dataset.run_optimization()
+test_dataset.run_qa()
results = test_dataset.check_run()
print(results)
```
@@ -111,7 +111,7 @@ from hirundo import (
GitRepo,
HirundoCSV,
LabelingType,
- OptimizationDataset,
+ QADataset,
StorageGit,
StorageConfig,
StorageTypes,
@@ -124,7 +124,7 @@ git_storage = StorageGit(
),
branch="main",
)
-test_dataset = OptimizationDataset(
+test_dataset = QADataset(
name="TEST-HuggingFace-BDD-100k-validation-OD-validation-dataset",
labeling_type=LabelingType.OBJECT_DETECTION,
storage_config=StorageConfig(
@@ -140,7 +140,7 @@ test_dataset = OptimizationDataset(
),
)
-test_dataset.run_optimization()
+test_dataset.run_qa()
results = test_dataset.check_run()
print(results)
```
diff --git a/docs/hirundo.dataset_optimization.rst b/docs/hirundo.dataset_optimization.rst
index df2a20e5..06b359f1 100644
--- a/docs/hirundo.dataset_optimization.rst
+++ b/docs/hirundo.dataset_optimization.rst
@@ -1,10 +1,10 @@
.. meta::
:http-equiv=Content-Security-Policy: default-src 'self', frame-ancestors 'none'
-hirundo.dataset\_optimization module
+hirundo.dataset\_qa module
====================================
-.. automodule:: hirundo.dataset_optimization
+.. automodule:: hirundo.dataset_qa
:members:
:undoc-members:
:show-inheritance:
diff --git a/docs/hirundo.rst b/docs/hirundo.rst
index 924f2687..b34766ca 100644
--- a/docs/hirundo.rst
+++ b/docs/hirundo.rst
@@ -11,7 +11,7 @@ Submodules
:maxdepth: 4
hirundo.cli
- hirundo.dataset_optimization
+ hirundo.dataset_qa
hirundo.enum
hirundo.git
hirundo.storage
diff --git a/hirundo/__init__.py b/hirundo/__init__.py
index f569fa0e..1f3e31ab 100644
--- a/hirundo/__init__.py
+++ b/hirundo/__init__.py
@@ -3,13 +3,15 @@
LabelingType,
StorageTypes,
)
-from .dataset_optimization import (
+from .dataset_qa import (
+ ClassificationRunArgs,
+ Domain,
HirundoError,
- OptimizationDataset,
+ ObjectDetectionRunArgs,
+ QADataset,
RunArgs,
- VisionRunArgs,
)
-from .dataset_optimization_results import DatasetOptimizationResults
+from .dataset_qa_results import DatasetQAResults
from .git import GitPlainAuth, GitRepo, GitSSHAuth
from .labeling import (
COCO,
@@ -40,9 +42,11 @@
"KeylabsObjDetVideo",
"KeylabsObjSegImages",
"KeylabsObjSegVideo",
- "OptimizationDataset",
+ "QADataset",
+ "Domain",
"RunArgs",
- "VisionRunArgs",
+ "ClassificationRunArgs",
+ "ObjectDetectionRunArgs",
"DatasetMetadataType",
"LabelingType",
"GitPlainAuth",
@@ -54,7 +58,7 @@
# "StorageAzure", TODO: Azure storage is coming soon
"StorageGit",
"StorageConfig",
- "DatasetOptimizationResults",
+ "DatasetQAResults",
"load_df",
"load_from_zip",
]
diff --git a/hirundo/_constraints.py b/hirundo/_constraints.py
index 7fc3862b..1703273b 100644
--- a/hirundo/_constraints.py
+++ b/hirundo/_constraints.py
@@ -11,7 +11,7 @@
if TYPE_CHECKING:
from hirundo._urls import HirundoUrl
- from hirundo.dataset_optimization import LabelingInfo
+ from hirundo.dataset_qa import LabelingInfo
from hirundo.storage import (
ResponseStorageConfig,
StorageConfig,
diff --git a/hirundo/cli.py b/hirundo/cli.py
index c9db2da4..fc64d52d 100644
--- a/hirundo/cli.py
+++ b/hirundo/cli.py
@@ -198,9 +198,9 @@ def check_run(
"""
Check the status of a run.
"""
- from hirundo.dataset_optimization import OptimizationDataset
+ from hirundo.dataset_qa import QADataset
- results = OptimizationDataset.check_run_by_id(run_id)
+ results = QADataset.check_run_by_id(run_id)
print(f"Run results saved to {results.cached_zip_path}")
@@ -209,9 +209,9 @@ def list_runs():
"""
List all runs available.
"""
- from hirundo.dataset_optimization import OptimizationDataset
+ from hirundo.dataset_qa import QADataset
- runs = OptimizationDataset.list_runs()
+ runs = QADataset.list_runs()
console = Console()
table = Table(
diff --git a/hirundo/dataset_optimization.py b/hirundo/dataset_qa.py
similarity index 80%
rename from hirundo/dataset_optimization.py
rename to hirundo/dataset_qa.py
index a82c3b68..117d015b 100644
--- a/hirundo/dataset_optimization.py
+++ b/hirundo/dataset_qa.py
@@ -19,7 +19,7 @@
from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
from hirundo._urls import HirundoUrl
from hirundo.dataset_enum import DatasetMetadataType, LabelingType
-from hirundo.dataset_optimization_results import DatasetOptimizationResults
+from hirundo.dataset_qa_results import DatasetQAResults
from hirundo.labeling import YOLO, LabelingInfo
from hirundo.logger import get_logger
from hirundo.storage import ResponseStorageConfig, StorageConfig
@@ -30,7 +30,7 @@
class HirundoError(Exception):
"""
- Custom exception used to indicate errors in `hirundo` dataset optimization runs
+ Custom exception used to indicate errors in `hirundo` dataset QA runs
"""
pass
@@ -51,14 +51,14 @@ class RunStatus(Enum):
STATUS_TO_TEXT_MAP = {
- RunStatus.STARTED.value: "Optimization run in progress. Downloading dataset",
- RunStatus.PENDING.value: "Optimization run queued and not yet started",
- RunStatus.SUCCESS.value: "Optimization run completed successfully",
- RunStatus.FAILURE.value: "Optimization run failed",
+ RunStatus.STARTED.value: "Dataset QA run in progress. Downloading dataset",
+ RunStatus.PENDING.value: "Dataset QA run queued and not yet started",
+ RunStatus.SUCCESS.value: "Dataset QA run completed successfully",
+ RunStatus.FAILURE.value: "Dataset QA run failed",
RunStatus.AWAITING_MANUAL_APPROVAL.value: "Awaiting manual approval",
- RunStatus.RETRY.value: "Optimization run failed. Retrying",
- RunStatus.REVOKED.value: "Optimization run was cancelled",
- RunStatus.REJECTED.value: "Optimization run was rejected",
+ RunStatus.RETRY.value: "Dataset QA run failed. Retrying",
+ RunStatus.REVOKED.value: "Dataset QA run was cancelled",
+ RunStatus.REJECTED.value: "Dataset QA run was rejected",
}
STATUS_TO_PROGRESS_MAP = {
RunStatus.STARTED.value: 0.0,
@@ -72,33 +72,51 @@ class RunStatus(Enum):
}
-class VisionRunArgs(BaseModel):
- upsample: bool = False
+class ClassificationRunArgs(BaseModel):
+ image_size: typing.Optional[tuple[int, int]] = (224, 224)
+ """
+ Size (width, height) to which to resize classification images.
+ It is recommended to keep this value at (224, 224) unless your classes are differentiated by very small differences.
+ """
+ upsample: typing.Optional[bool] = False
"""
Whether to upsample the dataset to attempt to balance the classes.
"""
- min_abs_bbox_size: int = 0
+
+
+class ObjectDetectionRunArgs(ClassificationRunArgs):
+ min_abs_bbox_size: typing.Optional[int] = None
"""
- Minimum valid size (in pixels) of a bounding box to keep it in the dataset for optimization.
+ Minimum valid size (in pixels) of a bounding box to keep it in the dataset for QA.
"""
- min_abs_bbox_area: int = 0
+ min_abs_bbox_area: typing.Optional[int] = None
"""
- Minimum valid absolute area (in pixels²) of a bounding box to keep it in the dataset for optimization.
+ Minimum valid absolute area (in pixels²) of a bounding box to keep it in the dataset for QA.
"""
- min_rel_bbox_size: float = 0.0
+ min_rel_bbox_size: typing.Optional[float] = None
"""
Minimum valid size (as a fraction of both image height and width) for a bounding box
- to keep it in the dataset for optimization, relative to the corresponding dimension size,
+ to keep it in the dataset for QA, relative to the corresponding dimension size,
i.e. if the bounding box is 10% of the image width and 5% of the image height, it will be kept if this value is 0.05, but not if the
value is 0.06 (since both width and height are checked).
"""
- min_rel_bbox_area: float = 0.0
+ min_rel_bbox_area: typing.Optional[float] = None
+ """
+ Minimum valid relative area (as a fraction of the image area) of a bounding box to keep it in the dataset for QA.
+ """
+ crop_ratio: typing.Optional[float] = None
+ """
+ Ratio of the bounding box to crop.
+ Change this value at your own risk. It is recommended to keep it at 1.0 unless you know what you are doing.
+ """
+ add_mask_channel: typing.Optional[bool] = None
"""
- Minimum valid relative area (as a fraction of the image area) of a bounding box to keep it in the dataset for optimization.
+ Whether to add a mask channel to the image.
+ Change at your own risk. It is recommended to keep it at False unless you know what you are doing.
"""
-RunArgs = typing.Union[VisionRunArgs]
+RunArgs = typing.Union[ClassificationRunArgs, ObjectDetectionRunArgs]
class AugmentationName(str, Enum):
@@ -111,13 +129,14 @@ class AugmentationName(str, Enum):
GAUSSIAN_BLUR = "GaussianBlur"
-class Modality(str, Enum):
- IMAGE = "Image"
- RADAR = "Radar"
- EKG = "EKG"
+class Domain(str, Enum):
+ RADAR = "RADAR"
+ VISION = "VISION"
+ SPEECH = "SPEECH"
+ TABULAR = "TABULAR"
-class OptimizationDataset(BaseModel):
+class QADataset(BaseModel):
id: typing.Optional[int] = Field(default=None)
"""
The ID of the dataset created on the server.
@@ -171,15 +190,15 @@ class OptimizationDataset(BaseModel):
For audio datasets, this field is ignored.
If no value is provided, all augmentations are applied to vision datasets.
"""
- modality: Modality = Modality.IMAGE
+ domain: Domain = Domain.VISION
"""
- Used to define the modality of the dataset.
+ Used to define the domain of the dataset.
Defaults to Image.
"""
run_id: typing.Optional[str] = Field(default=None, init=False)
"""
- The ID of the Dataset Optimization run created on the server.
+ The ID of the Dataset QA run created on the server.
"""
status: typing.Optional[RunStatus] = None
@@ -229,52 +248,52 @@ def validate_dataset(self):
return self
@staticmethod
- def get_by_id(dataset_id: int) -> "OptimizationDataset":
+ def get_by_id(dataset_id: int) -> "QADataset":
"""
- Get a `OptimizationDataset` instance from the server by its ID
+ Get a `QADataset` instance from the server by its ID
Args:
- dataset_id: The ID of the `OptimizationDataset` instance to get
+ dataset_id: The ID of the `QADataset` instance to get
"""
response = requests.get(
- f"{API_HOST}/dataset-optimization/dataset/{dataset_id}",
+ f"{API_HOST}/dataset-qa/dataset/{dataset_id}",
headers=get_headers(),
timeout=READ_TIMEOUT,
)
raise_for_status_with_reason(response)
dataset = response.json()
- return OptimizationDataset(**dataset)
+ return QADataset(**dataset)
@staticmethod
- def get_by_name(name: str) -> "OptimizationDataset":
+ def get_by_name(name: str) -> "QADataset":
"""
- Get a `OptimizationDataset` instance from the server by its name
+ Get a `QADataset` instance from the server by its name
Args:
- name: The name of the `OptimizationDataset` instance to get
+ name: The name of the `QADataset` instance to get
"""
response = requests.get(
- f"{API_HOST}/dataset-optimization/dataset/by-name/{name}",
+ f"{API_HOST}/dataset-qa/dataset/by-name/{name}",
headers=get_headers(),
timeout=READ_TIMEOUT,
)
raise_for_status_with_reason(response)
dataset = response.json()
- return OptimizationDataset(**dataset)
+ return QADataset(**dataset)
@staticmethod
def list_datasets(
organization_id: typing.Optional[int] = None,
- ) -> list["DataOptimizationDatasetOut"]:
+ ) -> list["QADatasetOut"]:
"""
- Lists all the optimization datasets created by user's default organization
+ Lists all the datasets created by user's default organization
or the `organization_id` passed
Args:
organization_id: The ID of the organization to list the datasets for.
"""
response = requests.get(
- f"{API_HOST}/dataset-optimization/dataset/",
+ f"{API_HOST}/dataset-qa/dataset/",
params={"dataset_organization_id": organization_id},
headers=get_headers(),
timeout=READ_TIMEOUT,
@@ -282,7 +301,7 @@ def list_datasets(
raise_for_status_with_reason(response)
datasets = response.json()
return [
- DataOptimizationDatasetOut(
+ QADatasetOut(
**ds,
)
for ds in datasets
@@ -291,17 +310,17 @@ def list_datasets(
@staticmethod
def list_runs(
organization_id: typing.Optional[int] = None,
- ) -> list["DataOptimizationRunOut"]:
+ ) -> list["DataQARunOut"]:
"""
- Lists all the `OptimizationDataset` instances created by user's default organization
+ Lists all the `QADataset` instances created by user's default organization
or the `organization_id` passed
- Note: The return type is `list[dict]` and not `list[OptimizationDataset]`
+ Note: The return type is `list[dict]` and not `list[QADataset]`
Args:
organization_id: The ID of the organization to list the datasets for.
"""
response = requests.get(
- f"{API_HOST}/dataset-optimization/run/list",
+ f"{API_HOST}/dataset-qa/run/list",
params={"dataset_organization_id": organization_id},
headers=get_headers(),
timeout=READ_TIMEOUT,
@@ -309,7 +328,7 @@ def list_runs(
raise_for_status_with_reason(response)
runs = response.json()
return [
- DataOptimizationRunOut(
+ DataQARunOut(
**run,
)
for run in runs
@@ -318,13 +337,13 @@ def list_runs(
@staticmethod
def delete_by_id(dataset_id: int) -> None:
"""
- Deletes a `OptimizationDataset` instance from the server by its ID
+ Deletes a `QADataset` instance from the server by its ID
Args:
- dataset_id: The ID of the `OptimizationDataset` instance to delete
+ dataset_id: The ID of the `QADataset` instance to delete
"""
response = requests.delete(
- f"{API_HOST}/dataset-optimization/dataset/{dataset_id}",
+ f"{API_HOST}/dataset-qa/dataset/{dataset_id}",
headers=get_headers(),
timeout=MODIFY_TIMEOUT,
)
@@ -333,14 +352,14 @@ def delete_by_id(dataset_id: int) -> None:
def delete(self, storage_config=True) -> None:
"""
- Deletes the active `OptimizationDataset` instance from the server.
- It can only be used on a `OptimizationDataset` instance that has been created.
+ Deletes the active `QADataset` instance from the server.
+ It can only be used on a `QADataset` instance that has been created.
Args:
- storage_config: If True, the `OptimizationDataset`'s `StorageConfig` will also be deleted
+ storage_config: If True, the `QADataset`'s `StorageConfig` will also be deleted
Note: If `storage_config` is not set to `False` then the `storage_config_id` must be set
- This can either be set manually or by creating the `StorageConfig` instance via the `OptimizationDataset`'s
+ This can either be set manually or by creating the `StorageConfig` instance via the `QADataset`'s
`create` method
"""
if storage_config:
@@ -357,7 +376,7 @@ def create(
replace_if_exists: bool = False,
) -> int:
"""
- Create a `OptimizationDataset` instance on the server.
+ Create a `QADataset` instance on the server.
If the `storage_config_id` field is not set, the storage config will also be created and the field will be set.
Args:
@@ -366,7 +385,7 @@ def create(
(this is determined by a dataset of the same name in the same organization).
Returns:
- The ID of the created `OptimizationDataset` instance
+ The ID of the created `QADataset` instance
"""
if self.storage_config is None and self.storage_config_id is None:
raise ValueError("No dataset storage has been provided")
@@ -391,7 +410,7 @@ def create(
model_dict = self.model_dump(mode="json")
# ⬆️ Get dict of model fields from Pydantic model instance
dataset_response = requests.post(
- f"{API_HOST}/dataset-optimization/dataset/",
+ f"{API_HOST}/dataset-qa/dataset/",
json={
**{k: model_dict[k] for k in model_dict.keys() - {"storage_config"}},
"organization_id": organization_id,
@@ -408,17 +427,17 @@ def create(
return self.id
@staticmethod
- def launch_optimization_run(
+ def launch_qa_run(
dataset_id: int,
organization_id: typing.Optional[int] = None,
run_args: typing.Optional[RunArgs] = None,
) -> str:
"""
- Run the dataset optimization process on the server using the dataset with the given ID
+ Run the dataset QA process on the server using the dataset with the given ID
i.e. `dataset_id`.
Args:
- dataset_id: The ID of the dataset to run optimization on.
+ dataset_id: The ID of the dataset to run QA on.
Returns:
ID of the run (`run_id`).
@@ -429,7 +448,7 @@ def launch_optimization_run(
if run_args:
run_info["run_args"] = run_args.model_dump(mode="json")
run_response = requests.post(
- f"{API_HOST}/dataset-optimization/run/{dataset_id}",
+ f"{API_HOST}/dataset-qa/run/{dataset_id}",
json=run_info if len(run_info) > 0 else None,
headers=get_headers(),
timeout=MODIFY_TIMEOUT,
@@ -440,12 +459,16 @@ def launch_optimization_run(
def _validate_run_args(self, run_args: RunArgs) -> None:
if self.labeling_type == LabelingType.SPEECH_TO_TEXT:
raise Exception("Speech to text cannot have `run_args` set")
- if self.labeling_type != LabelingType.OBJECT_DETECTION and any(
- (
- run_args.min_abs_bbox_size != 0,
- run_args.min_abs_bbox_area != 0,
- run_args.min_rel_bbox_size != 0,
- run_args.min_rel_bbox_area != 0,
+ if (
+ self.labeling_type != LabelingType.OBJECT_DETECTION
+ and isinstance(run_args, ObjectDetectionRunArgs)
+ and any(
+ (
+ run_args.min_abs_bbox_size != 0,
+ run_args.min_abs_bbox_area != 0,
+ run_args.min_rel_bbox_size != 0,
+ run_args.min_rel_bbox_area != 0,
+ )
)
):
raise Exception(
@@ -454,7 +477,7 @@ def _validate_run_args(self, run_args: RunArgs) -> None:
+ f"labeling type {self.labeling_type}"
)
- def run_optimization(
+ def run_qa(
self,
organization_id: typing.Optional[int] = None,
replace_dataset_if_exists: bool = False,
@@ -462,13 +485,13 @@ def run_optimization(
) -> str:
"""
If the dataset was not created on the server yet, it is created.
- Run the dataset optimization process on the server using the active `OptimizationDataset` instance
+ Run the dataset QA process on the server using the active `QADataset` instance
Args:
- organization_id: The ID of the organization to run the optimization for.
+ organization_id: The ID of the organization to run the QA for.
replace_dataset_if_exists: If True, the dataset will be replaced if it already exists
(this is determined by a dataset of the same name in the same organization).
- run_args: The run arguments to use for the optimization run
+ run_args: The run arguments to use for the QA run
Returns:
An ID of the run (`run_id`) and stores that `run_id` on the instance
@@ -478,7 +501,7 @@ def run_optimization(
self.id = self.create(replace_if_exists=replace_dataset_if_exists)
if run_args is not None:
self._validate_run_args(run_args)
- run_id = self.launch_optimization_run(self.id, organization_id, run_args)
+ run_id = self.launch_qa_run(self.id, organization_id, run_args)
self.run_id = run_id
logger.info("Started the run with ID: %s", run_id)
return run_id
@@ -516,7 +539,7 @@ def _check_run_by_id(run_id: str, retry=0) -> Generator[dict, None, None]:
for sse in iter_sse_retrying(
client,
"GET",
- f"{API_HOST}/dataset-optimization/run/{run_id}",
+ f"{API_HOST}/dataset-qa/run/{run_id}",
headers=get_headers(),
):
if sse.event == "ping":
@@ -542,50 +565,46 @@ def _check_run_by_id(run_id: str, retry=0) -> Generator[dict, None, None]:
raise HirundoError("Unknown error")
yield data
if not last_event or last_event["data"]["state"] == RunStatus.PENDING.value:
- OptimizationDataset._check_run_by_id(run_id, retry + 1)
+ QADataset._check_run_by_id(run_id, retry + 1)
@staticmethod
def _handle_failure(iteration: dict):
if iteration["result"]:
- raise HirundoError(
- f"Optimization run failed with error: {iteration['result']}"
- )
+ raise HirundoError(f"QA run failed with error: {iteration['result']}")
else:
- raise HirundoError(
- "Optimization run failed with an unknown error in _handle_failure"
- )
+ raise HirundoError("QA run failed with an unknown error in _handle_failure")
@staticmethod
@overload
def check_run_by_id(
run_id: str, stop_on_manual_approval: typing.Literal[True]
- ) -> typing.Optional[DatasetOptimizationResults]: ...
+ ) -> typing.Optional[DatasetQAResults]: ...
@staticmethod
@overload
def check_run_by_id(
run_id: str, stop_on_manual_approval: typing.Literal[False] = False
- ) -> DatasetOptimizationResults: ...
+ ) -> DatasetQAResults: ...
@staticmethod
@overload
def check_run_by_id(
run_id: str, stop_on_manual_approval: bool
- ) -> typing.Optional[DatasetOptimizationResults]: ...
+ ) -> typing.Optional[DatasetQAResults]: ...
@staticmethod
def check_run_by_id(
run_id: str, stop_on_manual_approval: bool = False
- ) -> typing.Optional[DatasetOptimizationResults]:
+ ) -> typing.Optional[DatasetQAResults]:
"""
Check the status of a run given its ID
Args:
- run_id: The `run_id` produced by a `run_optimization` call
+ run_id: The `run_id` produced by a `run_qa` call
stop_on_manual_approval: If True, the function will return `None` if the run is awaiting manual approval
Returns:
- A DatasetOptimizationResults object with the results of the optimization run
+ A DatasetQAResults object with the results of the QA run
Raises:
HirundoError: If the maximum number of retries is reached or if the run fails
@@ -593,7 +612,7 @@ def check_run_by_id(
logger.debug("Checking run with ID: %s", run_id)
with logging_redirect_tqdm():
t = tqdm(total=100.0)
- for iteration in OptimizationDataset._check_run_by_id(run_id):
+ for iteration in QADataset._check_run_by_id(run_id):
if iteration["state"] in STATUS_TO_PROGRESS_MAP:
t.set_description(STATUS_TO_TEXT_MAP[iteration["state"]])
t.n = STATUS_TO_PROGRESS_MAP[iteration["state"]]
@@ -608,11 +627,11 @@ def check_run_by_id(
"State is failure, rejected, or revoked: %s",
iteration["state"],
)
- OptimizationDataset._handle_failure(iteration)
+ QADataset._handle_failure(iteration)
elif iteration["state"] == RunStatus.SUCCESS.value:
t.close()
zip_temporary_url = iteration["result"]
- logger.debug("Optimization run completed. Downloading results")
+ logger.debug("QA run completed. Downloading results")
return download_and_extract_zip(
run_id,
@@ -644,7 +663,7 @@ def check_run_by_id(
stage = "Unknown progress state"
current_progress_percentage = t.n # Keep the same progress
desc = (
- "Optimization run completed. Uploading results"
+ "QA run completed. Uploading results"
if current_progress_percentage == 100.0
else stage
)
@@ -652,28 +671,26 @@ def check_run_by_id(
t.n = current_progress_percentage
logger.debug("Setting progress to %s", t.n)
t.refresh()
- raise HirundoError(
- "Optimization run failed with an unknown error in check_run_by_id"
- )
+ raise HirundoError("QA run failed with an unknown error in check_run_by_id")
@overload
def check_run(
self, stop_on_manual_approval: typing.Literal[True]
- ) -> typing.Optional[DatasetOptimizationResults]: ...
+ ) -> typing.Optional[DatasetQAResults]: ...
@overload
def check_run(
self, stop_on_manual_approval: typing.Literal[False] = False
- ) -> DatasetOptimizationResults: ...
+ ) -> DatasetQAResults: ...
def check_run(
self, stop_on_manual_approval: bool = False
- ) -> typing.Optional[DatasetOptimizationResults]:
+ ) -> typing.Optional[DatasetQAResults]:
"""
Check the status of the current active instance's run.
Returns:
- A pandas DataFrame with the results of the optimization run
+ A pandas DataFrame with the results of the QA run
"""
if not self.run_id:
@@ -690,7 +707,7 @@ async def acheck_run_by_id(run_id: str, retry=0) -> AsyncGenerator[dict, None]:
This generator will produce values to show progress of the run.
Args:
- run_id: The `run_id` produced by a `run_optimization` call
+ run_id: The `run_id` produced by a `run_qa` call
retry: A number used to track the number of retries to limit re-checks. *Do not* provide this value manually.
Yields:
@@ -709,7 +726,7 @@ async def acheck_run_by_id(run_id: str, retry=0) -> AsyncGenerator[dict, None]:
async_iterator = await aiter_sse_retrying(
client,
"GET",
- f"{API_HOST}/dataset-optimization/run/{run_id}",
+ f"{API_HOST}/dataset-qa/run/{run_id}",
headers=get_headers(),
)
async for sse in async_iterator:
@@ -725,7 +742,7 @@ async def acheck_run_by_id(run_id: str, retry=0) -> AsyncGenerator[dict, None]:
last_event = json.loads(sse.data)
yield last_event["data"]
if not last_event or last_event["data"]["state"] == RunStatus.PENDING.value:
- OptimizationDataset.acheck_run_by_id(run_id, retry + 1)
+ QADataset.acheck_run_by_id(run_id, retry + 1)
async def acheck_run(self) -> AsyncGenerator[dict, None]:
"""
@@ -749,14 +766,14 @@ async def acheck_run(self) -> AsyncGenerator[dict, None]:
@staticmethod
def cancel_by_id(run_id: str) -> None:
"""
- Cancel the dataset optimization run for the given `run_id`.
+ Cancel the dataset QA run for the given `run_id`.
Args:
run_id: The ID of the run to cancel
"""
logger.info("Cancelling run with ID: %s", run_id)
response = requests.delete(
- f"{API_HOST}/dataset-optimization/run/{run_id}",
+ f"{API_HOST}/dataset-qa/run/{run_id}",
headers=get_headers(),
timeout=MODIFY_TIMEOUT,
)
@@ -773,14 +790,14 @@ def cancel(self) -> None:
@staticmethod
def archive_run_by_id(run_id: str) -> None:
"""
- Archive the dataset optimization run for the given `run_id`.
+ Archive the dataset QA run for the given `run_id`.
Args:
run_id: The ID of the run to archive
"""
logger.info("Archiving run with ID: %s", run_id)
response = requests.patch(
- f"{API_HOST}/dataset-optimization/run/archive/{run_id}",
+ f"{API_HOST}/dataset-qa/run/archive/{run_id}",
headers=get_headers(),
timeout=MODIFY_TIMEOUT,
)
@@ -795,7 +812,7 @@ def archive(self) -> None:
self.archive_run_by_id(self.run_id)
-class DataOptimizationDatasetOut(BaseModel):
+class QADatasetOut(BaseModel):
id: int
name: str
@@ -814,7 +831,7 @@ class DataOptimizationDatasetOut(BaseModel):
updated_at: datetime.datetime
-class DataOptimizationRunOut(BaseModel):
+class DataQARunOut(BaseModel):
id: int
name: str
dataset_id: int
diff --git a/hirundo/dataset_optimization_results.py b/hirundo/dataset_qa_results.py
similarity index 82%
rename from hirundo/dataset_optimization_results.py
rename to hirundo/dataset_qa_results.py
index c962ee9f..5c776324 100644
--- a/hirundo/dataset_optimization_results.py
+++ b/hirundo/dataset_qa_results.py
@@ -21,7 +21,7 @@
T = typing.TypeVar("T")
-class DatasetOptimizationResults(BaseModel, typing.Generic[T]):
+class DatasetQAResults(BaseModel, typing.Generic[T]):
model_config = {"arbitrary_types_allowed": True}
cached_zip_path: Path
@@ -30,13 +30,13 @@ class DatasetOptimizationResults(BaseModel, typing.Generic[T]):
"""
suspects: T
"""
- A polars/pandas DataFrame containing the results of the optimization run
+ A polars/pandas DataFrame containing the results of the data QA run
"""
object_suspects: typing.Optional[T]
"""
- A polars/pandas DataFrame containing the object-level results of the optimization run
+ A polars/pandas DataFrame containing the object-level results of the data QA run
"""
warnings_and_errors: T
"""
- A polars/pandas DataFrame containing the warnings and errors of the optimization run
+ A polars/pandas DataFrame containing the warnings and errors of the data QA run
"""
diff --git a/hirundo/unzip.py b/hirundo/unzip.py
index aba2a85b..30fd40ba 100644
--- a/hirundo/unzip.py
+++ b/hirundo/unzip.py
@@ -19,9 +19,9 @@
from hirundo._env import API_HOST
from hirundo._headers import _get_auth_headers
from hirundo._timeouts import DOWNLOAD_READ_TIMEOUT
-from hirundo.dataset_optimization_results import (
+from hirundo.dataset_qa_results import (
DataFrameType,
- DatasetOptimizationResults,
+ DatasetQAResults,
)
from hirundo.logger import get_logger
@@ -117,7 +117,7 @@ def get_mislabel_suspect_filename(filenames: list[str]):
def download_and_extract_zip(
run_id: str, zip_url: str
-) -> DatasetOptimizationResults[DataFrameType]:
+) -> DatasetQAResults[DataFrameType]:
"""
Download and extract the zip file from the given URL.
@@ -127,11 +127,11 @@ def download_and_extract_zip(
and `warnings_and_errors.csv` files from the zip file.
Args:
- run_id: The ID of the optimization run.
+ run_id: The ID of the dataset QA run.
zip_url: The URL of the zip file to download.
Returns:
- The dataset optimization results object.
+ The dataset QA results object.
"""
# Define the local file path
cache_dir = Path.home() / ".hirundo" / "cache"
@@ -140,9 +140,8 @@ def download_and_extract_zip(
headers = None
if Url(zip_url).scheme == "file":
- zip_url = (
- f"{API_HOST}/dataset-optimization/run/local-download"
- + zip_url.replace("file://", "")
+ zip_url = f"{API_HOST}/dataset-qa/run/local-download" + zip_url.replace(
+ "file://", ""
)
headers = _get_auth_headers()
# Stream the zip file download
@@ -217,7 +216,7 @@ def download_and_extract_zip(
"Failed to load warnings and errors into DataFrame", exc_info=e
)
- return DatasetOptimizationResults[DataFrameType](
+ return DatasetQAResults[DataFrameType](
cached_zip_path=zip_file_path,
suspects=suspects_df,
object_suspects=object_suspects_df,
diff --git a/notebooks/Cancel_Dataset_Optimization_Run.ipynb b/notebooks/Cancel_Dataset_QA_Run.ipynb
similarity index 74%
rename from notebooks/Cancel_Dataset_Optimization_Run.ipynb
rename to notebooks/Cancel_Dataset_QA_Run.ipynb
index d6a1dfc0..4b2f7a38 100644
--- a/notebooks/Cancel_Dataset_Optimization_Run.ipynb
+++ b/notebooks/Cancel_Dataset_QA_Run.ipynb
@@ -7,7 +7,7 @@
"id": "view-in-github"
},
"source": [
- "
"
+ "
"
]
},
{
@@ -16,7 +16,7 @@
"id": "WOHjhp9ed6LM"
},
"source": [
- "# How to use cancel a Hirundo's Dataset Optimization run\n",
+ "# How to use cancel a Hirundo's Dataset QA run\n",
"\n",
"---\n",
"\n",
@@ -51,7 +51,7 @@
"id": "szD58TSCivAN"
},
"source": [
- "2. Import `OptimizeDataset` class from `hirundo`"
+ "2. Import `QADataset` class from `hirundo`"
]
},
{
@@ -63,7 +63,7 @@
"outputs": [],
"source": [
"from hirundo import (\n",
- " OptimizationDataset,\n",
+ " QADataset,\n",
")"
]
},
@@ -73,7 +73,7 @@
"id": "eJhmScbels65"
},
"source": [
- "3. Cancel dataset optimization run by calling `cancel_by_id` with the dataset optimization run ID (a string)"
+ "3. Cancel dataset QA run by calling `cancel_by_id` with the dataset QA run ID (a string)"
]
},
{
@@ -84,12 +84,12 @@
},
"outputs": [],
"source": [
- "dataset_optimization_run_id = \"\" # @param {type:\"string\"}\n",
+ "dataset_qa_run_id = \"\" # @param {type:\"string\"}\n",
"\n",
- "if not dataset_optimization_run_id:\n",
- " raise ValueError(\"You must enter a dataset optimization run ID\")\n",
+ "if not dataset_qa_run_id:\n",
+ " raise ValueError(\"You must enter a dataset QA run ID\")\n",
"\n",
- "OptimizationDataset.cancel_by_id(dataset_optimization_run_id)"
+ "QADataset.cancel_by_id(dataset_qa_run_id)"
]
}
],
diff --git a/notebooks/Create_BDD100k_dataset_AWS_S3.ipynb b/notebooks/Create_BDD100k_dataset_AWS_S3.ipynb
index 0e490015..f63d3b41 100644
--- a/notebooks/Create_BDD100k_dataset_AWS_S3.ipynb
+++ b/notebooks/Create_BDD100k_dataset_AWS_S3.ipynb
@@ -203,7 +203,7 @@
" \"bbox_id\": str(\n",
" i\n",
" ), # Box index in image used for box ID (since no unique ID is provided)\n",
- " \"label\": label[\"category\"],\n",
+ " \"class_name\": label[\"category\"],\n",
" \"xmin\": int(label[\"box2d\"][\"x1\"]),\n",
" \"ymin\": int(label[\"box2d\"][\"y1\"]),\n",
" \"xmax\": int(label[\"box2d\"][\"x2\"]),\n",
diff --git a/notebooks/Create_BDD100k_dataset_HuggingFace.ipynb b/notebooks/Create_BDD100k_dataset_HuggingFace.ipynb
index fdf79e88..f2d310cd 100644
--- a/notebooks/Create_BDD100k_dataset_HuggingFace.ipynb
+++ b/notebooks/Create_BDD100k_dataset_HuggingFace.ipynb
@@ -150,7 +150,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -188,7 +188,7 @@
" \"bbox_id\": str(\n",
" i\n",
" ), # Box index in image used for box ID (since no unique ID is provided)\n",
- " \"label\": label[\"category\"],\n",
+ " \"class_name\": label[\"category\"],\n",
" \"xmin\": int(label[\"box2d\"][\"x1\"]),\n",
" \"ymin\": int(label[\"box2d\"][\"y1\"]),\n",
" \"xmax\": int(label[\"box2d\"][\"x2\"]),\n",
diff --git a/notebooks/Create_cifar100_dataset_GCP.ipynb b/notebooks/Create_cifar100_dataset_GCP.ipynb
index 5e47fd50..8d766887 100644
--- a/notebooks/Create_cifar100_dataset_GCP.ipynb
+++ b/notebooks/Create_cifar100_dataset_GCP.ipynb
@@ -112,13 +112,13 @@
"img_dir = temp_dir_dataset_path / \"images\"\n",
"img_dir.mkdir()\n",
"\n",
- "csv = pd.DataFrame(columns=[\"image_path\", \"label\"], index=range(len(cifar100)))\n",
+ "csv = pd.DataFrame(columns=[\"image_path\", \"class_name\"], index=range(len(cifar100)))\n",
"for i, item in enumerate(cifar100):\n",
" image, target = item\n",
" image_path = img_dir / f\"{i}.png\"\n",
" csv.loc[i] = {\n",
" \"image_path\": image_path.relative_to(temp_dir_dataset_path),\n",
- " \"label\": cifar100.classes[target],\n",
+ " \"class_name\": cifar100.classes[target],\n",
" }\n",
" image.save(image_path)\n",
"\n",
diff --git a/notebooks/Hirundo_Dataset_Optimization_S3.ipynb b/notebooks/Hirundo_Dataset_Optimization_S3.ipynb
index 418026bf..6b8364c9 100644
--- a/notebooks/Hirundo_Dataset_Optimization_S3.ipynb
+++ b/notebooks/Hirundo_Dataset_Optimization_S3.ipynb
@@ -7,7 +7,7 @@
"id": "view-in-github"
},
"source": [
- "
"
+ "
"
]
},
{
@@ -16,7 +16,7 @@
"id": "WOHjhp9ed6LM"
},
"source": [
- "# How to use Hirundo's Dataset Optimization (S3)\n",
+ "# How to use Hirundo's Dataset QA (S3)\n",
"\n",
"---\n",
"\n",
@@ -60,7 +60,7 @@
"id": "3qSUfS1liadM"
},
"source": [
- "2. We import the `OptimizationDataset` class, as well as the `LabelingType` enum, the `StorageConfig` (to indicate where the dataset files are saved) class, `the StorageTypes` enum, and the `StorageS3` storage class"
+ "2. We import the `QADataset` class, as well as the `LabelingType` enum, the `StorageConfig` (to indicate where the dataset files are saved) class, `the StorageTypes` enum, and the `StorageS3` storage class"
]
},
{
@@ -74,7 +74,7 @@
"from hirundo import (\n",
" HirundoCSV,\n",
" LabelingType,\n",
- " OptimizationDataset,\n",
+ " QADataset,\n",
" StorageConfig,\n",
" StorageS3,\n",
" StorageTypes,\n",
@@ -87,7 +87,7 @@
"id": "8KhP3Cw6gqlB"
},
"source": [
- "3. First we create the `OptimizationDataset` object"
+ "3. First we create the `QADataset` object"
]
},
{
@@ -104,7 +104,7 @@
" access_key_id=os.environ[\"AWS_ACCESS_KEY\"],\n",
" secret_access_key=os.environ[\"AWS_SECRET_ACCESS_KEY\"],\n",
")\n",
- "test_dataset = OptimizationDataset(\n",
+ "test_dataset = QADataset(\n",
" name=\"AWS-test-OD-BDD-validation-dataset\",\n",
" labeling_type=LabelingType.OBJECT_DETECTION,\n",
" storage_config=StorageConfig(\n",
@@ -129,7 +129,7 @@
"id": "eJhmScbels65"
},
"source": [
- "4. Now that we have created our dataset, we can launch a dataset optimization run"
+ "4. Now that we have created our dataset, we can launch a dataset QA run"
]
},
{
@@ -140,8 +140,8 @@
},
"outputs": [],
"source": [
- "run_id = test_dataset.run_optimization(replace_dataset_if_exists=True)\n",
- "print(\"Running optimization. Run ID is \", run_id)\n",
+ "run_id = test_dataset.run_qa(replace_dataset_if_exists=True)\n",
+ "print(\"Running dataset QA. Run ID is \", run_id)\n",
"test_dataset.check_run()"
]
}
diff --git a/notebooks/Hirundo_Dataset_Optimization_HuggingFace.ipynb b/notebooks/Hirundo_QA_Dataset_HuggingFace.ipynb
similarity index 83%
rename from notebooks/Hirundo_Dataset_Optimization_HuggingFace.ipynb
rename to notebooks/Hirundo_QA_Dataset_HuggingFace.ipynb
index e4bcd2da..781a1f14 100644
--- a/notebooks/Hirundo_Dataset_Optimization_HuggingFace.ipynb
+++ b/notebooks/Hirundo_QA_Dataset_HuggingFace.ipynb
@@ -7,7 +7,7 @@
"id": "view-in-github"
},
"source": [
- "
"
+ "
"
]
},
{
@@ -16,7 +16,7 @@
"id": "WOHjhp9ed6LM"
},
"source": [
- "# How to use Hirundo's Dataset Optimization (HuggingFace)\n",
+ "# How to use Hirundo's Dataset QA (HuggingFace)\n",
"\n",
"---\n",
"\n",
@@ -57,7 +57,7 @@
"id": "pywDlD47if67"
},
"source": [
- "2. We import the `GitRepo` class, the `OptimizationDataset` class, as well as the `LabelingType` enum, the `StorageConfig` class (to indicate where the dataset files are saved), `the StorageTypes` enum, and the `StorageGit` storage class"
+ "2. We import the `GitRepo` class, the `QADataset` class, as well as the `LabelingType` enum, the `StorageConfig` class (to indicate where the dataset files are saved), `the StorageTypes` enum, and the `StorageGit` storage class"
]
},
{
@@ -72,7 +72,7 @@
" GitRepo,\n",
" HirundoCSV,\n",
" LabelingType,\n",
- " OptimizationDataset,\n",
+ " QADataset,\n",
" StorageConfig,\n",
" StorageGit,\n",
" StorageTypes,\n",
@@ -85,7 +85,7 @@
"id": "eJhmScbels65"
},
"source": [
- "3. First we create the `OptimizationDataset` object"
+ "3. First we create the `QADataset` object"
]
},
{
@@ -103,7 +103,7 @@
" ),\n",
" branch=\"main\",\n",
")\n",
- "test_dataset = OptimizationDataset(\n",
+ "test_dataset = QADataset(\n",
" name=\"HuggingFace-test-OD-BDD-validation dataset\",\n",
" labeling_type=LabelingType.OBJECT_DETECTION,\n",
" storage_config=StorageConfig(\n",
@@ -126,7 +126,7 @@
"id": "FT50g4x7rsaW"
},
"source": [
- "4. Now that we have created our dataset, we can launch a dataset optimization run"
+ "4. Now that we have created our dataset, we can launch a dataset QA run"
]
},
{
@@ -137,8 +137,8 @@
},
"outputs": [],
"source": [
- "run_id = test_dataset.run_optimization()\n",
- "print(\"Running optimization. Run ID is \", run_id)\n",
+ "run_id = test_dataset.run_qa()\n",
+ "print(\"Running dataset QA. Run ID is \", run_id)\n",
"test_dataset.check_run()"
]
}
diff --git a/on-prem/on_prem_audio_ar_test_notebook.ipynb b/on-prem/on_prem_audio_ar_test_notebook.ipynb
index 7f945aa8..45eacffa 100644
--- a/on-prem/on_prem_audio_ar_test_notebook.ipynb
+++ b/on-prem/on_prem_audio_ar_test_notebook.ipynb
@@ -10,11 +10,12 @@
"import os\n",
"\n",
"from hirundo import (\n",
+ " Domain,\n",
" GitPlainAuth,\n",
" GitRepo,\n",
" HirundoCSV,\n",
" LabelingType,\n",
- " OptimizationDataset,\n",
+ " QADataset,\n",
" StorageConfig,\n",
" StorageGit,\n",
" StorageTypes,\n",
@@ -26,18 +27,18 @@
"\n",
"run_ids = [\n",
" run.run_id\n",
- " for run in OptimizationDataset.list_runs()\n",
+ " for run in QADataset.list_runs()\n",
" if run.run_id and run.name.startswith(\"TEST-STT-MASC-dataset\")\n",
"]\n",
"for run_id in run_ids:\n",
- " OptimizationDataset.cancel_by_id(run_id)\n",
+ " QADataset.cancel_by_id(run_id)\n",
"dataset_ids = [\n",
" dataset.id\n",
- " for dataset in OptimizationDataset.list_datasets()\n",
+ " for dataset in QADataset.list_datasets()\n",
" if dataset.name.startswith(\"TEST-STT-MASC-dataset\")\n",
"]\n",
"for dataset_id in dataset_ids:\n",
- " OptimizationDataset.delete_by_id(dataset_id)\n",
+ " QADataset.delete_by_id(dataset_id)\n",
"storage_ids = [\n",
" storage_config.id\n",
" for storage_config in StorageConfig.list()\n",
@@ -64,8 +65,9 @@
" ),\n",
" branch=\"main\",\n",
")\n",
- "test_dataset = OptimizationDataset(\n",
+ "test_dataset = QADataset(\n",
" name=f\"TEST-STT-MASC-dataset{unique_id}\",\n",
+ " domain=Domain.SPEECH,\n",
" labeling_type=LabelingType.SPEECH_TO_TEXT,\n",
" language=\"ar\",\n",
" storage_config=StorageConfig(\n",
@@ -79,59 +81,14 @@
" ),\n",
")\n",
"\n",
- "# test_dataset.run_optimization()"
+ "# test_dataset.run_qa()"
]
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Optimization run queued and not yet started: 0%| | 0.0/100.0 [00:00, ?it/s]"
- ]
- },
- {
- "ename": "RemoteProtocolError",
- "evalue": "peer closed connection without sending complete message body (incomplete chunked read)",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mRemoteProtocolError\u001b[0m Traceback (most recent call last)",
- "File \u001b[0;32m~/Programming/hirundo-client/.venv/lib/python3.9/site-packages/httpx/_transports/default.py:69\u001b[0m, in \u001b[0;36mmap_httpcore_exceptions\u001b[0;34m()\u001b[0m\n\u001b[1;32m 68\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m---> 69\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m\n\u001b[1;32m 70\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n",
- "File \u001b[0;32m~/Programming/hirundo-client/.venv/lib/python3.9/site-packages/httpx/_transports/default.py:113\u001b[0m, in \u001b[0;36mResponseStream.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 112\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m map_httpcore_exceptions():\n\u001b[0;32m--> 113\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m part \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_httpcore_stream:\n\u001b[1;32m 114\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m part\n",
- "File \u001b[0;32m~/Programming/hirundo-client/.venv/lib/python3.9/site-packages/httpcore/_sync/connection_pool.py:367\u001b[0m, in \u001b[0;36mPoolByteStream.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 366\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclose()\n\u001b[0;32m--> 367\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exc \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n",
- "File \u001b[0;32m~/Programming/hirundo-client/.venv/lib/python3.9/site-packages/httpcore/_sync/connection_pool.py:363\u001b[0m, in \u001b[0;36mPoolByteStream.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 362\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 363\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m part \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_stream:\n\u001b[1;32m 364\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m part\n",
- "File \u001b[0;32m~/Programming/hirundo-client/.venv/lib/python3.9/site-packages/httpcore/_sync/http11.py:349\u001b[0m, in \u001b[0;36mHTTP11ConnectionByteStream.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 348\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclose()\n\u001b[0;32m--> 349\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exc\n",
- "File \u001b[0;32m~/Programming/hirundo-client/.venv/lib/python3.9/site-packages/httpcore/_sync/http11.py:341\u001b[0m, in \u001b[0;36mHTTP11ConnectionByteStream.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 340\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m Trace(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mreceive_response_body\u001b[39m\u001b[38;5;124m\"\u001b[39m, logger, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_request, kwargs):\n\u001b[0;32m--> 341\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m chunk \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_connection\u001b[38;5;241m.\u001b[39m_receive_response_body(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 342\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m chunk\n",
- "File \u001b[0;32m~/Programming/hirundo-client/.venv/lib/python3.9/site-packages/httpcore/_sync/http11.py:210\u001b[0m, in \u001b[0;36mHTTP11Connection._receive_response_body\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 209\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[0;32m--> 210\u001b[0m event \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_receive_event\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 211\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(event, h11\u001b[38;5;241m.\u001b[39mData):\n",
- "File \u001b[0;32m~/Programming/hirundo-client/.venv/lib/python3.9/site-packages/httpcore/_sync/http11.py:221\u001b[0m, in \u001b[0;36mHTTP11Connection._receive_event\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 220\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m map_exceptions({h11\u001b[38;5;241m.\u001b[39mRemoteProtocolError: RemoteProtocolError}):\n\u001b[0;32m--> 221\u001b[0m event \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_h11_state\u001b[38;5;241m.\u001b[39mnext_event()\n\u001b[1;32m 223\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m event \u001b[38;5;129;01mis\u001b[39;00m h11\u001b[38;5;241m.\u001b[39mNEED_DATA:\n",
- "File \u001b[0;32m~/.pyenv/versions/3.9.19/lib/python3.9/contextlib.py:137\u001b[0m, in \u001b[0;36m_GeneratorContextManager.__exit__\u001b[0;34m(self, typ, value, traceback)\u001b[0m\n\u001b[1;32m 136\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 137\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgen\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mthrow\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtyp\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtraceback\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 138\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 139\u001b[0m \u001b[38;5;66;03m# Suppress StopIteration *unless* it's the same exception that\u001b[39;00m\n\u001b[1;32m 140\u001b[0m \u001b[38;5;66;03m# was passed to throw(). This prevents a StopIteration\u001b[39;00m\n\u001b[1;32m 141\u001b[0m \u001b[38;5;66;03m# raised inside the \"with\" statement from being suppressed.\u001b[39;00m\n",
- "File \u001b[0;32m~/Programming/hirundo-client/.venv/lib/python3.9/site-packages/httpcore/_exceptions.py:14\u001b[0m, in \u001b[0;36mmap_exceptions\u001b[0;34m(map)\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(exc, from_exc):\n\u001b[0;32m---> 14\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m to_exc(exc) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mexc\u001b[39;00m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m\n",
- "\u001b[0;31mRemoteProtocolError\u001b[0m: peer closed connection without sending complete message body (incomplete chunked read)",
- "\nThe above exception was the direct cause of the following exception:\n",
- "\u001b[0;31mRemoteProtocolError\u001b[0m Traceback (most recent call last)",
- "Cell \u001b[0;32mIn[2], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[43mtest_dataset\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcheck_run\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2\u001b[0m results\u001b[38;5;241m.\u001b[39msuspects\u001b[38;5;241m.\u001b[39mto_csv(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mar-on-prem-audio-test.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
- "File \u001b[0;32m~/Programming/hirundo-client/hirundo/dataset_optimization.py:507\u001b[0m, in \u001b[0;36mOptimizationDataset.check_run\u001b[0;34m(self, stop_on_manual_approval)\u001b[0m\n\u001b[1;32m 505\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrun_id:\n\u001b[1;32m 506\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNo run has been started\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 507\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcheck_run_by_id\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstop_on_manual_approval\u001b[49m\u001b[43m)\u001b[49m\n",
- "File \u001b[0;32m~/Programming/hirundo-client/hirundo/dataset_optimization.py:440\u001b[0m, in \u001b[0;36mOptimizationDataset.check_run_by_id\u001b[0;34m(run_id, stop_on_manual_approval)\u001b[0m\n\u001b[1;32m 438\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m logging_redirect_tqdm():\n\u001b[1;32m 439\u001b[0m t \u001b[38;5;241m=\u001b[39m tqdm(total\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m100.0\u001b[39m)\n\u001b[0;32m--> 440\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m iteration \u001b[38;5;129;01min\u001b[39;00m OptimizationDataset\u001b[38;5;241m.\u001b[39m_check_run_by_id(run_id):\n\u001b[1;32m 441\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m iteration[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstate\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;129;01min\u001b[39;00m STATUS_TO_PROGRESS_MAP:\n\u001b[1;32m 442\u001b[0m t\u001b[38;5;241m.\u001b[39mset_description(STATUS_TO_TEXT_MAP[iteration[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstate\u001b[39m\u001b[38;5;124m\"\u001b[39m]])\n",
- "File \u001b[0;32m~/Programming/hirundo-client/hirundo/dataset_optimization.py:370\u001b[0m, in \u001b[0;36mOptimizationDataset._check_run_by_id\u001b[0;34m(run_id, retry)\u001b[0m\n\u001b[1;32m 368\u001b[0m last_event \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 369\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m httpx\u001b[38;5;241m.\u001b[39mClient(timeout\u001b[38;5;241m=\u001b[39mhttpx\u001b[38;5;241m.\u001b[39mTimeout(\u001b[38;5;28;01mNone\u001b[39;00m, connect\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m5.0\u001b[39m)) \u001b[38;5;28;01mas\u001b[39;00m client:\n\u001b[0;32m--> 370\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m sse \u001b[38;5;129;01min\u001b[39;00m iter_sse_retrying(\n\u001b[1;32m 371\u001b[0m client,\n\u001b[1;32m 372\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGET\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 373\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mAPI_HOST\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/dataset-optimization/run/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrun_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 374\u001b[0m headers\u001b[38;5;241m=\u001b[39mget_auth_headers(),\n\u001b[1;32m 375\u001b[0m ):\n\u001b[1;32m 376\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m sse\u001b[38;5;241m.\u001b[39mevent \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mping\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 377\u001b[0m \u001b[38;5;28;01mcontinue\u001b[39;00m\n",
- "File \u001b[0;32m~/Programming/hirundo-client/hirundo/_iter_sse_retrying.py:62\u001b[0m, in \u001b[0;36miter_sse_retrying.._iter_sse\u001b[0;34m()\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m connect_sse(client, method, url, headers\u001b[38;5;241m=\u001b[39mconnect_headers) \u001b[38;5;28;01mas\u001b[39;00m event_source:\n\u001b[1;32m 61\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m---> 62\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m sse \u001b[38;5;129;01min\u001b[39;00m event_source\u001b[38;5;241m.\u001b[39miter_sse():\n\u001b[1;32m 63\u001b[0m last_event_id \u001b[38;5;241m=\u001b[39m sse\u001b[38;5;241m.\u001b[39mid\n\u001b[1;32m 65\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m sse\u001b[38;5;241m.\u001b[39mretry \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
- "File \u001b[0;32m~/Programming/hirundo-client/.venv/lib/python3.9/site-packages/httpx_sse/_api.py:30\u001b[0m, in \u001b[0;36mEventSource.iter_sse\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 28\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_content_type()\n\u001b[1;32m 29\u001b[0m decoder \u001b[38;5;241m=\u001b[39m SSEDecoder()\n\u001b[0;32m---> 30\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m line \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_response\u001b[38;5;241m.\u001b[39miter_lines():\n\u001b[1;32m 31\u001b[0m line \u001b[38;5;241m=\u001b[39m line\u001b[38;5;241m.\u001b[39mrstrip(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 32\u001b[0m sse \u001b[38;5;241m=\u001b[39m decoder\u001b[38;5;241m.\u001b[39mdecode(line)\n",
- "File \u001b[0;32m~/Programming/hirundo-client/.venv/lib/python3.9/site-packages/httpx/_models.py:861\u001b[0m, in \u001b[0;36mResponse.iter_lines\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 859\u001b[0m decoder \u001b[38;5;241m=\u001b[39m LineDecoder()\n\u001b[1;32m 860\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m request_context(request\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_request):\n\u001b[0;32m--> 861\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m text \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39miter_text():\n\u001b[1;32m 862\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m line \u001b[38;5;129;01min\u001b[39;00m decoder\u001b[38;5;241m.\u001b[39mdecode(text):\n\u001b[1;32m 863\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m line\n",
- "File \u001b[0;32m~/Programming/hirundo-client/.venv/lib/python3.9/site-packages/httpx/_models.py:848\u001b[0m, in \u001b[0;36mResponse.iter_text\u001b[0;34m(self, chunk_size)\u001b[0m\n\u001b[1;32m 846\u001b[0m chunker \u001b[38;5;241m=\u001b[39m TextChunker(chunk_size\u001b[38;5;241m=\u001b[39mchunk_size)\n\u001b[1;32m 847\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m request_context(request\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_request):\n\u001b[0;32m--> 848\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m byte_content \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39miter_bytes():\n\u001b[1;32m 849\u001b[0m text_content \u001b[38;5;241m=\u001b[39m decoder\u001b[38;5;241m.\u001b[39mdecode(byte_content)\n\u001b[1;32m 850\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m chunk \u001b[38;5;129;01min\u001b[39;00m chunker\u001b[38;5;241m.\u001b[39mdecode(text_content):\n",
- "File \u001b[0;32m~/Programming/hirundo-client/.venv/lib/python3.9/site-packages/httpx/_models.py:829\u001b[0m, in \u001b[0;36mResponse.iter_bytes\u001b[0;34m(self, chunk_size)\u001b[0m\n\u001b[1;32m 827\u001b[0m chunker \u001b[38;5;241m=\u001b[39m ByteChunker(chunk_size\u001b[38;5;241m=\u001b[39mchunk_size)\n\u001b[1;32m 828\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m request_context(request\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_request):\n\u001b[0;32m--> 829\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m raw_bytes \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39miter_raw():\n\u001b[1;32m 830\u001b[0m decoded \u001b[38;5;241m=\u001b[39m decoder\u001b[38;5;241m.\u001b[39mdecode(raw_bytes)\n\u001b[1;32m 831\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m chunk \u001b[38;5;129;01min\u001b[39;00m chunker\u001b[38;5;241m.\u001b[39mdecode(decoded):\n",
- "File \u001b[0;32m~/Programming/hirundo-client/.venv/lib/python3.9/site-packages/httpx/_models.py:883\u001b[0m, in \u001b[0;36mResponse.iter_raw\u001b[0;34m(self, chunk_size)\u001b[0m\n\u001b[1;32m 880\u001b[0m chunker \u001b[38;5;241m=\u001b[39m ByteChunker(chunk_size\u001b[38;5;241m=\u001b[39mchunk_size)\n\u001b[1;32m 882\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m request_context(request\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_request):\n\u001b[0;32m--> 883\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m raw_stream_bytes \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstream:\n\u001b[1;32m 884\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_num_bytes_downloaded \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(raw_stream_bytes)\n\u001b[1;32m 885\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m chunk \u001b[38;5;129;01min\u001b[39;00m chunker\u001b[38;5;241m.\u001b[39mdecode(raw_stream_bytes):\n",
- "File \u001b[0;32m~/Programming/hirundo-client/.venv/lib/python3.9/site-packages/httpx/_client.py:126\u001b[0m, in \u001b[0;36mBoundSyncStream.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 125\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__iter__\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m typing\u001b[38;5;241m.\u001b[39mIterator[\u001b[38;5;28mbytes\u001b[39m]:\n\u001b[0;32m--> 126\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m chunk \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_stream:\n\u001b[1;32m 127\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m chunk\n",
- "File \u001b[0;32m~/Programming/hirundo-client/.venv/lib/python3.9/site-packages/httpx/_transports/default.py:114\u001b[0m, in \u001b[0;36mResponseStream.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 112\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m map_httpcore_exceptions():\n\u001b[1;32m 113\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m part \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_httpcore_stream:\n\u001b[0;32m--> 114\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m part\n",
- "File \u001b[0;32m~/.pyenv/versions/3.9.19/lib/python3.9/contextlib.py:137\u001b[0m, in \u001b[0;36m_GeneratorContextManager.__exit__\u001b[0;34m(self, typ, value, traceback)\u001b[0m\n\u001b[1;32m 135\u001b[0m value \u001b[38;5;241m=\u001b[39m typ()\n\u001b[1;32m 136\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 137\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgen\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mthrow\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtyp\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtraceback\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 138\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 139\u001b[0m \u001b[38;5;66;03m# Suppress StopIteration *unless* it's the same exception that\u001b[39;00m\n\u001b[1;32m 140\u001b[0m \u001b[38;5;66;03m# was passed to throw(). This prevents a StopIteration\u001b[39;00m\n\u001b[1;32m 141\u001b[0m \u001b[38;5;66;03m# raised inside the \"with\" statement from being suppressed.\u001b[39;00m\n\u001b[1;32m 142\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m exc \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m value\n",
- "File \u001b[0;32m~/Programming/hirundo-client/.venv/lib/python3.9/site-packages/httpx/_transports/default.py:86\u001b[0m, in \u001b[0;36mmap_httpcore_exceptions\u001b[0;34m()\u001b[0m\n\u001b[1;32m 83\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m\n\u001b[1;32m 85\u001b[0m message \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(exc)\n\u001b[0;32m---> 86\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m mapped_exc(message) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mexc\u001b[39;00m\n",
- "\u001b[0;31mRemoteProtocolError\u001b[0m: peer closed connection without sending complete message body (incomplete chunked read)"
- ]
- }
- ],
+ "outputs": [],
"source": [
"results = test_dataset.check_run()\n",
"results.suspects.to_csv(\"ar-on-prem-audio-test.csv\")"
diff --git a/on-prem/on_prem_audio_he_small_test_notebook.ipynb b/on-prem/on_prem_audio_he_small_test_notebook.ipynb
index aaa6b7a5..7546037d 100644
--- a/on-prem/on_prem_audio_he_small_test_notebook.ipynb
+++ b/on-prem/on_prem_audio_he_small_test_notebook.ipynb
@@ -10,11 +10,12 @@
"import os\n",
"\n",
"from hirundo import (\n",
+ " Domain,\n",
" GitPlainAuth,\n",
" GitRepo,\n",
" HirundoCSV,\n",
" LabelingType,\n",
- " OptimizationDataset,\n",
+ " QADataset,\n",
" StorageConfig,\n",
" StorageGit,\n",
" StorageTypes,\n",
@@ -24,12 +25,12 @@
"\n",
"unique_id = os.getenv(\"UNIQUE_ID\", \"\").replace(\".\", \"-\").replace(\"/\", \"-\")\n",
"\n",
- "run_ids = [run.run_id for run in OptimizationDataset.list_runs() if run.run_id]\n",
+ "run_ids = [run.run_id for run in QADataset.list_runs() if run.run_id]\n",
"for run_id in run_ids:\n",
- " OptimizationDataset.cancel_by_id(run_id)\n",
- "dataset_ids = [dataset.id for dataset in OptimizationDataset.list_datasets()]\n",
+ " QADataset.cancel_by_id(run_id)\n",
+ "dataset_ids = [dataset.id for dataset in QADataset.list_datasets()]\n",
"for dataset_id in dataset_ids:\n",
- " OptimizationDataset.delete_by_id(dataset_id)\n",
+ " QADataset.delete_by_id(dataset_id)\n",
"storage_ids = [storage_config.id for storage_config in StorageConfig.list()]\n",
"for storage_id in storage_ids:\n",
" StorageConfig.delete_by_id(storage_id)\n",
@@ -48,8 +49,9 @@
" ),\n",
" branch=\"main\",\n",
")\n",
- "test_dataset = OptimizationDataset(\n",
+ "test_dataset = QADataset(\n",
" name=f\"TEST-STT-RoboShaulGolden-dataset{unique_id}\",\n",
+ " domain=Domain.SPEECH,\n",
" labeling_type=LabelingType.SPEECH_TO_TEXT,\n",
" language=\"he\",\n",
" storage_config=StorageConfig(\n",
@@ -63,7 +65,7 @@
" ),\n",
")\n",
"\n",
- "test_dataset.run_optimization()"
+ "test_dataset.run_qa()"
]
},
{
diff --git a/on-prem/on_prem_audio_he_test_notebook.ipynb b/on-prem/on_prem_audio_he_test_notebook.ipynb
index dc407217..636fa5e4 100644
--- a/on-prem/on_prem_audio_he_test_notebook.ipynb
+++ b/on-prem/on_prem_audio_he_test_notebook.ipynb
@@ -4,40 +4,18 @@
"cell_type": "code",
"execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/blewis/Programming/hirundo-client/.venv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
- " from .autonotebook import tqdm as notebook_tqdm\n"
- ]
- },
- {
- "ename": "ValidationError",
- "evalue": "5 validation errors for DataOptimizationDatasetOut\nlabeling_type\n Field required [type=missing, input_value={'name': 'Keymakr data4 -...1500, 'path': '/data4'}}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.8/v/missing\nstorage_integration\n Field required [type=missing, input_value={'name': 'Keymakr data4 -...1500, 'path': '/data4'}}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.8/v/missing\ndata_root_url\n Field required [type=missing, input_value={'name': 'Keymakr data4 -...1500, 'path': '/data4'}}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.8/v/missing\nlabeling_info\n Field required [type=missing, input_value={'name': 'Keymakr data4 -...1500, 'path': '/data4'}}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.8/v/missing\nstatus\n Field required [type=missing, input_value={'name': 'Keymakr data4 -...1500, 'path': '/data4'}}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.8/v/missing",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mValidationError\u001b[0m Traceback (most recent call last)",
- "Cell \u001b[0;32mIn[2], line 20\u001b[0m\n\u001b[1;32m 15\u001b[0m logger \u001b[38;5;241m=\u001b[39m logging\u001b[38;5;241m.\u001b[39mgetLogger(\u001b[38;5;18m__name__\u001b[39m)\n\u001b[1;32m 17\u001b[0m unique_id \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mgetenv(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUNIQUE_ID\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mreplace(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m-\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mreplace(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m-\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 19\u001b[0m run_ids \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m---> 20\u001b[0m dataset\u001b[38;5;241m.\u001b[39mrun_id \u001b[38;5;28;01mfor\u001b[39;00m dataset \u001b[38;5;129;01min\u001b[39;00m \u001b[43mOptimizationDataset\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlist\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mif\u001b[39;00m dataset\u001b[38;5;241m.\u001b[39mrun_id\n\u001b[1;32m 21\u001b[0m ]\n\u001b[1;32m 22\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m run_id \u001b[38;5;129;01min\u001b[39;00m run_ids:\n\u001b[1;32m 23\u001b[0m OptimizationDataset\u001b[38;5;241m.\u001b[39mcancel_by_id(run_id)\n",
- "File \u001b[0;32m~/Programming/hirundo-client/hirundo/dataset_optimization.py:298\u001b[0m, in \u001b[0;36mOptimizationDataset.list\u001b[0;34m(organization_id)\u001b[0m\n\u001b[1;32m 296\u001b[0m raise_for_status_with_reason(response)\n\u001b[1;32m 297\u001b[0m datasets \u001b[38;5;241m=\u001b[39m response\u001b[38;5;241m.\u001b[39mjson()\n\u001b[0;32m--> 298\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m [\n\u001b[1;32m 299\u001b[0m DataOptimizationDatasetOut(\n\u001b[1;32m 300\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mds,\n\u001b[1;32m 301\u001b[0m )\n\u001b[1;32m 302\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m ds \u001b[38;5;129;01min\u001b[39;00m datasets\n\u001b[1;32m 303\u001b[0m ]\n",
- "File \u001b[0;32m~/Programming/hirundo-client/hirundo/dataset_optimization.py:299\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 296\u001b[0m raise_for_status_with_reason(response)\n\u001b[1;32m 297\u001b[0m datasets \u001b[38;5;241m=\u001b[39m response\u001b[38;5;241m.\u001b[39mjson()\n\u001b[1;32m 298\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m [\n\u001b[0;32m--> 299\u001b[0m \u001b[43mDataOptimizationDatasetOut\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 300\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 301\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 302\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m ds \u001b[38;5;129;01min\u001b[39;00m datasets\n\u001b[1;32m 303\u001b[0m ]\n",
- "File \u001b[0;32m~/Programming/hirundo-client/.venv/lib/python3.9/site-packages/pydantic/main.py:193\u001b[0m, in \u001b[0;36mBaseModel.__init__\u001b[0;34m(self, **data)\u001b[0m\n\u001b[1;32m 191\u001b[0m \u001b[38;5;66;03m# `__tracebackhide__` tells pytest and some other tools to omit this function from tracebacks\u001b[39;00m\n\u001b[1;32m 192\u001b[0m __tracebackhide__ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 193\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__pydantic_validator__\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalidate_python\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mself_instance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\n",
- "\u001b[0;31mValidationError\u001b[0m: 5 validation errors for DataOptimizationDatasetOut\nlabeling_type\n Field required [type=missing, input_value={'name': 'Keymakr data4 -...1500, 'path': '/data4'}}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.8/v/missing\nstorage_integration\n Field required [type=missing, input_value={'name': 'Keymakr data4 -...1500, 'path': '/data4'}}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.8/v/missing\ndata_root_url\n Field required [type=missing, input_value={'name': 'Keymakr data4 -...1500, 'path': '/data4'}}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.8/v/missing\nlabeling_info\n Field required [type=missing, input_value={'name': 'Keymakr data4 -...1500, 'path': '/data4'}}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.8/v/missing\nstatus\n Field required [type=missing, input_value={'name': 'Keymakr data4 -...1500, 'path': '/data4'}}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.8/v/missing"
- ]
- }
- ],
+ "outputs": [],
"source": [
"import logging\n",
"import os\n",
"\n",
"from hirundo import (\n",
+ " Domain,\n",
" GitPlainAuth,\n",
" GitRepo,\n",
" HirundoCSV,\n",
" LabelingType,\n",
- " OptimizationDataset,\n",
+ " QADataset,\n",
" StorageConfig,\n",
" StorageGit,\n",
" StorageTypes,\n",
@@ -47,12 +25,12 @@
"\n",
"unique_id = os.getenv(\"UNIQUE_ID\", \"\").replace(\".\", \"-\").replace(\"/\", \"-\")\n",
"\n",
- "run_ids = [run.run_id for run in OptimizationDataset.list_runs() if run.run_id]\n",
+ "run_ids = [run.run_id for run in QADataset.list_runs() if run.run_id]\n",
"for run_id in run_ids:\n",
- " OptimizationDataset.cancel_by_id(run_id)\n",
- "dataset_ids = [dataset.id for dataset in OptimizationDataset.list_datasets()]\n",
+ " QADataset.cancel_by_id(run_id)\n",
+ "dataset_ids = [dataset.id for dataset in QADataset.list_datasets()]\n",
"for dataset_id in dataset_ids:\n",
- " OptimizationDataset.delete_by_id(dataset_id)\n",
+ " QADataset.delete_by_id(dataset_id)\n",
"storage_ids = [storage_config.id for storage_config in StorageConfig.list()]\n",
"for storage_id in storage_ids:\n",
" StorageConfig.delete_by_id(storage_id)\n",
@@ -71,8 +49,9 @@
" ),\n",
" branch=\"main\",\n",
")\n",
- "test_dataset = OptimizationDataset(\n",
+ "test_dataset = QADataset(\n",
" name=f\"TEST-STT-RoboShaul-dataset{unique_id}\",\n",
+ " domain=Domain.SPEECH,\n",
" labeling_type=LabelingType.SPEECH_TO_TEXT,\n",
" language=\"he\",\n",
" storage_config=StorageConfig(\n",
@@ -86,7 +65,7 @@
" ),\n",
")\n",
"\n",
- "test_dataset.run_optimization()"
+ "test_dataset.run_qa()"
]
},
{
@@ -99,13 +78,6 @@
"results.suspects.to_csv(\"he-on-prem-audio-test.csv\")"
]
},
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [],
- "source": []
- },
{
"cell_type": "code",
"execution_count": null,
diff --git a/on-prem/on_prem_audio_sanity_test_notebook.ipynb b/on-prem/on_prem_audio_sanity_test_notebook.ipynb
index 7ba0b2a1..d82a9f0a 100644
--- a/on-prem/on_prem_audio_sanity_test_notebook.ipynb
+++ b/on-prem/on_prem_audio_sanity_test_notebook.ipynb
@@ -1,179 +1,181 @@
{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
+ "cells": [
{
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/blewis/Programming/hirundo-client/.venv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
- " from .autonotebook import tqdm as notebook_tqdm\n",
- "Deleted storage integration with ID: 9\n",
- "Created storage integration with ID: 11\n",
- "Created dataset with ID: 8\n",
- "Started the run with ID: 2f7e0a41-75c4-4568-95fc-a177411c5162\n",
- "Optimization run completed successfully: 100%|██████████| 100.0/100.0 [08:35<00:00, 5.16s/it] "
- ]
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/blewis/Programming/hirundo-client/.venv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+ " from .autonotebook import tqdm as notebook_tqdm\n",
+ "Deleted storage integration with ID: 9\n",
+ "Created storage integration with ID: 11\n",
+ "Created dataset with ID: 8\n",
+ "Started the run with ID: 2f7e0a41-75c4-4568-95fc-a177411c5162\n",
+ "Dataset QA run completed successfully: 100%|██████████| 100.0/100.0 [08:35<00:00, 5.16s/it] "
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "suspects= audio_segment_id audio_path \\\n",
+ "0 6 wavs/automatic_0012 \n",
+ "1 20 wavs/automatic_0017 \n",
+ "2 15 wavs/automatic_0018 \n",
+ "3 36 wavs/automatic_0020 \n",
+ "4 37 wavs/automatic_0010 \n",
+ ".. ... ... \n",
+ "120 32 wavs/automatic_0004 \n",
+ "121 34 wavs/automatic_0012 \n",
+ "122 39 wavs/automatic_0017 \n",
+ "123 39 wavs/automatic_0017 \n",
+ "124 3 wavs/automatic_0017 \n",
+ "\n",
+ " transcript \\\n",
+ "0 וגם אם לא תהיה פה השעה בסופו של דבר \n",
+ "1 פרשה שהתחילה בחקירה שנפתחה יום אחד ב2017 בעצם... \n",
+ "2 ללשכת ראש הממשלה הדבר הזה נולד רק דרך אילן יש... \n",
+ "3 ה \n",
+ "4 הפרשה הזאת בעצם שואלת האם לחץ להטות את הסיכור... \n",
+ ".. ... \n",
+ "120 היום אנחנו יוצאים אה נו לא אני גאון אני יושב ... \n",
+ "121 הסיפור הזה נותן לנו הצצה לדברים שאנחנו אף פעם... \n",
+ "122 כי במקור זו הייתה פרשה שקראנו לה פרשת בזק שענ... \n",
+ "123 כי במקור זו הייתה פרשה שקראנו לה פרשת בזק שענ... \n",
+ "124 הוא האיש האחראי לזה שתיק 4000 בכלל קיים ובא ל... \n",
+ "\n",
+ " suspect_region_start_index suspect_region_end_index \\\n",
+ "0 0 6 \n",
+ "1 0 12 \n",
+ "2 0 9 \n",
+ "3 0 -2 \n",
+ "4 138 157 \n",
+ ".. ... ... \n",
+ "120 20 32 \n",
+ "121 0 10 \n",
+ "122 17 33 \n",
+ "123 58 -2 \n",
+ "124 0 8 \n",
+ "\n",
+ " suspect_region suspect_level \\\n",
+ "0 וגם אם » לא תהיה פה השעה בסופו של דבר 1.0 \n",
+ "1 פרשה שהתחילה » בחקירה שנפתחה יום אחד ב2017 בע... 1.0 \n",
+ "2 ללשכת ראש » הממשלה הדבר הזה נולד רק דרך אילן ... 1.0 \n",
+ "3 ה 1.0 \n",
+ "4 הפרשה הזאת בעצם שואלת האם לחץ להטות את הסיכור... 1.0 \n",
+ ".. ... ... \n",
+ "120 היום אנחנו יוצאים אה « נו לא אני » גאון אני י... 0.5 \n",
+ "121 הסיפור הזה » נותן לנו הצצה לדברים שאנחנו אף פ... 0.5 \n",
+ "122 כי במקור זו הייתה « פרשה שקראו לה » פרשת בזק ... 0.5 \n",
+ "123 כי במקור זו הייתה פרשה שקראנו לה פרשת בזק שענ... 0.5 \n",
+ "124 הוא האיש » האחראי לזה שתיק 4000 בכלל קיים ובא... 0.5 \n",
+ "\n",
+ " suspect_score rank start_time end_time \n",
+ "0 1.000000 1 0.00 2.96 \n",
+ "1 0.946653 2 58.64 68.00 \n",
+ "2 0.794076 3 8.00 16.00 \n",
+ "3 0.741773 4 0.00 1.00 \n",
+ "4 0.728718 5 17.00 34.00 \n",
+ ".. ... ... ... ... \n",
+ "120 0.001446 121 0.00 5.20 \n",
+ "121 0.001355 122 2.96 6.64 \n",
+ "122 0.001330 123 50.96 58.64 \n",
+ "123 0.001073 124 50.96 58.64 \n",
+ "124 0.001006 125 44.48 50.96 \n",
+ "\n",
+ "[125 rows x 11 columns] warnings_and_errors=Empty DataFrame\n",
+ "Columns: [audio_path, status]\n",
+ "Index: []\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "import logging\n",
+ "import os\n",
+ "from pathlib import Path\n",
+ "\n",
+ "from hirundo import (\n",
+ " Domain,\n",
+ " HirundoCSV,\n",
+ " LabelingType,\n",
+ " QADataset,\n",
+ " StorageConfig,\n",
+ " StorageTypes,\n",
+ ")\n",
+ "from pydantic_core import Url\n",
+ "\n",
+ "logger = logging.getLogger(__name__)\n",
+ "\n",
+ "unique_id = os.getenv(\"UNIQUE_ID\", \"\").replace(\".\", \"-\").replace(\"/\", \"-\")\n",
+ "\n",
+ "run_ids = [run.run_id for run in QADataset.list_runs() if run.run_id]\n",
+ "for run_id in run_ids:\n",
+ " QADataset.cancel_by_id(run_id)\n",
+ "dataset_ids = [dataset.id for dataset in QADataset.list_datasets()]\n",
+ "for dataset_id in dataset_ids:\n",
+ " QADataset.delete_by_id(dataset_id)\n",
+ "storage_ids = [storage_config.id for storage_config in StorageConfig.list()]\n",
+ "for storage_id in storage_ids:\n",
+ " StorageConfig.delete_by_id(storage_id)\n",
+ "\n",
+ "test_dataset = QADataset(\n",
+ " name=f\"TEST-STT-RoboShaulTiny-dataset{unique_id}\",\n",
+ " domain=Domain.SPEECH,\n",
+ " labeling_type=LabelingType.SPEECH_TO_TEXT,\n",
+ " language=\"he\",\n",
+ " storage_config=StorageConfig(\n",
+ " name=f\"STT-RoboShaulTiny-dataset{unique_id}\",\n",
+ " type=StorageTypes.LOCAL,\n",
+ " ),\n",
+ " data_root_url=Url(Path(\"/datasets/RoboShaulTiny/wavs\").as_uri()),\n",
+ " labeling_info=HirundoCSV(\n",
+ " csv_url=Url(Path(\"/datasets/RoboShaulTiny/meta.csv\").as_uri()),\n",
+ " ),\n",
+ ")\n",
+ "\n",
+ "test_dataset.run_qa()\n",
+ "results = test_dataset.check_run()\n",
+ "print(results)"
+ ]
},
{
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "suspects= audio_segment_id audio_path \\\n",
- "0 6 wavs/automatic_0012 \n",
- "1 20 wavs/automatic_0017 \n",
- "2 15 wavs/automatic_0018 \n",
- "3 36 wavs/automatic_0020 \n",
- "4 37 wavs/automatic_0010 \n",
- ".. ... ... \n",
- "120 32 wavs/automatic_0004 \n",
- "121 34 wavs/automatic_0012 \n",
- "122 39 wavs/automatic_0017 \n",
- "123 39 wavs/automatic_0017 \n",
- "124 3 wavs/automatic_0017 \n",
- "\n",
- " transcript \\\n",
- "0 וגם אם לא תהיה פה השעה בסופו של דבר \n",
- "1 פרשה שהתחילה בחקירה שנפתחה יום אחד ב2017 בעצם... \n",
- "2 ללשכת ראש הממשלה הדבר הזה נולד רק דרך אילן יש... \n",
- "3 ה \n",
- "4 הפרשה הזאת בעצם שואלת האם לחץ להטות את הסיכור... \n",
- ".. ... \n",
- "120 היום אנחנו יוצאים אה נו לא אני גאון אני יושב ... \n",
- "121 הסיפור הזה נותן לנו הצצה לדברים שאנחנו אף פעם... \n",
- "122 כי במקור זו הייתה פרשה שקראנו לה פרשת בזק שענ... \n",
- "123 כי במקור זו הייתה פרשה שקראנו לה פרשת בזק שענ... \n",
- "124 הוא האיש האחראי לזה שתיק 4000 בכלל קיים ובא ל... \n",
- "\n",
- " suspect_region_start_index suspect_region_end_index \\\n",
- "0 0 6 \n",
- "1 0 12 \n",
- "2 0 9 \n",
- "3 0 -2 \n",
- "4 138 157 \n",
- ".. ... ... \n",
- "120 20 32 \n",
- "121 0 10 \n",
- "122 17 33 \n",
- "123 58 -2 \n",
- "124 0 8 \n",
- "\n",
- " suspect_region suspect_level \\\n",
- "0 וגם אם » לא תהיה פה השעה בסופו של דבר 1.0 \n",
- "1 פרשה שהתחילה » בחקירה שנפתחה יום אחד ב2017 בע... 1.0 \n",
- "2 ללשכת ראש » הממשלה הדבר הזה נולד רק דרך אילן ... 1.0 \n",
- "3 ה 1.0 \n",
- "4 הפרשה הזאת בעצם שואלת האם לחץ להטות את הסיכור... 1.0 \n",
- ".. ... ... \n",
- "120 היום אנחנו יוצאים אה « נו לא אני » גאון אני י... 0.5 \n",
- "121 הסיפור הזה » נותן לנו הצצה לדברים שאנחנו אף פ... 0.5 \n",
- "122 כי במקור זו הייתה « פרשה שקראו לה » פרשת בזק ... 0.5 \n",
- "123 כי במקור זו הייתה פרשה שקראנו לה פרשת בזק שענ... 0.5 \n",
- "124 הוא האיש » האחראי לזה שתיק 4000 בכלל קיים ובא... 0.5 \n",
- "\n",
- " suspect_score rank start_time end_time \n",
- "0 1.000000 1 0.00 2.96 \n",
- "1 0.946653 2 58.64 68.00 \n",
- "2 0.794076 3 8.00 16.00 \n",
- "3 0.741773 4 0.00 1.00 \n",
- "4 0.728718 5 17.00 34.00 \n",
- ".. ... ... ... ... \n",
- "120 0.001446 121 0.00 5.20 \n",
- "121 0.001355 122 2.96 6.64 \n",
- "122 0.001330 123 50.96 58.64 \n",
- "123 0.001073 124 50.96 58.64 \n",
- "124 0.001006 125 44.48 50.96 \n",
- "\n",
- "[125 rows x 11 columns] warnings_and_errors=Empty DataFrame\n",
- "Columns: [audio_path, status]\n",
- "Index: []\n"
- ]
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": ".venv",
+ "language": "python",
+ "name": "python3"
},
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n"
- ]
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.19"
}
- ],
- "source": [
- "import logging\n",
- "import os\n",
- "from pathlib import Path\n",
- "\n",
- "from hirundo import (\n",
- " HirundoCSV,\n",
- " LabelingType,\n",
- " OptimizationDataset,\n",
- " StorageConfig,\n",
- " StorageTypes,\n",
- ")\n",
- "from pydantic_core import Url\n",
- "\n",
- "logger = logging.getLogger(__name__)\n",
- "\n",
- "unique_id = os.getenv(\"UNIQUE_ID\", \"\").replace(\".\", \"-\").replace(\"/\", \"-\")\n",
- "\n",
- "run_ids = [run.run_id for run in OptimizationDataset.list_runs() if run.run_id]\n",
- "for run_id in run_ids:\n",
- " OptimizationDataset.cancel_by_id(run_id)\n",
- "dataset_ids = [dataset.id for dataset in OptimizationDataset.list_datasets()]\n",
- "for dataset_id in dataset_ids:\n",
- " OptimizationDataset.delete_by_id(dataset_id)\n",
- "storage_ids = [storage_config.id for storage_config in StorageConfig.list()]\n",
- "for storage_id in storage_ids:\n",
- " StorageConfig.delete_by_id(storage_id)\n",
- "\n",
- "test_dataset = OptimizationDataset(\n",
- " name=f\"TEST-STT-RoboShaulTiny-dataset{unique_id}\",\n",
- " labeling_type=LabelingType.SPEECH_TO_TEXT,\n",
- " language=\"he\",\n",
- " storage_config=StorageConfig(\n",
- " name=f\"STT-RoboShaulTiny-dataset{unique_id}\",\n",
- " type=StorageTypes.LOCAL,\n",
- " ),\n",
- " data_root_url=Url(Path(\"/datasets/RoboShaulTiny/wavs\").as_uri()),\n",
- " labeling_info=HirundoCSV(\n",
- " csv_url=Url(Path(\"/datasets/RoboShaulTiny/meta.csv\").as_uri()),\n",
- " ),\n",
- ")\n",
- "\n",
- "test_dataset.run_optimization()\n",
- "results = test_dataset.check_run()\n",
- "print(results)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": ".venv",
- "language": "python",
- "name": "python3"
},
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.9.19"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat": 4,
+ "nbformat_minor": 2
}
diff --git a/on-prem/on_prem_test_notebook.ipynb b/on-prem/on_prem_test_notebook.ipynb
index 807fc511..62742cb7 100644
--- a/on-prem/on_prem_test_notebook.ipynb
+++ b/on-prem/on_prem_test_notebook.ipynb
@@ -1,153 +1,153 @@
{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
+ "cells": [
{
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/blewis/Programming/hirundo-client/.venv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
- " from .autonotebook import tqdm as notebook_tqdm\n",
- "Cancelling run with ID: e2ee490c-99de-444b-8f34-83caf2d715c4\n",
- "Deleted dataset with ID: 135\n",
- "Deleted storage integration with ID: 140\n",
- "Created storage integration with ID: 141\n",
- "Created dataset with ID: 136\n",
- "Started the run with ID: efd28fc7-d295-4a8b-9d95-b4d3bbceadc8\n",
- "Optimization run completed successfully: 100%|██████████| 100.0/100.0 [05:16<00:00, 3.17s/it] "
- ]
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/blewis/Programming/hirundo-client/.venv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+ " from .autonotebook import tqdm as notebook_tqdm\n",
+ "Cancelling run with ID: e2ee490c-99de-444b-8f34-83caf2d715c4\n",
+ "Deleted dataset with ID: 135\n",
+ "Deleted storage integration with ID: 140\n",
+ "Created storage integration with ID: 141\n",
+ "Created dataset with ID: 136\n",
+ "Started the run with ID: efd28fc7-d295-4a8b-9d95-b4d3bbceadc8\n",
+ "Dataset QA run completed successfully: 100%|██████████| 100.0/100.0 [05:16<00:00, 3.17s/it] "
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "suspects= image_path label split suspect_score suspect_level \\\n",
+ "3632 /images/36320.png cat train 1.000000e+00 1.0 \n",
+ "1479 /images/14790.png ship train 9.710029e-01 1.0 \n",
+ "894 /images/8940.png ship train 7.296918e-01 1.0 \n",
+ "2034 /images/20340.png deer train 7.133279e-01 1.0 \n",
+ "709 /images/7090.png dog train 4.025281e-01 1.0 \n",
+ "... ... ... ... ... ... \n",
+ "2216 /images/22160.png truck train 1.899035e-13 0.0 \n",
+ "1814 /images/18140.png ship train 9.983570e-14 0.0 \n",
+ "1679 /images/16790.png bird train 7.334427e-14 0.0 \n",
+ "288 /images/2880.png truck train 5.965531e-14 0.0 \n",
+ "2204 /images/22040.png airplane train 2.194576e-14 0.0 \n",
+ "\n",
+ " suggested_label suggested_label_conf rank \n",
+ "3632 cat 0.999084 1 \n",
+ "1479 ship 1.000000 2 \n",
+ "894 ship 0.999997 3 \n",
+ "2034 deer 0.999986 4 \n",
+ "709 dog 1.000000 5 \n",
+ "... ... ... ... \n",
+ "2216 NaN NaN 4996 \n",
+ "1814 NaN NaN 4997 \n",
+ "1679 NaN NaN 4998 \n",
+ "288 NaN NaN 4999 \n",
+ "2204 NaN NaN 5000 \n",
+ "\n",
+ "[5000 rows x 8 columns] warnings_and_errors=Empty DataFrame\n",
+ "Columns: [image_path, label, status, split]\n",
+ "Index: []\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "import logging\n",
+ "import os\n",
+ "from pathlib import Path\n",
+ "\n",
+ "from hirundo import (\n",
+ " HirundoCSV,\n",
+ " LabelingType,\n",
+ " QADataset,\n",
+ " StorageConfig,\n",
+ " StorageTypes,\n",
+ ")\n",
+ "from pydantic_core import Url\n",
+ "\n",
+ "logger = logging.getLogger(__name__)\n",
+ "\n",
+ "unique_id = os.getenv(\"UNIQUE_ID\", \"\").replace(\".\", \"-\").replace(\"/\", \"-\")\n",
+ "\n",
+ "run_ids = [run.run_id for run in QADataset.list_runs() if run.run_id]\n",
+ "for run_id in run_ids:\n",
+ " QADataset.cancel_by_id(run_id)\n",
+ "dataset_ids = [dataset.id for dataset in QADataset.list_datasets()]\n",
+ "for dataset_id in dataset_ids:\n",
+ " QADataset.delete_by_id(dataset_id)\n",
+ "storage_ids = [storage_config.id for storage_config in StorageConfig.list()]\n",
+ "for storage_id in storage_ids:\n",
+ " StorageConfig.delete_by_id(storage_id)\n",
+ "\n",
+ "test_dataset = QADataset(\n",
+ " name=f\"TEST-GCP sanity dataset{unique_id}\",\n",
+ " labeling_type=LabelingType.SINGLE_LABEL_CLASSIFICATION,\n",
+ " storage_config=StorageConfig(\n",
+ " name=f\"local{unique_id}\",\n",
+ " type=StorageTypes.LOCAL,\n",
+ " ),\n",
+ " data_root_url=Url(Path(\"/datasets/pytorch-cifar/data\").as_uri()),\n",
+ " labeling_info=HirundoCSV(\n",
+ " csv_url=Url(Path(\"/datasets/pytorch-cifar/cifar1.csv\").as_uri()),\n",
+ " ),\n",
+ " classes=[\n",
+ " \"airplane\",\n",
+ " \"automobile\",\n",
+ " \"bird\",\n",
+ " \"cat\",\n",
+ " \"deer\",\n",
+ " \"dog\",\n",
+ " \"frog\",\n",
+ " \"horse\",\n",
+ " \"ship\",\n",
+ " \"truck\",\n",
+ " ],\n",
+ ")\n",
+ "\n",
+ "test_dataset.run_qa()\n",
+ "results = test_dataset.check_run()\n",
+ "print(results)"
+ ]
},
{
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "suspects= image_path label split suspect_score suspect_level \\\n",
- "3632 /images/36320.png cat train 1.000000e+00 1.0 \n",
- "1479 /images/14790.png ship train 9.710029e-01 1.0 \n",
- "894 /images/8940.png ship train 7.296918e-01 1.0 \n",
- "2034 /images/20340.png deer train 7.133279e-01 1.0 \n",
- "709 /images/7090.png dog train 4.025281e-01 1.0 \n",
- "... ... ... ... ... ... \n",
- "2216 /images/22160.png truck train 1.899035e-13 0.0 \n",
- "1814 /images/18140.png ship train 9.983570e-14 0.0 \n",
- "1679 /images/16790.png bird train 7.334427e-14 0.0 \n",
- "288 /images/2880.png truck train 5.965531e-14 0.0 \n",
- "2204 /images/22040.png airplane train 2.194576e-14 0.0 \n",
- "\n",
- " suggested_label suggested_label_conf rank \n",
- "3632 cat 0.999084 1 \n",
- "1479 ship 1.000000 2 \n",
- "894 ship 0.999997 3 \n",
- "2034 deer 0.999986 4 \n",
- "709 dog 1.000000 5 \n",
- "... ... ... ... \n",
- "2216 NaN NaN 4996 \n",
- "1814 NaN NaN 4997 \n",
- "1679 NaN NaN 4998 \n",
- "288 NaN NaN 4999 \n",
- "2204 NaN NaN 5000 \n",
- "\n",
- "[5000 rows x 8 columns] warnings_and_errors=Empty DataFrame\n",
- "Columns: [image_path, label, status, split]\n",
- "Index: []\n"
- ]
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": ".venv",
+ "language": "python",
+ "name": "python3"
},
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n"
- ]
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.19"
}
- ],
- "source": [
- "import logging\n",
- "import os\n",
- "from pathlib import Path\n",
- "\n",
- "from hirundo import (\n",
- " HirundoCSV,\n",
- " LabelingType,\n",
- " OptimizationDataset,\n",
- " StorageConfig,\n",
- " StorageTypes,\n",
- ")\n",
- "from pydantic_core import Url\n",
- "\n",
- "logger = logging.getLogger(__name__)\n",
- "\n",
- "unique_id = os.getenv(\"UNIQUE_ID\", \"\").replace(\".\", \"-\").replace(\"/\", \"-\")\n",
- "\n",
- "run_ids = [run.run_id for run in OptimizationDataset.list_runs() if run.run_id]\n",
- "for run_id in run_ids:\n",
- " OptimizationDataset.cancel_by_id(run_id)\n",
- "dataset_ids = [dataset.id for dataset in OptimizationDataset.list_datasets()]\n",
- "for dataset_id in dataset_ids:\n",
- " OptimizationDataset.delete_by_id(dataset_id)\n",
- "storage_ids = [storage_config.id for storage_config in StorageConfig.list()]\n",
- "for storage_id in storage_ids:\n",
- " StorageConfig.delete_by_id(storage_id)\n",
- "\n",
- "test_dataset = OptimizationDataset(\n",
- " name=f\"TEST-GCP sanity dataset{unique_id}\",\n",
- " labeling_type=LabelingType.SINGLE_LABEL_CLASSIFICATION,\n",
- " storage_config=StorageConfig(\n",
- " name=f\"local{unique_id}\",\n",
- " type=StorageTypes.LOCAL,\n",
- " ),\n",
- " data_root_url=Url(Path(\"/datasets/pytorch-cifar/data\").as_uri()),\n",
- " labeling_info=HirundoCSV(\n",
- " csv_url=Url(Path(\"/datasets/pytorch-cifar/cifar1.csv\").as_uri()),\n",
- " ),\n",
- " classes=[\n",
- " \"airplane\",\n",
- " \"automobile\",\n",
- " \"bird\",\n",
- " \"cat\",\n",
- " \"deer\",\n",
- " \"dog\",\n",
- " \"frog\",\n",
- " \"horse\",\n",
- " \"ship\",\n",
- " \"truck\",\n",
- " ],\n",
- ")\n",
- "\n",
- "test_dataset.run_optimization()\n",
- "results = test_dataset.check_run()\n",
- "print(results)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": ".venv",
- "language": "python",
- "name": "python3"
},
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.9.19"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat": 4,
+ "nbformat_minor": 2
}
diff --git a/on_prem_test_notebook.ipynb b/on_prem_test_notebook.ipynb
index f3fc76b5..759479ba 100644
--- a/on_prem_test_notebook.ipynb
+++ b/on_prem_test_notebook.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": null,
"metadata": {},
"outputs": [
{
@@ -17,7 +17,7 @@
"Created storage integration with ID: 141\n",
"Created dataset with ID: 136\n",
"Started the run with ID: efd28fc7-d295-4a8b-9d95-b4d3bbceadc8\n",
- "Optimization run completed successfully: 100%|██████████| 100.0/100.0 [05:16<00:00, 3.17s/it] "
+ "Dataset QA run completed successfully: 100%|██████████| 100.0/100.0 [05:16<00:00, 3.17s/it] "
]
},
{
@@ -67,39 +67,34 @@
"import logging\n",
"import os\n",
"\n",
- "from hirundo.dataset_optimization import OptimizationDataset\n",
- "from hirundo.enum import LabellingType\n",
- "from hirundo.storage import StorageIntegration, StorageLink, StorageTypes\n",
+ "from hirundo import HirundoCSV, LabelingType, QADataset, StorageConfig, StorageTypes\n",
+ "from pydantic_core import Url\n",
"\n",
"logger = logging.getLogger(__name__)\n",
"\n",
"unique_id = os.getenv(\"UNIQUE_ID\", \"\").replace(\".\", \"-\").replace(\"/\", \"-\")\n",
"\n",
- "run_ids = [\n",
- " dataset[\"run_id\"] for dataset in OptimizationDataset.list() if dataset[\"run_id\"]\n",
- "]\n",
+ "run_ids = [dataset.run_id for dataset in QADataset.list_runs() if dataset.run_id]\n",
"for run_id in run_ids:\n",
- " OptimizationDataset.cancel_by_id(run_id)\n",
- "dataset_ids = [dataset[\"id\"] for dataset in OptimizationDataset.list()]\n",
+ " QADataset.cancel_by_id(run_id)\n",
+ "dataset_ids = [dataset.id for dataset in QADataset.list_datasets()]\n",
"for dataset_id in dataset_ids:\n",
- " OptimizationDataset.delete_by_id(dataset_id)\n",
- "storage_integration_ids = [\n",
- " storage_integration[\"id\"] for storage_integration in StorageIntegration.list()\n",
- "]\n",
- "for storage_integration_id in storage_integration_ids:\n",
- " StorageIntegration.delete_by_id(storage_integration_id)\n",
+ " QADataset.delete_by_id(dataset_id)\n",
+ "storage_config_ids = [storage_config.id for storage_config in StorageConfig.list()]\n",
+ "for storage_config_id in storage_config_ids:\n",
+ " StorageConfig.delete_by_id(storage_config_id)\n",
"\n",
- "test_dataset = OptimizationDataset(\n",
+ "test_dataset = QADataset(\n",
" name=f\"TEST-GCP sanity dataset{unique_id}\",\n",
- " labelling_type=LabellingType.SingleLabelClassification,\n",
- " dataset_storage=StorageLink(\n",
- " storage_integration=StorageIntegration(\n",
- " name=f\"local{unique_id}\",\n",
- " type=StorageTypes.LOCAL,\n",
- " ),\n",
- " path=\"/datasets/pytorch-cifar/data\",\n",
+ " labeling_type=LabelingType.SINGLE_LABEL_CLASSIFICATION,\n",
+ " storage_config=StorageConfig(\n",
+ " name=f\"local{unique_id}\",\n",
+ " type=StorageTypes.LOCAL,\n",
+ " ),\n",
+ " data_root_url=Url(\"/datasets/pytorch-cifar/data\"),\n",
+ " labeling_info=HirundoCSV(\n",
+ " csv_url=Url(\"/datasets/pytorch-cifar/data/cifar1.csv\"),\n",
" ),\n",
- " dataset_metadata_path=\"cifar1.csv\",\n",
" classes=[\n",
" \"airplane\",\n",
" \"automobile\",\n",
@@ -114,7 +109,7 @@
" ],\n",
")\n",
"\n",
- "test_dataset.run_optimization()\n",
+ "test_dataset.run_qa()\n",
"results = test_dataset.check_run()\n",
"print(results)"
]
diff --git a/pyproject.toml b/pyproject.toml
index 9a603e13..c7db1848 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,6 @@
[build-system]
-requires = ["setuptools", "wheel"]
+requires = ["setuptools>=78.1.1", "wheel"]
+# ⬆️ Required to fix vulnerability GHSA-5rjg-fvgr-3xxf
build-backend = "setuptools.build_meta"
[tool.setuptools]
@@ -31,6 +32,12 @@ dependencies = [
"stamina>=24.2.0",
"httpx-sse>=0.4.0",
"tqdm>=4.66.5",
+ "h11>=0.16.0",
+ # ⬆️ Required to fix vulnerability GHSA-vqfr-h8mv-ghfj
+ "requests>=2.32.4",
+ # ⬆️ Required to fix vulnerability GHSA-9hjg-9r4m-mvj7
+ "urllib3>=2.5.0",
+ # ⬆️ Required to fix vulnerabilities GHSA-pq67-6m6q-mj2v and GHSA-48p4-8xcf-vxj5
]
[project.scripts]
@@ -58,10 +65,15 @@ dev = [
"pre-commit>=3.7.1",
"virtualenv>=20.6.6",
# ⬆️ Needed for `pre-commit` and locking version for `safety-cli`
+ # version fixes vulnerability GHSA-rqc4-2hc7-8c8v
"ruff>=0.12.0",
"bumpver",
"platformdirs>=4.3.6",
"safety>=3.2.13",
+ "cryptography>=44.0.1",
+ # ⬆️ Required to fix vulnerability GHSA-79v4-65xg-pq4g
+ "jinja2>=3.1.6",
+ # ⬆️ Required to fix vulnerabilities GHSA-cpwx-vrp4-4pq7 , GHSA-gmj6-6f8f-6699 & GHSA-q2x7-8rv6-6q7h
]
docs = [
"sphinx>=7.4.7",
@@ -71,9 +83,12 @@ docs = [
"furo",
"sphinx-multiversion",
"esbonio",
- "starlette>0.40.0",
+ "starlette>=0.47.2",
+ # ⬆️ Required to fix vulnerability GHSA-2c2j-9gv5-cj73
"markupsafe>=3.0.2",
# Force `starlette` and `markupsafe` to versions compatible with `dev` dependencies.
+ "jinja2>=3.1.6",
+ # ⬆️ Required to fix vulnerabilities GHSA-cpwx-vrp4-4pq7 , GHSA-gmj6-6f8f-6699 & GHSA-q2x7-8rv6-6q7h
]
pandas = ["pandas>=2.2.3"]
polars = ["polars>=1.0.0"]
@@ -172,4 +187,3 @@ dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
[tool.pyright]
typeCheckingMode = "standard"
-autoSearchPaths = true
diff --git a/requirements/dev.txt b/requirements/dev.txt
index b93dbd03..4236766c 100644
--- a/requirements/dev.txt
+++ b/requirements/dev.txt
@@ -37,14 +37,11 @@ click==8.1.7
# safety
# typer
colorama==0.4.6
+ # via bumpver
+cryptography==45.0.5
# via
- # -c requirements/requirements.txt
- # bumpver
- # click
- # pytest
- # tqdm
-cryptography==44.0.0
- # via authlib
+ # hirundo (pyproject.toml)
+ # authlib
distlib==0.3.8
# via virtualenv
docutils==0.21.2
@@ -64,11 +61,12 @@ filelock==3.12.4
# via
# safety
# virtualenv
-h11==0.14.0
+h11==0.16.0
# via
# -c requirements/requirements.txt
+ # hirundo (pyproject.toml)
# httpcore
-httpcore==1.0.5
+httpcore==1.0.9
# via
# -c requirements/requirements.txt
# httpx
@@ -107,8 +105,10 @@ jaraco-functools==4.0.1
# via
# -c requirements/requirements.txt
# keyring
-jinja2==3.1.4
- # via safety
+jinja2==3.1.6
+ # via
+ # hirundo (pyproject.toml)
+ # safety
keyring==25.2.1
# via
# -c requirements/requirements.txt
@@ -188,10 +188,6 @@ python-dotenv==1.0.1
# via
# -c requirements/requirements.txt
# hirundo (pyproject.toml)
-pywin32-ctypes==0.2.3
- # via
- # -c requirements/requirements.txt
- # keyring
pyyaml==6.0.1
# via
# -c requirements/requirements.txt
@@ -201,9 +197,10 @@ readme-renderer==44.0
# via
# -c requirements/requirements.txt
# twine
-requests==2.32.3
+requests==2.32.4
# via
# -c requirements/requirements.txt
+ # hirundo (pyproject.toml)
# requests-toolbelt
# safety
# twine
@@ -291,9 +288,10 @@ typing-extensions==4.12.2
# safety-schemas
# stamina
# typer
-urllib3==2.2.2
+urllib3==2.5.0
# via
# -c requirements/requirements.txt
+ # hirundo (pyproject.toml)
# requests
# safety
# twine
diff --git a/requirements/docs.txt b/requirements/docs.txt
index 554fb74c..3267a9f8 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -47,12 +47,7 @@ click==8.1.7
# typer
# uvicorn
colorama==0.4.6
- # via
- # -c requirements/requirements.txt
- # click
- # sphinx
- # sphinx-autobuild
- # tqdm
+ # via sphinx-autobuild
docutils==0.21.2
# via
# -c requirements/requirements.txt
@@ -68,12 +63,13 @@ exceptiongroup==1.2.2
# cattrs
furo==2024.7.18
# via hirundo (pyproject.toml)
-h11==0.14.0
+h11==0.16.0
# via
# -c requirements/requirements.txt
+ # hirundo (pyproject.toml)
# httpcore
# uvicorn
-httpcore==1.0.5
+httpcore==1.0.9
# via
# -c requirements/requirements.txt
# httpx
@@ -111,8 +107,10 @@ jaraco-functools==4.0.1
# via
# -c requirements/requirements.txt
# keyring
-jinja2==3.1.4
- # via sphinx
+jinja2==3.1.6
+ # via
+ # hirundo (pyproject.toml)
+ # sphinx
keyring==25.2.1
# via
# -c requirements/requirements.txt
@@ -176,10 +174,6 @@ python-dotenv==1.0.1
# -c requirements/requirements.txt
# hirundo (pyproject.toml)
# pydantic-settings
-pywin32-ctypes==0.2.3
- # via
- # -c requirements/requirements.txt
- # keyring
pyyaml==6.0.1
# via
# -c requirements/requirements.txt
@@ -188,9 +182,10 @@ readme-renderer==44.0
# via
# -c requirements/requirements.txt
# twine
-requests==2.32.3
+requests==2.32.4
# via
# -c requirements/requirements.txt
+ # hirundo (pyproject.toml)
# requests-toolbelt
# sphinx
# twine
@@ -254,7 +249,7 @@ stamina==24.2.0
# via
# -c requirements/requirements.txt
# hirundo (pyproject.toml)
-starlette==0.41.2
+starlette==0.47.2
# via
# hirundo (pyproject.toml)
# sphinx-autobuild
@@ -295,9 +290,10 @@ typing-extensions==4.12.2
# starlette
# typer
# uvicorn
-urllib3==2.2.2
+urllib3==2.5.0
# via
# -c requirements/requirements.txt
+ # hirundo (pyproject.toml)
# requests
# twine
# types-requests
diff --git a/requirements/pandas.txt b/requirements/pandas.txt
index d7b65866..fbf1120e 100644
--- a/requirements/pandas.txt
+++ b/requirements/pandas.txt
@@ -26,11 +26,6 @@ click==8.1.7
# via
# -c requirements/requirements.txt
# typer
-colorama==0.4.6
- # via
- # -c requirements/requirements.txt
- # click
- # tqdm
docutils==0.21.2
# via
# -c requirements/requirements.txt
@@ -39,11 +34,12 @@ exceptiongroup==1.2.2
# via
# -c requirements/requirements.txt
# anyio
-h11==0.14.0
+h11==0.16.0
# via
# -c requirements/requirements.txt
+ # hirundo (pyproject.toml)
# httpcore
-httpcore==1.0.5
+httpcore==1.0.9
# via
# -c requirements/requirements.txt
# httpx
@@ -128,10 +124,6 @@ python-dotenv==1.0.1
# hirundo (pyproject.toml)
pytz==2024.1
# via pandas
-pywin32-ctypes==0.2.3
- # via
- # -c requirements/requirements.txt
- # keyring
pyyaml==6.0.1
# via
# -c requirements/requirements.txt
@@ -140,9 +132,10 @@ readme-renderer==44.0
# via
# -c requirements/requirements.txt
# twine
-requests==2.32.3
+requests==2.32.4
# via
# -c requirements/requirements.txt
+ # hirundo (pyproject.toml)
# requests-toolbelt
# twine
requests-toolbelt==1.0.0
@@ -207,9 +200,10 @@ typing-extensions==4.12.2
# typer
tzdata==2024.1
# via pandas
-urllib3==2.2.2
+urllib3==2.5.0
# via
# -c requirements/requirements.txt
+ # hirundo (pyproject.toml)
# requests
# twine
# types-requests
diff --git a/requirements/polars.txt b/requirements/polars.txt
index e2ab3bdf..e9d8e619 100644
--- a/requirements/polars.txt
+++ b/requirements/polars.txt
@@ -26,11 +26,6 @@ click==8.1.7
# via
# -c requirements/requirements.txt
# typer
-colorama==0.4.6
- # via
- # -c requirements/requirements.txt
- # click
- # tqdm
docutils==0.21.2
# via
# -c requirements/requirements.txt
@@ -39,11 +34,12 @@ exceptiongroup==1.2.2
# via
# -c requirements/requirements.txt
# anyio
-h11==0.14.0
+h11==0.16.0
# via
# -c requirements/requirements.txt
+ # hirundo (pyproject.toml)
# httpcore
-httpcore==1.0.5
+httpcore==1.0.9
# via
# -c requirements/requirements.txt
# httpx
@@ -122,10 +118,6 @@ python-dotenv==1.0.1
# via
# -c requirements/requirements.txt
# hirundo (pyproject.toml)
-pywin32-ctypes==0.2.3
- # via
- # -c requirements/requirements.txt
- # keyring
pyyaml==6.0.1
# via
# -c requirements/requirements.txt
@@ -134,9 +126,10 @@ readme-renderer==44.0
# via
# -c requirements/requirements.txt
# twine
-requests==2.32.3
+requests==2.32.4
# via
# -c requirements/requirements.txt
+ # hirundo (pyproject.toml)
# requests-toolbelt
# twine
requests-toolbelt==1.0.0
@@ -197,9 +190,10 @@ typing-extensions==4.12.2
# pydantic-core
# stamina
# typer
-urllib3==2.2.2
+urllib3==2.5.0
# via
# -c requirements/requirements.txt
+ # hirundo (pyproject.toml)
# requests
# twine
# types-requests
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index b261d5e0..af7cf92b 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -15,17 +15,15 @@ charset-normalizer==3.3.2
# via requests
click==8.1.7
# via typer
-colorama==0.4.6
- # via
- # click
- # tqdm
docutils==0.21.2
# via readme-renderer
exceptiongroup==1.2.2
# via anyio
-h11==0.14.0
- # via httpcore
-httpcore==1.0.5
+h11==0.16.0
+ # via
+ # hirundo (pyproject.toml)
+ # httpcore
+httpcore==1.0.9
# via httpx
httpx==0.27.0
# via hirundo (pyproject.toml)
@@ -70,14 +68,13 @@ pygments==2.18.0
# rich
python-dotenv==1.0.1
# via hirundo (pyproject.toml)
-pywin32-ctypes==0.2.3
- # via keyring
pyyaml==6.0.1
# via hirundo (pyproject.toml)
readme-renderer==44.0
# via twine
-requests==2.32.3
+requests==2.32.4
# via
+ # hirundo (pyproject.toml)
# requests-toolbelt
# twine
requests-toolbelt==1.0.0
@@ -115,8 +112,9 @@ typing-extensions==4.12.2
# pydantic-core
# stamina
# typer
-urllib3==2.2.2
+urllib3==2.5.0
# via
+ # hirundo (pyproject.toml)
# requests
# twine
# types-requests
diff --git a/tests/classification/classification_aws_test.py b/tests/classification/classification_aws_test.py
index c1ea0c2d..c133a1e5 100644
--- a/tests/classification/classification_aws_test.py
+++ b/tests/classification/classification_aws_test.py
@@ -5,14 +5,14 @@
from hirundo import (
HirundoCSV,
LabelingType,
- OptimizationDataset,
+ QADataset,
StorageConfig,
StorageS3,
StorageTypes,
)
-from tests.dataset_optimization_shared import (
+from tests.dataset_qa_shared import (
cleanup,
- dataset_optimization_sync_test,
+ dataset_qa_sync_test,
get_unique_id,
)
@@ -25,7 +25,7 @@
access_key_id=os.environ["AWS_ACCESS_KEY"],
secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"],
)
-test_dataset = OptimizationDataset(
+test_dataset = QADataset(
name=f"TEST-AWS cifar10 classification dataset{unique_id}",
labeling_type=LabelingType.SINGLE_LABEL_CLASSIFICATION,
storage_config=StorageConfig(
@@ -59,13 +59,11 @@ def cleanup_tests():
cleanup(test_dataset)
-def test_dataset_optimization():
- full_run = dataset_optimization_sync_test(
- test_dataset, "RUN_CLASSIFICATION_AWS_OPTIMIZATION"
- )
+def test_dataset_qa():
+ full_run = dataset_qa_sync_test(test_dataset, "RUN_CLASSIFICATION_AWS_DATA_QA")
if full_run is not None:
assert full_run.warnings_and_errors is not None
assert full_run.warnings_and_errors.shape[0] == 0
# TODO: Add more assertions for results
else:
- logger.info("Full dataset optimization was not run!")
+ logger.info("Full dataset QA was not run!")
diff --git a/tests/classification/classification_gcp_test.py b/tests/classification/classification_gcp_test.py
index 90490103..d3c444f0 100644
--- a/tests/classification/classification_gcp_test.py
+++ b/tests/classification/classification_gcp_test.py
@@ -6,15 +6,15 @@
from hirundo import (
HirundoCSV,
LabelingType,
- OptimizationDataset,
+ QADataset,
StorageConfig,
StorageGCP,
StorageTypes,
)
from tests.classification.cifar100_classes import cifar100_classes
-from tests.dataset_optimization_shared import (
+from tests.dataset_qa_shared import (
cleanup,
- dataset_optimization_sync_test,
+ dataset_qa_sync_test,
get_unique_id,
)
@@ -26,7 +26,7 @@
project="Hirundo-global",
credentials_json=json.loads(os.environ["GCP_CREDENTIALS"]),
)
-test_dataset = OptimizationDataset(
+test_dataset = QADataset(
name=f"TEST-GCP cifar 100 classification dataset{unique_id}",
labeling_type=LabelingType.SINGLE_LABEL_CLASSIFICATION,
storage_config=StorageConfig(
@@ -49,13 +49,11 @@ def cleanup_tests():
cleanup(test_dataset)
-def test_dataset_optimization():
- full_run = dataset_optimization_sync_test(
- test_dataset, "RUN_CLASSIFICATION_GCP_OPTIMIZATION"
- )
+def test_dataset_qa():
+ full_run = dataset_qa_sync_test(test_dataset, "RUN_CLASSIFICATION_GCP_DATA_QA")
if full_run is not None:
assert full_run.warnings_and_errors is not None
assert full_run.warnings_and_errors.shape[0] == 0
# TODO: Add more assertions for results
else:
- logger.info("Full dataset optimization was not run!")
+ logger.info("Full dataset QA was not run!")
diff --git a/tests/classification/sanity_gcp_test.py b/tests/classification/sanity_gcp_test.py
index 17d6975b..8b1fc1d5 100644
--- a/tests/classification/sanity_gcp_test.py
+++ b/tests/classification/sanity_gcp_test.py
@@ -6,16 +6,16 @@
from hirundo import (
HirundoCSV,
LabelingType,
- OptimizationDataset,
+ QADataset,
StorageConfig,
StorageGCP,
StorageTypes,
)
-from hirundo.dataset_optimization import AugmentationName
-from tests.dataset_optimization_shared import (
+from hirundo.dataset_qa import AugmentationName
+from tests.dataset_qa_shared import (
cleanup,
- dataset_optimization_async_test,
- dataset_optimization_sync_test,
+ dataset_qa_async_test,
+ dataset_qa_sync_test,
get_unique_id,
)
@@ -27,7 +27,7 @@
project="Hirundo-global",
credentials_json=json.loads(os.environ["GCP_CREDENTIALS"]),
)
-test_dataset = OptimizationDataset(
+test_dataset = QADataset(
name=f"TEST-GCP sanity dataset{unique_id}",
labeling_type=LabelingType.SINGLE_LABEL_CLASSIFICATION,
storage_config=StorageConfig(
@@ -66,11 +66,11 @@ def cleanup_tests():
cleanup(test_dataset)
-def test_dataset_optimization():
- full_run = dataset_optimization_sync_test(
+def test_dataset_qa():
+ full_run = dataset_qa_sync_test(
test_dataset,
sanity=True,
- alternative_env="RUN_CLASSIFICATION_GCP_SANITY_OPTIMIZATION",
+ alternative_env="RUN_CLASSIFICATION_GCP_SANITY_DATA_QA",
)
if full_run is not None:
assert full_run.warnings_and_errors is not None
@@ -78,11 +78,9 @@ def test_dataset_optimization():
assert full_run.suspects is not None
assert full_run.suspects.shape[0] >= 5_000
else:
- logger.info("Full dataset optimization was not run!")
+ logger.info("Full dataset QA was not run!")
@pytest.mark.asyncio
-async def test_async_dataset_optimization():
- await dataset_optimization_async_test(
- test_dataset, "RUN_CLASSIFICATION_GCP_SANITY_OPTIMIZATION"
- )
+async def test_async_dataset_qa():
+ await dataset_qa_async_test(test_dataset, "RUN_CLASSIFICATION_GCP_SANITY_DATA_QA")
diff --git a/tests/dataset_optimization_shared.py b/tests/dataset_qa_shared.py
similarity index 80%
rename from tests/dataset_optimization_shared.py
rename to tests/dataset_qa_shared.py
index c59780e6..754290b8 100644
--- a/tests/dataset_optimization_shared.py
+++ b/tests/dataset_qa_shared.py
@@ -6,11 +6,11 @@
import requests
from hirundo import (
GitRepo,
- OptimizationDataset,
+ QADataset,
RunArgs,
StorageConfig,
)
-from hirundo.dataset_optimization import RunStatus
+from hirundo.dataset_qa import RunStatus
from hirundo.logger import get_logger
logger = get_logger(__name__)
@@ -25,9 +25,9 @@ def get_unique_id():
def cleanup_conflict_by_unique_id(unique_id: typing.Optional[str]):
if not unique_id:
return
- runs = OptimizationDataset.list_runs()
+ runs = QADataset.list_runs()
conflicting_run_ids = [run.run_id for run in runs if unique_id in run.name]
- datasets = OptimizationDataset.list_datasets()
+ datasets = QADataset.list_datasets()
conflicting_dataset_ids = [
dataset.id for dataset in datasets if unique_id in dataset.name
]
@@ -41,7 +41,7 @@ def cleanup_conflict_by_unique_id(unique_id: typing.Optional[str]):
]
for conflicting_run_id in conflicting_run_ids:
try:
- OptimizationDataset.archive_run_by_id(conflicting_run_id)
+ QADataset.archive_run_by_id(conflicting_run_id)
except Exception as e:
logger.warning(
"Failed to archive run with ID %s and exception %s",
@@ -50,7 +50,7 @@ def cleanup_conflict_by_unique_id(unique_id: typing.Optional[str]):
)
for conflicting_dataset_id in conflicting_dataset_ids:
try:
- OptimizationDataset.delete_by_id(conflicting_dataset_id)
+ QADataset.delete_by_id(conflicting_dataset_id)
except Exception as e:
logger.warning(
"Failed to delete dataset with ID %s and exception %s",
@@ -78,13 +78,13 @@ def cleanup_conflict_by_unique_id(unique_id: typing.Optional[str]):
@contextmanager
-def _handle_not_found_error(dataset: OptimizationDataset):
+def _handle_not_found_error(dataset: QADataset):
try:
yield
except requests.HTTPError as e:
if e.response.status_code == 404:
logger.info(
- "Optimization dataset with name %s not found, skipping cleanup",
+ "QA dataset with name %s not found, skipping cleanup",
dataset.name,
)
return
@@ -93,7 +93,7 @@ def _handle_not_found_error(dataset: OptimizationDataset):
def _get_runs_by_dataset():
- runs = OptimizationDataset.list_runs()
+ runs = QADataset.list_runs()
runs_by_dataset = defaultdict(list)
for run in runs:
if run.dataset_id is not None and run.run_id is not None:
@@ -101,17 +101,17 @@ def _get_runs_by_dataset():
return runs_by_dataset
-def cleanup(test_dataset: OptimizationDataset):
+def cleanup(test_dataset: QADataset):
logger.info("Started cleanup")
with _handle_not_found_error(test_dataset):
- dataset = OptimizationDataset.get_by_name(test_dataset.name)
+ dataset = QADataset.get_by_name(test_dataset.name)
storage_config_id = (
dataset.storage_config.id if dataset.storage_config is not None else None
)
runs_by_dataset = _get_runs_by_dataset()
if dataset.id is not None:
logger.debug(
- "Found optimization dataset with the same name, deleting it",
+ "Found QA dataset with the same name, deleting it",
)
logger.debug(
"Note: If I am not the owner, I will not be able to delete them"
@@ -119,14 +119,12 @@ def cleanup(test_dataset: OptimizationDataset):
try:
if dataset.id in runs_by_dataset:
for run_id in runs_by_dataset[dataset.id]:
- logger.debug(
- "Archiving optimization dataset with run ID %s", run_id
- )
- OptimizationDataset.archive_run_by_id(run_id)
- OptimizationDataset.delete_by_id(dataset.id)
+ logger.debug("Archiving QA dataset with run ID %s", run_id)
+ QADataset.archive_run_by_id(run_id)
+ QADataset.delete_by_id(dataset.id)
except Exception as e:
logger.warning(
- "Unable to delete optimization dataset with ID %s and exception %s",
+ "Unable to delete QA dataset with ID %s and exception %s",
dataset.id,
e,
)
@@ -170,8 +168,8 @@ def cleanup(test_dataset: OptimizationDataset):
logger.info("Finished cleanup")
-def dataset_optimization_sync_test(
- test_dataset: OptimizationDataset,
+def dataset_qa_sync_test(
+ test_dataset: QADataset,
alternative_env: typing.Optional[str] = None,
sanity=False,
run_args: typing.Optional[RunArgs] = None,
@@ -180,10 +178,8 @@ def dataset_optimization_sync_test(
if (os.getenv("FULL_TEST", "false") == "true" and sanity) or (
alternative_env and os.getenv(alternative_env, "false") == "true"
):
- run_id = test_dataset.run_optimization(
- replace_dataset_if_exists=True, run_args=run_args
- )
- logger.info("Sync: Started dataset optimization run with run ID %s", run_id)
+ run_id = test_dataset.run_qa(replace_dataset_if_exists=True, run_args=run_args)
+ logger.info("Sync: Started dataset QA run with run ID %s", run_id)
logger.info("Sync: Checking run progress")
result = test_dataset.check_run(stop_on_manual_approval=True)
logger.info("Sync: Results %s", result)
@@ -194,17 +190,15 @@ def dataset_optimization_sync_test(
return None
-async def dataset_optimization_async_test(
- test_dataset: OptimizationDataset,
+async def dataset_qa_async_test(
+ test_dataset: QADataset,
env: str,
run_args: typing.Optional[RunArgs] = None,
):
logger.info("Async: Finished cleanup")
if os.getenv(env, "false") == "true":
- run_id = test_dataset.run_optimization(
- replace_dataset_if_exists=True, run_args=run_args
- )
- logger.info("Async: Started dataset optimization run with run ID %s", run_id)
+ run_id = test_dataset.run_qa(replace_dataset_if_exists=True, run_args=run_args)
+ logger.info("Async: Started dataset QA run with run ID %s", run_id)
events_generator = test_dataset.acheck_run()
logger.info("Async: Checking run progress")
last_event = {}
diff --git a/tests/get_by_name_test.py b/tests/get_by_name_test.py
index 65f13c41..78348f2c 100644
--- a/tests/get_by_name_test.py
+++ b/tests/get_by_name_test.py
@@ -7,20 +7,20 @@
GitRepo,
HirundoCSV,
LabelingType,
- OptimizationDataset,
+ QADataset,
StorageConfig,
StorageGCP,
StorageGit,
StorageTypes,
)
-from tests.dataset_optimization_shared import get_unique_id
+from tests.dataset_qa_shared import get_unique_id
unique_id = get_unique_id()
gcp_storage_config_name = f"T-cifar1bucket_get_by_name{unique_id}"
-gcp_optimization_dataset_name = f"T-cifar1_get_by_name{unique_id}"
+gcp_qa_dataset_name = f"T-cifar1_get_by_name{unique_id}"
git_storage_config_name = f"T-BDD-100k-validation-git_get_by_name{unique_id}"
git_repository_name = f"T-BDD-100k-validation-git-repo_get_by_name{unique_id}"
-git_optimization_dataset_name = f"T-BDD-100k-validation-dataset_get_by_name{unique_id}"
+git_qa_dataset_name = f"T-BDD-100k-validation-dataset_get_by_name{unique_id}"
new_storage_config: typing.Optional[StorageConfig] = None
new_dataset = None
@@ -56,8 +56,8 @@ def test_get_by_name_gcp():
assert new_storage_config.gcp is not None
storage_gcp = new_storage_config.gcp
- OptimizationDataset(
- name=gcp_optimization_dataset_name,
+ QADataset(
+ name=gcp_qa_dataset_name,
labeling_type=LabelingType.SINGLE_LABEL_CLASSIFICATION,
storage_config=new_storage_config,
labeling_info=HirundoCSV(
@@ -66,7 +66,7 @@ def test_get_by_name_gcp():
data_root_url=storage_gcp.get_url("/pytorch-cifar/data"),
).create(replace_if_exists=True)
- dataset = OptimizationDataset.get_by_name(gcp_optimization_dataset_name)
+ dataset = QADataset.get_by_name(gcp_qa_dataset_name)
assert dataset is not None
@@ -90,8 +90,8 @@ def test_get_by_name_git():
assert new_storage_config.git is not None
storage_git = new_storage_config.git
- OptimizationDataset(
- name=git_optimization_dataset_name,
+ QADataset(
+ name=git_qa_dataset_name,
labeling_type=LabelingType.OBJECT_DETECTION,
storage_config=new_storage_config,
labeling_info=HirundoCSV(
@@ -102,5 +102,5 @@ def test_get_by_name_git():
data_root_url=storage_git.get_url(path="/BDD100K Val from Hirundo.zip/bdd100k"),
).create(replace_if_exists=True)
- new_dataset = OptimizationDataset.get_by_name(git_optimization_dataset_name)
+ new_dataset = QADataset.get_by_name(git_qa_dataset_name)
assert new_dataset is not None
diff --git a/tests/object-detection/od_aws_test.py b/tests/object-detection/od_aws_test.py
index 5632c582..da74d18d 100644
--- a/tests/object-detection/od_aws_test.py
+++ b/tests/object-detection/od_aws_test.py
@@ -5,14 +5,14 @@
from hirundo import (
HirundoCSV,
LabelingType,
- OptimizationDataset,
+ QADataset,
StorageConfig,
StorageS3,
StorageTypes,
)
-from tests.dataset_optimization_shared import (
+from tests.dataset_qa_shared import (
cleanup,
- dataset_optimization_sync_test,
+ dataset_qa_sync_test,
get_unique_id,
)
@@ -25,7 +25,7 @@
access_key_id=os.environ["AWS_ACCESS_KEY"],
secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"],
)
-test_dataset = OptimizationDataset(
+test_dataset = QADataset(
name=f"TEST-AWS-BDD-100k-validation-OD-dataset{unique_id}",
labeling_type=LabelingType.OBJECT_DETECTION,
storage_config=StorageConfig(
@@ -62,10 +62,10 @@ def cleanup_tests():
cleanup(test_dataset)
-def test_dataset_optimization():
- full_run = dataset_optimization_sync_test(test_dataset, "RUN_AWS_OD_OPTIMIZATION")
+def test_dataset_qa():
+ full_run = dataset_qa_sync_test(test_dataset, "RUN_AWS_OD_DATA_QA")
if full_run is not None:
pass
# TODO: Add add assertion for result
else:
- logger.info("Full dataset optimization was not run!")
+ logger.info("Full dataset QA was not run!")
diff --git a/tests/object-detection/od_git_test.py b/tests/object-detection/od_git_test.py
index 8f2af855..9836ff62 100644
--- a/tests/object-detection/od_git_test.py
+++ b/tests/object-detection/od_git_test.py
@@ -5,14 +5,14 @@
GitRepo,
HirundoCSV,
LabelingType,
- OptimizationDataset,
+ QADataset,
StorageConfig,
StorageGit,
StorageTypes,
)
-from tests.dataset_optimization_shared import (
+from tests.dataset_qa_shared import (
cleanup,
- dataset_optimization_sync_test,
+ dataset_qa_sync_test,
get_unique_id,
)
@@ -26,7 +26,7 @@
),
branch="main",
)
-test_dataset = OptimizationDataset(
+test_dataset = QADataset(
name=f"TEST-HuggingFace-BDD-100k-validation-OD-validation-dataset{unique_id}",
labeling_type=LabelingType.OBJECT_DETECTION,
storage_config=StorageConfig(
@@ -65,10 +65,10 @@ def cleanup_tests():
cleanup(test_dataset)
-def test_dataset_optimization():
- full_run = dataset_optimization_sync_test(test_dataset, "RUN_OD_GIT_OPTIMIZATION")
+def test_dataset_qa():
+ full_run = dataset_qa_sync_test(test_dataset, "RUN_OD_GIT_DATA_QA")
if full_run is not None:
pass
# TODO: Add add assertion for result
else:
- logger.info("Full dataset optimization was not run!")
+ logger.info("Full dataset QA was not run!")
diff --git a/tests/object-detection/rockpaperscisssors_yolo_test.py b/tests/object-detection/rockpaperscisssors_yolo_test.py
index 7a6683b6..6bca0b34 100644
--- a/tests/object-detection/rockpaperscisssors_yolo_test.py
+++ b/tests/object-detection/rockpaperscisssors_yolo_test.py
@@ -6,15 +6,15 @@
from hirundo import (
YOLO,
LabelingType,
- OptimizationDataset,
+ QADataset,
StorageConfig,
StorageGCP,
StorageTypes,
)
-from tests.dataset_optimization_shared import (
+from tests.dataset_qa_shared import (
cleanup,
- dataset_optimization_async_test,
- dataset_optimization_sync_test,
+ dataset_qa_async_test,
+ dataset_qa_sync_test,
get_unique_id,
)
@@ -26,7 +26,7 @@
project="Hirundo-global",
credentials_json=json.loads(os.environ["GCP_CREDENTIALS"]),
)
-test_dataset = OptimizationDataset(
+test_dataset = QADataset(
name=f"TEST-GCP sanity YOLO dataset{unique_id}",
labeling_type=LabelingType.OBJECT_DETECTION,
storage_config=StorageConfig(
@@ -55,10 +55,10 @@ def cleanup_tests():
cleanup(test_dataset)
-def test_dataset_optimization():
- full_run = dataset_optimization_sync_test(
+def test_dataset_qa():
+ full_run = dataset_qa_sync_test(
test_dataset,
- alternative_env="RUN_YOLO_OD_GCP_SANITY_OPTIMIZATION",
+ alternative_env="RUN_YOLO_OD_GCP_SANITY_DATA_QA",
)
if full_run is not None:
assert full_run.warnings_and_errors is not None
@@ -67,11 +67,9 @@ def test_dataset_optimization():
assert full_run.suspects.shape[0] >= 30_000
# TODO: Add more assertions for results
else:
- logger.info("Full dataset optimization was not run!")
+ logger.info("Full dataset QA was not run!")
@pytest.mark.asyncio
-async def test_async_dataset_optimization():
- await dataset_optimization_async_test(
- test_dataset, "RUN_YOLO_OD_GCP_SANITY_OPTIMIZATION"
- )
+async def test_async_dataset_qa():
+ await dataset_qa_async_test(test_dataset, "RUN_YOLO_OD_GCP_SANITY_DATA_QA")
diff --git a/tests/object-detection/sama_coco_test.py b/tests/object-detection/sama_coco_test.py
index 0d70b361..50fb23dd 100644
--- a/tests/object-detection/sama_coco_test.py
+++ b/tests/object-detection/sama_coco_test.py
@@ -6,15 +6,15 @@
from hirundo import (
COCO,
LabelingType,
- OptimizationDataset,
+ QADataset,
StorageConfig,
StorageGCP,
StorageTypes,
)
-from tests.dataset_optimization_shared import (
+from tests.dataset_qa_shared import (
cleanup,
- dataset_optimization_async_test,
- dataset_optimization_sync_test,
+ dataset_qa_async_test,
+ dataset_qa_sync_test,
get_unique_id,
)
@@ -26,7 +26,7 @@
project="Hirundo-global",
credentials_json=json.loads(os.environ["GCP_CREDENTIALS"]),
)
-test_dataset = OptimizationDataset(
+test_dataset = QADataset(
name=f"TEST-GCP sanity COCO dataset{unique_id}",
labeling_type=LabelingType.OBJECT_DETECTION,
storage_config=StorageConfig(
@@ -50,10 +50,10 @@ def cleanup_tests():
cleanup(test_dataset)
-def test_dataset_optimization():
- full_run = dataset_optimization_sync_test(
+def test_dataset_qa():
+ full_run = dataset_qa_sync_test(
test_dataset,
- alternative_env="RUN_COCO_OD_GCP_SANITY_OPTIMIZATION",
+ alternative_env="RUN_COCO_OD_GCP_SANITY_DATA_QA",
)
if full_run is not None:
assert full_run.warnings_and_errors is not None
@@ -62,11 +62,9 @@ def test_dataset_optimization():
assert full_run.suspects.shape[0] >= 30_000
# TODO: Add more assertions for results
else:
- logger.info("Full dataset optimization was not run!")
+ logger.info("Full dataset QA was not run!")
@pytest.mark.asyncio
-async def test_async_dataset_optimization():
- await dataset_optimization_async_test(
- test_dataset, "RUN_COCO_OD_GCP_SANITY_OPTIMIZATION"
- )
+async def test_async_dataset_qa():
+ await dataset_qa_async_test(test_dataset, "RUN_COCO_OD_GCP_SANITY_DATA_QA")
diff --git a/tests/object-detection/sanity_aws_test.py b/tests/object-detection/sanity_aws_test.py
index 038b690d..f5375a16 100644
--- a/tests/object-detection/sanity_aws_test.py
+++ b/tests/object-detection/sanity_aws_test.py
@@ -5,16 +5,16 @@
from hirundo import (
HirundoCSV,
LabelingType,
- OptimizationDataset,
+ ObjectDetectionRunArgs,
+ QADataset,
StorageConfig,
StorageS3,
StorageTypes,
- VisionRunArgs,
)
-from tests.dataset_optimization_shared import (
+from tests.dataset_qa_shared import (
cleanup,
- dataset_optimization_async_test,
- dataset_optimization_sync_test,
+ dataset_qa_async_test,
+ dataset_qa_sync_test,
get_unique_id,
)
@@ -27,7 +27,7 @@
access_key_id=os.environ["AWS_ACCESS_KEY"],
secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"],
)
-test_dataset = OptimizationDataset(
+test_dataset = QADataset(
name=f"TEST-AWS-BDD-100k-subset-1000-OD-dataset{unique_id}",
labeling_type=LabelingType.OBJECT_DETECTION,
storage_config=StorageConfig(
@@ -66,12 +66,12 @@ def cleanup_tests():
cleanup(test_dataset)
-def test_dataset_optimization():
- full_run = dataset_optimization_sync_test(
+def test_dataset_qa():
+ full_run = dataset_qa_sync_test(
test_dataset,
sanity=True,
- alternative_env="RUN_OD_AWS_SANITY_OPTIMIZATION",
- run_args=VisionRunArgs(
+ alternative_env="RUN_OD_AWS_SANITY_DATA_QA",
+ run_args=ObjectDetectionRunArgs(
upsample=True,
min_abs_bbox_size=11,
min_abs_bbox_area=121,
@@ -87,11 +87,9 @@ def test_dataset_optimization():
assert full_run.suspects.shape[0] == 1_107
# TODO: Add more assertions for results
else:
- logger.info("Full dataset optimization was not run!")
+ logger.info("Full dataset QA was not run!")
@pytest.mark.asyncio
-async def test_async_dataset_optimization():
- await dataset_optimization_async_test(
- test_dataset, "RUN_AWS_OD_SANITY_OPTIMIZATION"
- )
+async def test_async_dataset_qa():
+ await dataset_qa_async_test(test_dataset, "RUN_AWS_OD_SANITY_DATA_QA")
diff --git a/tests/speech-to-text/sanity_stt_git_test.py b/tests/speech-to-text/sanity_stt_git_test.py
index 0e7cfec0..8c7bff95 100644
--- a/tests/speech-to-text/sanity_stt_git_test.py
+++ b/tests/speech-to-text/sanity_stt_git_test.py
@@ -3,18 +3,19 @@
import pytest
from hirundo import (
+ Domain,
GitPlainAuth,
GitRepo,
HirundoCSV,
LabelingType,
- OptimizationDataset,
+ QADataset,
StorageConfig,
StorageGit,
StorageTypes,
)
-from tests.dataset_optimization_shared import (
+from tests.dataset_qa_shared import (
cleanup,
- dataset_optimization_sync_test,
+ dataset_qa_sync_test,
get_unique_id,
)
@@ -32,8 +33,9 @@
),
branch="main",
)
-test_dataset = OptimizationDataset(
+test_dataset = QADataset(
name=f"TEST-STT-RoboShaulTiny-dataset{unique_id}",
+ domain=Domain.SPEECH,
labeling_type=LabelingType.SPEECH_TO_TEXT,
language="he",
storage_config=StorageConfig(
@@ -55,9 +57,9 @@ def cleanup_tests():
cleanup(test_dataset)
-def test_dataset_optimization():
- full_run = dataset_optimization_sync_test(
- test_dataset, sanity=True, alternative_env="RUN_STT_GIT_OPTIMIZATION"
+def test_dataset_qa():
+ full_run = dataset_qa_sync_test(
+ test_dataset, sanity=True, alternative_env="RUN_STT_GIT_DATA_QA"
)
if full_run is not None:
assert full_run.warnings_and_errors is not None
@@ -66,4 +68,4 @@ def test_dataset_optimization():
assert full_run.suspects.shape[0] > 45
assert full_run.suspects.shape[0] < 100
else:
- logger.info("Full dataset optimization was not run!")
+ logger.info("Full dataset QA was not run!")
diff --git a/tests/speech-to-text/stt_git_test.py b/tests/speech-to-text/stt_git_test.py
index e8f5054e..eeaaf2f9 100644
--- a/tests/speech-to-text/stt_git_test.py
+++ b/tests/speech-to-text/stt_git_test.py
@@ -3,18 +3,19 @@
import pytest
from hirundo import (
+ Domain,
GitPlainAuth,
GitRepo,
HirundoCSV,
LabelingType,
- OptimizationDataset,
+ QADataset,
StorageConfig,
StorageGit,
StorageTypes,
)
-from tests.dataset_optimization_shared import (
+from tests.dataset_qa_shared import (
cleanup,
- dataset_optimization_sync_test,
+ dataset_qa_sync_test,
get_unique_id,
)
@@ -32,8 +33,9 @@
),
branch="main",
)
-test_dataset = OptimizationDataset(
+test_dataset = QADataset(
name=f"TEST-STT-MASC-dataset{unique_id}",
+ domain=Domain.SPEECH,
labeling_type=LabelingType.SPEECH_TO_TEXT,
language="ar",
storage_config=StorageConfig(
@@ -55,8 +57,8 @@ def cleanup_tests():
cleanup(test_dataset)
-def test_dataset_optimization():
- full_run = dataset_optimization_sync_test(test_dataset, "RUN_STT_GIT_OPTIMIZATION")
+def test_dataset_qa():
+ full_run = dataset_qa_sync_test(test_dataset, "RUN_STT_GIT_DATA_QA")
if full_run is not None:
assert full_run.warnings_and_errors is not None
assert (
@@ -65,4 +67,4 @@ def test_dataset_optimization():
assert full_run.suspects is not None
assert full_run.suspects.shape[0] <= 10_000
else:
- logger.info("Full dataset optimization was not run!")
+ logger.info("Full dataset QA was not run!")