diff --git a/assets/ann_group_index.sql b/assets/ann_group_index.sql index 5410f23..6beda5a 100644 --- a/assets/ann_group_index.sql +++ b/assets/ann_group_index.sql @@ -63,7 +63,7 @@ SELECT group_item.AnnotationGroupAlgorithmIdentificationSequence[SAFE_OFFSET(0)].AlgorithmName AS AlgorithmName FROM - `bigquery-public-data.idc_v23.dicom_all` AS ann + `bigquery-public-data.idc_v24.dicom_all` AS ann CROSS JOIN UNNEST(ann.AnnotationGroupSequence) AS group_item WHERE diff --git a/assets/ann_index.sql b/assets/ann_index.sql index 13a4a31..ed30364 100644 --- a/assets/ann_index.sql +++ b/assets/ann_index.sql @@ -21,7 +21,7 @@ SELECT ReferencedSeriesSequence[SAFE_OFFSET(0)].SeriesInstanceUID AS referenced_SeriesInstanceUID FROM - `bigquery-public-data.idc_v23.dicom_all` AS ann + `bigquery-public-data.idc_v24.dicom_all` AS ann WHERE # Microscopy Bulk Simple Annotations SOP Class UID - more reliable than Modality = "ANN" SOPClassUID = "1.2.840.10008.5.1.4.1.1.91.1" diff --git a/assets/clinical_index.sql b/assets/clinical_index.sql index 752649e..caef2ae 100644 --- a/assets/clinical_index.sql +++ b/assets/clinical_index.sql @@ -25,6 +25,6 @@ SELECT # values encountered in the column `values` FROM - `bigquery-public-data.idc_v23_clinical.column_metadata` + `bigquery-public-data.idc_v24_clinical.column_metadata` ORDER BY collection_id, table_name diff --git a/assets/contrast_index.sql b/assets/contrast_index.sql index 7174ccc..c3ef84f 100644 --- a/assets/contrast_index.sql +++ b/assets/contrast_index.sql @@ -11,7 +11,7 @@ WITH contrast_data AS ( ARRAY_AGG(DISTINCT ContrastBolusAgent IGNORE NULLS ORDER BY ContrastBolusAgent) AS ContrastBolusAgent, ARRAY_AGG(DISTINCT ContrastBolusIngredient IGNORE NULLS ORDER BY ContrastBolusIngredient) AS ContrastBolusIngredient, ARRAY_AGG(DISTINCT ContrastBolusRoute IGNORE NULLS ORDER BY ContrastBolusRoute) AS ContrastBolusRoute - FROM `bigquery-public-data.idc_v23.dicom_all` + FROM `bigquery-public-data.idc_v24.dicom_all` WHERE Modality IN ('CT', 'MR', 'PT', 'XA', 'RF') GROUP BY SeriesInstanceUID ) diff --git a/assets/rtstruct_index.sql b/assets/rtstruct_index.sql index d380d88..ef2a457 100644 --- a/assets/rtstruct_index.sql +++ b/assets/rtstruct_index.sql @@ -16,7 +16,7 @@ WITH roi.ROIName, roi.ROIGenerationAlgorithm FROM - `bigquery-public-data.idc_v23.dicom_all` + `bigquery-public-data.idc_v24.dicom_all` CROSS JOIN UNNEST(StructureSetROISequence) AS roi WHERE @@ -29,7 +29,7 @@ WITH obs.ReferencedROINumber, obs.RTROIInterpretedType FROM - `bigquery-public-data.idc_v23.dicom_all` + `bigquery-public-data.idc_v24.dicom_all` CROSS JOIN UNNEST(RTROIObservationsSequence) AS obs WHERE @@ -40,7 +40,7 @@ WITH SOPInstanceUID, rt_series.SeriesInstanceUID AS referenced_SeriesInstanceUID FROM - `bigquery-public-data.idc_v23.dicom_all` + `bigquery-public-data.idc_v24.dicom_all` CROSS JOIN UNNEST(ReferencedFrameOfReferenceSequence) AS ref_frame CROSS JOIN diff --git a/assets/seg_index.sql b/assets/seg_index.sql index f820f7a..08ece4c 100644 --- a/assets/seg_index.sql +++ b/assets/seg_index.sql @@ -25,7 +25,7 @@ WITH SegmentSequence, SegmentationType FROM - `bigquery-public-data.idc_v23.dicom_metadata` + `bigquery-public-data.idc_v24.dicom_metadata` WHERE # more reliable than Modality = "SEG" SOPClassUID = "1.2.840.10008.5.1.4.1.1.66.4" @@ -101,7 +101,7 @@ WITH SourceImageSequence[SAFE_OFFSET(0)].ReferencedSOPInstanceUID AS sis_one FROM - `bigquery-public-data.idc_v23.dicom_all` + `bigquery-public-data.idc_v24.dicom_all` WHERE Modality = "SEG" AND SOPClassUID = "1.2.840.10008.5.1.4.1.1.66.4" @@ -126,7 +126,7 @@ WITH FROM coalesced_ref JOIN - `bigquery-public-data.idc_v23.dicom_all` AS dicom_all + `bigquery-public-data.idc_v24.dicom_all` AS dicom_all ON coalesced_ref.referenced_sop = dicom_all.SOPInstanceUID RIGHT JOIN diff --git a/assets/sm_index.sql b/assets/sm_index.sql index c2ce66e..f9bc3d7 100644 --- a/assets/sm_index.sql +++ b/assets/sm_index.sql @@ -5,7 +5,7 @@ -- SELECT -- * EXCEPT(Modality) -- FROM --- `bigquery-public-data.idc_v19.dicom_metadata_curated_series_level` +-- `bigquery-public-data.idc_v24.dicom_metadata_curated_series_level` -- WHERE -- Modality = "SM" @@ -38,7 +38,7 @@ WITH FROM - `bigquery-public-data.idc_v23.dicom_all` AS dicom_all + `bigquery-public-data.idc_v24.dicom_all` AS dicom_all GROUP BY SeriesInstanceUID ), @@ -52,7 +52,7 @@ SpecimenPreparationSequence_unnested AS ( concept_code_sequence.CodeMeaning AS ccs_cm, concept_code_sequence.CodingSchemeDesignator AS ccs_csd, concept_code_sequence.CodeValue AS ccs_val, - FROM `bigquery-public-data.idc_v23.dicom_all`, + FROM `bigquery-public-data.idc_v24.dicom_all`, UNNEST(SpecimenDescriptionSequence[SAFE_OFFSET(0)].SpecimenPreparationSequence) as preparation_unnest_step1, UNNEST(preparation_unnest_step1.SpecimenPreparationStepContentItemSequence) as preparation_unnest_step2, UNNEST(preparation_unnest_step2.ConceptNameCodeSequence) as concept_name_code_sequence, diff --git a/assets/sm_instance_index.sql b/assets/sm_instance_index.sql index 61cbf56..930b7fa 100644 --- a/assets/sm_instance_index.sql +++ b/assets/sm_instance_index.sql @@ -16,7 +16,7 @@ WITH concept_code_sequence.CodingSchemeDesignator AS ccs_csd, concept_code_sequence.CodeValue AS ccs_val, FROM - `bigquery-public-data.idc_v23.dicom_all`, + `bigquery-public-data.idc_v24.dicom_all`, UNNEST(SpecimenDescriptionSequence[SAFE_OFFSET(0)].SpecimenPreparationSequence) AS preparation_unnest_step1, UNNEST(preparation_unnest_step1.SpecimenPreparationStepContentItemSequence) AS preparation_unnest_step2, UNNEST(preparation_unnest_step2.ConceptNameCodeSequence) AS concept_name_code_sequence, @@ -144,7 +144,7 @@ SELECT # unique identifier of the instance within the IDC dicom_all.crdc_instance_uuid AS crdc_instance_uuid FROM - `bigquery-public-data.idc_v23.dicom_all` AS dicom_all + `bigquery-public-data.idc_v24.dicom_all` AS dicom_all LEFT JOIN slide_embedding ON diff --git a/collections_index_schema.json b/collections_index_schema.json new file mode 100644 index 0000000..7f3484e --- /dev/null +++ b/collections_index_schema.json @@ -0,0 +1,159 @@ +{ + "table_description": "This table contains metadata about the collections available in IDC. Each row corresponds to a collection, and contains attributes such as the collection name, types of cancer represented, number of subjects, and pointers to the resources to learn more about the content of the collection.", + "columns": [ + { + "name": "collection_name", + "type": "STRING", + "mode": "NULLABLE", + "description": "name of the collection" + }, + { + "name": "collection_id", + "type": "STRING", + "mode": "NULLABLE", + "description": "unique identifier of the collection" + }, + { + "name": "cancer_types", + "type": "STRING", + "mode": "NULLABLE", + "description": "types of cancer represented in the collection" + }, + { + "name": "tumor_locations", + "type": "STRING", + "mode": "NULLABLE", + "description": "locations of tumors represented in the collection" + }, + { + "name": "subjects", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "number of subjects in the collection" + }, + { + "name": "species", + "type": "STRING", + "mode": "NULLABLE", + "description": "species represented in the collection" + }, + { + "name": "sources", + "type": "RECORD", + "mode": "REPEATED", + "description": "sources of data for the collection", + "fields": [ + { + "name": "source_id", + "type": "STRING", + "mode": "NULLABLE", + "description": "unique identifier of the source" + }, + { + "name": "source_title", + "type": "STRING", + "mode": "NULLABLE", + "description": "title of the source" + }, + { + "name": "source_type", + "type": "STRING", + "mode": "NULLABLE", + "description": "type of the source" + }, + { + "name": "source_doi", + "type": "STRING", + "mode": "NULLABLE", + "description": "DOI of the source" + }, + { + "name": "source_url", + "type": "STRING", + "mode": "NULLABLE", + "description": "URL of the source" + }, + { + "name": "modalities", + "type": "STRING", + "mode": "NULLABLE", + "description": "imaging modalities in this source" + }, + { + "name": "license", + "type": "RECORD", + "mode": "NULLABLE", + "description": "license information for the source", + "fields": [ + { + "name": "license_url", + "type": "STRING", + "mode": "NULLABLE", + "description": "URL of the license" + }, + { + "name": "license_long_name", + "type": "STRING", + "mode": "NULLABLE", + "description": "full name of the license" + }, + { + "name": "license_short_name", + "type": "STRING", + "mode": "NULLABLE", + "description": "abbreviated name of the license" + } + ] + }, + { + "name": "citation", + "type": "STRING", + "mode": "NULLABLE", + "description": "citation text for the source" + }, + { + "name": "Access", + "type": "STRING", + "mode": "NULLABLE", + "description": "access level of the source" + }, + { + "name": "ImageTypes", + "type": "STRING", + "mode": "NULLABLE", + "description": "image types available in the source" + } + ] + }, + { + "name": "supporting_data", + "type": "STRING", + "mode": "NULLABLE", + "description": "additional data supporting the collection available in IDC" + }, + { + "name": "program_id", + "type": "STRING", + "mode": "NULLABLE", + "description": "broader initiative/category under which this collection is being shared" + }, + { + "name": "status", + "type": "STRING", + "mode": "NULLABLE", + "description": "status of the collection (Completed or Ongoing)" + }, + { + "name": "updated", + "type": "DATE", + "mode": "NULLABLE", + "description": "date of the last update to the collection" + }, + { + "name": "description", + "type": "STRING", + "mode": "NULLABLE", + "description": "detailed information about the collection" + } + ] +} diff --git a/scripts/python/idc_index_data_manager.py b/scripts/python/idc_index_data_manager.py index 1e52efe..95f9613 100644 --- a/scripts/python/idc_index_data_manager.py +++ b/scripts/python/idc_index_data_manager.py @@ -254,6 +254,15 @@ def execute_sql_query( """ with Path(file_path).open("r") as file: sql_query = file.read() + # add try/catch and print query if error occurs to help with debugging + try: + query_job_result = self.client.query(sql_query).result() + except Exception as e: + logger.error( + "Error occurred while executing SQL query from file: %s", file_path + ) + logger.error(e) + raise e query_job_result = self.client.query(sql_query).result() schema = query_job_result.schema # Get schema from BigQuery QueryJob index_df = query_job_result.to_dataframe() @@ -454,7 +463,7 @@ def retrieve_latest_idc_release_version(self) -> int: SELECT MAX(idc_version) AS latest_idc_release_version FROM - `bigquery-public-data.idc_v23.version_metadata` + `bigquery-public-data.idc_v24.version_metadata` """ query_job = self.client.query(query) result = query_job.result() diff --git a/scripts/sql/analysis_results_index.sql b/scripts/sql/analysis_results_index.sql index bf36b25..b05e6cc 100644 --- a/scripts/sql/analysis_results_index.sql +++ b/scripts/sql/analysis_results_index.sql @@ -43,6 +43,6 @@ SELECT # citation for the analysis results collection that should be used for acknowledgment Citation FROM - `bigquery-public-data.idc_v23.analysis_results_metadata` + `bigquery-public-data.idc_v24.analysis_results_metadata` ORDER BY analysis_result_id diff --git a/scripts/sql/collections_index.sql b/scripts/sql/collections_index.sql index 6ac75ee..5e310e5 100644 --- a/scripts/sql/collections_index.sql +++ b/scripts/sql/collections_index.sql @@ -11,35 +11,35 @@ SELECT collection_id, # description: # types of cancer represented in the collection - CancerTypes, + cancer_types, # description: # locations of tumors represented in the collection - TumorLocations, + tumor_locations, # description: # number of subjects in the collection - Subjects, + subjects, # description: # species represented in the collection - Species, + species, # description: # sources of data for the collection - Sources, + sources, # description: # additional data supporting the collection available in IDC - SupportingData, + supporting_data, # description: # broader initiative/category under which this collection is being shared - Program, + program_id, # description: # status of the collection (Completed or Ongoing) - Status, + status, # description: - # timestamp of the last update to the collection - SAFE_CAST(Updated AS STRING) AS Updated, + # date of the last update to the collection + SAFE_CAST(updated AS STRING) AS updated, # description: # detailed information about the collection - Description + description FROM - `bigquery-public-data.idc_v23.original_collections_metadata` + `bigquery-public-data.idc_v24.original_collections_metadata` ORDER BY collection_id diff --git a/scripts/sql/idc_index.sql b/scripts/sql/idc_index.sql index e51ca3f..72bc564 100644 --- a/scripts/sql/idc_index.sql +++ b/scripts/sql/idc_index.sql @@ -163,9 +163,9 @@ SELECT # total size of the series in megabytes SUM(SAFE_CAST(instance_size AS float64))/1000000. AS series_size_MB, FROM - `bigquery-public-data.idc_v23.dicom_all` AS dicom_all + `bigquery-public-data.idc_v24.dicom_all` AS dicom_all LEFT JOIN - `bigquery-public-data.idc_v23.dicom_metadata_curated` AS dicom_curated + `bigquery-public-data.idc_v24.dicom_metadata_curated` AS dicom_curated ON dicom_all.SOPInstanceUID = dicom_curated.SOPInstanceUID GROUP BY diff --git a/scripts/sql/prior_versions_index.sql b/scripts/sql/prior_versions_index.sql index 563fa99..2c89410 100644 --- a/scripts/sql/prior_versions_index.sql +++ b/scripts/sql/prior_versions_index.sql @@ -3,7 +3,7 @@ -- -- Step 1: Declare variables DECLARE idc_versions ARRAY; -DECLARE latest_idc_version INT64 DEFAULT 23; +DECLARE latest_idc_version INT64 DEFAULT 24; DECLARE union_all_query STRING; --Step 2 diff --git a/scripts/sql/volume_geometry_index.sql b/scripts/sql/volume_geometry_index.sql index 5665880..56ff653 100644 --- a/scripts/sql/volume_geometry_index.sql +++ b/scripts/sql/volume_geometry_index.sql @@ -70,7 +70,7 @@ # used when all values in the group are expected to be the same. # To use a specific IDC version instead of idc_current, replace -# `bigquery-public-data.idc_current.dicom_all` with e.g. `bigquery-public-data.idc_v18.dicom_all` +# `bigquery-public-data.idc_current.dicom_all` with e.g. `bigquery-public-data.idc_v24.dicom_all` # Configurable parameters DECLARE relativeSliceTolerance FLOAT64 DEFAULT 0.01; # max allowed variation in slice spacing as a fraction diff --git a/tests/test_package.py b/tests/test_package.py index 12c97a7..c73441e 100644 --- a/tests/test_package.py +++ b/tests/test_package.py @@ -4,21 +4,16 @@ from pathlib import Path import pandas as pd -from packaging.version import Version import idc_index_data as m -EXPECTED_IDC_INDEX_VERSION = 23 +EXPECTED_IDC_INDEX_VERSION = 24 def test_version(): assert importlib.metadata.version("idc_index_data") == m.__version__ -def test_idc_index_version(): - assert Version(m.__version__).major == EXPECTED_IDC_INDEX_VERSION - - def test_filepath(): if m.IDC_INDEX_CSV_ARCHIVE_FILEPATH is not None: assert m.IDC_INDEX_CSV_ARCHIVE_FILEPATH.is_file() diff --git a/tests/test_real_sql_parsing.py b/tests/test_real_sql_parsing.py index e0c06e3..b2201f1 100644 --- a/tests/test_real_sql_parsing.py +++ b/tests/test_real_sql_parsing.py @@ -28,16 +28,16 @@ def test_real_sql_files() -> None: expected_columns = [ "collection_name", "collection_id", - "CancerTypes", - "TumorLocations", - "Subjects", - "Species", - "Sources", - "SupportingData", - "Program", - "Status", - "Updated", - "Description", + "cancer_types", + "tumor_locations", + "subjects", + "species", + "sources", + "supporting_data", + "program_id", + "status", + "updated", + "description", ] for col in expected_columns: assert col in descriptions, (