Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion assets/ann_group_index.sql
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ SELECT
group_item.AnnotationGroupAlgorithmIdentificationSequence[SAFE_OFFSET(0)].AlgorithmName AS AlgorithmName

FROM
`bigquery-public-data.idc_v23.dicom_all` AS ann
`bigquery-public-data.idc_v24.dicom_all` AS ann
CROSS JOIN
UNNEST(ann.AnnotationGroupSequence) AS group_item
WHERE
Expand Down
2 changes: 1 addition & 1 deletion assets/ann_index.sql
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ SELECT
ReferencedSeriesSequence[SAFE_OFFSET(0)].SeriesInstanceUID AS referenced_SeriesInstanceUID

FROM
`bigquery-public-data.idc_v23.dicom_all` AS ann
`bigquery-public-data.idc_v24.dicom_all` AS ann
WHERE
# Microscopy Bulk Simple Annotations SOP Class UID - more reliable than Modality = "ANN"
SOPClassUID = "1.2.840.10008.5.1.4.1.1.91.1"
Expand Down
2 changes: 1 addition & 1 deletion assets/clinical_index.sql
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,6 @@ SELECT
# values encountered in the column
`values`
FROM
`bigquery-public-data.idc_v23_clinical.column_metadata`
`bigquery-public-data.idc_v24_clinical.column_metadata`
ORDER BY
collection_id, table_name
2 changes: 1 addition & 1 deletion assets/contrast_index.sql
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ WITH contrast_data AS (
ARRAY_AGG(DISTINCT ContrastBolusAgent IGNORE NULLS ORDER BY ContrastBolusAgent) AS ContrastBolusAgent,
ARRAY_AGG(DISTINCT ContrastBolusIngredient IGNORE NULLS ORDER BY ContrastBolusIngredient) AS ContrastBolusIngredient,
ARRAY_AGG(DISTINCT ContrastBolusRoute IGNORE NULLS ORDER BY ContrastBolusRoute) AS ContrastBolusRoute
FROM `bigquery-public-data.idc_v23.dicom_all`
FROM `bigquery-public-data.idc_v24.dicom_all`
WHERE Modality IN ('CT', 'MR', 'PT', 'XA', 'RF')
GROUP BY SeriesInstanceUID
)
Expand Down
6 changes: 3 additions & 3 deletions assets/rtstruct_index.sql
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ WITH
roi.ROIName,
roi.ROIGenerationAlgorithm
FROM
`bigquery-public-data.idc_v23.dicom_all`
`bigquery-public-data.idc_v24.dicom_all`
CROSS JOIN
UNNEST(StructureSetROISequence) AS roi
WHERE
Expand All @@ -29,7 +29,7 @@ WITH
obs.ReferencedROINumber,
obs.RTROIInterpretedType
FROM
`bigquery-public-data.idc_v23.dicom_all`
`bigquery-public-data.idc_v24.dicom_all`
CROSS JOIN
UNNEST(RTROIObservationsSequence) AS obs
WHERE
Expand All @@ -40,7 +40,7 @@ WITH
SOPInstanceUID,
rt_series.SeriesInstanceUID AS referenced_SeriesInstanceUID
FROM
`bigquery-public-data.idc_v23.dicom_all`
`bigquery-public-data.idc_v24.dicom_all`
CROSS JOIN
UNNEST(ReferencedFrameOfReferenceSequence) AS ref_frame
CROSS JOIN
Expand Down
6 changes: 3 additions & 3 deletions assets/seg_index.sql
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ WITH
SegmentSequence,
SegmentationType
FROM
`bigquery-public-data.idc_v23.dicom_metadata`
`bigquery-public-data.idc_v24.dicom_metadata`
WHERE
# more reliable than Modality = "SEG"
SOPClassUID = "1.2.840.10008.5.1.4.1.1.66.4"
Expand Down Expand Up @@ -101,7 +101,7 @@ WITH
SourceImageSequence[SAFE_OFFSET(0)].ReferencedSOPInstanceUID
AS sis_one
FROM
`bigquery-public-data.idc_v23.dicom_all`
`bigquery-public-data.idc_v24.dicom_all`
WHERE
Modality = "SEG"
AND SOPClassUID = "1.2.840.10008.5.1.4.1.1.66.4"
Expand All @@ -126,7 +126,7 @@ WITH
FROM
coalesced_ref
JOIN
`bigquery-public-data.idc_v23.dicom_all` AS dicom_all
`bigquery-public-data.idc_v24.dicom_all` AS dicom_all
ON
coalesced_ref.referenced_sop = dicom_all.SOPInstanceUID
RIGHT JOIN
Expand Down
6 changes: 3 additions & 3 deletions assets/sm_index.sql
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
-- SELECT
-- * EXCEPT(Modality)
-- FROM
-- `bigquery-public-data.idc_v19.dicom_metadata_curated_series_level`
-- `bigquery-public-data.idc_v24.dicom_metadata_curated_series_level`
-- WHERE
-- Modality = "SM"

Expand Down Expand Up @@ -38,7 +38,7 @@ WITH


FROM
`bigquery-public-data.idc_v23.dicom_all` AS dicom_all
`bigquery-public-data.idc_v24.dicom_all` AS dicom_all
GROUP BY
SeriesInstanceUID
),
Expand All @@ -52,7 +52,7 @@ SpecimenPreparationSequence_unnested AS (
concept_code_sequence.CodeMeaning AS ccs_cm,
concept_code_sequence.CodingSchemeDesignator AS ccs_csd,
concept_code_sequence.CodeValue AS ccs_val,
FROM `bigquery-public-data.idc_v23.dicom_all`,
FROM `bigquery-public-data.idc_v24.dicom_all`,
UNNEST(SpecimenDescriptionSequence[SAFE_OFFSET(0)].SpecimenPreparationSequence) as preparation_unnest_step1,
UNNEST(preparation_unnest_step1.SpecimenPreparationStepContentItemSequence) as preparation_unnest_step2,
UNNEST(preparation_unnest_step2.ConceptNameCodeSequence) as concept_name_code_sequence,
Expand Down
4 changes: 2 additions & 2 deletions assets/sm_instance_index.sql
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ WITH
concept_code_sequence.CodingSchemeDesignator AS ccs_csd,
concept_code_sequence.CodeValue AS ccs_val,
FROM
`bigquery-public-data.idc_v23.dicom_all`,
`bigquery-public-data.idc_v24.dicom_all`,
UNNEST(SpecimenDescriptionSequence[SAFE_OFFSET(0)].SpecimenPreparationSequence) AS preparation_unnest_step1,
UNNEST(preparation_unnest_step1.SpecimenPreparationStepContentItemSequence) AS preparation_unnest_step2,
UNNEST(preparation_unnest_step2.ConceptNameCodeSequence) AS concept_name_code_sequence,
Expand Down Expand Up @@ -144,7 +144,7 @@ SELECT
# unique identifier of the instance within the IDC
dicom_all.crdc_instance_uuid AS crdc_instance_uuid
FROM
`bigquery-public-data.idc_v23.dicom_all` AS dicom_all
`bigquery-public-data.idc_v24.dicom_all` AS dicom_all
LEFT JOIN
slide_embedding
ON
Expand Down
159 changes: 159 additions & 0 deletions collections_index_schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
{
"table_description": "This table contains metadata about the collections available in IDC. Each row corresponds to a collection, and contains attributes such as the collection name, types of cancer represented, number of subjects, and pointers to the resources to learn more about the content of the collection.",
"columns": [
{
"name": "collection_name",
"type": "STRING",
"mode": "NULLABLE",
"description": "name of the collection"
},
{
"name": "collection_id",
"type": "STRING",
"mode": "NULLABLE",
"description": "unique identifier of the collection"
},
{
"name": "cancer_types",
"type": "STRING",
"mode": "NULLABLE",
"description": "types of cancer represented in the collection"
},
{
"name": "tumor_locations",
"type": "STRING",
"mode": "NULLABLE",
"description": "locations of tumors represented in the collection"
},
{
"name": "subjects",
"type": "INTEGER",
"mode": "NULLABLE",
"description": "number of subjects in the collection"
},
{
"name": "species",
"type": "STRING",
"mode": "NULLABLE",
"description": "species represented in the collection"
},
{
"name": "sources",
"type": "RECORD",
"mode": "REPEATED",
"description": "sources of data for the collection",
"fields": [
{
"name": "source_id",
"type": "STRING",
"mode": "NULLABLE",
"description": "unique identifier of the source"
},
{
"name": "source_title",
"type": "STRING",
"mode": "NULLABLE",
"description": "title of the source"
},
{
"name": "source_type",
"type": "STRING",
"mode": "NULLABLE",
"description": "type of the source"
},
{
"name": "source_doi",
"type": "STRING",
"mode": "NULLABLE",
"description": "DOI of the source"
},
{
"name": "source_url",
"type": "STRING",
"mode": "NULLABLE",
"description": "URL of the source"
},
{
"name": "modalities",
"type": "STRING",
"mode": "NULLABLE",
"description": "imaging modalities in this source"
},
{
"name": "license",
"type": "RECORD",
"mode": "NULLABLE",
"description": "license information for the source",
"fields": [
{
"name": "license_url",
"type": "STRING",
"mode": "NULLABLE",
"description": "URL of the license"
},
{
"name": "license_long_name",
"type": "STRING",
"mode": "NULLABLE",
"description": "full name of the license"
},
{
"name": "license_short_name",
"type": "STRING",
"mode": "NULLABLE",
"description": "abbreviated name of the license"
}
]
},
{
"name": "citation",
"type": "STRING",
"mode": "NULLABLE",
"description": "citation text for the source"
},
{
"name": "Access",
"type": "STRING",
"mode": "NULLABLE",
"description": "access level of the source"
},
{
"name": "ImageTypes",
"type": "STRING",
"mode": "NULLABLE",
"description": "image types available in the source"
}
]
},
{
"name": "supporting_data",
"type": "STRING",
"mode": "NULLABLE",
"description": "additional data supporting the collection available in IDC"
},
{
"name": "program_id",
"type": "STRING",
"mode": "NULLABLE",
"description": "broader initiative/category under which this collection is being shared"
},
{
"name": "status",
"type": "STRING",
"mode": "NULLABLE",
"description": "status of the collection (Completed or Ongoing)"
},
{
"name": "updated",
"type": "DATE",
"mode": "NULLABLE",
"description": "date of the last update to the collection"
},
{
"name": "description",
"type": "STRING",
"mode": "NULLABLE",
"description": "detailed information about the collection"
}
]
}
11 changes: 10 additions & 1 deletion scripts/python/idc_index_data_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,15 @@ def execute_sql_query(
"""
with Path(file_path).open("r") as file:
sql_query = file.read()
# add try/catch and print query if error occurs to help with debugging
try:
query_job_result = self.client.query(sql_query).result()
except Exception as e:
logger.error(
"Error occurred while executing SQL query from file: %s", file_path
)
logger.error(e)
raise e
query_job_result = self.client.query(sql_query).result()
schema = query_job_result.schema # Get schema from BigQuery QueryJob
index_df = query_job_result.to_dataframe()
Expand Down Expand Up @@ -454,7 +463,7 @@ def retrieve_latest_idc_release_version(self) -> int:
SELECT
MAX(idc_version) AS latest_idc_release_version
FROM
`bigquery-public-data.idc_v23.version_metadata`
`bigquery-public-data.idc_v24.version_metadata`
"""
query_job = self.client.query(query)
result = query_job.result()
Expand Down
2 changes: 1 addition & 1 deletion scripts/sql/analysis_results_index.sql
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,6 @@ SELECT
# citation for the analysis results collection that should be used for acknowledgment
Citation
FROM
`bigquery-public-data.idc_v23.analysis_results_metadata`
`bigquery-public-data.idc_v24.analysis_results_metadata`
ORDER BY
analysis_result_id
24 changes: 12 additions & 12 deletions scripts/sql/collections_index.sql
Original file line number Diff line number Diff line change
Expand Up @@ -11,35 +11,35 @@ SELECT
collection_id,
# description:
# types of cancer represented in the collection
CancerTypes,
cancer_types,
# description:
# locations of tumors represented in the collection
TumorLocations,
tumor_locations,
# description:
# number of subjects in the collection
Subjects,
subjects,
# description:
# species represented in the collection
Species,
species,
# description:
# sources of data for the collection
Sources,
sources,
# description:
# additional data supporting the collection available in IDC
SupportingData,
supporting_data,
# description:
# broader initiative/category under which this collection is being shared
Program,
program_id,
# description:
# status of the collection (Completed or Ongoing)
Status,
status,
# description:
# timestamp of the last update to the collection
SAFE_CAST(Updated AS STRING) AS Updated,
# date of the last update to the collection
SAFE_CAST(updated AS STRING) AS updated,
# description:
# detailed information about the collection
Description
description
FROM
`bigquery-public-data.idc_v23.original_collections_metadata`
`bigquery-public-data.idc_v24.original_collections_metadata`
ORDER BY
collection_id
4 changes: 2 additions & 2 deletions scripts/sql/idc_index.sql
Original file line number Diff line number Diff line change
Expand Up @@ -163,9 +163,9 @@ SELECT
# total size of the series in megabytes
SUM(SAFE_CAST(instance_size AS float64))/1000000. AS series_size_MB,
FROM
`bigquery-public-data.idc_v23.dicom_all` AS dicom_all
`bigquery-public-data.idc_v24.dicom_all` AS dicom_all
LEFT JOIN
`bigquery-public-data.idc_v23.dicom_metadata_curated` AS dicom_curated
`bigquery-public-data.idc_v24.dicom_metadata_curated` AS dicom_curated
ON
dicom_all.SOPInstanceUID = dicom_curated.SOPInstanceUID
GROUP BY
Expand Down
2 changes: 1 addition & 1 deletion scripts/sql/prior_versions_index.sql
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
--
-- Step 1: Declare variables
DECLARE idc_versions ARRAY<INT64>;
DECLARE latest_idc_version INT64 DEFAULT 23;
DECLARE latest_idc_version INT64 DEFAULT 24;
DECLARE union_all_query STRING;

--Step 2
Expand Down
Loading
Loading