diff --git a/src/valenspy/ancilliary_data/dataset_info.yml b/src/valenspy/ancilliary_data/dataset_info.yml index 9dc1a9c6..ae49d519 100644 --- a/src/valenspy/ancilliary_data/dataset_info.yml +++ b/src/valenspy/ancilliary_data/dataset_info.yml @@ -40,7 +40,10 @@ hortense: ERA5: #Note that there is an aggregation variable here! What to do with this? This is messy as it can not be uniquely identified by the filename! root: /dodrio/scratch/projects/2022_200/external/era5/ - pattern: ///_era5____.nc + pattern: #Warning: patterns should be mutually exclusive to ensure correct parsing of the information from the path. A path should not be able to match two patterns! + - ///_era5__monthly__.nc + - ///_era5__daily__.nc + - ///_era5__hourly_.nc meta_data: source_id: "ERA5" source_type: "reanalysis" diff --git a/src/valenspy/input/esm_catalog_builder.py b/src/valenspy/input/esm_catalog_builder.py index f894109f..5616aa9e 100644 --- a/src/valenspy/input/esm_catalog_builder.py +++ b/src/valenspy/input/esm_catalog_builder.py @@ -110,10 +110,12 @@ def _validate_dataset_info(self): for dataset_name, dataset_info in self.datasets_info.items(): key_set = set(dataset_info.get("meta_data", {}).keys()) - pattern = dataset_info.get("pattern", None) - if pattern: - for key in re.findall(r"<(.*?)>", pattern): - key_set.add(key) + + if pattern := dataset_info.get("pattern", None): + if isinstance(pattern, str): + pattern = [pattern] + #Add keys present in all patterns to the key_set by extracting the identifiers in the pattern. + key_set.update(set.intersection(*[set(re.findall(r"<(.*?)>", pat)) for pat in pattern])) # Check if all required identifiers are present for identifier in required_identifiers: @@ -172,12 +174,19 @@ def _process_dataset_for_catalog(self, dataset_name, dataset_info): """ dataset_root = Path(dataset_info.get("root")) - regex_pattern = create_named_regex(dataset_info.get("pattern", None)) - regex = re.compile(dataset_root.as_posix() + r"/" + regex_pattern) - dataset_meta_data = dataset_info.get("meta_data", {}) + if pattern := dataset_info.get("pattern", None): + if isinstance(pattern, str): + pattern = [pattern] + + regex = [re.compile(dataset_root.as_posix() + r"/" + create_named_regex(pat)) for pat in pattern] - IC = INPUT_CONVERTORS.get(dataset_name, None) + dataset_meta_data = dataset_info.get("meta_data", {}) + if dataset_IC := dataset_info.get("input_convertor", None): + IC = INPUT_CONVERTORS.get(dataset_IC, None) #Use the specified ICs + else: + IC = INPUT_CONVERTORS.get(dataset_name, None) #Use the dataset name to find the IC + if IC: CORDEX_variable_set = IC.cordex_variables variable_set = IC.raw_variables @@ -188,13 +197,13 @@ def _process_dataset_for_catalog(self, dataset_name, dataset_info): for file in files: if file.endswith(".nc"): file_path = os.path.join(root, file) - if match := regex.match(file_path): - file_metadata = match.groupdict() - else: - #Add the skipped file to the skipped files dictionary (create the entry if it does not exist) - if dataset_name not in self.skipped_files: - self.skipped_files[dataset_name] = [] - self.skipped_files[dataset_name].append(file_path) + for reg in regex: + if match := reg.match(file_path): + file_metadata = match.groupdict() + break + + if not match: + self.skipped_files.setdefault(dataset_name, []).append(file_path) continue # Add the file path to the metadata