Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion src/valenspy/ancilliary_data/dataset_info.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,10 @@
hortense:
ERA5: #Note that there is an aggregation variable here! What to do with this? This is messy as it can not be uniquely identified by the filename!
root: /dodrio/scratch/projects/2022_200/external/era5/
pattern: <domain_id>/<variable_id>/<frequency>/<variable_id_2>_era5_<domain_id_2>_<frequency_2>_<aggregation>_<time_period>.nc
pattern: #Warning: patterns should be mutually exclusive to ensure correct parsing of the information from the path. A path should not be able to match two patterns!
- <domain_id>/<variable_id>/<frequency>/<variable_id_2>_era5_<domain_id_2>_monthly_<aggregation>_<time_period>.nc
- <domain_id>/<variable_id>/<frequency>/<variable_id_2>_era5_<domain_id_2>_daily_<aggregation>_<time_period>.nc
- <domain_id>/<variable_id>/<frequency>/<variable_id_2>_era5_<domain_id_2>_hourly_<time_period>.nc
meta_data:
source_id: "ERA5"
source_type: "reanalysis"
Expand Down
39 changes: 24 additions & 15 deletions src/valenspy/input/esm_catalog_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,12 @@ def _validate_dataset_info(self):

for dataset_name, dataset_info in self.datasets_info.items():
key_set = set(dataset_info.get("meta_data", {}).keys())
pattern = dataset_info.get("pattern", None)
if pattern:
for key in re.findall(r"<(.*?)>", pattern):
key_set.add(key)

if pattern := dataset_info.get("pattern", None):
if isinstance(pattern, str):
pattern = [pattern]
#Add keys present in all patterns to the key_set by extracting the identifiers in the pattern.
key_set.update(set.intersection(*[set(re.findall(r"<(.*?)>", pat)) for pat in pattern]))

# Check if all required identifiers are present
for identifier in required_identifiers:
Expand Down Expand Up @@ -172,12 +174,19 @@ def _process_dataset_for_catalog(self, dataset_name, dataset_info):

"""
dataset_root = Path(dataset_info.get("root"))
regex_pattern = create_named_regex(dataset_info.get("pattern", None))
regex = re.compile(dataset_root.as_posix() + r"/" + regex_pattern)

dataset_meta_data = dataset_info.get("meta_data", {})
if pattern := dataset_info.get("pattern", None):
if isinstance(pattern, str):
pattern = [pattern]

regex = [re.compile(dataset_root.as_posix() + r"/" + create_named_regex(pat)) for pat in pattern]

IC = INPUT_CONVERTORS.get(dataset_name, None)
dataset_meta_data = dataset_info.get("meta_data", {})
if dataset_IC := dataset_info.get("input_convertor", None):
IC = INPUT_CONVERTORS.get(dataset_IC, None) #Use the specified ICs
else:
IC = INPUT_CONVERTORS.get(dataset_name, None) #Use the dataset name to find the IC

if IC:
CORDEX_variable_set = IC.cordex_variables
variable_set = IC.raw_variables
Expand All @@ -188,13 +197,13 @@ def _process_dataset_for_catalog(self, dataset_name, dataset_info):
for file in files:
if file.endswith(".nc"):
file_path = os.path.join(root, file)
if match := regex.match(file_path):
file_metadata = match.groupdict()
else:
#Add the skipped file to the skipped files dictionary (create the entry if it does not exist)
if dataset_name not in self.skipped_files:
self.skipped_files[dataset_name] = []
self.skipped_files[dataset_name].append(file_path)
for reg in regex:
if match := reg.match(file_path):
file_metadata = match.groupdict()
break

if not match:
self.skipped_files.setdefault(dataset_name, []).append(file_path)
continue

# Add the file path to the metadata
Expand Down
Loading