@@ -30,27 +30,27 @@ def check_files(file_list, site_code, parameter_names_accepted, input_dir=''):
3030 :param input_dir: base path where source files are stored
3131 :return: dictionary with the file name and list of failed tests, list good files chronologically ordered
3232 """
33-
34- file_list_dataframe = pd .DataFrame (columns = ["url" , "deployment_date" ])
33+ rows = []
3534 error_dict = {}
3635
3736 for file in file_list :
3837 with xr .open_dataset (os .path .join (input_dir , file )) as nc :
3938 error_list = check_file (nc , site_code , parameter_names_accepted )
4039 if error_list :
41- error_dict . update ({ file : error_list })
40+ error_dict [ file ] = error_list
4241 else :
43- file_list_dataframe = file_list_dataframe .append ({'url' : file ,
44- 'deployment_date' : parse (nc .time_deployment_start )},
45- ignore_index = True )
42+ rows .append ({
43+ 'url' : file ,
44+ 'deployment_date' : parse (nc .time_deployment_start )
45+ })
4646
47+ file_list_dataframe = pd .DataFrame (rows , columns = ["url" , "deployment_date" ])
4748 file_list_dataframe = file_list_dataframe .sort_values (by = 'deployment_date' )
48- file_list = file_list_dataframe ['url' ].to_list ()
49- if file_list == [] :
49+ sorted_files = file_list_dataframe ['url' ].to_list ()
50+ if not sorted_files :
5051 raise NoInputFilesError ("no valid input files to aggregate" )
5152
52- return file_list , error_dict
53-
53+ return sorted_files , error_dict
5454
5555
5656def get_parameter_names (nc ):
@@ -308,7 +308,7 @@ def PDresample_by_hour(df, function_dict, function_stats):
308308 df_data = pd .DataFrame (index = pd .DatetimeIndex ([]))
309309 for variable in varnames :
310310 ds_var = df [variable ]
311- ds_var_resample = ds_var .resample ('1H ' , base = 0.5 ) # shift by half hour to centre bin on the hour
311+ ds_var_resample = ds_var .resample ('1h ' , offset = '30min' ) # shift by half hour to centre bin on the hour
312312 ds_var_mean = ds_var_resample .apply (function_dict [variable ]).astype (np .float32 )
313313 df_data = pd .concat ([df_data , ds_var_mean ], axis = 1 , sort = False )
314314 for stat_method in function_stats :
@@ -366,8 +366,6 @@ def hourly_aggregator(files_to_aggregate, site_code, qcflags, input_dir='', outp
366366 variable_attribute_dictionary = json .load (json_file )['_variables' ]
367367
368368 df_data = pd .DataFrame ()
369-
370-
371369 ## create empty DF with dtypes
372370 metadata_df_types = [('source_file' , str ),
373371 ('instrument_id' , str ),
@@ -380,6 +378,7 @@ def hourly_aggregator(files_to_aggregate, site_code, qcflags, input_dir='', outp
380378 parameter_names_all = []
381379 applied_offset = []
382380 qc_count_all = {}
381+ metadata_rows = []
383382
384383 for file_index , file in enumerate (files_to_aggregate ):
385384 print (file_index )
@@ -398,13 +397,16 @@ def hourly_aggregator(files_to_aggregate, site_code, qcflags, input_dir='', outp
398397 qc_count = get_QCcount (nc_clean , qcflags )
399398 qc_count_all = update_QCcount (qc_count_all , qc_count )
400399 nc_clean = good_data_only (nc_clean , qcflags ) # good quality data only
401- df_metadata = df_metadata .append ({'source_file' : file ,
402- 'instrument_id' : utils .get_instrument_id (nc ),
403- 'LONGITUDE' : nc .LONGITUDE .squeeze ().values ,
404- 'LATITUDE' : nc .LATITUDE .squeeze ().values ,
405- 'NOMINAL_DEPTH' : get_nominal_depth (nc )},
406- ignore_index = True )
407-
400+
401+ # Append a new row as a dictionary to the list.
402+ metadata_rows .append ({
403+ 'source_file' : file ,
404+ 'instrument_id' : utils .get_instrument_id (nc ),
405+ 'LONGITUDE' : nc .LONGITUDE .squeeze ().values ,
406+ 'LATITUDE' : nc .LATITUDE .squeeze ().values ,
407+ 'NOMINAL_DEPTH' : get_nominal_depth (nc )
408+ })
409+
408410 # If TIME had out-of-range values before cleaning, nc_clean would now have a CFTimeIndex, which
409411 # breaks the resampling further down. Here we reset it to a DatetimeIndex as suggested here:
410412 # https://stackoverflow.com/questions/55786995/converting-cftime-datetimejulian-to-datetime/55787899#55787899
@@ -421,6 +423,7 @@ def hourly_aggregator(files_to_aggregate, site_code, qcflags, input_dir='', outp
421423 df_temp ['instrument_index' ] = np .repeat (file_index , len (df_temp )).astype (np .int32 )
422424 df_data = pd .concat ([df_data , df_temp .reset_index ()], ignore_index = True , sort = False )
423425
426+ df_metadata = pd .DataFrame (metadata_rows , columns = ['source_file' , 'instrument_id' , 'LONGITUDE' , 'LATITUDE' , 'NOMINAL_DEPTH' ])
424427 df_metadata .index .rename ('INSTRUMENT' , inplace = True )
425428 df_data .index .rename ('OBSERVATION' , inplace = True )
426429 ## rename index to TIME
0 commit comments