From 14e623dd620bc31c600928b004995aa47f4e9768 Mon Sep 17 00:00:00 2001 From: jwaiton Date: Sat, 28 Mar 2026 23:27:33 +0000 Subject: [PATCH 01/13] implement `read_binary_lazy()` --- packs/proc/processing_utils.py | 38 +++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/packs/proc/processing_utils.py b/packs/proc/processing_utils.py index c7ec2ed..da93abb 100644 --- a/packs/proc/processing_utils.py +++ b/packs/proc/processing_utils.py @@ -13,8 +13,11 @@ from typing import BinaryIO from typing import Generic from typing import Optional -from datetime import datetime from typing import List +from typing import Generator + +from datetime import datetime + # imports start from MULE/ from packs.core.core_utils import flatten @@ -231,6 +234,39 @@ def process_header(file_path : str, return wdtype, samples, sampling_period, channels +def read_binary_lazy(file : BinaryIO, + wdtype : np.dtype) -> Generator: + ''' + Reads the binary in with the expected format/offset, lazily, + depending on counts to break the data up. + + + Parameters + ---------- + + file (BufferedReader) : Opened file + wdtype (ndtype) : Custom data type for extracting information from + binary files + counts (int) : How many events you want to read in. -1 sets it to take all events. + offset (int) : Offset at which to start reading the data. Used for chunking purposes + and so should by default be set to zero if not chunking. + + Returns + ------- + data (ndarray) : Unformatted data from binary file + + ''' + # initialise data to start the loop + data = (np.fromfile(file, dtype=wdtype, count = counts)) + while len(data) != 0: + yield (True, data) + # ensure data is loaded in after the yield, so the while check is done + data = (np.fromfile(file, dtype=wdtype, count = counts)) + # yield 1 when finished + print('Processing Finished!') + yield (False, np.zeros(shape = (1,))) + + def read_binary(file : BinaryIO, wdtype : np.dtype, counts : Optional[int] = -1, From 808625e3a3de59831716413494f99991f6177f9b Mon Sep 17 00:00:00 2001 From: jwaiton Date: Sat, 28 Mar 2026 23:28:00 +0000 Subject: [PATCH 02/13] implement `process_bin_WD2_lazy()` --- packs/proc/processing_utils.py | 64 ++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/packs/proc/processing_utils.py b/packs/proc/processing_utils.py index da93abb..27896fa 100644 --- a/packs/proc/processing_utils.py +++ b/packs/proc/processing_utils.py @@ -547,6 +547,70 @@ def process_bin_WD1(file_path : str, write('rwf', waveforms, (True, num_of_events, i)) + +def process_bin_WD2_lazy(file_path : str, + save_path : str, + overwrite : Optional[bool] = False, + print_mod : Optional[int] = -1): + + ''' + WAVEDUMP 2: Takes a binary file and outputs the containing waveform information in a h5 file. + + For particularly large waveforms/number of events. You can 'chunk' the data such that + each dataset holds `counts` events. + + Parameters + ---------- + + file_path (str) : Path to binary file + save_path (str) : Path to saved file + overwrite (bool) : Boolean for overwriting pre-existing files + print_mod (int) : Readout frequency for number of events, -1 implies no readout + + Returns + ------- + None + ''' + + # Ensure save path is clear + save_path = check_save_path(save_path, overwrite) + print(f'\nData input : {file_path}\nData output : {save_path}') + + # collect header info + wdtype, samples, sampling_period, channels = process_header(file_path) + + # create header length (bytes) for processing + if channels == 1: + header_size = 24 + else: + header_size = 28 + + # open file for reading + with open(file_path, 'rb') as file: + with writer(save_path, 'RAW', overwrite) as write: + + for i, (flag, array) in enumerate(read_binary_lazy(file, wdtype)): + + if (i % print_mod == 0) and (print_mod != -1): + print(f"Event {i}") + + # catch, once done, rwf should be empty + if flag: + + evt_info, rwf = format_wfs(array, wdtype, samples, channels) + + + # first run-through, collect the header information to extract table size + if i == 0: + file_size = os.path.getsize(file_path) + waveform_size = ((samples * channels * 4 ) + header_size) # can't remember why *2, will need to test this + num_of_events = int(file_size / waveform_size) + + write('event_info', evt_info, (True, num_of_events, i)) + # writer only takes one row at a time, can't broadcast all three at once + for j, wfs in enumerate(rwf): + write('rwf', wfs, (True, num_of_events * channels, i + ((channels-1)*i) + j)) + def process_bin_WD2(file_path : str, save_path : str, overwrite : Optional[bool] = False, From af59f09007c414d5e953e399d644b8bc3a9101df Mon Sep 17 00:00:00 2001 From: jwaiton Date: Sat, 28 Mar 2026 23:28:17 +0000 Subject: [PATCH 03/13] alter config to match correct format --- packs/configs/process_WD2_3channel.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packs/configs/process_WD2_3channel.conf b/packs/configs/process_WD2_3channel.conf index 07190a6..c3ab45a 100644 --- a/packs/configs/process_WD2_3channel.conf +++ b/packs/configs/process_WD2_3channel.conf @@ -8,4 +8,4 @@ save_path = '/path/to/file.h5' [optional] overwrite = True -counts = -1 \ No newline at end of file +print_mod = -1 From c27954202725b8f45574dd0122893e40ced62b9f Mon Sep 17 00:00:00 2001 From: jwaiton Date: Sat, 28 Mar 2026 23:28:57 +0000 Subject: [PATCH 04/13] introduce `process_bin_WD2_lazy()` into conf_dict setup --- packs/proc/proc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packs/proc/proc.py b/packs/proc/proc.py index 863e24d..3fc4850 100644 --- a/packs/proc/proc.py +++ b/packs/proc/proc.py @@ -4,7 +4,7 @@ from packs.core.io import read_config_file from packs.proc.processing_utils import process_csv_lecroy -from packs.proc.processing_utils import process_bin_WD2 +from packs.proc.processing_utils import process_bin_WD2_lazy from packs.proc.processing_utils import process_bin_WD1 from packs.proc.calibration_utils import calibrate from packs.core.core_utils import check_test @@ -36,7 +36,7 @@ def proc(config_file): case 1: process_bin_WD1(**conf_dict) case 2: - process_bin_WD2(**conf_dict) + process_bin_WD2_lazy(**conf_dict) case other: raise RuntimeError(f"wavedump edition {other} decoding isn't currently implemented.") else: From fdd1711035536f58e93ac04c7bb7155d9b6a8d88 Mon Sep 17 00:00:00 2001 From: jwaiton Date: Sat, 28 Mar 2026 23:29:18 +0000 Subject: [PATCH 05/13] cosmetics --- packs/proc/proc.py | 2 +- packs/proc/processing_utils.py | 8 ++------ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/packs/proc/proc.py b/packs/proc/proc.py index 3fc4850..3a507e5 100644 --- a/packs/proc/proc.py +++ b/packs/proc/proc.py @@ -48,4 +48,4 @@ def proc(config_file): except KeyError as e: print(f"\nError in the configuration file, incorrect or missing argument: {e} \n") traceback.print_exc() - sys.exit(2) \ No newline at end of file + sys.exit(2) diff --git a/packs/proc/processing_utils.py b/packs/proc/processing_utils.py index 27896fa..2151a33 100644 --- a/packs/proc/processing_utils.py +++ b/packs/proc/processing_utils.py @@ -199,9 +199,9 @@ def process_header(file_path : str, # open file if not os.path.exists(file_path): - raise FileNotFoundError(2, 'Path or file not found', file_path) + raise FileNotFoundError(2, 'Path or file not found', file_path) - with open(file_path, 'rb') as file: + with open(file_path, 'rb') as file: event_number, timestamp, samples, sampling_period = read_defaults_WD2(file, byte_order) # attempt to read channels @@ -450,7 +450,6 @@ def process_event_lazy_WD1(file_object : BinaryIO): # header to check against sanity_header = header.copy() - # continue only if data exists while len(header) > 0: @@ -460,10 +459,8 @@ def process_event_lazy_WD1(file_object : BinaryIO): # collect waveform, no of samples and timestamp yield (np.fromfile(file_object, dtype = np.dtype(' Date: Sat, 28 Mar 2026 23:46:15 +0000 Subject: [PATCH 06/13] force chunking to be set to 1 This is suboptimal, as described in the functions comment, an issue to resolve this will be opened. --- packs/proc/processing_utils.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/packs/proc/processing_utils.py b/packs/proc/processing_utils.py index 2151a33..3b1e52a 100644 --- a/packs/proc/processing_utils.py +++ b/packs/proc/processing_utils.py @@ -240,6 +240,11 @@ def read_binary_lazy(file : BinaryIO, Reads the binary in with the expected format/offset, lazily, depending on counts to break the data up. + NOTE: + The counts are hardset to 1, making this function relatively inefficient. + In the future, the logic should be revised to allow `np.fromfile`'s count + value to be set based on optimal read-in speed. The logic of the WD2 function + will have to accomodate this when indexing the files. Parameters ---------- @@ -257,11 +262,11 @@ def read_binary_lazy(file : BinaryIO, ''' # initialise data to start the loop - data = (np.fromfile(file, dtype=wdtype, count = counts)) + data = (np.fromfile(file, dtype=wdtype, count = 1)) while len(data) != 0: yield (True, data) # ensure data is loaded in after the yield, so the while check is done - data = (np.fromfile(file, dtype=wdtype, count = counts)) + data = (np.fromfile(file, dtype=wdtype, count = 1)) # yield 1 when finished print('Processing Finished!') yield (False, np.zeros(shape = (1,))) From bfcc2e9c7abf1030460da4bbab89095b9d7f0299 Mon Sep 17 00:00:00 2001 From: jwaiton Date: Sat, 28 Mar 2026 23:47:03 +0000 Subject: [PATCH 07/13] alter configs to match new style --- packs/tests/data/configs/process_WD2_1channel.conf | 2 +- packs/tests/data/configs/process_WD2_3channel.conf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/packs/tests/data/configs/process_WD2_1channel.conf b/packs/tests/data/configs/process_WD2_1channel.conf index 82dfe2a..b7b3404 100644 --- a/packs/tests/data/configs/process_WD2_1channel.conf +++ b/packs/tests/data/configs/process_WD2_1channel.conf @@ -6,4 +6,4 @@ save_path = '/home/e78368jw/Documents/MULE/packs/tests/data/one_channel_tmp.h5' [optional] overwrite = True - +print_mod = -1 diff --git a/packs/tests/data/configs/process_WD2_3channel.conf b/packs/tests/data/configs/process_WD2_3channel.conf index e82645e..205576e 100644 --- a/packs/tests/data/configs/process_WD2_3channel.conf +++ b/packs/tests/data/configs/process_WD2_3channel.conf @@ -6,5 +6,5 @@ save_path = '/home/e78368jw/Documents/MULE/packs/tests/data/three_channels_tmp.h [optional] overwrite = True -counts = 10 +print_mod = -1 From 58b20fb801b02584388c39cff48fd6ff75887e3a Mon Sep 17 00:00:00 2001 From: jwaiton Date: Sat, 28 Mar 2026 23:47:26 +0000 Subject: [PATCH 08/13] update gitignore to ignore temporary files As should have been addressed in #67, but appears to have reared its head, temporary files are now being created one again, with increasing number. This needs to be resolved, as the initial issue is no longer fixed! --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 7c146c8..31f244b 100644 --- a/.gitignore +++ b/.gitignore @@ -165,4 +165,5 @@ cython_debug/ # ignore the new .txt files generated in test /packs/tests/data/repetitive_data/test_*.txt # temporary h5 files get ignored -*tmp.h5 \ No newline at end of file +*tmp.h5 +*tmp*.h5 From 1718bba5c8c33496a0ebd654dfce18633f9dee9e Mon Sep 17 00:00:00 2001 From: jwaiton Date: Mon, 30 Mar 2026 16:32:13 +0100 Subject: [PATCH 09/13] include basic lazy vs eager test --- packs/tests/processing_test.py | 35 ++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/packs/tests/processing_test.py b/packs/tests/processing_test.py index 11912ae..3d63984 100644 --- a/packs/tests/processing_test.py +++ b/packs/tests/processing_test.py @@ -19,6 +19,7 @@ from packs.proc.processing_utils import read_defaults_WD2 from packs.proc.processing_utils import process_header from packs.proc.processing_utils import read_binary +from packs.proc.processing_utils import read_binary_lazy from packs.proc.processing_utils import format_wfs from packs.proc.processing_utils import check_save_path from packs.proc.processing_utils import save_data @@ -64,7 +65,7 @@ def test_header_components_read_as_expected(wd2_3ch_bin): def test_nonexistent_file_raises_error(): - + fake_path = '/this/path/does/not/exist.bin' with raises(FileNotFoundError): @@ -152,7 +153,7 @@ def test_formatting_works(data_dir, wd2_3ch_bin): def test_save_path_exists(): data_path = 'some/fake/path/three_channels_WD2.h5' - + with raises(FileNotFoundError): check_save_path(data_path, overwrite = False) @@ -277,3 +278,33 @@ def test_lazy_loading_short_header_WD1(MULE_dir): next(a) +@mark.parametrize("inpt", [("one_channel_WD2.bin"),("three_channels_WD2.bin")]) +def test_lazy_eager_WD2_match(data_dir, inpt): + ''' + test to ensure that lazy and eager WD2 + provide the same result + ''' + + # how many events are we looking at? + counts = 30 + + # extract directory + file_path = data_dir + inpt + + # collect header info + wdtype, samples, sampling_period, channels = process_header(file_path) + + # collect lazy data + lazy_data = [] + with open(file_path) as f: + binary_lazy_readout = read_binary_lazy(f, wdtype) + for i in range(0,counts): + _, lazy_wf = next(binary_lazy_readout) + lazy_data.append(lazy_wf) + + # open eager data + with open(file_path) as f: + data = read_binary (f, wdtype, counts) + + for i in range(0,counts): + assert data[i] == lazy_data[i] From cd3e9fbb633e467c0bac8b4a670af4328d1b49da Mon Sep 17 00:00:00 2001 From: jwaiton Date: Wed, 13 May 2026 16:27:24 +0100 Subject: [PATCH 10/13] fix docstrings --- packs/proc/processing_utils.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/packs/proc/processing_utils.py b/packs/proc/processing_utils.py index 3b1e52a..4c7552e 100644 --- a/packs/proc/processing_utils.py +++ b/packs/proc/processing_utils.py @@ -252,9 +252,6 @@ def read_binary_lazy(file : BinaryIO, file (BufferedReader) : Opened file wdtype (ndtype) : Custom data type for extracting information from binary files - counts (int) : How many events you want to read in. -1 sets it to take all events. - offset (int) : Offset at which to start reading the data. Used for chunking purposes - and so should by default be set to zero if not chunking. Returns ------- @@ -557,9 +554,6 @@ def process_bin_WD2_lazy(file_path : str, ''' WAVEDUMP 2: Takes a binary file and outputs the containing waveform information in a h5 file. - For particularly large waveforms/number of events. You can 'chunk' the data such that - each dataset holds `counts` events. - Parameters ---------- From f0bef210d212d55c37accc384687390c10e19ffa Mon Sep 17 00:00:00 2001 From: jwaiton Date: Wed, 13 May 2026 16:28:26 +0100 Subject: [PATCH 11/13] functionalise `num_of_events` function --- packs/proc/processing_utils.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/packs/proc/processing_utils.py b/packs/proc/processing_utils.py index 4c7552e..776c322 100644 --- a/packs/proc/processing_utils.py +++ b/packs/proc/processing_utils.py @@ -296,6 +296,15 @@ def read_binary(file : BinaryIO, return data + +def number_of_events_WD2(file_path, samples, channels, header_size): + file_size = os.path.getsize(file_path) + waveform_size = ((samples * channels * 4 ) + header_size) # can't remember why *2, will need to test this + num_of_events = int(file_size / waveform_size) + + return num_of_events + + def format_wfs(data : np.ndarray, wdtype : np.dtype, samples : int, @@ -597,9 +606,7 @@ def process_bin_WD2_lazy(file_path : str, # first run-through, collect the header information to extract table size if i == 0: - file_size = os.path.getsize(file_path) - waveform_size = ((samples * channels * 4 ) + header_size) # can't remember why *2, will need to test this - num_of_events = int(file_size / waveform_size) + num_of_events = number_of_events_WD2(file_path, samples, channels, header_size) write('event_info', evt_info, (True, num_of_events, i)) # writer only takes one row at a time, can't broadcast all three at once From bd63930e95d66d2e63fd73f7bfd6351f078d0575 Mon Sep 17 00:00:00 2001 From: jwaiton Date: Wed, 13 May 2026 16:48:56 +0100 Subject: [PATCH 12/13] implement `test_number_of_events_WD2()` --- packs/tests/data/10000bytes.bin | Bin 0 -> 10000 bytes packs/tests/data/100bytes.bin | Bin 0 -> 100 bytes packs/tests/processing_test.py | 10 ++++++++++ 3 files changed, 10 insertions(+) create mode 100644 packs/tests/data/10000bytes.bin create mode 100644 packs/tests/data/100bytes.bin diff --git a/packs/tests/data/10000bytes.bin b/packs/tests/data/10000bytes.bin new file mode 100644 index 0000000000000000000000000000000000000000..e64c723ad5aeec49f2d1447b9f523fe09c522566 GIT binary patch literal 10000 zcmeIu0Sy2E0K%a6Pi+o2h(KY$fB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM O7%*VKfB^#r{s#sp5C8!H literal 0 HcmV?d00001 diff --git a/packs/tests/data/100bytes.bin b/packs/tests/data/100bytes.bin new file mode 100644 index 0000000000000000000000000000000000000000..eeb576070df6ab6d3f9dfdf278414e6c3f3ca6b7 GIT binary patch literal 100 LcmZQzpgjNp0Av6G literal 0 HcmV?d00001 diff --git a/packs/tests/processing_test.py b/packs/tests/processing_test.py index 3d63984..05ef170 100644 --- a/packs/tests/processing_test.py +++ b/packs/tests/processing_test.py @@ -23,6 +23,7 @@ from packs.proc.processing_utils import format_wfs from packs.proc.processing_utils import check_save_path from packs.proc.processing_utils import save_data +from packs.proc.processing_utils import number_of_events_WD2 from packs.types.types import generate_wfdtype from packs.types.types import rwf_type @@ -277,6 +278,15 @@ def test_lazy_loading_short_header_WD1(MULE_dir): a = process_event_lazy_WD1(file) next(a) +@mark.parametrize("file, samples, channels, header_size, output", [('100bytes.bin', 1, 1, 0, 25), ('100bytes.bin', 1, 1, 46, 2), ('100bytes.bin', 2, 10, 20, 1), ('10000bytes.bin', 4, 8, 72, 50)]) +def test_number_of_events_correct(data_dir, file, samples, channels, header_size, output): + ''' + Simple test to ensure the logic returns the number of events we expect. + ''' + file_path = data_dir + file + + assert output == number_of_events_WD2(file_path, samples, channels, header_size) + @mark.parametrize("inpt", [("one_channel_WD2.bin"),("three_channels_WD2.bin")]) def test_lazy_eager_WD2_match(data_dir, inpt): From 5262429c6e955ab4865dc73ae1a6b4d35dcf441e Mon Sep 17 00:00:00 2001 From: jwaiton Date: Wed, 13 May 2026 17:11:22 +0100 Subject: [PATCH 13/13] cosmetics and typing --- packs/proc/processing_utils.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/packs/proc/processing_utils.py b/packs/proc/processing_utils.py index 776c322..469ec31 100644 --- a/packs/proc/processing_utils.py +++ b/packs/proc/processing_utils.py @@ -297,7 +297,10 @@ def read_binary(file : BinaryIO, return data -def number_of_events_WD2(file_path, samples, channels, header_size): +def number_of_events_WD2(file_path : str, + samples : int, + channels : int, + header_size : int) -> int: file_size = os.path.getsize(file_path) waveform_size = ((samples * channels * 4 ) + header_size) # can't remember why *2, will need to test this num_of_events = int(file_size / waveform_size) @@ -713,28 +716,28 @@ def read_header_lecroy(file_obj : io.TextIOWrapper): - Segment number, date and time, time since first sample recorded - ... ''' - + oscilloscope_model = int((next(file_obj).split(','))[1]) file_heading = next(file_obj).split(',') segments = int(file_heading[1]) - segment_size = int(file_heading[3]) + segment_size = int(file_heading[3]) evt_info_heading = next(file_obj).split(',') for evt_info_line_idx in range(segments): _ = next(file_obj).split(',') - + data_heading = next(file_obj).split(',') - + time1 = float((next(file_obj).split(','))[0]) time2 = float((next(file_obj).split(','))[0]) return ((np.diff([time1, time2]))[0], segments, segment_size) def get_batch(reader : '_csv.reader', - batch_size : int) -> List: + batch_size : int) -> List: ''' Outputs a list of all the second elements of a row for each batch then goes to the next row @@ -783,14 +786,14 @@ def process_event_lazy_lecroy(file_obj : io.TextIOWrapper): # time since first sample recorded evt_info_times[evt_info_line_idx] = evt_info_line[2] # end of header - + # start of data data_heading = next(file_obj).split(',') reader = csv.reader(file_obj) wf_num = 0 while batch := get_batch(reader, segment_size): - yield (batch, evt_info_times[wf_num]) + yield (batch, evt_info_times[wf_num]) wf_num += 1 # end of data @@ -799,7 +802,7 @@ def process_event_lazy_lecroy(file_obj : io.TextIOWrapper): def process_csv_lecroy(file_path : str, save_path : str, overwrite : Optional[bool] = False, - print_mod : Optional[int] = -1): + print_mod : Optional[int] = -1): """ Process a Lecroy CSV waveform file and write the parsed events to a structured output file. This only works for individual channels at the moment, as Lecroy oscilloscopes save one file per channel. @@ -839,4 +842,4 @@ def process_csv_lecroy(file_path : str, # add data to df write('event_info', event_info, (True, num_of_events, i)) - write('rwf', waveforms, (True, num_of_events, i)) \ No newline at end of file + write('rwf', waveforms, (True, num_of_events, i))