Skip to content
Open
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -165,4 +165,5 @@ cython_debug/
# ignore the new .txt files generated in test
/packs/tests/data/repetitive_data/test_*.txt
# temporary h5 files get ignored
*tmp.h5
*tmp.h5
*tmp*.h5
2 changes: 1 addition & 1 deletion packs/configs/process_WD2_3channel.conf
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ save_path = '/path/to/file.h5'
[optional]

overwrite = True
counts = -1
print_mod = -1
6 changes: 3 additions & 3 deletions packs/proc/proc.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from packs.core.io import read_config_file
from packs.proc.processing_utils import process_csv_lecroy
from packs.proc.processing_utils import process_bin_WD2
from packs.proc.processing_utils import process_bin_WD2_lazy
from packs.proc.processing_utils import process_bin_WD1
from packs.proc.calibration_utils import calibrate
from packs.core.core_utils import check_test
Expand Down Expand Up @@ -36,7 +36,7 @@ def proc(config_file):
case 1:
process_bin_WD1(**conf_dict)
case 2:
process_bin_WD2(**conf_dict)
process_bin_WD2_lazy(**conf_dict)
case other:
raise RuntimeError(f"wavedump edition {other} decoding isn't currently implemented.")
else:
Expand All @@ -48,4 +48,4 @@ def proc(config_file):
except KeyError as e:
print(f"\nError in the configuration file, incorrect or missing argument: {e} \n")
traceback.print_exc()
sys.exit(2)
sys.exit(2)
137 changes: 121 additions & 16 deletions packs/proc/processing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,11 @@
from typing import BinaryIO
from typing import Generic
from typing import Optional
from datetime import datetime
from typing import List
from typing import Generator

from datetime import datetime


# imports start from MULE/
from packs.core.core_utils import flatten
Expand Down Expand Up @@ -196,9 +199,9 @@ def process_header(file_path : str,

# open file
if not os.path.exists(file_path):
raise FileNotFoundError(2, 'Path or file not found', file_path)
raise FileNotFoundError(2, 'Path or file not found', file_path)

with open(file_path, 'rb') as file:
with open(file_path, 'rb') as file:

event_number, timestamp, samples, sampling_period = read_defaults_WD2(file, byte_order)
# attempt to read channels
Expand Down Expand Up @@ -231,6 +234,41 @@ def process_header(file_path : str,
return wdtype, samples, sampling_period, channels


def read_binary_lazy(file : BinaryIO,
wdtype : np.dtype) -> Generator:
'''
Reads the binary in with the expected format/offset, lazily,
depending on counts to break the data up.

NOTE:
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Has an issue already been opened for this?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, but one should be opened after this review. I believe WD1 has the same issue.

The counts are hardset to 1, making this function relatively inefficient.
In the future, the logic should be revised to allow `np.fromfile`'s count
value to be set based on optimal read-in speed. The logic of the WD2 function
will have to accomodate this when indexing the files.

Parameters
----------

file (BufferedReader) : Opened file
wdtype (ndtype) : Custom data type for extracting information from
binary files

Returns
-------
data (ndarray) : Unformatted data from binary file

Comment on lines +240 to +259
Copy link

Copilot AI Apr 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

read_binary_lazy docstring mentions counts and offset parameters and chunking behavior, but the function signature only accepts (file, wdtype) and always calls np.fromfile(..., count=1). Update the docstring to match the actual API (or add the missing parameters).

Suggested change
Reads the binary in with the expected format/offset, lazily,
depending on counts to break the data up.
NOTE:
The counts are hardset to 1, making this function relatively inefficient.
In the future, the logic should be revised to allow `np.fromfile`'s count
value to be set based on optimal read-in speed. The logic of the WD2 function
will have to accomodate this when indexing the files.
Parameters
----------
file (BufferedReader) : Opened file
wdtype (ndtype) : Custom data type for extracting information from
binary files
counts (int) : How many events you want to read in. -1 sets it to take all events.
offset (int) : Offset at which to start reading the data. Used for chunking purposes
and so should by default be set to zero if not chunking.
Returns
-------
data (ndarray) : Unformatted data from binary file
Lazily read binary data using the provided numpy dtype.
This function repeatedly calls ``np.fromfile`` with ``count=1`` to read
one record at a time from the open binary file and yields each record
as it is read. This makes the function simple to use in streaming
contexts but relatively inefficient due to the small fixed read size.
NOTE:
The read size is currently hard-set to 1 element per call to
``np.fromfile``. In the future, the logic may be revised to allow the
``count`` value to be tuned for optimal read-in speed; any calling code
that indexes into the resulting data will need to accommodate such
changes.
Parameters
----------
file (BufferedReader) : Opened binary file object.
wdtype (numpy.dtype) : Custom data type for extracting information
from binary files.
Returns
-------
Generator[Tuple[bool, numpy.ndarray], None, None]
A generator yielding tuples of the form ``(has_data, data)``:
* ``has_data`` is ``True`` while records are being read, and
``False`` once processing has finished.
* ``data`` is a numpy array containing the record(s) read by the
corresponding call to ``np.fromfile``.

Copilot uses AI. Check for mistakes.
'''
# initialise data to start the loop
data = (np.fromfile(file, dtype=wdtype, count = 1))
while len(data) != 0:
yield (True, data)
# ensure data is loaded in after the yield, so the while check is done
data = (np.fromfile(file, dtype=wdtype, count = 1))
# yield 1 when finished
print('Processing Finished!')
yield (False, np.zeros(shape = (1,)))
Comment on lines +267 to +269
Copy link

Copilot AI Apr 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

read_binary_lazy prints "Processing Finished!" unconditionally and yields a final (False, zeros) sentinel. This introduces side effects even when callers set print_mod=-1 and makes the generator harder to consume (callers must remember to ignore the last value). Prefer ending the generator with return/StopIteration and let the caller decide whether/when to print completion.

Suggested change
# yield 1 when finished
print('Processing Finished!')
yield (False, np.zeros(shape = (1,)))
# when no more data is available, the generator stops naturally
return

Copilot uses AI. Check for mistakes.


def read_binary(file : BinaryIO,
wdtype : np.dtype,
counts : Optional[int] = -1,
Expand Down Expand Up @@ -258,6 +296,18 @@ def read_binary(file : BinaryIO,

return data


def number_of_events_WD2(file_path : str,
samples : int,
channels : int,
header_size : int) -> int:
file_size = os.path.getsize(file_path)
waveform_size = ((samples * channels * 4 ) + header_size) # can't remember why *2, will need to test this
num_of_events = int(file_size / waveform_size)

return num_of_events


def format_wfs(data : np.ndarray,
wdtype : np.dtype,
samples : int,
Expand Down Expand Up @@ -414,7 +464,6 @@ def process_event_lazy_WD1(file_object : BinaryIO):

# header to check against
sanity_header = header.copy()

# continue only if data exists
while len(header) > 0:

Expand All @@ -424,10 +473,8 @@ def process_event_lazy_WD1(file_object : BinaryIO):

# collect waveform, no of samples and timestamp
yield (np.fromfile(file_object, dtype = np.dtype('<H'), count = event_size), event_size, header[-1])

# collect next header
header = np.fromfile(file_object, dtype = 'i', count = 6)

# check if header has correct number of elements and correct information ONCE.
if sanity_header is not None:
if len(header) == 6:
Expand Down Expand Up @@ -492,7 +539,6 @@ def process_bin_WD1(file_path : str,

if (i % print_mod == 0) and (print_mod != -1):
print(f"Event {i}")

# enforce stucture upon data
e_dtype = types.event_info_type
wf_dtype = types.rwf_type_WD1(samples)
Expand All @@ -511,6 +557,65 @@ def process_bin_WD1(file_path : str,
write('rwf', waveforms, (True, num_of_events, i))



def process_bin_WD2_lazy(file_path : str,
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a need to keep both process_bin_WD2_lazy and process_bin_WD2?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Theres no reason to remove the eager methods yet, as it will be 'faster' than the lazy method. Once implementation of 'chunks' in the lazy method is done, we can remove process_bin_WD2.

save_path : str,
overwrite : Optional[bool] = False,
print_mod : Optional[int] = -1):

'''
WAVEDUMP 2: Takes a binary file and outputs the containing waveform information in a h5 file.

Parameters
----------

file_path (str) : Path to binary file
save_path (str) : Path to saved file
overwrite (bool) : Boolean for overwriting pre-existing files
print_mod (int) : Readout frequency for number of events, -1 implies no readout

Returns
-------
None
'''

# Ensure save path is clear
save_path = check_save_path(save_path, overwrite)
print(f'\nData input : {file_path}\nData output : {save_path}')

# collect header info
wdtype, samples, sampling_period, channels = process_header(file_path)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of opening the file twice, can process_header be changed to fit after:

 # open file for reading
with open(file_path, 'rb') as file:

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to try this soon.


# create header length (bytes) for processing
if channels == 1:
header_size = 24
else:
header_size = 28

# open file for reading
with open(file_path, 'rb') as file:
with writer(save_path, 'RAW', overwrite) as write:

for i, (flag, array) in enumerate(read_binary_lazy(file, wdtype)):

if (i % print_mod == 0) and (print_mod != -1):
Copy link

Copilot AI Apr 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The print_mod logic uses i % print_mod before validating print_mod. If a user sets print_mod=0, this will raise ZeroDivisionError. Consider validating print_mod (e.g., require -1 or a positive int) before entering the loop, or treat 0 as disabled printing.

Suggested change
if (i % print_mod == 0) and (print_mod != -1):
if print_mod not in (-1, 0) and (i % print_mod == 0):

Copilot uses AI. Check for mistakes.
print(f"Event {i}")

# catch, once done, rwf should be empty
if flag:

evt_info, rwf = format_wfs(array, wdtype, samples, channels)


# first run-through, collect the header information to extract table size
if i == 0:
num_of_events = number_of_events_WD2(file_path, samples, channels, header_size)

write('event_info', evt_info, (True, num_of_events, i))
Copy link

Copilot AI Apr 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

evt_info returned by format_wfs() is a length-1 ndarray when reading lazily (count=1). In fixed-size mode, writer.write() assigns a single row at index, so passing an ndarray here can raise a shape/type error. Write the scalar record instead (e.g., evt_info[0]) to match how process_bin_WD1 passes a single structured row.

Suggested change
write('event_info', evt_info, (True, num_of_events, i))
# In lazy mode, format_wfs can return a length-1 ndarray; unwrap to a single record
if isinstance(evt_info, np.ndarray) and getattr(evt_info, "shape", ()) and evt_info.shape[0] == 1:
evt_row = evt_info[0]
else:
evt_row = evt_info
write('event_info', evt_row, (True, num_of_events, i))

Copilot uses AI. Check for mistakes.
# writer only takes one row at a time, can't broadcast all three at once
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would perhaps explain here, or in another page how lazy reading and lazy writing works in mule

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It uses a generator (read_binary_lazy() for WD2) that yields raw arrays from the binary file, and then reformats them and writes them to a file. I can add more comments if thats what you wish :)

for j, wfs in enumerate(rwf):
write('rwf', wfs, (True, num_of_events * channels, i + ((channels-1)*i) + j))

def process_bin_WD2(file_path : str,
save_path : str,
overwrite : Optional[bool] = False,
Expand Down Expand Up @@ -611,28 +716,28 @@ def read_header_lecroy(file_obj : io.TextIOWrapper):
- Segment number, date and time, time since first sample recorded
- ...
'''

oscilloscope_model = int((next(file_obj).split(','))[1])

file_heading = next(file_obj).split(',')
segments = int(file_heading[1])
segment_size = int(file_heading[3])
segment_size = int(file_heading[3])

evt_info_heading = next(file_obj).split(',')
for evt_info_line_idx in range(segments):
_ = next(file_obj).split(',')


data_heading = next(file_obj).split(',')


time1 = float((next(file_obj).split(','))[0])
time2 = float((next(file_obj).split(','))[0])

return ((np.diff([time1, time2]))[0], segments, segment_size)

def get_batch(reader : '_csv.reader',
batch_size : int) -> List:
batch_size : int) -> List:
'''
Outputs a list of all the second elements of a row for each batch
then goes to the next row
Expand Down Expand Up @@ -681,14 +786,14 @@ def process_event_lazy_lecroy(file_obj : io.TextIOWrapper):
# time since first sample recorded
evt_info_times[evt_info_line_idx] = evt_info_line[2]
# end of header

# start of data
data_heading = next(file_obj).split(',')
reader = csv.reader(file_obj)
wf_num = 0
while batch := get_batch(reader, segment_size):

yield (batch, evt_info_times[wf_num])
yield (batch, evt_info_times[wf_num])
wf_num += 1
# end of data

Expand All @@ -697,7 +802,7 @@ def process_event_lazy_lecroy(file_obj : io.TextIOWrapper):
def process_csv_lecroy(file_path : str,
save_path : str,
overwrite : Optional[bool] = False,
print_mod : Optional[int] = -1):
print_mod : Optional[int] = -1):
"""
Process a Lecroy CSV waveform file and write the parsed events to a structured output file.
This only works for individual channels at the moment, as Lecroy oscilloscopes save one file per channel.
Expand Down Expand Up @@ -737,4 +842,4 @@ def process_csv_lecroy(file_path : str,

# add data to df
write('event_info', event_info, (True, num_of_events, i))
write('rwf', waveforms, (True, num_of_events, i))
write('rwf', waveforms, (True, num_of_events, i))
Binary file added packs/tests/data/10000bytes.bin
Binary file not shown.
Binary file added packs/tests/data/100bytes.bin
Binary file not shown.
2 changes: 1 addition & 1 deletion packs/tests/data/configs/process_WD2_1channel.conf
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ save_path = '/home/e78368jw/Documents/MULE/packs/tests/data/one_channel_tmp.h5'

[optional]
overwrite = True

print_mod = -1
2 changes: 1 addition & 1 deletion packs/tests/data/configs/process_WD2_3channel.conf
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@ save_path = '/home/e78368jw/Documents/MULE/packs/tests/data/three_channels_tmp.h

[optional]
overwrite = True
counts = 10
print_mod = -1

45 changes: 43 additions & 2 deletions packs/tests/processing_test.py
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for completeness, parameters could be added to test_decode_produces_expected_output for WD2 as well. This has been done for the Lecroy scope as a precedent.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its done as test_WD1_decode_produces_expected_output. We can keep combining these together, but actually looking at the @mark.parametrize for test_decode_produces_expected_output, I think they should all be separated (its looking messy).

Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,11 @@
from packs.proc.processing_utils import read_defaults_WD2
from packs.proc.processing_utils import process_header
from packs.proc.processing_utils import read_binary
from packs.proc.processing_utils import read_binary_lazy
from packs.proc.processing_utils import format_wfs
from packs.proc.processing_utils import check_save_path
from packs.proc.processing_utils import save_data
from packs.proc.processing_utils import number_of_events_WD2

from packs.types.types import generate_wfdtype
from packs.types.types import rwf_type
Expand Down Expand Up @@ -64,7 +66,7 @@ def test_header_components_read_as_expected(wd2_3ch_bin):


def test_nonexistent_file_raises_error():

fake_path = '/this/path/does/not/exist.bin'

with raises(FileNotFoundError):
Expand Down Expand Up @@ -152,7 +154,7 @@ def test_formatting_works(data_dir, wd2_3ch_bin):
def test_save_path_exists():

data_path = 'some/fake/path/three_channels_WD2.h5'

with raises(FileNotFoundError):
check_save_path(data_path, overwrite = False)

Expand Down Expand Up @@ -276,4 +278,43 @@ def test_lazy_loading_short_header_WD1(MULE_dir):
a = process_event_lazy_WD1(file)
next(a)

@mark.parametrize("file, samples, channels, header_size, output", [('100bytes.bin', 1, 1, 0, 25), ('100bytes.bin', 1, 1, 46, 2), ('100bytes.bin', 2, 10, 20, 1), ('10000bytes.bin', 4, 8, 72, 50)])
def test_number_of_events_correct(data_dir, file, samples, channels, header_size, output):
'''
Simple test to ensure the logic returns the number of events we expect.
'''
file_path = data_dir + file

assert output == number_of_events_WD2(file_path, samples, channels, header_size)


@mark.parametrize("inpt", [("one_channel_WD2.bin"),("three_channels_WD2.bin")])
def test_lazy_eager_WD2_match(data_dir, inpt):
'''
test to ensure that lazy and eager WD2
provide the same result
'''

# how many events are we looking at?
counts = 30

# extract directory
file_path = data_dir + inpt

# collect header info
wdtype, samples, sampling_period, channels = process_header(file_path)

# collect lazy data
lazy_data = []
with open(file_path) as f:
binary_lazy_readout = read_binary_lazy(f, wdtype)
for i in range(0,counts):
_, lazy_wf = next(binary_lazy_readout)
lazy_data.append(lazy_wf)

# open eager data
with open(file_path) as f:
data = read_binary (f, wdtype, counts)

for i in range(0,counts):
assert data[i] == lazy_data[i]
Comment on lines +308 to +320
Copy link

Copilot AI Apr 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test opens the binary files in text mode (open(file_path)), but np.fromfile expects a binary file handle; on some platforms this can corrupt reads. Also, read_binary_lazy yields an ndarray of length 1, so lazy_data[i] is not comparable to data[i] (a single structured record). Open with 'rb' and compare data[i] to the scalar record from lazy loading (e.g., lazy_wf[0]).

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Collaborator

@MattZur MattZur May 11, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Type hinting in def read_binary_lazy(file : BinaryIO, should be <class '_io.BufferedReader'>

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The BinaryIO typing object represents the io.BufferedReader class. Whoops!

Loading