nu-ZOO · jwaiton · Mar 28, 2026 · Mar 28, 2026 · Mar 28, 2026 · Mar 28, 2026
diff --git a/.gitignore b/.gitignore
@@ -165,4 +165,5 @@ cython_debug/
 # ignore the new .txt files generated in test
 /packs/tests/data/repetitive_data/test_*.txt
 # temporary h5 files get ignored
-*tmp.h5
+*tmp.h5
+*tmp*.h5
diff --git a/packs/configs/process_WD2_3channel.conf b/packs/configs/process_WD2_3channel.conf
@@ -8,4 +8,4 @@ save_path        = '/path/to/file.h5'
 [optional]
 
 overwrite        = True
-counts           = -1  
+print_mod        = -1  
diff --git a/packs/proc/proc.py b/packs/proc/proc.py
@@ -4,7 +4,7 @@
 
 from packs.core.io                import read_config_file
 from packs.proc.processing_utils  import process_csv_lecroy
-from packs.proc.processing_utils  import process_bin_WD2
+from packs.proc.processing_utils  import process_bin_WD2_lazy
 from packs.proc.processing_utils  import process_bin_WD1
 from packs.proc.calibration_utils    import calibrate
 from packs.core.core_utils        import check_test
@@ -36,7 +36,7 @@ def proc(config_file):
                         case 1:
                             process_bin_WD1(**conf_dict)
                         case 2:
-                            process_bin_WD2(**conf_dict)
+                            process_bin_WD2_lazy(**conf_dict)
                         case other:
                             raise RuntimeError(f"wavedump edition {other} decoding isn't currently implemented.")
                 else:
@@ -48,4 +48,4 @@ def proc(config_file):
     except KeyError as e:
         print(f"\nError in the configuration file, incorrect or missing argument: {e} \n")
         traceback.print_exc()
-        sys.exit(2)
+        sys.exit(2)
diff --git a/packs/proc/processing_utils.py b/packs/proc/processing_utils.py
@@ -13,8 +13,11 @@
 from typing import BinaryIO
 from typing import Generic
 from typing import Optional
-from datetime import datetime
 from typing import List
+from typing import Generator
+
+from datetime import datetime
+
 
 # imports start from MULE/
 from packs.core.core_utils import flatten
@@ -196,9 +199,9 @@ def process_header(file_path  :  str,
 
     # open file
     if not os.path.exists(file_path):
-        raise FileNotFoundError(2, 'Path or file not found', file_path)    
+        raise FileNotFoundError(2, 'Path or file not found', file_path)
 
-    with open(file_path, 'rb') as file: 
+    with open(file_path, 'rb') as file:
 
         event_number, timestamp, samples, sampling_period = read_defaults_WD2(file, byte_order)
         # attempt to read channels
@@ -231,6 +234,41 @@ def process_header(file_path  :  str,
     return wdtype, samples, sampling_period, channels
 
 
+def read_binary_lazy(file    :  BinaryIO,
+                     wdtype  :  np.dtype) -> Generator:
+    '''
+    Reads the binary in with the expected format/offset, lazily,
+    depending on counts to break the data up.
+
+    NOTE:
+    The counts are hardset to 1, making this function relatively inefficient.
+    In the future, the logic should be revised to allow `np.fromfile`'s count
+    value to be set based on optimal read-in speed. The logic of the WD2 function
+    will have to accomodate this when indexing the files.
+
+    Parameters
+    ----------
+
+        file    (BufferedReader)  :  Opened file
+        wdtype  (ndtype)         :  Custom data type for extracting information from
+                                     binary files
+
+    Returns
+    -------
+        data  (ndarray)  :  Unformatted data from binary file
+
-    Reads the binary in with the expected format/offset, lazily,
-    depending on counts to break the data up.
-
-    NOTE:
-    The counts are hardset to 1, making this function relatively inefficient.
-    In the future, the logic should be revised to allow `np.fromfile`'s count
-    value to be set based on optimal read-in speed. The logic of the WD2 function
-    will have to accomodate this when indexing the files.
-
-    Parameters
-    ----------
-
-        file    (BufferedReader)  :  Opened file
-        wdtype  (ndtype)         :  Custom data type for extracting information from
-                                     binary files
-        counts  (int)             :  How many events you want to read in. -1 sets it to take all events.
-        offset  (int)             :  Offset at which to start reading the data. Used for chunking purposes
-                                     and so should by default be set to zero if not chunking.
-
-    Returns
-    -------
-        data  (ndarray)  :  Unformatted data from binary file
+    Lazily read binary data using the provided numpy dtype.
+
+    This function repeatedly calls ``np.fromfile`` with ``count=1`` to read
+    one record at a time from the open binary file and yields each record
+    as it is read. This makes the function simple to use in streaming
+    contexts but relatively inefficient due to the small fixed read size.
+
+    NOTE:
+    The read size is currently hard-set to 1 element per call to
+    ``np.fromfile``. In the future, the logic may be revised to allow the
+    ``count`` value to be tuned for optimal read-in speed; any calling code
+    that indexes into the resulting data will need to accommodate such
+    changes.
+
+    Parameters
+    ----------
+
+        file    (BufferedReader)  :  Opened binary file object.
+        wdtype  (numpy.dtype)     :  Custom data type for extracting information
+                                     from binary files.
+
+    Returns
+    -------
+        Generator[Tuple[bool, numpy.ndarray], None, None]
+            A generator yielding tuples of the form ``(has_data, data)``:
+
+            * ``has_data`` is ``True`` while records are being read, and
+              ``False`` once processing has finished.
+            * ``data`` is a numpy array containing the record(s) read by the
+              corresponding call to ``np.fromfile``.
-    Reads the binary in with the expected format/offset, lazily,
-    depending on counts to break the data up.
-
-    NOTE:
-    The counts are hardset to 1, making this function relatively inefficient.
-    In the future, the logic should be revised to allow `np.fromfile`'s count
-    value to be set based on optimal read-in speed. The logic of the WD2 function
-    will have to accomodate this when indexing the files.
-
-    Parameters
-    ----------
-
-        file    (BufferedReader)  :  Opened file
-        wdtype  (ndtype)         :  Custom data type for extracting information from
-                                     binary files
-        counts  (int)             :  How many events you want to read in. -1 sets it to take all events.
-        offset  (int)             :  Offset at which to start reading the data. Used for chunking purposes
-                                     and so should by default be set to zero if not chunking.
-
-    Returns
-    -------
-        data  (ndarray)  :  Unformatted data from binary file
+    Lazily read binary data using the provided numpy dtype.
+
+    This function repeatedly calls ``np.fromfile`` with ``count=1`` to read
+    one record at a time from the open binary file and yields each record
+    as it is read. This makes the function simple to use in streaming
+    contexts but relatively inefficient due to the small fixed read size.
+
+    NOTE:
+    The read size is currently hard-set to 1 element per call to
+    ``np.fromfile``. In the future, the logic may be revised to allow the
+    ``count`` value to be tuned for optimal read-in speed; any calling code
+    that indexes into the resulting data will need to accommodate such
+    changes.
+
+    Parameters
+    ----------
+
+        file    (BufferedReader)  :  Opened binary file object.
+        wdtype  (numpy.dtype)     :  Custom data type for extracting information
+                                     from binary files.
+
+    Returns
+    -------
+        Generator[Tuple[bool, numpy.ndarray], None, None]
+            A generator yielding tuples of the form ``(has_data, data)``:
+
+            * ``has_data`` is ``True`` while records are being read, and
+              ``False`` once processing has finished.
+            * ``data`` is a numpy array containing the record(s) read by the
+              corresponding call to ``np.fromfile``.
+    '''
+    # initialise data to start the loop
+    data = (np.fromfile(file, dtype=wdtype, count = 1))
+    while len(data) != 0:
+        yield (True, data)
+        # ensure data is loaded in after the yield, so the while check is done
+        data = (np.fromfile(file, dtype=wdtype, count = 1))
+    # yield 1 when finished
+    print('Processing Finished!')
+    yield (False, np.zeros(shape = (1,)))
-    # yield 1 when finished
-    print('Processing Finished!')
-    yield (False, np.zeros(shape = (1,)))
+    # when no more data is available, the generator stops naturally
+    return
-    # yield 1 when finished
-    print('Processing Finished!')
-    yield (False, np.zeros(shape = (1,)))
+    # when no more data is available, the generator stops naturally
+    return
+
+
 def read_binary(file    :  BinaryIO,
                 wdtype  :  np.dtype,
                 counts  :  Optional[int] = -1,
@@ -258,6 +296,18 @@ def read_binary(file    :  BinaryIO,
 
     return data
 
+
+def number_of_events_WD2(file_path    :  str,
+                         samples      :  int,
+                         channels     :  int,
+                         header_size  :  int) -> int:
+    file_size     = os.path.getsize(file_path)
+    waveform_size = ((samples * channels * 4 ) + header_size) # can't remember why *2, will need to test this
+    num_of_events = int(file_size / waveform_size)
+
+    return num_of_events
+
+
 def format_wfs(data      :  np.ndarray,
                wdtype    :  np.dtype,
                samples   :  int,
@@ -414,7 +464,6 @@ def process_event_lazy_WD1(file_object  :  BinaryIO):
 
     # header to check against
     sanity_header = header.copy()
-
     # continue only if data exists
     while len(header) > 0:
 
@@ -424,10 +473,8 @@ def process_event_lazy_WD1(file_object  :  BinaryIO):
 
         # collect waveform, no of samples and timestamp
         yield (np.fromfile(file_object, dtype = np.dtype('<H'), count = event_size), event_size, header[-1])
-
         # collect next header
         header = np.fromfile(file_object, dtype = 'i', count = 6)
-
         # check if header has correct number of elements and correct information ONCE.
         if sanity_header is not None:
             if len(header) == 6:
@@ -492,7 +539,6 @@ def process_bin_WD1(file_path    :  str,
 
                 if (i % print_mod == 0) and (print_mod != -1):
                     print(f"Event {i}")
-
                 # enforce stucture upon data
                 e_dtype  = types.event_info_type
                 wf_dtype = types.rwf_type_WD1(samples)
@@ -511,6 +557,65 @@ def process_bin_WD1(file_path    :  str,
                 write('rwf', waveforms, (True, num_of_events, i))
 
 
+
+def process_bin_WD2_lazy(file_path  :  str,
+                    save_path  :  str,
+                    overwrite  :  Optional[bool] = False,
+                    print_mod  :  Optional[int]  = -1):
+
+    '''
+    WAVEDUMP 2: Takes a binary file and outputs the containing waveform information in a h5 file.
+
+    Parameters
+    ----------
+
+        file_path  (str)   :  Path to binary file
+        save_path  (str)   :  Path to saved file
+        overwrite  (bool)  :  Boolean for overwriting pre-existing files
+        print_mod  (int)   :  Readout frequency for number of events, -1 implies no readout
+
+    Returns
+    -------
+        None
+    '''
+
+    # Ensure save path is clear
+    save_path = check_save_path(save_path, overwrite)
+    print(f'\nData input   :  {file_path}\nData output  :  {save_path}')
+
+    # collect header info
+    wdtype, samples, sampling_period, channels = process_header(file_path)
+
+   # create header length (bytes) for processing
+    if channels == 1:
+        header_size = 24
+    else:
+        header_size = 28
+
+    # open file for reading
+    with open(file_path, 'rb') as file:
+        with writer(save_path, 'RAW', overwrite) as write:
+
+            for i, (flag, array) in enumerate(read_binary_lazy(file, wdtype)):
+
+                if (i % print_mod == 0) and (print_mod != -1):
-                if (i % print_mod == 0) and (print_mod != -1):
+                if print_mod not in (-1, 0) and (i % print_mod == 0):
-                if (i % print_mod == 0) and (print_mod != -1):
+                if print_mod not in (-1, 0) and (i % print_mod == 0):
+                    print(f"Event {i}")
+
+                # catch, once done, rwf should be empty
+                if flag:
+
+                    evt_info, rwf = format_wfs(array, wdtype, samples, channels)
+
+
+                    # first run-through, collect the header information to extract table size
+                    if i == 0:
+                        num_of_events = number_of_events_WD2(file_path, samples, channels, header_size)
+
+                    write('event_info', evt_info, (True, num_of_events, i))
-                    write('event_info', evt_info, (True, num_of_events, i))
+                    # In lazy mode, format_wfs can return a length-1 ndarray; unwrap to a single record
+                    if isinstance(evt_info, np.ndarray) and getattr(evt_info, "shape", ()) and evt_info.shape[0] == 1:
+                        evt_row = evt_info[0]
+                    else:
+                        evt_row = evt_info
+
+                    write('event_info', evt_row, (True, num_of_events, i))
-                    write('event_info', evt_info, (True, num_of_events, i))
+                    # In lazy mode, format_wfs can return a length-1 ndarray; unwrap to a single record
+                    if isinstance(evt_info, np.ndarray) and getattr(evt_info, "shape", ()) and evt_info.shape[0] == 1:
+                        evt_row = evt_info[0]
+                    else:
+                        evt_row = evt_info
+
+                    write('event_info', evt_row, (True, num_of_events, i))
+                    # writer only takes one row at a time, can't broadcast all three at once
+                    for j, wfs in enumerate(rwf):
+                        write('rwf',        wfs,      (True, num_of_events * channels, i + ((channels-1)*i) + j))
+
 def process_bin_WD2(file_path  :  str,
                     save_path  :  str,
                     overwrite  :  Optional[bool] = False,
@@ -611,28 +716,28 @@ def read_header_lecroy(file_obj  :   io.TextIOWrapper):
      - Segment number, date and time, time since first sample recorded
      - ...
     '''
-    
+
     oscilloscope_model = int((next(file_obj).split(','))[1])
 
     file_heading              = next(file_obj).split(',')
     segments                  = int(file_heading[1])
-    segment_size              = int(file_heading[3])   
+    segment_size              = int(file_heading[3])
 
     evt_info_heading          = next(file_obj).split(',')
     for evt_info_line_idx in range(segments):
         _         = next(file_obj).split(',')
 
-   
+
     data_heading = next(file_obj).split(',')
 
-  
+
     time1               = float((next(file_obj).split(','))[0])
     time2               = float((next(file_obj).split(','))[0])
 
     return ((np.diff([time1, time2]))[0], segments, segment_size)
 
 def get_batch(reader        :   '_csv.reader',
-              batch_size    :   int) -> List: 
+              batch_size    :   int) -> List:
     '''
     Outputs a list of all the second elements of a row for each batch
     then goes to the next row
@@ -681,14 +786,14 @@ def process_event_lazy_lecroy(file_obj  :   io.TextIOWrapper):
         # time since first sample recorded
         evt_info_times[evt_info_line_idx] = evt_info_line[2]
     # end of header
-    
+
     # start of data
     data_heading        = next(file_obj).split(',')
     reader              = csv.reader(file_obj)
     wf_num = 0
     while batch := get_batch(reader, segment_size):
 
-        yield (batch, evt_info_times[wf_num])    
+        yield (batch, evt_info_times[wf_num])
         wf_num += 1
     # end of data
 
@@ -697,7 +802,7 @@ def process_event_lazy_lecroy(file_obj  :   io.TextIOWrapper):
 def process_csv_lecroy(file_path    :  str,
                 save_path           :  str,
                 overwrite           :  Optional[bool] = False,
-                print_mod           :  Optional[int] = -1):   
+                print_mod           :  Optional[int] = -1):
     """
     Process a Lecroy CSV waveform file and write the parsed events to a structured output file.
     This only works for individual channels at the moment, as Lecroy oscilloscopes save one file per channel.
@@ -737,4 +842,4 @@ def process_csv_lecroy(file_path    :  str,
 
                 # add data to df
                 write('event_info', event_info, (True, num_of_events, i))
-                write('rwf', waveforms, (True, num_of_events, i))
+                write('rwf', waveforms, (True, num_of_events, i))
diff --git a/packs/tests/data/10000bytes.bin b/packs/tests/data/10000bytes.bin
diff --git a/packs/tests/data/100bytes.bin b/packs/tests/data/100bytes.bin
diff --git a/packs/tests/data/configs/process_WD2_1channel.conf b/packs/tests/data/configs/process_WD2_1channel.conf
@@ -6,4 +6,4 @@ save_path = '/home/e78368jw/Documents/MULE/packs/tests/data/one_channel_tmp.h5'
 
 [optional]
 overwrite = True
-
+print_mod = -1
diff --git a/packs/tests/data/configs/process_WD2_3channel.conf b/packs/tests/data/configs/process_WD2_3channel.conf
@@ -6,5 +6,5 @@ save_path = '/home/e78368jw/Documents/MULE/packs/tests/data/three_channels_tmp.h
 
 [optional]
 overwrite = True
-counts = 10
+print_mod = -1
 
diff --git a/packs/tests/processing_test.py b/packs/tests/processing_test.py
@@ -19,9 +19,11 @@
 from packs.proc.processing_utils   import read_defaults_WD2
 from packs.proc.processing_utils   import process_header
 from packs.proc.processing_utils   import read_binary
+from packs.proc.processing_utils   import read_binary_lazy
 from packs.proc.processing_utils   import format_wfs
 from packs.proc.processing_utils   import check_save_path
 from packs.proc.processing_utils   import save_data
+from packs.proc.processing_utils   import number_of_events_WD2
 
 from packs.types.types             import generate_wfdtype
 from packs.types.types             import rwf_type
@@ -64,7 +66,7 @@ def test_header_components_read_as_expected(wd2_3ch_bin):
 
 
 def test_nonexistent_file_raises_error():
-    
+
     fake_path = '/this/path/does/not/exist.bin'
 
     with raises(FileNotFoundError):
@@ -152,7 +154,7 @@ def test_formatting_works(data_dir, wd2_3ch_bin):
 def test_save_path_exists():
 
     data_path = 'some/fake/path/three_channels_WD2.h5'
-    
+
     with raises(FileNotFoundError):
         check_save_path(data_path, overwrite = False)
 
@@ -276,4 +278,43 @@ def test_lazy_loading_short_header_WD1(MULE_dir):
         a = process_event_lazy_WD1(file)
         next(a)
 
+@mark.parametrize("file, samples, channels, header_size, output", [('100bytes.bin', 1, 1, 0, 25), ('100bytes.bin', 1, 1, 46, 2), ('100bytes.bin', 2, 10, 20, 1), ('10000bytes.bin', 4, 8, 72, 50)])
+def test_number_of_events_correct(data_dir, file, samples, channels, header_size, output):
+    '''
+    Simple test to ensure the logic returns the number of events we expect.
+    '''
+    file_path = data_dir + file
+
+    assert output == number_of_events_WD2(file_path, samples, channels, header_size)
+
+
+@mark.parametrize("inpt", [("one_channel_WD2.bin"),("three_channels_WD2.bin")])
+def test_lazy_eager_WD2_match(data_dir, inpt):
+    '''
+    test to ensure that lazy and eager WD2
+    provide the same result
+    '''
+
+    # how many events are we looking at?
+    counts = 30
+
+    # extract directory
+    file_path = data_dir + inpt
+
+    # collect header info
+    wdtype, samples, sampling_period, channels = process_header(file_path)
+
+    # collect lazy data
+    lazy_data = []
+    with open(file_path) as f:
+        binary_lazy_readout   = read_binary_lazy(f, wdtype)
+        for i in range(0,counts):
+            _, lazy_wf            = next(binary_lazy_readout)
+            lazy_data.append(lazy_wf)
+
+    # open eager data
+    with open(file_path) as f:
+        data                  = read_binary     (f, wdtype, counts)
 
+    for i in range(0,counts):
+        assert data[i] == lazy_data[i]
Original file line number	Diff line number	Diff line change
Expand Up		@@ -6,4 +6,4 @@ save_path = '/home/e78368jw/Documents/MULE/packs/tests/data/one_channel_tmp.h5'

		[optional]
		overwrite = True

		print_mod = -1