diff --git a/docs/src/whatsnew/latest.rst b/docs/src/whatsnew/latest.rst index 975fefb406..8b996598d0 100644 --- a/docs/src/whatsnew/latest.rst +++ b/docs/src/whatsnew/latest.rst @@ -48,7 +48,9 @@ This document explains the changes made to Iris for this release 🚀 Performance Enhancements =========================== -#. N/A +#. `@trexfeathers`_ improved the speed of field iteration when reading PP files. + Up to 3x speed up has been seen, dependending on the circumstances. + (:pull:`7089`) 🔥 Deprecations diff --git a/lib/iris/fileformats/pp.py b/lib/iris/fileformats/pp.py index 971c0ff7ef..5dc15383da 100644 --- a/lib/iris/fileformats/pp.py +++ b/lib/iris/fileformats/pp.py @@ -1870,19 +1870,32 @@ def _field_gen(filename, read_data_bytes, little_ended=False): pp_file_read = pp_file.read field_count = 0 + # Total bytes for the full header record: + # leading length word + long headers + float headers + trailing length word + _HEADER_BYTES = PP_WORD_DEPTH * (1 + NUM_LONG_HEADERS + NUM_FLOAT_HEADERS + 1) + _LONGS_OFFSET = PP_WORD_DEPTH # bytes: skip leading length word + _FLOATS_OFFSET = _LONGS_OFFSET + NUM_LONG_HEADERS * PP_WORD_DEPTH + dtype_longs = np.dtype("%ci%d" % (dtype_endian_char, PP_WORD_DEPTH)) + dtype_floats = np.dtype("%cf%d" % (dtype_endian_char, PP_WORD_DEPTH)) # Keep reading until we reach the end of file while True: - # Move past the leading header length word - pp_file_seek(PP_WORD_DEPTH, os.SEEK_CUR) - # Get the LONG header entries - dtype = "%ci%d" % (dtype_endian_char, PP_WORD_DEPTH) - header_longs = np.fromfile(pp_file, dtype=dtype, count=NUM_LONG_HEADERS) + # Read the entire header record in one go + header_buf = pp_file_read(_HEADER_BYTES) # Nothing returned => EOF - if len(header_longs) == 0: + if len(header_buf) == 0: break - # Get the FLOAT header entries - dtype = "%cf%d" % (dtype_endian_char, PP_WORD_DEPTH) - header_floats = np.fromfile(pp_file, dtype=dtype, count=NUM_FLOAT_HEADERS) + header_longs = np.frombuffer( + header_buf, + dtype=dtype_longs, + count=NUM_LONG_HEADERS, + offset=_LONGS_OFFSET, + ) + header_floats = np.frombuffer( + header_buf, + dtype=dtype_floats, + count=NUM_FLOAT_HEADERS, + offset=_FLOATS_OFFSET, + ) header = tuple(header_longs) + tuple(header_floats) # Make a PPField of the appropriate sub-class (depends on header @@ -1900,9 +1913,6 @@ def _field_gen(filename, read_data_bytes, little_ended=False): ) break - # Skip the trailing 4-byte word containing the header length - pp_file_seek(PP_WORD_DEPTH, os.SEEK_CUR) - # Read the word telling me how long the data + extra data is # This value is # of bytes len_of_data_plus_extra = struct.unpack_from( diff --git a/lib/iris/tests/unit/fileformats/pp/test__field_gen.py b/lib/iris/tests/unit/fileformats/pp/test__field_gen.py index 6618c79a38..1b3f79bbf9 100644 --- a/lib/iris/tests/unit/fileformats/pp/test__field_gen.py +++ b/lib/iris/tests/unit/fileformats/pp/test__field_gen.py @@ -6,12 +6,20 @@ import contextlib import io +import struct import numpy as np import pytest import iris.fileformats.pp as pp +# Byte size of the full header record that _field_gen reads in one go: +# leading length word + long headers + float headers + trailing length word +_HEADER_BYTES = pp.PP_WORD_DEPTH * (1 + pp.NUM_LONG_HEADERS + pp.NUM_FLOAT_HEADERS + 1) +# A valid data-length word: 4 bytes encoding the value 4 (big-endian uint32), +# matching lblrec=1 * PP_WORD_DEPTH=4 so LBLREC validation passes. +_DATA_LEN_WORD = struct.pack(">L", 4) + class Test: @pytest.fixture @@ -21,24 +29,31 @@ def _mock_for_field_gen(fields): side_effect_fields = list(fields)[:] def make_pp_field_override(*args): - # Iterates over the fields passed to this context manager, - # until there are no more, upon which the np.fromfile - # returns an empty list and the while loop in load() is - # broken. - result = side_effect_fields.pop(0) - if not side_effect_fields: - np.fromfile.return_value = [] - return result - - open_func = "builtins.open" - mocker.patch("numpy.fromfile", return_value=[0]) - mocker.patch(open_func) + return side_effect_fields.pop(0) + + # Build the sequence of bytes that pp_file.read() will return: + # For each field: a _HEADER_BYTES-sized buffer (all zeros is fine + # for our purposes — make_pp_field is fully mocked), followed by + # a 4-byte data-length word. + # After all fields: b"" to signal EOF on the next header read. + read_side_effects = [] + for _ in fields: + read_side_effects.append(bytes(_HEADER_BYTES)) # header read + read_side_effects.append(_DATA_LEN_WORD) # data-len word + read_side_effects.append(b"") # EOF + + mock_file = mocker.MagicMock(spec=io.RawIOBase) + mock_file.__enter__ = mocker.Mock(return_value=mock_file) + mock_file.__exit__ = mocker.Mock(return_value=False) + mock_file.read.side_effect = read_side_effects + + mocker.patch("builtins.open", return_value=mock_file) mocker.patch("struct.unpack_from", return_value=[4]) mocker.patch( "iris.fileformats.pp.make_pp_field", side_effect=make_pp_field_override, ) - yield + yield mock_file return _mock_for_field_gen @@ -57,40 +72,34 @@ def test_lblrec_invalid(self, mocker, mock_for_field_gen): assert len(warn) == 1 def test_read_headers_call(self, mocker, mock_for_field_gen): - # Checks that the two calls to np.fromfile are called in the - # expected way. + # Checks that the file is read in a single call of _HEADER_BYTES and + # that np.frombuffer is used to parse longs and floats from that buffer. pp_field = mocker.Mock(lblrec=1, lbext=0, lbuser=[0]) - with mock_for_field_gen([pp_field]): - open_fh = mocker.MagicMock(spec=io.RawIOBase) - open.return_value = open_fh + mock_frombuffer = mocker.patch("numpy.frombuffer", wraps=np.frombuffer) + with mock_for_field_gen([pp_field]) as mock_file: next(pp._field_gen("mocked", read_data_bytes=False)) - with open_fh as open_fh_ctx: - calls = [ - mocker.call(open_fh_ctx, count=45, dtype=">i4"), - mocker.call(open_fh_ctx, count=19, dtype=">f4"), - ] - np.fromfile.assert_has_calls(calls) - with open_fh as open_fh_ctx: - expected_deferred_bytes = ( - "mocked", - open_fh_ctx.tell(), - 4, - np.dtype(">f4"), - ) - assert pp_field.data == expected_deferred_bytes + # The first read() call should request exactly _HEADER_BYTES bytes. + first_read_call = mock_file.read.call_args_list[0] + assert first_read_call == mocker.call(_HEADER_BYTES) + + # frombuffer should have been called twice: once for longs, once for floats. + assert mock_frombuffer.call_count == 2 + calls = mock_frombuffer.call_args_list + assert calls[0].kwargs["count"] == pp.NUM_LONG_HEADERS + assert calls[0].kwargs["dtype"] == np.dtype(">i4") + assert calls[1].kwargs["count"] == pp.NUM_FLOAT_HEADERS + assert calls[1].kwargs["dtype"] == np.dtype(">f4") def test_read_data_call(self, mocker, mock_for_field_gen): # Checks that data is read if read_data is True. pp_field = mocker.Mock(lblrec=1, lbext=0, lbuser=[0]) - with mock_for_field_gen([pp_field]): - open_fh = mocker.MagicMock(spec=io.RawIOBase) - open.return_value = open_fh + with mock_for_field_gen([pp_field]) as mock_file: next(pp._field_gen("mocked", read_data_bytes=True)) - with open_fh as open_fh_ctx: - expected_loaded_bytes = pp.LoadedArrayBytes( - open_fh_ctx.read(), np.dtype(">f4") - ) - assert pp_field.data == expected_loaded_bytes + # The third read() call (index 2) should be the data payload read + # with data_len = lblrec*PP_WORD_DEPTH - lbext*PP_WORD_DEPTH = 4 bytes. + data_read_call = mock_file.read.call_args_list[2] + assert data_read_call == mocker.call(4) + assert isinstance(pp_field.data, pp.LoadedArrayBytes) def test_invalid_header_release(self, tmp_path): # Check that an unknown LBREL value just results in a warning