Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion docs/src/whatsnew/latest.rst
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,9 @@ This document explains the changes made to Iris for this release
🚀 Performance Enhancements
===========================

#. N/A
#. `@trexfeathers`_ improved the speed of field iteration when reading PP files.
Up to 3x speed up has been seen, dependending on the circumstances.
(:pull:`7089`)


🔥 Deprecations
Expand Down
34 changes: 22 additions & 12 deletions lib/iris/fileformats/pp.py
Original file line number Diff line number Diff line change
Expand Up @@ -1870,19 +1870,32 @@ def _field_gen(filename, read_data_bytes, little_ended=False):
pp_file_read = pp_file.read

field_count = 0
# Total bytes for the full header record:
# leading length word + long headers + float headers + trailing length word
_HEADER_BYTES = PP_WORD_DEPTH * (1 + NUM_LONG_HEADERS + NUM_FLOAT_HEADERS + 1)
_LONGS_OFFSET = PP_WORD_DEPTH # bytes: skip leading length word
_FLOATS_OFFSET = _LONGS_OFFSET + NUM_LONG_HEADERS * PP_WORD_DEPTH
dtype_longs = np.dtype("%ci%d" % (dtype_endian_char, PP_WORD_DEPTH))
dtype_floats = np.dtype("%cf%d" % (dtype_endian_char, PP_WORD_DEPTH))
# Keep reading until we reach the end of file
while True:
# Move past the leading header length word
pp_file_seek(PP_WORD_DEPTH, os.SEEK_CUR)
# Get the LONG header entries
dtype = "%ci%d" % (dtype_endian_char, PP_WORD_DEPTH)
header_longs = np.fromfile(pp_file, dtype=dtype, count=NUM_LONG_HEADERS)
# Read the entire header record in one go
header_buf = pp_file_read(_HEADER_BYTES)
# Nothing returned => EOF
if len(header_longs) == 0:
if len(header_buf) == 0:
break
# Get the FLOAT header entries
dtype = "%cf%d" % (dtype_endian_char, PP_WORD_DEPTH)
header_floats = np.fromfile(pp_file, dtype=dtype, count=NUM_FLOAT_HEADERS)
header_longs = np.frombuffer(
header_buf,
dtype=dtype_longs,
count=NUM_LONG_HEADERS,
offset=_LONGS_OFFSET,
)
header_floats = np.frombuffer(
header_buf,
dtype=dtype_floats,
count=NUM_FLOAT_HEADERS,
offset=_FLOATS_OFFSET,
)
Comment thread
ukmo-ccbunney marked this conversation as resolved.
header = tuple(header_longs) + tuple(header_floats)

# Make a PPField of the appropriate sub-class (depends on header
Expand All @@ -1900,9 +1913,6 @@ def _field_gen(filename, read_data_bytes, little_ended=False):
)
break

# Skip the trailing 4-byte word containing the header length
pp_file_seek(PP_WORD_DEPTH, os.SEEK_CUR)

# Read the word telling me how long the data + extra data is
# This value is # of bytes
len_of_data_plus_extra = struct.unpack_from(
Expand Down
89 changes: 49 additions & 40 deletions lib/iris/tests/unit/fileformats/pp/test__field_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,20 @@

import contextlib
import io
import struct

import numpy as np
import pytest

import iris.fileformats.pp as pp

# Byte size of the full header record that _field_gen reads in one go:
# leading length word + long headers + float headers + trailing length word
_HEADER_BYTES = pp.PP_WORD_DEPTH * (1 + pp.NUM_LONG_HEADERS + pp.NUM_FLOAT_HEADERS + 1)
# A valid data-length word: 4 bytes encoding the value 4 (big-endian uint32),
# matching lblrec=1 * PP_WORD_DEPTH=4 so LBLREC validation passes.
_DATA_LEN_WORD = struct.pack(">L", 4)


class Test:
@pytest.fixture
Expand All @@ -21,24 +29,31 @@ def _mock_for_field_gen(fields):
side_effect_fields = list(fields)[:]

def make_pp_field_override(*args):
# Iterates over the fields passed to this context manager,
# until there are no more, upon which the np.fromfile
# returns an empty list and the while loop in load() is
# broken.
result = side_effect_fields.pop(0)
if not side_effect_fields:
np.fromfile.return_value = []
return result

open_func = "builtins.open"
mocker.patch("numpy.fromfile", return_value=[0])
mocker.patch(open_func)
return side_effect_fields.pop(0)

# Build the sequence of bytes that pp_file.read() will return:
# For each field: a _HEADER_BYTES-sized buffer (all zeros is fine
# for our purposes — make_pp_field is fully mocked), followed by
# a 4-byte data-length word.
# After all fields: b"" to signal EOF on the next header read.
read_side_effects = []
for _ in fields:
read_side_effects.append(bytes(_HEADER_BYTES)) # header read
read_side_effects.append(_DATA_LEN_WORD) # data-len word
read_side_effects.append(b"") # EOF

mock_file = mocker.MagicMock(spec=io.RawIOBase)
mock_file.__enter__ = mocker.Mock(return_value=mock_file)
mock_file.__exit__ = mocker.Mock(return_value=False)
mock_file.read.side_effect = read_side_effects

mocker.patch("builtins.open", return_value=mock_file)
mocker.patch("struct.unpack_from", return_value=[4])
mocker.patch(
"iris.fileformats.pp.make_pp_field",
side_effect=make_pp_field_override,
)
yield
yield mock_file

return _mock_for_field_gen

Expand All @@ -57,40 +72,34 @@ def test_lblrec_invalid(self, mocker, mock_for_field_gen):
assert len(warn) == 1

def test_read_headers_call(self, mocker, mock_for_field_gen):
# Checks that the two calls to np.fromfile are called in the
# expected way.
# Checks that the file is read in a single call of _HEADER_BYTES and
# that np.frombuffer is used to parse longs and floats from that buffer.
pp_field = mocker.Mock(lblrec=1, lbext=0, lbuser=[0])
with mock_for_field_gen([pp_field]):
open_fh = mocker.MagicMock(spec=io.RawIOBase)
open.return_value = open_fh
mock_frombuffer = mocker.patch("numpy.frombuffer", wraps=np.frombuffer)
with mock_for_field_gen([pp_field]) as mock_file:
next(pp._field_gen("mocked", read_data_bytes=False))
with open_fh as open_fh_ctx:
calls = [
mocker.call(open_fh_ctx, count=45, dtype=">i4"),
mocker.call(open_fh_ctx, count=19, dtype=">f4"),
]
np.fromfile.assert_has_calls(calls)
with open_fh as open_fh_ctx:
expected_deferred_bytes = (
"mocked",
open_fh_ctx.tell(),
4,
np.dtype(">f4"),
)
assert pp_field.data == expected_deferred_bytes
# The first read() call should request exactly _HEADER_BYTES bytes.
first_read_call = mock_file.read.call_args_list[0]
assert first_read_call == mocker.call(_HEADER_BYTES)

# frombuffer should have been called twice: once for longs, once for floats.
assert mock_frombuffer.call_count == 2
calls = mock_frombuffer.call_args_list
assert calls[0].kwargs["count"] == pp.NUM_LONG_HEADERS
assert calls[0].kwargs["dtype"] == np.dtype(">i4")
assert calls[1].kwargs["count"] == pp.NUM_FLOAT_HEADERS
assert calls[1].kwargs["dtype"] == np.dtype(">f4")

def test_read_data_call(self, mocker, mock_for_field_gen):
# Checks that data is read if read_data is True.
pp_field = mocker.Mock(lblrec=1, lbext=0, lbuser=[0])
with mock_for_field_gen([pp_field]):
open_fh = mocker.MagicMock(spec=io.RawIOBase)
open.return_value = open_fh
with mock_for_field_gen([pp_field]) as mock_file:
next(pp._field_gen("mocked", read_data_bytes=True))
with open_fh as open_fh_ctx:
expected_loaded_bytes = pp.LoadedArrayBytes(
open_fh_ctx.read(), np.dtype(">f4")
)
assert pp_field.data == expected_loaded_bytes
# The third read() call (index 2) should be the data payload read
# with data_len = lblrec*PP_WORD_DEPTH - lbext*PP_WORD_DEPTH = 4 bytes.
data_read_call = mock_file.read.call_args_list[2]
assert data_read_call == mocker.call(4)
assert isinstance(pp_field.data, pp.LoadedArrayBytes)

def test_invalid_header_release(self, tmp_path):
# Check that an unknown LBREL value just results in a warning
Expand Down
Loading