From 5e358a08b6b67b875cd3562421186b012d2c92f4 Mon Sep 17 00:00:00 2001
From: George Robertson <50412379+georgeRobertson@users.noreply.github.com>
Date: Tue, 24 Feb 2026 10:35:52 +0000
Subject: [PATCH 1/5] fix: deal with case sensitivity on file extension
 derivation (#50)

---
 src/dve/core_engine/backends/base/contract.py               | 2 +-
 src/dve/pipeline/utils.py                                   | 2 +-
 tests/features/books.feature                                | 4 ++--
 tests/test_core_engine/test_engine.py                       | 4 ++--
 tests/testdata/books/{nested_books.xml => nested_books.XML} | 0
 5 files changed, 6 insertions(+), 6 deletions(-)
 rename tests/testdata/books/{nested_books.xml => nested_books.XML} (100%)

diff --git a/src/dve/core_engine/backends/base/contract.py b/src/dve/core_engine/backends/base/contract.py
index a431120..fc7da4d 100644
--- a/src/dve/core_engine/backends/base/contract.py
+++ b/src/dve/core_engine/backends/base/contract.py
@@ -339,7 +339,7 @@ def read_raw_entities(
             reader_metadata = contract_metadata.reader_metadata[entity_name]
             extension = "." + (
                 get_file_suffix(resource) or ""
-            )  # Already checked that extension supported.
+            ).lower()  # Already checked that extension supported.
 
             reader_config = reader_metadata[extension]
             reader_type = get_reader(reader_config.reader)
diff --git a/src/dve/pipeline/utils.py b/src/dve/pipeline/utils.py
index a7e88aa..37f0cc7 100644
--- a/src/dve/pipeline/utils.py
+++ b/src/dve/pipeline/utils.py
@@ -47,7 +47,7 @@ def load_config(
 
 def load_reader(dataset: Dataset, model_name: str, file_extension: str):
     """Loads the readers for the diven feed, model name and file extension"""
-    reader_config = dataset[model_name].reader_config[f".{file_extension}"]
+    reader_config = dataset[model_name].reader_config[f".{file_extension.lower()}"]
     reader = _READER_REGISTRY[reader_config.reader](**reader_config.kwargs_)
     return reader
 
diff --git a/tests/features/books.feature b/tests/features/books.feature
index 9bc0611..f13658a 100644
--- a/tests/features/books.feature
+++ b/tests/features/books.feature
@@ -5,7 +5,7 @@ Feature: Pipeline tests using the books dataset
     introduces more complex transformations that require aggregation.
 
     Scenario: Validate complex nested XML data (spark)
-        Given I submit the books file nested_books.xml for processing
+        Given I submit the books file nested_books.XML for processing
         And A spark pipeline is configured with schema file 'nested_books.dischema.json'
         And I add initial audit entries for the submission
         Then the latest audit record for the submission is marked with processing status file_transformation
@@ -32,7 +32,7 @@ Feature: Pipeline tests using the books dataset
             | number_warnings          | 0     |
 
     Scenario: Validate complex nested XML data (duckdb)
-        Given I submit the books file nested_books.xml for processing
+        Given I submit the books file nested_books.XML for processing
         And A duckdb pipeline is configured with schema file 'nested_books_ddb.dischema.json'
         And I add initial audit entries for the submission
         Then the latest audit record for the submission is marked with processing status file_transformation
diff --git a/tests/test_core_engine/test_engine.py b/tests/test_core_engine/test_engine.py
index 5118cbd..ef23d71 100644
--- a/tests/test_core_engine/test_engine.py
+++ b/tests/test_core_engine/test_engine.py
@@ -99,8 +99,8 @@ def test_dummy_books_run(self, spark, temp_dir: str):
             with test_instance:
                 _, errors_uri = test_instance.run_pipeline(
                     entity_locations={
-                        "header": get_test_file_path("books/nested_books.xml").as_posix(),
-                        "nested_books": get_test_file_path("books/nested_books.xml").as_posix(),
+                        "header": get_test_file_path("books/nested_books.XML").as_posix(),
+                        "nested_books": get_test_file_path("books/nested_books.XML").as_posix(),
                     }
                 )
 
diff --git a/tests/testdata/books/nested_books.xml b/tests/testdata/books/nested_books.XML
similarity index 100%
rename from tests/testdata/books/nested_books.xml
rename to tests/testdata/books/nested_books.XML

From 70425c87474b197974fec84a6c40066a15d2740f Mon Sep 17 00:00:00 2001
From: stevenhsd <56357022+stevenhsd@users.noreply.github.com>
Date: Fri, 27 Feb 2026 16:25:25 +0000
Subject: [PATCH 2/5] fix: amend arrow file read to use streams

---
 .../backends/implementations/duckdb/reference_data.py           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dve/core_engine/backends/implementations/duckdb/reference_data.py b/src/dve/core_engine/backends/implementations/duckdb/reference_data.py
index c10aed7..af815ce 100644
--- a/src/dve/core_engine/backends/implementations/duckdb/reference_data.py
+++ b/src/dve/core_engine/backends/implementations/duckdb/reference_data.py
@@ -46,4 +46,4 @@ def load_parquet_file(self, uri: str) -> DuckDBPyRelation:
     @mark_refdata_file_extension("arrow")
     def load_arrow_file(self, uri: str) -> DuckDBPyRelation:
         """Load an arrow ipc file into a duckdb relation"""
-        return self.connection.from_arrow(ipc.open_file(uri).read_all())  # type:ignore
+        return self.connection.from_arrow(ipc.open_stream(uri).read_all())  # type:ignore

From 2d1083257b0afc6edaa427213709eb395fd4807b Mon Sep 17 00:00:00 2001
From: "george.robertson1" <50412379+georgeRobertson@users.noreply.github.com>
Date: Wed, 4 Mar 2026 17:14:27 +0000
Subject: [PATCH 3/5] test: fix arrow format used in tests

---
 .../movies/refdata/movies_sequels.arrow       | Bin 1114 -> 1032 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/tests/testdata/movies/refdata/movies_sequels.arrow b/tests/testdata/movies/refdata/movies_sequels.arrow
index 89ec37fa8fe721dad065af359e8ad7bb1e10d97b..9978fc8e42b344bec563c0eb772f4749e5f2526a 100644
GIT binary patch
literal 1032
zcmb7Cy-or_5FX@E#EZdy3X8R~u&^?85)(sZM`LBqu*f9`r^iZSN9il*)9?WJ8YU)I
zR>m05Z}w&(7|=MI-EZcbpPQY%G)>cLj>sdTH|U!{k3dj~R;Y;6qj&H=m?9Q?^EgM?
z_tqkMh5s6Q6T5<)+afB{IwH%oW^;U7k-x#Qhd_mGV^Qp9EL`+~xCL6o^00_PqdUxh
zf@fw4dJ>qhd)2cw4`3^7TN9U<Rnz4zBd(`!Zgq5FV(8q{fAR~+F`<bx5UJz&ah(B=
z4wB6=Pd;tfUVQZY3_s6~Wr)o&nUtV=h%IB@^F@iPCdhMS8Q61jI9mxd6g(2NkKuGi
z=~)R^6I)DjcGVF;eiL^WNB?E>o6VC|aqJ1YyUmMtd_yNmco}JR7WYGaqK(msY6YDz
zx>Fsd&b3iNH#TkXahIQLrtAp2FF2mn%;*W(XlqqZbYOJPWId?UzE0|K+?;A-y3a|z
s)7<R%-(vY~R#`-Lwyb;|KR&R$2o`EjBNipNTg#%i#jc2*{lDz{22<3O^Z)<=

literal 1114
zcmd^9!AiqG5S`eht!0rMr1sE51P?tbi1cPENKc}af_Rchw>FSAn65?tz)w)m{R}_A
zgF+Afgg>Fa+3Z3a4}O5J%*?*o+1c6IO*_5b>G>YfH$V=NPmCG~Dw9W5i~z?4C$K8C
zYcx>_Yz`S{FVP+6AV+M1H;oS|iFpN*MJk?LPbop%JdZ8$el`|EWG9N19254hLdxu<
zHTuZoy<+@2W~cJzRwkD!#rPF*K6d1_6<xJ&&gA@B>mgpnDv_|R{^fmWqiC*~A*hZ_
z+zFwxQOTtS{0lwHss=7}SGZR%5U!DXW1)L!Nx^a~!`j`PQfRC-mfN^9GjU+-gAFWj
z8~<3Cx4gaN=iV>Z%~OAIzSkl);Mj)nS^6@|;;TdkC+RGfM^Y)71;c0@C-=daT^CXX
z(IizP_|pkr1#LlE5I$RfB!g}yBPFLQ&k-EYWY$g7!AhFdeXtPV+WC&T#-Ya7;qw#Y
z(^xNje`5s3HbaC0b;BDWo)g}5em?@Qf?uV;ZSnWOuVx>5N00FGVSQFWXzU`VTK|Xd
Lr-gHO^nc0^48WSO


From 7f0bf5fd135e4faa35d94e087bd109816f689806 Mon Sep 17 00:00:00 2001
From: George Robertson <50412379+georgeRobertson@users.noreply.github.com>
Date: Thu, 5 Mar 2026 11:56:06 +0000
Subject: [PATCH 4/5] fix: fix postcode type to raise error when postcode
 submitted without space (#53)

---
 src/dve/metadata_parser/domain_types.py       | 54 +++++++++++++++----
 .../test_domain_types.py                      | 18 +++++++
 2 files changed, 62 insertions(+), 10 deletions(-)

diff --git a/src/dve/metadata_parser/domain_types.py b/src/dve/metadata_parser/domain_types.py
index 3153d26..63616c0 100644
--- a/src/dve/metadata_parser/domain_types.py
+++ b/src/dve/metadata_parser/domain_types.py
@@ -173,33 +173,67 @@ def permissive_nhs_number(warn_on_test_numbers: bool = False):
     return type("NHSNumber", (NHSNumber, *NHSNumber.__bases__), dict_)
 
 
-# TODO: Make the spacing configurable. Not all downstream consumers want a single space
 class Postcode(types.ConstrainedStr):
     """Postcode constrained string"""
 
     regex: re.Pattern = POSTCODE_REGEX
     strip_whitespace = True
+    apply_normalize = True
 
     @staticmethod
-    def normalize(postcode: str) -> Optional[str]:
+    def normalize(_postcode: str) -> Optional[str]:
         """Strips internal and external spaces"""
-        postcode = postcode.replace(" ", "")
-        if not postcode or postcode.lower() in NULL_POSTCODES:
+        _postcode = _postcode.replace(" ", "")
+        if not _postcode or _postcode.lower() in NULL_POSTCODES:
             return None
-        postcode = postcode.replace(" ", "")
-        return " ".join((postcode[0:-3], postcode[-3:])).upper()
+        _postcode = _postcode.replace(" ", "")
+        return " ".join((_postcode[0:-3], _postcode[-3:])).upper()
 
     @classmethod
     def validate(cls, value: str) -> Optional[str]:  # type: ignore
         """Validates the given postcode"""
-        stripped = cls.normalize(value)
-        if not stripped:
+        if cls.apply_normalize and value:
+            value = cls.normalize(value)  # type: ignore
+
+        if not value:
             return None
 
-        if not cls.regex.match(stripped):
+        if not cls.regex.match(value):
             raise ValueError("Invalid Postcode submitted")
 
-        return stripped
+        return value
+
+
+@lru_cache()
+@validate_arguments
+def postcode(
+    # pylint: disable=R0913
+    strip_whitespace: Optional[bool] = True,
+    to_upper: Optional[bool] = False,
+    to_lower: Optional[bool] = False,
+    strict: Optional[bool] = False,
+    min_length: Optional[int] = None,
+    max_length: Optional[int] = None,
+    curtail_length: Optional[int] = None,
+    regex: Optional[str] = POSTCODE_REGEX,  # type: ignore
+    apply_normalize: Optional[bool] = True,
+) -> type[Postcode]:
+    """Return a formatted date class with a set date format
+    and timezone treatment.
+
+    """
+    dict_ = Postcode.__dict__.copy()
+    dict_["strip_whitespace"] = strip_whitespace
+    dict_["to_upper"] = to_upper
+    dict_["to_lower"] = to_lower
+    dict_["strict"] = strict
+    dict_["min_length"] = min_length
+    dict_["max_length"] = max_length
+    dict_["curtail_length"] = curtail_length
+    dict_["regex"] = regex
+    dict_["apply_normalize"] = apply_normalize
+
+    return type("Postcode", (Postcode, *Postcode.__bases__), dict_)
 
 
 class OrgID(_SimpleRegexValidator):
diff --git a/tests/test_model_generation/test_domain_types.py b/tests/test_model_generation/test_domain_types.py
index 6ceee74..18d0c73 100644
--- a/tests/test_model_generation/test_domain_types.py
+++ b/tests/test_model_generation/test_domain_types.py
@@ -98,6 +98,24 @@ def test_postcode(postcode, expected):
     assert model.postcode == expected
 
 
+@pytest.mark.parametrize(
+    ("postcode", "should_error"),
+    [
+        ("LS479AJ", True),
+        ("PostcodeIamNot", True),
+        ("LS47 9AJ", False)
+    ]
+)
+def test_postcode_errors_with_apply_normalize_disabled(postcode: str, should_error: bool):
+    postcode_type = hct.postcode(apply_normalize=False)
+
+    if should_error:
+        with pytest.raises(ValueError, match="Invalid Postcode submitted"):
+            assert postcode_type.validate(postcode)
+    else:
+        assert postcode_type.validate(postcode)
+
+
 @pytest.mark.parametrize(("org_id", "expected"), [("AB123", "AB123"), ("ABCDE", "ABCDE")])
 def test_org_id_passes(org_id, expected):
     model = ATestModel(org_id=org_id)

From aa3a2664696dadd05e9bc107bb0469c402a09b2e Mon Sep 17 00:00:00 2001
From: George Robertson <50412379+georgeRobertson@users.noreply.github.com>
Date: Thu, 5 Mar 2026 11:59:47 +0000
Subject: [PATCH 5/5] fix: fix issue where formattedtime type not triggering
 validate correctly (#54)

---
 src/dve/metadata_parser/domain_types.py       |  5 ++++
 .../test_duckdb/test_data_contract.py         |  2 +-
 .../test_domain_types.py                      | 24 ++++++++++++++++++-
 3 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/src/dve/metadata_parser/domain_types.py b/src/dve/metadata_parser/domain_types.py
index 63616c0..6c4a5c4 100644
--- a/src/dve/metadata_parser/domain_types.py
+++ b/src/dve/metadata_parser/domain_types.py
@@ -516,6 +516,11 @@ def validate(cls, value: Union[dt.time, dt.datetime, str]) -> dt.time | None:
 
         return new_time
 
+    @classmethod
+    def __get_validators__(cls) -> Iterator[classmethod]:
+        """Gets all validators"""
+        yield cls.validate  # type: ignore
+
 
 @lru_cache()
 @validate_arguments
diff --git a/tests/test_core_engine/test_backends/test_implementations/test_duckdb/test_data_contract.py b/tests/test_core_engine/test_backends/test_implementations/test_duckdb/test_data_contract.py
index 9e49338..0300808 100644
--- a/tests/test_core_engine/test_backends/test_implementations/test_duckdb/test_data_contract.py
+++ b/tests/test_core_engine/test_backends/test_implementations/test_duckdb/test_data_contract.py
@@ -50,7 +50,7 @@ def test_duckdb_data_contract_csv(temp_csv_file):
                                 "description": "test",
                                 "callable": "formattedtime",
                                 "constraints": {
-                                    "time_format": "%Y-%m-%d",
+                                    "time_format": "%H:%M:%S",
                                     "timezone_treatment": "forbid"
                                 }
                             }
diff --git a/tests/test_model_generation/test_domain_types.py b/tests/test_model_generation/test_domain_types.py
index 18d0c73..a494494 100644
--- a/tests/test_model_generation/test_domain_types.py
+++ b/tests/test_model_generation/test_domain_types.py
@@ -365,7 +365,8 @@ def test_formattedtime(
         ["23:00:00", "%H:%M:%S", "require",],
         ["23:00:00Z", "%I:%M:%S", "forbid",],
         [dt.datetime(2025, 12, 1, 13, 0, 5, tzinfo=UTC), "%H:%M:%S", "forbid",],
-        [dt.time(13, 0, 5, tzinfo=UTC), "%H:%M:%S", "forbid",]
+        [dt.time(13, 0, 5, tzinfo=UTC), "%H:%M:%S", "forbid",],
+        ["12:00", "%H:%M:%S", "forbid",],
     ]
 )
 def test_formattedtime_raises(
@@ -378,3 +379,24 @@ def test_formattedtime_raises(
     time_type = hct.formattedtime(time_format, timezone_treatment)
     with pytest.raises(ValueError):
         time_type.validate(time_to_validate)  # pylint: disable=W0106
+
+
+class StrictTimeModel(BaseModel):
+    time_val: hct.formattedtime(time_format="%H:%M:%S", timezone_treatment="forbid")
+
+
+@pytest.mark.parametrize(
+    ["time_to_validate", "expected_to_error"],
+    [
+        ("12:00:00", False),
+        ("120000", True),
+        ("12:00", True),
+        ("12", True),
+    ]
+)
+def test_formattedtime_against_model(time_to_validate: str, expected_to_error: bool):
+    if expected_to_error:
+        with pytest.raises(ValueError):
+            StrictTimeModel(time_val=time_to_validate)
+    else:
+        StrictTimeModel(time_val=time_to_validate)