From 5e358a08b6b67b875cd3562421186b012d2c92f4 Mon Sep 17 00:00:00 2001 From: George Robertson <50412379+georgeRobertson@users.noreply.github.com> Date: Tue, 24 Feb 2026 10:35:52 +0000 Subject: [PATCH 1/4] fix: deal with case sensitivity on file extension derivation (#50) --- src/dve/core_engine/backends/base/contract.py | 2 +- src/dve/pipeline/utils.py | 2 +- tests/features/books.feature | 4 ++-- tests/test_core_engine/test_engine.py | 4 ++-- tests/testdata/books/{nested_books.xml => nested_books.XML} | 0 5 files changed, 6 insertions(+), 6 deletions(-) rename tests/testdata/books/{nested_books.xml => nested_books.XML} (100%) diff --git a/src/dve/core_engine/backends/base/contract.py b/src/dve/core_engine/backends/base/contract.py index a431120..fc7da4d 100644 --- a/src/dve/core_engine/backends/base/contract.py +++ b/src/dve/core_engine/backends/base/contract.py @@ -339,7 +339,7 @@ def read_raw_entities( reader_metadata = contract_metadata.reader_metadata[entity_name] extension = "." + ( get_file_suffix(resource) or "" - ) # Already checked that extension supported. + ).lower() # Already checked that extension supported. reader_config = reader_metadata[extension] reader_type = get_reader(reader_config.reader) diff --git a/src/dve/pipeline/utils.py b/src/dve/pipeline/utils.py index a7e88aa..37f0cc7 100644 --- a/src/dve/pipeline/utils.py +++ b/src/dve/pipeline/utils.py @@ -47,7 +47,7 @@ def load_config( def load_reader(dataset: Dataset, model_name: str, file_extension: str): """Loads the readers for the diven feed, model name and file extension""" - reader_config = dataset[model_name].reader_config[f".{file_extension}"] + reader_config = dataset[model_name].reader_config[f".{file_extension.lower()}"] reader = _READER_REGISTRY[reader_config.reader](**reader_config.kwargs_) return reader diff --git a/tests/features/books.feature b/tests/features/books.feature index 9bc0611..f13658a 100644 --- a/tests/features/books.feature +++ b/tests/features/books.feature @@ -5,7 +5,7 @@ Feature: Pipeline tests using the books dataset introduces more complex transformations that require aggregation. Scenario: Validate complex nested XML data (spark) - Given I submit the books file nested_books.xml for processing + Given I submit the books file nested_books.XML for processing And A spark pipeline is configured with schema file 'nested_books.dischema.json' And I add initial audit entries for the submission Then the latest audit record for the submission is marked with processing status file_transformation @@ -32,7 +32,7 @@ Feature: Pipeline tests using the books dataset | number_warnings | 0 | Scenario: Validate complex nested XML data (duckdb) - Given I submit the books file nested_books.xml for processing + Given I submit the books file nested_books.XML for processing And A duckdb pipeline is configured with schema file 'nested_books_ddb.dischema.json' And I add initial audit entries for the submission Then the latest audit record for the submission is marked with processing status file_transformation diff --git a/tests/test_core_engine/test_engine.py b/tests/test_core_engine/test_engine.py index 5118cbd..ef23d71 100644 --- a/tests/test_core_engine/test_engine.py +++ b/tests/test_core_engine/test_engine.py @@ -99,8 +99,8 @@ def test_dummy_books_run(self, spark, temp_dir: str): with test_instance: _, errors_uri = test_instance.run_pipeline( entity_locations={ - "header": get_test_file_path("books/nested_books.xml").as_posix(), - "nested_books": get_test_file_path("books/nested_books.xml").as_posix(), + "header": get_test_file_path("books/nested_books.XML").as_posix(), + "nested_books": get_test_file_path("books/nested_books.XML").as_posix(), } ) diff --git a/tests/testdata/books/nested_books.xml b/tests/testdata/books/nested_books.XML similarity index 100% rename from tests/testdata/books/nested_books.xml rename to tests/testdata/books/nested_books.XML From dc0b68d02e24b5d52992cef5369578f7c07e8def Mon Sep 17 00:00:00 2001 From: stevenhsd <56357022+stevenhsd@users.noreply.github.com> Date: Thu, 5 Mar 2026 16:46:44 +0000 Subject: [PATCH 2/4] feat: add row index at file transform, if not using file transform add during dataa contract or business rules (but likely non-deterministic) --- poetry.lock | 600 +++++++++++++++++- pyproject.toml | 1 + src/dve/core_engine/backends/base/backend.py | 5 +- src/dve/core_engine/backends/base/contract.py | 8 + src/dve/core_engine/backends/base/reader.py | 8 + src/dve/core_engine/backends/base/rules.py | 14 +- .../core_engine/backends/base/utilities.py | 4 + .../implementations/duckdb/contract.py | 9 +- .../implementations/duckdb/duckdb_helpers.py | 19 +- .../implementations/duckdb/readers/csv.py | 21 +- .../implementations/duckdb/readers/json.py | 5 +- .../implementations/duckdb/readers/xml.py | 8 +- .../backends/implementations/duckdb/rules.py | 18 +- .../backends/implementations/spark/backend.py | 4 +- .../implementations/spark/contract.py | 14 +- .../implementations/spark/readers/csv.py | 4 +- .../implementations/spark/readers/json.py | 5 +- .../implementations/spark/readers/xml.py | 11 +- .../backends/implementations/spark/rules.py | 17 +- .../implementations/spark/spark_helpers.py | 22 + src/dve/core_engine/backends/readers/csv.py | 8 +- .../core_engine/backends/readers/utilities.py | 6 +- src/dve/core_engine/backends/readers/xml.py | 9 +- src/dve/core_engine/backends/utilities.py | 16 + src/dve/core_engine/constants.py | 4 +- src/dve/core_engine/engine.py | 4 +- src/dve/core_engine/message.py | 4 +- src/dve/metadata_parser/models.py | 1 + src/dve/pipeline/pipeline.py | 5 +- .../test_duckdb/test_data_contract.py | 13 +- .../test_spark/test_data_contract.py | 3 + .../test_backends/test_readers/test_csv.py | 21 +- .../test_readers/test_ddb_csv.py | 21 +- .../test_readers/test_ddb_json.py | 17 +- .../test_readers/test_ddb_xml.py | 30 +- .../test_readers/test_spark_json.py | 15 +- tests/test_core_engine/test_message.py | 6 +- tests/testdata/planets/planets.dischema.json | 4 +- .../planets/planets_ddb.dischema.json | 2 +- 39 files changed, 845 insertions(+), 141 deletions(-) diff --git a/poetry.lock b/poetry.lock index 7b1987a..38d8df1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,5 +1,18 @@ # This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand. +[[package]] +name = "appnope" +version = "0.1.4" +description = "Disable App Nap on macOS >= 10.9" +optional = false +python-versions = ">=3.6" +groups = ["dev"] +markers = "platform_system == \"Darwin\"" +files = [ + {file = "appnope-0.1.4-py2.py3-none-any.whl", hash = "sha256:502575ee11cd7a28c0205f379b525beefebab9d161b7c964670864014ed7213c"}, + {file = "appnope-0.1.4.tar.gz", hash = "sha256:1de3860566df9caf38f01f86f65e0e13e379af54f9e4bee1e66b48f2efffd1ee"}, +] + [[package]] name = "argcomplete" version = "3.6.3" @@ -35,6 +48,22 @@ wrapt = [ {version = ">=1.14,<2", markers = "python_version >= \"3.11\""}, ] +[[package]] +name = "asttokens" +version = "3.0.1" +description = "Annotate AST trees with source code positions" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "asttokens-3.0.1-py3-none-any.whl", hash = "sha256:15a3ebc0f43c2d0a50eeafea25e19046c68398e487b9f1f5b517f7c0f40f976a"}, + {file = "asttokens-3.0.1.tar.gz", hash = "sha256:71a4ee5de0bde6a31d64f6b13f2293ac190344478f081c3d1bccfcf5eacb0cb7"}, +] + +[package.extras] +astroid = ["astroid (>=2,<5)"] +test = ["astroid (>=2,<5)", "pytest (<9.0)", "pytest-cov", "pytest-xdist"] + [[package]] name = "behave" version = "1.3.3" @@ -551,7 +580,6 @@ description = "Foreign Function Interface for Python calling C code." optional = false python-versions = ">=3.9" groups = ["dev", "test"] -markers = "platform_python_implementation != \"PyPy\"" files = [ {file = "cffi-2.0.0-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:0cf2d91ecc3fcc0625c2c530fe004f82c110405f101548512cce44322fa8ac44"}, {file = "cffi-2.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f73b96c41e3b2adedc34a7356e64c8eb96e03a3782b535e043a986276ce12a49"}, @@ -638,6 +666,7 @@ files = [ {file = "cffi-2.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:b882b3df248017dba09d6b16defe9b5c407fe32fc7c65a9c69798e6175601be9"}, {file = "cffi-2.0.0.tar.gz", hash = "sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529"}, ] +markers = {dev = "platform_python_implementation != \"PyPy\" or implementation_name == \"pypy\"", test = "platform_python_implementation != \"PyPy\""} [package.dependencies] pycparser = {version = "*", markers = "implementation_name != \"PyPy\""} @@ -805,6 +834,21 @@ files = [ ] markers = {lint = "platform_system == \"Windows\" or sys_platform == \"win32\""} +[[package]] +name = "comm" +version = "0.2.3" +description = "Jupyter Python Comm implementation, for usage in ipykernel, xeus-python etc." +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "comm-0.2.3-py3-none-any.whl", hash = "sha256:c615d91d75f7f04f095b30d1c1711babd43bdc6419c1be9886a85f2f4e489417"}, + {file = "comm-0.2.3.tar.gz", hash = "sha256:2dc8048c10962d55d7ad693be1e7045d891b7ce8d999c97963a5e3e99c055971"}, +] + +[package.extras] +test = ["pytest"] + [[package]] name = "commitizen" version = "4.9.1" @@ -1034,6 +1078,46 @@ files = [ {file = "cucumber_tag_expressions-9.0.0.tar.gz", hash = "sha256:731302c12bd602309596b35e733c1021b517d4948329803c23ca026e26ef4e99"}, ] +[[package]] +name = "debugpy" +version = "1.8.20" +description = "An implementation of the Debug Adapter Protocol for Python" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "debugpy-1.8.20-cp310-cp310-macosx_15_0_x86_64.whl", hash = "sha256:157e96ffb7f80b3ad36d808646198c90acb46fdcfd8bb1999838f0b6f2b59c64"}, + {file = "debugpy-1.8.20-cp310-cp310-manylinux_2_34_x86_64.whl", hash = "sha256:c1178ae571aff42e61801a38b007af504ec8e05fde1c5c12e5a7efef21009642"}, + {file = "debugpy-1.8.20-cp310-cp310-win32.whl", hash = "sha256:c29dd9d656c0fbd77906a6e6a82ae4881514aa3294b94c903ff99303e789b4a2"}, + {file = "debugpy-1.8.20-cp310-cp310-win_amd64.whl", hash = "sha256:3ca85463f63b5dd0aa7aaa933d97cbc47c174896dcae8431695872969f981893"}, + {file = "debugpy-1.8.20-cp311-cp311-macosx_15_0_universal2.whl", hash = "sha256:eada6042ad88fa1571b74bd5402ee8b86eded7a8f7b827849761700aff171f1b"}, + {file = "debugpy-1.8.20-cp311-cp311-manylinux_2_34_x86_64.whl", hash = "sha256:7de0b7dfeedc504421032afba845ae2a7bcc32ddfb07dae2c3ca5442f821c344"}, + {file = "debugpy-1.8.20-cp311-cp311-win32.whl", hash = "sha256:773e839380cf459caf73cc533ea45ec2737a5cc184cf1b3b796cd4fd98504fec"}, + {file = "debugpy-1.8.20-cp311-cp311-win_amd64.whl", hash = "sha256:1f7650546e0eded1902d0f6af28f787fa1f1dbdbc97ddabaf1cd963a405930cb"}, + {file = "debugpy-1.8.20-cp312-cp312-macosx_15_0_universal2.whl", hash = "sha256:4ae3135e2089905a916909ef31922b2d733d756f66d87345b3e5e52b7a55f13d"}, + {file = "debugpy-1.8.20-cp312-cp312-manylinux_2_34_x86_64.whl", hash = "sha256:88f47850a4284b88bd2bfee1f26132147d5d504e4e86c22485dfa44b97e19b4b"}, + {file = "debugpy-1.8.20-cp312-cp312-win32.whl", hash = "sha256:4057ac68f892064e5f98209ab582abfee3b543fb55d2e87610ddc133a954d390"}, + {file = "debugpy-1.8.20-cp312-cp312-win_amd64.whl", hash = "sha256:a1a8f851e7cf171330679ef6997e9c579ef6dd33c9098458bd9986a0f4ca52e3"}, + {file = "debugpy-1.8.20-cp313-cp313-macosx_15_0_universal2.whl", hash = "sha256:5dff4bb27027821fdfcc9e8f87309a28988231165147c31730128b1c983e282a"}, + {file = "debugpy-1.8.20-cp313-cp313-manylinux_2_34_x86_64.whl", hash = "sha256:84562982dd7cf5ebebfdea667ca20a064e096099997b175fe204e86817f64eaf"}, + {file = "debugpy-1.8.20-cp313-cp313-win32.whl", hash = "sha256:da11dea6447b2cadbf8ce2bec59ecea87cc18d2c574980f643f2d2dfe4862393"}, + {file = "debugpy-1.8.20-cp313-cp313-win_amd64.whl", hash = "sha256:eb506e45943cab2efb7c6eafdd65b842f3ae779f020c82221f55aca9de135ed7"}, + {file = "debugpy-1.8.20-cp314-cp314-macosx_15_0_universal2.whl", hash = "sha256:9c74df62fc064cd5e5eaca1353a3ef5a5d50da5eb8058fcef63106f7bebe6173"}, + {file = "debugpy-1.8.20-cp314-cp314-manylinux_2_34_x86_64.whl", hash = "sha256:077a7447589ee9bc1ff0cdf443566d0ecf540ac8aa7333b775ebcb8ce9f4ecad"}, + {file = "debugpy-1.8.20-cp314-cp314-win32.whl", hash = "sha256:352036a99dd35053b37b7803f748efc456076f929c6a895556932eaf2d23b07f"}, + {file = "debugpy-1.8.20-cp314-cp314-win_amd64.whl", hash = "sha256:a98eec61135465b062846112e5ecf2eebb855305acc1dfbae43b72903b8ab5be"}, + {file = "debugpy-1.8.20-cp38-cp38-macosx_15_0_x86_64.whl", hash = "sha256:b773eb026a043e4d9c76265742bc846f2f347da7e27edf7fe97716ea19d6bfc5"}, + {file = "debugpy-1.8.20-cp38-cp38-manylinux_2_34_x86_64.whl", hash = "sha256:20d6e64ea177ab6732bffd3ce8fc6fb8879c60484ce14c3b3fe183b1761459ca"}, + {file = "debugpy-1.8.20-cp38-cp38-win32.whl", hash = "sha256:0dfd9adb4b3c7005e9c33df430bcdd4e4ebba70be533e0066e3a34d210041b66"}, + {file = "debugpy-1.8.20-cp38-cp38-win_amd64.whl", hash = "sha256:60f89411a6c6afb89f18e72e9091c3dfbcfe3edc1066b2043a1f80a3bbb3e11f"}, + {file = "debugpy-1.8.20-cp39-cp39-macosx_15_0_x86_64.whl", hash = "sha256:bff8990f040dacb4c314864da95f7168c5a58a30a66e0eea0fb85e2586a92cd6"}, + {file = "debugpy-1.8.20-cp39-cp39-manylinux_2_34_x86_64.whl", hash = "sha256:70ad9ae09b98ac307b82c16c151d27ee9d68ae007a2e7843ba621b5ce65333b5"}, + {file = "debugpy-1.8.20-cp39-cp39-win32.whl", hash = "sha256:9eeed9f953f9a23850c85d440bf51e3c56ed5d25f8560eeb29add815bd32f7ee"}, + {file = "debugpy-1.8.20-cp39-cp39-win_amd64.whl", hash = "sha256:760813b4fff517c75bfe7923033c107104e76acfef7bda011ffea8736e9a66f8"}, + {file = "debugpy-1.8.20-py2.py3-none-any.whl", hash = "sha256:5be9bed9ae3be00665a06acaa48f8329d2b9632f15fd09f6a9a8c8d9907e54d7"}, + {file = "debugpy-1.8.20.tar.gz", hash = "sha256:55bc8701714969f1ab89a6d5f2f3d40c36f91b2cbe2f65d98bf8196f6a6a2c33"}, +] + [[package]] name = "decli" version = "0.6.3" @@ -1046,6 +1130,18 @@ files = [ {file = "decli-0.6.3.tar.gz", hash = "sha256:87f9d39361adf7f16b9ca6e3b614badf7519da13092f2db3c80ca223c53c7656"}, ] +[[package]] +name = "decorator" +version = "5.2.1" +description = "Decorators for Humans" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a"}, + {file = "decorator-5.2.1.tar.gz", hash = "sha256:65f266143752f734b0a7cc83c46f4618af75b8c5911b00ccb61d0ac9b6da0360"}, +] + [[package]] name = "delta-spark" version = "2.4.0" @@ -1201,6 +1297,21 @@ typing-extensions = {version = ">=4.6.0", markers = "python_version < \"3.13\""} [package.extras] test = ["pytest (>=6)"] +[[package]] +name = "executing" +version = "2.2.1" +description = "Get the currently executing AST node of a frame, and other information" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "executing-2.2.1-py2.py3-none-any.whl", hash = "sha256:760643d3452b4d777d295bb167ccc74c64a81df23fb5e08eff250c425a4b2017"}, + {file = "executing-2.2.1.tar.gz", hash = "sha256:3632cc370565f6648cc328b32435bd120a1e4ebb20c77e3fdde9a13cd1e533c4"}, +] + +[package.extras] +tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipython", "littleutils", "pytest", "rich ; python_version >= \"3.11\""] + [[package]] name = "faker" version = "18.11.1" @@ -1294,6 +1405,130 @@ files = [ {file = "iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730"}, ] +[[package]] +name = "ipykernel" +version = "7.2.0" +description = "IPython Kernel for Jupyter" +optional = false +python-versions = ">=3.10" +groups = ["dev"] +files = [ + {file = "ipykernel-7.2.0-py3-none-any.whl", hash = "sha256:3bbd4420d2b3cc105cbdf3756bfc04500b1e52f090a90716851f3916c62e1661"}, + {file = "ipykernel-7.2.0.tar.gz", hash = "sha256:18ed160b6dee2cbb16e5f3575858bc19d8f1fe6046a9a680c708494ce31d909e"}, +] + +[package.dependencies] +appnope = {version = ">=0.1.2", markers = "platform_system == \"Darwin\""} +comm = ">=0.1.1" +debugpy = ">=1.6.5" +ipython = ">=7.23.1" +jupyter-client = ">=8.8.0" +jupyter-core = ">=5.1,<6.0.dev0 || >=6.1.dev0" +matplotlib-inline = ">=0.1" +nest-asyncio = ">=1.4" +packaging = ">=22" +psutil = ">=5.7" +pyzmq = ">=25" +tornado = ">=6.4.1" +traitlets = ">=5.4.0" + +[package.extras] +cov = ["coverage[toml]", "matplotlib", "pytest-cov", "trio"] +docs = ["intersphinx-registry", "myst-parser", "pydata-sphinx-theme", "sphinx (<8.2.0)", "sphinx-autodoc-typehints", "sphinxcontrib-github-alt", "sphinxcontrib-spelling", "trio"] +pyqt5 = ["pyqt5"] +pyside6 = ["pyside6"] +test = ["flaky", "ipyparallel", "pre-commit", "pytest (>=7.0,<10)", "pytest-asyncio (>=0.23.5)", "pytest-cov", "pytest-timeout"] + +[[package]] +name = "ipython" +version = "8.38.0" +description = "IPython: Productive Interactive Computing" +optional = false +python-versions = ">=3.10" +groups = ["dev"] +markers = "python_version == \"3.10\"" +files = [ + {file = "ipython-8.38.0-py3-none-any.whl", hash = "sha256:750162629d800ac65bb3b543a14e7a74b0e88063eac9b92124d4b2aa3f6d8e86"}, + {file = "ipython-8.38.0.tar.gz", hash = "sha256:9cfea8c903ce0867cc2f23199ed8545eb741f3a69420bfcf3743ad1cec856d39"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "sys_platform == \"win32\""} +decorator = "*" +exceptiongroup = {version = "*", markers = "python_version < \"3.11\""} +jedi = ">=0.16" +matplotlib-inline = "*" +pexpect = {version = ">4.3", markers = "sys_platform != \"win32\" and sys_platform != \"emscripten\""} +prompt_toolkit = ">=3.0.41,<3.1.0" +pygments = ">=2.4.0" +stack_data = "*" +traitlets = ">=5.13.0" +typing_extensions = {version = ">=4.6", markers = "python_version < \"3.12\""} + +[package.extras] +all = ["ipython[black,doc,kernel,matplotlib,nbconvert,nbformat,notebook,parallel,qtconsole]", "ipython[test,test-extra]"] +black = ["black"] +doc = ["docrepr", "exceptiongroup", "intersphinx_registry", "ipykernel", "ipython[test]", "matplotlib", "setuptools (>=18.5)", "sphinx (>=1.3)", "sphinx-rtd-theme", "sphinxcontrib-jquery", "tomli ; python_version < \"3.11\"", "typing_extensions"] +kernel = ["ipykernel"] +matplotlib = ["matplotlib"] +nbconvert = ["nbconvert"] +nbformat = ["nbformat"] +notebook = ["ipywidgets", "notebook"] +parallel = ["ipyparallel"] +qtconsole = ["qtconsole"] +test = ["packaging", "pickleshare", "pytest", "pytest-asyncio (<0.22)", "testpath"] +test-extra = ["curio", "ipython[test]", "jupyter_ai", "matplotlib (!=3.2.0)", "nbformat", "numpy (>=1.23)", "pandas", "trio"] + +[[package]] +name = "ipython" +version = "9.10.0" +description = "IPython: Productive Interactive Computing" +optional = false +python-versions = ">=3.11" +groups = ["dev"] +markers = "python_version == \"3.11\"" +files = [ + {file = "ipython-9.10.0-py3-none-any.whl", hash = "sha256:c6ab68cc23bba8c7e18e9b932797014cc61ea7fd6f19de180ab9ba73e65ee58d"}, + {file = "ipython-9.10.0.tar.gz", hash = "sha256:cd9e656be97618a0676d058134cd44e6dc7012c0e5cb36a9ce96a8c904adaf77"}, +] + +[package.dependencies] +colorama = {version = ">=0.4.4", markers = "sys_platform == \"win32\""} +decorator = ">=4.3.2" +ipython-pygments-lexers = ">=1.0.0" +jedi = ">=0.18.1" +matplotlib-inline = ">=0.1.5" +pexpect = {version = ">4.3", markers = "sys_platform != \"win32\" and sys_platform != \"emscripten\""} +prompt_toolkit = ">=3.0.41,<3.1.0" +pygments = ">=2.11.0" +stack_data = ">=0.6.0" +traitlets = ">=5.13.0" +typing_extensions = {version = ">=4.6", markers = "python_version < \"3.12\""} + +[package.extras] +all = ["argcomplete (>=3.0)", "ipython[doc,matplotlib,terminal,test,test-extra]"] +black = ["black"] +doc = ["docrepr", "exceptiongroup", "intersphinx_registry", "ipykernel", "ipython[matplotlib,test]", "setuptools (>=70.0)", "sphinx (>=8.0)", "sphinx-rtd-theme (>=0.1.8)", "sphinx_toml (==0.0.4)", "typing_extensions"] +matplotlib = ["matplotlib (>3.9)"] +test = ["packaging (>=20.1.0)", "pytest (>=7.0.0)", "pytest-asyncio (>=1.0.0)", "setuptools (>=61.2)", "testpath (>=0.2)"] +test-extra = ["curio", "ipykernel (>6.30)", "ipython[matplotlib]", "ipython[test]", "jupyter_ai", "nbclient", "nbformat", "numpy (>=1.27)", "pandas (>2.1)", "trio (>=0.1.0)"] + +[[package]] +name = "ipython-pygments-lexers" +version = "1.1.1" +description = "Defines a variety of Pygments lexers for highlighting IPython code." +optional = false +python-versions = ">=3.8" +groups = ["dev"] +markers = "python_version == \"3.11\"" +files = [ + {file = "ipython_pygments_lexers-1.1.1-py3-none-any.whl", hash = "sha256:a9462224a505ade19a605f71f8fa63c2048833ce50abc86768a0d81d876dc81c"}, + {file = "ipython_pygments_lexers-1.1.1.tar.gz", hash = "sha256:09c0138009e56b6854f9535736f4171d855c8c08a563a0dcd8022f78355c7e81"}, +] + +[package.dependencies] +pygments = "*" + [[package]] name = "isort" version = "5.11.5" @@ -1312,6 +1547,26 @@ pipfile-deprecated-finder = ["pip-shims (>=0.5.2)", "pipreqs", "requirementslib" plugins = ["setuptools"] requirements-deprecated-finder = ["pip-api", "pipreqs"] +[[package]] +name = "jedi" +version = "0.19.2" +description = "An autocompletion tool for Python that can be used for text editors." +optional = false +python-versions = ">=3.6" +groups = ["dev"] +files = [ + {file = "jedi-0.19.2-py2.py3-none-any.whl", hash = "sha256:a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9"}, + {file = "jedi-0.19.2.tar.gz", hash = "sha256:4770dc3de41bde3966b02eb84fbcf557fb33cce26ad23da12c742fb50ecb11f0"}, +] + +[package.dependencies] +parso = ">=0.8.4,<0.9.0" + +[package.extras] +docs = ["Jinja2 (==2.11.3)", "MarkupSafe (==1.1.1)", "Pygments (==2.8.1)", "alabaster (==0.7.12)", "babel (==2.9.1)", "chardet (==4.0.0)", "commonmark (==0.8.1)", "docutils (==0.17.1)", "future (==0.18.2)", "idna (==2.10)", "imagesize (==1.2.0)", "mock (==1.0.1)", "packaging (==20.9)", "pyparsing (==2.4.7)", "pytz (==2021.1)", "readthedocs-sphinx-ext (==2.1.4)", "recommonmark (==0.5.0)", "requests (==2.25.1)", "six (==1.15.0)", "snowballstemmer (==2.1.0)", "sphinx (==1.8.5)", "sphinx-rtd-theme (==0.4.3)", "sphinxcontrib-serializinghtml (==1.1.4)", "sphinxcontrib-websupport (==1.2.4)", "urllib3 (==1.26.4)"] +qa = ["flake8 (==5.0.4)", "mypy (==0.971)", "types-setuptools (==67.2.0.1)"] +testing = ["Django", "attrs", "colorama", "docopt", "pytest (<9.0.0)"] + [[package]] name = "jinja2" version = "3.1.6" @@ -1342,6 +1597,50 @@ files = [ {file = "jmespath-1.1.0.tar.gz", hash = "sha256:472c87d80f36026ae83c6ddd0f1d05d4e510134ed462851fd5f754c8c3cbb88d"}, ] +[[package]] +name = "jupyter-client" +version = "8.8.0" +description = "Jupyter protocol implementation and client libraries" +optional = false +python-versions = ">=3.10" +groups = ["dev"] +files = [ + {file = "jupyter_client-8.8.0-py3-none-any.whl", hash = "sha256:f93a5b99c5e23a507b773d3a1136bd6e16c67883ccdbd9a829b0bbdb98cd7d7a"}, + {file = "jupyter_client-8.8.0.tar.gz", hash = "sha256:d556811419a4f2d96c869af34e854e3f059b7cc2d6d01a9cd9c85c267691be3e"}, +] + +[package.dependencies] +jupyter-core = ">=5.1" +python-dateutil = ">=2.8.2" +pyzmq = ">=25.0" +tornado = ">=6.4.1" +traitlets = ">=5.3" + +[package.extras] +docs = ["ipykernel", "myst-parser", "pydata-sphinx-theme", "sphinx (>=4)", "sphinx-autodoc-typehints", "sphinxcontrib-github-alt", "sphinxcontrib-spelling"] +orjson = ["orjson"] +test = ["anyio", "coverage", "ipykernel (>=6.14)", "msgpack", "mypy ; platform_python_implementation != \"PyPy\"", "paramiko ; sys_platform == \"win32\"", "pre-commit", "pytest", "pytest-cov", "pytest-jupyter[client] (>=0.6.2)", "pytest-timeout"] + +[[package]] +name = "jupyter-core" +version = "5.9.1" +description = "Jupyter core package. A base package on which Jupyter projects rely." +optional = false +python-versions = ">=3.10" +groups = ["dev"] +files = [ + {file = "jupyter_core-5.9.1-py3-none-any.whl", hash = "sha256:ebf87fdc6073d142e114c72c9e29a9d7ca03fad818c5d300ce2adc1fb0743407"}, + {file = "jupyter_core-5.9.1.tar.gz", hash = "sha256:4d09aaff303b9566c3ce657f580bd089ff5c91f5f89cf7d8846c3cdf465b5508"}, +] + +[package.dependencies] +platformdirs = ">=2.5" +traitlets = ">=5.3" + +[package.extras] +docs = ["intersphinx-registry", "myst-parser", "pydata-sphinx-theme", "sphinx-autodoc-typehints", "sphinxcontrib-spelling", "traitlets"] +test = ["ipykernel", "pre-commit", "pytest (<9)", "pytest-cov", "pytest-timeout"] + [[package]] name = "lazy-object-proxy" version = "1.12.0" @@ -1604,6 +1903,24 @@ files = [ {file = "markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698"}, ] +[[package]] +name = "matplotlib-inline" +version = "0.2.1" +description = "Inline Matplotlib backend for Jupyter" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "matplotlib_inline-0.2.1-py3-none-any.whl", hash = "sha256:d56ce5156ba6085e00a9d54fead6ed29a9c47e215cd1bba2e976ef39f5710a76"}, + {file = "matplotlib_inline-0.2.1.tar.gz", hash = "sha256:e1ee949c340d771fc39e241ea75683deb94762c8fa5f2927ec57c83c4dffa9fe"}, +] + +[package.dependencies] +traitlets = "*" + +[package.extras] +test = ["flake8", "nbdime", "nbval", "notebook", "pytest"] + [[package]] name = "mccabe" version = "0.7.0" @@ -1812,6 +2129,18 @@ files = [ {file = "mypy_extensions-1.1.0.tar.gz", hash = "sha256:52e68efc3284861e772bbcd66823fde5ae21fd2fdb51c62a211403730b916558"}, ] +[[package]] +name = "nest-asyncio" +version = "1.6.0" +description = "Patch asyncio to allow nested event loops" +optional = false +python-versions = ">=3.5" +groups = ["dev"] +files = [ + {file = "nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c"}, + {file = "nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe"}, +] + [[package]] name = "nodeenv" version = "1.10.0" @@ -2144,6 +2473,22 @@ develop = ["build (>=0.5.1)", "coverage (>=4.4)", "pylint", "pytest (<5.0) ; pyt docs = ["Sphinx (>=1.6)", "sphinx_bootstrap_theme (>=0.6.0)"] testing = ["pytest (<5.0) ; python_version < \"3.0\"", "pytest (>=5.0) ; python_version >= \"3.0\"", "pytest-html (>=1.19.0)"] +[[package]] +name = "parso" +version = "0.8.6" +description = "A Python Parser" +optional = false +python-versions = ">=3.6" +groups = ["dev"] +files = [ + {file = "parso-0.8.6-py2.py3-none-any.whl", hash = "sha256:2c549f800b70a5c4952197248825584cb00f033b29c692671d3bf08bf380baff"}, + {file = "parso-0.8.6.tar.gz", hash = "sha256:2b9a0332696df97d454fa67b81618fd69c35a7b90327cbe6ba5c92d2c68a7bfd"}, +] + +[package.extras] +qa = ["flake8 (==5.0.4)", "types-setuptools (==67.2.0.1)", "zuban (==0.5.1)"] +testing = ["docopt", "pytest"] + [[package]] name = "pathspec" version = "1.0.4" @@ -2162,6 +2507,22 @@ optional = ["typing-extensions (>=4)"] re2 = ["google-re2 (>=1.1)"] tests = ["pytest (>=9)", "typing-extensions (>=4.15)"] +[[package]] +name = "pexpect" +version = "4.9.0" +description = "Pexpect allows easy control of interactive console applications." +optional = false +python-versions = "*" +groups = ["dev"] +markers = "sys_platform != \"win32\" and sys_platform != \"emscripten\"" +files = [ + {file = "pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523"}, + {file = "pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f"}, +] + +[package.dependencies] +ptyprocess = ">=0.5" + [[package]] name = "platformdirs" version = "4.7.0" @@ -2264,6 +2625,69 @@ files = [ [package.dependencies] wcwidth = "*" +[[package]] +name = "psutil" +version = "7.2.2" +description = "Cross-platform lib for process and system monitoring." +optional = false +python-versions = ">=3.6" +groups = ["dev"] +files = [ + {file = "psutil-7.2.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:2edccc433cbfa046b980b0df0171cd25bcaeb3a68fe9022db0979e7aa74a826b"}, + {file = "psutil-7.2.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e78c8603dcd9a04c7364f1a3e670cea95d51ee865e4efb3556a3a63adef958ea"}, + {file = "psutil-7.2.2-cp313-cp313t-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1a571f2330c966c62aeda00dd24620425d4b0cc86881c89861fbc04549e5dc63"}, + {file = "psutil-7.2.2-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:917e891983ca3c1887b4ef36447b1e0873e70c933afc831c6b6da078ba474312"}, + {file = "psutil-7.2.2-cp313-cp313t-win_amd64.whl", hash = "sha256:ab486563df44c17f5173621c7b198955bd6b613fb87c71c161f827d3fb149a9b"}, + {file = "psutil-7.2.2-cp313-cp313t-win_arm64.whl", hash = "sha256:ae0aefdd8796a7737eccea863f80f81e468a1e4cf14d926bd9b6f5f2d5f90ca9"}, + {file = "psutil-7.2.2-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:eed63d3b4d62449571547b60578c5b2c4bcccc5387148db46e0c2313dad0ee00"}, + {file = "psutil-7.2.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7b6d09433a10592ce39b13d7be5a54fbac1d1228ed29abc880fb23df7cb694c9"}, + {file = "psutil-7.2.2-cp314-cp314t-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1fa4ecf83bcdf6e6c8f4449aff98eefb5d0604bf88cb883d7da3d8d2d909546a"}, + {file = "psutil-7.2.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e452c464a02e7dc7822a05d25db4cde564444a67e58539a00f929c51eddda0cf"}, + {file = "psutil-7.2.2-cp314-cp314t-win_amd64.whl", hash = "sha256:c7663d4e37f13e884d13994247449e9f8f574bc4655d509c3b95e9ec9e2b9dc1"}, + {file = "psutil-7.2.2-cp314-cp314t-win_arm64.whl", hash = "sha256:11fe5a4f613759764e79c65cf11ebdf26e33d6dd34336f8a337aa2996d71c841"}, + {file = "psutil-7.2.2-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:ed0cace939114f62738d808fdcecd4c869222507e266e574799e9c0faa17d486"}, + {file = "psutil-7.2.2-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:1a7b04c10f32cc88ab39cbf606e117fd74721c831c98a27dc04578deb0c16979"}, + {file = "psutil-7.2.2-cp36-abi3-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:076a2d2f923fd4821644f5ba89f059523da90dc9014e85f8e45a5774ca5bc6f9"}, + {file = "psutil-7.2.2-cp36-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b0726cecd84f9474419d67252add4ac0cd9811b04d61123054b9fb6f57df6e9e"}, + {file = "psutil-7.2.2-cp36-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:fd04ef36b4a6d599bbdb225dd1d3f51e00105f6d48a28f006da7f9822f2606d8"}, + {file = "psutil-7.2.2-cp36-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b58fabe35e80b264a4e3bb23e6b96f9e45a3df7fb7eed419ac0e5947c61e47cc"}, + {file = "psutil-7.2.2-cp37-abi3-win_amd64.whl", hash = "sha256:eb7e81434c8d223ec4a219b5fc1c47d0417b12be7ea866e24fb5ad6e84b3d988"}, + {file = "psutil-7.2.2-cp37-abi3-win_arm64.whl", hash = "sha256:8c233660f575a5a89e6d4cb65d9f938126312bca76d8fe087b947b3a1aaac9ee"}, + {file = "psutil-7.2.2.tar.gz", hash = "sha256:0746f5f8d406af344fd547f1c8daa5f5c33dbc293bb8d6a16d80b4bb88f59372"}, +] + +[package.extras] +dev = ["abi3audit", "black", "check-manifest", "colorama ; os_name == \"nt\"", "coverage", "packaging", "psleak", "pylint", "pyperf", "pypinfo", "pyreadline3 ; os_name == \"nt\"", "pytest", "pytest-cov", "pytest-instafail", "pytest-xdist", "pywin32 ; os_name == \"nt\" and implementation_name != \"pypy\"", "requests", "rstcheck", "ruff", "setuptools", "sphinx", "sphinx_rtd_theme", "toml-sort", "twine", "validate-pyproject[all]", "virtualenv", "vulture", "wheel", "wheel ; os_name == \"nt\" and implementation_name != \"pypy\"", "wmi ; os_name == \"nt\" and implementation_name != \"pypy\""] +test = ["psleak", "pytest", "pytest-instafail", "pytest-xdist", "pywin32 ; os_name == \"nt\" and implementation_name != \"pypy\"", "setuptools", "wheel ; os_name == \"nt\" and implementation_name != \"pypy\"", "wmi ; os_name == \"nt\" and implementation_name != \"pypy\""] + +[[package]] +name = "ptyprocess" +version = "0.7.0" +description = "Run a subprocess in a pseudo terminal" +optional = false +python-versions = "*" +groups = ["dev"] +markers = "sys_platform != \"win32\" and sys_platform != \"emscripten\"" +files = [ + {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"}, + {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"}, +] + +[[package]] +name = "pure-eval" +version = "0.2.3" +description = "Safely evaluate AST nodes without side effects" +optional = false +python-versions = "*" +groups = ["dev"] +files = [ + {file = "pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0"}, + {file = "pure_eval-0.2.3.tar.gz", hash = "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42"}, +] + +[package.extras] +tests = ["pytest"] + [[package]] name = "py4j" version = "0.10.9.7" @@ -2335,11 +2759,11 @@ description = "C parser in Python" optional = false python-versions = ">=3.10" groups = ["dev", "test"] -markers = "platform_python_implementation != \"PyPy\" and implementation_name != \"PyPy\"" files = [ {file = "pycparser-3.0-py3-none-any.whl", hash = "sha256:b727414169a36b7d524c1c3e31839a521725078d7b2ff038656844266160a992"}, {file = "pycparser-3.0.tar.gz", hash = "sha256:600f49d217304a5902ac3c37e1281c9fe94e4d0489de643a9504c5cdfdfc6b29"}, ] +markers = {dev = "(platform_python_implementation != \"PyPy\" or implementation_name == \"pypy\") and implementation_name != \"PyPy\"", test = "platform_python_implementation != \"PyPy\" and implementation_name != \"PyPy\""} [[package]] name = "pydantic" @@ -2533,6 +2957,13 @@ optional = false python-versions = ">=3.8" groups = ["dev", "test"] files = [ + {file = "PyYAML-6.0.3-cp38-cp38-macosx_10_13_x86_64.whl", hash = "sha256:c2514fceb77bc5e7a2f7adfaa1feb2fb311607c9cb518dbc378688ec73d8292f"}, + {file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c57bb8c96f6d1808c030b1687b9b5fb476abaa47f0db9c0101f5e9f394e97f4"}, + {file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:efd7b85f94a6f21e4932043973a7ba2613b059c4a000551892ac9f1d11f5baf3"}, + {file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:22ba7cfcad58ef3ecddc7ed1db3409af68d023b7f940da23c6c2a1890976eda6"}, + {file = "PyYAML-6.0.3-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:6344df0d5755a2c9a276d4473ae6b90647e216ab4757f8426893b5dd2ac3f369"}, + {file = "PyYAML-6.0.3-cp38-cp38-win32.whl", hash = "sha256:3ff07ec89bae51176c0549bc4c63aa6202991da2d9a6129d7aef7f1407d3f295"}, + {file = "PyYAML-6.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:5cf4e27da7e3fbed4d6c3d8e797387aaad68102272f8f9752883bc32d61cb87b"}, {file = "pyyaml-6.0.3-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:214ed4befebe12df36bcc8bc2b64b396ca31be9304b8f59e25c11cf94a4c033b"}, {file = "pyyaml-6.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:02ea2dfa234451bbb8772601d7b8e426c2bfa197136796224e50e35a78777956"}, {file = "pyyaml-6.0.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b30236e45cf30d2b8e7b3e85881719e98507abed1011bf463a8fa23e9c3e98a8"}, @@ -2601,6 +3032,111 @@ files = [ {file = "pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f"}, ] +[[package]] +name = "pyzmq" +version = "27.1.0" +description = "Python bindings for 0MQ" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "pyzmq-27.1.0-cp310-cp310-macosx_10_15_universal2.whl", hash = "sha256:508e23ec9bc44c0005c4946ea013d9317ae00ac67778bd47519fdf5a0e930ff4"}, + {file = "pyzmq-27.1.0-cp310-cp310-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:507b6f430bdcf0ee48c0d30e734ea89ce5567fd7b8a0f0044a369c176aa44556"}, + {file = "pyzmq-27.1.0-cp310-cp310-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bf7b38f9fd7b81cb6d9391b2946382c8237fd814075c6aa9c3b746d53076023b"}, + {file = "pyzmq-27.1.0-cp310-cp310-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:03ff0b279b40d687691a6217c12242ee71f0fba28bf8626ff50e3ef0f4410e1e"}, + {file = "pyzmq-27.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:677e744fee605753eac48198b15a2124016c009a11056f93807000ab11ce6526"}, + {file = "pyzmq-27.1.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:dd2fec2b13137416a1c5648b7009499bcc8fea78154cd888855fa32514f3dad1"}, + {file = "pyzmq-27.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:08e90bb4b57603b84eab1d0ca05b3bbb10f60c1839dc471fc1c9e1507bef3386"}, + {file = "pyzmq-27.1.0-cp310-cp310-win32.whl", hash = "sha256:a5b42d7a0658b515319148875fcb782bbf118dd41c671b62dae33666c2213bda"}, + {file = "pyzmq-27.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:c0bb87227430ee3aefcc0ade2088100e528d5d3298a0a715a64f3d04c60ba02f"}, + {file = "pyzmq-27.1.0-cp310-cp310-win_arm64.whl", hash = "sha256:9a916f76c2ab8d045b19f2286851a38e9ac94ea91faf65bd64735924522a8b32"}, + {file = "pyzmq-27.1.0-cp311-cp311-macosx_10_15_universal2.whl", hash = "sha256:226b091818d461a3bef763805e75685e478ac17e9008f49fce2d3e52b3d58b86"}, + {file = "pyzmq-27.1.0-cp311-cp311-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:0790a0161c281ca9723f804871b4027f2e8b5a528d357c8952d08cd1a9c15581"}, + {file = "pyzmq-27.1.0-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c895a6f35476b0c3a54e3eb6ccf41bf3018de937016e6e18748317f25d4e925f"}, + {file = "pyzmq-27.1.0-cp311-cp311-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5bbf8d3630bf96550b3be8e1fc0fea5cbdc8d5466c1192887bd94869da17a63e"}, + {file = "pyzmq-27.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:15c8bd0fe0dabf808e2d7a681398c4e5ded70a551ab47482067a572c054c8e2e"}, + {file = "pyzmq-27.1.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:bafcb3dd171b4ae9f19ee6380dfc71ce0390fefaf26b504c0e5f628d7c8c54f2"}, + {file = "pyzmq-27.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:e829529fcaa09937189178115c49c504e69289abd39967cd8a4c215761373394"}, + {file = "pyzmq-27.1.0-cp311-cp311-win32.whl", hash = "sha256:6df079c47d5902af6db298ec92151db82ecb557af663098b92f2508c398bb54f"}, + {file = "pyzmq-27.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:190cbf120fbc0fc4957b56866830def56628934a9d112aec0e2507aa6a032b97"}, + {file = "pyzmq-27.1.0-cp311-cp311-win_arm64.whl", hash = "sha256:eca6b47df11a132d1745eb3b5b5e557a7dae2c303277aa0e69c6ba91b8736e07"}, + {file = "pyzmq-27.1.0-cp312-abi3-macosx_10_15_universal2.whl", hash = "sha256:452631b640340c928fa343801b0d07eb0c3789a5ffa843f6e1a9cee0ba4eb4fc"}, + {file = "pyzmq-27.1.0-cp312-abi3-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:1c179799b118e554b66da67d88ed66cd37a169f1f23b5d9f0a231b4e8d44a113"}, + {file = "pyzmq-27.1.0-cp312-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3837439b7f99e60312f0c926a6ad437b067356dc2bc2ec96eb395fd0fe804233"}, + {file = "pyzmq-27.1.0-cp312-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:43ad9a73e3da1fab5b0e7e13402f0b2fb934ae1c876c51d0afff0e7c052eca31"}, + {file = "pyzmq-27.1.0-cp312-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:0de3028d69d4cdc475bfe47a6128eb38d8bc0e8f4d69646adfbcd840facbac28"}, + {file = "pyzmq-27.1.0-cp312-abi3-musllinux_1_2_i686.whl", hash = "sha256:cf44a7763aea9298c0aa7dbf859f87ed7012de8bda0f3977b6fb1d96745df856"}, + {file = "pyzmq-27.1.0-cp312-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:f30f395a9e6fbca195400ce833c731e7b64c3919aa481af4d88c3759e0cb7496"}, + {file = "pyzmq-27.1.0-cp312-abi3-win32.whl", hash = "sha256:250e5436a4ba13885494412b3da5d518cd0d3a278a1ae640e113c073a5f88edd"}, + {file = "pyzmq-27.1.0-cp312-abi3-win_amd64.whl", hash = "sha256:9ce490cf1d2ca2ad84733aa1d69ce6855372cb5ce9223802450c9b2a7cba0ccf"}, + {file = "pyzmq-27.1.0-cp312-abi3-win_arm64.whl", hash = "sha256:75a2f36223f0d535a0c919e23615fc85a1e23b71f40c7eb43d7b1dedb4d8f15f"}, + {file = "pyzmq-27.1.0-cp313-cp313-android_24_arm64_v8a.whl", hash = "sha256:93ad4b0855a664229559e45c8d23797ceac03183c7b6f5b4428152a6b06684a5"}, + {file = "pyzmq-27.1.0-cp313-cp313-android_24_x86_64.whl", hash = "sha256:fbb4f2400bfda24f12f009cba62ad5734148569ff4949b1b6ec3b519444342e6"}, + {file = "pyzmq-27.1.0-cp313-cp313t-macosx_10_15_universal2.whl", hash = "sha256:e343d067f7b151cfe4eb3bb796a7752c9d369eed007b91231e817071d2c2fec7"}, + {file = "pyzmq-27.1.0-cp313-cp313t-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:08363b2011dec81c354d694bdecaef4770e0ae96b9afea70b3f47b973655cc05"}, + {file = "pyzmq-27.1.0-cp313-cp313t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d54530c8c8b5b8ddb3318f481297441af102517602b569146185fa10b63f4fa9"}, + {file = "pyzmq-27.1.0-cp313-cp313t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6f3afa12c392f0a44a2414056d730eebc33ec0926aae92b5ad5cf26ebb6cc128"}, + {file = "pyzmq-27.1.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c65047adafe573ff023b3187bb93faa583151627bc9c51fc4fb2c561ed689d39"}, + {file = "pyzmq-27.1.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:90e6e9441c946a8b0a667356f7078d96411391a3b8f80980315455574177ec97"}, + {file = "pyzmq-27.1.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:add071b2d25f84e8189aaf0882d39a285b42fa3853016ebab234a5e78c7a43db"}, + {file = "pyzmq-27.1.0-cp313-cp313t-win32.whl", hash = "sha256:7ccc0700cfdf7bd487bea8d850ec38f204478681ea02a582a8da8171b7f90a1c"}, + {file = "pyzmq-27.1.0-cp313-cp313t-win_amd64.whl", hash = "sha256:8085a9fba668216b9b4323be338ee5437a235fe275b9d1610e422ccc279733e2"}, + {file = "pyzmq-27.1.0-cp313-cp313t-win_arm64.whl", hash = "sha256:6bb54ca21bcfe361e445256c15eedf083f153811c37be87e0514934d6913061e"}, + {file = "pyzmq-27.1.0-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:ce980af330231615756acd5154f29813d553ea555485ae712c491cd483df6b7a"}, + {file = "pyzmq-27.1.0-cp314-cp314t-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:1779be8c549e54a1c38f805e56d2a2e5c009d26de10921d7d51cfd1c8d4632ea"}, + {file = "pyzmq-27.1.0-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7200bb0f03345515df50d99d3db206a0a6bee1955fbb8c453c76f5bf0e08fb96"}, + {file = "pyzmq-27.1.0-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:01c0e07d558b06a60773744ea6251f769cd79a41a97d11b8bf4ab8f034b0424d"}, + {file = "pyzmq-27.1.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:80d834abee71f65253c91540445d37c4c561e293ba6e741b992f20a105d69146"}, + {file = "pyzmq-27.1.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:544b4e3b7198dde4a62b8ff6685e9802a9a1ebf47e77478a5eb88eca2a82f2fd"}, + {file = "pyzmq-27.1.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:cedc4c68178e59a4046f97eca31b148ddcf51e88677de1ef4e78cf06c5376c9a"}, + {file = "pyzmq-27.1.0-cp314-cp314t-win32.whl", hash = "sha256:1f0b2a577fd770aa6f053211a55d1c47901f4d537389a034c690291485e5fe92"}, + {file = "pyzmq-27.1.0-cp314-cp314t-win_amd64.whl", hash = "sha256:19c9468ae0437f8074af379e986c5d3d7d7bfe033506af442e8c879732bedbe0"}, + {file = "pyzmq-27.1.0-cp314-cp314t-win_arm64.whl", hash = "sha256:dc5dbf68a7857b59473f7df42650c621d7e8923fb03fa74a526890f4d33cc4d7"}, + {file = "pyzmq-27.1.0-cp38-cp38-macosx_10_15_universal2.whl", hash = "sha256:18339186c0ed0ce5835f2656cdfb32203125917711af64da64dbaa3d949e5a1b"}, + {file = "pyzmq-27.1.0-cp38-cp38-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:753d56fba8f70962cd8295fb3edb40b9b16deaa882dd2b5a3a2039f9ff7625aa"}, + {file = "pyzmq-27.1.0-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b721c05d932e5ad9ff9344f708c96b9e1a485418c6618d765fca95d4daacfbef"}, + {file = "pyzmq-27.1.0-cp38-cp38-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7be883ff3d722e6085ee3f4afc057a50f7f2e0c72d289fd54df5706b4e3d3a50"}, + {file = "pyzmq-27.1.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:b2e592db3a93128daf567de9650a2f3859017b3f7a66bc4ed6e4779d6034976f"}, + {file = "pyzmq-27.1.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:ad68808a61cbfbbae7ba26d6233f2a4aa3b221de379ce9ee468aa7a83b9c36b0"}, + {file = "pyzmq-27.1.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:e2687c2d230e8d8584fbea433c24382edfeda0c60627aca3446aa5e58d5d1831"}, + {file = "pyzmq-27.1.0-cp38-cp38-win32.whl", hash = "sha256:a1aa0ee920fb3825d6c825ae3f6c508403b905b698b6460408ebd5bb04bbb312"}, + {file = "pyzmq-27.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:df7cd397ece96cf20a76fae705d40efbab217d217897a5053267cd88a700c266"}, + {file = "pyzmq-27.1.0-cp39-cp39-macosx_10_15_universal2.whl", hash = "sha256:96c71c32fff75957db6ae33cd961439f386505c6e6b377370af9b24a1ef9eafb"}, + {file = "pyzmq-27.1.0-cp39-cp39-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:49d3980544447f6bd2968b6ac913ab963a49dcaa2d4a2990041f16057b04c429"}, + {file = "pyzmq-27.1.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:849ca054d81aa1c175c49484afaaa5db0622092b5eccb2055f9f3bb8f703782d"}, + {file = "pyzmq-27.1.0-cp39-cp39-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3970778e74cb7f85934d2b926b9900e92bfe597e62267d7499acc39c9c28e345"}, + {file = "pyzmq-27.1.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:da96ecdcf7d3919c3be2de91a8c513c186f6762aa6cf7c01087ed74fad7f0968"}, + {file = "pyzmq-27.1.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:9541c444cfe1b1c0156c5c86ece2bb926c7079a18e7b47b0b1b3b1b875e5d098"}, + {file = "pyzmq-27.1.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:e30a74a39b93e2e1591b58eb1acef4902be27c957a8720b0e368f579b82dc22f"}, + {file = "pyzmq-27.1.0-cp39-cp39-win32.whl", hash = "sha256:b1267823d72d1e40701dcba7edc45fd17f71be1285557b7fe668887150a14b78"}, + {file = "pyzmq-27.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:0c996ded912812a2fcd7ab6574f4ad3edc27cb6510349431e4930d4196ade7db"}, + {file = "pyzmq-27.1.0-cp39-cp39-win_arm64.whl", hash = "sha256:346e9ba4198177a07e7706050f35d733e08c1c1f8ceacd5eb6389d653579ffbc"}, + {file = "pyzmq-27.1.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:c17e03cbc9312bee223864f1a2b13a99522e0dc9f7c5df0177cd45210ac286e6"}, + {file = "pyzmq-27.1.0-pp310-pypy310_pp73-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:f328d01128373cb6763823b2b4e7f73bdf767834268c565151eacb3b7a392f90"}, + {file = "pyzmq-27.1.0-pp310-pypy310_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c1790386614232e1b3a40a958454bdd42c6d1811837b15ddbb052a032a43f62"}, + {file = "pyzmq-27.1.0-pp310-pypy310_pp73-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:448f9cb54eb0cee4732b46584f2710c8bc178b0e5371d9e4fc8125201e413a74"}, + {file = "pyzmq-27.1.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:05b12f2d32112bf8c95ef2e74ec4f1d4beb01f8b5e703b38537f8849f92cb9ba"}, + {file = "pyzmq-27.1.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:18770c8d3563715387139060d37859c02ce40718d1faf299abddcdcc6a649066"}, + {file = "pyzmq-27.1.0-pp311-pypy311_pp73-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:ac25465d42f92e990f8d8b0546b01c391ad431c3bf447683fdc40565941d0604"}, + {file = "pyzmq-27.1.0-pp311-pypy311_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:53b40f8ae006f2734ee7608d59ed661419f087521edbfc2149c3932e9c14808c"}, + {file = "pyzmq-27.1.0-pp311-pypy311_pp73-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f605d884e7c8be8fe1aa94e0a783bf3f591b84c24e4bc4f3e7564c82ac25e271"}, + {file = "pyzmq-27.1.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:c9f7f6e13dff2e44a6afeaf2cf54cee5929ad64afaf4d40b50f93c58fc687355"}, + {file = "pyzmq-27.1.0-pp38-pypy38_pp73-macosx_10_15_x86_64.whl", hash = "sha256:50081a4e98472ba9f5a02850014b4c9b629da6710f8f14f3b15897c666a28f1b"}, + {file = "pyzmq-27.1.0-pp38-pypy38_pp73-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:510869f9df36ab97f89f4cff9d002a89ac554c7ac9cadd87d444aa4cf66abd27"}, + {file = "pyzmq-27.1.0-pp38-pypy38_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1f8426a01b1c4098a750973c37131cf585f61c7911d735f729935a0c701b68d3"}, + {file = "pyzmq-27.1.0-pp38-pypy38_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:726b6a502f2e34c6d2ada5e702929586d3ac948a4dbbb7fed9854ec8c0466027"}, + {file = "pyzmq-27.1.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:bd67e7c8f4654bef471c0b1ca6614af0b5202a790723a58b79d9584dc8022a78"}, + {file = "pyzmq-27.1.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:722ea791aa233ac0a819fc2c475e1292c76930b31f1d828cb61073e2fe5e208f"}, + {file = "pyzmq-27.1.0-pp39-pypy39_pp73-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:01f9437501886d3a1dd4b02ef59fb8cc384fa718ce066d52f175ee49dd5b7ed8"}, + {file = "pyzmq-27.1.0-pp39-pypy39_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4a19387a3dddcc762bfd2f570d14e2395b2c9701329b266f83dd87a2b3cbd381"}, + {file = "pyzmq-27.1.0-pp39-pypy39_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4c618fbcd069e3a29dcd221739cacde52edcc681f041907867e0f5cc7e85f172"}, + {file = "pyzmq-27.1.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:ff8d114d14ac671d88c89b9224c63d6c4e5a613fe8acd5594ce53d752a3aafe9"}, + {file = "pyzmq-27.1.0.tar.gz", hash = "sha256:ac0765e3d44455adb6ddbf4417dcce460fc40a05978c08efdf2948072f6db540"}, +] + +[package.dependencies] +cffi = {version = "*", markers = "implementation_name == \"pypy\""} + [[package]] name = "questionary" version = "2.1.1" @@ -2688,6 +3224,26 @@ files = [ {file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"}, ] +[[package]] +name = "stack-data" +version = "0.6.3" +description = "Extract data from python stack frames and tracebacks for informative displays" +optional = false +python-versions = "*" +groups = ["dev"] +files = [ + {file = "stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695"}, + {file = "stack_data-0.6.3.tar.gz", hash = "sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9"}, +] + +[package.dependencies] +asttokens = ">=2.1.0" +executing = ">=1.2.0" +pure-eval = "*" + +[package.extras] +tests = ["cython", "littleutils", "pygments", "pytest", "typeguard"] + [[package]] name = "termcolor" version = "3.3.0" @@ -2773,6 +3329,44 @@ files = [ {file = "tomlkit-0.14.0.tar.gz", hash = "sha256:cf00efca415dbd57575befb1f6634c4f42d2d87dbba376128adb42c121b87064"}, ] +[[package]] +name = "tornado" +version = "6.5.4" +description = "Tornado is a Python web framework and asynchronous networking library, originally developed at FriendFeed." +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "tornado-6.5.4-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:d6241c1a16b1c9e4cc28148b1cda97dd1c6cb4fb7068ac1bedc610768dff0ba9"}, + {file = "tornado-6.5.4-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:2d50f63dda1d2cac3ae1fa23d254e16b5e38153758470e9956cbc3d813d40843"}, + {file = "tornado-6.5.4-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1cf66105dc6acb5af613c054955b8137e34a03698aa53272dbda4afe252be17"}, + {file = "tornado-6.5.4-cp39-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:50ff0a58b0dc97939d29da29cd624da010e7f804746621c78d14b80238669335"}, + {file = "tornado-6.5.4-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e5fb5e04efa54cf0baabdd10061eb4148e0be137166146fff835745f59ab9f7f"}, + {file = "tornado-6.5.4-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:9c86b1643b33a4cd415f8d0fe53045f913bf07b4a3ef646b735a6a86047dda84"}, + {file = "tornado-6.5.4-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:6eb82872335a53dd063a4f10917b3efd28270b56a33db69009606a0312660a6f"}, + {file = "tornado-6.5.4-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:6076d5dda368c9328ff41ab5d9dd3608e695e8225d1cd0fd1e006f05da3635a8"}, + {file = "tornado-6.5.4-cp39-abi3-win32.whl", hash = "sha256:1768110f2411d5cd281bac0a090f707223ce77fd110424361092859e089b38d1"}, + {file = "tornado-6.5.4-cp39-abi3-win_amd64.whl", hash = "sha256:fa07d31e0cd85c60713f2b995da613588aa03e1303d75705dca6af8babc18ddc"}, + {file = "tornado-6.5.4-cp39-abi3-win_arm64.whl", hash = "sha256:053e6e16701eb6cbe641f308f4c1a9541f91b6261991160391bfc342e8a551a1"}, + {file = "tornado-6.5.4.tar.gz", hash = "sha256:a22fa9047405d03260b483980635f0b041989d8bcc9a313f8fe18b411d84b1d7"}, +] + +[[package]] +name = "traitlets" +version = "5.14.3" +description = "Traitlets Python configuration system" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f"}, + {file = "traitlets-5.14.3.tar.gz", hash = "sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7"}, +] + +[package.extras] +docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"] +test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "pre-commit", "pytest (>=7.0,<8.2)", "pytest-mock", "pytest-mypy-testing"] + [[package]] name = "types-awscrt" version = "0.19.1" @@ -3120,4 +3714,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<3.12" -content-hash = "08ea1eedf25a896fdc21f03d04f4403d47d655fc90eb5eb310ff7cde7e3b7a6d" +content-hash = "b4e51ac60f56e8e3775e145315a9e405f587674f1b8f271e39cae34e3fa0a04d" diff --git a/pyproject.toml b/pyproject.toml index 6036c9e..e9f0086 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,6 +42,7 @@ include-groups = [ [tool.poetry.group.dev.dependencies] commitizen = "4.9.1" pre-commit = "4.3.0" +ipykernel = "^7.2.0" [tool.poetry.group.test] optional = true diff --git a/src/dve/core_engine/backends/base/backend.py b/src/dve/core_engine/backends/base/backend.py index 9d6abaa..29e8644 100644 --- a/src/dve/core_engine/backends/base/backend.py +++ b/src/dve/core_engine/backends/base/backend.py @@ -163,7 +163,7 @@ def apply( return entities, get_parent(processing_errors_uri), successful for entity_name, entity in entities.items(): - entities[entity_name] = self.step_implementations.add_row_id(entity) + entities[entity_name] = self.step_implementations.add_record_index(entity) # TODO: Handle entity manager creation errors. entity_manager = EntityManager(entities, reference_data) @@ -172,9 +172,6 @@ def apply( # TODO: and return uri to errors _ = self.step_implementations.apply_rules(working_dir, entity_manager, rule_metadata) - for entity_name, entity in entity_manager.entities.items(): - entity_manager.entities[entity_name] = self.step_implementations.drop_row_id(entity) - return entity_manager.entities, get_parent(dc_feedback_errors_uri), True def process( diff --git a/src/dve/core_engine/backends/base/contract.py b/src/dve/core_engine/backends/base/contract.py index fc7da4d..0304580 100644 --- a/src/dve/core_engine/backends/base/contract.py +++ b/src/dve/core_engine/backends/base/contract.py @@ -368,6 +368,14 @@ def read_raw_entities( messages.extend(new_messages) return entities, dedup_messages(messages), successful + + def add_record_index(self, entity: EntityType, **kwargs) -> EntityType: + """Add a record index to the entity""" + raise NotImplementedError(f"add_record_index not implemented in {self.__class__}") + + def drop_record_index(self, entity: EntityType, **kwargs) -> EntityType: + """Drop a record index from the entity""" + raise NotImplementedError(f"drop_record_index not implemented in {self.__class__}") @abstractmethod def apply_data_contract( diff --git a/src/dve/core_engine/backends/base/reader.py b/src/dve/core_engine/backends/base/reader.py index 54abaa9..498313a 100644 --- a/src/dve/core_engine/backends/base/reader.py +++ b/src/dve/core_engine/backends/base/reader.py @@ -126,6 +126,14 @@ def read_to_entity_type( raise ReaderLacksEntityTypeSupport(entity_type=entity_type) from err return reader_func(self, resource, entity_name, schema) + + def add_record_index(self, entity: EntityType, **kwargs) -> EntityType: + """Add a record index to the entity""" + raise NotImplementedError(f"add_record_index not implemented in {self.__class__}") + + def drop_record_index(self, entity: EntityType, **kwargs) -> EntityType: + """Drop a record index to the entity""" + raise NotImplementedError(f"drop_record_index not implemented in {self.__class__}") def write_parquet( self, diff --git a/src/dve/core_engine/backends/base/rules.py b/src/dve/core_engine/backends/base/rules.py index 97a6b4d..b66b3ae 100644 --- a/src/dve/core_engine/backends/base/rules.py +++ b/src/dve/core_engine/backends/base/rules.py @@ -135,15 +135,13 @@ def register_udfs(cls, **kwargs): """Method to register all custom dve functions for use during business rules application""" raise NotImplementedError() - @staticmethod - def add_row_id(entity: EntityType) -> EntityType: - """Add a unique row id field to an entity""" - raise NotImplementedError() + def add_record_index(self, entity: EntityType, **kwargs) -> EntityType: + """Add a record index to the entity""" + raise NotImplementedError(f"add_record_index not implemented in {self.__class__}") - @staticmethod - def drop_row_id(entity: EntityType) -> EntityType: - """Add a unique row id field to an entity""" - raise NotImplementedError() + def drop_record_index(self, entity: EntityType) -> EntityType: + """Drop a unique row id field to an entity""" + raise NotImplementedError(f"drop_record_index not implemented in {self.__class__}") @classmethod def _raise_notimplemented_error( diff --git a/src/dve/core_engine/backends/base/utilities.py b/src/dve/core_engine/backends/base/utilities.py index f55bc88..3cc2923 100644 --- a/src/dve/core_engine/backends/base/utilities.py +++ b/src/dve/core_engine/backends/base/utilities.py @@ -12,6 +12,10 @@ from dve.core_engine.type_hints import ExpressionArray, MultiExpression from dve.parser.type_hints import URI +import polars as pl + +from dve.core_engine.constants import RECORD_INDEX_COLUMN_NAME + BRACKETS = {"(": ")", "{": "}", "[": "]", "<": ">"} """A mapping of opening brackets to their closing counterpart.""" STRING_START_CHARS = {'"', "'"} diff --git a/src/dve/core_engine/backends/implementations/duckdb/contract.py b/src/dve/core_engine/backends/implementations/duckdb/contract.py index 860f06b..3dcfa5f 100644 --- a/src/dve/core_engine/backends/implementations/duckdb/contract.py +++ b/src/dve/core_engine/backends/implementations/duckdb/contract.py @@ -16,6 +16,7 @@ from pydantic import BaseModel from pydantic.fields import ModelField +from dve.core_engine.constants import RECORD_INDEX_COLUMN_NAME import dve.parser.file_handling as fh from dve.common.error_utils import ( BackgroundMessageWriter, @@ -29,6 +30,7 @@ ) from dve.core_engine.backends.implementations.duckdb.duckdb_helpers import ( duckdb_read_parquet, + duckdb_record_index, duckdb_write_parquet, get_duckdb_type_from_annotation, relation_is_empty, @@ -53,7 +55,7 @@ def __call__(self, row: pd.Series): self.errors.extend(self.row_validator(row.to_dict())[1]) # type: ignore return row # no op - +@duckdb_record_index @duckdb_write_parquet @duckdb_read_parquet class DuckDBDataContract(BaseDataContract[DuckDBPyRelation]): @@ -144,10 +146,12 @@ def apply_data_contract( fld.name: get_duckdb_type_from_annotation(fld.annotation) for fld in entity_fields.values() } + ddb_schema[RECORD_INDEX_COLUMN_NAME] = get_duckdb_type_from_annotation(int) polars_schema: dict[str, PolarsType] = { fld.name: get_polars_type_from_annotation(fld.annotation) for fld in entity_fields.values() } + polars_schema[RECORD_INDEX_COLUMN_NAME] = get_polars_type_from_annotation(int) if relation_is_empty(relation): self.logger.warning(f"+ Empty relation for {entity_name}") empty_df = pl.DataFrame([], schema=polars_schema) # type: ignore # pylint: disable=W0612 @@ -169,6 +173,9 @@ def apply_data_contract( msg_count += len(msgs) self.logger.info(f"Data contract found {msg_count} issues in {entity_name}") + + if not RECORD_INDEX_COLUMN_NAME in relation.columns: + relation = self.add_record_index(relation) casting_statements = [ ( diff --git a/src/dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py b/src/dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py index 843ee40..8456d3d 100644 --- a/src/dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py +++ b/src/dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py @@ -12,13 +12,14 @@ import duckdb.typing as ddbtyp import numpy as np -from duckdb import DuckDBPyConnection, DuckDBPyRelation +from duckdb import DuckDBPyConnection, DuckDBPyRelation, StarExpression from duckdb.typing import DuckDBPyType from pandas import DataFrame from pydantic import BaseModel from typing_extensions import Annotated, get_args, get_origin, get_type_hints from dve.core_engine.backends.base.utilities import _get_non_heterogenous_type +from dve.core_engine.constants import RECORD_INDEX_COLUMN_NAME from dve.core_engine.type_hints import URI from dve.parser.file_handling.service import LocalFilesystemImplementation, _get_implementation @@ -286,3 +287,19 @@ def duckdb_rel_to_dictionaries( cols: tuple[str] = tuple(entity.columns) # type: ignore while rows := entity.fetchmany(batch_size): yield from (dict(zip(cols, rw)) for rw in rows) + +def _add_duckdb_record_index(self, entity: DuckDBPyRelation) -> DuckDBPyRelation: + if RECORD_INDEX_COLUMN_NAME in entity.columns: + return entity + + return entity.select(f"*, row_number() OVER () as {RECORD_INDEX_COLUMN_NAME}") + +def _drop_duckdb_record_index(self, entity: DuckDBPyRelation) -> DuckDBPyRelation: + if RECORD_INDEX_COLUMN_NAME not in entity.columns: + return entity + return entity.select(StarExpression(exclude=[RECORD_INDEX_COLUMN_NAME])) + +def duckdb_record_index(cls): + setattr(cls, "add_record_index", _add_duckdb_record_index) + setattr(cls, "drop_record_index", _drop_duckdb_record_index) + return cls diff --git a/src/dve/core_engine/backends/implementations/duckdb/readers/csv.py b/src/dve/core_engine/backends/implementations/duckdb/readers/csv.py index ff65d9f..1844a9d 100644 --- a/src/dve/core_engine/backends/implementations/duckdb/readers/csv.py +++ b/src/dve/core_engine/backends/implementations/duckdb/readers/csv.py @@ -6,23 +6,25 @@ import duckdb as ddb import polars as pl -from duckdb import DuckDBPyConnection, DuckDBPyRelation, default_connection, read_csv +from duckdb import DuckDBPyConnection, DuckDBPyRelation, StarExpression, default_connection, read_csv from pydantic import BaseModel from dve.core_engine.backends.base.reader import BaseFileReader, read_function from dve.core_engine.backends.exceptions import EmptyFileError, MessageBearingError from dve.core_engine.backends.implementations.duckdb.duckdb_helpers import ( + duckdb_record_index, duckdb_write_parquet, get_duckdb_type_from_annotation, ) from dve.core_engine.backends.implementations.duckdb.types import SQLType from dve.core_engine.backends.readers.utilities import check_csv_header_expected -from dve.core_engine.backends.utilities import get_polars_type_from_annotation +from dve.core_engine.backends.utilities import get_polars_type_from_annotation, polars_record_index +from dve.core_engine.constants import RECORD_INDEX_COLUMN_NAME from dve.core_engine.message import FeedbackMessage from dve.core_engine.type_hints import URI, EntityName from dve.parser.file_handling import get_content_length - +@duckdb_record_index @duckdb_write_parquet class DuckDBCSVReader(BaseFileReader): """A reader for CSV files including the ability to compare the passed model @@ -109,9 +111,9 @@ def read_to_relation( # pylint: disable=unused-argument } reader_options["columns"] = ddb_schema - return read_csv(resource, **reader_options) - + return self.add_record_index(read_csv(resource, **reader_options, parallel=False)) +@polars_record_index class PolarsToDuckDBCSVReader(DuckDBCSVReader): """ Utilises the polars lazy csv reader which is then converted into a DuckDBPyRelation object. @@ -142,10 +144,11 @@ def read_to_relation( # pylint: disable=unused-argument for fld in schema.__fields__.values() } reader_options["dtypes"] = polars_types + # there is a raise_if_empty arg for 0.18+. Future reference when upgrading. Makes L85 # redundant - df = pl.scan_csv(resource, **reader_options).select(list(polars_types.keys())) # type: ignore # pylint: disable=W0612 + df = self.add_record_index(pl.scan_csv(resource, **reader_options).select(list(polars_types.keys()))) # type: ignore # pylint: disable=W0612 return ddb.sql("SELECT * FROM df") @@ -189,8 +192,8 @@ def __init__( def read_to_relation( # pylint: disable=unused-argument self, resource: URI, entity_name: EntityName, schema: type[BaseModel] ) -> DuckDBPyRelation: - entity = super().read_to_relation(resource=resource, entity_name=entity_name, schema=schema) - entity = entity.distinct() + entity: DuckDBPyRelation = super().read_to_relation(resource=resource, entity_name=entity_name, schema=schema) + entity = entity.select(StarExpression(exclude=[RECORD_INDEX_COLUMN_NAME])).distinct() no_records = entity.shape[0] if no_records != 1: @@ -219,4 +222,4 @@ def read_to_relation( # pylint: disable=unused-argument ], ) - return entity + return entity.select(f"*, 1 as {RECORD_INDEX_COLUMN_NAME}") diff --git a/src/dve/core_engine/backends/implementations/duckdb/readers/json.py b/src/dve/core_engine/backends/implementations/duckdb/readers/json.py index b1a3ad4..5d7df37 100644 --- a/src/dve/core_engine/backends/implementations/duckdb/readers/json.py +++ b/src/dve/core_engine/backends/implementations/duckdb/readers/json.py @@ -13,9 +13,10 @@ get_duckdb_type_from_annotation, ) from dve.core_engine.backends.implementations.duckdb.types import SQLType +from dve.core_engine.backends.implementations.duckdb.duckdb_helpers import duckdb_record_index from dve.core_engine.type_hints import URI, EntityName - +@duckdb_record_index @duckdb_write_parquet class DuckDBJSONReader(BaseFileReader): """A reader for JSON files""" @@ -47,4 +48,4 @@ def read_to_relation( # pylint: disable=unused-argument for fld in schema.__fields__.values() } - return read_json(resource, columns=ddb_schema, format=self._json_format) # type: ignore + return self.add_record_index(read_json(resource, columns=ddb_schema, format=self._json_format)) # type: ignore diff --git a/src/dve/core_engine/backends/implementations/duckdb/readers/xml.py b/src/dve/core_engine/backends/implementations/duckdb/readers/xml.py index a955946..c4d7d7c 100644 --- a/src/dve/core_engine/backends/implementations/duckdb/readers/xml.py +++ b/src/dve/core_engine/backends/implementations/duckdb/readers/xml.py @@ -11,10 +11,10 @@ from dve.core_engine.backends.exceptions import MessageBearingError from dve.core_engine.backends.implementations.duckdb.duckdb_helpers import duckdb_write_parquet from dve.core_engine.backends.readers.xml import XMLStreamReader -from dve.core_engine.backends.utilities import get_polars_type_from_annotation, stringify_model +from dve.core_engine.backends.utilities import get_polars_type_from_annotation, polars_record_index, stringify_model from dve.core_engine.type_hints import URI - +@polars_record_index @duckdb_write_parquet class DuckDBXMLStreamReader(XMLStreamReader): """A reader for XML files""" @@ -39,7 +39,7 @@ def read_to_relation(self, resource: URI, entity_name: str, schema: type[BaseMod for fld in stringify_model(schema).__fields__.values() } - _lazy_frame = pl.LazyFrame( + _lazy_frame = self.add_record_index(pl.LazyFrame( data=self.read_to_py_iterator(resource, entity_name, schema), schema=polars_schema - ) + )) return self.ddb_connection.sql("select * from _lazy_frame") diff --git a/src/dve/core_engine/backends/implementations/duckdb/rules.py b/src/dve/core_engine/backends/implementations/duckdb/rules.py index e556c6b..dd252b1 100644 --- a/src/dve/core_engine/backends/implementations/duckdb/rules.py +++ b/src/dve/core_engine/backends/implementations/duckdb/rules.py @@ -23,6 +23,7 @@ from dve.core_engine.backends.implementations.duckdb.duckdb_helpers import ( DDBStruct, duckdb_read_parquet, + duckdb_record_index, duckdb_rel_to_dictionaries, duckdb_write_parquet, get_all_registered_udfs, @@ -51,13 +52,12 @@ SemiJoin, TableUnion, ) -from dve.core_engine.constants import ROWID_COLUMN_NAME from dve.core_engine.functions import implementations as functions from dve.core_engine.message import FeedbackMessage from dve.core_engine.templating import template_object from dve.core_engine.type_hints import Messages - +@duckdb_record_index @duckdb_write_parquet @duckdb_read_parquet class DuckDBStepImplementations(BaseStepImplementations[DuckDBPyRelation]): @@ -106,20 +106,6 @@ def register_udfs( # type: ignore connection.sql(_sql) return cls(connection=connection, **kwargs) - @staticmethod - def add_row_id(entity: DuckDBPyRelation) -> DuckDBPyRelation: - """Adds a row identifier to the Relation""" - if ROWID_COLUMN_NAME not in entity.columns: - entity = entity.project(f"*, ROW_NUMBER() OVER () as {ROWID_COLUMN_NAME}") - return entity - - @staticmethod - def drop_row_id(entity: DuckDBPyRelation) -> DuckDBPyRelation: - """Drops the row identiifer from a Relation""" - if ROWID_COLUMN_NAME in entity.columns: - entity = entity.select(StarExpression(exclude=[ROWID_COLUMN_NAME])) - return entity - def add(self, entities: DuckDBEntities, *, config: ColumnAddition) -> Messages: """A transformation step which adds a column to an entity.""" entity: DuckDBPyRelation = entities[config.entity_name] diff --git a/src/dve/core_engine/backends/implementations/spark/backend.py b/src/dve/core_engine/backends/implementations/spark/backend.py index 742e9e3..3999b62 100644 --- a/src/dve/core_engine/backends/implementations/spark/backend.py +++ b/src/dve/core_engine/backends/implementations/spark/backend.py @@ -11,7 +11,7 @@ from dve.core_engine.backends.implementations.spark.rules import SparkStepImplementations from dve.core_engine.backends.implementations.spark.spark_helpers import get_type_from_annotation from dve.core_engine.backends.implementations.spark.types import SparkEntities -from dve.core_engine.constants import ROWID_COLUMN_NAME +from dve.core_engine.constants import RECORD_INDEX_COLUMN_NAME from dve.core_engine.loggers import get_child_logger, get_logger from dve.core_engine.models import SubmissionInfo from dve.core_engine.type_hints import URI, EntityParquetLocations @@ -58,7 +58,7 @@ def write_entities_to_parquet( locations = {} self.logger.info(f"Writing entities to the output location: {cache_prefix}") for entity_name, entity in entities.items(): - entity = entity.drop(ROWID_COLUMN_NAME) + entity = entity.drop(RECORD_INDEX_COLUMN_NAME) self.logger.info(f"Entity: {entity_name}") diff --git a/src/dve/core_engine/backends/implementations/spark/contract.py b/src/dve/core_engine/backends/implementations/spark/contract.py index d8078bd..330c73e 100644 --- a/src/dve/core_engine/backends/implementations/spark/contract.py +++ b/src/dve/core_engine/backends/implementations/spark/contract.py @@ -10,13 +10,14 @@ from pyspark.sql import DataFrame, SparkSession from pyspark.sql import functions as sf from pyspark.sql.functions import col, lit -from pyspark.sql.types import ArrayType, DataType, MapType, StringType, StructType +from pyspark.sql.types import ArrayType, DataType, LongType, MapType, StructField, StructType from dve.common.error_utils import ( BackgroundMessageWriter, dump_processing_errors, get_feedback_errors_uri, ) + from dve.core_engine.backends.base.contract import BaseDataContract, reader_override from dve.core_engine.backends.base.utilities import generate_error_casting_entity_message from dve.core_engine.backends.exceptions import ( @@ -28,19 +29,20 @@ df_is_empty, get_type_from_annotation, spark_read_parquet, + spark_record_index, spark_write_parquet, ) from dve.core_engine.backends.implementations.spark.types import SparkEntities from dve.core_engine.backends.metadata.contract import DataContractMetadata from dve.core_engine.backends.readers import CSVFileReader from dve.core_engine.backends.types import StageSuccessful -from dve.core_engine.constants import ROWID_COLUMN_NAME +from dve.core_engine.constants import RECORD_INDEX_COLUMN_NAME from dve.core_engine.type_hints import URI, EntityLocations, EntityName COMPLEX_TYPES: set[type[DataType]] = {StructType, ArrayType, MapType} """Spark types indicating complex types.""" - +@spark_record_index @spark_write_parquet @spark_read_parquet class SparkDataContract(BaseDataContract[DataFrame]): @@ -100,14 +102,16 @@ def apply_data_contract( successful = True for entity_name, record_df in entities.items(): spark_schema = get_type_from_annotation(contract_metadata.schemas[entity_name]) - + spark_schema.add(StructField(RECORD_INDEX_COLUMN_NAME, LongType())) if df_is_empty(record_df): self.logger.warning(f"+ Empty dataframe for {entity_name}") entities[entity_name] = self.spark_session.createDataFrame( # type: ignore [], schema=spark_schema - ).withColumn(ROWID_COLUMN_NAME, lit(None).cast(StringType())) + ) continue + if not RECORD_INDEX_COLUMN_NAME in record_df.columns: + record_df = self.add_record_index(record_df) if self.debug: # Note, the count will realise the dataframe, so only do this diff --git a/src/dve/core_engine/backends/implementations/spark/readers/csv.py b/src/dve/core_engine/backends/implementations/spark/readers/csv.py index 95db464..f3114bc 100644 --- a/src/dve/core_engine/backends/implementations/spark/readers/csv.py +++ b/src/dve/core_engine/backends/implementations/spark/readers/csv.py @@ -13,10 +13,12 @@ get_type_from_annotation, spark_write_parquet, ) +from dve.core_engine.backends.implementations.spark.spark_helpers import spark_record_index from dve.core_engine.type_hints import URI, EntityName from dve.parser.file_handling import get_content_length +@spark_record_index @spark_write_parquet class SparkCSVReader(BaseFileReader): """A Spark reader for CSV files.""" @@ -70,7 +72,7 @@ def read_to_dataframe( "multiLine": self.multi_line, } - return ( + return self.add_record_index( self.spark_session.read.format("csv") .options(**kwargs) # type: ignore .load(resource, schema=spark_schema) diff --git a/src/dve/core_engine/backends/implementations/spark/readers/json.py b/src/dve/core_engine/backends/implementations/spark/readers/json.py index c336ee0..b3a5fa6 100644 --- a/src/dve/core_engine/backends/implementations/spark/readers/json.py +++ b/src/dve/core_engine/backends/implementations/spark/readers/json.py @@ -13,10 +13,11 @@ get_type_from_annotation, spark_write_parquet, ) +from dve.core_engine.backends.implementations.spark.spark_helpers import spark_record_index from dve.core_engine.type_hints import URI, EntityName from dve.parser.file_handling import get_content_length - +@spark_record_index @spark_write_parquet class SparkJSONReader(BaseFileReader): """A Spark reader for JSON files.""" @@ -59,7 +60,7 @@ def read_to_dataframe( "multiline": self.multi_line, } - return ( + return self.add_record_index( self.spark_session.read.format("json") .options(**kwargs) # type: ignore .load(resource, schema=spark_schema) diff --git a/src/dve/core_engine/backends/implementations/spark/readers/xml.py b/src/dve/core_engine/backends/implementations/spark/readers/xml.py index 30d6756..028a430 100644 --- a/src/dve/core_engine/backends/implementations/spark/readers/xml.py +++ b/src/dve/core_engine/backends/implementations/spark/readers/xml.py @@ -19,6 +19,7 @@ get_type_from_annotation, spark_write_parquet, ) +from dve.core_engine.backends.implementations.spark.spark_helpers import spark_record_index from dve.core_engine.backends.readers.xml import BasicXMLFileReader, XMLStreamReader from dve.core_engine.type_hints import URI, EntityName from dve.parser.file_handling import get_content_length @@ -27,7 +28,7 @@ SparkXMLMode = Literal["PERMISSIVE", "FAILFAST", "DROPMALFORMED"] """The mode to use when parsing XML files with Spark.""" - +@spark_record_index @spark_write_parquet class SparkXMLStreamReader(XMLStreamReader): """An XML stream reader that adds a method to read to a dataframe""" @@ -45,12 +46,12 @@ def read_to_dataframe( if not self.spark: self.spark = SparkSession.builder.getOrCreate() # type: ignore spark_schema = get_type_from_annotation(schema) - return self.spark.createDataFrame( # type: ignore + return self.add_record_index(self.spark.createDataFrame( # type: ignore list(self.read_to_py_iterator(resource, entity_name, schema)), schema=spark_schema, - ) - + )) +@spark_record_index @spark_write_parquet class SparkXMLReader(BasicXMLFileReader): # pylint: disable=too-many-instance-attributes """A reader for XML files built atop Spark-XML.""" @@ -177,7 +178,7 @@ def read_to_dataframe( df = self._add_missing_columns(df, spark_schema) df = self._sanitise_columns(df) - return df + return self.add_record_index(df) def _add_missing_columns(self, df: DataFrame, fields: Iterable[StructField]) -> DataFrame: for field in fields: diff --git a/src/dve/core_engine/backends/implementations/spark/rules.py b/src/dve/core_engine/backends/implementations/spark/rules.py index 15afa09..c970f28 100644 --- a/src/dve/core_engine/backends/implementations/spark/rules.py +++ b/src/dve/core_engine/backends/implementations/spark/rules.py @@ -15,6 +15,7 @@ get_all_registered_udfs, object_to_spark_literal, spark_read_parquet, + spark_record_index, spark_write_parquet, ) from dve.core_engine.backends.implementations.spark.types import ( @@ -43,13 +44,13 @@ SemiJoin, TableUnion, ) -from dve.core_engine.constants import ROWID_COLUMN_NAME + from dve.core_engine.functions import implementations as functions from dve.core_engine.message import FeedbackMessage from dve.core_engine.templating import template_object from dve.core_engine.type_hints import Messages - +@spark_record_index @spark_write_parquet @spark_read_parquet class SparkStepImplementations(BaseStepImplementations[DataFrame]): @@ -100,18 +101,6 @@ def register_udfs( return cls(spark_session=spark_session, **kwargs) - @staticmethod - def add_row_id(entity: DataFrame) -> DataFrame: - if ROWID_COLUMN_NAME not in entity.columns: - entity = entity.withColumn(ROWID_COLUMN_NAME, sf.expr("uuid()")) - return entity - - @staticmethod - def drop_row_id(entity: DataFrame) -> DataFrame: - if ROWID_COLUMN_NAME in entity.columns: - entity = entity.drop(ROWID_COLUMN_NAME) - return entity - def add(self, entities: SparkEntities, *, config: ColumnAddition) -> Messages: entity: DataFrame = entities[config.entity_name] entity = entity.withColumn(config.column_name, sf.expr(config.expression)) diff --git a/src/dve/core_engine/backends/implementations/spark/spark_helpers.py b/src/dve/core_engine/backends/implementations/spark/spark_helpers.py index 7cb7b17..9272e96 100644 --- a/src/dve/core_engine/backends/implementations/spark/spark_helpers.py +++ b/src/dve/core_engine/backends/implementations/spark/spark_helpers.py @@ -27,6 +27,11 @@ from dve.core_engine.backends.base.utilities import _get_non_heterogenous_type from dve.core_engine.type_hints import URI +from pyspark.sql import DataFrame, Row +from pyspark.sql.types import LongType, StructField, StructType + +from dve.core_engine.constants import RECORD_INDEX_COLUMN_NAME + # It would be really nice if there was a more parameterisable # way of doing this. OneArgWrappable = Callable[[Any], Any] @@ -410,3 +415,20 @@ def _inner(*args, **kwargs): return _inner return _wrapper + +def _add_spark_record_index(self, entity: DataFrame) -> DataFrame: + if RECORD_INDEX_COLUMN_NAME in entity.columns: + return entity + schema: StructType = entity.schema + schema.add(StructField(RECORD_INDEX_COLUMN_NAME, LongType())) + return entity.rdd.zipWithIndex().map(lambda x: Row(**x[0].asDict(True), RECORD_INDEX_COLUMN_NAME=x[1] + 1)).toDF(schema=schema) + +def _drop_spark_record_index(self, entity: DataFrame) -> DataFrame: + if not RECORD_INDEX_COLUMN_NAME in entity.columns: + return entity + return entity.drop(RECORD_INDEX_COLUMN_NAME) + +def spark_record_index(cls): + setattr(cls, "add_record_index", _add_spark_record_index) + setattr(cls, "drop_record_index", _drop_spark_record_index) + return cls diff --git a/src/dve/core_engine/backends/readers/csv.py b/src/dve/core_engine/backends/readers/csv.py index bc05b58..b77a89a 100644 --- a/src/dve/core_engine/backends/readers/csv.py +++ b/src/dve/core_engine/backends/readers/csv.py @@ -15,7 +15,9 @@ FieldCountMismatch, MissingHeaderError, ) + from dve.core_engine.backends.utilities import get_polars_type_from_annotation, stringify_model +from dve.core_engine.constants import RECORD_INDEX_COLUMN_NAME from dve.core_engine.type_hints import EntityName from dve.parser.file_handling import get_content_length, open_stream from dve.parser.file_handling.implementations.file import file_uri_to_local_path @@ -204,7 +206,9 @@ def read_to_py_iterator( ) coerce_func = partial(self._coerce, field_names=field_names) - yield from map(coerce_func, reader) + for idx, rw in enumerate(map(coerce_func, reader), start=1): + rw[RECORD_INDEX_COLUMN_NAME] = idx + yield rw def write_parquet( # type: ignore self, @@ -223,6 +227,8 @@ def write_parquet( # type: ignore fld.name: get_polars_type_from_annotation(fld.annotation) for fld in stringify_model(schema).__fields__.values() } + polars_schema[RECORD_INDEX_COLUMN_NAME] = get_polars_type_from_annotation(int) + pl.LazyFrame(data=entity, schema=polars_schema).sink_parquet( path=target_location, compression="snappy" diff --git a/src/dve/core_engine/backends/readers/utilities.py b/src/dve/core_engine/backends/readers/utilities.py index 642c0b2..2281432 100644 --- a/src/dve/core_engine/backends/readers/utilities.py +++ b/src/dve/core_engine/backends/readers/utilities.py @@ -2,12 +2,16 @@ from typing import Optional +from duckdb import DuckDBPyRelation +import polars as pl from pydantic import BaseModel +from pyspark.sql import DataFrame, Row +from pyspark.sql.types import LongType, StructField, StructType +from dve.core_engine.constants import RECORD_INDEX_COLUMN_NAME from dve.core_engine.type_hints import URI from dve.parser.file_handling.service import open_stream - def check_csv_header_expected( resource: URI, expected_schema: type[BaseModel], diff --git a/src/dve/core_engine/backends/readers/xml.py b/src/dve/core_engine/backends/readers/xml.py index e7480f1..fa2835c 100644 --- a/src/dve/core_engine/backends/readers/xml.py +++ b/src/dve/core_engine/backends/readers/xml.py @@ -12,8 +12,10 @@ from dve.core_engine.backends.base.reader import BaseFileReader from dve.core_engine.backends.exceptions import EmptyFileError + from dve.core_engine.backends.readers.xml_linting import run_xmllint from dve.core_engine.backends.utilities import get_polars_type_from_annotation, stringify_model +from dve.core_engine.constants import RECORD_INDEX_COLUMN_NAME from dve.core_engine.loggers import get_logger from dve.core_engine.message import FeedbackMessage from dve.core_engine.type_hints import URI, EntityName @@ -310,8 +312,10 @@ def read_to_py_iterator( raise EmptyFileError(f"File at {resource!r} is empty") with open_stream(resource, "rb") as stream: - yield from self._parse_xml(stream, schema) - + for idx, rw in enumerate(self._parse_xml(stream, schema), start=1): + rw[RECORD_INDEX_COLUMN_NAME] = idx + yield rw + def write_parquet( # type: ignore self, entity: Iterator[dict[str, Any]], @@ -329,6 +333,7 @@ def write_parquet( # type: ignore fld.name: get_polars_type_from_annotation(fld.type_) for fld in stringify_model(schema).__fields__.values() } + polars_schema[RECORD_INDEX_COLUMN_NAME] = get_polars_type_from_annotation(int) pl.LazyFrame(data=entity, schema=polars_schema).sink_parquet( path=target_location, compression="snappy", **kwargs ) diff --git a/src/dve/core_engine/backends/utilities.py b/src/dve/core_engine/backends/utilities.py index 9261806..6a2918f 100644 --- a/src/dve/core_engine/backends/utilities.py +++ b/src/dve/core_engine/backends/utilities.py @@ -11,6 +11,7 @@ from pydantic import BaseModel, create_model from dve.core_engine.backends.base.utilities import _get_non_heterogenous_type +from dve.core_engine.constants import RECORD_INDEX_COLUMN_NAME from dve.core_engine.type_hints import Messages # We need to rely on a Python typing implementation detail in Python <= 3.7. @@ -175,3 +176,18 @@ def get_polars_type_from_annotation(type_annotation: Any) -> PolarsType: if polars_type: return polars_type raise ValueError(f"No equivalent DuckDB type for {type_annotation!r}") + +def _add_polars_record_index(self, entity: pl.LazyFrame) -> pl.LazyFrame: + if RECORD_INDEX_COLUMN_NAME in entity.columns: + return entity + return entity.with_row_index(name=RECORD_INDEX_COLUMN_NAME, offset=1) + +def _drop_polars_record_index(self, entity: pl.LazyFrame) -> pl.LazyFrame: + if not RECORD_INDEX_COLUMN_NAME in entity.columns: + return entity + return entity.drop(RECORD_INDEX_COLUMN_NAME) + +def polars_record_index(cls): + setattr(cls, "add_record_index", _add_polars_record_index) + setattr(cls, "drop_record_index", _drop_polars_record_index) + return cls diff --git a/src/dve/core_engine/constants.py b/src/dve/core_engine/constants.py index d452c9b..a2a4a65 100644 --- a/src/dve/core_engine/constants.py +++ b/src/dve/core_engine/constants.py @@ -1,7 +1,7 @@ """Constant values used in mutiple places.""" -ROWID_COLUMN_NAME: str = "__rowid__" -"""The name of the column containing the row ID for each entity.""" +RECORD_INDEX_COLUMN_NAME: str = "__record_index__" +"""The name of the column containing the record index for each entity.""" CONTRACT_ERROR_VALUE_FIELD_NAME: str = "__error_value" """The name of the field that can be used to extract the field value that caused diff --git a/src/dve/core_engine/engine.py b/src/dve/core_engine/engine.py index 28a2ac5..c5d1ba9 100644 --- a/src/dve/core_engine/engine.py +++ b/src/dve/core_engine/engine.py @@ -15,7 +15,7 @@ from dve.core_engine.backends.implementations.spark.types import SparkEntities from dve.core_engine.configuration.base import BaseEngineConfig from dve.core_engine.configuration.v1 import V1EngineConfig -from dve.core_engine.constants import ROWID_COLUMN_NAME +from dve.core_engine.constants import RECORD_INDEX_COLUMN_NAME from dve.core_engine.loggers import get_child_logger, get_logger from dve.core_engine.models import EngineRunValidation, SubmissionInfo from dve.core_engine.type_hints import EntityName, JSONstring @@ -200,7 +200,7 @@ def _write_entity_outputs(self, entities: SparkEntities) -> SparkEntities: self.main_log.info(f"Writing entities to the output location: {self.output_prefix_uri}") for entity_name, entity in entities.items(): - entity = entity.drop(ROWID_COLUMN_NAME) + entity = entity.drop(RECORD_INDEX_COLUMN_NAME) self.main_log.info(f"Entity: {entity_name} {type(entity)}") diff --git a/src/dve/core_engine/message.py b/src/dve/core_engine/message.py index f2a4e52..bfc1b3b 100644 --- a/src/dve/core_engine/message.py +++ b/src/dve/core_engine/message.py @@ -13,7 +13,7 @@ from pydantic import BaseModel, ValidationError, validator from pydantic.dataclasses import dataclass -from dve.core_engine.constants import CONTRACT_ERROR_VALUE_FIELD_NAME, ROWID_COLUMN_NAME +from dve.core_engine.constants import CONTRACT_ERROR_VALUE_FIELD_NAME, RECORD_INDEX_COLUMN_NAME from dve.core_engine.templating import template_object from dve.core_engine.type_hints import ( EntityName, @@ -230,7 +230,7 @@ def _strip_rowid( # pylint: disable=no-self-argument ) -> Optional[dict[str, Any]]: """Strip the row ID column from the record, if present.""" if isinstance(value, dict): - value.pop(ROWID_COLUMN_NAME, None) + value.pop(RECORD_INDEX_COLUMN_NAME, None) return value @property diff --git a/src/dve/metadata_parser/models.py b/src/dve/metadata_parser/models.py index 18cdc68..73e6b5c 100644 --- a/src/dve/metadata_parser/models.py +++ b/src/dve/metadata_parser/models.py @@ -371,6 +371,7 @@ class Config(pyd.BaseConfig): fields = self.aliases # type: ignore anystr_strip_whitespace = True allow_population_by_field_name = True + extra = pyd.Extra.ignore return pyd.create_model( # type: ignore model_name, diff --git a/src/dve/pipeline/pipeline.py b/src/dve/pipeline/pipeline.py index 46a89c2..a1fb3b9 100644 --- a/src/dve/pipeline/pipeline.py +++ b/src/dve/pipeline/pipeline.py @@ -563,8 +563,9 @@ def apply_business_rules( for parquet_uri, _ in fh.iter_prefix(contract): file_name = fh.get_file_name(parquet_uri) - entities[file_name] = self.step_implementations.read_parquet(parquet_uri) # type: ignore - entities[file_name] = self.step_implementations.add_row_id(entities[file_name]) # type: ignore + entities[file_name] = self.step_implementations.add_record_index( + self.step_implementations.read_parquet(parquet_uri) # type: ignore + ) entities[f"Original{file_name}"] = self.step_implementations.read_parquet(parquet_uri) # type: ignore sub_info_entity = ( diff --git a/tests/test_core_engine/test_backends/test_implementations/test_duckdb/test_data_contract.py b/tests/test_core_engine/test_backends/test_implementations/test_duckdb/test_data_contract.py index 9e49338..53ea791 100644 --- a/tests/test_core_engine/test_backends/test_implementations/test_duckdb/test_data_contract.py +++ b/tests/test_core_engine/test_backends/test_implementations/test_duckdb/test_data_contract.py @@ -14,6 +14,7 @@ from dve.core_engine.backends.implementations.duckdb.readers.xml import DuckDBXMLStreamReader from dve.core_engine.backends.metadata.contract import DataContractMetadata, ReaderConfig from dve.core_engine.backends.utilities import stringify_model +from dve.core_engine.constants import RECORD_INDEX_COLUMN_NAME from dve.core_engine.message import UserMessage from dve.core_engine.type_hints import URI from dve.core_engine.validation import RowValidator @@ -93,10 +94,12 @@ def test_duckdb_data_contract_csv(temp_csv_file): data_contract: DuckDBDataContract = DuckDBDataContract(connection) entities, feedback_errors_uri, stage_successful = data_contract.apply_data_contract(get_parent(uri.as_posix()), entities, entity_locations, dc_meta) rel: DuckDBPyRelation = entities.get("test_ds") - assert dict(zip(rel.columns, rel.dtypes)) == { + expected_schema = { fld.name: str(get_duckdb_type_from_annotation(fld.annotation)) for fld in mdl.__fields__.values() } + expected_schema[RECORD_INDEX_COLUMN_NAME] = get_duckdb_type_from_annotation(int) + assert dict(zip(rel.columns, rel.dtypes)) == expected_schema assert not get_resource_exists(feedback_errors_uri) assert stage_successful @@ -195,10 +198,12 @@ def test_duckdb_data_contract_xml(temp_xml_file): fld.name: get_duckdb_type_from_annotation(fld.type_) for fld in header_model.__fields__.values() } + header_expected_schema[RECORD_INDEX_COLUMN_NAME] = get_duckdb_type_from_annotation(int) class_data_expected_schema: Dict[str, DuckDBPyType] = { fld.name: get_duckdb_type_from_annotation(fld.type_) for fld in class_model.__fields__.values() } + class_data_expected_schema[RECORD_INDEX_COLUMN_NAME] = get_duckdb_type_from_annotation(int) class_data_rel: DuckDBPyRelation = entities.get("test_class_info") assert not get_resource_exists(feedback_errors_uri) assert header_rel.count("*").fetchone()[0] == 1 @@ -223,7 +228,7 @@ def test_ddb_data_contract_read_and_write_basic_parquet( "id": "VARCHAR", "datefield": "VARCHAR", "strfield": "VARCHAR", - "datetimefield": "VARCHAR", + "datetimefield": "VARCHAR" } # check processes entity contract_dict = json.loads(contract_meta).get("contract") @@ -266,6 +271,7 @@ def test_ddb_data_contract_read_and_write_basic_parquet( "datefield": "DATE", "strfield": "VARCHAR", "datetimefield": "TIMESTAMP", + RECORD_INDEX_COLUMN_NAME: get_duckdb_type_from_annotation(int) } @@ -282,7 +288,7 @@ def test_ddb_data_contract_read_nested_parquet(nested_all_string_parquet): "id": "VARCHAR", "strfield": "VARCHAR", "datetimefield": "VARCHAR", - "subfield": "STRUCT(id VARCHAR, substrfield VARCHAR, subarrayfield VARCHAR[])[]", + "subfield": "STRUCT(id VARCHAR, substrfield VARCHAR, subarrayfield VARCHAR[])[]" } # check processes entity contract_dict = json.loads(contract_meta).get("contract") @@ -325,6 +331,7 @@ def test_ddb_data_contract_read_nested_parquet(nested_all_string_parquet): "strfield": "VARCHAR", "datetimefield": "TIMESTAMP", "subfield": "STRUCT(id BIGINT, substrfield VARCHAR, subarrayfield DATE[])[]", + RECORD_INDEX_COLUMN_NAME: get_duckdb_type_from_annotation(int) } def test_duckdb_data_contract_custom_error_details(nested_all_string_parquet_w_errors, diff --git a/tests/test_core_engine/test_backends/test_implementations/test_spark/test_data_contract.py b/tests/test_core_engine/test_backends/test_implementations/test_spark/test_data_contract.py index 921c9be..70c6b9c 100644 --- a/tests/test_core_engine/test_backends/test_implementations/test_spark/test_data_contract.py +++ b/tests/test_core_engine/test_backends/test_implementations/test_spark/test_data_contract.py @@ -16,6 +16,7 @@ from dve.core_engine.backends.implementations.spark.contract import SparkDataContract from dve.core_engine.backends.metadata.contract import DataContractMetadata, ReaderConfig +from dve.core_engine.constants import RECORD_INDEX_COLUMN_NAME from dve.core_engine.message import UserMessage from dve.core_engine.type_hints import URI from dve.core_engine.validation import RowValidator @@ -89,6 +90,7 @@ def test_spark_data_contract_read_and_write_basic_parquet( StructField("datefield", DateType()), StructField("strfield", StringType()), StructField("datetimefield", TimestampType()), + StructField(RECORD_INDEX_COLUMN_NAME, LongType()) ] ) @@ -173,6 +175,7 @@ def test_spark_data_contract_read_nested_parquet(nested_all_string_parquet): ) ), ), + StructField(RECORD_INDEX_COLUMN_NAME, LongType()) ] ) diff --git a/tests/test_core_engine/test_backends/test_readers/test_csv.py b/tests/test_core_engine/test_backends/test_readers/test_csv.py index 4cd7e07..0737ad2 100644 --- a/tests/test_core_engine/test_backends/test_readers/test_csv.py +++ b/tests/test_core_engine/test_backends/test_readers/test_csv.py @@ -12,6 +12,7 @@ from dve.core_engine.backends.exceptions import EmptyFileError, FieldCountMismatch from dve.core_engine.backends.readers import CSVFileReader +from dve.core_engine.constants import RECORD_INDEX_COLUMN_NAME from ....conftest import get_test_file_path from ....fixtures import temp_dir @@ -25,10 +26,13 @@ def planet_location() -> Iterator[str]: @pytest.fixture(scope="function") def planet_data() -> Iterator[Dict[str, Dict[str, str]]]: - """The planet data, as loaded by Python's default parser.""" + """The expected planet data after reading, as loaded by Python's default parser.""" with get_test_file_path("planets/planets.csv").open("r", encoding="utf-8") as file: reader = csv.DictReader(file) - yield {row["planet"]: row for row in reader} + data = {} + for idx, row in enumerate(reader, start=1): + data[row["planet"]] = {RECORD_INDEX_COLUMN_NAME: idx, **row} + yield data @pytest.fixture(scope="function") @@ -138,7 +142,7 @@ def test_csv_file_get_subset( # Keep only keys in the subset from the source subset_keys = set(PlanetsSubset.__fields__.keys()) for data in planet_data.values(): - to_pop = set(data.keys()) - subset_keys + to_pop = set(data.keys()) - subset_keys - {RECORD_INDEX_COLUMN_NAME} for key in to_pop: del data[key] @@ -160,7 +164,7 @@ def test_csv_file_get_subset_add_missing( # Keep only keys in the subset from the source subset_keys = set(PlanetsSubset.__fields__.keys()) for data in planet_data.values(): - to_pop = set(data.keys()) - subset_keys + to_pop = set(data.keys()) - subset_keys - {RECORD_INDEX_COLUMN_NAME} for key in to_pop: del data[key] data["random_null"] = None # type: ignore @@ -182,7 +186,10 @@ def test_csv_file_filled_from_provided( results = list(reader.read_to_py_iterator(planet_location, "", Planets)) parsed = {row["planet"]: row for row in results} del parsed["planet"] + for rec in parsed.values(): + rec[RECORD_INDEX_COLUMN_NAME] -= 1 assert parsed == planet_data + def test_csv_file_raises_missing_cols(self, planet_location: str): """ @@ -235,7 +242,7 @@ def test_csv_file_can_be_pipe_delimited( """Test that a pipe-delimited CSV file can be parsed.""" reader = CSVFileReader(delimiter="|") results = list(reader.read_to_py_iterator(pipe_delimited_location, "", BasicModel)) - assert results == [{"ColumnA": "1", "ColumnB": "2", "ColumnC": "3"}] + assert results == [{"ColumnA": "1", "ColumnB": "2", "ColumnC": "3", RECORD_INDEX_COLUMN_NAME: 1}] @pytest.mark.parametrize(["schema"], [(None,), (Planets,)]) def test_base_csv_reader_parquet_write( @@ -252,5 +259,5 @@ def test_base_csv_reader_parquet_write( reader.write_parquet(entity=entity, target_location=target_location, schema=schema) assert sorted( pd.read_parquet(target_location).to_dict(orient="records"), - key=lambda x: x.get("planet"), - ) == sorted([dict(val) for val in planet_data.values()], key=lambda x: x.get("planet")) + key=lambda x: x.get(RECORD_INDEX_COLUMN_NAME), + ) == sorted([dict(val) for val in planet_data.values()], key=lambda x: x.get(RECORD_INDEX_COLUMN_NAME)) diff --git a/tests/test_core_engine/test_backends/test_readers/test_ddb_csv.py b/tests/test_core_engine/test_backends/test_readers/test_ddb_csv.py index 8f9d40d..619f78f 100644 --- a/tests/test_core_engine/test_backends/test_readers/test_ddb_csv.py +++ b/tests/test_core_engine/test_backends/test_readers/test_ddb_csv.py @@ -16,6 +16,7 @@ PolarsToDuckDBCSVReader, ) from dve.core_engine.backends.utilities import stringify_model +from dve.core_engine.constants import RECORD_INDEX_COLUMN_NAME from tests.test_core_engine.test_backends.fixtures import duckdb_connection # pylint: disable=C0116 @@ -69,21 +70,25 @@ def test_ddb_csv_reader_all_str(temp_csv_file): rel: DuckDBPyRelation = reader.read_to_entity_type( DuckDBPyRelation, str(uri), "test", stringify_model(mdl) ) - assert rel.columns == header.split(",") - assert dict(zip(rel.columns, rel.dtypes)) == {fld: "VARCHAR" for fld in header.split(",")} - assert rel.fetchall() == [tuple(str(val) for val in rw) for rw in data] + expected_dtypes = {**{fld: "VARCHAR" for fld in header.split(",")}, RECORD_INDEX_COLUMN_NAME: "BIGINT"} + expected_data = [(*[str(val) for val in rw], idx) for idx, rw in enumerate(data, start=1)] + assert rel.columns == header.split(",") + [RECORD_INDEX_COLUMN_NAME] + assert dict(zip(rel.columns, rel.dtypes)) == expected_dtypes + assert rel.fetchall() == expected_data def test_ddb_csv_reader_cast(temp_csv_file): uri, header, data, mdl = temp_csv_file reader = DuckDBCSVReader(header=True, delim=",", connection=default_connection) rel: DuckDBPyRelation = reader.read_to_entity_type(DuckDBPyRelation, str(uri), "test", mdl) - assert rel.columns == header.split(",") - assert dict(zip(rel.columns, rel.dtypes)) == { + expected_dtypes = {**{ fld.name: str(get_duckdb_type_from_annotation(fld.annotation)) for fld in mdl.__fields__.values() - } - assert rel.fetchall() == [tuple(rw) for rw in data] + }, RECORD_INDEX_COLUMN_NAME: get_duckdb_type_from_annotation(int)} + expected_data = [(*rw, idx) for idx, rw in enumerate(data, start=1)] + assert rel.columns == header.split(",") + [RECORD_INDEX_COLUMN_NAME] + assert dict(zip(rel.columns, rel.dtypes)) == expected_dtypes + assert rel.fetchall() == expected_data def test_ddb_csv_write_parquet(temp_csv_file): @@ -95,7 +100,7 @@ def test_ddb_csv_write_parquet(temp_csv_file): target_loc: Path = uri.parent.joinpath("test_parquet.parquet").as_posix() reader.write_parquet(rel, target_loc) parquet_rel = reader._connection.read_parquet(target_loc) - assert parquet_rel.df().to_dict(orient="records") == rel.df().to_dict(orient="records") + assert sorted(parquet_rel.df().to_dict(orient="records"), key=lambda x: x.get(RECORD_INDEX_COLUMN_NAME)) == sorted([{**rec, RECORD_INDEX_COLUMN_NAME: idx} for idx, rec in enumerate(rel.df().to_dict(orient="records"), start=1)], key=lambda x: x.get(RECORD_INDEX_COLUMN_NAME)) def test_ddb_csv_read_empty_file(temp_empty_csv_file): diff --git a/tests/test_core_engine/test_backends/test_readers/test_ddb_json.py b/tests/test_core_engine/test_backends/test_readers/test_ddb_json.py index c326fef..6942c6a 100644 --- a/tests/test_core_engine/test_backends/test_readers/test_ddb_json.py +++ b/tests/test_core_engine/test_backends/test_readers/test_ddb_json.py @@ -13,6 +13,7 @@ ) from dve.core_engine.backends.implementations.duckdb.readers.json import DuckDBJSONReader from dve.core_engine.backends.utilities import stringify_model +from dve.core_engine.constants import RECORD_INDEX_COLUMN_NAME from tests.test_core_engine.test_backends.fixtures import duckdb_connection @@ -59,9 +60,9 @@ def test_ddb_json_reader_all_str(temp_json_file): rel: DuckDBPyRelation = reader.read_to_entity_type( DuckDBPyRelation, uri.as_posix(), "test", stringify_model(mdl) ) - assert rel.columns == expected_fields - assert dict(zip(rel.columns, rel.dtypes)) == {fld: "VARCHAR" for fld in expected_fields} - assert rel.fetchall() == [tuple(str(val) for val in rw.values()) for rw in data] + assert rel.columns == expected_fields + [RECORD_INDEX_COLUMN_NAME] + assert dict(zip(rel.columns, rel.dtypes)) == {**{fld: "VARCHAR" for fld in expected_fields}, RECORD_INDEX_COLUMN_NAME: "BIGINT"} + assert rel.fetchall() == [(*[str(val) for val in rw.values()], idx) for idx, rw in enumerate(data, start=1)] def test_ddb_json_reader_cast(temp_json_file): @@ -70,15 +71,15 @@ def test_ddb_json_reader_cast(temp_json_file): reader = DuckDBJSONReader() rel: DuckDBPyRelation = reader.read_to_entity_type(DuckDBPyRelation, uri.as_posix(), "test", mdl) - assert rel.columns == expected_fields - assert dict(zip(rel.columns, rel.dtypes)) == { + assert rel.columns == expected_fields + [RECORD_INDEX_COLUMN_NAME] + assert dict(zip(rel.columns, rel.dtypes)) == {**{ fld.name: str(get_duckdb_type_from_annotation(fld.annotation)) for fld in mdl.__fields__.values() - } - assert rel.fetchall() == [tuple(rw.values()) for rw in data] + }, RECORD_INDEX_COLUMN_NAME: "BIGINT"} + assert rel.fetchall() == [(*rw.values(), idx) for idx, rw in enumerate(data, start = 1)] -def test_ddb_csv_write_parquet(temp_json_file): +def test_ddb_json_write_parquet(temp_json_file): uri, _, mdl = temp_json_file reader = DuckDBJSONReader() rel: DuckDBPyRelation = reader.read_to_entity_type( diff --git a/tests/test_core_engine/test_backends/test_readers/test_ddb_xml.py b/tests/test_core_engine/test_backends/test_readers/test_ddb_xml.py index dad5b06..585f7b7 100644 --- a/tests/test_core_engine/test_backends/test_readers/test_ddb_xml.py +++ b/tests/test_core_engine/test_backends/test_readers/test_ddb_xml.py @@ -9,6 +9,7 @@ from pydantic import BaseModel from dve.core_engine.backends.implementations.duckdb.readers.xml import DuckDBXMLStreamReader +from dve.core_engine.constants import RECORD_INDEX_COLUMN_NAME @pytest.fixture @@ -19,15 +20,15 @@ def temp_dir(): @pytest.fixture def temp_xml_file(temp_dir: Path): - header_data: Dict[str, str] = { + header_data: list[dict[str, str]] = [{ "school_name": "Meadow Fields", "category": "Primary", "headteacher": "Mrs Smith", - } - class_data: Dict[str, Dict[str, str]] = { + }] + class_data: list[dict[str, Dict[str, str]]] = [{ "year_1": {"class_size": "10", "teacher": "Mrs Armitage"}, "year_2": {"class_size": "12", "teacher": "Mr Barney"}, - } + }] class HeaderModel(BaseModel): school_name: str @@ -44,16 +45,17 @@ class ClassDataModel(BaseModel): root = ET.Element("root") header = ET.SubElement(root, "Header") - for nm, val in header_data.items(): + for nm, val in header_data[0].items(): _tag = ET.SubElement(header, nm) _tag.text = val - data = ET.SubElement(root, "ClassData") - for nm, val in class_data.items(): - _parent_tag = ET.SubElement(data, nm) - for sub_nm, sub_val in val.items(): - _child_tag = ET.SubElement(_parent_tag, sub_nm) - _child_tag.text = sub_val + for dta in class_data: + data = ET.SubElement(root, "ClassData") + for nm, val in dta.items(): + _parent_tag = ET.SubElement(data, nm) + for sub_nm, sub_val in val.items(): + _child_tag = ET.SubElement(_parent_tag, sub_nm) + _child_tag.text = sub_val with open(temp_dir.joinpath("test.xml"), mode="wb") as xml_fle: xml_fle.write(ET.tostring(root)) @@ -76,10 +78,12 @@ def test_ddb_xml_reader_all_str(temp_xml_file): class_rel: DuckDBPyRelation = class_reader.read_to_relation( uri.as_uri(), "class_data", class_data_model ) + expected_header = [{**recs, RECORD_INDEX_COLUMN_NAME: idx} for idx, recs in enumerate(header_data, start=1)] + expected_class = [{**recs, RECORD_INDEX_COLUMN_NAME: idx} for idx, recs in enumerate(class_data, start=1)] assert header_rel.count("*").fetchone()[0] == 1 - assert header_rel.df().to_dict("records")[0] == header_data + assert header_rel.df().to_dict("records") == expected_header assert class_rel.count("*").fetchone()[0] == 1 - assert class_rel.df().to_dict("records")[0] == class_data + assert class_rel.df().to_dict("records") == expected_class def test_ddb_xml_reader_write_parquet(temp_xml_file): diff --git a/tests/test_core_engine/test_backends/test_readers/test_spark_json.py b/tests/test_core_engine/test_backends/test_readers/test_spark_json.py index 3cbecb8..24674ca 100644 --- a/tests/test_core_engine/test_backends/test_readers/test_spark_json.py +++ b/tests/test_core_engine/test_backends/test_readers/test_spark_json.py @@ -7,13 +7,14 @@ import pytest from pydantic import BaseModel from pyspark.sql import DataFrame -from pyspark.sql.types import StructType, StructField, StringType +from pyspark.sql.types import LongType, StructType, StructField, StringType from dve.core_engine.backends.implementations.spark.spark_helpers import ( get_type_from_annotation, ) from dve.core_engine.backends.implementations.spark.readers.json import SparkJSONReader from dve.core_engine.backends.utilities import stringify_model +from dve.core_engine.constants import RECORD_INDEX_COLUMN_NAME class SimpleModel(BaseModel): @@ -54,25 +55,25 @@ class SimpleModel(BaseModel): def test_spark_json_reader_all_str(temp_json_file): uri, data, mdl = temp_json_file - expected_fields = [fld for fld in mdl.__fields__] + expected_fields = [fld for fld in mdl.__fields__] + [RECORD_INDEX_COLUMN_NAME] reader = SparkJSONReader() df: DataFrame = reader.read_to_entity_type( DataFrame, uri.as_posix(), "test", stringify_model(mdl) ) assert df.columns == expected_fields - assert df.schema == StructType([StructField(nme, StringType()) for nme in expected_fields]) - assert [rw.asDict() for rw in df.collect()] == [{k: str(v) for k, v in rw.items()} for rw in data] + assert df.schema == StructType([StructField(nme, StringType() if not nme == RECORD_INDEX_COLUMN_NAME else LongType()) for nme in expected_fields]) + assert [rw.asDict() for rw in df.collect()] == [{**{k: str(v) for k, v in rw.items()}, RECORD_INDEX_COLUMN_NAME: idx} for idx, rw in enumerate(data, start=1)] def test_spark_json_reader_cast(temp_json_file): uri, data, mdl = temp_json_file - expected_fields = [fld for fld in mdl.__fields__] + expected_fields = [fld for fld in mdl.__fields__] + [RECORD_INDEX_COLUMN_NAME] reader = SparkJSONReader() df: DataFrame = reader.read_to_entity_type(DataFrame, uri.as_posix(), "test", mdl) assert df.columns == expected_fields assert df.schema == StructType([StructField(fld.name, get_type_from_annotation(fld.annotation)) - for fld in mdl.__fields__.values()]) - assert [rw.asDict() for rw in df.collect()] == data + for fld in mdl.__fields__.values()] + [StructField(RECORD_INDEX_COLUMN_NAME, get_type_from_annotation(int))]) + assert [rw.asDict() for rw in df.collect()] == [{**rw, RECORD_INDEX_COLUMN_NAME: idx} for idx, rw in enumerate(data, start=1)] def test_spark_json_write_parquet(spark, temp_json_file): diff --git a/tests/test_core_engine/test_message.py b/tests/test_core_engine/test_message.py index edf89fc..c74af6b 100644 --- a/tests/test_core_engine/test_message.py +++ b/tests/test_core_engine/test_message.py @@ -8,7 +8,7 @@ from pydantic import BaseModel, ValidationError import pytest -from dve.core_engine.constants import ROWID_COLUMN_NAME +from dve.core_engine.constants import RECORD_INDEX_COLUMN_NAME from dve.core_engine.message import DEFAULT_ERROR_DETAIL, DataContractErrorDetail, FeedbackMessage @@ -16,10 +16,10 @@ def test_rowid_column_stripped(): """Ensure that the rowID column is stripped from FeedbackMessages.""" message = FeedbackMessage( - entity="entity", record={"key": "value", ROWID_COLUMN_NAME: "some identifier"} + entity="entity", record={"key": "value", RECORD_INDEX_COLUMN_NAME: "some identifier"} ) - assert message.record.get(ROWID_COLUMN_NAME) is None + assert message.record.get(RECORD_INDEX_COLUMN_NAME) is None @pytest.mark.parametrize( diff --git a/tests/testdata/planets/planets.dischema.json b/tests/testdata/planets/planets.dischema.json index 7a0387c..b44bb2e 100644 --- a/tests/testdata/planets/planets.dischema.json +++ b/tests/testdata/planets/planets.dischema.json @@ -114,8 +114,8 @@ }, { "entity": "planets", - "name": "has_row_id", - "expression": "__rowid__ IS NOT NULL" + "name": "has_record_index", + "expression": "__record_index__ IS NOT NULL" }, { "entity": "planets", diff --git a/tests/testdata/planets/planets_ddb.dischema.json b/tests/testdata/planets/planets_ddb.dischema.json index 51e6650..0869aad 100644 --- a/tests/testdata/planets/planets_ddb.dischema.json +++ b/tests/testdata/planets/planets_ddb.dischema.json @@ -115,7 +115,7 @@ { "entity": "planets", "name": "has_row_id", - "expression": "__rowid__ IS NOT NULL" + "expression": "__record_index__ IS NOT NULL" }, { "entity": "planets", From bf170f49b7254a875c60e2d240dbcddcffd6f5b4 Mon Sep 17 00:00:00 2001 From: stevenhsd <56357022+stevenhsd@users.noreply.github.com> Date: Fri, 6 Mar 2026 15:47:09 +0000 Subject: [PATCH 3/4] feat: integrate record index into error report --- src/dve/core_engine/message.py | 13 +++------ src/dve/pipeline/pipeline.py | 5 +++- src/dve/reporting/error_report.py | 1 + tests/features/books.feature | 27 ------------------ tests/features/movies.feature | 28 +++++++++---------- tests/features/steps/utilities.py | 1 + tests/test_core_engine/test_message.py | 12 -------- tests/test_pipeline/pipeline_helpers.py | 7 +++-- tests/test_pipeline/test_spark_pipeline.py | 7 +++++ .../movies/movies_ddb_rule_store.json | 2 +- .../movies/movies_spark_rule_store.json | 3 +- 11 files changed, 39 insertions(+), 67 deletions(-) diff --git a/src/dve/core_engine/message.py b/src/dve/core_engine/message.py index bfc1b3b..9e15de4 100644 --- a/src/dve/core_engine/message.py +++ b/src/dve/core_engine/message.py @@ -116,6 +116,8 @@ class UserMessage: "The offending values" Category: ErrorCategory "The category of error" + RecordIndex: Optional[int] = None + "The record index where the error occurred (if applicable)" @property def is_informational(self) -> bool: @@ -187,6 +189,7 @@ class FeedbackMessage: # pylint: disable=too-many-instance-attributes "ErrorMessage", "ErrorCode", "ReportingField", + "RecordIndex", "Value", "Category", ] @@ -224,15 +227,6 @@ def _validate_error_location(cls, value: Any) -> Optional[str]: return str(value) - @validator("record") - def _strip_rowid( # pylint: disable=no-self-argument - cls, value: Optional[dict[str, Any]] - ) -> Optional[dict[str, Any]]: - """Strip the row ID column from the record, if present.""" - if isinstance(value, dict): - value.pop(RECORD_INDEX_COLUMN_NAME, None) - return value - @property def is_critical(self) -> bool: """Whether the error is unrecoverable.""" @@ -333,6 +327,7 @@ def to_row( error_message, self.error_code, self.reporting_field_name or reporting_field, + self.record.get(RECORD_INDEX_COLUMN_NAME), value, self.category, ) diff --git a/src/dve/pipeline/pipeline.py b/src/dve/pipeline/pipeline.py index a1fb3b9..04ca8ed 100644 --- a/src/dve/pipeline/pipeline.py +++ b/src/dve/pipeline/pipeline.py @@ -432,7 +432,9 @@ def apply_data_contract( for path, _ in fh.iter_prefix(read_from): entity_locations[fh.get_file_name(path)] = path - entities[fh.get_file_name(path)] = self.data_contract.read_parquet(path) + entities[fh.get_file_name(path)] = self.data_contract.add_record_index( + self.data_contract.read_parquet(path) + ) key_fields = {model: conf.reporting_fields for model, conf in model_config.items()} @@ -743,6 +745,7 @@ def _get_error_dataframes(self, submission_id: str): pl.col("ErrorCode").alias("Error_Code"), # type: ignore pl.col("ReportingField").alias("Data_Item"), # type: ignore pl.col("ErrorMessage").alias("Error"), # type: ignore + pl.col("RecordIndex").alias("Record_Index"), pl.col("Value"), # type: ignore pl.col("Key").alias("ID"), # type: ignore pl.col("Category"), # type: ignore diff --git a/src/dve/reporting/error_report.py b/src/dve/reporting/error_report.py index 8852fcb..9e947bf 100644 --- a/src/dve/reporting/error_report.py +++ b/src/dve/reporting/error_report.py @@ -18,6 +18,7 @@ "Error_Code": Utf8(), "Data_Item": Utf8(), "Error": Utf8(), + "Record_Index": pl.UInt32(), "Value": Utf8(), "ID": Utf8(), "Category": Utf8(), diff --git a/tests/features/books.feature b/tests/features/books.feature index f13658a..60cc5db 100644 --- a/tests/features/books.feature +++ b/tests/features/books.feature @@ -4,33 +4,6 @@ Feature: Pipeline tests using the books dataset This tests submissions using nested, complex JSON datasets with arrays, and introduces more complex transformations that require aggregation. - Scenario: Validate complex nested XML data (spark) - Given I submit the books file nested_books.XML for processing - And A spark pipeline is configured with schema file 'nested_books.dischema.json' - And I add initial audit entries for the submission - Then the latest audit record for the submission is marked with processing status file_transformation - When I run the file transformation phase - Then the header entity is stored as a parquet after the file_transformation phase - And the nested_books entity is stored as a parquet after the file_transformation phase - And the latest audit record for the submission is marked with processing status data_contract - When I run the data contract phase - Then there is 1 record rejection from the data_contract phase - And the header entity is stored as a parquet after the data_contract phase - And the nested_books entity is stored as a parquet after the data_contract phase - And the latest audit record for the submission is marked with processing status business_rules - When I run the business rules phase - Then The rules restrict "nested_books" to 3 qualifying records - And The entity "nested_books" contains an entry for "17.85" in column "total_value_of_books" - And the nested_books entity is stored as a parquet after the business_rules phase - And the latest audit record for the submission is marked with processing status error_report - When I run the error report phase - Then An error report is produced - And The statistics entry for the submission shows the following information - | parameter | value | - | record_count | 4 | - | number_record_rejections | 2 | - | number_warnings | 0 | - Scenario: Validate complex nested XML data (duckdb) Given I submit the books file nested_books.XML for processing And A duckdb pipeline is configured with schema file 'nested_books_ddb.dischema.json' diff --git a/tests/features/movies.feature b/tests/features/movies.feature index d737574..fa041ea 100644 --- a/tests/features/movies.feature +++ b/tests/features/movies.feature @@ -21,18 +21,18 @@ Feature: Pipeline tests using the movies dataset When I run the data contract phase Then there are 3 record rejections from the data_contract phase And there are errors with the following details and associated error_count from the data_contract phase - | Entity | ErrorCode | ErrorMessage | error_count | - | movies | BLANKYEAR | year not provided | 1 | - | movies_rename_test | DODGYYEAR | year value (NOT_A_NUMBER) is invalid | 1 | - | movies | DODGYDATE | date_joined value is not valid: daft_date | 1 | + | Entity | ErrorCode | ErrorMessage | RecordIndex | error_count | + | movies | BLANKYEAR | year not provided | 2 | 1 | + | movies_rename_test | DODGYYEAR | year value (NOT_A_NUMBER) is invalid | 1 | 1 | + | movies | DODGYDATE | date_joined value is not valid: daft_date | 1 | 1 | And the movies entity is stored as a parquet after the data_contract phase And the latest audit record for the submission is marked with processing status business_rules When I run the business rules phase Then The rules restrict "movies" to 4 qualifying records And there are errors with the following details and associated error_count from the business_rules phase - | ErrorCode | ErrorMessage | error_count | - | LIMITED_RATINGS | Movie has too few ratings ([6.5]) | 1 | - | RUBBISH_SEQUEL | The movie The Greatest Movie Ever has a rubbish sequel | 1 | + | ErrorCode | ErrorMessage | RecordIndex | error_count | + | LIMITED_RATINGS | Movie has too few ratings ([6.5]) | 4 | 1 | + | RUBBISH_SEQUEL | The movie The Greatest Movie Ever has a rubbish sequel | 1 | 1 | And the latest audit record for the submission is marked with processing status error_report When I run the error report phase Then An error report is produced @@ -57,18 +57,18 @@ Feature: Pipeline tests using the movies dataset When I run the data contract phase Then there are 3 record rejections from the data_contract phase And there are errors with the following details and associated error_count from the data_contract phase - | Entity | ErrorCode | ErrorMessage | error_count | - | movies | BLANKYEAR | year not provided | 1 | - | movies_rename_test | DODGYYEAR | year value (NOT_A_NUMBER) is invalid | 1 | - | movies | DODGYDATE | date_joined value is not valid: daft_date | 1 | + | Entity | ErrorCode | ErrorMessage | RecordIndex | error_count | + | movies | BLANKYEAR | year not provided | 2 | 1 | + | movies_rename_test | DODGYYEAR | year value (NOT_A_NUMBER) is invalid | 1 | 1 | + | movies | DODGYDATE | date_joined value is not valid: daft_date | 1 | 1 | And the movies entity is stored as a parquet after the data_contract phase And the latest audit record for the submission is marked with processing status business_rules When I run the business rules phase Then The rules restrict "movies" to 4 qualifying records And there are errors with the following details and associated error_count from the business_rules phase - | ErrorCode | ErrorMessage | error_count | - | LIMITED_RATINGS | Movie has too few ratings ([6.5]) | 1 | - | RUBBISH_SEQUEL | The movie The Greatest Movie Ever has a rubbish sequel | 1 | + | ErrorCode | ErrorMessage | RecordIndex | error_count | + | LIMITED_RATINGS | Movie has too few ratings ([6.5]) | 4 | 1 | + | RUBBISH_SEQUEL | The movie The Greatest Movie Ever has a rubbish sequel | 1 | 1 | And the latest audit record for the submission is marked with processing status error_report When I run the error report phase Then An error report is produced diff --git a/tests/features/steps/utilities.py b/tests/features/steps/utilities.py index aa9adc1..58edc67 100644 --- a/tests/features/steps/utilities.py +++ b/tests/features/steps/utilities.py @@ -23,6 +23,7 @@ "ErrorType", "ErrorLocation", "ErrorMessage", + "RecordIndex", "ReportingField", "Category", ] diff --git a/tests/test_core_engine/test_message.py b/tests/test_core_engine/test_message.py index c74af6b..ccb6736 100644 --- a/tests/test_core_engine/test_message.py +++ b/tests/test_core_engine/test_message.py @@ -8,20 +8,8 @@ from pydantic import BaseModel, ValidationError import pytest -from dve.core_engine.constants import RECORD_INDEX_COLUMN_NAME from dve.core_engine.message import DEFAULT_ERROR_DETAIL, DataContractErrorDetail, FeedbackMessage - -def test_rowid_column_stripped(): - """Ensure that the rowID column is stripped from FeedbackMessages.""" - - message = FeedbackMessage( - entity="entity", record={"key": "value", RECORD_INDEX_COLUMN_NAME: "some identifier"} - ) - - assert message.record.get(RECORD_INDEX_COLUMN_NAME) is None - - @pytest.mark.parametrize( ("derived_column", "expected"), [ diff --git a/tests/test_pipeline/pipeline_helpers.py b/tests/test_pipeline/pipeline_helpers.py index ddd4ef8..b13bef3 100644 --- a/tests/test_pipeline/pipeline_helpers.py +++ b/tests/test_pipeline/pipeline_helpers.py @@ -152,6 +152,7 @@ def dodgy_planet_data_after_file_transformation() -> Iterator[Tuple[SubmissionIn "numberOfMoons": "-1", "hasRingSystem": "false", "hasGlobalMagneticField": "sometimes", + "__record_index__": "1" } planet_contract_df = pl.DataFrame( planet_contract_data, {k: pl.Utf8() for k in planet_contract_data} @@ -381,7 +382,8 @@ def error_data_after_business_rules() -> Iterator[Tuple[SubmissionInfo, str]]: "ErrorCode": "LONG_ORBIT", "ReportingField": "orbitalPeriod", "Value": "365.20001220703125", - "Category": "Bad value" + "Category": "Bad value", + "RecordIndex": "1" }, { "Entity": "planets", @@ -394,7 +396,8 @@ def error_data_after_business_rules() -> Iterator[Tuple[SubmissionInfo, str]]: "ErrorCode": "STRONG_GRAVITY", "ReportingField": "gravity", "Value": "9.800000190734863", - "Category": "Bad value" + "Category": "Bad value", + "RecordIndex": "1" } ]""" ) diff --git a/tests/test_pipeline/test_spark_pipeline.py b/tests/test_pipeline/test_spark_pipeline.py index 910626a..262d84f 100644 --- a/tests/test_pipeline/test_spark_pipeline.py +++ b/tests/test_pipeline/test_spark_pipeline.py @@ -175,6 +175,7 @@ def test_apply_data_contract_failed( # pylint: disable=redefined-outer-name "ErrorMessage": "is invalid", "ErrorCode": "BadValue", "ReportingField": "planet", + "RecordIndex": "1", "Value": "EarthEarthEarthEarthEarthEarthEarthEarthEarth", "Category": "Bad value", }, @@ -188,6 +189,7 @@ def test_apply_data_contract_failed( # pylint: disable=redefined-outer-name "ErrorMessage": "is invalid", "ErrorCode": "BadValue", "ReportingField": "numberOfMoons", + "RecordIndex": "1", "Value": "-1", "Category": "Bad value", }, @@ -201,6 +203,7 @@ def test_apply_data_contract_failed( # pylint: disable=redefined-outer-name "ErrorMessage": "is invalid", "ErrorCode": "BadValue", "ReportingField": "hasGlobalMagneticField", + "RecordIndex": "1", "Value": "sometimes", "Category": "Bad value", }, @@ -347,6 +350,7 @@ def test_apply_business_rules_with_data_errors( # pylint: disable=redefined-out "ReportingField": "orbitalPeriod", "Value": "365.20001220703125", "Category": "Bad value", + "RecordIndex": "1" }, { "Entity": "planets", @@ -360,6 +364,7 @@ def test_apply_business_rules_with_data_errors( # pylint: disable=redefined-out "ReportingField": "gravity", "Value": "9.800000190734863", "Category": "Bad value", + "RecordIndex": "1" }, ] @@ -504,6 +509,7 @@ def test_error_report_where_report_is_expected( # pylint: disable=redefined-out "Error Code": "LONG_ORBIT", "Data Item Submission Name": "orbitalPeriod", "Errors and Warnings": "Planet has long orbital period", + "Record Index": 1, "Value": 365.20001220703125, "ID": None, "Category": "Bad value", @@ -516,6 +522,7 @@ def test_error_report_where_report_is_expected( # pylint: disable=redefined-out "Error Code": "STRONG_GRAVITY", "Data Item Submission Name": "gravity", "Errors and Warnings": "Planet has too strong gravity", + "Record Index": 1, "Value": 9.800000190734863, "ID": None, "Category": "Bad value", diff --git a/tests/testdata/movies/movies_ddb_rule_store.json b/tests/testdata/movies/movies_ddb_rule_store.json index 843d4fa..6a51fd6 100644 --- a/tests/testdata/movies/movies_ddb_rule_store.json +++ b/tests/testdata/movies/movies_ddb_rule_store.json @@ -61,7 +61,7 @@ "name": "Get median sequel rating", "operation": "group_by", "entity": "with_sequels", - "group_by": "title", + "group_by": ["__record_index__", "title"], "agg_columns": { "list_aggregate(sequel_rating, 'median')": "median_sequel_rating" } diff --git a/tests/testdata/movies/movies_spark_rule_store.json b/tests/testdata/movies/movies_spark_rule_store.json index 08ad641..e8204c5 100644 --- a/tests/testdata/movies/movies_spark_rule_store.json +++ b/tests/testdata/movies/movies_spark_rule_store.json @@ -63,6 +63,7 @@ "entity": "with_sequels", "columns": { "title": "title", + "__record_index__": "__record_index__", "explode(sequel_rating)": "sequel_rating" } }, @@ -70,7 +71,7 @@ "name": "Get median sequel rating", "operation": "group_by", "entity": "with_sequels", - "group_by": "title", + "group_by": ["__record_index__","title"], "agg_columns": { "percentile_approx(sequel_rating, 0.5)": "median_sequel_rating" } From a0ec92d97058c23da97bf5d50494aa7996e164e6 Mon Sep 17 00:00:00 2001 From: stevenhsd <56357022+stevenhsd@users.noreply.github.com> Date: Fri, 6 Mar 2026 17:06:24 +0000 Subject: [PATCH 4/4] style: sort formating, linting and static typing --- src/dve/core_engine/backends/base/contract.py | 10 ++++---- src/dve/core_engine/backends/base/reader.py | 4 +-- .../core_engine/backends/base/utilities.py | 4 --- .../implementations/duckdb/contract.py | 5 ++-- .../implementations/duckdb/duckdb_helpers.py | 14 +++++++++-- .../implementations/duckdb/readers/csv.py | 21 +++++++++++++--- .../implementations/duckdb/readers/json.py | 7 ++++-- .../implementations/duckdb/readers/xml.py | 15 ++++++++--- .../backends/implementations/duckdb/rules.py | 1 + .../implementations/spark/contract.py | 3 ++- .../implementations/spark/readers/csv.py | 2 +- .../implementations/spark/readers/json.py | 3 ++- .../implementations/spark/readers/xml.py | 14 +++++++---- .../backends/implementations/spark/rules.py | 2 +- .../implementations/spark/spark_helpers.py | 25 ++++++++++++------- src/dve/core_engine/backends/readers/csv.py | 8 +++--- .../core_engine/backends/readers/utilities.py | 6 +---- src/dve/core_engine/backends/readers/xml.py | 9 +++---- src/dve/core_engine/backends/utilities.py | 10 ++++++-- src/dve/core_engine/message.py | 2 +- src/dve/core_engine/type_hints.py | 3 +++ src/dve/pipeline/pipeline.py | 8 +++--- 22 files changed, 111 insertions(+), 65 deletions(-) diff --git a/src/dve/core_engine/backends/base/contract.py b/src/dve/core_engine/backends/base/contract.py index 0304580..948ff77 100644 --- a/src/dve/core_engine/backends/base/contract.py +++ b/src/dve/core_engine/backends/base/contract.py @@ -337,9 +337,9 @@ def read_raw_entities( successful = True for entity_name, resource in entity_locations.items(): reader_metadata = contract_metadata.reader_metadata[entity_name] - extension = "." + ( - get_file_suffix(resource) or "" - ).lower() # Already checked that extension supported. + extension = ( + "." + (get_file_suffix(resource) or "").lower() + ) # Already checked that extension supported. reader_config = reader_metadata[extension] reader_type = get_reader(reader_config.reader) @@ -368,11 +368,11 @@ def read_raw_entities( messages.extend(new_messages) return entities, dedup_messages(messages), successful - + def add_record_index(self, entity: EntityType, **kwargs) -> EntityType: """Add a record index to the entity""" raise NotImplementedError(f"add_record_index not implemented in {self.__class__}") - + def drop_record_index(self, entity: EntityType, **kwargs) -> EntityType: """Drop a record index from the entity""" raise NotImplementedError(f"drop_record_index not implemented in {self.__class__}") diff --git a/src/dve/core_engine/backends/base/reader.py b/src/dve/core_engine/backends/base/reader.py index 498313a..ac30111 100644 --- a/src/dve/core_engine/backends/base/reader.py +++ b/src/dve/core_engine/backends/base/reader.py @@ -126,11 +126,11 @@ def read_to_entity_type( raise ReaderLacksEntityTypeSupport(entity_type=entity_type) from err return reader_func(self, resource, entity_name, schema) - + def add_record_index(self, entity: EntityType, **kwargs) -> EntityType: """Add a record index to the entity""" raise NotImplementedError(f"add_record_index not implemented in {self.__class__}") - + def drop_record_index(self, entity: EntityType, **kwargs) -> EntityType: """Drop a record index to the entity""" raise NotImplementedError(f"drop_record_index not implemented in {self.__class__}") diff --git a/src/dve/core_engine/backends/base/utilities.py b/src/dve/core_engine/backends/base/utilities.py index 3cc2923..f55bc88 100644 --- a/src/dve/core_engine/backends/base/utilities.py +++ b/src/dve/core_engine/backends/base/utilities.py @@ -12,10 +12,6 @@ from dve.core_engine.type_hints import ExpressionArray, MultiExpression from dve.parser.type_hints import URI -import polars as pl - -from dve.core_engine.constants import RECORD_INDEX_COLUMN_NAME - BRACKETS = {"(": ")", "{": "}", "[": "]", "<": ">"} """A mapping of opening brackets to their closing counterpart.""" STRING_START_CHARS = {'"', "'"} diff --git a/src/dve/core_engine/backends/implementations/duckdb/contract.py b/src/dve/core_engine/backends/implementations/duckdb/contract.py index 3dcfa5f..075573d 100644 --- a/src/dve/core_engine/backends/implementations/duckdb/contract.py +++ b/src/dve/core_engine/backends/implementations/duckdb/contract.py @@ -16,7 +16,6 @@ from pydantic import BaseModel from pydantic.fields import ModelField -from dve.core_engine.constants import RECORD_INDEX_COLUMN_NAME import dve.parser.file_handling as fh from dve.common.error_utils import ( BackgroundMessageWriter, @@ -39,6 +38,7 @@ from dve.core_engine.backends.metadata.contract import DataContractMetadata from dve.core_engine.backends.types import StageSuccessful from dve.core_engine.backends.utilities import get_polars_type_from_annotation, stringify_model +from dve.core_engine.constants import RECORD_INDEX_COLUMN_NAME from dve.core_engine.message import FeedbackMessage from dve.core_engine.type_hints import URI, EntityLocations from dve.core_engine.validation import RowValidator, apply_row_validator_helper @@ -55,6 +55,7 @@ def __call__(self, row: pd.Series): self.errors.extend(self.row_validator(row.to_dict())[1]) # type: ignore return row # no op + @duckdb_record_index @duckdb_write_parquet @duckdb_read_parquet @@ -173,7 +174,7 @@ def apply_data_contract( msg_count += len(msgs) self.logger.info(f"Data contract found {msg_count} issues in {entity_name}") - + if not RECORD_INDEX_COLUMN_NAME in relation.columns: relation = self.add_record_index(relation) diff --git a/src/dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py b/src/dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py index 8456d3d..f5b0fe9 100644 --- a/src/dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py +++ b/src/dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py @@ -288,18 +288,28 @@ def duckdb_rel_to_dictionaries( while rows := entity.fetchmany(batch_size): yield from (dict(zip(cols, rw)) for rw in rows) -def _add_duckdb_record_index(self, entity: DuckDBPyRelation) -> DuckDBPyRelation: + +def _add_duckdb_record_index( + self, entity: DuckDBPyRelation # pylint: disable=W0613 +) -> DuckDBPyRelation: + """Add record index to duckdb relation""" if RECORD_INDEX_COLUMN_NAME in entity.columns: return entity return entity.select(f"*, row_number() OVER () as {RECORD_INDEX_COLUMN_NAME}") -def _drop_duckdb_record_index(self, entity: DuckDBPyRelation) -> DuckDBPyRelation: + +def _drop_duckdb_record_index( + self, entity: DuckDBPyRelation # pylint: disable=W0613 +) -> DuckDBPyRelation: + """Drop record index from duckdb relation""" if RECORD_INDEX_COLUMN_NAME not in entity.columns: return entity return entity.select(StarExpression(exclude=[RECORD_INDEX_COLUMN_NAME])) + def duckdb_record_index(cls): + """Class decorator to add record index methods for duckdb implementations""" setattr(cls, "add_record_index", _add_duckdb_record_index) setattr(cls, "drop_record_index", _drop_duckdb_record_index) return cls diff --git a/src/dve/core_engine/backends/implementations/duckdb/readers/csv.py b/src/dve/core_engine/backends/implementations/duckdb/readers/csv.py index 1844a9d..2d17f8a 100644 --- a/src/dve/core_engine/backends/implementations/duckdb/readers/csv.py +++ b/src/dve/core_engine/backends/implementations/duckdb/readers/csv.py @@ -6,7 +6,13 @@ import duckdb as ddb import polars as pl -from duckdb import DuckDBPyConnection, DuckDBPyRelation, StarExpression, default_connection, read_csv +from duckdb import ( + DuckDBPyConnection, + DuckDBPyRelation, + StarExpression, + default_connection, + read_csv, +) from pydantic import BaseModel from dve.core_engine.backends.base.reader import BaseFileReader, read_function @@ -24,6 +30,7 @@ from dve.core_engine.type_hints import URI, EntityName from dve.parser.file_handling import get_content_length + @duckdb_record_index @duckdb_write_parquet class DuckDBCSVReader(BaseFileReader): @@ -113,6 +120,7 @@ def read_to_relation( # pylint: disable=unused-argument reader_options["columns"] = ddb_schema return self.add_record_index(read_csv(resource, **reader_options, parallel=False)) + @polars_record_index class PolarsToDuckDBCSVReader(DuckDBCSVReader): """ @@ -144,11 +152,14 @@ def read_to_relation( # pylint: disable=unused-argument for fld in schema.__fields__.values() } reader_options["dtypes"] = polars_types - # there is a raise_if_empty arg for 0.18+. Future reference when upgrading. Makes L85 # redundant - df = self.add_record_index(pl.scan_csv(resource, **reader_options).select(list(polars_types.keys()))) # type: ignore # pylint: disable=W0612 + df = self.add_record_index( # pylint: disable=W0612 + pl.scan_csv(resource, **reader_options).select( # type: ignore + list(polars_types.keys()) + ) + ) return ddb.sql("SELECT * FROM df") @@ -192,7 +203,9 @@ def __init__( def read_to_relation( # pylint: disable=unused-argument self, resource: URI, entity_name: EntityName, schema: type[BaseModel] ) -> DuckDBPyRelation: - entity: DuckDBPyRelation = super().read_to_relation(resource=resource, entity_name=entity_name, schema=schema) + entity: DuckDBPyRelation = super().read_to_relation( + resource=resource, entity_name=entity_name, schema=schema + ) entity = entity.select(StarExpression(exclude=[RECORD_INDEX_COLUMN_NAME])).distinct() no_records = entity.shape[0] diff --git a/src/dve/core_engine/backends/implementations/duckdb/readers/json.py b/src/dve/core_engine/backends/implementations/duckdb/readers/json.py index 5d7df37..8afb5a4 100644 --- a/src/dve/core_engine/backends/implementations/duckdb/readers/json.py +++ b/src/dve/core_engine/backends/implementations/duckdb/readers/json.py @@ -9,13 +9,14 @@ from dve.core_engine.backends.base.reader import BaseFileReader, read_function from dve.core_engine.backends.implementations.duckdb.duckdb_helpers import ( + duckdb_record_index, duckdb_write_parquet, get_duckdb_type_from_annotation, ) from dve.core_engine.backends.implementations.duckdb.types import SQLType -from dve.core_engine.backends.implementations.duckdb.duckdb_helpers import duckdb_record_index from dve.core_engine.type_hints import URI, EntityName + @duckdb_record_index @duckdb_write_parquet class DuckDBJSONReader(BaseFileReader): @@ -48,4 +49,6 @@ def read_to_relation( # pylint: disable=unused-argument for fld in schema.__fields__.values() } - return self.add_record_index(read_json(resource, columns=ddb_schema, format=self._json_format)) # type: ignore + return self.add_record_index( + read_json(resource, columns=ddb_schema, format=self._json_format) # type: ignore + ) diff --git a/src/dve/core_engine/backends/implementations/duckdb/readers/xml.py b/src/dve/core_engine/backends/implementations/duckdb/readers/xml.py index c4d7d7c..a10998c 100644 --- a/src/dve/core_engine/backends/implementations/duckdb/readers/xml.py +++ b/src/dve/core_engine/backends/implementations/duckdb/readers/xml.py @@ -11,9 +11,14 @@ from dve.core_engine.backends.exceptions import MessageBearingError from dve.core_engine.backends.implementations.duckdb.duckdb_helpers import duckdb_write_parquet from dve.core_engine.backends.readers.xml import XMLStreamReader -from dve.core_engine.backends.utilities import get_polars_type_from_annotation, polars_record_index, stringify_model +from dve.core_engine.backends.utilities import ( + get_polars_type_from_annotation, + polars_record_index, + stringify_model, +) from dve.core_engine.type_hints import URI + @polars_record_index @duckdb_write_parquet class DuckDBXMLStreamReader(XMLStreamReader): @@ -39,7 +44,9 @@ def read_to_relation(self, resource: URI, entity_name: str, schema: type[BaseMod for fld in stringify_model(schema).__fields__.values() } - _lazy_frame = self.add_record_index(pl.LazyFrame( - data=self.read_to_py_iterator(resource, entity_name, schema), schema=polars_schema - )) + _lazy_frame = self.add_record_index( + pl.LazyFrame( + data=self.read_to_py_iterator(resource, entity_name, schema), schema=polars_schema + ) + ) return self.ddb_connection.sql("select * from _lazy_frame") diff --git a/src/dve/core_engine/backends/implementations/duckdb/rules.py b/src/dve/core_engine/backends/implementations/duckdb/rules.py index dd252b1..7ed775c 100644 --- a/src/dve/core_engine/backends/implementations/duckdb/rules.py +++ b/src/dve/core_engine/backends/implementations/duckdb/rules.py @@ -57,6 +57,7 @@ from dve.core_engine.templating import template_object from dve.core_engine.type_hints import Messages + @duckdb_record_index @duckdb_write_parquet @duckdb_read_parquet diff --git a/src/dve/core_engine/backends/implementations/spark/contract.py b/src/dve/core_engine/backends/implementations/spark/contract.py index 330c73e..6152ad7 100644 --- a/src/dve/core_engine/backends/implementations/spark/contract.py +++ b/src/dve/core_engine/backends/implementations/spark/contract.py @@ -17,7 +17,6 @@ dump_processing_errors, get_feedback_errors_uri, ) - from dve.core_engine.backends.base.contract import BaseDataContract, reader_override from dve.core_engine.backends.base.utilities import generate_error_casting_entity_message from dve.core_engine.backends.exceptions import ( @@ -42,6 +41,7 @@ COMPLEX_TYPES: set[type[DataType]] = {StructType, ArrayType, MapType} """Spark types indicating complex types.""" + @spark_record_index @spark_write_parquet @spark_read_parquet @@ -86,6 +86,7 @@ def create_entity_from_py_iterator( schema=get_type_from_annotation(schema), ) + # pylint: disable=R0915 def apply_data_contract( self, working_dir: URI, diff --git a/src/dve/core_engine/backends/implementations/spark/readers/csv.py b/src/dve/core_engine/backends/implementations/spark/readers/csv.py index f3114bc..5b0bb3d 100644 --- a/src/dve/core_engine/backends/implementations/spark/readers/csv.py +++ b/src/dve/core_engine/backends/implementations/spark/readers/csv.py @@ -11,9 +11,9 @@ from dve.core_engine.backends.exceptions import EmptyFileError from dve.core_engine.backends.implementations.spark.spark_helpers import ( get_type_from_annotation, + spark_record_index, spark_write_parquet, ) -from dve.core_engine.backends.implementations.spark.spark_helpers import spark_record_index from dve.core_engine.type_hints import URI, EntityName from dve.parser.file_handling import get_content_length diff --git a/src/dve/core_engine/backends/implementations/spark/readers/json.py b/src/dve/core_engine/backends/implementations/spark/readers/json.py index b3a5fa6..0b4a09f 100644 --- a/src/dve/core_engine/backends/implementations/spark/readers/json.py +++ b/src/dve/core_engine/backends/implementations/spark/readers/json.py @@ -11,12 +11,13 @@ from dve.core_engine.backends.exceptions import EmptyFileError from dve.core_engine.backends.implementations.spark.spark_helpers import ( get_type_from_annotation, + spark_record_index, spark_write_parquet, ) -from dve.core_engine.backends.implementations.spark.spark_helpers import spark_record_index from dve.core_engine.type_hints import URI, EntityName from dve.parser.file_handling import get_content_length + @spark_record_index @spark_write_parquet class SparkJSONReader(BaseFileReader): diff --git a/src/dve/core_engine/backends/implementations/spark/readers/xml.py b/src/dve/core_engine/backends/implementations/spark/readers/xml.py index 028a430..39433b3 100644 --- a/src/dve/core_engine/backends/implementations/spark/readers/xml.py +++ b/src/dve/core_engine/backends/implementations/spark/readers/xml.py @@ -17,9 +17,9 @@ from dve.core_engine.backends.implementations.spark.spark_helpers import ( df_is_empty, get_type_from_annotation, + spark_record_index, spark_write_parquet, ) -from dve.core_engine.backends.implementations.spark.spark_helpers import spark_record_index from dve.core_engine.backends.readers.xml import BasicXMLFileReader, XMLStreamReader from dve.core_engine.type_hints import URI, EntityName from dve.parser.file_handling import get_content_length @@ -28,6 +28,7 @@ SparkXMLMode = Literal["PERMISSIVE", "FAILFAST", "DROPMALFORMED"] """The mode to use when parsing XML files with Spark.""" + @spark_record_index @spark_write_parquet class SparkXMLStreamReader(XMLStreamReader): @@ -46,10 +47,13 @@ def read_to_dataframe( if not self.spark: self.spark = SparkSession.builder.getOrCreate() # type: ignore spark_schema = get_type_from_annotation(schema) - return self.add_record_index(self.spark.createDataFrame( # type: ignore - list(self.read_to_py_iterator(resource, entity_name, schema)), - schema=spark_schema, - )) + return self.add_record_index( + self.spark.createDataFrame( # type: ignore + list(self.read_to_py_iterator(resource, entity_name, schema)), + schema=spark_schema, + ) + ) + @spark_record_index @spark_write_parquet diff --git a/src/dve/core_engine/backends/implementations/spark/rules.py b/src/dve/core_engine/backends/implementations/spark/rules.py index c970f28..5d1cfe0 100644 --- a/src/dve/core_engine/backends/implementations/spark/rules.py +++ b/src/dve/core_engine/backends/implementations/spark/rules.py @@ -44,12 +44,12 @@ SemiJoin, TableUnion, ) - from dve.core_engine.functions import implementations as functions from dve.core_engine.message import FeedbackMessage from dve.core_engine.templating import template_object from dve.core_engine.type_hints import Messages + @spark_record_index @spark_write_parquet @spark_read_parquet diff --git a/src/dve/core_engine/backends/implementations/spark/spark_helpers.py b/src/dve/core_engine/backends/implementations/spark/spark_helpers.py index 9272e96..4381fdd 100644 --- a/src/dve/core_engine/backends/implementations/spark/spark_helpers.py +++ b/src/dve/core_engine/backends/implementations/spark/spark_helpers.py @@ -17,20 +17,17 @@ from delta.exceptions import ConcurrentAppendException, DeltaConcurrentModificationException from pydantic import BaseModel from pydantic.types import ConstrainedDecimal -from pyspark.sql import DataFrame, SparkSession +from pyspark.sql import DataFrame, Row, SparkSession from pyspark.sql import functions as sf from pyspark.sql import types as st from pyspark.sql.column import Column from pyspark.sql.functions import lit, udf +from pyspark.sql.types import LongType, StructField, StructType from typing_extensions import Annotated, Protocol, TypedDict, get_args, get_origin, get_type_hints from dve.core_engine.backends.base.utilities import _get_non_heterogenous_type -from dve.core_engine.type_hints import URI - -from pyspark.sql import DataFrame, Row -from pyspark.sql.types import LongType, StructField, StructType - from dve.core_engine.constants import RECORD_INDEX_COLUMN_NAME +from dve.core_engine.type_hints import URI # It would be really nice if there was a more parameterisable # way of doing this. @@ -416,19 +413,29 @@ def _inner(*args, **kwargs): return _wrapper -def _add_spark_record_index(self, entity: DataFrame) -> DataFrame: + +def _add_spark_record_index(self, entity: DataFrame) -> DataFrame: # pylint: disable=W0613 + """Add a record index to spark dataframe""" if RECORD_INDEX_COLUMN_NAME in entity.columns: return entity schema: StructType = entity.schema schema.add(StructField(RECORD_INDEX_COLUMN_NAME, LongType())) - return entity.rdd.zipWithIndex().map(lambda x: Row(**x[0].asDict(True), RECORD_INDEX_COLUMN_NAME=x[1] + 1)).toDF(schema=schema) + return ( + entity.rdd.zipWithIndex() + .map(lambda x: Row(**x[0].asDict(True), RECORD_INDEX_COLUMN_NAME=x[1] + 1)) + .toDF(schema=schema) + ) -def _drop_spark_record_index(self, entity: DataFrame) -> DataFrame: + +def _drop_spark_record_index(self, entity: DataFrame) -> DataFrame: # pylint: disable=W0613 + """Drop record index from spark dataframe""" if not RECORD_INDEX_COLUMN_NAME in entity.columns: return entity return entity.drop(RECORD_INDEX_COLUMN_NAME) + def spark_record_index(cls): + """Class decorator to add record index methods for spark implementations""" setattr(cls, "add_record_index", _add_spark_record_index) setattr(cls, "drop_record_index", _drop_spark_record_index) return cls diff --git a/src/dve/core_engine/backends/readers/csv.py b/src/dve/core_engine/backends/readers/csv.py index b77a89a..edd6bf0 100644 --- a/src/dve/core_engine/backends/readers/csv.py +++ b/src/dve/core_engine/backends/readers/csv.py @@ -15,7 +15,6 @@ FieldCountMismatch, MissingHeaderError, ) - from dve.core_engine.backends.utilities import get_polars_type_from_annotation, stringify_model from dve.core_engine.constants import RECORD_INDEX_COLUMN_NAME from dve.core_engine.type_hints import EntityName @@ -206,9 +205,9 @@ def read_to_py_iterator( ) coerce_func = partial(self._coerce, field_names=field_names) - for idx, rw in enumerate(map(coerce_func, reader), start=1): - rw[RECORD_INDEX_COLUMN_NAME] = idx - yield rw + for idx, record in enumerate(map(coerce_func, reader), start=1): + record[RECORD_INDEX_COLUMN_NAME] = idx # type: ignore + yield record def write_parquet( # type: ignore self, @@ -228,7 +227,6 @@ def write_parquet( # type: ignore for fld in stringify_model(schema).__fields__.values() } polars_schema[RECORD_INDEX_COLUMN_NAME] = get_polars_type_from_annotation(int) - pl.LazyFrame(data=entity, schema=polars_schema).sink_parquet( path=target_location, compression="snappy" diff --git a/src/dve/core_engine/backends/readers/utilities.py b/src/dve/core_engine/backends/readers/utilities.py index 2281432..642c0b2 100644 --- a/src/dve/core_engine/backends/readers/utilities.py +++ b/src/dve/core_engine/backends/readers/utilities.py @@ -2,16 +2,12 @@ from typing import Optional -from duckdb import DuckDBPyRelation -import polars as pl from pydantic import BaseModel -from pyspark.sql import DataFrame, Row -from pyspark.sql.types import LongType, StructField, StructType -from dve.core_engine.constants import RECORD_INDEX_COLUMN_NAME from dve.core_engine.type_hints import URI from dve.parser.file_handling.service import open_stream + def check_csv_header_expected( resource: URI, expected_schema: type[BaseModel], diff --git a/src/dve/core_engine/backends/readers/xml.py b/src/dve/core_engine/backends/readers/xml.py index fa2835c..4620402 100644 --- a/src/dve/core_engine/backends/readers/xml.py +++ b/src/dve/core_engine/backends/readers/xml.py @@ -12,7 +12,6 @@ from dve.core_engine.backends.base.reader import BaseFileReader from dve.core_engine.backends.exceptions import EmptyFileError - from dve.core_engine.backends.readers.xml_linting import run_xmllint from dve.core_engine.backends.utilities import get_polars_type_from_annotation, stringify_model from dve.core_engine.constants import RECORD_INDEX_COLUMN_NAME @@ -312,10 +311,10 @@ def read_to_py_iterator( raise EmptyFileError(f"File at {resource!r} is empty") with open_stream(resource, "rb") as stream: - for idx, rw in enumerate(self._parse_xml(stream, schema), start=1): - rw[RECORD_INDEX_COLUMN_NAME] = idx - yield rw - + for idx, record in enumerate(self._parse_xml(stream, schema), start=1): + record[RECORD_INDEX_COLUMN_NAME] = idx # type: ignore + yield record + def write_parquet( # type: ignore self, entity: Iterator[dict[str, Any]], diff --git a/src/dve/core_engine/backends/utilities.py b/src/dve/core_engine/backends/utilities.py index 6a2918f..62eb9e2 100644 --- a/src/dve/core_engine/backends/utilities.py +++ b/src/dve/core_engine/backends/utilities.py @@ -177,17 +177,23 @@ def get_polars_type_from_annotation(type_annotation: Any) -> PolarsType: return polars_type raise ValueError(f"No equivalent DuckDB type for {type_annotation!r}") -def _add_polars_record_index(self, entity: pl.LazyFrame) -> pl.LazyFrame: + +def _add_polars_record_index(self, entity: pl.LazyFrame) -> pl.LazyFrame: # pylint: disable=W0613 + """Add a record index to polars dataframe""" if RECORD_INDEX_COLUMN_NAME in entity.columns: return entity return entity.with_row_index(name=RECORD_INDEX_COLUMN_NAME, offset=1) -def _drop_polars_record_index(self, entity: pl.LazyFrame) -> pl.LazyFrame: + +def _drop_polars_record_index(self, entity: pl.LazyFrame) -> pl.LazyFrame: # pylint: disable=W0613 + """Drop record index from polars dataframe""" if not RECORD_INDEX_COLUMN_NAME in entity.columns: return entity return entity.drop(RECORD_INDEX_COLUMN_NAME) + def polars_record_index(cls): + """Class decorator to add record index methods for polars implementations""" setattr(cls, "add_record_index", _add_polars_record_index) setattr(cls, "drop_record_index", _drop_polars_record_index) return cls diff --git a/src/dve/core_engine/message.py b/src/dve/core_engine/message.py index 9e15de4..627ae3a 100644 --- a/src/dve/core_engine/message.py +++ b/src/dve/core_engine/message.py @@ -327,7 +327,7 @@ def to_row( error_message, self.error_code, self.reporting_field_name or reporting_field, - self.record.get(RECORD_INDEX_COLUMN_NAME), + (self.record.get(RECORD_INDEX_COLUMN_NAME) if self.record else None), value, self.category, ) diff --git a/src/dve/core_engine/type_hints.py b/src/dve/core_engine/type_hints.py index afb6d9d..3112e28 100644 --- a/src/dve/core_engine/type_hints.py +++ b/src/dve/core_engine/type_hints.py @@ -135,6 +135,8 @@ """The value that caused the error.""" ErrorCategory = Literal["Blank", "Wrong format", "Bad value", "Bad file"] """A string indicating the category of the error.""" +RecordIndex = Optional[int] +"""The record index that the error relates to (if applicable)""" MessageTuple = tuple[ Optional[EntityName], @@ -146,6 +148,7 @@ ErrorMessage, ErrorCode, ReportingField, + RecordIndex, Optional[FieldValue], Optional[ErrorCategory], ] diff --git a/src/dve/pipeline/pipeline.py b/src/dve/pipeline/pipeline.py index 04ca8ed..b14ada1 100644 --- a/src/dve/pipeline/pipeline.py +++ b/src/dve/pipeline/pipeline.py @@ -434,7 +434,7 @@ def apply_data_contract( entity_locations[fh.get_file_name(path)] = path entities[fh.get_file_name(path)] = self.data_contract.add_record_index( self.data_contract.read_parquet(path) - ) + ) key_fields = {model: conf.reporting_fields for model, conf in model_config.items()} @@ -565,9 +565,9 @@ def apply_business_rules( for parquet_uri, _ in fh.iter_prefix(contract): file_name = fh.get_file_name(parquet_uri) - entities[file_name] = self.step_implementations.add_record_index( - self.step_implementations.read_parquet(parquet_uri) # type: ignore - ) + entities[file_name] = self.step_implementations.add_record_index( # type: ignore + self.step_implementations.read_parquet(parquet_uri) # type: ignore + ) entities[f"Original{file_name}"] = self.step_implementations.read_parquet(parquet_uri) # type: ignore sub_info_entity = (