Merge branch 'datacontract:main' into main

dmaresma · web-flow · commit d770f6c93e70 · 2025-08-06T18:56:17.000-04:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,9 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Unreleased
 
+
+## [0.10.34] - 2025-08-06
+
 ### Added
 
-- `datacontract test` now supports testing HTTP APIs.
+- `datacontract test` now supports HTTP APIs.
+- `datacontract test` now supports Athena.
 
 ### Fixed
 
diff --git a/README.md b/README.md
@@ -161,6 +161,14 @@ if not run.has_passed():
 
 Choose the most appropriate installation method for your needs:
 
+### uv
+
+If you have [uv](https://docs.astral.sh/uv/) installed, you can run datacontract-cli directly without installing:
+
+```
+uv run --with 'datacontract-cli[all]' datacontract --version
+```
+
 ### pip
 Python 3.10, 3.11, and 3.12 are supported. We recommend to use Python 3.11.
 
@@ -222,6 +230,7 @@ A list of available extras:
 
 | Dependency              | Installation Command                       |
 |-------------------------|--------------------------------------------|
+| Amazon Athena           | `pip install datacontract-cli[athena]`     |
 | Avro Support            | `pip install datacontract-cli[avro]`       |
 | Google BigQuery         | `pip install datacontract-cli[bigquery]`   |
 | Databricks Integration  | `pip install datacontract-cli[databricks]` |
@@ -366,6 +375,7 @@ Credentials are provided with environment variables.
 Supported server types:
 
 - [s3](#S3)
+- [athena](#athena)
 - [bigquery](#bigquery)
 - [azure](#azure)
 - [sqlserver](#sqlserver)
@@ -436,6 +446,41 @@ servers:
 | `DATACONTRACT_S3_SESSION_TOKEN`     | `AQoDYXdzEJr...`                | AWS temporary session token (optional) |
 
 
+#### Athena
+
+Data Contract CLI can test data in AWS Athena stored in S3.
+Supports different file formats, such as Iceberg, Parquet, JSON, CSV...
+
+##### Example
+
+datacontract.yaml
+```yaml
+servers:
+  athena:
+    type: athena
+    catalog: awsdatacatalog # awsdatacatalog is the default setting
+    schema: icebergdemodb   # in Athena, this is called "database"
+    regionName: eu-central-1
+    stagingDir: s3://my-bucket/athena-results/
+models:
+  my_table: # corresponds to a table of view name
+    type: table
+    fields:
+      my_column_1: # corresponds to a column
+        type: string
+        config:
+          physicalType: varchar
+```
+
+##### Environment Variables
+
+| Environment Variable                | Example                         | Description                            |
+|-------------------------------------|---------------------------------|----------------------------------------|
+| `DATACONTRACT_S3_REGION`            | `eu-central-1`                  | Region of Athena service               |
+| `DATACONTRACT_S3_ACCESS_KEY_ID`     | `AKIAXV5Q5QABCDEFGH`            | AWS Access Key ID                      |
+| `DATACONTRACT_S3_SECRET_ACCESS_KEY` | `93S7LRrJcqLaaaa/XXXXXXXXXXXXX` | AWS Secret Access Key                  |
+| `DATACONTRACT_S3_SESSION_TOKEN`     | `AQoDYXdzEJr...`                | AWS temporary session token (optional) |
+
 
 #### Google Cloud Storage (GCS)
 
@@ -898,8 +943,10 @@ models:
 │    --engine          TEXT                                  [engine] The engine used for great    │
 │                                                            expection run.                        │
 │                                                            [default: None]                       │
-│    --template        PATH                                  [custom] The file path of Jinja       │
-│                                                            template.                             │
+│    --template        PATH                                  The file path or URL of a template.   │
+│                                                            For Excel format: path/URL to custom  │
+│                                                            Excel template. For custom format:    │
+│                                                            path to Jinja template.               │
 │                                                            [default: None]                       │
 │    --help                                                  Show this message and exit.           │
 ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
diff --git a/datacontract/engines/soda/check_soda_execute.py b/datacontract/engines/soda/check_soda_execute.py
@@ -2,6 +2,8 @@
 import typing
 import uuid
 
+from datacontract.engines.soda.connections.athena import to_athena_soda_configuration
+
 if typing.TYPE_CHECKING:
     from pyspark.sql import SparkSession
 
@@ -106,6 +108,10 @@ def check_soda_execute(
         soda_configuration_str = to_trino_soda_configuration(server)
         scan.add_configuration_yaml_str(soda_configuration_str)
         scan.set_data_source_name(server.type)
+    elif server.type == "athena":
+        soda_configuration_str = to_athena_soda_configuration(server)
+        scan.add_configuration_yaml_str(soda_configuration_str)
+        scan.set_data_source_name(server.type)
 
     else:
         run.checks.append(
diff --git a/datacontract/engines/soda/connections/athena.py b/datacontract/engines/soda/connections/athena.py
@@ -0,0 +1,79 @@
+import os
+
+import yaml
+
+from datacontract.model.exceptions import DataContractException
+
+
+def to_athena_soda_configuration(server):
+    s3_region = os.getenv("DATACONTRACT_S3_REGION")
+    s3_access_key_id = os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID")
+    s3_secret_access_key = os.getenv("DATACONTRACT_S3_SECRET_ACCESS_KEY")
+    s3_session_token = os.getenv("DATACONTRACT_S3_SESSION_TOKEN")
+
+    # Validate required parameters
+    if not s3_access_key_id:
+        raise DataContractException(
+            type="athena-connection",
+            name="missing_access_key_id",
+            reason="AWS access key ID is required. Set the DATACONTRACT_S3_ACCESS_KEY_ID environment variable.",
+            engine="datacontract",
+        )
+
+    if not s3_secret_access_key:
+        raise DataContractException(
+            type="athena-connection",
+            name="missing_secret_access_key",
+            reason="AWS secret access key is required. Set the DATACONTRACT_S3_SECRET_ACCESS_KEY environment variable.",
+            engine="datacontract",
+        )
+
+    if not hasattr(server, "schema_") or not server.schema_:
+        raise DataContractException(
+            type="athena-connection",
+            name="missing_schema",
+            reason="Schema is required for Athena connection. Specify the schema where your tables exist in the server configuration.",
+            engine="datacontract",
+        )
+
+    if not hasattr(server, "stagingDir") or not server.stagingDir:
+        raise DataContractException(
+            type="athena-connection",
+            name="missing_s3_staging_dir",
+            reason="S3 staging directory is required for Athena connection. This should be the Amazon S3 Query Result Location (e.g., 's3://my-bucket/athena-results/').",
+            engine="datacontract",
+        )
+
+    # Validate S3 staging directory format
+    if not server.stagingDir.startswith("s3://"):
+        raise DataContractException(
+            type="athena-connection",
+            name="invalid_s3_staging_dir",
+            reason=f"S3 staging directory must start with 's3://'. Got: {server.s3_staging_dir}. Example: 's3://my-bucket/athena-results/'",
+            engine="datacontract",
+        )
+
+    data_source = {
+        "type": "athena",
+        "access_key_id": s3_access_key_id,
+        "secret_access_key": s3_secret_access_key,
+        "schema": server.schema_,
+        "staging_dir": server.stagingDir,
+    }
+
+    if s3_region:
+        data_source["region_name"] = s3_region
+    elif server.region_name:
+        data_source["region_name"] = server.region_name
+
+    if server.catalog:
+        # Optional, Identify the name of the Data Source, also referred to as a Catalog. The default value is `awsdatacatalog`.
+        data_source["catalog"] = server.catalog
+
+    if s3_session_token:
+        data_source["aws_session_token"] = s3_session_token
+
+    soda_configuration = {f"data_source {server.type}": data_source}
+
+    soda_configuration_str = yaml.dump(soda_configuration)
+    return soda_configuration_str
diff --git a/datacontract/export/sql_type_converter.py b/datacontract/export/sql_type_converter.py
@@ -3,6 +3,9 @@
 
 
 def convert_to_sql_type(field: Field, server_type: str) -> str:
+    if field.config and "physicalType" in field.config:
+        return field.config["physicalType"]
+
     if server_type == "snowflake":
         return convert_to_snowflake(field)
     elif server_type == "postgres":
@@ -19,6 +22,7 @@ def convert_to_sql_type(field: Field, server_type: str) -> str:
         return convert_type_to_bigquery(field)
     elif server_type == "trino":
         return convert_type_to_trino(field)
+
     return field.type
 
 
diff --git a/datacontract/imports/odcs_v3_importer.py b/datacontract/imports/odcs_v3_importer.py
@@ -131,6 +131,7 @@ def import_servers(odcs: OpenDataContractStandard) -> Dict[str, Server] | None:
         server.host = odcs_server.host
         server.port = odcs_server.port
         server.catalog = odcs_server.catalog
+        server.stagingDir = odcs_server.stagingDir
         server.topic = getattr(odcs_server, "topic", None)
         server.http_path = getattr(odcs_server, "http_path", None)
         server.token = getattr(odcs_server, "token", None)
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "datacontract-cli"
-version = "0.10.33"
+version = "0.10.34"
 description = "The datacontract CLI is an open source command-line tool for working with Data Contracts. It uses data contract YAML files to lint the data contract, connect to data sources and execute schema and quality tests, detect breaking changes, and export to different formats. The tool is written in Python. It can be used as a standalone CLI tool, in a CI/CD pipeline, or directly as a Python library."
 license = "MIT"
 readme = "README.md"
@@ -92,6 +92,10 @@ sqlserver = [
   "soda-core-sqlserver>=3.3.20,<3.6.0"
 ]
 
+athena = [
+  "soda-core-athena>=3.3.20,<3.6.0"
+]
+
 trino = [
   "soda-core-trino>=3.3.20,<3.6.0"
 ]
@@ -122,7 +126,7 @@ protobuf = [
 ]
 
 all = [
-  "datacontract-cli[kafka,bigquery,csv,excel,snowflake,postgres,databricks,sqlserver,s3,trino,dbt,dbml,iceberg,parquet,rdf,api,protobuf]"
+  "datacontract-cli[kafka,bigquery,csv,excel,snowflake,postgres,databricks,sqlserver,s3,athena,trino,dbt,dbml,iceberg,parquet,rdf,api,protobuf]"
 ]
 
 # for development, we pin all libraries to an exact version
diff --git a/tests/fixtures/athena-iceberg/SETUP.md b/tests/fixtures/athena-iceberg/SETUP.md
@@ -0,0 +1,70 @@
+Setup:
+
+# Create an S3 bucket for Iceberg data
+
+
+```
+s3://datacontract-iceberg-demo
+```
+
+# Create an S3 bucket for Athena Results
+
+```
+s3://entropy-data-demo-athena-results-dfhsiuya
+```
+
+# Create a Glue database
+
+In Athena run:
+```
+CREATE DATABASE icebergdemodb
+```
+
+# Create an Iceberg table
+In Athena run:
+```
+CREATE TABLE athena_iceberg_table_partitioned (
+    color string,
+    date string,
+    name string,
+    price bigint,
+    product string,
+    ts timestamp)
+PARTITIONED BY (day(ts))
+LOCATION 's3://datacontract-iceberg-demo/ice_warehouse/iceberg_db/athena_iceberg_table/'
+TBLPROPERTIES (
+    'table_type' ='ICEBERG'
+)
+```
+
+# Add some data to the Iceberg table
+
+In Athena run:
+```
+INSERT INTO "icebergdemodb"."athena_iceberg_table_partitioned" VALUES (
+    'red', '222022-07-19T03:47:29', 'PersonNew', 178, 'Tuna', now()
+)
+```
+
+# Add a new IAM user 
+No permissions needed
+
+E.g. `datacontract-cli-unittests`
+
+# Create an Access Key for this IAM user
+
+Use type `other`
+Save them in .env file
+```
+DATACONTRACT_S3_ACCESS_KEY_ID=AKIA...
+DATACONTRACT_S3_SECRET_ACCESS_KEY=...
+```
+
+# Give permissions to the IAM user
+
+In Glue ->
+https://eu-central-1.console.aws.amazon.com/glue/home?region=eu-central-1#/v2/iam-permissions/select-users
+
+Select the S3 bucket
+
+Create the standard role `AWSGlueServiceRole`
diff --git a/tests/fixtures/athena-iceberg/iceberg_example.odcs.yaml b/tests/fixtures/athena-iceberg/iceberg_example.odcs.yaml
@@ -0,0 +1,42 @@
+apiVersion: v3.0.1
+kind: DataContract
+id: iceberg-example
+name: Iceberg Example
+version: 0.0.1
+status: active
+customProperties:
+  - property: owner
+    value: data--ai
+description: {}
+servers:
+  - server: athena
+    type: athena
+    description: Iceberg files on S3
+    catalog: awsdatacatalog # awsdatacatalog is the default catalog in Athena
+    schema: icebergdemodb # called database in Athena
+    regionName: eu-central-1
+    stagingDir: s3://entropy-data-demo-athena-results-dfhsiuya/cli
+schema:
+  - name: athena_iceberg_table_partitioned
+    logicalType: object
+    properties:
+      - name: color
+        logicalType: string
+        required: true
+        unique: true
+        physicalType: varchar
+      - name: date
+        logicalType: string
+        physicalType: varchar
+      - name: name
+        logicalType: string
+        physicalType: varchar
+      - name: price
+        logicalType: integer
+        physicalType: bigint
+      - name: product
+        logicalType: string
+        physicalType: varchar
+      - name: ts
+        logicalType: date
+        physicalType: timestamp(6)
diff --git a/tests/test_test_athena_iceberg.py b/tests/test_test_athena_iceberg.py
@@ -0,0 +1,26 @@
+import logging
+import os
+
+import pytest
+from dotenv import load_dotenv
+
+from datacontract.data_contract import DataContract
+
+logging.basicConfig(level=logging.INFO, force=True)
+load_dotenv(override=True)
+datacontract = "fixtures/athena-iceberg/iceberg_example.odcs.yaml"
+
+
+@pytest.mark.skipif(
+    os.environ.get("DATACONTRACT_S3_ACCESS_KEY_ID") is None
+    or os.environ.get("DATACONTRACT_S3_SECRET_ACCESS_KEY") is None,
+    reason="Requires DATACONTRACT_S3_ACCESS_KEY_ID, and DATACONTRACT_S3_SECRET_ACCESS_KEY to be set",
+)
+def test_test_athena_iceberg(monkeypatch):
+    data_contract = DataContract(data_contract_file=datacontract)
+
+    run = data_contract.test()
+
+    print(run.pretty())
+    assert run.result == "passed"
+    assert all(check.result == "passed" for check in run.checks)

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`[project]`
`2`	`2`	`name = "datacontract-cli"`
`3`		`-version = "0.10.33"`
	`3`	`+version = "0.10.34"`
`4`	`4`	`description = "The datacontract CLI is an open source command-line tool for working with Data Contracts. It uses data contract YAML files to lint the data contract, connect to data sources and execute schema and quality tests, detect breaking changes, and export to different formats. The tool is written in Python. It can be used as a standalone CLI tool, in a CI/CD pipeline, or directly as a Python library."`
`5`	`5`	`license = "MIT"`
`6`	`6`	`readme = "README.md"`
`@@ -92,6 +92,10 @@ sqlserver = [`
`92`	`92`	`"soda-core-sqlserver>=3.3.20,<3.6.0"`
`93`	`93`	`]`
`94`	`94`
	`95`	`+athena = [`
	`96`	`+ "soda-core-athena>=3.3.20,<3.6.0"`
	`97`	`+]`
	`98`	`+`
`95`	`99`	`trino = [`
`96`	`100`	`"soda-core-trino>=3.3.20,<3.6.0"`
`97`	`101`	`]`
`@@ -122,7 +126,7 @@ protobuf = [`
`122`	`126`	`]`
`123`	`127`
`124`	`128`	`all = [`
`125`		`- "datacontract-cli[kafka,bigquery,csv,excel,snowflake,postgres,databricks,sqlserver,s3,trino,dbt,dbml,iceberg,parquet,rdf,api,protobuf]"`
	`129`	`+ "datacontract-cli[kafka,bigquery,csv,excel,snowflake,postgres,databricks,sqlserver,s3,athena,trino,dbt,dbml,iceberg,parquet,rdf,api,protobuf]"`
`126`	`130`	`]`
`127`	`131`
`128`	`132`	`# for development, we pin all libraries to an exact version`