44import os
55from pathlib import Path
66
7+ import pytest
78from duckdb import DuckDBPyConnection
89
910from tests .utils import generate_sample_embeddings_for_run , generate_sample_records
1011from timdex_dataset_api import TIMDEXDataset
12+ from timdex_dataset_api .data_source import TIMDEXDataSource
1113from timdex_dataset_api .embeddings import TIMDEXEmbeddings
12- from timdex_dataset_api .metadata import DataTypeMetadataConfig , TIMDEXDatasetMetadata
14+ from timdex_dataset_api .metadata import TIMDEXDatasetMetadata
1315from timdex_dataset_api .records import TIMDEXRecords
1416
1517
@@ -33,46 +35,53 @@ def test_tdm_s3_dataset_structure_properties(timdex_dataset_empty):
3335 assert timdex_dataset_empty .location_scheme == "file"
3436
3537
36- def test_data_type_metadata_config_prejoin_records_default_true ():
37- config = DataTypeMetadataConfig (
38- name = "example" ,
39- metadata_columns = ["timdex_record_id" ],
40- data_path = "data/example" ,
41- )
42- assert config .prejoin_records is True
43-
44-
45- def test_data_source_metadata_configs_are_derived_from_base_class ():
46- assert TIMDEXRecords .METADATA_CONFIG .name == TIMDEXRecords .NAME
47- assert TIMDEXRecords .METADATA_CONFIG .data_path == TIMDEXRecords .DATA_PATH
48- assert TIMDEXRecords .METADATA_CONFIG .prejoin_records is False
38+ def test_data_source_metadata_columns_are_derived_from_base_class ():
4939 assert (
50- TIMDEXRecords .METADATA_CONFIG . metadata_columns
40+ TIMDEXRecords .SOURCE_METADATA_COLUMNS
5141 == TIMDEXDatasetMetadata .BASE_METADATA_COLUMNS
5242 )
43+ assert TIMDEXRecords .METADATA_COLUMNS == TIMDEXDatasetMetadata .BASE_METADATA_COLUMNS
5344
54- assert TIMDEXEmbeddings .METADATA_CONFIG .name == TIMDEXEmbeddings .NAME
55- assert TIMDEXEmbeddings .METADATA_CONFIG .data_path == TIMDEXEmbeddings .DATA_PATH
56- assert TIMDEXEmbeddings .METADATA_CONFIG .prejoin_records is True
57- assert TIMDEXEmbeddings .METADATA_CONFIG .metadata_columns == [
45+ assert TIMDEXEmbeddings .SOURCE_METADATA_COLUMNS == [
5846 "timdex_record_id" ,
5947 "run_id" ,
6048 "run_record_offset" ,
61- * TIMDEXEmbeddings .ADDITIONAL_METADATA_COLUMNS ,
6249 "filename" ,
50+ "embedding_timestamp" ,
51+ "embedding_model" ,
52+ "embedding_strategy" ,
6353 ]
54+ assert [
55+ * TIMDEXDatasetMetadata .BASE_METADATA_COLUMNS ,
56+ "embedding_timestamp" ,
57+ "embedding_model" ,
58+ "embedding_strategy" ,
59+ ] == TIMDEXEmbeddings .METADATA_COLUMNS
60+
61+
62+ def test_data_source_subclass_requires_contract_vars ():
63+ with pytest .raises (
64+ TypeError ,
65+ match = (
66+ "InvalidDataSource must define required class vars: "
67+ "SCHEMA, DATA_COLUMNS, DATA_PATH"
68+ ),
69+ ):
6470
71+ class InvalidDataSource (TIMDEXDataSource ):
72+ NAME = "invalid"
6573
66- def test_dataset_registers_current_view_specs_from_data_sources (tmp_path ):
67- td = TIMDEXDataset (str (tmp_path / "register_current_view_specs" ))
6874
69- expected_view_names = [
70- spec .name
71- for spec in (
72- TIMDEXRecords .CURRENT_VIEW_SPECS + TIMDEXEmbeddings .CURRENT_VIEW_SPECS
73- )
75+ def test_dataset_registers_table_configs_from_data_sources (tmp_path ):
76+ td = TIMDEXDataset (str (tmp_path / "register_table_configs" ))
77+
78+ expected_table_names = [
79+ table_config .name
80+ for table_config in (TIMDEXRecords .TABLES + TIMDEXEmbeddings .TABLES )
7481 ]
75- assert [spec .name for spec in td .current_metadata_view_specs ] == expected_view_names
82+ assert [
83+ table_config .name for table_config in td .table_configs
84+ ] == expected_table_names
7685
7786
7887def test_tdm_create_metadata_database_file_success (
@@ -136,9 +145,7 @@ def test_tdm_views_created_on_init(timdex_metadata):
136145 assert expected_views <= actual_views
137146
138147
139- def test_tdm_current_view_specs_missing_dependencies_are_skipped_generically (
140- caplog , tmp_path
141- ):
148+ def test_tdm_custom_tables_missing_dependencies_are_skipped_generically (caplog , tmp_path ):
142149 dataset_path = str (tmp_path / "current_view_missing_dependencies" )
143150
144151 td = TIMDEXDataset (dataset_path )
@@ -166,25 +173,28 @@ def test_tdm_current_view_specs_missing_dependencies_are_skipped_generically(
166173 """ ).to_df ()
167174 metadata_names = set (metadata_objects .table_name )
168175
169- missing_specs = []
170- for spec in td_with_metadata .current_metadata_view_specs :
176+ missing_tables = []
177+ for table_config in td_with_metadata .table_configs :
178+ if table_config .kind != "custom" :
179+ continue
180+
171181 missing_required_tables = [
172182 table_name
173- for table_name in spec .required_metadata_tables
183+ for table_name in table_config .required_metadata_tables
174184 if table_name not in metadata_names
175185 ]
176186 if not missing_required_tables :
177187 continue
178188
179- missing_specs .append (spec .name )
180- assert spec .name not in metadata_names
189+ missing_tables .append (table_config .name )
190+ assert table_config .name not in metadata_names
181191 assert (
182192 "Skipping metadata."
183- f"{ spec .name } view creation because missing dependencies: "
193+ f"{ table_config .name } view creation because missing dependencies: "
184194 f"{ ', ' .join (missing_required_tables )} "
185195 ) in caplog .text
186196
187- assert missing_specs
197+ assert missing_tables
188198
189199
190200def test_tdm_records_view_structure (timdex_metadata ):
@@ -374,7 +384,7 @@ def test_tdm_merge_append_deltas_static_counts_match_records_count_before_merge(
374384def test_tdm_merge_append_deltas_adds_records_to_static_db (
375385 timdex_metadata_with_deltas , timdex_metadata_merged_deltas
376386):
377- columns = "," .join (TIMDEXRecords .METADATA_CONFIG . metadata_columns )
387+ columns = "," .join (TIMDEXRecords .SOURCE_METADATA_COLUMNS )
378388 append_deltas = timdex_metadata_with_deltas .timdex_dataset .conn .query (f"""
379389 select
380390 { columns }
@@ -396,10 +406,10 @@ def test_tdm_merge_append_deltas_deletes_append_deltas(
396406 timdex_metadata_with_deltas , timdex_metadata_merged_deltas
397407):
398408 records_deltas_path_before = timdex_metadata_with_deltas .append_deltas_path_for (
399- TIMDEXRecords . METADATA_CONFIG
409+ TIMDEXRecords
400410 )
401411 records_deltas_path_after = timdex_metadata_merged_deltas .append_deltas_path_for (
402- TIMDEXRecords . METADATA_CONFIG
412+ TIMDEXRecords
403413 )
404414
405415 assert timdex_metadata_with_deltas .append_deltas_count != 0
@@ -436,14 +446,7 @@ def test_tdm_embeddings_metadata_view_structure(tmp_path):
436446 """select * from metadata.embeddings limit 1;"""
437447 ).to_df ()
438448 assert len (embeddings_df ) == 1
439- # pre-joined view includes native embeddings columns + records columns
440- expected_columns = set (TIMDEXEmbeddings .METADATA_CONFIG .metadata_columns ) | {
441- "source" ,
442- "run_date" ,
443- "run_type" ,
444- "action" ,
445- "run_timestamp" ,
446- }
449+ expected_columns = set (TIMDEXEmbeddings .METADATA_COLUMNS )
447450 assert set (embeddings_df .columns ) == expected_columns
448451
449452
@@ -475,14 +478,7 @@ def test_tdm_current_embeddings_view_structure(tmp_path):
475478 ).to_df ()
476479
477480 assert len (current_embeddings_df ) == 1
478- # pre-joined view includes native embeddings columns + records columns
479- expected_columns = set (TIMDEXEmbeddings .METADATA_CONFIG .metadata_columns ) | {
480- "source" ,
481- "run_date" ,
482- "run_type" ,
483- "action" ,
484- "run_timestamp" ,
485- }
481+ expected_columns = set (TIMDEXEmbeddings .METADATA_COLUMNS )
486482 assert set (current_embeddings_df .columns ) == expected_columns
487483
488484
@@ -590,14 +586,7 @@ def test_tdm_current_run_embeddings_view_structure(tmp_path):
590586 ).to_df ()
591587
592588 assert len (current_run_embeddings_df ) == 1
593- # pre-joined view includes native embeddings columns + records columns
594- expected_columns = set (TIMDEXEmbeddings .METADATA_CONFIG .metadata_columns ) | {
595- "source" ,
596- "run_date" ,
597- "run_type" ,
598- "action" ,
599- "run_timestamp" ,
600- }
589+ expected_columns = set (TIMDEXEmbeddings .METADATA_COLUMNS )
601590 assert set (current_run_embeddings_df .columns ) == expected_columns
602591
603592
@@ -747,11 +736,7 @@ def test_tdm_keyset_paginated_query_on_prejoined_embeddings_view(tmp_path):
747736 # execute and verify results
748737 result_df = td .conn .query (query ).to_df ()
749738 assert len (result_df ) == 10 # noqa: PLR2004
750- expected_cols = set (
751- TIMDEXDatasetMetadata .BASE_METADATA_COLUMNS
752- + TIMDEXEmbeddings .ADDITIONAL_METADATA_COLUMNS
753- + ["run_id_hash" , "filename_hash" ]
754- )
739+ expected_cols = {* TIMDEXEmbeddings .METADATA_COLUMNS , "run_id_hash" , "filename_hash" }
755740 assert set (result_df .columns ) == expected_cols
756741
757742
@@ -783,9 +768,7 @@ def test_tdm_embeddings_write_append_deltas_without_static_embeddings_table(tmp_
783768 """select count(*) from metadata.embeddings_append_deltas;"""
784769 ).fetchone ()[0 ]
785770
786- embeddings_deltas_path = td .metadata .append_deltas_path_for (
787- TIMDEXEmbeddings .METADATA_CONFIG
788- )
771+ embeddings_deltas_path = td .metadata .append_deltas_path_for (TIMDEXEmbeddings )
789772 assert embeddings_count == record_count
790773 assert embeddings_deltas_count == record_count
791774 assert os .listdir (embeddings_deltas_path )
0 commit comments