-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconfig.example.yaml
More file actions
110 lines (97 loc) · 3.23 KB
/
config.example.yaml
File metadata and controls
110 lines (97 loc) · 3.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# rustream - Example Config
# Copy to config.yaml and edit for your environment.
#
# Preview before syncing:
# rustream sync --config config.yaml --dry-run
postgres:
host: localhost
port: 5432
database: mydb
user: postgres
password: secret
# Output: local filesystem
output:
type: local
path: ./output
# Output: S3 (uncomment to use instead of local)
# AWS credentials come from: env vars, ~/.aws/credentials, or IAM role
# output:
# type: s3
# bucket: my-data-lake
# prefix: raw/postgres
# region: us-east-1
# Output format: parquet (default) or iceberg
# format: parquet
# Iceberg output (uncomment to use instead of standalone Parquet)
# Writes proper Iceberg table metadata so Spark, Trino, Athena can query it.
# format: iceberg
# warehouse: s3://my-bucket/warehouse # or ./local_warehouse
# catalog:
# type: filesystem # default, zero setup
# # type: glue # for Athena (requires --features glue)
# # glue_database: my_db # required when type=glue
# Batch size for reading rows from Postgres
batch_size: 10000
# Directory for SQLite state tracking (watermarks)
state_dir: .rustream_state
# Option 1: List specific tables
tables:
- name: users
incremental_column: updated_at
incremental_tiebreaker_column: id
columns: # optional: pick specific columns
- id
- email
- full_name
- created_at
- updated_at
- name: orders
incremental_column: updated_at
incremental_tiebreaker_column: id
partition_by: date # table/year=2026/month=02/day=10/...
# partition_by: month # table/year=2026/month=02/...
# partition_by: year # table/year=2026/...
- name: products # no incremental_column = full sync every run
# Example for append-only tables without updated_at:
# - name: events
# incremental_column: id
# incremental_column_is_unique: true
# Option 2: Auto-discover all tables (remove `tables` above and uncomment below)
# schema: public # which schema to discover from (default: public)
# exclude: # skip these tables
# - schema_migrations
# - ar_internal_metadata
# ─── Ingest: load Parquet/CSV files into Postgres ───────────────
# Preview before ingesting:
# rustream ingest --config config.yaml --dry-run
#
# ingest:
# input:
# type: local
# path: ./parquet_files
# pattern: "**/*.parquet"
#
# # S3 input (uncomment to use instead of local):
# # input:
# # type: s3
# # bucket: my-data-lake
# # prefix: raw/postgres
# # region: us-east-1
# # pattern: "**/*.parquet"
#
# file_format: parquet # "parquet" or "csv"
# write_mode: insert # "insert" | "upsert" | "truncate_insert"
# batch_size: 5000
# target_schema: public # Postgres schema for target tables
#
# tables:
# - file_pattern: "users/*.parquet"
# target_table: users
# key_columns: [id] # required for upsert mode
# create_if_missing: true # auto-CREATE TABLE from file schema
#
# - file_pattern: "orders/**/*.parquet"
# target_table: orders
# key_columns: [id]
#
# # If no tables listed, table name is inferred from directory/filename