rustream/config.example.yaml at main · kraftaa/rustream · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# rustream - Example Config
# Copy to config.yaml and edit for your environment.
#
# Preview before syncing:
#   rustream sync --config config.yaml --dry-run

postgres:
  host: localhost
  port: 5432
  database: mydb
  user: postgres
  password: secret

# Output: local filesystem
output:
  type: local
  path: ./output

# Output: S3 (uncomment to use instead of local)
# AWS credentials come from: env vars, ~/.aws/credentials, or IAM role
# output:
#   type: s3
#   bucket: my-data-lake
#   prefix: raw/postgres
#   region: us-east-1

# Output format: parquet (default) or iceberg
# format: parquet

# Iceberg output (uncomment to use instead of standalone Parquet)
# Writes proper Iceberg table metadata so Spark, Trino, Athena can query it.
# format: iceberg
# warehouse: s3://my-bucket/warehouse   # or ./local_warehouse
# catalog:
#   type: filesystem                     # default, zero setup
#   # type: glue                         # for Athena (requires --features glue)
#   # glue_database: my_db              # required when type=glue

# Batch size for reading rows from Postgres
batch_size: 10000

# Directory for SQLite state tracking (watermarks)
state_dir: .rustream_state

# Option 1: List specific tables
tables:
  - name: users
    incremental_column: updated_at
    incremental_tiebreaker_column: id
    columns:              # optional: pick specific columns
      - id
      - email
      - full_name
      - created_at
      - updated_at

  - name: orders
    incremental_column: updated_at
    incremental_tiebreaker_column: id
    partition_by: date      # table/year=2026/month=02/day=10/...
    # partition_by: month   # table/year=2026/month=02/...
    # partition_by: year    # table/year=2026/...

  - name: products        # no incremental_column = full sync every run

  # Example for append-only tables without updated_at:
  # - name: events
  #   incremental_column: id
  #   incremental_column_is_unique: true

# Option 2: Auto-discover all tables (remove `tables` above and uncomment below)
# schema: public          # which schema to discover from (default: public)
# exclude:                # skip these tables
#   - schema_migrations
#   - ar_internal_metadata

# ─── Ingest: load Parquet/CSV files into Postgres ───────────────
# Preview before ingesting:
#   rustream ingest --config config.yaml --dry-run
#
# ingest:
#   input:
#     type: local
#     path: ./parquet_files
#     pattern: "**/*.parquet"
#
#   # S3 input (uncomment to use instead of local):
#   # input:
#   #   type: s3
#   #   bucket: my-data-lake
#   #   prefix: raw/postgres
#   #   region: us-east-1
#   #   pattern: "**/*.parquet"
#
#   file_format: parquet             # "parquet" or "csv"
#   write_mode: insert               # "insert" | "upsert" | "truncate_insert"
#   batch_size: 5000
#   target_schema: public            # Postgres schema for target tables
#
#   tables:
#     - file_pattern: "users/*.parquet"
#       target_table: users
#       key_columns: [id]            # required for upsert mode
#       create_if_missing: true      # auto-CREATE TABLE from file schema
#
#     - file_pattern: "orders/**/*.parquet"
#       target_table: orders
#       key_columns: [id]
#
#   # If no tables listed, table name is inferred from directory/filename