|
22 | 22 | "metadata": {}, |
23 | 23 | "outputs": [], |
24 | 24 | "source": [ |
25 | | - "%pip install ttd-databricks" |
| 25 | + "%pip install ttd-databricks\n", |
| 26 | + "\n", |
| 27 | + "# Recommended to restart kerner to use updated packages\n", |
| 28 | + "dbutils.library.restartPython()" |
26 | 29 | ] |
27 | 30 | }, |
28 | 31 | { |
|
109 | 112 | "\n", |
110 | 113 | "Use `get_ttd_input_schema()` to see which columns your DataFrame must contain.\n", |
111 | 114 | "\n", |
112 | | - "For more information on the meaning of particular fields, see [openTTD](https://open.thetradedesk.com/advertiser/docsApp/GuidesAdvertiser/data/doc/post-data-advertiser-firstparty)." |
| 115 | + "For more information on the meaning of particular fields and supported data types supported per endpoint refer to the following table:\n", |
| 116 | + "\n", |
| 117 | + "| Endpoint | Context | Data API | Documentation |\n", |
| 118 | + "|---|---|---|---|\n", |
| 119 | + "| Advertiser | `AdvertiserContext` | `POST /data/advertiser` | [OpenTTD](https://open.thetradedesk.com/provider/docsApp/GuidesProvider/audience/doc/post-data-advertiser-external) |\n", |
| 120 | + "| Third Party | `ThirdPartyContext` | `POST /data/thirdparty` | [OpenTTD](https://open.thetradedesk.com/provider/docsApp/GuidesProvider/audience/doc/post-data-thirdparty) |\n", |
| 121 | + "| Offline Conversion | `OfflineConversionContext` | `POST /providerapi/offlineconversion` | [OpenTTD](https://open.thetradedesk.com/advertiser/docsApp/GuidesAdvertiser/data/doc/post-providerapi-offlineconversion) |\n", |
| 122 | + "| Deletion / Opt-Out — Advertiser | `DeletionOptOutAdvertiserContext` | `POST /data/deletion-optout/advertiser` | [OpenTTD](https://open.thetradedesk.com/provider/docsApp/GuidesProvider/audience/doc/post-data-deletion-optout-advertiser-external) |\n", |
| 123 | + "| Deletion / Opt-Out — Third Party | `DeletionOptOutThirdPartyContext` | `POST /data/deletion-optout/thirdparty` | [OpenTTD](https://open.thetradedesk.com/provider/docsApp/GuidesProvider/audience/doc/post-data-deletion-optout-thirdparty) |\n", |
| 124 | + "| Deletion / Opt-Out — Merchant | `DeletionOptOutMerchantContext` | `POST /data/deletion-optout/merchant` | [OpenTTD](https://open.thetradedesk.com/provider/docsApp/GuidesProvider/retail/doc/post-data-deletion-optout-merchant) |" |
113 | 125 | ] |
114 | 126 | }, |
115 | 127 | { |
|
120 | 132 | "source": [ |
121 | 133 | "input_schema = get_ttd_input_schema(TTDEndpoint.ADVERTISER)\n", |
122 | 134 | "print(\"Required input schema:\")\n", |
123 | | - "input_schema.printTreeString()" |
| 135 | + "\n", |
| 136 | + "for field in input_schema.fields:\n", |
| 137 | + " print(f\" {field.name}: {field.dataType.simpleString()} (nullable={field.nullable})\")" |
124 | 138 | ] |
125 | 139 | }, |
126 | 140 | { |
|
142 | 156 | "# Example: create a small sample DataFrame\n", |
143 | 157 | "# In practice, read from a Delta table or other data source\n", |
144 | 158 | "sample_data = [\n", |
145 | | - " (\"tdid-001\", \"seg-001\"),\n", |
146 | | - " (\"tdid-002\", \"seg-002\"),\n", |
147 | | - " (\"tdid-003\", \"seg-001\"),\n", |
| 159 | + " {\"id_type\": \"tdid\", \"id_value\": \"a3f1c2d4-8e7b-4f6a-9c0d-1b2e3f4a5b6c\", \"segment_name\": \"segment_1\"},\n", |
| 160 | + " {\"id_type\": \"daid\", \"id_value\": \"7d9e0f1a-2b3c-4d5e-6f7a-8b9c0d1e2f3a\", \"segment_name\": \"segment_2\"},\n", |
| 161 | + " # intentionally incorrect format for ramp_id to showcase error enrties in output\n", |
| 162 | + " {\"id_type\": \"ramp_id\", \"id_value\": \"c4d5e6f7-a8b9-4c0d-1e2f-3a4b5c6d7e8f\", \"segment_name\": \"segment_3\"},\n", |
| 163 | + " {\"id_type\": \"tdid\", \"id_value\": \"1f2a3b4c-5d6e-4f7a-8b9c-0d1e2f3a4b5c\", \"segment_name\": \"segment_4\"},\n", |
| 164 | + " {\"id_type\": \"daid\", \"id_value\": \"9b0c1d2e-3f4a-4b5c-6d7e-8f9a0b1c2d3e\", \"segment_name\": \"segment_5\"},\n", |
148 | 165 | "]\n", |
149 | | - "input_df = spark.createDataFrame(sample_data, schema=input_schema)\n", |
| 166 | + "\n", |
| 167 | + "input_df = spark.createDataFrame(sample_data)\n", |
150 | 168 | "display(input_df)" |
151 | 169 | ] |
152 | 170 | }, |
|
173 | 191 | " result_df = client.push_data(\n", |
174 | 192 | " df=input_df,\n", |
175 | 193 | " context=context,\n", |
176 | | - " batch_size=1600,\n", |
| 194 | + " batch_size=1600, # Number of rows batched together in a single request to The Trade Desk\n", |
177 | 195 | " )\n", |
178 | 196 | " display(result_df)\n", |
179 | 197 | "except TTDSchemaValidationError as e:\n", |
|
226 | 244 | "metadata": {}, |
227 | 245 | "outputs": [], |
228 | 246 | "source": [ |
229 | | - "# Create tables with default names and managed storage\n", |
| 247 | + "# Creates three managed Delta tables in the active catalog/database.\n", |
| 248 | + "# Default names: ttd_advertiser_input, ttd_advertiser_output, ttd_metadata\n", |
| 249 | + "# Pass table_name= and location= to use custom names or external storage.\n", |
230 | 250 | "input_table = client.setup_input_table(endpoint=TTDEndpoint.ADVERTISER)\n", |
231 | 251 | "output_table = client.setup_output_table(endpoint=TTDEndpoint.ADVERTISER)\n", |
232 | | - "metadata_table = client.setup_metadata_table()\n", |
| 252 | + "metadata_table = client.setup_metadata_table(table_name=\"ttd_advertiser_metadata\")\n", |
233 | 253 | "\n", |
234 | 254 | "print(f\"Input table: {input_table}\")\n", |
235 | 255 | "print(f\"Output table: {output_table}\")\n", |
236 | 256 | "print(f\"Metadata table: {metadata_table}\")" |
237 | 257 | ] |
238 | 258 | }, |
| 259 | + { |
| 260 | + "cell_type": "code", |
| 261 | + "execution_count": null, |
| 262 | + "metadata": {}, |
| 263 | + "outputs": [], |
| 264 | + "source": [ |
| 265 | + "from pyspark.sql import functions as F\n", |
| 266 | + "\n", |
| 267 | + "# This cell simulates your upstream pipeline writing records to the input table\n", |
| 268 | + "# The user is responsible to write into the input table, the SDK only performs reads from the table\n", |
| 269 | + "\n", |
| 270 | + "\n", |
| 271 | + "# updated_at is required for incremental processing: batch_process uses it\n", |
| 272 | + "# to filter rows added since the last run when process_new_records_only=True\n", |
| 273 | + "# The user is responsible to set the updated_at value for entries in the input table\n", |
| 274 | + "\n", |
| 275 | + "(\n", |
| 276 | + " spark.createDataFrame(sample_data)\n", |
| 277 | + " .withColumn(\"updated_at\", F.current_timestamp())\n", |
| 278 | + " .write.format(\"delta\")\n", |
| 279 | + " .mode(\"append\")\n", |
| 280 | + " .saveAsTable(input_table)\n", |
| 281 | + ")\n", |
| 282 | + "\n", |
| 283 | + "display(spark.table(input_table))" |
| 284 | + ] |
| 285 | + }, |
239 | 286 | { |
240 | 287 | "cell_type": "code", |
241 | 288 | "execution_count": null, |
242 | 289 | "metadata": {}, |
243 | 290 | "outputs": [], |
244 | 291 | "source": [ |
245 | 292 | "# Run batch processing (reads from input_table, writes to output_table)\n", |
| 293 | + "\n", |
| 294 | + "# process_new_records_only=True filters to rows where updated_at > last run date\n", |
| 295 | + "# On the first run, metadata_table is empty so all rows are processed\n", |
246 | 296 | "client.batch_process(\n", |
247 | 297 | " context=context,\n", |
248 | 298 | " input_table=input_table,\n", |
249 | 299 | " output_table=output_table,\n", |
250 | 300 | " metadata_table=metadata_table,\n", |
251 | | - " process_new_records_only=True, # incremental: only rows newer than last run\n", |
252 | | - " batch_size=1600,\n", |
253 | | - " parallelism=16,\n", |
| 301 | + " process_new_records_only=True, # Processes rows updated after last run; processes all rows on first run\n", |
| 302 | + " batch_size=1600, # Number of rows grouped together in a single request to The Trade Desk\n", |
| 303 | + " parallelism=16, # Number of paralellel workers processing the entries from the input table\n", |
254 | 304 | ")" |
255 | 305 | ] |
| 306 | + }, |
| 307 | + { |
| 308 | + "cell_type": "code", |
| 309 | + "execution_count": null, |
| 310 | + "metadata": {}, |
| 311 | + "outputs": [], |
| 312 | + "source": [ |
| 313 | + "# Display the output table\n", |
| 314 | + "display(spark.table(output_table))\n", |
| 315 | + "\n", |
| 316 | + "# Display the metadata table\n", |
| 317 | + "display(spark.table(metadata_table))" |
| 318 | + ] |
256 | 319 | } |
257 | 320 | ], |
258 | 321 | "metadata": { |
|
0 commit comments