diff --git a/.github/dependabot.yml b/.github/dependabot.yml index c5dacfd1..24f5d81d 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -1,3 +1,4 @@ +--- # To get started with Dependabot version updates, you'll need to specify which # package ecosystems to update and where the package manifests are located. # Please see the documentation for all configuration options: diff --git a/.github/workflows/linter.yaml b/.github/workflows/linter.yaml index 5d8c02b0..ed5eb351 100644 --- a/.github/workflows/linter.yaml +++ b/.github/workflows/linter.yaml @@ -1,3 +1,4 @@ +--- name: Linter on: diff --git a/Makefile b/Makefile index 794f7389..515af2cb 100644 --- a/Makefile +++ b/Makefile @@ -13,7 +13,7 @@ tf_apply: terraform -chdir=infra/tf init && terraform -chdir=infra/tf apply -auto-approve bigquery_export_deploy: - cd infra/bigquery-export && npm install && npm run buildpack + cd infra/bigquery-export && npm run build #bigquery_export_spark_deploy: # cd infra/bigquery_export_spark && gcloud builds submit --region=global --tag us-docker.pkg.dev/httparchive/bigquery-spark-procedures/firestore_export:latest diff --git a/definitions/output/reports/tech_report_geos.js b/definitions/output/reports/tech_report_geos.js new file mode 100644 index 00000000..e50089a4 --- /dev/null +++ b/definitions/output/reports/tech_report_geos.js @@ -0,0 +1,31 @@ +const pastMonth = constants.fnPastMonth(constants.currentMonth) + +publish('tech_report_geos', { + schema: 'reports', + type: 'table', + tags: ['tech_report'] +}).query(ctx => ` +SELECT + geo, + adoption.mobile AS mobile_origins +FROM ${ctx.ref('reports', 'tech_report_adoption')} +WHERE + date = '${pastMonth}' + AND rank = 'ALL' + AND technology = 'ALL' + AND version = 'ALL' + ${constants.devRankFilter} +`).postOps(ctx => ` + SELECT + reports.run_export_job( + JSON '''{ + "destination": "firestore", + "config": { + "database": "tech-report-api-${constants.environment}", + "collection": "geos", + "type": "dict" + }, + "query": "SELECT * FROM ${ctx.self()}" + }''' + ); + `) diff --git a/definitions/output/reports/tech_report_ranks.js b/definitions/output/reports/tech_report_ranks.js new file mode 100644 index 00000000..4d55cb3b --- /dev/null +++ b/definitions/output/reports/tech_report_ranks.js @@ -0,0 +1,31 @@ +const pastMonth = constants.fnPastMonth(constants.currentMonth) + +publish('tech_report_ranks', { + schema: 'reports', + type: 'table', + tags: ['tech_report'] +}).query(ctx => ` +SELECT + rank, + adoption.mobile AS mobile_origins +FROM ${ctx.ref('reports', 'tech_report_adoption')} +WHERE + date = '${pastMonth}' + AND geo = 'ALL' + AND technology = 'ALL' + AND version = 'ALL' + ${constants.devRankFilter} +`).postOps(ctx => ` + SELECT + reports.run_export_job( + JSON '''{ + "destination": "firestore", + "config": { + "database": "tech-report-api-${constants.environment}", + "collection": "ranks", + "type": "dict" + }, + "query": "SELECT * FROM ${ctx.self()}" + }''' + ); + `) diff --git a/infra/bigquery-export/.dockerignore b/infra/bigquery-export/.dockerignore new file mode 100644 index 00000000..5805cc92 --- /dev/null +++ b/infra/bigquery-export/.dockerignore @@ -0,0 +1,9 @@ +node_modules +npm-debug.log +.git +.gitignore +.env +.nyc_output +coverage +*.md +.DS_Store diff --git a/infra/bigquery-export/Dockerfile b/infra/bigquery-export/Dockerfile index a50c671b..45773da4 100644 --- a/infra/bigquery-export/Dockerfile +++ b/infra/bigquery-export/Dockerfile @@ -3,11 +3,13 @@ FROM node:current-slim WORKDIR /usr/src/app -COPY . . +# Copy package files first for better caching +COPY package*.json ./ -# Clean up the node_modules directory -RUN rm -rf node_modules +# Install dependencies +RUN npm ci --only=production --quiet --no-fund --no-audit -RUN npm ci --only=production +# Copy source code +COPY . . CMD ["node", "index.js"] diff --git a/infra/bigquery-export/cloudbuild.yaml b/infra/bigquery-export/cloudbuild.yaml new file mode 100644 index 00000000..1da2eba0 --- /dev/null +++ b/infra/bigquery-export/cloudbuild.yaml @@ -0,0 +1,7 @@ +--- +steps: + - name: "gcr.io/cloud-builders/docker" + args: + ["build", "-t", "us.gcr.io/httparchive/cloud-run/bigquery-export", "."] +images: + - "us.gcr.io/httparchive/cloud-run/bigquery-export" diff --git a/infra/bigquery-export/firestore.js b/infra/bigquery-export/firestore.js index 18d8f51e..38225983 100644 --- a/infra/bigquery-export/firestore.js +++ b/infra/bigquery-export/firestore.js @@ -3,135 +3,185 @@ import { BigQueryExport } from './bigquery.js' export class FirestoreBatch { constructor () { - this.firestore = new Firestore() + this.firestore = new Firestore({ + gaxOptions: { + grpc: { + max_receive_message_length: 500 * 1024 * 1024, // 500MB + max_send_message_length: 500 * 1024 * 1024, // 500MB + 'grpc.max_connection_idle_ms': 5 * 60 * 1000, // 5 minutes + 'grpc.keepalive_time_ms': 30 * 1000, // 30 seconds + 'grpc.keepalive_timeout_ms': 60 * 1000, // 1 minute + 'grpc.keepalive_permit_without_calls': true + } + } + }) this.bigquery = new BigQueryExport() - this.batchSize = 500 - this.maxConcurrentBatches = 200 + + // Configuration constants + this.config = { + timeout: 10 * 60 * 1000, // 10 minutes + progressReportInterval: 200000, // Report progress every N operations + flushThreshold: 200000 // Flush BulkWriter every N operations + } + + this.reset() + } + + reset () { + this.processedDocs = 0 + this.totalDocs = 0 + this.bulkWriter = null } - queueBatch (operation) { - const batch = this.firestore.batch() - - this.currentBatch.forEach((doc) => { - if (operation === 'delete') { - batch.delete(doc.ref) - } else if (operation === 'set') { - const docRef = this.firestore.collection(this.collectionName).doc() - batch.set(docRef, doc) - } else { - throw new Error('Invalid operation') + createBulkWriter (operation) { + const bulkWriter = this.firestore.bulkWriter() + + // Configure error handling with progress info + bulkWriter.onWriteError((error) => { + const progressInfo = this.totalDocs > 0 ? ` (${this.processedDocs}/${this.totalDocs})` : '' + console.warn(`${operation} operation failed${progressInfo}:`, error.message) + + // Retry on transient errors, fail on permanent ones + const retryableErrors = ['deadline-exceeded', 'unavailable', 'resource-exhausted'] + return retryableErrors.includes(error.code) + }) + + // Track progress on successful writes + bulkWriter.onWriteResult(() => { + this.processedDocs++ + + // Report progress periodically + if (this.processedDocs % this.config.progressReportInterval === 0) { + const progressInfo = this.totalDocs > 0 ? ` (${this.processedDocs}/${this.totalDocs})` : ` (${this.processedDocs} processed)` + console.log(`Progress${progressInfo} - ${operation}ing documents in ${this.collectionName}`) } }) - this.batchPromises.push(batch) - this.currentBatch = [] - } - async commitBatches () { - console.log(`Committing ${this.batchPromises.length} batches to ${this.collectionName}`) - await Promise.all( - this.batchPromises.map(async (batchPromise) => await batchPromise.commit() - .catch((error) => { - console.error('Error committing batch:', error) - throw error - }) - ) - ) - this.batchPromises = [] + return bulkWriter } - async finalFlush (operation) { - if (this.currentBatch.length > 0) { - this.queueBatch(operation) + buildQuery (collectionRef) { + const queryMap = { + report: () => { + console.info(`Deleting documents from ${this.collectionName} for date ${this.date}`) + return collectionRef.where('date', '==', this.date) + }, + dict: () => { + console.info(`Deleting documents from ${this.collectionName}`) + return collectionRef + } + } + + const queryBuilder = queryMap[this.collectionType] + if (!queryBuilder) { + throw new Error(`Invalid collection type: ${this.collectionType}`) } - if (this.batchPromises.length > 0) { - await this.commitBatches() + return queryBuilder() + } + + async getDocumentCount (query) { + try { + const countSnapshot = await query.count().get() + return countSnapshot.data().count + } catch (error) { + console.warn('Could not get document count for progress tracking:', error.message) + return 0 } } async batchDelete () { console.info('Starting batch deletion...') const startTime = Date.now() - this.currentBatch = [] - this.batchPromises = [] + this.reset() - let totalDocsDeleted = 0 const collectionRef = this.firestore.collection(this.collectionName) + const collectionQuery = this.buildQuery(collectionRef) - let collectionQuery - if (this.collectionType === 'report') { - console.info('Deleting documents from ' + this.collectionName + ' for date ' + this.date) - // Query to fetch monthly documents - collectionQuery = collectionRef.where('date', '==', this.date) - } else if (this.collectionType === 'dict') { - console.info('Deleting documents from ' + this.collectionName) - collectionQuery = collectionRef - } else { - throw new Error('Invalid collection type') + // Get total count for progress tracking + this.totalDocs = await this.getDocumentCount(collectionQuery) + if (this.totalDocs > 0) { + console.info(`Total documents to delete: ${this.totalDocs}`) } - while (true) { - const snapshot = await collectionQuery.limit(this.batchSize * this.maxConcurrentBatches).get() - if (snapshot.empty) { - break - } + // Create BulkWriter for delete operations + this.bulkWriter = this.createBulkWriter('delet') - for await (const doc of snapshot.docs) { - this.currentBatch.push(doc) + let deletedCount = 0 + const batchSize = this.config.flushThreshold // Process documents in chunks - if (this.currentBatch.length >= this.batchSize) { - this.queueBatch('delete') - } - if (this.batchPromises.length >= this.maxConcurrentBatches) { - await this.commitBatches() - } - totalDocsDeleted++ - } + while (deletedCount < this.totalDocs || this.totalDocs === 0) { + const snapshot = await collectionQuery.limit(batchSize).get() + if (snapshot.empty) break + + // Add all delete operations to BulkWriter + snapshot.docs.forEach(doc => { + this.bulkWriter.delete(doc.ref) + deletedCount++ + }) + + // Periodically flush to manage memory + // if (deletedCount % this.config.flushThreshold === 0) { + console.log(`Flushing BulkWriter at ${deletedCount} operations...`) + await this.bulkWriter.flush() + // } } - await this.finalFlush('delete') + + // Final flush and close + console.log('Finalizing deletion operations...') + await this.bulkWriter.close() const duration = (Date.now() - startTime) / 1000 - console.info(`Deletion complete. Total docs deleted: ${totalDocsDeleted}. Time: ${duration} seconds`) + console.info(`Deletion complete. Total docs deleted: ${this.processedDocs}. Time: ${duration} seconds`) } - /** - * Streams BigQuery query results into a Firestore collection using batch commits. - * @param {string} query - The BigQuery SQL query. - */ async streamFromBigQuery (rowStream) { console.info('Starting BigQuery to Firestore transfer...') const startTime = Date.now() - let totalRowsProcessed = 0 + this.reset() - this.currentBatch = [] - this.batchPromises = [] + // Create BulkWriter for write operations + this.bulkWriter = this.createBulkWriter('writ') + + let rowCount = 0 + const collectionRef = this.firestore.collection(this.collectionName) for await (const row of rowStream) { - this.currentBatch.push(row) + // Add document to BulkWriter + const docRef = collectionRef.doc() + this.bulkWriter.set(docRef, row) - // Write batch when it reaches specified size - if (this.currentBatch.length >= this.batchSize) { - this.queueBatch('set') - } + rowCount++ + this.totalDocs = rowCount // Update total as we go since we can't predict BigQuery result size - if (this.batchPromises.length >= this.maxConcurrentBatches) { - await this.commitBatches() + // Periodically flush to manage memory + if (rowCount % this.config.flushThreshold === 0) { + console.log(`Flushing BulkWriter at ${rowCount} operations...`) + await this.bulkWriter.flush() } - totalRowsProcessed++ } - await this.finalFlush('set') + + // Final flush and close + console.log('Finalizing write operations...') + await this.bulkWriter.close() const duration = (Date.now() - startTime) / 1000 - console.info(`Transfer to ${this.collectionName} complete. Total rows processed: ${totalRowsProcessed}. Time: ${duration} seconds`) + console.info(`Transfer to ${this.collectionName} complete. Total rows processed: ${this.processedDocs}. Time: ${duration} seconds`) } async export (query, exportConfig) { + // Configure Firestore settings this.firestore.settings({ - databaseId: exportConfig.database + databaseId: exportConfig.database, + timeout: this.config.timeout + }) + + // Set instance properties + Object.assign(this, { + collectionName: exportConfig.collection, + collectionType: exportConfig.type, + date: exportConfig.date }) - this.collectionName = exportConfig.collection - this.collectionType = exportConfig.type - this.date = exportConfig.date await this.batchDelete() diff --git a/infra/bigquery-export/package.json b/infra/bigquery-export/package.json index 02bf0e92..d7533172 100644 --- a/infra/bigquery-export/package.json +++ b/infra/bigquery-export/package.json @@ -4,7 +4,7 @@ "main": "index.js", "scripts": { "start": "node index.js", - "buildpack": "rm -rf node_modules; gcloud builds submit --pack image=us.gcr.io/httparchive/cloud-run/bigquery-export" + "build": "gcloud builds submit" }, "type": "module", "dependencies": { diff --git a/infra/tf/.terraform.lock.hcl b/infra/tf/.terraform.lock.hcl index 10f078fb..07b73d29 100644 --- a/infra/tf/.terraform.lock.hcl +++ b/infra/tf/.terraform.lock.hcl @@ -22,41 +22,41 @@ provider "registry.terraform.io/hashicorp/archive" { } provider "registry.terraform.io/hashicorp/google" { - version = "6.37.0" + version = "6.38.0" constraints = ">= 6.13.0" hashes = [ - "h1:2x1x5lhrMLOUoQRPQw7fZ/NsJ/aRVJAPr+Tg6F8wUV0=", - "zh:0527880f838690bc32bf3d4bba42b3adefdf81e6614a169b09def759f341e11e", - "zh:39b5bf4ddebb7289db800faa14acd92e3591bcc711082058a3ecfbf868c43fdf", - "zh:3b0fb69d504d01801fa54dc1b5e8fad59f56a6a1866a7a7475a450e95a690fbf", - "zh:6b354bc2d89ee2a0f55fb11a2360ce94d185e7957b21a6b1a5f2cb01aff35e0b", - "zh:8c8783c892f3b20b425885f78dcd7fbb68fb10c4b8825b7f807eb4de950d963c", - "zh:9291034807a9d4799ecd2cbac33bf3d78aa59c6b734147b9579cd7a3d9ea207c", - "zh:9396293aed1fabc476452a2c6d14775f8e03b0d27ad558a18875fee1dc7fa8f8", - "zh:9e95308ce490dcf8efb45cd945ecf46c7a8b74ad9c65e25800b65ffd2125e4e1", - "zh:9fa9bdd07efd4eaeae1fea44e7926b1abb3d065c938c6cd5fcb0f88b12e09b68", - "zh:b684074bc12e46e671aa627849d8f515045983b53fcc56b7d6ded28abcaf4f10", - "zh:e35d5e5d89469324b8baf68b1d9599ccc1cfacb43f2cfa73107d1de7ce7f3aa9", + "h1:KVk7+B+uKgzqM410dsyo69WAXoNL/PHXJUGvwnKj0d8=", + "zh:007d14340f39d90240dd621904f7ff83fb1fc436607e74b85b343deaa060bafb", + "zh:251e457647a7658aac163de81644ff8b5f831938136fe210de48632a8e7e2289", + "zh:5e94c76c288182ac19e150ccd7349457a69901285e67b184ecfd7e6b2c1e4b03", + "zh:5ee76059d6e2989f1798a49feff379d913ba23220fef29c782c30ebbe2839f80", + "zh:78045823eacec861c9d5a3a3cbd87a4b020a6e90c6cfbc7ad7f7f5acb5d339f6", + "zh:863a8e0fa74a4823a2600d00d71577ce1ace5958d287bdff9035e752aab14eb9", + "zh:8fafbcc831650b644c3ac342cd15d5c77a4e1f9a35bfb86cf954a37fe0e63094", + "zh:b1cb7a2fe71d018033151f5ac97120dff204a0ee9e695db40dd1a22657312584", + "zh:cbfa712cc1484ca5c086ba5fdb34df393c9214ddc05f83ca2d9071a41bd215e6", "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", + "zh:fc2aae390011f7256ac88110151b67b6b45b6e856c7729ac4f2aebede5b5231e", + "zh:fe786d8cc9f63065a544a50b15ea09d8e0c9f71d8ebdc60722736fcdcf6b04d7", ] } provider "registry.terraform.io/hashicorp/google-beta" { - version = "6.37.0" + version = "6.38.0" constraints = ">= 6.13.0" hashes = [ - "h1:+AJWUA01ExrN2Ez7uJcq6EpGrIF+Sc5vDjm5M9naezY=", - "zh:087d744055310d4a276f80b5242090ea1d0aff9e64f4134a770e437d2e65b1d3", - "zh:0ede089f946d86810a91e78430bf7245db92adba6875641d344c8ff39f3ad7a5", - "zh:2c31b578bcbaf5591629919a48b980a031053b54b5923ce4a751caf2ef1f58b1", - "zh:598d03c4bc516f1d56a650579e8c165cd5be526bab57aa0f5cc47b607e31f055", - "zh:658f8346a8fff371d87c4a18040f131b7b69d26100e8da6c8998000e638820cb", - "zh:7d8664e3858842a169c0fe618839c386dcfcef82a0de225e096e6a34d96459a9", - "zh:9c31d88dce8776eb172cb31750a9d21c7e3e9284a8143fa3f7f889a15efce969", - "zh:9ec810a3affd04f3d723e786a86029cf5057bfcb7a94d193fb91e6b26e0db0c1", - "zh:9fb6ba5a282afd916470e10076a7f4d39817a7be0ba48abcef1c7ecfd24db6f8", - "zh:b6465dedc0e75557f70b020f80a8e1ab4ba0501a291880e178b84ce689da177c", - "zh:cb5b7e6f673db80e736e7a6710611d87d4727c1ef6250a6b3a70dd3401935f28", + "h1:RjKIj+Y12o34lpfoRcaTvQsNPlct2MJZFky3Vt0wHvo=", + "zh:13b9da4d63e96d4e9487e84fcf78b2e8261a66be821c939bdd5587d5d21bef24", + "zh:2f4c09e4c7e99f6b96e4abd29263cc25f62f2d87fb2efaf7ed97076cd8a2c2d3", + "zh:3bbd8fd0f91553504f229d8ee20908ff2baf4c53b9df4cb5bf42b1b058b946cd", + "zh:568d560696dec1dde68fbc30b1063c95091a609333f50742cafde5647d83f1bb", + "zh:7d407ac54a03e86f0dcb81c24eb859ed18da6d45838a91dfde3b53c1cef3ebc1", + "zh:a4a0f5aaeb9d82e4ef137ce1d2bf3e0d582f116c18dfe2dd8fef3c68630e3290", + "zh:affcbb5baea2e55c4043d5d626bbbefbdc6121d1214f0eec47a7bbebf0db8f95", + "zh:be444c9b54305e7816754f790594e0dc09de22c581677b526159f4f20fee12e2", + "zh:d988e3b0152d64fcee379255fbe5c9649733c0b625f41f5f30d72bcab042e4ca", + "zh:dc2a2d2b91930cd046e70964590d58d4e75da8e065a37f23605c9f0b1b0f6975", "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", + "zh:febf46d09277eeb9dc1323d003d612ce623b4608fac517ac64c8ce56f2352e5c", ] } diff --git a/infra/tf/bigquery_export/main.tf b/infra/tf/bigquery_export/main.tf index 7a492740..76ab75a9 100644 --- a/infra/tf/bigquery_export/main.tf +++ b/infra/tf/bigquery_export/main.tf @@ -35,7 +35,7 @@ resource "google_cloud_run_v2_job" "bigquery_export" { value = "" } } - timeout = "3600s" + timeout = "7200s" service_account = var.function_identity max_retries = 1 } diff --git a/infra/tf/data_exchange.tf b/infra/tf/data_exchange.tf index 7adda636..4a234c29 100644 --- a/infra/tf/data_exchange.tf +++ b/infra/tf/data_exchange.tf @@ -1,13 +1,3 @@ -/*import { - id = "projects/${local.project}/locations/${local.location}/dataExchanges/${local.data_exchange_id}" - to = google_bigquery_analytics_hub_data_exchange.default -} - -import { - id = "projects/${local.project}/locations/${local.location}/dataExchanges/${local.data_exchange_id}/listings/${local.listing_id}" - to = google_bigquery_analytics_hub_listing.default -}*/ - resource "google_bigquery_analytics_hub_data_exchange" "default" { data_exchange_id = "httparchive" location = local.location diff --git a/infra/tf/dataform.tf b/infra/tf/dataform.tf index edb73148..60d7c08b 100644 --- a/infra/tf/dataform.tf +++ b/infra/tf/dataform.tf @@ -1,9 +1,3 @@ -/*import { - provider = google-beta - id = "projects/${local.project}/locations/${local.region}/repositories/crawl-data" - to = google_dataform_repository.production -}*/ - # BigQuery IAM roles for Dataform locals { dataform_service_account_email = "service-226352634162@gcp-sa-dataform.iam.gserviceaccount.com" diff --git a/infra/tf/functions.tf b/infra/tf/functions.tf deleted file mode 100644 index 0e92229e..00000000 --- a/infra/tf/functions.tf +++ /dev/null @@ -1,45 +0,0 @@ -locals { - function_identity = "cloud-function@httparchive.iam.gserviceaccount.com" -} - -resource "google_project_iam_member" "project" { - for_each = toset(["roles/bigquery.jobUser", "roles/dataform.serviceAgent", "roles/run.invoker", "roles/run.jobsExecutorWithOverrides", "roles/datastore.user", "roles/storage.objectUser"]) - - project = local.project - role = each.value - member = "serviceAccount:${local.function_identity}" -} - -resource "google_bigquery_dataset_iam_member" "cloud_function_dataset_reader_role" { - for_each = toset(local.edit_datasets) - - dataset_id = each.value - role = "roles/bigquery.dataViewer" - member = "serviceAccount:${local.function_identity}" -} - -resource "google_bigquery_connection" "spark-procedures" { - connection_id = "spark-procedures" - location = local.location - spark {} -} - -resource "google_bigquery_connection" "remote-functions" { - connection_id = "remote-functions" - location = local.location - cloud_resource {} -} - -resource "google_project_iam_member" "bigquery-remote-functions-connector" { - project = local.project - role = "roles/run.invoker" - member = "serviceAccount:${google_bigquery_connection.remote-functions.cloud_resource[0].service_account_id}" -} - -resource "google_project_iam_member" "spark-procedures-connector" { - for_each = toset(["roles/datastore.user", "roles/artifactregistry.reader", "roles/bigquery.user"]) - - project = local.project - role = each.value - member = "serviceAccount:${google_bigquery_connection.spark-procedures.spark[0].service_account_id}" -} diff --git a/infra/tf/functions/main.tf b/infra/tf/functions/main.tf new file mode 100644 index 00000000..ed24bbbf --- /dev/null +++ b/infra/tf/functions/main.tf @@ -0,0 +1,40 @@ +terraform { + required_version = ">= 1.9.7" + + required_providers { + google = { + source = "hashicorp/google" + version = ">= 6.13.0" + } + } +} + +resource "google_project_iam_member" "function_identity" { + for_each = toset(["roles/bigquery.jobUser", "roles/dataform.serviceAgent", "roles/run.invoker", "roles/run.jobsExecutorWithOverrides", "roles/datastore.user", "roles/storage.objectUser"]) + + project = var.project + role = each.value + member = "serviceAccount:${var.function_identity}" +} + +resource "google_bigquery_dataset_iam_member" "cloud_function_dataset_reader_role" { + for_each = toset(var.edit_datasets) + + dataset_id = each.value + role = "roles/bigquery.dataViewer" + member = "serviceAccount:${var.function_identity}" +} + +resource "google_bigquery_connection" "remote-functions" { + connection_id = "remote-functions" + location = var.location + cloud_resource {} +} + +resource "google_project_iam_member" "bigquery-connection-remote-functions" { + for_each = toset(["roles/run.invoker"]) + + project = var.project + role = each.value + member = "serviceAccount:${google_bigquery_connection.remote-functions.cloud_resource[0].service_account_id}" +} diff --git a/infra/tf/functions/output.tf b/infra/tf/functions/output.tf new file mode 100644 index 00000000..b263ac5c --- /dev/null +++ b/infra/tf/functions/output.tf @@ -0,0 +1,9 @@ +output "google_bigquery_connection-remote_functions-id" { + description = "The connection ID for the remote functions BigQuery connection." + value = google_bigquery_connection.remote-functions.id +} + +output "remote_functions_connection_service_account_id" { + description = "The service account ID associated with the remote functions BigQuery connection." + value = google_bigquery_connection.remote-functions.cloud_resource[0].service_account_id +} diff --git a/infra/tf/functions/variables.tf b/infra/tf/functions/variables.tf new file mode 100644 index 00000000..aa401e9c --- /dev/null +++ b/infra/tf/functions/variables.tf @@ -0,0 +1,16 @@ +variable "project" { + type = string +} + +variable "function_identity" { + type = string +} + +variable "location" { + type = string +} + +variable "edit_datasets" { + type = list(string) + default = [] +} diff --git a/infra/tf/main.tf b/infra/tf/main.tf index c95ff40e..3aabee67 100644 --- a/infra/tf/main.tf +++ b/infra/tf/main.tf @@ -24,14 +24,8 @@ provider "google" { billing_project = local.project } -module "dataform_export" { - source = "./dataform_export" - - project_number = local.project_number - region = local.region - function_identity = "cloud-function@httparchive.iam.gserviceaccount.com" - function_name = "dataform-export" - remote_functions_connection = google_bigquery_connection.remote-functions.id +locals { + function_identity = "cloud-function@httparchive.iam.gserviceaccount.com" } module "dataform_trigger" { @@ -40,7 +34,7 @@ module "dataform_trigger" { project = local.project project_number = local.project_number region = local.region - function_identity = "cloud-function@httparchive.iam.gserviceaccount.com" + function_identity = local.function_identity function_name = "dataform-trigger" } @@ -50,7 +44,7 @@ module "bigquery_export" { project = local.project region = local.region location = local.location - function_identity = "cloud-function@httparchive.iam.gserviceaccount.com" + function_identity = local.function_identity function_name = "bigquery-export" } @@ -62,3 +56,22 @@ module "masthead" { # project_id = local.project # project_number = local.project_number } + +module "functions" { + source = "./functions" + project = local.project + location = local.location + function_identity = local.function_identity + edit_datasets = local.edit_datasets +} + +module "dataform_export" { + source = "./dataform_export" + + project_number = local.project_number + region = local.region + function_identity = local.function_identity + function_name = "dataform-export" + remote_functions_connection = module.functions.google_bigquery_connection-remote_functions-id + depends_on = [module.functions] +} diff --git a/workflow_settings.yaml b/workflow_settings.yaml index 6d3e2f6b..28e759c7 100644 --- a/workflow_settings.yaml +++ b/workflow_settings.yaml @@ -3,4 +3,4 @@ defaultLocation: US defaultAssertionDataset: dataform_assertions vars: - environment: prod # MUST be equal 'prod' in main branch, enables processing sampled data + environment: prod # MUST be 'prod' in main branch, enables sampling