From 5bb213809e2c6297714f500885eb7b2c3dcbf51b Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 28 Apr 2025 00:53:37 +0200 Subject: [PATCH 01/11] new api draft --- infra/tech-report-api/.gitignore | 0 infra/tech-report-api/Dockerfile | 0 infra/tech-report-api/README.md | 0 .../tech-report-api/__tests__/routes.test.js | 0 infra/tech-report-api/jest.config.js | 0 infra/tech-report-api/package.json | 0 .../src/controllers/adoptionController.js | 0 .../src/controllers/categoriesController.js | 0 .../src/controllers/cwvtechController.js | 0 .../src/controllers/geosController.js | 0 .../src/controllers/lighthouseController.js | 0 .../src/controllers/pageWeightController.js | 82 ++++++++++++++++++ .../src/controllers/ranksController.js | 0 .../src/controllers/technologiesController.js | 0 infra/tech-report-api/src/index.js | 0 infra/tech-report-api/src/routes/adoption.js | 0 .../tech-report-api/src/routes/categories.js | 0 infra/tech-report-api/src/routes/cwvtech.js | 0 infra/tech-report-api/src/routes/geos.js | 0 .../tech-report-api/src/routes/lighthouse.js | 0 .../tech-report-api/src/routes/pageWeight.js | 0 infra/tech-report-api/src/routes/ranks.js | 0 .../src/routes/technologies.js | 0 infra/tech-report-api/src/utils/db.js | 0 infra/tech-report-api/src/utils/helpers.js | 0 infra/tech-report-api/test-api.sh | 0 infra/tf/api-gateway/main.tf | 50 +++++++++++ infra/tf/api-gateway/networking.tf | 86 +++++++++++++++++++ infra/tf/api-gateway/variables.tf | 34 ++++++++ 29 files changed, 252 insertions(+) create mode 100644 infra/tech-report-api/.gitignore create mode 100644 infra/tech-report-api/Dockerfile create mode 100644 infra/tech-report-api/README.md create mode 100644 infra/tech-report-api/__tests__/routes.test.js create mode 100644 infra/tech-report-api/jest.config.js create mode 100644 infra/tech-report-api/package.json create mode 100644 infra/tech-report-api/src/controllers/adoptionController.js create mode 100644 infra/tech-report-api/src/controllers/categoriesController.js create mode 100644 infra/tech-report-api/src/controllers/cwvtechController.js create mode 100644 infra/tech-report-api/src/controllers/geosController.js create mode 100644 infra/tech-report-api/src/controllers/lighthouseController.js create mode 100644 infra/tech-report-api/src/controllers/pageWeightController.js create mode 100644 infra/tech-report-api/src/controllers/ranksController.js create mode 100644 infra/tech-report-api/src/controllers/technologiesController.js create mode 100644 infra/tech-report-api/src/index.js create mode 100644 infra/tech-report-api/src/routes/adoption.js create mode 100644 infra/tech-report-api/src/routes/categories.js create mode 100644 infra/tech-report-api/src/routes/cwvtech.js create mode 100644 infra/tech-report-api/src/routes/geos.js create mode 100644 infra/tech-report-api/src/routes/lighthouse.js create mode 100644 infra/tech-report-api/src/routes/pageWeight.js create mode 100644 infra/tech-report-api/src/routes/ranks.js create mode 100644 infra/tech-report-api/src/routes/technologies.js create mode 100644 infra/tech-report-api/src/utils/db.js create mode 100644 infra/tech-report-api/src/utils/helpers.js create mode 100755 infra/tech-report-api/test-api.sh create mode 100644 infra/tf/api-gateway/main.tf create mode 100644 infra/tf/api-gateway/networking.tf create mode 100644 infra/tf/api-gateway/variables.tf diff --git a/infra/tech-report-api/.gitignore b/infra/tech-report-api/.gitignore new file mode 100644 index 00000000..e69de29b diff --git a/infra/tech-report-api/Dockerfile b/infra/tech-report-api/Dockerfile new file mode 100644 index 00000000..e69de29b diff --git a/infra/tech-report-api/README.md b/infra/tech-report-api/README.md new file mode 100644 index 00000000..e69de29b diff --git a/infra/tech-report-api/__tests__/routes.test.js b/infra/tech-report-api/__tests__/routes.test.js new file mode 100644 index 00000000..e69de29b diff --git a/infra/tech-report-api/jest.config.js b/infra/tech-report-api/jest.config.js new file mode 100644 index 00000000..e69de29b diff --git a/infra/tech-report-api/package.json b/infra/tech-report-api/package.json new file mode 100644 index 00000000..e69de29b diff --git a/infra/tech-report-api/src/controllers/adoptionController.js b/infra/tech-report-api/src/controllers/adoptionController.js new file mode 100644 index 00000000..e69de29b diff --git a/infra/tech-report-api/src/controllers/categoriesController.js b/infra/tech-report-api/src/controllers/categoriesController.js new file mode 100644 index 00000000..e69de29b diff --git a/infra/tech-report-api/src/controllers/cwvtechController.js b/infra/tech-report-api/src/controllers/cwvtechController.js new file mode 100644 index 00000000..e69de29b diff --git a/infra/tech-report-api/src/controllers/geosController.js b/infra/tech-report-api/src/controllers/geosController.js new file mode 100644 index 00000000..e69de29b diff --git a/infra/tech-report-api/src/controllers/lighthouseController.js b/infra/tech-report-api/src/controllers/lighthouseController.js new file mode 100644 index 00000000..e69de29b diff --git a/infra/tech-report-api/src/controllers/pageWeightController.js b/infra/tech-report-api/src/controllers/pageWeightController.js new file mode 100644 index 00000000..fa996de2 --- /dev/null +++ b/infra/tech-report-api/src/controllers/pageWeightController.js @@ -0,0 +1,82 @@ +const firestore = require('../utils/db'); +const { convertToArray, createSuccessResponse, createErrorResponse } = require('../utils/helpers'); + +const TABLE = 'page_weight'; + +/** + * Get the latest date in the collection + */ +const getLatestDate = async () => { + const query = firestore.collection(TABLE).orderBy('date', 'desc').limit(1); + const snapshot = await query.get(); + if (!snapshot.empty) { + return snapshot.docs[0].data().date; + } + return null; +}; + +/** + * List Page Weight data with filtering + */ +const listPageWeightData = async (req, res) => { + try { + const params = req.query; + const data = []; + + // Required parameters check + if (!params.technology) { + return res.status(400).send(createErrorResponse([ + ['technology', 'missing technology parameter'] + ])); + } + + // Convert technology parameter to array + const techArray = convertToArray(params.technology); + + // Handle 'latest' special value for start parameter + if (params.start && params.start === 'latest') { + params.start = await getLatestDate(); + } + + // Query for each technology + for (const technology of techArray) { + let query = firestore.collection(TABLE); + + // Apply filters + if (params.start) { + query = query.where('date', '>=', params.start); + } + + if (params.end) { + query = query.where('date', '<=', params.end); + } + + if (params.geo) { + query = query.where('geo', '==', params.geo); + } + + if (params.rank) { + query = query.where('rank', '==', params.rank); + } + + // Always filter by technology + query = query.where('technology', '==', technology); + + // Execute query + const snapshot = await query.get(); + snapshot.forEach(doc => { + data.push(doc.data()); + }); + } + + // Send response + res.status(200).send(createSuccessResponse(data)); + } catch (error) { + console.error('Error fetching Page Weight data:', error); + res.status(400).send(createErrorResponse([['query', error.message]])); + } +}; + +module.exports = { + listPageWeightData +}; diff --git a/infra/tech-report-api/src/controllers/ranksController.js b/infra/tech-report-api/src/controllers/ranksController.js new file mode 100644 index 00000000..e69de29b diff --git a/infra/tech-report-api/src/controllers/technologiesController.js b/infra/tech-report-api/src/controllers/technologiesController.js new file mode 100644 index 00000000..e69de29b diff --git a/infra/tech-report-api/src/index.js b/infra/tech-report-api/src/index.js new file mode 100644 index 00000000..e69de29b diff --git a/infra/tech-report-api/src/routes/adoption.js b/infra/tech-report-api/src/routes/adoption.js new file mode 100644 index 00000000..e69de29b diff --git a/infra/tech-report-api/src/routes/categories.js b/infra/tech-report-api/src/routes/categories.js new file mode 100644 index 00000000..e69de29b diff --git a/infra/tech-report-api/src/routes/cwvtech.js b/infra/tech-report-api/src/routes/cwvtech.js new file mode 100644 index 00000000..e69de29b diff --git a/infra/tech-report-api/src/routes/geos.js b/infra/tech-report-api/src/routes/geos.js new file mode 100644 index 00000000..e69de29b diff --git a/infra/tech-report-api/src/routes/lighthouse.js b/infra/tech-report-api/src/routes/lighthouse.js new file mode 100644 index 00000000..e69de29b diff --git a/infra/tech-report-api/src/routes/pageWeight.js b/infra/tech-report-api/src/routes/pageWeight.js new file mode 100644 index 00000000..e69de29b diff --git a/infra/tech-report-api/src/routes/ranks.js b/infra/tech-report-api/src/routes/ranks.js new file mode 100644 index 00000000..e69de29b diff --git a/infra/tech-report-api/src/routes/technologies.js b/infra/tech-report-api/src/routes/technologies.js new file mode 100644 index 00000000..e69de29b diff --git a/infra/tech-report-api/src/utils/db.js b/infra/tech-report-api/src/utils/db.js new file mode 100644 index 00000000..e69de29b diff --git a/infra/tech-report-api/src/utils/helpers.js b/infra/tech-report-api/src/utils/helpers.js new file mode 100644 index 00000000..e69de29b diff --git a/infra/tech-report-api/test-api.sh b/infra/tech-report-api/test-api.sh new file mode 100755 index 00000000..e69de29b diff --git a/infra/tf/api-gateway/main.tf b/infra/tf/api-gateway/main.tf new file mode 100644 index 00000000..e2bd9da3 --- /dev/null +++ b/infra/tf/api-gateway/main.tf @@ -0,0 +1,50 @@ +###################################### +# API Gateway +###################################### +# Used to expose Internal resources to external sources, such as web applications +# See https://cloud.google.com/api-gateway/docs for more information +# The API used by the Gateway +resource "google_api_gateway_api" "api" { + provider = google-beta # API Gateway is still in beta + api_id = "api-gw-${var.environment}" + display_name = "The ${var.environment} API Gateway" + project = var.project +} +# A Configuration, consisting of an OpenAPI specification +resource "google_api_gateway_api_config" "api_config" { + provider = google-beta # API Gateway is still in beta + api = google_api_gateway_api.api.api_id + api_config_id_prefix = "api" + project = var.project + display_name = "The ${var.environment} Config" + openapi_documents { + document { + path = "spec.yaml" # File name is simply sugar to show on GCP + contents = filebase64("spec.yaml") # This is based on *who* is call the module! + } + } + gateway_config { + backend_config { + google_service_account = var.service_account_email + } + } +} +# The actual API Gateway +resource "google_api_gateway_gateway" "gateway" { + provider = google-beta + project = var.project + region = var.region + api_config = google_api_gateway_api_config.api_config.id + gateway_id = "${var.environment}-gw" + display_name = "${var.environment} Api Gateway" + labels = { + owner = "tech_report_api" + environment = var.environment + } + depends_on = [google_api_gateway_api_config.api_config] + lifecycle { + replace_triggered_by = [ + google_api_gateway_api_config.api_config + ] + } +} diff --git a/infra/tf/api-gateway/networking.tf b/infra/tf/api-gateway/networking.tf new file mode 100644 index 00000000..9fb3462a --- /dev/null +++ b/infra/tf/api-gateway/networking.tf @@ -0,0 +1,86 @@ +resource "google_compute_global_address" "default" { + #count = var.environment == "prod" ? 1 : 0 + count = 0 + + project = var.project + name = "httparchive-api-gateway-address" + address_type = "EXTERNAL" + ip_version = "IPV4" +} + +resource "google_compute_global_forwarding_rule" "https" { + #count = var.environment == "prod" ? 1 : 0 + count = 0 + + provider = google-beta + project = var.project + name = "httparchive-api-gateway-https" + target = google_compute_target_https_proxy.default[count.index].self_link + ip_address = google_compute_global_address.default[count.index].id + port_range = "443" + load_balancing_scheme = "EXTERNAL_MANAGED" +} + +resource "google_compute_managed_ssl_certificate" "default" { + #count = var.environment == "prod" ? 1 : 0 + count = 0 + + name = "httparchive-api-gateway-ssl" + managed { + domains = ["api.httparchive.org"] + } + +} + +resource "google_compute_target_https_proxy" "default" { + #count = var.environment == "prod" ? 1 : 0 + count = 0 + + provider = google-beta + project = var.project + name = "httparchive-api-gateway-https-proxy" + url_map = google_compute_url_map.default[count.index].id + ssl_certificates = [google_compute_managed_ssl_certificate.default[count.index].id] +} + +resource "google_compute_region_network_endpoint_group" "function_neg" { + #count = var.environment == "prod" ? 1 : 0 + count = 0 + + provider = google-beta + name = "httparchive-api-gateway-function-neg" + network_endpoint_type = "SERVERLESS" + project = var.project + region = var.region + + serverless_deployment { + platform = "apigateway.googleapis.com" + resource = google_api_gateway_gateway.gateway.gateway_id + } + +} + +resource "google_compute_backend_service" "backend_neg" { + #count = var.environment == "prod" ? 1 : 0 + count = 0 + + provider = google-beta + name = "httparchive-api-gateway-backend-neg" + project = var.project + load_balancing_scheme = "EXTERNAL_MANAGED" + protocol = "HTTP" + backend { + group = google_compute_region_network_endpoint_group.function_neg[count.index].self_link + } + +} + +resource "google_compute_url_map" "default" { + #count = var.environment == "prod" ? 1 : 0 + count = 0 + + provider = google-beta + project = var.project + name = "httparchive-api-gateway-url-map" + default_service = google_compute_backend_service.backend_neg[count.index].self_link +} \ No newline at end of file diff --git a/infra/tf/api-gateway/variables.tf b/infra/tf/api-gateway/variables.tf new file mode 100644 index 00000000..066f618d --- /dev/null +++ b/infra/tf/api-gateway/variables.tf @@ -0,0 +1,34 @@ +variable "environment" { + description = "The 'Environment' that is being created/deployed. Applied as a suffix to many resources." + type = string +} +variable "project" { + description = "The ID of the project in which the resource belongs. If it is not provided, the provider project is used." + type = string +} +variable "region" { + description = "The Region of this resource" + type = string +} +variable "service_account_email" { + description = "Email of the service account associated with and to run the API Gateway" + type = string +} + + + + + + + + + + + + + + + + + + From f0d7fb02deb605ab1c6f75c8099674f5997a0727 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 28 Apr 2025 00:58:58 +0200 Subject: [PATCH 02/11] ranks and geos --- .../output/reports/tech_report_geos.js | 31 +++++++++++++++++++ .../output/reports/tech_report_ranks.js | 31 +++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 definitions/output/reports/tech_report_geos.js create mode 100644 definitions/output/reports/tech_report_ranks.js diff --git a/definitions/output/reports/tech_report_geos.js b/definitions/output/reports/tech_report_geos.js new file mode 100644 index 00000000..acbf9ebb --- /dev/null +++ b/definitions/output/reports/tech_report_geos.js @@ -0,0 +1,31 @@ +const pastMonth = constants.fnPastMonth(constants.currentMonth) + +publish('tech_report_geos', { + schema: 'reports', + type: 'table', + tags: ['tech_report'] +}).query(ctx => ` +SELECT + geo, + adoption AS origins +FROM ${ctx.ref('reports', 'tech_report_adoption')} +WHERE + date = '${pastMonth}' + AND rank = 'ALL' + AND technology = 'ALL' + AND version = 'ALL' + ${constants.devRankFilter} +`).postOps(ctx => ` + SELECT + reports.run_export_job( + JSON '''{ + "destination": "firestore", + "config": { + "database": "tech-report-api-${constants.environment}", + "collection": "geos", + "type": "dict" + }, + "query": "SELECT * FROM ${ctx.self()}" + }''' + ); + `) diff --git a/definitions/output/reports/tech_report_ranks.js b/definitions/output/reports/tech_report_ranks.js new file mode 100644 index 00000000..d8e68545 --- /dev/null +++ b/definitions/output/reports/tech_report_ranks.js @@ -0,0 +1,31 @@ +const pastMonth = constants.fnPastMonth(constants.currentMonth) + +publish('tech_report_ranks', { + schema: 'reports', + type: 'table', + tags: ['tech_report'] +}).query(ctx => ` +SELECT + rank, + adoption AS origins +FROM ${ctx.ref('reports', 'tech_report_adoption')} +WHERE + date = '${pastMonth}' + AND geo = 'ALL' + AND technology = 'ALL' + AND version = 'ALL' + ${constants.devRankFilter} +`).postOps(ctx => ` + SELECT + reports.run_export_job( + JSON '''{ + "destination": "firestore", + "config": { + "database": "tech-report-api-${constants.environment}", + "collection": "ranks", + "type": "dict" + }, + "query": "SELECT * FROM ${ctx.self()}" + }''' + ); + `) From d70441eef1f1c11021842d857f9a0bf1e19a75b1 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 12 May 2025 00:05:28 +0200 Subject: [PATCH 03/11] cleanup --- infra/tech-report-api/.gitignore | 0 infra/tech-report-api/Dockerfile | 0 infra/tech-report-api/README.md | 0 .../tech-report-api/__tests__/routes.test.js | 0 infra/tech-report-api/jest.config.js | 0 infra/tech-report-api/package.json | 0 .../src/controllers/adoptionController.js | 0 .../src/controllers/categoriesController.js | 0 .../src/controllers/cwvtechController.js | 0 .../src/controllers/geosController.js | 0 .../src/controllers/lighthouseController.js | 0 .../src/controllers/pageWeightController.js | 82 ------------------ .../src/controllers/ranksController.js | 0 .../src/controllers/technologiesController.js | 0 infra/tech-report-api/src/index.js | 0 infra/tech-report-api/src/routes/adoption.js | 0 .../tech-report-api/src/routes/categories.js | 0 infra/tech-report-api/src/routes/cwvtech.js | 0 infra/tech-report-api/src/routes/geos.js | 0 .../tech-report-api/src/routes/lighthouse.js | 0 .../tech-report-api/src/routes/pageWeight.js | 0 infra/tech-report-api/src/routes/ranks.js | 0 .../src/routes/technologies.js | 0 infra/tech-report-api/src/utils/db.js | 0 infra/tech-report-api/src/utils/helpers.js | 0 infra/tech-report-api/test-api.sh | 0 infra/tf/api-gateway/main.tf | 50 ----------- infra/tf/api-gateway/networking.tf | 86 ------------------- infra/tf/api-gateway/variables.tf | 34 -------- infra/tf/functions.tf | 45 ---------- infra/tf/functions/main.tf | 31 +++++++ infra/tf/functions/variables.tf | 20 +++++ infra/tf/main.tf | 19 +++- 33 files changed, 67 insertions(+), 300 deletions(-) delete mode 100644 infra/tech-report-api/.gitignore delete mode 100644 infra/tech-report-api/Dockerfile delete mode 100644 infra/tech-report-api/README.md delete mode 100644 infra/tech-report-api/__tests__/routes.test.js delete mode 100644 infra/tech-report-api/jest.config.js delete mode 100644 infra/tech-report-api/package.json delete mode 100644 infra/tech-report-api/src/controllers/adoptionController.js delete mode 100644 infra/tech-report-api/src/controllers/categoriesController.js delete mode 100644 infra/tech-report-api/src/controllers/cwvtechController.js delete mode 100644 infra/tech-report-api/src/controllers/geosController.js delete mode 100644 infra/tech-report-api/src/controllers/lighthouseController.js delete mode 100644 infra/tech-report-api/src/controllers/pageWeightController.js delete mode 100644 infra/tech-report-api/src/controllers/ranksController.js delete mode 100644 infra/tech-report-api/src/controllers/technologiesController.js delete mode 100644 infra/tech-report-api/src/index.js delete mode 100644 infra/tech-report-api/src/routes/adoption.js delete mode 100644 infra/tech-report-api/src/routes/categories.js delete mode 100644 infra/tech-report-api/src/routes/cwvtech.js delete mode 100644 infra/tech-report-api/src/routes/geos.js delete mode 100644 infra/tech-report-api/src/routes/lighthouse.js delete mode 100644 infra/tech-report-api/src/routes/pageWeight.js delete mode 100644 infra/tech-report-api/src/routes/ranks.js delete mode 100644 infra/tech-report-api/src/routes/technologies.js delete mode 100644 infra/tech-report-api/src/utils/db.js delete mode 100644 infra/tech-report-api/src/utils/helpers.js delete mode 100755 infra/tech-report-api/test-api.sh delete mode 100644 infra/tf/api-gateway/main.tf delete mode 100644 infra/tf/api-gateway/networking.tf delete mode 100644 infra/tf/api-gateway/variables.tf delete mode 100644 infra/tf/functions.tf create mode 100644 infra/tf/functions/main.tf create mode 100644 infra/tf/functions/variables.tf diff --git a/infra/tech-report-api/.gitignore b/infra/tech-report-api/.gitignore deleted file mode 100644 index e69de29b..00000000 diff --git a/infra/tech-report-api/Dockerfile b/infra/tech-report-api/Dockerfile deleted file mode 100644 index e69de29b..00000000 diff --git a/infra/tech-report-api/README.md b/infra/tech-report-api/README.md deleted file mode 100644 index e69de29b..00000000 diff --git a/infra/tech-report-api/__tests__/routes.test.js b/infra/tech-report-api/__tests__/routes.test.js deleted file mode 100644 index e69de29b..00000000 diff --git a/infra/tech-report-api/jest.config.js b/infra/tech-report-api/jest.config.js deleted file mode 100644 index e69de29b..00000000 diff --git a/infra/tech-report-api/package.json b/infra/tech-report-api/package.json deleted file mode 100644 index e69de29b..00000000 diff --git a/infra/tech-report-api/src/controllers/adoptionController.js b/infra/tech-report-api/src/controllers/adoptionController.js deleted file mode 100644 index e69de29b..00000000 diff --git a/infra/tech-report-api/src/controllers/categoriesController.js b/infra/tech-report-api/src/controllers/categoriesController.js deleted file mode 100644 index e69de29b..00000000 diff --git a/infra/tech-report-api/src/controllers/cwvtechController.js b/infra/tech-report-api/src/controllers/cwvtechController.js deleted file mode 100644 index e69de29b..00000000 diff --git a/infra/tech-report-api/src/controllers/geosController.js b/infra/tech-report-api/src/controllers/geosController.js deleted file mode 100644 index e69de29b..00000000 diff --git a/infra/tech-report-api/src/controllers/lighthouseController.js b/infra/tech-report-api/src/controllers/lighthouseController.js deleted file mode 100644 index e69de29b..00000000 diff --git a/infra/tech-report-api/src/controllers/pageWeightController.js b/infra/tech-report-api/src/controllers/pageWeightController.js deleted file mode 100644 index fa996de2..00000000 --- a/infra/tech-report-api/src/controllers/pageWeightController.js +++ /dev/null @@ -1,82 +0,0 @@ -const firestore = require('../utils/db'); -const { convertToArray, createSuccessResponse, createErrorResponse } = require('../utils/helpers'); - -const TABLE = 'page_weight'; - -/** - * Get the latest date in the collection - */ -const getLatestDate = async () => { - const query = firestore.collection(TABLE).orderBy('date', 'desc').limit(1); - const snapshot = await query.get(); - if (!snapshot.empty) { - return snapshot.docs[0].data().date; - } - return null; -}; - -/** - * List Page Weight data with filtering - */ -const listPageWeightData = async (req, res) => { - try { - const params = req.query; - const data = []; - - // Required parameters check - if (!params.technology) { - return res.status(400).send(createErrorResponse([ - ['technology', 'missing technology parameter'] - ])); - } - - // Convert technology parameter to array - const techArray = convertToArray(params.technology); - - // Handle 'latest' special value for start parameter - if (params.start && params.start === 'latest') { - params.start = await getLatestDate(); - } - - // Query for each technology - for (const technology of techArray) { - let query = firestore.collection(TABLE); - - // Apply filters - if (params.start) { - query = query.where('date', '>=', params.start); - } - - if (params.end) { - query = query.where('date', '<=', params.end); - } - - if (params.geo) { - query = query.where('geo', '==', params.geo); - } - - if (params.rank) { - query = query.where('rank', '==', params.rank); - } - - // Always filter by technology - query = query.where('technology', '==', technology); - - // Execute query - const snapshot = await query.get(); - snapshot.forEach(doc => { - data.push(doc.data()); - }); - } - - // Send response - res.status(200).send(createSuccessResponse(data)); - } catch (error) { - console.error('Error fetching Page Weight data:', error); - res.status(400).send(createErrorResponse([['query', error.message]])); - } -}; - -module.exports = { - listPageWeightData -}; diff --git a/infra/tech-report-api/src/controllers/ranksController.js b/infra/tech-report-api/src/controllers/ranksController.js deleted file mode 100644 index e69de29b..00000000 diff --git a/infra/tech-report-api/src/controllers/technologiesController.js b/infra/tech-report-api/src/controllers/technologiesController.js deleted file mode 100644 index e69de29b..00000000 diff --git a/infra/tech-report-api/src/index.js b/infra/tech-report-api/src/index.js deleted file mode 100644 index e69de29b..00000000 diff --git a/infra/tech-report-api/src/routes/adoption.js b/infra/tech-report-api/src/routes/adoption.js deleted file mode 100644 index e69de29b..00000000 diff --git a/infra/tech-report-api/src/routes/categories.js b/infra/tech-report-api/src/routes/categories.js deleted file mode 100644 index e69de29b..00000000 diff --git a/infra/tech-report-api/src/routes/cwvtech.js b/infra/tech-report-api/src/routes/cwvtech.js deleted file mode 100644 index e69de29b..00000000 diff --git a/infra/tech-report-api/src/routes/geos.js b/infra/tech-report-api/src/routes/geos.js deleted file mode 100644 index e69de29b..00000000 diff --git a/infra/tech-report-api/src/routes/lighthouse.js b/infra/tech-report-api/src/routes/lighthouse.js deleted file mode 100644 index e69de29b..00000000 diff --git a/infra/tech-report-api/src/routes/pageWeight.js b/infra/tech-report-api/src/routes/pageWeight.js deleted file mode 100644 index e69de29b..00000000 diff --git a/infra/tech-report-api/src/routes/ranks.js b/infra/tech-report-api/src/routes/ranks.js deleted file mode 100644 index e69de29b..00000000 diff --git a/infra/tech-report-api/src/routes/technologies.js b/infra/tech-report-api/src/routes/technologies.js deleted file mode 100644 index e69de29b..00000000 diff --git a/infra/tech-report-api/src/utils/db.js b/infra/tech-report-api/src/utils/db.js deleted file mode 100644 index e69de29b..00000000 diff --git a/infra/tech-report-api/src/utils/helpers.js b/infra/tech-report-api/src/utils/helpers.js deleted file mode 100644 index e69de29b..00000000 diff --git a/infra/tech-report-api/test-api.sh b/infra/tech-report-api/test-api.sh deleted file mode 100755 index e69de29b..00000000 diff --git a/infra/tf/api-gateway/main.tf b/infra/tf/api-gateway/main.tf deleted file mode 100644 index e2bd9da3..00000000 --- a/infra/tf/api-gateway/main.tf +++ /dev/null @@ -1,50 +0,0 @@ -###################################### -# API Gateway -###################################### -# Used to expose Internal resources to external sources, such as web applications -# See https://cloud.google.com/api-gateway/docs for more information -# The API used by the Gateway -resource "google_api_gateway_api" "api" { - provider = google-beta # API Gateway is still in beta - api_id = "api-gw-${var.environment}" - display_name = "The ${var.environment} API Gateway" - project = var.project -} -# A Configuration, consisting of an OpenAPI specification -resource "google_api_gateway_api_config" "api_config" { - provider = google-beta # API Gateway is still in beta - api = google_api_gateway_api.api.api_id - api_config_id_prefix = "api" - project = var.project - display_name = "The ${var.environment} Config" - openapi_documents { - document { - path = "spec.yaml" # File name is simply sugar to show on GCP - contents = filebase64("spec.yaml") # This is based on *who* is call the module! - } - } - gateway_config { - backend_config { - google_service_account = var.service_account_email - } - } -} -# The actual API Gateway -resource "google_api_gateway_gateway" "gateway" { - provider = google-beta - project = var.project - region = var.region - api_config = google_api_gateway_api_config.api_config.id - gateway_id = "${var.environment}-gw" - display_name = "${var.environment} Api Gateway" - labels = { - owner = "tech_report_api" - environment = var.environment - } - depends_on = [google_api_gateway_api_config.api_config] - lifecycle { - replace_triggered_by = [ - google_api_gateway_api_config.api_config - ] - } -} diff --git a/infra/tf/api-gateway/networking.tf b/infra/tf/api-gateway/networking.tf deleted file mode 100644 index 9fb3462a..00000000 --- a/infra/tf/api-gateway/networking.tf +++ /dev/null @@ -1,86 +0,0 @@ -resource "google_compute_global_address" "default" { - #count = var.environment == "prod" ? 1 : 0 - count = 0 - - project = var.project - name = "httparchive-api-gateway-address" - address_type = "EXTERNAL" - ip_version = "IPV4" -} - -resource "google_compute_global_forwarding_rule" "https" { - #count = var.environment == "prod" ? 1 : 0 - count = 0 - - provider = google-beta - project = var.project - name = "httparchive-api-gateway-https" - target = google_compute_target_https_proxy.default[count.index].self_link - ip_address = google_compute_global_address.default[count.index].id - port_range = "443" - load_balancing_scheme = "EXTERNAL_MANAGED" -} - -resource "google_compute_managed_ssl_certificate" "default" { - #count = var.environment == "prod" ? 1 : 0 - count = 0 - - name = "httparchive-api-gateway-ssl" - managed { - domains = ["api.httparchive.org"] - } - -} - -resource "google_compute_target_https_proxy" "default" { - #count = var.environment == "prod" ? 1 : 0 - count = 0 - - provider = google-beta - project = var.project - name = "httparchive-api-gateway-https-proxy" - url_map = google_compute_url_map.default[count.index].id - ssl_certificates = [google_compute_managed_ssl_certificate.default[count.index].id] -} - -resource "google_compute_region_network_endpoint_group" "function_neg" { - #count = var.environment == "prod" ? 1 : 0 - count = 0 - - provider = google-beta - name = "httparchive-api-gateway-function-neg" - network_endpoint_type = "SERVERLESS" - project = var.project - region = var.region - - serverless_deployment { - platform = "apigateway.googleapis.com" - resource = google_api_gateway_gateway.gateway.gateway_id - } - -} - -resource "google_compute_backend_service" "backend_neg" { - #count = var.environment == "prod" ? 1 : 0 - count = 0 - - provider = google-beta - name = "httparchive-api-gateway-backend-neg" - project = var.project - load_balancing_scheme = "EXTERNAL_MANAGED" - protocol = "HTTP" - backend { - group = google_compute_region_network_endpoint_group.function_neg[count.index].self_link - } - -} - -resource "google_compute_url_map" "default" { - #count = var.environment == "prod" ? 1 : 0 - count = 0 - - provider = google-beta - project = var.project - name = "httparchive-api-gateway-url-map" - default_service = google_compute_backend_service.backend_neg[count.index].self_link -} \ No newline at end of file diff --git a/infra/tf/api-gateway/variables.tf b/infra/tf/api-gateway/variables.tf deleted file mode 100644 index 066f618d..00000000 --- a/infra/tf/api-gateway/variables.tf +++ /dev/null @@ -1,34 +0,0 @@ -variable "environment" { - description = "The 'Environment' that is being created/deployed. Applied as a suffix to many resources." - type = string -} -variable "project" { - description = "The ID of the project in which the resource belongs. If it is not provided, the provider project is used." - type = string -} -variable "region" { - description = "The Region of this resource" - type = string -} -variable "service_account_email" { - description = "Email of the service account associated with and to run the API Gateway" - type = string -} - - - - - - - - - - - - - - - - - - diff --git a/infra/tf/functions.tf b/infra/tf/functions.tf deleted file mode 100644 index 0e92229e..00000000 --- a/infra/tf/functions.tf +++ /dev/null @@ -1,45 +0,0 @@ -locals { - function_identity = "cloud-function@httparchive.iam.gserviceaccount.com" -} - -resource "google_project_iam_member" "project" { - for_each = toset(["roles/bigquery.jobUser", "roles/dataform.serviceAgent", "roles/run.invoker", "roles/run.jobsExecutorWithOverrides", "roles/datastore.user", "roles/storage.objectUser"]) - - project = local.project - role = each.value - member = "serviceAccount:${local.function_identity}" -} - -resource "google_bigquery_dataset_iam_member" "cloud_function_dataset_reader_role" { - for_each = toset(local.edit_datasets) - - dataset_id = each.value - role = "roles/bigquery.dataViewer" - member = "serviceAccount:${local.function_identity}" -} - -resource "google_bigquery_connection" "spark-procedures" { - connection_id = "spark-procedures" - location = local.location - spark {} -} - -resource "google_bigquery_connection" "remote-functions" { - connection_id = "remote-functions" - location = local.location - cloud_resource {} -} - -resource "google_project_iam_member" "bigquery-remote-functions-connector" { - project = local.project - role = "roles/run.invoker" - member = "serviceAccount:${google_bigquery_connection.remote-functions.cloud_resource[0].service_account_id}" -} - -resource "google_project_iam_member" "spark-procedures-connector" { - for_each = toset(["roles/datastore.user", "roles/artifactregistry.reader", "roles/bigquery.user"]) - - project = local.project - role = each.value - member = "serviceAccount:${google_bigquery_connection.spark-procedures.spark[0].service_account_id}" -} diff --git a/infra/tf/functions/main.tf b/infra/tf/functions/main.tf new file mode 100644 index 00000000..6b073398 --- /dev/null +++ b/infra/tf/functions/main.tf @@ -0,0 +1,31 @@ +resource "google_project_iam_member" "project" { + for_each = toset(["roles/bigquery.jobUser", "roles/dataform.serviceAgent", "roles/run.invoker", "roles/run.jobsExecutorWithOverrides", "roles/datastore.user", "roles/storage.objectUser"]) + + project = var.project + role = each.value + member = "serviceAccount:${var.function_identity}" +} + +resource "google_bigquery_dataset_iam_member" "cloud_function_dataset_reader_role" { + for_each = toset(var.edit_datasets) + + dataset_id = each.value + role = "roles/bigquery.dataViewer" + member = "serviceAccount:${var.function_identity}" +} + +resource "google_bigquery_connection" "remote-functions" { + connection_id = "remote-functions" + location = var.location + cloud_resource {} +} + +resource "google_project_iam_member" "bigquery-remote-functions-connector" { + project = var.project + role = "roles/run.invoker" + member = "serviceAccount:${google_bigquery_connection.remote-functions.cloud_resource[0].service_account_id}" +} + +data "google_project" "project" { + project_id = var.project +} diff --git a/infra/tf/functions/variables.tf b/infra/tf/functions/variables.tf new file mode 100644 index 00000000..2f5bd083 --- /dev/null +++ b/infra/tf/functions/variables.tf @@ -0,0 +1,20 @@ +variable "project" { + type = string +} + +variable "function_identity" { + type = string +} + +variable "location" { + type = string +} + +variable "region" { + type = string +} + +variable "edit_datasets" { + type = list(string) + default = [] +} diff --git a/infra/tf/main.tf b/infra/tf/main.tf index c95ff40e..0281f5c5 100644 --- a/infra/tf/main.tf +++ b/infra/tf/main.tf @@ -24,12 +24,16 @@ provider "google" { billing_project = local.project } +locals { + function_identity = "cloud-function@httparchive.iam.gserviceaccount.com" +} + module "dataform_export" { source = "./dataform_export" project_number = local.project_number region = local.region - function_identity = "cloud-function@httparchive.iam.gserviceaccount.com" + function_identity = local.function_identity function_name = "dataform-export" remote_functions_connection = google_bigquery_connection.remote-functions.id } @@ -40,7 +44,7 @@ module "dataform_trigger" { project = local.project project_number = local.project_number region = local.region - function_identity = "cloud-function@httparchive.iam.gserviceaccount.com" + function_identity = local.function_identity function_name = "dataform-trigger" } @@ -50,7 +54,7 @@ module "bigquery_export" { project = local.project region = local.region location = local.location - function_identity = "cloud-function@httparchive.iam.gserviceaccount.com" + function_identity = local.function_identity function_name = "bigquery-export" } @@ -62,3 +66,12 @@ module "masthead" { # project_id = local.project # project_number = local.project_number } + +module "functions" { + source = "./functions" + + project = local.project + location = local.location + function_identity = local.function_identity + edit_datasets = local.edit_datasets +} From c4a35fafd74d854c4cc077122e31dcd9da3b828b Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Wed, 21 May 2025 10:44:10 +0200 Subject: [PATCH 04/11] lint --- infra/tf/.terraform.lock.hcl | 52 ++++++++++++++++----------------- infra/tf/data_exchange.tf | 10 ------- infra/tf/dataform.tf | 6 ---- infra/tf/functions/main.tf | 15 +++++++--- infra/tf/functions/variables.tf | 6 +--- infra/tf/main.tf | 1 - 6 files changed, 38 insertions(+), 52 deletions(-) diff --git a/infra/tf/.terraform.lock.hcl b/infra/tf/.terraform.lock.hcl index 48e15e0b..a2a72d51 100644 --- a/infra/tf/.terraform.lock.hcl +++ b/infra/tf/.terraform.lock.hcl @@ -22,41 +22,41 @@ provider "registry.terraform.io/hashicorp/archive" { } provider "registry.terraform.io/hashicorp/google" { - version = "6.35.0" + version = "6.36.0" constraints = ">= 6.13.0" hashes = [ - "h1:cwsqYGV0m2nME4R39OiG8YbDyayea5l2aahm9y/YHB0=", - "zh:0f7b8da0050470b074c2c5124359a407a15298dc3374cb2fbf5d243f7a390712", - "zh:27a8774e750a8b6ff296240d74ba322c3ac626ea847e150d745c014440c66927", - "zh:3cf134f088f31811f35c5996282b316308ad7a02b801cc84712696d536ef57e3", - "zh:3e08b3b451b33101f5d65fddc0d4f00e53260fcc8702ddd3d2ebab93b3553df1", - "zh:5fe57182514244b410c33d1b7bde8b1a78442323d97b13d0b08ad0fac884ad55", - "zh:6db9431b1bb9ac198eb5e0e6706d2c12ad5d47961e09d731be91cfd97f17c18b", - "zh:76309d66d6806f75639126a49db996b51496fa64d2313466f46f476284802ab3", - "zh:897c2cb1411dfed658419de9be24a2b82f64e236449f45f76d9a13c244c872de", - "zh:a5b4e8771fe4547979a767c60cf44b590f52f296d75f161bfa032c6fe6364bc2", - "zh:bb3297ad9bdf20b9d460fe0801ae3d555460bad3bfbf0c062094afa27125afc1", - "zh:cb7ed92b9af88f29f8faef4de844d57d3f4aeccce92edf3b4a33e8b4c6049b38", + "h1:lxi5P8TBMfiKk88FI4xM2Lnwlk2+Q3r9oa4yrg83x1w=", + "zh:0a67432c04d4c74829632cc5669b8c5988f837259333307e07c2915a0529b3fb", + "zh:4559afe21bb59e8fb9e3e7414ea65b6be233e8f217e8683dd0e9c347ecadf910", + "zh:58a34fe3b28271deba9a44db1e704c2844f2e30e252ded5f200a8f9af170d52d", + "zh:6b07f388b4fb2189d9fe3a058831e5c755092f7bd5f7388a1c0d9583f8c43ef0", + "zh:6fae25f93bf4a6fd59ce0f9d05e8551b65b4be7084f5e6e5f528ab011dbbef6a", + "zh:73be19906c569f1d46b8f88d3e846bd8dabd6ed65d8ba9a91f67da5365b534dc", + "zh:7b047330342f600e92c02e248f72eefae1a2e01c16ef45cc533942eb73c49c06", + "zh:9fecf5cdf1a16b9b9c1e83dc6129a40bfbc9e640252d2afddd34dc61de213330", + "zh:c557ea7357880615af290452ff06bc23bd3821132783240bd7cd75aadf16a04a", + "zh:dc11ad9a0b595b70490326563ea5f77de9c69aba0ee959971392e46b10a3c246", + "zh:ed8dc5d5ce3d8e3c1648c3fc6f48a19da9ed38d0157c33019c74b7790480c29f", "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", ] } provider "registry.terraform.io/hashicorp/google-beta" { - version = "6.35.0" + version = "6.36.0" constraints = ">= 6.13.0" hashes = [ - "h1:EBLF+Zlt2eksFWh1QicX50zy7pj+QGqQWywtUB/lOMY=", - "zh:119b4d11dbaff065dc9e91d11d0bfde3076ee21313a61eeee459928ae20ca366", - "zh:137fda0ed8d2d4588021c8f18e61ee0d5219d4f5d95821ee65a1cda3d3574d2c", - "zh:280a5ea88053511443c1b6b2270c415b46df6a7b2ab4e87889d64668c0840440", - "zh:65bd3715520885330cb55289e62baf5354f75de039137657c3b1cf1a5f9761d4", - "zh:7099ca906f54888131da1ceb0a25444b4a9e5a533341cba283d874ef144c51ac", - "zh:b3b1f093f177cf5166b0e38257e8bc0e039cdf27f28693004304c3de387df165", - "zh:b52eefd0a02a87ce1ff5e87026adfe7c693c7e6fa8bfad0e35f70b556486e103", - "zh:bbec0ab75da4d3a972fe2f0bc28a0be8916a8b167d7d889ff6f4611b4536b4a9", - "zh:c7638b10aecfe1659c9c829de52c6b83b58368c1d55c49ebc8945ef289d8f2cf", - "zh:c92260e7b06f5394f101471d0c19759af620e38d145fa54b104d6ed3a68bf437", - "zh:f4bd6cd39368d7a49ffedbbe03b49750316285cd48987a945d3bdc9a4dfb8cce", + "h1:fwzLITL6eMQsPWNw1vgCm+H4rZnYwf//Vt1B8wJiGCI=", + "zh:2dec781899c11f6439442969cb531f56e2b0f9703ca837beee4fc4056f6287e8", + "zh:34ef667719f0a73944aa98eb26883df343af74aa831efc331512fc93a50a20df", + "zh:4877b60c56fdb5fe268ecb7b6328ea8da6cac7b4a6bd0ec09aa49ad3ab7b2d33", + "zh:6de42ed7eaae3e963bc67d9e432fb7a5663d9d832a0ba53edddf05cfabf59ee3", + "zh:762bb0bf31584f7784959102b38c231c0360c96738f1048fadf0f3e711b1bb70", + "zh:82e728aae46c7501a964e9963d42414c94219e228d3b6afbbac463b8507f79b2", + "zh:a07ed04849dc6b4e0bf943719cd5cb227ceef0cc27bc004396bdbd2baa809bff", + "zh:b91f8a67d7edd012bf25d85e6cbb2479d35c1dbc6e5d6d102008688427ff10d2", + "zh:bf5ad40aac28399da461b9e03df83aa4183a299ccfdfb1047b0f5dc1b46f0503", + "zh:c103cb3b828cc78a9fad3c3bea078d5adc4dd3ab20e7801b424b4c3151c7f739", "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", + "zh:ffad3c15795107705cac7faa2a5deb5d00f2aaf95163f0f2cddbf08677d55a8c", ] } diff --git a/infra/tf/data_exchange.tf b/infra/tf/data_exchange.tf index 7adda636..4a234c29 100644 --- a/infra/tf/data_exchange.tf +++ b/infra/tf/data_exchange.tf @@ -1,13 +1,3 @@ -/*import { - id = "projects/${local.project}/locations/${local.location}/dataExchanges/${local.data_exchange_id}" - to = google_bigquery_analytics_hub_data_exchange.default -} - -import { - id = "projects/${local.project}/locations/${local.location}/dataExchanges/${local.data_exchange_id}/listings/${local.listing_id}" - to = google_bigquery_analytics_hub_listing.default -}*/ - resource "google_bigquery_analytics_hub_data_exchange" "default" { data_exchange_id = "httparchive" location = local.location diff --git a/infra/tf/dataform.tf b/infra/tf/dataform.tf index edb73148..60d7c08b 100644 --- a/infra/tf/dataform.tf +++ b/infra/tf/dataform.tf @@ -1,9 +1,3 @@ -/*import { - provider = google-beta - id = "projects/${local.project}/locations/${local.region}/repositories/crawl-data" - to = google_dataform_repository.production -}*/ - # BigQuery IAM roles for Dataform locals { dataform_service_account_email = "service-226352634162@gcp-sa-dataform.iam.gserviceaccount.com" diff --git a/infra/tf/functions/main.tf b/infra/tf/functions/main.tf index 6b073398..66c4b7e5 100644 --- a/infra/tf/functions/main.tf +++ b/infra/tf/functions/main.tf @@ -1,3 +1,14 @@ +terraform { + required_version = ">= 1.9.7" + + required_providers { + google = { + source = "hashicorp/google" + version = ">= 6.13.0" + } + } +} + resource "google_project_iam_member" "project" { for_each = toset(["roles/bigquery.jobUser", "roles/dataform.serviceAgent", "roles/run.invoker", "roles/run.jobsExecutorWithOverrides", "roles/datastore.user", "roles/storage.objectUser"]) @@ -25,7 +36,3 @@ resource "google_project_iam_member" "bigquery-remote-functions-connector" { role = "roles/run.invoker" member = "serviceAccount:${google_bigquery_connection.remote-functions.cloud_resource[0].service_account_id}" } - -data "google_project" "project" { - project_id = var.project -} diff --git a/infra/tf/functions/variables.tf b/infra/tf/functions/variables.tf index 2f5bd083..aa401e9c 100644 --- a/infra/tf/functions/variables.tf +++ b/infra/tf/functions/variables.tf @@ -10,11 +10,7 @@ variable "location" { type = string } -variable "region" { - type = string -} - variable "edit_datasets" { - type = list(string) + type = list(string) default = [] } diff --git a/infra/tf/main.tf b/infra/tf/main.tf index 0281f5c5..3615372a 100644 --- a/infra/tf/main.tf +++ b/infra/tf/main.tf @@ -69,7 +69,6 @@ module "masthead" { module "functions" { source = "./functions" - project = local.project location = local.location function_identity = local.function_identity From 15fb58185ece7ee4bec1d8104f9711185520f87e Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sun, 25 May 2025 19:20:36 +0200 Subject: [PATCH 05/11] flat mobile origins --- definitions/output/reports/tech_report_geos.js | 2 +- definitions/output/reports/tech_report_ranks.js | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/definitions/output/reports/tech_report_geos.js b/definitions/output/reports/tech_report_geos.js index acbf9ebb..e50089a4 100644 --- a/definitions/output/reports/tech_report_geos.js +++ b/definitions/output/reports/tech_report_geos.js @@ -7,7 +7,7 @@ publish('tech_report_geos', { }).query(ctx => ` SELECT geo, - adoption AS origins + adoption.mobile AS mobile_origins FROM ${ctx.ref('reports', 'tech_report_adoption')} WHERE date = '${pastMonth}' diff --git a/definitions/output/reports/tech_report_ranks.js b/definitions/output/reports/tech_report_ranks.js index d8e68545..4d55cb3b 100644 --- a/definitions/output/reports/tech_report_ranks.js +++ b/definitions/output/reports/tech_report_ranks.js @@ -7,7 +7,7 @@ publish('tech_report_ranks', { }).query(ctx => ` SELECT rank, - adoption AS origins + adoption.mobile AS mobile_origins FROM ${ctx.ref('reports', 'tech_report_adoption')} WHERE date = '${pastMonth}' From bb8ca1ca72818bc3b3a377e96a97940e9e47fcc0 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sun, 25 May 2025 22:31:41 +0200 Subject: [PATCH 06/11] lint --- definitions/output/f1/requests_latest.js | 1 - infra/tf/main.tf | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/definitions/output/f1/requests_latest.js b/definitions/output/f1/requests_latest.js index 6ac5555a..bf3d9a32 100644 --- a/definitions/output/f1/requests_latest.js +++ b/definitions/output/f1/requests_latest.js @@ -52,4 +52,3 @@ FROM ${ctx.ref('crawl', 'requests')} WHERE date = '${constants.currentMonth}' AND client = 'desktop' `) - diff --git a/infra/tf/main.tf b/infra/tf/main.tf index 3615372a..054b17ef 100644 --- a/infra/tf/main.tf +++ b/infra/tf/main.tf @@ -68,7 +68,7 @@ module "masthead" { } module "functions" { - source = "./functions" + source = "./functions" project = local.project location = local.location function_identity = local.function_identity From e02145881e427f9d4b76404ddea3810194fb3c18 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Thu, 29 May 2025 23:57:23 +0200 Subject: [PATCH 07/11] firestore timeout --- infra/bigquery-export/firestore.js | 56 ++++++++++++++++++++++++------ infra/tf/.terraform.lock.hcl | 53 ++++++++++++++-------------- infra/tf/functions/main.tf | 8 +++-- infra/tf/functions/output.tf | 9 +++++ infra/tf/main.tf | 21 +++++------ 5 files changed, 96 insertions(+), 51 deletions(-) create mode 100644 infra/tf/functions/output.tf diff --git a/infra/bigquery-export/firestore.js b/infra/bigquery-export/firestore.js index 18d8f51e..41426960 100644 --- a/infra/bigquery-export/firestore.js +++ b/infra/bigquery-export/firestore.js @@ -3,10 +3,23 @@ import { BigQueryExport } from './bigquery.js' export class FirestoreBatch { constructor () { - this.firestore = new Firestore() + this.firestore = new Firestore({ + // Increase timeout to 10 minutes for large batch operations + gaxOptions: { + grpc: { + max_receive_message_length: 100 * 1024 * 1024, // 100MB + max_send_message_length: 100 * 1024 * 1024, // 100MB + 'grpc.max_connection_idle_ms': 5 * 60 * 1000, // 5 minutes + 'grpc.keepalive_time_ms': 30 * 1000, // 30 seconds + 'grpc.keepalive_timeout_ms': 60 * 1000, // 1 minute + 'grpc.keepalive_permit_without_calls': true + } + } + }) this.bigquery = new BigQueryExport() - this.batchSize = 500 - this.maxConcurrentBatches = 200 + this.batchSizeDelete = 500 + this.batchSizeWrite = 400 // Reduced batch size for better performance + this.maxConcurrentBatches = 100 // Reduced concurrent batches to avoid overwhelming } queueBatch (operation) { @@ -28,14 +41,34 @@ export class FirestoreBatch { async commitBatches () { console.log(`Committing ${this.batchPromises.length} batches to ${this.collectionName}`) + await Promise.all( - this.batchPromises.map(async (batchPromise) => await batchPromise.commit() - .catch((error) => { - console.error('Error committing batch:', error) - throw error - }) - ) + this.batchPromises.map(async (batchPromise, index) => { + const retryCount = 3 + let lastError + + for (let attempt = 1; attempt <= retryCount; attempt++) { + try { + await batchPromise.commit() + return + } catch (error) { + lastError = error + console.warn(`Batch ${index} attempt ${attempt} failed:`, error.message) + + if (attempt < retryCount) { + // Exponential backoff: 2^attempt seconds + const delayMs = Math.pow(2, attempt) * 1000 + console.log(`Retrying batch ${index} in ${delayMs}ms...`) + await new Promise(resolve => setTimeout(resolve, delayMs)) + } + } + } + + console.error(`Batch ${index} failed after ${retryCount} attempts:`, lastError) + throw lastError + }) ) + this.batchPromises = [] } @@ -71,7 +104,7 @@ export class FirestoreBatch { } while (true) { - const snapshot = await collectionQuery.limit(this.batchSize * this.maxConcurrentBatches).get() + const snapshot = await collectionQuery.limit(this.batchSizeDelete * this.maxConcurrentBatches).get() if (snapshot.empty) { break } @@ -127,7 +160,8 @@ export class FirestoreBatch { async export (query, exportConfig) { this.firestore.settings({ - databaseId: exportConfig.database + databaseId: exportConfig.database, + timeout: 10 * 60 * 1000 // 10 minutes timeout }) this.collectionName = exportConfig.collection this.collectionType = exportConfig.type diff --git a/infra/tf/.terraform.lock.hcl b/infra/tf/.terraform.lock.hcl index 7fb63e11..10f078fb 100644 --- a/infra/tf/.terraform.lock.hcl +++ b/infra/tf/.terraform.lock.hcl @@ -22,42 +22,41 @@ provider "registry.terraform.io/hashicorp/archive" { } provider "registry.terraform.io/hashicorp/google" { - version = "6.36.1" + version = "6.37.0" constraints = ">= 6.13.0" hashes = [ - "h1:sWQkaDmXK1VJx80HDJKaKNM2DGXoJu9cH74QOQnu/3I=", - "zh:161d054bda352a286a997f80f3d59366d072377abc6777bfe8abacc2e10ac2bf", - "zh:314c38d8050036176031f691b533184c3578036085483fed877b3b139f887047", - "zh:503c9807312feb9766f0da207d5ac149aa6f5a669144c60da38a33d072ebe2c2", - "zh:897abe484c44c625b301e828a4967c9f781fbf33b5bb50afe226410ebbfd0137", - "zh:97bcdc879b48f5f8bee98316234a1eed3c2e440e356631066d07b5d85ccfe288", - "zh:b273e940da85c5d673cbb3cafe36fb70dd8536e2d8812e617191196c312dac3b", - "zh:ba294699ca7082d394498ce62ecb55501c6a141fd1c43db00a6abdaebc92fa2e", - "zh:c50be96b5c8df5124f18ccdef473eed0bc3a2452e674c19f58fbf7ee14195af4", - "zh:d5a8fda3abda57d912be020e22bab3568facc729756e5339af9a3e56ca277cbc", - "zh:dddbfed09dff01fe538830714a13e71d7581cf2ccfba15acc0a6bb7266147bed", - "zh:e65e8c2e97d1e23c9380516b1d37e111cc700b538d8fca6b9e28ebb89e305ee0", + "h1:2x1x5lhrMLOUoQRPQw7fZ/NsJ/aRVJAPr+Tg6F8wUV0=", + "zh:0527880f838690bc32bf3d4bba42b3adefdf81e6614a169b09def759f341e11e", + "zh:39b5bf4ddebb7289db800faa14acd92e3591bcc711082058a3ecfbf868c43fdf", + "zh:3b0fb69d504d01801fa54dc1b5e8fad59f56a6a1866a7a7475a450e95a690fbf", + "zh:6b354bc2d89ee2a0f55fb11a2360ce94d185e7957b21a6b1a5f2cb01aff35e0b", + "zh:8c8783c892f3b20b425885f78dcd7fbb68fb10c4b8825b7f807eb4de950d963c", + "zh:9291034807a9d4799ecd2cbac33bf3d78aa59c6b734147b9579cd7a3d9ea207c", + "zh:9396293aed1fabc476452a2c6d14775f8e03b0d27ad558a18875fee1dc7fa8f8", + "zh:9e95308ce490dcf8efb45cd945ecf46c7a8b74ad9c65e25800b65ffd2125e4e1", + "zh:9fa9bdd07efd4eaeae1fea44e7926b1abb3d065c938c6cd5fcb0f88b12e09b68", + "zh:b684074bc12e46e671aa627849d8f515045983b53fcc56b7d6ded28abcaf4f10", + "zh:e35d5e5d89469324b8baf68b1d9599ccc1cfacb43f2cfa73107d1de7ce7f3aa9", "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", ] } provider "registry.terraform.io/hashicorp/google-beta" { - version = "6.36.1" + version = "6.37.0" constraints = ">= 6.13.0" hashes = [ - "h1:IfrOYuQK84Itc5lIEKDJjAEffX6bwadDpAt6qvEoH3E=", - "zh:36122fe31b8ccf41b71f1fb2e2527aba9925954aeb602d4ec071c38891035bb2", - "zh:3f60a033b87449d7b7596f1e07cf414641c53c5430aa5a754f85af8b83cc601d", - "zh:40e04d77aff40f88525db40f4a73cd5b4107b26817d43746c83974de9b3ac52d", - "zh:51e2e09754f3189549b0e9a7bb5ead644884570fada5bea790a3b4551950beb2", - "zh:67cc007ab4919a48637491c98fd2e2741e95098b7266219d17a42fdc8098af49", - "zh:717f28e514a58789963eba720c76fe48c9b4e8b99af714e841be3cc50f63f5e6", - "zh:7d636e59975fd4d8cec67b4cdfa736bc9d73c43d5cf2d71077d07a36eace66b3", - "zh:b43a96e0924c6ea22f94eb368cce6e07dacaff8f551789a4accfb54ee4dff782", - "zh:bfdec36fb3e821e55c5a936a9ad87a2af885f57809d89f67e43b3e82a886044c", - "zh:db3d0845f0e46fcfe8768f02791a4cf89ff81d41f16d3865c93a0efa4bae55f5", - "zh:f2258cd364aa2d1b80b0404a88fb5acb8e1dc005ef8018a03835e46ce6f25fae", + "h1:+AJWUA01ExrN2Ez7uJcq6EpGrIF+Sc5vDjm5M9naezY=", + "zh:087d744055310d4a276f80b5242090ea1d0aff9e64f4134a770e437d2e65b1d3", + "zh:0ede089f946d86810a91e78430bf7245db92adba6875641d344c8ff39f3ad7a5", + "zh:2c31b578bcbaf5591629919a48b980a031053b54b5923ce4a751caf2ef1f58b1", + "zh:598d03c4bc516f1d56a650579e8c165cd5be526bab57aa0f5cc47b607e31f055", + "zh:658f8346a8fff371d87c4a18040f131b7b69d26100e8da6c8998000e638820cb", + "zh:7d8664e3858842a169c0fe618839c386dcfcef82a0de225e096e6a34d96459a9", + "zh:9c31d88dce8776eb172cb31750a9d21c7e3e9284a8143fa3f7f889a15efce969", + "zh:9ec810a3affd04f3d723e786a86029cf5057bfcb7a94d193fb91e6b26e0db0c1", + "zh:9fb6ba5a282afd916470e10076a7f4d39817a7be0ba48abcef1c7ecfd24db6f8", + "zh:b6465dedc0e75557f70b020f80a8e1ab4ba0501a291880e178b84ce689da177c", + "zh:cb5b7e6f673db80e736e7a6710611d87d4727c1ef6250a6b3a70dd3401935f28", "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", - "zh:ffad3c15795107705cac7faa2a5deb5d00f2aaf95163f0f2cddbf08677d55a8c", ] } diff --git a/infra/tf/functions/main.tf b/infra/tf/functions/main.tf index 66c4b7e5..ed24bbbf 100644 --- a/infra/tf/functions/main.tf +++ b/infra/tf/functions/main.tf @@ -9,7 +9,7 @@ terraform { } } -resource "google_project_iam_member" "project" { +resource "google_project_iam_member" "function_identity" { for_each = toset(["roles/bigquery.jobUser", "roles/dataform.serviceAgent", "roles/run.invoker", "roles/run.jobsExecutorWithOverrides", "roles/datastore.user", "roles/storage.objectUser"]) project = var.project @@ -31,8 +31,10 @@ resource "google_bigquery_connection" "remote-functions" { cloud_resource {} } -resource "google_project_iam_member" "bigquery-remote-functions-connector" { +resource "google_project_iam_member" "bigquery-connection-remote-functions" { + for_each = toset(["roles/run.invoker"]) + project = var.project - role = "roles/run.invoker" + role = each.value member = "serviceAccount:${google_bigquery_connection.remote-functions.cloud_resource[0].service_account_id}" } diff --git a/infra/tf/functions/output.tf b/infra/tf/functions/output.tf new file mode 100644 index 00000000..b263ac5c --- /dev/null +++ b/infra/tf/functions/output.tf @@ -0,0 +1,9 @@ +output "google_bigquery_connection-remote_functions-id" { + description = "The connection ID for the remote functions BigQuery connection." + value = google_bigquery_connection.remote-functions.id +} + +output "remote_functions_connection_service_account_id" { + description = "The service account ID associated with the remote functions BigQuery connection." + value = google_bigquery_connection.remote-functions.cloud_resource[0].service_account_id +} diff --git a/infra/tf/main.tf b/infra/tf/main.tf index 054b17ef..3aabee67 100644 --- a/infra/tf/main.tf +++ b/infra/tf/main.tf @@ -28,16 +28,6 @@ locals { function_identity = "cloud-function@httparchive.iam.gserviceaccount.com" } -module "dataform_export" { - source = "./dataform_export" - - project_number = local.project_number - region = local.region - function_identity = local.function_identity - function_name = "dataform-export" - remote_functions_connection = google_bigquery_connection.remote-functions.id -} - module "dataform_trigger" { source = "./dataform_trigger" @@ -74,3 +64,14 @@ module "functions" { function_identity = local.function_identity edit_datasets = local.edit_datasets } + +module "dataform_export" { + source = "./dataform_export" + + project_number = local.project_number + region = local.region + function_identity = local.function_identity + function_name = "dataform-export" + remote_functions_connection = module.functions.google_bigquery_connection-remote_functions-id + depends_on = [module.functions] +} From 8c192174c09d32e95e283450c58d503c72d47b98 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Wed, 4 Jun 2025 13:36:38 +0200 Subject: [PATCH 08/11] perf settings refactored --- infra/bigquery-export/firestore.js | 191 ++++++++++++++++------------- infra/tf/.terraform.lock.hcl | 52 ++++---- infra/tf/bigquery_export/main.tf | 2 +- 3 files changed, 136 insertions(+), 109 deletions(-) diff --git a/infra/bigquery-export/firestore.js b/infra/bigquery-export/firestore.js index 41426960..5919a71c 100644 --- a/infra/bigquery-export/firestore.js +++ b/infra/bigquery-export/firestore.js @@ -4,11 +4,10 @@ import { BigQueryExport } from './bigquery.js' export class FirestoreBatch { constructor () { this.firestore = new Firestore({ - // Increase timeout to 10 minutes for large batch operations gaxOptions: { grpc: { - max_receive_message_length: 100 * 1024 * 1024, // 100MB - max_send_message_length: 100 * 1024 * 1024, // 100MB + max_receive_message_length: 500 * 1024 * 1024, // 500MB + max_send_message_length: 500 * 1024 * 1024, // 500MB 'grpc.max_connection_idle_ms': 5 * 60 * 1000, // 5 minutes 'grpc.keepalive_time_ms': 30 * 1000, // 30 seconds 'grpc.keepalive_timeout_ms': 60 * 1000, // 1 minute @@ -17,12 +16,54 @@ export class FirestoreBatch { } }) this.bigquery = new BigQueryExport() - this.batchSizeDelete = 500 - this.batchSizeWrite = 400 // Reduced batch size for better performance - this.maxConcurrentBatches = 100 // Reduced concurrent batches to avoid overwhelming + + // Configuration constants + this.config = { + batchSize: { + delete: 500, + write: 400 + }, + maxConcurrentBatches: 200, + retryCount: 5, + timeout: 10 * 60 * 1000 // 10 minutes + } + + this.reset() } - queueBatch (operation) { + reset () { + this.currentBatch = [] + this.batchPromises = [] + } + + getCurrentBatchSize (operation) { + return this.config.batchSize[operation === 'delete' ? 'delete' : 'write'] + } + + async commitWithRetry (batch, index) { + let lastError + + for (let attempt = 1; attempt <= this.config.retryCount; attempt++) { + try { + await batch.commit() + return + } catch (error) { + lastError = error + console.warn(`Batch ${index} attempt ${attempt} failed:`, error.message) + + if (attempt < this.config.retryCount) { + const delayMs = Math.pow(2, attempt) * 500 + console.log(`Retrying batch ${index} in ${delayMs}ms...`) + await new Promise(resolve => setTimeout(resolve, delayMs)) + } + } + } + + console.error(`Batch ${index} failed after ${this.config.retryCount} attempts:`, lastError) + throw lastError + } + + createBatch (operation) { const batch = this.firestore.batch() this.currentBatch.forEach((doc) => { @@ -32,140 +73,126 @@ export class FirestoreBatch { const docRef = this.firestore.collection(this.collectionName).doc() batch.set(docRef, doc) } else { - throw new Error('Invalid operation') + throw new Error(`Invalid operation: ${operation}`) } }) + + return batch + } + + queueBatch (operation) { + const batch = this.createBatch(operation) this.batchPromises.push(batch) this.currentBatch = [] } async commitBatches () { + if (this.batchPromises.length === 0) return + console.log(`Committing ${this.batchPromises.length} batches to ${this.collectionName}`) await Promise.all( - this.batchPromises.map(async (batchPromise, index) => { - const retryCount = 3 - let lastError - - for (let attempt = 1; attempt <= retryCount; attempt++) { - try { - await batchPromise.commit() - return - } catch (error) { - lastError = error - console.warn(`Batch ${index} attempt ${attempt} failed:`, error.message) - - if (attempt < retryCount) { - // Exponential backoff: 2^attempt seconds - const delayMs = Math.pow(2, attempt) * 1000 - console.log(`Retrying batch ${index} in ${delayMs}ms...`) - await new Promise(resolve => setTimeout(resolve, delayMs)) - } - } - } - - console.error(`Batch ${index} failed after ${retryCount} attempts:`, lastError) - throw lastError - }) + this.batchPromises.map((batch, index) => + this.commitWithRetry(batch, index) + ) ) this.batchPromises = [] } - async finalFlush (operation) { - if (this.currentBatch.length > 0) { + async processInBatches (operation, shouldFlush = false) { + const batchSize = this.getCurrentBatchSize(operation) + + if (this.currentBatch.length >= batchSize || shouldFlush) { this.queueBatch(operation) } - if (this.batchPromises.length > 0) { + if (this.batchPromises.length >= this.config.maxConcurrentBatches || shouldFlush) { await this.commitBatches() } } + buildQuery (collectionRef) { + const queryMap = { + report: () => { + console.info(`Deleting documents from ${this.collectionName} for date ${this.date}`) + return collectionRef.where('date', '==', this.date) + }, + dict: () => { + console.info(`Deleting documents from ${this.collectionName}`) + return collectionRef + } + } + + const queryBuilder = queryMap[this.collectionType] + if (!queryBuilder) { + throw new Error(`Invalid collection type: ${this.collectionType}`) + } + + return queryBuilder() + } + async batchDelete () { console.info('Starting batch deletion...') const startTime = Date.now() - this.currentBatch = [] - this.batchPromises = [] + this.reset() let totalDocsDeleted = 0 const collectionRef = this.firestore.collection(this.collectionName) - - let collectionQuery - if (this.collectionType === 'report') { - console.info('Deleting documents from ' + this.collectionName + ' for date ' + this.date) - // Query to fetch monthly documents - collectionQuery = collectionRef.where('date', '==', this.date) - } else if (this.collectionType === 'dict') { - console.info('Deleting documents from ' + this.collectionName) - collectionQuery = collectionRef - } else { - throw new Error('Invalid collection type') - } + const collectionQuery = this.buildQuery(collectionRef) + const batchSize = this.getCurrentBatchSize('delete') while (true) { - const snapshot = await collectionQuery.limit(this.batchSizeDelete * this.maxConcurrentBatches).get() - if (snapshot.empty) { - break - } + const snapshot = await collectionQuery.limit(batchSize * this.config.maxConcurrentBatches).get() + if (snapshot.empty) break - for await (const doc of snapshot.docs) { + for (const doc of snapshot.docs) { this.currentBatch.push(doc) - - if (this.currentBatch.length >= this.batchSize) { - this.queueBatch('delete') - } - if (this.batchPromises.length >= this.maxConcurrentBatches) { - await this.commitBatches() - } + await this.processInBatches('delete') totalDocsDeleted++ } } - await this.finalFlush('delete') + + // Final flush + await this.processInBatches('delete', true) const duration = (Date.now() - startTime) / 1000 console.info(`Deletion complete. Total docs deleted: ${totalDocsDeleted}. Time: ${duration} seconds`) } - /** - * Streams BigQuery query results into a Firestore collection using batch commits. - * @param {string} query - The BigQuery SQL query. - */ async streamFromBigQuery (rowStream) { console.info('Starting BigQuery to Firestore transfer...') const startTime = Date.now() let totalRowsProcessed = 0 - this.currentBatch = [] - this.batchPromises = [] + this.reset() for await (const row of rowStream) { this.currentBatch.push(row) - - // Write batch when it reaches specified size - if (this.currentBatch.length >= this.batchSize) { - this.queueBatch('set') - } - - if (this.batchPromises.length >= this.maxConcurrentBatches) { - await this.commitBatches() - } + await this.processInBatches('set') totalRowsProcessed++ } - await this.finalFlush('set') + + // Final flush + await this.processInBatches('set', true) const duration = (Date.now() - startTime) / 1000 console.info(`Transfer to ${this.collectionName} complete. Total rows processed: ${totalRowsProcessed}. Time: ${duration} seconds`) } async export (query, exportConfig) { + // Configure Firestore settings this.firestore.settings({ databaseId: exportConfig.database, - timeout: 10 * 60 * 1000 // 10 minutes timeout + timeout: this.config.timeout + }) + + // Set instance properties + Object.assign(this, { + collectionName: exportConfig.collection, + collectionType: exportConfig.type, + date: exportConfig.date }) - this.collectionName = exportConfig.collection - this.collectionType = exportConfig.type - this.date = exportConfig.date await this.batchDelete() diff --git a/infra/tf/.terraform.lock.hcl b/infra/tf/.terraform.lock.hcl index 10f078fb..07b73d29 100644 --- a/infra/tf/.terraform.lock.hcl +++ b/infra/tf/.terraform.lock.hcl @@ -22,41 +22,41 @@ provider "registry.terraform.io/hashicorp/archive" { } provider "registry.terraform.io/hashicorp/google" { - version = "6.37.0" + version = "6.38.0" constraints = ">= 6.13.0" hashes = [ - "h1:2x1x5lhrMLOUoQRPQw7fZ/NsJ/aRVJAPr+Tg6F8wUV0=", - "zh:0527880f838690bc32bf3d4bba42b3adefdf81e6614a169b09def759f341e11e", - "zh:39b5bf4ddebb7289db800faa14acd92e3591bcc711082058a3ecfbf868c43fdf", - "zh:3b0fb69d504d01801fa54dc1b5e8fad59f56a6a1866a7a7475a450e95a690fbf", - "zh:6b354bc2d89ee2a0f55fb11a2360ce94d185e7957b21a6b1a5f2cb01aff35e0b", - "zh:8c8783c892f3b20b425885f78dcd7fbb68fb10c4b8825b7f807eb4de950d963c", - "zh:9291034807a9d4799ecd2cbac33bf3d78aa59c6b734147b9579cd7a3d9ea207c", - "zh:9396293aed1fabc476452a2c6d14775f8e03b0d27ad558a18875fee1dc7fa8f8", - "zh:9e95308ce490dcf8efb45cd945ecf46c7a8b74ad9c65e25800b65ffd2125e4e1", - "zh:9fa9bdd07efd4eaeae1fea44e7926b1abb3d065c938c6cd5fcb0f88b12e09b68", - "zh:b684074bc12e46e671aa627849d8f515045983b53fcc56b7d6ded28abcaf4f10", - "zh:e35d5e5d89469324b8baf68b1d9599ccc1cfacb43f2cfa73107d1de7ce7f3aa9", + "h1:KVk7+B+uKgzqM410dsyo69WAXoNL/PHXJUGvwnKj0d8=", + "zh:007d14340f39d90240dd621904f7ff83fb1fc436607e74b85b343deaa060bafb", + "zh:251e457647a7658aac163de81644ff8b5f831938136fe210de48632a8e7e2289", + "zh:5e94c76c288182ac19e150ccd7349457a69901285e67b184ecfd7e6b2c1e4b03", + "zh:5ee76059d6e2989f1798a49feff379d913ba23220fef29c782c30ebbe2839f80", + "zh:78045823eacec861c9d5a3a3cbd87a4b020a6e90c6cfbc7ad7f7f5acb5d339f6", + "zh:863a8e0fa74a4823a2600d00d71577ce1ace5958d287bdff9035e752aab14eb9", + "zh:8fafbcc831650b644c3ac342cd15d5c77a4e1f9a35bfb86cf954a37fe0e63094", + "zh:b1cb7a2fe71d018033151f5ac97120dff204a0ee9e695db40dd1a22657312584", + "zh:cbfa712cc1484ca5c086ba5fdb34df393c9214ddc05f83ca2d9071a41bd215e6", "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", + "zh:fc2aae390011f7256ac88110151b67b6b45b6e856c7729ac4f2aebede5b5231e", + "zh:fe786d8cc9f63065a544a50b15ea09d8e0c9f71d8ebdc60722736fcdcf6b04d7", ] } provider "registry.terraform.io/hashicorp/google-beta" { - version = "6.37.0" + version = "6.38.0" constraints = ">= 6.13.0" hashes = [ - "h1:+AJWUA01ExrN2Ez7uJcq6EpGrIF+Sc5vDjm5M9naezY=", - "zh:087d744055310d4a276f80b5242090ea1d0aff9e64f4134a770e437d2e65b1d3", - "zh:0ede089f946d86810a91e78430bf7245db92adba6875641d344c8ff39f3ad7a5", - "zh:2c31b578bcbaf5591629919a48b980a031053b54b5923ce4a751caf2ef1f58b1", - "zh:598d03c4bc516f1d56a650579e8c165cd5be526bab57aa0f5cc47b607e31f055", - "zh:658f8346a8fff371d87c4a18040f131b7b69d26100e8da6c8998000e638820cb", - "zh:7d8664e3858842a169c0fe618839c386dcfcef82a0de225e096e6a34d96459a9", - "zh:9c31d88dce8776eb172cb31750a9d21c7e3e9284a8143fa3f7f889a15efce969", - "zh:9ec810a3affd04f3d723e786a86029cf5057bfcb7a94d193fb91e6b26e0db0c1", - "zh:9fb6ba5a282afd916470e10076a7f4d39817a7be0ba48abcef1c7ecfd24db6f8", - "zh:b6465dedc0e75557f70b020f80a8e1ab4ba0501a291880e178b84ce689da177c", - "zh:cb5b7e6f673db80e736e7a6710611d87d4727c1ef6250a6b3a70dd3401935f28", + "h1:RjKIj+Y12o34lpfoRcaTvQsNPlct2MJZFky3Vt0wHvo=", + "zh:13b9da4d63e96d4e9487e84fcf78b2e8261a66be821c939bdd5587d5d21bef24", + "zh:2f4c09e4c7e99f6b96e4abd29263cc25f62f2d87fb2efaf7ed97076cd8a2c2d3", + "zh:3bbd8fd0f91553504f229d8ee20908ff2baf4c53b9df4cb5bf42b1b058b946cd", + "zh:568d560696dec1dde68fbc30b1063c95091a609333f50742cafde5647d83f1bb", + "zh:7d407ac54a03e86f0dcb81c24eb859ed18da6d45838a91dfde3b53c1cef3ebc1", + "zh:a4a0f5aaeb9d82e4ef137ce1d2bf3e0d582f116c18dfe2dd8fef3c68630e3290", + "zh:affcbb5baea2e55c4043d5d626bbbefbdc6121d1214f0eec47a7bbebf0db8f95", + "zh:be444c9b54305e7816754f790594e0dc09de22c581677b526159f4f20fee12e2", + "zh:d988e3b0152d64fcee379255fbe5c9649733c0b625f41f5f30d72bcab042e4ca", + "zh:dc2a2d2b91930cd046e70964590d58d4e75da8e065a37f23605c9f0b1b0f6975", "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", + "zh:febf46d09277eeb9dc1323d003d612ce623b4608fac517ac64c8ce56f2352e5c", ] } diff --git a/infra/tf/bigquery_export/main.tf b/infra/tf/bigquery_export/main.tf index 7a492740..76ab75a9 100644 --- a/infra/tf/bigquery_export/main.tf +++ b/infra/tf/bigquery_export/main.tf @@ -35,7 +35,7 @@ resource "google_cloud_run_v2_job" "bigquery_export" { value = "" } } - timeout = "3600s" + timeout = "7200s" service_account = var.function_identity max_retries = 1 } From 386f75880d766fb84ff81c32484cbc70acb65b2d Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Thu, 5 Jun 2025 14:24:17 +0200 Subject: [PATCH 09/11] bulkWriter --- Makefile | 2 +- infra/bigquery-export/.dockerignore | 9 ++ infra/bigquery-export/Dockerfile | 10 +- infra/bigquery-export/cloudbuild.yaml | 9 ++ infra/bigquery-export/firestore.js | 183 ++++++++++++-------------- infra/bigquery-export/package.json | 2 +- 6 files changed, 112 insertions(+), 103 deletions(-) create mode 100644 infra/bigquery-export/.dockerignore create mode 100644 infra/bigquery-export/cloudbuild.yaml diff --git a/Makefile b/Makefile index 794f7389..515af2cb 100644 --- a/Makefile +++ b/Makefile @@ -13,7 +13,7 @@ tf_apply: terraform -chdir=infra/tf init && terraform -chdir=infra/tf apply -auto-approve bigquery_export_deploy: - cd infra/bigquery-export && npm install && npm run buildpack + cd infra/bigquery-export && npm run build #bigquery_export_spark_deploy: # cd infra/bigquery_export_spark && gcloud builds submit --region=global --tag us-docker.pkg.dev/httparchive/bigquery-spark-procedures/firestore_export:latest diff --git a/infra/bigquery-export/.dockerignore b/infra/bigquery-export/.dockerignore new file mode 100644 index 00000000..5805cc92 --- /dev/null +++ b/infra/bigquery-export/.dockerignore @@ -0,0 +1,9 @@ +node_modules +npm-debug.log +.git +.gitignore +.env +.nyc_output +coverage +*.md +.DS_Store diff --git a/infra/bigquery-export/Dockerfile b/infra/bigquery-export/Dockerfile index a50c671b..45773da4 100644 --- a/infra/bigquery-export/Dockerfile +++ b/infra/bigquery-export/Dockerfile @@ -3,11 +3,13 @@ FROM node:current-slim WORKDIR /usr/src/app -COPY . . +# Copy package files first for better caching +COPY package*.json ./ -# Clean up the node_modules directory -RUN rm -rf node_modules +# Install dependencies +RUN npm ci --only=production --quiet --no-fund --no-audit -RUN npm ci --only=production +# Copy source code +COPY . . CMD ["node", "index.js"] diff --git a/infra/bigquery-export/cloudbuild.yaml b/infra/bigquery-export/cloudbuild.yaml new file mode 100644 index 00000000..6c93d860 --- /dev/null +++ b/infra/bigquery-export/cloudbuild.yaml @@ -0,0 +1,9 @@ +steps: + - name: 'gcr.io/cloud-builders/docker' + args: [ + 'build', + '-t', 'us.gcr.io/httparchive/cloud-run/bigquery-export', + '.' + ] +images: + - 'us.gcr.io/httparchive/cloud-run/bigquery-export' diff --git a/infra/bigquery-export/firestore.js b/infra/bigquery-export/firestore.js index 5919a71c..38225983 100644 --- a/infra/bigquery-export/firestore.js +++ b/infra/bigquery-export/firestore.js @@ -19,97 +19,45 @@ export class FirestoreBatch { // Configuration constants this.config = { - batchSize: { - delete: 500, - write: 400 - }, - maxConcurrentBatches: 200, - retryCount: 5, - timeout: 10 * 60 * 1000 // 10 minutes + timeout: 10 * 60 * 1000, // 10 minutes + progressReportInterval: 200000, // Report progress every N operations + flushThreshold: 200000 // Flush BulkWriter every N operations } this.reset() } reset () { - this.currentBatch = [] - this.batchPromises = [] + this.processedDocs = 0 + this.totalDocs = 0 + this.bulkWriter = null } - getCurrentBatchSize (operation) { - return this.config.batchSize[operation === 'delete' ? 'delete' : 'write'] - } + createBulkWriter (operation) { + const bulkWriter = this.firestore.bulkWriter() - async commitWithRetry (batch, index) { - let lastError - - for (let attempt = 1; attempt <= this.config.retryCount; attempt++) { - try { - await batch.commit() - return - } catch (error) { - lastError = error - console.warn(`Batch ${index} attempt ${attempt} failed:`, error.message) - - if (attempt < this.config.retryCount) { - const delayMs = Math.pow(2, attempt) * 500 - console.log(`Retrying batch ${index} in ${delayMs}ms...`) - await new Promise(resolve => setTimeout(resolve, delayMs)) - } - } - } + // Configure error handling with progress info + bulkWriter.onWriteError((error) => { + const progressInfo = this.totalDocs > 0 ? ` (${this.processedDocs}/${this.totalDocs})` : '' + console.warn(`${operation} operation failed${progressInfo}:`, error.message) - console.error(`Batch ${index} failed after ${this.config.retryCount} attempts:`, lastError) - throw lastError - } - - createBatch (operation) { - const batch = this.firestore.batch() - - this.currentBatch.forEach((doc) => { - if (operation === 'delete') { - batch.delete(doc.ref) - } else if (operation === 'set') { - const docRef = this.firestore.collection(this.collectionName).doc() - batch.set(docRef, doc) - } else { - throw new Error(`Invalid operation: ${operation}`) - } + // Retry on transient errors, fail on permanent ones + const retryableErrors = ['deadline-exceeded', 'unavailable', 'resource-exhausted'] + return retryableErrors.includes(error.code) }) - return batch - } - - queueBatch (operation) { - const batch = this.createBatch(operation) - this.batchPromises.push(batch) - this.currentBatch = [] - } - - async commitBatches () { - if (this.batchPromises.length === 0) return - - console.log(`Committing ${this.batchPromises.length} batches to ${this.collectionName}`) - - await Promise.all( - this.batchPromises.map((batch, index) => - this.commitWithRetry(batch, index) - ) - ) + // Track progress on successful writes + bulkWriter.onWriteResult(() => { + this.processedDocs++ - this.batchPromises = [] - } - - async processInBatches (operation, shouldFlush = false) { - const batchSize = this.getCurrentBatchSize(operation) - - if (this.currentBatch.length >= batchSize || shouldFlush) { - this.queueBatch(operation) - } + // Report progress periodically + if (this.processedDocs % this.config.progressReportInterval === 0) { + const progressInfo = this.totalDocs > 0 ? ` (${this.processedDocs}/${this.totalDocs})` : ` (${this.processedDocs} processed)` + console.log(`Progress${progressInfo} - ${operation}ing documents in ${this.collectionName}`) + } + }) - if (this.batchPromises.length >= this.config.maxConcurrentBatches || shouldFlush) { - await this.commitBatches() - } + return bulkWriter } buildQuery (collectionRef) { @@ -132,52 +80,93 @@ export class FirestoreBatch { return queryBuilder() } + async getDocumentCount (query) { + try { + const countSnapshot = await query.count().get() + return countSnapshot.data().count + } catch (error) { + console.warn('Could not get document count for progress tracking:', error.message) + return 0 + } + } + async batchDelete () { console.info('Starting batch deletion...') const startTime = Date.now() this.reset() - let totalDocsDeleted = 0 const collectionRef = this.firestore.collection(this.collectionName) const collectionQuery = this.buildQuery(collectionRef) - const batchSize = this.getCurrentBatchSize('delete') - while (true) { - const snapshot = await collectionQuery.limit(batchSize * this.config.maxConcurrentBatches).get() + // Get total count for progress tracking + this.totalDocs = await this.getDocumentCount(collectionQuery) + if (this.totalDocs > 0) { + console.info(`Total documents to delete: ${this.totalDocs}`) + } + + // Create BulkWriter for delete operations + this.bulkWriter = this.createBulkWriter('delet') + + let deletedCount = 0 + const batchSize = this.config.flushThreshold // Process documents in chunks + + while (deletedCount < this.totalDocs || this.totalDocs === 0) { + const snapshot = await collectionQuery.limit(batchSize).get() if (snapshot.empty) break - for (const doc of snapshot.docs) { - this.currentBatch.push(doc) - await this.processInBatches('delete') - totalDocsDeleted++ - } + // Add all delete operations to BulkWriter + snapshot.docs.forEach(doc => { + this.bulkWriter.delete(doc.ref) + deletedCount++ + }) + + // Periodically flush to manage memory + // if (deletedCount % this.config.flushThreshold === 0) { + console.log(`Flushing BulkWriter at ${deletedCount} operations...`) + await this.bulkWriter.flush() + // } } - // Final flush - await this.processInBatches('delete', true) + // Final flush and close + console.log('Finalizing deletion operations...') + await this.bulkWriter.close() const duration = (Date.now() - startTime) / 1000 - console.info(`Deletion complete. Total docs deleted: ${totalDocsDeleted}. Time: ${duration} seconds`) + console.info(`Deletion complete. Total docs deleted: ${this.processedDocs}. Time: ${duration} seconds`) } async streamFromBigQuery (rowStream) { console.info('Starting BigQuery to Firestore transfer...') const startTime = Date.now() - let totalRowsProcessed = 0 - this.reset() + // Create BulkWriter for write operations + this.bulkWriter = this.createBulkWriter('writ') + + let rowCount = 0 + const collectionRef = this.firestore.collection(this.collectionName) + for await (const row of rowStream) { - this.currentBatch.push(row) - await this.processInBatches('set') - totalRowsProcessed++ + // Add document to BulkWriter + const docRef = collectionRef.doc() + this.bulkWriter.set(docRef, row) + + rowCount++ + this.totalDocs = rowCount // Update total as we go since we can't predict BigQuery result size + + // Periodically flush to manage memory + if (rowCount % this.config.flushThreshold === 0) { + console.log(`Flushing BulkWriter at ${rowCount} operations...`) + await this.bulkWriter.flush() + } } - // Final flush - await this.processInBatches('set', true) + // Final flush and close + console.log('Finalizing write operations...') + await this.bulkWriter.close() const duration = (Date.now() - startTime) / 1000 - console.info(`Transfer to ${this.collectionName} complete. Total rows processed: ${totalRowsProcessed}. Time: ${duration} seconds`) + console.info(`Transfer to ${this.collectionName} complete. Total rows processed: ${this.processedDocs}. Time: ${duration} seconds`) } async export (query, exportConfig) { diff --git a/infra/bigquery-export/package.json b/infra/bigquery-export/package.json index 02bf0e92..d7533172 100644 --- a/infra/bigquery-export/package.json +++ b/infra/bigquery-export/package.json @@ -4,7 +4,7 @@ "main": "index.js", "scripts": { "start": "node index.js", - "buildpack": "rm -rf node_modules; gcloud builds submit --pack image=us.gcr.io/httparchive/cloud-run/bigquery-export" + "build": "gcloud builds submit" }, "type": "module", "dependencies": { From 9ce595f68b6895ff5293ee95e52e33f3c945d71e Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Thu, 5 Jun 2025 14:30:26 +0200 Subject: [PATCH 10/11] lint --- .github/dependabot.yml | 1 + .github/workflows/linter.yaml | 1 + infra/bigquery-export/cloudbuild.yaml | 13 ++++++------- workflow_settings.yaml | 2 ++ 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index c5dacfd1..24f5d81d 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -1,3 +1,4 @@ +--- # To get started with Dependabot version updates, you'll need to specify which # package ecosystems to update and where the package manifests are located. # Please see the documentation for all configuration options: diff --git a/.github/workflows/linter.yaml b/.github/workflows/linter.yaml index 5d8c02b0..ed5eb351 100644 --- a/.github/workflows/linter.yaml +++ b/.github/workflows/linter.yaml @@ -1,3 +1,4 @@ +--- name: Linter on: diff --git a/infra/bigquery-export/cloudbuild.yaml b/infra/bigquery-export/cloudbuild.yaml index 6c93d860..cd902c48 100644 --- a/infra/bigquery-export/cloudbuild.yaml +++ b/infra/bigquery-export/cloudbuild.yaml @@ -1,9 +1,8 @@ +--- + steps: - - name: 'gcr.io/cloud-builders/docker' - args: [ - 'build', - '-t', 'us.gcr.io/httparchive/cloud-run/bigquery-export', - '.' - ] + - name: "gcr.io/cloud-builders/docker" + args: + ["build", "-t", "us.gcr.io/httparchive/cloud-run/bigquery-export", "."] images: - - 'us.gcr.io/httparchive/cloud-run/bigquery-export' + - "us.gcr.io/httparchive/cloud-run/bigquery-export" diff --git a/workflow_settings.yaml b/workflow_settings.yaml index 6d3e2f6b..be02c093 100644 --- a/workflow_settings.yaml +++ b/workflow_settings.yaml @@ -1,3 +1,5 @@ +--- + defaultProject: httparchive defaultLocation: US defaultAssertionDataset: dataform_assertions From 2ed18e430b87b08aaf13c1e11215442bf4463889 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Thu, 5 Jun 2025 14:35:32 +0200 Subject: [PATCH 11/11] lint --- infra/bigquery-export/cloudbuild.yaml | 1 - workflow_settings.yaml | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/infra/bigquery-export/cloudbuild.yaml b/infra/bigquery-export/cloudbuild.yaml index cd902c48..1da2eba0 100644 --- a/infra/bigquery-export/cloudbuild.yaml +++ b/infra/bigquery-export/cloudbuild.yaml @@ -1,5 +1,4 @@ --- - steps: - name: "gcr.io/cloud-builders/docker" args: diff --git a/workflow_settings.yaml b/workflow_settings.yaml index be02c093..28e759c7 100644 --- a/workflow_settings.yaml +++ b/workflow_settings.yaml @@ -1,8 +1,6 @@ ---- - defaultProject: httparchive defaultLocation: US defaultAssertionDataset: dataform_assertions vars: - environment: prod # MUST be equal 'prod' in main branch, enables processing sampled data + environment: prod # MUST be 'prod' in main branch, enables sampling