From 17a1bdbcbd078625d6ed11380027a5c50b8fb293 Mon Sep 17 00:00:00 2001 From: rootflo-hardik Date: Fri, 3 Apr 2026 18:24:04 +0530 Subject: [PATCH 1/4] added rag_ingestion action, script and dockerfile --- .../build-rag-ingestion-develop.yaml | 107 ++++++++++++++++++ .../server/docker/rag_ingestion.Dockerfile | 32 ++++++ .../rag_ingestion/startup-rag-ingestion.sh | 6 + 3 files changed, 145 insertions(+) create mode 100644 .github/workflows/build-rag-ingestion-develop.yaml create mode 100644 wavefront/server/docker/rag_ingestion.Dockerfile create mode 100644 wavefront/server/scripts/rag_ingestion/startup-rag-ingestion.sh diff --git a/.github/workflows/build-rag-ingestion-develop.yaml b/.github/workflows/build-rag-ingestion-develop.yaml new file mode 100644 index 00000000..3946caf5 --- /dev/null +++ b/.github/workflows/build-rag-ingestion-develop.yaml @@ -0,0 +1,107 @@ +name: (Develop) Build and Push RAG Ingestion to AWS, GCP and Azure + +on: + workflow_dispatch: + +env: + PROJECT_ID: aesy-330511 + GCP_REGION: asia-south1 + GAR_LOCATION: asia-south1-docker.pkg.dev/aesy-330511/root-hub + IMAGE_NAME: auraflo-rag-ingestion + + AWS_REGION: ap-south-1 + ECR_REGISTRY: 025066241490.dkr.ecr.ap-south-1.amazonaws.com + ECR_REPOSITORY: rootflo/auraflo-rag-ingestion + + ACR_REGISTRY_NAME: rootflo + ACR_REGISTRY: rootflo.azurecr.io + ACR_REPOSITORY: auraflo-rag-ingestion + +jobs: + build-push-artifact: + runs-on: ubuntu-latest + + steps: + - name: "Checkout" + uses: "actions/checkout@v3" + + - name: Get commit hash + id: get-commit-hash + run: echo "::set-output name=commit-hash::$(git rev-parse --short HEAD)" + + - name: Get timestamp + id: get-timestamp + run: echo "::set-output name=timestamp::$(date +'%Y-%m-%d-%H-%M')" + + - name: Cache Docker layers + id: cache-docker-layers + uses: actions/cache@v3 + with: + path: /tmp/.buildx-cache + key: ${{ runner.os }}-docker-${{ github.sha }} + restore-keys: | + ${{ runner.os }}-docker- + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build Docker Image + id: build-image + run: | + docker build -f wavefront/server/docker/rag_ingestion.Dockerfile -t rootflo:${{ steps.get-commit-hash.outputs.commit-hash }}-${{ steps.get-timestamp.outputs.timestamp }} . + echo "IMAGE_TAG=${{ steps.get-commit-hash.outputs.commit-hash }}-${{ steps.get-timestamp.outputs.timestamp }}" >> $GITHUB_ENV + + - id: "Auth-to-GCP" + uses: "google-github-actions/auth@v1" + with: + credentials_json: "${{ secrets.GCP_SERVICE_ACCOUNT_KEY }}" + + - name: "Set up Cloud SDK" + uses: "google-github-actions/setup-gcloud@v1" + + - name: "Docker auth for GCP" + run: |- + gcloud auth configure-docker ${{ env.GCP_REGION }}-docker.pkg.dev --quiet + + - name: Tag and push image to GCP Artifact Registry + run: | + docker tag rootflo:${{ env.IMAGE_TAG }} ${{ env.GAR_LOCATION }}/${{ env.IMAGE_NAME }}:${{ env.IMAGE_TAG }} + docker push ${{ env.GAR_LOCATION }}/${{ env.IMAGE_NAME }}:${{ env.IMAGE_TAG }} + + # Configure AWS credentials and push to ECR + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ env.AWS_REGION }} + + - name: Login to Amazon ECR + id: login-ecr + uses: aws-actions/amazon-ecr-login@v1 + + - name: Tag and push image to Amazon ECR + run: | + docker tag rootflo:${{ env.IMAGE_TAG }} ${{ env.ECR_REGISTRY }}/${{ env.ECR_REPOSITORY }}:${{ env.IMAGE_TAG }} + docker push ${{ env.ECR_REGISTRY }}/${{ env.ECR_REPOSITORY }}:${{ env.IMAGE_TAG }} + + # Configure Azure credentials and push to ACR + - name: Login to Azure + uses: azure/login@v2 + with: + creds: ${{ secrets.AZURE_CREDENTIALS }} + + - name: Docker auth for Azure ACR + run: az acr login --name ${{ env.ACR_REGISTRY_NAME }} + + - name: Tag and push image to Azure Container Registry + run: | + docker tag rootflo:${{ env.IMAGE_TAG }} ${{ env.ACR_REGISTRY }}/${{ env.ACR_REPOSITORY }}:${{ env.IMAGE_TAG }} + docker push ${{ env.ACR_REGISTRY }}/${{ env.ACR_REPOSITORY }}:${{ env.IMAGE_TAG }} + + - name: Cleanup Docker images + run: | + docker rmi rootflo:${{ env.IMAGE_TAG }} || true + docker rmi ${{ env.GAR_LOCATION }}/${{ env.IMAGE_NAME }}:${{ env.IMAGE_TAG }} || true + docker rmi ${{ env.ECR_REGISTRY }}/${{ env.ECR_REPOSITORY }}:${{ env.IMAGE_TAG }} || true + docker rmi ${{ env.ACR_REGISTRY }}/${{ env.ACR_REPOSITORY }}:${{ env.IMAGE_TAG }} || true diff --git a/wavefront/server/docker/rag_ingestion.Dockerfile b/wavefront/server/docker/rag_ingestion.Dockerfile new file mode 100644 index 00000000..3242be2d --- /dev/null +++ b/wavefront/server/docker/rag_ingestion.Dockerfile @@ -0,0 +1,32 @@ +FROM python:3.11-slim-buster + +# Copy UV from official image +COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ + +# Set working directory +WORKDIR /app + +# Copy project files +COPY wavefront/server/pyproject.toml wavefront/server/uv.lock ./ +COPY wavefront/server/background_jobs/rag_ingestion ./background_jobs/rag_ingestion/ +COPY wavefront/server/packages/flo_cloud ./packages/flo_cloud/ +COPY wavefront/server/packages/flo_utils ./packages/flo_utils/ +COPY wavefront/server/modules/db_repo_module ./modules/db_repo_module/ +COPY wavefront/server/modules/common_module ./modules/common_module/ +COPY wavefront/server/scripts/rag_ingestion/startup-rag-ingestion.sh ./background_jobs/rag_ingestion/ + +# Install dependencies +RUN uv sync --package rag-ingestion --frozen --no-dev + +# Download the tiktoken encoding file and NLTK data +RUN mkdir -p /root/.cache/tiktoken +RUN uv run python3 -c "import tiktoken; enc = tiktoken.encoding_for_model('gpt-4')" +RUN uv run python3 -c "import nltk; nltk.download('punkt'); nltk.download('averaged_perceptron_tagger')" + +WORKDIR /app/background_jobs/rag_ingestion + +# Make startup script executable +RUN chmod +x startup-rag-ingestion.sh + +# Set entrypoint to run startup script +CMD ["./startup-rag-ingestion.sh"] \ No newline at end of file diff --git a/wavefront/server/scripts/rag_ingestion/startup-rag-ingestion.sh b/wavefront/server/scripts/rag_ingestion/startup-rag-ingestion.sh new file mode 100644 index 00000000..e0d9ad13 --- /dev/null +++ b/wavefront/server/scripts/rag_ingestion/startup-rag-ingestion.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +source /app/.venv/bin/activate + +# Run the main application for RAG Ingestion +python rag_ingestion/main.py \ No newline at end of file From 82e2b309059fd5dc0af534e04d4a36ce0409744b Mon Sep 17 00:00:00 2001 From: rootflo-hardik Date: Sat, 4 Apr 2026 11:28:03 +0530 Subject: [PATCH 2/4] added missing nltk package in rag_ingestion --- wavefront/server/background_jobs/rag_ingestion/pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/wavefront/server/background_jobs/rag_ingestion/pyproject.toml b/wavefront/server/background_jobs/rag_ingestion/pyproject.toml index 43048ada..c8239724 100644 --- a/wavefront/server/background_jobs/rag_ingestion/pyproject.toml +++ b/wavefront/server/background_jobs/rag_ingestion/pyproject.toml @@ -11,6 +11,7 @@ dependencies = [ "flo-utils", "db-repo-module", "tiktoken>=0.9.0", + "nltk>=3.9.0", "textract>=1.6.5", "transformers>=4.45.0", "asyncpg>=0.30.0", From d05f4b0dc80486a81c4f682ff32a0313fb047b56 Mon Sep 17 00:00:00 2001 From: rootflo-hardik Date: Sat, 4 Apr 2026 12:29:17 +0530 Subject: [PATCH 3/4] pushed uv.lock as well --- wavefront/server/uv.lock | 2 ++ 1 file changed, 2 insertions(+) diff --git a/wavefront/server/uv.lock b/wavefront/server/uv.lock index f7bf7aa7..b6ff7f3f 100644 --- a/wavefront/server/uv.lock +++ b/wavefront/server/uv.lock @@ -4791,6 +4791,7 @@ dependencies = [ { name = "flo-cloud" }, { name = "flo-utils" }, { name = "httpx" }, + { name = "nltk" }, { name = "pyjwt", extra = ["crypto"] }, { name = "python-dotenv" }, { name = "textract" }, @@ -4807,6 +4808,7 @@ requires-dist = [ { name = "flo-cloud", editable = "packages/flo_cloud" }, { name = "flo-utils", editable = "packages/flo_utils" }, { name = "httpx", specifier = ">=0.28.1" }, + { name = "nltk", specifier = ">=3.9.0" }, { name = "pyjwt", extras = ["crypto"], specifier = ">=2.9.0" }, { name = "python-dotenv", specifier = ">=1.1.0,<2.0.0" }, { name = "textract", specifier = ">=1.6.5" }, From e8d0b8a2199ca09174fd23d55febb652f836e5f3 Mon Sep 17 00:00:00 2001 From: rootflo-hardik Date: Sat, 4 Apr 2026 12:41:18 +0530 Subject: [PATCH 4/4] upgraded nltk to latest version --- .../server/background_jobs/rag_ingestion/pyproject.toml | 2 +- wavefront/server/uv.lock | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/wavefront/server/background_jobs/rag_ingestion/pyproject.toml b/wavefront/server/background_jobs/rag_ingestion/pyproject.toml index c8239724..e41993bd 100644 --- a/wavefront/server/background_jobs/rag_ingestion/pyproject.toml +++ b/wavefront/server/background_jobs/rag_ingestion/pyproject.toml @@ -11,7 +11,7 @@ dependencies = [ "flo-utils", "db-repo-module", "tiktoken>=0.9.0", - "nltk>=3.9.0", + "nltk>=3.9.3", "textract>=1.6.5", "transformers>=4.45.0", "asyncpg>=0.30.0", diff --git a/wavefront/server/uv.lock b/wavefront/server/uv.lock index b6ff7f3f..e90c433e 100644 --- a/wavefront/server/uv.lock +++ b/wavefront/server/uv.lock @@ -3291,7 +3291,7 @@ wheels = [ [[package]] name = "nltk" -version = "3.9.2" +version = "3.9.4" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "click" }, @@ -3299,9 +3299,9 @@ dependencies = [ { name = "regex" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/f9/76/3a5e4312c19a028770f86fd7c058cf9f4ec4321c6cf7526bab998a5b683c/nltk-3.9.2.tar.gz", hash = "sha256:0f409e9b069ca4177c1903c3e843eef90c7e92992fa4931ae607da6de49e1419", size = 2887629, upload-time = "2025-10-01T07:19:23.764Z" } +sdist = { url = "https://files.pythonhosted.org/packages/74/a1/b3b4adf15585a5bc4c357adde150c01ebeeb642173ded4d871e89468767c/nltk-3.9.4.tar.gz", hash = "sha256:ed03bc098a40481310320808b2db712d95d13ca65b27372f8a403949c8b523d0", size = 2946864, upload-time = "2026-03-24T06:13:40.641Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/60/90/81ac364ef94209c100e12579629dc92bf7a709a84af32f8c551b02c07e94/nltk-3.9.2-py3-none-any.whl", hash = "sha256:1e209d2b3009110635ed9709a67a1a3e33a10f799490fa71cf4bec218c11c88a", size = 1513404, upload-time = "2025-10-01T07:19:21.648Z" }, + { url = "https://files.pythonhosted.org/packages/9d/91/04e965f8e717ba0ab4bdca5c112deeab11c9e750d94c4d4602f050295d39/nltk-3.9.4-py3-none-any.whl", hash = "sha256:f2fa301c3a12718ce4a0e9305c5675299da5ad9e26068218b69d692fda84828f", size = 1552087, upload-time = "2026-03-24T06:13:38.47Z" }, ] [[package]] @@ -4808,7 +4808,7 @@ requires-dist = [ { name = "flo-cloud", editable = "packages/flo_cloud" }, { name = "flo-utils", editable = "packages/flo_utils" }, { name = "httpx", specifier = ">=0.28.1" }, - { name = "nltk", specifier = ">=3.9.0" }, + { name = "nltk", specifier = ">=3.9.3" }, { name = "pyjwt", extras = ["crypto"], specifier = ">=2.9.0" }, { name = "python-dotenv", specifier = ">=1.1.0,<2.0.0" }, { name = "textract", specifier = ">=1.6.5" },