NVIDIA · ko3n1g · Jan 28, 2025 · Jan 28, 2025 · Jan 28, 2025 · Jan 28, 2025
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -44,12 +44,14 @@ jobs:
       test_to_run: ${{ steps.test_to_run.outputs.main }}
       all: ${{ steps.all.outputs.main }}
       run_ci: ${{ steps.evaluate.outputs.run_ci }}
+      build_args: ${{ steps.manifest.outputs.BUILD_ARGS }}
     steps:
       - name: Parse test_to_run
         id: test_to_run
         run: |
           parsed_string=$(echo ${{ inputs.test_to_run || 'all' }} | jq -c --raw-input 'split(",")')
           echo "main=${parsed_string}" | tee -a "$GITHUB_OUTPUT"
+
       - name: Parse all
         id: all
         run: |
@@ -89,6 +91,36 @@ jobs:
           # Run CI only (on main or if label is attached) and if it's not only docs
           echo run_ci=$([[ ("$LABEL" = "true" || "$IS_PULLREQUEST" = "false" || "$MERGE_GROUP" = "true") && "$DOCS_ONLY" = "false" ]] && echo "true" || echo "false") | tee -a "$GITHUB_OUTPUT"
 
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          path: ${{ github.run_id }}
+
+      - name: Parse manifest.json
+        id: manifest
+        run: |
+          cd ${{ github.run_id }}
+
+          BUILD_ARGS=$(cat << EOF
+          BASE_IMAGE=$(cat setup/manifest.json | jq -r '."ngc-pytorch"')
+          NEMO_REPO=$(cat setup/manifest.json | jq -r '."vcs-dependencies".nemo.repo')
+          NEMO_TAG=$(cat setup/manifest.json | jq -r '."vcs-dependencies".nemo.ref')
+          MLM_REPO=$(cat setup/manifest.json | jq -r '."vcs-dependencies".megatron.repo')
+          MLM_TAG=$(cat setup/manifest.json | jq -r '."vcs-dependencies".megatron.ref')
+          TE_REPO=$(cat setup/manifest.json | jq -r '."vcs-dependencies".te.repo')
+          TE_TAG=$(cat setup/manifest.json | jq -r '."vcs-dependencies".te.ref')
+          TRTLLM_REPO=$(cat setup/manifest.json | jq -r '."vcs-dependencies".trtllm.repo')
+          TRTLLM_TAG=$(cat setup/manifest.json | jq -r '."vcs-dependencies".trtllm.ref')
+          PROTOBUF_VERSION=$(cat setup/manifest.json | jq -r '."pypi-dependencies".protobuf')
+          PYTRITON_VERSION=$(cat setup/manifest.json | jq -r '."pypi-dependencies".pytriton')
+          PYNVML_VERSION=$(cat setup/manifest.json | jq -r '."pypi-dependencies".pynvml')
+          EOF
+          )
+
+          echo "BUILD_ARGS<<EOF" >> $GITHUB_OUTPUT
+          echo "$BUILD_ARGS" >> $GITHUB_OUTPUT
+          echo "EOF" >> $GITHUB_OUTPUT
+
   build-container:
     if: ${{ needs.pre-flight.outputs.run_ci == 'true' }}
     needs: [pre-flight]
@@ -100,7 +132,8 @@ jobs:
       build-args: |
         MAX_JOBS=32
         ALIGNER_COMMIT=${{ github.sha }}
-
+        ${{ needs.pre-flight.outputs.BUILD_ARGS }}
+
   Unit_Tests:
     name: ${{ matrix.test_case }}
     needs: [build-container, pre-flight]

diff --git a/Dockerfile b/Dockerfile
@@ -11,14 +11,21 @@
 # if you get errors building TE or Apex, decrease this to 4
 ARG MAX_JOBS=8
 # Git refs for dependencies
-ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
-ARG PYTRITON_VERSION=0.5.10
+
+ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.07-py3
+ARG NEMO_REPO=https://github.com/NVIDIA/NeMo
 ARG NEMO_TAG=19668e5320a2e2af0199b6d5e0b841993be3a634  # On: main
+ARG MLM_REPO=https://github.com/NVIDIA/Megatron-LM
 ARG MLM_TAG=25059d3bbf68be0751800f3644731df12a88f3f3   # On: main
+ARG ALIGNER_REPO=https://github.com/NVIDIA/NeMo-Aligner
 ARG ALIGNER_COMMIT=main
+ARG TE_REPO=https://github.com/NVIDIA/TransformerEngine
+ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
+ARG TRTLLM_REPO=https://github.com/NVIDIA/TensorRT-LLM.git
 ARG TRTLLM_VERSION=v0.13.0
 ARG PROTOBUF_VERSION=4.24.4
-ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.07-py3
+ARG PYTRITON_VERSION=0.5.10
+ARG PYNVML_VERSION=11.5.3
 
 FROM ${BASE_IMAGE} AS aligner-bump
 ARG ALIGNER_COMMIT
@@ -38,118 +45,86 @@ git pull --rebase || true
 pip install --no-cache-dir --no-deps -e .
 EOF
 
+FROM ${BASE_IMAGE} as trtllm-wheel
+ARG TRTLLM_VERSION
+COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh 
+RUN cd /opt/NeMo-Aligner && \
+    bash reinstall.sh --library trtllm --mode build && \
+    ls -al /opt/TensorRT-LLM
+
+FROM ${BASE_IMAGE} as te-wheel
+ARG MAX_JOBS
+ARG TE_TAG
+COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh 
+RUN cd /opt/NeMo-Aligner && \
+    bash reinstall.sh --library te --mode build && \
+    ls -al /opt/TransformerEngine
+
+FROM ${BASE_IMAGE} as apex-wheel
+ARG APEX_TAG
+ARG MAX_JOBS
+COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh 
+RUN cd /opt/NeMo-Aligner && \
+    bash reinstall.sh --library apex --mode build && \
+    ls -al /opt/Apex
+
 FROM ${BASE_IMAGE} AS final
 LABEL "nemo.library"="nemo-aligner"
 WORKDIR /opt
 # needed in case git complains that it can't detect a valid email, this email is fake but works
 RUN git config --global user.email "worker@nvidia.com"
-# install latest apex
-ARG APEX_TAG
-RUN pip uninstall -y apex && \
-    git clone https://github.com/NVIDIA/apex && \
-    cd apex && \
-    if [ ! -z $APEX_TAG ]; then \
-        git fetch origin $APEX_TAG && \
-        git checkout FETCH_HEAD; \
-    fi && \
-    pip install -v --no-build-isolation --disable-pip-version-check --no-cache-dir --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" ./
-
-# Git LFS
-RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash && \
-    apt-get install git-lfs && \
-    git lfs install && \
-    apt-get clean
+
+# Copy installer script
+COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh
+
+# Apex
+COPY --from=apex-wheel /opt/Apex /tmp/apex
+RUN bash /opt/NeMo-Aligner/reinstall.sh --mode install --library apex
 
 # TRTLLM
-ARG TRTLLM_VERSION
-RUN git clone https://github.com/NVIDIA/TensorRT-LLM.git && \
-    cd TensorRT-LLM && \
-    git checkout ${TRTLLM_VERSION} && \
-    . docker/common/install_tensorrt.sh && \
-    python3 ./scripts/build_wheel.py --job_count $(nproc) --trt_root /usr/local/tensorrt  --python_bindings --benchmarks && \
-    pip install -e .
+ARG PYNVML_VERSION
+COPY --from=trtllm-wheel /opt/TensorRT-LLM/build/ /tmp/trtllm
+RUN bash /opt/NeMo-Aligner/reinstall.sh --mode install --library trtllm
 ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/
 
-# TODO: This pinning of pynvml is only needed while on TRTLLM v13 since pynvml>=11.5.0 but pynvml==12.0.0 contains a
-#   breaking change. The last known working verison is 11.5.3
-RUN pip install pynvml==11.5.3
-
-# install TransformerEngine
-ARG MAX_JOBS
-ARG TE_TAG
-RUN pip uninstall -y transformer-engine && \
-    git clone https://github.com/NVIDIA/TransformerEngine.git && \
-    cd TransformerEngine && \
-    if [ ! -z $TE_TAG ]; then \
-        git fetch origin $TE_TAG && \
-        git checkout FETCH_HEAD; \
-    fi && \
-    git submodule init && git submodule update && \
-    NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install .
-
-# place any util pkgs here
-ARG PYTRITON_VERSION
-RUN pip install --upgrade-strategy only-if-needed nvidia-pytriton==$PYTRITON_VERSION
-ARG PROTOBUF_VERSION
-RUN pip install -U --no-deps protobuf==$PROTOBUF_VERSION
-RUN pip install --upgrade-strategy only-if-needed jsonlines
+# TransformerEngine
+COPY --from=te-wheel /opt/TransformerEngine /tmp/te
+RUN bash /opt/NeMo-Aligner/reinstall.sh --mode install --library te
 
-# NeMo
+ARG NEMO_REPO
 ARG NEMO_TAG
-RUN git clone https://github.com/NVIDIA/NeMo.git && \
-    cd NeMo && \
-    git pull && \
-    if [ ! -z $NEMO_TAG ]; then \
-        git fetch origin $NEMO_TAG && \
-        git checkout FETCH_HEAD; \
-    fi && \
-    pip uninstall -y nemo_toolkit sacrebleu && \
-    pip install -e ".[nlp]" && \
-    cd nemo/collections/nlp/data/language_modeling/megatron && make
-
-# TODO: While we are on Pytorch 24.07, we need to downgrade triton since 3.2.0 introduced a breaking change
-#   This un-pinned requirement comes from mamba-ssm, and this pin can be removed once Pytorch base image is
-#   updated.
-RUN pip install triton==3.1.0
-
-# MLM
-ARG MLM_TAG
-RUN pip uninstall -y megatron-core && \
-    git clone https://github.com/NVIDIA/Megatron-LM.git && \
-    cd Megatron-LM && \
-    git pull && \
-    if [ ! -z $MLM_TAG ]; then \
-        git fetch origin $MLM_TAG && \
-        git checkout FETCH_HEAD; \
-    fi && \
-    pip install -e .
+RUN bash /opt/NeMo-Aligner/reinstall.sh --mode install --library nemo
 
 COPY --from=aligner-bump /opt/NeMo-Aligner /opt/NeMo-Aligner
-RUN cd /opt/NeMo-Aligner && \
-    pip install --no-deps -e .
+ARG ALIGNER_COMMIT
+ARG PROTOBUF_VERSION
+ARG PYTRITON_VERSION
+ARG PYNVML_VERSION
+RUN bash /opt/NeMo-Aligner/reinstall.sh --library all --mode install
+#  && \
+#     cd /opt/TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch
 
-RUN cd TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch
 
 # TODO(terryk): This layer should be deleted ASAP after NeMo is bumped to include all of these PRs
-RUN <<"EOF" bash -exu
-cd NeMo
-# Ensures we don't cherry-pick "future" origin/main commits
-git fetch -a
-# 0c92fe17df4642ffc33d5d8c0c83fda729e3910c: [fix] Ensures disabling exp_manager with exp_manager=null does not error NeMo#10651
-# 60e677423667c029dd05875da72bf0719774f844: [feat] Update get_model_parallel_src_rank to support tp-pp-dp ordering NeMo#10652
-# 0deaf6716cb4f20766c995ce25d129795f1ae200: fix[export]: update API for disabling device reassignment in TRTLLM for Aligner NeMo#10863
-# (superceded by 10863) 148543d6e9c66ff1f8562e84484448202249811d: feat: Migrate GPTSession refit path in Nemo export to ModelRunner for Aligner NeMo#10654
-for pr_and_commit in \
-  "10651 0c92fe17df4642ffc33d5d8c0c83fda729e3910c" \
-  "10652 60e677423667c029dd05875da72bf0719774f844" \
-  "10863 0deaf6716cb4f20766c995ce25d129795f1ae200" \
-; do
-  pr=$(cut -f1 -d' ' <<<"$pr_and_commit")
-  head_pr_commit=$(cut -f2 -d' ' <<<"$pr_and_commit")
-  git fetch origin $head_pr_commit:PR-${pr}
-  # cherry-picks all commits between main and the top of the PR
-  git cherry-pick --allow-empty $(git merge-base origin/main PR-${pr})..PR-${pr}
-  # Tag cherry-picks to help
-  git tag cherry-pick-PR-${pr}
-done
-EOF
+# RUN <<"EOF" bash -exu
+# cd NeMo
+# # Ensures we don't cherry-pick "future" origin/main commits
+# git fetch -a
+# # 0c92fe17df4642ffc33d5d8c0c83fda729e3910c: [fix] Ensures disabling exp_manager with exp_manager=null does not error NeMo#10651
+# # 60e677423667c029dd05875da72bf0719774f844: [feat] Update get_model_parallel_src_rank to support tp-pp-dp ordering NeMo#10652
+# # 0deaf6716cb4f20766c995ce25d129795f1ae200: fix[export]: update API for disabling device reassignment in TRTLLM for Aligner NeMo#10863
+# # (superceded by 10863) 148543d6e9c66ff1f8562e84484448202249811d: feat: Migrate GPTSession refit path in Nemo export to ModelRunner for Aligner NeMo#10654
+# for pr_and_commit in \
+#   "10651 0c92fe17df4642ffc33d5d8c0c83fda729e3910c" \
+#   "10652 60e677423667c029dd05875da72bf0719774f844" \
+#   "10863 0deaf6716cb4f20766c995ce25d129795f1ae200" \
+# ; do
+#   pr=$(cut -f1 -d' ' <<<"$pr_and_commit")
+#   head_pr_commit=$(cut -f2 -d' ' <<<"$pr_and_commit")
+#   git fetch origin $head_pr_commit:PR-${pr}
+#   # cherry-picks all commits between main and the top of the PR
+#   git cherry-pick --allow-empty $(git merge-base origin/main PR-${pr})..PR-${pr}
+#   # Tag cherry-picks to help
+#   git tag cherry-pick-PR-${pr}
+# done
+# EOF