Skip to content
This repository was archived by the owner on Nov 19, 2025. It is now read-only.
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 34 additions & 1 deletion .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,14 @@ jobs:
test_to_run: ${{ steps.test_to_run.outputs.main }}
all: ${{ steps.all.outputs.main }}
run_ci: ${{ steps.evaluate.outputs.run_ci }}
build_args: ${{ steps.manifest.outputs.BUILD_ARGS }}
steps:
- name: Parse test_to_run
id: test_to_run
run: |
parsed_string=$(echo ${{ inputs.test_to_run || 'all' }} | jq -c --raw-input 'split(",")')
echo "main=${parsed_string}" | tee -a "$GITHUB_OUTPUT"

- name: Parse all
id: all
run: |
Expand Down Expand Up @@ -89,6 +91,36 @@ jobs:
# Run CI only (on main or if label is attached) and if it's not only docs
echo run_ci=$([[ ("$LABEL" = "true" || "$IS_PULLREQUEST" = "false" || "$MERGE_GROUP" = "true") && "$DOCS_ONLY" = "false" ]] && echo "true" || echo "false") | tee -a "$GITHUB_OUTPUT"

- name: Checkout repository
uses: actions/checkout@v4
with:
path: ${{ github.run_id }}

- name: Parse manifest.json
id: manifest
run: |
cd ${{ github.run_id }}

BUILD_ARGS=$(cat << EOF
BASE_IMAGE=$(cat setup/manifest.json | jq -r '."ngc-pytorch"')
NEMO_REPO=$(cat setup/manifest.json | jq -r '."vcs-dependencies".nemo.repo')
NEMO_TAG=$(cat setup/manifest.json | jq -r '."vcs-dependencies".nemo.ref')
MLM_REPO=$(cat setup/manifest.json | jq -r '."vcs-dependencies".megatron.repo')
MLM_TAG=$(cat setup/manifest.json | jq -r '."vcs-dependencies".megatron.ref')
TE_REPO=$(cat setup/manifest.json | jq -r '."vcs-dependencies".te.repo')
TE_TAG=$(cat setup/manifest.json | jq -r '."vcs-dependencies".te.ref')
TRTLLM_REPO=$(cat setup/manifest.json | jq -r '."vcs-dependencies".trtllm.repo')
TRTLLM_TAG=$(cat setup/manifest.json | jq -r '."vcs-dependencies".trtllm.ref')
PROTOBUF_VERSION=$(cat setup/manifest.json | jq -r '."pypi-dependencies".protobuf')
PYTRITON_VERSION=$(cat setup/manifest.json | jq -r '."pypi-dependencies".pytriton')
PYNVML_VERSION=$(cat setup/manifest.json | jq -r '."pypi-dependencies".pynvml')
EOF
)

echo "BUILD_ARGS<<EOF" >> $GITHUB_OUTPUT
echo "$BUILD_ARGS" >> $GITHUB_OUTPUT
echo "EOF" >> $GITHUB_OUTPUT

build-container:
if: ${{ needs.pre-flight.outputs.run_ci == 'true' }}
needs: [pre-flight]
Expand All @@ -100,7 +132,8 @@ jobs:
build-args: |
MAX_JOBS=32
ALIGNER_COMMIT=${{ github.sha }}

${{ needs.pre-flight.outputs.BUILD_ARGS }}

Unit_Tests:
name: ${{ matrix.test_case }}
needs: [build-container, pre-flight]
Expand Down
179 changes: 77 additions & 102 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,21 @@
# if you get errors building TE or Apex, decrease this to 4
ARG MAX_JOBS=8
# Git refs for dependencies
ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
ARG PYTRITON_VERSION=0.5.10

ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.07-py3
ARG NEMO_REPO=https://github.com/NVIDIA/NeMo
ARG NEMO_TAG=19668e5320a2e2af0199b6d5e0b841993be3a634 # On: main
ARG MLM_REPO=https://github.com/NVIDIA/Megatron-LM
ARG MLM_TAG=25059d3bbf68be0751800f3644731df12a88f3f3 # On: main
ARG ALIGNER_REPO=https://github.com/NVIDIA/NeMo-Aligner
ARG ALIGNER_COMMIT=main
ARG TE_REPO=https://github.com/NVIDIA/TransformerEngine
ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
ARG TRTLLM_REPO=https://github.com/NVIDIA/TensorRT-LLM.git
ARG TRTLLM_VERSION=v0.13.0
ARG PROTOBUF_VERSION=4.24.4
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.07-py3
ARG PYTRITON_VERSION=0.5.10
ARG PYNVML_VERSION=11.5.3

FROM ${BASE_IMAGE} AS aligner-bump
ARG ALIGNER_COMMIT
Expand All @@ -38,118 +45,86 @@ git pull --rebase || true
pip install --no-cache-dir --no-deps -e .
EOF

FROM ${BASE_IMAGE} as trtllm-wheel
ARG TRTLLM_VERSION
COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh
RUN cd /opt/NeMo-Aligner && \
bash reinstall.sh --library trtllm --mode build && \
ls -al /opt/TensorRT-LLM

FROM ${BASE_IMAGE} as te-wheel
ARG MAX_JOBS
ARG TE_TAG
COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh
RUN cd /opt/NeMo-Aligner && \
bash reinstall.sh --library te --mode build && \
ls -al /opt/TransformerEngine

FROM ${BASE_IMAGE} as apex-wheel
ARG APEX_TAG
ARG MAX_JOBS
COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh
RUN cd /opt/NeMo-Aligner && \
bash reinstall.sh --library apex --mode build && \
ls -al /opt/Apex

FROM ${BASE_IMAGE} AS final
LABEL "nemo.library"="nemo-aligner"
WORKDIR /opt
# needed in case git complains that it can't detect a valid email, this email is fake but works
RUN git config --global user.email "worker@nvidia.com"
# install latest apex
ARG APEX_TAG
RUN pip uninstall -y apex && \
git clone https://github.com/NVIDIA/apex && \
cd apex && \
if [ ! -z $APEX_TAG ]; then \
git fetch origin $APEX_TAG && \
git checkout FETCH_HEAD; \
fi && \
pip install -v --no-build-isolation --disable-pip-version-check --no-cache-dir --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" ./

# Git LFS
RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash && \
apt-get install git-lfs && \
git lfs install && \
apt-get clean

# Copy installer script
COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh

# Apex
COPY --from=apex-wheel /opt/Apex /tmp/apex
RUN bash /opt/NeMo-Aligner/reinstall.sh --mode install --library apex

# TRTLLM
ARG TRTLLM_VERSION
RUN git clone https://github.com/NVIDIA/TensorRT-LLM.git && \
cd TensorRT-LLM && \
git checkout ${TRTLLM_VERSION} && \
. docker/common/install_tensorrt.sh && \
python3 ./scripts/build_wheel.py --job_count $(nproc) --trt_root /usr/local/tensorrt --python_bindings --benchmarks && \
pip install -e .
ARG PYNVML_VERSION
COPY --from=trtllm-wheel /opt/TensorRT-LLM/build/ /tmp/trtllm
RUN bash /opt/NeMo-Aligner/reinstall.sh --mode install --library trtllm
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/

# TODO: This pinning of pynvml is only needed while on TRTLLM v13 since pynvml>=11.5.0 but pynvml==12.0.0 contains a
# breaking change. The last known working verison is 11.5.3
RUN pip install pynvml==11.5.3

# install TransformerEngine
ARG MAX_JOBS
ARG TE_TAG
RUN pip uninstall -y transformer-engine && \
git clone https://github.com/NVIDIA/TransformerEngine.git && \
cd TransformerEngine && \
if [ ! -z $TE_TAG ]; then \
git fetch origin $TE_TAG && \
git checkout FETCH_HEAD; \
fi && \
git submodule init && git submodule update && \
NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install .

# place any util pkgs here
ARG PYTRITON_VERSION
RUN pip install --upgrade-strategy only-if-needed nvidia-pytriton==$PYTRITON_VERSION
ARG PROTOBUF_VERSION
RUN pip install -U --no-deps protobuf==$PROTOBUF_VERSION
RUN pip install --upgrade-strategy only-if-needed jsonlines
# TransformerEngine
COPY --from=te-wheel /opt/TransformerEngine /tmp/te
RUN bash /opt/NeMo-Aligner/reinstall.sh --mode install --library te

# NeMo
ARG NEMO_REPO
ARG NEMO_TAG
RUN git clone https://github.com/NVIDIA/NeMo.git && \
cd NeMo && \
git pull && \
if [ ! -z $NEMO_TAG ]; then \
git fetch origin $NEMO_TAG && \
git checkout FETCH_HEAD; \
fi && \
pip uninstall -y nemo_toolkit sacrebleu && \
pip install -e ".[nlp]" && \
cd nemo/collections/nlp/data/language_modeling/megatron && make

# TODO: While we are on Pytorch 24.07, we need to downgrade triton since 3.2.0 introduced a breaking change
# This un-pinned requirement comes from mamba-ssm, and this pin can be removed once Pytorch base image is
# updated.
RUN pip install triton==3.1.0

# MLM
ARG MLM_TAG
RUN pip uninstall -y megatron-core && \
git clone https://github.com/NVIDIA/Megatron-LM.git && \
cd Megatron-LM && \
git pull && \
if [ ! -z $MLM_TAG ]; then \
git fetch origin $MLM_TAG && \
git checkout FETCH_HEAD; \
fi && \
pip install -e .
RUN bash /opt/NeMo-Aligner/reinstall.sh --mode install --library nemo

COPY --from=aligner-bump /opt/NeMo-Aligner /opt/NeMo-Aligner
RUN cd /opt/NeMo-Aligner && \
pip install --no-deps -e .
ARG ALIGNER_COMMIT
ARG PROTOBUF_VERSION
ARG PYTRITON_VERSION
ARG PYNVML_VERSION
RUN bash /opt/NeMo-Aligner/reinstall.sh --library all --mode install
# && \
# cd /opt/TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch

RUN cd TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch

# TODO(terryk): This layer should be deleted ASAP after NeMo is bumped to include all of these PRs
RUN <<"EOF" bash -exu
cd NeMo
# Ensures we don't cherry-pick "future" origin/main commits
git fetch -a
# 0c92fe17df4642ffc33d5d8c0c83fda729e3910c: [fix] Ensures disabling exp_manager with exp_manager=null does not error NeMo#10651
# 60e677423667c029dd05875da72bf0719774f844: [feat] Update get_model_parallel_src_rank to support tp-pp-dp ordering NeMo#10652
# 0deaf6716cb4f20766c995ce25d129795f1ae200: fix[export]: update API for disabling device reassignment in TRTLLM for Aligner NeMo#10863
# (superceded by 10863) 148543d6e9c66ff1f8562e84484448202249811d: feat: Migrate GPTSession refit path in Nemo export to ModelRunner for Aligner NeMo#10654
for pr_and_commit in \
"10651 0c92fe17df4642ffc33d5d8c0c83fda729e3910c" \
"10652 60e677423667c029dd05875da72bf0719774f844" \
"10863 0deaf6716cb4f20766c995ce25d129795f1ae200" \
; do
pr=$(cut -f1 -d' ' <<<"$pr_and_commit")
head_pr_commit=$(cut -f2 -d' ' <<<"$pr_and_commit")
git fetch origin $head_pr_commit:PR-${pr}
# cherry-picks all commits between main and the top of the PR
git cherry-pick --allow-empty $(git merge-base origin/main PR-${pr})..PR-${pr}
# Tag cherry-picks to help
git tag cherry-pick-PR-${pr}
done
EOF
# RUN <<"EOF" bash -exu
# cd NeMo
# # Ensures we don't cherry-pick "future" origin/main commits
# git fetch -a
# # 0c92fe17df4642ffc33d5d8c0c83fda729e3910c: [fix] Ensures disabling exp_manager with exp_manager=null does not error NeMo#10651
# # 60e677423667c029dd05875da72bf0719774f844: [feat] Update get_model_parallel_src_rank to support tp-pp-dp ordering NeMo#10652
# # 0deaf6716cb4f20766c995ce25d129795f1ae200: fix[export]: update API for disabling device reassignment in TRTLLM for Aligner NeMo#10863
# # (superceded by 10863) 148543d6e9c66ff1f8562e84484448202249811d: feat: Migrate GPTSession refit path in Nemo export to ModelRunner for Aligner NeMo#10654
# for pr_and_commit in \
# "10651 0c92fe17df4642ffc33d5d8c0c83fda729e3910c" \
# "10652 60e677423667c029dd05875da72bf0719774f844" \
# "10863 0deaf6716cb4f20766c995ce25d129795f1ae200" \
# ; do
# pr=$(cut -f1 -d' ' <<<"$pr_and_commit")
# head_pr_commit=$(cut -f2 -d' ' <<<"$pr_and_commit")
# git fetch origin $head_pr_commit:PR-${pr}
# # cherry-picks all commits between main and the top of the PR
# git cherry-pick --allow-empty $(git merge-base origin/main PR-${pr})..PR-${pr}
# # Tag cherry-picks to help
# git tag cherry-pick-PR-${pr}
# done
# EOF
Loading