diff --git a/.github/workflows/npu-test.yml b/.github/workflows/npu-test.yml new file mode 100644 index 0000000000..ffb91aa604 --- /dev/null +++ b/.github/workflows/npu-test.yml @@ -0,0 +1,241 @@ +name: NPU Build and Test + +on: + push: + paths: + - '.github/workflows/npu-test.yml' + pull_request: + paths: + - '.github/workflows/npu-test.yml' + workflow_dispatch: + inputs: + torch_nightly_date: + description: 'PyTorch nightly 日期 (格式: YYYYMMDD,留空使用最新版)' + required: false + default: '' + +jobs: + build-and-test: + name: Build and Test torch_npu + runs-on: linux-aarch64-a3-2 + container: + image: swr.cn-north-4.myhuaweicloud.com/frameworkptadapter/pytorch_2.11.0_a2_aarch64_builder:20260331 + options: --user root + env: + PYTHON_VERSION: '3.11' + DOCKER_IMAGE: swr.cn-north-4.myhuaweicloud.com/frameworkptadapter/pytorch_2.11.0_a2_aarch64_builder:20260331 + AUDITWHEEL_PLAT: 'skip' + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Setup cache directories + run: | + mkdir -p /github/home/.cache/pip + mkdir -p /github/home/.cache/ccache + chmod -R 777 /github/home/.cache + + - name: Install ccache + run: | + yum install -y ccache + ccache --version + + - name: Cache pip + uses: actions/cache@v4 + with: + path: /github/home/.cache/pip + key: pip-arm-py${{ env.PYTHON_VERSION }}-build-test + restore-keys: | + pip-arm-py${{ env.PYTHON_VERSION }}- + + - name: Uninstall pre-installed packages + run: | + pip${{ env.PYTHON_VERSION }} uninstall -y torch torchvision pyyaml setuptools auditwheel || true + echo "Pre-installed packages uninstalled" + + - name: Install PyTorch nightly + id: install_torch + run: | + PIP=pip${{ env.PYTHON_VERSION }} + PYTHON=python${{ env.PYTHON_VERSION }} + + export PIP_CACHE_DIR=/github/home/.cache/pip + $PIP install --upgrade pip + + # 安装基础依赖 + $PIP install pyyaml setuptools auditwheel + + if [ -n "${{ github.event.inputs.torch_nightly_date }}" ]; then + DATE="${{ github.event.inputs.torch_nightly_date }}" + $PIP install --pre "torch==2.12.0.dev${DATE}" --index-url https://download.pytorch.org/whl/nightly/cpu + else + # 按 requirements.txt 安装固定版本 + $PIP install --pre "torch==2.12.0.dev20260217" --extra-index-url https://download.pytorch.org/whl/nightly/cpu + fi + + TORCH_VER=$($PYTHON -c "import torch; print(torch.__version__)") + echo "version=${TORCH_VER}" >> $GITHUB_OUTPUT + echo "PyTorch nightly version: ${TORCH_VER}" + + - name: Cache ccache + uses: actions/cache@v4 + with: + path: /github/home/.cache/ccache + key: ccache-arm-py${{ env.PYTHON_VERSION }}-torch${{ steps.install_torch.outputs.version }}-${{ github.sha }} + restore-keys: | + ccache-arm-py${{ env.PYTHON_VERSION }}-torch${{ steps.install_torch.outputs.version }}- + + - name: Build torch_npu wheel + id: build + run: | + PYTHON=python${{ env.PYTHON_VERSION }} + + # 配置 ccache + if command -v ccache &> /dev/null; then + echo "ccache found, enabling ccache" + ccache -M 10G + ccache -z || true + export CC="ccache gcc" + export CXX="ccache g++" + export CCACHE_DIR=/github/home/.cache/ccache + export CCACHE_COMPRESS=1 + export CCACHE_MAXSIZE=10G + export CCACHE_BASEDIR="${PWD}" + USE_CCACHE=1 + else + echo "ccache not found, building without cache" + USE_CCACHE=0 + fi + + # 构建参数 + echo "nproc value: $(nproc)" + echo "MAX_JOBS: 40" + export MAX_JOBS=40 + export DISABLE_INSTALL_TORCHAIR=FALSE + export BUILD_WITHOUT_SHA=1 + + # 使用 ci/build.sh 脚本 + bash ci/build.sh --python=${{ env.PYTHON_VERSION }} 2>&1 | tee /tmp/build.log + BUILD_STATUS=${PIPESTATUS[0]} + + # ccache 统计(兼容 ccache 3.x/4.x 格式) + if [ "${USE_CCACHE}" = "1" ]; then + CCACHE_STATS=$(ccache -s | grep -E "cache hit|cache miss|cache size|hit rate" | tr '\n' ' ') + echo "ccache_stats=${CCACHE_STATS}" >> $GITHUB_OUTPUT + ccache -s + fi + + echo "status=${BUILD_STATUS}" >> $GITHUB_OUTPUT + if [ ${BUILD_STATUS} -eq 0 ]; then + WHL=$(ls dist/*.whl 2>/dev/null | head -1) + echo "wheel=${WHL}" >> $GITHUB_OUTPUT + echo "Build succeeded: ${WHL}" + fi + exit ${BUILD_STATUS} + + - name: Install torch_npu wheel + run: | + pip${{ env.PYTHON_VERSION }} install dist/torch_npu*.whl + echo "torch_npu wheel installed" + + - name: Check Ascend paths + run: | + echo "=== Checking Ascend paths ===" + ls -la /usr/local/Ascend/ 2>&1 || echo "/usr/local/Ascend not found" + ls -la /usr/local/Ascend/cann/ 2>&1 || echo "/usr/local/Ascend/cann not found" + ls -la /usr/local/Ascend/nnal/ 2>&1 || echo "/usr/local/Ascend/nnal not found" + + - name: Verify NPU availability + run: | + # 加载 CANN 环境变量 + source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true + source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true + + PYTHON=python${{ env.PYTHON_VERSION }} + # 切换到项目根目录的上一级,避免从源码目录加载 torch_npu + cd .. + echo "=== Testing torch_npu import ===" + $PYTHON -c "import torch; print(f'torch: {torch.__version__}'); import torch_npu; print(f'torch_npu: {torch_npu.__version__}'); print(f'NPU available: {torch.npu.is_available()}'); print(f'NPU count: {torch.npu.device_count()}'); print(f'NPU name: {torch.npu.get_device_name(0) if torch.npu.is_available() else \"N/A\"}')" + + - name: Run test_device.py + id: run_tests + run: | + # 加载 CANN 环境变量 + source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true + source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true + + PYTHON=python${{ env.PYTHON_VERSION }} + PIP=pip${{ env.PYTHON_VERSION }} + + # 步骤开始时自动回到项目根目录,进入 test 目录执行测试 + cd test + $PYTHON -m pytest npu/test_device.py -v 2>&1 | tee /tmp/test.log + + if [ $? -eq 0 ]; then + echo "status=0" >> $GITHUB_OUTPUT + echo "test_device.py: PASSED" + else + echo "status=1" >> $GITHUB_OUTPUT + echo "test_device.py: FAILED" + fi + + - name: Upload build log + if: always() + uses: actions/upload-artifact@v4 + with: + name: build-log-${{ github.run_number }} + path: /tmp/build.log + if-no-files-found: warn + + - name: Upload test log + if: always() + uses: actions/upload-artifact@v4 + with: + name: test-log-${{ github.run_number }} + path: /tmp/test.log + if-no-files-found: warn + + - name: Upload wheel artifact + if: steps.build.outputs.status == '0' + uses: actions/upload-artifact@v4 + with: + name: torch_npu-wheel-${{ github.run_number }} + path: dist/*.whl + if-no-files-found: warn + + - name: Build and Test summary + if: always() + run: | + BUILD_STATUS="${{ steps.build.outputs.status }}" + TEST_STATUS="${{ steps.run_tests.outputs.status }}" + + if [ "${BUILD_STATUS}" = "0" ]; then + BUILD_ICON="✅ SUCCESS" + else + BUILD_ICON="❌ FAILED" + fi + + if [ "${TEST_STATUS}" = "0" ]; then + TEST_ICON="✅ PASSED" + else + TEST_ICON="❌ FAILED" + fi + + cat >> $GITHUB_STEP_SUMMARY << EOF + ## NPU Build and Test + + | 项目 | 详情 | + |------|------| + | 执行时间 | $(date -u '+%Y-%m-%d %H:%M UTC') | + | Docker 镜像 | \`${{ env.DOCKER_IMAGE }}\` | + | PyTorch Nightly | \`${{ steps.install_torch.outputs.version }}\` | + | 仓库 Commit | \`${{ github.sha }}\` | + | ccache 统计 | ${{ steps.build.outputs.ccache_stats || 'N/A' }} | + | 构建结果 | ${BUILD_ICON} | + | 测试结果 | ${TEST_ICON} | + + $( [ "${BUILD_STATUS}" = "0" ] && echo "> Wheel: \`${{ steps.build.outputs.wheel }}\`" || echo "> 查看 build-log artifact 获取详细错误信息" ) + EOF \ No newline at end of file