NVIDIA
diff --git a/‎.github/actions/fetch_ctk/action.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/actions/fetch_ctk/action.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/bandit.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/bandit.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/build-wheel.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/build-wheel.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/codeql.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/codeql.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/coverage.yml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/coverage.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.github/workflows/test-wheel-linux.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/test-wheel-linux.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/test-wheel-windows.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/test-wheel-windows.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py‎
Lines changed: 46 additions & 46 deletions b/‎cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py‎
Lines changed: 46 additions & 46 deletions
@@ -14,7 +14,7 @@ inputs:
   cuda-components:
     description: "A list of the CTK components to install as a comma-separated list. e.g. 'cuda_nvcc,cuda_nvrtc,cuda_cudart'"
     required: false
-    default: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_profiler_api,cuda_cccl,libnvjitlink,libcufile,libnvfatbin"
+    default: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_profiler_api,cuda_cccl,cuda_cupti,libnvjitlink,libcufile,libnvfatbin"
   cuda-path:
     description: "where the CTK components will be installed to, relative to $PWD"
     required: false
 
@@ -42,6 +42,6 @@ jobs:
         with:
           args: "check --select S --ignore ${{ steps.ignore-codes.outputs.codes }} --output-format sarif --output-file results.sarif"
       - name: Upload SARIF file
-        uses: github/codeql-action/upload-sarif@v4.32.4
+        uses: github/codeql-action/upload-sarif@v4.32.5
         with:
           sarif_file: results.sarif
@@ -369,7 +369,7 @@ jobs:
 
           OLD_BRANCH=$(yq '.backport_branch' ci/versions.yml)
           OLD_BASENAME="cuda-bindings-python${PYTHON_VERSION_FORMATTED}-cuda*-${{ inputs.host-platform }}*"
-          LATEST_PRIOR_RUN_ID=$(gh run list -b ${OLD_BRANCH} -L 1 -w "ci.yml" -s completed -R NVIDIA/cuda-python --json databaseId | jq '.[]| .databaseId')
+          LATEST_PRIOR_RUN_ID=$(gh run list -b ${OLD_BRANCH} -L 1 -w "ci.yml" -s success -R NVIDIA/cuda-python --json databaseId | jq '.[]| .databaseId')
           if [[ "$LATEST_PRIOR_RUN_ID" == "" ]]; then
             echo "LATEST_PRIOR_RUN_ID not found!"
             exit 1
 
@@ -31,13 +31,13 @@ jobs:
       uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
 
     - name: Initialize CodeQL
-      uses: github/codeql-action/init@0ec47d036c68ae0cf94c629009b1029407111281  # v3.31.8
+      uses: github/codeql-action/init@40f0fa95c41fede7b43f035cb47aac899ee0ba0a  # v3.31.8
       with:
         languages: ${{ matrix.language }}
         build-mode: ${{ matrix.build-mode }}
         queries: security-extended
 
     - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@0ec47d036c68ae0cf94c629009b1029407111281  # v3.31.8
+      uses: github/codeql-action/analyze@40f0fa95c41fede7b43f035cb47aac899ee0ba0a  # v3.31.8
       with:
         category: "/language:${{matrix.language}}"
@@ -98,7 +98,8 @@ jobs:
 
       - name: Build cuda-pathfinder
         run: |
-          .venv/bin/pip install -v ./cuda_pathfinder --group test
+          cd cuda_pathfinder
+          ../.venv/bin/pip install -v . --group test
 
       - name: Build cuda-bindings
         run: |
 
@@ -151,7 +151,7 @@ jobs:
 
           OLD_BRANCH=${{ needs.compute-matrix.outputs.OLD_BRANCH }}
           OLD_BASENAME="cuda-bindings-python${PYTHON_VERSION_FORMATTED}-cuda*-${{ inputs.host-platform }}*"
-          LATEST_PRIOR_RUN_ID=$(gh run list -b ${OLD_BRANCH} -L 1 -w "ci.yml" -s completed -R NVIDIA/cuda-python --json databaseId | jq '.[]| .databaseId')
+          LATEST_PRIOR_RUN_ID=$(gh run list -b ${OLD_BRANCH} -L 1 -w "ci.yml" -s success -R NVIDIA/cuda-python --json databaseId | jq '.[]| .databaseId')
           if [[ "$LATEST_PRIOR_RUN_ID" == "" ]]; then
             echo "LATEST_PRIOR_RUN_ID not found!"
             exit 1
 
@@ -137,7 +137,7 @@ jobs:
         run: |
           $OLD_BRANCH = yq '.backport_branch' ci/versions.yml
           $OLD_BASENAME = "cuda-bindings-python${env:PYTHON_VERSION_FORMATTED}-cuda*-${{ inputs.host-platform }}*"
-          $runData = gh run list -b $OLD_BRANCH -L 1 -w "ci.yml" -s completed -R NVIDIA/cuda-python --json databaseId | ConvertFrom-Json
+          $runData = gh run list -b $OLD_BRANCH -L 1 -w "ci.yml" -s success -R NVIDIA/cuda-python --json databaseId | ConvertFrom-Json
           if (-not $runData -or $runData.Length -eq 0 -or -not $runData[0].databaseId -or [string]::IsNullOrEmpty($runData[0].databaseId)) {
               Write-Host "LATEST_PRIOR_RUN_ID not found!"
               exit 1
 
@@ -5,7 +5,7 @@
 
 import numpy as np
 from common import common
-from common.helper_cuda import checkCudaErrors, findCudaDevice
+from common.helper_cuda import check_cuda_errors, find_cuda_device
 
 from cuda.bindings import driver as cuda
 
@@ -50,8 +50,8 @@
 }
 """
 
-NUM_BLOCKS = 64
-NUM_THREADS = 256
+num_blocks = 64
+num_threads = 256
 
 
 def elems_to_bytes(nelems, dt):
@@ -64,52 +64,52 @@ def main():
     if platform.machine() == "armv7l":
         pytest.skip("clock_nvrtc is not supported on ARMv7")
 
-    timer = np.empty(NUM_BLOCKS * 2, dtype="int64")
-    hinput = np.empty(NUM_THREADS * 2, dtype="float32")
+    timer = np.empty(num_blocks * 2, dtype="int64")
+    hinput = np.empty(num_threads * 2, dtype="float32")
 
-    for i in range(NUM_THREADS * 2):
+    for i in range(num_threads * 2):
         hinput[i] = i
 
-    devID = findCudaDevice()
-    with common.KernelHelper(clock_nvrtc, devID) as kernelHelper:
-        kernel_addr = kernelHelper.getFunction(b"timedReduction")
-
-        dinput = checkCudaErrors(cuda.cuMemAlloc(hinput.nbytes))
-        doutput = checkCudaErrors(cuda.cuMemAlloc(elems_to_bytes(NUM_BLOCKS, np.float32)))
-        dtimer = checkCudaErrors(cuda.cuMemAlloc(timer.nbytes))
-        checkCudaErrors(cuda.cuMemcpyHtoD(dinput, hinput, hinput.nbytes))
-
-        args = ((dinput, doutput, dtimer), (None, None, None))
-        shared_memory_nbytes = elems_to_bytes(2 * NUM_THREADS, np.float32)
-
-        grid_dims = (NUM_BLOCKS, 1, 1)
-        block_dims = (NUM_THREADS, 1, 1)
-
-        checkCudaErrors(
-            cuda.cuLaunchKernel(
-                kernel_addr,
-                *grid_dims,  # grid dim
-                *block_dims,  # block dim
-                shared_memory_nbytes,
-                0,  # shared mem, stream
-                args,
-                0,
-            )
-        )  # arguments
-
-        checkCudaErrors(cuda.cuCtxSynchronize())
-        checkCudaErrors(cuda.cuMemcpyDtoH(timer, dtimer, timer.nbytes))
-        checkCudaErrors(cuda.cuMemFree(dinput))
-        checkCudaErrors(cuda.cuMemFree(doutput))
-        checkCudaErrors(cuda.cuMemFree(dtimer))
-
-    avgElapsedClocks = 0.0
-
-    for i in range(NUM_BLOCKS):
-        avgElapsedClocks += timer[i + NUM_BLOCKS] - timer[i]
-
-    avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS
-    print(f"Average clocks/block = {avgElapsedClocks}")
+    dev_id = find_cuda_device()
+    kernel_helper = common.KernelHelper(clock_nvrtc, dev_id)
+    kernel_addr = kernel_helper.get_function(b"timedReduction")
+
+    dinput = check_cuda_errors(cuda.cuMemAlloc(hinput.nbytes))
+    doutput = check_cuda_errors(cuda.cuMemAlloc(elems_to_bytes(num_blocks, np.float32)))
+    dtimer = check_cuda_errors(cuda.cuMemAlloc(timer.nbytes))
+    check_cuda_errors(cuda.cuMemcpyHtoD(dinput, hinput, hinput.nbytes))
+
+    args = ((dinput, doutput, dtimer), (None, None, None))
+    shared_memory_nbytes = elems_to_bytes(2 * num_threads, np.float32)
+
+    grid_dims = (num_blocks, 1, 1)
+    block_dims = (num_threads, 1, 1)
+
+    check_cuda_errors(
+        cuda.cuLaunchKernel(
+            kernel_addr,
+            *grid_dims,  # grid dim
+            *block_dims,  # block dim
+            shared_memory_nbytes,
+            0,  # shared mem, stream
+            args,
+            0,
+        )
+    )  # arguments
+
+    check_cuda_errors(cuda.cuCtxSynchronize())
+    check_cuda_errors(cuda.cuMemcpyDtoH(timer, dtimer, timer.nbytes))
+    check_cuda_errors(cuda.cuMemFree(dinput))
+    check_cuda_errors(cuda.cuMemFree(doutput))
+    check_cuda_errors(cuda.cuMemFree(dtimer))
+
+    avg_elapsed_clocks = 0.0
+
+    for i in range(num_blocks):
+        avg_elapsed_clocks += timer[i + num_blocks] - timer[i]
+
+    avg_elapsed_clocks = avg_elapsed_clocks / num_blocks
+    print(f"Average clocks/block = {avg_elapsed_clocks}")
 
 
 if __name__ == "__main__":