diff --git a/.claude/skills/cudaq-guide/SKILL.md b/.claude/skills/cudaq-guide/SKILL.md
index 9dda4831f1d..c7b210e928f 100644
--- a/.claude/skills/cudaq-guide/SKILL.md
+++ b/.claude/skills/cudaq-guide/SKILL.md
@@ -3,13 +3,13 @@ name: "cudaq-guide"
 title: "Cuda Quantum"
 description: "CUDA-Q onboarding guide for installation, test programs, GPU simulation, QPU hardware, and quantum applications."
 version: "1.0.0"
-author: "Sachin Pisal <spisal@nvidia.com>"
+author: "CUDA-Q"
 tags: [cuda-quantum, quantum-computing, onboarding, getting-started, nvidia]
 tools: [Read, Glob, Grep, Bash]
 license: "Apache License 2.0"
 compatibility: "Python 3.10+, C++ 20"
 metadata:
-    author: "Sachin Pisal <spisal@nvidia.com>"
+    author: "CUDA-Q"
     tags:
         - cuda-quantum
         - quantum-computing
diff --git a/.github/nv-slack-bot.yaml b/.github/nv-slack-bot.yaml
new file mode 100644
index 00000000000..27bc9e65782
--- /dev/null
+++ b/.github/nv-slack-bot.yaml
@@ -0,0 +1,100 @@
+$schema: https://public.gha-runners.nvidia.com/nv-slack-bot/schemas/config-v1.json
+enabled: true
+notifications:
+  - name: "Publishing workflow failed"
+    event: workflow_run
+    slack:
+      nvidia:
+        channels:
+          - id: C0AT93CK1B9 # nvqpp-cudaq-notifications
+    match: workflow_run.name = "Publishing" and workflow_run.conclusion = "failure"
+    message:
+      body: |
+        <{{url}}|Publishing workflow> failed on `{{branch}}` (commit <{{commitUrl}}|{{sha}}>)
+      vars:
+        url: workflow_run.html_url
+        branch: workflow_run.head_branch
+        sha: $substring(workflow_run.head_sha, 0, 7)
+        commitUrl: workflow_run.head_repository.html_url & "/commit/" & workflow_run.head_sha
+    show_webhook_payload_on_error: false
+
+  - name: "Stable publishing workflow failed"
+    event: workflow_run
+    slack:
+      nvidia:
+        channels:
+          - id: C0AT93CK1B9 # nvqpp-cudaq-notifications
+    match: workflow_run.name = "Push stable release" and workflow_run.conclusion = "failure"
+    message:
+      body: |
+        <{{url}}|Push stable release workflow> failed on `{{branch}}` (commit <{{commitUrl}}|{{sha}}>)
+      vars:
+        url: workflow_run.html_url
+        branch: workflow_run.head_branch
+        sha: $substring(workflow_run.head_sha, 0, 7)
+        commitUrl: workflow_run.head_repository.html_url & "/commit/" & workflow_run.head_sha
+    show_webhook_payload_on_error: false
+
+  - name: "Deployments workflow failed"
+    event: workflow_run
+    slack:
+      nvidia:
+        channels:
+          - id: C0AT93CK1B9 # nvqpp-cudaq-notifications
+    match: workflow_run.name = "Deployments" and workflow_run.conclusion = "failure"
+    message:
+      body: |
+        <{{url}}|Deployments workflow> failed on `{{branch}}` (commit <{{commitUrl}}|{{sha}}>)
+      vars:
+        url: workflow_run.html_url
+        branch: workflow_run.head_branch
+        sha: $substring(workflow_run.head_sha, 0, 7)
+        commitUrl: workflow_run.head_repository.html_url & "/commit/" & workflow_run.head_sha
+    show_webhook_payload_on_error: false
+
+  - name: "Merge queue CI failed"
+    event: workflow_run
+    slack:
+      nvidia:
+        channels:
+          - id: C0AT93CK1B9 # nvqpp-cudaq-notifications
+    match: workflow_run.name = "CI" and workflow_run.event = "merge_group" and workflow_run.conclusion = "failure"
+    message:
+      body: |
+        <{{url}}|Merge queue CI> failed on `{{branch}}` (commit <{{commitUrl}}|{{sha}}>)
+      vars:
+        url: workflow_run.html_url
+        branch: workflow_run.head_branch
+        sha: $substring(workflow_run.head_sha, 0, 7)
+        commitUrl: workflow_run.head_repository.html_url & "/commit/" & workflow_run.head_sha
+    show_webhook_payload_on_error: false
+
+  - name: "Issue opened"
+    event: issues
+    slack:
+      nvidia:
+        channels:
+          - id: C0AT93CK1B9 # nvqpp-cudaq-notifications
+    match: action = "opened"
+    message:
+      body: |
+        New issue opened: <{{url}}|{{title}}>
+      vars:
+        url: issue.html_url
+        title: issue.title
+    show_webhook_payload_on_error: false
+
+  - name: "Stale issue (no update in over a week)"
+    event: repository_dispatch
+    slack:
+      nvidia:
+        channels:
+          - id: C0AT93CK1B9 # nvqpp-cudaq-notifications
+    match: action = "stale-issue"
+    message:
+      body: |
+        Issue has had no activity in over a week: <{{url}}|{{title}}>
+      vars:
+        url: client_payload.url
+        title: client_payload.title
+    show_webhook_payload_on_error: false
diff --git a/.github/pre-commit/spelling_allowlist.txt b/.github/pre-commit/spelling_allowlist.txt
index 64c9c045bed..e4dea1a5777 100644
--- a/.github/pre-commit/spelling_allowlist.txt
+++ b/.github/pre-commit/spelling_allowlist.txt
@@ -108,6 +108,8 @@ Photonics
 PyPI
 Pygments
 QAOA
+QASM
+QBRAID
 QCI
 QCaaS
 QEC
@@ -128,6 +130,7 @@ QuTiP
 Quake
 Quantinuum
 RDMA
+REPL
 RHEL
 RPC
 RSA
@@ -168,6 +171,7 @@ amongst
 ancilla
 ansatz
 ansatzes
+api
 archiver
 arity
 auxillary
@@ -300,6 +304,7 @@ lossy
 lvalue
 macOS
 makefiles
+measurementCounts
 merchantability
 mps
 multinomial
@@ -307,10 +312,12 @@ multithreaded
 mutex
 namespace
 namespaces
+nanobind
 natively
 normalization
 nullary
 nvcc
+nvq
 observables
 optimizer
 optimizers
@@ -332,21 +339,26 @@ preprocessor
 probability
 programmatically
 pybind
+qBraid
 qaoa
+qbraid
 qed
 qio
+qrn
 quantize
 quantized
 qubit
 qubits
 qudit
 qudits
+queryable
 qumode
 qumodes
 reStructuredText
 realtime
 reconfigurable
 reproducibility
+resultData
 reusability
 runtime
 runtimes
diff --git a/.github/workflows/clean_up.yml b/.github/workflows/clean_up.yml
index ebca2c0aaa2..a5bedab7c05 100644
--- a/.github/workflows/clean_up.yml
+++ b/.github/workflows/clean_up.yml
@@ -144,6 +144,8 @@ jobs:
   # Since we use the same workflows during CI, a default environment that defines
   # the necessary variables is used instead. Unfortunately, this automatically
   # also creates an (unwanted) deployment, which we delete with this job.
+  # The ghcr-ci environment similarly produces unwanted deployment entries
+  # from the dev_environment workflow during CI runs on pull requests.
   # See also https://github.com/actions/runner/issues/2120
   deployments:
     name: Deployments
@@ -155,26 +157,28 @@ jobs:
       - uses: actions/github-script@v7
         with:
           script: |
-            const deployments = await github.rest.repos.listDeployments({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              environment: 'default'
-            });
-            await Promise.all(
-              deployments.data.map(async (deployment) => {
-                await github.rest.repos.createDeploymentStatus({ 
-                owner: context.repo.owner, 
-                repo: context.repo.repo, 
-                deployment_id: deployment.id, 
-                state: 'inactive' 
-                });
-                return github.rest.repos.deleteDeployment({
+            for (const environment of ['default', 'ghcr-ci']) {
+              const deployments = await github.rest.repos.listDeployments({
                 owner: context.repo.owner,
                 repo: context.repo.repo,
-                deployment_id: deployment.id
-                });
-              })
-            );
+                environment: environment
+              });
+              await Promise.all(
+                deployments.data.map(async (deployment) => {
+                  await github.rest.repos.createDeploymentStatus({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  deployment_id: deployment.id,
+                  state: 'inactive'
+                  });
+                  return github.rest.repos.deleteDeployment({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  deployment_id: deployment.id
+                  });
+                })
+              );
+            }
 
   pr_cleanup:
     name: Clean up documentation previews
diff --git a/.github/workflows/config/gitlab_commits.txt b/.github/workflows/config/gitlab_commits.txt
index c912a492963..e3e19dbe3f3 100644
--- a/.github/workflows/config/gitlab_commits.txt
+++ b/.github/workflows/config/gitlab_commits.txt
@@ -1,2 +1,2 @@
 nvidia-mgpu-repo: cuda-quantum/cuquantum-mgpu.git
-nvidia-mgpu-commit: 52dbd7d31cf3c88c8e5a1de9bac6635a5b0c8309
+nvidia-mgpu-commit: 16b82e64ab6f1d14ed7162a8d2580b632271a89f
diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml
index 47cce0eb1e3..7297e2faeba 100644
--- a/.github/workflows/integration_tests.yml
+++ b/.github/workflows/integration_tests.yml
@@ -28,6 +28,7 @@ on:
           - quantinuum
           - scaleway
           - tii
+          - qbraid
       single_test_name:
         type: string
         required: false
@@ -191,7 +192,7 @@ jobs:
         run: |
           # Determine which providers to test based on inputs and event type
           if [[ "${{ github.event_name }}" == "schedule" || "${{ inputs.target }}" == "nightly" ]]; then
-            providers='["anyon", "fermioniq", "infleqtion", "ionq", "iqm", "oqc", "orca", "pasqal", "qci", "quantinuum", "scaleway", "tii"]'
+            providers='["anyon", "fermioniq", "infleqtion", "ionq", "iqm", "oqc", "orca", "pasqal", "qbraid", "qci", "quantinuum", "scaleway", "tii"]'
           else
             # Just run the specified target provider
             providers="[\"${{ inputs.target }}\"]"
@@ -261,6 +262,9 @@ jobs:
               pasqal)
                 filelist="docs/sphinx/targets/cpp/pasqal.cpp docs/sphinx/targets/python/pasqal.py"
                 ;;
+              qbraid)
+                filelist="targettests/qbraid/*.cpp docs/sphinx/targets/cpp/qbraid.cpp docs/sphinx/targets/python/qbraid.py"
+                ;;
               qci)
                 filelist="targettests/qci/*.cpp"
                 ;;
@@ -380,6 +384,11 @@ jobs:
               echo "PASQAL_PROJECT_ID=${{ secrets.PASQAL_PROJECT_ID }}" >> $GITHUB_ENV
               echo "PASQAL_MACHINE_TARGET=EMU_FREE" >> $GITHUB_ENV
               ;;
+            qbraid)
+              echo "### Setting up qBraid account" >> $GITHUB_STEP_SUMMARY
+              echo "::add-mask::${{ secrets.QBRAID_API_KEY }}"
+              echo "QBRAID_API_KEY=${{ secrets.QBRAID_API_KEY }}" >> $GITHUB_ENV
+              ;;
             qci)
               echo "### Setting up QCI account" >> $GITHUB_STEP_SUMMARY
               echo "::add-mask::${{ secrets.QCI_AUTH_TOKEN }}"
@@ -671,6 +680,39 @@ jobs:
                 fi
                 ;;
 
+              qbraid)
+                if [[ "$filename" == *.cpp ]]; then
+                  nvq++ -v $filename --target qbraid --qbraid-machine qbraid:qbraid:sim:qir-sv
+                  test_status=$?
+                  if [ $test_status -eq 0 ]; then
+                    ./a.out
+                    test_status=$?
+                    if [ $test_status -eq 0 ]; then
+                      echo ":white_check_mark: Successfully ran test: $filename" >> $GITHUB_STEP_SUMMARY
+                    else
+                      echo ":x: Test failed (failed to execute): $filename" >> $GITHUB_STEP_SUMMARY
+                      test_err_sum=$((test_err_sum+1))
+                    fi
+                  else
+                    echo ":x: Test failed (failed to compile): $filename" >> $GITHUB_STEP_SUMMARY
+                    test_err_sum=$((test_err_sum+1))
+                  fi
+                elif [[ "$filename" == *.py ]]; then
+                  python3 $filename 1> /dev/null
+                  test_status=$?
+                  if [ $test_status -eq 0 ]; then
+                    echo ":white_check_mark: Successfully ran test: $filename" >> $GITHUB_STEP_SUMMARY
+                  else
+                    echo ":x: Test failed (failed to execute): $filename" >> $GITHUB_STEP_SUMMARY
+                    test_err_sum=$((test_err_sum+1))
+                  fi
+                else
+                  echo "::warning::Unsupported file type: $filename"
+                  echo ":warning: Test skipped (unsupported file type): $filename" >> $GITHUB_STEP_SUMMARY
+                  test_skip_sum=$((test_skip_sum+1))
+                fi
+                ;;
+
               qci)
                 nvq++ -v $filename --target qci
                 test_status=$?
diff --git a/.github/workflows/stale_issue_notifier.yml b/.github/workflows/stale_issue_notifier.yml
new file mode 100644
index 00000000000..fefdacba09f
--- /dev/null
+++ b/.github/workflows/stale_issue_notifier.yml
@@ -0,0 +1,107 @@
+name: Stale issue notifier
+
+on:
+  schedule:
+    - cron: '0 14 * * *'
+  workflow_dispatch:
+  issues:
+    types: [edited, reopened]
+  issue_comment:
+    types: [created]
+
+permissions:
+  contents: write
+  issues: write
+
+jobs:
+  notify:
+    if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Dispatch stale-issue events
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const WEEK_MS = 7 * 24 * 60 * 60 * 1000;
+            const LABEL = 'stale-notified';
+            const now = Date.now();
+
+            try {
+              await github.rest.issues.getLabel({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                name: LABEL,
+              });
+            } catch (e) {
+              if (e.status !== 404) throw e;
+              await github.rest.issues.createLabel({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                name: LABEL,
+                color: 'ededed',
+                description: 'Stale notification has already fired for this issue',
+              });
+            }
+
+            const issues = await github.paginate(github.rest.issues.listForRepo, {
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              state: 'open',
+              sort: 'updated',
+              direction: 'asc',
+              per_page: 100,
+            });
+
+            let dispatched = 0;
+            for (const issue of issues) {
+              if (issue.pull_request) continue;
+              const age = now - new Date(issue.updated_at).getTime();
+              const alreadyNotified = issue.labels.some(l => (l.name || l) === LABEL);
+              if (age < WEEK_MS || alreadyNotified) continue;
+
+              await github.rest.repos.createDispatchEvent({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                event_type: 'stale-issue',
+                client_payload: {
+                  number: issue.number,
+                  url: issue.html_url,
+                  title: issue.title,
+                },
+              });
+              await github.rest.issues.addLabels({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: issue.number,
+                labels: [LABEL],
+              });
+              core.info(`Dispatched stale-issue for #${issue.number}`);
+              dispatched++;
+            }
+            core.info(`Total dispatched: ${dispatched}`);
+
+  refresh:
+    if: |
+      github.event_name == 'issues' ||
+      (github.event_name == 'issue_comment' && github.event.issue.pull_request == null)
+    runs-on: ubuntu-latest
+    steps:
+      - name: Remove stale-notified label on activity
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const LABEL = 'stale-notified';
+            const issue = context.payload.issue;
+            const hasLabel = (issue.labels || []).some(l => l.name === LABEL);
+            if (!hasLabel) return;
+            try {
+              await github.rest.issues.removeLabel({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: issue.number,
+                name: LABEL,
+              });
+              core.info(`Removed ${LABEL} from #${issue.number}`);
+            } catch (e) {
+              if (e.status !== 404) throw e;
+            }
diff --git a/.gitmodules b/.gitmodules
index 622993890c6..644ab8cc24f 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -48,3 +48,6 @@
 [submodule "tpls/Stim"]
 	path = tpls/Stim
 	url = https://github.com/quantumlib/Stim
+[submodule "tpls/nanobind"]
+	path = tpls/nanobind
+	url = https://github.com/wjakob/nanobind.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 548bd30adcf..0baf807653e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -135,6 +135,11 @@ if (NOT DEFINED CUDAQ_ENABLE_SCALEWAY_BACKEND)
   set(CUDAQ_ENABLE_SCALEWAY_BACKEND ON CACHE BOOL "Enable building the Scaleway target.")
 endif()
 
+# Enable qBraid target by default.
+if (NOT DEFINED CUDAQ_ENABLE_QBRAID_BACKEND)
+  set(CUDAQ_ENABLE_QBRAID_BACKEND ON CACHE BOOL "Enable building the qBraid target.")
+endif()
+
 # Generate a CompilationDatabase (compile_commands.json file) for our build,
 # for use by clang_complete, YouCompleteMe, etc.
 set(CMAKE_EXPORT_COMPILE_COMMANDS 1)
@@ -713,6 +718,12 @@ if (CUDAQ_ENABLE_PYTHON)
   # Python bindings generated as part of the CUDA-Q build and bindings generated for
   # third party CUDA-Q libraries; see also https://github.com/pybind/pybind11/issues/1262
   add_subdirectory(tpls/pybind11)
+
+  # nanobind is used for all CUDA-Q Python bindings. pybind11 is retained only
+  # for upstream MLIR Python extensions (e.g., _mlirAsyncPasses) which use
+  # mlir/Bindings/Python/PybindAdaptors.h.
+  add_subdirectory(tpls/nanobind)
+
   add_subdirectory(python)
 endif()
 
diff --git a/docker/build/devcontainer.Dockerfile b/docker/build/devcontainer.Dockerfile
index 51fa1c72ea5..4100c90e340 100644
--- a/docker/build/devcontainer.Dockerfile
+++ b/docker/build/devcontainer.Dockerfile
@@ -183,7 +183,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     else \
         cupy_version=13.4.1; \
     fi && \
-    python3 -m pip install --break-system-packages cupy-cuda$(echo $CUDA_VERSION | cut -d . -f1)x==${cupy_version} cuquantum-cu$(echo $CUDA_VERSION | cut -d . -f1)==26.1.0 && \
+    python3 -m pip install --break-system-packages cupy-cuda$(echo $CUDA_VERSION | cut -d . -f1)x==${cupy_version} cuquantum-cu$(echo $CUDA_VERSION | cut -d . -f1)==26.3.1 && \
     if [ "$(python3 --version | grep -o [0-9\.]* | cut -d . -f -2)" != "3.12" ]; then \
         echo "expecting Python version 3.12"; \
     fi
diff --git a/docs/sphinx/api/languages/python_api.rst b/docs/sphinx/api/languages/python_api.rst
index 29644506d2d..a973997b7ce 100644
--- a/docs/sphinx/api/languages/python_api.rst
+++ b/docs/sphinx/api/languages/python_api.rst
@@ -76,6 +76,7 @@ Kernel Execution
 Backend Configuration
 =============================
 
+.. autofunction:: cudaq::parse_args
 .. autofunction:: cudaq::has_target
 .. autofunction:: cudaq::get_target
 .. autofunction:: cudaq::get_targets
@@ -494,4 +495,3 @@ Trajectory and Selection Types
 
 .. autoclass:: cudaq.ptsbe.KrausSelection
     :members:
-
diff --git a/docs/sphinx/applications/python/divisive_clustering_coresets.ipynb b/docs/sphinx/applications/python/divisive_clustering_coresets.ipynb
index b0b1fae4c5a..4ede8d321b7 100644
--- a/docs/sphinx/applications/python/divisive_clustering_coresets.ipynb
+++ b/docs/sphinx/applications/python/divisive_clustering_coresets.ipynb
@@ -651,7 +651,7 @@
    "source": [
     "threshold_height = 1\n",
     "clusters = dendo.get_clusters_using_height(threshold_height)\n",
-    "colors = [\"red\", \"blue\", \"green\", \"black\", \"purple\", \"orange\", \"yellow\"]\n",
+    "colors = [\"red\", \"blue\", \"green\", \"black\", \"purple\", \"orange\", \"yellow\", \"cyan\", \"magenta\", \"brown\"]\n",
     "dendo.plot_dendrogram(\n",
     "    plot_title=\"Dendrogram of Coreset using VQE\",\n",
     "    colors=colors,\n",
diff --git a/docs/sphinx/applications/python/krylov.ipynb b/docs/sphinx/applications/python/krylov.ipynb
index 711eb9d2c28..66b95331e95 100644
--- a/docs/sphinx/applications/python/krylov.ipynb
+++ b/docs/sphinx/applications/python/krylov.ipynb
@@ -60,7 +60,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "ba61665c-dc3b-4e43-b1cf-340855ea68fb",
    "metadata": {},
    "outputs": [],
@@ -100,7 +100,7 @@
       "[pyscf] Total number of orbitals =  2\n",
       "[pyscf] Total number of electrons =  2\n",
       "[pyscf] HF energy =  -1.116325564486115\n",
-      "[pyscf] Total R-CCSD energy =  -1.1371758844013342\n",
+      "[pyscf] Total R-CCSD energy =  -1.1371758844013327\n",
       "Ground state energy (classical simulation)=  (-1.1371757102406845+0j) , index=  3\n"
      ]
     }
@@ -167,17 +167,20 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[(-0.10647701149300526+0j), (0.17028010135220517+0j), (0.17028010135220514+0j), (-0.22004130022421745+0j), (-0.22004130022421745+0j), (0.1683359862516207+0j), (0.12020049071260122+0j), (0.1656068235817425+0j), (0.1656068235817425+0j), (0.12020049071260122+0j), (0.17407289249680213+0j), (-0.04540633286914128+0j), (0.04540633286914128+0j), (0.04540633286914128+0j), (-0.04540633286914128+0j)]\n",
-      "['IIII', 'ZIII', 'IZII', 'IIZI', 'IIIZ', 'ZZII', 'ZIZI', 'ZIIZ', 'IZZI', 'IZIZ', 'IIZZ', 'XXYY', 'XYYX', 'YXXY', 'YYXX']\n"
+      "[(0.17028010135220506+0j), (0.17028010135220503+0j), (-0.2200413002242175+0j), (-0.2200413002242175+0j), (0.1683359862516207+0j), (0.12020049071260122+0j), (0.1656068235817425+0j), (0.1656068235817425+0j), (0.12020049071260122+0j), (0.17407289249680213+0j), (-0.04540633286914128+0j), (0.04540633286914128+0j), (0.04540633286914128+0j), (-0.04540633286914128+0j)]\n",
+      "['ZIII', 'IZII', 'IIZI', 'IIIZ', 'ZZII', 'ZIZI', 'ZIIZ', 'IZZI', 'IZIZ', 'IIZZ', 'XXYY', 'XYYX', 'YXXY', 'YYXX']\n"
      ]
     }
    ],
    "source": [
-    "\n",
-    "# Collect coefficients from a spin operator so we can pass them to a kernel\n",
+    "# Collect coefficients from a spin operator so we can pass them to a kernel.\n",
+    "# The identity term is excluded. Its contribution is added back to the \n",
+    "# Hamiltonian matrix classically below.\n",
     "def term_coefficients(ham: cudaq.SpinOperator) -> list[complex]:\n",
     "    result = []\n",
     "    for term in ham:\n",
+    "        if term.is_identity():\n",
+    "            continue\n",
     "        result.append(term.evaluate_coefficient())\n",
     "    return result\n",
     "\n",
@@ -185,9 +188,10 @@
     "def term_words(ham: cudaq.SpinOperator) -> list[str]:\n",
     "    # Our kernel uses these words to apply exp_pauli to the entire state.\n",
     "    # we hence ensure that each pauli word covers the entire space.\n",
-    "    \n",
     "    result = []\n",
     "    for term in ham:\n",
+    "        if term.is_identity():\n",
+    "            continue\n",
     "        result.append(term.get_pauli_word(qubits_num))\n",
     "    return result\n",
     "\n",
@@ -195,6 +199,13 @@
     "coefficient = term_coefficients(hamiltonian)\n",
     "pauli_string = term_words(hamiltonian)\n",
     "\n",
+    "# Sum of identity-term coefficients\n",
+    "# The identity contributes `identity_coef * S` to the Hamiltonian matrix.\n",
+    "identity_coef = sum(\n",
+    "    term.evaluate_coefficient().real\n",
+    "    for term in hamiltonian\n",
+    "    if term.is_identity())\n",
+    "\n",
     "print(coefficient)\n",
     "print(pauli_string)"
    ]
@@ -365,7 +376,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "\n",
     "# Create the identity operator\n",
     "identity_op = cudaq.SpinOperator.from_word('I' * qubits_num)\n",
     "# Get the Pauli word and convert it to a list of integers\n",
@@ -423,7 +433,7 @@
     "        # 2 entry array that stores real and imaginary part of matrix element\n",
     "        tot_e = np.zeros(2)\n",
     "\n",
-    "        # Loops over the terms in the Hamiltonian, computing expectation values\n",
+    "        # Loops over the (non-identity) terms in the Hamiltonian, computing expectation values\n",
     "        for coef, word in zip(coefficient, pauli_string):\n",
     "            pauli_list = pauli_str(word, qubits_num)\n",
     "            \n",
@@ -441,8 +451,8 @@
     "            tot_e[0] += temp[0]\n",
     "            tot_e[1] += temp[1]\n",
     "\n",
-    "        # Sums real and imaginary totals to specify Hamiltonian entry\n",
-    "        ham_matrx[m, n] = tot_e[0] + tot_e[1] * 1j\n",
+    "        # Adds back the identity-term contribution.\n",
+    "        ham_matrx[m, n] = tot_e[0] + tot_e[1] * 1j + identity_coef * wf_overlap[m, n]\n",
     "        if n != m:\n",
     "            ham_matrx[n, m] = np.conj(ham_matrx[m, n])"
    ]
@@ -512,7 +522,7 @@
      "output_type": "stream",
      "text": [
       "Energy from QFD:\n",
-      "(-1.137176660753775-1.6945689273261445e-07j)\n"
+      "(-1.1359686811350462-4.497484607599205e-09j)\n"
      ]
     }
    ],
diff --git a/docs/sphinx/examples/cpp/basics/mid_circuit_measurement.cpp b/docs/sphinx/examples/cpp/basics/mid_circuit_measurement.cpp
index 36c548b6b5a..ce987a92c39 100644
--- a/docs/sphinx/examples/cpp/basics/mid_circuit_measurement.cpp
+++ b/docs/sphinx/examples/cpp/basics/mid_circuit_measurement.cpp
@@ -6,7 +6,7 @@
 #include <cudaq.h>
 
 struct kernel {
-  bool operator()() __qpu__ {
+  auto operator()() __qpu__ {
     cudaq::qarray<3> q;
     // Initial state preparation
     x(q[0]);
diff --git a/docs/sphinx/examples/cpp/measuring_kernels.cpp b/docs/sphinx/examples/cpp/measuring_kernels.cpp
index 53ee6361326..0c4664c042b 100644
--- a/docs/sphinx/examples/cpp/measuring_kernels.cpp
+++ b/docs/sphinx/examples/cpp/measuring_kernels.cpp
@@ -27,7 +27,7 @@ __qpu__ void kernel1() {
 // [End Sample2]
 
 // [Begin Run0]
-__qpu__ std::vector<bool> kernel2() {
+__qpu__ auto kernel2() {
   cudaq::qvector q(2);
   h(q[0]);
   auto b0 = mz(q[0]);
@@ -37,7 +37,7 @@ __qpu__ std::vector<bool> kernel2() {
   if (b0) {
     h(q[1]);
   }
-  return cudaq::to_bool_vector(mz(q));
+  return mz(q);
 }
 
 int main() {
diff --git a/docs/sphinx/examples/cpp/sample_to_run_migration.cpp b/docs/sphinx/examples/cpp/sample_to_run_migration.cpp
index fe557aa25c9..7ce52822b91 100644
--- a/docs/sphinx/examples/cpp/sample_to_run_migration.cpp
+++ b/docs/sphinx/examples/cpp/sample_to_run_migration.cpp
@@ -31,7 +31,7 @@ __qpu__ void reset_pattern() {
 
 // [Begin Example1]
 struct simple_conditional {
-  bool operator()() __qpu__ {
+  auto operator()() __qpu__ {
     cudaq::qvector q(2);
     h(q[0]);
     auto r = mz(q[0]);
diff --git a/docs/sphinx/targets/cpp/qbraid.cpp b/docs/sphinx/targets/cpp/qbraid.cpp
new file mode 100644
index 00000000000..f7a15a0906e
--- /dev/null
+++ b/docs/sphinx/targets/cpp/qbraid.cpp
@@ -0,0 +1,48 @@
+// Compile and run with:
+// ```
+// nvq++ --target qbraid qbraid.cpp -o out.x && ./out.x
+// ```
+// This will submit the job to the qBraid ideal simulator target (default).
+
+#include <cudaq.h>
+#include <fstream>
+
+// Define a simple quantum kernel to execute on qBraid.
+struct ghz {
+  // Maximally entangled state between 5 qubits.
+  auto operator()() __qpu__ {
+    cudaq::qvector q(5);
+    h(q[0]);
+    for (int i = 0; i < 4; i++) {
+      x<cudaq::ctrl>(q[i], q[i + 1]);
+    }
+    auto result = mz(q);
+  }
+};
+
+int main() {
+  // Submit to qBraid asynchronously (e.g., continue executing
+  // code in the file until the job has been returned).
+  auto future = cudaq::sample_async(ghz{});
+  // ... classical code to execute in the meantime ...
+
+  // Can write the future to file:
+  {
+    std::ofstream out("saveMe.json");
+    out << future;
+  }
+
+  // Then come back and read it in later.
+  cudaq::async_result<cudaq::sample_result> readIn;
+  std::ifstream in("saveMe.json");
+  in >> readIn;
+
+  // Get the results of the read in future.
+  auto async_counts = readIn.get();
+  async_counts.dump();
+
+  // OR: Submit to qBraid synchronously (e.g., wait for the job
+  // result to be returned before proceeding).
+  auto counts = cudaq::sample(ghz{});
+  counts.dump();
+}
diff --git a/docs/sphinx/targets/python/qbraid.py b/docs/sphinx/targets/python/qbraid.py
new file mode 100644
index 00000000000..dc61d605709
--- /dev/null
+++ b/docs/sphinx/targets/python/qbraid.py
@@ -0,0 +1,51 @@
+import cudaq
+
+# You only have to set the target once! No need to redefine it
+# for every execution call on your kernel.
+# To use different targets in the same file, you must update
+# it via another call to `cudaq.set_target()`
+cudaq.set_target("qbraid")
+
+
+# Create the kernel we'd like to execute on qBraid.
+@cudaq.kernel
+def kernel():
+    qvector = cudaq.qvector(2)
+    h(qvector[0])
+    x.ctrl(qvector[0], qvector[1])
+
+
+# Execute on qBraid and print out the results.
+
+# Option A:
+# By using the asynchronous `cudaq.sample_async`, the remaining
+# classical code will be executed while the job is being handled
+# by qBraid. This is ideal when submitting via a queue over
+# the cloud.
+async_results = cudaq.sample_async(kernel)
+# ... more classical code to run ...
+
+# We can either retrieve the results later in the program with
+# ```
+# async_counts = async_results.get()
+# ```
+# or we can also write the job reference (`async_results`) to
+# a file and load it later or from a different process.
+file = open("future.txt", "w")
+file.write(str(async_results))
+file.close()
+
+# We can later read the file content and retrieve the job
+# information and results.
+same_file = open("future.txt", "r")
+retrieved_async_results = cudaq.AsyncSampleResult(str(same_file.read()))
+
+counts = retrieved_async_results.get()
+print(counts)
+
+# Option B:
+# By using the synchronous `cudaq.sample`, the execution of
+# any remaining classical code in the file will occur only
+# after the job has been returned from qBraid.
+counts = cudaq.sample(kernel)
+print(counts)
diff --git a/docs/sphinx/using/backends/cloud.rst b/docs/sphinx/using/backends/cloud.rst
index 8c03a4398cc..ebd02e033e8 100644
--- a/docs/sphinx/using/backends/cloud.rst
+++ b/docs/sphinx/using/backends/cloud.rst
@@ -5,6 +5,7 @@ CUDA-Q provides a number of options to access hardware resources (GPUs and QPUs)
 
 .. toctree::
    :maxdepth: 1
-      
+
         Amazon Braket (braket) <cloud/braket.rst>
         Scaleway QaaS (scaleway) <cloud/scaleway.rst>
+        qBraid <cloud/qbraid.rst>
diff --git a/docs/sphinx/using/backends/cloud/qbraid.rst b/docs/sphinx/using/backends/cloud/qbraid.rst
new file mode 100644
index 00000000000..dfa72e53913
--- /dev/null
+++ b/docs/sphinx/using/backends/cloud/qbraid.rst
@@ -0,0 +1,101 @@
+qBraid
+++++++
+
+.. _qbraid-backend:
+
+`qBraid <https://www.qbraid.com/>`__ is a cloud platform that brokers access to
+quantum simulators and hardware from multiple vendors through a single API.
+CUDA-Q can submit OpenQASM 2 jobs to any device exposed by the qBraid service.
+See the `qBraid device catalog <https://account.qbraid.com/devices>`__ for the
+set of simulators and QPUs currently available.
+
+Setting Credentials
+```````````````````
+
+Generate an API key from your `qBraid account <https://account.qbraid.com/>`__
+and export it as an environment variable:
+
+.. code:: bash
+
+    export QBRAID_API_KEY="qbraid_generated_api_key"
+
+Alternatively, the API key can be passed directly to ``cudaq.set_target`` via
+the ``api_key`` argument (see below).
+
+Submitting
+``````````
+
+.. tab:: Python
+
+    The target to which quantum kernels are submitted can be controlled with
+    the ``cudaq.set_target()`` function.
+
+    .. code:: python
+
+        cudaq.set_target("qbraid")
+
+    By default, jobs are submitted to the qBraid state vector simulator
+    (``qbraid:qbraid:sim:qir-sv``).
+
+    To specify a different qBraid device, set the ``machine`` parameter to its
+    qBraid device ID.
+
+    .. code:: python
+
+        cudaq.set_target("qbraid", machine="qbraid:qbraid:sim:qir-sv")
+
+    The API key can also be supplied inline instead of through the
+    ``QBRAID_API_KEY`` environment variable.
+
+    .. code:: python
+
+        cudaq.set_target("qbraid", api_key="qbraid_generated_api_key")
+
+    qBraid devices are cloud-hosted, so local emulation via the ``emulate``
+    flag is not supported — all jobs are executed on the qBraid service.
+    To run without submitting to real hardware, select one of the qBraid
+    simulator devices (for example, ``qbraid:qbraid:sim:qir-sv``) via the
+    ``machine`` argument.
+
+    The number of shots for a kernel execution can be set through the
+    ``shots_count`` argument to ``cudaq.sample`` or ``cudaq.observe``. The
+    default is 1000.
+
+    .. code:: python
+
+        cudaq.sample(kernel, shots_count=10000)
+
+.. tab:: C++
+
+    To target quantum kernel code for execution on qBraid, pass the flag
+    ``--target qbraid`` to the ``nvq++`` compiler. By default jobs are
+    submitted to the qBraid state vector simulator
+    (``qbraid:qbraid:sim:qir-sv``).
+
+    .. code:: bash
+
+        nvq++ --target qbraid src.cpp
+
+    To execute kernels on a different device, pass ``--qbraid-machine`` with
+    the qBraid device ID:
+
+    .. code:: bash
+
+        nvq++ --target qbraid --qbraid-machine "qbraid:qbraid:sim:qir-sv" src.cpp
+
+    The API key can be passed explicitly with ``--qbraid-api_key`` instead of
+    being read from ``QBRAID_API_KEY``:
+
+    .. code:: bash
+
+        nvq++ --target qbraid --qbraid-api_key "qbraid_generated_api_key" src.cpp
+
+    qBraid devices are cloud-hosted, so the ``--emulate`` flag is not
+    supported for this target — all jobs are executed on the qBraid
+    service. To run without submitting to real hardware, pass
+    ``--qbraid-machine`` with a qBraid simulator device ID (for example,
+    ``qbraid:qbraid:sim:qir-sv``).
+
+To see a complete example for using qBraid's backends, take a look at our
+:doc:`Python examples <../../examples/examples>` and
+:doc:`C++ examples <../../examples/examples>`.
diff --git a/docs/sphinx/using/basics/run_kernel.rst b/docs/sphinx/using/basics/run_kernel.rst
index 3999fb4e673..371fdd4685d 100644
--- a/docs/sphinx/using/basics/run_kernel.rst
+++ b/docs/sphinx/using/basics/run_kernel.rst
@@ -218,7 +218,7 @@ The observe function allows us to calculate expectation values for a defined qua
   The :func:`cudaq.observe` method takes a kernel and its arguments as inputs, along with a :class:`cudaq.operators.spin.SpinOperator`.
 
   Using the `cudaq.spin` module, operators may be defined as a linear combination of Pauli strings. Functions, such
-  as :func:`cudaq.spin.i`, :func:`cudaq.spin.x`, :func:`cudaq.spin.y`, :func:`cudaq.spin.z` may be used to construct more
+  as `cudaq.spin.i`, `cudaq.spin.x`, `cudaq.spin.y`, `cudaq.spin.z` may be used to construct more
   complex spin Hamiltonians on multiple qubits.
 
 .. tab:: C++
diff --git a/docs/sphinx/using/basics/troubleshooting.rst b/docs/sphinx/using/basics/troubleshooting.rst
index 608e7086f60..5ca7bbcf2a2 100644
--- a/docs/sphinx/using/basics/troubleshooting.rst
+++ b/docs/sphinx/using/basics/troubleshooting.rst
@@ -38,4 +38,27 @@ variable. For any CUDA-Q executable, just prepend as follows:
 
       CUDAQ_DUMP_JIT_IR=1 ./a.out
       # or
-      CUDAQ_DUMP_JIT_IR=<output_filename> ./a.out
\ No newline at end of file
+      CUDAQ_DUMP_JIT_IR=<output_filename> ./a.out
+
+Python Stack-Traces
+++++++++++++++++++++++++
+
+When CUDA-Q parses Python command-line options via :func:`cudaq.parse_args`,
+Python stack-traces are suppressed by default to keep runtime errors concise.
+To show the full stack-trace for debugging, pass
+:code:`--cudaq-full-stack-trace` when invoking your script.
+
+.. code-block:: bash
+
+    python3 program.py --cudaq-full-stack-trace
+
+This flag can be combined with other CUDA-Q Python runtime options such as
+:code:`--target`, :code:`--target-option`, and :code:`--emulate`.
+
+.. code-block:: bash
+
+    python3 program.py --target nvidia --target-option fp64 --cudaq-full-stack-trace
+
+If your application parses CUDA-Q command-line arguments explicitly, call
+:func:`cudaq.parse_args` before running the rest of the program so the flag is
+recognized.
diff --git a/include/cudaq/Optimizer/CodeGen/Passes.td b/include/cudaq/Optimizer/CodeGen/Passes.td
index 0424599a46a..598e9d2c7c0 100644
--- a/include/cudaq/Optimizer/CodeGen/Passes.td
+++ b/include/cudaq/Optimizer/CodeGen/Passes.td
@@ -278,5 +278,33 @@ def ReturnToOutputLog : Pass<"return-to-output-log", "mlir::ModuleOp"> {
   ];
 }
 
+def EliminateDeadHeapCopy
+    : Pass<"eliminate-dead-heap-copy", "mlir::func::FuncOp"> {
+  let summary = "Eliminate dead heap copies from return value logging.";
+  let description = [{
+    When a kernel returns a vector (e.g., measurement results), the frontend
+    wraps the return value with `__nvqpp_vectorCopyCtor`, which performs a
+    malloc+memcpy to copy the data from the callee's stack to the heap. This
+    is necessary because the returned data must outlive the callee's stack
+    frame when one kernel calls another. After AggressiveInlining, this
+    intrinsic is expanded into raw malloc and memcpy operations in the caller.
+
+    After ReturnToOutputLog converts return values to QIR output logging
+    calls (e.g., `__quantum__rt__*_record_output`), it reads from the
+    cc.stdvec_init's buffer (the malloc'd pointer) and creates new load ops
+    from it, leaving the cc.stdvec_init with no users. The malloc+memcpy
+    are then only needed to populate the heap buffer that the output logging
+    reads from. This pass redirects those reads to the memcpy source (the
+    original stack data), making the malloc+memcpy dead, and erases them
+    along with the now-unused cc.stdvec_init.
+
+    Note: this pass is only needed on code paths that do not run LLVM's
+    optimization passes (e.g., when emitting MLIR rather than LLVM IR for 
+    a remote backend). When the full LLVM opt pipeline runs, it would eliminate
+    these dead allocations on its own.
+  }];
+  let dependentDialects = ["cudaq::cc::CCDialect", "mlir::func::FuncDialect"];
+}
+
 
 #endif // CUDAQ_OPT_OPTIMIZER_CODEGEN_PASSES
diff --git a/include/cudaq/Optimizer/CodeGen/QIRFunctionNames.h b/include/cudaq/Optimizer/CodeGen/QIRFunctionNames.h
index b471ceebb23..56d6006b407 100644
--- a/include/cudaq/Optimizer/CodeGen/QIRFunctionNames.h
+++ b/include/cudaq/Optimizer/CodeGen/QIRFunctionNames.h
@@ -81,10 +81,6 @@ static constexpr const char QIRArrayConcatArray[] =
     "__quantum__rt__array_concatenate";
 static constexpr const char QIRArrayCreateArray[] =
     "__quantum__rt__array_create_1d";
-static constexpr const char QIRResultArrayCreate[] =
-    "__quantum__rt__result_array_create_1d";
-static constexpr const char QIRResultArrayGetElementPtr1d[] =
-    "__quantum__rt__result_array_get_element_ptr_1d";
 
 /// Dynamic qubit management helper functions. These are currently only used by
 /// the NVQIR simulator.
diff --git a/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td b/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
index 4edc20ca348..a1f1ec77d5c 100644
--- a/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
+++ b/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
@@ -261,21 +261,21 @@ def quake_ExtractRefOp : QuakeOp<"extract_ref", [Pure]> {
 }
 
 def quake_RelaxSizeOp : QuakeOp<"relax_size", [Pure]> {
-  let summary = "Relax the constant size on a sized type to be unknown.";
+  let summary = "Relax the constant size on a !veq to be unknown.";
   let description = [{
-    Demotes a sized `!quake.veq<N>` to `!quake.veq<?>`, or a sized
-    `!quake.measurements<N>` to `!quake.measurements<?>`. Required to preserve
-    strongly-typed IR at function call/return boundaries.
+    At times, the IR needs to forget the length of an SSA-value of type
+    `!quake.veq<N>` and demote it to type `!quake.veq<?>` where the size is
+    said to be unknown. This demotion is required to preserve a valid,
+    strongly-typed IR.
 
-    Examples:
+    Example:
     ```mlir
       %uqv = quake.relax_size %qv : (!quake.veq<4>) -> !quake.veq<?>
-      %ums = quake.relax_size %ms : (!quake.measurements<4>) -> !quake.measurements<?>
     ```
   }];
 
-  let arguments = (ins AnyTypeOf<[VeqType, MeasurementsType]>:$inputVec);
-  let results = (outs AnyTypeOf<[VeqType, MeasurementsType]>);
+  let arguments = (ins VeqType:$inputVec);
+  let results = (outs VeqType);
 
   let assemblyFormat = [{
     $inputVec `:` functional-type(operands, results) attr-dict
@@ -377,34 +377,6 @@ def quake_VeqSizeOp : QuakeOp<"veq_size", [Pure]> {
   let hasCanonicalizer = 1;
 }
 
-def quake_MeasurementsSizeOp : QuakeOp<"measurements_size", [Pure]> {
-  let summary = "Return the size of a measurements collection.";
-  let description = [{
-    Returns the number of individual measurements in a `!quake.measurements<N>`
-    collection. If the collection has a static size, the static size is returned
-    (effectively as a constant). If the size is dynamic, the value will be an
-    SSA-value.
-
-    Examples:
-    ```mlir
-      %ms = quake.mz %qubits : (!quake.veq<4>) -> !quake.measurements<4>
-      %n = quake.measurements_size %ms : (!quake.measurements<4>) -> i64
-      
-      %ms2 = quake.mz %dyn_veq : (!quake.veq<?>) -> !quake.measurements<?>
-      %n2 = quake.measurements_size %ms2 : (!quake.measurements<?>) -> i64
-    ```
-  }];
-
-  let arguments = (ins MeasurementsType:$measurements);
-  let results = (outs AnySignlessIntegerOrIndex:$size);
-
-  let assemblyFormat = [{
-    $measurements `:` functional-type(operands, results) attr-dict
-  }];
-
-  let hasCanonicalizer = 1;
-}
-
 //===----------------------------------------------------------------------===//
 // Application, ComputeAction(Uncompute)
 //===----------------------------------------------------------------------===//
@@ -1094,7 +1066,7 @@ class Measurement<string mnemonic> : QuakeOp<mnemonic, [MeasurementInterface,
     OptionalAttr<StrAttr>:$registerName
   );
   let results = (outs
-    AnyTypeOf<[MeasureType, MeasurementsType]>:$measOut,
+    AnyTypeOf<[MeasureType, StdvecOf<[MeasureType]>]>:$measOut,
     Variadic<WireType>:$wires
   );
 
@@ -1111,15 +1083,13 @@ class Measurement<string mnemonic> : QuakeOp<mnemonic, [MeasurementInterface,
   }];
 
   let hasVerifier = 1;
-  let hasCanonicalizer = 1;
 }
 
 def MxOp : Measurement<"mx"> {
   let summary = "Measurement along the x-axis";
   let description = [{
-    The `mx` operation measures the state of qubits along the x-axis. For a
-    single qubit the result is `!quake.measure`; for multiple qubits the result
-    is `!quake.measurements<N>` (or `!quake.measurements<?>` when unsized).
+    The `mx` operation measures the state of qubits into classical bits
+    represented by a `i1` (or a vector of `i1`), along the x-axis.
 
     The state of the qubits is collapsed into one of the computational basis
     states, i.e., either |0> or |1>. A `reset` operation can guarantee that the
@@ -1132,9 +1102,8 @@ def MxOp : Measurement<"mx"> {
 def MyOp : Measurement<"my"> {
   let summary = "Measurement along the y-axis";
   let description = [{
-    The `my` operation measures the state of qubits along the y-axis. For a
-    single qubit the result is `!quake.measure`; for multiple qubits the result
-    is `!quake.measurements<N>` (or `!quake.measurements<?>` when unsized).
+    The `my` operation measures the state of qubits into classical bits
+    represented by a `i1` (or a vector of `i1`), along the y-axis.
 
     The state of the qubit is collapsed into one of the computational basis
     states, i.e., either |0> or |1>. A `reset` operation can guarantee that the
@@ -1147,10 +1116,9 @@ def MyOp : Measurement<"my"> {
 def MzOp : Measurement<"mz"> {
   let summary = "Measurement along the z-axis";
   let description = [{
-    The `mz` operation measures the state of qubits along the z-axis---the
-    so-called computational basis. For a single qubit the result is
-    `!quake.measure`; for multiple qubits the result is
-    `!quake.measurements<N>` (or `!quake.measurements<?>` when unsized).
+    The `mz` operation measures the state of qubits into a classical bits
+    represented by a `i1` (or a vector of `i1`), along the z-axis---the
+    so-called computational basis.
 
     The state of the qubit is collapsed into one of the computational basis
     states, i.e., either |0> or |1>. A `reset` operation can guarantee that the
@@ -1175,7 +1143,7 @@ def quake_DiscriminateOp : QuakeOp<"discriminate", [Pure]> {
   }];
 
   let arguments = (ins
-    AnyTypeOf<[MeasureType, MeasurementsType]>:$measurement
+    AnyTypeOf<[MeasureType, StdvecOf<[MeasureType]>]>:$measurement
   );
   let results = (outs
     AnyTypeOf<[AnySignlessInteger, StdvecOf<[AnySignlessInteger]>]>
@@ -1188,64 +1156,6 @@ def quake_DiscriminateOp : QuakeOp<"discriminate", [Pure]> {
   let hasVerifier = 1;
 }
 
-//===----------------------------------------------------------------------===//
-// GetMeasureOp
-//===----------------------------------------------------------------------===//
-
-def quake_GetMeasureOp : QuakeOp<"get_measure", [Pure]> {
-  let summary =
-    "Extract a single measurement from a measurements collection.";
-  let description = [{
-    Extracts a single `!quake.measure` value from a `!quake.measurements<N>`
-    collection by index. This is analogous to `quake.extract_ref` for qubits.
-
-    Example:
-    ```mlir
-      %m = quake.get_measure %ms[0] : (!quake.measurements<4>) -> !quake.measure
-    ```
-  }];
-
-  let arguments = (ins
-    MeasurementsType:$measurements,
-    Optional<AnySignlessIntegerOrIndex>:$index,
-    I64Attr:$rawIndex
-  );
-  let results = (outs MeasureType:$measure);
-
-  let builders = [
-    OpBuilder<(ins "mlir::Value":$measurements, "mlir::Value":$index,
-                   "mlir::IntegerAttr":$rawIndex), [{
-      return build($_builder, $_state, $_builder.getType<MeasureType>(),
-                   measurements, index, rawIndex);
-    }]>,
-    OpBuilder<(ins "mlir::Value":$measurements, "mlir::Value":$index), [{
-      return build($_builder, $_state, $_builder.getType<MeasureType>(),
-                   measurements, index, GetMeasureOp::kDynamicIndex);
-    }]>,
-    OpBuilder<(ins "mlir::Value":$measurements, "std::size_t":$rawIndex), [{
-      auto i64Ty = $_builder.getI64Type();
-      return build($_builder, $_state, $_builder.getType<MeasureType>(),
-                   measurements, mlir::Value{},
-                   mlir::IntegerAttr::get(i64Ty, rawIndex));
-    }]>
-  ];
-
-  let assemblyFormat = [{
-    $measurements `[` custom<RawIndex>($index, $rawIndex) `]` `:`
-      functional-type(operands, results) attr-dict
-  }];
-
-  let hasVerifier = 1;
-
-  let extraClassDeclaration = [{
-    static constexpr std::size_t kDynamicIndex =
-      std::numeric_limits<std::size_t>::max();
-
-    bool hasConstantIndex() { return !getIndex(); }
-    std::size_t getConstantIndex() { return getRawIndex(); }
-  }];
-}
-
 //===----------------------------------------------------------------------===//
 // Quantum gates
 //===----------------------------------------------------------------------===//
diff --git a/include/cudaq/Optimizer/Dialect/Quake/QuakeTypes.h b/include/cudaq/Optimizer/Dialect/Quake/QuakeTypes.h
index 3bc91c21479..6c0d3ff51ed 100644
--- a/include/cudaq/Optimizer/Dialect/Quake/QuakeTypes.h
+++ b/include/cudaq/Optimizer/Dialect/Quake/QuakeTypes.h
@@ -31,8 +31,7 @@ inline bool isQuantumType(mlir::Type ty) {
 /// \returns true if \p `ty` is a Quake type.
 inline bool isQuakeType(mlir::Type ty) {
   // This should correspond to the registered types in QuakeTypes.cpp.
-  return isQuantumType(ty) ||
-         mlir::isa<quake::MeasureType, quake::MeasurementsType>(ty);
+  return isQuantumType(ty) || mlir::isa<quake::MeasureType>(ty);
 }
 
 /// \returns true if \p ty is a quantum reference type, excluding `struq`.
diff --git a/include/cudaq/Optimizer/Dialect/Quake/QuakeTypes.td b/include/cudaq/Optimizer/Dialect/Quake/QuakeTypes.td
index 542f4861069..2f4f98cf0fd 100644
--- a/include/cudaq/Optimizer/Dialect/Quake/QuakeTypes.td
+++ b/include/cudaq/Optimizer/Dialect/Quake/QuakeTypes.td
@@ -251,41 +251,6 @@ def MeasureType : QuakeType<"Measure", "measure"> {
   let genStorageClass = 0;
 }
 
-//===----------------------------------------------------------------------===//
-// MeasurementsType: classical data type for a collection of measurements
-//===----------------------------------------------------------------------===//
-
-def MeasurementsType : QuakeType<"Measurements", "measurements"> {
-  let summary = "a sequence of measurement results";
-  let description = [{
-    A value of type `measurements` is a collection of values of type `measure`.
-    This is the natural result type of measuring multiple qubits. Like `veq` is
-    to `ref`, `measurements` is to `measure`.
-
-    ```mlir
-      %ms = quake.mz %qubits : (!quake.veq<4>) -> !quake.measurements<4>
-      %m0 = quake.get_measure %ms[0] : (!quake.measurements<4>) -> !quake.measure
-    ```
-  }];
-
-  let parameters = (ins "std::size_t":$size);
-
-  let hasCustomAssemblyFormat = 1;
-
-  let extraClassDeclaration = [{
-    static constexpr std::size_t kDynamicSize =
-      std::numeric_limits<std::size_t>::max();
-    
-    bool hasSpecifiedSize() const { return getSize() != kDynamicSize; }
-    bool hasNonZeroSpecifiedSize() const {
-      return hasSpecifiedSize() && getSize();
-    }
-    static MeasurementsType getUnsized(mlir::MLIRContext *ctx) {
-      return MeasurementsType::get(ctx, kDynamicSize);
-    }
-  }];
-}
-
 //===----------------------------------------------------------------------===//
 // StateType
 //===----------------------------------------------------------------------===//
diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index 98a3a0ec3ff..df9890e47b3 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -219,7 +219,7 @@ def CombineMeasurements :
       %1 = ... : !quake.veq<4>
       %2 = quake.subveq %1, %c2, %c3 : (!quake.veq<4>, i32, i32) ->
             !quake.veq<2>
-      %measOut = quake.mz %2 : (!quake.veq<2>) -> !quake.measurements<2>
+      %measOut = quake.mz %2 : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
     }
     ```
     with:
@@ -227,7 +227,7 @@ def CombineMeasurements :
     func.func @kernel() attributes {"cudaq-entrypoint", ["output_names",
     "[[[0,[1,\22q0\22]],[1,[2,\22q1\22]]]]"]} {
       %1 = ... : !quake.veq<4>
-      %measOut = quake.mz %1 : (!quake.veq<4>) -> !quake.measurements<4>
+      %measOut = quake.mz %1 : (!quake.veq<4>) -> !cc.stdvec<!quake.measure>
     }
     ```
   }];
@@ -478,30 +478,15 @@ def ExpandControlVeqs: Pass<"expand-control-veqs", "mlir::func::FuncOp"> {
 }
 
 def ExpandMeasurements : Pass<"expand-measurements"> {
-  let summary = "Expand multi-qubit measurements to individual qubit ops.";
+  let summary = "Expand multi-ref measurements to series on single refs.";
   let description = [{
-    The `mx`, `my`, `mz` ops can take a list of qubits and/or veq arguments
-    and return a `!quake.measurements<N>` collection. The target may only
-    support measuring a single qubit however. This pass expands these
-    multi-qubit measurements into individual single-qubit measurements in two
-    steps.
-
-    Step 1: Any `quake.discriminate` on a sized `!quake.measurements<N>`
-    value is expanded into N individual `quake.get_measure` +
-    `quake.discriminate` operations on single `!quake.measure` values, with
-    results collected into a `!cc.stdvec<iK>`.
-
-    Step 2: Multi-qubit `mx`, `my`, `mz` ops are replaced with individual
-    per-qubit measurements. For `veq<N>` targets with known size, the qubits
-    are extracted via `quake.extract_ref` and measured individually. For
-    `veq<?>` targets with dynamically-sized targets, a loop is generated
-    using `quake.veq_size` to compute the iteration count and individual
-    qubits are extracted via `quake.extract_ref` within the loop body.
-    
-    Multi-qubit measurements without local discriminate users are left intact.
+    The `mx`, `my`, `mz` ops can take a list of qubits and/or veq arguments.
+    The target may only support measuring a single qubit however. This pass
+    expands these ops in list format into a series of measurements (including
+    loops) on individual qubits and into a single `std::vector<bool>` result.
 
-    The `reset` op can also take a veq argument and this pass will expand that
-    to a loop of `reset` operations on individual qubits.
+    The `reset` op can also take a veq argument and this pass will also expand
+    that to a series of `reset` operations on single qubits.
   }];
 
   let dependentDialects = ["cudaq::cc::CCDialect", "mlir::LLVM::LLVMDialect"];
diff --git a/lib/Frontend/nvqpp/ASTBridge.cpp b/lib/Frontend/nvqpp/ASTBridge.cpp
index c3ac466c2e9..724b13e16e7 100644
--- a/lib/Frontend/nvqpp/ASTBridge.cpp
+++ b/lib/Frontend/nvqpp/ASTBridge.cpp
@@ -62,8 +62,8 @@ static bool isQubitType(Type ty) {
 }
 
 // Check the builtin type FunctionType to see if it has any references to Quake
-// types (including measurement) in its arguments and/or results.
-static bool hasAnyQuakeTypes(FunctionType funcTy) {
+// qubit types in its arguments and/or results.
+static bool hasAnyQubitTypes(FunctionType funcTy) {
   for (auto ty : funcTy.getInputs())
     if (isQubitType(ty))
       return true;
@@ -639,7 +639,7 @@ void ASTBridgeAction::ASTBridgeConsumer::HandleTranslationUnit(
       auto unitAttr = UnitAttr::get(ctx);
       // Flag func as a quantum kernel.
       func->setAttr(kernelAttrName, unitAttr);
-      if ((!hasAnyQuakeTypes(func.getFunctionType())) &&
+      if ((!hasAnyQubitTypes(func.getFunctionType())) &&
           (!cudaq::ASTBridgeAction::ASTBridgeConsumer::isCustomOpGenerator(
               fdPair.second))) {
         // Flag func as an entry point to a quantum kernel.
diff --git a/lib/Frontend/nvqpp/ConvertDecl.cpp b/lib/Frontend/nvqpp/ConvertDecl.cpp
index 26ab9af5a25..fd01e4b5ec4 100644
--- a/lib/Frontend/nvqpp/ConvertDecl.cpp
+++ b/lib/Frontend/nvqpp/ConvertDecl.cpp
@@ -93,9 +93,8 @@ void QuakeBridgeVisitor::addArgumentSymbols(
       auto parmTy = entryBlock->getArgument(index).getType();
       if (isa<FunctionType, cc::CallableType, cc::IndirectCallableType,
               cc::PointerType, cc::SpanLikeType, LLVM::LLVMStructType,
-              quake::ControlType, quake::MeasureType, quake::MeasurementsType,
-              quake::RefType, quake::StruqType, quake::VeqType,
-              quake::WireType>(parmTy)) {
+              quake::ControlType, quake::RefType, quake::StruqType,
+              quake::VeqType, quake::WireType>(parmTy)) {
         symbolTable.insert(name, entryBlock->getArgument(index));
       } else {
         auto stackSlot = builder.create<cc::AllocaOp>(loc, parmTy);
@@ -170,9 +169,6 @@ bool QuakeBridgeVisitor::interceptRecordDecl(clang::RecordDecl *x) {
       auto fnTy = cast<FunctionType>(popType());
       return pushType(cc::IndirectCallableType::get(fnTy));
     }
-    // Measurement result type.
-    if (name == "measure_result")
-      return pushType(quake::MeasureType::get(ctx));
     if (!isInNamespace(x, "solvers") && !isInNamespace(x, "qec")) {
       auto loc = toLocation(x);
       TODO_loc(loc, "unhandled type, " + name + ", in cudaq namespace");
@@ -192,10 +188,6 @@ bool QuakeBridgeVisitor::interceptRecordDecl(clang::RecordDecl *x) {
                               "std::vector element type is not supported");
         return false;
       }
-      // TODO: std::vector<measure_result> will be replaced by
-      // cudaq::measure_vector, recognized directly by class name (see spec).
-      if (isa<quake::MeasureType>(ty))
-        return pushType(quake::MeasurementsType::getUnsized(ctx));
       return pushType(cc::StdvecType::get(ctx, ty));
     }
     // std::vector<bool>   =>   cc.stdvec<i1>
@@ -740,14 +732,7 @@ bool QuakeBridgeVisitor::VisitVarDecl(clang::VarDecl *x) {
     return true;
   }
 
-  if (isa<quake::MeasureType, quake::MeasurementsType>(type)) {
-    assert(x->getInit() && "`measure_result` has no default constructor");
-    auto initVal = popValue();
-    symbolTable.insert(x->getName(), initVal);
-    if (auto meas = initVal.getDefiningOp<quake::MeasurementInterface>())
-      meas.setRegisterName(builder.getStringAttr(x->getName()));
-    return true;
-  }
+  // Here we maybe have something like auto var = mz(qreg)
   if (auto vecType = dyn_cast<cc::StdvecType>(type)) {
     // Variable is of !cc.stdvec type.
     if (x->getInit()) {
@@ -759,11 +744,6 @@ bool QuakeBridgeVisitor::VisitVarDecl(clang::VarDecl *x) {
       // and if so, find the mz and tag it with the variable name
       auto elementType = vecType.getElementType();
 
-      if (auto meas = initVec.getDefiningOp<quake::MeasurementInterface>()) {
-        meas.setRegisterName(builder.getStringAttr(x->getName()));
-        return true;
-      }
-
       // Drop out if this is not an i1
       if (!elementType.isIntOrFloat() ||
           elementType.getIntOrFloatBitWidth() != 1)
@@ -801,11 +781,6 @@ bool QuakeBridgeVisitor::VisitVarDecl(clang::VarDecl *x) {
         auto firstGepUser = *gepOp->getResult(0).getUsers().begin();
         if (auto storeOp = dyn_cast<cc::StoreOp>(firstGepUser)) {
           auto result = storeOp->getOperand(0);
-          if (auto measureOp =
-                  result.getDefiningOp<quake::MeasurementInterface>()) {
-            measureOp.setRegisterName(builder.getStringAttr(x->getName()));
-            break;
-          }
           if (auto discr = result.getDefiningOp<quake::DiscriminateOp>())
             if (auto mzOp =
                     discr.getMeasurement().getDefiningOp<quake::MzOp>()) {
@@ -842,8 +817,9 @@ bool QuakeBridgeVisitor::VisitVarDecl(clang::VarDecl *x) {
 
   // If this was an auto var = mz(q), then we want to know the
   // var name, as it will serve as the classical bit register name
-  if (auto meas = initValue.getDefiningOp<quake::MeasurementInterface>())
-    meas.setRegisterName(builder.getStringAttr(x->getName()));
+  if (auto discr = initValue.getDefiningOp<quake::DiscriminateOp>())
+    if (auto mz = discr.getMeasurement().getDefiningOp<quake::MzOp>())
+      mz.setRegisterName(builder.getStringAttr(x->getName()));
 
   assert(initValue && "initializer value must be lowered");
   if (isa<IntegerType>(initValue.getType()) && isa<IntegerType>(type)) {
diff --git a/lib/Frontend/nvqpp/ConvertExpr.cpp b/lib/Frontend/nvqpp/ConvertExpr.cpp
index a2ee180f508..70aaf25f990 100644
--- a/lib/Frontend/nvqpp/ConvertExpr.cpp
+++ b/lib/Frontend/nvqpp/ConvertExpr.cpp
@@ -555,13 +555,6 @@ SmallVector<Value> QuakeBridgeVisitor::convertKernelArgs(
           continue;
         }
       }
-    if (auto vMeasTy = dyn_cast<quake::MeasurementsType>(vTy))
-      if (auto kMeasTy = dyn_cast<quake::MeasurementsType>(kTy))
-        if (vMeasTy.hasSpecifiedSize() && !kMeasTy.hasSpecifiedSize()) {
-          auto relax = builder.create<quake::RelaxSizeOp>(loc, kMeasTy, v);
-          result.push_back(relax);
-          continue;
-        }
 
     LLVM_DEBUG(llvm::dbgs() << "convert: " << v << "\nto:" << kTy << '\n');
     TODO_loc(loc, "argument type conversion");
@@ -665,7 +658,7 @@ bool QuakeBridgeVisitor::VisitCastExpr(clang::CastExpr *x) {
   }
   case clang::CastKind::CK_UserDefinedConversion: {
     auto sub = popValue();
-    // castToTy is the conversion function signature.
+    // castToTy is the converion function signature.
     castToTy = popType();
     if (isa<IntegerType>(castToTy) && isa<IntegerType>(sub.getType())) {
       auto locSub = toLocation(x->getSubExpr());
@@ -673,29 +666,6 @@ bool QuakeBridgeVisitor::VisitCastExpr(clang::CastExpr *x) {
       assert(result && "integer conversion failed");
       return result;
     }
-    auto i1Type = builder.getI1Type();
-    // Handle conversion of `measure_result`
-    auto measTy = quake::MeasureType::get(builder.getContext());
-    if (sub.getType() == measTy) {
-      auto i1Val = builder.create<quake::DiscriminateOp>(loc, i1Type, sub);
-      // Convert to `int`
-      if (isa<IntegerType>(castToTy))
-        return pushValue(builder.create<cudaq::cc::CastOp>(
-            loc, castToTy, i1Val, cudaq::cc::CastOpMode::Unsigned));
-      // Convert to `float`
-      if (isa<FloatType>(castToTy))
-        return pushValue(builder.create<cudaq::cc::CastOp>(
-            loc, castToTy, i1Val, cudaq::cc::CastOpMode::Unsigned));
-      // Otherwise, just return the `i1` value
-      return pushValue(i1Val);
-    }
-
-    // Handle conversion of measurement collection to std::vector<bool>.
-    // TODO: will become measure_vector::operator std::vector<bool>().
-    if (isa<quake::MeasurementsType>(sub.getType()))
-      return pushValue(builder.create<quake::DiscriminateOp>(
-          loc, cc::StdvecType::get(i1Type), sub));
-
     TODO_loc(loc, "unhandled user-defined implicit conversion");
   }
   case clang::CastKind::CK_ConstructorConversion: {
@@ -1045,7 +1015,7 @@ bool QuakeBridgeVisitor::VisitMaterializeTemporaryExpr(
   // In those cases, there is nothing to materialize, so we can just pass the
   // Value on the top of the stack.
   if (isa<cc::CallableType, quake::VeqType, quake::RefType, cc::SpanLikeType,
-          quake::StateType, quake::MeasureType, quake::MeasurementsType>(ty))
+          quake::StateType>(ty))
     return true;
 
   // If not one of the above special cases, then materialize the value to a
@@ -1311,14 +1281,6 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
     auto svec = popValue();
     if (isa<cc::PointerType>(svec.getType()))
       svec = builder.create<cc::LoadOp>(loc, svec);
-    if (isa<quake::MeasurementsType>(svec.getType()) && funcName == "size")
-      if (auto memberCall = dyn_cast<clang::CXXMemberCallExpr>(x))
-        if (memberCall->getImplicitObjectArgument()) {
-          [[maybe_unused]] auto calleeTy = popType();
-          assert(isa<FunctionType>(calleeTy));
-          return pushValue(builder.create<quake::MeasurementsSizeOp>(
-              loc, builder.getI64Type(), svec));
-        }
     auto ext =
         builder.create<cc::StdvecSizeOp>(loc, builder.getI64Type(), svec);
     if (funcName == "size")
@@ -1558,38 +1520,10 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
   auto funcArity = func->getNumParams();
   SmallVector<Value> args = lastValues(funcArity);
   if (isa<clang::CXXMethodDecl>(func)) {
-    auto thisPtrValue = popValue();
-
-    // For `measure_result`, the implicit "this" value is the `!quake.measure`
-    // SSA value; forward it unchanged
-    if (isa<clang::CXXConversionDecl>(func) &&
-        isInClassInNamespace(func, "measure_result", "cudaq"))
-      return pushValue(thisPtrValue);
+    [[maybe_unused]] auto thisPtrValue = popValue();
   }
   auto calleeOp = popValue();
 
-  // Handle operator== and operator!= for measure_result (friend functions)
-  if (func->isOverloadedOperator() && isInNamespace(func, "cudaq")) {
-    auto opKind = func->getOverloadedOperator();
-    if ((opKind == clang::OO_EqualEqual || opKind == clang::OO_ExclaimEqual) &&
-        args.size() == 2) {
-      auto lhs = args[0];
-      auto rhs = args[1];
-      auto measTy = quake::MeasureType::get(builder.getContext());
-      if (lhs.getType() == measTy || rhs.getType() == measTy) {
-        auto i1Type = builder.getI1Type();
-        if (lhs.getType() == measTy)
-          lhs = builder.create<quake::DiscriminateOp>(loc, i1Type, lhs);
-        if (rhs.getType() == measTy)
-          rhs = builder.create<quake::DiscriminateOp>(loc, i1Type, rhs);
-        // Choose predicate based on operator
-        auto pred = (opKind == clang::OO_EqualEqual) ? arith::CmpIPredicate::eq
-                                                     : arith::CmpIPredicate::ne;
-        return pushValue(builder.create<arith::CmpIOp>(loc, pred, lhs, rhs));
-      }
-    }
-  }
-
   if (isInNamespace(func, "cudaq")) {
     // Check and see if this quantum operation is adjoint
     bool isAdjoint = false;
@@ -1712,33 +1646,25 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
     }
 
     if (funcName == "mx" || funcName == "my" || funcName == "mz") {
-      bool useMeasurements =
+      // Measurements always return a bool or a std::vector<bool>.
+      bool useStdvec =
           (args.size() > 1) ||
           (args.size() == 1 && isa<quake::VeqType>(args[0].getType()));
       auto measure = [&]() -> Value {
         Type measTy = quake::MeasureType::get(builder.getContext());
-        if (useMeasurements) {
-          std::size_t totalSize = 0;
-          bool allKnown = true;
-          for (auto a : args) {
-            if (quake::isConstantQuantumRefType(a.getType()))
-              totalSize += quake::getAllocationSize(a.getType());
-            else
-              allKnown = false;
-          }
-          if (allKnown && totalSize > 0)
-            measTy =
-                quake::MeasurementsType::get(builder.getContext(), totalSize);
-          else
-            measTy = quake::MeasurementsType::getUnsized(builder.getContext());
-        }
+        if (useStdvec)
+          measTy = cc::StdvecType::get(measTy);
         if (funcName == "mx")
           return builder.create<quake::MxOp>(loc, measTy, args).getMeasOut();
         if (funcName == "my")
           return builder.create<quake::MyOp>(loc, measTy, args).getMeasOut();
         return builder.create<quake::MzOp>(loc, measTy, args).getMeasOut();
       }();
-      return pushValue(measure);
+      Type resTy = builder.getI1Type();
+      if (useStdvec)
+        resTy = cc::StdvecType::get(resTy);
+      return pushValue(
+          builder.create<quake::DiscriminateOp>(loc, resTy, measure));
     }
 
     // Handle the quantum gate set.
@@ -2198,39 +2124,16 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
       return true;
     }
 
-    // TODO: will be replaced by measure_vector::operator std::int64_t().
     if (funcName == "toInteger" || funcName == "to_integer") {
-      auto arg = args[0];
-      auto i1Ty = builder.getI1Type();
-      auto boolVecTy = cc::StdvecType::get(i1Ty);
-      if (isa<quake::MeasurementsType>(arg.getType()))
-        arg = builder.create<quake::DiscriminateOp>(loc, boolVecTy, arg);
-      else if (arg.getType() != boolVecTy)
-        reportClangError(x, mangler,
-                         "`to_integer` requires measurements or "
-                         "std::vector<bool> argument");
       IRBuilder irBuilder(builder.getContext());
       if (failed(irBuilder.loadIntrinsic(module, cudaqConvertToInteger))) {
         reportClangError(x, mangler, "cannot load cudaqConvertToInteger");
         return false;
       }
       auto i64Ty = builder.getI64Type();
-      return pushValue(builder
-                           .create<func::CallOp>(loc, i64Ty,
-                                                 cudaqConvertToInteger,
-                                                 ValueRange{arg})
-                           .getResult(0));
-    }
-
-    // TODO: will be replaced by measure_vector::operator std::vector<bool>().
-    if (funcName == "to_bool_vector") {
-      auto arg = args[0];
-      assert(isa<quake::MeasurementsType>(arg.getType()) &&
-             "to_bool_vector requires measurements type argument");
-      auto i1Ty = builder.getI1Type();
-      arg = builder.create<quake::DiscriminateOp>(
-          loc, cc::StdvecType::get(i1Ty), arg);
-      return pushValue(arg);
+      return pushValue(
+          builder.create<func::CallOp>(loc, i64Ty, cudaqConvertToInteger, args)
+              .getResult(0));
     }
 
     if (funcName == "slice_vector") {
@@ -2627,10 +2530,6 @@ bool QuakeBridgeVisitor::VisitCXXOperatorCallExpr(
       auto svec = popValue();
       if (isa<cc::PointerType>(svec.getType()))
         svec = builder.create<cc::LoadOp>(loc, svec);
-      if (isa<quake::MeasurementsType>(svec.getType())) {
-        auto getMeas = builder.create<quake::GetMeasureOp>(loc, svec, indexVar);
-        return replaceTOSValue(getMeas);
-      }
       if (!isa<cc::StdvecType>(svec.getType())) {
         TODO_x(loc, x, mangler, "vector dereference");
         return false;
@@ -3352,20 +3251,6 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) {
     return pushValue(builder.create<cc::LoadOp>(loc, copyObj));
   }
 
-  // For `measure_result`, the implicit "this" value is the `!quake.measure`
-  // SSA value; forward it unchanged.
-  // Note: Copy support is a temporary concession while
-  // `std::vector<measure_result>` exists (its `operator[]` returns by
-  // reference, forcing copies). Once replaced, it becomes move-only.
-  if ((ctor->isCopyConstructor() || ctor->isMoveConstructor()) &&
-      isInClassInNamespace(ctor, "measure_result", "cudaq")) {
-    assert(x->getNumArgs() == 1);
-    auto src = popValue();
-    assert(isa<quake::MeasureType>(src.getType()) &&
-           "`measure_result` copy/move source must be `!quake.measure`");
-    return pushValue(src);
-  }
-
   // TODO: remove this when we can handle ctors more generally.
   if (!ctor->isDefaultConstructor()) {
     LLVM_DEBUG(llvm::dbgs() << ctorName << " - unhandled ctor:\n"; x->dump());
diff --git a/lib/Frontend/nvqpp/ConvertStmt.cpp b/lib/Frontend/nvqpp/ConvertStmt.cpp
index 23ee12901f5..54bd9ca50ec 100644
--- a/lib/Frontend/nvqpp/ConvertStmt.cpp
+++ b/lib/Frontend/nvqpp/ConvertStmt.cpp
@@ -263,46 +263,6 @@ bool QuakeBridgeVisitor::TraverseCXXForRangeStmt(clang::CXXForRangeStmt *x,
     auto idxIters = builder.create<cudaq::cc::CastOp>(
         loc, i64Ty, iters, cudaq::cc::CastOpMode::Unsigned);
     opt::factory::createInvariantLoop(builder, loc, idxIters, bodyBuilder);
-  } else if (auto measTy =
-                 dyn_cast<quake::MeasurementsType>(buffer.getType())) {
-    Value iters;
-    if (measTy.hasSpecifiedSize()) {
-      iters =
-          builder.create<arith::ConstantIntOp>(loc, measTy.getSize(), i64Ty);
-    } else if (auto measIface = dyn_cast_or_null<quake::MeasurementInterface>(
-                   buffer.getDefiningOp())) {
-      // Derive the iteration count from the measurement op's qubit targets.
-      for (auto target : measIface.getTargets()) {
-        Value count;
-        if (auto veqTy = dyn_cast<quake::VeqType>(target.getType())) {
-          if (veqTy.hasSpecifiedSize())
-            count = builder.create<arith::ConstantIntOp>(loc, veqTy.getSize(),
-                                                         i64Ty);
-          else
-            count = builder.create<quake::VeqSizeOp>(loc, i64Ty, target);
-        } else {
-          count = builder.create<arith::ConstantIntOp>(loc, 1, i64Ty);
-        }
-        iters =
-            iters ? builder.create<arith::AddIOp>(loc, iters, count).getResult()
-                  : count;
-      }
-    } else {
-      iters = builder.create<quake::MeasurementsSizeOp>(loc, i64Ty, buffer);
-    }
-    auto bodyBuilder = [&](OpBuilder &builder, Location loc, Region &region,
-                           Block &block) {
-      OpBuilder::InsertionGuard guard(builder);
-      builder.setInsertionPointToStart(&block);
-      Value index = block.getArgument(0);
-      Value measure = builder.create<quake::GetMeasureOp>(loc, buffer, index);
-      symbolTable.insert(loopVar->getName(), measure);
-      if (!TraverseStmt(static_cast<clang::Stmt *>(body)))
-        result = false;
-    };
-    auto idxIters = builder.create<cudaq::cc::CastOp>(
-        loc, i64Ty, iters, cudaq::cc::CastOpMode::Unsigned);
-    opt::factory::createInvariantLoop(builder, loc, idxIters, bodyBuilder);
   } else {
     TODO_x(toLocation(x), x, mangler, "ranged for statement");
   }
@@ -376,22 +336,6 @@ bool QuakeBridgeVisitor::VisitReturnStmt(clang::ReturnStmt *x) {
           result = builder.create<cc::CastOp>(loc, i1Ty, result);
       }
     }
-    // Relax sized measurements to unsized when the function expects unsized.
-    if (auto measTy = dyn_cast<quake::MeasurementsType>(result.getType())) {
-      auto *parentOp = builder.getBlock()->getParentOp();
-      auto funcOp = dyn_cast<func::FuncOp>(parentOp);
-      if (!funcOp)
-        funcOp = parentOp->getParentOfType<func::FuncOp>();
-      if (funcOp) {
-        auto fnTy = funcOp.getFunctionType();
-        if (fnTy.getNumResults() == 1)
-          if (auto fnResMeasTy =
-                  dyn_cast<quake::MeasurementsType>(fnTy.getResult(0)))
-            if (measTy != fnResMeasTy)
-              result =
-                  builder.create<quake::RelaxSizeOp>(loc, fnResMeasTy, result);
-      }
-    }
     if (auto vecTy = dyn_cast<cc::SpanLikeType>(resTy)) {
       // Returning vector data that was allocated on the stack is not valid.
       // Allocate space on the heap and make a copy of the vector instead. It
diff --git a/lib/Frontend/nvqpp/ConvertType.cpp b/lib/Frontend/nvqpp/ConvertType.cpp
index e151331aafd..c21ef8d56a9 100644
--- a/lib/Frontend/nvqpp/ConvertType.cpp
+++ b/lib/Frontend/nvqpp/ConvertType.cpp
@@ -124,21 +124,13 @@ static bool isFunctionCallable(Type t) {
   return false;
 }
 
-static bool isMeasureType(Type t) {
-  if (isa<quake::MeasureType, quake::MeasurementsType>(t))
-    return true;
-  if (auto vec = dyn_cast<cudaq::cc::SpanLikeType>(t))
-    return isMeasureType(vec.getElementType());
-  return false;
-}
-
 /// Return true if and only if \p t is a (simple) arithmetic type, an arithmetic
 /// sequence type (possibly dynamic in length), or a static product type of
 /// arithmetic types. Note that this means a product type with a dynamic
 /// sequence of arithmetic types is \em disallowed.
 static bool isKernelResultType(Type t) {
   return isArithmeticType(t) || isArithmeticSequenceType(t) ||
-         isStaticArithmeticProductType(t) || isMeasureType(t);
+         isStaticArithmeticProductType(t);
 }
 
 /// Return true if and only if \p t is a (simple) arithmetic type, an possibly
@@ -147,7 +139,7 @@ static bool isKernelResultType(Type t) {
 static bool isKernelArgumentType(Type t) {
   return isArithmeticType(t) || isComposedArithmeticType(t) ||
          quake::isQuantumReferenceType(t) || isKernelCallable(t) ||
-         isFunctionCallable(t) || isMeasureType(t) ||
+         isFunctionCallable(t) ||
          // TODO: move from pointers to a builtin string type.
          cudaq::isCharPointerType(t);
 }
@@ -457,8 +449,7 @@ bool QuakeBridgeVisitor::VisitLValueReferenceType(
     return pushType(cc::PointerType::get(builder.getContext()));
   auto eleTy = popType();
   if (isa<cc::CallableType, cc::IndirectCallableType, cc::SpanLikeType,
-          quake::VeqType, quake::RefType, quake::StruqType, quake::MeasureType,
-          quake::MeasurementsType>(eleTy))
+          quake::VeqType, quake::RefType, quake::StruqType>(eleTy))
     return pushType(eleTy);
   return pushType(cc::PointerType::get(eleTy));
 }
@@ -471,8 +462,7 @@ bool QuakeBridgeVisitor::VisitRValueReferenceType(
   // FIXME: LLVMStructType is promoted as a temporary workaround.
   if (isa<cc::ArrayType, cc::CallableType, cc::IndirectCallableType,
           cc::SpanLikeType, cc::StructType, quake::VeqType, quake::RefType,
-          quake::StruqType, quake::MeasureType, quake::MeasurementsType,
-          LLVM::LLVMStructType>(eleTy))
+          quake::StruqType, LLVM::LLVMStructType>(eleTy))
     return pushType(eleTy);
   return pushType(cc::PointerType::get(eleTy));
 }
diff --git a/lib/Optimizer/Builder/Intrinsics.cpp b/lib/Optimizer/Builder/Intrinsics.cpp
index 968035e37c0..c611b15a1f5 100644
--- a/lib/Optimizer/Builder/Intrinsics.cpp
+++ b/lib/Optimizer/Builder/Intrinsics.cpp
@@ -551,12 +551,10 @@ static constexpr IntrinsicCode intrinsicTable[] = {
   func.func private @__quantum__rt__qubit_release(!qir_qubit)
 
   func.func private @__quantum__rt__array_create_1d(i32, i64) -> !qir_array
-  func.func private @__quantum__rt__result_array_create_1d(i64) -> !qir_array
   func.func private @__quantum__rt__array_concatenate(!qir_array, !qir_array) -> !qir_array
   func.func private @__quantum__rt__array_get_size_1d(!qir_array) -> i64
   func.func private @__quantum__rt__array_slice(!qir_array, i32, i64, i64, i64) -> !qir_array
   func.func private @__quantum__rt__array_get_element_ptr_1d(!qir_array, i64) -> !cc.ptr<!qir_qubit>
-  func.func private @__quantum__rt__result_array_get_element_ptr_1d(!qir_array, i64) -> !cc.ptr<!qir_result>
 
   func.func private @__quantum__qis__h__ctl(!qir_array, !qir_qubit)
   func.func private @__quantum__qis__x__ctl(!qir_array, !qir_qubit)
diff --git a/lib/Optimizer/Builder/Marshal.cpp b/lib/Optimizer/Builder/Marshal.cpp
index 03633b8d496..7c272eb3f12 100644
--- a/lib/Optimizer/Builder/Marshal.cpp
+++ b/lib/Optimizer/Builder/Marshal.cpp
@@ -761,10 +761,10 @@ void cudaq::opt::marshal::populateCallbackBuffer(
 
 bool cudaq::opt::marshal::hasLegalType(FunctionType funTy) {
   for (auto ty : funTy.getInputs())
-    if (quake::isQuakeType(ty))
+    if (quake::isQuantumType(ty))
       return false;
   for (auto ty : funTy.getResults())
-    if (quake::isQuakeType(ty))
+    if (quake::isQuantumType(ty))
       return false;
   return true;
 }
@@ -790,10 +790,6 @@ std::pair<bool, func::FuncOp> cudaq::opt::marshal::lookupHostEntryPointFunc(
     // No host entry point needed.
     return {false, func::FuncOp{}};
   }
-  // Device-only kernels (those with quantum types or `measure_result` in their
-  // signature) have no host-side entry point, so skip them.
-  if (!funcOp->hasAttr(cudaq::entryPointAttrName))
-    return {false, func::FuncOp{}};
   if (auto *decl = module.lookupSymbol(mangledEntryPointName))
     if (auto func = dyn_cast<func::FuncOp>(decl)) {
       func.eraseBody();
diff --git a/lib/Optimizer/CodeGen/CMakeLists.txt b/lib/Optimizer/CodeGen/CMakeLists.txt
index d64b2b32fd9..d6036b56e60 100644
--- a/lib/Optimizer/CodeGen/CMakeLists.txt
+++ b/lib/Optimizer/CodeGen/CMakeLists.txt
@@ -21,6 +21,7 @@ add_cudaq_library(OptCodeGen
   ConvertToQIR.cpp
   ConvertToQIRAPI.cpp
   DelayMeasurements.cpp
+  EliminateDeadHeapCopy.cpp
   OptUtils.cpp
   Passes.cpp
   Pipelines.cpp
diff --git a/lib/Optimizer/CodeGen/ConvertToExecMgr.cpp b/lib/Optimizer/CodeGen/ConvertToExecMgr.cpp
index 78585b13502..9cb7869cd66 100644
--- a/lib/Optimizer/CodeGen/ConvertToExecMgr.cpp
+++ b/lib/Optimizer/CodeGen/ConvertToExecMgr.cpp
@@ -49,8 +49,7 @@ struct QuakeTypeConverter : public TypeConverter {
       return cudaq::cc::StructType::get(ty.getContext(), mems);
     });
     addConversion([](quake::MeasureType ty) {
-      auto i64Ty = IntegerType::get(ty.getContext(), 64);
-      return cudaq::cc::StructType::get(ty.getContext(), {i64Ty, i64Ty});
+      return IntegerType::get(ty.getContext(), 64);
     });
   }
 };
diff --git a/lib/Optimizer/CodeGen/ConvertToQIR.cpp b/lib/Optimizer/CodeGen/ConvertToQIR.cpp
index a9770100005..686eb82d806 100644
--- a/lib/Optimizer/CodeGen/ConvertToQIR.cpp
+++ b/lib/Optimizer/CodeGen/ConvertToQIR.cpp
@@ -206,10 +206,8 @@ void cudaq::opt::initializeTypeConversions(LLVMTypeConverter &typeConverter) {
     return LLVM::LLVMStructType::getLiteral(type.getContext(), mems,
                                             /*packed=*/false);
   });
-  typeConverter.addConversion(
-      [](quake::MeasureType type) { return getResultType(type.getContext()); });
-  typeConverter.addConversion([](quake::MeasurementsType type) {
-    return getArrayType(type.getContext());
+  typeConverter.addConversion([](quake::MeasureType type) {
+    return IntegerType::get(type.getContext(), 1);
   });
   cudaq::opt::populateCCTypeConversions(&typeConverter);
 }
diff --git a/lib/Optimizer/CodeGen/ConvertToQIRAPI.cpp b/lib/Optimizer/CodeGen/ConvertToQIRAPI.cpp
index 8467cacc73c..0d08c1416e3 100644
--- a/lib/Optimizer/CodeGen/ConvertToQIRAPI.cpp
+++ b/lib/Optimizer/CodeGen/ConvertToQIRAPI.cpp
@@ -7,7 +7,6 @@
  ******************************************************************************/
 
 #include "CodeGenOps.h"
-#include "cudaq/Optimizer/Builder/Factory.h"
 #include "cudaq/Optimizer/Builder/Intrinsics.h"
 #include "cudaq/Optimizer/Builder/Runtime.h"
 #include "cudaq/Optimizer/CodeGen/CodeGenDialect.h"
@@ -149,31 +148,7 @@ struct QIRAPITypeConverter : public TypeConverter {
         [&](quake::CableType ty) { return getArrayType(ty.getContext()); });
     addConversion(
         [&](quake::MeasureType ty) { return getResultType(ty.getContext()); });
-    addConversion([&](quake::MeasurementsType ty) {
-      return getArrayType(ty.getContext());
-    });
     addConversion([&](quake::StruqType ty) { return convertStruqType(ty); });
-    addConversion([&](cudaq::cc::StdvecType ty) {
-      return cudaq::cc::StdvecType::get(ty.getContext(),
-                                        convertType(ty.getElementType()));
-    });
-    addConversion([&](cudaq::cc::ArrayType ty) {
-      auto newEleTy = convertType(ty.getElementType());
-      auto size = ty.getSize();
-      if (size)
-        return cudaq::cc::ArrayType::get(ty.getContext(), newEleTy, size);
-      return cudaq::cc::ArrayType::get(newEleTy);
-    });
-    addConversion([&](cudaq::cc::StructType ty) -> Type {
-      if (ty.getOpaque())
-        return ty;
-      SmallVector<Type> members;
-      for (auto memTy : ty.getMembers())
-        members.push_back(convertType(memTy));
-      return cudaq::cc::StructType::get(ty.getContext(), ty.getName(), members,
-                                        /*opaque=*/false, ty.getPacked(),
-                                        ty.getBitSize(), ty.getAlignment());
-    });
   }
 
   Type convertFunctionType(FunctionType ty) {
@@ -741,44 +716,6 @@ struct DeallocLikeErase : public OpConversionPattern<OP> {
 
 using DeallocOpErase = DeallocLikeErase<quake::DeallocOp>;
 using SinkOpErase = DeallocLikeErase<quake::SinkOp>;
-
-// Lower `quake.get_measure` to `result_array_get_element_ptr_1d`.
-struct GetMeasureOpRewrite : public OpConversionPattern<quake::GetMeasureOp> {
-  using OpConversionPattern::OpConversionPattern;
-
-  LogicalResult
-  matchAndRewrite(quake::GetMeasureOp getMeas, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override {
-    auto loc = getMeas.getLoc();
-    auto i64Ty = rewriter.getI64Type();
-    Value index;
-    if (!adaptor.getIndex()) {
-      index =
-          rewriter.create<arith::ConstantIntOp>(loc, getMeas.getRawIndex(), 64);
-    } else {
-      index = adaptor.getIndex();
-      if (isa<IndexType>(index.getType())) {
-        index = rewriter.create<arith::IndexCastOp>(loc, i64Ty, index);
-      } else if (isa<IntegerType>(index.getType())) {
-        auto width = cast<IntegerType>(index.getType()).getWidth();
-        if (width < 64)
-          index = rewriter.create<cudaq::cc::CastOp>(
-              loc, i64Ty, index, cudaq::cc::CastOpMode::Unsigned);
-        else if (width > 64)
-          index = rewriter.create<cudaq::cc::CastOp>(loc, i64Ty, index);
-      }
-    }
-    auto resultTy =
-        getTypeConverter()->convertType(getMeas.getMeasure().getType());
-    auto ptrResultTy = cudaq::cc::PointerType::get(resultTy);
-    auto call = rewriter.create<func::CallOp>(
-        loc, TypeRange{ptrResultTy}, cudaq::opt::QIRResultArrayGetElementPtr1d,
-        ArrayRef<Value>{adaptor.getMeasurements(), index});
-    rewriter.replaceOpWithNewOp<cudaq::cc::LoadOp>(getMeas, call.getResult(0));
-    return success();
-  }
-};
-
 struct DiscriminateOpRewrite
     : public OpConversionPattern<quake::DiscriminateOp> {
   using OpConversionPattern::OpConversionPattern;
@@ -788,86 +725,9 @@ struct DiscriminateOpRewrite
                   ConversionPatternRewriter &rewriter) const override {
     auto loc = disc.getLoc();
     Value m = adaptor.getMeasurement();
-
-    // If the result is a stdvec (indicating a `MeasurementsType` input), loop
-    // over the result array and read each result. NB: we check the result type
-    // rather than the operand type because the type converter has already
-    // remapped the operand from MeasurementsType to Array*.
-    if (isa<cudaq::cc::StdvecType>(disc.getResult().getType())) {
-      auto i1Ty = rewriter.getI1Type();
-      auto i1PtrTy = cudaq::cc::PointerType::get(i1Ty);
-      auto i64Ty = rewriter.getI64Type();
-      auto resultTy = cudaq::cg::getResultType(rewriter.getContext());
-      auto ptrResultTy = cudaq::cc::PointerType::get(resultTy);
-
-      auto stdvecResTy = cast<cudaq::cc::StdvecType>(
-          getTypeConverter()->convertType(disc.getResult().getType()));
-      auto elemTy = stdvecResTy.getElementType();
-      unsigned elemWidth = cast<IntegerType>(elemTy).getWidth();
-      Type bufElemTy =
-          elemWidth > 8 ? elemTy : static_cast<Type>(rewriter.getI8Type());
-
-      Value arraySize =
-          rewriter
-              .create<func::CallOp>(loc, i64Ty, cudaq::opt::QIRArrayGetSize,
-                                    ValueRange{m})
-              .getResult(0);
-      Value buff =
-          rewriter.create<cudaq::cc::AllocaOp>(loc, bufElemTy, arraySize);
-
-      cudaq::opt::factory::createInvariantLoop(
-          rewriter, loc, arraySize,
-          [&](OpBuilder &builder, Location loc, Region &, Block &block) {
-            Value iv = block.getArgument(0);
-            Value elemPtr = builder
-                                .create<func::CallOp>(
-                                    loc, ptrResultTy,
-                                    cudaq::opt::QIRResultArrayGetElementPtr1d,
-                                    ValueRange{m, iv})
-                                .getResult(0);
-            Value resultVal = builder.create<cudaq::cc::LoadOp>(loc, elemPtr);
-            Value bitPtr =
-                builder.create<cudaq::cc::CastOp>(loc, i1PtrTy, resultVal);
-            Value bit = builder.create<cudaq::cc::LoadOp>(loc, bitPtr);
-            Value addr = builder.create<cudaq::cc::ComputePtrOp>(
-                loc, cudaq::cc::PointerType::get(bufElemTy), buff, iv);
-            Value stored = (i1Ty != bufElemTy)
-                               ? builder
-                                     .create<cudaq::cc::CastOp>(
-                                         loc, bufElemTy, bit,
-                                         cudaq::cc::CastOpMode::Unsigned)
-                                     .getResult()
-                               : static_cast<Value>(bit);
-            builder.create<cudaq::cc::StoreOp>(loc, stored, addr);
-          });
-
-      auto ptrArrElemTy =
-          cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(elemTy));
-      auto buffCast =
-          rewriter.create<cudaq::cc::CastOp>(loc, ptrArrElemTy, buff);
-      rewriter.replaceOpWithNewOp<cudaq::cc::StdvecInitOp>(disc, stdvecResTy,
-                                                           buffCast, arraySize);
-      return success();
-    }
-
-    auto i1Ty = rewriter.getI1Type();
-    auto i1PtrTy = cudaq::cc::PointerType::get(i1Ty);
-    auto origResTy = disc.getResult().getType();
-    Value loaded;
-    if (auto intTy = dyn_cast<IntegerType>(origResTy);
-        intTy && intTy.getWidth() > 1) {
-      // For wider-than-i1 types: use byte-addressable i8* load, then cc.cast
-      // to truncate to the target width.
-      auto i8Ty = rewriter.getI8Type();
-      auto i8PtrTy = cudaq::cc::PointerType::get(i8Ty);
-      auto bytePtr = rewriter.create<cudaq::cc::CastOp>(loc, i8PtrTy, m);
-      Value byteVal = rewriter.create<cudaq::cc::LoadOp>(loc, bytePtr);
-      loaded = rewriter.create<cudaq::cc::CastOp>(loc, origResTy, byteVal);
-    } else {
-      auto ptrCast = rewriter.create<cudaq::cc::CastOp>(loc, i1PtrTy, m);
-      loaded = rewriter.create<cudaq::cc::LoadOp>(loc, ptrCast);
-    }
-    rewriter.replaceOp(disc, loaded);
+    auto i1PtrTy = cudaq::cc::PointerType::get(rewriter.getI1Type());
+    auto cast = rewriter.create<cudaq::cc::CastOp>(loc, i1PtrTy, m);
+    rewriter.replaceOpWithNewOp<cudaq::cc::LoadOp>(disc, cast);
     return success();
   }
 };
@@ -883,45 +743,26 @@ struct DiscriminateOpToCallRewrite
   LogicalResult
   matchAndRewrite(quake::DiscriminateOp disc, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    // This pattern handles single-qubit MeasureType only.
-    if (isa<cudaq::cc::StdvecType>(disc.getResult().getType()))
-      return failure();
-    auto loc = disc.getLoc();
-    auto i1Ty = rewriter.getI1Type();
-    Value loaded;
     if constexpr (M::discriminateToClassical) {
-      StringRef readFn = M::qirVersion == QirVersion::version_1_0
-                             ? cudaq::opt::qir1_0::ReadResult
-                             : cudaq::opt::qir0_1::ReadResultBody;
-      auto call = rewriter.create<func::CallOp>(loc, i1Ty, readFn,
-                                                adaptor.getOperands());
-      loaded = call.getResult(0);
+      if constexpr (M::qirVersion == QirVersion::version_1_0) {
+        rewriter.replaceOpWithNewOp<func::CallOp>(
+            disc, rewriter.getI1Type(), cudaq::opt::qir1_0::ReadResult,
+            adaptor.getOperands());
+      } else {
+        rewriter.replaceOpWithNewOp<func::CallOp>(
+            disc, rewriter.getI1Type(), cudaq::opt::qir0_1::ReadResultBody,
+            adaptor.getOperands());
+      }
     } else {
+      auto loc = disc.getLoc();
       // NB: the double cast here is to avoid folding the pointer casts.
       auto i64Ty = rewriter.getI64Type();
       auto unu =
           rewriter.create<cudaq::cc::CastOp>(loc, i64Ty, adaptor.getOperands());
-      auto origResTy = disc.getResult().getType();
-      if (auto intTy = dyn_cast<IntegerType>(origResTy);
-          intTy && intTy.getWidth() > 1) {
-        auto i8Ty = rewriter.getI8Type();
-        auto i8PtrTy = cudaq::cc::PointerType::get(i8Ty);
-        auto du = rewriter.create<cudaq::cc::CastOp>(loc, i8PtrTy, unu);
-        Value byteVal = rewriter.create<cudaq::cc::LoadOp>(loc, du);
-        loaded = rewriter.create<cudaq::cc::CastOp>(loc, origResTy, byteVal);
-      } else {
-        auto ptrI1Ty = cudaq::cc::PointerType::get(i1Ty);
-        auto du = rewriter.create<cudaq::cc::CastOp>(loc, ptrI1Ty, unu);
-        loaded = rewriter.create<cudaq::cc::LoadOp>(loc, du);
-      }
+      auto ptrI1Ty = cudaq::cc::PointerType::get(rewriter.getI1Type());
+      auto du = rewriter.create<cudaq::cc::CastOp>(loc, ptrI1Ty, unu);
+      rewriter.replaceOpWithNewOp<cudaq::cc::LoadOp>(disc, du);
     }
-    auto origResTy = disc.getResult().getType();
-    if constexpr (M::discriminateToClassical) {
-      if (auto intTy = dyn_cast<IntegerType>(origResTy);
-          intTy && intTy.getWidth() > 1)
-        loaded = rewriter.create<arith::ExtUIOp>(loc, origResTy, loaded);
-    }
-    rewriter.replaceOp(disc, loaded);
     return success();
   }
 
@@ -950,7 +791,7 @@ struct ExtractRefOpRewrite : public OpConversionPattern<quake::ExtractRefOp> {
           loc, extract.getConstantIndex(), 64);
     } else {
       index = adaptor.getIndex();
-      if (isa<IntegerType>(index.getType())) {
+      if (index.getType().isIntOrFloat()) {
         if (cast<IntegerType>(index.getType()).getWidth() < 64)
           index = rewriter.create<cudaq::cc::CastOp>(
               loc, i64Ty, index, cudaq::cc::CastOpMode::Unsigned);
@@ -1005,20 +846,6 @@ struct VeqSizeOpRewrite : public OpConversionPattern<quake::VeqSizeOp> {
   }
 };
 
-struct MeasurementsSizeOpRewrite
-    : public OpConversionPattern<quake::MeasurementsSizeOp> {
-  using OpConversionPattern::OpConversionPattern;
-
-  LogicalResult
-  matchAndRewrite(quake::MeasurementsSizeOp msize, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override {
-    rewriter.replaceOpWithNewOp<func::CallOp>(msize, TypeRange{msize.getType()},
-                                              cudaq::opt::QIRArrayGetSize,
-                                              adaptor.getOperands());
-    return success();
-  }
-};
-
 struct MakeStruqOpRewrite : public OpConversionPattern<quake::MakeStruqOp> {
   using OpConversionPattern::OpConversionPattern;
 
@@ -1473,12 +1300,9 @@ struct MeasurementOpPattern : public OpConversionPattern<quake::MzOp> {
     SmallVector<Value> args{adaptor.getTargets().begin(),
                             adaptor.getTargets().end()};
     auto functionName = M::getQIRMeasure();
-    bool isMultiQubit = isa<quake::MeasurementsType>(mz.getMeasOut().getType());
+
     // Are we using the measurement that returns a result?
     if constexpr (M::mzReturnsResultType) {
-      if (isMultiQubit)
-        return rewriteMultiQubitMeasurement(mz, adaptor, rewriter, loc,
-                                            regNameAttr);
       // Yes, the measurement results the result, so we can use a
       // straightforward codegen pattern. Use either the mz or the
       // mz_to_register call (with the name as an extra argument) and forward
@@ -1542,113 +1366,6 @@ struct MeasurementOpPattern : public OpConversionPattern<quake::MzOp> {
     }
     return success();
   }
-
-private:
-  LogicalResult
-  rewriteMultiQubitMeasurement(quake::MzOp mz, OpAdaptor adaptor,
-                               ConversionPatternRewriter &rewriter,
-                               Location loc, StringAttr regNameAttr) const {
-    auto *ctx = rewriter.getContext();
-    auto i64Ty = rewriter.getI64Type();
-    auto resultTy = M::getResultType(ctx);
-    auto arrayTy = M::getArrayType(ctx);
-    auto qubitTy = M::getQubitType(ctx);
-    auto ptrQubitTy = cudaq::cc::PointerType::get(qubitTy);
-    auto ptrResultTy = cudaq::cc::PointerType::get(resultTy);
-
-    // Compute total number of qubits across all targets, caching veq sizes.
-    SmallVector<Value> veqSizes;
-    Value totalQubits = rewriter.create<arith::ConstantIntOp>(loc, 0, 64);
-    for (auto [origTarget, convTarget] :
-         llvm::zip(mz.getTargets(), adaptor.getTargets())) {
-      if (isa<quake::RefType>(origTarget.getType())) {
-        Value one = rewriter.create<arith::ConstantIntOp>(loc, 1, 64);
-        totalQubits = rewriter.create<arith::AddIOp>(loc, totalQubits, one);
-        veqSizes.push_back(Value{});
-      } else {
-        Value sz =
-            rewriter
-                .create<func::CallOp>(loc, i64Ty, cudaq::opt::QIRArrayGetSize,
-                                      ValueRange{convTarget})
-                .getResult(0);
-        totalQubits = rewriter.create<arith::AddIOp>(loc, totalQubits, sz);
-        veqSizes.push_back(sz);
-      }
-    }
-
-    // Allocate the result array.
-    Value resultArray = rewriter
-                            .create<func::CallOp>(
-                                loc, arrayTy, cudaq::opt::QIRResultArrayCreate,
-                                ValueRange{totalQubits})
-                            .getResult(0);
-
-    auto functionName = M::getQIRMeasure();
-    Value cstringGlobal;
-    if (mz->getAttr(cudaq::opt::MzAssignedNameAttrName)) {
-      functionName = cudaq::opt::QIRMeasureToRegister;
-      cstringGlobal =
-          createGlobalCString(mz, loc, rewriter, regNameAttr.getValue());
-    }
-
-    auto getResultSlot = [&](OpBuilder &builder, Location loc, Value array,
-                             Value index) -> Value {
-      return builder
-          .create<func::CallOp>(loc, ptrResultTy,
-                                cudaq::opt::QIRResultArrayGetElementPtr1d,
-                                ValueRange{array, index})
-          .getResult(0);
-    };
-
-    // Iterate over targets, measure each qubit, store Result* in the array.
-    Value offset = rewriter.create<arith::ConstantIntOp>(loc, 0, 64);
-    Value one = rewriter.create<arith::ConstantIntOp>(loc, 1, 64);
-    unsigned sizeIdx = 0;
-    for (auto [origTarget, convTarget] :
-         llvm::zip(mz.getTargets(), adaptor.getTargets())) {
-      if (isa<quake::RefType>(origTarget.getType())) {
-        SmallVector<Value> mzArgs{convTarget};
-        if (cstringGlobal)
-          mzArgs.push_back(cstringGlobal);
-        Value result =
-            rewriter.create<func::CallOp>(loc, resultTy, functionName, mzArgs)
-                .getResult(0);
-        Value slot = getResultSlot(rewriter, loc, resultArray, offset);
-        rewriter.create<cudaq::cc::StoreOp>(loc, result, slot);
-        offset = rewriter.create<arith::AddIOp>(loc, offset, one);
-        ++sizeIdx;
-      } else {
-        Value veqSize = veqSizes[sizeIdx++];
-        auto savedOffset = offset;
-        cudaq::opt::factory::createInvariantLoop(
-            rewriter, loc, veqSize,
-            [&](OpBuilder &builder, Location loc, Region &, Block &block) {
-              Value iv = block.getArgument(0);
-              Value qubitPtr =
-                  builder
-                      .create<func::CallOp>(loc, ptrQubitTy,
-                                            cudaq::opt::QIRArrayGetElementPtr1d,
-                                            ValueRange{convTarget, iv})
-                      .getResult(0);
-              Value qubit = builder.create<cudaq::cc::LoadOp>(loc, qubitPtr);
-              SmallVector<Value> mzArgs{qubit};
-              if (cstringGlobal)
-                mzArgs.push_back(cstringGlobal);
-              Value result =
-                  builder
-                      .create<func::CallOp>(loc, resultTy, functionName, mzArgs)
-                      .getResult(0);
-              Value idx = builder.create<arith::AddIOp>(loc, savedOffset, iv);
-              Value slot = getResultSlot(builder, loc, resultArray, idx);
-              builder.create<cudaq::cc::StoreOp>(loc, result, slot);
-            });
-        offset = rewriter.create<arith::AddIOp>(loc, offset, veqSize);
-      }
-    }
-
-    rewriter.replaceOp(mz, resultArray);
-    return success();
-  }
 };
 
 template <typename M>
@@ -2154,23 +1871,20 @@ struct InstantiateCallablePattern
   }
 };
 
-template <typename OP>
-struct ZeroResultOpPattern : public OpConversionPattern<OP> {
-  using Base = OpConversionPattern<OP>;
+struct StoreOpPattern : public OpConversionPattern<cudaq::cc::StoreOp> {
+  using Base = OpConversionPattern;
   using Base::Base;
+  using Base::getTypeConverter;
 
   LogicalResult
-  matchAndRewrite(OP op, typename Base::OpAdaptor adaptor,
+  matchAndRewrite(cudaq::cc::StoreOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    rewriter.replaceOpWithNewOp<OP>(op, TypeRange{}, adaptor.getOperands(),
-                                    op->getAttrs());
+    rewriter.replaceOpWithNewOp<cudaq::cc::StoreOp>(
+        op, TypeRange{}, adaptor.getOperands(), op->getAttrs());
     return success();
   }
 };
 
-using StoreOpPattern = ZeroResultOpPattern<cudaq::cc::StoreOp>;
-using LogOutputOpPattern = ZeroResultOpPattern<cudaq::cc::LogOutputOp>;
-
 template <typename CALLOP>
 struct CallOpInterfacePattern : public OpConversionPattern<CALLOP> {
   using Base = OpConversionPattern<CALLOP>;
@@ -2245,24 +1959,22 @@ struct CallableClosurePattern
 static void commonClassicalHandlingPatterns(RewritePatternSet &patterns,
                                             TypeConverter &typeConverter,
                                             MLIRContext *ctx) {
-  patterns
-      .insert<AllocaOpPattern, BranchOpPattern, CallableClosurePattern,
-              CallableFuncPattern, CallCallableOpPattern,
-              CallIndirectCallableOpPattern, CallIndirectOpPattern,
-              CallOpPattern, CallNoInlineOpPattern, CallVarargOpPattern,
-              CastOpPattern, CondBranchOpPattern, CreateLambdaPattern,
-              FuncConstantPattern, FuncSignaturePattern, FuncToPtrPattern,
-              InstantiateCallablePattern, LoadOpPattern, LogOutputOpPattern,
-              PoisonOpPattern, SelectOpPattern, StoreOpPattern, UndefOpPattern>(
-          typeConverter, ctx);
+  patterns.insert<AllocaOpPattern, BranchOpPattern, CallableClosurePattern,
+                  CallableFuncPattern, CallCallableOpPattern,
+                  CallIndirectCallableOpPattern, CallIndirectOpPattern,
+                  CallOpPattern, CallNoInlineOpPattern, CallVarargOpPattern,
+                  CastOpPattern, CondBranchOpPattern, CreateLambdaPattern,
+                  FuncConstantPattern, FuncSignaturePattern, FuncToPtrPattern,
+                  InstantiateCallablePattern, LoadOpPattern, PoisonOpPattern,
+                  SelectOpPattern, StoreOpPattern, UndefOpPattern>(
+      typeConverter, ctx);
 }
 
 static void commonQuakeHandlingPatterns(RewritePatternSet &patterns,
                                         TypeConverter &typeConverter,
                                         MLIRContext *ctx) {
-  patterns.insert<ApplyOpTrap, CallByRefOpRewrite, GetMeasureOpRewrite,
-                  GetMemberOpRewrite, MakeStruqOpRewrite,
-                  MeasurementsSizeOpRewrite, ReturnOpPattern, RelaxSizeOpErase,
+  patterns.insert<ApplyOpTrap, CallByRefOpRewrite, GetMemberOpRewrite,
+                  MakeStruqOpRewrite, ReturnOpPattern, RelaxSizeOpErase,
                   UnwrapOpErase, VeqSizeOpRewrite, WrapOpErase>(typeConverter,
                                                                 ctx);
 }
@@ -2530,8 +2242,7 @@ struct QuakeToQIRAPIPass
         cudaq::cc::NoInlineCallOp, cudaq::cc::VarargCallOp,
         cudaq::cc::CallCallableOp, cudaq::cc::CallIndirectCallableOp,
         cudaq::cc::CastOp, cudaq::cc::FuncToPtrOp, cudaq::cc::StoreOp,
-        cudaq::cc::LoadOp, cudaq::cc::ComputePtrOp, cudaq::cc::StdvecInitOp,
-        cudaq::cc::StdvecDataOp, cudaq::cc::LogOutputOp>([&](Operation *op) {
+        cudaq::cc::LoadOp>([&](Operation *op) {
       for (auto opnd : op->getOperands())
         if (hasQuakeType(opnd.getType()))
           return false;
@@ -2549,16 +2260,6 @@ struct QuakeToQIRAPIPass
   static bool hasQuakeType(Type ty) {
     if (auto pty = dyn_cast<cudaq::cc::PointerType>(ty))
       return hasQuakeType(pty.getElementType());
-    if (auto aty = dyn_cast<cudaq::cc::ArrayType>(ty))
-      return hasQuakeType(aty.getElementType());
-    if (auto sty = dyn_cast<cudaq::cc::StdvecType>(ty))
-      return hasQuakeType(sty.getElementType());
-    if (auto sty = dyn_cast<cudaq::cc::StructType>(ty)) {
-      for (auto memTy : sty.getMembers())
-        if (hasQuakeType(memTy))
-          return true;
-      return false;
-    }
     if (auto cty = dyn_cast<cudaq::cc::CallableType>(ty))
       return hasQuakeType(cty.getSignature());
     if (auto cty = dyn_cast<cudaq::cc::IndirectCallableType>(ty))
diff --git a/lib/Optimizer/CodeGen/EliminateDeadHeapCopy.cpp b/lib/Optimizer/CodeGen/EliminateDeadHeapCopy.cpp
new file mode 100644
index 00000000000..981a45b0f48
--- /dev/null
+++ b/lib/Optimizer/CodeGen/EliminateDeadHeapCopy.cpp
@@ -0,0 +1,109 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "PassDetails.h"
+#include "cudaq/Optimizer/CodeGen/Passes.h"
+#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
+
+#define DEBUG_TYPE "eliminate-dead-heap-copy"
+
+namespace cudaq::opt {
+#define GEN_PASS_DEF_ELIMINATEDEADHEAPCOPY
+#include "cudaq/Optimizer/CodeGen/Passes.h.inc"
+} // namespace cudaq::opt
+
+using namespace mlir;
+
+namespace {
+
+/// When a kernel returns a vector, the frontend copies the stack data to the
+/// heap via malloc+memcpy (from __nvqpp_vectorCopyCtor) so the data outlives
+/// the callee's stack frame. After inlining and ReturnToOutputLog, the output
+/// logging reads from the heap buffer through cc.cast ops, and the
+/// cc.stdvec_init that wrapped the malloc becomes dead. This pass redirects
+/// those cc.cast reads to the memcpy source (the original stack buffer) and
+/// erases the now-dead malloc, memcpy, and cc.stdvec_init.
+struct EliminateDeadHeapCopyPass
+    : public cudaq::opt::impl::EliminateDeadHeapCopyBase<
+          EliminateDeadHeapCopyPass> {
+  using EliminateDeadHeapCopyBase::EliminateDeadHeapCopyBase;
+
+  void runOnOperation() override {
+    auto func = getOperation();
+    SmallVector<func::CallOp> mallocCalls;
+    func.walk([&](func::CallOp callOp) {
+      if (callOp.getCallee() == "malloc")
+        mallocCalls.push_back(callOp);
+    });
+
+    for (auto mallocCall : mallocCalls) {
+      // malloc should return exactly one result (the allocated pointer).
+      if (mallocCall->getNumResults() != 1)
+        continue;
+      Value mallocResult = mallocCall.getResult(0);
+
+      // Classify users of the malloc result.
+      func::CallOp memcpyCall;
+      SmallVector<cudaq::cc::StdvecInitOp> deadVecInits;
+      SmallVector<cudaq::cc::CastOp> castUsers;
+      bool hasUnsafeUser = false;
+
+      for (auto *user : mallocResult.getUsers()) {
+        if (auto userCall = dyn_cast<func::CallOp>(user)) {
+          if (userCall.getCallee().starts_with("llvm.memcpy") &&
+              userCall.getOperand(0) == mallocResult) {
+            if (memcpyCall) {
+              // Multiple memcpys to the same malloc dest — bail out.
+              hasUnsafeUser = true;
+              break;
+            }
+            memcpyCall = userCall;
+            continue;
+          }
+        }
+        // A dead stdvec_init (no remaining users) can be safely erased.
+        // One with live users is treated as unsafe.
+        if (auto vecInit = dyn_cast<cudaq::cc::StdvecInitOp>(user)) {
+          if (vecInit->use_empty()) {
+            deadVecInits.push_back(vecInit);
+            continue;
+          }
+        }
+        // A cc.cast is safe to redirect: since the memcpy copies from
+        // source to the malloc buffer, reading through either pointer
+        // yields the same data.
+        if (auto castOp = dyn_cast<cudaq::cc::CastOp>(user)) {
+          castUsers.push_back(castOp);
+          continue;
+        }
+        // Any other user prevents elimination.
+        hasUnsafeUser = true;
+        break;
+      }
+
+      if (!memcpyCall || hasUnsafeUser)
+        continue;
+
+      Value memcpySrc = memcpyCall.getOperand(1);
+
+      // Redirect cc.cast users from the malloc result to the memcpy source.
+      for (auto castOp : castUsers)
+        castOp->replaceUsesOfWith(mallocResult, memcpySrc);
+
+      // Erase dead stdvec_inits.
+      for (auto vecInit : deadVecInits)
+        vecInit->erase();
+
+      // Erase memcpy and malloc.
+      memcpyCall->erase();
+      mallocCall->erase();
+    }
+  }
+};
+
+} // namespace
diff --git a/lib/Optimizer/CodeGen/Pipelines.cpp b/lib/Optimizer/CodeGen/Pipelines.cpp
index 43b26f15f2b..377b52b7797 100644
--- a/lib/Optimizer/CodeGen/Pipelines.cpp
+++ b/lib/Optimizer/CodeGen/Pipelines.cpp
@@ -171,7 +171,6 @@ void cudaq::opt::createPipelineTransformsForPythonToOpenQASM(
 }
 
 void cudaq::opt::addPipelineTranslateToOpenQASM(PassManager &pm) {
-  createCommonTargetCodegenPipeline</*isJIT=*/true>(pm, {});
   pm.addNestedPass<func::FuncOp>(createClassicalMemToReg());
   pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
   pm.addNestedPass<func::FuncOp>(createDeadStoreRemoval());
diff --git a/lib/Optimizer/CodeGen/QuakeToExecMgr.cpp b/lib/Optimizer/CodeGen/QuakeToExecMgr.cpp
index 35f4380c4e0..8dbeac4659c 100644
--- a/lib/Optimizer/CodeGen/QuakeToExecMgr.cpp
+++ b/lib/Optimizer/CodeGen/QuakeToExecMgr.cpp
@@ -450,25 +450,6 @@ class VeqSizeOpRewrite : public OpConversionPattern<quake::VeqSizeOp> {
   }
 };
 
-class MeasurementsSizeOpRewrite
-    : public OpConversionPattern<quake::MeasurementsSizeOp> {
-public:
-  using OpConversionPattern::OpConversionPattern;
-
-  LogicalResult
-  matchAndRewrite(quake::MeasurementsSizeOp msize, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override {
-    auto loc = msize->getLoc();
-    auto i64Ty = rewriter.getI64Type();
-    auto ptrI64Ty = cudaq::cc::PointerType::get(i64Ty);
-    auto sizeptr = rewriter.create<cudaq::cc::ComputePtrOp>(
-        loc, ptrI64Ty, adaptor.getMeasurements(),
-        ArrayRef<cudaq::cc::ComputePtrArg>{1});
-    rewriter.replaceOpWithNewOp<cudaq::cc::LoadOp>(msize, sizeptr);
-    return success();
-  }
-};
-
 } // namespace
 
 void cudaq::opt::populateQuakeToCCPatterns(TypeConverter &converter,
@@ -476,9 +457,8 @@ void cudaq::opt::populateQuakeToCCPatterns(TypeConverter &converter,
   auto *context = patterns.getContext();
   patterns.insert<AllocaOpRewrite, ConcatOpRewrite, DeallocOpRewrite,
                   DiscriminateOpRewrite, ExtractRefOpRewrite, VeqSizeOpRewrite,
-                  MeasurementsSizeOpRewrite, MzOpRewrite, ResetRewrite,
-                  SubveqOpRewrite, GenericRewrite<quake::HOp>,
-                  GenericRewrite<quake::PhasedRxOp>,
+                  MzOpRewrite, ResetRewrite, SubveqOpRewrite,
+                  GenericRewrite<quake::HOp>, GenericRewrite<quake::PhasedRxOp>,
                   GenericRewrite<quake::R1Op>, GenericRewrite<quake::RxOp>,
                   GenericRewrite<quake::RyOp>, GenericRewrite<quake::RzOp>,
                   GenericRewrite<quake::SOp>, GenericRewrite<quake::SwapOp>,
diff --git a/lib/Optimizer/CodeGen/QuakeToLLVM.cpp b/lib/Optimizer/CodeGen/QuakeToLLVM.cpp
index f6dbd0206c7..32d845d2b6a 100644
--- a/lib/Optimizer/CodeGen/QuakeToLLVM.cpp
+++ b/lib/Optimizer/CodeGen/QuakeToLLVM.cpp
@@ -1144,7 +1144,11 @@ class MeasureRewrite : public ConvertOpToLLVMPattern<OP> {
         loc, cudaq::opt::getResultType(context), symbolRef, ValueRange{args});
     if (regName)
       callOp->setAttr("registerName", regName);
-    rewriter.replaceOp(measure, callOp.getResult());
+    auto i1Ty = rewriter.getI1Type();
+    auto i1PtrTy = LLVM::LLVMPointerType::get(i1Ty);
+    auto cast =
+        rewriter.create<LLVM::BitcastOp>(loc, i1PtrTy, callOp.getResult());
+    rewriter.replaceOpWithNewOp<LLVM::LoadOp>(measure, i1Ty, cast);
 
     return success();
   }
@@ -1174,28 +1178,6 @@ class GetVeqSizeOpRewrite : public OpConversionPattern<quake::VeqSizeOp> {
   }
 };
 
-class GetMeasurementsSizeOpRewrite
-    : public OpConversionPattern<quake::MeasurementsSizeOp> {
-public:
-  using OpConversionPattern::OpConversionPattern;
-
-  LogicalResult
-  matchAndRewrite(quake::MeasurementsSizeOp msize, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override {
-    auto parentModule = msize->getParentOfType<ModuleOp>();
-    auto context = parentModule->getContext();
-    auto qFunctionName = cudaq::opt::QIRArrayGetSize;
-
-    auto symbolRef = cudaq::opt::factory::createLLVMFunctionSymbol(
-        qFunctionName, rewriter.getI64Type(),
-        {cudaq::opt::getArrayType(context)}, parentModule);
-
-    rewriter.replaceOpWithNewOp<LLVM::CallOp>(msize, rewriter.getI64Type(),
-                                              symbolRef, adaptor.getOperands());
-    return success();
-  }
-};
-
 //===----------------------------------------------------------------------===//
 // Other conversion patterns.
 //===----------------------------------------------------------------------===//
@@ -1429,8 +1411,9 @@ void cudaq::opt::populateQuakeToLLVMPatterns(LLVMTypeConverter &typeConverter,
                                              unsigned &measureCounter) {
   auto *context = patterns.getContext();
   cudaq::opt::populateQuakeToCCPrepPatterns(patterns);
-  patterns.insert<GetMeasurementsSizeOpRewrite, GetVeqSizeOpRewrite,
-                  RemoveRelaxSizeRewrite, ReturnBitRewrite>(context);
+  patterns
+      .insert<GetVeqSizeOpRewrite, RemoveRelaxSizeRewrite, ReturnBitRewrite>(
+          context);
   patterns
       .insert<AllocaOpRewrite, ConcatOpRewrite, CustomUnitaryOpRewrite,
               DeallocOpRewrite, DiscriminateOpPattern, ExtractQubitOpRewrite,
diff --git a/lib/Optimizer/CodeGen/WireSetsToProfileQIR.cpp b/lib/Optimizer/CodeGen/WireSetsToProfileQIR.cpp
index 1051fa43183..8f7370a9947 100644
--- a/lib/Optimizer/CodeGen/WireSetsToProfileQIR.cpp
+++ b/lib/Optimizer/CodeGen/WireSetsToProfileQIR.cpp
@@ -70,9 +70,6 @@ struct QuakeTypeConverter : public TypeConverter {
     addConversion([](quake::MeasureType ty) {
       return cudaq::opt::getResultType(ty.getContext());
     });
-    addConversion([](quake::MeasurementsType ty) {
-      return cudaq::opt::getArrayType(ty.getContext());
-    });
   }
 };
 } // namespace
diff --git a/lib/Optimizer/Dialect/CC/CCOps.cpp b/lib/Optimizer/Dialect/CC/CCOps.cpp
index d024918cc32..04172cf6bba 100644
--- a/lib/Optimizer/Dialect/CC/CCOps.cpp
+++ b/lib/Optimizer/Dialect/CC/CCOps.cpp
@@ -77,10 +77,6 @@ Value cudaq::cc::getByteSizeOfType(OpBuilder &builder, Location loc, Type ty,
                 // we're assuming pointers are 64 bits.
                 return {8};
               })
-          .Case([](quake::MeasureType) -> std::optional<std::int32_t> {
-            // Size of `measure_result` {value, unique_id} = 16 bytes
-            return {16};
-          })
           .Default({});
 
   if (rawSize)
diff --git a/lib/Optimizer/Dialect/Quake/CanonicalPatterns.inc b/lib/Optimizer/Dialect/Quake/CanonicalPatterns.inc
index e6d4bddb291..a2d45bc0d21 100644
--- a/lib/Optimizer/Dialect/Quake/CanonicalPatterns.inc
+++ b/lib/Optimizer/Dialect/Quake/CanonicalPatterns.inc
@@ -100,28 +100,6 @@ struct ForwardConstantVeqSizePattern
   }
 };
 
-// %4 = quake.measurements_size %3 : (!quake.measurements<10>) -> i64
-// ─────────────────────────────────────────────────────────────────
-// %4 = constant 10 : i64
-struct ForwardConstantMeasurementsSizePattern
-    : public OpRewritePattern<quake::MeasurementsSizeOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(quake::MeasurementsSizeOp msSize,
-                                PatternRewriter &rewriter) const override {
-    auto msTy =
-        dyn_cast<quake::MeasurementsType>(msSize.getMeasurements().getType());
-    if (!msTy)
-      return failure();
-    if (!msTy.hasSpecifiedSize())
-      return failure();
-    auto resTy = msSize.getType();
-    rewriter.replaceOpWithNewOp<arith::ConstantIntOp>(msSize, msTy.getSize(),
-                                                      resTy);
-    return success();
-  }
-};
-
 // %2 = constant 10 : i32
 // %3 = quake.alloca !quake.veq<?>[%2 : i32]
 // ─────────────────────────────────────────
@@ -737,98 +715,6 @@ struct MergeRotationPattern : public OpRewritePattern<OP> {
   }
 };
 
-// %0 = quake.alloca !quake.veq<2>
-// %1 = quake.mz %0 : (!quake.veq<2>) -> !quake.measurements<?>
-// ────────────────────────────────────────────────────────────
-// %0 = quake.alloca !quake.veq<2>
-// %1 = quake.mz %0 : (!quake.veq<2>) -> !quake.measurements<2>
-template <typename MeasOp>
-struct FuseSizeToMeasurementPattern : public OpRewritePattern<MeasOp> {
-  using OpRewritePattern<MeasOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(MeasOp measOp,
-                                PatternRewriter &rewriter) const override {
-    auto measTy =
-        dyn_cast<quake::MeasurementsType>(measOp.getMeasOut().getType());
-    if (!measTy || measTy.hasSpecifiedSize())
-      return failure();
-
-    std::size_t totalSize = 0;
-    for (auto target : measOp.getTargets()) {
-      if (quake::isConstantQuantumRefType(target.getType())) {
-        totalSize += quake::getAllocationSize(target.getType());
-        continue;
-      }
-      return failure();
-    }
-    if (totalSize == 0)
-      return failure();
-
-    auto newMeasTy =
-        quake::MeasurementsType::get(rewriter.getContext(), totalSize);
-
-    // If any user expects unsized measurements (return-like ops where the
-    // function returns unsized, or call ops where the callee parameter is
-    // unsized), insert a quake.relax_size to bridge the sized-to-unsized gap.
-    auto needsCastForUser = [&](OpOperand &use) -> bool {
-      auto *user = use.getOwner();
-      if (isa<cudaq::cc::ReturnOp, cudaq::cc::UnwindReturnOp>(user)) {
-        if (auto funcOp = user->getParentOfType<func::FuncOp>())
-          if (funcOp.getFunctionType().getNumResults() == 1)
-            if (auto fnResMeasTy = dyn_cast<quake::MeasurementsType>(
-                    funcOp.getFunctionType().getResult(0)))
-              return !fnResMeasTy.hasSpecifiedSize();
-        return false;
-      }
-      auto checkCalleeArgType = [&](mlir::FunctionType calleeType,
-                                    unsigned argIdx) -> bool {
-        if (argIdx < calleeType.getNumInputs())
-          if (auto paramMeasTy = dyn_cast<quake::MeasurementsType>(
-                  calleeType.getInput(argIdx)))
-            return !paramMeasTy.hasSpecifiedSize();
-        return false;
-      };
-      if (auto callOp = dyn_cast<func::CallOp>(user))
-        return checkCalleeArgType(callOp.getCalleeType(),
-                                  use.getOperandNumber());
-      if (auto callOp = dyn_cast<func::CallIndirectOp>(user)) {
-        unsigned opIdx = use.getOperandNumber();
-        if (opIdx == 0)
-          return false; // operand 0 is the callee value itself
-        auto calleeFnTy = cast<mlir::FunctionType>(
-            callOp.getCallee().getType());
-        return checkCalleeArgType(calleeFnTy, opIdx - 1);
-      }
-      return false;
-    };
-
-    SmallVector<Type> resultTypes;
-    resultTypes.push_back(newMeasTy);
-    for (unsigned i = 1; i < measOp->getNumResults(); ++i)
-      resultTypes.push_back(measOp->getResult(i).getType());
-
-    auto oldAttrs = measOp->getAttrs();
-    auto newOp = rewriter.replaceOpWithNewOp<MeasOp>(measOp,
-                                                     TypeRange{resultTypes},
-                                                     measOp.getTargets(),
-                                                     measOp.getRegisterNameAttr());
-    for (auto &attr : oldAttrs)
-      if (!newOp->getAttr(attr.getName()))
-        newOp->setAttr(attr.getName(), attr.getValue());
-
-    for (auto &use :
-         llvm::make_early_inc_range(newOp.getMeasOut().getUses())) {
-      if (needsCastForUser(use)) {
-        rewriter.setInsertionPoint(use.getOwner());
-        auto relax = rewriter.create<quake::RelaxSizeOp>(
-            use.getOwner()->getLoc(), measTy, newOp.getMeasOut());
-        use.set(relax);
-      }
-    }
-    return success();
-  }
-};
-
 // Forward the argument to a relax_size to the users for all users that are
 // quake operations. All quake ops that take a sized veq argument are
 // polymorphic on all veq types. If the op is not a quake op, then maintain
diff --git a/lib/Optimizer/Dialect/Quake/QuakeOps.cpp b/lib/Optimizer/Dialect/Quake/QuakeOps.cpp
index e8c8228f6fa..ac459be3e4e 100644
--- a/lib/Optimizer/Dialect/Quake/QuakeOps.cpp
+++ b/lib/Optimizer/Dialect/Quake/QuakeOps.cpp
@@ -625,30 +625,6 @@ void quake::GetMemberOp::getCanonicalizationPatterns(
   patterns.add<BypassMakeStruq>(context);
 }
 
-//===----------------------------------------------------------------------===//
-// GetMeasureOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult quake::GetMeasureOp::verify() {
-  if (getIndex()) {
-    if (getRawIndex() != kDynamicIndex)
-      return emitOpError(
-          "must not have both a constant index and an index argument.");
-  } else {
-    if (getRawIndex() == kDynamicIndex) {
-      return emitOpError("invalid constant index value");
-    } else {
-      auto msSize = getMeasurements().getType().getSize();
-      if (getMeasurements().getType().hasSpecifiedSize() &&
-          getRawIndex() >= msSize)
-        return emitOpError("invalid index [" + std::to_string(getRawIndex()) +
-                           "] because >= size [" + std::to_string(msSize) +
-                           "]");
-    }
-  }
-  return success();
-}
-
 //===----------------------------------------------------------------------===//
 // InitializeStateOp
 //===----------------------------------------------------------------------===//
@@ -702,19 +678,8 @@ LogicalResult quake::MakeStruqOp::verify() {
 //===----------------------------------------------------------------------===//
 
 LogicalResult quake::RelaxSizeOp::verify() {
-  auto inTy = getInputVec().getType();
-  auto resTy = getType();
-  if (auto veqTy = dyn_cast<quake::VeqType>(resTy)) {
-    if (veqTy.hasSpecifiedSize())
-      return emitOpError("result veq type must not specify a size");
-    if (!isa<quake::VeqType>(inTy))
-      return emitOpError("input and result must both be veq types");
-  } else if (auto measTy = dyn_cast<quake::MeasurementsType>(resTy)) {
-    if (measTy.hasSpecifiedSize())
-      return emitOpError("result measurements type must not specify a size");
-    if (!isa<quake::MeasurementsType>(inTy))
-      return emitOpError("input and result must both be measurements types");
-  }
+  if (cast<quake::VeqType>(getType()).hasSpecifiedSize())
+    emitOpError("return veq type must not specify a size");
   return success();
 }
 
@@ -767,15 +732,6 @@ void quake::VeqSizeOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
       context);
 }
 
-//===----------------------------------------------------------------------===//
-// MeasurementsSizeOp
-//===----------------------------------------------------------------------===//
-
-void quake::MeasurementsSizeOp::getCanonicalizationPatterns(
-    RewritePatternSet &patterns, MLIRContext *context) {
-  patterns.add<ForwardConstantMeasurementsSizePattern>(context);
-}
-
 //===----------------------------------------------------------------------===//
 // WrapOp
 //===----------------------------------------------------------------------===//
@@ -868,12 +824,12 @@ LogicalResult verifyMeasurements(MEAS op, TypeRange targetsType,
                                  const Type bitsType) {
   if (failed(verifyWireResultsAreLinear(op)))
     return failure();
-  bool mustBeCollection =
+  bool mustBeStdvec =
       targetsType.size() > 1 ||
       (targetsType.size() == 1 && isa<quake::VeqType>(targetsType[0]));
-  if (mustBeCollection) {
-    if (!isa<quake::MeasurementsType>(op.getMeasOut().getType()))
-      return op.emitOpError("must return `!quake.measurements`, when "
+  if (mustBeStdvec) {
+    if (!isa<cudaq::cc::StdvecType>(op.getMeasOut().getType()))
+      return op.emitOpError("must return `!cc.stdvec<!quake.measure>`, when "
                             "measuring a qreg, a series of qubits, or both");
   } else {
     if (!isa<quake::MeasureType>(op.getMeasOut().getType()))
@@ -901,34 +857,19 @@ LogicalResult quake::MzOp::verify() {
                             getMeasOut().getType());
 }
 
-void quake::MxOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
-                                              MLIRContext *context) {
-  patterns.add<FuseSizeToMeasurementPattern<quake::MxOp>>(context);
-}
-
-void quake::MyOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
-                                              MLIRContext *context) {
-  patterns.add<FuseSizeToMeasurementPattern<quake::MyOp>>(context);
-}
-
-void quake::MzOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
-                                              MLIRContext *context) {
-  patterns.add<FuseSizeToMeasurementPattern<quake::MzOp>>(context);
-}
-
 //===----------------------------------------------------------------------===//
 // Discriminate
 //===----------------------------------------------------------------------===//
 
 LogicalResult quake::DiscriminateOp::verify() {
-  if (isa<quake::MeasurementsType>(getMeasurement().getType())) {
+  if (isa<cudaq::cc::StdvecType>(getMeasurement().getType())) {
     auto stdvecTy = dyn_cast<cudaq::cc::StdvecType>(getResult().getType());
     if (!stdvecTy || !isa<IntegerType>(stdvecTy.getElementType()))
       return emitOpError("must return a !cc.stdvec<integral> type, when "
-                         "discriminating a measurements collection");
+                         "discriminating a qreg, a series of qubits, or both");
   } else {
-    if (!isa<quake::MeasureType>(getMeasurement().getType()) ||
-        !isa<IntegerType>(getResult().getType()))
+    auto measTy = isa<quake::MeasureType>(getMeasurement().getType());
+    if (!measTy || !isa<IntegerType>(getResult().getType()))
       return emitOpError(
           "must return integral type when discriminating exactly one qubit");
   }
diff --git a/lib/Optimizer/Dialect/Quake/QuakeTypes.cpp b/lib/Optimizer/Dialect/Quake/QuakeTypes.cpp
index d61381860ee..b127ea60e88 100644
--- a/lib/Optimizer/Dialect/Quake/QuakeTypes.cpp
+++ b/lib/Optimizer/Dialect/Quake/QuakeTypes.cpp
@@ -49,34 +49,6 @@ Type quake::VeqType::parse(AsmParser &parser) {
   return get(parser.getContext(), size);
 }
 
-//===----------------------------------------------------------------------===//
-// Measurements' custom parser and pretty printing.
-//
-// measurements `<` (`?` | int) `>`
-//===----------------------------------------------------------------------===//
-
-void quake::MeasurementsType::print(AsmPrinter &os) const {
-  os << '<';
-  if (hasSpecifiedSize())
-    os << getSize();
-  else
-    os << '?';
-  os << '>';
-}
-
-Type quake::MeasurementsType::parse(AsmParser &parser) {
-  if (parser.parseLess())
-    return {};
-  std::size_t size = kDynamicSize;
-  if (succeeded(parser.parseOptionalQuestion()))
-    size = kDynamicSize;
-  else if (parser.parseInteger(size))
-    return {};
-  if (parser.parseGreater())
-    return {};
-  return get(parser.getContext(), size);
-}
-
 //===----------------------------------------------------------------------===//
 
 Type quake::StruqType::parse(AsmParser &parser) {
@@ -185,6 +157,6 @@ std::size_t quake::getAllocationSize(Type ty) {
 //===----------------------------------------------------------------------===//
 
 void quake::QuakeDialect::registerTypes() {
-  addTypes<CableType, ControlType, MeasureType, MeasurementsType, RefType,
-           StateType, StruqType, VeqType, WireType>();
+  addTypes<CableType, ControlType, MeasureType, RefType, StateType, StruqType,
+           VeqType, WireType>();
 }
diff --git a/lib/Optimizer/Transforms/AddMeasurements.cpp b/lib/Optimizer/Transforms/AddMeasurements.cpp
index b3776062286..1b71702ae1a 100644
--- a/lib/Optimizer/Transforms/AddMeasurements.cpp
+++ b/lib/Optimizer/Transforms/AddMeasurements.cpp
@@ -92,14 +92,10 @@ addMeasurements(func::FuncOp funcOp, SmallVector<Operation *> &allocations,
   builder.setInsertionPointToEnd(newBlock);
   auto measTy = quake::MeasureType::get(builder.getContext());
   for (auto &[index, alloca] : llvm::enumerate(allocations)) {
-    if (auto veqTy = dyn_cast<quake::VeqType>(alloca->getResult(0).getType())) {
-      Type measurementsTy = [&]() {
-        auto *ctx = builder.getContext();
-        if (veqTy.hasSpecifiedSize())
-          return quake::MeasurementsType::get(ctx, veqTy.getSize());
-        return quake::MeasurementsType::getUnsized(ctx);
-      }();
-      builder.create<quake::MzOp>(loc, measurementsTy, alloca->getResult(0));
+    if (isa<quake::VeqType>(alloca->getResult(0).getType())) {
+      auto stdvecTy = cudaq::cc::StdvecType::get(measTy);
+      builder.create<quake::MzOp>(loc, stdvecTy,
+                                  ValueRange{alloca->getResult(0)});
     } else {
       builder.create<quake::MzOp>(loc, measTy, alloca->getResult(0));
     }
diff --git a/lib/Optimizer/Transforms/ApplyOpSpecialization.cpp b/lib/Optimizer/Transforms/ApplyOpSpecialization.cpp
index ad45ca10c05..c428bfdec2a 100644
--- a/lib/Optimizer/Transforms/ApplyOpSpecialization.cpp
+++ b/lib/Optimizer/Transforms/ApplyOpSpecialization.cpp
@@ -548,7 +548,13 @@ class ApplySpecializationPass
           << "cannot make adjoint of kernel: unstructured control flow\n");
       return failure();
     }
-    if (cudaq::opt::hasCallOp(func)) {
+    // quake::ApplyOp implements CallOpInterface but can be handled below by
+    // toggling isAdj. Reject any other call-like op that we cannot invert.
+    if (cudaq::opt::internal::hasCharacteristic(
+            [](Operation &op) {
+              return isa<mlir::CallOpInterface>(op) && !isa<quake::ApplyOp>(op);
+            },
+            *func.getOperation())) {
       LLVM_DEBUG(llvm::dbgs() << "cannot make adjoint of kernel with calls\n");
       return failure();
     }
@@ -583,7 +589,7 @@ class ApplySpecializationPass
   static SmallVector<Operation *> getOpsToInvert(Block &block) {
     SmallVector<Operation *> ops;
     for (auto &op : block)
-      if (cudaq::opt::hasQuantum(op))
+      if (cudaq::opt::hasQuantum(op) || isa<quake::ApplyOp>(op))
         ops.push_back(&op);
     return ops;
   }
@@ -775,6 +781,19 @@ class ApplySpecializationPass
         continue;
       }
 
+      if (auto applyOp = dyn_cast<quake::ApplyOp>(op)) {
+        LLVM_DEBUG(llvm::dbgs() << "moving apply op: " << *op << ".\n");
+        // Adjoint of an ApplyOp: toggles the isAdj flag.
+        mlir::UnitAttr newIsAdj =
+            applyOp.getIsAdj() ? mlir::UnitAttr{}
+                               : mlir::UnitAttr::get(builder.getContext());
+        builder.create<quake::ApplyOp>(
+            applyOp.getLoc(), applyOp.getResultTypes(), applyOp.getCalleeAttr(),
+            newIsAdj, applyOp.getControls(), applyOp.getActuals());
+        applyOp->erase();
+        continue;
+      }
+
       bool opWasNegated = false;
       IRMapping mapper;
       LLVM_DEBUG(llvm::dbgs() << "moving quantum op: " << *op << ".\n");
diff --git a/lib/Optimizer/Transforms/CombineMeasurements.cpp b/lib/Optimizer/Transforms/CombineMeasurements.cpp
index f3d422e65d7..5065c8aa6b6 100644
--- a/lib/Optimizer/Transforms/CombineMeasurements.cpp
+++ b/lib/Optimizer/Transforms/CombineMeasurements.cpp
@@ -109,7 +109,7 @@ class ExtendQubitMeasurePattern : public OpRewritePattern<quake::MzOp> {
   // with:
   // ```
   //   %1 = ... : !quake.veq<4>
-  //   %measOut = quake.mz %1 : (!quake.veq<4>) -> !quake.measurements<4>
+  //   %measOut = quake.mz %1 : (!quake.veq<4>) -> !cc.stdvec<!quake.measure>
   // ```
   // And collect output names information:  `"[[[0,[1,"q0"]],[1,[2,"q1"]]]]"`
   LogicalResult matchAndRewrite(quake::MzOp measure,
@@ -132,12 +132,7 @@ class ExtendQubitMeasurePattern : public OpRewritePattern<quake::MzOp> {
       analysis.resultQubitVals[offset] =
           std::make_pair(idx, std::to_string(idx));
 
-      Type resultType;
-      if (quake::isConstantQuantumRefType(veq.getType()))
-        resultType = quake::MeasurementsType::get(
-            measure->getContext(), quake::getAllocationSize(veq.getType()));
-      else
-        resultType = quake::MeasurementsType::getUnsized(measure->getContext());
+      auto resultType = cudaq::cc::StdvecType::get(measure.getType(0));
       if (measure == analysis.lastMeasurement) {
         rewriter.replaceOpWithNewOp<quake::MzOp>(measure, TypeRange{resultType},
                                                  ValueRange{veq},
@@ -170,12 +165,12 @@ class ExtendVeqMeasurePattern : public OpRewritePattern<quake::MzOp> {
   //   %1 = ... : !quake.veq<4>
   //   %2 = quake.subveq %1, %c1, %c2 : (!quake.veq<4>, i32, i32) ->
   //        !quake.veq<2>
-  //   %measOut = quake.mz %2 : (!quake.veq<2>) -> !quake.measurements<2>
+  //   %measOut = quake.mz %2 : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
   // ```
   // with:
   // ```
   //   %1 = ... : !quake.veq<4>
-  //   %measOut = quake.mz %1 : (!quake.veq<4>) -> !quake.measurements<4>
+  //   %measOut = quake.mz %1 : (!quake.veq<4>) -> !cc.stdvec<!quake.measure>
   // ```
   // And collect output names information:  `"[[[0,[1,"q0"]],[1,[2,"q1"]]]]"`
   LogicalResult matchAndRewrite(quake::MzOp measure,
@@ -208,21 +203,12 @@ class ExtendVeqMeasurePattern : public OpRewritePattern<quake::MzOp> {
         analysis.resultQubitVals[offset] = std::make_pair(i, std::to_string(i));
       }
 
-      if (measure == analysis.lastMeasurement) {
-        auto veq = subveq.getVeq();
-        Type resultType;
-        if (quake::isConstantQuantumRefType(veq.getType()))
-          resultType = quake::MeasurementsType::get(
-              measure->getContext(), quake::getAllocationSize(veq.getType()));
-        else
-          resultType =
-              quake::MeasurementsType::getUnsized(measure->getContext());
-        rewriter.replaceOpWithNewOp<quake::MzOp>(measure, TypeRange{resultType},
-                                                 ValueRange{veq},
-                                                 measure.getRegisterNameAttr());
-      } else if (measure.use_empty()) {
+      if (measure == analysis.lastMeasurement)
+        rewriter.replaceOpWithNewOp<quake::MzOp>(
+            measure, measure.getResultTypes(), ValueRange{subveq.getVeq()},
+            measure.getRegisterNameAttr());
+      else if (measure.use_empty())
         rewriter.eraseOp(measure);
-      }
 
       return success();
     }
diff --git a/lib/Optimizer/Transforms/DecompositionPatternSelection.cpp b/lib/Optimizer/Transforms/DecompositionPatternSelection.cpp
index fda09bc8db2..9fac90636bf 100644
--- a/lib/Optimizer/Transforms/DecompositionPatternSelection.cpp
+++ b/lib/Optimizer/Transforms/DecompositionPatternSelection.cpp
@@ -63,6 +63,21 @@ struct OperatorInfo {
     return name == other.name && numControls == other.numControls &&
            isAdj == other.isAdj;
   }
+
+  bool isUnbounded() const {
+    return numControls == std::numeric_limits<std::size_t>::max();
+  }
+
+  /// Check if this gate matches another, treating unbounded (n) control
+  /// count as a wildcard that matches any concrete count.
+  bool matches(const OperatorInfo &other) const {
+    if (name != other.name || isAdj != other.isAdj)
+      return false;
+    constexpr auto unbounded = std::numeric_limits<std::size_t>::max();
+    if (numControls == unbounded || other.numControls == unbounded)
+      return true;
+    return numControls == other.numControls;
+  }
 };
 
 struct BasisTarget : public ConversionTarget {
@@ -175,14 +190,15 @@ class DecompositionGraph {
   }
 
   /// Return all patterns that have the given gate as one of their targets.
-  ///
-  /// @param gate The gate to find incoming patterns for
-  /// @return A vector of pattern names (StringRef) whose targets include the
-  /// given gate
-  llvm::ArrayRef<std::string> incomingPatterns(const OperatorInfo &gate) const {
-    static const llvm::SmallVector<std::string> empty;
-    auto it = targetToPatterns.find(gate);
-    return it == targetToPatterns.end() ? empty : it->second;
+  /// Uses OperatorInfo::matches() to handle unbounded (n) control counts.
+  llvm::SmallVector<std::string>
+  incomingPatterns(const OperatorInfo &gate) const {
+    llvm::SmallVector<std::string> result;
+    for (const auto &[key, patterns] : targetToPatterns) {
+      if (key.matches(gate))
+        result.append(patterns.begin(), patterns.end());
+    }
+    return result;
   }
 
   /// Select subset of patterns relevant to decomposing to the given basis
@@ -207,7 +223,12 @@ class DecompositionGraph {
 
     for (const auto &patternName : patternSelectionCache[hashVal]) {
       const auto &pattern = getPatternType(patternName);
-      patterns.add(pattern->create(patterns.getContext()));
+      // Patterns with unbounded (n) control counts get lower benefit so
+      // that specific patterns (e.g., CR1ToCX for r1(1)) are preferred
+      // when both match the same op.
+      OperatorInfo sourceInfo(pattern->getSourceOp());
+      PatternBenefit benefit = sourceInfo.isUnbounded() ? 1 : 2;
+      patterns.add(pattern->create(patterns.getContext(), benefit));
     }
   }
 
@@ -260,18 +281,29 @@ class DecompositionGraph {
       gatesToVisit.push({gate, 0, std::nullopt});
     }
 
+    /// Find the distance for a gate, handling unbounded (n) control counts.
+    /// Exact hash lookup first for the common case, then a scan when the
+    /// query or any visited entry uses unbounded controls.
+    auto findGateDist = [&](const OperatorInfo &gate) -> std::size_t {
+      auto it = visitedGates.find(gate);
+      if (it != visitedGates.end())
+        return it->second;
+      // Scan for wildcard matches (either side could be unbounded).
+      std::size_t best = std::numeric_limits<std::size_t>::max();
+      for (const auto &[visited, dist] : visitedGates) {
+        if (visited.matches(gate))
+          best = std::min(best, dist);
+      }
+      return best;
+    };
+
     /// Compute the maximum distance from a pattern's targets to the basis
     /// gates.
     auto getPatternDist = [&](const auto &pattern) {
       auto targetGates = pattern->getTargetOps();
       std::vector<std::size_t> targetDistances;
-      for (const auto &targetGate : targetGates) {
-        if (visitedGates.count(targetGate)) {
-          targetDistances.push_back(visitedGates.at(targetGate));
-        } else {
-          targetDistances.push_back(std::numeric_limits<std::size_t>::max());
-        }
-      }
+      for (const auto &targetGate : targetGates)
+        targetDistances.push_back(findGateDist(targetGate));
       return *std::max_element(targetDistances.begin(), targetDistances.end());
     };
 
diff --git a/lib/Optimizer/Transforms/DecompositionPatterns.cpp b/lib/Optimizer/Transforms/DecompositionPatterns.cpp
index 1add53a2f85..755ab75af38 100644
--- a/lib/Optimizer/Transforms/DecompositionPatterns.cpp
+++ b/lib/Optimizer/Transforms/DecompositionPatterns.cpp
@@ -334,10 +334,9 @@ LogicalResult checkAndExtractControls(quake::OperatorInterface op,
   };                                                                           \
   CUDAQ_REGISTER_TYPE(cudaq::DecompositionPatternType, PATTERN##Type, PATTERN)
 
-// TODO: The decomposition patterns "SToR1", "TToR1", "R1ToU3", "U3ToRotations"
-// can handle arbitrary number of controls, but currently metadata cannot
-// capture this. The pattern types therefore only advertise them for a fixed
-// number of controls (1 for "SToR1" and "TToR1", 0 for the rest).
+// NOTE: The patterns SToR1, TToR1, R1ToU3, and U3ToRotations handle arbitrary
+// control counts and are registered with (n) metadata. R1ToRz explicitly
+// rejects controlled ops and uses bare metadata.
 
 //===----------------------------------------------------------------------===//
 // HOp decompositions
@@ -608,7 +607,7 @@ struct R1ToU3 : public cudaq::DecompositionPattern<R1ToU3Type, quake::R1Op> {
     return success();
   }
 };
-REGISTER_DECOMPOSITION_PATTERN(R1ToU3, "r1", "u3");
+REGISTER_DECOMPOSITION_PATTERN(R1ToU3, "r1(n)", "u3(n)");
 
 // quake.r1<adj> (θ) target
 // ─────────────────────────────────
@@ -800,7 +799,7 @@ struct SToR1 : public cudaq::DecompositionPattern<SToR1Type, quake::SOp> {
     return success();
   }
 };
-REGISTER_DECOMPOSITION_PATTERN(SToR1, "s(1)", "r1(1)");
+REGISTER_DECOMPOSITION_PATTERN(SToR1, "s(n)", "r1(n)");
 
 //===----------------------------------------------------------------------===//
 // TOp decompositions
@@ -881,7 +880,7 @@ struct TToR1 : public cudaq::DecompositionPattern<TToR1Type, quake::TOp> {
     return success();
   }
 };
-REGISTER_DECOMPOSITION_PATTERN(TToR1, "t(1)", "r1(1)");
+REGISTER_DECOMPOSITION_PATTERN(TToR1, "t(n)", "r1(n)");
 
 //===----------------------------------------------------------------------===//
 // XOp decompositions
@@ -1818,7 +1817,7 @@ struct U3ToRotations
     return success();
   }
 };
-REGISTER_DECOMPOSITION_PATTERN(U3ToRotations, "u3", "rz", "rx");
+REGISTER_DECOMPOSITION_PATTERN(U3ToRotations, "u3(n)", "rz(n)", "rx(n)");
 
 } // namespace
 
diff --git a/lib/Optimizer/Transforms/ExpandMeasurements.cpp b/lib/Optimizer/Transforms/ExpandMeasurements.cpp
index e0f4fc299f7..1527608dca0 100644
--- a/lib/Optimizer/Transforms/ExpandMeasurements.cpp
+++ b/lib/Optimizer/Transforms/ExpandMeasurements.cpp
@@ -11,113 +11,68 @@
 #include "cudaq/Optimizer/Dialect/CC/CCOps.h"
 #include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
+#include "cudaq/Todo.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Transforms/DialectConversion.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
 
 using namespace mlir;
 
-namespace {
 // Only an individual qubit measurement returns a bool.
 template <typename A>
 bool usesIndividualQubit(A x) {
   return x.getType() == quake::MeasureType::get(x.getContext());
 }
 
-// Pattern for expanding a multi-qubit measurement on unsized veq<?> targets
-// into a dynamic loop of individual measurements.
+// Generalized pattern for expanding a multiple qubit measurement (whether it is
+// mx, my, or mz) to a series of individual measurements.
 template <typename A>
-class ExpandUnsizedMeasurePattern : public OpRewritePattern<A> {
+class ExpandRewritePattern : public OpRewritePattern<A> {
 public:
   using OpRewritePattern<A>::OpRewritePattern;
 
   LogicalResult matchAndRewrite(A measureOp,
                                 PatternRewriter &rewriter) const override {
-    if (usesIndividualQubit(measureOp.getMeasOut()))
-      return failure();
-
-    // Only handle the unsized case here.
-    bool hasUnsizedTarget = false;
-    for (auto v : measureOp.getTargets())
-      if (auto veqTy = dyn_cast<quake::VeqType>(v.getType()))
-        if (!veqTy.hasSpecifiedSize())
-          hasUnsizedTarget = true;
-    if (!hasUnsizedTarget)
-      return failure();
-
-    // Only expand if every user of the measurement result is a DiscriminateOp.
-    for (auto *user : measureOp.getMeasOut().getUsers())
-      if (!isa<quake::DiscriminateOp>(user))
-        return failure();
-
-    // Even without discriminate users we must expand, because downstream QIR
-    // lowering cannot handle mz on veq<?>. When discriminate users exist we
-    // additionally allocate a buffer to collect per-qubit results and build the
-    // stdvec that replaces each discriminate.
-    bool hasDiscriminateUsers = !measureOp.getMeasOut().use_empty();
     auto loc = measureOp.getLoc();
-    auto i64Ty = rewriter.getI64Type();
-    auto measTy = quake::MeasureType::get(rewriter.getContext());
-
     // 1. Determine the total number of qubits we need to measure. This
     // determines the size of the buffer of bools to create to store the results
     // in.
-    Value buff, totalToRead, buffOff, one;
-    Type elemTy, bufElemTy;
-    if (hasDiscriminateUsers) {
-      auto firstDisc = cast<quake::DiscriminateOp>(
-          *measureOp.getMeasOut().getUsers().begin());
-      auto stdvecTy =
-          cast<cudaq::cc::StdvecType>(firstDisc.getResult().getType());
-      elemTy = stdvecTy.getElementType();
-      unsigned elemWidth = cast<IntegerType>(elemTy).getWidth();
-      bufElemTy =
-          elemWidth > 8 ? elemTy : static_cast<Type>(rewriter.getI8Type());
-
-      unsigned numQubits = 0u;
-      for (auto v : measureOp.getTargets())
-        if (v.getType().template isa<quake::RefType>())
-          ++numQubits;
-      totalToRead =
-          rewriter.template create<arith::ConstantIntOp>(loc, numQubits, 64);
-      for (auto v : measureOp.getTargets())
-        if (v.getType().template isa<quake::VeqType>()) {
-          Value vecSz =
-              rewriter.template create<quake::VeqSizeOp>(loc, i64Ty, v);
-          totalToRead =
-              rewriter.template create<arith::AddIOp>(loc, totalToRead, vecSz);
-        }
+    unsigned numQubits = 0u;
+    for (auto v : measureOp.getTargets())
+      if (v.getType().template isa<quake::RefType>())
+        ++numQubits;
+    Value totalToRead =
+        rewriter.template create<arith::ConstantIntOp>(loc, numQubits, 64);
+    auto i64Ty = rewriter.getI64Type();
+    for (auto v : measureOp.getTargets())
+      if (v.getType().template isa<quake::VeqType>()) {
+        Value vecSz = rewriter.template create<quake::VeqSizeOp>(loc, i64Ty, v);
+        totalToRead =
+            rewriter.template create<arith::AddIOp>(loc, totalToRead, vecSz);
+      }
 
-      // 2. Create the buffer.
-      buff = rewriter.template create<cudaq::cc::AllocaOp>(loc, bufElemTy,
-                                                           totalToRead);
-      buffOff = rewriter.template create<arith::ConstantIntOp>(loc, 0, 64);
-      one = rewriter.template create<arith::ConstantIntOp>(loc, 1, 64);
-    }
+    // 2. Create the buffer.
+    auto i1Ty = rewriter.getI1Type();
+    auto i8Ty = rewriter.getI8Type();
+    Value buff =
+        rewriter.template create<cudaq::cc::AllocaOp>(loc, i8Ty, totalToRead);
 
     // 3. Measure each individual qubit and insert the result, in order, into
     // the buffer. For registers/vectors, loop over the entire set of qubits.
+    Value buffOff = rewriter.template create<arith::ConstantIntOp>(loc, 0, 64);
+    Value one = rewriter.template create<arith::ConstantIntOp>(loc, 1, 64);
+    auto measTy = quake::MeasureType::get(rewriter.getContext());
     for (auto v : measureOp.getTargets()) {
       if (isa<quake::RefType>(v.getType())) {
-        auto meas = rewriter.template create<A>(loc, measTy, v);
-        if (auto registerName = measureOp.getRegisterNameAttr())
-          meas.setRegisterName(registerName);
-        if (hasDiscriminateUsers) {
-          auto bit = rewriter.template create<quake::DiscriminateOp>(
-              loc, elemTy, meas.getMeasOut());
-          Value addr = rewriter.template create<cudaq::cc::ComputePtrOp>(
-              loc, cudaq::cc::PointerType::get(bufElemTy), buff, buffOff);
-          Value stored = (elemTy != bufElemTy)
-                             ? rewriter
-                                   .template create<cudaq::cc::CastOp>(
-                                       loc, bufElemTy, bit,
-                                       cudaq::cc::CastOpMode::Unsigned)
-                                   .getResult()
-                             : static_cast<Value>(bit);
-          rewriter.template create<cudaq::cc::StoreOp>(loc, stored, addr);
-          buffOff = rewriter.template create<arith::AddIOp>(loc, buffOff, one);
-        }
+        auto meas = rewriter.template create<A>(loc, measTy, v).getMeasOut();
+        auto bit =
+            rewriter.template create<quake::DiscriminateOp>(loc, i1Ty, meas);
+        Value addr = rewriter.template create<cudaq::cc::ComputePtrOp>(
+            loc, cudaq::cc::PointerType::get(i8Ty), buff, buffOff);
+        auto bitByte = rewriter.template create<cudaq::cc::CastOp>(
+            loc, i8Ty, bit, cudaq::cc::CastOpMode::Unsigned);
+        rewriter.template create<cudaq::cc::StoreOp>(loc, bitByte, addr);
+        buffOff = rewriter.template create<arith::AddIOp>(loc, buffOff, one);
       } else {
         assert(isa<quake::VeqType>(v.getType()));
         Value vecSz = rewriter.template create<quake::VeqSizeOp>(loc, i64Ty, v);
@@ -128,138 +83,41 @@ class ExpandUnsizedMeasurePattern : public OpRewritePattern<A> {
               Value qv =
                   builder.template create<quake::ExtractRefOp>(loc, v, iv);
               auto meas = builder.template create<A>(loc, measTy, qv);
+              auto bit = builder.template create<quake::DiscriminateOp>(
+                  loc, i1Ty, meas.getMeasOut());
               if (auto registerName = measureOp.getRegisterNameAttr())
                 meas.setRegisterName(registerName);
-              if (hasDiscriminateUsers) {
-                auto bit = builder.template create<quake::DiscriminateOp>(
-                    loc, elemTy, meas.getMeasOut());
-                Value offset =
-                    builder.template create<arith::AddIOp>(loc, iv, buffOff);
-                auto addr = builder.template create<cudaq::cc::ComputePtrOp>(
-                    loc, cudaq::cc::PointerType::get(bufElemTy), buff, offset);
-                Value stored = (elemTy != bufElemTy)
-                                   ? builder
-                                         .template create<cudaq::cc::CastOp>(
-                                             loc, bufElemTy, bit,
-                                             cudaq::cc::CastOpMode::Unsigned)
-                                         .getResult()
-                                   : static_cast<Value>(bit);
-                builder.template create<cudaq::cc::StoreOp>(loc, stored, addr);
-              }
+              Value offset =
+                  builder.template create<arith::AddIOp>(loc, iv, buffOff);
+              auto addr = builder.template create<cudaq::cc::ComputePtrOp>(
+                  loc, cudaq::cc::PointerType::get(i8Ty), buff, offset);
+              auto bitByte = rewriter.template create<cudaq::cc::CastOp>(
+                  loc, i8Ty, bit, cudaq::cc::CastOpMode::Unsigned);
+              builder.template create<cudaq::cc::StoreOp>(loc, bitByte, addr);
             });
-        if (hasDiscriminateUsers)
-          buffOff =
-              rewriter.template create<arith::AddIOp>(loc, buffOff, vecSz);
+        buffOff = rewriter.template create<arith::AddIOp>(loc, buffOff, vecSz);
       }
     }
 
     // 4. Use the buffer as an initialization expression and create the
     // std::vec<bool> value.
-    if (hasDiscriminateUsers) {
-      auto stdvecTy = cudaq::cc::StdvecType::get(rewriter.getContext(), elemTy);
-      SmallVector<quake::DiscriminateOp> discs;
-      for (auto *out : measureOp.getMeasOut().getUsers())
-        if (auto disc = dyn_cast_if_present<quake::DiscriminateOp>(out))
-          discs.push_back(disc);
-      for (auto disc : discs) {
-        auto ptrArrTy =
-            cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(elemTy));
+    auto stdvecTy = cudaq::cc::StdvecType::get(rewriter.getContext(), i1Ty);
+    for (auto *out : measureOp.getMeasOut().getUsers())
+      if (auto disc = dyn_cast_if_present<quake::DiscriminateOp>(out)) {
+        auto ptrArrI1Ty =
+            cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(i1Ty));
         auto buffCast =
-            rewriter.template create<cudaq::cc::CastOp>(loc, ptrArrTy, buff);
+            rewriter.template create<cudaq::cc::CastOp>(loc, ptrArrI1Ty, buff);
         rewriter.template replaceOpWithNewOp<cudaq::cc::StdvecInitOp>(
             disc, stdvecTy, buffCast, totalToRead);
       }
-    }
 
     rewriter.eraseOp(measureOp);
     return success();
   }
 };
 
-using MxUnsizedRewrite = ExpandUnsizedMeasurePattern<quake::MxOp>;
-using MyUnsizedRewrite = ExpandUnsizedMeasurePattern<quake::MyOp>;
-using MzUnsizedRewrite = ExpandUnsizedMeasurePattern<quake::MzOp>;
-
-// Generalized pattern for expanding a multiple qubit measurement (whether it is
-// mx, my, or mz) to a series of individual measurements.
-template <typename A>
-class ExpandRewritePattern : public OpRewritePattern<A> {
-public:
-  using OpRewritePattern<A>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(A measureOp,
-                                PatternRewriter &rewriter) const override {
-    if (usesIndividualQubit(measureOp.getMeasOut()))
-      return failure();
-
-    // Collect all the `get_measure` ops for this measurement operation.
-    SmallVector<quake::GetMeasureOp> getMeasureOps;
-    for (auto *user : measureOp.getMeasOut().getUsers())
-      if (auto gm = dyn_cast<quake::GetMeasureOp>(user))
-        getMeasureOps.push_back(gm);
-
-    // Can only replace `get_measure %m[i]` with per-qubit measurements, else
-    // bail out.
-    if (getMeasureOps.empty() && !measureOp.getMeasOut().use_empty())
-      return failure();
-
-    // Validate that all `get_measure` ops have constant indices and all the veq
-    // targets have known sizes.
-    for (auto gm : getMeasureOps)
-      if (!gm.hasConstantIndex())
-        return failure();
-    std::size_t totalMeasures = 0;
-    for (auto v : measureOp.getTargets()) {
-      if (isa<quake::RefType>(v.getType())) {
-        ++totalMeasures;
-      } else {
-        auto veqTy = cast<quake::VeqType>(v.getType());
-        if (!veqTy.hasSpecifiedSize())
-          return failure();
-        totalMeasures += veqTy.getSize();
-      }
-    }
-    // Bounds check
-    for (auto gm : getMeasureOps)
-      if (gm.getConstantIndex() >= totalMeasures)
-        return failure();
-
-    auto loc = measureOp.getLoc();
-    auto measTy = quake::MeasureType::get(rewriter.getContext());
-
-    // Create individual per-qubit measurements for each target.
-    SmallVector<Value> individualMeasures;
-    for (auto v : measureOp.getTargets()) {
-      if (isa<quake::RefType>(v.getType())) {
-        auto meas = rewriter.template create<A>(loc, measTy, v);
-        if (auto registerName = measureOp.getRegisterNameAttr())
-          meas.setRegisterName(registerName);
-        individualMeasures.push_back(meas.getMeasOut());
-      } else {
-        auto veqTy = cast<quake::VeqType>(v.getType());
-        for (std::size_t i = 0; i < veqTy.getSize(); ++i) {
-          Value idx =
-              rewriter.template create<arith::ConstantIntOp>(loc, i, 64);
-          Value qv = rewriter.template create<quake::ExtractRefOp>(loc, v, idx);
-          auto meas = rewriter.template create<A>(loc, measTy, qv);
-          if (auto registerName = measureOp.getRegisterNameAttr())
-            meas.setRegisterName(registerName);
-          individualMeasures.push_back(meas.getMeasOut());
-        }
-      }
-    }
-
-    // Replace each get_measure op with the corresponding individual result.
-    for (auto gm : getMeasureOps)
-      rewriter.replaceOp(gm, individualMeasures[gm.getConstantIndex()]);
-
-    if (measureOp.getMeasOut().use_empty())
-      rewriter.eraseOp(measureOp);
-
-    return success();
-  }
-};
-
+namespace {
 using MxRewrite = ExpandRewritePattern<quake::MxOp>;
 using MyRewrite = ExpandRewritePattern<quake::MyOp>;
 using MzRewrite = ExpandRewritePattern<quake::MzOp>;
@@ -272,10 +130,8 @@ class ResetRewrite : public OpRewritePattern<quake::ResetOp> {
 
   LogicalResult matchAndRewrite(quake::ResetOp resetOp,
                                 PatternRewriter &rewriter) const override {
-    auto veqArg = resetOp.getTargets();
-    if (!isa<quake::VeqType>(veqArg.getType()))
-      return failure();
     auto loc = resetOp.getLoc();
+    auto veqArg = resetOp.getTargets();
     auto i64Ty = rewriter.getI64Type();
     Value vecSz = rewriter.create<quake::VeqSizeOp>(loc, i64Ty, veqArg);
     cudaq::opt::factory::createInvariantLoop(
@@ -290,106 +146,29 @@ class ResetRewrite : public OpRewritePattern<quake::ResetOp> {
   }
 };
 
-// Pattern for expanding a `quake.discriminate` op on a `quake.measurements`
-// with a known size into a series of `quake.discriminate` ops on individual
-// `quake.measure` results via `quake.get_measure`.
-class ExpandDiscriminatePattern
-    : public OpRewritePattern<quake::DiscriminateOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(quake::DiscriminateOp discOp,
-                                PatternRewriter &rewriter) const override {
-    auto measVal = discOp.getMeasurement();
-    auto measTy = dyn_cast<quake::MeasurementsType>(measVal.getType());
-    if (!measTy)
-      return failure();
-    if (!measTy.hasSpecifiedSize())
-      return failure();
-
-    auto loc = discOp.getLoc();
-    auto stdvecResTy =
-        cast<cudaq::cc::StdvecType>(discOp.getResult().getType());
-    auto elemTy = stdvecResTy.getElementType();
-    unsigned elemWidth = cast<IntegerType>(elemTy).getWidth();
-    Type bufElemTy = elemWidth > 8 ? elemTy : rewriter.getI8Type();
-
-    Value totalToRead =
-        rewriter.create<arith::ConstantIntOp>(loc, measTy.getSize(), 64);
-    Value buff =
-        rewriter.create<cudaq::cc::AllocaOp>(loc, bufElemTy, totalToRead);
-
-    // TODO: For large N, consider emitting a loop to avoid IR bloat.
-    std::size_t n = measTy.getSize();
-    for (std::size_t i = 0; i < n; ++i) {
-      Value getMeas = rewriter.create<quake::GetMeasureOp>(loc, measVal, i);
-      Value bit = rewriter.create<quake::DiscriminateOp>(loc, elemTy, getMeas);
-      Value idx = rewriter.create<arith::ConstantIntOp>(loc, i, 64);
-      Value addr = rewriter.create<cudaq::cc::ComputePtrOp>(
-          loc, cudaq::cc::PointerType::get(bufElemTy), buff, idx);
-      Value stored =
-          (elemTy != bufElemTy)
-              ? rewriter
-                    .create<cudaq::cc::CastOp>(loc, bufElemTy, bit,
-                                               cudaq::cc::CastOpMode::Unsigned)
-                    .getResult()
-              : bit;
-      rewriter.create<cudaq::cc::StoreOp>(loc, stored, addr);
-    }
-
-    auto ptrArrElemTy =
-        cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(elemTy));
-    auto buffCast = rewriter.create<cudaq::cc::CastOp>(loc, ptrArrElemTy, buff);
-    rewriter.replaceOpWithNewOp<cudaq::cc::StdvecInitOp>(discOp, stdvecResTy,
-                                                         buffCast, totalToRead);
-    return success();
-  }
-};
-
 class ExpandMeasurementsPass
     : public cudaq::opt::ExpandMeasurementsBase<ExpandMeasurementsPass> {
 public:
   void runOnOperation() override {
     auto *op = getOperation();
     auto *ctx = &getContext();
-
-    // Step 1: Expand discriminate(measurements<N>) into individual
-    // get_measure + discriminate ops. This must run first so that step 2's
-    // ExpandRewritePattern can see the resulting get_measure users.
-    {
-      RewritePatternSet patterns(ctx);
-      patterns.insert<ExpandDiscriminatePattern>(ctx);
-      ConversionTarget target(*ctx);
-      target.addLegalDialect<quake::QuakeDialect, cudaq::cc::CCDialect,
-                             arith::ArithDialect, LLVM::LLVMDialect>();
-      target.addDynamicallyLegalOp<quake::DiscriminateOp>(
-          [](quake::DiscriminateOp d) {
-            auto measTy =
-                dyn_cast<quake::MeasurementsType>(d.getMeasurement().getType());
-            if (!measTy)
-              return true;
-            return !measTy.hasSpecifiedSize();
-          });
-      if (failed(applyPartialConversion(op, target, std::move(patterns)))) {
-        op->emitOpError("could not expand discriminate ops");
-        signalPassFailure();
-        return;
-      }
-    }
-
-    // Step 2: Expand multi-qubit m[xyz] and reset ops.
-    // ExpandRewritePattern handles sized targets (veq<N>) via unrolling.
-    // ExpandUnsizedMeasurePattern handles unsized targets (veq<?>) via
-    // dynamic loops using VeqSizeOp + createInvariantLoop.
-    {
-      RewritePatternSet patterns(ctx);
-      patterns.insert<MxRewrite, MyRewrite, MzRewrite, ResetRewrite>(ctx);
-      patterns.insert<MxUnsizedRewrite, MyUnsizedRewrite, MzUnsizedRewrite>(
-          ctx);
-      if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns)))) {
-        op->emitOpError("could not expand measurements");
-        signalPassFailure();
-      }
+    RewritePatternSet patterns(ctx);
+    patterns.insert<MxRewrite, MyRewrite, MzRewrite, ResetRewrite>(ctx);
+    ConversionTarget target(*ctx);
+    target.addLegalDialect<quake::QuakeDialect, cudaq::cc::CCDialect,
+                           arith::ArithDialect, LLVM::LLVMDialect>();
+    target.addDynamicallyLegalOp<quake::MxOp>(
+        [](quake::MxOp x) { return usesIndividualQubit(x.getMeasOut()); });
+    target.addDynamicallyLegalOp<quake::MyOp>(
+        [](quake::MyOp x) { return usesIndividualQubit(x.getMeasOut()); });
+    target.addDynamicallyLegalOp<quake::MzOp>(
+        [](quake::MzOp x) { return usesIndividualQubit(x.getMeasOut()); });
+    target.addDynamicallyLegalOp<quake::ResetOp>([](quake::ResetOp r) {
+      return !isa<quake::VeqType>(r.getTargets().getType());
+    });
+    if (failed(applyPartialConversion(op, target, std::move(patterns)))) {
+      op->emitOpError("could not expand measurements");
+      signalPassFailure();
     }
   }
 };
diff --git a/lib/Optimizer/Transforms/Mapping.cpp b/lib/Optimizer/Transforms/Mapping.cpp
index f6a09f1bf16..c7b1f33d910 100644
--- a/lib/Optimizer/Transforms/Mapping.cpp
+++ b/lib/Optimizer/Transforms/Mapping.cpp
@@ -823,8 +823,6 @@ struct MappingFunc : public cudaq::opt::impl::MappingFuncBase<MappingFunc> {
           auto measureOp = builder.create<quake::MzOp>(
               finalQubitWire[i].getLoc(), TypeRange{measTy, wireTy},
               finalQubitWire[i]);
-          /// NOTE: Eagerly discriminate here since these are terminal
-          /// measurements and would need classical readout.
           builder.create<quake::DiscriminateOp>(finalQubitWire[i].getLoc(),
                                                 resTy, measureOp.getMeasOut());
 
diff --git a/lib/Optimizer/Transforms/ResourceCount.cpp b/lib/Optimizer/Transforms/ResourceCount.cpp
index 918667f906a..ed6ce573c67 100644
--- a/lib/Optimizer/Transforms/ResourceCount.cpp
+++ b/lib/Optimizer/Transforms/ResourceCount.cpp
@@ -16,6 +16,25 @@ using namespace mlir;
 
 mlir::FailureOr<cudaq::Resources>
 cudaq::opt::countResourcesFromIR(ModuleOp module) {
+  // Check upfront whether all qubit allocations have statically known sizes.
+  // If any veq has a dynamic size we cannot count qubits statically, so bail
+  // out before running the gate-erasing pass manager.
+  std::size_t allocated = 0;
+  bool unresolvedVeq = false;
+  module.walk([&](quake::AllocaOp alloc) {
+    if (isa<quake::RefType>(alloc.getType())) {
+      allocated++;
+    } else if (auto size = quake::getVeqSize(alloc.getResult())) {
+      allocated += *size;
+    } else {
+      unresolvedVeq = true;
+    }
+  });
+  if (unresolvedVeq)
+    return failure();
+
+  // All qubit sizes are statically known — proceed to count gates and erase
+  // them from the IR so the subsequent JIT compiles a near-empty module.
   cudaq::Resources counts;
   auto countGate = [&counts](std::string gate,
                              std::vector<std::size_t> controls,
@@ -40,21 +59,6 @@ cudaq::opt::countResourcesFromIR(ModuleOp module) {
   if (failed(pmResult))
     return failure();
 
-  // Count allocated qubits from the IR.
-  std::size_t allocated = 0;
-  bool unresolvedVeq = false;
-  module.walk([&](quake::AllocaOp alloc) {
-    if (isa<quake::RefType>(alloc.getType())) {
-      allocated++;
-    } else if (auto size = quake::getVeqSize(alloc.getResult())) {
-      allocated += *size;
-    } else {
-      unresolvedVeq = true;
-    }
-  });
-  if (unresolvedVeq)
-    return failure();
   counts.setNumQubits(allocated);
-
   return counts;
 }
diff --git a/pyproject.toml.cu12 b/pyproject.toml.cu12
index b86d7743d64..b07ec657757 100644
--- a/pyproject.toml.cu12
+++ b/pyproject.toml.cu12
@@ -19,9 +19,9 @@ license = "Apache-2.0"
 license-files = ["LICENSE", "NOTICE", "CITATION.cff"]
 dependencies = [
   'astpretty ~= 3.0',
-  'custatevec-cu12 ~= 1.12',
-  'cutensornet-cu12 ~= 2.11',
-  'cudensitymat-cu12 ~= 0.4',
+  'custatevec-cu12 ~= 1.13.1',
+  'cutensornet-cu12 ~= 2.12.1',
+  'cudensitymat-cu12 ~= 0.5.1',
   'numpy >= 1.24',
   'scipy >= 1.10.1',
   'requests >= 2.32.4',
diff --git a/pyproject.toml.cu13 b/pyproject.toml.cu13
index bae9dbbb929..7e9a6dd6926 100644
--- a/pyproject.toml.cu13
+++ b/pyproject.toml.cu13
@@ -23,9 +23,9 @@ dependencies = [
   'scipy >= 1.10.1',
   'requests >= 2.32.4',
   # CUDA dependencies - excluded on macOS (CPU-only support)
-  'custatevec-cu13 ~= 1.12; sys_platform != "darwin"',
-  'cutensornet-cu13 ~= 2.11; sys_platform != "darwin"',
-  'cudensitymat-cu13 ~= 0.4; sys_platform != "darwin"',
+  'custatevec-cu13 ~= 1.13.1; sys_platform != "darwin"',
+  'cutensornet-cu13 ~= 2.12.1; sys_platform != "darwin"',
+  'cudensitymat-cu13 ~= 0.5.1; sys_platform != "darwin"',
   'nvidia-cublas ~= 13.0; sys_platform != "darwin"',
   'nvidia-curand ~= 10.4; sys_platform != "darwin"',
   'nvidia-cusparse ~= 12.6; sys_platform != "darwin"',
diff --git a/python/cudaq/dynamics/integrator.py b/python/cudaq/dynamics/integrator.py
index 019beee38b1..134e40b18e2 100644
--- a/python/cudaq/dynamics/integrator.py
+++ b/python/cudaq/dynamics/integrator.py
@@ -35,7 +35,14 @@ def __init__(self, **kwargs):
         self.dimensions = None
         self.schedule = None
         self.hamiltonian = None
+        # The actual stepper used for integration.
+        # This may be set in the constructor with a user-provided stepper,
+        # or it may be auto-created by the integrator when `integrate()` is called based on the system dynamics.
         self.stepper = None
+        # User-provided stepper.
+        # This will be used for integration if provided.
+        # Note: it's user's responsibility to ensure that the provided stepper is compatible with the system dynamics.
+        self._user_provided_stepper = None
         self.collapse_operators = None
         self.super_op = None
         self.__post_init__()
@@ -69,7 +76,10 @@ def set_system(self,
             self.hamiltonian = hamiltonian
 
         self.collapse_operators = collapse_operators
-        self.stepper = None
+        # Restore the user-provided stepper if one was given at construction,
+        # otherwise reset to None so `integrate()` builds a fresh stepper from
+        # the new system dynamics.
+        self.stepper = self._user_provided_stepper
 
     @abstractmethod
     def integrate(self, t):
diff --git a/python/cudaq/dynamics/integrators/cuda_torchdiffeq_integrator.py b/python/cudaq/dynamics/integrators/cuda_torchdiffeq_integrator.py
index 50e4dd90a0b..52863a2cf48 100644
--- a/python/cudaq/dynamics/integrators/cuda_torchdiffeq_integrator.py
+++ b/python/cudaq/dynamics/integrators/cuda_torchdiffeq_integrator.py
@@ -9,6 +9,7 @@
 from ..integrator import BaseTimeStepper, BaseIntegrator
 from .builtin_integrators import cuDensityMatTimeStepper, cuDensityMatSuperOpTimeStepper
 from ...mlir._mlir_libs._quakeDialects import cudaq_runtime
+from typing import Optional
 import math
 
 has_cupy = True
@@ -71,7 +72,7 @@ class CUDATorchDiffEqIntegrator(BaseIntegrator[cudaq_runtime.State]):
     rtol = 1e-7
 
     def __init__(self,
-                 stepper: BaseTimeStepper[cudaq_runtime.State],
+                 stepper: Optional[BaseTimeStepper[cudaq_runtime.State]] = None,
                  solver: str = 'rk4',
                  **kwargs):
         if not has_dynamics:
@@ -96,7 +97,8 @@ def __init__(self,
                 'CuPy is required to use Torch-based integrators.')
 
         super().__init__(**kwargs)
-        self.stepper = stepper
+        # Store the user-provided stepper so it survives `set_system()` calls.
+        self._user_provided_stepper = stepper
         self.solver = solver
         self.dm_shape = None
         self.n_steps = 10
@@ -105,6 +107,7 @@ def __init__(self,
         self.batchSize = None
         self._dimensions_list = None
         self._solver_instance = None
+        self._use_compute_inplace = None
 
     def compute_rhs(self, t, vec):
         if torch.is_tensor(t):
@@ -121,21 +124,25 @@ def compute_rhs(self, t, vec):
         device_ptr = vec.data_ptr()
         size = vec.numel()
 
-        if self._dimensions_list is None:
-            self._dimensions_list = list(self.dimensions)
-
         # Wrap the device pointer as a `cudaq::state` (no copy)
         temp_state = bindings.initializeState(device_ptr, size,
                                               self._dimensions_list,
                                               self.batchSize)
-        # Pre-allocate output tensor (torch tensor)
-        result_vec = torch.zeros_like(vec)
-        # Wrap the output tensor device pointer as a `cudaq::state` (no copy)
-        result_state = bindings.initializeState(result_vec.data_ptr(), size,
-                                                self._dimensions_list,
-                                                self.batchSize)
-        # Compute the RHS into the output state
-        self.stepper.compute_inplace(temp_state, t_scalar, result_state)
+        if self._use_compute_inplace:
+            # If `compute_inplace` is available, use it to avoid extra data conversion (`dlpack` conversion between `torch` and `cupy`).
+            # Pre-allocate output tensor (torch tensor)
+            result_vec = torch.zeros_like(vec)
+            # Wrap the output tensor device pointer as a `cudaq::state` (no copy)
+            result_state = bindings.initializeState(result_vec.data_ptr(), size,
+                                                    self._dimensions_list,
+                                                    self.batchSize)
+            self.stepper.compute_inplace(temp_state, t_scalar, result_state)
+        else:
+            # Stepper only provides compute(); call it and convert the returned
+            # state back to a torch tensor via `dlpack` (no extra copy).
+            result_state_obj = self.stepper.compute(temp_state, t_scalar)
+            result_cupy = to_cupy_array(result_state_obj)
+            result_vec = torch.from_dlpack(result_cupy)
         return result_vec
 
     def _create_wrapped_rhs_func(self):
@@ -176,6 +183,11 @@ def _get_solver_class(self):
         return solver_map.get(self.solver)
 
     def integrate(self, t):
+        if self.is_density_state is None:
+            self.is_density_state = (
+                (math.prod(self.dimensions)**2 *
+                 self.batchSize) == self.state.getTensor().get_num_elements())
+
         if self.stepper is None:
             if self.dimensions is None:
                 raise ValueError(
@@ -188,10 +200,6 @@ def integrate(self, t):
                 )
             self.schedule_ = bindings.Schedule(self.schedule._steps,
                                                list(self.schedule._parameters))
-            if self.is_density_state is None:
-                self.is_density_state = (
-                    (math.prod(self.dimensions)**2 * self.batchSize
-                    ) == self.state.getTensor().get_num_elements())
 
             if self.super_op is None:
                 # Create a stepper based on the provided Hamiltonian and collapse operators
@@ -205,6 +213,11 @@ def integrate(self, t):
                 self.stepper = cuDensityMatSuperOpTimeStepper(
                     self.super_op, self.schedule_, list(self.dimensions))
 
+        # Cache whether the stepper provides `compute_inplace` to dispatch proper call in `compute_rhs`.
+        self._use_compute_inplace = hasattr(self.stepper, 'compute_inplace')
+        if self._dimensions_list is None:
+            self._dimensions_list = list(self.dimensions)
+
         if t <= self.t:
             raise ValueError(
                 "Integration time must be greater than current time")
@@ -256,9 +269,6 @@ def integrate(self, t):
         # convert the solution back to CuPy array
         y_t_cupy = cp.from_dlpack(y_t)
 
-        if self._dimensions_list is None:
-            self._dimensions_list = list(self.dimensions)
-
         # Keep results in GPU memory
         self.state = cudaq_runtime.State.from_data(y_t_cupy)
         self.state = bindings.initializeState(self.state, self._dimensions_list,
@@ -275,7 +285,7 @@ def set_state(self, state: cudaq_runtime.State, t: float = 0.0):
 class CUDATorchDiffEqRK4Integrator(CUDATorchDiffEqIntegrator):
 
     def __init__(self,
-                 stepper: BaseTimeStepper[cudaq_runtime.State] = None,
+                 stepper: Optional[BaseTimeStepper[cudaq_runtime.State]] = None,
                  **kwargs):
         super().__init__(stepper, solver='rk4', **kwargs)
 
@@ -283,7 +293,7 @@ def __init__(self,
 class CUDATorchDiffEqEulerIntegrator(CUDATorchDiffEqIntegrator):
 
     def __init__(self,
-                 stepper: BaseTimeStepper[cudaq_runtime.State] = None,
+                 stepper: Optional[BaseTimeStepper[cudaq_runtime.State]] = None,
                  **kwargs):
         super().__init__(stepper, solver='euler', **kwargs)
 
@@ -291,7 +301,7 @@ def __init__(self,
 class CUDATorchDiffEqMidpointIntegrator(CUDATorchDiffEqIntegrator):
 
     def __init__(self,
-                 stepper: BaseTimeStepper[cudaq_runtime.State] = None,
+                 stepper: Optional[BaseTimeStepper[cudaq_runtime.State]] = None,
                  **kwargs):
         super().__init__(stepper, solver='midpoint', **kwargs)
 
@@ -299,7 +309,7 @@ def __init__(self,
 class CUDATorchDiffEqDopri5Integrator(CUDATorchDiffEqIntegrator):
 
     def __init__(self,
-                 stepper: BaseTimeStepper[cudaq_runtime.State] = None,
+                 stepper: Optional[BaseTimeStepper[cudaq_runtime.State]] = None,
                  **kwargs):
         super().__init__(stepper, solver='dopri5', **kwargs)
 
@@ -307,7 +317,7 @@ def __init__(self,
 class CUDATorchDiffEqDopri8Integrator(CUDATorchDiffEqIntegrator):
 
     def __init__(self,
-                 stepper: BaseTimeStepper[cudaq_runtime.State] = None,
+                 stepper: Optional[BaseTimeStepper[cudaq_runtime.State]] = None,
                  **kwargs):
         super().__init__(stepper, solver='dopri8', **kwargs)
 
@@ -315,7 +325,7 @@ def __init__(self,
 class CUDATorchDiffEqBosh3Integrator(CUDATorchDiffEqIntegrator):
 
     def __init__(self,
-                 stepper: BaseTimeStepper[cudaq_runtime.State] = None,
+                 stepper: Optional[BaseTimeStepper[cudaq_runtime.State]] = None,
                  **kwargs):
         super().__init__(stepper, solver='bosh3', **kwargs)
 
@@ -323,7 +333,7 @@ def __init__(self,
 class CUDATorchDiffEqAdaptiveHeunIntegrator(CUDATorchDiffEqIntegrator):
 
     def __init__(self,
-                 stepper: BaseTimeStepper[cudaq_runtime.State] = None,
+                 stepper: Optional[BaseTimeStepper[cudaq_runtime.State]] = None,
                  **kwargs):
         super().__init__(stepper, solver='adaptive_heun', **kwargs)
 
@@ -331,7 +341,7 @@ def __init__(self,
 class CUDATorchDiffEqExplicitAdamsIntegrator(CUDATorchDiffEqIntegrator):
 
     def __init__(self,
-                 stepper: BaseTimeStepper[cudaq_runtime.State] = None,
+                 stepper: Optional[BaseTimeStepper[cudaq_runtime.State]] = None,
                  **kwargs):
         super().__init__(stepper, solver='explicit_adams', **kwargs)
 
@@ -339,7 +349,7 @@ def __init__(self,
 class CUDATorchDiffEqFehlberg2Integrator(CUDATorchDiffEqIntegrator):
 
     def __init__(self,
-                 stepper: BaseTimeStepper[cudaq_runtime.State] = None,
+                 stepper: Optional[BaseTimeStepper[cudaq_runtime.State]] = None,
                  **kwargs):
         super().__init__(stepper, solver='fehlberg2', **kwargs)
 
@@ -347,7 +357,7 @@ def __init__(self,
 class CUDATorchDiffEqHeun3Integrator(CUDATorchDiffEqIntegrator):
 
     def __init__(self,
-                 stepper: BaseTimeStepper[cudaq_runtime.State] = None,
+                 stepper: Optional[BaseTimeStepper[cudaq_runtime.State]] = None,
                  **kwargs):
         super().__init__(stepper, solver='heun3', **kwargs)
 
@@ -355,7 +365,7 @@ def __init__(self,
 class CUDATorchDiffEqImplicitAdamsIntegrator(CUDATorchDiffEqIntegrator):
 
     def __init__(self,
-                 stepper: BaseTimeStepper[cudaq_runtime.State] = None,
+                 stepper: Optional[BaseTimeStepper[cudaq_runtime.State]] = None,
                  **kwargs):
         super().__init__(stepper, solver='implicit_adams', **kwargs)
 
@@ -363,6 +373,6 @@ def __init__(self,
 class CUDATorchDiffEqFixedAdamsIntegrator(CUDATorchDiffEqIntegrator):
 
     def __init__(self,
-                 stepper: BaseTimeStepper[cudaq_runtime.State] = None,
+                 stepper: Optional[BaseTimeStepper[cudaq_runtime.State]] = None,
                  **kwargs):
         super().__init__(stepper, solver='fixed_adams', **kwargs)
diff --git a/python/cudaq/dynamics/integrators/scipy_integrators.py b/python/cudaq/dynamics/integrators/scipy_integrators.py
index 3cc434ff320..48799fa4fa3 100644
--- a/python/cudaq/dynamics/integrators/scipy_integrators.py
+++ b/python/cudaq/dynamics/integrators/scipy_integrators.py
@@ -9,6 +9,7 @@
 from ..integrator import BaseTimeStepper, BaseIntegrator
 from .builtin_integrators import cuDensityMatTimeStepper, cuDensityMatSuperOpTimeStepper
 from ...mlir._mlir_libs._quakeDialects import cudaq_runtime
+from typing import Optional
 import numpy, math
 
 has_dynamics = True
@@ -31,7 +32,9 @@ class ScipyZvodeIntegrator(BaseIntegrator[cudaq_runtime.State]):
     rtol = 1e-6
     order = 12
 
-    def __init__(self, stepper: BaseTimeStepper[cudaq_runtime.State], **kwargs):
+    def __init__(self,
+                 stepper: Optional[BaseTimeStepper[cudaq_runtime.State]] = None,
+                 **kwargs):
         if not has_dynamics:
             raise ImportError(
                 'CUDA-Q is missing dynamics support. Please check your installation'
@@ -39,15 +42,11 @@ def __init__(self, stepper: BaseTimeStepper[cudaq_runtime.State], **kwargs):
         if not has_scipy:
             raise ImportError("scipy is required to use this integrator.")
         super().__init__(**kwargs)
-        self.stepper = stepper
+        # Store the user-provided stepper so it survives `set_system()` calls.
+        self._user_provided_stepper = stepper
         self.is_density_state = None
         self.batchSize = None
 
-    def __init__(self, **kwargs):
-        if not has_scipy:
-            raise ImportError("scipy is required to use this integrator.")
-        super().__init__(**kwargs)
-
     def compute_rhs(self, t, vec):
         state = cudaq_runtime.State.from_data(vec)
         state = bindings.initializeState(state, list(self.dimensions),
diff --git a/python/cudaq/kernel/analysis.py b/python/cudaq/kernel/analysis.py
index 09e0c962b88..172cf27f681 100644
--- a/python/cudaq/kernel/analysis.py
+++ b/python/cudaq/kernel/analysis.py
@@ -11,6 +11,8 @@
 import textwrap
 from typing import Optional, Type
 
+from .utils import get_function_source_or_raise
+
 
 class FunctionDefVisitor(ast.NodeVisitor):
     """
@@ -108,7 +110,8 @@ def _getChildFuncNames(func_obj: object,
         if name is None:
             name = func_obj.__name__
 
-        tree = ast.parse(textwrap.dedent(inspect.getsource(func_obj)))
+        src, _ = get_function_source_or_raise(func_obj)
+        tree = ast.parse(src)
         vis = FindDepFuncsVisitor()
         visit_set.add(name)
         vis.visit(tree)
@@ -141,7 +144,9 @@ def fetch(func_obj: object):
             else:
                 this_func_obj = FetchDepFuncsSourceCode._getFuncObj(
                     funcName, callingFrame)
-            src = textwrap.dedent(inspect.getsource(this_func_obj))
+            if this_func_obj is None:
+                continue
+            src, _ = get_function_source_or_raise(this_func_obj)
 
             code += src + '\n'
 
diff --git a/python/cudaq/kernel/ast_bridge.py b/python/cudaq/kernel/ast_bridge.py
index efb7bb78033..f568867381a 100644
--- a/python/cudaq/kernel/ast_bridge.py
+++ b/python/cudaq/kernel/ast_bridge.py
@@ -2672,11 +2672,33 @@ def checkModule(obj, moduleNames):
                     node.func.value.id) and node.func.attr == 'kernel':
                 return
 
+            def isExactCudaqDbgAstCall(func_node: ast.AST) -> bool:
+                """Return True iff `func_node` is the exact AST shape for
+                ``<cudaq_alias>.dbg.ast.<name>``.
+
+                Runtime attribute lookup follows lazy aliases (e.g. ``cudaq.ast``
+                resolves to ``cudaq.dbg.ast`` via ``_LAZY_SUBMODULES``), so
+                `devKey` is not a sufficient check. Walk the literal node
+                structure instead."""
+                if not isinstance(func_node, ast.Attribute):
+                    return False
+                if not isinstance(
+                        func_node.value,
+                        ast.Attribute) or func_node.value.attr != 'ast':
+                    return False
+                if not isinstance(
+                        func_node.value.value,
+                        ast.Attribute) or func_node.value.value.attr != 'dbg':
+                    return False
+                root = func_node.value.value.value
+                return isinstance(root, ast.Name) and self.isCudaqName(root.id)
+
             devKey, name = resolveQualifiedName(node.func)
             if devKey:
 
                 # Handle debug functions
-                if devKey == 'cudaq.dbg.ast':
+                if devKey == 'cudaq.dbg.ast' and isExactCudaqDbgAstCall(
+                        node.func):
                     # Handle a debug print statement
                     arg = self.__groupValues(node.args, [1])
                     self.__insertDbgStmt(arg, name)
@@ -2955,17 +2977,7 @@ def bodyBuilder(iterVar):
                     measTy = quake.MeasureType.get()
                     resTy = self.getIntegerType(1)
                 else:
-                    total_size = 0
-                    all_known = True
-                    for q in qubits:
-                        if quake.isConstantQuantumRefType(q.type):
-                            total_size += quake.getAllocationSize(q.type)
-                        else:
-                            all_known = False
-                    if all_known and total_size > 0:
-                        measTy = quake.MeasurementsType.get(total_size)
-                    else:
-                        measTy = quake.MeasurementsType.get()
+                    measTy = cc.StdvecType.get(quake.MeasureType.get())
                     resTy = cc.StdvecType.get(self.getIntegerType(1))
                 measureResult = processQuantumOperation(
                     node.func.id.title(), [],
@@ -3807,13 +3819,17 @@ def visit_ListComp(self, node):
         the MLIR.
 
         By simple, we mean expressions like `[expr(iter) for iter in iterable]`
-        or `myList = [exprThatReturns(iter) for iter in iterable]`.
+        or `myList = [exprThatReturns(iter) for iter in iterable]`, optionally
+        with `if` filter clause.
         """
         if len(node.generators) > 1:
             self.emitFatalError(
                 "CUDA-Q only supports single generators for list comprehension.",
                 node)
 
+        if_clauses = node.generators[0].ifs
+        hasFilter = len(if_clauses) > 0
+
         self.visit(node.generators[0].iter)
         iterable = self.popValue()
         orig_iterable_type = iterable.type
@@ -3852,6 +3868,15 @@ def process_void_list():
             # `visit_For`, but that would be premature optimization.
             self.visit_For(forNode)
 
+        def evalFilter():
+            cond = None
+            for if_node in if_clauses:
+                self.visit(if_node)
+                this_cond = self.__arithmetic_to_bool(self.popValue())
+                cond = this_cond if cond is None else arith.AndIOp(
+                    cond, this_cond).result
+            return cond
+
         target_types = {}
 
         def get_target_type(target, targetType):
@@ -4050,10 +4075,11 @@ def get_item_type(pyval):
             return
 
         if quake.RefType.isinstance(listElemTy):
-            if quake.VeqType.isinstance(orig_iterable_type):
+            if quake.VeqType.isinstance(orig_iterable_type) and not hasFilter:
                 self.pushValue(iterable)
                 return
-            if cc.StdvecType.isinstance(orig_iterable_type):
+            if (cc.StdvecType.isinstance(orig_iterable_type) or
+                    quake.VeqType.isinstance(orig_iterable_type)):
                 i64Ty = self.getIntegerType()
                 veqTy = self.getVeqType()
                 c0 = self.getConstantInt(0)
@@ -4066,18 +4092,39 @@ def get_item_type(pyval):
 
                 def bodyBuilder(args):
                     i, curr_veq = args[0], args[1]
-                    elem_addr = cc.ComputePtrOp(
-                        cc.PointerType.get(iterTy), iterable, [i],
-                        DenseI32ArrayAttr.get([kDynamicPtrIndex],
-                                              context=self.ctx))
-                    idx_val = cc.LoadOp(elem_addr).result
+                    if quake.VeqType.isinstance(iterable.type):
+                        idx_val = quake.ExtractRefOp(iterTy,
+                                                     iterable,
+                                                     -1,
+                                                     index=i).result
+                    else:
+                        elem_addr = cc.ComputePtrOp(
+                            cc.PointerType.get(iterTy), iterable, [i],
+                            DenseI32ArrayAttr.get([kDynamicPtrIndex],
+                                                  context=self.ctx))
+                        idx_val = cc.LoadOp(elem_addr).result
                     self.symbolTable.beginBlock()
                     self.__deconstructAssignment(node.generators[0].target,
                                                  idx_val)
-                    self.visit(node.elt)
-                    ref = self.popValue()
+                    if hasFilter:
+                        cond = evalFilter()
+                        ifOp = cc.IfOp([veqTy], cond, [])
+                        thenBlock = Block.create_at_start(ifOp.thenRegion, [])
+                        with InsertionPoint(thenBlock):
+                            self.visit(node.elt)
+                            ref = self.popValue()
+                            appended = quake.ConcatOp(veqTy,
+                                                      [curr_veq, ref]).result
+                            cc.ContinueOp([appended])
+                        elseBlock = Block.create_at_start(ifOp.elseRegion, [])
+                        with InsertionPoint(elseBlock):
+                            cc.ContinueOp([curr_veq])
+                        new_veq = ifOp.result
+                    else:
+                        self.visit(node.elt)
+                        ref = self.popValue()
+                        new_veq = quake.ConcatOp(veqTy, [curr_veq, ref]).result
                     self.symbolTable.endBlock()
-                    new_veq = quake.ConcatOp(veqTy, [curr_veq, ref]).result
                     cc.ContinueOp([i, new_veq])
 
                 loop = self.createForLoop(
@@ -4099,47 +4146,72 @@ def bodyBuilder(args):
                                 TypeAttr.get(listElemTy),
                                 seqSize=iterableSize).result
 
-        # General case of
-        # `listVar = [expr(i) for i in iterable]`
-        # Need to think of this as
-        # `listVar = stdvec(iterable.size)`
-        # `for i, r in enumerate(listVar):`
-        # `   listVar[i] = expr(r)`
-        def bodyBuilder(iterVar):
-            self.symbolTable.beginBlock()
+        def extractIterVal(iterVar):
             if quake.VeqType.isinstance(iterable.type):
-                iterVal = quake.ExtractRefOp(iterTy,
-                                             iterable,
-                                             -1,
-                                             index=iterVar).result
-            else:
-                eleAddr = cc.ComputePtrOp(
-                    cc.PointerType.get(iterTy), iterable, [iterVar],
-                    DenseI32ArrayAttr.get([kDynamicPtrIndex], context=self.ctx))
-                iterVal = cc.LoadOp(eleAddr).result
-
-            # We don't do support anything within list comprehensions that would
-            # require being careful about assigning references, so simply
-            # adding them to the symbol table is enough for list comprehension.
-            self.__deconstructAssignment(node.generators[0].target, iterVal)
+                return quake.ExtractRefOp(iterTy, iterable, -1,
+                                          index=iterVar).result
+            eleAddr = cc.ComputePtrOp(
+                cc.PointerType.get(iterTy), iterable, [iterVar],
+                DenseI32ArrayAttr.get([kDynamicPtrIndex], context=self.ctx))
+            return cc.LoadOp(eleAddr).result
+
+        def storeElementAt(storeIdx):
             self.visit(node.elt)
             element = self.popValue()
-            # We do need to be careful, however, about validating the list
-            # elements.
+            # We do need to be careful about validating the list elements.
             self.__validate_container_entry(element, node.elt)
-
             listValueAddr = cc.ComputePtrOp(
-                cc.PointerType.get(listElemTy), listValue, [iterVar],
+                cc.PointerType.get(listElemTy), listValue, [storeIdx],
                 DenseI32ArrayAttr.get([kDynamicPtrIndex], context=self.ctx))
             element = self.changeOperandToType(listElemTy,
                                                element,
                                                allowDemotion=False)
             cc.StoreOp(element, listValueAddr)
-            self.symbolTable.endBlock()
 
-        self.createInvariantForLoop(bodyBuilder, iterableSize)
-        res = cc.StdvecInitOp(resultVecTy, listValue,
-                              length=iterableSize).result
+        if not hasFilter:
+
+            def bodyBuilder(iterVar):
+                self.symbolTable.beginBlock()
+                iterVal = extractIterVal(iterVar)
+                self.__deconstructAssignment(node.generators[0].target, iterVal)
+                storeElementAt(iterVar)
+                self.symbolTable.endBlock()
+
+            self.createInvariantForLoop(bodyBuilder, iterableSize)
+            res = cc.StdvecInitOp(resultVecTy, listValue,
+                                  length=iterableSize).result
+            self.pushValue(res)
+            return
+
+        i64Ty = self.getIntegerType()
+        c0 = self.getConstantInt(0)
+        c1 = self.getConstantInt(1)
+
+        def filteredBodyBuilder(args):
+            i, count = args[0], args[1]
+            self.symbolTable.beginBlock()
+            iterVal = extractIterVal(i)
+            self.__deconstructAssignment(node.generators[0].target, iterVal)
+            cond = evalFilter()
+            ifOp = cc.IfOp([i64Ty], cond, [])
+            thenBlock = Block.create_at_start(ifOp.thenRegion, [])
+            with InsertionPoint(thenBlock):
+                storeElementAt(count)
+                cc.ContinueOp([arith.AddIOp(count, c1).result])
+            elseBlock = Block.create_at_start(ifOp.elseRegion, [])
+            with InsertionPoint(elseBlock):
+                cc.ContinueOp([count])
+            nextCount = ifOp.result
+            self.symbolTable.endBlock()
+            cc.ContinueOp([i, nextCount])
+
+        loop = self.createForLoop(
+            [i64Ty, i64Ty],
+            filteredBodyBuilder, [c0, c0], lambda args: arith.CmpIOp(
+                IntegerAttr.get(i64Ty, 2), args[0], iterableSize).result,
+            lambda args: [arith.AddIOp(args[0], c1).result, args[1]])
+        finalCount = loop.results[1]
+        res = cc.StdvecInitOp(resultVecTy, listValue, length=finalCount).result
         self.pushValue(res)
         return
 
diff --git a/python/cudaq/kernel/kernel_builder.py b/python/cudaq/kernel/kernel_builder.py
index 82b78fd7381..4abc9fb8733 100644
--- a/python/cudaq/kernel/kernel_builder.py
+++ b/python/cudaq/kernel/kernel_builder.py
@@ -653,7 +653,6 @@ def functor(op):
                 cloned = otherST[calleeName].operation.clone()
                 if 'cudaq-entrypoint' in cloned.operation.attributes:
                     cloned.operation.attributes.__delitem__('cudaq-entrypoint')
-                print("adding", cloned)
                 currentModule.body.append(cloned)
 
                 visitAllCallOps(cloned)
@@ -682,6 +681,12 @@ def __applyControlOrAdjoint(self, target, isAdjoint, controls, *args):
                 otherFuncCloned, otherModule = self.__cloneOrGetFunction(
                     target.name, self.module, target)
                 assert isinstance(otherFuncCloned, func.FuncOp)
+                # Same as __addAllCalledFunctionsRecursively does for
+                # transitively called functions: a sub-kernel merged into this
+                # module is no longer an `entrypoint`.
+                if 'cudaq-entrypoint' in otherFuncCloned.operation.attributes:
+                    otherFuncCloned.operation.attributes.__delitem__(
+                        'cudaq-entrypoint')
                 self.__addAllCalledFunctionsRecursively(otherFuncCloned,
                                                         self.module,
                                                         otherModule)
@@ -1113,24 +1118,6 @@ def reset(self, target):
                     'reset operation broadcasting on qvector not supported yet.'
                 )
 
-    @staticmethod
-    def _get_measurement_type(targets):
-        """
-        Compute the appropriate measurement type for the given targets.
-        """
-        if len(targets) == 1 and quake.RefType.isinstance(targets[0].type):
-            return quake.MeasureType.get()
-        total_size = 0
-        all_known = True
-        for t in targets:
-            if quake.isConstantQuantumRefType(t.type):
-                total_size += quake.getAllocationSize(t.type)
-            else:
-                all_known = False
-        if all_known and total_size > 0:
-            return quake.MeasurementsType.get(total_size)
-        return quake.MeasurementsType.get()
-
     def mz(self, target, regName=None):
         """
         Measure the given qubit or qubits in the Z-basis. The optional
@@ -1163,10 +1150,13 @@ def mz(self, target, regName=None):
         """
         with self.ctx, self.insertPoint, self.loc:
             i1Ty = IntegerType.get_signless(1)
-            measTy = PyKernel._get_measurement_type([target.mlirValue])
+            qubitTy = target.mlirValue.type
             retTy = i1Ty
-            if quake.MeasurementsType.isinstance(measTy):
-                retTy = cc.StdvecType.get(i1Ty)
+            measTy = quake.MeasureType.get()
+            stdvecTy = cc.StdvecType.get(i1Ty)
+            if quake.VeqType.isinstance(target.mlirValue.type):
+                retTy = stdvecTy
+                measTy = cc.StdvecType.get(measTy)
             if regName is not None:
                 res = quake.MzOp(measTy, [], [target.mlirValue],
                                  registerName=StringAttr.get(regName,
@@ -1207,10 +1197,13 @@ def mx(self, target, regName=None):
         """
         with self.ctx, self.insertPoint, self.loc:
             i1Ty = IntegerType.get_signless(1)
-            measTy = PyKernel._get_measurement_type([target.mlirValue])
+            qubitTy = target.mlirValue.type
             retTy = i1Ty
-            if quake.MeasurementsType.isinstance(measTy):
-                retTy = cc.StdvecType.get(i1Ty)
+            measTy = quake.MeasureType.get()
+            stdvecTy = cc.StdvecType.get(i1Ty)
+            if quake.VeqType.isinstance(target.mlirValue.type):
+                retTy = stdvecTy
+                measTy = cc.StdvecType.get(measTy)
             if regName is not None:
                 res = quake.MxOp(measTy, [], [target.mlirValue],
                                  registerName=StringAttr.get(regName,
@@ -1252,10 +1245,13 @@ def my(self, target, regName=None):
         """
         with self.ctx, self.insertPoint, self.loc:
             i1Ty = IntegerType.get_signless(1)
-            measTy = PyKernel._get_measurement_type([target.mlirValue])
+            qubitTy = target.mlirValue.type
             retTy = i1Ty
-            if quake.MeasurementsType.isinstance(measTy):
-                retTy = cc.StdvecType.get(i1Ty)
+            measTy = quake.MeasureType.get()
+            stdvecTy = cc.StdvecType.get(i1Ty)
+            if quake.VeqType.isinstance(target.mlirValue.type):
+                retTy = stdvecTy
+                measTy = cc.StdvecType.get(measTy)
             if regName is not None:
                 res = quake.MyOp(measTy, [], [target.mlirValue],
                                  registerName=StringAttr.get(regName,
diff --git a/python/cudaq/kernel/kernel_decorator.py b/python/cudaq/kernel/kernel_decorator.py
index 11a03e5d6fd..aea0b992974 100644
--- a/python/cudaq/kernel/kernel_decorator.py
+++ b/python/cudaq/kernel/kernel_decorator.py
@@ -24,7 +24,8 @@
 from .analysis import FunctionDefVisitor
 from .kernel_signature import CapturedLinkedKernel, CapturedVariable, KernelSignature
 from .ast_bridge import compile_to_mlir
-from .utils import (emitFatalError, emitErrorIfInvalidPauli, get_module_name,
+from .utils import (emitFatalError, emitErrorIfInvalidPauli,
+                    get_function_source_or_raise, get_module_name,
                     globalRegisteredTypes, mlirTypeFromPyType, mlirTypeToPyType,
                     nvqppPrefix, getMLIRContext, recover_func_op,
                     recover_value_of)
@@ -736,14 +737,7 @@ def isa_kernel_decorator(object):
 def _get_source(function):
     if function is None:
         return None, None
-    # Get the function source location
-    location = (inspect.getfile(function), inspect.getsourcelines(function)[1])
-    # Get the function source
-    src = inspect.getsource(function)
-    # Strip off the extra tabs
-    leadingSpaces = len(src) - len(src.lstrip())
-    src = '\n'.join([line[leadingSpaces:] for line in src.split('\n')])
-    return src, location
+    return get_function_source_or_raise(function)
 
 
 def _recover_defining_frame():
diff --git a/python/cudaq/kernel/utils.py b/python/cudaq/kernel/utils.py
index 2e1f4d0b0fb..47c1eb3dfb5 100644
--- a/python/cudaq/kernel/utils.py
+++ b/python/cudaq/kernel/utils.py
@@ -265,6 +265,64 @@ def emitWarning(msg):
                    Color.END + '\n\nOffending code:\n' + offendingSrc[0])
 
 
+def _format_missing_source_error(function, filename):
+    """
+    Build a user-facing diagnostic explaining why source for `function` could
+    not be retrieved. Distinguishes between three buckets:
+      - Interactive interpreter-defined (`<stdin>` or `<python-input-...>`).
+      - Other synthetic filenames (code compiled with a non-file name).
+      - Real paths that failed to read (missing file, frozen module,
+        compiled extension).
+    """
+    qualname = getattr(function, '__qualname__',
+                       getattr(function, '__name__', '<unknown>'))
+    if filename is None:
+        return (f"@cudaq.kernel could not determine a source location for "
+                f"function `{qualname}`. `@cudaq.kernel` requires source that "
+                f"Python's `inspect` module can recover. Move the kernel into "
+                f"a `.py` module.")
+    is_repl = filename == '<stdin>' or filename.startswith('<python-input')
+    is_synthetic = filename.startswith('<') and filename.endswith('>')
+    if is_repl:
+        return (f"@cudaq.kernel could not retrieve source for function "
+                f"`{qualname}` because it is defined in the Python REPL, "
+                f"which does not preserve source code that `inspect` can "
+                f"recover. To use `@cudaq.kernel`, either run from a "
+                f"Jupyter/IPython session (which preserves source via "
+                f"`linecache`) or move the kernel into a `.py` module.")
+    if is_synthetic:
+        return (f"@cudaq.kernel could not retrieve source for function "
+                f"`{qualname}`: it is defined in a non-file context "
+                f"(`{filename}`). `@cudaq.kernel` requires source that "
+                f"`inspect` can recover. Move the kernel into a `.py` "
+                f"module.")
+    return (f"@cudaq.kernel could not read source for function "
+            f"`{qualname}` at `{filename}` (the file may be missing, "
+            f"frozen, or a compiled extension).")
+
+
+def get_function_source_or_raise(function):
+    """
+    Return `(dedented_source, (filename, first_lineno))` for `function`.
+    Wraps `inspect.getfile`, `inspect.getsourcelines`, and
+    `inspect.getsource`. If any fail (most commonly because `function` was
+    defined in the interactive Python interpreter), raise `RuntimeError`
+    with a diagnostic
+    tailored to the failure mode, chained from the underlying exception.
+    """
+    filename = None
+    try:
+        filename = inspect.getfile(function)
+        first_line = inspect.getsourcelines(function)[1]
+        src = inspect.getsource(function)
+    except OSError as e:
+        raise RuntimeError(_format_missing_source_error(function,
+                                                        filename)) from e
+    leadingSpaces = len(src) - len(src.lstrip())
+    src = '\n'.join([line[leadingSpaces:] for line in src.split('\n')])
+    return src, (filename, first_line)
+
+
 def mlirTryCreateStructType(mlirEleTypes, name=None, context=None):
     """
     Creates either a `quake.StruqType` or a `cc.StructType` used to represent 
diff --git a/python/cudaq/operators/scalar/scalar_op.py b/python/cudaq/operators/scalar/scalar_op.py
index c95927e1900..2c0245badb2 100644
--- a/python/cudaq/operators/scalar/scalar_op.py
+++ b/python/cudaq/operators/scalar/scalar_op.py
@@ -7,11 +7,11 @@
 # ============================================================================ #
 
 from __future__ import annotations
-import inspect, numpy  # type: ignore
-from typing import Any, Callable, Mapping, Optional
+import numpy  # type: ignore
+from typing import Any, Callable, Mapping
 from numpy.typing import NDArray
 
-from ..helpers import NumericType, _aggregate_parameters, _args_from_kwargs, _parameter_docs
+from ..helpers import NumericType, _aggregate_parameters
 from cudaq.mlir._mlir_libs._quakeDialects.cudaq_runtime import ScalarOperator
 
 
@@ -21,7 +21,7 @@ def _const_init(cls, constant_value: NumericType) -> ScalarOperator:
     """
     if not isinstance(constant_value, NumericType):
         raise ValueError("argument must be a numeric constant")
-    return cls(constant_value)
+    return cls(complex(constant_value))
 
 
 ScalarOperator.const = classmethod(_const_init)
@@ -65,7 +65,7 @@ def _compose(
         if self.is_constant():
             return ScalarOperator.const(fct(self.evaluate(), other))
         generator = lambda **kwargs: fct(self.evaluate(**kwargs), other)
-        return ScalarOperator(generator, self.parameters)
+        return ScalarOperator(generator, **self.parameters)
     elif type(other) == ScalarOperator:
         if self.is_constant() and other.is_constant():
             return ScalarOperator.const(fct(self.evaluate(), other.evaluate()))
@@ -73,7 +73,7 @@ def _compose(
                                          other.evaluate(**kwargs))
         parameter_info = _aggregate_parameters(
             [self.parameters, other.parameters])
-        return ScalarOperator(generator, parameter_info)
+        return ScalarOperator(generator, **parameter_info)
     return NotImplemented
 
 
@@ -97,44 +97,3 @@ def _compose(
                                                        v2: v2 + v1)
 ScalarOperator.__rsub__ = lambda self, other: _compose(self, other, lambda v1,
                                                        v2: v2 - v1)
-
-
-def _instantiate(cls,
-                 generator: NumericType | Callable[..., NumericType],
-                 parameter_info: Optional[Mapping[str, str]] = None) -> None:
-    """
-    Instantiates a scalar operator.
-
-    Arguments:
-        generator: The value of the scalar operator as a function of its
-            parameters. The generator may take any number of complex-valued
-            arguments and must return a number. Each parameter must be passed
-            as a keyword argument when evaluating the operator. 
-    """
-    instance = super(ScalarOperator, cls).__new__(cls)
-    if isinstance(generator, NumericType):
-        instance.__init__(numpy.complex128(generator))
-    else:
-        # A variable number of arguments (i.e. `*args`) cannot be supported
-        # for generators; it would prevent proper argument handling while
-        # supporting additions and multiplication of all kinds of operators.
-        arg_spec = inspect.getfullargspec(generator)
-        if arg_spec.varargs is not None:
-            raise ValueError(
-                f"the function defining a scalar operator must not take *args")
-        if parameter_info is None:
-            parameter_info = {}
-            for arg_name in arg_spec.args + arg_spec.kwonlyargs:
-                parameter_info[arg_name] = _parameter_docs(
-                    arg_name, generator.__doc__)
-
-        def generator_wrapper(kwargs: dict[str, NumericType]):
-            generator_args, remaining_kwargs = _args_from_kwargs(
-                generator, **kwargs)
-            return generator(*generator_args, **remaining_kwargs)
-
-        instance.__init__(generator_wrapper, **parameter_info)
-    return instance
-
-
-ScalarOperator.__new__ = staticmethod(_instantiate)
diff --git a/python/cudaq/runtime/sample.py b/python/cudaq/runtime/sample.py
index 0e44761557f..b2c97d97800 100644
--- a/python/cudaq/runtime/sample.py
+++ b/python/cudaq/runtime/sample.py
@@ -224,6 +224,9 @@ def sample_async(decorator,
       explicit_measurements (Optional[bool]): A flag to indicate whether or not
           to concatenate measurements in execution order for the returned
           sample result.
+      noise_model (Optional[`NoiseModel`]): The optional :class:`NoiseModel`
+          to add noise to the kernel execution on the simulator. Defaults to
+          an empty noise model.
       `qpu_id` (Optional[int]): The optional identification for which QPU
           on the platform to target. Defaults to zero. Key-word only.
 
diff --git a/python/extension/CMakeLists.txt b/python/extension/CMakeLists.txt
index 52bb22a14db..d5bad6f9ac1 100644
--- a/python/extension/CMakeLists.txt
+++ b/python/extension/CMakeLists.txt
@@ -13,6 +13,43 @@ endif()
 include(HandleLLVMOptions)
 include(AddMLIRPython)
 
+function(add_mlir_python_extension libname extname)
+  cmake_parse_arguments(ARG
+    ""
+    "INSTALL_COMPONENT;INSTALL_DIR;OUTPUT_DIRECTORY"
+    "SOURCES;LINK_LIBS"
+    ${ARGN})
+
+  # Use nanobind for CUDA-Q's own extension (_quakeDialects) and pybind11
+  # for upstream MLIR extensions (AsyncPasses, RegisterEverything, etc.).
+  if(libname MATCHES "_quakeDialects")
+    nanobind_add_module(${libname} NB_STATIC ${ARG_SOURCES})
+    target_compile_options(${libname} PRIVATE -frtti -fexceptions -Wno-cast-qual)
+  else()
+    pybind11_add_module(${libname} MODULE ${ARG_SOURCES})
+    target_compile_options(${libname} PRIVATE -frtti -fexceptions)
+  endif()
+
+  set_target_properties(${libname} PROPERTIES
+    LIBRARY_OUTPUT_DIRECTORY ${ARG_OUTPUT_DIRECTORY}
+    OUTPUT_NAME "${extname}"
+    NO_SONAME ON
+  )
+
+  target_link_libraries(${libname} PRIVATE ${ARG_LINK_LIBS})
+  target_link_options(${libname} PRIVATE
+    $<$<PLATFORM_ID:Linux>:LINKER:--exclude-libs,ALL>
+  )
+
+  if(ARG_INSTALL_DIR)
+    install(TARGETS ${libname}
+      COMPONENT ${ARG_INSTALL_COMPONENT}
+      LIBRARY DESTINATION "${ARG_INSTALL_DIR}"
+      RUNTIME DESTINATION "${ARG_INSTALL_DIR}"
+    )
+  endif()
+endfunction()
+
 # Specifies that all MLIR packages are co-located under the cudaq
 # top level package (the API has been embedded in a relocatable way).
 add_compile_definitions("MLIR_PYTHON_PACKAGE_PREFIX=cudaq.mlir.")
@@ -92,6 +129,7 @@ declare_mlir_python_extension(CUDAQuantumPythonSources.Extension
     ../../runtime/cudaq/platform/default/rest/RemoteRESTQPU.cpp
     ../../runtime/cudaq/platform/default/python/QPU.cpp
     ../../runtime/internal/compiler/ArgumentConversion.cpp
+    ../../runtime/internal/compiler/CompiledModuleHelper.cpp
     ../../runtime/internal/compiler/LayoutInfo.cpp
     ../../runtime/internal/compiler/RuntimeMLIR.cpp
     ../../runtime/internal/compiler/RuntimePyMLIR.cpp
@@ -112,8 +150,8 @@ declare_mlir_python_extension(CUDAQuantumPythonSources.Extension
    cudaq-mlir-runtime-headers
 )
 
-target_include_directories(CUDAQuantumPythonSources.Extension INTERFACE 
-    ${CMAKE_SOURCE_DIR}/python 
+target_include_directories(CUDAQuantumPythonSources.Extension INTERFACE
+    ${CMAKE_SOURCE_DIR}/python
     ${CMAKE_SOURCE_DIR}/python/utils
     ${CMAKE_SOURCE_DIR}/runtime
 )
@@ -161,6 +199,10 @@ add_mlir_python_modules(CUDAQuantumPythonModules
     CUDAQuantumPythonCAPI
   )
 
+if(TARGET nanobind-static)
+  target_compile_options(nanobind-static PRIVATE -Wno-cast-qual -Wno-covered-switch-default)
+endif()
+
 ## The Python bindings module for Quake dialect depends on CUDAQ libraries 
 ## which it can't locate since they are in "../../lib" and the 'rpath' is set
 ## to '$ORIGIN' by default.
diff --git a/python/extension/CUDAQuantumExtension.cpp b/python/extension/CUDAQuantumExtension.cpp
index 679c191a7a6..ac1ca729446 100644
--- a/python/extension/CUDAQuantumExtension.cpp
+++ b/python/extension/CUDAQuantumExtension.cpp
@@ -43,24 +43,26 @@
 #include "runtime/cudaq/qis/py_pauli_word.h"
 #include "runtime/cudaq/target/py_runtime_target.h"
 #include "runtime/cudaq/target/py_testing_utils.h"
-#include "runtime/interop/PythonCppInterop.h"
+#include "runtime/interop/PythonCppInteropDecls.h"
 #include "runtime/mlir/py_register_dialects.h"
 #include "utils/LinkedLibraryHolder.h"
+#include "utils/NanobindAdaptors.h"
 #include "utils/OpaqueArguments.h"
-#include "mlir/Bindings/Python/PybindAdaptors.h"
 #include "mlir/Parser/Parser.h"
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
-#include <pybind11/complex.h>
-#include <pybind11/pytypes.h>
-#include <pybind11/stl.h>
-
-namespace py = pybind11;
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/map.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/pair.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/tuple.h>
+#include <nanobind/stl/vector.h>
 
 using namespace cudaq;
 
 static std::unique_ptr<LinkedLibraryHolder> holder;
 
-PYBIND11_MODULE(_quakeDialects, m) {
+NB_MODULE(_quakeDialects, m) {
   holder = std::make_unique<LinkedLibraryHolder>();
 
   bindRegisterDialects(m);
@@ -92,8 +94,10 @@ PYBIND11_MODULE(_quakeDialects, m) {
           holder->setTarget(*target, extraConfig);
         }
       },
-      py::arg("option") = py::none(), py::arg("emulate") = py::none(),
-      py::arg("target") = py::none(), "Initialize the CUDA-Q environment.");
+      nanobind::arg("option") = nanobind::none(),
+      nanobind::arg("emulate") = nanobind::none(),
+      nanobind::arg("target") = nanobind::none(),
+      "Initialize the CUDA-Q environment.");
 
   bindRuntimeTarget(cudaqRuntime, *holder.get());
   bindMeasureCounts(cudaqRuntime);
@@ -197,41 +201,46 @@ PYBIND11_MODULE(_quakeDialects, m) {
   auto orcaSubmodule = cudaqRuntime.def_submodule("orca");
   orcaSubmodule.def(
       "sample",
-      py::overload_cast<std::vector<std::size_t> &, std::vector<std::size_t> &,
-                        std::vector<double> &, std::vector<double> &, int,
-                        std::size_t>(&orca::sample),
+      nanobind::overload_cast<std::vector<std::size_t> &,
+                              std::vector<std::size_t> &, std::vector<double> &,
+                              std::vector<double> &, int, std::size_t>(
+          &orca::sample),
       "Performs Time Bin Interferometer (TBI) boson sampling experiments on "
       "ORCA's backends",
-      py::arg("input_state"), py::arg("loop_lengths"), py::arg("bs_angles"),
-      py::arg("ps_angles"), py::arg("n_samples") = 10000,
-      py::arg("qpu_id") = 0);
+      nanobind::arg("input_state"), nanobind::arg("loop_lengths"),
+      nanobind::arg("bs_angles"), nanobind::arg("ps_angles"),
+      nanobind::arg("n_samples") = 10000, nanobind::arg("qpu_id") = 0);
   orcaSubmodule.def(
       "sample",
-      py::overload_cast<std::vector<std::size_t> &, std::vector<std::size_t> &,
-                        std::vector<double> &, int, std::size_t>(&orca::sample),
+      nanobind::overload_cast<std::vector<std::size_t> &,
+                              std::vector<std::size_t> &, std::vector<double> &,
+                              int, std::size_t>(&orca::sample),
       "Performs Time Bin Interferometer (TBI) boson sampling experiments on "
       "ORCA's backends",
-      py::arg("input_state"), py::arg("loop_lengths"), py::arg("bs_angles"),
-      py::arg("n_samples") = 10000, py::arg("qpu_id") = 0);
+      nanobind::arg("input_state"), nanobind::arg("loop_lengths"),
+      nanobind::arg("bs_angles"), nanobind::arg("n_samples") = 10000,
+      nanobind::arg("qpu_id") = 0);
   orcaSubmodule.def(
       "sample_async",
-      py::overload_cast<std::vector<std::size_t> &, std::vector<std::size_t> &,
-                        std::vector<double> &, std::vector<double> &, int,
-                        std::size_t>(&orca::sample_async),
+      nanobind::overload_cast<std::vector<std::size_t> &,
+                              std::vector<std::size_t> &, std::vector<double> &,
+                              std::vector<double> &, int, std::size_t>(
+          &orca::sample_async),
       "Performs Time Bin Interferometer (TBI) boson sampling experiments on "
       "ORCA's backends",
-      py::arg("input_state"), py::arg("loop_lengths"), py::arg("bs_angles"),
-      py::arg("ps_angles"), py::arg("n_samples") = 10000,
-      py::arg("qpu_id") = 0);
+      nanobind::arg("input_state"), nanobind::arg("loop_lengths"),
+      nanobind::arg("bs_angles"), nanobind::arg("ps_angles"),
+      nanobind::arg("n_samples") = 10000, nanobind::arg("qpu_id") = 0);
   orcaSubmodule.def(
       "sample_async",
-      py::overload_cast<std::vector<std::size_t> &, std::vector<std::size_t> &,
-                        std::vector<double> &, int, std::size_t>(
-          &orca::sample_async),
+      nanobind::overload_cast<std::vector<std::size_t> &,
+                              std::vector<std::size_t> &, std::vector<double> &,
+                              int, std::size_t>(&orca::sample_async),
       "Performs Time Bin Interferometer (TBI) boson sampling experiments on "
       "ORCA's backends",
-      py::arg("input_state"), py::arg("loop_lengths"), py::arg("bs_angles"),
-      py::arg("n_samples") = 10000, py::arg("qpu_id") = 0);
+      nanobind::arg("input_state"), nanobind::arg("loop_lengths"),
+      nanobind::arg("bs_angles"), nanobind::arg("n_samples") = 10000,
+      nanobind::arg("qpu_id") = 0);
 
   auto photonicsSubmodule = cudaqRuntime.def_submodule("photonics");
   photonicsSubmodule.def(
@@ -239,7 +248,7 @@ PYBIND11_MODULE(_quakeDialects, m) {
       [](std::size_t &level) {
         return getExecutionManager()->allocateQudit(level);
       },
-      "Allocate a qudit of given level.", py::arg("level"));
+      "Allocate a qudit of given level.", nanobind::arg("level"));
   photonicsSubmodule.def(
       "apply_operation",
       [](const std::string &name, std::vector<double> &params,
@@ -254,20 +263,21 @@ PYBIND11_MODULE(_quakeDialects, m) {
                                      spin_op::identity());
       },
       "Apply the input photonics operation on the target qudits.",
-      py::arg("name"), py::arg("params"), py::arg("targets"));
+      nanobind::arg("name"), nanobind::arg("params"), nanobind::arg("targets"));
   photonicsSubmodule.def(
       "measure",
       [](std::size_t level, std::size_t id, const std::string &regName) {
         return getExecutionManager()->measure(QuditInfo(level, id), regName);
       },
-      "Measure the input qudit(s).", py::arg("level"), py::arg("qudit"),
-      py::arg("register_name") = "");
+      "Measure the input qudit(s).", nanobind::arg("level"),
+      nanobind::arg("qudit"), nanobind::arg("register_name") = "");
   photonicsSubmodule.def(
       "release_qudit",
       [](std::size_t level, std::size_t id) {
         getExecutionManager()->returnQudit(QuditInfo(level, id));
       },
-      "Release a qudit of given id.", py::arg("level"), py::arg("id"));
+      "Release a qudit of given id.", nanobind::arg("level"),
+      nanobind::arg("id"));
   cudaqRuntime.def("cloneModule",
                    [](MlirModule mod) { return wrap(unwrap(mod).clone()); });
   cudaqRuntime.def("isTerminator", [](MlirOperation op) {
diff --git a/python/runtime/common/py_AnalogHamiltonian.cpp b/python/runtime/common/py_AnalogHamiltonian.cpp
index b9049f6e961..ec182338e03 100644
--- a/python/runtime/common/py_AnalogHamiltonian.cpp
+++ b/python/runtime/common/py_AnalogHamiltonian.cpp
@@ -9,78 +9,78 @@
 #include "py_AnalogHamiltonian.h"
 #include "common/AnalogHamiltonian.h"
 #include "common/JsonConvert.h"
-#include <pybind11/stl.h>
-
-namespace py = pybind11;
+#include <nanobind/stl/pair.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
 
 namespace cudaq {
 
 /// @brief Binds the `cudaq::ahs` classes.
-void bindAnalogHamiltonian(py::module &mod) {
+void bindAnalogHamiltonian(nanobind::module_ &mod) {
 
-  py::class_<cudaq::ahs::AtomArrangement>(mod, "AtomArrangement")
-      .def(py::init<>())
-      .def_readwrite("sites", &cudaq::ahs::AtomArrangement::sites)
-      .def_readwrite("filling", &cudaq::ahs::AtomArrangement::filling);
+  nanobind::class_<cudaq::ahs::AtomArrangement>(mod, "AtomArrangement")
+      .def(nanobind::init<>())
+      .def_rw("sites", &cudaq::ahs::AtomArrangement::sites)
+      .def_rw("filling", &cudaq::ahs::AtomArrangement::filling);
 
-  py::class_<cudaq::ahs::Setup>(mod, "SetUp")
-      .def(py::init<>())
-      .def_readwrite("ahs_register", &cudaq::ahs::Setup::ahs_register);
+  nanobind::class_<cudaq::ahs::Setup>(mod, "SetUp")
+      .def(nanobind::init<>())
+      .def_rw("ahs_register", &cudaq::ahs::Setup::ahs_register);
 
-  py::class_<cudaq::ahs::TimeSeries>(mod, "TimeSeries")
-      .def(py::init<>())
-      .def(py::init<std::vector<std::pair<double, double>>>())
-      .def_readwrite("values", &cudaq::ahs::TimeSeries::values)
-      .def_readwrite("times", &cudaq::ahs::TimeSeries::times);
+  nanobind::class_<cudaq::ahs::TimeSeries>(mod, "TimeSeries")
+      .def(nanobind::init<>())
+      .def(nanobind::init<std::vector<std::pair<double, double>>>())
+      .def_rw("values", &cudaq::ahs::TimeSeries::values)
+      .def_rw("times", &cudaq::ahs::TimeSeries::times);
 
-  py::class_<cudaq::ahs::FieldPattern>(mod, "FieldPattern")
+  nanobind::class_<cudaq::ahs::FieldPattern>(mod, "FieldPattern")
       /// NOTE: Other constructors not required from Python interface
-      .def(py::init<>())
-      .def_readwrite("patternStr", &cudaq::ahs::FieldPattern::patternStr)
-      .def_readwrite("patternVals", &cudaq::ahs::FieldPattern::patternVals);
-
-  py::class_<cudaq::ahs::PhysicalField>(mod, "PhysicalField")
-      .def(py::init<>())
-      .def_readwrite("time_series", &cudaq::ahs::PhysicalField::time_series)
-      .def_readwrite("pattern", &cudaq::ahs::PhysicalField::pattern);
-
-  py::class_<cudaq::ahs::DrivingField>(mod, "DrivingField")
-      .def(py::init<>())
-      .def_readwrite("amplitude", &cudaq::ahs::DrivingField::amplitude)
-      .def_readwrite("phase", &cudaq::ahs::DrivingField::phase)
-      .def_readwrite("detuning", &cudaq::ahs::DrivingField::detuning);
-
-  py::class_<cudaq::ahs::LocalDetuning>(mod, "LocalDetuning")
-      .def(py::init<>())
-      .def_readwrite("magnitude", &cudaq::ahs::LocalDetuning::magnitude);
-
-  py::class_<cudaq::ahs::Hamiltonian>(mod, "Hamiltonian")
-      .def(py::init<>())
-      .def_readwrite("drivingFields", &cudaq::ahs::Hamiltonian::drivingFields)
-      .def_readwrite("localDetuning", &cudaq::ahs::Hamiltonian::localDetuning);
-
-  py::class_<cudaq::ahs::Program>(mod, "Program")
-      .def(py::init<>())
-      .def_readwrite("setup", &cudaq::ahs::Program::setup)
-      .def_readwrite("hamiltonian", &cudaq::ahs::Program::hamiltonian)
+      .def(nanobind::init<>())
+      .def_rw("patternStr", &cudaq::ahs::FieldPattern::patternStr)
+      .def_rw("patternVals", &cudaq::ahs::FieldPattern::patternVals);
+
+  nanobind::class_<cudaq::ahs::PhysicalField>(mod, "PhysicalField")
+      .def(nanobind::init<>())
+      .def_rw("time_series", &cudaq::ahs::PhysicalField::time_series)
+      .def_rw("pattern", &cudaq::ahs::PhysicalField::pattern);
+
+  nanobind::class_<cudaq::ahs::DrivingField>(mod, "DrivingField")
+      .def(nanobind::init<>())
+      .def_rw("amplitude", &cudaq::ahs::DrivingField::amplitude)
+      .def_rw("phase", &cudaq::ahs::DrivingField::phase)
+      .def_rw("detuning", &cudaq::ahs::DrivingField::detuning);
+
+  nanobind::class_<cudaq::ahs::LocalDetuning>(mod, "LocalDetuning")
+      .def(nanobind::init<>())
+      .def_rw("magnitude", &cudaq::ahs::LocalDetuning::magnitude);
+
+  nanobind::class_<cudaq::ahs::Hamiltonian>(mod, "Hamiltonian")
+      .def(nanobind::init<>())
+      .def_rw("drivingFields", &cudaq::ahs::Hamiltonian::drivingFields)
+      .def_rw("localDetuning", &cudaq::ahs::Hamiltonian::localDetuning);
+
+  nanobind::class_<cudaq::ahs::Program>(mod, "Program")
+      .def(nanobind::init<>())
+      .def_rw("setup", &cudaq::ahs::Program::setup)
+      .def_rw("hamiltonian", &cudaq::ahs::Program::hamiltonian)
       .def(
           "to_json",
           [](const cudaq::ahs::Program &p) { return json(p).dump(); },
           "Convert Program to JSON");
 
-  py::class_<cudaq::ahs::ShotMetadata>(mod, "ShotMetadata")
-      .def(py::init<>())
-      .def_readwrite("shotStatus", &cudaq::ahs::ShotMetadata::shotStatus);
+  nanobind::class_<cudaq::ahs::ShotMetadata>(mod, "ShotMetadata")
+      .def(nanobind::init<>())
+      .def_rw("shotStatus", &cudaq::ahs::ShotMetadata::shotStatus);
 
-  py::class_<cudaq::ahs::ShotResult>(mod, "ShotResult")
-      .def(py::init<>())
-      .def_readwrite("preSequence", &cudaq::ahs::ShotResult::preSequence)
-      .def_readwrite("postSequence", &cudaq::ahs::ShotResult::postSequence);
+  nanobind::class_<cudaq::ahs::ShotResult>(mod, "ShotResult")
+      .def(nanobind::init<>())
+      .def_rw("preSequence", &cudaq::ahs::ShotResult::preSequence)
+      .def_rw("postSequence", &cudaq::ahs::ShotResult::postSequence);
 
-  py::class_<cudaq::ahs::ShotMeasurement>(mod, "ShotMeasurement")
-      .def(py::init<>())
-      .def_readwrite("shotMetadata", &cudaq::ahs::ShotMeasurement::shotMetadata)
-      .def_readwrite("shotResult", &cudaq::ahs::ShotMeasurement::shotResult);
+  nanobind::class_<cudaq::ahs::ShotMeasurement>(mod, "ShotMeasurement")
+      .def(nanobind::init<>())
+      .def_rw("shotMetadata", &cudaq::ahs::ShotMeasurement::shotMetadata)
+      .def_rw("shotResult", &cudaq::ahs::ShotMeasurement::shotResult);
 
   /// TODO: Add other classes if needed
 }
diff --git a/python/runtime/common/py_AnalogHamiltonian.h b/python/runtime/common/py_AnalogHamiltonian.h
index bfc098955a7..a1e039a8fa5 100644
--- a/python/runtime/common/py_AnalogHamiltonian.h
+++ b/python/runtime/common/py_AnalogHamiltonian.h
@@ -6,13 +6,11 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
 
 /// @brief Binds the `cudaq::ahs` classes.
-void bindAnalogHamiltonian(py::module &mod);
+void bindAnalogHamiltonian(nanobind::module_ &mod);
 
 } // namespace cudaq
diff --git a/python/runtime/common/py_CustomOpRegistry.cpp b/python/runtime/common/py_CustomOpRegistry.cpp
index 35c5132695b..6d09cd8d69b 100644
--- a/python/runtime/common/py_CustomOpRegistry.cpp
+++ b/python/runtime/common/py_CustomOpRegistry.cpp
@@ -7,9 +7,10 @@
  ******************************************************************************/
 #include "py_CustomOpRegistry.h"
 #include "common/CustomOp.h"
-#include <pybind11/complex.h>
-#include <pybind11/functional.h>
-#include <pybind11/stl.h>
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/function.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
 
 namespace cudaq {
 struct py_unitary_operation : public unitary_operation {
@@ -22,7 +23,7 @@ struct py_unitary_operation : public unitary_operation {
   }
 };
 
-void bindCustomOpRegistry(py::module &mod) {
+void bindCustomOpRegistry(nanobind::module_ &mod) {
   mod.def(
       "register_custom_operation",
       [&](const std::string &opName) {
diff --git a/python/runtime/common/py_CustomOpRegistry.h b/python/runtime/common/py_CustomOpRegistry.h
index dcd4f2c2b2e..f9b6d2003eb 100644
--- a/python/runtime/common/py_CustomOpRegistry.h
+++ b/python/runtime/common/py_CustomOpRegistry.h
@@ -6,11 +6,9 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
 /// @brief Bind the custom operation registry to Python.
-void bindCustomOpRegistry(py::module &mod);
+void bindCustomOpRegistry(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/common/py_EvolveResult.cpp b/python/runtime/common/py_EvolveResult.cpp
index 007acf6577e..6a57cebaa92 100644
--- a/python/runtime/common/py_EvolveResult.cpp
+++ b/python/runtime/common/py_EvolveResult.cpp
@@ -9,36 +9,36 @@
 #include "py_EvolveResult.h"
 #include "common/EvolveResult.h"
 #include "cudaq/algorithms/evolve_internal.h"
-#include <optional>
-#include <pybind11/stl.h>
-
-namespace py = pybind11;
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
 
 namespace cudaq {
 /// @brief Bind the `cudaq::evolve_result` and `cudaq::async_evolve_result`
 /// data classes to python as `cudaq.EvolveResult` and
 /// `cudaq.AsyncEvolveResult`.
-void bindEvolveResult(py::module &mod) {
-  py::class_<evolve_result>(
+void bindEvolveResult(nanobind::module_ &mod) {
+  nanobind::class_<evolve_result>(
       mod, "EvolveResult",
       "Stores the execution data from an invocation of :func:`evolve`.\n")
       // IMPORTANT: state overloads must be provided before vector<state>
       // overloads. Otherwise, Python might try to access the __len__ of state
       // during overload resolution. __len__ is not always well-defined for all
       // state types and may raise an exception.
-      .def(py::init<state>())
-      .def(py::init<state, std::vector<observe_result>>())
-      .def(py::init<state, std::vector<double>>())
-      .def(py::init<std::vector<state>>())
-      .def(py::init<std::vector<state>,
-                    std::vector<std::vector<observe_result>>>())
-      .def(py::init<std::vector<state>, std::vector<std::vector<double>>>())
+      .def(nanobind::init<state>())
+      .def(nanobind::init<state, std::vector<observe_result>>())
+      .def(nanobind::init<state, std::vector<double>>())
+      .def(nanobind::init<std::vector<state>>())
+      .def(nanobind::init<std::vector<state>,
+                          std::vector<std::vector<observe_result>>>())
+      .def(nanobind::init<std::vector<state>,
+                          std::vector<std::vector<double>>>())
       .def(
           "final_state",
-          [](evolve_result &self) -> py::object {
+          [](evolve_result &self) -> nanobind::object {
             if (!self.states.has_value() || self.states->empty())
-              return py::none();
-            return py::cast(self.states->back());
+              return nanobind::none();
+            return nanobind::cast(self.states->back());
           },
           "Stores the final state produced by a call to :func:`evolve`. "
           "Represent the state of a quantum system after time evolution under "
@@ -54,11 +54,11 @@ void bindEvolveResult(py::module &mod) {
           ":func:`evolve`.\n")
       .def(
           "final_expectation_values",
-          [](evolve_result &self) -> py::object {
+          [](evolve_result &self) -> nanobind::object {
             if (!self.expectation_values.has_value() ||
                 self.expectation_values->empty())
-              return py::none();
-            return py::cast(self.expectation_values->back());
+              return nanobind::none();
+            return nanobind::cast(self.expectation_values->back());
           },
           "Stores the final expectation values, that is the results produced "
           "by "
@@ -81,12 +81,12 @@ void bindEvolveResult(py::module &mod) {
           "if no intermediate results were requested, or if no observables "
           "were specified in the call.\n");
 
-  py::class_<async_evolve_result>(
+  nanobind::class_<async_evolve_result>(
       mod, "AsyncEvolveResult",
       "Stores the execution data from an invocation of :func:`evolve_async`.\n")
       .def(
           "get", [](async_evolve_result &self) { return self.get(); },
-          py::call_guard<py::gil_scoped_release>(),
+          nanobind::call_guard<nanobind::gil_scoped_release>(),
           "Retrieve the evolution result from the asynchronous evolve "
           "execution\n.");
 }
diff --git a/python/runtime/common/py_EvolveResult.h b/python/runtime/common/py_EvolveResult.h
index 48ddfb9950b..1bafe73cd2d 100644
--- a/python/runtime/common/py_EvolveResult.h
+++ b/python/runtime/common/py_EvolveResult.h
@@ -6,11 +6,9 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
 /// @brief Binds `cudaq.EvolveResult` and `cudaq.AsyncEvolveResult`.
-void bindEvolveResult(py::module &mod);
+void bindEvolveResult(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/common/py_ExecutionContext.cpp b/python/runtime/common/py_ExecutionContext.cpp
index 83b71fbea5e..132462462de 100644
--- a/python/runtime/common/py_ExecutionContext.cpp
+++ b/python/runtime/common/py_ExecutionContext.cpp
@@ -12,10 +12,10 @@
 #include "cudaq/utils/cudaq_utils.h"
 #include "mlir/ExecutionEngine/ExecutionEngine.h"
 #include <fmt/core.h>
-#include <pybind11/complex.h>
-#include <pybind11/stl.h>
-
-namespace py = pybind11;
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
 
 namespace nvqir {
 std::string_view getQirOutputLog();
@@ -28,30 +28,29 @@ class PersistJITEngine {};
 
 namespace cudaq {
 
-void bindExecutionContext(py::module &mod) {
-  py::class_<cudaq::ExecutionContext>(mod, "ExecutionContext")
-      .def(py::init<std::string>())
-      .def(py::init<std::string, std::size_t, std::size_t>(), py::arg("name"),
-           py::arg("shots"), py::arg("qpu_id") = 0)
-      .def_readwrite("kernelName", &cudaq::ExecutionContext::kernelName)
-      .def_readonly("result", &cudaq::ExecutionContext::result)
-      .def_readwrite("asyncExec", &cudaq::ExecutionContext::asyncExec)
-      .def_readonly("asyncResult", &cudaq::ExecutionContext::asyncResult)
-      .def_readwrite("hasConditionalsOnMeasureResults",
-                     &cudaq::ExecutionContext::hasConditionalsOnMeasureResults)
-      .def_readwrite("totalIterations",
-                     &cudaq::ExecutionContext::totalIterations)
-      .def_readwrite("batchIteration", &cudaq::ExecutionContext::batchIteration)
-      .def_readwrite("numberTrajectories",
-                     &cudaq::ExecutionContext::numberTrajectories)
-      .def_readwrite("explicitMeasurements",
-                     &cudaq::ExecutionContext::explicitMeasurements)
-      .def_readwrite("allowJitEngineCaching",
-                     &cudaq::ExecutionContext::allowJitEngineCaching)
-      .def_readwrite("useParametricJit",
-                     &cudaq::ExecutionContext::useParametricJit)
-      .def_readonly("invocationResultBuffer",
-                    &cudaq::ExecutionContext::invocationResultBuffer)
+void bindExecutionContext(nanobind::module_ &mod) {
+  nanobind::class_<cudaq::ExecutionContext>(mod, "ExecutionContext")
+      .def(nanobind::init<std::string>())
+      .def(nanobind::init<std::string, std::size_t, std::size_t>(),
+           nanobind::arg("name"), nanobind::arg("shots"),
+           nanobind::arg("qpu_id") = 0)
+      .def_rw("kernelName", &cudaq::ExecutionContext::kernelName)
+      .def_ro("result", &cudaq::ExecutionContext::result)
+      .def_rw("asyncExec", &cudaq::ExecutionContext::asyncExec)
+      .def_ro("asyncResult", &cudaq::ExecutionContext::asyncResult)
+      .def_rw("hasConditionalsOnMeasureResults",
+              &cudaq::ExecutionContext::hasConditionalsOnMeasureResults)
+      .def_rw("totalIterations", &cudaq::ExecutionContext::totalIterations)
+      .def_rw("batchIteration", &cudaq::ExecutionContext::batchIteration)
+      .def_rw("numberTrajectories",
+              &cudaq::ExecutionContext::numberTrajectories)
+      .def_rw("explicitMeasurements",
+              &cudaq::ExecutionContext::explicitMeasurements)
+      .def_rw("allowJitEngineCaching",
+              &cudaq::ExecutionContext::allowJitEngineCaching)
+      .def_rw("useParametricJit", &cudaq::ExecutionContext::useParametricJit)
+      .def_ro("invocationResultBuffer",
+              &cudaq::ExecutionContext::invocationResultBuffer)
       .def("unset_jit_engine",
            [&](cudaq::ExecutionContext &execCtx) {
              if (execCtx.jitEng) {
@@ -68,44 +67,50 @@ void bindExecutionContext(py::module &mod) {
            [](cudaq::ExecutionContext &ctx) { return ctx.expectationValue; })
       // ----- Context management using with blocks -----
       // Unlike in C++, we do not support nested execution contexts in Python.
-      .def("__enter__",
-           [](cudaq::ExecutionContext &ctx) -> ExecutionContext & {
-             if (cudaq::getExecutionContext()) {
-               throw std::runtime_error("Context already set. Nested execution "
-                                        "contexts are not supported in Python");
-             }
-             auto &platform = cudaq::get_platform();
-             platform.configureExecutionContext(ctx);
-             cudaq::detail::setExecutionContext(&ctx);
-             platform.beginExecution();
-             return ctx;
-           })
-      .def("__exit__", [](cudaq::ExecutionContext &ctx, py::object type,
-                          py::object value, py::object traceback) {
-        if (type.is_none()) {
-          // Normal exit: finalize results, clean up the simulator,
-          // and reset the context (guaranteed even if finalize throws).
-          auto &platform = cudaq::get_platform();
-          detail::try_finally(
-              [&] {
+      .def(
+          "__enter__",
+          [](cudaq::ExecutionContext &ctx) -> ExecutionContext & {
+            if (cudaq::getExecutionContext()) {
+              throw std::runtime_error("Context already set. Nested execution "
+                                       "contexts are not supported in Python");
+            }
+            auto &platform = cudaq::get_platform();
+            platform.configureExecutionContext(ctx);
+            cudaq::detail::setExecutionContext(&ctx);
+            platform.beginExecution();
+            return ctx;
+          },
+          nanobind::rv_policy::reference)
+      .def(
+          "__exit__",
+          [](cudaq::ExecutionContext &ctx, nanobind::object type,
+             nanobind::object value, nanobind::object traceback) {
+            if (type.is_none()) {
+              // Normal exit: finalize results, clean up the simulator,
+              // and reset the context (guaranteed even if finalize throws).
+              auto &platform = cudaq::get_platform();
+              detail::try_finally(
+                  [&] {
+                    platform.finalizeExecutionContext(ctx);
+                    platform.endExecution();
+                  },
+                  detail::resetExecutionContext);
+            } else {
+              // The kernel threw. Still need to tear down the platform so
+              // the simulator doesn't carry stale state into the next run.
+              // Separate invoke_no_throw so the context reset always runs.
+              detail::invoke_no_throw([&] {
+                auto &platform = cudaq::get_platform();
                 platform.finalizeExecutionContext(ctx);
                 platform.endExecution();
-              },
-              detail::resetExecutionContext);
-        } else {
-          // The kernel threw. Still need to tear down the platform so
-          // the simulator doesn't carry stale state into the next run.
-          // Separate invoke_no_throw so the context reset always runs.
-          detail::invoke_no_throw([&] {
-            auto &platform = cudaq::get_platform();
-            platform.finalizeExecutionContext(ctx);
-            platform.endExecution();
-          });
-          // Always reset context, even if the above cleanup failed.
-          detail::invoke_no_throw(detail::resetExecutionContext);
-        }
-        return false;
-      });
+              });
+              // Always reset context, even if the above cleanup failed.
+              detail::invoke_no_throw(detail::resetExecutionContext);
+            }
+            return false;
+          },
+          nanobind::arg("type").none(), nanobind::arg("value").none(),
+          nanobind::arg("traceback").none());
   mod.def("supportsExplicitMeasurements", []() {
     auto &platform = cudaq::get_platform();
     return platform.supports_explicit_measurements();
@@ -121,33 +126,35 @@ void bindExecutionContext(py::module &mod) {
         return !isRemoteSimulator &&
                (platform.is_remote() || platform.is_emulated());
       },
-      py::arg("qpuId") = 0);
+      nanobind::arg("qpuId") = 0);
   mod.def("getQirOutputLog", []() { return nvqir::getQirOutputLog(); });
   mod.def("clearQirOutputLog", []() { nvqir::clearQirOutputLog(); });
   mod.def("decodeQirOutputLog",
-          [](const std::string &outputLog, py::buffer decodedResults) {
+          [](const std::string &outputLog, nanobind::bytearray decodedResults) {
             cudaq::RecordLogParser parser;
             parser.parse(outputLog);
-            auto info = decodedResults.request();
-            // Get the buffer and length of buffer (in bytes) from the parser.
             auto *origBuffer = parser.getBufferPtr();
             const std::size_t bufferSize = parser.getBufferSize();
-            std::memcpy(info.ptr, origBuffer, bufferSize);
+            std::memcpy(decodedResults.data(), origBuffer, bufferSize);
           });
 
-  py::class_<PersistJITEngine>(
+  nanobind::class_<PersistJITEngine>(
       mod, "reuse_compiler_artifacts",
       "Within this context, CUDAQ will blindly reuse compiled objects."
       "It is up to the user to ensure that there are never two distinct"
       "computations launched within a single context.")
-      .def(py::init())
+      .def(nanobind::init<>())
       .def("__enter__",
            [](PersistJITEngine &ctx) -> void {
              cudaq::compiler_artifact::enablePersistentJITEngine();
            })
-      .def("__exit__", [](PersistJITEngine &ctx, py::object type,
-                          py::object value, py::object traceback) {
-        cudaq::compiler_artifact::disablePersistentJITEngine();
-      });
+      .def(
+          "__exit__",
+          [](PersistJITEngine &ctx, nanobind::object type,
+             nanobind::object value, nanobind::object traceback) {
+            cudaq::compiler_artifact::disablePersistentJITEngine();
+          },
+          nanobind::arg("type").none(), nanobind::arg("value").none(),
+          nanobind::arg("traceback").none());
 }
 } // namespace cudaq
diff --git a/python/runtime/common/py_ExecutionContext.h b/python/runtime/common/py_ExecutionContext.h
index d4004941135..7df4e909b43 100644
--- a/python/runtime/common/py_ExecutionContext.h
+++ b/python/runtime/common/py_ExecutionContext.h
@@ -8,10 +8,8 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
-void bindExecutionContext(py::module &mod);
+void bindExecutionContext(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/common/py_NoiseModel.cpp b/python/runtime/common/py_NoiseModel.cpp
index 82563f86284..cf4f96b85cc 100644
--- a/python/runtime/common/py_NoiseModel.cpp
+++ b/python/runtime/common/py_NoiseModel.cpp
@@ -10,43 +10,41 @@
 #include "common/NoiseModel.h"
 #include "cudaq.h"
 #include <iostream>
-#include <pybind11/complex.h>
-#include <pybind11/functional.h>
-#include <pybind11/stl.h>
+#include <nanobind/ndarray.h>
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/function.h>
+#include <nanobind/stl/map.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
 
 namespace cudaq {
 
-/// @brief Extract the array data from a buffer_info into our
+/// @brief Extract the array data from a 2-d ndarray into our
 /// own allocated data pointer.
 /// This supports 2-d array in either row or column major.
-void extractKrausData(py::buffer_info &info, complex *data) {
-  if (info.format != py::format_descriptor<complex>::format())
-    throw std::runtime_error(
-        "Incompatible buffer format, must be np.complex128.");
-
-  if (info.ndim != 2)
-    throw std::runtime_error("Incompatible buffer shape " +
-                             std::to_string(info.ndim) + ".");
+void extractKrausData(nanobind::ndarray<std::complex<double>, nanobind::ndim<2>,
+                                        nanobind::c_contig>
+                          arr,
+                      complex *data) {
+  auto rows = arr.shape(0);
+  auto cols = arr.shape(1);
+  auto *srcData = static_cast<const std::complex<double> *>(arr.data());
 
   constexpr bool rowMajor = true;
-  typedef Eigen::MatrixXcd::Scalar Scalar;
   typedef Eigen::Matrix<std::complex<double>, Eigen::Dynamic, Eigen::Dynamic,
                         Eigen::RowMajor>
       RowMajorMat;
   auto strides = Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(
-      info.strides[rowMajor ? 0 : 1] / (py::ssize_t)sizeof(Scalar),
-      info.strides[rowMajor ? 1 : 0] / (py::ssize_t)sizeof(Scalar));
-  auto map =
-      Eigen::Map<RowMajorMat, 0, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>>(
-          static_cast<Scalar *>(info.ptr), info.shape[0], info.shape[1],
-          strides);
+      arr.stride(rowMajor ? 0 : 1), arr.stride(rowMajor ? 1 : 0));
+  auto map = Eigen::Map<const RowMajorMat, 0,
+                        Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>>(
+      srcData, rows, cols, strides);
   RowMajorMat eigenMat(map);
-  memcpy(data, eigenMat.data(),
-         sizeof(complex) * (info.shape[0] * info.shape[1]));
+  memcpy(data, eigenMat.data(), sizeof(complex) * (rows * cols));
 }
 
 /// @brief Bind the cudaq::noise_model, kraus_op, and kraus_channel.
-void bindNoiseModel(py::module &mod) {
+void bindNoiseModel(nanobind::module_ &mod) {
 
   mod.def("set_noise", &set_noise, "Set the underlying noise model.");
   mod.def("unset_noise", &unset_noise,
@@ -54,86 +52,87 @@ void bindNoiseModel(py::module &mod) {
   mod.def(
       "get_noise", []() { return cudaq::get_platform().get_noise(); },
       "Get the underlying noise model.");
-  py::class_<noise_model>(
+  nanobind::class_<noise_model>(
       mod, "NoiseModel",
       "The `NoiseModel` defines a set of :class:`KrausChannel`'s applied to "
       "specific qubits after the invocation of specified quantum operations.")
-      .def(py::init<>([mod]() {
-             // Create the noise model
-             auto model = std::make_unique<noise_model>();
-
-             // Define a map of channel names to generator functions
-             static std::map<std::string, std::function<kraus_channel(
-                                              const std::vector<double> &)>>
-                 channelGenerators = {
-                     {"DepolarizationChannel",
-                      [](const std::vector<double> &p) -> kraus_channel {
-                        return depolarization_channel(p);
-                      }},
-                     {"AmplitudeDampingChannel",
-                      [](const std::vector<double> &p) -> kraus_channel {
-                        return amplitude_damping_channel(p);
-                      }},
-                     {"BitFlipChannel",
-                      [](const std::vector<double> &p) -> kraus_channel {
-                        return bit_flip_channel(p);
-                      }},
-                     {"PhaseFlipChannel",
-                      [](const std::vector<double> &p) -> kraus_channel {
-                        return phase_flip_channel(p);
-                      }},
-                     {"XError",
-                      [](const std::vector<double> &p) -> kraus_channel {
-                        return x_error(p);
-                      }},
-                     {"YError",
-                      [](const std::vector<double> &p) -> kraus_channel {
-                        return y_error(p);
-                      }},
-                     {"ZError",
-                      [](const std::vector<double> &p) -> kraus_channel {
-                        return z_error(p);
-                      }},
-                     {"PhaseDamping",
-                      [](const std::vector<double> &p) -> kraus_channel {
-                        return phase_damping(p);
-                      }},
-                     {"Pauli1",
-                      [](const std::vector<double> &p) -> kraus_channel {
-                        return pauli1(p);
-                      }},
-                     {"Pauli2",
-                      [](const std::vector<double> &p) -> kraus_channel {
-                        return pauli2(p);
-                      }},
-                     {"Depolarization1",
-                      [](const std::vector<double> &p) -> kraus_channel {
-                        return depolarization1(p);
-                      }},
-                     {"Depolarization2",
-                      [](const std::vector<double> &p) -> kraus_channel {
-                        return depolarization2(p);
-                      }}};
-
-             // Register each channel generator
-             for (const auto &[name, generator] : channelGenerators) {
-               if (py::hasattr(mod, name.c_str())) {
-                 py::type channelType = py::getattr(mod, name.c_str());
-                 auto key = py::hash(channelType);
-                 model->register_channel(key, generator);
-               }
-             }
-
-             return model;
-           }),
-           "Construct a noise model with all built-in channels pre-registered.")
+      .def(
+          "__init__",
+          [mod](noise_model *self) {
+            new (self) noise_model();
+
+            // Define a map of channel names to generator functions
+            static std::map<std::string, std::function<kraus_channel(
+                                             const std::vector<double> &)>>
+                channelGenerators = {
+                    {"DepolarizationChannel",
+                     [](const std::vector<double> &p) -> kraus_channel {
+                       return depolarization_channel(p);
+                     }},
+                    {"AmplitudeDampingChannel",
+                     [](const std::vector<double> &p) -> kraus_channel {
+                       return amplitude_damping_channel(p);
+                     }},
+                    {"BitFlipChannel",
+                     [](const std::vector<double> &p) -> kraus_channel {
+                       return bit_flip_channel(p);
+                     }},
+                    {"PhaseFlipChannel",
+                     [](const std::vector<double> &p) -> kraus_channel {
+                       return phase_flip_channel(p);
+                     }},
+                    {"XError",
+                     [](const std::vector<double> &p) -> kraus_channel {
+                       return x_error(p);
+                     }},
+                    {"YError",
+                     [](const std::vector<double> &p) -> kraus_channel {
+                       return y_error(p);
+                     }},
+                    {"ZError",
+                     [](const std::vector<double> &p) -> kraus_channel {
+                       return z_error(p);
+                     }},
+                    {"PhaseDamping",
+                     [](const std::vector<double> &p) -> kraus_channel {
+                       return phase_damping(p);
+                     }},
+                    {"Pauli1",
+                     [](const std::vector<double> &p) -> kraus_channel {
+                       return pauli1(p);
+                     }},
+                    {"Pauli2",
+                     [](const std::vector<double> &p) -> kraus_channel {
+                       return pauli2(p);
+                     }},
+                    {"Depolarization1",
+                     [](const std::vector<double> &p) -> kraus_channel {
+                       return depolarization1(p);
+                     }},
+                    {"Depolarization2",
+                     [](const std::vector<double> &p) -> kraus_channel {
+                       return depolarization2(p);
+                     }}};
+
+            // Register each channel generator
+            for (const auto &[name, generator] : channelGenerators) {
+              if (nanobind::hasattr(mod, name.c_str())) {
+                nanobind::type_object channelType =
+                    nanobind::borrow<nanobind::type_object>(
+                        nanobind::getattr(mod, name.c_str()));
+                auto key = nanobind::hash(channelType);
+                self->register_channel(key, generator);
+              }
+            }
+          },
+          "Construct a noise model with all built-in channels pre-registered.")
       .def(
           "register_channel",
-          [](noise_model &self, const py::type krausT) {
-            auto key = py::hash(krausT);
+          [](noise_model &self, const nanobind::type_object krausT) {
+            auto key = nanobind::hash(krausT);
             std::function<kraus_channel(const std::vector<double> &)> lambda =
                 [krausT](const std::vector<double> &p) -> kraus_channel {
-              return krausT(p).cast<kraus_channel>();
+              return nanobind::cast<kraus_channel>(krausT(p));
             };
             self.register_channel(key, lambda);
           },
@@ -144,7 +143,8 @@ void bindNoiseModel(py::module &mod) {
              std::vector<std::size_t> &qubits, kraus_channel &channel) {
             self.add_channel(opName, qubits, channel);
           },
-          py::arg("operator"), py::arg("qubits"), py::arg("channel"),
+          nanobind::arg("operator"), nanobind::arg("qubits"),
+          nanobind::arg("channel"),
           R"#(Add the given :class:`KrausChannel` to be applied after invocation
 of the specified quantum operation.
 
@@ -159,7 +159,7 @@ of the specified quantum operation.
              const noise_model::PredicateFuncTy &pre) {
             self.add_channel(opName, pre);
           },
-          py::arg("operator"), py::arg("pre"),
+          nanobind::arg("operator"), nanobind::arg("pre"),
           R"#(Add the given :class:`KrausChannel` generator callback to be applied after invocation
 of the specified quantum operation.
 
@@ -173,7 +173,8 @@ of the specified quantum operation.
              std::size_t num_controls = 0) {
             self.add_all_qubit_channel(opName, channel, num_controls);
           },
-          py::arg("operator"), py::arg("channel"), py::arg("num_controls") = 0,
+          nanobind::arg("operator"), nanobind::arg("channel"),
+          nanobind::arg("num_controls") = 0,
 
           R"#(Add the given :class:`KrausChannel` to be applied after invocation
 of the specified quantum operation on arbitrary qubits.
@@ -189,7 +190,7 @@ of the specified quantum operation on arbitrary qubits.
              const std::vector<std::size_t> &qubits) {
             return self.get_channels(op, qubits);
           },
-          py::arg("operator"), py::arg("qubits"),
+          nanobind::arg("operator"), nanobind::arg("qubits"),
           "Return the :class:`KrausChannel`'s that make up this noise model.")
       .def(
           "get_channels",
@@ -198,35 +199,44 @@ of the specified quantum operation on arbitrary qubits.
              const std::vector<std::size_t> &controls) {
             return self.get_channels(op, qubits, controls);
           },
-          py::arg("operator"), py::arg("qubits"), py::arg("controls"),
+          nanobind::arg("operator"), nanobind::arg("qubits"),
+          nanobind::arg("controls"),
           "Return the :class:`KrausChannel`'s that make up this noise model.");
 }
 
-void bindKrausOp(py::module &mod) {
-  py::class_<kraus_op>(
-      mod, "KrausOperator", py::buffer_protocol(),
+void bindKrausOp(nanobind::module_ &mod) {
+  nanobind::class_<kraus_op>(
+      mod, "KrausOperator",
       "The `KrausOperator` is represented by a matrix and serves as an element "
       "of a quantum channel such that :code:`Sum Ki Ki^dag = I.`")
-      .def_buffer([](kraus_op &op) -> py::buffer_info {
-        return py::buffer_info(op.data.data(), sizeof(complex),
-                               py::format_descriptor<complex>::format(), 2,
-                               {op.nRows, op.nCols},
-                               {sizeof(complex) * op.nCols, sizeof(complex)});
-      })
-      .def(py::init([](const py::buffer &b) {
-             py::buffer_info info = b.request();
-             std::vector<complex> v(info.shape[0] * info.shape[1]);
-             extractKrausData(info, v.data());
-             return kraus_op(v);
-           }),
-           "Create a :class:`KrausOperator` from a buffer of data, like a "
-           "numpy array.")
-      .def_readonly("row_count", &kraus_op::nRows,
-                    "The number of rows in the matrix representation of this "
-                    ":class:`KrausOperator`.")
-      .def_readonly("col_count", &kraus_op::nCols,
-                    "The number of columns in the matrix representation of "
-                    "this :class:`KrausOperator`.");
+      .def(
+          "__array__",
+          [](kraus_op &op, nanobind::object dtype_obj,
+             nanobind::object copy_obj) {
+            size_t shape[2] = {op.nRows, op.nCols};
+            return nanobind::ndarray<nanobind::numpy, std::complex<double>>(
+                op.data.data(), 2, shape, nanobind::handle());
+          },
+          nanobind::arg("dtype") = nanobind::none(),
+          nanobind::arg("copy") = nanobind::none())
+      .def(
+          "__init__",
+          [](kraus_op *self,
+             nanobind::ndarray<std::complex<double>, nanobind::ndim<2>,
+                               nanobind::c_contig>
+                 arr) {
+            std::vector<complex> v(arr.shape(0) * arr.shape(1));
+            extractKrausData(arr, v.data());
+            new (self) kraus_op(v);
+          },
+          "Create a :class:`KrausOperator` from a buffer of data, like a "
+          "numpy array.")
+      .def_ro("row_count", &kraus_op::nRows,
+              "The number of rows in the matrix representation of this "
+              ":class:`KrausOperator`.")
+      .def_ro("col_count", &kraus_op::nCols,
+              "The number of columns in the matrix representation of "
+              "this :class:`KrausOperator`.");
 }
 
 // Need a trampoline class to make this sub-class-able from Python
@@ -235,8 +245,8 @@ class PyKrausChannel : public kraus_channel {
   using kraus_channel::kraus_channel;
 };
 
-void bindNoiseChannels(py::module &mod) {
-  py::enum_<cudaq::noise_model_type>(mod, "NoiseModelType")
+void bindNoiseChannels(nanobind::module_ &mod) {
+  nanobind::enum_<cudaq::noise_model_type>(mod, "NoiseModelType")
       .value("Unknown", cudaq::noise_model_type::unknown)
       .value("DepolarizationChannel",
              cudaq::noise_model_type::depolarization_channel)
@@ -254,46 +264,50 @@ void bindNoiseChannels(py::module &mod) {
       .value("Depolarization1", cudaq::noise_model_type::depolarization1)
       .value("Depolarization2", cudaq::noise_model_type::depolarization2);
 
-  py::class_<kraus_channel, PyKrausChannel>(
-      mod, "KrausChannel", py::dynamic_attr(),
+  nanobind::class_<kraus_channel, PyKrausChannel>(
+      mod, "KrausChannel",
       "The `KrausChannel` is composed of a list of "
       ":class:`KrausOperator`'s and "
       "is applied to a specific qubit or set of qubits.")
-      .def(py::init<>(), "Create an empty :class:`KrausChannel`")
-      .def(py::init<const std::vector<kraus_op> &>(),
+      .def(nanobind::init<>(), "Create an empty :class:`KrausChannel`")
+      .def(nanobind::init<const std::vector<kraus_op> &>(),
            "Create a :class:`KrausChannel` composed of a list of "
            ":class:`KrausOperator`'s.")
-      .def(py::init([](py::list ops) {
-             std::vector<kraus_op> kops;
-             for (std::size_t i = 0; i < ops.size(); i++) {
-               auto buffer = ops[i].cast<py::buffer>();
-               auto info = buffer.request();
-               auto shape = info.shape;
-               std::vector<complex> v(shape[0] * shape[1]);
-               extractKrausData(info, v.data());
-               kops.emplace_back(v);
-             }
-             return kraus_channel(kops);
-           }),
-           "Create a :class:`KrausChannel` given a list of "
-           ":class:`KrausOperator`'s.")
-      .def_readwrite("parameters", &kraus_channel::parameters)
-      .def_readwrite("noise_type", &kraus_channel::noise_type)
+      .def(
+          "__init__",
+          [](kraus_channel *self, nanobind::list ops) {
+            std::vector<kraus_op> kops;
+            for (std::size_t i = 0; i < ops.size(); i++) {
+              auto arr = nanobind::cast<nanobind::ndarray<
+                  std::complex<double>, nanobind::ndim<2>, nanobind::c_contig>>(
+                  ops[i]);
+              auto rows = arr.shape(0);
+              auto cols = arr.shape(1);
+              std::vector<complex> v(rows * cols);
+              extractKrausData(arr, v.data());
+              kops.emplace_back(v);
+            }
+            new (self) kraus_channel(kops);
+          },
+          "Create a :class:`KrausChannel` given a list of "
+          ":class:`KrausOperator`'s.")
+      .def_rw("parameters", &kraus_channel::parameters)
+      .def_rw("noise_type", &kraus_channel::noise_type)
       .def("get_ops", &kraus_channel::get_ops,
            "Return the :class:`KrausOperator`'s in this :class:`KrausChannel`.")
       .def(
           "__getitem__",
           [](kraus_channel &self, std::size_t idx) { return self[idx]; },
-          py::arg("index"),
+          nanobind::arg("index"),
           "Return the :class:`KrausOperator` at the given index in this "
           ":class:`KrausChannel`.")
       .def(
           "append",
           [](kraus_channel &self, kraus_op op) { self.push_back(op); },
-          py::arg("operator"),
+          nanobind::arg("operator"),
           "Add a :class:`KrausOperator` to this :class:`KrausChannel`.");
 
-  py::class_<depolarization_channel, kraus_channel>(
+  nanobind::class_<depolarization_channel, kraus_channel>(
       mod, "DepolarizationChannel",
       R"#(Models the decoherence of the qubit state and phase into a mixture "
       of the computational basis states, `|0>` and `|1>`.
@@ -318,15 +332,15 @@ void bindNoiseChannels(py::module &mod) {
       For `probability = 0.0`, the channel will behave noise-free.
       For `probability = 0.75`, the channel will fully depolarize the state.
       For `probability = 1.0`, the channel will be uniform.)#")
-      .def(py::init<std::vector<double>>())
-      .def(py::init<double>(), py::arg("probability"),
+      .def(nanobind::init<std::vector<double>>())
+      .def(nanobind::init<double>(), nanobind::arg("probability"),
            "Initialize the `DepolarizationChannel` with the provided "
            "`probability`.")
-      .def_readonly_static(
+      .def_ro_static(
           "num_parameters", &depolarization_channel::num_parameters,
           "The number of parameters this channel requires at construction.");
 
-  py::class_<amplitude_damping_channel, kraus_channel>(
+  nanobind::class_<amplitude_damping_channel, kraus_channel>(
       mod, "AmplitudeDampingChannel",
       R"#(Models the dissipation of energy due to system interactions with the
       environment.
@@ -341,15 +355,15 @@ void bindNoiseChannels(py::module &mod) {
       representing the probability that the qubit will decay to its ground
       state. The probability of the qubit remaining in the same state is
       therefore `1 - probability`.)#")
-      .def(py::init<std::vector<double>>())
-      .def(py::init<double>(), py::arg("probability"),
+      .def(nanobind::init<std::vector<double>>())
+      .def(nanobind::init<double>(), nanobind::arg("probability"),
            "Initialize the `AmplitudeDampingChannel` with the provided "
            "`probability`.")
-      .def_readonly_static(
+      .def_ro_static(
           "num_parameters", &amplitude_damping_channel::num_parameters,
           "The number of parameters this channel requires at construction.");
 
-  py::class_<bit_flip_channel, kraus_channel>(
+  nanobind::class_<bit_flip_channel, kraus_channel>(
       mod, "BitFlipChannel",
       R"#(Models the decoherence of the qubit state. Its constructor expects a
       float value, `probability`, representing the probability that the qubit
@@ -364,14 +378,14 @@ void bindNoiseChannels(py::module &mod) {
 
       The probability of the qubit remaining in the same state is therefore `1 -
       probability`.)#")
-      .def(py::init<std::vector<double>>())
-      .def(py::init<double>(), py::arg("probability"),
+      .def(nanobind::init<std::vector<double>>())
+      .def(nanobind::init<double>(), nanobind::arg("probability"),
            "Initialize the `BitFlipChannel` with the provided `probability`.")
-      .def_readonly_static(
+      .def_ro_static(
           "num_parameters", &bit_flip_channel::num_parameters,
           "The number of parameters this channel requires at construction.");
 
-  py::class_<phase_flip_channel, kraus_channel>(
+  nanobind::class_<phase_flip_channel, kraus_channel>(
       mod, "PhaseFlipChannel",
       R"#(Models the decoherence of the qubit phase. Its constructor expects a
       float value, `probability`, representing the probability of a random
@@ -385,95 +399,95 @@ void bindNoiseChannels(py::module &mod) {
 
       The probability of the qubit phase remaining untouched is therefore
       `1 - probability`.)#")
-      .def(py::init<std::vector<double>>())
-      .def(py::init<double>(), py::arg("probability"),
+      .def(nanobind::init<std::vector<double>>())
+      .def(nanobind::init<double>(), nanobind::arg("probability"),
            "Initialize the `PhaseFlipChannel` with the provided `probability`.")
-      .def_readonly_static(
+      .def_ro_static(
           "num_parameters", &phase_flip_channel::num_parameters,
           "The number of parameters this channel requires at construction.");
 
-  py::class_<phase_damping, kraus_channel>(
+  nanobind::class_<phase_damping, kraus_channel>(
       mod, "PhaseDamping",
       R"#(A Kraus channel that models the single-qubit phase damping error. This
       is similar to AmplitudeDamping, but for phase.)#")
-      .def(py::init<std::vector<double>>())
-      .def(py::init<double>())
-      .def_readonly_static(
+      .def(nanobind::init<std::vector<double>>())
+      .def(nanobind::init<double>())
+      .def_ro_static(
           "num_parameters", &phase_damping::num_parameters,
           "The number of parameters this channel requires at construction.");
 
-  py::class_<z_error, kraus_channel>(
+  nanobind::class_<z_error, kraus_channel>(
       mod, "ZError",
       R"#(A Pauli error that applies the Z operator when an error
       occurs. It is the same as PhaseFlipChannel.)#")
-      .def(py::init<std::vector<double>>())
-      .def(py::init<double>())
-      .def_readonly_static(
+      .def(nanobind::init<std::vector<double>>())
+      .def(nanobind::init<double>())
+      .def_ro_static(
           "num_parameters", &z_error::num_parameters,
           "The number of parameters this channel requires at construction.");
 
-  py::class_<x_error, kraus_channel>(
+  nanobind::class_<x_error, kraus_channel>(
       mod, "XError",
       R"#(A Pauli error that applies the X operator when an error
       occurs. It is the same as BitFlipChannel.)#")
-      .def(py::init<std::vector<double>>())
-      .def(py::init<double>())
-      .def_readonly_static(
+      .def(nanobind::init<std::vector<double>>())
+      .def(nanobind::init<double>())
+      .def_ro_static(
           "num_parameters", &x_error::num_parameters,
           "The number of parameters this channel requires at construction.");
 
-  py::class_<y_error, kraus_channel>(
+  nanobind::class_<y_error, kraus_channel>(
       mod, "YError",
       R"#(A Pauli error that applies the Y operator when an error
       occurs.)#")
-      .def(py::init<std::vector<double>>())
-      .def(py::init<double>())
-      .def_readonly_static(
+      .def(nanobind::init<std::vector<double>>())
+      .def(nanobind::init<double>())
+      .def_ro_static(
           "num_parameters", &y_error::num_parameters,
           "The number of parameters this channel requires at construction.");
 
-  py::class_<pauli1, kraus_channel>(
+  nanobind::class_<pauli1, kraus_channel>(
       mod, "Pauli1",
       R"#(A single-qubit Pauli error that applies either an X error, Y error,
       or Z error. The probability of each X, Y, or Z error is supplied as a
       parameter.)#")
-      .def(py::init<std::vector<double>>())
-      .def_readonly_static(
+      .def(nanobind::init<std::vector<double>>())
+      .def_ro_static(
           "num_parameters", &pauli1::num_parameters,
           "The number of parameters this channel requires at construction.");
 
-  py::class_<pauli2, kraus_channel>(
+  nanobind::class_<pauli2, kraus_channel>(
       mod, "Pauli2",
       R"#(A 2-qubit Pauli error that applies one of the following errors, with
       the probabilities specified as a vector. Possible errors: IX, IY, IZ, XI, XX,
       XY, XZ, YI, YX, YY, YZ, ZI, ZX, ZY, and ZZ.)#")
-      .def(py::init<std::vector<double>>())
-      .def_readonly_static(
+      .def(nanobind::init<std::vector<double>>())
+      .def_ro_static(
           "num_parameters", &pauli2::num_parameters,
           "The number of parameters this channel requires at construction.");
 
-  py::class_<depolarization1, kraus_channel>(
+  nanobind::class_<depolarization1, kraus_channel>(
       mod, "Depolarization1",
       R"#(The same as DepolarizationChannel (single qubit depolarization))#")
-      .def(py::init<std::vector<double>>())
-      .def(py::init<double>())
-      .def_readonly_static(
+      .def(nanobind::init<std::vector<double>>())
+      .def(nanobind::init<double>())
+      .def_ro_static(
           "num_parameters", &depolarization1::num_parameters,
           "The number of parameters this channel requires at construction.");
 
-  py::class_<depolarization2, kraus_channel>(
+  nanobind::class_<depolarization2, kraus_channel>(
       mod, "Depolarization2",
       R"#(A 2-qubit depolarization error that applies one of the following
       errors. Possible errors: IX, IY, IZ, XI, XX, XY, XZ, YI, YX, YY, YZ, ZI, ZX,
       ZY, and ZZ.)#")
-      .def(py::init<std::vector<double>>())
-      .def(py::init<double>())
-      .def_readonly_static(
+      .def(nanobind::init<std::vector<double>>())
+      .def(nanobind::init<double>())
+      .def_ro_static(
           "num_parameters", &depolarization2::num_parameters,
           "The number of parameters this channel requires at construction.");
 }
 
-void bindNoise(py::module &mod) {
+void bindNoise(nanobind::module_ &mod) {
   bindNoiseModel(mod);
   bindKrausOp(mod);
   bindNoiseChannels(mod);
diff --git a/python/runtime/common/py_NoiseModel.h b/python/runtime/common/py_NoiseModel.h
index 75c0f0b8df7..cc03a52e138 100644
--- a/python/runtime/common/py_NoiseModel.h
+++ b/python/runtime/common/py_NoiseModel.h
@@ -6,11 +6,9 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
 /// @brief Bind the cudaq::noise_model data-type to Python.
-void bindNoise(py::module &mod);
+void bindNoise(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/common/py_ObserveResult.cpp b/python/runtime/common/py_ObserveResult.cpp
index d26185673ee..5383391b9dc 100644
--- a/python/runtime/common/py_ObserveResult.cpp
+++ b/python/runtime/common/py_ObserveResult.cpp
@@ -11,21 +11,23 @@
 #include "common/ObserveResult.h"
 #include "cudaq/algorithms/observe.h"
 
-namespace py = pybind11;
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
+
 namespace {
 // FIXME(OperatorCpp): Remove this when the operator class is implemented in
 // C++
-cudaq::spin_op to_spin_op(py::object &obj) {
-  if (py::hasattr(obj, "_to_spinop"))
-    return obj.attr("_to_spinop")().cast<cudaq::spin_op>();
-  return obj.cast<cudaq::spin_op>();
+cudaq::spin_op to_spin_op(nanobind::object &obj) {
+  if (nanobind::hasattr(obj, "_to_spinop"))
+    return nanobind::cast<cudaq::spin_op>(obj.attr("_to_spinop")());
+  return nanobind::cast<cudaq::spin_op>(obj);
 }
-cudaq::spin_op to_spin_op_term(py::object &obj) {
+cudaq::spin_op to_spin_op_term(nanobind::object &obj) {
   auto op = cudaq::spin_op::empty();
-  if (py::hasattr(obj, "_to_spinop"))
-    op = obj.attr("_to_spinop")().cast<cudaq::spin_op>();
+  if (nanobind::hasattr(obj, "_to_spinop"))
+    op = nanobind::cast<cudaq::spin_op>(obj.attr("_to_spinop")());
   else
-    op = obj.cast<cudaq::spin_op>();
+    op = nanobind::cast<cudaq::spin_op>(obj);
   if (op.num_terms() != 1)
     throw std::invalid_argument("expecting a spin op with a single term");
   return *op.begin();
@@ -46,21 +48,23 @@ namespace cudaq {
 /// @brief Bind the `cudaq::observe_result` and `cudaq::async_observe_result`
 /// data classes to python as `cudaq.ObserveResult` and
 /// `cudaq.AsyncObserveResult`.
-void bindObserveResult(py::module &mod) {
-  py::class_<observe_result>(
+void bindObserveResult(nanobind::module_ &mod) {
+  nanobind::class_<observe_result>(
       mod, "ObserveResult",
       "A data-type containing the results of a call to :func:`observe`. "
       "This includes any measurement counts data, as well as the global "
       "expectation value of the user-defined `spin_operator`.\n")
-      .def(py::init<double, spin_op, sample_result>())
-      .def(py::init(
-          [](double exp_val, const spin_op &spin_op, sample_result result) {
-            return observe_result(exp_val, spin_op, result);
-          }))
-      .def(py::init(
-          [](double exp_val, py::object spin_op, sample_result result) {
-            return observe_result(exp_val, to_spin_op(spin_op), result);
-          }))
+      .def(nanobind::init<double, spin_op, sample_result>())
+      .def("__init__",
+           [](observe_result *self, double exp_val, const spin_op &spin_op,
+              sample_result result) {
+             new (self) observe_result(exp_val, spin_op, result);
+           })
+      .def("__init__",
+           [](observe_result *self, double exp_val, nanobind::object spin_op,
+              sample_result result) {
+             new (self) observe_result(exp_val, to_spin_op(spin_op), result);
+           })
       /// @brief Bind the member functions of `cudaq.ObserveResult`.
       .def("dump", &observe_result::dump,
            "Dump the raw data from the :class:`SampleResult` that are stored "
@@ -79,18 +83,18 @@ void bindObserveResult(py::module &mod) {
           [](observe_result &self, const spin_op_term &sub_term) {
             return self.counts(sub_term);
           },
-          py::arg("sub_term"), "")
+          nanobind::arg("sub_term"), "")
       .def(
           "counts",
-          [](observe_result &self, py::object sub_term) {
+          [](observe_result &self, nanobind::object sub_term) {
             return self.counts(to_spin_op_term(sub_term));
           },
-          py::arg("sub_term"),
-          R"#(Given a `sub_term` of the global `spin_operator` that was passed 
+          nanobind::arg("sub_term"),
+          R"#(Given a `sub_term` of the global `spin_operator` that was passed
 to :func:`observe`, return its measurement counts.
 
 Args:
-  sub_term (`SpinOperator`): An individual sub-term of the 
+  sub_term (`SpinOperator`): An individual sub-term of the
     `spin_operator`.
 
 Returns:
@@ -104,7 +108,7 @@ to :func:`observe`, return its measurement counts.
                 1);
             return self.counts(sub_term);
           },
-          py::arg("sub_term"),
+          nanobind::arg("sub_term"),
           "Deprecated - ensure to pass a SpinOperatorTerm instead of a "
           "SpinOperator")
       .def(
@@ -117,22 +121,22 @@ to :func:`observe`, return its measurement counts.
           [](observe_result &self, const spin_op_term &spin_term) {
             return self.expectation(spin_term);
           },
-          py::arg("sub_term"), "")
+          nanobind::arg("sub_term"), "")
       .def(
           "expectation",
-          [](observe_result &self, py::object spin_term) {
+          [](observe_result &self, nanobind::object spin_term) {
             return self.expectation(to_spin_op_term(spin_term));
           },
-          py::arg("sub_term"),
-          R"#(Return the expectation value of an individual `sub_term` of the 
+          nanobind::arg("sub_term"),
+          R"#(Return the expectation value of an individual `sub_term` of the
 global `spin_operator` that was passed to :func:`observe`.
 
 Args:
-  sub_term (:class:`SpinOperatorTerm`): An individual sub-term of the 
+  sub_term (:class:`SpinOperatorTerm`): An individual sub-term of the
     `spin_operator`.
 
 Returns:
-  float : The expectation value of the `sub_term` with respect to the 
+  float : The expectation value of the `sub_term` with respect to the
   :class:`Kernel` that was passed to :func:`observe`.)#")
       .def(
           "expectation",
@@ -144,36 +148,37 @@ global `spin_operator` that was passed to :func:`observe`.
 
             return self.expectation(spin_term);
           },
-          py::arg("sub_term"),
+          nanobind::arg("sub_term"),
           "Deprecated - ensure to pass a SpinOperatorTerm instead of a "
           "SpinOperator");
 
-  py::class_<async_observe_result>(
+  nanobind::class_<async_observe_result>(
       mod, "AsyncObserveResult",
-      R"#(A data-type containing the results of a call to :func:`observe_async`. 
-      
-The `AsyncObserveResult` contains a future, whose :class:`ObserveResult` 
-may be returned via an invocation of the `get` method. 
+      R"#(A data-type containing the results of a call to :func:`observe_async`.
+
+The `AsyncObserveResult` contains a future, whose :class:`ObserveResult`
+may be returned via an invocation of the `get` method.
 
 This kicks off a wait on the current thread until the results are available.
 
 See `future <https://en.cppreference.com/w/cpp/thread/future>`_
 for more information on this programming pattern.)#")
-      .def(py::init([](std::string inJson, spin_op op) {
-        async_observe_result f(&op);
-        std::istringstream is(inJson);
-        is >> f;
-        return f;
-      }))
-      .def(py::init([](std::string inJson, py::object op) {
-        auto as_spin_op = to_spin_op(op);
-        async_observe_result f(&as_spin_op);
-        std::istringstream is(inJson);
-        is >> f;
-        return f;
-      }))
+      .def("__init__",
+           [](async_observe_result *self, std::string inJson, spin_op op) {
+             new (self) async_observe_result(&op);
+             std::istringstream is(inJson);
+             is >> *self;
+           })
+      .def("__init__",
+           [](async_observe_result *self, std::string inJson,
+              nanobind::object op) {
+             auto as_spin_op = to_spin_op(op);
+             new (self) async_observe_result(&as_spin_op);
+             std::istringstream is(inJson);
+             is >> *self;
+           })
       .def("get", &async_observe_result::get,
-           py::call_guard<py::gil_scoped_release>(),
+           nanobind::call_guard<nanobind::gil_scoped_release>(),
            "Returns the :class:`ObserveResult` from the asynchronous observe "
            "execution.")
       .def("__str__", [](async_observe_result &self) {
diff --git a/python/runtime/common/py_ObserveResult.h b/python/runtime/common/py_ObserveResult.h
index 920a09c78e1..823d0b0ee6a 100644
--- a/python/runtime/common/py_ObserveResult.h
+++ b/python/runtime/common/py_ObserveResult.h
@@ -6,11 +6,9 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
 /// @brief Binds `cudaq.ObserveResult` and `cudaq.AsyncObserveResult`.
-void bindObserveResult(py::module &mod);
+void bindObserveResult(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/common/py_Resources.cpp b/python/runtime/common/py_Resources.cpp
index c777b185aaa..07098a83377 100644
--- a/python/runtime/common/py_Resources.cpp
+++ b/python/runtime/common/py_Resources.cpp
@@ -6,8 +6,11 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include <pybind11/operators.h>
-#include <pybind11/stl.h>
+#include <nanobind/operators.h>
+#include <nanobind/stl/map.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/unordered_map.h>
+#include <nanobind/stl/vector.h>
 
 #include "py_Resources.h"
 
@@ -17,14 +20,14 @@
 
 namespace cudaq {
 
-void bindResources(py::module &mod) {
+void bindResources(nanobind::module_ &mod) {
   using namespace cudaq;
 
-  py::class_<Resources>(
+  nanobind::class_<Resources>(
       mod, "Resources",
-      R"#(A data-type containing the results of a call to :func:`estimate_resources`. 
+      R"#(A data-type containing the results of a call to :func:`estimate_resources`.
 This includes all gate counts.)#")
-      .def(py::init<>())
+      .def(nanobind::init<>())
       .def(
           "dump", [](Resources &self) { self.dump(); },
           "Print a string of the raw resource counts data to the "
@@ -59,41 +62,33 @@ This includes all gate counts.)#")
           "to_dict", [](Resources &self) { return self.gateCounts(); },
           "Return a dictionary of the raw resource counts that are stored in "
           "`self`.\n")
-      .def_property_readonly(
-          "num_qubits", &Resources::getNumQubits,
-          "The total number of qubits allocated in the kernel.\n")
-      .def_property_readonly(
-          "num_used_qubits", &Resources::getNumUsedQubits,
-          "The number of qubits touched by at least one quantum "
-          "operation.\n")
-      .def_property_readonly(
-          "depth", &Resources::getCircuitDepth,
-          "The circuit depth (longest gate chain on any qubit).\n")
-      .def_property_readonly(
+      .def_prop_ro("num_qubits", &Resources::getNumQubits,
+                   "The total number of qubits allocated in the kernel.\n")
+      .def_prop_ro("num_used_qubits", &Resources::getNumUsedQubits,
+                   "The number of qubits touched by at least one quantum "
+                   "operation.\n")
+      .def_prop_ro("depth", &Resources::getCircuitDepth,
+                   "The circuit depth (longest gate chain on any qubit).\n")
+      .def_prop_ro(
           "gate_count_by_arity",
-          [](Resources &self) {
-            return py::dict(py::cast(self.getGateCountsByArity()));
-          },
+          [](Resources &self) { return self.getGateCountsByArity(); },
           "Gate counts by qubit arity, as a dict mapping arity to count.\n")
       .def("gate_count_for_arity", &Resources::getGateCountByArity,
-           py::arg("arity"),
+           nanobind::arg("arity"),
            "Get gate count for a specific qubit arity (total qubits "
            "including controls and targets). Returns 0 if no gates of "
            "that arity exist.")
-      .def("depth_for_arity", &Resources::getDepthByArity, py::arg("arity"),
+      .def("depth_for_arity", &Resources::getDepthByArity,
+           nanobind::arg("arity"),
            "Get circuit depth considering only gates of a specific qubit "
            "arity. Returns 0 if no gates of that arity exist.")
-      .def_property_readonly("multi_qubit_gate_count",
-                             &Resources::getMultiQubitGateCount,
-                             "Total count of gates with 2 or more qubits.\n")
-      .def_property_readonly("multi_qubit_depth",
-                             &Resources::getMultiQubitDepth,
-                             "Max depth across all gate widths >= 2.\n")
-      .def_property_readonly(
+      .def_prop_ro("multi_qubit_gate_count", &Resources::getMultiQubitGateCount,
+                   "Total count of gates with 2 or more qubits.\n")
+      .def_prop_ro("multi_qubit_depth", &Resources::getMultiQubitDepth,
+                   "Max depth across all gate widths >= 2.\n")
+      .def_prop_ro(
           "per_qubit_depth",
-          [](Resources &self) {
-            return py::dict(py::cast(self.getPerQubitDepth()));
-          },
+          [](Resources &self) { return self.getPerQubitDepth(); },
           "Per-qubit circuit depth (all gates), as a dict mapping qubit "
           "index to depth.\n")
       .def("clear", &Resources::clear, "Clear out all metadata from `self`.\n");
diff --git a/python/runtime/common/py_Resources.h b/python/runtime/common/py_Resources.h
index cf5f6e7fdaf..4ea7546e1a3 100644
--- a/python/runtime/common/py_Resources.h
+++ b/python/runtime/common/py_Resources.h
@@ -5,11 +5,9 @@
  * This source code and the accompanying materials are made available under    *
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
 /// @brief Bind `cudaq.Resources` to python.
-void bindResources(py::module &mod);
+void bindResources(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/common/py_SampleResult.cpp b/python/runtime/common/py_SampleResult.cpp
index 6196502bb63..47b65d5226e 100644
--- a/python/runtime/common/py_SampleResult.cpp
+++ b/python/runtime/common/py_SampleResult.cpp
@@ -6,8 +6,11 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include <pybind11/operators.h>
-#include <pybind11/stl.h>
+#include <nanobind/make_iterator.h>
+#include <nanobind/operators.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/string_view.h>
+#include <nanobind/stl/vector.h>
 
 #include "py_SampleResult.h"
 
@@ -17,26 +20,22 @@
 
 namespace cudaq {
 
-void bindMeasureCounts(py::module &mod) {
+void bindMeasureCounts(nanobind::module_ &mod) {
   using namespace cudaq;
 
   // TODO Bind the variants of this functions that take the register name
   // as input.
-  py::class_<sample_result>(
+  nanobind::class_<sample_result>(
       mod, "SampleResult",
-      R"#(A data-type containing the results of a call to :func:`sample`. 
-This includes all measurement counts data from both mid-circuit and 
+      R"#(A data-type containing the results of a call to :func:`sample`.
+This includes all measurement counts data from both mid-circuit and
 terminal measurements.
 
 Note:
-	Conditional logic on mid-circuit measurements is no longer supported with 
-  `sample`. Use `run` instead.
-
-Attributes:
-	register_names (List[str]): A list of the names of each measurement 
-		register that are stored in `self`.)#")
-      .def_property_readonly("register_names", &sample_result::register_names)
-      .def(py::init<>())
+	Conditional logic on mid-circuit measurements is no longer supported with
+  `sample`. Use `run` instead.)#")
+      .def_prop_ro("register_names", &sample_result::register_names)
+      .def(nanobind::init<>())
       .def(
           "dump", [](sample_result &self) { self.dump(); },
           "Print a string of the raw measurement counts data to the "
@@ -63,19 +62,19 @@ terminal measurements.
             auto map = self.to_map();
             auto iter = map.find(bitstring);
             if (iter == map.end())
-              throw py::key_error("bitstring '" + bitstring +
-                                  "' does not exist");
+              throw nanobind::key_error(
+                  ("bitstring '" + bitstring + "' does not exist").c_str());
 
             return iter->second;
           },
-          py::arg("bitstring"),
+          nanobind::arg("bitstring"),
           R"#(Return the measurement counts for the given `bitstring`.
 
 Args:
 	bitstring (str): The binary string to return the measurement data of.
 
 Returns:
-	float: The number of times the given `bitstring` was measured 
+	float: The number of times the given `bitstring` was measured
 	during the `shots_count` number of executions on the QPU.)#")
       .def(
           "__len__", [](sample_result &self) { return self.to_map().size(); },
@@ -84,12 +83,14 @@ terminal measurements.
       .def(
           "__iter__",
           [](sample_result &self) {
-            return py::make_key_iterator(self.begin(), self.end());
+            return nanobind::make_key_iterator(nanobind::type<sample_result>(),
+                                               "key_iterator", self.begin(),
+                                               self.end());
           },
-          py::keep_alive<0, 1>(),
+          nanobind::keep_alive<0, 1>(),
           "Iterate through the :class:`SampleResult` dictionary.\n")
       .def("expectation", &sample_result::expectation,
-           py::arg("register_name") = GlobalRegisterName,
+           nanobind::arg("register_name") = GlobalRegisterName,
            "Return the expectation value in the Z-basis of the :class:`Kernel` "
            "that was sampled.\n")
       .def(
@@ -102,45 +103,46 @@ terminal measurements.
                          1);
             return self.expectation();
           },
-          py::arg("register_name") = GlobalRegisterName,
+          nanobind::arg("register_name") = GlobalRegisterName,
           "Return the expectation value in the Z-basis of the :class:`Kernel` "
           "that was sampled.\n")
       .def("probability", &sample_result::probability,
            "Return the probability of observing the given bit string.\n",
-           py::arg("bitstring"), py::arg("register_name") = GlobalRegisterName,
+           nanobind::arg("bitstring"),
+           nanobind::arg("register_name") = GlobalRegisterName,
            R"#(Return the probability of measuring the given `bitstring`.
 
 Args:
-  bitstring (str): The binary string to return the measurement 
+  bitstring (str): The binary string to return the measurement
 		probability of.
-  register_name (Optional[str]): The optional measurement register 
-		name to extract the probability from. Defaults to the '__global__' 
+  register_name (Optional[str]): The optional measurement register
+		name to extract the probability from. Defaults to the '__global__'
 		register.
 
 Returns:
-  float: 
-	The probability of measuring the given `bitstring`. Equivalent 
-	to the proportion of the total times the bitstring was measured 
+  float:
+	The probability of measuring the given `bitstring`. Equivalent
+	to the proportion of the total times the bitstring was measured
 	vs. the number of experiments (`shots_count`).)#")
       .def("most_probable", &sample_result::most_probable,
-           py::arg("register_name") = GlobalRegisterName,
-           R"#(Return the bitstring that was measured most frequently in the 
+           nanobind::arg("register_name") = GlobalRegisterName,
+           R"#(Return the bitstring that was measured most frequently in the
 experiment.
 
 Args:
-  register_name (Optional[str]): The optional measurement register 
-		name to extract the most probable bitstring from. Defaults to the 
+  register_name (Optional[str]): The optional measurement register
+		name to extract the most probable bitstring from. Defaults to the
 		'__global__' register.
 
 Returns:
   str: The most frequently measured binary string during the experiment.)#")
-      .def("count", &sample_result::count, py::arg("bitstring"),
-           py::arg("register_name") = GlobalRegisterName,
+      .def("count", &sample_result::count, nanobind::arg("bitstring"),
+           nanobind::arg("register_name") = GlobalRegisterName,
            R"#(Return the number of times the given bitstring was observed.
 
 Args:
   bitstring (str): The binary string to return the measurement counts for.
-  register_name (Optional[str]): The optional measurement register name to 
+  register_name (Optional[str]): The optional measurement register name to
 		extract the probability from. Defaults to the '__global__' register.
 
 Returns:
@@ -149,21 +151,21 @@ experiment.
            static_cast<sample_result (sample_result::*)(
                const std::vector<std::size_t> &, const std::string_view) const>(
                &sample_result::get_marginal),
-           py::arg("marginal_indices"), py::kw_only(),
-           py::arg("register_name") = GlobalRegisterName,
-           R"#(Extract the measurement counts data for the provided subset of 
+           nanobind::arg("marginal_indices"), nanobind::kw_only(),
+           nanobind::arg("register_name") = GlobalRegisterName,
+           R"#(Extract the measurement counts data for the provided subset of
 qubits (`marginal_indices`).
 
 Args:
-  marginal_indices (list[int]): A list of the qubit indices to extract the 
+  marginal_indices (list[int]): A list of the qubit indices to extract the
 		measurement data from.
-  register_name (Optional[str]): The optional measurement register name to extract 
+  register_name (Optional[str]): The optional measurement register name to extract
 		the counts data from. Defaults to the '__global__' register.
 Returns:
-  :class:`SampleResult`: 
+  :class:`SampleResult`:
 	A new `SampleResult` dictionary containing the extracted measurement data.)#")
       .def("get_sequential_data", &sample_result::sequential_data,
-           py::arg("register_name") = GlobalRegisterName,
+           nanobind::arg("register_name") = GlobalRegisterName,
            "Return the data from the given register (`register_name`) as it "
            "was collected sequentially. A list of measurement results, not "
            "collated into a map.\n")
@@ -174,26 +176,30 @@ qubits (`marginal_indices`).
             ExecutionResult res(cd);
             return sample_result(res);
           },
-          py::arg("register_name"),
+          nanobind::arg("register_name"),
           "Extract the provided sub-register (`register_name`) as a new "
           ":class:`SampleResult`.\n")
       .def(
           "items",
           [](sample_result &self) {
-            return py::make_iterator(self.begin(), self.end());
+            return nanobind::make_iterator(nanobind::type<sample_result>(),
+                                           "item_iterator", self.begin(),
+                                           self.end());
           },
-          py::keep_alive<0, 1>(),
+          nanobind::keep_alive<0, 1>(),
           "Return the key/value pairs in this :class:`SampleResult` "
           "dictionary.\n")
       .def(
           "values",
           [](sample_result &self) {
-            return py::make_value_iterator(self.begin(), self.end());
+            return nanobind::make_value_iterator(
+                nanobind::type<sample_result>(), "value_iterator", self.begin(),
+                self.end());
           },
-          py::keep_alive<0, 1>(),
+          nanobind::keep_alive<0, 1>(),
           "Return all values (the counts) in this :class:`SampleResult` "
           "dictionary.\n")
-      .def(py::self += py::self)
+      .def(nanobind::self += nanobind::self)
       .def("clear", &sample_result::clear,
            "Clear out all metadata from `self`.\n");
 }
diff --git a/python/runtime/common/py_SampleResult.h b/python/runtime/common/py_SampleResult.h
index 2cc72487900..832acf3e40c 100644
--- a/python/runtime/common/py_SampleResult.h
+++ b/python/runtime/common/py_SampleResult.h
@@ -5,13 +5,11 @@
  * This source code and the accompanying materials are made available under    *
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
 
 #include "utils/LinkedLibraryHolder.h"
 
-namespace py = pybind11;
-
 namespace cudaq {
 /// @brief Bind `cudaq.MeasureCounts` to python.
-void bindMeasureCounts(py::module &mod);
+void bindMeasureCounts(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/algorithms/py_draw.cpp b/python/runtime/cudaq/algorithms/py_draw.cpp
index be13796efd9..94d01c1b151 100644
--- a/python/runtime/cudaq/algorithms/py_draw.cpp
+++ b/python/runtime/cudaq/algorithms/py_draw.cpp
@@ -11,14 +11,12 @@
 #include "cudaq/platform/nvqpp_interface.h"
 #include "runtime/cudaq/platform/py_alt_launch_kernel.h"
 
-namespace py = pybind11;
-
 /// @brief Run `cudaq::contrib::draw`'s string overload on the provided kernel.
 /// \p kernel is a kernel decorator object and \p args are the arguments to
 /// launch \p kernel.
 static std::string pyDraw(const std::string &format,
                           const std::string &shortName, MlirModule mod,
-                          py::args runtimeArgs) {
+                          nanobind::args runtimeArgs) {
   if (format != "ascii" && format != "latex")
     throw std::runtime_error("format argument must be \"ascii\" or \"latex\".");
 
@@ -31,11 +29,11 @@ static std::string pyDraw(const std::string &format,
 }
 
 /// @brief Bind the draw cudaq function
-void cudaq::bindPyDraw(py::module &mod) {
+void cudaq::bindPyDraw(nanobind::module_ &mod) {
   mod.def(
       "draw_impl",
       [](const std::string &format, const std::string &shortName,
-         MlirModule mod, py::args runtimeArgs) {
+         MlirModule mod, nanobind::args runtimeArgs) {
         return pyDraw(format, shortName, mod, runtimeArgs);
       },
       R"#(
@@ -47,7 +45,7 @@ string.
 Args:
   format (str): The format of the output. Can be 'ascii' or 'latex'.
   kernel (:class:`Kernel`): The :class:`Kernel` to draw.
-  *arguments (Optional[Any]): The concrete values to evaluate the kernel 
+  *arguments (Optional[Any]): The concrete values to evaluate the kernel
       function at. Leave empty if the kernel doesn't accept any arguments.
 
 Returns:
@@ -66,12 +64,12 @@ string.
       mz(q)
   print(cudaq.draw(bell_pair))
   # Output
-  #      ╭───╮     
+  #      ╭───╮
   # q0 : ┤ h ├──●──
   #      ╰───╯╭─┴─╮
   # q1 : ─────┤ x ├
   #           ╰───╯
-  
+
   # Example with arguments
   import cudaq
   @cudaq.kernel
diff --git a/python/runtime/cudaq/algorithms/py_draw.h b/python/runtime/cudaq/algorithms/py_draw.h
index f6bd76d4a3b..cc5c37df87e 100644
--- a/python/runtime/cudaq/algorithms/py_draw.h
+++ b/python/runtime/cudaq/algorithms/py_draw.h
@@ -9,8 +9,8 @@
 #pragma once
 
 #include "utils/OpaqueArguments.h"
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
-void bindPyDraw(pybind11::module &mod);
+void bindPyDraw(nanobind::module_ &mod);
 }
diff --git a/python/runtime/cudaq/algorithms/py_evolve.cpp b/python/runtime/cudaq/algorithms/py_evolve.cpp
index b243287a0db..80e54f3edc7 100644
--- a/python/runtime/cudaq/algorithms/py_evolve.cpp
+++ b/python/runtime/cudaq/algorithms/py_evolve.cpp
@@ -11,13 +11,17 @@
 #include "cudaq/algorithms/evolve_internal.h"
 #include "cudaq/runtime/logger/logger.h"
 #include "runtime/cudaq/platform/py_alt_launch_kernel.h"
+#include "utils/NanobindAdaptors.h"
 #include "utils/OpaqueArguments.h"
-#include "mlir/Bindings/Python/PybindAdaptors.h"
 #include "mlir/CAPI/IR.h"
-#include <pybind11/complex.h>
-#include <pybind11/functional.h>
-#include <pybind11/numpy.h>
-#include <pybind11/stl.h>
+#include <nanobind/ndarray.h>
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/function.h>
+#include <nanobind/stl/map.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/pair.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
 
 namespace cudaq {
 
@@ -26,17 +30,18 @@ using spin_op_creator =
     std::function<spin_op(std::map<std::string, numeric_type>)>;
 
 // Helper to determine if an object is a Python kernel builder object (PyKernel)
-static bool isPyKernelObject(py::object &kernel) {
+static bool isPyKernelObject(nanobind::object &kernel) {
   const std::string kernelTypeName =
-      py::hasattr(kernel, "__class__")
-          ? kernel.attr("__class__").attr("__name__").cast<std::string>()
+      nanobind::hasattr(kernel, "__class__")
+          ? nanobind::cast<std::string>(
+                kernel.attr("__class__").attr("__name__"))
           : "";
   return (kernelTypeName == "PyKernel");
 }
 
 template <typename numeric_type>
 evolve_result
-pyEvolve(state initial_state, py::object kernel,
+pyEvolve(state initial_state, nanobind::object kernel,
          std::map<std::string, numeric_type> params,
          std::vector<spin_op_creator<numeric_type>> observables = {},
          int shots_count = -1) {
@@ -44,11 +49,11 @@ pyEvolve(state initial_state, py::object kernel,
     throw std::runtime_error(
         "The provided kernel to pyEvolve is not a valid PyKernel object.");
 
-  if (py::hasattr(kernel, "compile"))
+  if (nanobind::hasattr(kernel, "compile"))
     kernel.attr("compile")();
 
-  auto kernelName = kernel.attr("name").cast<std::string>();
-  auto kernelMod = unwrap(kernel.attr("module").cast<MlirModule>());
+  auto kernelName = nanobind::cast<std::string>(kernel.attr("name"));
+  auto kernelMod = unwrap(nanobind::cast<MlirModule>(kernel.attr("module")));
 
   std::vector<spin_op> spin_ops = {};
   for (auto &observable : observables) {
@@ -70,23 +75,24 @@ pyEvolve(state initial_state, py::object kernel,
 
 template <typename numeric_type>
 evolve_result
-pyEvolve(state initial_state, std::vector<py::object> kernels,
+pyEvolve(state initial_state, std::vector<nanobind::object> kernels,
          std::vector<std::map<std::string, numeric_type>> params,
          std::vector<spin_op_creator<numeric_type>> observables = {},
          int shots_count = -1, bool save_intermediate_states = true) {
-  if (!std::all_of(kernels.begin(), kernels.end(),
-                   [](py::object &kernel) { return isPyKernelObject(kernel); }))
+  if (!std::all_of(
+          kernels.begin(), kernels.end(),
+          [](nanobind::object &kernel) { return isPyKernelObject(kernel); }))
     throw std::runtime_error(
         "One or more of the provided kernels to pyEvolve is not a valid "
         "PyKernel object.");
 
   std::vector<std::function<void(state)>> launchFcts = {};
-  for (py::object kernel : kernels) {
-    if (py::hasattr(kernel, "compile"))
+  for (nanobind::object kernel : kernels) {
+    if (nanobind::hasattr(kernel, "compile"))
       kernel.attr("compile")();
 
-    auto kernelName = kernel.attr("name").cast<std::string>();
-    auto kernelMod = unwrap(kernel.attr("module").cast<MlirModule>());
+    auto kernelName = nanobind::cast<std::string>(kernel.attr("name"));
+    auto kernelMod = unwrap(nanobind::cast<MlirModule>(kernel.attr("module")));
 
     launchFcts.push_back([kernelMod, kernelName](state state) mutable {
       auto *argData = new cudaq::OpaqueArguments();
@@ -112,7 +118,7 @@ pyEvolve(state initial_state, std::vector<py::object> kernels,
 
 template <typename numeric_type>
 async_evolve_result
-pyEvolveAsync(state initial_state, py::object kernel,
+pyEvolveAsync(state initial_state, nanobind::object kernel,
               std::map<std::string, numeric_type> params,
               std::vector<spin_op_creator<numeric_type>> observables = {},
               std::size_t qpu_id = 0,
@@ -122,18 +128,19 @@ pyEvolveAsync(state initial_state, py::object kernel,
     throw std::runtime_error(
         "The provided kernel to pyEvolveAsync is not a valid PyKernel object.");
 
-  if (py::hasattr(kernel, "compile"))
+  if (nanobind::hasattr(kernel, "compile"))
     kernel.attr("compile")();
 
-  auto kernelMod = unwrap(kernel.attr("module").cast<MlirModule>()).clone();
-  auto kernelName = kernel.attr("name").cast<std::string>();
+  auto kernelMod =
+      unwrap(nanobind::cast<MlirModule>(kernel.attr("module"))).clone();
+  auto kernelName = nanobind::cast<std::string>(kernel.attr("name"));
 
   std::vector<spin_op> spin_ops = {};
   for (auto observable : observables) {
     spin_ops.push_back(observable(params));
   }
 
-  py::gil_scoped_release release;
+  nanobind::gil_scoped_release release;
   return __internal__::evolve_async(
       initial_state,
       [kernelMod, kernelName](state state) mutable {
@@ -148,27 +155,29 @@ pyEvolveAsync(state initial_state, py::object kernel,
 
 template <typename numeric_type>
 async_evolve_result
-pyEvolveAsync(state initial_state, std::vector<py::object> kernels,
+pyEvolveAsync(state initial_state, std::vector<nanobind::object> kernels,
               std::vector<std::map<std::string, numeric_type>> params,
               std::vector<spin_op_creator<numeric_type>> observables = {},
               std::size_t qpu_id = 0,
               std::optional<cudaq::noise_model> noise_model = std::nullopt,
               int shots_count = -1, bool save_intermediate_states = true) {
-  if (!std::all_of(kernels.begin(), kernels.end(),
-                   [](py::object &kernel) { return isPyKernelObject(kernel); }))
+  if (!std::all_of(
+          kernels.begin(), kernels.end(),
+          [](nanobind::object &kernel) { return isPyKernelObject(kernel); }))
     throw std::runtime_error(
         "One or more of the provided kernels to pyEvolveAsync is not a valid "
         "PyKernel object.");
 
   std::vector<std::function<void(state)>> launchFcts = {};
-  for (py::object kernel : kernels) {
-    if (py::hasattr(kernel, "compile"))
+  for (nanobind::object kernel : kernels) {
+    if (nanobind::hasattr(kernel, "compile"))
       kernel.attr("compile")();
 
     // IMPORTANT: we need to make sure no Python data is accessed in the async.
     // functor.
-    auto kernelMod = unwrap(kernel.attr("module").cast<MlirModule>()).clone();
-    auto kernelName = kernel.attr("name").cast<std::string>();
+    auto kernelMod =
+        unwrap(nanobind::cast<MlirModule>(kernel.attr("module"))).clone();
+    auto kernelName = nanobind::cast<std::string>(kernel.attr("name"));
     launchFcts.push_back(
         [kernelMod = std::move(kernelMod), kernelName](state state) mutable {
           cudaq::OpaqueArguments argData;
@@ -187,7 +196,7 @@ pyEvolveAsync(state initial_state, std::vector<py::object> kernels,
     spin_ops.push_back(std::move(ops));
   }
 
-  py::gil_scoped_release release;
+  nanobind::gil_scoped_release release;
   return __internal__::evolve_async(initial_state, launchFcts, spin_ops, qpu_id,
                                     noise_model, shots_count,
                                     save_intermediate_states);
@@ -196,7 +205,7 @@ pyEvolveAsync(state initial_state, std::vector<py::object> kernels,
 #define DEFINE_PARAM_TYPE_OVERLOAD_VEC(type, pyMod)                            \
   pyMod.def(                                                                   \
       "evolve",                                                                \
-      [](state initial_state, std::vector<py::object> kernels,                 \
+      [](state initial_state, std::vector<nanobind::object> kernels,           \
          std::vector<std::map<std::string, type>> params = {},                 \
          std::vector<spin_op_creator<type>> observables = {},                  \
          int shots_count = -1, bool save_intermediate_states = true) {         \
@@ -205,16 +214,16 @@ pyEvolveAsync(state initial_state, std::vector<py::object> kernels,
       },                                                                       \
       "Evolve the given initial_state with the provided kernel and "           \
       "parameters.",                                                           \
-      py::arg("initial_state"), py::arg("kernels"),                            \
-      py::arg("params") = std::vector<std::map<std::string, type>>{},          \
-      py::arg("observables") = std::vector<spin_op_creator<type>>{},           \
-      py::arg("shots_count") = -1,                                             \
-      py::arg("save_intermediate_states") = true);
+      nanobind::arg("initial_state"), nanobind::arg("kernels"),                \
+      nanobind::arg("params") = std::vector<std::map<std::string, type>>{},    \
+      nanobind::arg("observables") = std::vector<spin_op_creator<type>>{},     \
+      nanobind::arg("shots_count") = -1,                                       \
+      nanobind::arg("save_intermediate_states") = true);
 
 #define DEFINE_PARAM_TYPE_OVERLOAD(type, pyMod)                                \
   pyMod.def(                                                                   \
       "evolve",                                                                \
-      [](state initial_state, py::object kernel,                               \
+      [](state initial_state, nanobind::object kernel,                         \
          std::map<std::string, type> params = {},                              \
          std::vector<spin_op_creator<type>> observables = {},                  \
          int shots_count = -1) {                                               \
@@ -223,15 +232,15 @@ pyEvolveAsync(state initial_state, std::vector<py::object> kernels,
       },                                                                       \
       "Evolve the given initial_state with the provided kernel and "           \
       "parameters.",                                                           \
-      py::arg("initial_state"), py::arg("kernels"),                            \
-      py::arg("params") = std::map<std::string, type>{},                       \
-      py::arg("observables") = std::vector<spin_op_creator<type>>{},           \
-      py::arg("shots_count") = -1);
+      nanobind::arg("initial_state"), nanobind::arg("kernels"),                \
+      nanobind::arg("params") = std::map<std::string, type>{},                 \
+      nanobind::arg("observables") = std::vector<spin_op_creator<type>>{},     \
+      nanobind::arg("shots_count") = -1);
 
 #define DEFINE_ASYNC_PARAM_TYPE_OVERLOAD_VEC(type, pyMod)                      \
   pyMod.def(                                                                   \
       "evolve_async",                                                          \
-      [](state initial_state, std::vector<py::object> kernels,                 \
+      [](state initial_state, std::vector<nanobind::object> kernels,           \
          std::vector<std::map<std::string, type>> params = {},                 \
          std::vector<spin_op_creator<type>> observables = {},                  \
          std::size_t qpu_id = 0,                                               \
@@ -243,17 +252,18 @@ pyEvolveAsync(state initial_state, std::vector<py::object> kernels,
       },                                                                       \
       "Asynchronously evolve the given initial_state with "                    \
       "the provided kernel and parameters.",                                   \
-      py::arg("initial_state"), py::arg("kernels"),                            \
-      py::arg("params") = std::vector<std::map<std::string, type>>{},          \
-      py::arg("observables") = std::vector<spin_op_creator<type>>{},           \
-      py::arg("qpu_id") = 0, py::arg("noise_model") = std::nullopt,            \
-      py::arg("shots_count") = -1,                                             \
-      py::arg("save_intermediate_states") = true);
+      nanobind::arg("initial_state"), nanobind::arg("kernels"),                \
+      nanobind::arg("params") = std::vector<std::map<std::string, type>>{},    \
+      nanobind::arg("observables") = std::vector<spin_op_creator<type>>{},     \
+      nanobind::arg("qpu_id") = 0,                                             \
+      nanobind::arg("noise_model") = std::nullopt,                             \
+      nanobind::arg("shots_count") = -1,                                       \
+      nanobind::arg("save_intermediate_states") = true);
 
 #define DEFINE_ASYNC_PARAM_TYPE_OVERLOAD(type, pyMod)                          \
   pyMod.def(                                                                   \
       "evolve_async",                                                          \
-      [](state initial_state, py::object kernel,                               \
+      [](state initial_state, nanobind::object kernel,                         \
          std::map<std::string, type> params = {},                              \
          std::vector<spin_op_creator<type>> observables = {},                  \
          std::size_t qpu_id = 0,                                               \
@@ -264,14 +274,15 @@ pyEvolveAsync(state initial_state, std::vector<py::object> kernels,
       },                                                                       \
       "Asynchronously evolve the given initial_state with "                    \
       "the provided kernel and parameters.",                                   \
-      py::arg("initial_state"), py::arg("kernels"),                            \
-      py::arg("params") = std::map<std::string, type>{},                       \
-      py::arg("observables") = std::vector<spin_op_creator<type>>{},           \
-      py::arg("qpu_id") = 0, py::arg("noise_model") = std::nullopt,            \
-      py::arg("shots_count") = -1);
+      nanobind::arg("initial_state"), nanobind::arg("kernels"),                \
+      nanobind::arg("params") = std::map<std::string, type>{},                 \
+      nanobind::arg("observables") = std::vector<spin_op_creator<type>>{},     \
+      nanobind::arg("qpu_id") = 0,                                             \
+      nanobind::arg("noise_model") = std::nullopt,                             \
+      nanobind::arg("shots_count") = -1);
 
 /// @brief Bind the evolve cudaq function for circuit simulator
-void bindPyEvolve(py::module &mod) {
+void bindPyEvolve(nanobind::module_ &mod) {
   // Sync evolve overloads
   DEFINE_PARAM_TYPE_OVERLOAD_VEC(long, mod);
   DEFINE_PARAM_TYPE_OVERLOAD_VEC(double, mod);
diff --git a/python/runtime/cudaq/algorithms/py_evolve.h b/python/runtime/cudaq/algorithms/py_evolve.h
index 869806c41b4..4af37da5b0c 100644
--- a/python/runtime/cudaq/algorithms/py_evolve.h
+++ b/python/runtime/cudaq/algorithms/py_evolve.h
@@ -8,10 +8,8 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
-void bindPyEvolve(py::module &mod);
+void bindPyEvolve(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/algorithms/py_observe_async.cpp b/python/runtime/cudaq/algorithms/py_observe_async.cpp
index 58c5ce37454..19586bce198 100644
--- a/python/runtime/cudaq/algorithms/py_observe_async.cpp
+++ b/python/runtime/cudaq/algorithms/py_observe_async.cpp
@@ -13,14 +13,15 @@
 #include "cudaq/Todo.h"
 #include "cudaq/algorithms/observe.h"
 #include "runtime/cudaq/platform/py_alt_launch_kernel.h"
+#include "utils/NanobindAdaptors.h"
 #include "utils/OpaqueArguments.h"
-#include "mlir/Bindings/Python/PybindAdaptors.h"
 #include "mlir/CAPI/IR.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include <fmt/core.h>
-#include <pybind11/stl.h>
-
-namespace py = pybind11;
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/tuple.h>
+#include <nanobind/stl/vector.h>
 
 using namespace cudaq;
 
@@ -67,14 +68,14 @@ static async_observe_result pyObserveAsync(const std::string &shortName,
                                            mlir::ModuleOp mod,
                                            const spin_op &spin_operator,
                                            std::size_t qpu_id, int shots,
-                                           py::args args) {
+                                           nanobind::args args) {
   auto &platform = get_platform();
   args = simplifiedValidateInputArguments(args);
   auto fnOp = getKernelFuncOp(mod, shortName);
   auto opaques = marshal_arguments_for_module_launch(mod, args, fnOp);
 
   // Launch the asynchronous execution.
-  py::gil_scoped_release release;
+  nanobind::gil_scoped_release release;
   return details::runObservationAsync(
       detail::make_copyable_function([opaques = std::move(opaques), shortName,
                                       mod = mod.clone()]() mutable {
@@ -86,17 +87,16 @@ static async_observe_result pyObserveAsync(const std::string &shortName,
       spin_operator, platform, shots, shortName, qpu_id);
 }
 
-static async_observe_result observe_async_impl(const std::string &shortName,
-                                               MlirModule module,
-                                               py::object &spin_operator_obj,
-                                               std::size_t qpu_id, int shots,
-                                               py::args args) {
+static async_observe_result
+observe_async_impl(const std::string &shortName, MlirModule module,
+                   nanobind::object &spin_operator_obj, std::size_t qpu_id,
+                   int shots, nanobind::args args) {
   // FIXME(OperatorCpp): Remove this when the operator class is implemented in
   // C++
-  spin_op spin_operator = [](py::object &obj) -> spin_op {
-    if (py::hasattr(obj, "_to_spinop"))
-      return obj.attr("_to_spinop")().cast<spin_op>();
-    return obj.cast<spin_op>();
+  spin_op spin_operator = [](nanobind::object &obj) -> spin_op {
+    if (nanobind::hasattr(obj, "_to_spinop"))
+      return nanobind::cast<spin_op>(obj.attr("_to_spinop")());
+    return nanobind::cast<spin_op>(obj);
   }(spin_operator_obj);
   auto mod = unwrap(module);
   return pyObserveAsync(shortName, mod, spin_operator, qpu_id, shots, args);
@@ -106,7 +106,7 @@ static async_observe_result observe_async_impl(const std::string &shortName,
 static observe_result
 pyObservePar(const PyParType &type, const std::string &shortName,
              mlir::ModuleOp module, spin_op &spin_operator, int shots,
-             std::optional<noise_model> noise, py::args args) {
+             std::optional<noise_model> noise, nanobind::args args) {
   // Ensure the user input is correct.
   auto &platform = get_platform();
   if (!platform.supports_task_distribution())
@@ -163,11 +163,14 @@ pyObservePar(const PyParType &type, const std::string &shortName,
 
 /// Observe can be a single observe call, a parallel observe call, or a observe
 /// broadcast. All these variants are handled here.
-static observe_result
-observe_parallel_impl(const std::string &shortName, MlirModule module,
-                      py::type execution, spin_op &spin_operator, int shots,
-                      std::optional<noise_model> noise, py::args arguments) {
-  std::string applicatorKey = py::str(execution.attr("__name__"));
+static observe_result observe_parallel_impl(const std::string &shortName,
+                                            MlirModule module,
+                                            nanobind::type_object execution,
+                                            spin_op &spin_operator, int shots,
+                                            std::optional<noise_model> noise,
+                                            nanobind::args arguments) {
+  std::string applicatorKey =
+      nanobind::cast<std::string>(execution.attr("__name__"));
   auto mod = unwrap(module);
   if (applicatorKey == "thread")
     return pyObservePar(PyParType::thread, shortName, mod, spin_operator, shots,
@@ -178,14 +181,14 @@ observe_parallel_impl(const std::string &shortName, MlirModule module,
   throw std::runtime_error("invalid parallel execution context");
 }
 
-void cudaq::bindObserveAsync(py::module &mod) {
+void cudaq::bindObserveAsync(nanobind::module_ &mod) {
   auto parallelSubmodule = mod.def_submodule("parallel");
-  py::class_<parallel::mpi>(
+  nanobind::class_<parallel::mpi>(
       parallelSubmodule, "mpi",
       "Type indicating that the :func:`observe` function should distribute its "
       "expectation value computations across available MPI ranks and GPUs for "
       "each term.");
-  py::class_<parallel::thread>(
+  nanobind::class_<parallel::thread>(
       parallelSubmodule, "thread",
       "Type indicating that the :func:`observe` function should distribute its "
       "term "
diff --git a/python/runtime/cudaq/algorithms/py_observe_async.h b/python/runtime/cudaq/algorithms/py_observe_async.h
index 44cb0a63048..ebd599b6ab3 100644
--- a/python/runtime/cudaq/algorithms/py_observe_async.h
+++ b/python/runtime/cudaq/algorithms/py_observe_async.h
@@ -8,8 +8,8 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
-void bindObserveAsync(pybind11::module &mod);
+void bindObserveAsync(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/algorithms/py_optimizer.cpp b/python/runtime/cudaq/algorithms/py_optimizer.cpp
index 79064fbf867..339b33e81ae 100644
--- a/python/runtime/cudaq/algorithms/py_optimizer.cpp
+++ b/python/runtime/cudaq/algorithms/py_optimizer.cpp
@@ -5,8 +5,12 @@
  * This source code and the accompanying materials are made available under    *
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
-#include <pybind11/functional.h>
-#include <pybind11/stl.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/function.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/tuple.h>
+#include <nanobind/stl/vector.h>
 
 #include "common/JsonConvert.h"
 #include "cudaq/algorithms/gradients/central_difference.h"
@@ -19,23 +23,26 @@
 
 namespace cudaq {
 
-/// @brief Bind the `cudaq::optimization_result` typedef.
-void bindOptimizationResult(py::module &mod) {
-  py::class_<optimization_result>(mod, "OptimizationResult");
+/// @brief optimization_result is a typedef for std::tuple<double,
+/// std::vector<double>> which is automatically converted by nanobind's
+/// stl/tuple type caster.
+void bindOptimizationResult(nanobind::module_ &mod) {
+  mod.attr("OptimizationResult") =
+      nanobind::handle(reinterpret_cast<PyObject *>(&PyTuple_Type));
 }
 
-void bindGradientStrategies(py::module &mod) {
+void bindGradientStrategies(nanobind::module_ &mod) {
   // Binding under the `cudaq.gradients` namespace in python.
   auto gradients_submodule = mod.def_submodule("gradients");
   // Have to bind the parent class, `cudaq::gradient`, to allow
   // for the passing of arbitrary `cudaq::gradients::` around.
   // Note: this class lives under `cudaq.gradients.gradient`
   // in python.
-  py::class_<gradient>(gradients_submodule, "gradient");
+  nanobind::class_<gradient>(gradients_submodule, "gradient");
   // Gradient strategies derive from the `cudaq::gradient` class.
-  py::class_<gradients::central_difference, gradient>(gradients_submodule,
-                                                      "CentralDifference")
-      .def(py::init<>())
+  nanobind::class_<gradients::central_difference, gradient>(gradients_submodule,
+                                                            "CentralDifference")
+      .def(nanobind::init<>())
       .def(
           "to_json",
           [](const gradients::central_difference &p) { return json(p).dump(); },
@@ -51,18 +58,20 @@ void bindGradientStrategies(py::module &mod) {
       .def(
           "compute",
           [](cudaq::gradient &grad, const std::vector<double> &x,
-             py::function &func, double funcAtX) {
+             nanobind::callable &func, double funcAtX) {
             auto function =
-                func.cast<std::function<double(std::vector<double>)>>();
+                nanobind::cast<std::function<double(std::vector<double>)>>(
+                    func);
             return grad.compute(x, function, funcAtX);
           },
-          py::arg("parameter_vector"), py::arg("function"), py::arg("funcAtX"),
+          nanobind::arg("parameter_vector"), nanobind::arg("function"),
+          nanobind::arg("funcAtX"),
           "Compute the gradient of the provided `parameter_vector` with "
           "respect to "
           "its loss function, using the `CentralDifference` method.\n");
-  py::class_<gradients::forward_difference, gradient>(gradients_submodule,
-                                                      "ForwardDifference")
-      .def(py::init<>())
+  nanobind::class_<gradients::forward_difference, gradient>(gradients_submodule,
+                                                            "ForwardDifference")
+      .def(nanobind::init<>())
       .def(
           "to_json",
           [](const gradients::forward_difference &p) { return json(p).dump(); },
@@ -78,18 +87,20 @@ void bindGradientStrategies(py::module &mod) {
       .def(
           "compute",
           [](cudaq::gradient &grad, const std::vector<double> &x,
-             py::function &func, double funcAtX) {
+             nanobind::callable &func, double funcAtX) {
             auto function =
-                func.cast<std::function<double(std::vector<double>)>>();
+                nanobind::cast<std::function<double(std::vector<double>)>>(
+                    func);
             return grad.compute(x, function, funcAtX);
           },
-          py::arg("parameter_vector"), py::arg("function"), py::arg("funcAtX"),
+          nanobind::arg("parameter_vector"), nanobind::arg("function"),
+          nanobind::arg("funcAtX"),
           "Compute the gradient of the provided `parameter_vector` with "
           "respect to "
           "its loss function, using the `ForwardDifference` method.\n");
-  py::class_<gradients::parameter_shift, gradient>(gradients_submodule,
-                                                   "ParameterShift")
-      .def(py::init<>())
+  nanobind::class_<gradients::parameter_shift, gradient>(gradients_submodule,
+                                                         "ParameterShift")
+      .def(nanobind::init<>())
       .def(
           "to_json",
           [](const gradients::parameter_shift &p) { return json(p).dump(); },
@@ -105,12 +116,14 @@ void bindGradientStrategies(py::module &mod) {
       .def(
           "compute",
           [](cudaq::gradient &grad, const std::vector<double> &x,
-             py::function &func, double funcAtX) {
+             nanobind::callable &func, double funcAtX) {
             auto function =
-                func.cast<std::function<double(std::vector<double>)>>();
+                nanobind::cast<std::function<double(std::vector<double>)>>(
+                    func);
             return grad.compute(x, function, funcAtX);
           },
-          py::arg("parameter_vector"), py::arg("function"), py::arg("funcAtX"),
+          nanobind::arg("parameter_vector"), nanobind::arg("function"),
+          nanobind::arg("funcAtX"),
           "Compute the gradient of the provided `parameter_vector` with "
           "respect to "
           "its loss function, using the `ParameterShift` method.\n");
@@ -121,9 +134,10 @@ void bindGradientStrategies(py::module &mod) {
 /// Can now define its member functions on
 /// that submodule.
 template <typename OptimizerT>
-py::class_<OptimizerT> addPyOptimizer(py::module &mod, std::string &&name) {
-  return py::class_<OptimizerT, optimizer>(mod, name.c_str())
-      .def(py::init<>())
+nanobind::class_<OptimizerT, optimizer> addPyOptimizer(nanobind::module_ &mod,
+                                                       std::string &&name) {
+  return nanobind::class_<OptimizerT, optimizer>(mod, name.c_str())
+      .def(nanobind::init<>())
       .def(
           "to_json", [](const OptimizerT &p) { return json(p).dump(); },
           "Convert optimizer to JSON string")
@@ -135,15 +149,15 @@ py::class_<OptimizerT> addPyOptimizer(py::module &mod, std::string &&name) {
             return p;
           },
           "Convert JSON string to optimizer")
-      .def_readwrite("max_iterations", &OptimizerT::max_eval, R"doc(
+      .def_rw("max_iterations", &OptimizerT::max_eval, R"doc(
           int: Maximum number of optimizer iterations (default: unlimited).
 
           Sets an upper bound on the number of function evaluations or iterations 
           the optimizer will perform. If not set, the optimizer may run until 
           convergence or until another stopping criterion is met.
           )doc")
-      .def_readwrite("initial_parameters", &OptimizerT::initial_parameters,
-                     R"doc(
+      .def_rw("initial_parameters", &OptimizerT::initial_parameters,
+              R"doc(
           list[float]: Initial values for the optimization parameters (optional).
 
           Provides a starting point for the optimization. If not specified, the 
@@ -156,7 +170,7 @@ py::class_<OptimizerT> addPyOptimizer(py::module &mod, std::string &&name) {
 
                   optimizer.initial_parameters = [0.5, -0.3, 1.2]
           )doc")
-      .def_readwrite("lower_bounds", &OptimizerT::lower_bounds, R"doc(
+      .def_rw("lower_bounds", &OptimizerT::lower_bounds, R"doc(
           list[float]: Lower bounds for optimization parameters (optional).
 
           Constrains the search space by specifying minimum allowed values for 
@@ -168,7 +182,7 @@ py::class_<OptimizerT> addPyOptimizer(py::module &mod, std::string &&name) {
 
                   optimizer.lower_bounds = [-2.0, -2.0]  # For 2D problem
           )doc")
-      .def_readwrite("upper_bounds", &OptimizerT::upper_bounds, R"doc(
+      .def_rw("upper_bounds", &OptimizerT::upper_bounds, R"doc(
           list[float]: Upper bounds for optimization parameters (optional).
 
           Constrains the search space by specifying maximum allowed values for 
@@ -197,21 +211,22 @@ py::class_<OptimizerT> addPyOptimizer(py::module &mod, std::string &&name) {
           )doc")
       .def(
           "optimize",
-          [](OptimizerT &opt, const int dim, py::function &func) {
+          [](OptimizerT &opt, const int dim, nanobind::callable &func) {
             return opt.optimize(dim, [&](std::vector<double> x,
                                          std::vector<double> &grad) {
               // Call the function.
               auto ret = func(x);
               // Does it return a tuple?
-              auto isTupleReturn = py::isinstance<py::tuple>(ret);
+              auto isTupleReturn = nanobind::isinstance<nanobind::tuple>(ret);
               // If we don't need gradients, and it does, just grab the value
               // and return.
               if (!opt.requiresGradients() && isTupleReturn)
-                return ret.cast<py::tuple>()[0].cast<double>();
+                return nanobind::cast<double>(
+                    nanobind::cast<nanobind::tuple>(ret)[0]);
               // If we don't need gradients and it doesn't return tuple, then
               // just pass what we got.
               if (!opt.requiresGradients() && !isTupleReturn)
-                return ret.cast<double>();
+                return nanobind::cast<double>(ret);
 
               // Throw an error if we need gradients and they weren't provided.
               if (opt.requiresGradients() && !isTupleReturn)
@@ -220,16 +235,16 @@ py::class_<OptimizerT> addPyOptimizer(py::module &mod, std::string &&name) {
                     "(float, list[float]) for gradient-based optimizers");
 
               // If here, we require gradients, and the signature is right.
-              auto tuple = ret.cast<py::tuple>();
+              auto tuple = nanobind::cast<nanobind::tuple>(ret);
               auto val = tuple[0];
-              auto gradIn = tuple[1].cast<py::list>();
+              auto gradIn = nanobind::cast<nanobind::list>(tuple[1]);
               for (std::size_t i = 0; i < gradIn.size(); i++)
-                grad[i] = gradIn[i].cast<double>();
+                grad[i] = nanobind::cast<double>(gradIn[i]);
 
-              return val.cast<double>();
+              return nanobind::cast<double>(val);
             });
           },
-          py::arg("dimensions"), py::arg("function"), R"doc(
+          nanobind::arg("dimensions"), nanobind::arg("function"), R"doc(
 Run the optimization procedure.
 
 Args:
@@ -267,14 +282,14 @@ Run the optimization procedure.
 )doc");
 }
 
-void bindOptimizers(py::module &mod) {
+void bindOptimizers(nanobind::module_ &mod) {
   // Binding the `cudaq::optimizers` class to `_pycudaq` as a submodule
   // so it's accessible directly in the cudaq namespace.
   auto optimizers_submodule = mod.def_submodule("optimizers");
-  py::class_<optimizer>(optimizers_submodule, "optimizer");
+  nanobind::class_<optimizer>(optimizers_submodule, "optimizer");
 
   addPyOptimizer<optimizers::cobyla>(optimizers_submodule, "COBYLA")
-      .def(py::init<>(), R"doc(
+      .def(nanobind::init<>(), R"doc(
 Constrained Optimization BY Linear Approximations (COBYLA).
 
 COBYLA is a gradient-free derivative-free optimization algorithm that uses 
@@ -297,7 +312,7 @@ This optimizer does not require gradients from the objective function.
 )doc");
 
   addPyOptimizer<optimizers::neldermead>(optimizers_submodule, "NelderMead")
-      .def(py::init<>(), R"doc(
+      .def(nanobind::init<>(), R"doc(
 Nelder-Mead simplex optimization algorithm.
 
 The Nelder-Mead method is a gradient-free simplex-based optimization algorithm 
@@ -320,7 +335,7 @@ This optimizer does not require gradients from the objective function.
 )doc");
 
   addPyOptimizer<optimizers::lbfgs>(optimizers_submodule, "LBFGS")
-      .def(py::init<>(), R"doc(
+      .def(nanobind::init<>(), R"doc(
 Limited-memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS) optimizer.
 
 L-BFGS is a quasi-Newton method that approximates the Hessian matrix using 
@@ -346,7 +361,7 @@ This optimizer requires gradients from the objective function.
 
   addPyOptimizer<optimizers::gradient_descent>(optimizers_submodule,
                                                "GradientDescent")
-      .def(py::init<>(), R"doc(
+      .def(nanobind::init<>(), R"doc(
 Basic gradient descent optimization algorithm.
 
 Gradient descent iteratively moves in the direction of steepest descent 
@@ -373,7 +388,7 @@ This optimizer requires gradients from the objective function.
 
   // Have to bind extra optimizer parameters to the following manually:
   auto py_spsa = addPyOptimizer<optimizers::spsa>(optimizers_submodule, "SPSA")
-                     .def(py::init<>(), R"doc(
+                     .def(nanobind::init<>(), R"doc(
 Simultaneous Perturbation Stochastic Approximation (SPSA) optimizer.
 
 SPSA is a gradient-free optimization algorithm that uses simultaneous 
@@ -404,15 +419,15 @@ This optimizer does not require gradients from the objective function.
             function=objective
         )
 )doc");
-  py_spsa.def_readwrite("gamma", &cudaq::optimizers::spsa::gamma, R"doc(
+  py_spsa.def_rw("gamma", &cudaq::optimizers::spsa::gamma, R"doc(
 float: Scaling exponent for the step size schedule (default: 0.101).
 
 Controls how the step size decreases over iterations. The step size at 
 iteration k is proportional to (A + k + 1)^(-gamma), where A is a stability 
 constant. Common values are in the range [0.1, 0.6].
 )doc");
-  py_spsa.def_readwrite("step_size", &cudaq::optimizers::spsa::eval_step_size,
-                        R"doc(
+  py_spsa.def_rw("step_size", &cudaq::optimizers::spsa::eval_step_size,
+                 R"doc(
 float: Evaluation step size for gradient approximation (default: 0.3).
 
 Controls the magnitude of perturbations used to approximate gradients.
@@ -421,7 +436,7 @@ to noise. Typical values range from 0.1 to 0.5.
 )doc");
 
   auto py_adam = addPyOptimizer<optimizers::adam>(optimizers_submodule, "Adam")
-                     .def(py::init<>(), R"doc(
+                     .def(nanobind::init<>(), R"doc(
 Adaptive Moment Estimation (Adam) optimizer.
 
 Adam is an adaptive learning rate optimization algorithm that computes 
@@ -458,8 +473,8 @@ function must return a tuple of (value, gradient_vector).
             function=objective_with_grad
         )
 )doc");
-  py_adam.def_readwrite("batch_size", &cudaq::optimizers::adam::batch_size,
-                        R"doc(
+  py_adam.def_rw("batch_size", &cudaq::optimizers::adam::batch_size,
+                 R"doc(
 int: Number of samples per batch (default: 1).
 
 For stochastic optimization, determines how many samples are used to 
@@ -467,28 +482,28 @@ compute each gradient estimate. Batch size of 1 corresponds to online
 learning. Larger batch sizes can provide more stable gradient estimates
 but require more computation per iteration.
 )doc");
-  py_adam.def_readwrite("beta1", &cudaq::optimizers::adam::beta1, R"doc(
+  py_adam.def_rw("beta1", &cudaq::optimizers::adam::beta1, R"doc(
 float: Exponential decay rate for the first moment estimates (default: 0.9).
 
 Controls the exponential moving average of past gradients (momentum term).
 Values are typically in the range [0.9, 0.999]. Higher values give more 
 weight to past gradients, providing smoother updates but slower adaptation.
 )doc");
-  py_adam.def_readwrite("beta2", &cudaq::optimizers::adam::beta2, R"doc(
+  py_adam.def_rw("beta2", &cudaq::optimizers::adam::beta2, R"doc(
 float: Exponential decay rate for the second moment estimates (default: 0.999).
 
 Controls the exponential moving average of past squared gradients. Values 
 are typically in the range [0.99, 0.9999]. Higher values provide more 
 stable learning rates but slower adaptation to changing gradient magnitudes.
 )doc");
-  py_adam.def_readwrite("epsilon", &cudaq::optimizers::adam::eps, R"doc(
+  py_adam.def_rw("epsilon", &cudaq::optimizers::adam::eps, R"doc(
 float: Small constant for numerical stability (default: 1e-8).
 
 Added to the denominator to prevent division by zero when computing 
 adaptive learning rates. Should be a small positive value, typically 
 between 1e-8 and 1e-6.
 )doc");
-  py_adam.def_readwrite("step_size", &cudaq::optimizers::adam::step_size, R"doc(
+  py_adam.def_rw("step_size", &cudaq::optimizers::adam::step_size, R"doc(
 float: Learning rate (step size) for parameter updates (default: 0.01).
 
 Controls the magnitude of parameter updates at each iteration. Typical 
@@ -496,7 +511,7 @@ values range from 0.001 to 0.1. The effective learning rate is adapted
 per parameter based on gradient history. Start with 0.001 or 0.01 and 
 adjust based on convergence behavior.
 )doc");
-  py_adam.def_readwrite("f_tol", &cudaq::optimizers::adam::f_tol, R"doc(
+  py_adam.def_rw("f_tol", &cudaq::optimizers::adam::f_tol, R"doc(
 float: Convergence tolerance on the objective function value (default: 1e-4).
 
 Optimization terminates when the change in objective function value between 
@@ -505,7 +520,7 @@ convergence but may require more iterations.
 )doc");
 
   auto py_sgd = addPyOptimizer<optimizers::sgd>(optimizers_submodule, "SGD")
-                    .def(py::init<>(), R"doc(
+                    .def(nanobind::init<>(), R"doc(
 Stochastic Gradient Descent (SGD) optimizer.
 
 SGD is a fundamental optimization algorithm that updates parameters by taking 
@@ -539,7 +554,7 @@ function must return a tuple of (value, gradient_vector).
             function=objective_with_grad
         )
 )doc");
-  py_sgd.def_readwrite("batch_size", &cudaq::optimizers::sgd::batch_size, R"doc(
+  py_sgd.def_rw("batch_size", &cudaq::optimizers::sgd::batch_size, R"doc(
 int: Number of samples per batch (default: 1).
 
 For stochastic optimization, determines how many samples are used to 
@@ -548,7 +563,7 @@ stochastic gradient descent. Larger batch sizes (mini-batch SGD) can
 provide more stable gradient estimates but require more computation 
 per iteration.
 )doc");
-  py_sgd.def_readwrite("step_size", &cudaq::optimizers::sgd::step_size, R"doc(
+  py_sgd.def_rw("step_size", &cudaq::optimizers::sgd::step_size, R"doc(
 float: Learning rate (step size) for parameter updates (default: 0.01).
 
 Controls the magnitude of parameter updates at each iteration. The update 
@@ -556,7 +571,7 @@ rule is: x_new = x_old - step_size * gradient. Typical values range from
 0.001 to 0.1. Too large values can cause divergence, while too small values 
 lead to slow convergence.
 )doc");
-  py_sgd.def_readwrite("f_tol", &cudaq::optimizers::sgd::f_tol, R"doc(
+  py_sgd.def_rw("f_tol", &cudaq::optimizers::sgd::f_tol, R"doc(
 float: Convergence tolerance on the objective function value (default: 1e-4).
 
 Optimization terminates when the change in objective function value between 
@@ -566,7 +581,7 @@ gradients, convergence may be noisy.
 )doc");
 }
 
-void bindOptimizerWrapper(py::module &mod) {
+void bindOptimizerWrapper(nanobind::module_ &mod) {
   bindOptimizationResult(mod);
   bindGradientStrategies(mod);
   bindOptimizers(mod);
diff --git a/python/runtime/cudaq/algorithms/py_optimizer.h b/python/runtime/cudaq/algorithms/py_optimizer.h
index bd90e44e3af..10ec35d46cd 100644
--- a/python/runtime/cudaq/algorithms/py_optimizer.h
+++ b/python/runtime/cudaq/algorithms/py_optimizer.h
@@ -8,11 +8,9 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
 /// @brief Bind the `cudaq::optimizers::` to python.
-void bindOptimizerWrapper(py::module &mod);
+void bindOptimizerWrapper(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/algorithms/py_resource_count.cpp b/python/runtime/cudaq/algorithms/py_resource_count.cpp
index 2f30d7d6b87..53af2405cf5 100644
--- a/python/runtime/cudaq/algorithms/py_resource_count.cpp
+++ b/python/runtime/cudaq/algorithms/py_resource_count.cpp
@@ -10,17 +10,16 @@
 #include "common/Resources.h"
 #include "runtime/cudaq/platform/py_alt_launch_kernel.h"
 #include "utils/LinkedLibraryHolder.h"
-#include "mlir/Bindings/Python/PybindAdaptors.h"
-#include <pybind11/functional.h>
-
-namespace py = pybind11;
+#include "utils/NanobindAdaptors.h"
+#include <nanobind/stl/function.h>
+#include <nanobind/stl/optional.h>
 
 using namespace cudaq;
 
 static Resources
 estimate_resources_impl(const std::string &kernelName, MlirModule kernelMod,
                         std::optional<std::function<bool()>> choice,
-                        py::args args) {
+                        nanobind::args args) {
   auto &platform = cudaq::get_platform();
   args = simplifiedValidateInputArguments(args);
 
@@ -60,7 +59,7 @@ estimate_resources_impl(const std::string &kernelName, MlirModule kernelMod,
   return counts;
 }
 
-void cudaq::bindCountResources(py::module &mod) {
+void cudaq::bindCountResources(nanobind::module_ &mod) {
   mod.def("estimate_resources_impl", estimate_resources_impl,
           "See python documentation for estimate_resources.");
 }
diff --git a/python/runtime/cudaq/algorithms/py_resource_count.h b/python/runtime/cudaq/algorithms/py_resource_count.h
index af03edd8476..d307c83ed9c 100644
--- a/python/runtime/cudaq/algorithms/py_resource_count.h
+++ b/python/runtime/cudaq/algorithms/py_resource_count.h
@@ -8,8 +8,8 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
-void bindCountResources(pybind11::module &mod);
+void bindCountResources(nanobind::module_ &mod);
 }
diff --git a/python/runtime/cudaq/algorithms/py_run.cpp b/python/runtime/cudaq/algorithms/py_run.cpp
index 153d9c50fbc..5609ebe325a 100644
--- a/python/runtime/cudaq/algorithms/py_run.cpp
+++ b/python/runtime/cudaq/algorithms/py_run.cpp
@@ -11,26 +11,27 @@
 #include "cudaq/algorithms/run.h"
 #include "cudaq_internal/compiler/LayoutInfo.h"
 #include "runtime/cudaq/platform/py_alt_launch_kernel.h"
+#include "utils/NanobindAdaptors.h"
 #include "utils/OpaqueArguments.h"
-#include "mlir/Bindings/Python/PybindAdaptors.h"
 #include <future>
-#include <pybind11/complex.h>
-#include <pybind11/functional.h>
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/ndarray.h>
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/function.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
 
 using namespace cudaq;
 using namespace cudaq_internal::compiler;
 
-static std::vector<py::object> readRunResults(mlir::ModuleOp module,
-                                              mlir::Type ty,
-                                              details::RunResultSpan &results,
-                                              std::size_t count) {
-  std::vector<py::object> ret;
+static std::vector<nanobind::object>
+readRunResults(mlir::ModuleOp module, mlir::Type ty,
+               details::RunResultSpan &results, std::size_t count) {
+  std::vector<nanobind::object> ret;
   std::size_t byteSize = results.lengthInBytes / count;
   for (std::size_t i = 0; i < results.lengthInBytes; i += byteSize) {
-    py::object obj = convertResult(module, ty, results.data + i);
+    nanobind::object obj = convertResult(module, ty, results.data + i);
     ret.push_back(obj);
   }
   return ret;
@@ -90,19 +91,18 @@ pyRunTheKernel(const std::string &name, quantum_platform &platform,
   return results;
 }
 
-static std::vector<py::object> pyReadResults(details::RunResultSpan results,
-                                             mlir::ModuleOp mod,
-                                             std::size_t shots_count,
-                                             const std::string &name) {
+static std::vector<nanobind::object>
+pyReadResults(details::RunResultSpan results, mlir::ModuleOp mod,
+              std::size_t shots_count, const std::string &name) {
   auto returnTy = recoverReturnType(mod, name);
   return readRunResults(mod, returnTy, results, shots_count);
 }
 
 /// @brief Run `cudaq::run` on the provided kernel.
-static std::vector<py::object>
+static std::vector<nanobind::object>
 run_impl(const std::string &shortName, MlirModule module,
          std::size_t shots_count, std::optional<noise_model> noise_model,
-         std::size_t qpu_id, py::args runtimeArgs) {
+         std::size_t qpu_id, nanobind::args runtimeArgs) {
   if (shots_count == 0)
     return {};
 
@@ -133,7 +133,7 @@ namespace {
 // When the `ready` future is set, the content of the buffer is filled.
 struct async_run_result {
   std::future<void> ready;
-  std::vector<py::object> *results;
+  std::vector<nanobind::object> *results;
   std::string *error;
 };
 } // namespace
@@ -142,7 +142,7 @@ struct async_run_result {
 static async_run_result
 run_async_impl(const std::string &shortName, MlirModule module,
                std::size_t shots_count, std::optional<noise_model> noise_model,
-               std::size_t qpu_id, py::args runtimeArgs) {
+               std::size_t qpu_id, nanobind::args runtimeArgs) {
   if (!shots_count)
     return {};
 
@@ -162,7 +162,7 @@ run_async_impl(const std::string &shortName, MlirModule module,
         "Noise model is not supported on remote platforms.");
 
   async_run_result result;
-  result.results = new std::vector<py::object>();
+  result.results = new std::vector<nanobind::object>();
   result.error = new std::string();
 
   if (shots_count == 0) {
@@ -184,7 +184,7 @@ run_async_impl(const std::string &shortName, MlirModule module,
   {
     // Release GIL to allow c++ threads, all code inside the scope is c++, so
     // there is no need to re-acquire the GIL inside the thread.
-    py::gil_scoped_release gil_release{};
+    nanobind::gil_scoped_release gil_release{};
     QuantumTask wrapped = detail::make_copyable_function(
         [sp = std::move(spanPromise), ep = std::move(errorPromise),
          noise_model = std::move(noise_model), qpu_id, name = shortName,
@@ -214,7 +214,7 @@ run_async_impl(const std::string &shortName, MlirModule module,
   {
     // Release GIL to allow c++ threads, re-acquire for conversion of the
     // results to python objects.
-    py::gil_scoped_release gil_release{};
+    nanobind::gil_scoped_release gil_release{};
     auto resultFuture =
         std::async(std::launch::deferred,
                    [sf = std::move(spanFuture), ef = std::move(errorFuture),
@@ -224,7 +224,7 @@ run_async_impl(const std::string &shortName, MlirModule module,
                      std::swap(*errorPtr, error);
                      if (error.empty()) {
                        auto span = sf.get();
-                       py::gil_scoped_acquire gil{};
+                       nanobind::gil_scoped_acquire gil{};
                        auto results =
                            pyReadResults(span, mod, shots_count, shortName);
                        std::swap(*resultsPtr, results);
@@ -237,7 +237,7 @@ run_async_impl(const std::string &shortName, MlirModule module,
 }
 
 /// @brief Bind the run cudaq function.
-void cudaq::bindPyRun(py::module &mod) {
+void cudaq::bindPyRun(nanobind::module_ &mod) {
   mod.def("run_impl", run_impl,
           R"#(
 Run the provided `kernel` with the given kernel arguments over the specified
@@ -255,8 +255,8 @@ number of circuit executions (`shots_count`).
 }
 
 /// @brief Bind the run_async cudaq function.
-void cudaq::bindPyRunAsync(py::module &mod) {
-  py::class_<async_run_result>(mod, "AsyncRunResultImpl", "")
+void cudaq::bindPyRunAsync(nanobind::module_ &mod) {
+  nanobind::class_<async_run_result>(mod, "AsyncRunResultImpl", "")
       .def(
           "get",
           [](async_run_result &self) {
diff --git a/python/runtime/cudaq/algorithms/py_run.h b/python/runtime/cudaq/algorithms/py_run.h
index 3e2c10df6b4..c1070f1f443 100644
--- a/python/runtime/cudaq/algorithms/py_run.h
+++ b/python/runtime/cudaq/algorithms/py_run.h
@@ -8,9 +8,9 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
-void bindPyRun(pybind11::module &mod);
-void bindPyRunAsync(pybind11::module &mod);
+void bindPyRun(nanobind::module_ &mod);
+void bindPyRunAsync(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/algorithms/py_sample_async.cpp b/python/runtime/cudaq/algorithms/py_sample_async.cpp
index dfe50fcb4b5..43deba6c1ce 100644
--- a/python/runtime/cudaq/algorithms/py_sample_async.cpp
+++ b/python/runtime/cudaq/algorithms/py_sample_async.cpp
@@ -10,21 +10,21 @@
 #include "common/DeviceCodeRegistry.h"
 #include "cudaq/algorithms/sample.h"
 #include "runtime/cudaq/platform/py_alt_launch_kernel.h"
+#include "utils/NanobindAdaptors.h"
 #include "utils/OpaqueArguments.h"
-#include "mlir/Bindings/Python/PybindAdaptors.h"
 #include "mlir/CAPI/IR.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include <fmt/core.h>
-#include <pybind11/stl.h>
-
-namespace py = pybind11;
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
 
 using namespace cudaq;
 
 static async_sample_result sample_async_impl(
     const std::string &shortName, MlirModule module, std::size_t shots_count,
     std::optional<noise_model> noise_model, bool explicit_measurements,
-    std::size_t qpu_id, py::args runtimeArgs) {
+    std::size_t qpu_id, nanobind::args runtimeArgs) {
   mlir::ModuleOp mod = unwrap(module);
   runtimeArgs = simplifiedValidateInputArguments(runtimeArgs);
 
@@ -40,7 +40,7 @@ static async_sample_result sample_async_impl(
   auto opaques = marshal_arguments_for_module_launch(mod, runtimeArgs, fnOp);
 
   // Should only have C++ going on here, safe to release the GIL
-  py::gil_scoped_release release;
+  nanobind::gil_scoped_release release;
 
   // Use runSamplingAsync with noise model support.
   // The noise_model is passed by value to runSamplingAsync, which captures
@@ -60,7 +60,7 @@ static async_sample_result sample_async_impl(
       std::move(noise_model));
 }
 
-void cudaq::bindSampleAsync(py::module &mod) {
+void cudaq::bindSampleAsync(nanobind::module_ &mod) {
   // Async. result wrapper for Python kernels, which also holds the Python MLIR
   // context.
   //
@@ -74,8 +74,8 @@ void cudaq::bindSampleAsync(py::module &mod) {
   // then track a reference (ref count) to the context of the temporary (rval)
   // kernel.
 
-  py::class_<async_sample_result>(mod, "AsyncSampleResultImpl",
-                                  R"#(
+  nanobind::class_<async_sample_result>(mod, "AsyncSampleResultImpl",
+                                        R"#(
 A data-type containing the results of a call to :func:`sample_async`.  The
 `AsyncSampleResult` models a future-like type, whose :class:`SampleResult` may
 be returned via an invocation of the `get` method.  This kicks off a wait on the
@@ -83,14 +83,15 @@ current thread until the results are available.  See `future
 <https://en.cppreference.com/w/cpp/thread/future>`_ for more information on this
 programming pattern.
 )#")
-      .def(py::init([](std::string inJson) {
-        async_sample_result f;
-        std::istringstream is(inJson);
-        is >> f;
-        return f;
-      }))
+      .def("__init__",
+           [](async_sample_result *self, std::string inJson) {
+             async_sample_result f;
+             std::istringstream is(inJson);
+             is >> f;
+             new (self) async_sample_result(std::move(f));
+           })
       .def("get", &async_sample_result::get,
-           py::call_guard<py::gil_scoped_release>(),
+           nanobind::call_guard<nanobind::gil_scoped_release>(),
            "Return the :class:`SampleResult` from the asynchronous sample "
            "execution.\n")
       .def(
diff --git a/python/runtime/cudaq/algorithms/py_sample_async.h b/python/runtime/cudaq/algorithms/py_sample_async.h
index b494b2631bb..ec1c69476ac 100644
--- a/python/runtime/cudaq/algorithms/py_sample_async.h
+++ b/python/runtime/cudaq/algorithms/py_sample_async.h
@@ -8,10 +8,8 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
-void bindSampleAsync(py::module &mod);
+void bindSampleAsync(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/algorithms/py_sample_ptsbe.cpp b/python/runtime/cudaq/algorithms/py_sample_ptsbe.cpp
index 4bf979ac8d9..064672787bc 100644
--- a/python/runtime/cudaq/algorithms/py_sample_ptsbe.cpp
+++ b/python/runtime/cudaq/algorithms/py_sample_ptsbe.cpp
@@ -20,13 +20,15 @@
 #include "cudaq/ptsbe/strategies/OrderedSamplingStrategy.h"
 #include "cudaq/ptsbe/strategies/ProbabilisticSamplingStrategy.h"
 #include "runtime/cudaq/platform/py_alt_launch_kernel.h"
+#include "utils/NanobindAdaptors.h"
 #include "utils/OpaqueArguments.h"
-#include "mlir/Bindings/Python/PybindAdaptors.h"
 #include "mlir/CAPI/IR.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
-#include <pybind11/stl.h>
-
-namespace py = pybind11;
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/shared_ptr.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/unordered_map.h>
+#include <nanobind/stl/vector.h>
 
 using namespace cudaq;
 
@@ -38,9 +40,9 @@ static ptsbe::sample_result
 pySamplePTSBE(const std::string &shortName, MlirModule module,
               std::size_t shots_count, noise_model noiseModel,
               std::optional<std::size_t> max_trajectories,
-              py::object sampling_strategy, py::object shot_allocation_obj,
-              bool return_execution_data, bool include_sequential_data,
-              py::args runtimeArgs) {
+              nanobind::object sampling_strategy,
+              nanobind::object shot_allocation_obj, bool return_execution_data,
+              bool include_sequential_data, nanobind::args runtimeArgs) {
   if (shots_count == 0)
     return ptsbe::sample_result();
 
@@ -51,11 +53,12 @@ pySamplePTSBE(const std::string &shortName, MlirModule module,
 
   if (!sampling_strategy.is_none())
     ptsbe_options.strategy =
-        sampling_strategy.cast<std::shared_ptr<ptsbe::PTSSamplingStrategy>>();
+        nanobind::cast<std::shared_ptr<ptsbe::PTSSamplingStrategy>>(
+            sampling_strategy);
 
   if (!shot_allocation_obj.is_none())
     ptsbe_options.shot_allocation =
-        shot_allocation_obj.cast<ptsbe::ShotAllocationStrategy>();
+        nanobind::cast<ptsbe::ShotAllocationStrategy>(shot_allocation_obj);
 
   auto mod = unwrap(module);
   runtimeArgs = simplifiedValidateInputArguments(runtimeArgs);
@@ -104,13 +107,12 @@ struct AsyncPTSBESampleResultImpl {
 } // namespace
 
 /// @brief Run PTSBE sampling asynchronously from Python.
-static AsyncPTSBESampleResultImpl
-pySampleAsyncPTSBE(const std::string &shortName, MlirModule module,
-                   std::size_t shots_count, noise_model &noiseModel,
-                   std::optional<std::size_t> max_trajectories,
-                   py::object sampling_strategy, py::object shot_allocation_obj,
-                   bool return_execution_data, bool include_sequential_data,
-                   py::args runtimeArgs) {
+static AsyncPTSBESampleResultImpl pySampleAsyncPTSBE(
+    const std::string &shortName, MlirModule module, std::size_t shots_count,
+    noise_model &noiseModel, std::optional<std::size_t> max_trajectories,
+    nanobind::object sampling_strategy, nanobind::object shot_allocation_obj,
+    bool return_execution_data, bool include_sequential_data,
+    nanobind::args runtimeArgs) {
 
   ptsbe::PTSBEOptions ptsbe_options;
   ptsbe_options.return_execution_data = return_execution_data;
@@ -119,11 +121,12 @@ pySampleAsyncPTSBE(const std::string &shortName, MlirModule module,
 
   if (!sampling_strategy.is_none())
     ptsbe_options.strategy =
-        sampling_strategy.cast<std::shared_ptr<ptsbe::PTSSamplingStrategy>>();
+        nanobind::cast<std::shared_ptr<ptsbe::PTSSamplingStrategy>>(
+            sampling_strategy);
 
   if (!shot_allocation_obj.is_none())
     ptsbe_options.shot_allocation =
-        shot_allocation_obj.cast<ptsbe::ShotAllocationStrategy>();
+        nanobind::cast<ptsbe::ShotAllocationStrategy>(shot_allocation_obj);
 
   auto mod = unwrap(module);
   runtimeArgs = simplifiedValidateInputArguments(runtimeArgs);
@@ -135,7 +138,7 @@ pySampleAsyncPTSBE(const std::string &shortName, MlirModule module,
   std::string kernelName = shortName;
 
   // Release GIL before launching async C++ work
-  py::gil_scoped_release release;
+  nanobind::gil_scoped_release release;
   return AsyncPTSBESampleResultImpl(ptsbe::detail::runSamplingAsyncPTSBE(
       [opaques = std::move(opaques), kernelName, mod = mod.clone()]() mutable {
         [[maybe_unused]] auto result =
@@ -145,20 +148,19 @@ pySampleAsyncPTSBE(const std::string &shortName, MlirModule module,
       noiseModel));
 }
 
-void cudaq::bindSamplePTSBE(py::module &mod) {
+void cudaq::bindSamplePTSBE(nanobind::module_ &mod) {
   auto ptsbe = mod.def_submodule(
       "ptsbe", "PTSBE (Pre-Trajectory Sampling with Batch Execution)");
 
   // Base strategy class (abstract, not directly constructible)
-  py::class_<ptsbe::PTSSamplingStrategy,
-             std::shared_ptr<ptsbe::PTSSamplingStrategy>>(
+  nanobind::class_<ptsbe::PTSSamplingStrategy>(
       ptsbe, "PTSSamplingStrategy",
       "Base class for trajectory sampling strategies.")
       .def("name", &ptsbe::PTSSamplingStrategy::name,
            "Get the name of this strategy.");
 
   // Shot allocation strategy
-  py::enum_<ptsbe::ShotAllocationStrategy::Type>(
+  nanobind::enum_<ptsbe::ShotAllocationStrategy::Type>(
       ptsbe, "ShotAllocationType",
       "Strategy type for allocating shots across trajectories.")
       .value("PROPORTIONAL", ptsbe::ShotAllocationStrategy::Type::PROPORTIONAL,
@@ -172,33 +174,36 @@ void cudaq::bindSamplePTSBE(py::module &mod) {
              ptsbe::ShotAllocationStrategy::Type::HIGH_WEIGHT_BIAS,
              "Bias toward high-weight error trajectories.");
 
-  py::class_<ptsbe::ShotAllocationStrategy>(
+  nanobind::class_<ptsbe::ShotAllocationStrategy>(
       ptsbe, "ShotAllocationStrategy",
       "Strategy for allocating shots across selected trajectories.")
-      .def(py::init<>(), "Create a default (PROPORTIONAL) strategy.")
-      .def(py::init([](ptsbe::ShotAllocationStrategy::Type t, double bias,
-                       std::optional<std::uint64_t> seed) {
-             return ptsbe::ShotAllocationStrategy(t, bias, seed);
-           }),
-           py::arg("type"), py::arg("bias_strength") = 2.0,
-           py::arg("seed") = py::none(),
-           "Create a strategy with specified type, optional bias strength, "
-           "and optional random seed. When seed is None (default), uses "
-           "CUDA-Q's global random seed.")
-      .def_readwrite("type", &ptsbe::ShotAllocationStrategy::type,
-                     "The allocation strategy type.")
-      .def_readwrite(
-          "bias_strength", &ptsbe::ShotAllocationStrategy::bias_strength,
-          "Bias factor for weighted strategies. Default value is 2.0.");
+      .def(nanobind::init<>(), "Create a default (PROPORTIONAL) strategy.")
+      .def(
+          "__init__",
+          [](ptsbe::ShotAllocationStrategy *self,
+             ptsbe::ShotAllocationStrategy::Type t, double bias,
+             std::optional<std::uint64_t> seed) {
+            new (self) ptsbe::ShotAllocationStrategy(t, bias, seed);
+          },
+          nanobind::arg("type"), nanobind::arg("bias_strength") = 2.0,
+          nanobind::arg("seed") = nanobind::none(),
+          "Create a strategy with specified type, optional bias strength, "
+          "and optional random seed. When seed is None (default), uses "
+          "CUDA-Q's global random seed.")
+      .def_rw("type", &ptsbe::ShotAllocationStrategy::type,
+              "The allocation strategy type.")
+      .def_rw("bias_strength", &ptsbe::ShotAllocationStrategy::bias_strength,
+              "Bias factor for weighted strategies. Default value is 2.0.");
 
   // Concrete strategies
-  py::class_<ptsbe::ProbabilisticSamplingStrategy, ptsbe::PTSSamplingStrategy,
-             std::shared_ptr<ptsbe::ProbabilisticSamplingStrategy>>(
+  nanobind::class_<ptsbe::ProbabilisticSamplingStrategy,
+                   ptsbe::PTSSamplingStrategy>(
       ptsbe, "ProbabilisticSamplingStrategy",
       "Sample trajectories randomly based on their occurrence probabilities.")
-      .def(py::init<std::optional<std::uint64_t>, std::optional<std::size_t>>(),
-           py::arg("seed") = py::none(),
-           py::arg("max_trajectory_samples") = py::none(),
+      .def(nanobind::init<std::optional<std::uint64_t>,
+                          std::optional<std::size_t>>(),
+           nanobind::arg("seed") = nanobind::none(),
+           nanobind::arg("max_trajectory_samples") = nanobind::none(),
            "Create a probabilistic strategy with optional random seed and "
            "max trajectory sample count. When seed is None (default), uses "
            "CUDA-Q's global random seed. "
@@ -206,20 +211,19 @@ void cudaq::bindSamplePTSBE(py::module &mod) {
            "The loop stops early once max_trajectories unique patterns are "
            "found. When None (default), a budget is auto-calculated.");
 
-  py::class_<ptsbe::OrderedSamplingStrategy, ptsbe::PTSSamplingStrategy,
-             std::shared_ptr<ptsbe::OrderedSamplingStrategy>>(
+  nanobind::class_<ptsbe::OrderedSamplingStrategy, ptsbe::PTSSamplingStrategy>(
       ptsbe, "OrderedSamplingStrategy",
       "Sample trajectories sorted by probability in descending order.")
-      .def(py::init<>(), "Create an ordered strategy.");
+      .def(nanobind::init<>(), "Create an ordered strategy.");
 
-  py::class_<ptsbe::ExhaustiveSamplingStrategy, ptsbe::PTSSamplingStrategy,
-             std::shared_ptr<ptsbe::ExhaustiveSamplingStrategy>>(
+  nanobind::class_<ptsbe::ExhaustiveSamplingStrategy,
+                   ptsbe::PTSSamplingStrategy>(
       ptsbe, "ExhaustiveSamplingStrategy",
       "Enumerate all possible trajectories in lexicographic order.")
-      .def(py::init<>(), "Create an exhaustive strategy.");
+      .def(nanobind::init<>(), "Create an exhaustive strategy.");
 
   // Trace instruction type enum
-  py::enum_<ptsbe::TraceInstructionType>(
+  nanobind::enum_<ptsbe::TraceInstructionType>(
       ptsbe, "TraceInstructionType",
       "Type discriminator for trace instructions.")
       .value("Gate", ptsbe::TraceInstructionType::Gate)
@@ -228,47 +232,48 @@ void cudaq::bindSamplePTSBE(py::module &mod) {
       .export_values();
 
   // Trace instruction
-  py::class_<ptsbe::TraceInstruction>(
+  nanobind::class_<ptsbe::TraceInstruction>(
       ptsbe, "TraceInstruction", "Single operation in the execution trace.")
-      .def_property_readonly(
+      .def_prop_ro(
           "type", [](const ptsbe::TraceInstruction &self) { return self.type; })
-      .def_property_readonly(
+      .def_prop_ro(
           "name", [](const ptsbe::TraceInstruction &self) { return self.name; })
-      .def_property_readonly("targets",
-                             [](const ptsbe::TraceInstruction &self) {
-                               return std::vector<std::size_t>(
-                                   self.targets.begin(), self.targets.end());
-                             })
-      .def_property_readonly("controls",
-                             [](const ptsbe::TraceInstruction &self) {
-                               return std::vector<std::size_t>(
-                                   self.controls.begin(), self.controls.end());
-                             })
-      .def_property_readonly("params",
-                             [](const ptsbe::TraceInstruction &self) {
-                               return std::vector<double>(self.params.begin(),
-                                                          self.params.end());
-                             })
+      .def_prop_ro("targets",
+                   [](const ptsbe::TraceInstruction &self) {
+                     return std::vector<std::size_t>(self.targets.begin(),
+                                                     self.targets.end());
+                   })
+      .def_prop_ro("controls",
+                   [](const ptsbe::TraceInstruction &self) {
+                     return std::vector<std::size_t>(self.controls.begin(),
+                                                     self.controls.end());
+                   })
+      .def_prop_ro("params",
+                   [](const ptsbe::TraceInstruction &self) {
+                     return std::vector<double>(self.params.begin(),
+                                                self.params.end());
+                   })
       .def("__repr__", [](const ptsbe::TraceInstruction &self) {
         return "TraceInstruction(" + self.name + " on " +
                std::to_string(self.targets.size()) + " qubits)";
       });
 
   // Kraus selection (cudaq:: namespace)
-  py::class_<KrausSelection>(ptsbe, "KrausSelection",
-                             "Reference to a single Kraus operator selection.")
-      .def_property_readonly(
+  nanobind::class_<KrausSelection>(
+      ptsbe, "KrausSelection",
+      "Reference to a single Kraus operator selection.")
+      .def_prop_ro(
           "circuit_location",
           [](const KrausSelection &self) { return self.circuit_location; })
-      .def_property_readonly(
+      .def_prop_ro(
           "kraus_operator_index",
           [](const KrausSelection &self) { return self.kraus_operator_index; })
-      .def_property_readonly(
-          "is_error", [](const KrausSelection &self) { return self.is_error; })
-      .def_property_readonly(
-          "qubits", [](const KrausSelection &self) { return self.qubits; })
-      .def_property_readonly(
-          "op_name", [](const KrausSelection &self) { return self.op_name; })
+      .def_prop_ro("is_error",
+                   [](const KrausSelection &self) { return self.is_error; })
+      .def_prop_ro("qubits",
+                   [](const KrausSelection &self) { return self.qubits; })
+      .def_prop_ro("op_name",
+                   [](const KrausSelection &self) { return self.op_name; })
       .def("__repr__", [](const KrausSelection &self) {
         return "KrausSelection(loc=" + std::to_string(self.circuit_location) +
                ", idx=" + std::to_string(self.kraus_operator_index) +
@@ -276,27 +281,25 @@ void cudaq::bindSamplePTSBE(py::module &mod) {
       });
 
   // Kraus trajectory (cudaq:: namespace)
-  py::class_<KrausTrajectory>(
+  nanobind::class_<KrausTrajectory>(
       ptsbe, "KrausTrajectory",
       "Complete specification of one noise trajectory with outcomes.")
-      .def_property_readonly(
+      .def_prop_ro(
           "trajectory_id",
           [](const KrausTrajectory &self) { return self.trajectory_id; })
-      .def_property_readonly(
-          "probability",
-          [](const KrausTrajectory &self) { return self.probability; })
-      .def_property_readonly(
-          "num_shots",
-          [](const KrausTrajectory &self) { return self.num_shots; })
-      .def_readonly("multiplicity", &KrausTrajectory::multiplicity,
-                    "Number of times this trajectory was sampled.")
-      .def_readonly("weight", &KrausTrajectory::weight,
-                    "Allocation weight for shot distribution.")
-      .def_property_readonly(
+      .def_prop_ro("probability",
+                   [](const KrausTrajectory &self) { return self.probability; })
+      .def_prop_ro("num_shots",
+                   [](const KrausTrajectory &self) { return self.num_shots; })
+      .def_ro("multiplicity", &KrausTrajectory::multiplicity,
+              "Number of times this trajectory was sampled.")
+      .def_ro("weight", &KrausTrajectory::weight,
+              "Allocation weight for shot distribution.")
+      .def_prop_ro(
           "kraus_selections",
           [](const KrausTrajectory &self) { return self.kraus_selections; },
-          py::return_value_policy::reference_internal)
-      .def_property_readonly(
+          nanobind::rv_policy::reference_internal)
+      .def_prop_ro(
           "measurement_counts",
           [](const KrausTrajectory &self) { return self.measurement_counts; })
       .def("__repr__", [](const KrausTrajectory &self) {
@@ -306,34 +309,35 @@ void cudaq::bindSamplePTSBE(py::module &mod) {
       });
 
   // PTSBE execution data container
-  py::class_<ptsbe::PTSBEExecutionData>(
+  nanobind::class_<ptsbe::PTSBEExecutionData>(
       ptsbe, "PTSBEExecutionData",
       "Container for PTSBE execution data including circuit structure, "
       "trajectory specifications, and per-trajectory measurement outcomes.")
-      .def_property_readonly(
+      .def_prop_ro(
           "instructions",
           [](const ptsbe::PTSBEExecutionData &self)
               -> const std::vector<ptsbe::TraceInstruction> & {
             return self.instructions;
           },
-          py::return_value_policy::reference_internal)
-      .def_property_readonly(
+          nanobind::rv_policy::reference_internal)
+      .def_prop_ro(
           "trajectories",
           [](const ptsbe::PTSBEExecutionData &self)
               -> const std::vector<cudaq::KrausTrajectory> & {
             return self.trajectories;
           },
-          py::return_value_policy::reference_internal)
+          nanobind::rv_policy::reference_internal)
       .def(
           "count_instructions",
           [](const ptsbe::PTSBEExecutionData &self,
-             ptsbe::TraceInstructionType type, py::object name) -> std::size_t {
+             ptsbe::TraceInstructionType type,
+             nanobind::object name) -> std::size_t {
             std::optional<std::string> nameOpt;
             if (!name.is_none())
-              nameOpt = name.cast<std::string>();
+              nameOpt = nanobind::cast<std::string>(name);
             return self.count_instructions(type, nameOpt);
           },
-          py::arg("type"), py::arg("name") = py::none(),
+          nanobind::arg("type"), nanobind::arg("name") = nanobind::none(),
           "Count instructions of a given type.")
       .def(
           "get_trajectory",
@@ -344,7 +348,8 @@ void cudaq::bindSamplePTSBE(py::module &mod) {
               return nullptr;
             return &result.value().get();
           },
-          py::return_value_policy::reference_internal, py::arg("trajectory_id"),
+          nanobind::rv_policy::reference_internal,
+          nanobind::arg("trajectory_id"),
           "Look up a trajectory by its ID. Returns None if not found.")
       .def("__repr__",
            [](const ptsbe::PTSBEExecutionData &self) {
@@ -358,10 +363,10 @@ void cudaq::bindSamplePTSBE(py::module &mod) {
       });
 
   // PTSBE sample result (subclass of sample_result)
-  py::class_<ptsbe::sample_result, sample_result>(
+  nanobind::class_<ptsbe::sample_result, sample_result>(
       ptsbe, "PTSBESampleResult",
       "PTSBE sample result with optional execution data.")
-      .def_property_readonly(
+      .def_prop_ro(
           "ptsbe_execution_data",
           [](const ptsbe::sample_result &self)
               -> const ptsbe::PTSBEExecutionData * {
@@ -371,31 +376,36 @@ void cudaq::bindSamplePTSBE(py::module &mod) {
           },
           // reference_internal ties the returned object's lifetime to self,
           // so the pointer into internal data stays valid.
-          py::return_value_policy::reference_internal,
+          nanobind::rv_policy::reference_internal,
           "PTSBE execution data if return_execution_data was True, None "
           "otherwise.")
       .def("has_execution_data", &ptsbe::sample_result::has_execution_data,
            "Check if execution data is available.");
 
   // Async PTSBE sample result wrapper
-  py::class_<AsyncPTSBESampleResultImpl>(
+  nanobind::class_<AsyncPTSBESampleResultImpl>(
       ptsbe, "AsyncSampleResultImpl",
       "Future-like wrapper for asynchronous PTSBE sampling.")
       .def("get", &AsyncPTSBESampleResultImpl::get,
-           py::call_guard<py::gil_scoped_release>(),
+           nanobind::call_guard<nanobind::gil_scoped_release>(),
            "Block until the PTSBE sampling result is available and return it.");
 
   // PTSBE sample implementation
-  ptsbe.def("sample_impl", pySamplePTSBE,
+  ptsbe.def("sample_impl", pySamplePTSBE, nanobind::arg("kernel_name"),
+            nanobind::arg("module"), nanobind::arg("shots_count"),
+            nanobind::arg("noise_model"), nanobind::arg("max_trajectories"),
+            nanobind::arg("sampling_strategy").none(),
+            nanobind::arg("shot_allocation").none(),
+            nanobind::arg("return_execution_data"),
+            nanobind::arg("include_sequential_data"),
             R"pbdoc(
 Run PTSBE sampling on the provided kernel.
 
 Args:
   kernel_name: The kernel name.
   module: The MLIR module.
-  return_type: The MLIR return type.
   shots_count: The number of shots.
-  noise_model: Optional noise model for gate-based noise; may be None.
+  noise_model: The noise model.
   max_trajectories: Maximum unique trajectories, or None to use shots.
   sampling_strategy: Sampling strategy or None for default (probabilistic).
   shot_allocation: Shot allocation strategy or None for default (proportional).
@@ -409,6 +419,13 @@ Run PTSBE sampling on the provided kernel.
 
   // PTSBE async sample implementation
   ptsbe.def("sample_async_impl", pySampleAsyncPTSBE,
+            nanobind::arg("kernel_name"), nanobind::arg("module"),
+            nanobind::arg("shots_count"), nanobind::arg("noise_model"),
+            nanobind::arg("max_trajectories"),
+            nanobind::arg("sampling_strategy").none(),
+            nanobind::arg("shot_allocation").none(),
+            nanobind::arg("return_execution_data"),
+            nanobind::arg("include_sequential_data"),
             "Run PTSBE sampling asynchronously. Returns an "
             "AsyncSampleResultImpl.");
 }
diff --git a/python/runtime/cudaq/algorithms/py_sample_ptsbe.h b/python/runtime/cudaq/algorithms/py_sample_ptsbe.h
index 2c5f2869486..ad8386efd64 100644
--- a/python/runtime/cudaq/algorithms/py_sample_ptsbe.h
+++ b/python/runtime/cudaq/algorithms/py_sample_ptsbe.h
@@ -8,8 +8,8 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
-void bindSamplePTSBE(pybind11::module &mod);
+void bindSamplePTSBE(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/algorithms/py_state.cpp b/python/runtime/cudaq/algorithms/py_state.cpp
index 38fa842f020..d1099e692be 100644
--- a/python/runtime/cudaq/algorithms/py_state.cpp
+++ b/python/runtime/cudaq/algorithms/py_state.cpp
@@ -13,8 +13,16 @@
 #include "cudaq/algorithms/get_state.h"
 #include "cudaq/runtime/logger/logger.h"
 #include "runtime/cudaq/platform/py_alt_launch_kernel.h"
+#include "utils/NanobindAdaptors.h"
 #include "utils/OpaqueArguments.h"
-#include "mlir/Bindings/Python/PybindAdaptors.h"
+#include <nanobind/ndarray.h>
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/function.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/pair.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/unique_ptr.h>
+#include <nanobind/stl/vector.h>
 
 using namespace cudaq;
 
@@ -41,7 +49,7 @@ static std::vector<int> bitStringToIntVec(const std::string &bitString) {
 
 /// @brief Run `cudaq::get_state` on the provided kernel and spin operator.
 static state get_state_impl(const std::string &shortName, MlirModule mod,
-                            py::args args) {
+                            nanobind::args args) {
   auto closure = [=]() {
     return marshal_and_launch_module(shortName, mod, args);
   };
@@ -51,7 +59,7 @@ static state get_state_impl(const std::string &shortName, MlirModule mod,
 static std::future<state> get_state_async_impl(const std::string &shortName,
                                                MlirModule module,
                                                std::size_t qpu_id,
-                                               py::args args) {
+                                               nanobind::args args) {
   // Launch the asynchronous execution.
   auto mod = unwrap(module);
   std::string kernelName = shortName;
@@ -59,7 +67,7 @@ static std::future<state> get_state_async_impl(const std::string &shortName,
   auto fnOp = getKernelFuncOp(mod, shortName);
   auto opaques = marshal_arguments_for_module_launch(mod, args, fnOp);
 
-  py::gil_scoped_release release;
+  nanobind::gil_scoped_release release;
   return details::runGetStateAsync(
       detail::make_copyable_function([opaques = std::move(opaques), kernelName,
                                       mod = mod.clone()]() mutable {
@@ -131,12 +139,12 @@ class PyRemoteSimulationState : public RemoteSimulationState {
 
 /// @brief Run `cudaq::get_state` for remote execution targets on the provided
 /// kernel and args
-state pyGetStateRemote(py::object kernel, py::args args) {
-  if (py::hasattr(kernel, "compile"))
+state pyGetStateRemote(nanobind::object kernel, nanobind::args args) {
+  if (nanobind::hasattr(kernel, "compile"))
     kernel.attr("compile")();
 
-  auto kernelName = kernel.attr("uniqName").cast<std::string>();
-  auto kernelMod = kernel.attr("qkeModule").cast<MlirModule>();
+  auto kernelName = nanobind::cast<std::string>(kernel.attr("uniqName"));
+  auto kernelMod = nanobind::cast<MlirModule>(kernel.attr("qkeModule"));
   args = simplifiedValidateInputArguments(args);
   auto *argData = toOpaqueArgs(args, kernelMod, kernelName);
 #if 0
@@ -170,7 +178,7 @@ class PyQPUState : public QPUState {
 /// @brief Run `cudaq::get_state` for qpu targets on the provided
 /// kernel and args
 state pyGetStateQPU(const std::string &kernelName, MlirModule kernelMod,
-                    py::args args) {
+                    nanobind::args args) {
   auto moduleOp = unwrap(kernelMod);
   std::string mlirCode;
   llvm::raw_string_ostream outStr(mlirCode);
@@ -182,45 +190,62 @@ state pyGetStateQPU(const std::string &kernelName, MlirModule kernelMod,
   return state(new PyQPUState(kernelName, mlirCode, argData));
 }
 
-state pyGetStateLibraryMode(py::object kernel, py::args args) {
+state pyGetStateLibraryMode(nanobind::object kernel, nanobind::args args) {
   return details::extractState([&]() mutable {
     if (0 == args.size())
       kernel();
     else {
-      std::vector<py::object> argsData;
+      std::vector<nanobind::object> argsData;
       for (size_t i = 0; i < args.size(); i++) {
-        py::object arg = args[i];
-        argsData.emplace_back(std::forward<py::object>(arg));
+        nanobind::object arg = args[i];
+        argsData.emplace_back(std::forward<nanobind::object>(arg));
       }
       kernel(std::move(argsData));
     }
   });
 }
 
-static py::buffer_info getCupyBufferInfo(py::buffer cupy_buffer) {
-  // Note: cupy 13.5+ arrays will bind (overload resolution) to a py::buffer
-  // type. However, we cannot access the underlying buffer info via a
+/// @brief Helper struct to hold buffer metadata, analogous to Python's
+/// buffer_info.
+struct BufferInfo {
+  void *ptr = nullptr;
+  std::size_t itemsize = 0;
+  std::string format;
+  std::size_t ndim = 0;
+  std::vector<std::size_t> shape;
+  std::vector<ssize_t> strides;
+  bool readonly = false;
+  std::size_t size = 0; // total number of elements
+};
+
+static BufferInfo getCupyBufferInfo(nanobind::object cupy_buffer) {
+  // Note: cupy 13.5+ arrays will bind (overload resolution) to a
+  // nanobind::object type. However, we cannot access the underlying buffer info
+  // via a
   // `.request()` as it will throw unless that is managed memory. Here, we
-  // retrieve and construct buffer_info from the CuPy array interface.
+  // retrieve and construct BufferInfo from the CuPy array interface.
 
-  if (!py::hasattr(cupy_buffer, "__cuda_array_interface__")) {
+  if (!nanobind::hasattr(cupy_buffer, "__cuda_array_interface__")) {
     throw std::runtime_error("Buffer is not a CuPy array");
   }
 
-  py::dict cupy_array_info = cupy_buffer.attr("__cuda_array_interface__");
+  nanobind::dict cupy_array_info = nanobind::cast<nanobind::dict>(
+      cupy_buffer.attr("__cuda_array_interface__"));
   // Ref: https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html
   // example: {'shape': (2, 2), 'typestr': '<c16', 'descr': [('', '<c16')],
   // 'stream': 1, 'version': 3, 'strides': None, 'data': (140222144708608,
   // False)}
-  py::tuple dataInfo = cupy_array_info["data"].cast<py::tuple>();
-  void *dataPtr = (void *)dataInfo[0].cast<int64_t>();
-  const bool readOnly = dataInfo[1].cast<bool>();
-  auto shapeTuple = cupy_array_info["shape"].cast<py::tuple>();
+  nanobind::tuple dataInfo =
+      nanobind::cast<nanobind::tuple>(cupy_array_info["data"]);
+  void *dataPtr = (void *)nanobind::cast<int64_t>(dataInfo[0]);
+  const bool readOnly = nanobind::cast<bool>(dataInfo[1]);
+  auto shapeTuple = nanobind::cast<nanobind::tuple>(cupy_array_info["shape"]);
   std::vector<std::size_t> extents;
   for (std::size_t i = 0; i < shapeTuple.size(); i++) {
-    extents.push_back(shapeTuple[i].cast<std::size_t>());
+    extents.push_back(nanobind::cast<std::size_t>(shapeTuple[i]));
   }
-  const std::string typeStr = cupy_array_info["typestr"].cast<std::string>();
+  const std::string typeStr =
+      nanobind::cast<std::string>(cupy_array_info["typestr"]);
   if (typeStr != "<c16" && typeStr != "<c8") {
     throw std::runtime_error("Unsupported typestr in CuPy array: " + typeStr +
                              ". Supported types are: <c16 and <c8.");
@@ -228,30 +253,71 @@ static py::buffer_info getCupyBufferInfo(py::buffer cupy_buffer) {
 
   const bool isDoublePrecision = typeStr == "<c16";
 
-  auto [dataTypeSize, desc] =
-      !isDoublePrecision
-          ? std::make_tuple(
-                sizeof(std::complex<float>),
-                py::format_descriptor<std::complex<float>>::format())
-          : std::make_tuple(
-                sizeof(std::complex<double>),
-                py::format_descriptor<std::complex<double>>::format());
+  std::size_t dataTypeSize = isDoublePrecision ? sizeof(std::complex<double>)
+                                               : sizeof(std::complex<float>);
+  std::string desc = isDoublePrecision ? "Zd" : "Zf";
 
   std::vector<ssize_t> strides(extents.size(), dataTypeSize);
   for (size_t i = 1; i < extents.size(); ++i)
     strides[i] = strides[i - 1] * extents[i - 1];
 
-  return py::buffer_info(dataPtr, dataTypeSize, /*itemsize */
-                         desc, extents.size(),  /* ndim */
-                         extents,               /* shape */
-                         strides,               /* strides */
-                         readOnly               /* readonly */
-  );
+  std::size_t totalSize = 1;
+  for (auto e : extents)
+    totalSize *= e;
+
+  BufferInfo info;
+  info.ptr = dataPtr;
+  info.itemsize = dataTypeSize;
+  info.format = desc;
+  info.ndim = extents.size();
+  info.shape = extents;
+  info.strides = strides;
+  info.readonly = readOnly;
+  info.size = totalSize;
+  return info;
 }
 
-static cudaq::state createStateFromPyBuffer(py::buffer data,
+/// @brief Helper to get BufferInfo from a numpy array via Python buffer
+/// protocol.
+static BufferInfo getNumpyBufferInfo(nanobind::object numpy_array) {
+  nanobind::module_ np = nanobind::module_::import_("numpy");
+  auto dtype = numpy_array.attr("dtype");
+  std::string dtypeStr = nanobind::cast<std::string>(dtype.attr("name"));
+
+  BufferInfo info;
+  if (dtypeStr == "complex64") {
+    info.itemsize = sizeof(std::complex<float>);
+    info.format = "Zf";
+  } else if (dtypeStr == "complex128") {
+    info.itemsize = sizeof(std::complex<double>);
+    info.format = "Zd";
+  } else {
+    info.format = dtypeStr;
+    info.itemsize = nanobind::cast<std::size_t>(dtype.attr("itemsize"));
+  }
+  auto shapeTuple = nanobind::cast<nanobind::tuple>(numpy_array.attr("shape"));
+  info.ndim = shapeTuple.size();
+  info.size = 1;
+  for (std::size_t i = 0; i < shapeTuple.size(); i++) {
+    auto ext = nanobind::cast<std::size_t>(shapeTuple[i]);
+    info.shape.push_back(ext);
+    info.size *= ext;
+  }
+  auto stridesTuple =
+      nanobind::cast<nanobind::tuple>(numpy_array.attr("strides"));
+  for (std::size_t i = 0; i < stridesTuple.size(); i++) {
+    info.strides.push_back(nanobind::cast<ssize_t>(stridesTuple[i]));
+  }
+  // Get the raw data pointer via numpy's ctypes interface
+  info.ptr = reinterpret_cast<void *>(
+      nanobind::cast<intptr_t>(numpy_array.attr("ctypes").attr("data")));
+  info.readonly = false;
+  return info;
+}
+
+static cudaq::state createStateFromPyBuffer(nanobind::object data,
                                             LinkedLibraryHolder &holder) {
-  const bool isHostData = !py::hasattr(data, "__cuda_array_interface__");
+  const bool isHostData = !nanobind::hasattr(data, "__cuda_array_interface__");
   // Check that the target is GPU-based, i.e., can handle device
   // pointer.
   if (!holder.getTarget().config.GpuRequired && !isHostData)
@@ -259,12 +325,11 @@ static cudaq::state createStateFromPyBuffer(py::buffer data,
         fmt::format("Current target '{}' does not support CuPy arrays.",
                     holder.getTarget().name));
 
-  auto info = isHostData ? data.request() : getCupyBufferInfo(data);
+  auto info = isHostData ? getNumpyBufferInfo(data) : getCupyBufferInfo(data);
   if (info.shape.size() > 2)
     throw std::runtime_error(
         "state.from_data only supports 1D or 2D array data.");
-  if (info.format != py::format_descriptor<std::complex<float>>::format() &&
-      info.format != py::format_descriptor<std::complex<double>>::format())
+  if (info.format != "Zf" && info.format != "Zd")
     throw std::runtime_error(
         "A numpy array with only floating point elements passed to "
         "`state.from_data`. Input must be of complex float type. Please add to "
@@ -273,7 +338,7 @@ static cudaq::state createStateFromPyBuffer(py::buffer data,
         "`dtype=cudaq.complex()` for precision-agnostic code.");
 
   if (!isHostData || info.shape.size() == 1) {
-    if (info.format == py::format_descriptor<std::complex<float>>::format())
+    if (info.format == "Zf")
       return state::from_data(std::make_pair(
           reinterpret_cast<std::complex<float> *>(info.ptr), info.size));
 
@@ -286,8 +351,7 @@ static cudaq::state createStateFromPyBuffer(py::buffer data,
       throw std::runtime_error(
           "state.from_data 2D array (density matrix) input must be "
           "square matrix data.");
-    const bool isDoublePrecision =
-        info.format == py::format_descriptor<std::complex<double>>::format();
+    const bool isDoublePrecision = (info.format == "Zd");
     const int64_t dataSize = isDoublePrecision ? sizeof(std::complex<double>)
                                                : sizeof(std::complex<float>);
     const bool rowMajor =
@@ -313,15 +377,15 @@ static cudaq::state createStateFromPyBuffer(py::buffer data,
 }
 
 /// @brief Bind the get_state cudaq function
-void cudaq::bindPyState(py::module &mod, LinkedLibraryHolder &holder) {
-  py::enum_<InitialState>(mod, "InitialStateType",
-                          "Enumeration describing the initial state "
-                          "type to be created in the backend")
+void cudaq::bindPyState(nanobind::module_ &mod, LinkedLibraryHolder &holder) {
+  nanobind::enum_<InitialState>(mod, "InitialStateType",
+                                "Enumeration describing the initial state "
+                                "type to be created in the backend")
       .value("ZERO", InitialState::ZERO)
       .value("UNIFORM", InitialState::UNIFORM)
       .export_values();
 
-  py::class_<SimulationState::Tensor>(
+  nanobind::class_<SimulationState::Tensor>(
       mod, "Tensor",
       "The `Tensor` describes a pointer to simulation data as well as the rank "
       "and extents for that tensorial data it represents.")
@@ -329,87 +393,112 @@ void cudaq::bindPyState(py::module &mod, LinkedLibraryHolder &holder) {
            [](SimulationState::Tensor &tensor) {
              return reinterpret_cast<intptr_t>(tensor.data);
            })
-      .def_readonly("extents", &SimulationState::Tensor::extents)
+      .def_ro("extents", &SimulationState::Tensor::extents)
       .def("get_rank", &SimulationState::Tensor::get_rank)
       .def("get_element_size", &SimulationState::Tensor::element_size)
       .def("get_num_elements", &SimulationState::Tensor::get_num_elements);
 
-  py::class_<state>(
-      mod, "State", py::buffer_protocol(),
+  nanobind::class_<state>(
+      mod, "State",
       "A data-type representing the quantum state of the internal simulator. "
       "This type is not user-constructible and instances can only be retrieved "
       "via the `cudaq.get_state(...)` function or the static "
       "`cudaq.State.from_data()` method.\n")
-      .def_buffer([](const state &self) {
-        if (self.get_num_tensors() != 1)
-          throw std::runtime_error("Numpy interop is only supported for vector "
-                                   "and matrix state data.");
-
-        // This method is used by Pybind to enable interoperability with NumPy
-        // array data. We therefore must be careful since the state data may
-        // actually be on GPU device.
-
-        // Get the data pointer.
-        // Data may be on GPU device, if so we must make a copy to host.
-        // If users do not want this copy, they will have to operate apart
-        // from Numpy
-        void *dataPtr = nullptr;
-        auto stateVector = self.get_tensor();
-        auto precision = self.get_precision();
-        if (self.is_on_gpu()) {
-          // This is device data, transfer to host, which gives us
-          // ownership of a new data pointer on host. Store it globally
-          // here so we ensure that it gets cleaned up.
-          auto numElements = stateVector.get_num_elements();
-          if (precision == SimulationState::precision::fp32) {
-            auto *hostData = new std::complex<float>[numElements];
-            self.to_host(hostData, numElements);
-            dataPtr = reinterpret_cast<void *>(hostData);
-          } else {
-            auto *hostData = new std::complex<double>[numElements];
-            self.to_host(hostData, numElements);
-            dataPtr = reinterpret_cast<void *>(hostData);
-          }
-          hostDataFromDevice.emplace_back(dataPtr, [precision](void *data) {
-            CUDAQ_INFO("freeing data that was copied from GPU device for "
-                       "compatibility with NumPy");
-            // Use delete[] to match new[] allocation (not free())
-            if (precision == SimulationState::precision::fp32)
-              delete[] static_cast<std::complex<float> *>(data);
-            else
-              delete[] static_cast<std::complex<double> *>(data);
-          });
-        } else {
-          dataPtr = self.get_tensor().data;
-        }
-
-        // We need to know the precision of the simulation data to get the
-        // data type size and the format descriptor
-        auto [dataTypeSize, desc] =
-            precision == SimulationState::precision::fp32
-                ? std::make_tuple(
-                      sizeof(std::complex<float>),
-                      py::format_descriptor<std::complex<float>>::format())
-                : std::make_tuple(
-                      sizeof(std::complex<double>),
-                      py::format_descriptor<std::complex<double>>::format());
-
-        // Get the shape of the data. Return buffer info in a correctly
-        // shaped manner.
-        auto shape = self.get_tensor().extents;
-        if (shape.size() != 1)
-          return py::buffer_info(dataPtr, dataTypeSize, /*itemsize */
-                                 desc, 2,               /* ndim */
-                                 {shape[0], shape[1]},  /* shape */
-                                 {dataTypeSize * static_cast<ssize_t>(shape[1]),
-                                  dataTypeSize}, /* strides */
-                                 true            /* readonly */
-          );
-        return py::buffer_info(dataPtr, dataTypeSize, /*itemsize */
-                               desc, 1,               /* ndim */
-                               {shape[0]},            /* shape */
-                               {dataTypeSize});
-      })
+      .def(
+          "__array__",
+          [](const state &self, nanobind::object dtype_obj,
+             nanobind::object copy_obj) {
+            if (self.get_num_tensors() != 1)
+              throw std::runtime_error(
+                  "Numpy interop is only supported for vector "
+                  "and matrix state data.");
+
+            // This method enables interoperability with NumPy array data.
+            // We must be careful since the state data may actually be on GPU
+            // device.
+
+            nanobind::module_ np = nanobind::module_::import_("numpy");
+            auto stateVector = self.get_tensor();
+            auto precision = self.get_precision();
+            auto shape = self.get_tensor().extents;
+
+            // Determine numpy dtype
+            nanobind::object np_dtype =
+                precision == SimulationState::precision::fp32
+                    ? np.attr("complex64")
+                    : np.attr("complex128");
+
+            if (self.is_on_gpu()) {
+              // This is device data, transfer to host
+              auto numElements = stateVector.get_num_elements();
+              nanobind::object arr;
+              if (precision == SimulationState::precision::fp32) {
+                auto *hostData = new std::complex<float>[numElements];
+                self.to_host(hostData, numElements);
+                // Create numpy array and copy data
+                if (shape.size() != 1) {
+                  nanobind::tuple np_shape =
+                      nanobind::make_tuple(shape[0], shape[1]);
+                  arr = np.attr("empty")(np_shape, np_dtype);
+                } else {
+                  nanobind::tuple np_shape = nanobind::make_tuple(shape[0]);
+                  arr = np.attr("empty")(np_shape, np_dtype);
+                }
+                auto *destPtr = reinterpret_cast<std::complex<float> *>(
+                    nanobind::cast<intptr_t>(arr.attr("ctypes").attr("data")));
+                std::memcpy(destPtr, hostData,
+                            numElements * sizeof(std::complex<float>));
+                delete[] hostData;
+              } else {
+                auto *hostData = new std::complex<double>[numElements];
+                self.to_host(hostData, numElements);
+                if (shape.size() != 1) {
+                  nanobind::tuple np_shape =
+                      nanobind::make_tuple(shape[0], shape[1]);
+                  arr = np.attr("empty")(np_shape, np_dtype);
+                } else {
+                  nanobind::tuple np_shape = nanobind::make_tuple(shape[0]);
+                  arr = np.attr("empty")(np_shape, np_dtype);
+                }
+                auto *destPtr = reinterpret_cast<std::complex<double> *>(
+                    nanobind::cast<intptr_t>(arr.attr("ctypes").attr("data")));
+                std::memcpy(destPtr, hostData,
+                            numElements * sizeof(std::complex<double>));
+                delete[] hostData;
+              }
+              return arr;
+            }
+
+            // Host data path - wrap existing memory
+            void *dataPtr = self.get_tensor().data;
+            auto numElements = stateVector.get_num_elements();
+            if (shape.size() != 1) {
+              nanobind::tuple np_shape =
+                  nanobind::make_tuple(shape[0], shape[1]);
+              // Use np.frombuffer-like approach: create array from pointer
+              nanobind::object arr = np.attr("empty")(np_shape, np_dtype);
+              auto *destPtr = reinterpret_cast<void *>(
+                  nanobind::cast<intptr_t>(arr.attr("ctypes").attr("data")));
+              std::size_t dataTypeSize =
+                  precision == SimulationState::precision::fp32
+                      ? sizeof(std::complex<float>)
+                      : sizeof(std::complex<double>);
+              std::memcpy(destPtr, dataPtr, numElements * dataTypeSize);
+              return arr;
+            }
+            nanobind::tuple np_shape = nanobind::make_tuple(shape[0]);
+            nanobind::object arr = np.attr("empty")(np_shape, np_dtype);
+            auto *destPtr = reinterpret_cast<void *>(
+                nanobind::cast<intptr_t>(arr.attr("ctypes").attr("data")));
+            std::size_t dataTypeSize =
+                precision == SimulationState::precision::fp32
+                    ? sizeof(std::complex<float>)
+                    : sizeof(std::complex<double>);
+            std::memcpy(destPtr, dataPtr, numElements * dataTypeSize);
+            return arr;
+          },
+          nanobind::arg("dtype") = nanobind::none(),
+          nanobind::arg("copy") = nanobind::none())
       .def(
           "__len__",
           [](state &self) {
@@ -433,16 +522,28 @@ void cudaq::bindPyState(py::module &mod, LinkedLibraryHolder &holder) {
           "Convert the address of the state object to an integer.")
       .def_static(
           "from_data",
-          [&](py::buffer data) {
+          [&](nanobind::object data) {
+            // Reject Python sequences (list/tuple) overload — they should be
+            // dispatched to the vector overload below. In pybind11, py::buffer
+            // excluded lists; nanobind::object accepts anything, so we must
+            // guard explicitly.
+            if (nanobind::isinstance<nanobind::list>(data) ||
+                nanobind::isinstance<nanobind::tuple>(data))
+              throw nanobind::next_overload();
             return createStateFromPyBuffer(data, holder);
           },
           "Return a state from data.")
       .def_static(
           "from_data",
-          [&holder](const std::vector<py::buffer> &tensors) {
+          [&holder](const std::vector<nanobind::object> &tensors) {
+            // Reject SimulationState::Tensor objects overload — they're handled
+            // by the next overload and don't have numpy/cupy buffer attributes.
+            if (!tensors.empty() &&
+                nanobind::isinstance<SimulationState::Tensor>(tensors[0]))
+              throw nanobind::next_overload();
             const bool isHostData =
                 tensors.empty() ||
-                !py::hasattr(tensors[0], "__cuda_array_interface__");
+                !nanobind::hasattr(tensors[0], "__cuda_array_interface__");
             // Check that the target is GPU-based, i.e., can handle device
             // pointer.
             if (!holder.getTarget().config.GpuRequired && !isHostData)
@@ -451,8 +552,8 @@ void cudaq::bindPyState(py::module &mod, LinkedLibraryHolder &holder) {
                   holder.getTarget().name));
             TensorStateData tensorData;
             for (auto &tensor : tensors) {
-              auto info =
-                  isHostData ? tensor.request() : getCupyBufferInfo(tensor);
+              auto info = isHostData ? getNumpyBufferInfo(tensor)
+                                     : getCupyBufferInfo(tensor);
               const std::vector<std::size_t> extents(info.shape.begin(),
                                                      info.shape.end());
               tensorData.emplace_back(
@@ -477,36 +578,38 @@ void cudaq::bindPyState(py::module &mod, LinkedLibraryHolder &holder) {
           "Return a state from matrix product state tensor data.")
       .def_static(
           "from_data",
-          [](const py::list &tensors) {
-            // Note: we must use Python type (py::list) for proper overload
-            // resolution. The overload for py::object, intended for cupy arrays
-            // (implementing Python array interface), may be overshadowed by any
-            // std::vector overloads.
+          [](const nanobind::list &tensors) {
+            // Note: we must use Python type (nanobind::list) for proper
+            // overload resolution. The overload for nanobind::object, intended
+            // for cupy arrays (implementing Python array interface), may be
+            // overshadowed by any std::vector overloads.
             TensorStateData tensorData;
-            for (auto &tensor : tensors) {
+            for (auto tensor : tensors) {
               // Make sure this is a CuPy array
-              if (!py::hasattr(tensor, "data"))
+              if (!nanobind::hasattr(tensor, "data"))
                 throw std::runtime_error(
-                    "invalid from_data operation on py::object - "
+                    "invalid from_data operation on nanobind::object - "
                     "only cupy array supported.");
               auto data = tensor.attr("data");
-              if (!py::hasattr(data, "ptr"))
+              if (!nanobind::hasattr(data, "ptr"))
                 throw std::runtime_error(
-                    "invalid from_data operation on py::object tensors - "
+                    "invalid from_data operation on nanobind::object tensors - "
                     "only cupy array supported.");
 
               // We know this is a cupy device pointer. Start by ensuring it is
               // of proper complex type
-              auto typeStr = py::str(tensor.attr("dtype")).cast<std::string>();
+              auto typeStr = nanobind::cast<std::string>(
+                  tensor.attr("dtype").attr("name"));
               if (typeStr != "complex128")
                 throw std::runtime_error(
-                    "invalid from_data operation on py::object tensors - "
+                    "invalid from_data operation on nanobind::object tensors - "
                     "only cupy complex128 tensors supported.");
-              auto shape = tensor.attr("shape").cast<py::tuple>();
+              auto shape =
+                  nanobind::cast<nanobind::tuple>(tensor.attr("shape"));
               std::vector<std::size_t> extents;
               for (auto el : shape)
-                extents.emplace_back(el.cast<std::size_t>());
-              long ptr = data.attr("ptr").cast<long>();
+                extents.emplace_back(nanobind::cast<std::size_t>(el));
+              long ptr = nanobind::cast<long>(data.attr("ptr"));
               tensorData.emplace_back(
                   std::pair<const void *, std::vector<std::size_t>>{
                       reinterpret_cast<std::complex<double> *>(ptr), extents});
@@ -517,24 +620,24 @@ void cudaq::bindPyState(py::module &mod, LinkedLibraryHolder &holder) {
           "ndarray).")
       .def_static(
           "from_data",
-          [&holder](py::object opaqueData) {
+          [&holder](nanobind::object opaqueData) {
             // Note: This overload is no longer needed from cupy 13.5+ onward.
             // We can remove it in future releases.
             // Make sure this is a CuPy array
-            if (!py::hasattr(opaqueData, "data"))
+            if (!nanobind::hasattr(opaqueData, "data"))
               throw std::runtime_error(
-                  "invalid from_data operation on py::object - "
+                  "invalid from_data operation on nanobind::object - "
                   "only cupy array supported.");
             auto data = opaqueData.attr("data");
-            if (!py::hasattr(data, "ptr"))
+            if (!nanobind::hasattr(data, "ptr"))
               throw std::runtime_error(
-                  "invalid from_data operation on py::object - "
+                  "invalid from_data operation on nanobind::object - "
                   "only cupy array supported.");
 
             // We know this is a cupy device pointer. Start by ensuring it is of
             // complex type
-            auto typeStr =
-                py::str(opaqueData.attr("dtype")).cast<std::string>();
+            auto typeStr = nanobind::cast<std::string>(
+                opaqueData.attr("dtype").attr("name"));
             if (typeStr.find("float") != std::string::npos)
               throw std::runtime_error(
                   "CuPy array with only floating point elements passed to "
@@ -546,16 +649,17 @@ void cudaq::bindPyState(py::module &mod, LinkedLibraryHolder &holder) {
             // Compute the number of elements in the array
             std::vector<std::size_t> extents;
             auto numElements = [&]() {
-              auto shape = opaqueData.attr("shape").cast<py::tuple>();
+              auto shape =
+                  nanobind::cast<nanobind::tuple>(opaqueData.attr("shape"));
               std::size_t numElements = 1;
               for (auto el : shape) {
-                numElements *= el.cast<std::size_t>();
-                extents.emplace_back(el.cast<std::size_t>());
+                numElements *= nanobind::cast<std::size_t>(el);
+                extents.emplace_back(nanobind::cast<std::size_t>(el));
               }
               return numElements;
             }();
 
-            long ptr = data.attr("ptr").cast<long>();
+            long ptr = nanobind::cast<long>(data.attr("ptr"));
             if (holder.getTarget().name == "dynamics") {
               // For dynamics, we need to send on the extents to distinguish
               // state vector vs density matrix.
@@ -587,7 +691,7 @@ void cudaq::bindPyState(py::module &mod, LinkedLibraryHolder &holder) {
       .def(
           "getTensor",
           [](state &self, std::size_t idx) { return self.get_tensor(idx); },
-          py::arg("idx") = 0,
+          nanobind::arg("idx") = 0,
           "Return the `idx` tensor making up this state representation.")
       .def(
           "getTensors", [](state &self) { return self.get_tensors(); },
@@ -699,7 +803,7 @@ index pair.
           [](state &self) {
             std::stringstream ss;
             self.dump(ss);
-            py::print(ss.str());
+            nanobind::print(ss.str().c_str());
           },
           "Print the state to the console.")
       .def("__str__",
@@ -714,7 +818,7 @@ index pair.
           "Compute the overlap between the provided :class:`State`'s.")
       .def(
           "overlap",
-          [&holder](state &self, py::buffer &other) {
+          [&holder](state &self, nanobind::object &other) {
             if (self.get_num_tensors() != 1)
               throw std::runtime_error("overlap NumPy interop only supported "
                                        "for vector and matrix state data.");
@@ -724,24 +828,25 @@ index pair.
           "Compute the overlap between the provided :class:`State`'s.")
       .def(
           "overlap",
-          [](state &self, py::object other) {
+          [](state &self, nanobind::object other) {
             // Note: This overload is no longer needed from cupy 13.5+ onward.
             // We can remove it in future releases. Make sure this is a CuPy
             // array
-            if (!py::hasattr(other, "data"))
+            if (!nanobind::hasattr(other, "data"))
               throw std::runtime_error(
-                  "invalid overlap operation on py::object - "
+                  "invalid overlap operation on nanobind::object - "
                   "only cupy array supported.");
             auto data = other.attr("data");
-            if (!py::hasattr(data, "ptr"))
+            if (!nanobind::hasattr(data, "ptr"))
               throw std::runtime_error(
-                  "invalid overlap operation on py::object - "
+                  "invalid overlap operation on nanobind::object - "
                   "only cupy array supported.");
 
             // We know this is a cupy device pointer.
 
             // Start by ensuring it is of complex type
-            auto typeStr = py::str(other.attr("dtype")).cast<std::string>();
+            auto typeStr =
+                nanobind::cast<std::string>(other.attr("dtype").attr("name"));
             if (typeStr.find("float") != std::string::npos)
               throw std::runtime_error(
                   "CuPy array with only floating point elements passed to "
@@ -765,15 +870,15 @@ index pair.
 
             // Compute the number of elements in the other array
             auto numOtherElements = [&]() {
-              auto shape = other.attr("shape").cast<py::tuple>();
+              auto shape = nanobind::cast<nanobind::tuple>(other.attr("shape"));
               std::size_t numElements = 1;
               for (auto el : shape)
-                numElements *= el.cast<std::size_t>();
+                numElements *= nanobind::cast<std::size_t>(el);
               return numElements;
             }();
 
             // Cast the device ptr and perform the overlap
-            long ptr = data.attr("ptr").cast<long>();
+            long ptr = nanobind::cast<long>(data.attr("ptr"));
             if (precision == SimulationState::precision::fp32)
               return self.overlap(state::from_data(
                   std::make_pair(reinterpret_cast<std::complex<float> *>(ptr),
@@ -787,7 +892,8 @@ index pair.
 
   mod.def(
       "get_state_impl",
-      [&](const std::string &shortName, MlirModule module, py::args args) {
+      [&](const std::string &shortName, MlirModule module,
+          nanobind::args args) {
         // Check for unsupported cases.
         if (holder.getTarget().name == "remote-mqpu" ||
             holder.getTarget().name == "orca-photonics")
@@ -800,7 +906,7 @@ index pair.
       },
       "See the python documentation for get_state.");
 
-  py::class_<async_state_result>(
+  nanobind::class_<async_state_result>(
       mod, "AsyncStateResult",
       R"#(A data-type containing the results of a call to :func:`get_state_async`.
 The `AsyncStateResult` models a future-like type, whose
@@ -810,14 +916,14 @@ See `future <https://en.cppreference.com/w/cpp/thread/future>`_
 for more information on this programming pattern.)#")
       .def(
           "get", [](async_state_result &self) { return self.get(); },
-          py::call_guard<py::gil_scoped_release>(),
+          nanobind::call_guard<nanobind::gil_scoped_release>(),
           "Return the :class:`State` from the asynchronous `get_state` "
           "accessor execution.\n");
 
   mod.def(
       "get_state_async_impl",
       [&](const std::string &shortName, MlirModule module, std::size_t qpu_id,
-          py::args args) {
+          nanobind::args args) {
         // Check for unsupported cases.
         if (holder.getTarget().name == "remote-mqpu" ||
             holder.getTarget().name == "nvqc" ||
diff --git a/python/runtime/cudaq/algorithms/py_state.h b/python/runtime/cudaq/algorithms/py_state.h
index e290aa35e1a..7a7152f8d1f 100644
--- a/python/runtime/cudaq/algorithms/py_state.h
+++ b/python/runtime/cudaq/algorithms/py_state.h
@@ -8,11 +8,11 @@
 
 #pragma once
 
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/ndarray.h>
 
 namespace cudaq {
 class LinkedLibraryHolder;
 
-void bindPyState(pybind11::module &mod, LinkedLibraryHolder &holder);
+void bindPyState(nanobind::module_ &mod, LinkedLibraryHolder &holder);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/algorithms/py_translate.cpp b/python/runtime/cudaq/algorithms/py_translate.cpp
index 4d5f834ed0c..503cbc38cce 100644
--- a/python/runtime/cudaq/algorithms/py_translate.cpp
+++ b/python/runtime/cudaq/algorithms/py_translate.cpp
@@ -13,8 +13,8 @@
 #include "cudaq/platform/default/python/QPU.h"
 #include "cudaq/runtime/logger/logger.h"
 #include "runtime/cudaq/platform/py_alt_launch_kernel.h"
+#include "utils/NanobindAdaptors.h"
 #include "utils/OpaqueArguments.h"
-#include "mlir/Bindings/Python/PybindAdaptors.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Target/LLVMIR/Export.h"
 
@@ -23,7 +23,7 @@ using namespace mlir;
 /// @brief Run `cudaq::translate` on the provided kernel.
 static std::string translate_impl(const std::string &shortName,
                                   MlirModule module, const std::string &format,
-                                  py::args runtimeArguments) {
+                                  nanobind::args runtimeArguments) {
   StringRef format_ = format;
   auto formatPair = format_.split(':');
   auto mod = unwrap(module);
@@ -66,7 +66,7 @@ static std::string translate_impl(const std::string &shortName,
 }
 
 /// @brief Bind the translate cudaq function
-void cudaq::bindPyTranslate(py::module &mod) {
+void cudaq::bindPyTranslate(nanobind::module_ &mod) {
   mod.def("translate_impl", translate_impl,
           "See python documentation for translate.");
   // Internal translation to QIR for testing and internal use. Not intended to
diff --git a/python/runtime/cudaq/algorithms/py_translate.h b/python/runtime/cudaq/algorithms/py_translate.h
index 67b43598744..041167f7017 100644
--- a/python/runtime/cudaq/algorithms/py_translate.h
+++ b/python/runtime/cudaq/algorithms/py_translate.h
@@ -8,10 +8,8 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
-void bindPyTranslate(py::module &mod);
+void bindPyTranslate(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/algorithms/py_unitary.cpp b/python/runtime/cudaq/algorithms/py_unitary.cpp
index fad6bd1d0c7..3aefbbc957d 100644
--- a/python/runtime/cudaq/algorithms/py_unitary.cpp
+++ b/python/runtime/cudaq/algorithms/py_unitary.cpp
@@ -10,26 +10,25 @@
 #include "cudaq/algorithms/unitary.h"
 #include "runtime/cudaq/operators/py_helpers.h"
 #include "runtime/cudaq/platform/py_alt_launch_kernel.h"
-#include "mlir/Bindings/Python/PybindAdaptors.h"
-
-namespace py = pybind11;
+#include "utils/NanobindAdaptors.h"
 
 using namespace cudaq;
 
 /// Compute the unitary of this kernel module.
-static py::array get_unitary_impl(const std::string &shortName,
-                                  MlirModule module, py::args args) {
+static nanobind::object get_unitary_impl(const std::string &shortName,
+                                         MlirModule module,
+                                         nanobind::args args) {
   auto f = [=]() {
     return cudaq::marshal_and_launch_module(shortName, module, args);
   };
 
   // Return as numpy array (dim, dim), complex128
   auto temp = contrib::get_unitary_cmat(std::move(f));
-  return details::cmat_to_numpy(temp);
+  return nanobind::cast(details::cmat_to_numpy(temp));
 }
 
 /// Bind the get_unitary cudaq function
-void cudaq::bindPyUnitary(py::module &mod) {
+void cudaq::bindPyUnitary(nanobind::module_ &mod) {
   mod.def("get_unitary_impl", get_unitary_impl,
           "See python documentation for get_unitary().");
 }
diff --git a/python/runtime/cudaq/algorithms/py_unitary.h b/python/runtime/cudaq/algorithms/py_unitary.h
index ea2ffeca055..fccac11e42b 100644
--- a/python/runtime/cudaq/algorithms/py_unitary.h
+++ b/python/runtime/cudaq/algorithms/py_unitary.h
@@ -8,10 +8,8 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
-void bindPyUnitary(py::module &mod);
+void bindPyUnitary(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/algorithms/py_utils.cpp b/python/runtime/cudaq/algorithms/py_utils.cpp
index 0c6b16ec7c7..e396f93c3a5 100644
--- a/python/runtime/cudaq/algorithms/py_utils.cpp
+++ b/python/runtime/cudaq/algorithms/py_utils.cpp
@@ -8,52 +8,56 @@
 
 #include "py_utils.h"
 #include "cudaq/utils/cudaq_utils.h"
-#include <pybind11/functional.h>
-#include <pybind11/stl.h>
+#include <nanobind/stl/function.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/tuple.h>
+#include <nanobind/stl/unordered_map.h>
+#include <nanobind/stl/vector.h>
 
 namespace cudaq {
 
-py::dict get_serializable_var_dict() {
-  py::object json = py::module_::import("json");
-  py::dict serialized_dict;
+nanobind::dict get_serializable_var_dict() {
+  nanobind::object json = nanobind::module_::import_("json");
+  nanobind::dict serialized_dict;
 
   auto try_to_add_item = [&](const auto item) {
     try {
       auto key = item.first;
       auto value = item.second;
 
-      if (key.template cast<std::string>().starts_with("__")) {
+      if (nanobind::cast<std::string>(key).starts_with("__")) {
         // Ignore items that start with "__" (like Python __builtins__, etc.)
-      } else if (py::hasattr(value, "to_json")) {
-        auto type = value.get_type();
+      } else if (nanobind::hasattr(value, "to_json")) {
+        auto type = value.type();
         std::string module =
-            type.attr("__module__").template cast<std::string>();
-        std::string name = type.attr("__name__").template cast<std::string>();
-        auto type_name = py::str(module + "." + name);
-        auto json_key_name = py::str(key) + py::str("/") + type_name;
+            nanobind::cast<std::string>(type.attr("__module__"));
+        std::string name = nanobind::cast<std::string>(type.attr("__name__"));
+        auto type_name = nanobind::str((module + "." + name).c_str());
+        auto json_key_name = nanobind::str(nanobind::str(key).c_str()) +
+                             nanobind::str("/") + type_name;
         serialized_dict[json_key_name] =
             json.attr("loads")(value.attr("to_json")());
-      } else if (py::hasattr(value, "tolist")) {
+      } else if (nanobind::hasattr(value, "tolist")) {
         serialized_dict[key] =
             json.attr("loads")(json.attr("dumps")(value.attr("tolist")()));
       } else {
         serialized_dict[key] = json.attr("loads")(json.attr("dumps")(value));
       }
-    } catch (const py::error_already_set &e) {
+    } catch (const nanobind::python_error &e) {
       // Uncomment the following lines for debug, but all this really means is
       // that we won't send this to the remote server.
 
       // std::cout << "Failed to serialize key '"
-      //           << item.first.template cast<std::string>()
+      //           << nanobind::cast<std::string>(item.first)
       //           << "' : " + std::string(e.what()) << std::endl;
     }
   };
 
-  for (const auto item : py::globals())
+  for (const auto item : nanobind::globals())
     try_to_add_item(item);
 
-  py::object inspect = py::module::import("inspect");
-  std::vector<py::object> frame_vec;
+  nanobind::object inspect = nanobind::module_::import_("inspect");
+  std::vector<nanobind::object> frame_vec;
   auto current_frame = inspect.attr("currentframe")();
   while (current_frame && !current_frame.is_none()) {
     frame_vec.push_back(current_frame);
@@ -64,7 +68,8 @@ py::dict get_serializable_var_dict() {
   // globals first to locals last. This ensures that the overwrites give
   // precedence to closest-to-locals.
   for (auto it = frame_vec.rbegin(); it != frame_vec.rend(); ++it) {
-    py::dict f_locals = it->attr("f_locals");
+    nanobind::dict f_locals =
+        nanobind::cast<nanobind::dict>(it->attr("f_locals"));
     for (const auto item : f_locals)
       try_to_add_item(item);
   }
@@ -104,56 +109,60 @@ static std::size_t strip_leading_whitespace(std::string &source_code) {
   return min_indent;
 }
 
-std::string get_source_code(const py::function &func) {
+std::string get_source_code(const nanobind::callable &func) {
   // Get the source code
-  py::module_ analysis = py::module_::import("cudaq.kernel.analysis");
-  py::object FetchDepFuncsSourceCode = analysis.attr("FetchDepFuncsSourceCode");
-  py::object source_code;
+  nanobind::module_ analysis =
+      nanobind::module_::import_("cudaq.kernel.analysis");
+  nanobind::object FetchDepFuncsSourceCode =
+      analysis.attr("FetchDepFuncsSourceCode");
+  nanobind::object source_code;
   try {
     source_code = FetchDepFuncsSourceCode.attr("fetch")(func);
-  } catch (py::error_already_set &e) {
+  } catch (nanobind::python_error &e) {
     throw std::runtime_error("Failed to get source code: " +
                              std::string(e.what()));
   }
 
-  std::string source = source_code.cast<std::string>();
+  std::string source = nanobind::cast<std::string>(source_code);
   strip_leading_whitespace(source);
   return source;
 }
 
-std::string get_var_name_for_handle(const py::handle &h) {
-  py::object inspect = py::module::import("inspect");
+std::string get_var_name_for_handle(const nanobind::handle &h) {
+  nanobind::object inspect = nanobind::module_::import_("inspect");
   // Search locals first, walking up the call stack
   auto current_frame = inspect.attr("currentframe")();
   while (current_frame && !current_frame.is_none()) {
-    py::dict f_locals = current_frame.attr("f_locals");
+    nanobind::dict f_locals =
+        nanobind::cast<nanobind::dict>(current_frame.attr("f_locals"));
     for (auto item : f_locals)
       if (item.second.is(h))
-        return py::str(item.first);
+        return nanobind::cast<std::string>(nanobind::str(item.first));
     current_frame = current_frame.attr("f_back");
   }
   // Search globals now
   current_frame = inspect.attr("currentframe")();
-  py::dict f_globals = current_frame.attr("f_globals");
+  nanobind::dict f_globals =
+      nanobind::cast<nanobind::dict>(current_frame.attr("f_globals"));
   for (auto item : f_globals)
     if (item.second.is(h))
-      return py::str(item.first);
+      return nanobind::cast<std::string>(nanobind::str(item.first));
   return std::string();
 }
 
-std::unordered_map<std::string, std::tuple<py::object, py::dict>>
+std::unordered_map<std::string, std::tuple<nanobind::object, nanobind::dict>>
     DataClassRegistry::classes{};
 
 /// @brief Bind the dataclass registry
-void bindPyDataClassRegistry(py::module &mod) {
-  py::class_<DataClassRegistry>(mod, "DataClassRegistry",
-                                R"#(Registry for dataclasses used in kernels)#")
+void bindPyDataClassRegistry(nanobind::module_ &mod) {
+  nanobind::class_<DataClassRegistry>(
+      mod, "DataClassRegistry", R"#(Registry for dataclasses used in kernels)#")
       .def_static("registerClass", &DataClassRegistry::registerClass,
                   "Register class\n")
       .def_static("isRegisteredClass", &DataClassRegistry::isRegisteredClass,
                   "Is class registered\n")
       .def_static("getClassAttributes", &DataClassRegistry::getClassAttributes,
                   "Find registered class and its attributes\n")
-      .def_readonly_static("classes", &DataClassRegistry::classes);
+      .def_ro_static("classes", &DataClassRegistry::classes);
 }
 } // namespace cudaq
diff --git a/python/runtime/cudaq/algorithms/py_utils.h b/python/runtime/cudaq/algorithms/py_utils.h
index 84dc1e6455c..2abd81d122a 100644
--- a/python/runtime/cudaq/algorithms/py_utils.h
+++ b/python/runtime/cudaq/algorithms/py_utils.h
@@ -8,35 +8,36 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
+#include <string>
 #include <tuple>
 #include <unordered_map>
 
-namespace py = pybind11;
-
 namespace cudaq {
 
 /// @brief Get a JSON-encoded dictionary of a combination of all local
 /// and global variables that are JSON compatible
-py::dict get_serializable_var_dict();
+nanobind::dict get_serializable_var_dict();
 
-/// @brief Fetch the Python source code from a `py::function`
-std::string get_source_code(const py::function &func);
+/// @brief Fetch the Python source code from a `nanobind::callable`
+std::string get_source_code(const nanobind::callable &func);
 
 /// @brief Find the variable name for a given Python object handle. It searches
 /// locally first, walks up the call stack, and finally checks the global
 /// namespace. If not found, it returns an empty string.
-std::string get_var_name_for_handle(const py::handle &h);
+std::string get_var_name_for_handle(const nanobind::handle &h);
 
 /// @brief Registry for python data classes used in kernels
 class DataClassRegistry {
 public:
-  static std::unordered_map<std::string, std::tuple<py::object, py::dict>>
+  static std::unordered_map<std::string,
+                            std::tuple<nanobind::object, nanobind::dict>>
       classes;
 
   /// @brief Register class object
-  static void registerClass(std::string &name, py::object cls) {
-    classes[name] = {cls, cls.attr("__annotations__").cast<py::dict>()};
+  static void registerClass(std::string &name, nanobind::object cls) {
+    classes[name] = {
+        cls, nanobind::cast<nanobind::dict>(cls.attr("__annotations__"))};
   }
 
   /// @brief Is data class name registered
@@ -45,12 +46,12 @@ class DataClassRegistry {
   }
 
   /// @brief Find registered data class object and its attributes
-  static std::tuple<py::object, py::dict>
+  static std::tuple<nanobind::object, nanobind::dict>
   getClassAttributes(std::string &name) {
     return classes[name];
   }
 };
 
-void bindPyDataClassRegistry(py::module &mod);
+void bindPyDataClassRegistry(nanobind::module_ &mod);
 
 } // namespace cudaq
diff --git a/python/runtime/cudaq/domains/plugins/CMakeLists.txt b/python/runtime/cudaq/domains/plugins/CMakeLists.txt
index 675919e25ca..3bd2e991655 100644
--- a/python/runtime/cudaq/domains/plugins/CMakeLists.txt
+++ b/python/runtime/cudaq/domains/plugins/CMakeLists.txt
@@ -15,10 +15,12 @@ else()
 endif()
 add_library(cudaq-pyscf SHARED PySCFDriver.cpp)
 
+target_compile_options(cudaq-pyscf PRIVATE -Wno-cast-qual)
+
 if (SKBUILD)
   target_link_libraries(cudaq-pyscf
     PRIVATE
-      pybind11::pybind11 Python::Module
+      nanobind-static Python::Module
       cudaq-chemistry cudaq-operator cudaq cudaq-py-utils cudaq-platform-default)
   # Apple's linker (ld64) doesn't support --unresolved-symbols flag
   if (NOT APPLE)
@@ -31,7 +33,7 @@ else()
   endif()
   target_link_libraries(cudaq-pyscf
     PRIVATE
-      Python::Python pybind11::pybind11
+      nanobind-static Python::Python
       cudaq-chemistry cudaq-operator cudaq cudaq-py-utils cudaq-platform-default)
 endif()
 
diff --git a/python/runtime/cudaq/domains/plugins/PySCFDriver.cpp b/python/runtime/cudaq/domains/plugins/PySCFDriver.cpp
index 1cd6e142a83..8f99b59e231 100644
--- a/python/runtime/cudaq/domains/plugins/PySCFDriver.cpp
+++ b/python/runtime/cudaq/domains/plugins/PySCFDriver.cpp
@@ -9,20 +9,18 @@
 #include "cudaq/domains/chemistry/MoleculePackageDriver.h"
 #include "cudaq/target_control.h"
 #include <map>
-#include <pybind11/embed.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/pair.h>
+#include <nanobind/stl/string.h>
 
-namespace py = pybind11;
 using namespace cudaq;
 
 namespace {
 
-/// @brief Reference to the pybind11 scoped interpreter
-thread_local static std::unique_ptr<py::scoped_interpreter> interp;
-
-/// @brief Map an OpenFermion QubitOperator represented as a py::object
+/// @brief Map an OpenFermion QubitOperator represented as a nanobind::object
 /// to a CUDA-Q spin_op
-spin_op fromOpenFermionQubitOperator(const py::object &op) {
-  if (!py::hasattr(op, "terms"))
+spin_op fromOpenFermionQubitOperator(const nanobind::object &op) {
+  if (!nanobind::hasattr(op, "terms"))
     throw std::runtime_error(
         "This is not an openfermion operator, must have 'terms' attribute.");
   std::map<std::string, std::function<spin_op_term(std::size_t)>> creatorMap{
@@ -32,20 +30,21 @@ spin_op fromOpenFermionQubitOperator(const py::object &op) {
   auto terms = op.attr("terms");
   auto H = spin_op::empty();
   for (auto term : terms) {
-    auto termTuple = term.cast<py::tuple>();
+    auto termTuple = nanobind::cast<nanobind::tuple>(term);
     auto localTerm = spin_op::identity();
-    for (auto &element : termTuple) {
-      auto casted = element.cast<std::pair<std::size_t, std::string>>();
+    for (auto element : termTuple) {
+      auto casted =
+          nanobind::cast<std::pair<std::size_t, std::string>>(element);
       localTerm *= creatorMap[casted.second](casted.first);
     }
-    H += terms[term].cast<double>() * localTerm;
+    H += nanobind::cast<double>(terms[term]) * localTerm;
   }
   return H;
 }
 
 /// @brief Implement the CUDA-Q MoleculePackageDriver interface
 /// with support for generating molecular Hamiltonians via PySCF. We
-/// achieve this via Pybind11's embedded interpreter capabilities.
+/// achieve this via nanobind's Python API wrappers.
 class PySCFPackageDriver : public MoleculePackageDriver {
 protected:
   /// @brief The name of the chemistry python module.
@@ -62,82 +61,83 @@ class PySCFPackageDriver : public MoleculePackageDriver {
       int multiplicity, int charge,
       std::optional<std::size_t> nActiveElectrons = std::nullopt,
       std::optional<std::size_t> nActiveOrbitals = std::nullopt) override {
-    if (!interp)
-      interp = std::make_unique<py::scoped_interpreter>();
+    if (!Py_IsInitialized())
+      Py_Initialize();
 
     // Convert the molecular_geometry to a list[tuple(str,tuple)]
-    py::list pyGeometry(geometry.size());
-    for (std::size_t counter = 0; auto &atom : geometry) {
-      py::tuple coordinate(3);
+    nanobind::list pyGeometry;
+    for (auto &atom : geometry) {
+      nanobind::object coordinate = nanobind::steal(PyTuple_New(3));
       for (int i = 0; i < 3; i++)
-        coordinate[i] = atom.coordinates[i];
+        PyTuple_SET_ITEM(coordinate.ptr(), i,
+                         nanobind::cast(atom.coordinates[i]).release().ptr());
 
-      pyGeometry[counter++] = py::make_tuple(atom.name, coordinate);
+      pyGeometry.append(nanobind::make_tuple(atom.name, coordinate));
     }
 
     // We don't want to modify the platform, indicate so
     cudaq::__internal__::disableTargetModification();
 
     // Import the cudaq python chemistry module
-    auto cudaqModule = py::module_::import(ChemistryModuleName);
+    auto cudaqModule = nanobind::module_::import_(ChemistryModuleName);
 
     // Reset it
     cudaq::__internal__::enableTargetModification();
 
     // Setup the active space if requested.
-    py::object nElectrons = py::none();
-    py::object nActive = py::none();
+    nanobind::object nElectrons = nanobind::none();
+    nanobind::object nActive = nanobind::none();
     if (nActiveElectrons.has_value())
-      nElectrons = py::int_(nActiveElectrons.value());
+      nElectrons = nanobind::int_(nActiveElectrons.value());
     if (nActiveOrbitals.has_value())
-      nActive = py::int_(nActiveOrbitals.value());
+      nActive = nanobind::int_(nActiveOrbitals.value());
 
     // Run the openfermion-pyscf wrapper to create the hamiltonian + metadata
     auto hamiltonianGen = cudaqModule.attr(CreatorFunctionName);
-    auto resultTuple = hamiltonianGen(pyGeometry, basis, multiplicity, charge,
-                                      nElectrons, nActive)
-                           .cast<py::tuple>();
+    auto resultTuple = nanobind::cast<nanobind::tuple>(hamiltonianGen(
+        pyGeometry, basis, multiplicity, charge, nElectrons, nActive));
 
     // Get the spin_op representation
-    auto spinOp = fromOpenFermionQubitOperator(resultTuple[0]);
+    auto spinOp =
+        fromOpenFermionQubitOperator(nanobind::borrow(resultTuple[0]));
 
     // Get the OpenFermion molecule representation
-    auto openFermionMolecule = resultTuple[1];
+    auto openFermionMolecule = nanobind::borrow(resultTuple[1]);
 
     // Extract the one-body integrals
     auto pyOneBody = openFermionMolecule.attr("one_body_integrals");
-    auto shape = pyOneBody.attr("shape").cast<py::tuple>();
-    one_body_integrals oneBody(
-        {shape[0].cast<std::size_t>(), shape[1].cast<std::size_t>()});
+    auto shape = nanobind::cast<nanobind::tuple>(pyOneBody.attr("shape"));
+    one_body_integrals oneBody({nanobind::cast<std::size_t>(shape[0]),
+                                nanobind::cast<std::size_t>(shape[1])});
     for (std::size_t i = 0; i < oneBody.shape[0]; i++)
       for (std::size_t j = 0; j < oneBody.shape[1]; j++)
-        oneBody(i, j) =
-            pyOneBody.attr("__getitem__")(py::make_tuple(i, j)).cast<double>();
+        oneBody(i, j) = nanobind::cast<double>(
+            pyOneBody.attr("__getitem__")(nanobind::make_tuple(i, j)));
 
     // Extract the two-body integrals
     auto pyTwoBody = openFermionMolecule.attr("two_body_integrals");
-    shape = pyTwoBody.attr("shape").cast<py::tuple>();
-    two_body_integals twoBody(
-        {shape[0].cast<std::size_t>(), shape[1].cast<std::size_t>(),
-         shape[2].cast<std::size_t>(), shape[3].cast<std::size_t>()});
+    shape = nanobind::cast<nanobind::tuple>(pyTwoBody.attr("shape"));
+    two_body_integals twoBody({nanobind::cast<std::size_t>(shape[0]),
+                               nanobind::cast<std::size_t>(shape[1]),
+                               nanobind::cast<std::size_t>(shape[2]),
+                               nanobind::cast<std::size_t>(shape[3])});
     for (std::size_t i = 0; i < twoBody.shape[0]; i++)
       for (std::size_t j = 0; j < twoBody.shape[1]; j++)
         for (std::size_t k = 0; k < twoBody.shape[2]; k++)
           for (std::size_t l = 0; l < twoBody.shape[3]; l++)
-            twoBody(i, j, k, l) =
-                pyTwoBody.attr("__getitem__")(py::make_tuple(i, j, k, l))
-                    .cast<double>();
+            twoBody(i, j, k, l) = nanobind::cast<double>(pyTwoBody.attr(
+                "__getitem__")(nanobind::make_tuple(i, j, k, l)));
 
     // return a new molecular_hamiltonian
     return molecular_hamiltonian{
         spinOp,
         std::move(oneBody),
         std::move(twoBody),
-        openFermionMolecule.attr("n_electrons").cast<std::size_t>(),
-        openFermionMolecule.attr("n_orbitals").cast<std::size_t>(),
-        openFermionMolecule.attr("nuclear_repulsion").cast<double>(),
-        openFermionMolecule.attr("hf_energy").cast<double>(),
-        openFermionMolecule.attr("fci_energy").cast<double>()};
+        nanobind::cast<std::size_t>(openFermionMolecule.attr("n_electrons")),
+        nanobind::cast<std::size_t>(openFermionMolecule.attr("n_orbitals")),
+        nanobind::cast<double>(openFermionMolecule.attr("nuclear_repulsion")),
+        nanobind::cast<double>(openFermionMolecule.attr("hf_energy")),
+        nanobind::cast<double>(openFermionMolecule.attr("fci_energy"))};
   }
 };
 
diff --git a/python/runtime/cudaq/dynamics/CMakeLists.txt b/python/runtime/cudaq/dynamics/CMakeLists.txt
index c56a4c3672d..d7910fdf586 100644
--- a/python/runtime/cudaq/dynamics/CMakeLists.txt
+++ b/python/runtime/cudaq/dynamics/CMakeLists.txt
@@ -7,13 +7,9 @@
 # ============================================================================ #
 
 find_package(Python COMPONENTS Interpreter Development)
-find_package(pybind11 CONFIG)
 
-pybind11_add_module(nvqir_dynamics_bindings pyDynamics.cpp)
-target_include_directories(nvqir_dynamics_bindings PRIVATE
-    ${PYTHON_INCLUDE_DIRS}
-    ${pybind11_INCLUDE_DIRS}
-)
+nanobind_add_module(nvqir_dynamics_bindings NB_STATIC pyDynamics.cpp)
+
 target_include_directories(nvqir_dynamics_bindings 
     PRIVATE 
         ${CMAKE_SOURCE_DIR}/runtime
diff --git a/python/runtime/cudaq/dynamics/pyDynamics.cpp b/python/runtime/cudaq/dynamics/pyDynamics.cpp
index 5a4ee1380ac..1fdccbedcaa 100644
--- a/python/runtime/cudaq/dynamics/pyDynamics.cpp
+++ b/python/runtime/cudaq/dynamics/pyDynamics.cpp
@@ -15,10 +15,14 @@
 #include "cudaq/algorithms/base_integrator.h"
 #include "cudaq/algorithms/integrator.h"
 #include "cudaq/schedule.h"
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/pair.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/unordered_map.h>
+#include <nanobind/stl/vector.h>
 
-namespace py = pybind11;
 namespace {
 cudaq::CuDensityMatState *asCudmState(cudaq::state &cudaqState) {
   auto *simState = cudaq::state_helper::getSimulationState(&cudaqState);
@@ -30,7 +34,7 @@ cudaq::CuDensityMatState *asCudmState(cudaq::state &cudaqState) {
 } // namespace
 
 // Internal dynamics bindings
-PYBIND11_MODULE(nvqir_dynamics_bindings, m) {
+NB_MODULE(nvqir_dynamics_bindings, m) {
   class PyCuDensityMatTimeStepper : public cudaq::CuDensityMatTimeStepper {
   public:
     PyCuDensityMatTimeStepper(cudensitymatHandle_t handle,
@@ -42,76 +46,80 @@ PYBIND11_MODULE(nvqir_dynamics_bindings, m) {
   };
 
   // Time stepper bindings
-  py::class_<PyCuDensityMatTimeStepper>(m, "TimeStepper")
-      .def(py::init(
-          [](cudaq::schedule schedule, std::vector<int64_t> modeExtents,
-             cudaq::sum_op<cudaq::matrix_handler> hamiltonian,
-             std::vector<cudaq::sum_op<cudaq::matrix_handler>> collapse_ops,
-             bool is_master_equation) {
-            std::unordered_map<std::string, std::complex<double>> params;
-            for (const auto &param : schedule.get_parameters()) {
-              params[param] = schedule.get_value_function()(param, 0.0);
-            }
-            auto liouvillian = cudaq::dynamics::Context::getCurrentContext()
-                                   ->getOpConverter()
-                                   .constructLiouvillian(
-                                       {hamiltonian}, {collapse_ops},
-                                       modeExtents, params, is_master_equation);
-            return PyCuDensityMatTimeStepper(
-                cudaq::dynamics::Context::getCurrentContext()->getHandle(),
-                liouvillian, schedule);
-          }))
-      .def(py::init([](cudaq::schedule schedule,
-                       std::vector<int64_t> modeExtents,
-                       cudaq::super_op superOp) {
-        std::unordered_map<std::string, std::complex<double>> params;
-        for (const auto &param : schedule.get_parameters()) {
-          params[param] = schedule.get_value_function()(param, 0.0);
-        }
-        auto liouvillian =
-            cudaq::dynamics::Context::getCurrentContext()
-                ->getOpConverter()
-                .constructLiouvillian({superOp}, modeExtents, params);
-        return PyCuDensityMatTimeStepper(
-            cudaq::dynamics::Context::getCurrentContext()->getHandle(),
-            liouvillian, schedule);
-      }))
-      .def(py::init([](cudaq::schedule schedule,
-                       std::vector<int64_t> modeExtents,
-                       const std::vector<cudaq::sum_op<cudaq::matrix_handler>>
-                           &hamiltonians,
-                       const std::vector<
-                           std::vector<cudaq::sum_op<cudaq::matrix_handler>>>
-                           &list_collapse_ops,
-                       bool is_master_equation) {
-        std::unordered_map<std::string, std::complex<double>> params;
-        for (const auto &param : schedule.get_parameters()) {
-          params[param] = schedule.get_value_function()(param, 0.0);
-        }
-        auto liouvillian =
-            cudaq::dynamics::Context::getCurrentContext()
-                ->getOpConverter()
-                .constructLiouvillian(hamiltonians, list_collapse_ops,
-                                      modeExtents, params, is_master_equation);
-        return PyCuDensityMatTimeStepper(
-            cudaq::dynamics::Context::getCurrentContext()->getHandle(),
-            liouvillian, schedule);
-      }))
-      .def(py::init([](cudaq::schedule schedule,
-                       std::vector<int64_t> modeExtents,
-                       const std::vector<cudaq::super_op> &superOps) {
-        std::unordered_map<std::string, std::complex<double>> params;
-        for (const auto &param : schedule.get_parameters()) {
-          params[param] = schedule.get_value_function()(param, 0.0);
-        }
-        auto liouvillian =
-            cudaq::dynamics::Context::getCurrentContext()
-                ->getOpConverter()
-                .constructLiouvillian(superOps, modeExtents, params);
-        return PyCuDensityMatTimeStepper(
-            cudaq::dynamics::Context::getCurrentContext()->getHandle(),
-            liouvillian, schedule);
-      }))
+  nanobind::class_<PyCuDensityMatTimeStepper>(m, "TimeStepper")
+      .def("__init__",
+           [](PyCuDensityMatTimeStepper *self, cudaq::schedule schedule,
+              std::vector<int64_t> modeExtents,
+              cudaq::sum_op<cudaq::matrix_handler> hamiltonian,
+              std::vector<cudaq::sum_op<cudaq::matrix_handler>> collapse_ops,
+              bool is_master_equation) {
+             std::unordered_map<std::string, std::complex<double>> params;
+             for (const auto &param : schedule.get_parameters()) {
+               params[param] = schedule.get_value_function()(param, 0.0);
+             }
+             auto liouvillian =
+                 cudaq::dynamics::Context::getCurrentContext()
+                     ->getOpConverter()
+                     .constructLiouvillian({hamiltonian}, {collapse_ops},
+                                           modeExtents, params,
+                                           is_master_equation);
+             new (self) PyCuDensityMatTimeStepper(
+                 cudaq::dynamics::Context::getCurrentContext()->getHandle(),
+                 liouvillian, schedule);
+           })
+      .def("__init__",
+           [](PyCuDensityMatTimeStepper *self, cudaq::schedule schedule,
+              std::vector<int64_t> modeExtents, cudaq::super_op superOp) {
+             std::unordered_map<std::string, std::complex<double>> params;
+             for (const auto &param : schedule.get_parameters()) {
+               params[param] = schedule.get_value_function()(param, 0.0);
+             }
+             auto liouvillian =
+                 cudaq::dynamics::Context::getCurrentContext()
+                     ->getOpConverter()
+                     .constructLiouvillian({superOp}, modeExtents, params);
+             new (self) PyCuDensityMatTimeStepper(
+                 cudaq::dynamics::Context::getCurrentContext()->getHandle(),
+                 liouvillian, schedule);
+           })
+      .def("__init__",
+           [](PyCuDensityMatTimeStepper *self, cudaq::schedule schedule,
+              std::vector<int64_t> modeExtents,
+              const std::vector<cudaq::sum_op<cudaq::matrix_handler>>
+                  &hamiltonians,
+              const std::vector<std::vector<
+                  cudaq::sum_op<cudaq::matrix_handler>>> &list_collapse_ops,
+              bool is_master_equation) {
+             std::unordered_map<std::string, std::complex<double>> params;
+             for (const auto &param : schedule.get_parameters()) {
+               params[param] = schedule.get_value_function()(param, 0.0);
+             }
+             auto liouvillian =
+                 cudaq::dynamics::Context::getCurrentContext()
+                     ->getOpConverter()
+                     .constructLiouvillian(hamiltonians, list_collapse_ops,
+                                           modeExtents, params,
+                                           is_master_equation);
+             new (self) PyCuDensityMatTimeStepper(
+                 cudaq::dynamics::Context::getCurrentContext()->getHandle(),
+                 liouvillian, schedule);
+           })
+      .def("__init__",
+           [](PyCuDensityMatTimeStepper *self, cudaq::schedule schedule,
+              std::vector<int64_t> modeExtents,
+              const std::vector<cudaq::super_op> &superOps) {
+             std::unordered_map<std::string, std::complex<double>> params;
+             for (const auto &param : schedule.get_parameters()) {
+               params[param] = schedule.get_value_function()(param, 0.0);
+             }
+             auto liouvillian =
+                 cudaq::dynamics::Context::getCurrentContext()
+                     ->getOpConverter()
+                     .constructLiouvillian(superOps, modeExtents, params);
+             new (self) PyCuDensityMatTimeStepper(
+                 cudaq::dynamics::Context::getCurrentContext()->getHandle(),
+                 liouvillian, schedule);
+           })
       .def("compute",
            [](PyCuDensityMatTimeStepper &self, cudaq::state &inputState,
               double t) {
@@ -124,7 +132,6 @@ PYBIND11_MODULE(nvqir_dynamics_bindings, m) {
       .def("compute",
            [](PyCuDensityMatTimeStepper &self, cudaq::state &inputState,
               double t, cudaq::state &outputState) {
-             // Compute into the provided output state
              std::unordered_map<std::string, std::complex<double>> params;
              for (const auto &param : self.m_schedule.get_parameters()) {
                params[param] = self.m_schedule.get_value_function()(param, t);
@@ -151,24 +158,26 @@ PYBIND11_MODULE(nvqir_dynamics_bindings, m) {
            });
 
   // System dynamics data class
-  py::class_<cudaq::SystemDynamics>(m, "SystemDynamics")
-      .def(py::init<>())
-      .def_readwrite("modeExtents", &cudaq::SystemDynamics::modeExtents)
-      .def_readwrite("hamiltonian", &cudaq::SystemDynamics::hamiltonian)
-      .def_readwrite("collapseOps", &cudaq::SystemDynamics::collapseOps)
-      .def_readwrite("parameters", &cudaq::SystemDynamics::parameters)
-      .def_readwrite("superOp", &cudaq::SystemDynamics::superOp);
+  nanobind::class_<cudaq::SystemDynamics>(m, "SystemDynamics")
+      .def(nanobind::init<>())
+      .def_rw("modeExtents", &cudaq::SystemDynamics::modeExtents)
+      .def_rw("hamiltonian", &cudaq::SystemDynamics::hamiltonian)
+      .def_rw("collapseOps", &cudaq::SystemDynamics::collapseOps)
+      .def_rw("parameters", &cudaq::SystemDynamics::parameters)
+      .def_rw("superOp", &cudaq::SystemDynamics::superOp);
 
   // Expectation calculation
-  py::class_<cudaq::CuDensityMatExpectation>(m, "CuDensityMatExpectation")
-      .def(py::init([](cudaq::sum_op<cudaq::matrix_handler> &obs,
-                       const std::vector<int64_t> &modeExtents) {
-        return cudaq::CuDensityMatExpectation(
-            cudaq::dynamics::Context::getCurrentContext()->getHandle(),
-            cudaq::dynamics::Context::getCurrentContext()
-                ->getOpConverter()
-                .convertToCudensitymatOperator({}, obs, modeExtents));
-      }))
+  nanobind::class_<cudaq::CuDensityMatExpectation>(m, "CuDensityMatExpectation")
+      .def("__init__",
+           [](cudaq::CuDensityMatExpectation *self,
+              cudaq::sum_op<cudaq::matrix_handler> &obs,
+              const std::vector<int64_t> &modeExtents) {
+             new (self) cudaq::CuDensityMatExpectation(
+                 cudaq::dynamics::Context::getCurrentContext()->getHandle(),
+                 cudaq::dynamics::Context::getCurrentContext()
+                     ->getOpConverter()
+                     .convertToCudensitymatOperator({}, obs, modeExtents));
+           })
       .def("prepare",
            [](cudaq::CuDensityMatExpectation &self, cudaq::state &state) {
              auto *cudmState = asCudmState(state);
@@ -187,9 +196,9 @@ PYBIND11_MODULE(nvqir_dynamics_bindings, m) {
       });
 
   // Schedule class
-  py::class_<cudaq::schedule>(m, "Schedule")
-      .def(py::init<const std::vector<double> &,
-                    const std::vector<std::string> &>());
+  nanobind::class_<cudaq::schedule>(m, "Schedule")
+      .def(nanobind::init<const std::vector<double> &,
+                          const std::vector<std::string> &>());
 
   // Helper to initialize a data buffer state
   m.def("initializeState",
@@ -287,23 +296,24 @@ PYBIND11_MODULE(nvqir_dynamics_bindings, m) {
         return cudaq::__internal__::checkBatchingCompatibility(hamOps,
                                                                listCollapseOps);
       },
-      py::arg("hamiltonians"), py::arg("collapse_operators"));
+      nanobind::arg("hamiltonians"), nanobind::arg("collapse_operators"));
 
   m.def(
       "checkSuperOpBatchingCompatibility",
       [](const std::vector<cudaq::super_op> &super_operators) {
         return cudaq::__internal__::checkBatchingCompatibility(super_operators);
       },
-      py::arg("super_operators"));
+      nanobind::arg("super_operators"));
 
   auto integratorsSubmodule = m.def_submodule("integrators");
 
   // Runge-Kutta integrator
-  py::class_<cudaq::integrators::runge_kutta>(integratorsSubmodule,
-                                              "runge_kutta")
-      .def(py::init<int, std::optional<double>>(), py::kw_only(),
-           py::arg("order") = cudaq::integrators::runge_kutta::default_order,
-           py::arg("max_step_size") = py::none())
+  nanobind::class_<cudaq::integrators::runge_kutta>(integratorsSubmodule,
+                                                    "runge_kutta")
+      .def(nanobind::init<int, std::optional<double>>(), nanobind::kw_only(),
+           nanobind::arg("order") =
+               cudaq::integrators::runge_kutta::default_order,
+           nanobind::arg("max_step_size") = nanobind::none())
       .def("setState",
            [](cudaq::integrators::runge_kutta &self, cudaq::state &state,
               double t) { self.setState(state, t); })
diff --git a/python/runtime/cudaq/operators/py_boson_op.cpp b/python/runtime/cudaq/operators/py_boson_op.cpp
index 5a23d29a9af..6df75bd5a27 100644
--- a/python/runtime/cudaq/operators/py_boson_op.cpp
+++ b/python/runtime/cudaq/operators/py_boson_op.cpp
@@ -7,10 +7,17 @@
  ******************************************************************************/
 
 #include <complex>
-#include <pybind11/complex.h>
-#include <pybind11/numpy.h>
-#include <pybind11/operators.h>
-#include <pybind11/stl.h>
+#include <nanobind/make_iterator.h>
+#include <nanobind/ndarray.h>
+#include <nanobind/operators.h>
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/map.h>
+#include <nanobind/stl/pair.h>
+#include <nanobind/stl/set.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/tuple.h>
+#include <nanobind/stl/unordered_map.h>
+#include <nanobind/stl/vector.h>
 
 #include "cudaq/operators.h"
 #include "cudaq/operators/serialization.h"
@@ -19,7 +26,7 @@
 
 namespace cudaq {
 
-void bindBosonModule(py::module &mod) {
+void bindBosonModule(nanobind::module_ &mod) {
   // Binding the functions in `cudaq::boson` as `_pycudaq` submodule
   // so it's accessible directly in the cudaq namespace.
   auto boson_submodule = mod.def_submodule("boson");
@@ -32,31 +39,32 @@ void bindBosonModule(py::module &mod) {
       "Returns product operator with constant value 1.");
   boson_submodule.def(
       "identity", [](std::size_t target) { return boson_op::identity(target); },
-      py::arg("target"),
+      nanobind::arg("target"),
       "Returns an identity operator on the given target index.");
   boson_submodule.def(
       "identities",
       [](std::size_t first, std::size_t last) {
         return boson_op_term(first, last);
       },
-      py::arg("first"), py::arg("last"),
+      nanobind::arg("first"), nanobind::arg("last"),
       "Creates a product operator that applies an identity operation to all "
       "degrees of "
       "freedom in the open range [first, last).");
   boson_submodule.def(
-      "create", &boson_op::create<boson_handler>, py::arg("target"),
+      "create", &boson_op::create<boson_handler>, nanobind::arg("target"),
       "Returns a bosonic creation operator on the given target index.");
   boson_submodule.def(
-      "annihilate", &boson_op::annihilate<boson_handler>, py::arg("target"),
+      "annihilate", &boson_op::annihilate<boson_handler>,
+      nanobind::arg("target"),
       "Returns a bosonic annihilation operator on the given target index.");
   boson_submodule.def(
-      "number", &boson_op::number<boson_handler>, py::arg("target"),
+      "number", &boson_op::number<boson_handler>, nanobind::arg("target"),
       "Returns a bosonic number operator on the given target index.");
   boson_submodule.def(
-      "position", &boson_op::position<boson_handler>, py::arg("target"),
+      "position", &boson_op::position<boson_handler>, nanobind::arg("target"),
       "Returns a bosonic position operator on the given target index.");
   boson_submodule.def(
-      "momentum", &boson_op::momentum<boson_handler>, py::arg("target"),
+      "momentum", &boson_op::momentum<boson_handler>, nanobind::arg("target"),
       "Returns a bosonic momentum operator on the given target index.");
   boson_submodule.def(
       "canonicalized",
@@ -90,50 +98,52 @@ void bindBosonModule(py::module &mod) {
       "degrees of freedom.");
 }
 
-void bindBosonOperator(py::module &mod) {
+void bindBosonOperator(nanobind::module_ &mod) {
 
-  auto boson_op_class = py::class_<boson_op>(mod, "BosonOperator");
+  auto boson_op_class = nanobind::class_<boson_op>(mod, "BosonOperator");
   auto boson_op_term_class =
-      py::class_<boson_op_term>(mod, "BosonOperatorTerm");
+      nanobind::class_<boson_op_term>(mod, "BosonOperatorTerm");
 
   boson_op_class
       .def(
           "__iter__",
           [](boson_op &self) {
-            return py::make_iterator(self.begin(), self.end());
+            return nanobind::make_iterator(nanobind::type<boson_op>(),
+                                           "iterator", self.begin(),
+                                           self.end());
           },
-          py::keep_alive<0, 1>(), "Loop through each term of the operator.")
+          nanobind::keep_alive<0, 1>(),
+          "Loop through each term of the operator.")
 
       // properties
 
-      .def_property_readonly("parameters",
-                             &boson_op::get_parameter_descriptions,
-                             "Returns a dictionary that maps each parameter "
-                             "name to its description.")
-      .def_property_readonly("degrees", &boson_op::degrees,
-                             "Returns a vector that lists all degrees of "
-                             "freedom that the operator targets. "
-                             "The order of degrees is from smallest to largest "
-                             "and reflects the ordering of "
-                             "the matrix returned by `to_matrix`. "
-                             "Specifically, the indices of a statevector "
-                             "with two qubits are {00, 01, 10, 11}. An "
-                             "ordering of degrees {0, 1} then indicates "
-                             "that a state where the qubit with index 0 equals "
-                             "1 with probability 1 is given by "
-                             "the vector {0., 1., 0., 0.}.")
-      .def_property_readonly("min_degree", &boson_op::min_degree,
-                             "Returns the smallest index of the degrees of "
-                             "freedom that the operator targets.")
-      .def_property_readonly("max_degree", &boson_op::max_degree,
-                             "Returns the smallest index of the degrees of "
-                             "freedom that the operator targets.")
-      .def_property_readonly("term_count", &boson_op::num_terms,
-                             "Returns the number of terms in the operator.")
+      .def_prop_ro("parameters", &boson_op::get_parameter_descriptions,
+                   "Returns a dictionary that maps each parameter "
+                   "name to its description.")
+      .def_prop_ro("degrees", &boson_op::degrees,
+                   "Returns a vector that lists all degrees of "
+                   "freedom that the operator targets. "
+                   "The order of degrees is from smallest to largest "
+                   "and reflects the ordering of "
+                   "the matrix returned by `to_matrix`. "
+                   "Specifically, the indices of a statevector "
+                   "with two qubits are {00, 01, 10, 11}. An "
+                   "ordering of degrees {0, 1} then indicates "
+                   "that a state where the qubit with index 0 equals "
+                   "1 with probability 1 is given by "
+                   "the vector {0., 1., 0., 0.}.")
+      .def_prop_ro("min_degree", &boson_op::min_degree,
+                   "Returns the smallest index of the degrees of "
+                   "freedom that the operator targets.")
+      .def_prop_ro("max_degree", &boson_op::max_degree,
+                   "Returns the smallest index of the degrees of "
+                   "freedom that the operator targets.")
+      .def_prop_ro("term_count", &boson_op::num_terms,
+                   "Returns the number of terms in the operator.")
 
       // constructors
 
-      .def(py::init<>(),
+      .def(nanobind::init<>(),
            "Creates a default instantiated sum. A default instantiated "
            "sum has no value; it will take a value the first time an "
            "arithmetic operation "
@@ -142,12 +152,12 @@ void bindBosonOperator(py::module &mod) {
            "identity. To construct a `0` value in the mathematical sense "
            "(neutral element "
            "for addition), use `empty()` instead.")
-      .def(py::init<std::size_t>(),
+      .def(nanobind::init<std::size_t>(),
            "Creates a sum operator with no terms, reserving "
            "space for the given number of terms.")
-      .def(py::init<const boson_op_term &>(),
+      .def(nanobind::init<const boson_op_term &>(),
            "Creates a sum operator with the given term.")
-      .def(py::init<const boson_op &>(), "Copy constructor.")
+      .def(nanobind::init<const boson_op &>(), "Copy constructor.")
       .def(
           "copy", [](const boson_op &self) { return boson_op(self); },
           "Creates a copy of the operator.")
@@ -161,9 +171,9 @@ void bindBosonOperator(py::module &mod) {
             auto cmat = self.to_matrix(dimensions, params, invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("parameters") = parameter_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("invert_order") = false,
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
@@ -173,13 +183,13 @@ void bindBosonOperator(py::module &mod) {
       .def(
           "to_matrix",
           [](const boson_op &self, dimension_map &dimensions, bool invert_order,
-             const py::kwargs &kwargs) {
+             const nanobind::kwargs &kwargs) {
             auto cmat = self.to_matrix(
                 dimensions, details::kwargs_to_param_map(kwargs), invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
@@ -192,9 +202,9 @@ void bindBosonOperator(py::module &mod) {
              const parameter_map &params, bool invert_order) {
             return self.to_sparse_matrix(dimensions, params, invert_order);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("parameters") = parameter_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("invert_order") = false,
           "Return the sparse matrix representation of the operator. This "
           "representation is a "
           "`Tuple[list[complex], list[int], list[int]]`, encoding the "
@@ -208,12 +218,12 @@ void bindBosonOperator(py::module &mod) {
       .def(
           "to_sparse_matrix",
           [](const boson_op &self, dimension_map &dimensions, bool invert_order,
-             const py::kwargs &kwargs) {
+             const nanobind::kwargs &kwargs) {
             return self.to_sparse_matrix(
                 dimensions, details::kwargs_to_param_map(kwargs), invert_order);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Return the sparse matrix representation of the operator. This "
           "representation is a "
           "`Tuple[list[complex], list[int], list[int]]`, encoding the "
@@ -227,7 +237,7 @@ void bindBosonOperator(py::module &mod) {
 
       // comparisons
 
-      .def("__eq__", &boson_op::operator==, py::is_operator(),
+      .def("__eq__", &boson_op::operator==, nanobind::is_operator(),
            "Return true if the two operators are equivalent. The equivalence "
            "check takes "
            "commutation relations into account. Operators acting on different "
@@ -239,91 +249,91 @@ void bindBosonOperator(py::module &mod) {
           [](const boson_op &self, const boson_op_term &other) {
             return self.num_terms() == 1 && *self.begin() == other;
           },
-          py::is_operator(), "Return true if the two operators are equivalent.")
+          nanobind::is_operator(),
+          "Return true if the two operators are equivalent.")
 
       // unary operators
 
-      .def(-py::self, py::is_operator())
-      .def(+py::self, py::is_operator())
+      .def(-nanobind::self, nanobind::is_operator())
+      .def(+nanobind::self, nanobind::is_operator())
 
       // in-place arithmetics
 
-      .def(py::self /= int(), py::is_operator())
-      .def(py::self *= int(), py::is_operator())
-      .def(py::self += int(), py::is_operator())
-      .def(py::self -= int(), py::is_operator())
-      .def(py::self /= double(), py::is_operator())
-      .def(py::self *= double(), py::is_operator())
-      .def(py::self += double(), py::is_operator())
-      .def(py::self -= double(), py::is_operator())
-      .def(py::self /= std::complex<double>(), py::is_operator())
-      .def(py::self *= std::complex<double>(), py::is_operator())
-      .def(py::self += std::complex<double>(), py::is_operator())
-      .def(py::self -= std::complex<double>(), py::is_operator())
-      .def(py::self /= scalar_operator(), py::is_operator())
-      .def(py::self *= scalar_operator(), py::is_operator())
-      .def(py::self += scalar_operator(), py::is_operator())
-      .def(py::self -= scalar_operator(), py::is_operator())
-      .def(py::self *= boson_op_term(), py::is_operator())
-      .def(py::self += boson_op_term(), py::is_operator())
-      .def(py::self -= boson_op_term(), py::is_operator())
-      .def(py::self *= py::self, py::is_operator())
-      .def(py::self += py::self, py::is_operator())
-// see issue https://github.com/pybind/pybind11/issues/1893
+      .def(nanobind::self /= int(), nanobind::is_operator())
+      .def(nanobind::self *= int(), nanobind::is_operator())
+      .def(nanobind::self += int(), nanobind::is_operator())
+      .def(nanobind::self -= int(), nanobind::is_operator())
+      .def(nanobind::self /= double(), nanobind::is_operator())
+      .def(nanobind::self *= double(), nanobind::is_operator())
+      .def(nanobind::self += double(), nanobind::is_operator())
+      .def(nanobind::self -= double(), nanobind::is_operator())
+      .def(nanobind::self /= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self *= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self += std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self -= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self /= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self *= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self += scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self -= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self *= boson_op_term(), nanobind::is_operator())
+      .def(nanobind::self += boson_op_term(), nanobind::is_operator())
+      .def(nanobind::self -= boson_op_term(), nanobind::is_operator())
+      .def(nanobind::self *= nanobind::self, nanobind::is_operator())
+      .def(nanobind::self += nanobind::self, nanobind::is_operator())
 #ifdef __clang__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wself-assign-overloaded"
 #endif
-      .def(py::self -= py::self, py::is_operator())
+      .def(nanobind::self -= nanobind::self, nanobind::is_operator())
 #ifdef __clang__
 #pragma clang diagnostic pop
 #endif
 
       // right-hand arithmetics
 
-      .def(py::self / int(), py::is_operator())
-      .def(py::self * int(), py::is_operator())
-      .def(py::self + int(), py::is_operator())
-      .def(py::self - int(), py::is_operator())
-      .def(py::self / double(), py::is_operator())
-      .def(py::self * double(), py::is_operator())
-      .def(py::self + double(), py::is_operator())
-      .def(py::self - double(), py::is_operator())
-      .def(py::self / std::complex<double>(), py::is_operator())
-      .def(py::self * std::complex<double>(), py::is_operator())
-      .def(py::self + std::complex<double>(), py::is_operator())
-      .def(py::self - std::complex<double>(), py::is_operator())
-      .def(py::self / scalar_operator(), py::is_operator())
-      .def(py::self * scalar_operator(), py::is_operator())
-      .def(py::self + scalar_operator(), py::is_operator())
-      .def(py::self - scalar_operator(), py::is_operator())
-      .def(py::self * boson_op_term(), py::is_operator())
-      .def(py::self + boson_op_term(), py::is_operator())
-      .def(py::self - boson_op_term(), py::is_operator())
-      .def(py::self * py::self, py::is_operator())
-      .def(py::self + py::self, py::is_operator())
-      .def(py::self - py::self, py::is_operator())
-      .def(py::self * matrix_op_term(), py::is_operator())
-      .def(py::self + matrix_op_term(), py::is_operator())
-      .def(py::self - matrix_op_term(), py::is_operator())
-      .def(py::self * matrix_op(), py::is_operator())
-      .def(py::self + matrix_op(), py::is_operator())
-      .def(py::self - matrix_op(), py::is_operator())
+      .def(nanobind::self / int(), nanobind::is_operator())
+      .def(nanobind::self * int(), nanobind::is_operator())
+      .def(nanobind::self + int(), nanobind::is_operator())
+      .def(nanobind::self - int(), nanobind::is_operator())
+      .def(nanobind::self / double(), nanobind::is_operator())
+      .def(nanobind::self * double(), nanobind::is_operator())
+      .def(nanobind::self + double(), nanobind::is_operator())
+      .def(nanobind::self - double(), nanobind::is_operator())
+      .def(nanobind::self / std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self * std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self + std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self - std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self / scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self * scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self + scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self - scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self * boson_op_term(), nanobind::is_operator())
+      .def(nanobind::self + boson_op_term(), nanobind::is_operator())
+      .def(nanobind::self - boson_op_term(), nanobind::is_operator())
+      .def(nanobind::self * nanobind::self, nanobind::is_operator())
+      .def(nanobind::self + nanobind::self, nanobind::is_operator())
+      .def(nanobind::self - nanobind::self, nanobind::is_operator())
+      .def(nanobind::self * matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self + matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self - matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self * matrix_op(), nanobind::is_operator())
+      .def(nanobind::self + matrix_op(), nanobind::is_operator())
+      .def(nanobind::self - matrix_op(), nanobind::is_operator())
 
       // left-hand arithmetics
 
-      .def(int() * py::self, py::is_operator())
-      .def(int() + py::self, py::is_operator())
-      .def(int() - py::self, py::is_operator())
-      .def(double() * py::self, py::is_operator())
-      .def(double() + py::self, py::is_operator())
-      .def(double() - py::self, py::is_operator())
-      .def(std::complex<double>() * py::self, py::is_operator())
-      .def(std::complex<double>() + py::self, py::is_operator())
-      .def(std::complex<double>() - py::self, py::is_operator())
-      .def(scalar_operator() * py::self, py::is_operator())
-      .def(scalar_operator() + py::self, py::is_operator())
-      .def(scalar_operator() - py::self, py::is_operator())
+      .def(int() * nanobind::self, nanobind::is_operator())
+      .def(int() + nanobind::self, nanobind::is_operator())
+      .def(int() - nanobind::self, nanobind::is_operator())
+      .def(double() * nanobind::self, nanobind::is_operator())
+      .def(double() + nanobind::self, nanobind::is_operator())
+      .def(double() - nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() * nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() + nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() - nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() * nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() + nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() - nanobind::self, nanobind::is_operator())
 
       // common operators
 
@@ -351,17 +361,17 @@ void bindBosonOperator(py::module &mod) {
       .def("dump", &boson_op::dump,
            "Prints the string representation of the operator to the standard "
            "output.")
-      .def("trim", &boson_op::trim, py::arg("tol") = 0.0,
-           py::arg("parameters") = parameter_map(),
+      .def("trim", &boson_op::trim, nanobind::arg("tol") = 0.0,
+           nanobind::arg("parameters") = parameter_map(),
            "Removes all terms from the sum for which the absolute value of the "
            "coefficient is below "
            "the given tolerance.")
       .def(
           "trim",
-          [](boson_op &self, double tol, const py::kwargs &kwargs) {
+          [](boson_op &self, double tol, const nanobind::kwargs &kwargs) {
             return self.trim(tol, details::kwargs_to_param_map(kwargs));
           },
-          py::arg("tol") = 0.0,
+          nanobind::arg("tol") = 0.0, nanobind::arg("kwargs"),
           "Removes all terms from the sum for which the absolute value of the "
           "coefficient is below "
           "the given tolerance.")
@@ -386,42 +396,44 @@ void bindBosonOperator(py::module &mod) {
       .def(
           "__iter__",
           [](boson_op_term &self) {
-            return py::make_iterator(self.begin(), self.end());
+            return nanobind::make_iterator(nanobind::type<boson_op_term>(),
+                                           "iterator", self.begin(),
+                                           self.end());
           },
-          py::keep_alive<0, 1>(), "Loop through each term of the operator.")
+          nanobind::keep_alive<0, 1>(),
+          "Loop through each term of the operator.")
 
       // properties
 
-      .def_property_readonly("parameters",
-                             &boson_op_term::get_parameter_descriptions,
-                             "Returns a dictionary that maps each parameter "
-                             "name to its description.")
-      .def_property_readonly("degrees", &boson_op_term::degrees,
-                             "Returns a vector that lists all degrees of "
-                             "freedom that the operator targets. "
-                             "The order of degrees is from smallest to largest "
-                             "and reflects the ordering of "
-                             "the matrix returned by `to_matrix`. "
-                             "Specifically, the indices of a statevector "
-                             "with two qubits are {00, 01, 10, 11}. An "
-                             "ordering of degrees {0, 1} then indicates "
-                             "that a state where the qubit with index 0 equals "
-                             "1 with probability 1 is given by "
-                             "the vector {0., 1., 0., 0.}.")
-      .def_property_readonly("min_degree", &boson_op_term::min_degree,
-                             "Returns the smallest index of the degrees of "
-                             "freedom that the operator targets.")
-      .def_property_readonly("max_degree", &boson_op_term::max_degree,
-                             "Returns the smallest index of the degrees of "
-                             "freedom that the operator targets.")
-      .def_property_readonly("ops_count", &boson_op_term::num_ops,
-                             "Returns the number of operators in the product.")
-      .def_property_readonly(
+      .def_prop_ro("parameters", &boson_op_term::get_parameter_descriptions,
+                   "Returns a dictionary that maps each parameter "
+                   "name to its description.")
+      .def_prop_ro("degrees", &boson_op_term::degrees,
+                   "Returns a vector that lists all degrees of "
+                   "freedom that the operator targets. "
+                   "The order of degrees is from smallest to largest "
+                   "and reflects the ordering of "
+                   "the matrix returned by `to_matrix`. "
+                   "Specifically, the indices of a statevector "
+                   "with two qubits are {00, 01, 10, 11}. An "
+                   "ordering of degrees {0, 1} then indicates "
+                   "that a state where the qubit with index 0 equals "
+                   "1 with probability 1 is given by "
+                   "the vector {0., 1., 0., 0.}.")
+      .def_prop_ro("min_degree", &boson_op_term::min_degree,
+                   "Returns the smallest index of the degrees of "
+                   "freedom that the operator targets.")
+      .def_prop_ro("max_degree", &boson_op_term::max_degree,
+                   "Returns the smallest index of the degrees of "
+                   "freedom that the operator targets.")
+      .def_prop_ro("ops_count", &boson_op_term::num_ops,
+                   "Returns the number of operators in the product.")
+      .def_prop_ro(
           "term_id", &boson_op_term::get_term_id,
           "The term id uniquely identifies the operators and targets (degrees) "
           "that they act on, "
           "but does not include information about the coefficient.")
-      .def_property_readonly(
+      .def_prop_ro(
           "coefficient", &boson_op_term::get_coefficient,
           "Returns the unevaluated coefficient of the operator. The "
           "coefficient is a "
@@ -429,30 +441,32 @@ void bindBosonOperator(py::module &mod) {
 
       // constructors
 
-      .def(py::init<>(),
+      .def(nanobind::init<>(),
            "Creates a product operator with constant value 1. The returned "
            "operator does not target any degrees of freedom but merely "
            "represents a constant.")
-      .def(py::init<std::size_t, std::size_t>(), py::arg("first_degree"),
-           py::arg("last_degree"),
+      .def(nanobind::init<std::size_t, std::size_t>(),
+           nanobind::arg("first_degree"), nanobind::arg("last_degree"),
            "Creates a product operator that applies an identity operation to "
            "all degrees of "
            "freedom in the range [first_degree, last_degree).")
-      .def(py::init<double>(),
+      .def(nanobind::init<double>(),
            "Creates a product operator with the given constant value. "
            "The returned operator does not target any degrees of freedom.")
-      .def(py::init<std::complex<double>>(),
+      .def(nanobind::init<std::complex<double>>(),
            "Creates a product operator with the given "
            "constant value. The returned operator does not target any degrees "
            "of freedom.")
-      .def(py::init([](const scalar_operator &scalar) {
-             return boson_op_term() * scalar;
-           }),
-           "Creates a product operator with non-constant scalar value.")
-      .def(py::init<boson_handler>(),
+      .def(
+          "__init__",
+          [](boson_op_term *self, const scalar_operator &scalar) {
+            new (self) boson_op_term(boson_op_term() * scalar);
+          },
+          "Creates a product operator with non-constant scalar value.")
+      .def(nanobind::init<boson_handler>(),
            "Creates a product operator with the given elementary operator.")
-      .def(py::init<const boson_op_term &, std::size_t>(), py::arg("operator"),
-           py::arg("size") = 0,
+      .def(nanobind::init<const boson_op_term &, std::size_t>(),
+           nanobind::arg("operator"), nanobind::arg("size") = 0,
            "Creates a copy of the given operator and reserves space for "
            "storing the given "
            "number of product terms (if a size is provided).")
@@ -463,7 +477,7 @@ void bindBosonOperator(py::module &mod) {
       // evaluations
 
       .def("evaluate_coefficient", &boson_op_term::evaluate_coefficient,
-           py::arg("parameters") = parameter_map(),
+           nanobind::arg("parameters") = parameter_map(),
            "Returns the evaluated coefficient of the product operator. The "
            "parameters is a map of parameter names to their concrete, complex "
            "values.")
@@ -474,9 +488,9 @@ void bindBosonOperator(py::module &mod) {
             auto cmat = self.to_matrix(dimensions, params, invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("parameters") = parameter_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("invert_order") = false,
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
@@ -486,13 +500,13 @@ void bindBosonOperator(py::module &mod) {
       .def(
           "to_matrix",
           [](const boson_op_term &self, dimension_map &dimensions,
-             bool invert_order, const py::kwargs &kwargs) {
+             bool invert_order, const nanobind::kwargs &kwargs) {
             auto cmat = self.to_matrix(
                 dimensions, details::kwargs_to_param_map(kwargs), invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
@@ -505,9 +519,9 @@ void bindBosonOperator(py::module &mod) {
              const parameter_map &params, bool invert_order) {
             return self.to_sparse_matrix(dimensions, params, invert_order);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("parameters") = parameter_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("invert_order") = false,
           "Return the sparse matrix representation of the operator. This "
           "representation is a "
           "`Tuple[list[complex], list[int], list[int]]`, encoding the "
@@ -521,12 +535,12 @@ void bindBosonOperator(py::module &mod) {
       .def(
           "to_sparse_matrix",
           [](const boson_op_term &self, dimension_map &dimensions,
-             bool invert_order, const py::kwargs &kwargs) {
+             bool invert_order, const nanobind::kwargs &kwargs) {
             return self.to_sparse_matrix(
                 dimensions, details::kwargs_to_param_map(kwargs), invert_order);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Return the sparse matrix representation of the operator. This "
           "representation is a "
           "`Tuple[list[complex], list[int], list[int]]`, encoding the "
@@ -540,7 +554,7 @@ void bindBosonOperator(py::module &mod) {
 
       // comparisons
 
-      .def("__eq__", &boson_op_term::operator==, py::is_operator(),
+      .def("__eq__", &boson_op_term::operator==, nanobind::is_operator(),
            "Return true if the two operators are equivalent. The equivalence "
            "check takes "
            "commutation relations into account. Operators acting on different "
@@ -552,77 +566,78 @@ void bindBosonOperator(py::module &mod) {
           [](const boson_op_term &self, const boson_op &other) {
             return other.num_terms() == 1 && *other.begin() == self;
           },
-          py::is_operator(), "Return true if the two operators are equivalent.")
+          nanobind::is_operator(),
+          "Return true if the two operators are equivalent.")
 
       // unary operators
 
-      .def(-py::self, py::is_operator())
-      .def(+py::self, py::is_operator())
+      .def(-nanobind::self, nanobind::is_operator())
+      .def(+nanobind::self, nanobind::is_operator())
 
       // in-place arithmetics
 
-      .def(py::self /= int(), py::is_operator())
-      .def(py::self *= int(), py::is_operator())
-      .def(py::self /= double(), py::is_operator())
-      .def(py::self *= double(), py::is_operator())
-      .def(py::self /= std::complex<double>(), py::is_operator())
-      .def(py::self *= std::complex<double>(), py::is_operator())
-      .def(py::self /= scalar_operator(), py::is_operator())
-      .def(py::self *= scalar_operator(), py::is_operator())
-      .def(py::self *= py::self, py::is_operator())
+      .def(nanobind::self /= int(), nanobind::is_operator())
+      .def(nanobind::self *= int(), nanobind::is_operator())
+      .def(nanobind::self /= double(), nanobind::is_operator())
+      .def(nanobind::self *= double(), nanobind::is_operator())
+      .def(nanobind::self /= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self *= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self /= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self *= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self *= nanobind::self, nanobind::is_operator())
 
       // right-hand arithmetics
 
-      .def(py::self / int(), py::is_operator())
-      .def(py::self * int(), py::is_operator())
-      .def(py::self + int(), py::is_operator())
-      .def(py::self - int(), py::is_operator())
-      .def(py::self / double(), py::is_operator())
-      .def(py::self * double(), py::is_operator())
-      .def(py::self + double(), py::is_operator())
-      .def(py::self - double(), py::is_operator())
-      .def(py::self / std::complex<double>(), py::is_operator())
-      .def(py::self * std::complex<double>(), py::is_operator())
-      .def(py::self + std::complex<double>(), py::is_operator())
-      .def(py::self - std::complex<double>(), py::is_operator())
-      .def(py::self / scalar_operator(), py::is_operator())
-      .def(py::self * scalar_operator(), py::is_operator())
-      .def(py::self + scalar_operator(), py::is_operator())
-      .def(py::self - scalar_operator(), py::is_operator())
-      .def(py::self * py::self, py::is_operator())
-      .def(py::self + py::self, py::is_operator())
-      .def(py::self - py::self, py::is_operator())
-      .def(py::self * boson_op(), py::is_operator())
-      .def(py::self + boson_op(), py::is_operator())
-      .def(py::self - boson_op(), py::is_operator())
-      .def(py::self * matrix_op_term(), py::is_operator())
-      .def(py::self + matrix_op_term(), py::is_operator())
-      .def(py::self - matrix_op_term(), py::is_operator())
-      .def(py::self * matrix_op(), py::is_operator())
-      .def(py::self + matrix_op(), py::is_operator())
-      .def(py::self - matrix_op(), py::is_operator())
+      .def(nanobind::self / int(), nanobind::is_operator())
+      .def(nanobind::self * int(), nanobind::is_operator())
+      .def(nanobind::self + int(), nanobind::is_operator())
+      .def(nanobind::self - int(), nanobind::is_operator())
+      .def(nanobind::self / double(), nanobind::is_operator())
+      .def(nanobind::self * double(), nanobind::is_operator())
+      .def(nanobind::self + double(), nanobind::is_operator())
+      .def(nanobind::self - double(), nanobind::is_operator())
+      .def(nanobind::self / std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self * std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self + std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self - std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self / scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self * scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self + scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self - scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self * nanobind::self, nanobind::is_operator())
+      .def(nanobind::self + nanobind::self, nanobind::is_operator())
+      .def(nanobind::self - nanobind::self, nanobind::is_operator())
+      .def(nanobind::self * boson_op(), nanobind::is_operator())
+      .def(nanobind::self + boson_op(), nanobind::is_operator())
+      .def(nanobind::self - boson_op(), nanobind::is_operator())
+      .def(nanobind::self * matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self + matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self - matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self * matrix_op(), nanobind::is_operator())
+      .def(nanobind::self + matrix_op(), nanobind::is_operator())
+      .def(nanobind::self - matrix_op(), nanobind::is_operator())
 
       // left-hand arithmetics
 
-      .def(int() * py::self, py::is_operator())
-      .def(int() + py::self, py::is_operator())
-      .def(int() - py::self, py::is_operator())
-      .def(double() * py::self, py::is_operator())
-      .def(double() + py::self, py::is_operator())
-      .def(double() - py::self, py::is_operator())
-      .def(std::complex<double>() * py::self, py::is_operator())
-      .def(std::complex<double>() + py::self, py::is_operator())
-      .def(std::complex<double>() - py::self, py::is_operator())
-      .def(scalar_operator() * py::self, py::is_operator())
-      .def(scalar_operator() + py::self, py::is_operator())
-      .def(scalar_operator() - py::self, py::is_operator())
+      .def(int() * nanobind::self, nanobind::is_operator())
+      .def(int() + nanobind::self, nanobind::is_operator())
+      .def(int() - nanobind::self, nanobind::is_operator())
+      .def(double() * nanobind::self, nanobind::is_operator())
+      .def(double() + nanobind::self, nanobind::is_operator())
+      .def(double() - nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() * nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() + nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() - nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() * nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() + nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() - nanobind::self, nanobind::is_operator())
 
       // general utility functions
 
       .def("is_identity", &boson_op_term::is_identity,
            "Checks if all operators in the product are the identity. "
-           "Note: this function returns true regardless of the value of the "
-           "coefficient.")
+           "Note that this function returns true regardless of the value of "
+           "the coefficient.")
       .def(
           "__str__", [](const boson_op_term &self) { return self.to_string(); },
           "Returns the string representation of the operator.")
@@ -645,12 +660,12 @@ void bindBosonOperator(py::module &mod) {
           "of freedom that are not included in the given set.");
 }
 
-void bindBosonWrapper(py::module &mod) {
+void bindBosonWrapper(nanobind::module_ &mod) {
   bindBosonOperator(mod);
-  py::implicitly_convertible<double, boson_op_term>();
-  py::implicitly_convertible<std::complex<double>, boson_op_term>();
-  py::implicitly_convertible<scalar_operator, boson_op_term>();
-  py::implicitly_convertible<boson_op_term, boson_op>();
+  nanobind::implicitly_convertible<double, boson_op_term>();
+  nanobind::implicitly_convertible<std::complex<double>, boson_op_term>();
+  nanobind::implicitly_convertible<scalar_operator, boson_op_term>();
+  nanobind::implicitly_convertible<boson_op_term, boson_op>();
   bindBosonModule(mod);
 }
 
diff --git a/python/runtime/cudaq/operators/py_boson_op.h b/python/runtime/cudaq/operators/py_boson_op.h
index 7952fcab9cf..7f74e49cbc0 100644
--- a/python/runtime/cudaq/operators/py_boson_op.h
+++ b/python/runtime/cudaq/operators/py_boson_op.h
@@ -6,12 +6,10 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
 /// @brief Wrapper function for exposing the bindings of bosonic
 /// operators to python.
-void bindBosonWrapper(py::module &mod);
+void bindBosonWrapper(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/operators/py_fermion_op.cpp b/python/runtime/cudaq/operators/py_fermion_op.cpp
index 7c0e315d6f5..621f39c873f 100644
--- a/python/runtime/cudaq/operators/py_fermion_op.cpp
+++ b/python/runtime/cudaq/operators/py_fermion_op.cpp
@@ -7,10 +7,18 @@
  ******************************************************************************/
 
 #include <complex>
-#include <pybind11/complex.h>
-#include <pybind11/numpy.h>
-#include <pybind11/operators.h>
-#include <pybind11/stl.h>
+#include <nanobind/make_iterator.h>
+#include <nanobind/ndarray.h>
+#include <nanobind/operators.h>
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/function.h>
+#include <nanobind/stl/map.h>
+#include <nanobind/stl/pair.h>
+#include <nanobind/stl/set.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/tuple.h>
+#include <nanobind/stl/unordered_map.h>
+#include <nanobind/stl/vector.h>
 
 #include "cudaq/operators.h"
 #include "cudaq/operators/serialization.h"
@@ -19,7 +27,7 @@
 
 namespace cudaq {
 
-void bindFermionModule(py::module &mod) {
+void bindFermionModule(nanobind::module_ &mod) {
   // Binding the functions in `cudaq::fermion` as `_pycudaq` submodule
   // so it's accessible directly in the cudaq namespace.
   auto fermion_submodule = mod.def_submodule("fermion");
@@ -33,25 +41,26 @@ void bindFermionModule(py::module &mod) {
   fermion_submodule.def(
       "identity",
       [](std::size_t target) { return fermion_op::identity(target); },
-      py::arg("target"),
+      nanobind::arg("target"),
       "Returns an identity operator on the given target index.");
   fermion_submodule.def(
       "identities",
       [](std::size_t first, std::size_t last) {
         return fermion_op_term(first, last);
       },
-      py::arg("first"), py::arg("last"),
+      nanobind::arg("first"), nanobind::arg("last"),
       "Creates a product operator that applies an identity operation to all "
       "degrees of "
       "freedom in the open range [first, last).");
   fermion_submodule.def(
-      "create", &fermion_op::create<fermion_handler>, py::arg("target"),
+      "create", &fermion_op::create<fermion_handler>, nanobind::arg("target"),
       "Returns a fermionic creation operator on the given target index.");
   fermion_submodule.def(
-      "annihilate", &fermion_op::annihilate<fermion_handler>, py::arg("target"),
+      "annihilate", &fermion_op::annihilate<fermion_handler>,
+      nanobind::arg("target"),
       "Returns a fermionic annihilation operator on the given target index.");
   fermion_submodule.def(
-      "number", &fermion_op::number<fermion_handler>, py::arg("target"),
+      "number", &fermion_op::number<fermion_handler>, nanobind::arg("target"),
       "Returns a fermionic number operator on the given target index.");
   fermion_submodule.def(
       "canonicalized",
@@ -85,50 +94,52 @@ void bindFermionModule(py::module &mod) {
       "degrees of freedom.");
 }
 
-void bindFermionOperator(py::module &mod) {
+void bindFermionOperator(nanobind::module_ &mod) {
 
-  auto fermion_op_class = py::class_<fermion_op>(mod, "FermionOperator");
+  auto fermion_op_class = nanobind::class_<fermion_op>(mod, "FermionOperator");
   auto fermion_op_term_class =
-      py::class_<fermion_op_term>(mod, "FermionOperatorTerm");
+      nanobind::class_<fermion_op_term>(mod, "FermionOperatorTerm");
 
   fermion_op_class
       .def(
           "__iter__",
           [](fermion_op &self) {
-            return py::make_iterator(self.begin(), self.end());
+            return nanobind::make_iterator(nanobind::type<fermion_op>(),
+                                           "iterator", self.begin(),
+                                           self.end());
           },
-          py::keep_alive<0, 1>(), "Loop through each term of the operator.")
+          nanobind::keep_alive<0, 1>(),
+          "Loop through each term of the operator.")
 
       // properties
 
-      .def_property_readonly("parameters",
-                             &fermion_op::get_parameter_descriptions,
-                             "Returns a dictionary that maps each parameter "
-                             "name to its description.")
-      .def_property_readonly("degrees", &fermion_op::degrees,
-                             "Returns a vector that lists all degrees of "
-                             "freedom that the operator targets. "
-                             "The order of degrees is from smallest to largest "
-                             "and reflects the ordering of "
-                             "the matrix returned by `to_matrix`. "
-                             "Specifically, the indices of a statevector "
-                             "with two qubits are {00, 01, 10, 11}. An "
-                             "ordering of degrees {0, 1} then indicates "
-                             "that a state where the qubit with index 0 equals "
-                             "1 with probability 1 is given by "
-                             "the vector {0., 1., 0., 0.}.")
-      .def_property_readonly("min_degree", &fermion_op::min_degree,
-                             "Returns the smallest index of the degrees of "
-                             "freedom that the operator targets.")
-      .def_property_readonly("max_degree", &fermion_op::max_degree,
-                             "Returns the smallest index of the degrees of "
-                             "freedom that the operator targets.")
-      .def_property_readonly("term_count", &fermion_op::num_terms,
-                             "Returns the number of terms in the operator.")
+      .def_prop_ro("parameters", &fermion_op::get_parameter_descriptions,
+                   "Returns a dictionary that maps each parameter "
+                   "name to its description.")
+      .def_prop_ro("degrees", &fermion_op::degrees,
+                   "Returns a vector that lists all degrees of "
+                   "freedom that the operator targets. "
+                   "The order of degrees is from smallest to largest "
+                   "and reflects the ordering of "
+                   "the matrix returned by `to_matrix`. "
+                   "Specifically, the indices of a statevector "
+                   "with two qubits are {00, 01, 10, 11}. An "
+                   "ordering of degrees {0, 1} then indicates "
+                   "that a state where the qubit with index 0 equals "
+                   "1 with probability 1 is given by "
+                   "the vector {0., 1., 0., 0.}.")
+      .def_prop_ro("min_degree", &fermion_op::min_degree,
+                   "Returns the smallest index of the degrees of "
+                   "freedom that the operator targets.")
+      .def_prop_ro("max_degree", &fermion_op::max_degree,
+                   "Returns the smallest index of the degrees of "
+                   "freedom that the operator targets.")
+      .def_prop_ro("term_count", &fermion_op::num_terms,
+                   "Returns the number of terms in the operator.")
 
       // constructors
 
-      .def(py::init<>(),
+      .def(nanobind::init<>(),
            "Creates a default instantiated sum. A default instantiated "
            "sum has no value; it will take a value the first time an "
            "arithmetic operation "
@@ -137,12 +148,12 @@ void bindFermionOperator(py::module &mod) {
            "identity. To construct a `0` value in the mathematical sense "
            "(neutral element "
            "for addition), use `empty()` instead.")
-      .def(py::init<std::size_t>(),
+      .def(nanobind::init<std::size_t>(),
            "Creates a sum operator with no terms, reserving "
            "space for the given number of terms.")
-      .def(py::init<const fermion_op_term &>(),
+      .def(nanobind::init<const fermion_op_term &>(),
            "Creates a sum operator with the given term.")
-      .def(py::init<const fermion_op &>(), "Copy constructor.")
+      .def(nanobind::init<const fermion_op &>(), "Copy constructor.")
       .def(
           "copy", [](const fermion_op &self) { return fermion_op(self); },
           "Creates a copy of the operator.")
@@ -156,9 +167,9 @@ void bindFermionOperator(py::module &mod) {
             auto cmat = self.to_matrix(dimensions, params, invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("parameters") = parameter_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("invert_order") = false,
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
@@ -168,13 +179,13 @@ void bindFermionOperator(py::module &mod) {
       .def(
           "to_matrix",
           [](const fermion_op &self, dimension_map &dimensions,
-             bool invert_order, const py::kwargs &kwargs) {
+             bool invert_order, const nanobind::kwargs &kwargs) {
             auto cmat = self.to_matrix(
                 dimensions, details::kwargs_to_param_map(kwargs), invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
@@ -187,9 +198,9 @@ void bindFermionOperator(py::module &mod) {
              const parameter_map &params, bool invert_order) {
             return self.to_sparse_matrix(dimensions, params, invert_order);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("parameters") = parameter_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("invert_order") = false,
           "Return the sparse matrix representation of the operator. This "
           "representation is a "
           "`Tuple[list[complex], list[int], list[int]]`, encoding the "
@@ -203,12 +214,12 @@ void bindFermionOperator(py::module &mod) {
       .def(
           "to_sparse_matrix",
           [](const fermion_op &self, dimension_map &dimensions,
-             bool invert_order, const py::kwargs &kwargs) {
+             bool invert_order, const nanobind::kwargs &kwargs) {
             return self.to_sparse_matrix(
                 dimensions, details::kwargs_to_param_map(kwargs), invert_order);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Return the sparse matrix representation of the operator. This "
           "representation is a "
           "`Tuple[list[complex], list[int], list[int]]`, encoding the "
@@ -222,7 +233,7 @@ void bindFermionOperator(py::module &mod) {
 
       // comparisons
 
-      .def("__eq__", &fermion_op::operator==, py::is_operator(),
+      .def("__eq__", &fermion_op::operator==, nanobind::is_operator(),
            "Return true if the two operators are equivalent. The equivalence "
            "check takes "
            "commutation relations into account. Operators acting on different "
@@ -234,91 +245,91 @@ void bindFermionOperator(py::module &mod) {
           [](const fermion_op &self, const fermion_op_term &other) {
             return self.num_terms() == 1 && *self.begin() == other;
           },
-          py::is_operator(), "Return true if the two operators are equivalent.")
+          nanobind::is_operator(),
+          "Return true if the two operators are equivalent.")
 
       // unary operators
 
-      .def(-py::self, py::is_operator())
-      .def(+py::self, py::is_operator())
+      .def(-nanobind::self, nanobind::is_operator())
+      .def(+nanobind::self, nanobind::is_operator())
 
       // in-place arithmetics
 
-      .def(py::self /= int(), py::is_operator())
-      .def(py::self *= int(), py::is_operator())
-      .def(py::self += int(), py::is_operator())
-      .def(py::self -= int(), py::is_operator())
-      .def(py::self /= double(), py::is_operator())
-      .def(py::self *= double(), py::is_operator())
-      .def(py::self += double(), py::is_operator())
-      .def(py::self -= double(), py::is_operator())
-      .def(py::self /= std::complex<double>(), py::is_operator())
-      .def(py::self *= std::complex<double>(), py::is_operator())
-      .def(py::self += std::complex<double>(), py::is_operator())
-      .def(py::self -= std::complex<double>(), py::is_operator())
-      .def(py::self /= scalar_operator(), py::is_operator())
-      .def(py::self *= scalar_operator(), py::is_operator())
-      .def(py::self += scalar_operator(), py::is_operator())
-      .def(py::self -= scalar_operator(), py::is_operator())
-      .def(py::self *= fermion_op_term(), py::is_operator())
-      .def(py::self += fermion_op_term(), py::is_operator())
-      .def(py::self -= fermion_op_term(), py::is_operator())
-      .def(py::self *= py::self, py::is_operator())
-      .def(py::self += py::self, py::is_operator())
-// see issue https://github.com/pybind/pybind11/issues/1893
+      .def(nanobind::self /= int(), nanobind::is_operator())
+      .def(nanobind::self *= int(), nanobind::is_operator())
+      .def(nanobind::self += int(), nanobind::is_operator())
+      .def(nanobind::self -= int(), nanobind::is_operator())
+      .def(nanobind::self /= double(), nanobind::is_operator())
+      .def(nanobind::self *= double(), nanobind::is_operator())
+      .def(nanobind::self += double(), nanobind::is_operator())
+      .def(nanobind::self -= double(), nanobind::is_operator())
+      .def(nanobind::self /= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self *= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self += std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self -= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self /= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self *= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self += scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self -= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self *= fermion_op_term(), nanobind::is_operator())
+      .def(nanobind::self += fermion_op_term(), nanobind::is_operator())
+      .def(nanobind::self -= fermion_op_term(), nanobind::is_operator())
+      .def(nanobind::self *= nanobind::self, nanobind::is_operator())
+      .def(nanobind::self += nanobind::self, nanobind::is_operator())
 #ifdef __clang__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wself-assign-overloaded"
 #endif
-      .def(py::self -= py::self, py::is_operator())
+      .def(nanobind::self -= nanobind::self, nanobind::is_operator())
 #ifdef __clang__
 #pragma clang diagnostic pop
 #endif
 
       // right-hand arithmetics
 
-      .def(py::self / int(), py::is_operator())
-      .def(py::self * int(), py::is_operator())
-      .def(py::self + int(), py::is_operator())
-      .def(py::self - int(), py::is_operator())
-      .def(py::self / double(), py::is_operator())
-      .def(py::self * double(), py::is_operator())
-      .def(py::self + double(), py::is_operator())
-      .def(py::self - double(), py::is_operator())
-      .def(py::self / std::complex<double>(), py::is_operator())
-      .def(py::self * std::complex<double>(), py::is_operator())
-      .def(py::self + std::complex<double>(), py::is_operator())
-      .def(py::self - std::complex<double>(), py::is_operator())
-      .def(py::self / scalar_operator(), py::is_operator())
-      .def(py::self * scalar_operator(), py::is_operator())
-      .def(py::self + scalar_operator(), py::is_operator())
-      .def(py::self - scalar_operator(), py::is_operator())
-      .def(py::self * fermion_op_term(), py::is_operator())
-      .def(py::self + fermion_op_term(), py::is_operator())
-      .def(py::self - fermion_op_term(), py::is_operator())
-      .def(py::self * py::self, py::is_operator())
-      .def(py::self + py::self, py::is_operator())
-      .def(py::self - py::self, py::is_operator())
-      .def(py::self * matrix_op_term(), py::is_operator())
-      .def(py::self + matrix_op_term(), py::is_operator())
-      .def(py::self - matrix_op_term(), py::is_operator())
-      .def(py::self * matrix_op(), py::is_operator())
-      .def(py::self + matrix_op(), py::is_operator())
-      .def(py::self - matrix_op(), py::is_operator())
+      .def(nanobind::self / int(), nanobind::is_operator())
+      .def(nanobind::self * int(), nanobind::is_operator())
+      .def(nanobind::self + int(), nanobind::is_operator())
+      .def(nanobind::self - int(), nanobind::is_operator())
+      .def(nanobind::self / double(), nanobind::is_operator())
+      .def(nanobind::self * double(), nanobind::is_operator())
+      .def(nanobind::self + double(), nanobind::is_operator())
+      .def(nanobind::self - double(), nanobind::is_operator())
+      .def(nanobind::self / std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self * std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self + std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self - std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self / scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self * scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self + scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self - scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self * fermion_op_term(), nanobind::is_operator())
+      .def(nanobind::self + fermion_op_term(), nanobind::is_operator())
+      .def(nanobind::self - fermion_op_term(), nanobind::is_operator())
+      .def(nanobind::self * nanobind::self, nanobind::is_operator())
+      .def(nanobind::self + nanobind::self, nanobind::is_operator())
+      .def(nanobind::self - nanobind::self, nanobind::is_operator())
+      .def(nanobind::self * matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self + matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self - matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self * matrix_op(), nanobind::is_operator())
+      .def(nanobind::self + matrix_op(), nanobind::is_operator())
+      .def(nanobind::self - matrix_op(), nanobind::is_operator())
 
       // left-hand arithmetics
 
-      .def(int() * py::self, py::is_operator())
-      .def(int() + py::self, py::is_operator())
-      .def(int() - py::self, py::is_operator())
-      .def(double() * py::self, py::is_operator())
-      .def(double() + py::self, py::is_operator())
-      .def(double() - py::self, py::is_operator())
-      .def(std::complex<double>() * py::self, py::is_operator())
-      .def(std::complex<double>() + py::self, py::is_operator())
-      .def(std::complex<double>() - py::self, py::is_operator())
-      .def(scalar_operator() * py::self, py::is_operator())
-      .def(scalar_operator() + py::self, py::is_operator())
-      .def(scalar_operator() - py::self, py::is_operator())
+      .def(int() * nanobind::self, nanobind::is_operator())
+      .def(int() + nanobind::self, nanobind::is_operator())
+      .def(int() - nanobind::self, nanobind::is_operator())
+      .def(double() * nanobind::self, nanobind::is_operator())
+      .def(double() + nanobind::self, nanobind::is_operator())
+      .def(double() - nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() * nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() + nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() - nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() * nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() + nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() - nanobind::self, nanobind::is_operator())
 
       // common operators
 
@@ -346,17 +357,17 @@ void bindFermionOperator(py::module &mod) {
       .def("dump", &fermion_op::dump,
            "Prints the string representation of the operator to the standard "
            "output.")
-      .def("trim", &fermion_op::trim, py::arg("tol") = 0.0,
-           py::arg("parameters") = parameter_map(),
+      .def("trim", &fermion_op::trim, nanobind::arg("tol") = 0.0,
+           nanobind::arg("parameters") = parameter_map(),
            "Removes all terms from the sum for which the absolute value of the "
            "coefficient is below "
            "the given tolerance.")
       .def(
           "trim",
-          [](fermion_op &self, double tol, const py::kwargs &kwargs) {
+          [](fermion_op &self, double tol, const nanobind::kwargs &kwargs) {
             return self.trim(tol, details::kwargs_to_param_map(kwargs));
           },
-          py::arg("tol") = 0.0,
+          nanobind::arg("tol") = 0.0, nanobind::arg("kwargs"),
           "Removes all terms from the sum for which the absolute value of the "
           "coefficient is below "
           "the given tolerance.")
@@ -381,42 +392,44 @@ void bindFermionOperator(py::module &mod) {
       .def(
           "__iter__",
           [](fermion_op_term &self) {
-            return py::make_iterator(self.begin(), self.end());
+            return nanobind::make_iterator(nanobind::type<fermion_op_term>(),
+                                           "iterator", self.begin(),
+                                           self.end());
           },
-          py::keep_alive<0, 1>(), "Loop through each term of the operator.")
+          nanobind::keep_alive<0, 1>(),
+          "Loop through each term of the operator.")
 
       // properties
 
-      .def_property_readonly("parameters",
-                             &fermion_op_term::get_parameter_descriptions,
-                             "Returns a dictionary that maps each parameter "
-                             "name to its description.")
-      .def_property_readonly("degrees", &fermion_op_term::degrees,
-                             "Returns a vector that lists all degrees of "
-                             "freedom that the operator targets. "
-                             "The order of degrees is from smallest to largest "
-                             "and reflects the ordering of "
-                             "the matrix returned by `to_matrix`. "
-                             "Specifically, the indices of a statevector "
-                             "with two qubits are {00, 01, 10, 11}. An "
-                             "ordering of degrees {0, 1} then indicates "
-                             "that a state where the qubit with index 0 equals "
-                             "1 with probability 1 is given by "
-                             "the vector {0., 1., 0., 0.}.")
-      .def_property_readonly("min_degree", &fermion_op_term::min_degree,
-                             "Returns the smallest index of the degrees of "
-                             "freedom that the operator targets.")
-      .def_property_readonly("max_degree", &fermion_op_term::max_degree,
-                             "Returns the smallest index of the degrees of "
-                             "freedom that the operator targets.")
-      .def_property_readonly("ops_count", &fermion_op_term::num_ops,
-                             "Returns the number of operators in the product.")
-      .def_property_readonly(
+      .def_prop_ro("parameters", &fermion_op_term::get_parameter_descriptions,
+                   "Returns a dictionary that maps each parameter "
+                   "name to its description.")
+      .def_prop_ro("degrees", &fermion_op_term::degrees,
+                   "Returns a vector that lists all degrees of "
+                   "freedom that the operator targets. "
+                   "The order of degrees is from smallest to largest "
+                   "and reflects the ordering of "
+                   "the matrix returned by `to_matrix`. "
+                   "Specifically, the indices of a statevector "
+                   "with two qubits are {00, 01, 10, 11}. An "
+                   "ordering of degrees {0, 1} then indicates "
+                   "that a state where the qubit with index 0 equals "
+                   "1 with probability 1 is given by "
+                   "the vector {0., 1., 0., 0.}.")
+      .def_prop_ro("min_degree", &fermion_op_term::min_degree,
+                   "Returns the smallest index of the degrees of "
+                   "freedom that the operator targets.")
+      .def_prop_ro("max_degree", &fermion_op_term::max_degree,
+                   "Returns the smallest index of the degrees of "
+                   "freedom that the operator targets.")
+      .def_prop_ro("ops_count", &fermion_op_term::num_ops,
+                   "Returns the number of operators in the product.")
+      .def_prop_ro(
           "term_id", &fermion_op_term::get_term_id,
           "The term id uniquely identifies the operators and targets (degrees) "
           "that they act on, "
           "but does not include information about the coefficient.")
-      .def_property_readonly(
+      .def_prop_ro(
           "coefficient", &fermion_op_term::get_coefficient,
           "Returns the unevaluated coefficient of the operator. The "
           "coefficient is a "
@@ -424,30 +437,32 @@ void bindFermionOperator(py::module &mod) {
 
       // constructors
 
-      .def(py::init<>(),
+      .def(nanobind::init<>(),
            "Creates a product operator with constant value 1. The returned "
            "operator does not target any degrees of freedom but merely "
            "represents a constant.")
-      .def(py::init<std::size_t, std::size_t>(), py::arg("first_degree"),
-           py::arg("last_degree"),
+      .def(nanobind::init<std::size_t, std::size_t>(),
+           nanobind::arg("first_degree"), nanobind::arg("last_degree"),
            "Creates a product operator that applies an identity operation to "
            "all degrees of "
            "freedom in the range [first_degree, last_degree).")
-      .def(py::init<double>(),
+      .def(nanobind::init<double>(),
            "Creates a product operator with the given constant value. "
            "The returned operator does not target any degrees of freedom.")
-      .def(py::init<std::complex<double>>(),
+      .def(nanobind::init<std::complex<double>>(),
            "Creates a product operator with the given "
            "constant value. The returned operator does not target any degrees "
            "of freedom.")
-      .def(py::init([](const scalar_operator &scalar) {
-             return fermion_op_term() * scalar;
-           }),
-           "Creates a product operator with non-constant scalar value.")
-      .def(py::init<fermion_handler>(),
+      .def(
+          "__init__",
+          [](fermion_op_term *self, const scalar_operator &scalar) {
+            new (self) fermion_op_term(fermion_op_term() * scalar);
+          },
+          "Creates a product operator with non-constant scalar value.")
+      .def(nanobind::init<fermion_handler>(),
            "Creates a product operator with the given elementary operator.")
-      .def(py::init<const fermion_op_term &, std::size_t>(),
-           py::arg("operator"), py::arg("size") = 0,
+      .def(nanobind::init<const fermion_op_term &, std::size_t>(),
+           nanobind::arg("operator"), nanobind::arg("size") = 0,
            "Creates a copy of the given operator and reserves space for "
            "storing the given "
            "number of product terms (if a size is provided).")
@@ -459,7 +474,7 @@ void bindFermionOperator(py::module &mod) {
       // evaluations
 
       .def("evaluate_coefficient", &fermion_op_term::evaluate_coefficient,
-           py::arg("parameters") = parameter_map(),
+           nanobind::arg("parameters") = parameter_map(),
            "Returns the evaluated coefficient of the product operator. The "
            "parameters is a map of parameter names to their concrete, complex "
            "values.")
@@ -470,9 +485,9 @@ void bindFermionOperator(py::module &mod) {
             auto cmat = self.to_matrix(dimensions, params, invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("parameters") = parameter_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("invert_order") = false,
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
@@ -482,13 +497,13 @@ void bindFermionOperator(py::module &mod) {
       .def(
           "to_matrix",
           [](const fermion_op_term &self, dimension_map &dimensions,
-             bool invert_order, const py::kwargs &kwargs) {
+             bool invert_order, const nanobind::kwargs &kwargs) {
             auto cmat = self.to_matrix(
                 dimensions, details::kwargs_to_param_map(kwargs), invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
@@ -501,9 +516,9 @@ void bindFermionOperator(py::module &mod) {
              const parameter_map &params, bool invert_order) {
             return self.to_sparse_matrix(dimensions, params, invert_order);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("parameters") = parameter_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("invert_order") = false,
           "Return the sparse matrix representation of the operator. This "
           "representation is a "
           "`Tuple[list[complex], list[int], list[int]]`, encoding the "
@@ -517,12 +532,12 @@ void bindFermionOperator(py::module &mod) {
       .def(
           "to_sparse_matrix",
           [](const fermion_op_term &self, dimension_map &dimensions,
-             bool invert_order, const py::kwargs &kwargs) {
+             bool invert_order, const nanobind::kwargs &kwargs) {
             return self.to_sparse_matrix(
                 dimensions, details::kwargs_to_param_map(kwargs), invert_order);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Return the sparse matrix representation of the operator. This "
           "representation is a "
           "`Tuple[list[complex], list[int], list[int]]`, encoding the "
@@ -536,7 +551,7 @@ void bindFermionOperator(py::module &mod) {
 
       // comparisons
 
-      .def("__eq__", &fermion_op_term::operator==, py::is_operator(),
+      .def("__eq__", &fermion_op_term::operator==, nanobind::is_operator(),
            "Return true if the two operators are equivalent. The equivalence "
            "check takes "
            "commutation relations into account. Operators acting on different "
@@ -548,77 +563,78 @@ void bindFermionOperator(py::module &mod) {
           [](const fermion_op_term &self, const fermion_op &other) {
             return other.num_terms() == 1 && *other.begin() == self;
           },
-          py::is_operator(), "Return true if the two operators are equivalent.")
+          nanobind::is_operator(),
+          "Return true if the two operators are equivalent.")
 
       // unary operators
 
-      .def(-py::self, py::is_operator())
-      .def(+py::self, py::is_operator())
+      .def(-nanobind::self, nanobind::is_operator())
+      .def(+nanobind::self, nanobind::is_operator())
 
       // in-place arithmetics
 
-      .def(py::self /= int(), py::is_operator())
-      .def(py::self *= int(), py::is_operator())
-      .def(py::self /= double(), py::is_operator())
-      .def(py::self *= double(), py::is_operator())
-      .def(py::self /= std::complex<double>(), py::is_operator())
-      .def(py::self *= std::complex<double>(), py::is_operator())
-      .def(py::self /= scalar_operator(), py::is_operator())
-      .def(py::self *= scalar_operator(), py::is_operator())
-      .def(py::self *= py::self, py::is_operator())
+      .def(nanobind::self /= int(), nanobind::is_operator())
+      .def(nanobind::self *= int(), nanobind::is_operator())
+      .def(nanobind::self /= double(), nanobind::is_operator())
+      .def(nanobind::self *= double(), nanobind::is_operator())
+      .def(nanobind::self /= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self *= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self /= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self *= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self *= nanobind::self, nanobind::is_operator())
 
       // right-hand arithmetics
 
-      .def(py::self / int(), py::is_operator())
-      .def(py::self * int(), py::is_operator())
-      .def(py::self + int(), py::is_operator())
-      .def(py::self - int(), py::is_operator())
-      .def(py::self / double(), py::is_operator())
-      .def(py::self * double(), py::is_operator())
-      .def(py::self + double(), py::is_operator())
-      .def(py::self - double(), py::is_operator())
-      .def(py::self / std::complex<double>(), py::is_operator())
-      .def(py::self * std::complex<double>(), py::is_operator())
-      .def(py::self + std::complex<double>(), py::is_operator())
-      .def(py::self - std::complex<double>(), py::is_operator())
-      .def(py::self / scalar_operator(), py::is_operator())
-      .def(py::self * scalar_operator(), py::is_operator())
-      .def(py::self + scalar_operator(), py::is_operator())
-      .def(py::self - scalar_operator(), py::is_operator())
-      .def(py::self * py::self, py::is_operator())
-      .def(py::self + py::self, py::is_operator())
-      .def(py::self - py::self, py::is_operator())
-      .def(py::self * fermion_op(), py::is_operator())
-      .def(py::self + fermion_op(), py::is_operator())
-      .def(py::self - fermion_op(), py::is_operator())
-      .def(py::self * matrix_op_term(), py::is_operator())
-      .def(py::self + matrix_op_term(), py::is_operator())
-      .def(py::self - matrix_op_term(), py::is_operator())
-      .def(py::self * matrix_op(), py::is_operator())
-      .def(py::self + matrix_op(), py::is_operator())
-      .def(py::self - matrix_op(), py::is_operator())
+      .def(nanobind::self / int(), nanobind::is_operator())
+      .def(nanobind::self * int(), nanobind::is_operator())
+      .def(nanobind::self + int(), nanobind::is_operator())
+      .def(nanobind::self - int(), nanobind::is_operator())
+      .def(nanobind::self / double(), nanobind::is_operator())
+      .def(nanobind::self * double(), nanobind::is_operator())
+      .def(nanobind::self + double(), nanobind::is_operator())
+      .def(nanobind::self - double(), nanobind::is_operator())
+      .def(nanobind::self / std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self * std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self + std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self - std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self / scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self * scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self + scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self - scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self * nanobind::self, nanobind::is_operator())
+      .def(nanobind::self + nanobind::self, nanobind::is_operator())
+      .def(nanobind::self - nanobind::self, nanobind::is_operator())
+      .def(nanobind::self * fermion_op(), nanobind::is_operator())
+      .def(nanobind::self + fermion_op(), nanobind::is_operator())
+      .def(nanobind::self - fermion_op(), nanobind::is_operator())
+      .def(nanobind::self * matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self + matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self - matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self * matrix_op(), nanobind::is_operator())
+      .def(nanobind::self + matrix_op(), nanobind::is_operator())
+      .def(nanobind::self - matrix_op(), nanobind::is_operator())
 
       // left-hand arithmetics
 
-      .def(int() * py::self, py::is_operator())
-      .def(int() + py::self, py::is_operator())
-      .def(int() - py::self, py::is_operator())
-      .def(double() * py::self, py::is_operator())
-      .def(double() + py::self, py::is_operator())
-      .def(double() - py::self, py::is_operator())
-      .def(std::complex<double>() * py::self, py::is_operator())
-      .def(std::complex<double>() + py::self, py::is_operator())
-      .def(std::complex<double>() - py::self, py::is_operator())
-      .def(scalar_operator() * py::self, py::is_operator())
-      .def(scalar_operator() + py::self, py::is_operator())
-      .def(scalar_operator() - py::self, py::is_operator())
+      .def(int() * nanobind::self, nanobind::is_operator())
+      .def(int() + nanobind::self, nanobind::is_operator())
+      .def(int() - nanobind::self, nanobind::is_operator())
+      .def(double() * nanobind::self, nanobind::is_operator())
+      .def(double() + nanobind::self, nanobind::is_operator())
+      .def(double() - nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() * nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() + nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() - nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() * nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() + nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() - nanobind::self, nanobind::is_operator())
 
       // general utility functions
 
       .def("is_identity", &fermion_op_term::is_identity,
            "Checks if all operators in the product are the identity. "
-           "Note: this function returns true regardless of the value of the "
-           "coefficient.")
+           "Note that this function returns true regardless of the value of "
+           "the coefficient.")
       .def(
           "__str__",
           [](const fermion_op_term &self) { return self.to_string(); },
@@ -642,12 +658,12 @@ void bindFermionOperator(py::module &mod) {
           "of freedom that are not included in the given set.");
 }
 
-void bindFermionWrapper(py::module &mod) {
+void bindFermionWrapper(nanobind::module_ &mod) {
   bindFermionOperator(mod);
-  py::implicitly_convertible<double, fermion_op_term>();
-  py::implicitly_convertible<std::complex<double>, fermion_op_term>();
-  py::implicitly_convertible<scalar_operator, fermion_op_term>();
-  py::implicitly_convertible<fermion_op_term, fermion_op>();
+  nanobind::implicitly_convertible<double, fermion_op_term>();
+  nanobind::implicitly_convertible<std::complex<double>, fermion_op_term>();
+  nanobind::implicitly_convertible<scalar_operator, fermion_op_term>();
+  nanobind::implicitly_convertible<fermion_op_term, fermion_op>();
   bindFermionModule(mod);
 }
 
diff --git a/python/runtime/cudaq/operators/py_fermion_op.h b/python/runtime/cudaq/operators/py_fermion_op.h
index b54e406267a..45dbb8015d2 100644
--- a/python/runtime/cudaq/operators/py_fermion_op.h
+++ b/python/runtime/cudaq/operators/py_fermion_op.h
@@ -6,12 +6,10 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
 /// @brief Wrapper function for exposing the bindings of fermionic
 /// operators to python.
-void bindFermionWrapper(py::module &mod);
+void bindFermionWrapper(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/operators/py_handlers.cpp b/python/runtime/cudaq/operators/py_handlers.cpp
index 71ca25a15aa..e8c2147e92b 100644
--- a/python/runtime/cudaq/operators/py_handlers.cpp
+++ b/python/runtime/cudaq/operators/py_handlers.cpp
@@ -7,11 +7,14 @@
  ******************************************************************************/
 
 #include <complex>
-#include <pybind11/complex.h>
-#include <pybind11/functional.h>
-#include <pybind11/numpy.h>
-#include <pybind11/operators.h>
-#include <pybind11/stl.h>
+#include <nanobind/ndarray.h>
+#include <nanobind/operators.h>
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/function.h>
+#include <nanobind/stl/map.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/unordered_map.h>
+#include <nanobind/stl/vector.h>
 
 #include "cudaq/operators.h"
 #include "py_handlers.h"
@@ -19,53 +22,56 @@
 
 namespace cudaq {
 
-void bindPauli(py::module mod) {
-  py::enum_<pauli>(mod, "Pauli",
-                   "An enumeration representing the types of Pauli matrices.")
+void bindPauli(nanobind::module_ mod) {
+  nanobind::enum_<pauli>(
+      mod, "Pauli", "An enumeration representing the types of Pauli matrices.")
       .value("X", pauli::X)
       .value("Y", pauli::Y)
       .value("Z", pauli::Z)
       .value("I", pauli::I);
 }
 
-void bindOperatorHandlers(py::module &mod) {
+void bindOperatorHandlers(nanobind::module_ &mod) {
   using matrix_callback = std::function<complex_matrix(
       const std::vector<int64_t> &, const parameter_map &)>;
 
-  py::class_<matrix_handler>(mod, "MatrixOperatorElement")
-      .def_property_readonly(
+  nanobind::class_<matrix_handler>(mod, "MatrixOperatorElement")
+      .def_prop_ro(
           "id",
           [](const matrix_handler &self) { return self.to_string(false); },
           "Returns the id used to define and instantiate the operator.")
-      .def_property_readonly("degrees", &matrix_handler::degrees,
-                             "Returns a vector that lists all degrees of "
-                             "freedom that the operator targets.")
-      .def_property_readonly("parameters",
-                             &matrix_handler::get_parameter_descriptions,
-                             "Returns a dictionary that maps each parameter "
-                             "name to its description.")
-      .def_property_readonly("expected_dimensions",
-                             &matrix_handler::get_expected_dimensions,
-                             "The number of levels, that is the dimension, for "
-                             "each degree of freedom "
-                             "in canonical order that the operator acts on. A "
-                             "value of zero or less "
-                             "indicates that the operator is defined for any "
-                             "dimension of that degree.")
-      .def(py::init<std::size_t>(),
+      .def_prop_ro("degrees", &matrix_handler::degrees,
+                   "Returns a vector that lists all degrees of "
+                   "freedom that the operator targets.")
+      .def_prop_ro("parameters", &matrix_handler::get_parameter_descriptions,
+                   "Returns a dictionary that maps each parameter "
+                   "name to its description.")
+      .def_prop_ro("expected_dimensions",
+                   &matrix_handler::get_expected_dimensions,
+                   "The number of levels, that is the dimension, for "
+                   "each degree of freedom "
+                   "in canonical order that the operator acts on. A "
+                   "value of zero or less "
+                   "indicates that the operator is defined for any "
+                   "dimension of that degree.")
+      .def(nanobind::init<std::size_t>(),
            "Creates an identity operator on the given target.")
-      .def(py::init([](std::string operator_id,
-                       std::vector<std::size_t> degrees) {
-             return matrix_handler(std::move(operator_id), std::move(degrees));
-           }),
-           py::arg("id"), py::arg("degrees"),
-           "Creates the matrix operator with the given id acting on the given "
-           "degrees of "
-           "freedom. Throws a runtime exception if no operator with that id "
-           "has been defined.")
-      .def(py::init<const matrix_handler &>(), "Copy constructor.")
-      .def("__eq__", &matrix_handler::operator==, py::is_operator())
-      .def("to_string", &matrix_handler::to_string, py::arg("include_degrees"),
+      .def(
+          "__init__",
+          [](matrix_handler *self, std::string operator_id,
+             std::vector<std::size_t> degrees) {
+            new (self)
+                matrix_handler(std::move(operator_id), std::move(degrees));
+          },
+          nanobind::arg("id"), nanobind::arg("degrees"),
+          "Creates the matrix operator with the given id acting on the given "
+          "degrees of "
+          "freedom. Throws a runtime exception if no operator with that id "
+          "has been defined.")
+      .def(nanobind::init<const matrix_handler &>(), "Copy constructor.")
+      .def("__eq__", &matrix_handler::operator==, nanobind::is_operator())
+      .def("to_string", &matrix_handler::to_string,
+           nanobind::arg("include_degrees"),
            "Returns the string representation of the operator.")
       .def(
           "to_matrix",
@@ -74,18 +80,19 @@ void bindOperatorHandlers(py::module &mod) {
             auto cmat = self.to_matrix(dimensions, params);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("parameters") = parameter_map(),
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("parameters") = parameter_map(),
           "Returns the matrix representation of the operator.")
       .def(
           "to_matrix",
           [](const matrix_handler &self, dimension_map &dimensions,
-             const py::kwargs &kwargs) {
+             const nanobind::kwargs &kwargs) {
             auto cmat = self.to_matrix(dimensions,
                                        details::kwargs_to_param_map(kwargs));
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("kwargs"),
           "Returns the matrix representation of the operator.")
 
       // tools for custom operators
@@ -93,12 +100,12 @@ void bindOperatorHandlers(py::module &mod) {
           "_define",
           [](std::string operator_id, std::vector<int64_t> expected_dimensions,
              const matrix_callback &func, bool overwrite,
-             const py::kwargs &kwargs) {
+             const nanobind::kwargs &kwargs) {
             // we need to make sure the python function that is stored in
             // the static dictionary containing the operator definitions
             // is properly cleaned up - otherwise python will hang on exit...
-            auto atexit = py::module_::import("atexit");
-            atexit.attr("register")(py::cpp_function([operator_id]() {
+            auto atexit = nanobind::module_::import_("atexit");
+            atexit.attr("register")(nanobind::cpp_function([operator_id]() {
               matrix_handler::remove_definition(operator_id);
             }));
             if (overwrite)
@@ -107,24 +114,25 @@ void bindOperatorHandlers(py::module &mod) {
                 std::move(operator_id), std::move(expected_dimensions), func,
                 details::kwargs_to_param_description(kwargs));
           },
-          py::arg("operator_id"), py::arg("expected_dimensions"),
-          py::arg("callback"), py::arg("overwrite") = false,
+          nanobind::arg("operator_id"), nanobind::arg("expected_dimensions"),
+          nanobind::arg("callback"), nanobind::arg("overwrite") = false,
+          nanobind::arg("kwargs"),
           "Defines a matrix operator with the given name and dimensions whose"
           "matrix representation can be obtained by invoking the given "
           "callback function.");
 
-  py::class_<boson_handler>(mod, "BosonOperatorElement")
-      .def_property_readonly(
-          "target", &boson_handler::target,
-          "Returns the degree of freedom that the operator targets.")
-      .def_property_readonly("degrees", &boson_handler::degrees,
-                             "Returns a vector that lists all degrees of "
-                             "freedom that the operator targets.")
-      .def(py::init<std::size_t>(),
+  nanobind::class_<boson_handler>(mod, "BosonOperatorElement")
+      .def_prop_ro("target", &boson_handler::target,
+                   "Returns the degree of freedom that the operator targets.")
+      .def_prop_ro("degrees", &boson_handler::degrees,
+                   "Returns a vector that lists all degrees of "
+                   "freedom that the operator targets.")
+      .def(nanobind::init<std::size_t>(),
            "Creates an identity operator on the given target.")
-      .def(py::init<const boson_handler &>(), "Copy constructor.")
-      .def("__eq__", &boson_handler::operator==, py::is_operator())
-      .def("to_string", &boson_handler::to_string, py::arg("include_degrees"),
+      .def(nanobind::init<const boson_handler &>(), "Copy constructor.")
+      .def("__eq__", &boson_handler::operator==, nanobind::is_operator())
+      .def("to_string", &boson_handler::to_string,
+           nanobind::arg("include_degrees"),
            "Returns the string representation of the operator.")
       .def(
           "to_matrix",
@@ -133,32 +141,33 @@ void bindOperatorHandlers(py::module &mod) {
             auto cmat = self.to_matrix(dimensions, params);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("parameters") = parameter_map(),
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("parameters") = parameter_map(),
           "Returns the matrix representation of the operator.")
       .def(
           "to_matrix",
           [](const boson_handler &self, dimension_map &dimensions,
-             const py::kwargs &kwargs) {
+             const nanobind::kwargs &kwargs) {
             auto cmat = self.to_matrix(dimensions,
                                        details::kwargs_to_param_map(kwargs));
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("kwargs"),
           "Returns the matrix representation of the operator.");
 
-  py::class_<fermion_handler>(mod, "FermionOperatorElement")
-      .def_property_readonly(
-          "target", &fermion_handler::target,
-          "Returns the degree of freedom that the operator targets.")
-      .def_property_readonly("degrees", &fermion_handler::degrees,
-                             "Returns a vector that lists all degrees of "
-                             "freedom that the operator targets.")
-      .def(py::init<std::size_t>(),
+  nanobind::class_<fermion_handler>(mod, "FermionOperatorElement")
+      .def_prop_ro("target", &fermion_handler::target,
+                   "Returns the degree of freedom that the operator targets.")
+      .def_prop_ro("degrees", &fermion_handler::degrees,
+                   "Returns a vector that lists all degrees of "
+                   "freedom that the operator targets.")
+      .def(nanobind::init<std::size_t>(),
            "Creates an identity operator on the given target.")
-      .def(py::init<const fermion_handler &>(), "Copy constructor.")
-      .def("__eq__", &fermion_handler::operator==, py::is_operator())
-      .def("to_string", &fermion_handler::to_string, py::arg("include_degrees"),
+      .def(nanobind::init<const fermion_handler &>(), "Copy constructor.")
+      .def("__eq__", &fermion_handler::operator==, nanobind::is_operator())
+      .def("to_string", &fermion_handler::to_string,
+           nanobind::arg("include_degrees"),
            "Returns the string representation of the operator.")
       .def(
           "to_matrix",
@@ -167,34 +176,35 @@ void bindOperatorHandlers(py::module &mod) {
             auto cmat = self.to_matrix(dimensions, params);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("parameters") = parameter_map(),
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("parameters") = parameter_map(),
           "Returns the matrix representation of the operator.")
       .def(
           "to_matrix",
           [](const fermion_handler &self, dimension_map &dimensions,
-             const py::kwargs &kwargs) {
+             const nanobind::kwargs &kwargs) {
             auto cmat = self.to_matrix(dimensions,
                                        details::kwargs_to_param_map(kwargs));
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("kwargs"),
           "Returns the matrix representation of the operator.");
 
-  py::class_<spin_handler>(mod, "SpinOperatorElement")
-      .def_property_readonly(
-          "target", &spin_handler::target,
-          "Returns the degree of freedom that the operator targets.")
-      .def_property_readonly("degrees", &spin_handler::degrees,
-                             "Returns a vector that lists all degrees of "
-                             "freedom that the operator targets.")
-      .def(py::init<std::size_t>(),
+  nanobind::class_<spin_handler>(mod, "SpinOperatorElement")
+      .def_prop_ro("target", &spin_handler::target,
+                   "Returns the degree of freedom that the operator targets.")
+      .def_prop_ro("degrees", &spin_handler::degrees,
+                   "Returns a vector that lists all degrees of "
+                   "freedom that the operator targets.")
+      .def(nanobind::init<std::size_t>(),
            "Creates an identity operator on the given target.")
-      .def(py::init<const spin_handler &>(), "Copy constructor.")
-      .def("__eq__", &spin_handler::operator==, py::is_operator())
+      .def(nanobind::init<const spin_handler &>(), "Copy constructor.")
+      .def("__eq__", &spin_handler::operator==, nanobind::is_operator())
       .def("as_pauli", &spin_handler::as_pauli,
            "Returns the Pauli representation of the operator.")
-      .def("to_string", &spin_handler::to_string, py::arg("include_degrees"),
+      .def("to_string", &spin_handler::to_string,
+           nanobind::arg("include_degrees"),
            "Returns the string representation of the operator.")
       .def(
           "to_matrix",
@@ -203,22 +213,23 @@ void bindOperatorHandlers(py::module &mod) {
             auto cmat = self.to_matrix(dimensions, params);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("parameters") = parameter_map(),
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("parameters") = parameter_map(),
           "Returns the matrix representation of the operator.")
       .def(
           "to_matrix",
           [](const spin_handler &self, dimension_map &dimensions,
-             const py::kwargs &kwargs) {
+             const nanobind::kwargs &kwargs) {
             auto cmat = self.to_matrix(dimensions,
                                        details::kwargs_to_param_map(kwargs));
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("kwargs"),
           "Returns the matrix representation of the operator.");
 }
 
-void bindHandlersWrapper(py::module &mod) {
+void bindHandlersWrapper(nanobind::module_ &mod) {
   bindPauli(mod);
   bindOperatorHandlers(mod);
 }
diff --git a/python/runtime/cudaq/operators/py_handlers.h b/python/runtime/cudaq/operators/py_handlers.h
index 3bcde5ad205..cd82dd92e44 100644
--- a/python/runtime/cudaq/operators/py_handlers.h
+++ b/python/runtime/cudaq/operators/py_handlers.h
@@ -6,12 +6,10 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
 /// @brief Wrapper function for exposing the bindings of
 /// operator handlers to python.
-void bindHandlersWrapper(py::module &mod);
+void bindHandlersWrapper(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/operators/py_helpers.cpp b/python/runtime/cudaq/operators/py_helpers.cpp
index aecc5811074..b34212bce6e 100644
--- a/python/runtime/cudaq/operators/py_helpers.cpp
+++ b/python/runtime/cudaq/operators/py_helpers.cpp
@@ -8,18 +8,19 @@
 
 #include "py_helpers.h"
 #include "cudaq/operators.h"
+#include <algorithm>
 #include <complex>
-#include <pybind11/complex.h>
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/ndarray.h>
+#include <nanobind/stl/complex.h>
 
 namespace cudaq::details {
 
-cudaq::parameter_map kwargs_to_param_map(const py::kwargs &kwargs) {
+cudaq::parameter_map kwargs_to_param_map(const nanobind::kwargs &kwargs) {
   cudaq::parameter_map params;
-  for (auto &[keyPy, valuePy] : kwargs) {
-    std::string key = py::str(keyPy);
-    std::complex<double> value = valuePy.cast<std::complex<double>>();
+  for (auto [keyPy, valuePy] : kwargs) {
+    std::string key = nanobind::str(keyPy).c_str();
+    std::complex<double> value = nanobind::cast<std::complex<double>>(valuePy);
     params.insert(params.end(),
                   std::pair<std::string, std::complex<double>>(key, value));
   }
@@ -27,29 +28,34 @@ cudaq::parameter_map kwargs_to_param_map(const py::kwargs &kwargs) {
 }
 
 std::unordered_map<std::string, std::string>
-kwargs_to_param_description(const py::kwargs &kwargs) {
+kwargs_to_param_description(const nanobind::kwargs &kwargs) {
   std::unordered_map<std::string, std::string> param_desc;
-  for (auto &[keyPy, valuePy] : kwargs) {
-    std::string key = py::str(keyPy);
-    std::string value = py::str(valuePy);
+  for (auto [keyPy, valuePy] : kwargs) {
+    std::string key = nanobind::str(keyPy).c_str();
+    std::string value = nanobind::str(valuePy).c_str();
     param_desc.insert(param_desc.end(),
                       std::pair<std::string, std::string>(key, value));
   }
   return param_desc;
 }
 
-py::array_t<std::complex<double>> cmat_to_numpy(complex_matrix &cmat) {
+nanobind::ndarray<nanobind::numpy, std::complex<double>>
+cmat_to_numpy(complex_matrix &cmat) {
   auto rows = cmat.rows();
   auto cols = cmat.cols();
-  auto data = cmat.get_data(complex_matrix::order::row_major);
-  std::vector<ssize_t> shape = {static_cast<ssize_t>(rows),
-                                static_cast<ssize_t>(cols)};
-  std::vector<ssize_t> strides = {
-      static_cast<ssize_t>(sizeof(std::complex<double>) * cols),
-      static_cast<ssize_t>(sizeof(std::complex<double>))};
-
-  // Return a numpy array without copying data
-  return py::array_t<std::complex<double>>(shape, strides, data);
-};
+  auto *src = cmat.get_data(complex_matrix::order::row_major);
+  std::size_t n = rows * cols;
+  std::size_t shape[2] = {rows, cols};
+
+  auto *copy = new std::complex<double>[n];
+  std::copy(src, src + n, copy);
+
+  nanobind::capsule owner(copy, [](void *p) noexcept {
+    delete[] static_cast<std::complex<double> *>(p);
+  });
+
+  return nanobind::ndarray<nanobind::numpy, std::complex<double>>(copy, 2,
+                                                                  shape, owner);
+}
 
 } // namespace cudaq::details
diff --git a/python/runtime/cudaq/operators/py_helpers.h b/python/runtime/cudaq/operators/py_helpers.h
index 33b7463ae9a..e712281784f 100644
--- a/python/runtime/cudaq/operators/py_helpers.h
+++ b/python/runtime/cudaq/operators/py_helpers.h
@@ -7,14 +7,13 @@
  ******************************************************************************/
 
 #include "cudaq/operators.h"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
+#include <nanobind/ndarray.h>
 
 namespace cudaq::details {
-cudaq::parameter_map kwargs_to_param_map(const py::kwargs &kwargs);
+cudaq::parameter_map kwargs_to_param_map(const nanobind::kwargs &kwargs);
 std::unordered_map<std::string, std::string>
-kwargs_to_param_description(const py::kwargs &kwargs);
-py::array_t<std::complex<double>> cmat_to_numpy(complex_matrix &cmat);
+kwargs_to_param_description(const nanobind::kwargs &kwargs);
+nanobind::ndarray<nanobind::numpy, std::complex<double>>
+cmat_to_numpy(complex_matrix &cmat);
 } // namespace cudaq::details
diff --git a/python/runtime/cudaq/operators/py_matrix.cpp b/python/runtime/cudaq/operators/py_matrix.cpp
index 6b2828c2973..48d37891e7f 100644
--- a/python/runtime/cudaq/operators/py_matrix.cpp
+++ b/python/runtime/cudaq/operators/py_matrix.cpp
@@ -6,10 +6,12 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include <pybind11/complex.h>
-#include <pybind11/numpy.h>
-#include <pybind11/operators.h>
-#include <pybind11/stl.h>
+#include <nanobind/ndarray.h>
+#include <nanobind/operators.h>
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/tuple.h>
+#include <nanobind/stl/vector.h>
 
 #include "cudaq/operators/matrix.h"
 #include "py_helpers.h"
@@ -19,44 +21,25 @@
 
 namespace cudaq {
 
-/// @brief Extract the array data from a buffer_info into our
-/// own allocated data pointer.
-void extractMatrixData(py::buffer_info &info, std::complex<double> *data) {
-  if (info.format != py::format_descriptor<std::complex<double>>::format())
-    throw std::runtime_error(
-        "Incompatible buffer format, must be np.complex128.");
-
-  if (info.ndim != 2)
-    throw std::runtime_error("Incompatible buffer shape.");
-
-  memcpy(data, info.ptr,
-         sizeof(std::complex<double>) * (info.shape[0] * info.shape[1]));
-}
-
-void bindComplexMatrix(py::module &mod) {
-  py::class_<complex_matrix>(
-      mod, "ComplexMatrix", py::buffer_protocol(),
+void bindComplexMatrix(nanobind::module_ &mod) {
+  nanobind::class_<complex_matrix>(
+      mod, "ComplexMatrix",
       "The :class:`ComplexMatrix` is a thin wrapper around a "
       "matrix of complex<double> elements.")
-      /// The following makes this fully compatible with NumPy
-      .def_buffer([](complex_matrix &op) -> py::buffer_info {
-        return py::buffer_info(
-            op.get_data(complex_matrix::order::row_major),
-            sizeof(std::complex<double>),
-            py::format_descriptor<std::complex<double>>::format(), 2,
-            {op.rows(), op.cols()},
-            {sizeof(std::complex<double>) * op.cols(),
-             sizeof(std::complex<double>)});
-      })
-      .def(py::init([](const py::buffer &b) {
-             py::buffer_info info = b.request();
-             complex_matrix m(info.shape[0], info.shape[1]);
-             extractMatrixData(info,
-                               m.get_data(complex_matrix::order::row_major));
-             return m;
-           }),
-           "Create a :class:`ComplexMatrix` from a buffer of data, such as a "
-           "numpy.ndarray.")
+      .def(
+          "__init__",
+          [](complex_matrix *self,
+             nanobind::ndarray<std::complex<double>, nanobind::ndim<2>,
+                               nanobind::c_contig, nanobind::numpy>
+                 arr) {
+            auto rows = arr.shape(0);
+            auto cols = arr.shape(1);
+            new (self) complex_matrix(rows, cols);
+            memcpy(self->get_data(complex_matrix::order::row_major), arr.data(),
+                   sizeof(std::complex<double>) * rows * cols);
+          },
+          "Create a :class:`ComplexMatrix` from a buffer of data, such as a "
+          "numpy.ndarray.")
       .def(
           "num_rows", [](complex_matrix &m) { return m.rows(); },
           "Returns the number of rows in the matrix.")
@@ -85,7 +68,7 @@ void bindComplexMatrix(py::module &mod) {
           [](const complex_matrix &lhs, const complex_matrix &rhs) {
             return lhs == rhs;
           },
-          py::is_operator())
+          nanobind::is_operator())
       .def("__str__", &complex_matrix::to_string,
            "Returns the string representation of the matrix.")
       .def(
diff --git a/python/runtime/cudaq/operators/py_matrix.h b/python/runtime/cudaq/operators/py_matrix.h
index 022a74fdbf8..baf93260e9e 100644
--- a/python/runtime/cudaq/operators/py_matrix.h
+++ b/python/runtime/cudaq/operators/py_matrix.h
@@ -6,12 +6,10 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
 /// @brief Wrapper function for exposing the bindings of `cudaq::complex_matrix`
 /// to python.
-void bindComplexMatrix(py::module &mod);
+void bindComplexMatrix(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/operators/py_matrix_op.cpp b/python/runtime/cudaq/operators/py_matrix_op.cpp
index 187ab99f746..3883f86c9bd 100644
--- a/python/runtime/cudaq/operators/py_matrix_op.cpp
+++ b/python/runtime/cudaq/operators/py_matrix_op.cpp
@@ -7,10 +7,15 @@
  ******************************************************************************/
 
 #include <complex>
-#include <pybind11/complex.h>
-#include <pybind11/numpy.h>
-#include <pybind11/operators.h>
-#include <pybind11/stl.h>
+#include <nanobind/make_iterator.h>
+#include <nanobind/ndarray.h>
+#include <nanobind/operators.h>
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/map.h>
+#include <nanobind/stl/set.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/unordered_map.h>
+#include <nanobind/stl/vector.h>
 
 #include "cudaq/operators.h"
 #include "cudaq/operators/serialization.h"
@@ -19,7 +24,7 @@
 
 namespace cudaq {
 
-void bindOperatorsModule(py::module &mod) {
+void bindOperatorsModule(nanobind::module_ &mod) {
   // Binding the functions in `cudaq::operators` as `_pycudaq` submodule
   // so it's accessible directly in the cudaq namespace.
   auto operators_submodule = mod.def_submodule("operators");
@@ -33,34 +38,34 @@ void bindOperatorsModule(py::module &mod) {
   operators_submodule.def(
       "identity",
       [](std::size_t target) { return matrix_op::identity(target); },
-      py::arg("target"),
+      nanobind::arg("target"),
       "Returns an identity operator on the given target index.");
   operators_submodule.def(
       "identities",
       [](std::size_t first, std::size_t last) {
         return matrix_op_term(first, last);
       },
-      py::arg("first"), py::arg("last"),
+      nanobind::arg("first"), nanobind::arg("last"),
       "Creates a product operator that applies an identity operation to all "
       "degrees of "
       "freedom in the open range [first, last).");
   operators_submodule.def(
-      "number", &matrix_op::number<matrix_handler>, py::arg("target"),
+      "number", &matrix_op::number<matrix_handler>, nanobind::arg("target"),
       "Returns a number operator on the given target index.");
   operators_submodule.def(
-      "parity", &matrix_op::parity<matrix_handler>, py::arg("target"),
+      "parity", &matrix_op::parity<matrix_handler>, nanobind::arg("target"),
       "Returns a parity operator on the given target index.");
   operators_submodule.def(
-      "position", &matrix_op::position<matrix_handler>, py::arg("target"),
+      "position", &matrix_op::position<matrix_handler>, nanobind::arg("target"),
       "Returns a position operator on the given target index.");
   operators_submodule.def(
-      "momentum", &matrix_op::momentum<matrix_handler>, py::arg("target"),
+      "momentum", &matrix_op::momentum<matrix_handler>, nanobind::arg("target"),
       "Returns a momentum operator on the given target index.");
   operators_submodule.def(
-      "squeeze", &matrix_op::squeeze<matrix_handler>, py::arg("target"),
+      "squeeze", &matrix_op::squeeze<matrix_handler>, nanobind::arg("target"),
       "Returns a squeezing operator on the given target index.");
   operators_submodule.def(
-      "displace", &matrix_op::displace<matrix_handler>, py::arg("target"),
+      "displace", &matrix_op::displace<matrix_handler>, nanobind::arg("target"),
       "Returns a displacement operator on the given target index.");
   operators_submodule.def(
       "canonicalized",
@@ -94,41 +99,43 @@ void bindOperatorsModule(py::module &mod) {
       "degrees of freedom.");
 }
 
-void bindMatrixOperator(py::module &mod) {
+void bindMatrixOperator(nanobind::module_ &mod) {
 
-  auto matrix_op_class = py::class_<matrix_op>(mod, "MatrixOperator");
+  auto matrix_op_class = nanobind::class_<matrix_op>(mod, "MatrixOperator");
   auto matrix_op_term_class =
-      py::class_<matrix_op_term>(mod, "MatrixOperatorTerm");
+      nanobind::class_<matrix_op_term>(mod, "MatrixOperatorTerm");
 
   matrix_op_class
       .def(
           "__iter__",
           [](matrix_op &self) {
-            return py::make_iterator(self.begin(), self.end());
+            return nanobind::make_iterator(nanobind::type<matrix_op>(),
+                                           "iterator", self.begin(),
+                                           self.end());
           },
-          py::keep_alive<0, 1>(), "Loop through each term of the operator.")
+          nanobind::keep_alive<0, 1>(),
+          "Loop through each term of the operator.")
 
       // properties
 
-      .def_property_readonly("parameters",
-                             &matrix_op::get_parameter_descriptions,
-                             "Returns a dictionary that maps each parameter "
-                             "name to its description.")
-      .def_property_readonly("degrees", &matrix_op::degrees,
-                             "Returns a vector that lists all degrees of "
-                             "freedom that the operator targets.")
-      .def_property_readonly("min_degree", &matrix_op::min_degree,
-                             "Returns the smallest index of the degrees of "
-                             "freedom that the operator targets.")
-      .def_property_readonly("max_degree", &matrix_op::max_degree,
-                             "Returns the smallest index of the degrees of "
-                             "freedom that the operator targets.")
-      .def_property_readonly("term_count", &matrix_op::num_terms,
-                             "Returns the number of terms in the operator.")
+      .def_prop_ro("parameters", &matrix_op::get_parameter_descriptions,
+                   "Returns a dictionary that maps each parameter "
+                   "name to its description.")
+      .def_prop_ro("degrees", &matrix_op::degrees,
+                   "Returns a vector that lists all degrees of "
+                   "freedom that the operator targets.")
+      .def_prop_ro("min_degree", &matrix_op::min_degree,
+                   "Returns the smallest index of the degrees of "
+                   "freedom that the operator targets.")
+      .def_prop_ro("max_degree", &matrix_op::max_degree,
+                   "Returns the smallest index of the degrees of "
+                   "freedom that the operator targets.")
+      .def_prop_ro("term_count", &matrix_op::num_terms,
+                   "Returns the number of terms in the operator.")
 
       // constructors
 
-      .def(py::init<>(),
+      .def(nanobind::init<>(),
            "Creates a default instantiated sum. A default instantiated "
            "sum has no value; it will take a value the first time an "
            "arithmetic operation "
@@ -137,15 +144,15 @@ void bindMatrixOperator(py::module &mod) {
            "identity. To construct a `0` value in the mathematical sense "
            "(neutral element "
            "for addition), use `empty()` instead.")
-      .def(py::init<std::size_t>(),
+      .def(nanobind::init<std::size_t>(),
            "Creates a sum operator with no terms, reserving "
            "space for the given number of terms.")
-      .def(py::init<spin_op>())
-      .def(py::init<fermion_op>())
-      .def(py::init<boson_op>())
-      .def(py::init<const matrix_op_term &>(),
+      .def(nanobind::init<spin_op>())
+      .def(nanobind::init<fermion_op>())
+      .def(nanobind::init<boson_op>())
+      .def(nanobind::init<const matrix_op_term &>(),
            "Creates a sum operator with the given term.")
-      .def(py::init<const matrix_op &>(), "Copy constructor.")
+      .def(nanobind::init<const matrix_op &>(), "Copy constructor.")
       .def(
           "copy", [](const matrix_op &self) { return matrix_op(self); },
           "Creates a copy of the operator.")
@@ -159,9 +166,9 @@ void bindMatrixOperator(py::module &mod) {
             auto cmat = self.to_matrix(dimensions, params, invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("parameters") = parameter_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("invert_order") = false,
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
@@ -172,13 +179,13 @@ void bindMatrixOperator(py::module &mod) {
       .def(
           "to_matrix",
           [](const matrix_op &self, dimension_map &dimensions,
-             bool invert_order, const py::kwargs &kwargs) {
+             bool invert_order, const nanobind::kwargs &kwargs) {
             auto cmat = self.to_matrix(
                 dimensions, details::kwargs_to_param_map(kwargs), invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
@@ -188,7 +195,7 @@ void bindMatrixOperator(py::module &mod) {
 
       // comparisons
 
-      .def("__eq__", &matrix_op::operator==, py::is_operator(),
+      .def("__eq__", &matrix_op::operator==, nanobind::is_operator(),
            "Return true if the two operators are equivalent. The equivalence "
            "check takes "
            "into account that addition is commutative and so is multiplication "
@@ -202,85 +209,85 @@ void bindMatrixOperator(py::module &mod) {
           [](const matrix_op &self, const matrix_op_term &other) {
             return self.num_terms() == 1 && *self.begin() == other;
           },
-          py::is_operator(), "Return true if the two operators are equivalent.")
+          nanobind::is_operator(),
+          "Return true if the two operators are equivalent.")
 
       // unary operators
 
-      .def(-py::self, py::is_operator())
-      .def(+py::self, py::is_operator())
+      .def(-nanobind::self, nanobind::is_operator())
+      .def(+nanobind::self, nanobind::is_operator())
 
       // in-place arithmetics
 
-      .def(py::self /= int(), py::is_operator())
-      .def(py::self *= int(), py::is_operator())
-      .def(py::self += int(), py::is_operator())
-      .def(py::self -= int(), py::is_operator())
-      .def(py::self /= double(), py::is_operator())
-      .def(py::self *= double(), py::is_operator())
-      .def(py::self += double(), py::is_operator())
-      .def(py::self -= double(), py::is_operator())
-      .def(py::self /= std::complex<double>(), py::is_operator())
-      .def(py::self *= std::complex<double>(), py::is_operator())
-      .def(py::self += std::complex<double>(), py::is_operator())
-      .def(py::self -= std::complex<double>(), py::is_operator())
-      .def(py::self /= scalar_operator(), py::is_operator())
-      .def(py::self *= scalar_operator(), py::is_operator())
-      .def(py::self += scalar_operator(), py::is_operator())
-      .def(py::self -= scalar_operator(), py::is_operator())
-      .def(py::self *= matrix_op_term(), py::is_operator())
-      .def(py::self += matrix_op_term(), py::is_operator())
-      .def(py::self -= matrix_op_term(), py::is_operator())
-      .def(py::self *= py::self, py::is_operator())
-      .def(py::self += py::self, py::is_operator())
-// see issue https://github.com/pybind/pybind11/issues/1893
+      .def(nanobind::self /= int(), nanobind::is_operator())
+      .def(nanobind::self *= int(), nanobind::is_operator())
+      .def(nanobind::self += int(), nanobind::is_operator())
+      .def(nanobind::self -= int(), nanobind::is_operator())
+      .def(nanobind::self /= double(), nanobind::is_operator())
+      .def(nanobind::self *= double(), nanobind::is_operator())
+      .def(nanobind::self += double(), nanobind::is_operator())
+      .def(nanobind::self -= double(), nanobind::is_operator())
+      .def(nanobind::self /= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self *= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self += std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self -= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self /= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self *= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self += scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self -= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self *= matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self += matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self -= matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self *= nanobind::self, nanobind::is_operator())
+      .def(nanobind::self += nanobind::self, nanobind::is_operator())
 #ifdef __clang__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wself-assign-overloaded"
 #endif
-      .def(py::self -= py::self, py::is_operator())
+      .def(nanobind::self -= nanobind::self, nanobind::is_operator())
 #ifdef __clang__
 #pragma clang diagnostic pop
 #endif
 
       // right-hand arithmetics
 
-      .def(py::self / int(), py::is_operator())
-      .def(py::self * int(), py::is_operator())
-      .def(py::self + int(), py::is_operator())
-      .def(py::self - int(), py::is_operator())
-      .def(py::self / double(), py::is_operator())
-      .def(py::self * double(), py::is_operator())
-      .def(py::self + double(), py::is_operator())
-      .def(py::self - double(), py::is_operator())
-      .def(py::self / std::complex<double>(), py::is_operator())
-      .def(py::self * std::complex<double>(), py::is_operator())
-      .def(py::self + std::complex<double>(), py::is_operator())
-      .def(py::self - std::complex<double>(), py::is_operator())
-      .def(py::self / scalar_operator(), py::is_operator())
-      .def(py::self * scalar_operator(), py::is_operator())
-      .def(py::self + scalar_operator(), py::is_operator())
-      .def(py::self - scalar_operator(), py::is_operator())
-      .def(py::self * matrix_op_term(), py::is_operator())
-      .def(py::self + matrix_op_term(), py::is_operator())
-      .def(py::self - matrix_op_term(), py::is_operator())
-      .def(py::self * py::self, py::is_operator())
-      .def(py::self + py::self, py::is_operator())
-      .def(py::self - py::self, py::is_operator())
+      .def(nanobind::self / int(), nanobind::is_operator())
+      .def(nanobind::self * int(), nanobind::is_operator())
+      .def(nanobind::self + int(), nanobind::is_operator())
+      .def(nanobind::self - int(), nanobind::is_operator())
+      .def(nanobind::self / double(), nanobind::is_operator())
+      .def(nanobind::self * double(), nanobind::is_operator())
+      .def(nanobind::self + double(), nanobind::is_operator())
+      .def(nanobind::self - double(), nanobind::is_operator())
+      .def(nanobind::self / std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self * std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self + std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self - std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self / scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self * scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self + scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self - scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self * matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self + matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self - matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self * nanobind::self, nanobind::is_operator())
+      .def(nanobind::self + nanobind::self, nanobind::is_operator())
+      .def(nanobind::self - nanobind::self, nanobind::is_operator())
 
       // left-hand arithmetics
 
-      .def(int() * py::self, py::is_operator())
-      .def(int() + py::self, py::is_operator())
-      .def(int() - py::self, py::is_operator())
-      .def(double() * py::self, py::is_operator())
-      .def(double() + py::self, py::is_operator())
-      .def(double() - py::self, py::is_operator())
-      .def(std::complex<double>() * py::self, py::is_operator())
-      .def(std::complex<double>() + py::self, py::is_operator())
-      .def(std::complex<double>() - py::self, py::is_operator())
-      .def(scalar_operator() * py::self, py::is_operator())
-      .def(scalar_operator() + py::self, py::is_operator())
-      .def(scalar_operator() - py::self, py::is_operator())
+      .def(int() * nanobind::self, nanobind::is_operator())
+      .def(int() + nanobind::self, nanobind::is_operator())
+      .def(int() - nanobind::self, nanobind::is_operator())
+      .def(double() * nanobind::self, nanobind::is_operator())
+      .def(double() + nanobind::self, nanobind::is_operator())
+      .def(double() - nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() * nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() + nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() - nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() * nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() + nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() - nanobind::self, nanobind::is_operator())
 
       // common operators
 
@@ -308,17 +315,17 @@ void bindMatrixOperator(py::module &mod) {
       .def("dump", &matrix_op::dump,
            "Prints the string representation of the operator to the standard "
            "output.")
-      .def("trim", &matrix_op::trim, py::arg("tol") = 0.0,
-           py::arg("parameters") = parameter_map(),
+      .def("trim", &matrix_op::trim, nanobind::arg("tol") = 0.0,
+           nanobind::arg("parameters") = parameter_map(),
            "Removes all terms from the sum for which the absolute value of the "
            "coefficient is below "
            "the given tolerance.")
       .def(
           "trim",
-          [](matrix_op &self, double tol, const py::kwargs &kwargs) {
+          [](matrix_op &self, double tol, const nanobind::kwargs &kwargs) {
             return self.trim(tol, details::kwargs_to_param_map(kwargs));
           },
-          py::arg("tol") = 0.0,
+          nanobind::arg("tol") = 0.0, nanobind::arg("kwargs"),
           "Removes all terms from the sum for which the absolute value of the "
           "coefficient is below "
           "the given tolerance.")
@@ -343,42 +350,44 @@ void bindMatrixOperator(py::module &mod) {
       .def(
           "__iter__",
           [](matrix_op_term &self) {
-            return py::make_iterator(self.begin(), self.end());
+            return nanobind::make_iterator(nanobind::type<matrix_op_term>(),
+                                           "iterator", self.begin(),
+                                           self.end());
           },
-          py::keep_alive<0, 1>(), "Loop through each term of the operator.")
+          nanobind::keep_alive<0, 1>(),
+          "Loop through each term of the operator.")
 
       // properties
 
-      .def_property_readonly("parameters",
-                             &matrix_op_term::get_parameter_descriptions,
-                             "Returns a dictionary that maps each parameter "
-                             "name to its description.")
-      .def_property_readonly("degrees", &matrix_op_term::degrees,
-                             "Returns a vector that lists all degrees of "
-                             "freedom that the operator targets. "
-                             "The order of degrees is from smallest to largest "
-                             "and reflects the ordering of "
-                             "the matrix returned by `to_matrix`. "
-                             "Specifically, the indices of a statevector "
-                             "with two qubits are {00, 01, 10, 11}. An "
-                             "ordering of degrees {0, 1} then indicates "
-                             "that a state where the qubit with index 0 equals "
-                             "1 with probability 1 is given by "
-                             "the vector {0., 1., 0., 0.}.")
-      .def_property_readonly("min_degree", &matrix_op_term::min_degree,
-                             "Returns the smallest index of the degrees of "
-                             "freedom that the operator targets.")
-      .def_property_readonly("max_degree", &matrix_op_term::max_degree,
-                             "Returns the smallest index of the degrees of "
-                             "freedom that the operator targets.")
-      .def_property_readonly("ops_count", &matrix_op_term::num_ops,
-                             "Returns the number of operators in the product.")
-      .def_property_readonly(
+      .def_prop_ro("parameters", &matrix_op_term::get_parameter_descriptions,
+                   "Returns a dictionary that maps each parameter "
+                   "name to its description.")
+      .def_prop_ro("degrees", &matrix_op_term::degrees,
+                   "Returns a vector that lists all degrees of "
+                   "freedom that the operator targets. "
+                   "The order of degrees is from smallest to largest "
+                   "and reflects the ordering of "
+                   "the matrix returned by `to_matrix`. "
+                   "Specifically, the indices of a statevector "
+                   "with two qubits are {00, 01, 10, 11}. An "
+                   "ordering of degrees {0, 1} then indicates "
+                   "that a state where the qubit with index 0 equals "
+                   "1 with probability 1 is given by "
+                   "the vector {0., 1., 0., 0.}.")
+      .def_prop_ro("min_degree", &matrix_op_term::min_degree,
+                   "Returns the smallest index of the degrees of "
+                   "freedom that the operator targets.")
+      .def_prop_ro("max_degree", &matrix_op_term::max_degree,
+                   "Returns the smallest index of the degrees of "
+                   "freedom that the operator targets.")
+      .def_prop_ro("ops_count", &matrix_op_term::num_ops,
+                   "Returns the number of operators in the product.")
+      .def_prop_ro(
           "term_id", &matrix_op_term::get_term_id,
           "The term id uniquely identifies the operators and targets (degrees) "
           "that they act on, "
           "but does not include information about the coefficient.")
-      .def_property_readonly(
+      .def_prop_ro(
           "coefficient", &matrix_op_term::get_coefficient,
           "Returns the unevaluated coefficient of the operator. The "
           "coefficient is a "
@@ -386,33 +395,35 @@ void bindMatrixOperator(py::module &mod) {
 
       // constructors
 
-      .def(py::init<>(),
+      .def(nanobind::init<>(),
            "Creates a product operator with constant value 1. The returned "
            "operator does not target any degrees of freedom but merely "
            "represents a constant.")
-      .def(py::init<std::size_t, std::size_t>(), py::arg("first_degree"),
-           py::arg("last_degree"),
+      .def(nanobind::init<std::size_t, std::size_t>(),
+           nanobind::arg("first_degree"), nanobind::arg("last_degree"),
            "Creates a product operator that applies an identity operation to "
            "all degrees of "
            "freedom in the range [first_degree, last_degree).")
-      .def(py::init<double>(),
+      .def(nanobind::init<double>(),
            "Creates a product operator with the given constant value. "
            "The returned operator does not target any degrees of freedom.")
-      .def(py::init<std::complex<double>>(),
+      .def(nanobind::init<std::complex<double>>(),
            "Creates a product operator with the given "
            "constant value. The returned operator does not target any degrees "
            "of freedom.")
-      .def(py::init([](const scalar_operator &scalar) {
-             return matrix_op_term() * scalar;
-           }),
-           "Creates a product operator with non-constant scalar value.")
-      .def(py::init<matrix_handler>(),
+      .def(
+          "__init__",
+          [](matrix_op_term *self, const scalar_operator &scalar) {
+            new (self) matrix_op_term(matrix_op_term() * scalar);
+          },
+          "Creates a product operator with non-constant scalar value.")
+      .def(nanobind::init<matrix_handler>(),
            "Creates a product operator with the given elementary operator.")
-      .def(py::init<spin_op_term>())
-      .def(py::init<fermion_op_term>())
-      .def(py::init<boson_op_term>())
-      .def(py::init<const matrix_op_term &, std::size_t>(), py::arg("operator"),
-           py::arg("size") = 0,
+      .def(nanobind::init<spin_op_term>())
+      .def(nanobind::init<fermion_op_term>())
+      .def(nanobind::init<boson_op_term>())
+      .def(nanobind::init<const matrix_op_term &, std::size_t>(),
+           nanobind::arg("operator"), nanobind::arg("size") = 0,
            "Creates a copy of the given operator and reserves space for "
            "storing the given "
            "number of product terms (if a size is provided).")
@@ -424,7 +435,7 @@ void bindMatrixOperator(py::module &mod) {
       // evaluations
 
       .def("evaluate_coefficient", &matrix_op_term::evaluate_coefficient,
-           py::arg("parameters") = parameter_map(),
+           nanobind::arg("parameters") = parameter_map(),
            "Returns the evaluated coefficient of the product operator. The "
            "parameters is a map of parameter names to their concrete, complex "
            "values.")
@@ -435,9 +446,9 @@ void bindMatrixOperator(py::module &mod) {
             auto cmat = self.to_matrix(dimensions, params, invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("parameters") = parameter_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("invert_order") = false,
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
@@ -447,13 +458,13 @@ void bindMatrixOperator(py::module &mod) {
       .def(
           "to_matrix",
           [](const matrix_op_term &self, dimension_map &dimensions,
-             bool invert_order, const py::kwargs &kwargs) {
+             bool invert_order, const nanobind::kwargs &kwargs) {
             auto cmat = self.to_matrix(
                 dimensions, details::kwargs_to_param_map(kwargs), invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
@@ -463,7 +474,7 @@ void bindMatrixOperator(py::module &mod) {
 
       // comparisons
 
-      .def("__eq__", &matrix_op_term::operator==, py::is_operator(),
+      .def("__eq__", &matrix_op_term::operator==, nanobind::is_operator(),
            "Return true if the two operators are equivalent. The equivalence "
            "check takes "
            "into account that multiplication of operators that act on "
@@ -476,71 +487,72 @@ void bindMatrixOperator(py::module &mod) {
           [](const matrix_op_term &self, const matrix_op &other) {
             return other.num_terms() == 1 && *other.begin() == self;
           },
-          py::is_operator(), "Return true if the two operators are equivalent.")
+          nanobind::is_operator(),
+          "Return true if the two operators are equivalent.")
 
       // unary operators
 
-      .def(-py::self, py::is_operator())
-      .def(+py::self, py::is_operator())
+      .def(-nanobind::self, nanobind::is_operator())
+      .def(+nanobind::self, nanobind::is_operator())
 
       // in-place arithmetics
 
-      .def(py::self /= int(), py::is_operator())
-      .def(py::self *= int(), py::is_operator())
-      .def(py::self /= double(), py::is_operator())
-      .def(py::self *= double(), py::is_operator())
-      .def(py::self /= std::complex<double>(), py::is_operator())
-      .def(py::self *= std::complex<double>(), py::is_operator())
-      .def(py::self /= scalar_operator(), py::is_operator())
-      .def(py::self *= scalar_operator(), py::is_operator())
-      .def(py::self *= py::self, py::is_operator())
+      .def(nanobind::self /= int(), nanobind::is_operator())
+      .def(nanobind::self *= int(), nanobind::is_operator())
+      .def(nanobind::self /= double(), nanobind::is_operator())
+      .def(nanobind::self *= double(), nanobind::is_operator())
+      .def(nanobind::self /= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self *= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self /= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self *= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self *= nanobind::self, nanobind::is_operator())
 
       // right-hand arithmetics
 
-      .def(py::self / int(), py::is_operator())
-      .def(py::self * int(), py::is_operator())
-      .def(py::self + int(), py::is_operator())
-      .def(py::self - int(), py::is_operator())
-      .def(py::self / double(), py::is_operator())
-      .def(py::self * double(), py::is_operator())
-      .def(py::self + double(), py::is_operator())
-      .def(py::self - double(), py::is_operator())
-      .def(py::self / std::complex<double>(), py::is_operator())
-      .def(py::self * std::complex<double>(), py::is_operator())
-      .def(py::self + std::complex<double>(), py::is_operator())
-      .def(py::self - std::complex<double>(), py::is_operator())
-      .def(py::self / scalar_operator(), py::is_operator())
-      .def(py::self * scalar_operator(), py::is_operator())
-      .def(py::self + scalar_operator(), py::is_operator())
-      .def(py::self - scalar_operator(), py::is_operator())
-      .def(py::self * py::self, py::is_operator())
-      .def(py::self + py::self, py::is_operator())
-      .def(py::self - py::self, py::is_operator())
-      .def(py::self * matrix_op(), py::is_operator())
-      .def(py::self + matrix_op(), py::is_operator())
-      .def(py::self - matrix_op(), py::is_operator())
+      .def(nanobind::self / int(), nanobind::is_operator())
+      .def(nanobind::self * int(), nanobind::is_operator())
+      .def(nanobind::self + int(), nanobind::is_operator())
+      .def(nanobind::self - int(), nanobind::is_operator())
+      .def(nanobind::self / double(), nanobind::is_operator())
+      .def(nanobind::self * double(), nanobind::is_operator())
+      .def(nanobind::self + double(), nanobind::is_operator())
+      .def(nanobind::self - double(), nanobind::is_operator())
+      .def(nanobind::self / std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self * std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self + std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self - std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self / scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self * scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self + scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self - scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self * nanobind::self, nanobind::is_operator())
+      .def(nanobind::self + nanobind::self, nanobind::is_operator())
+      .def(nanobind::self - nanobind::self, nanobind::is_operator())
+      .def(nanobind::self * matrix_op(), nanobind::is_operator())
+      .def(nanobind::self + matrix_op(), nanobind::is_operator())
+      .def(nanobind::self - matrix_op(), nanobind::is_operator())
 
       // left-hand arithmetics
 
-      .def(int() * py::self, py::is_operator())
-      .def(int() + py::self, py::is_operator())
-      .def(int() - py::self, py::is_operator())
-      .def(double() * py::self, py::is_operator())
-      .def(double() + py::self, py::is_operator())
-      .def(double() - py::self, py::is_operator())
-      .def(std::complex<double>() * py::self, py::is_operator())
-      .def(std::complex<double>() + py::self, py::is_operator())
-      .def(std::complex<double>() - py::self, py::is_operator())
-      .def(scalar_operator() * py::self, py::is_operator())
-      .def(scalar_operator() + py::self, py::is_operator())
-      .def(scalar_operator() - py::self, py::is_operator())
+      .def(int() * nanobind::self, nanobind::is_operator())
+      .def(int() + nanobind::self, nanobind::is_operator())
+      .def(int() - nanobind::self, nanobind::is_operator())
+      .def(double() * nanobind::self, nanobind::is_operator())
+      .def(double() + nanobind::self, nanobind::is_operator())
+      .def(double() - nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() * nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() + nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() - nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() * nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() + nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() - nanobind::self, nanobind::is_operator())
 
       // general utility functions
 
       .def("is_identity", &matrix_op_term::is_identity,
            "Checks if all operators in the product are the identity. "
-           "Note: this function returns true regardless of the value of the "
-           "coefficient.")
+           "Note that this function returns true regardless of the value of "
+           "the coefficient.")
       .def(
           "__str__",
           [](const matrix_op_term &self) { return self.to_string(); },
@@ -564,18 +576,18 @@ void bindMatrixOperator(py::module &mod) {
           "of freedom that are not included in the given set.");
 }
 
-void bindOperatorsWrapper(py::module &mod) {
+void bindOperatorsWrapper(nanobind::module_ &mod) {
   bindMatrixOperator(mod);
-  py::implicitly_convertible<double, matrix_op_term>();
-  py::implicitly_convertible<std::complex<double>, matrix_op_term>();
-  py::implicitly_convertible<scalar_operator, matrix_op_term>();
-  py::implicitly_convertible<spin_op_term, matrix_op_term>();
-  py::implicitly_convertible<spin_op, matrix_op>();
-  py::implicitly_convertible<boson_op_term, matrix_op_term>();
-  py::implicitly_convertible<boson_op, matrix_op>();
-  py::implicitly_convertible<fermion_op_term, matrix_op_term>();
-  py::implicitly_convertible<fermion_op, matrix_op>();
-  py::implicitly_convertible<matrix_op_term, matrix_op>();
+  nanobind::implicitly_convertible<double, matrix_op_term>();
+  nanobind::implicitly_convertible<std::complex<double>, matrix_op_term>();
+  nanobind::implicitly_convertible<scalar_operator, matrix_op_term>();
+  nanobind::implicitly_convertible<spin_op_term, matrix_op_term>();
+  nanobind::implicitly_convertible<spin_op, matrix_op>();
+  nanobind::implicitly_convertible<boson_op_term, matrix_op_term>();
+  nanobind::implicitly_convertible<boson_op, matrix_op>();
+  nanobind::implicitly_convertible<fermion_op_term, matrix_op_term>();
+  nanobind::implicitly_convertible<fermion_op, matrix_op>();
+  nanobind::implicitly_convertible<matrix_op_term, matrix_op>();
   bindOperatorsModule(mod);
 }
 
diff --git a/python/runtime/cudaq/operators/py_matrix_op.h b/python/runtime/cudaq/operators/py_matrix_op.h
index 97b154b720f..28df05d8efb 100644
--- a/python/runtime/cudaq/operators/py_matrix_op.h
+++ b/python/runtime/cudaq/operators/py_matrix_op.h
@@ -6,12 +6,10 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
 /// @brief Wrapper function for exposing the bindings of matrix
 /// operators to python.
-void bindOperatorsWrapper(py::module &mod);
+void bindOperatorsWrapper(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/operators/py_scalar_op.cpp b/python/runtime/cudaq/operators/py_scalar_op.cpp
index fc92a836551..1ed437dc316 100644
--- a/python/runtime/cudaq/operators/py_scalar_op.cpp
+++ b/python/runtime/cudaq/operators/py_scalar_op.cpp
@@ -10,11 +10,15 @@
 #include <functional>
 #include <unordered_map>
 
-#include <pybind11/complex.h>
-#include <pybind11/functional.h>
-#include <pybind11/numpy.h>
-#include <pybind11/operators.h>
-#include <pybind11/stl.h>
+#include <nanobind/ndarray.h>
+#include <nanobind/operators.h>
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/function.h>
+#include <nanobind/stl/map.h>
+#include <nanobind/stl/pair.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/unordered_map.h>
+#include <nanobind/stl/vector.h>
 
 #include "cudaq/operators.h"
 #include "cudaq/operators/serialization.h"
@@ -23,47 +27,127 @@
 
 namespace cudaq {
 
-void bindScalarOperator(py::module &mod) {
-  using scalar_callback =
-      std::function<std::complex<double>(const parameter_map &)>;
+namespace {
+
+std::pair<std::unordered_map<std::string, std::string>, bool>
+introspectCallable(const nanobind::callable &func) {
+  nanobind::module_ inspect = nanobind::module_::import_("inspect");
+  nanobind::object argSpec = inspect.attr("getfullargspec")(func);
+
+  if (!argSpec.attr("varargs").is_none())
+    throw std::invalid_argument(
+        "the function defining a scalar operator must not take *args");
+
+  nanobind::module_ helpers =
+      nanobind::module_::import_("cudaq.operators.helpers");
+  nanobind::object paramDocsFn = helpers.attr("_parameter_docs");
+  nanobind::object docstring = func.attr("__doc__");
+
+  std::unordered_map<std::string, std::string> paramDesc;
+  for (nanobind::handle name : argSpec.attr("args")) {
+    std::string n = nanobind::cast<std::string>(name);
+    std::string doc = nanobind::cast<std::string>(
+        paramDocsFn(nanobind::str(n.c_str()), docstring));
+    paramDesc[n] = doc;
+  }
+  for (nanobind::handle name : argSpec.attr("kwonlyargs")) {
+    std::string n = nanobind::cast<std::string>(name);
+    std::string doc = nanobind::cast<std::string>(
+        paramDocsFn(nanobind::str(n.c_str()), docstring));
+    paramDesc[n] = doc;
+  }
+
+  bool acceptsKwargs = !argSpec.attr("varkw").is_none();
+  return {std::move(paramDesc), acceptsKwargs};
+}
+
+scalar_callback wrapPythonCallable(nanobind::callable func,
+                                   const std::vector<std::string> &paramNames,
+                                   bool acceptsKwargs) {
+  return [func = std::move(func), paramNames,
+          acceptsKwargs](const parameter_map &params) -> std::complex<double> {
+    nanobind::gil_scoped_acquire guard;
+    nanobind::dict pyKwargs;
+    if (acceptsKwargs) {
+      for (const auto &[k, v] : params)
+        pyKwargs[k.c_str()] = nanobind::cast(v);
+    } else {
+      for (const auto &name : paramNames) {
+        auto it = params.find(name);
+        if (it != params.end())
+          pyKwargs[name.c_str()] = nanobind::cast(it->second);
+      }
+    }
+    nanobind::object result = func(**pyKwargs);
+    return nanobind::cast<std::complex<double>>(result);
+  };
+}
+
+} // anonymous namespace
+
+void bindScalarOperator(nanobind::module_ &mod) {
 
-  py::class_<scalar_operator>(mod, "ScalarOperator")
+  nanobind::class_<scalar_operator>(mod, "ScalarOperator")
 
       // properties
 
-      .def_property_readonly("parameters",
-                             &scalar_operator::get_parameter_descriptions,
-                             "Returns a dictionary that maps each parameter "
-                             "name to its description.")
+      .def_prop_ro("parameters", &scalar_operator::get_parameter_descriptions,
+                   "Returns a dictionary that maps each parameter "
+                   "name to its description.")
 
       // constructors
 
-      .def(py::init<>(), "Creates a scalar operator with constant value 1.")
-      .def(py::init<double>(),
+      .def(nanobind::init<>(),
+           "Creates a scalar operator with constant value 1.")
+      .def(nanobind::init<double>(),
            "Creates a scalar operator with the given constant value.")
-      .def(py::init<std::complex<double>>(),
+      .def(nanobind::init<std::complex<double>>(),
            "Creates a scalar operator with the given constant value.")
-      .def(py::init([](const scalar_callback &func, const py::kwargs &kwargs) {
-             return scalar_operator(
-                 func, details::kwargs_to_param_description(kwargs));
-           }),
-           py::arg("callback"),
-           "Creates a scalar operator where the given callback function is "
-           "invoked during evaluation.")
-      .def(py::init<const scalar_operator &>(), "Copy constructor.")
+      .def(
+          "__init__",
+          [](scalar_operator *self, nanobind::callable func) {
+            auto [paramDesc, acceptsKwargs] = introspectCallable(func);
+            std::vector<std::string> paramNames;
+            for (const auto &[k, v] : paramDesc)
+              paramNames.push_back(k);
+            auto callback =
+                wrapPythonCallable(std::move(func), paramNames, acceptsKwargs);
+            new (self)
+                scalar_operator(std::move(callback), std::move(paramDesc));
+          },
+          nanobind::arg("generator"),
+          "Creates a scalar operator from a callable. Parameter names are "
+          "introspected from the function signature.")
+      .def(
+          "__init__",
+          [](scalar_operator *self, nanobind::callable func,
+             const nanobind::kwargs &kwargs) {
+            auto [introspected, acceptsKwargs] = introspectCallable(func);
+            auto paramDesc = details::kwargs_to_param_description(kwargs);
+            std::vector<std::string> paramNames;
+            for (const auto &[k, v] : paramDesc)
+              paramNames.push_back(k);
+            auto callback =
+                wrapPythonCallable(std::move(func), paramNames, acceptsKwargs);
+            new (self)
+                scalar_operator(std::move(callback), std::move(paramDesc));
+          },
+          "Creates a scalar operator from a callable with keyword argument "
+          "parameter descriptions.")
+      .def(nanobind::init<const scalar_operator &>(), "Copy constructor.")
 
       // evaluations
 
       .def(
           "evaluate",
-          [](const scalar_operator &self, const py::kwargs &kwargs) {
+          [](const scalar_operator &self, const nanobind::kwargs &kwargs) {
             return self.evaluate(details::kwargs_to_param_map(kwargs));
           },
           "Evaluated value of the operator.")
 
       // comparisons
 
-      .def("__eq__", &scalar_operator::operator==, py::is_operator())
+      .def("__eq__", &scalar_operator::operator==, nanobind::is_operator())
 
       // general utility functions
 
@@ -73,10 +157,10 @@ void bindScalarOperator(py::module &mod) {
            "Returns the string representation of the operator.");
 }
 
-void bindScalarWrapper(py::module &mod) {
+void bindScalarWrapper(nanobind::module_ &mod) {
   bindScalarOperator(mod);
-  py::implicitly_convertible<double, scalar_operator>();
-  py::implicitly_convertible<std::complex<double>, scalar_operator>();
+  nanobind::implicitly_convertible<double, scalar_operator>();
+  nanobind::implicitly_convertible<std::complex<double>, scalar_operator>();
 }
 
 } // namespace cudaq
diff --git a/python/runtime/cudaq/operators/py_scalar_op.h b/python/runtime/cudaq/operators/py_scalar_op.h
index 046f23411c6..4197132a60c 100644
--- a/python/runtime/cudaq/operators/py_scalar_op.h
+++ b/python/runtime/cudaq/operators/py_scalar_op.h
@@ -6,12 +6,10 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
 /// @brief Wrapper function for exposing the bindings of `cudaq::spin`
 /// and `cudaq::spin_op` to python.
-void bindScalarWrapper(py::module &mod);
+void bindScalarWrapper(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/operators/py_spin_op.cpp b/python/runtime/cudaq/operators/py_spin_op.cpp
index 0e35fb1e0d9..e901dcac0cd 100644
--- a/python/runtime/cudaq/operators/py_spin_op.cpp
+++ b/python/runtime/cudaq/operators/py_spin_op.cpp
@@ -7,10 +7,18 @@
  ******************************************************************************/
 
 #include <complex>
-#include <pybind11/complex.h>
-#include <pybind11/numpy.h>
-#include <pybind11/operators.h>
-#include <pybind11/stl.h>
+#include <nanobind/make_iterator.h>
+#include <nanobind/ndarray.h>
+#include <nanobind/operators.h>
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/function.h>
+#include <nanobind/stl/map.h>
+#include <nanobind/stl/pair.h>
+#include <nanobind/stl/set.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/tuple.h>
+#include <nanobind/stl/unordered_map.h>
+#include <nanobind/stl/vector.h>
 
 #include "cudaq/operators.h"
 #include "cudaq/operators/serialization.h"
@@ -20,8 +28,8 @@
 namespace cudaq {
 
 /// @brief Map an OpenFermion operator to our own spin operator
-spin_op fromOpenFermionQubitOperator(py::object &op) {
-  if (!py::hasattr(op, "terms"))
+spin_op fromOpenFermionQubitOperator(nanobind::object &op) {
+  if (!nanobind::hasattr(op, "terms"))
     throw std::runtime_error(
         "This is not an openfermion operator, must have 'terms' attribute.");
   std::map<std::string, std::function<spin_op_term(std::size_t)>> creatorMap{
@@ -31,18 +39,19 @@ spin_op fromOpenFermionQubitOperator(py::object &op) {
   auto terms = op.attr("terms");
   auto H = spin_op::empty();
   for (auto term : terms) {
-    auto termTuple = term.cast<py::tuple>();
+    auto termTuple = nanobind::cast<nanobind::tuple>(term);
     auto localTerm = spin_op::identity();
-    for (auto &element : termTuple) {
-      auto casted = element.cast<std::pair<std::size_t, std::string>>();
+    for (auto element : termTuple) {
+      auto casted =
+          nanobind::cast<std::pair<std::size_t, std::string>>(element);
       localTerm *= creatorMap[casted.second](casted.first);
     }
-    H += terms[term].cast<double>() * localTerm;
+    H += nanobind::cast<double>(terms[term]) * localTerm;
   }
   return H;
 }
 
-void bindSpinModule(py::module &mod) {
+void bindSpinModule(nanobind::module_ &mod) {
   // Binding the functions in `cudaq::spin` as `_pycudaq` submodule
   // so it's accessible directly in the cudaq namespace.
   auto spin_submodule = mod.def_submodule("spin");
@@ -56,33 +65,35 @@ void bindSpinModule(py::module &mod) {
   // here for consistency with other operators
   spin_submodule.def(
       "identity", [](std::size_t target) { return spin_op::identity(target); },
-      py::arg("target"),
+      nanobind::arg("target"),
       "Returns an identity operator on the given target index.");
   spin_submodule.def(
       "identities",
       [](std::size_t first, std::size_t last) {
         return spin_op_term(first, last);
       },
-      py::arg("first"), py::arg("last"),
+      nanobind::arg("first"), nanobind::arg("last"),
       "Creates a product operator that applies an identity operation to all "
       "degrees of "
       "freedom in the open range [first, last).");
-  spin_submodule.def("i", &spin_op::i<spin_handler>, py::arg("target"),
+  spin_submodule.def("i", &spin_op::i<spin_handler>, nanobind::arg("target"),
                      "Returns a Pauli I spin operator on the given "
                      "target qubit index.");
   spin_submodule.def(
-      "x", &spin_op::x<spin_handler>, py::arg("target"),
+      "x", &spin_op::x<spin_handler>, nanobind::arg("target"),
       "Returns a Pauli X spin operator on the given target qubit index.");
   spin_submodule.def(
-      "y", &spin_op::y<spin_handler>, py::arg("target"),
+      "y", &spin_op::y<spin_handler>, nanobind::arg("target"),
       "Returns a Pauli Y spin operator on the given target qubit index.");
   spin_submodule.def(
-      "z", &spin_op::z<spin_handler>, py::arg("target"),
+      "z", &spin_op::z<spin_handler>, nanobind::arg("target"),
       "Returns a Pauli Z spin operator on the given target qubit index.");
-  spin_submodule.def("plus", &spin_op::plus<spin_handler>, py::arg("target"),
+  spin_submodule.def("plus", &spin_op::plus<spin_handler>,
+                     nanobind::arg("target"),
                      "Return a sigma plus spin operator on the given "
                      "target qubit index.");
-  spin_submodule.def("minus", &spin_op::minus<spin_handler>, py::arg("target"),
+  spin_submodule.def("minus", &spin_op::minus<spin_handler>,
+                     nanobind::arg("target"),
                      "Return a sigma minus spin operator on the given "
                      "target qubit index.");
   spin_submodule.def(
@@ -115,52 +126,55 @@ void bindSpinModule(py::module &mod) {
       "degrees of freedom.");
 }
 
-void bindSpinOperator(py::module &mod) {
+void bindSpinOperator(nanobind::module_ &mod) {
 
-  auto spin_op_class = py::class_<spin_op>(mod, "SpinOperator");
-  auto spin_op_term_class = py::class_<spin_op_term>(mod, "SpinOperatorTerm");
+  auto spin_op_class = nanobind::class_<spin_op>(mod, "SpinOperator");
+  auto spin_op_term_class =
+      nanobind::class_<spin_op_term>(mod, "SpinOperatorTerm");
 
   spin_op_class
       .def(
           "__iter__",
           [](spin_op &self) {
-            return py::make_iterator(self.begin(), self.end());
+            return nanobind::make_iterator(nanobind::type<spin_op>(),
+                                           "iterator", self.begin(),
+                                           self.end());
           },
-          py::keep_alive<0, 1>(), "Loop through each term of the operator.")
+          nanobind::keep_alive<0, 1>(),
+          "Loop through each term of the operator.")
 
       // properties
 
-      .def_property_readonly("parameters", &spin_op::get_parameter_descriptions,
-                             "Returns a dictionary that maps each parameter "
-                             "name to its description.")
-      .def_property_readonly("degrees", &spin_op::degrees,
-                             "Returns a vector that lists all degrees of "
-                             "freedom that the operator targets. "
-                             "The order of degrees is from smallest to largest "
-                             "and reflects the ordering of "
-                             "the matrix returned by `to_matrix`. "
-                             "Specifically, the indices of a statevector "
-                             "with two qubits are {00, 01, 10, 11}. An "
-                             "ordering of degrees {0, 1} then indicates "
-                             "that a state where the qubit with index 0 equals "
-                             "1 with probability 1 is given by "
-                             "the vector {0., 1., 0., 0.}.")
-      .def_property_readonly("min_degree", &spin_op::min_degree,
-                             "Returns the smallest index of the degrees of "
-                             "freedom that the operator targets.")
-      .def_property_readonly("max_degree", &spin_op::max_degree,
-                             "Returns the smallest index of the degrees of "
-                             "freedom that the operator targets.")
-      .def_property_readonly("term_count", &spin_op::num_terms,
-                             "Returns the number of terms in the operator.")
+      .def_prop_ro("parameters", &spin_op::get_parameter_descriptions,
+                   "Returns a dictionary that maps each parameter "
+                   "name to its description.")
+      .def_prop_ro("degrees", &spin_op::degrees,
+                   "Returns a vector that lists all degrees of "
+                   "freedom that the operator targets. "
+                   "The order of degrees is from smallest to largest "
+                   "and reflects the ordering of "
+                   "the matrix returned by `to_matrix`. "
+                   "Specifically, the indices of a statevector "
+                   "with two qubits are {00, 01, 10, 11}. An "
+                   "ordering of degrees {0, 1} then indicates "
+                   "that a state where the qubit with index 0 equals "
+                   "1 with probability 1 is given by "
+                   "the vector {0., 1., 0., 0.}.")
+      .def_prop_ro("min_degree", &spin_op::min_degree,
+                   "Returns the smallest index of the degrees of "
+                   "freedom that the operator targets.")
+      .def_prop_ro("max_degree", &spin_op::max_degree,
+                   "Returns the smallest index of the degrees of "
+                   "freedom that the operator targets.")
+      .def_prop_ro("term_count", &spin_op::num_terms,
+                   "Returns the number of terms in the operator.")
       // only exists for spin operators
-      .def_property_readonly(
-          "qubit_count", &spin_op::num_qubits<spin_handler>,
-          "Return the number of qubits this operator acts on.")
+      .def_prop_ro("qubit_count", &spin_op::num_qubits<spin_handler>,
+                   "Return the number of qubits this operator acts on.")
 
       // constructors
 
-      .def(py::init<>(),
+      .def(nanobind::init<>(),
            "Creates a default instantiated sum. A default instantiated "
            "sum has no value; it will take a value the first time an "
            "arithmetic operation "
@@ -169,27 +183,31 @@ void bindSpinOperator(py::module &mod) {
            "identity. To construct a `0` value in the mathematical sense "
            "(neutral element "
            "for addition), use `empty()` instead.")
-      .def(py::init<std::size_t>(), py::arg("size"),
+      .def(nanobind::init<std::size_t>(), nanobind::arg("size"),
            "Creates a sum operator with no terms, reserving "
            "space for the given number of terms (size).")
       // NOTE: only supported on spin ops so far
-      .def(py::init<std::vector<double> &>(), py::arg("data"),
+      .def(nanobind::init<std::vector<double> &>(), nanobind::arg("data"),
            "Creates an operator based on a serialized data representation.")
       // NOTE: only supported on spin ops so far
-      .def(py::init([](const std::string &fileName) {
-             binary_spin_op_reader reader;
-             return reader.read(fileName);
-           }),
-           "Creates an operator based on a serialized data representation in "
-           "the given file.")
-      .def(py::init<const spin_op_term &>(),
+      .def(
+          "__init__",
+          [](spin_op *self, const std::string &fileName) {
+            binary_spin_op_reader reader;
+            new (self) spin_op(reader.read(fileName));
+          },
+          "Creates an operator based on a serialized data representation in "
+          "the given file.")
+      .def(nanobind::init<const spin_op_term &>(),
            "Creates a sum operator with the given term.")
-      .def(py::init<const spin_op &>(), "Copy constructor.")
+      .def(nanobind::init<const spin_op &>(), "Copy constructor.")
       // NOTE: only supported on spin ops
-      .def(py::init([](py::object obj) {
-             return fromOpenFermionQubitOperator(obj);
-           }),
-           "Convert an OpenFermion operator to a CUDA-Q spin operator.")
+      .def(
+          "__init__",
+          [](spin_op *self, nanobind::object obj) {
+            new (self) spin_op(fromOpenFermionQubitOperator(obj));
+          },
+          "Convert an OpenFermion operator to a CUDA-Q spin operator.")
       .def(
           "copy", [](const spin_op &self) { return spin_op(self); },
           "Creates a copy of the operator.")
@@ -200,15 +218,16 @@ void bindSpinOperator(py::module &mod) {
       .def_static(
           "from_json",
           [](const std::string &json_str) {
-            py::object json = py::module_::import("json");
-            auto data = py::list(json.attr("loads")(json_str));
-            return spin_op(data.cast<std::vector<double>>());
+            nanobind::object json = nanobind::module_::import_("json");
+            auto data = nanobind::list(json.attr("loads")(json_str));
+            return spin_op(nanobind::cast<std::vector<double>>(data));
           },
           "Convert JSON string ('[d1, d2, d3, ...]') to spin_op")
       // NOTE: only supported on spin ops
       .def_static(
-          "random", &spin_op::random<spin_handler>, py::arg("qubit_count"),
-          py::arg("term_count"), py::arg("seed") = std::random_device{}(),
+          "random", &spin_op::random<spin_handler>,
+          nanobind::arg("qubit_count"), nanobind::arg("term_count"),
+          nanobind::arg("seed") = std::random_device{}(),
           "Return a random spin operator with the given number of terms "
           "(`term_count`) where each term acts on all targets in the open "
           "range "
@@ -223,9 +242,9 @@ void bindSpinOperator(py::module &mod) {
             auto cmat = self.to_matrix(dimensions, params, invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("parameters") = parameter_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("invert_order") = false,
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
@@ -235,13 +254,13 @@ void bindSpinOperator(py::module &mod) {
       .def(
           "to_matrix",
           [](const spin_op &self, dimension_map &dimensions, bool invert_order,
-             const py::kwargs &kwargs) {
+             const nanobind::kwargs &kwargs) {
             auto cmat = self.to_matrix(
                 dimensions, details::kwargs_to_param_map(kwargs), invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
@@ -254,9 +273,9 @@ void bindSpinOperator(py::module &mod) {
              const parameter_map &params, bool invert_order) {
             return self.to_sparse_matrix(dimensions, params, invert_order);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("parameters") = parameter_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("invert_order") = false,
           "Return the sparse matrix representation of the operator. This "
           "representation is a "
           "`Tuple[list[complex], list[int], list[int]]`, encoding the "
@@ -270,12 +289,12 @@ void bindSpinOperator(py::module &mod) {
       .def(
           "to_sparse_matrix",
           [](const spin_op &self, dimension_map &dimensions, bool invert_order,
-             const py::kwargs &kwargs) {
+             const nanobind::kwargs &kwargs) {
             return self.to_sparse_matrix(
                 dimensions, details::kwargs_to_param_map(kwargs), invert_order);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Return the sparse matrix representation of the operator. This "
           "representation is a "
           "`Tuple[list[complex], list[int], list[int]]`, encoding the "
@@ -289,7 +308,7 @@ void bindSpinOperator(py::module &mod) {
 
       // comparisons
 
-      .def("__eq__", &spin_op::operator==, py::is_operator(),
+      .def("__eq__", &spin_op::operator==, nanobind::is_operator(),
            "Return true if the two operators are equivalent. The equivalence "
            "check takes "
            "commutation relations into account. Operators acting on different "
@@ -301,91 +320,91 @@ void bindSpinOperator(py::module &mod) {
           [](const spin_op &self, const spin_op_term &other) {
             return self.num_terms() == 1 && *self.begin() == other;
           },
-          py::is_operator(), "Return true if the two operators are equivalent.")
+          nanobind::is_operator(),
+          "Return true if the two operators are equivalent.")
 
       // unary operators
 
-      .def(-py::self, py::is_operator())
-      .def(+py::self, py::is_operator())
+      .def(-nanobind::self, nanobind::is_operator())
+      .def(+nanobind::self, nanobind::is_operator())
 
       // in-place arithmetics
 
-      .def(py::self /= int(), py::is_operator())
-      .def(py::self *= int(), py::is_operator())
-      .def(py::self += int(), py::is_operator())
-      .def(py::self -= int(), py::is_operator())
-      .def(py::self /= double(), py::is_operator())
-      .def(py::self *= double(), py::is_operator())
-      .def(py::self += double(), py::is_operator())
-      .def(py::self -= double(), py::is_operator())
-      .def(py::self /= std::complex<double>(), py::is_operator())
-      .def(py::self *= std::complex<double>(), py::is_operator())
-      .def(py::self += std::complex<double>(), py::is_operator())
-      .def(py::self -= std::complex<double>(), py::is_operator())
-      .def(py::self /= scalar_operator(), py::is_operator())
-      .def(py::self *= scalar_operator(), py::is_operator())
-      .def(py::self += scalar_operator(), py::is_operator())
-      .def(py::self -= scalar_operator(), py::is_operator())
-      .def(py::self *= spin_op_term(), py::is_operator())
-      .def(py::self += spin_op_term(), py::is_operator())
-      .def(py::self -= spin_op_term(), py::is_operator())
-      .def(py::self *= py::self, py::is_operator())
-      .def(py::self += py::self, py::is_operator())
-// see issue https://github.com/pybind/pybind11/issues/1893
+      .def(nanobind::self /= int(), nanobind::is_operator())
+      .def(nanobind::self *= int(), nanobind::is_operator())
+      .def(nanobind::self += int(), nanobind::is_operator())
+      .def(nanobind::self -= int(), nanobind::is_operator())
+      .def(nanobind::self /= double(), nanobind::is_operator())
+      .def(nanobind::self *= double(), nanobind::is_operator())
+      .def(nanobind::self += double(), nanobind::is_operator())
+      .def(nanobind::self -= double(), nanobind::is_operator())
+      .def(nanobind::self /= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self *= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self += std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self -= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self /= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self *= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self += scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self -= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self *= spin_op_term(), nanobind::is_operator())
+      .def(nanobind::self += spin_op_term(), nanobind::is_operator())
+      .def(nanobind::self -= spin_op_term(), nanobind::is_operator())
+      .def(nanobind::self *= nanobind::self, nanobind::is_operator())
+      .def(nanobind::self += nanobind::self, nanobind::is_operator())
 #ifdef __clang__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wself-assign-overloaded"
 #endif
-      .def(py::self -= py::self, py::is_operator())
+      .def(nanobind::self -= nanobind::self, nanobind::is_operator())
 #ifdef __clang__
 #pragma clang diagnostic pop
 #endif
 
       // right-hand arithmetics
 
-      .def(py::self / int(), py::is_operator())
-      .def(py::self * int(), py::is_operator())
-      .def(py::self + int(), py::is_operator())
-      .def(py::self - int(), py::is_operator())
-      .def(py::self / double(), py::is_operator())
-      .def(py::self * double(), py::is_operator())
-      .def(py::self + double(), py::is_operator())
-      .def(py::self - double(), py::is_operator())
-      .def(py::self / std::complex<double>(), py::is_operator())
-      .def(py::self * std::complex<double>(), py::is_operator())
-      .def(py::self + std::complex<double>(), py::is_operator())
-      .def(py::self - std::complex<double>(), py::is_operator())
-      .def(py::self / scalar_operator(), py::is_operator())
-      .def(py::self * scalar_operator(), py::is_operator())
-      .def(py::self + scalar_operator(), py::is_operator())
-      .def(py::self - scalar_operator(), py::is_operator())
-      .def(py::self * spin_op_term(), py::is_operator())
-      .def(py::self + spin_op_term(), py::is_operator())
-      .def(py::self - spin_op_term(), py::is_operator())
-      .def(py::self * py::self, py::is_operator())
-      .def(py::self + py::self, py::is_operator())
-      .def(py::self - py::self, py::is_operator())
-      .def(py::self * matrix_op_term(), py::is_operator())
-      .def(py::self + matrix_op_term(), py::is_operator())
-      .def(py::self - matrix_op_term(), py::is_operator())
-      .def(py::self * matrix_op(), py::is_operator())
-      .def(py::self + matrix_op(), py::is_operator())
-      .def(py::self - matrix_op(), py::is_operator())
+      .def(nanobind::self / int(), nanobind::is_operator())
+      .def(nanobind::self * int(), nanobind::is_operator())
+      .def(nanobind::self + int(), nanobind::is_operator())
+      .def(nanobind::self - int(), nanobind::is_operator())
+      .def(nanobind::self / double(), nanobind::is_operator())
+      .def(nanobind::self * double(), nanobind::is_operator())
+      .def(nanobind::self + double(), nanobind::is_operator())
+      .def(nanobind::self - double(), nanobind::is_operator())
+      .def(nanobind::self / std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self * std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self + std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self - std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self / scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self * scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self + scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self - scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self * spin_op_term(), nanobind::is_operator())
+      .def(nanobind::self + spin_op_term(), nanobind::is_operator())
+      .def(nanobind::self - spin_op_term(), nanobind::is_operator())
+      .def(nanobind::self * nanobind::self, nanobind::is_operator())
+      .def(nanobind::self + nanobind::self, nanobind::is_operator())
+      .def(nanobind::self - nanobind::self, nanobind::is_operator())
+      .def(nanobind::self * matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self + matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self - matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self * matrix_op(), nanobind::is_operator())
+      .def(nanobind::self + matrix_op(), nanobind::is_operator())
+      .def(nanobind::self - matrix_op(), nanobind::is_operator())
 
       // left-hand arithmetics
 
-      .def(int() * py::self, py::is_operator())
-      .def(int() + py::self, py::is_operator())
-      .def(int() - py::self, py::is_operator())
-      .def(double() * py::self, py::is_operator())
-      .def(double() + py::self, py::is_operator())
-      .def(double() - py::self, py::is_operator())
-      .def(std::complex<double>() * py::self, py::is_operator())
-      .def(std::complex<double>() + py::self, py::is_operator())
-      .def(std::complex<double>() - py::self, py::is_operator())
-      .def(scalar_operator() * py::self, py::is_operator())
-      .def(scalar_operator() + py::self, py::is_operator())
-      .def(scalar_operator() - py::self, py::is_operator())
+      .def(int() * nanobind::self, nanobind::is_operator())
+      .def(int() + nanobind::self, nanobind::is_operator())
+      .def(int() - nanobind::self, nanobind::is_operator())
+      .def(double() * nanobind::self, nanobind::is_operator())
+      .def(double() + nanobind::self, nanobind::is_operator())
+      .def(double() - nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() * nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() + nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() - nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() * nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() + nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() - nanobind::self, nanobind::is_operator())
 
       // common operators
 
@@ -420,22 +439,22 @@ void bindSpinOperator(py::module &mod) {
       .def(
           "to_json",
           [](const spin_op &self) {
-            py::object json = py::module_::import("json");
+            nanobind::object json = nanobind::module_::import_("json");
             auto data = self.get_data_representation();
             return json.attr("dumps")(data);
           },
-          "Convert spin_op to JSON string: '[d1, d2, d3, ...]'")
-      .def("trim", &spin_op::trim, py::arg("tol") = 0.0,
-           py::arg("parameters") = parameter_map(),
+          "Convert spin_op to a JSON string, e.g., '[d1, d2, d3, ...]'.")
+      .def("trim", &spin_op::trim, nanobind::arg("tol") = 0.0,
+           nanobind::arg("parameters") = parameter_map(),
            "Removes all terms from the sum for which the absolute value of the "
            "coefficient is below "
            "the given tolerance.")
       .def(
           "trim",
-          [](spin_op &self, double tol, const py::kwargs &kwargs) {
+          [](spin_op &self, double tol, const nanobind::kwargs &kwargs) {
             return self.trim(tol, details::kwargs_to_param_map(kwargs));
           },
-          py::arg("tol") = 0.0,
+          nanobind::arg("tol") = 0.0, nanobind::arg("kwargs"),
           "Removes all terms from the sum for which the absolute value of the "
           "coefficient is below "
           "the given tolerance.")
@@ -522,33 +541,37 @@ void bindSpinOperator(py::module &mod) {
           "(product operator) in future releases.")
       // constructor for old serialization format
       .def(
-          py::init([](const std::vector<double> &data, std::size_t num_qubits) {
+          "__init__",
+          [](spin_op *self, const std::vector<double> &data,
+             std::size_t num_qubits) {
             PyErr_WarnEx(
                 PyExc_DeprecationWarning,
                 "serialization format changed - use the constructor without a "
                 "size_t argument to create a spin_op from the new format",
                 1);
-            return spin_op(data, num_qubits);
-          }),
-          py::arg("data"), py::arg("num_qubits"),
+            new (self) spin_op(data, num_qubits);
+          },
+          nanobind::arg("data"), nanobind::arg("num_qubits"),
           "Deprecated - use constructor without the `num_qubits` argument "
           "instead.")
       // new constructor with deprecation warning provided only for backwards
       // compatibility (matching the deprecated data constructor for the old
       // serialization format above)
-      .def(py::init([](const std::string &fileName, bool legacy) {
-             binary_spin_op_reader reader;
-             PyErr_WarnEx(
-                 PyExc_DeprecationWarning,
-                 "overload provided for compatibility with the deprecated "
-                 "serialization format - please migrate to the new format and "
-                 "use the constructor without boolean argument",
-                 1);
-             return reader.read(fileName, legacy);
-           }),
-           py::arg("filename"), py::arg("legacy"),
-           "Constructor available for loading deprecated data representations "
-           "from file - will be removed in future releases.")
+      .def(
+          "__init__",
+          [](spin_op *self, const std::string &fileName, bool legacy) {
+            binary_spin_op_reader reader;
+            PyErr_WarnEx(
+                PyExc_DeprecationWarning,
+                "overload provided for compatibility with the deprecated "
+                "serialization format - please migrate to the new format and "
+                "use the constructor without boolean argument",
+                1);
+            new (self) spin_op(reader.read(fileName, legacy));
+          },
+          nanobind::arg("filename"), nanobind::arg("legacy"),
+          "Constructor available for loading deprecated data representations "
+          "from file - will be removed in future releases.")
       .def_static(
           "empty_op",
           []() {
@@ -568,27 +591,28 @@ void bindSpinOperator(py::module &mod) {
                          1);
             return self.to_string(print_coefficient);
           },
-          py::arg("print_coefficient") = true,
+          nanobind::arg("print_coefficient") = true,
           "Deprecated - use the standard `str` conversion or `get_pauli_word` "
           "on each term instead.")
       .def(
           "for_each_term",
-          [](spin_op &self, py::function functor) {
+          [](spin_op &self, nanobind::callable functor) {
             PyErr_WarnEx(PyExc_DeprecationWarning,
                          "use standard iteration instead", 1);
             self.for_each_term(functor);
           },
-          py::arg("function"), "Deprecated - use standard iteration instead.")
+          nanobind::arg("function"),
+          "Deprecated - use standard iteration instead.")
       .def(
           "for_each_pauli",
-          [](spin_op &self, py::function functor) {
+          [](spin_op &self, nanobind::callable functor) {
             PyErr_WarnEx(PyExc_DeprecationWarning,
                          "iterate over the sum to get each term and then "
                          "iterate over the term(s) instead",
                          1);
             self.for_each_pauli(functor);
           },
-          py::arg("function"),
+          nanobind::arg("function"),
           "Deprecated - iterator over sum and then iterator over term "
           "instead.");
 #if (defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER))
@@ -602,49 +626,50 @@ void bindSpinOperator(py::module &mod) {
       .def(
           "__iter__",
           [](spin_op_term &self) {
-            return py::make_iterator(self.begin(), self.end());
+            return nanobind::make_iterator(nanobind::type<spin_op_term>(),
+                                           "iterator", self.begin(),
+                                           self.end());
           },
-          py::keep_alive<0, 1>(), "Loop through each term of the operator.")
+          nanobind::keep_alive<0, 1>(),
+          "Loop through each term of the operator.")
 
       // properties
 
-      .def_property_readonly("parameters",
-                             &spin_op_term::get_parameter_descriptions,
-                             "Returns a dictionary that maps each parameter "
-                             "name to its description.")
-      .def_property_readonly("degrees", &spin_op_term::degrees,
-                             "Returns a vector that lists all degrees of "
-                             "freedom that the operator targets. "
-                             "The order of degrees is from smallest to largest "
-                             "and reflects the ordering of "
-                             "the matrix returned by `to_matrix`. "
-                             "Specifically, the indices of a statevector "
-                             "with two qubits are {00, 01, 10, 11}. An "
-                             "ordering of degrees {0, 1} then indicates "
-                             "that a state where the qubit with index 0 equals "
-                             "1 with probability 1 is given by "
-                             "the vector {0., 1., 0., 0.}.")
-      .def_property_readonly("min_degree", &spin_op_term::min_degree,
-                             "Returns the smallest index of the degrees of "
-                             "freedom that the operator targets.")
-      .def_property_readonly("max_degree", &spin_op_term::max_degree,
-                             "Returns the smallest index of the degrees of "
-                             "freedom that the operator targets.")
-      .def_property_readonly("ops_count", &spin_op_term::num_ops,
-                             "Returns the number of operators in the product.")
-      .def_property_readonly(
+      .def_prop_ro("parameters", &spin_op_term::get_parameter_descriptions,
+                   "Returns a dictionary that maps each parameter "
+                   "name to its description.")
+      .def_prop_ro("degrees", &spin_op_term::degrees,
+                   "Returns a vector that lists all degrees of "
+                   "freedom that the operator targets. "
+                   "The order of degrees is from smallest to largest "
+                   "and reflects the ordering of "
+                   "the matrix returned by `to_matrix`. "
+                   "Specifically, the indices of a statevector "
+                   "with two qubits are {00, 01, 10, 11}. An "
+                   "ordering of degrees {0, 1} then indicates "
+                   "that a state where the qubit with index 0 equals "
+                   "1 with probability 1 is given by "
+                   "the vector {0., 1., 0., 0.}.")
+      .def_prop_ro("min_degree", &spin_op_term::min_degree,
+                   "Returns the smallest index of the degrees of "
+                   "freedom that the operator targets.")
+      .def_prop_ro("max_degree", &spin_op_term::max_degree,
+                   "Returns the smallest index of the degrees of "
+                   "freedom that the operator targets.")
+      .def_prop_ro("ops_count", &spin_op_term::num_ops,
+                   "Returns the number of operators in the product.")
+      .def_prop_ro(
           "term_count", [](const spin_op_term &) { return 1; },
           "Returns the number of terms in the operator. Always returns 1.")
       // only exists for spin operators
-      .def_property_readonly(
-          "qubit_count", &spin_op_term::num_qubits<spin_handler>,
-          "Return the number of qubits this operator acts on.")
-      .def_property_readonly(
+      .def_prop_ro("qubit_count", &spin_op_term::num_qubits<spin_handler>,
+                   "Return the number of qubits this operator acts on.")
+      .def_prop_ro(
           "term_id", &spin_op_term::get_term_id,
           "The term id uniquely identifies the operators and targets (degrees) "
           "that they act on, "
           "but does not include information about the coefficient.")
-      .def_property_readonly(
+      .def_prop_ro(
           "coefficient", &spin_op_term::get_coefficient,
           "Returns the unevaluated coefficient of the operator. The "
           "coefficient is a "
@@ -652,60 +677,66 @@ void bindSpinOperator(py::module &mod) {
 
       // constructors
 
-      .def(py::init<>(),
+      .def(nanobind::init<>(),
            "Creates a product operator with constant value 1. The returned "
            "operator does not target any degrees of freedom but merely "
            "represents a constant.")
-      .def(py::init<std::size_t, std::size_t>(), py::arg("first_degree"),
-           py::arg("last_degree"),
+      .def(nanobind::init<std::size_t, std::size_t>(),
+           nanobind::arg("first_degree"), nanobind::arg("last_degree"),
            "Creates a product operator that applies an identity operation to "
            "all degrees of "
            "freedom in the range [first_degree, last_degree).")
       // NOTE: only supported on spin ops so far
-      .def(py::init([](const std::vector<double> &data) {
-             spin_op op(data);
-             if (op.num_terms() != 1)
-               throw std::runtime_error(
-                   "invalid data representation for product operator");
-             return *op.begin();
-           }),
-           py::arg("data"),
-           "Creates an operator based on a serialized data representation.")
+      .def(
+          "__init__",
+          [](spin_op_term *self, const std::vector<double> &data) {
+            spin_op op(data);
+            if (op.num_terms() != 1)
+              throw std::runtime_error(
+                  "invalid data representation for product operator");
+            new (self) spin_op_term(*op.begin());
+          },
+          nanobind::arg("data"),
+          "Creates an operator based on a serialized data representation.")
       // NOTE: only supported on spin ops so far
-      .def(py::init([](const std::string &fileName) {
-             binary_spin_op_reader reader;
-             spin_op op = reader.read(fileName);
-             if (op.num_terms() != 1)
-               throw std::runtime_error(
-                   "invalid data representation for product operator");
-             return *op.begin();
-           }),
-           "Creates an operator based on a serialized data representation in "
-           "the given file.")
-      .def(py::init<double>(),
+      .def(
+          "__init__",
+          [](spin_op_term *self, const std::string &fileName) {
+            binary_spin_op_reader reader;
+            spin_op op = reader.read(fileName);
+            if (op.num_terms() != 1)
+              throw std::runtime_error(
+                  "invalid data representation for product operator");
+            new (self) spin_op_term(*op.begin());
+          },
+          "Creates an operator based on a serialized data representation in "
+          "the given file.")
+      .def(nanobind::init<double>(),
            "Creates a product operator with the given constant value. "
            "The returned operator does not target any degrees of freedom.")
-      .def(py::init<std::complex<double>>(),
+      .def(nanobind::init<std::complex<double>>(),
            "Creates a product operator with the given "
            "constant value. The returned operator does not target any degrees "
            "of freedom.")
-      .def(py::init([](const scalar_operator &scalar) {
-             return spin_op_term() * scalar;
-           }),
-           "Creates a product operator with non-constant scalar value.")
-      .def(py::init<spin_handler>(),
+      .def(
+          "__init__",
+          [](spin_op_term *self, const scalar_operator &scalar) {
+            new (self) spin_op_term(spin_op_term() * scalar);
+          },
+          "Creates a product operator with non-constant scalar value.")
+      .def(nanobind::init<spin_handler>(),
            "Creates a product operator with the given elementary operator.")
-      .def(py::init<const spin_op_term &, std::size_t>(), py::arg("operator"),
-           py::arg("size") = 0,
+      .def(nanobind::init<const spin_op_term &, std::size_t>(),
+           nanobind::arg("operator"), nanobind::arg("size") = 0,
            "Creates a copy of the given operator and reserves space for "
            "storing the given "
            "number of product terms (if a size is provided).")
       .def_static(
           "from_json",
           [](const std::string &json_str) {
-            py::object json = py::module_::import("json");
-            auto data = py::list(json.attr("loads")(json_str));
-            spin_op op(data.cast<std::vector<double>>());
+            nanobind::object json = nanobind::module_::import_("json");
+            auto data = nanobind::list(json.attr("loads")(json_str));
+            spin_op op(nanobind::cast<std::vector<double>>(data));
             if (op.num_terms() != 1)
               throw std::runtime_error(
                   "invalid data representation for product operator");
@@ -719,7 +750,7 @@ void bindSpinOperator(py::module &mod) {
       // evaluations
 
       .def("evaluate_coefficient", &spin_op_term::evaluate_coefficient,
-           py::arg("parameters") = parameter_map(),
+           nanobind::arg("parameters") = parameter_map(),
            "Returns the evaluated coefficient of the product operator. The "
            "parameters is a map of parameter names to their concrete, complex "
            "values.")
@@ -730,9 +761,9 @@ void bindSpinOperator(py::module &mod) {
             auto cmat = self.to_matrix(dimensions, params, invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("parameters") = parameter_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("invert_order") = false,
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
@@ -742,13 +773,13 @@ void bindSpinOperator(py::module &mod) {
       .def(
           "to_matrix",
           [](const spin_op_term &self, dimension_map &dimensions,
-             bool invert_order, const py::kwargs &kwargs) {
+             bool invert_order, const nanobind::kwargs &kwargs) {
             auto cmat = self.to_matrix(
                 dimensions, details::kwargs_to_param_map(kwargs), invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
@@ -761,9 +792,9 @@ void bindSpinOperator(py::module &mod) {
              const parameter_map &params, bool invert_order) {
             return self.to_sparse_matrix(dimensions, params, invert_order);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("parameters") = parameter_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("invert_order") = false,
           "Return the sparse matrix representation of the operator. This "
           "representation is a "
           "`Tuple[list[complex], list[int], list[int]]`, encoding the "
@@ -777,12 +808,12 @@ void bindSpinOperator(py::module &mod) {
       .def(
           "to_sparse_matrix",
           [](const spin_op_term &self, dimension_map &dimensions,
-             bool invert_order, const py::kwargs &kwargs) {
+             bool invert_order, const nanobind::kwargs &kwargs) {
             return self.to_sparse_matrix(
                 dimensions, details::kwargs_to_param_map(kwargs), invert_order);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Return the sparse matrix representation of the operator. This "
           "representation is a "
           "`Tuple[list[complex], list[int], list[int]]`, encoding the "
@@ -796,7 +827,7 @@ void bindSpinOperator(py::module &mod) {
 
       // comparisons
 
-      .def("__eq__", &spin_op_term::operator==, py::is_operator(),
+      .def("__eq__", &spin_op_term::operator==, nanobind::is_operator(),
            "Return true if the two operators are equivalent. The equivalence "
            "check takes "
            "commutation relations into account. Operators acting on different "
@@ -808,77 +839,78 @@ void bindSpinOperator(py::module &mod) {
           [](const spin_op_term &self, const spin_op &other) {
             return other.num_terms() == 1 && *other.begin() == self;
           },
-          py::is_operator(), "Return true if the two operators are equivalent.")
+          nanobind::is_operator(),
+          "Return true if the two operators are equivalent.")
 
       // unary operators
 
-      .def(-py::self, py::is_operator())
-      .def(+py::self, py::is_operator())
+      .def(-nanobind::self, nanobind::is_operator())
+      .def(+nanobind::self, nanobind::is_operator())
 
       // in-place arithmetics
 
-      .def(py::self /= int(), py::is_operator())
-      .def(py::self *= int(), py::is_operator())
-      .def(py::self /= double(), py::is_operator())
-      .def(py::self *= double(), py::is_operator())
-      .def(py::self /= std::complex<double>(), py::is_operator())
-      .def(py::self *= std::complex<double>(), py::is_operator())
-      .def(py::self /= scalar_operator(), py::is_operator())
-      .def(py::self *= scalar_operator(), py::is_operator())
-      .def(py::self *= py::self, py::is_operator())
+      .def(nanobind::self /= int(), nanobind::is_operator())
+      .def(nanobind::self *= int(), nanobind::is_operator())
+      .def(nanobind::self /= double(), nanobind::is_operator())
+      .def(nanobind::self *= double(), nanobind::is_operator())
+      .def(nanobind::self /= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self *= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self /= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self *= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self *= nanobind::self, nanobind::is_operator())
 
       // right-hand arithmetics
 
-      .def(py::self / int(), py::is_operator())
-      .def(py::self * int(), py::is_operator())
-      .def(py::self + int(), py::is_operator())
-      .def(py::self - int(), py::is_operator())
-      .def(py::self / double(), py::is_operator())
-      .def(py::self * double(), py::is_operator())
-      .def(py::self + double(), py::is_operator())
-      .def(py::self - double(), py::is_operator())
-      .def(py::self / std::complex<double>(), py::is_operator())
-      .def(py::self * std::complex<double>(), py::is_operator())
-      .def(py::self + std::complex<double>(), py::is_operator())
-      .def(py::self - std::complex<double>(), py::is_operator())
-      .def(py::self / scalar_operator(), py::is_operator())
-      .def(py::self * scalar_operator(), py::is_operator())
-      .def(py::self + scalar_operator(), py::is_operator())
-      .def(py::self - scalar_operator(), py::is_operator())
-      .def(py::self * py::self, py::is_operator())
-      .def(py::self + py::self, py::is_operator())
-      .def(py::self - py::self, py::is_operator())
-      .def(py::self * spin_op(), py::is_operator())
-      .def(py::self + spin_op(), py::is_operator())
-      .def(py::self - spin_op(), py::is_operator())
-      .def(py::self * matrix_op_term(), py::is_operator())
-      .def(py::self + matrix_op_term(), py::is_operator())
-      .def(py::self - matrix_op_term(), py::is_operator())
-      .def(py::self * matrix_op(), py::is_operator())
-      .def(py::self + matrix_op(), py::is_operator())
-      .def(py::self - matrix_op(), py::is_operator())
+      .def(nanobind::self / int(), nanobind::is_operator())
+      .def(nanobind::self * int(), nanobind::is_operator())
+      .def(nanobind::self + int(), nanobind::is_operator())
+      .def(nanobind::self - int(), nanobind::is_operator())
+      .def(nanobind::self / double(), nanobind::is_operator())
+      .def(nanobind::self * double(), nanobind::is_operator())
+      .def(nanobind::self + double(), nanobind::is_operator())
+      .def(nanobind::self - double(), nanobind::is_operator())
+      .def(nanobind::self / std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self * std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self + std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self - std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self / scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self * scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self + scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self - scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self * nanobind::self, nanobind::is_operator())
+      .def(nanobind::self + nanobind::self, nanobind::is_operator())
+      .def(nanobind::self - nanobind::self, nanobind::is_operator())
+      .def(nanobind::self * spin_op(), nanobind::is_operator())
+      .def(nanobind::self + spin_op(), nanobind::is_operator())
+      .def(nanobind::self - spin_op(), nanobind::is_operator())
+      .def(nanobind::self * matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self + matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self - matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self * matrix_op(), nanobind::is_operator())
+      .def(nanobind::self + matrix_op(), nanobind::is_operator())
+      .def(nanobind::self - matrix_op(), nanobind::is_operator())
 
       // left-hand arithmetics
 
-      .def(int() * py::self, py::is_operator())
-      .def(int() + py::self, py::is_operator())
-      .def(int() - py::self, py::is_operator())
-      .def(double() * py::self, py::is_operator())
-      .def(double() + py::self, py::is_operator())
-      .def(double() - py::self, py::is_operator())
-      .def(std::complex<double>() * py::self, py::is_operator())
-      .def(std::complex<double>() + py::self, py::is_operator())
-      .def(std::complex<double>() - py::self, py::is_operator())
-      .def(scalar_operator() * py::self, py::is_operator())
-      .def(scalar_operator() + py::self, py::is_operator())
-      .def(scalar_operator() - py::self, py::is_operator())
+      .def(int() * nanobind::self, nanobind::is_operator())
+      .def(int() + nanobind::self, nanobind::is_operator())
+      .def(int() - nanobind::self, nanobind::is_operator())
+      .def(double() * nanobind::self, nanobind::is_operator())
+      .def(double() + nanobind::self, nanobind::is_operator())
+      .def(double() - nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() * nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() + nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() - nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() * nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() + nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() - nanobind::self, nanobind::is_operator())
 
       // general utility functions
 
       .def("is_identity", &spin_op_term::is_identity,
            "Checks if all operators in the product are the identity. "
-           "Note: this function returns true regardless of the value of the "
-           "coefficient.")
+           "Note that this function returns true regardless of the value of "
+           "the coefficient.")
       .def(
           "__str__", [](const spin_op_term &self) { return self.to_string(); },
           "Returns the string representation of the operator.")
@@ -896,18 +928,18 @@ void bindSpinOperator(py::module &mod) {
       .def(
           "to_json",
           [](const spin_op_term &self) {
-            py::object json = py::module_::import("json");
+            nanobind::object json = nanobind::module_::import_("json");
             auto data = spin_op(self).get_data_representation();
             return json.attr("dumps")(data);
           },
-          "Convert spin_op to JSON string: '[d1, d2, d3, ...]'")
+          "Convert spin_op to a JSON string, e.g., '[d1, d2, d3, ...]'.")
       // only exists for spin operators
       .def(
           "get_pauli_word",
           [](spin_op_term &op, std::size_t pad_identities) {
             return op.get_pauli_word(pad_identities);
           },
-          py::arg("pad_identities") = 0,
+          nanobind::arg("pad_identities") = 0,
           "Gets the Pauli word representation of this product operator.")
       // only exists for spin operators
       .def("get_binary_symplectic_form",
@@ -973,7 +1005,7 @@ void bindSpinOperator(py::module &mod) {
                          1);
             return self.to_string(print_coefficient);
           },
-          py::arg("print_coefficient") = true,
+          nanobind::arg("print_coefficient") = true,
           "Deprecated - use the standard `str` conversion or use "
           "`get_pauli_word` instead.")
       .def(
@@ -985,18 +1017,19 @@ void bindSpinOperator(py::module &mod) {
                          1);
             return spin_op(op).distribute_terms(chunks);
           },
-          py::arg("chunk_count"),
+          nanobind::arg("chunk_count"),
           "Deprecated - instantiate a `SpinOperator` from this "
           "`SpinOperatorTerm` "
           "and call distribute_terms on that.")
       .def(
           "for_each_pauli",
-          [](spin_op_term &self, py::function functor) {
+          [](spin_op_term &self, nanobind::callable functor) {
             PyErr_WarnEx(PyExc_DeprecationWarning,
                          "use standard iteration instead", 1);
             spin_op(self).for_each_pauli(functor);
           },
-          py::arg("function"), "Deprecated - use standard iteration instead.");
+          nanobind::arg("function"),
+          "Deprecated - use standard iteration instead.");
 #if (defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER))
 #pragma GCC diagnostic pop
 #endif
@@ -1005,12 +1038,12 @@ void bindSpinOperator(py::module &mod) {
 #endif
 }
 
-void bindSpinWrapper(py::module &mod) {
+void bindSpinWrapper(nanobind::module_ &mod) {
   bindSpinOperator(mod);
-  py::implicitly_convertible<double, spin_op_term>();
-  py::implicitly_convertible<std::complex<double>, spin_op_term>();
-  py::implicitly_convertible<scalar_operator, spin_op_term>();
-  py::implicitly_convertible<spin_op_term, spin_op>();
+  nanobind::implicitly_convertible<double, spin_op_term>();
+  nanobind::implicitly_convertible<std::complex<double>, spin_op_term>();
+  nanobind::implicitly_convertible<scalar_operator, spin_op_term>();
+  nanobind::implicitly_convertible<spin_op_term, spin_op>();
   bindSpinModule(mod);
 }
 
diff --git a/python/runtime/cudaq/operators/py_spin_op.h b/python/runtime/cudaq/operators/py_spin_op.h
index 592458ca681..3d0b7df7a8b 100644
--- a/python/runtime/cudaq/operators/py_spin_op.h
+++ b/python/runtime/cudaq/operators/py_spin_op.h
@@ -6,12 +6,10 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
 /// @brief Wrapper function for exposing the bindings of `cudaq::spin`
 /// and `cudaq::spin_op` to python.
-void bindSpinWrapper(py::module &mod);
+void bindSpinWrapper(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/operators/py_super_op.cpp b/python/runtime/cudaq/operators/py_super_op.cpp
index 730064dbb72..2c18dfbc820 100644
--- a/python/runtime/cudaq/operators/py_super_op.cpp
+++ b/python/runtime/cudaq/operators/py_super_op.cpp
@@ -7,10 +7,14 @@
  ******************************************************************************/
 
 #include <complex>
-#include <pybind11/complex.h>
-#include <pybind11/numpy.h>
-#include <pybind11/operators.h>
-#include <pybind11/stl.h>
+#include <nanobind/make_iterator.h>
+#include <nanobind/ndarray.h>
+#include <nanobind/operators.h>
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/pair.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
 
 #include "cudaq/operators.h"
 #include "py_helpers.h"
@@ -18,53 +22,54 @@
 
 namespace cudaq {
 
-void bindSuperOperatorWrapper(py::module &mod) {
-  auto super_op_class = py::class_<super_op>(mod, "SuperOperator");
+void bindSuperOperatorWrapper(nanobind::module_ &mod) {
+  auto super_op_class = nanobind::class_<super_op>(mod, "SuperOperator");
 
   super_op_class
-      .def(py::init<>(), "Creates a default instantiated super-operator. A "
-                         "default instantiated "
-                         "super-operator means a no action linear map.")
-      .def_static(
-          "left_multiply",
-          py::overload_cast<const cudaq::product_op<cudaq::matrix_handler> &>(
-              &super_op::left_multiply),
-          "Creates a super-operator representing a left "
-          "multiplication of the operator to the density matrix.")
-      .def_static(
-          "right_multiply",
-          py::overload_cast<const cudaq::product_op<cudaq::matrix_handler> &>(
-              &super_op::right_multiply),
-          "Creates a super-operator representing a right "
-          "multiplication of the operator to the density matrix.")
-      .def_static(
-          "left_right_multiply",
-          py::overload_cast<const cudaq::product_op<cudaq::matrix_handler> &,
-                            const cudaq::product_op<cudaq::matrix_handler> &>(
-              &super_op::left_right_multiply),
-          "Creates a super-operator representing a simultaneous left "
-          "multiplication of the first operator operand and right "
-          "multiplication of the second operator operand to the "
-          "density matrix.")
+      .def(nanobind::init<>(),
+           "Creates a default instantiated super-operator. A "
+           "default instantiated "
+           "super-operator means a no action linear map.")
+      .def_static("left_multiply",
+                  nanobind::overload_cast<
+                      const cudaq::product_op<cudaq::matrix_handler> &>(
+                      &super_op::left_multiply),
+                  "Creates a super-operator representing a left "
+                  "multiplication of the operator to the density matrix.")
+      .def_static("right_multiply",
+                  nanobind::overload_cast<
+                      const cudaq::product_op<cudaq::matrix_handler> &>(
+                      &super_op::right_multiply),
+                  "Creates a super-operator representing a right "
+                  "multiplication of the operator to the density matrix.")
+      .def_static("left_right_multiply",
+                  nanobind::overload_cast<
+                      const cudaq::product_op<cudaq::matrix_handler> &,
+                      const cudaq::product_op<cudaq::matrix_handler> &>(
+                      &super_op::left_right_multiply),
+                  "Creates a super-operator representing a simultaneous left "
+                  "multiplication of the first operator operand and right "
+                  "multiplication of the second operator operand to the "
+                  "density matrix.")
 
       .def_static(
           "left_multiply",
-          py::overload_cast<const cudaq::sum_op<cudaq::matrix_handler> &>(
+          nanobind::overload_cast<const cudaq::sum_op<cudaq::matrix_handler> &>(
               &super_op::left_multiply),
           "Creates a super-operator representing a left "
           "multiplication of the operator to the density matrix. The sum is "
           "distributed into a linear combination of super-operator actions.")
       .def_static(
           "right_multiply",
-          py::overload_cast<const cudaq::sum_op<cudaq::matrix_handler> &>(
+          nanobind::overload_cast<const cudaq::sum_op<cudaq::matrix_handler> &>(
               &super_op::right_multiply),
           "Creates a super-operator representing a right "
           "multiplication of the operator to the density matrix. The sum is "
           "distributed into a linear combination of super-operator actions.")
       .def_static(
           "left_right_multiply",
-          py::overload_cast<const cudaq::sum_op<cudaq::matrix_handler> &,
-                            const cudaq::sum_op<cudaq::matrix_handler> &>(
+          nanobind::overload_cast<const cudaq::sum_op<cudaq::matrix_handler> &,
+                                  const cudaq::sum_op<cudaq::matrix_handler> &>(
               &super_op::left_right_multiply),
           "Creates a super-operator representing a simultaneous left "
           "multiplication of the first operator operand and right "
@@ -74,11 +79,13 @@ void bindSuperOperatorWrapper(py::module &mod) {
       .def(
           "__iter__",
           [](super_op &self) {
-            return py::make_iterator(self.begin(), self.end());
+            return nanobind::make_iterator(nanobind::type<super_op>(),
+                                           "iterator", self.begin(),
+                                           self.end());
           },
-          py::keep_alive<0, 1>(),
+          nanobind::keep_alive<0, 1>(),
           "Loop through each term of the super-operator.")
-      .def(py::self += py::self, py::is_operator());
+      .def(nanobind::self += nanobind::self, nanobind::is_operator());
 }
 
 } // namespace cudaq
diff --git a/python/runtime/cudaq/operators/py_super_op.h b/python/runtime/cudaq/operators/py_super_op.h
index 32474d65639..da8c5e3ea3a 100644
--- a/python/runtime/cudaq/operators/py_super_op.h
+++ b/python/runtime/cudaq/operators/py_super_op.h
@@ -6,12 +6,10 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
 /// @brief Wrapper function for exposing the bindings of super-operator to
 /// python.
-void bindSuperOperatorWrapper(py::module &mod);
+void bindSuperOperatorWrapper(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index 9cf74e898c0..eb2dd7f63d5 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -9,7 +9,6 @@
 #include "py_alt_launch_kernel.h"
 #include "common/AnalogHamiltonian.h"
 #include "common/ArgumentWrapper.h"
-#include "common/CompiledModule.h"
 #include "common/Environment.h"
 #include "cudaq/Optimizer/Builder/Marshal.h"
 #include "cudaq/Optimizer/Builder/Runtime.h"
@@ -24,6 +23,7 @@
 #include "cudaq_internal/compiler/LayoutInfo.h"
 #include "runtime/cudaq/algorithms/py_utils.h"
 #include "utils/LinkedLibraryHolder.h"
+#include "utils/NanobindAdaptors.h"
 #include "utils/OpaqueArguments.h"
 #include "utils/PyTypes.h"
 #include "llvm/MC/SubtargetFeature.h"
@@ -31,7 +31,6 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Target/TargetMachine.h"
-#include "mlir/Bindings/Python/PybindAdaptors.h"
 #include "mlir/CAPI/ExecutionEngine.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/ExecutionEngine/OptUtils.h"
@@ -42,13 +41,20 @@
 #include "mlir/Target/LLVMIR/Export.h"
 #include "mlir/Transforms/Passes.h"
 #include <fmt/core.h>
-#include <pybind11/numpy.h>
-#include <pybind11/stl.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/ndarray.h>
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/function.h>
+#include <nanobind/stl/map.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/pair.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
 
-namespace py = pybind11;
 using namespace mlir;
 using namespace cudaq_internal::compiler;
 using cudaq::JitEngine;
+using cudaq::PackingStyle;
 
 static std::function<std::string()> getTransportLayer = []() -> std::string {
   throw std::runtime_error("binding for kernel launch is incomplete");
@@ -128,53 +134,55 @@ void cudaq::setDataLayout(MlirModule module) {
 // The section is the implementation of functions declared in OpaqueArguments.h
 //===----------------------------------------------------------------------===//
 
-py::args cudaq::simplifiedValidateInputArguments(py::args &args) {
-  py::args processed = py::tuple(args.size());
+nanobind::args cudaq::simplifiedValidateInputArguments(nanobind::args &args) {
+  nanobind::args processed =
+      nanobind::steal<nanobind::args>(PyTuple_New((Py_ssize_t)args.size()));
   for (std::size_t i = 0; i < args.size(); ++i) {
-    auto arg = args[i];
+    nanobind::object arg = nanobind::borrow(args[i]);
     // Check if it has tolist, so it might be a 1d buffer (array / numpy
     // ndarray)
-    if (py::hasattr(args[i], "tolist")) {
+    if (nanobind::hasattr(args[i], "tolist")) {
       // This is a valid ndarray if it has tolist and shape
-      if (!py::hasattr(args[i], "shape"))
+      if (!nanobind::hasattr(args[i], "shape"))
         throw std::runtime_error(
             "Invalid input argument type, could not get shape of array.");
 
       // This is an ndarray with tolist() and shape attributes
       // get the shape and check its size
-      auto shape = args[i].attr("shape").cast<py::tuple>();
+      auto shape = nanobind::cast<nanobind::tuple>(args[i].attr("shape"));
       if (shape.size() != 1)
         throw std::runtime_error("Cannot pass ndarray with shape != (N,).");
 
       arg = args[i].attr("tolist")();
-    } else if (py::isinstance<py::str>(arg)) {
-      arg = py::cast<std::string>(arg);
-    } else if (py::isinstance<py::list>(arg)) {
-      py::list arg_list = py::cast<py::list>(arg);
+    } else if (nanobind::isinstance<nanobind::str>(arg)) {
+      arg = nanobind::cast(nanobind::cast<std::string>(arg));
+    } else if (nanobind::isinstance<nanobind::list>(arg)) {
+      nanobind::list arg_list = nanobind::cast<nanobind::list>(arg);
       const bool all_strings = [&]() {
-        for (auto &item : arg_list)
-          if (!py::isinstance<py::str>(item))
+        for (auto item : arg_list)
+          if (!nanobind::isinstance<nanobind::str>(item))
             return false;
         return true;
       }();
       if (all_strings) {
         std::vector<cudaq::pauli_word> pw_list;
         pw_list.reserve(arg_list.size());
-        for (auto &item : arg_list)
-          pw_list.emplace_back(py::cast<std::string>(item));
-        arg = std::move(pw_list);
+        for (auto item : arg_list)
+          pw_list.emplace_back(nanobind::cast<std::string>(item));
+        arg = nanobind::cast(std::move(pw_list));
       }
     }
 
-    processed[i] = arg;
+    PyTuple_SET_ITEM(processed.ptr(), (Py_ssize_t)i, arg.inc_ref().ptr());
   }
 
   return processed;
 }
 
+template <PackingStyle style>
 void cudaq::handleStructMemberVariable(void *data, std::size_t offset,
                                        mlir::Type memberType,
-                                       py::object value) {
+                                       nanobind::object value) {
   auto appendValue = [](void *data, auto &&value, std::size_t offset) {
     std::memcpy(((char *)data) + offset, &value,
                 sizeof(std::remove_cvref_t<decltype(value)>));
@@ -182,30 +190,37 @@ void cudaq::handleStructMemberVariable(void *data, std::size_t offset,
   llvm::TypeSwitch<mlir::Type, void>(memberType)
       .Case([&](mlir::IntegerType ty) {
         if (ty.isInteger(1)) {
-          appendValue(data, (bool)value.cast<py::bool_>(), offset);
+          appendValue(data, nanobind::cast<bool>(value), offset);
           return;
         }
-        appendValue(data, (std::int64_t)value.cast<py::int_>(), offset);
+        appendValue(data, nanobind::cast<std::int64_t>(value), offset);
       })
       .Case([&](mlir::Float64Type ty) {
-        appendValue(data, (double)value.cast<py::float_>(), offset);
+        appendValue(data, nanobind::cast<double>(value), offset);
       })
       .Case([&](cudaq::cc::StdvecType ty) {
-        auto appendVectorValue = []<typename T>(py::object value, void *data,
-                                                std::size_t offset, T) {
-          auto asList = value.cast<py::list>();
+        auto appendVectorValue = []<typename T>(nanobind::object value,
+                                                void *data, std::size_t offset,
+                                                T) {
+          auto asList = nanobind::cast<nanobind::list>(value);
           // Use the correct element type T (not always double).
           auto *values = new std::vector<T>(asList.size());
-          for (std::size_t i = 0; auto &v : asList)
-            (*values)[i++] = v.cast<T>();
-
-          std::memcpy(((char *)data) + offset, values, 16);
+          for (std::size_t i = 0; auto v : asList)
+            (*values)[i++] = nanobind::cast<T>(v);
+
+          // synthesis path: span {ptr, size_t}
+          // argsCreator path: std::vector<T> {ptr, ptr, ptr}
+          constexpr std::size_t copySize =
+              sizeof(std::conditional_t<style == PackingStyle::synthesis,
+                                        std::pair<char *, std::size_t>,
+                                        std::vector<T>>);
+          std::memcpy(((char *)data) + offset, values, copySize);
         };
 
         mlir::TypeSwitch<mlir::Type, void>(ty.getElementType())
             .Case([&](mlir::IntegerType type) {
               if (type.isInteger(1)) {
-                appendVectorValue(value, data, offset, char());
+                appendVectorValue(value, data, offset, BoolVecElem<style>{});
                 return;
               }
               appendVectorValue(value, data, offset, std::size_t());
@@ -216,6 +231,20 @@ void cudaq::handleStructMemberVariable(void *data, std::size_t offset,
                 return;
               }
               appendVectorValue(value, data, offset, double());
+            })
+            .Case([&](cudaq::cc::StdvecType innerVecType) {
+              if constexpr (style == PackingStyle::synthesis) {
+                throw std::runtime_error(
+                    "Type not supported for custom struct in kernel.");
+              } else {
+                // Nested vector (e.g., list[list[int]]): delegate to
+                // handleVectorElements which handles the recursive case.
+                auto asList = nanobind::cast<nanobind::list>(value);
+                auto *values = handleVectorElements<PackingStyle::argsCreator>(
+                    innerVecType, asList);
+                std::memcpy(((char *)data) + offset, values,
+                            sizeof(std::vector<std::vector<std::size_t>>));
+              }
             });
       })
       .Default([&](mlir::Type ty) {
@@ -225,10 +254,12 @@ void cudaq::handleStructMemberVariable(void *data, std::size_t offset,
       });
 }
 
-void *cudaq::handleVectorElements(mlir::Type eleTy, py::list list) {
-  auto appendValue = []<typename T>(py::list list, auto &&converter) -> void * {
+template <PackingStyle style>
+void *cudaq::handleVectorElements(mlir::Type eleTy, nanobind::list list) {
+  auto appendValue = []<typename T>(nanobind::list list,
+                                    auto &&converter) -> void * {
     std::vector<T> *values = new std::vector<T>(list.size());
-    for (std::size_t i = 0; auto &v : list) {
+    for (std::size_t i = 0; auto v : list) {
       auto converted = converter(v, i);
       (*values)[i++] = converted;
     }
@@ -237,75 +268,76 @@ void *cudaq::handleVectorElements(mlir::Type eleTy, py::list list) {
 
   return llvm::TypeSwitch<mlir::Type, void *>(eleTy)
       .Case([&](mlir::IntegerType ty) {
-        if (ty.getIntOrFloatBitWidth() == 1)
-          return appendValue.template operator()<char>(
-              list, [](py::handle v, std::size_t i) {
-                checkListElementType<py::bool_>(v, i);
-                return v.cast<bool>();
+        if (ty.getIntOrFloatBitWidth() == 1) {
+          return appendValue.template operator()<BoolVecElem<style>>(
+              list, [](nanobind::handle v, std::size_t i) {
+                checkListElementType<nanobind::bool_>(v, i);
+                return static_cast<BoolVecElem<style>>(nanobind::cast<bool>(v));
               });
+        }
         if (ty.getIntOrFloatBitWidth() == 8)
           return appendValue.template operator()<std::int8_t>(
-              list, [](py::handle v, std::size_t i) {
+              list, [](nanobind::handle v, std::size_t i) {
                 checkListElementType<py_ext::Int>(v, i);
-                return v.cast<std::int8_t>();
+                return nanobind::cast<std::int8_t>(v);
               });
         if (ty.getIntOrFloatBitWidth() == 16)
           return appendValue.template operator()<std::int16_t>(
-              list, [](py::handle v, std::size_t i) {
+              list, [](nanobind::handle v, std::size_t i) {
                 checkListElementType<py_ext::Int>(v, i);
-                return v.cast<std::int16_t>();
+                return nanobind::cast<std::int16_t>(v);
               });
         if (ty.getIntOrFloatBitWidth() == 32)
           return appendValue.template operator()<std::int32_t>(
-              list, [](py::handle v, std::size_t i) {
+              list, [](nanobind::handle v, std::size_t i) {
                 checkListElementType<py_ext::Int>(v, i);
-                return v.cast<std::int32_t>();
+                return nanobind::cast<std::int32_t>(v);
               });
         return appendValue.template operator()<std::int64_t>(
-            list, [](py::handle v, std::size_t i) {
+            list, [](nanobind::handle v, std::size_t i) {
               checkListElementType<py_ext::Int>(v, i);
-              return v.cast<std::int64_t>();
+              return nanobind::cast<std::int64_t>(v);
             });
       })
       .Case([&](mlir::Float32Type ty) {
         return appendValue.template operator()<float>(
-            list, [](py::handle v, std::size_t i) {
+            list, [](nanobind::handle v, std::size_t i) {
               checkListElementType<py_ext::Float>(v, i);
-              return v.cast<float>();
+              return nanobind::cast<float>(v);
             });
       })
       .Case([&](mlir::Float64Type ty) {
         return appendValue.template operator()<double>(
-            list, [](py::handle v, std::size_t i) {
+            list, [](nanobind::handle v, std::size_t i) {
               checkListElementType<py_ext::Float>(v, i);
-              return v.cast<double>();
+              return nanobind::cast<double>(v);
             });
       })
       .Case([&](cudaq::cc::CharspanType type) {
         return appendValue.template operator()<std::string>(
-            list, [](py::handle v, std::size_t i) {
-              return v.cast<cudaq::pauli_word>().str();
+            list, [](nanobind::handle v, std::size_t i) {
+              return nanobind::cast<cudaq::pauli_word>(v).str();
             });
       })
       .Case([&](mlir::ComplexType type) {
         if (mlir::isa<mlir::Float64Type>(type.getElementType()))
           return appendValue.template operator()<std::complex<double>>(
-              list, [](py::handle v, std::size_t i) {
+              list, [](nanobind::handle v, std::size_t i) {
                 checkListElementType<py_ext::Complex>(v, i);
-                return v.cast<std::complex<double>>();
+                return nanobind::cast<std::complex<double>>(v);
               });
         return appendValue.template operator()<std::complex<float>>(
-            list, [](py::handle v, std::size_t i) {
+            list, [](nanobind::handle v, std::size_t i) {
               checkListElementType<py_ext::Complex>(v, i);
-              return v.cast<std::complex<float>>();
+              return nanobind::cast<std::complex<float>>(v);
             });
       })
       .Case([&](cudaq::cc::StdvecType ty) {
         auto appendVectorValue = []<typename T>(mlir::Type eleTy,
-                                                py::list list) -> void * {
+                                                nanobind::list list) -> void * {
           auto *values = new std::vector<std::vector<T>>();
           for (std::size_t i = 0; i < list.size(); i++) {
-            auto ptr = handleVectorElements(eleTy, list[i]);
+            auto ptr = handleVectorElements<style>(eleTy, list[i]);
             auto *element = static_cast<std::vector<T> *>(ptr);
             values->emplace_back(std::move(*element));
           }
@@ -313,9 +345,11 @@ void *cudaq::handleVectorElements(mlir::Type eleTy, py::list list) {
         };
 
         auto eleTy = ty.getElementType();
-        if (ty.getElementType().isInteger(1))
+        if (ty.getElementType().isInteger(1)) {
           // Special case for a `std::vector<bool>`.
-          return appendVectorValue.template operator()<char>(eleTy, list);
+          return appendVectorValue.template operator()<BoolVecElem<style>>(
+              eleTy, list);
+        }
 
         // All other `std::Vector<T>` types, including nested vectors.
         return appendVectorValue.template operator()<std::size_t>(eleTy, list);
@@ -336,16 +370,19 @@ std::string cudaq::mlirTypeToString(mlir::Type ty) {
   return msg;
 }
 
-void cudaq::packArgs(OpaqueArguments &argData, py::list args,
-                     mlir::ArrayRef<mlir::Type> mlirTys,
-                     const std::function<bool(OpaqueArguments &, py::object &,
-                                              unsigned)> &backupHandler,
-                     mlir::func::FuncOp kernelFuncOp) {
+template <PackingStyle style>
+void cudaq::packArgs(
+    OpaqueArguments &argData, nanobind::list args,
+    mlir::ArrayRef<mlir::Type> mlirTys,
+    const std::function<bool(OpaqueArguments &, nanobind::object &, unsigned)>
+        &backupHandler,
+    mlir::func::FuncOp kernelFuncOp) {
   if (args.size() == 0)
     return;
 
   for (auto [i, zippy] : llvm::enumerate(llvm::zip(args, mlirTys))) {
-    py::object arg = py::reinterpret_borrow<py::object>(std::get<0>(zippy));
+    nanobind::object arg =
+        nanobind::borrow<nanobind::object>(std::get<0>(zippy));
     Type kernelArgTy = std::get<1>(zippy);
     if (arg.is_none()) {
       argData.emplace_back(nullptr, [](void *ptr) {});
@@ -355,39 +392,42 @@ void cudaq::packArgs(OpaqueArguments &argData, py::list args,
         .Case([&](ComplexType ty) {
           checkArgumentType<py_ext::Complex>(arg, i);
           if (isa<Float64Type>(ty.getElementType())) {
-            addArgument(argData, arg.cast<std::complex<double>>());
+            addArgument(argData, nanobind::cast<std::complex<double>>(arg));
           } else if (isa<Float32Type>(ty.getElementType())) {
-            addArgument(argData, arg.cast<std::complex<float>>());
+            addArgument(argData, nanobind::cast<std::complex<float>>(arg));
           } else {
-            throw std::runtime_error("Invalid complex type argument: " +
-                                     py::str(args).cast<std::string>() +
-                                     " Type: " + mlirTypeToString(ty));
+            throw std::runtime_error(
+                "Invalid complex type argument: " +
+                nanobind::cast<std::string>(
+                    nanobind::steal(PyObject_Str(args.ptr()))) +
+                " Type: " + mlirTypeToString(ty));
           }
         })
         .Case([&](Float64Type ty) {
           checkArgumentType<py_ext::Float>(arg, i);
-          addArgument(argData, arg.cast<double>());
+          addArgument(argData, nanobind::cast<double>(arg));
         })
         .Case([&](Float32Type ty) {
           checkArgumentType<py_ext::Float>(arg, i);
-          addArgument(argData, arg.cast<float>());
+          addArgument(argData, nanobind::cast<float>(arg));
         })
         .Case([&](IntegerType ty) {
           if (ty.getIntOrFloatBitWidth() == 1) {
-            checkArgumentType<py::bool_>(arg, i);
-            addArgument(argData, static_cast<char>(arg.cast<bool>()));
+            checkArgumentType<nanobind::bool_>(arg, i);
+            addArgument(argData, static_cast<BoolVecElem<style>>(
+                                     nanobind::cast<bool>(arg)));
             return;
           }
 
           checkArgumentType<py_ext::Int>(arg, i);
-          addArgument(argData, arg.cast<std::int64_t>());
+          addArgument(argData, nanobind::cast<std::int64_t>(arg));
         })
         .Case([&](cc::CharspanType ty) {
-          addArgument(argData, arg.cast<pauli_word>().str());
+          addArgument(argData, nanobind::cast<pauli_word>(arg).str());
         })
         .Case([&](cc::PointerType ty) {
           if (isa<quake::StateType>(ty.getElementType())) {
-            auto *stateArg = arg.cast<state *>();
+            auto *stateArg = nanobind::cast<state *>(arg);
 
             if (stateArg == nullptr)
               throw std::runtime_error("Null cudaq::state* argument passed.");
@@ -413,29 +453,36 @@ void cudaq::packArgs(OpaqueArguments &argData, py::list args,
                   [](void *ptr) { /* do nothing, we don't own the state */ });
             }
           } else {
-            throw std::runtime_error("Invalid pointer type argument: " +
-                                     py::str(arg).cast<std::string>() +
-                                     " Type: " + mlirTypeToString(ty));
+            throw std::runtime_error(
+                "Invalid pointer type argument: " +
+                nanobind::cast<std::string>(
+                    nanobind::steal(PyObject_Str(arg.ptr()))) +
+                " Type: " + mlirTypeToString(ty));
           }
         })
         .Case([&](cc::StructType ty) {
           auto mod = kernelFuncOp->getParentOfType<mlir::ModuleOp>();
-          auto [size, offsets] = getTargetLayout(mod, ty);
+          cc::StructType layoutTy = ty;
+          if constexpr (style == PackingStyle::argsCreator)
+            layoutTy = cast<cc::StructType>(
+                cudaq::opt::factory::convertToHostSideType(ty, mod));
+          auto [size, offsets] = getTargetLayout(mod, layoutTy);
           auto memberTys = ty.getMembers();
           auto allocatedArg = std::malloc(size);
           if (ty.getName() == "tuple") {
-            auto elements = arg.cast<py::tuple>();
+            auto elements = nanobind::cast<nanobind::tuple>(arg);
             for (std::size_t i = 0; i < offsets.size(); i++)
-              handleStructMemberVariable(allocatedArg, offsets[i], memberTys[i],
-                                         elements[i]);
+              handleStructMemberVariable<style>(allocatedArg, offsets[i],
+                                                memberTys[i], elements[i]);
           } else {
-            py::dict attributes = arg.attr("__annotations__").cast<py::dict>();
+            nanobind::dict attributes =
+                nanobind::cast<nanobind::dict>(arg.attr("__annotations__"));
             for (std::size_t i = 0;
                  const auto &[attr_name, unused] : attributes) {
-              py::object attr_value =
-                  arg.attr(attr_name.cast<std::string>().c_str());
-              handleStructMemberVariable(allocatedArg, offsets[i], memberTys[i],
-                                         attr_value);
+              nanobind::object attr_value =
+                  arg.attr(nanobind::cast<std::string>(attr_name).c_str());
+              handleStructMemberVariable<style>(allocatedArg, offsets[i],
+                                                memberTys[i], attr_value);
               i++;
             }
           }
@@ -443,19 +490,20 @@ void cudaq::packArgs(OpaqueArguments &argData, py::list args,
         })
         .Case([&](cc::StdvecType ty) {
           auto appendVectorValue = [&argData]<typename T>(Type eleTy,
-                                                          py::list list) {
-            auto allocatedArg = handleVectorElements(eleTy, list);
+                                                          nanobind::list list) {
+            auto allocatedArg = handleVectorElements<style>(eleTy, list);
             argData.emplace_back(allocatedArg, [](void *ptr) {
               delete static_cast<std::vector<T> *>(ptr);
             });
           };
 
-          checkArgumentType<py::list>(arg, i);
-          auto list = py::cast<py::list>(arg);
+          checkArgumentType<nanobind::list>(arg, i);
+          auto list = nanobind::cast<nanobind::list>(arg);
           auto eleTy = ty.getElementType();
           if (eleTy.isInteger(1)) {
             // Special case for a `std::vector<bool>`.
-            appendVectorValue.template operator()<char>(eleTy, list);
+            appendVectorValue.template operator()<BoolVecElem<style>>(eleTy,
+                                                                      list);
             return;
           }
           // All other `std::vector<T>` types, including nested vectors.
@@ -463,14 +511,15 @@ void cudaq::packArgs(OpaqueArguments &argData, py::list args,
         })
         .Case([&](cc::CallableType ty) {
           // arg must be a DecoratorCapture object.
-          checkArgumentType<py::object>(arg, i);
-          if (py::hasattr(arg, "linkedKernel")) {
-            auto kernelName = arg.attr("linkedKernel").cast<std::string>();
+          checkArgumentType<nanobind::object>(arg, i);
+          if (nanobind::hasattr(arg, "linkedKernel")) {
+            auto kernelName =
+                nanobind::cast<std::string>(arg.attr("linkedKernel"));
             // TODO: This is kinda yucky to have to remove because it's already
             // present
             kernelName.erase(0, strlen(cudaq::runtime::cudaqGenPrefixName));
             auto kernelModule =
-                unwrap(arg.attr("qkeModule").cast<MlirModule>());
+                unwrap(nanobind::cast<MlirModule>(arg.attr("qkeModule")));
             OpaqueArguments resolvedArgs;
             argData.emplace_back(
                 new runtime::CallableClosureArgument(kernelName, kernelModule,
@@ -480,16 +529,18 @@ void cudaq::packArgs(OpaqueArguments &argData, py::list args,
                   delete static_cast<runtime::CallableClosureArgument *>(that);
                 });
           } else {
-            py::object decorator = arg.attr("decorator");
-            auto kernelName = decorator.attr("uniqName").cast<std::string>();
+            nanobind::object decorator = arg.attr("decorator");
+            auto kernelName =
+                nanobind::cast<std::string>(decorator.attr("uniqName"));
             auto kernelModule =
-                unwrap(decorator.attr("qkeModule").cast<MlirModule>());
+                unwrap(nanobind::cast<MlirModule>(decorator.attr("qkeModule")));
             auto calledFuncOp = kernelModule.lookupSymbol<func::FuncOp>(
                 cudaq::runtime::cudaqGenPrefixName + kernelName);
-            py::list arguments = arg.attr("resolved");
+            nanobind::list arguments = arg.attr("resolved");
             auto startLiftedArgs = [&]() -> std::optional<unsigned> {
               if (!arguments.empty())
-                return decorator.attr("formal_arity")().cast<unsigned>();
+                return nanobind::cast<unsigned>(
+                    decorator.attr("formal_arity")());
               return std::nullopt;
             }();
             // build the recursive closure in a C++ object
@@ -498,8 +549,8 @@ void cudaq::packArgs(OpaqueArguments &argData, py::list args,
               if (startLiftedArgs) {
                 auto fnTy = calledFuncOp.getFunctionType();
                 auto liftedTys = fnTy.getInputs().drop_front(*startLiftedArgs);
-                packArgs(resolvedArgs, arguments, liftedTys, backupHandler,
-                         calledFuncOp);
+                packArgs<style>(resolvedArgs, arguments, liftedTys,
+                                backupHandler, calledFuncOp);
               }
               return new runtime::CallableClosureArgument(
                   kernelName, kernelModule, std::move(startLiftedArgs),
@@ -515,17 +566,21 @@ void cudaq::packArgs(OpaqueArguments &argData, py::list args,
           bool success = backupHandler(argData, arg, i);
           if (!success)
             throw std::runtime_error(
-                "Could not pack argument: " + py::str(arg).cast<std::string>() +
+                "Could not pack argument: " +
+                nanobind::cast<std::string>(
+                    nanobind::steal(PyObject_Str(arg.ptr()))) +
                 " Type: " + mlirTypeToString(ty));
         });
   }
 }
 
-void cudaq::packArgs(OpaqueArguments &argData, py::args args,
-                     mlir::func::FuncOp kernelFuncOp,
-                     const std::function<bool(OpaqueArguments &, py::object &,
-                                              unsigned)> &backupHandler,
-                     std::size_t startingArgIdx) {
+template <PackingStyle style>
+void cudaq::packArgs(
+    OpaqueArguments &argData, nanobind::args args,
+    mlir::func::FuncOp kernelFuncOp,
+    const std::function<bool(OpaqueArguments &, nanobind::object &, unsigned)>
+        &backupHandler,
+    std::size_t startingArgIdx) {
   if (args.size() == 0) {
     // Nothing to pack. This may be a full QIR pre-compile, which is perfectly
     // legit. At any rate, there is nothing to pack so return.
@@ -539,13 +594,13 @@ void cudaq::packArgs(OpaqueArguments &argData, py::args args,
                              std::to_string(args.size()) + " arguments.");
 
   // Move the args to a list, lopping off startingArgIdx args from the front.
-  py::list pyList;
+  nanobind::list pyList;
   for (auto [i, h] : llvm::enumerate(args)) {
     if (i < startingArgIdx)
       continue;
     pyList.append(h);
   }
-  return packArgs(
+  return packArgs<style>(
       argData, pyList,
       kernelFuncOp.getFunctionType().getInputs().drop_front(startingArgIdx),
       backupHandler, kernelFuncOp);
@@ -556,11 +611,11 @@ void cudaq::packArgs(OpaqueArguments &argData, py::args args,
 /// Mechanical merge of a callable argument (captured in a python decorator)
 /// when the call site is executed.
 static bool linkResolvedCallable(ModuleOp currMod, func::FuncOp entryPoint,
-                                 unsigned argPos, py::object arg) {
-  if (!py::hasattr(arg, "qkeModule"))
+                                 unsigned argPos, nanobind::object arg) {
+  if (!nanobind::hasattr(arg, "qkeModule"))
     return false;
-  auto uniqName = arg.attr("uniqName").cast<std::string>();
-  auto otherModule = arg.attr("qkeModule").cast<MlirModule>();
+  auto uniqName = nanobind::cast<std::string>(arg.attr("uniqName"));
+  auto otherModule = nanobind::cast<MlirModule>(arg.attr("qkeModule"));
   ModuleOp otherMod = unwrap(otherModule);
   std::string calleeName = cudaq::runtime::cudaqGenPrefixName + uniqName;
   auto callee = cudaq::getKernelFuncOp(otherModule, calleeName);
@@ -586,7 +641,8 @@ static bool linkResolvedCallable(ModuleOp currMod, func::FuncOp entryPoint,
 
 /// @brief Create a new OpaqueArguments pointer and pack the python arguments
 /// in it. Clients must delete the memory.
-cudaq::OpaqueArguments *cudaq::toOpaqueArgs(py::args &args, MlirModule mod,
+cudaq::OpaqueArguments *cudaq::toOpaqueArgs(nanobind::args &args,
+                                            MlirModule mod,
                                             const std::string &name) {
   auto kernelFunc = getKernelFuncOp(mod, name);
   auto *argData = new cudaq::OpaqueArguments();
@@ -594,7 +650,7 @@ cudaq::OpaqueArguments *cudaq::toOpaqueArgs(py::args &args, MlirModule mod,
   setDataLayout(mod);
   cudaq::packArgs(
       *argData, args, kernelFunc,
-      [](OpaqueArguments &, py::object &, unsigned) { return false; });
+      [](OpaqueArguments &, nanobind::object &, unsigned) { return false; });
   return argData;
 }
 
@@ -642,7 +698,7 @@ static void pyAltLaunchAnalogKernel(const std::string &name,
 }
 
 template <typename T>
-py::object readPyObject(Type ty, char *arg) {
+nanobind::object readPyObject(Type ty, char *arg) {
   std::size_t bytes = cudaq::byteSize(ty);
   if (sizeof(T) != bytes) {
     ty.dump();
@@ -658,11 +714,11 @@ py::object readPyObject(Type ty, char *arg) {
 
 /// Convert bytes in buffer, \p data, which are the result of the kernel
 /// launched to python object.
-py::object cudaq::convertResult(ModuleOp module, Type ty, char *data) {
+nanobind::object cudaq::convertResult(ModuleOp module, Type ty, char *data) {
   auto isRunContext = module->hasAttr(runtime::enableCudaqRun);
 
-  return TypeSwitch<Type, py::object>(ty)
-      .Case([&](IntegerType ty) -> py::object {
+  return TypeSwitch<Type, nanobind::object>(ty)
+      .Case([&](IntegerType ty) -> nanobind::object {
         if (ty.getIntOrFloatBitWidth() == 1)
           return readPyObject<bool>(ty, data);
         if (ty.getIntOrFloatBitWidth() == 8)
@@ -673,43 +729,53 @@ py::object cudaq::convertResult(ModuleOp module, Type ty, char *data) {
           return readPyObject<std::int32_t>(ty, data);
         return readPyObject<std::int64_t>(ty, data);
       })
-      .Case([&](ComplexType ty) -> py::object {
+      .Case([&](ComplexType ty) -> nanobind::object {
         auto eleTy = ty.getElementType();
-        return TypeSwitch<Type, py::object>(eleTy)
-            .Case([&](Float64Type eTy) -> py::object {
+        return TypeSwitch<Type, nanobind::object>(eleTy)
+            .Case([&](Float64Type eTy) -> nanobind::object {
               return readPyObject<std::complex<double>>(ty, data);
             })
-            .Case([&](Float32Type eTy) -> py::object {
+            .Case([&](Float32Type eTy) -> nanobind::object {
               return readPyObject<std::complex<float>>(ty, data);
             })
-            .Default([](Type eTy) -> py::object {
+            .Default([](Type eTy) -> nanobind::object {
               eTy.dump();
               throw std::runtime_error(
                   "Unsupported float element type for complex type return.");
             });
       })
-      .Case([&](Float64Type ty) -> py::object {
+      .Case([&](Float64Type ty) -> nanobind::object {
         return readPyObject<double>(ty, data);
       })
-      .Case([&](Float32Type ty) -> py::object {
+      .Case([&](Float32Type ty) -> nanobind::object {
         return readPyObject<float>(ty, data);
       })
-      .Case([&](cudaq::cc::StdvecType ty) -> py::object {
+      .Case([&](cudaq::cc::StdvecType ty) -> nanobind::object {
+        auto eleTy = ty.getElementType();
+        // Nested StdvecType elements have a different in-memory size than
+        // scalar types: span ({ptr,size_t} = 16 bytes) in direct-call context,
+        // std::vector ({ptr,ptr,ptr} = 24 bytes) in run context.
+        auto getEleByteSize = [&](Type eTy) -> std::size_t {
+          if (isa<cudaq::cc::StdvecType>(eTy))
+            return isRunContext ? 3 * sizeof(void *)
+                                : sizeof(char *) + sizeof(std::size_t);
+          return byteSize(eTy);
+        };
+
         if (isRunContext) {
           // cudaq.run return.
-          auto eleTy = ty.getElementType();
-          auto eleByteSize = byteSize(eleTy);
+          auto eleByteSize = getEleByteSize(eleTy);
 
           // Vector of booleans has a special layout.
           // Read the vector and create a list of booleans.
           // Note: in the `cudaq::run` context the `std::vector<bool>` is
           // constructed in the host runtime by parsing the output log to
           // `std::vector<bool>`.
-          if (eleTy.getIntOrFloatBitWidth() == 1) {
+          if (eleTy.isInteger(1)) {
             auto v = reinterpret_cast<std::vector<bool> *>(data);
-            py::list list;
+            nanobind::list list;
             for (auto const bit : *v)
-              list.append(py::bool_(bit));
+              list.append(nanobind::bool_(bit));
             return list;
           }
 
@@ -723,15 +789,14 @@ py::object cudaq::convertResult(ModuleOp module, Type ty, char *data) {
           auto v = reinterpret_cast<vec *>(data);
 
           // Read vector elements.
-          py::list list;
+          nanobind::list list;
           for (char *i = v->begin; i < v->end; i += eleByteSize)
             list.append(convertResult(module, eleTy, i));
           return list;
         }
 
         // Direct call return.
-        auto eleTy = ty.getElementType();
-        auto eleByteSize = byteSize(eleTy);
+        auto eleByteSize = getEleByteSize(eleTy);
 
         // Vector is a span: `{ data, length }`.
         // Read `data` and `length` from the buffer.
@@ -742,19 +807,19 @@ py::object cudaq::convertResult(ModuleOp module, Type ty, char *data) {
         auto v = reinterpret_cast<vec *>(data);
 
         // Read vector elements.
-        py::list list;
+        nanobind::list list;
         std::size_t byteLength = v->length * eleByteSize;
         for (std::size_t i = 0; i < byteLength; i += eleByteSize)
           list.append(convertResult(module, eleTy, v->data + i));
         return list;
       })
-      .Case([&](cudaq::cc::StructType ty) -> py::object {
+      .Case([&](cudaq::cc::StructType ty) -> nanobind::object {
         auto name = ty.getName().str();
         // Handle tuples.
         if (name == "tuple") {
           auto [size, offsets] = getTargetLayout(module, ty);
           auto memberTys = ty.getMembers();
-          py::list list;
+          nanobind::list list;
           for (std::size_t i = 0; i < offsets.size(); i++) {
             auto eleTy = memberTys[i];
             if (!eleTy.isIntOrFloat()) {
@@ -765,7 +830,7 @@ py::object cudaq::convertResult(ModuleOp module, Type ty, char *data) {
             }
             list.append(convertResult(module, eleTy, data + offsets[i]));
           }
-          return py::tuple(list);
+          return nanobind::tuple(list);
         }
 
         // Handle data class objects.
@@ -776,14 +841,14 @@ py::object cudaq::convertResult(ModuleOp module, Type ty, char *data) {
         auto [cls, attributes] = DataClassRegistry::getClassAttributes(name);
 
         // Collect field names.
-        std::vector<py::str> fieldNames;
+        std::vector<nanobind::str> fieldNames;
         for (const auto &[attr_name, unused] : attributes)
-          fieldNames.emplace_back(py::str(attr_name));
+          fieldNames.emplace_back(nanobind::str(attr_name));
 
         // Read field values and create the constructor `kwargs`
         auto [size, offsets] = getTargetLayout(module, ty);
         auto memberTys = ty.getMembers();
-        py::dict kwargs;
+        nanobind::dict kwargs;
         for (std::size_t i = 0; i < offsets.size(); i++) {
           auto eleTy = memberTys[i];
           if (!eleTy.isIntOrFloat()) {
@@ -804,7 +869,7 @@ py::object cudaq::convertResult(ModuleOp module, Type ty, char *data) {
         // Create python object of class `cls` with the collected args.
         return cls(**kwargs);
       })
-      .Default([](Type ty) -> py::object {
+      .Default([](Type ty) -> nanobind::object {
         ty.dump();
         throw std::runtime_error("Unsupported return type.");
       });
@@ -828,43 +893,45 @@ cudaq::clean_launch_module(const std::string &name, ModuleOp mod,
   return pyLaunchModule(name, mod, rawArgs);
 }
 
-cudaq::OpaqueArguments
-cudaq::marshal_arguments_for_module_launch(ModuleOp mod, py::args runtimeArgs,
-                                           func::FuncOp kernelFunc) {
+cudaq::OpaqueArguments cudaq::marshal_arguments_for_module_launch(
+    ModuleOp mod, nanobind::args runtimeArgs, func::FuncOp kernelFunc) {
   // Convert python arguments to opaque form.
   cudaq::OpaqueArguments args;
-  cudaq::packArgs(
-      args, runtimeArgs, kernelFunc,
-      [&](cudaq::OpaqueArguments &args, py::object &pyArg, unsigned pos) {
-        return linkResolvedCallable(mod, kernelFunc, pos, pyArg);
-      });
+  bool isLocalSimulator =
+      !(cudaq::is_remote_platform() || cudaq::is_emulated_platform());
+  auto handler = [&](cudaq::OpaqueArguments &args, nanobind::object &pyArg,
+                     unsigned pos) {
+    return linkResolvedCallable(mod, kernelFunc, pos, pyArg);
+  };
+  if (isLocalSimulator)
+    cudaq::packArgs<PackingStyle::argsCreator>(args, runtimeArgs, kernelFunc,
+                                               handler);
+  else
+    cudaq::packArgs<PackingStyle::synthesis>(args, runtimeArgs, kernelFunc,
+                                             handler);
   return args;
 }
 
-py::object cudaq::marshal_and_launch_module(const std::string &name,
-                                            MlirModule module,
-                                            py::args runtimeArgs) {
+nanobind::object cudaq::marshal_and_launch_module(const std::string &name,
+                                                  MlirModule module,
+                                                  nanobind::args runtimeArgs) {
   ScopedTraceWithContext("marshal_and_launch_module", name);
   auto kernelFunc = getKernelFuncOp(module, name);
   auto mod = unwrap(module);
   Type retTy = cudaq::runtime::getReturnType(kernelFunc);
   auto args = marshal_arguments_for_module_launch(mod, runtimeArgs, kernelFunc);
   [[maybe_unused]] auto resultPtr = clean_launch_module(name, mod, args);
-  // FIXME: handle dynamic sized results!
 
   if (!retTy)
-    return py::none();
+    return nanobind::none();
   return cudaq::convertResult(mod, retTy,
                               reinterpret_cast<char *>(args.getArgs().back()));
 }
 
 // Compile (specialize + JIT) the kernel module and return a CompiledModule.
-// The returned instance owns the JIT engine and manages its lifetime using
-// RAII.
-static cudaq::CompiledModule marshal_and_retain_module(const std::string &name,
-                                                       MlirModule module,
-                                                       bool isEntryPoint,
-                                                       py::args runtimeArgs) {
+static cudaq::CompiledModule
+marshal_and_retain_module(const std::string &name, MlirModule module,
+                          bool isEntryPoint, nanobind::args runtimeArgs) {
   ScopedTraceWithContext("marshal_and_retain_module", name);
 
   auto kernelFunc = cudaq::getKernelFuncOp(module, name);
@@ -881,10 +948,11 @@ static cudaq::CompiledModule marshal_and_retain_module(const std::string &name,
   return compiled;
 }
 
-static MlirModule synthesizeKernel(py::object kernel, py::args runtimeArgs) {
-  auto module = kernel.attr("qkeModule").cast<MlirModule>();
+static MlirModule synthesizeKernel(nanobind::object kernel,
+                                   nanobind::args runtimeArgs) {
+  auto module = nanobind::cast<MlirModule>(kernel.attr("qkeModule"));
   auto mod = unwrap(module);
-  auto name = kernel.attr("uniqName").cast<std::string>();
+  auto name = nanobind::cast<std::string>(kernel.attr("uniqName"));
   if (mod->hasAttr(cudaq::runtime::pythonUniqueAttrName)) {
     StringRef n =
         cast<StringAttr>(mod->getAttr(cudaq::runtime::pythonUniqueAttrName));
@@ -893,9 +961,10 @@ static MlirModule synthesizeKernel(py::object kernel, py::args runtimeArgs) {
   auto kernelFuncOp = cudaq::getKernelFuncOp(module, name);
   cudaq::OpaqueArguments args;
   cudaq::setDataLayout(module);
-  cudaq::packArgs(
-      args, runtimeArgs, kernelFuncOp,
-      [](cudaq::OpaqueArguments &, py::object &, unsigned) { return false; });
+  cudaq::packArgs(args, runtimeArgs, kernelFuncOp,
+                  [](cudaq::OpaqueArguments &, nanobind::object &, unsigned) {
+                    return false;
+                  });
 
   ScopedTraceWithContext(cudaq::TIMING_JIT, "synthesizeKernel", name);
   auto rawArgs = appendResultToArgsVector(args, {}, mod, name);
@@ -1039,13 +1108,14 @@ static ModuleOp cleanLowerToCodegenKernel(ModuleOp mod,
 }
 
 static MlirModule lower_to_codegen(const std::string &kernelName,
-                                   MlirModule module, py::args runtimeArgs) {
+                                   MlirModule module,
+                                   nanobind::args runtimeArgs) {
   auto kernelFunc = cudaq::getKernelFuncOp(module, kernelName);
   cudaq::OpaqueArguments args;
   auto mod = unwrap(module);
   cudaq::packArgs(
       args, runtimeArgs, kernelFunc,
-      [&](cudaq::OpaqueArguments &args, py::object &pyArg, unsigned pos) {
+      [&](cudaq::OpaqueArguments &args, nanobind::object &pyArg, unsigned pos) {
         return linkResolvedCallable(mod, kernelFunc, pos, pyArg);
       });
   return wrap(cleanLowerToCodegenKernel(mod, args));
@@ -1065,21 +1135,20 @@ static std::size_t get_launch_args_required(MlirModule module,
   return result;
 }
 
-void cudaq::bindAltLaunchKernel(py::module &mod,
+void cudaq::bindAltLaunchKernel(nanobind::module_ &mod,
                                 std::function<std::string()> &&getTL) {
   getTransportLayer = std::move(getTL);
 
-  py::class_<cudaq::CompiledModule>(mod, "CompiledModule")
-      .def_property_readonly(
+  nanobind::class_<cudaq::CompiledModule>(mod, "CompiledModule")
+      .def_prop_ro(
           "entry_point",
           [](const cudaq::CompiledModule &ck) {
-            return reinterpret_cast<std::uintptr_t>(
-                ck.getJit().getEntryPoint());
+            return reinterpret_cast<std::uintptr_t>(ck.getJit()->getFn());
           },
           "The address of the JIT-compiled entry point.")
-      .def_property_readonly("is_fully_specialized",
-                             &cudaq::CompiledModule::isFullySpecialized,
-                             "Whether all arguments have been specialized.");
+      .def_prop_ro("is_fully_specialized",
+                   &cudaq::CompiledModule::isFullySpecialized,
+                   "Whether all arguments have been specialized.");
 
   mod.def("lower_to_codegen", lower_to_codegen,
           "Lower a kernel module to CC dialect. Never launches the kernel.");
@@ -1100,9 +1169,9 @@ void cudaq::bindAltLaunchKernel(py::module &mod,
 
   mod.def(
       "storePointerToStateData",
-      [](const std::string &name, const std::string &hash, py::buffer data,
-         simulation_precision precision) {
-        auto ptr = data.request().ptr;
+      [](const std::string &name, const std::string &hash,
+         nanobind::ndarray<> data, simulation_precision precision) {
+        auto ptr = data.data();
         stateStorage->insert({hash, PyStateVectorData{ptr, precision, name}});
       },
       "Store qalloc state initialization array data.");
@@ -1124,8 +1193,9 @@ void cudaq::bindAltLaunchKernel(py::module &mod,
 
   mod.def(
       "storePointerToCudaqState",
-      [](const std::string &name, const std::string &hash, py::object data) {
-        auto state = data.cast<cudaq::state>();
+      [](const std::string &name, const std::string &hash,
+         nanobind::object data) {
+        auto state = nanobind::cast<cudaq::state>(data);
         cudaqStateStorage->insert({hash, PyStateData{state, name}});
       },
       "Store qalloc state initialization states.");
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.h b/python/runtime/cudaq/platform/py_alt_launch_kernel.h
index dcf74abe027..1a94c11bbbe 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.h
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.h
@@ -10,17 +10,17 @@
 
 #include "cudaq/Optimizer/Builder/Factory.h"
 #include "cudaq/algorithms/run.h"
+#include "utils/NanobindAdaptors.h"
 #include "utils/OpaqueArguments.h"
 #include "utils/PyTypes.h"
-#include "mlir/Bindings/Python/PybindAdaptors.h"
-#include <pybind11/complex.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/function.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
 #include <string>
 #include <vector>
 
-namespace py = pybind11;
-
 namespace cudaq {
 
 /// @brief Set current architecture's data layout attribute on a module.
@@ -28,25 +28,28 @@ void setDataLayout(MlirModule module);
 
 /// @brief Create a new OpaqueArguments pointer and pack the
 /// python arguments in it. Clients must delete the memory.
-OpaqueArguments *toOpaqueArgs(py::args &args, MlirModule mod,
+OpaqueArguments *toOpaqueArgs(nanobind::args &args, MlirModule mod,
                               const std::string &name);
 
 // FIXME: Document!
 std::size_t byteSize(mlir::Type ty);
 
 /// @brief Convert raw return of kernel to python object.
-py::object convertResult(mlir::ModuleOp module, mlir::Type ty, char *data);
+nanobind::object convertResult(mlir::ModuleOp module, mlir::Type ty,
+                               char *data);
 
 /// Create python bindings for C++ code in this compilation unit.
-void bindAltLaunchKernel(py::module &mod, std::function<std::string()> &&);
+void bindAltLaunchKernel(nanobind::module_ &mod,
+                         std::function<std::string()> &&);
 
 /// Launch the kernel \p kernelName from module \p module. \p runtimeArgs are
 /// the python arguments to the kernel. Pre-condition: all arguments must be
 /// resolved at this `callsite` \e prior to launching this module. In particular
 /// this means \p module is ready for beta reduction of callables. The return
 /// type is obtained from the kernel's FuncOp. \p module must be modifiable.
-py::object marshal_and_launch_module(const std::string &kernelName,
-                                     MlirModule module, py::args runtimeArgs);
+nanobind::object marshal_and_launch_module(const std::string &kernelName,
+                                           MlirModule module,
+                                           nanobind::args runtimeArgs);
 
 /// Pure C++ code that launches a kernel. Argument marshaling and result
 /// unmarshalling is \e not performed.
@@ -54,8 +57,12 @@ KernelThunkResultType clean_launch_module(const std::string &kernelName,
                                           mlir::ModuleOp mod,
                                           OpaqueArguments &args);
 
+/// Marshal python arguments into an OpaqueArguments for kernel launch.
+/// Encodes arguments in the runtime ABI layout for direct local simulation,
+/// and the synthesis-pass layout for all other targets.
 OpaqueArguments
-marshal_arguments_for_module_launch(mlir::ModuleOp mod, py::args runtimeArgs,
+marshal_arguments_for_module_launch(mlir::ModuleOp mod,
+                                    nanobind::args runtimeArgs,
                                     mlir::func::FuncOp kernelFunc);
 
 } // namespace cudaq
diff --git a/python/runtime/cudaq/qis/py_execution_manager.cpp b/python/runtime/cudaq/qis/py_execution_manager.cpp
index 467c0d2c36b..13cf01df6cb 100644
--- a/python/runtime/cudaq/qis/py_execution_manager.cpp
+++ b/python/runtime/cudaq/qis/py_execution_manager.cpp
@@ -8,14 +8,13 @@
 
 #include "cudaq/qis/execution_manager.h"
 #include <fmt/core.h>
-#include <pybind11/complex.h>
-#include <pybind11/stl.h>
-
-namespace py = pybind11;
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
 
 namespace cudaq {
 
-void bindExecutionManager(py::module &mod) {
+void bindExecutionManager(nanobind::module_ &mod) {
 
   mod.def(
       "applyQuantumOperation",
@@ -29,9 +28,9 @@ void bindExecutionManager(py::module &mod) {
                        [](auto &&el) { return cudaq::QuditInfo(2, el); });
         cudaq::getExecutionManager()->apply(name, params, c, t, isAdjoint, op);
       },
-      py::arg("name"), py::arg("params"), py::arg("controls"),
-      py::arg("targets"), py::arg("isAdjoint") = false,
-      py::arg("op") = cudaq::spin_op::identity());
+      nanobind::arg("name"), nanobind::arg("params"), nanobind::arg("controls"),
+      nanobind::arg("targets"), nanobind::arg("isAdjoint") = false,
+      nanobind::arg("op") = cudaq::spin_op::identity());
 
   mod.def("startAdjointRegion",
           []() { cudaq::getExecutionManager()->startAdjointRegion(); });
@@ -50,6 +49,6 @@ void bindExecutionManager(py::module &mod) {
         return cudaq::getExecutionManager()->measure(cudaq::QuditInfo(2, id),
                                                      regName);
       },
-      py::arg("qubit"), py::arg("register_name") = "");
+      nanobind::arg("qubit"), nanobind::arg("register_name") = "");
 }
 } // namespace cudaq
diff --git a/python/runtime/cudaq/qis/py_execution_manager.h b/python/runtime/cudaq/qis/py_execution_manager.h
index d562fe25946..4893dff9f6b 100644
--- a/python/runtime/cudaq/qis/py_execution_manager.h
+++ b/python/runtime/cudaq/qis/py_execution_manager.h
@@ -8,10 +8,8 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
-void bindExecutionManager(py::module &mod);
+void bindExecutionManager(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/qis/py_pauli_word.cpp b/python/runtime/cudaq/qis/py_pauli_word.cpp
index 923732cfc27..c8388e9153e 100644
--- a/python/runtime/cudaq/qis/py_pauli_word.cpp
+++ b/python/runtime/cudaq/qis/py_pauli_word.cpp
@@ -8,16 +8,18 @@
 
 #include "py_pauli_word.h"
 #include "cudaq/qis/pauli_word.h"
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/string.h>
 
 namespace cudaq {
 
-void bindPauliWord(py::module &mod) {
+void bindPauliWord(nanobind::module_ &mod) {
 
-  py::class_<pauli_word>(mod, "pauli_word",
-                         "The `pauli_word` is a thin wrapper on a Pauli tensor "
-                         "product string, e.g. `XXYZ` on 4 qubits.")
-      .def(py::init<>())
-      .def(py::init<const std::string>());
+  nanobind::class_<pauli_word>(
+      mod, "pauli_word",
+      "The `pauli_word` is a thin wrapper on a Pauli tensor "
+      "product string, e.g. `XXYZ` on 4 qubits.")
+      .def(nanobind::init<>())
+      .def(nanobind::init<const std::string>());
 }
 } // namespace cudaq
diff --git a/python/runtime/cudaq/qis/py_pauli_word.h b/python/runtime/cudaq/qis/py_pauli_word.h
index 5ff9c2581a8..fc48d8a6230 100644
--- a/python/runtime/cudaq/qis/py_pauli_word.h
+++ b/python/runtime/cudaq/qis/py_pauli_word.h
@@ -8,11 +8,9 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
 /// @brief Bind the Quantum Instruction Set.
-void bindPauliWord(py::module &mod);
+void bindPauliWord(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/target/py_runtime_target.cpp b/python/runtime/cudaq/target/py_runtime_target.cpp
index 5f58c95d293..1eabed728b4 100644
--- a/python/runtime/cudaq/target/py_runtime_target.cpp
+++ b/python/runtime/cudaq/target/py_runtime_target.cpp
@@ -13,9 +13,11 @@
 #include "cudaq/runtime/logger/logger.h"
 #include "cudaq/target_control.h"
 #include <functional>
-#include <pybind11/functional.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/function.h>
+#include <nanobind/stl/map.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
 #include <shared_mutex>
 
 namespace {
@@ -52,55 +54,55 @@ void onTargetChange(const cudaq::RuntimeTarget &newTarget) {
 namespace cudaq {
 
 std::map<std::string, std::string>
-parseTargetKwArgs(const py::kwargs &extraConfig) {
+parseTargetKwArgs(const nanobind::kwargs &extraConfig) {
   if (extraConfig.contains("options"))
     throw std::runtime_error("The keyword `options` argument is not supported "
                              "in cudaq.set_target(). Please use the keyword "
                              "`option` in order to set the target options.");
   std::map<std::string, std::string> config;
-  for (auto &[key, value] : extraConfig) {
+  for (auto [key, value] : extraConfig) {
     std::string strValue = "";
-    if (py::isinstance<py::bool_>(value))
-      strValue = value.cast<py::bool_>() ? "true" : "false";
-    else if (py::isinstance<py::str>(value))
-      strValue = value.cast<std::string>();
-    else if (py::isinstance<py::int_>(value))
-      strValue = std::to_string(value.cast<int>());
+    if (nanobind::isinstance<nanobind::bool_>(value))
+      strValue = nanobind::cast<bool>(value) ? "true" : "false";
+    else if (nanobind::isinstance<nanobind::str>(value))
+      strValue = nanobind::cast<std::string>(value);
+    else if (nanobind::isinstance<nanobind::int_>(value))
+      strValue = std::to_string(nanobind::cast<int>(value));
     else
       throw std::runtime_error(
           "QPU kwargs config value must be cast-able to a string.");
 
     // Ignore empty parameter values
     if (!strValue.empty())
-      config.emplace(key.cast<std::string>(), strValue);
+      config.emplace(nanobind::cast<std::string>(key), strValue);
   }
   return config;
 }
 
-void bindRuntimeTarget(py::module &mod, LinkedLibraryHolder &holder) {
+void bindRuntimeTarget(nanobind::module_ &mod, LinkedLibraryHolder &holder) {
 
-  py::enum_<simulation_precision>(
+  nanobind::enum_<simulation_precision>(
       mod, "SimulationPrecision",
       "Enumeration describing the precision of the underlying simulation.")
       .value("fp32", simulation_precision::fp32)
       .value("fp64", simulation_precision::fp64);
 
-  py::class_<cudaq::RuntimeTarget>(
+  nanobind::class_<cudaq::RuntimeTarget>(
       mod, "Target",
       "The `cudaq.Target` represents the underlying infrastructure that "
       "CUDA-Q kernels will execute on. Instances of `cudaq.Target` describe "
       "what simulator they may leverage, the quantum_platform required for "
       "execution, and a description for the target.")
-      .def_readonly("name", &cudaq::RuntimeTarget::name,
-                    "The name of the `cudaq.Target`.")
-      .def_readonly("simulator", &cudaq::RuntimeTarget::simulatorName,
-                    "The name of the simulator this `cudaq.Target` leverages. "
-                    "This will be empty for physical QPUs.")
-      .def_readonly("platform", &cudaq::RuntimeTarget::platformName,
-                    "The name of the quantum_platform implementation this "
-                    "`cudaq.Target` leverages.")
-      .def_readonly("description", &cudaq::RuntimeTarget::description,
-                    "A string describing the features for this `cudaq.Target`.")
+      .def_ro("name", &cudaq::RuntimeTarget::name,
+              "The name of the `cudaq.Target`.")
+      .def_ro("simulator", &cudaq::RuntimeTarget::simulatorName,
+              "The name of the simulator this `cudaq.Target` leverages. "
+              "This will be empty for physical QPUs.")
+      .def_ro("platform", &cudaq::RuntimeTarget::platformName,
+              "The name of the quantum_platform implementation this "
+              "`cudaq.Target` leverages.")
+      .def_ro("description", &cudaq::RuntimeTarget::description,
+              "A string describing the features for this `cudaq.Target`.")
       .def(
           "num_qpus",
           [](cudaq::RuntimeTarget &_) { return cudaq::platform_num_qpus(); },
@@ -165,7 +167,7 @@ void bindRuntimeTarget(py::module &mod, LinkedLibraryHolder &holder) {
       "Return all available `cudaq.Target` instances on the current system.");
   mod.def(
       "set_target",
-      [&](const cudaq::RuntimeTarget &target, py::kwargs extraConfig) {
+      [&](const cudaq::RuntimeTarget &target, nanobind::kwargs extraConfig) {
         auto config = parseTargetKwArgs(extraConfig);
         holder.setTarget(target.name, config);
         onTargetChange(target);
@@ -175,7 +177,7 @@ void bindRuntimeTarget(py::module &mod, LinkedLibraryHolder &holder) {
       "kwargs.");
   mod.def(
       "set_target",
-      [&](const std::string &name, py::kwargs extraConfig) {
+      [&](const std::string &name, nanobind::kwargs extraConfig) {
         auto config = parseTargetKwArgs(extraConfig);
         holder.setTarget(name, config);
         onTargetChange(holder.getTarget());
@@ -209,10 +211,12 @@ void bindRuntimeTarget(py::module &mod, LinkedLibraryHolder &holder) {
       },
       "Unregister a callback identified by the input identifier.");
 
-  py::module_::import("atexit").attr("register")(py::cpp_function([]() {
-    // Perform cleanup of registered callbacks, which might be Python objects.
-    g_callbacks.clear();
-  }));
+  nanobind::module_::import_("atexit").attr("register")(
+      nanobind::cpp_function([]() {
+        // Perform cleanup of registered callbacks, which might be Python
+        // objects.
+        g_callbacks.clear();
+      }));
 }
 
 } // namespace cudaq
diff --git a/python/runtime/cudaq/target/py_runtime_target.h b/python/runtime/cudaq/target/py_runtime_target.h
index 1d405033241..672ef6c298d 100644
--- a/python/runtime/cudaq/target/py_runtime_target.h
+++ b/python/runtime/cudaq/target/py_runtime_target.h
@@ -8,14 +8,12 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
 
 class LinkedLibraryHolder;
 
-void bindRuntimeTarget(py::module &mod, LinkedLibraryHolder &holder);
+void bindRuntimeTarget(nanobind::module_ &mod, LinkedLibraryHolder &holder);
 
 } // namespace cudaq
diff --git a/python/runtime/cudaq/target/py_testing_utils.cpp b/python/runtime/cudaq/target/py_testing_utils.cpp
index 423677eebeb..ccfff9e2c7a 100644
--- a/python/runtime/cudaq/target/py_testing_utils.cpp
+++ b/python/runtime/cudaq/target/py_testing_utils.cpp
@@ -11,9 +11,9 @@
 #include "cudaq.h"
 #include "cudaq/platform.h"
 #include "nvqir/CircuitSimulator.h"
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
 
 namespace nvqir {
 void toggleDynamicQubitManagement();
@@ -21,7 +21,7 @@ void toggleDynamicQubitManagement();
 
 namespace cudaq {
 
-void bindTestUtils(py::module &mod, LinkedLibraryHolder &holder) {
+void bindTestUtils(nanobind::module_ &mod, LinkedLibraryHolder &holder) {
   auto testingSubmodule = mod.def_submodule("testing");
 
   testingSubmodule.def(
@@ -34,7 +34,7 @@ void bindTestUtils(py::module &mod, LinkedLibraryHolder &holder) {
         auto simName = holder.getTarget().simulatorName;
         return holder.getSimulator(simName)->allocateQubits(numQubits);
       },
-      py::arg("numQubits"));
+      nanobind::arg("numQubits"));
 
   testingSubmodule.def("deallocateQubits",
                        [&](const std::vector<std::size_t> &qubits) {
diff --git a/python/runtime/cudaq/target/py_testing_utils.h b/python/runtime/cudaq/target/py_testing_utils.h
index deb53e59e8e..593022f95fd 100644
--- a/python/runtime/cudaq/target/py_testing_utils.h
+++ b/python/runtime/cudaq/target/py_testing_utils.h
@@ -8,15 +8,13 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
 
 class LinkedLibraryHolder;
 
 /// @brief Bind test utilities needed for mock QPU QIR profile simulation
-void bindTestUtils(py::module &mod, LinkedLibraryHolder &holder);
+void bindTestUtils(nanobind::module_ &mod, LinkedLibraryHolder &holder);
 
 } // namespace cudaq
diff --git a/python/runtime/interop/CMakeLists.txt b/python/runtime/interop/CMakeLists.txt
index 5063c5858de..c20b2d8390a 100644
--- a/python/runtime/interop/CMakeLists.txt
+++ b/python/runtime/interop/CMakeLists.txt
@@ -10,10 +10,13 @@ add_compile_options(-Wno-attributes)
 add_library(cudaq-python-interop SHARED PythonCppInterop.cpp)
 target_include_directories(cudaq-python-interop PRIVATE
     ${PYTHON_INCLUDE_DIRS}
-    ${pybind11_INCLUDE_DIRS}
 )
-target_link_libraries(cudaq-python-interop PRIVATE pybind11::module cudaq)
-install (FILES PythonCppInterop.h DESTINATION include/cudaq/python/)
+if (SKBUILD)
+  target_link_libraries(cudaq-python-interop PRIVATE nanobind-static Python::Module cudaq)
+else()
+  target_link_libraries(cudaq-python-interop PRIVATE nanobind-static Python::Python cudaq)
+endif()
+install (FILES PythonCppInterop.h PythonCppInteropDecls.h DESTINATION include/cudaq/python/)
 
 install(TARGETS cudaq-python-interop EXPORT cudaq-python-interop-targets DESTINATION lib)
 
diff --git a/python/runtime/interop/PythonCppInterop.h b/python/runtime/interop/PythonCppInterop.h
index 82f90eb73c7..9a24a740a7f 100644
--- a/python/runtime/interop/PythonCppInterop.h
+++ b/python/runtime/interop/PythonCppInterop.h
@@ -7,10 +7,10 @@
  ******************************************************************************/
 #pragma once
 
+#include "PythonCppInteropDecls.h"
 #include "cudaq/qis/qkernel.h"
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/string.h>
 
 namespace cudaq::python {
 
@@ -20,8 +20,8 @@ class CppPyKernelDecorator {
   /// The constructor.
   /// @param obj A kernel decorator Python object.
   /// @throw std::runtime_error if the object is not a valid kernel decorator.
-  CppPyKernelDecorator(py::object obj) : kernel(obj) {
-    if (!py::hasattr(obj, "qkeModule"))
+  CppPyKernelDecorator(nanobind::object obj) : kernel(obj) {
+    if (!nanobind::hasattr(obj, "qkeModule"))
       throw std::runtime_error("Invalid python kernel object passed, must be "
                                "annotated with cudaq.kernel");
   }
@@ -52,9 +52,9 @@ class CppPyKernelDecorator {
   }
 
 private:
-  py::object kernel;
+  nanobind::object kernel;
   // Hold on to the CompiledModule, it keeps the JIT engine alive.
-  py::object compiledKernel;
+  nanobind::object compiledKernel;
 
   template <typename... As>
   void *getKernelHelper(bool isEntryPoint, As... as) {
@@ -62,7 +62,7 @@ class CppPyKernelDecorator {
     compiledKernel =
         kernel.attr("beta_reduction")(isEntryPoint, std::forward<As>(as)...);
     auto entryPointAddr =
-        compiledKernel.attr("entry_point").cast<std::uintptr_t>();
+        nanobind::cast<std::uintptr_t>(compiledKernel.attr("entry_point"));
     // Set lsb to 1 to denote this is NOT a C++ kernel.
     auto *p = reinterpret_cast<void *>(
         static_cast<std::intptr_t>(entryPointAddr) | 1);
@@ -76,86 +76,13 @@ class CppPyKernelDecorator {
 /// (synthesized) into the kernel and cannot be changed by the algorithm.
 template <typename KT, typename ALGO, typename... As>
   requires QKernelType<KT> && std::invocable<ALGO, KT>
-auto launch_specialized_py_decorator(py::object qern, ALGO algo, As... as) {
+auto launch_specialized_py_decorator(nanobind::object qern, ALGO algo,
+                                     As... as) {
   cudaq::python::CppPyKernelDecorator decorator(qern);
   auto entryPoint = decorator.getDirectKernelCall<KT>(std::forward<As>(as)...);
   return algo(std::move(entryPoint));
 }
 
-/// @brief Extracts the kernel name from an input MLIR string.
-/// @param input The input string containing the kernel name.
-/// @return The extracted kernel name.
-std::string getKernelName(const std::string &input);
-
-/// @brief Extracts a sub-string from an input string based on start and end
-/// delimiters.
-/// @param input The input string to extract from.
-/// @param startStr The starting delimiter.
-/// @param endStr The ending delimiter.
-/// @return The extracted sub-string.
-std::string extractSubstring(const std::string &input,
-                             const std::string &startStr,
-                             const std::string &endStr);
-
-/// @brief Retrieves the MLIR code and mangled kernel name for a given
-/// user-level kernel name.
-/// @param name The name of the kernel.
-/// @return A tuple containing the MLIR code and the kernel name.
-std::tuple<std::string, std::string>
-getMLIRCodeAndName(const std::string &name, const std::string mangled = "");
-
-/// @brief Register a C++ device kernel with the given module and name
-/// @param module The name of the module containing the kernel
-/// @param name The name of the kernel to register
-void registerDeviceKernel(const std::string &module, const std::string &name,
-                          const std::string &mangled);
-
-/// @brief Retrieve the module and name of a registered device kernel
-/// @param compositeName The composite name of the kernel (module.name)
-/// @return A tuple containing the module name and kernel name
-std::tuple<std::string, std::string>
-getDeviceKernel(const std::string &compositeName);
-
-bool isRegisteredDeviceModule(const std::string &compositeName);
-
-template <typename T>
-constexpr bool is_const_reference_v =
-    std::is_reference_v<T> && std::is_const_v<std::remove_reference_t<T>>;
-
-template <typename T>
-struct TypeMangler {
-  static std::string mangle() {
-    std::string mangledName = typeid(T).name();
-    if constexpr (is_const_reference_v<T>) {
-      mangledName = "RK" + mangledName;
-    }
-    return mangledName;
-  }
-};
-
-template <typename... Args>
-inline std::string getMangledArgsString() {
-  std::string result;
-  (result += ... += TypeMangler<Args>::mangle());
-
-  // Remove any namespace cudaq text
-  std::string search = "N5cudaq";
-  std::string replace = "";
-
-  size_t pos = result.find(search);
-  while (pos != std::string::npos) {
-    result.replace(pos, search.length(), replace);
-    pos = result.find(search, pos + replace.length());
-  }
-
-  return result;
-}
-
-template <>
-inline std::string getMangledArgsString<>() {
-  return {};
-}
-
 /// @brief Add a C++ device kernel that is usable from CUDA-Q Python.
 /// @tparam Signature The function signature of the kernel
 /// @param m The Python module to add the kernel to
@@ -163,7 +90,7 @@ inline std::string getMangledArgsString<>() {
 /// @param kernelName The name of the kernel
 /// @param docstring The documentation string for the kernel
 template <typename... Signature>
-void addDeviceKernelInterop(py::module_ &m, const std::string &modName,
+void addDeviceKernelInterop(nanobind::module_ &m, const std::string &modName,
                             const std::string &kernelName,
                             const std::string &docstring) {
 
@@ -171,16 +98,16 @@ void addDeviceKernelInterop(py::module_ &m, const std::string &modName,
 
   // FIXME Maybe Add replacement options (i.e., _pycudaq -> cudaq)
 
-  py::module_ sub;
-  if (py::hasattr(m, modName.c_str()))
-    sub = m.attr(modName.c_str()).cast<py::module_>();
-  else
-    sub = m.def_submodule(modName.c_str());
+  nanobind::module_ sub =
+      nanobind::hasattr(m, modName.c_str())
+          ? nanobind::cast<nanobind::module_>(m.attr(modName.c_str()))
+          : m.def_submodule(modName.c_str());
 
   sub.def(
       kernelName.c_str(), [](Signature...) {}, docstring.c_str());
-  cudaq::python::registerDeviceKernel(sub.attr("__name__").cast<std::string>(),
-                                      kernelName, mangledArgs);
+  cudaq::python::registerDeviceKernel(
+      nanobind::cast<std::string>(sub.attr("__name__")), kernelName,
+      mangledArgs);
   return;
 }
 } // namespace cudaq::python
diff --git a/python/runtime/interop/PythonCppInteropDecls.h b/python/runtime/interop/PythonCppInteropDecls.h
new file mode 100644
index 00000000000..9bb637807d1
--- /dev/null
+++ b/python/runtime/interop/PythonCppInteropDecls.h
@@ -0,0 +1,91 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2026 NVIDIA Corporation & Affiliates.                         *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+#pragma once
+
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <typeinfo>
+
+namespace cudaq::python {
+
+/// @brief Extracts the kernel name from an input MLIR string.
+/// @param input The input string containing the kernel name.
+/// @return The extracted kernel name.
+std::string getKernelName(const std::string &input);
+
+/// @brief Extracts a sub-string from an input string based on start and end
+/// delimiters.
+/// @param input The input string to extract from.
+/// @param startStr The starting delimiter.
+/// @param endStr The ending delimiter.
+/// @return The extracted sub-string.
+std::string extractSubstring(const std::string &input,
+                             const std::string &startStr,
+                             const std::string &endStr);
+
+/// @brief Retrieves the MLIR code and mangled kernel name for a given
+/// user-level kernel name.
+/// @param name The name of the kernel.
+/// @return A tuple containing the MLIR code and the kernel name.
+std::tuple<std::string, std::string>
+getMLIRCodeAndName(const std::string &name, const std::string mangled = "");
+
+/// @brief Register a C++ device kernel with the given module and name
+/// @param module The name of the module containing the kernel
+/// @param name The name of the kernel to register
+void registerDeviceKernel(const std::string &module, const std::string &name,
+                          const std::string &mangled);
+
+/// @brief Retrieve the module and name of a registered device kernel
+/// @param compositeName The composite name of the kernel (module.name)
+/// @return A tuple containing the module name and kernel name
+std::tuple<std::string, std::string>
+getDeviceKernel(const std::string &compositeName);
+
+bool isRegisteredDeviceModule(const std::string &compositeName);
+
+template <typename T>
+constexpr bool is_const_reference_v =
+    std::is_reference_v<T> && std::is_const_v<std::remove_reference_t<T>>;
+
+template <typename T>
+struct TypeMangler {
+  static std::string mangle() {
+    std::string mangledName = typeid(T).name();
+    if constexpr (is_const_reference_v<T>) {
+      mangledName = "RK" + mangledName;
+    }
+    return mangledName;
+  }
+};
+
+template <typename... Args>
+inline std::string getMangledArgsString() {
+  std::string result;
+  (result += ... += TypeMangler<Args>::mangle());
+
+  // Remove any namespace cudaq text
+  std::string search = "N5cudaq";
+  std::string replace = "";
+
+  size_t pos = result.find(search);
+  while (pos != std::string::npos) {
+    result.replace(pos, search.length(), replace);
+    pos = result.find(search, pos + replace.length());
+  }
+
+  return result;
+}
+
+template <>
+inline std::string getMangledArgsString<>() {
+  return {};
+}
+
+} // namespace cudaq::python
diff --git a/python/runtime/mlir/py_register_dialects.cpp b/python/runtime/mlir/py_register_dialects.cpp
index f1ec32ac531..4db6be7ed36 100644
--- a/python/runtime/mlir/py_register_dialects.cpp
+++ b/python/runtime/mlir/py_register_dialects.cpp
@@ -16,20 +16,19 @@
 #include "cudaq/Optimizer/Dialect/Quake/QuakeTypes.h"
 #include "cudaq/Optimizer/InitAllPasses.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
-#include "mlir/Bindings/Python/PybindAdaptors.h"
+#include "utils/NanobindAdaptors.h"
 #include "mlir/InitAllDialects.h"
 #include <fmt/core.h>
-#include <pybind11/complex.h>
-#include <pybind11/stl.h>
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
 
-namespace py = pybind11;
-using namespace mlir::python::adaptors;
 using namespace mlir;
 
 namespace cudaq {
 static bool registered = false;
 
-void registerQuakeDialectAndTypes(py::module &m) {
+void registerQuakeDialectAndTypes(nanobind::module_ &m) {
   auto quakeMod = m.def_submodule("quake");
 
   quakeMod.def(
@@ -45,51 +44,40 @@ void registerQuakeDialectAndTypes(py::module &m) {
           registered = true;
         }
       },
-      py::arg("load") = true, py::arg("context") = py::none());
+      nanobind::arg("load") = true,
+      nanobind::arg("context") = nanobind::none());
 
-  mlir_type_subclass(
+  mlir::python::nanobind_adaptors::mlir_type_subclass(
       quakeMod, "RefType",
       [](MlirType type) { return unwrap(type).isa<quake::RefType>(); })
       .def_classmethod(
           "get",
-          [](py::object cls, MlirContext context) {
+          [](nanobind::object cls, MlirContext context) {
             return wrap(quake::RefType::get(unwrap(context)));
           },
-          py::arg("cls"), py::arg("context") = py::none());
+          nanobind::arg("cls"), nanobind::arg("context") = nanobind::none());
 
-  mlir_type_subclass(
+  mlir::python::nanobind_adaptors::mlir_type_subclass(
       quakeMod, "MeasureType",
       [](MlirType type) { return unwrap(type).isa<quake::MeasureType>(); })
       .def_classmethod(
           "get",
-          [](py::object cls, MlirContext context) {
+          [](nanobind::object cls, MlirContext context) {
             return wrap(quake::MeasureType::get(unwrap(context)));
           },
-          py::arg("cls"), py::arg("context") = py::none());
+          nanobind::arg("cls"), nanobind::arg("context") = nanobind::none());
 
-  mlir_type_subclass(
-      quakeMod, "MeasurementsType",
-      [](MlirType type) { return unwrap(type).isa<quake::MeasurementsType>(); })
-      .def_classmethod(
-          "get",
-          [](py::object cls, std::size_t size, MlirContext context) {
-            return wrap(quake::MeasurementsType::get(unwrap(context), size));
-          },
-          py::arg("cls"),
-          py::arg("size") = quake::MeasurementsType::kDynamicSize,
-          py::arg("context") = py::none());
-
-  mlir_type_subclass(
+  mlir::python::nanobind_adaptors::mlir_type_subclass(
       quakeMod, "VeqType",
       [](MlirType type) { return unwrap(type).isa<quake::VeqType>(); })
       .def_classmethod(
           "get",
-          [](py::object cls, std::size_t size, MlirContext context) {
+          [](nanobind::object cls, std::size_t size, MlirContext context) {
             return wrap(quake::VeqType::get(unwrap(context), size));
           },
-          py::arg("cls"),
-          py::arg("size") = std::numeric_limits<std::size_t>::max(),
-          py::arg("context") = py::none())
+          nanobind::arg("cls"),
+          nanobind::arg("size") = std::numeric_limits<std::size_t>::max(),
+          nanobind::arg("context") = nanobind::none())
       .def_staticmethod(
           "hasSpecifiedSize",
           [](MlirType type) {
@@ -100,7 +88,7 @@ void registerQuakeDialectAndTypes(py::module &m) {
 
             return veqTy.hasSpecifiedSize();
           },
-          py::arg("veqTypeInstance"))
+          nanobind::arg("veqTypeInstance"))
       .def_staticmethod(
           "getSize",
           [](MlirType type) {
@@ -111,49 +99,51 @@ void registerQuakeDialectAndTypes(py::module &m) {
 
             return veqTy.getSize();
           },
-          py::arg("veqTypeInstance"));
+          nanobind::arg("veqTypeInstance"));
 
   quakeMod.def(
       "isConstantQuantumRefType",
       [](MlirType type) {
         return quake::isConstantQuantumRefType(unwrap(type));
       },
-      py::arg("type"));
+      nanobind::arg("type"));
 
   quakeMod.def(
       "getAllocationSize",
       [](MlirType type) { return quake::getAllocationSize(unwrap(type)); },
-      py::arg("type"));
+      nanobind::arg("type"));
 
-  mlir_type_subclass(
+  mlir::python::nanobind_adaptors::mlir_type_subclass(
       quakeMod, "StruqType",
       [](MlirType type) { return unwrap(type).isa<quake::StruqType>(); })
       .def_classmethod(
           "get",
-          [](py::object cls, py::list aggregateTypes, MlirContext context) {
+          [](nanobind::object cls, nanobind::list aggregateTypes,
+             MlirContext context) {
             SmallVector<Type> inTys;
-            for (auto &t : aggregateTypes)
-              inTys.push_back(unwrap(t.cast<MlirType>()));
+            for (auto t : aggregateTypes)
+              inTys.push_back(unwrap(nanobind::cast<MlirType>(t)));
 
             return wrap(quake::StruqType::get(unwrap(context), inTys));
           },
-          py::arg("cls"), py::arg("aggregateTypes"),
-          py::arg("context") = py::none())
+          nanobind::arg("cls"), nanobind::arg("aggregateTypes"),
+          nanobind::arg("context") = nanobind::none())
       .def_classmethod(
           "getNamed",
-          [](py::object cls, const std::string &name, py::list aggregateTypes,
-             MlirContext context) {
+          [](nanobind::object cls, const std::string &name,
+             nanobind::list aggregateTypes, MlirContext context) {
             SmallVector<Type> inTys;
-            for (auto &t : aggregateTypes)
-              inTys.push_back(unwrap(t.cast<MlirType>()));
+            for (auto t : aggregateTypes)
+              inTys.push_back(unwrap(nanobind::cast<MlirType>(t)));
 
             return wrap(quake::StruqType::get(unwrap(context), name, inTys));
           },
-          py::arg("cls"), py::arg("name"), py::arg("aggregateTypes"),
-          py::arg("context") = py::none())
+          nanobind::arg("cls"), nanobind::arg("name"),
+          nanobind::arg("aggregateTypes"),
+          nanobind::arg("context") = nanobind::none())
       .def_classmethod(
           "getTypes",
-          [](py::object cls, MlirType structTy) {
+          [](nanobind::object cls, MlirType structTy) {
             auto ty = dyn_cast<quake::StruqType>(unwrap(structTy));
             if (!ty)
               throw std::runtime_error(
@@ -164,7 +154,7 @@ void registerQuakeDialectAndTypes(py::module &m) {
               ret.push_back(wrap(t));
             return ret;
           })
-      .def_classmethod("getName", [](py::object cls, MlirType structTy) {
+      .def_classmethod("getName", [](nanobind::object cls, MlirType structTy) {
         auto ty = dyn_cast<quake::StruqType>(unwrap(structTy));
         if (!ty)
           throw std::runtime_error(
@@ -174,7 +164,7 @@ void registerQuakeDialectAndTypes(py::module &m) {
       });
 }
 
-void registerCCDialectAndTypes(py::module &m) {
+void registerCCDialectAndTypes(nanobind::module_ &m) {
 
   auto ccMod = m.def_submodule("cc");
 
@@ -187,34 +177,35 @@ void registerCCDialectAndTypes(py::module &m) {
           mlirDialectHandleLoadDialect(ccHandle, context);
         }
       },
-      py::arg("load") = true, py::arg("context") = py::none());
+      nanobind::arg("load") = true,
+      nanobind::arg("context") = nanobind::none());
 
-  mlir_type_subclass(
+  mlir::python::nanobind_adaptors::mlir_type_subclass(
       ccMod, "CharspanType",
       [](MlirType type) { return unwrap(type).isa<cudaq::cc::CharspanType>(); })
       .def_classmethod(
           "get",
-          [](py::object cls, MlirContext context) {
+          [](nanobind::object cls, MlirContext context) {
             return wrap(cudaq::cc::CharspanType::get(unwrap(context)));
           },
-          py::arg("cls"), py::arg("context") = py::none());
+          nanobind::arg("cls"), nanobind::arg("context") = nanobind::none());
 
-  mlir_type_subclass(
+  mlir::python::nanobind_adaptors::mlir_type_subclass(
       ccMod, "StateType",
       [](MlirType type) { return unwrap(type).isa<quake::StateType>(); })
       .def_classmethod(
           "get",
-          [](py::object cls, MlirContext context) {
+          [](nanobind::object cls, MlirContext context) {
             return wrap(quake::StateType::get(unwrap(context)));
           },
-          py::arg("cls"), py::arg("context") = py::none());
+          nanobind::arg("cls"), nanobind::arg("context") = nanobind::none());
 
-  mlir_type_subclass(
+  mlir::python::nanobind_adaptors::mlir_type_subclass(
       ccMod, "PointerType",
       [](MlirType type) { return unwrap(type).isa<cudaq::cc::PointerType>(); })
       .def_classmethod(
           "getElementType",
-          [](py::object cls, MlirType type) {
+          [](nanobind::object cls, MlirType type) {
             auto ty = unwrap(type);
             auto casted = dyn_cast<cudaq::cc::PointerType>(ty);
             if (!casted)
@@ -225,19 +216,19 @@ void registerCCDialectAndTypes(py::module &m) {
           })
       .def_classmethod(
           "get",
-          [](py::object cls, MlirType elementType, MlirContext context) {
+          [](nanobind::object cls, MlirType elementType, MlirContext context) {
             return wrap(cudaq::cc::PointerType::get(unwrap(context),
                                                     unwrap(elementType)));
           },
-          py::arg("cls"), py::arg("elementType"),
-          py::arg("context") = py::none());
+          nanobind::arg("cls"), nanobind::arg("elementType"),
+          nanobind::arg("context") = nanobind::none());
 
-  mlir_type_subclass(
+  mlir::python::nanobind_adaptors::mlir_type_subclass(
       ccMod, "ArrayType",
       [](MlirType type) { return unwrap(type).isa<cudaq::cc::ArrayType>(); })
       .def_classmethod(
           "getElementType",
-          [](py::object cls, MlirType type) {
+          [](nanobind::object cls, MlirType type) {
             auto ty = unwrap(type);
             auto casted = dyn_cast<cudaq::cc::ArrayType>(ty);
             if (!casted)
@@ -248,45 +239,47 @@ void registerCCDialectAndTypes(py::module &m) {
           })
       .def_classmethod(
           "get",
-          [](py::object cls, MlirType elementType, std::int64_t size,
+          [](nanobind::object cls, MlirType elementType, std::int64_t size,
              MlirContext context) {
             return wrap(cudaq::cc::ArrayType::get(unwrap(context),
                                                   unwrap(elementType), size));
           },
-          py::arg("cls"), py::arg("elementType"),
-          py::arg("size") = std::numeric_limits<std::int64_t>::min(),
-          py::arg("context") = py::none());
+          nanobind::arg("cls"), nanobind::arg("elementType"),
+          nanobind::arg("size") = std::numeric_limits<std::int64_t>::min(),
+          nanobind::arg("context") = nanobind::none());
 
-  mlir_type_subclass(
+  mlir::python::nanobind_adaptors::mlir_type_subclass(
       ccMod, "StructType",
       [](MlirType type) { return unwrap(type).isa<cudaq::cc::StructType>(); })
       .def_classmethod(
           "get",
-          [](py::object cls, py::list aggregateTypes, MlirContext context) {
+          [](nanobind::object cls, nanobind::list aggregateTypes,
+             MlirContext context) {
             SmallVector<Type> inTys;
-            for (auto &t : aggregateTypes)
-              inTys.push_back(unwrap(t.cast<MlirType>()));
+            for (auto t : aggregateTypes)
+              inTys.push_back(unwrap(nanobind::cast<MlirType>(t)));
 
             return wrap(cudaq::cc::StructType::get(unwrap(context), inTys));
           },
-          py::arg("cls"), py::arg("aggregateTypes"),
-          py::arg("context") = py::none())
+          nanobind::arg("cls"), nanobind::arg("aggregateTypes"),
+          nanobind::arg("context") = nanobind::none())
       .def_classmethod(
           "getNamed",
-          [](py::object cls, const std::string &name, py::list aggregateTypes,
-             MlirContext context) {
+          [](nanobind::object cls, const std::string &name,
+             nanobind::list aggregateTypes, MlirContext context) {
             SmallVector<Type> inTys;
-            for (auto &t : aggregateTypes)
-              inTys.push_back(unwrap(t.cast<MlirType>()));
+            for (auto t : aggregateTypes)
+              inTys.push_back(unwrap(nanobind::cast<MlirType>(t)));
 
             return wrap(
                 cudaq::cc::StructType::get(unwrap(context), name, inTys));
           },
-          py::arg("cls"), py::arg("name"), py::arg("aggregateTypes"),
-          py::arg("context") = py::none())
+          nanobind::arg("cls"), nanobind::arg("name"),
+          nanobind::arg("aggregateTypes"),
+          nanobind::arg("context") = nanobind::none())
       .def_classmethod(
           "getTypes",
-          [](py::object cls, MlirType structTy) {
+          [](nanobind::object cls, MlirType structTy) {
             auto ty = dyn_cast<cudaq::cc::StructType>(unwrap(structTy));
             if (!ty)
               throw std::runtime_error(
@@ -297,7 +290,7 @@ void registerCCDialectAndTypes(py::module &m) {
               ret.push_back(wrap(t));
             return ret;
           })
-      .def_classmethod("getName", [](py::object cls, MlirType structTy) {
+      .def_classmethod("getName", [](nanobind::object cls, MlirType structTy) {
         auto ty = dyn_cast<cudaq::cc::StructType>(unwrap(structTy));
         if (!ty)
           throw std::runtime_error(
@@ -306,38 +299,40 @@ void registerCCDialectAndTypes(py::module &m) {
         return ty.getName().getValue().str();
       });
 
-  mlir_type_subclass(
+  mlir::python::nanobind_adaptors::mlir_type_subclass(
       ccMod, "CallableType",
       [](MlirType type) { return unwrap(type).isa<cudaq::cc::CallableType>(); })
       .def_classmethod("get",
-                       [](py::object cls, MlirContext context, py::list inTypes,
-                          py::list resTypes) {
-                         // Pybind builder: make the builder for this type look
-                         // like that of a FunctionType.
+                       [](nanobind::object cls, MlirContext context,
+                          nanobind::list inTypes, nanobind::list resTypes) {
+                         // Nanobind builder: make the builder for this type
+                         // look like that of a FunctionType.
                          SmallVector<Type> inTys;
-                         for (auto &t : inTypes)
-                           inTys.push_back(unwrap(t.cast<MlirType>()));
+                         for (auto t : inTypes)
+                           inTys.push_back(unwrap(nanobind::cast<MlirType>(t)));
                          SmallVector<Type> resTys;
-                         for (auto &t : resTypes)
-                           resTys.push_back(unwrap(t.cast<MlirType>()));
+                         for (auto t : resTypes)
+                           resTys.push_back(
+                               unwrap(nanobind::cast<MlirType>(t)));
 
                          auto *ctx = unwrap(context);
                          return wrap(cudaq::cc::CallableType::get(
                              ctx, FunctionType::get(ctx, inTys, resTys)));
                        })
-      .def_classmethod("getFunctionType", [](py::object cls, MlirType type) {
-        auto callTy = dyn_cast<cudaq::cc::CallableType>(unwrap(type));
-        if (!callTy)
-          throw std::runtime_error("must be a cc.callable type!");
-        return wrap(callTy.getSignature());
-      });
-
-  mlir_type_subclass(
+      .def_classmethod(
+          "getFunctionType", [](nanobind::object cls, MlirType type) {
+            auto callTy = dyn_cast<cudaq::cc::CallableType>(unwrap(type));
+            if (!callTy)
+              throw std::runtime_error("must be a cc.callable type!");
+            return wrap(callTy.getSignature());
+          });
+
+  mlir::python::nanobind_adaptors::mlir_type_subclass(
       ccMod, "StdvecType",
       [](MlirType type) { return unwrap(type).isa<cudaq::cc::StdvecType>(); })
       .def_classmethod(
           "getElementType",
-          [](py::object cls, MlirType type) {
+          [](nanobind::object cls, MlirType type) {
             auto ty = unwrap(type);
             auto casted = dyn_cast<cudaq::cc::StdvecType>(ty);
             if (!casted)
@@ -348,15 +343,15 @@ void registerCCDialectAndTypes(py::module &m) {
           })
       .def_classmethod(
           "get",
-          [](py::object cls, MlirType elementType, MlirContext context) {
+          [](nanobind::object cls, MlirType elementType, MlirContext context) {
             return wrap(cudaq::cc::StdvecType::get(unwrap(context),
                                                    unwrap(elementType)));
           },
-          py::arg("cls"), py::arg("elementType"),
-          py::arg("context") = py::none());
+          nanobind::arg("cls"), nanobind::arg("elementType"),
+          nanobind::arg("context") = nanobind::none());
 }
 
-void bindRegisterDialects(py::module &mod) {
+void bindRegisterDialects(nanobind::module_ &mod) {
   registerQuakeDialectAndTypes(mod);
   registerCCDialectAndTypes(mod);
 
diff --git a/python/runtime/mlir/py_register_dialects.h b/python/runtime/mlir/py_register_dialects.h
index a81771b4129..4ed5f455f41 100644
--- a/python/runtime/mlir/py_register_dialects.h
+++ b/python/runtime/mlir/py_register_dialects.h
@@ -8,10 +8,8 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
-void bindRegisterDialects(py::module &mod);
+void bindRegisterDialects(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/tests/backends/test_circuit_opt_bench.py b/python/tests/backends/test_circuit_opt_bench.py
index 938e7a5c145..c295ff12a12 100644
--- a/python/tests/backends/test_circuit_opt_bench.py
+++ b/python/tests/backends/test_circuit_opt_bench.py
@@ -76,6 +76,26 @@ def test_custom_unitary_produces_2q_gates():
         f"KAK produces at most 3 CX (6 CZ after basis change), got {two_q}")
 
 
+def test_ccx_fully_decomposed():
+    """CCX (Toffoli) must decompose to CZ basis, not remain as ccx.
+
+    The decomposition pass must select CCXToCCZ and CCZToCX patterns
+    even when t and s are not directly in the basis. Requires unbounded
+    (n) registration for SToR1/TToR1 and wildcard matching in the
+    pattern selection graph.
+    """
+    cudaq.set_target('circuit-opt-bench')
+
+    kernel = cudaq.make_kernel()
+    q = kernel.qalloc(4)
+    kernel.cx([q[0], q[1]], q[2])
+
+    resources = cudaq.estimate_resources(kernel)
+    ops = resources.to_dict()
+    assert 'ccx' not in ops, f"CCX not decomposed: {ops}"
+    assert resources.gate_count_for_arity(2) > 0
+
+
 def _make_nonlocal_cx_kernel():
     """Build a 5-qubit kernel with CX between non-adjacent qubits (q0, q4).
     On a path topology, q0 and q4 are 4 hops apart, forcing SWAP insertion."""
diff --git a/python/tests/backends/test_qbraid.py b/python/tests/backends/test_qbraid.py
new file mode 100644
index 00000000000..8aa9b0dff57
--- /dev/null
+++ b/python/tests/backends/test_qbraid.py
@@ -0,0 +1,240 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+import os
+from multiprocessing import Process
+from urllib.request import Request, urlopen
+
+import cudaq
+import pytest
+from cudaq import spin
+from network_utils import check_server_connection
+
+try:
+    from utils.mock_qpu.qbraid import startServer
+except ImportError:
+    print("Mock qpu not available, skipping qBraid tests.")
+    pytest.skip("Mock qpu not available.", allow_module_level=True)
+
+port = 62454
+
+# Default machine for tests. Mirrors the real qBraid device string format.
+TEST_MACHINE = "qbraid:qbraid:sim:qir-sv"
+TEST_API_KEY = "00000000000000000000000000000000"
+
+# The qbraid mock server in utils/mock_qpu/qbraid/__init__.py doesn't simulate
+# quantum mechanics - it only inspects the QASM for `h` and `measure` ops and
+# generates random outcomes for qubits with H. It does NOT model entanglement
+# via CNOT. Assertions below reflect the mock's behavior, not physical truth.
+
+
+def _set_qbraid_target(**overrides):
+    """Call set_target with the canonical qbraid args plus any overrides.
+
+    Uses the documented target arguments (`machine`, `api_key`) plus `url`
+    which is accepted by the helper for test/mock overrides.
+    """
+    kwargs = {
+        "url": f"http://localhost:{port}",
+        "machine": TEST_MACHINE,
+        "api_key": TEST_API_KEY,
+    }
+    kwargs.update(overrides)
+    cudaq.set_target("qbraid", **kwargs)
+
+
+@pytest.fixture(scope="session", autouse=True)
+def startUpMockServer():
+    cudaq.set_random_seed(13)
+    os.environ["QBRAID_API_KEY"] = TEST_API_KEY
+
+    _set_qbraid_target()
+
+    p = Process(target=startServer, args=(port,))
+    p.start()
+
+    if not check_server_connection(port):
+        p.terminate()
+        pytest.exit("Mock server did not start in time, skipping tests.",
+                    returncode=1)
+
+    yield "Server started."
+
+    p.terminate()
+
+
+@pytest.fixture(scope="function", autouse=True)
+def configureTarget():
+    _set_qbraid_target()
+    yield "Running the test."
+    cudaq.reset_target()
+
+
+def _make_h_kernel():
+    """H on q[0], CX to q[1], measure both. Mock only sees H on q[0]."""
+    kernel = cudaq.make_kernel()
+    qubits = kernel.qalloc(2)
+    kernel.h(qubits[0])
+    kernel.cx(qubits[0], qubits[1])
+    kernel.mz(qubits)
+    return kernel
+
+
+def test_qbraid_sample():
+    counts = cudaq.sample(_make_h_kernel())
+    # Mock: q[0] superposition -> {"0","1"}, q[1] fixed -> "0"
+    # Observed outcomes: "00" and "10"
+    assert len(counts) == 2
+    assert "00" in counts
+    assert "10" in counts
+
+
+def test_qbraid_sample_async():
+    future = cudaq.sample_async(_make_h_kernel())
+    counts = future.get()
+    assert len(counts) == 2
+    assert "00" in counts
+    assert "10" in counts
+
+
+def test_qbraid_sample_async_persist_future():
+    future = cudaq.sample_async(_make_h_kernel())
+    futureAsString = str(future)
+
+    readIn = cudaq.AsyncSampleResult(futureAsString)
+    counts = readIn.get()
+    assert len(counts) == 2
+    assert "00" in counts
+    assert "10" in counts
+
+
+def _make_vqe_ansatz():
+    kernel, theta = cudaq.make_kernel(float)
+    qreg = kernel.qalloc(2)
+    kernel.x(qreg[0])
+    kernel.ry(theta, qreg[1])
+    kernel.cx(qreg[1], qreg[0])
+    hamiltonian = (5.907 - 2.1433 * spin.x(0) * spin.x(1) -
+                   2.1433 * spin.y(0) * spin.y(1) + 0.21829 * spin.z(0) -
+                   6.125 * spin.z(1))
+    return kernel, hamiltonian
+
+
+def test_qbraid_observe():
+    kernel, hamiltonian = _make_vqe_ansatz()
+    res = cudaq.observe(kernel, hamiltonian, 0.59)
+    # Mock outcomes are random; just verify the roundtrip returned a finite value.
+    val = res.expectation()
+    assert isinstance(val, float)
+    assert val == val  # NaN check
+
+
+def test_qbraid_observe_async_persist_future():
+    kernel, hamiltonian = _make_vqe_ansatz()
+
+    future = cudaq.observe_async(kernel, hamiltonian, 0.59)
+    futureAsString = str(future)
+
+    readIn = cudaq.AsyncObserveResult(futureAsString, hamiltonian)
+    res = readIn.get()
+    val = res.expectation()
+    assert isinstance(val, float)
+    assert val == val
+
+
+def test_qbraid_api_key_via_target_arg_without_env_var():
+    """When QBRAID_API_KEY env var is absent, api_key kwarg must work."""
+    saved = os.environ.pop("QBRAID_API_KEY", None)
+    try:
+        _set_qbraid_target(api_key=TEST_API_KEY)
+
+        kernel = cudaq.make_kernel()
+        qubit = kernel.qalloc()
+        kernel.h(qubit)
+        kernel.mz(qubit)
+
+        counts = cudaq.sample(kernel)
+        assert len(counts) >= 1
+    finally:
+        if saved is not None:
+            os.environ["QBRAID_API_KEY"] = saved
+
+
+def test_qbraid_machine_alternative_device():
+    """A different machine string is accepted via the target arg."""
+    _set_qbraid_target(machine="aws:aws:sim:sv1")
+
+    kernel = cudaq.make_kernel()
+    qubit = kernel.qalloc()
+    kernel.h(qubit)
+    kernel.mz(qubit)
+
+    counts = cudaq.sample(kernel)
+    assert len(counts) >= 1
+
+
+def _arm_result_status(code: int):
+    """Force the next /result call on the mock to return the given HTTP code.
+
+    Resets prior test-hook state first so the test is order-independent.
+    """
+    reset_url = f"http://localhost:{port}/test/reset"
+    arm_url = f"http://localhost:{port}/test/force_next_result_status/{code}"
+    # POST with empty body; no response parsing needed.
+    urlopen(Request(reset_url, data=b"", method="POST"), timeout=5).read()
+    urlopen(Request(arm_url, data=b"", method="POST"), timeout=5).read()
+
+
+def test_qbraid_result_auth_failure():
+    """401 on /result -> terminal auth error; message names the status."""
+    _arm_result_status(401)
+    kernel = cudaq.make_kernel()
+    qubit = kernel.qalloc()
+    kernel.h(qubit)
+    kernel.mz(qubit)
+    with pytest.raises(RuntimeError, match="authentication failed"):
+        cudaq.sample(kernel)
+
+
+def test_qbraid_result_forbidden():
+    """403 on /result -> same terminal auth translation as 401."""
+    _arm_result_status(403)
+    kernel = cudaq.make_kernel()
+    qubit = kernel.qalloc()
+    kernel.h(qubit)
+    kernel.mz(qubit)
+    with pytest.raises(RuntimeError, match="authentication failed"):
+        cudaq.sample(kernel)
+
+
+def test_qbraid_result_not_found():
+    """404 on /result -> terminal 'result not found' error."""
+    _arm_result_status(404)
+    kernel = cudaq.make_kernel()
+    qubit = kernel.qalloc()
+    kernel.h(qubit)
+    kernel.mz(qubit)
+    with pytest.raises(RuntimeError, match="result not found"):
+        cudaq.sample(kernel)
+
+
+def test_qbraid_result_server_error_retries():
+    """500 on /result is retryable; hook clears after one call so retry wins."""
+    _arm_result_status(500)
+    kernel = cudaq.make_kernel()
+    qubit = kernel.qalloc()
+    kernel.h(qubit)
+    kernel.mz(qubit)
+    counts = cudaq.sample(kernel)
+    assert len(counts) >= 1
+
+
+# leave for gdb debugging
+if __name__ == "__main__":
+    loc = os.path.abspath(__file__)
+    pytest.main([loc, "-s"])
diff --git a/python/tests/builder/test_translate.py b/python/tests/builder/test_translate.py
index f3995eabde9..d1709dfd059 100644
--- a/python/tests/builder/test_translate.py
+++ b/python/tests/builder/test_translate.py
@@ -6,6 +6,8 @@
 # the terms of the Apache License 2.0 which accompanies this distribution.     #
 # ============================================================================ #
 
+import math
+import re
 import numpy as np
 import cudaq
 
@@ -77,3 +79,100 @@ def test_get_unitary_builder():
     expected = (1 / np.sqrt(2)) * np.array([[1, 1], [1, -1]],
                                            dtype=np.complex128)
     np.testing.assert_allclose(unitary, expected, atol=1e-12)
+
+
+def _adjoint_openqasm(build_inner):
+    # build_inner() returns the sub-kernel to embed via adjoint().
+    # Builds a one-qubit outer kernel, applies the adjoint, and translates to OpenQASM 2.
+    outer = cudaq.make_kernel()
+    reg = outer.qalloc(1)
+    outer.adjoint(build_inner(), reg[0])
+    return cudaq.translate(outer, format="openqasm2")
+
+
+def _parse_qasm2_rotation_ops(qasm):
+    # Return list of (gate_name, angle) for every rx/ry/rz in the circuit
+    # body (after the qreg declaration, skipping gate definition blocks).
+    body = qasm[qasm.index('qreg'):]
+    return [(m.group(1), float(m.group(2)))
+            for m in re.finditer(r'\b(r[xyz])\(([^)]+)\)', body)]
+
+
+def test_translate_builder_adjoint_s_openqasm():
+    # adjoint(s) should produce rz(-pi/2), the s-dagger equivalent.
+    def inner():
+        k, q = cudaq.make_kernel(cudaq.qubit)
+        k.s(q)
+        return k
+
+    asm = _adjoint_openqasm(inner)
+    assert "OPENQASM 2.0;" in asm
+    ops = _parse_qasm2_rotation_ops(asm)
+    assert len(ops) == 1
+    gate, angle = ops[0]
+    assert gate == "rz"
+    assert math.isclose(angle, -math.pi / 2, rel_tol=1e-5)
+
+
+def test_translate_builder_adjoint_rx_openqasm():
+    # adjoint(rx(pi/3)) should negate the angle to -pi/3.
+    def inner():
+        k, q = cudaq.make_kernel(cudaq.qubit)
+        k.rx(math.pi / 3, q)
+        return k
+
+    asm = _adjoint_openqasm(inner)
+    assert "OPENQASM 2.0;" in asm
+    ops = _parse_qasm2_rotation_ops(asm)
+    assert len(ops) == 1
+    gate, angle = ops[0]
+    assert gate == "rx"
+    assert math.isclose(angle, -math.pi / 3, rel_tol=1e-5)
+
+
+def test_translate_builder_adjoint_ry_openqasm():
+    # adjoint(ry(pi/4)) should negate the angle to -pi/4.
+    def inner():
+        k, q = cudaq.make_kernel(cudaq.qubit)
+        k.ry(math.pi / 4, q)
+        return k
+
+    asm = _adjoint_openqasm(inner)
+    assert "OPENQASM 2.0;" in asm
+    ops = _parse_qasm2_rotation_ops(asm)
+    assert len(ops) == 1
+    gate, angle = ops[0]
+    assert gate == "ry"
+    assert math.isclose(angle, -math.pi / 4, rel_tol=1e-5)
+
+
+def test_translate_builder_adjoint_rz_openqasm():
+    # adjoint(rz(pi/6)) should negate the angle to -pi/6.
+    def inner():
+        k, q = cudaq.make_kernel(cudaq.qubit)
+        k.rz(math.pi / 6, q)
+        return k
+
+    asm = _adjoint_openqasm(inner)
+    assert "OPENQASM 2.0;" in asm
+    ops = _parse_qasm2_rotation_ops(asm)
+    assert len(ops) == 1
+    gate, angle = ops[0]
+    assert gate == "rz"
+    assert math.isclose(angle, -math.pi / 6, rel_tol=1e-5)
+
+
+def test_translate_builder_adjoint_t_openqasm():
+    # adjoint(t) should produce rz(-pi/4), the t-dagger equivalent.
+    def inner():
+        k, q = cudaq.make_kernel(cudaq.qubit)
+        k.t(q)
+        return k
+
+    asm = _adjoint_openqasm(inner)
+    assert "OPENQASM 2.0;" in asm
+    ops = _parse_qasm2_rotation_ops(asm)
+    assert len(ops) == 1
+    gate, angle = ops[0]
+    assert gate == "rz"
+    assert math.isclose(angle, -math.pi / 4, rel_tol=1e-5)
diff --git a/python/tests/custom/test_custom_operations.py b/python/tests/custom/test_custom_operations.py
index 01df0fae971..8d25c85d418 100644
--- a/python/tests/custom/test_custom_operations.py
+++ b/python/tests/custom/test_custom_operations.py
@@ -371,6 +371,41 @@ def kernel(n: int):
         error)
 
 
+def test_nested_kernel_single_qubit():
+    """Regression test for issue #2485: custom op in a nested kernel on a single qubit."""
+    cudaq.register_operation("custom_x_nested", np.array([0, 1, 1, 0]))
+
+    @cudaq.kernel
+    def inner(q: cudaq.qubit):
+        custom_x_nested(q)
+
+    @cudaq.kernel
+    def outer():
+        q = cudaq.qubit()
+        inner(q)
+
+    counts = cudaq.sample(outer, shots_count=100)
+    assert counts["1"] == 100
+
+
+def test_nested_kernel_qview():
+    """Regression test for issue #2485: custom op in a nested kernel on a qview."""
+    cudaq.register_operation("custom_x_qview", np.array([0, 1, 1, 0]))
+
+    @cudaq.kernel
+    def inner(qubits: cudaq.qview):
+        for i in range(len(qubits)):
+            custom_x_qview(qubits[i])
+
+    @cudaq.kernel
+    def outer():
+        qubits = cudaq.qvector(2)
+        inner(qubits)
+
+    counts = cudaq.sample(outer, shots_count=100)
+    assert counts["11"] == 100
+
+
 # leave for gdb debugging
 if __name__ == "__main__":
     loc = os.path.abspath(__file__)
diff --git a/python/tests/dynamics/integrators/test_evolve_dynamics_torch_integrators.py b/python/tests/dynamics/integrators/test_evolve_dynamics_torch_integrators.py
index 390aa4ee4ad..89b0ef4fa3e 100644
--- a/python/tests/dynamics/integrators/test_evolve_dynamics_torch_integrators.py
+++ b/python/tests/dynamics/integrators/test_evolve_dynamics_torch_integrators.py
@@ -84,6 +84,121 @@ def test_density_matrix_indexing():
     TestDensityMatrixIndexing().run_tests(CUDATorchDiffEqRK4Integrator)
 
 
+def test_user_provided_stepper_torch():
+    """Verify that Torch integrators use a user-provided stepper."""
+    from cudaq.dynamics.integrators.builtin_integrators import cuDensityMatTimeStepper
+    from cudaq.dynamics.integrator import BaseTimeStepper
+    from cudaq.mlir._mlir_libs._quakeDialects.cudaq_runtime import MatrixOperator, State
+    from cudaq.dynamics import nvqir_dynamics_bindings as bindings
+
+    N = 10
+    steps = np.linspace(0, 10, 101)
+    schedule = Schedule(steps, ["t"])
+    hamiltonian = number(0)
+    dimensions = {0: N}
+    decay_rate = 0.1
+    collapse_operators = [np.sqrt(decay_rate) * annihilate(0)]
+
+    bindings_schedule = bindings.Schedule(steps, ["t"])
+    # The actual stepper we'll use for integration, wrapped by our TrackingStepper below to verify that it's being called by the integrator.
+    real_stepper = cuDensityMatTimeStepper(
+        bindings_schedule, MatrixOperator(hamiltonian),
+        [MatrixOperator(op) for op in collapse_operators], [N], True)
+
+    class TrackingStepper(BaseTimeStepper[State]):
+
+        def __init__(self, stepper):
+            self.stepper = stepper
+            self.call_count = 0
+
+        def compute(self, state, t):
+            return self.stepper.compute(state, t)
+
+        def compute_inplace(self, state, t, out_state):
+            self.call_count += 1
+            self.stepper.compute_inplace(state, t, out_state)
+
+    tracking = TrackingStepper(real_stepper)
+    psi0_ = cp.zeros(N, dtype=cp.complex128)
+    psi0_[-1] = 1.0
+    psi0 = cudaq.State.from_data(psi0_)
+
+    evolution_result = cudaq.evolve(
+        hamiltonian,
+        dimensions,
+        schedule,
+        psi0,
+        observables=[hamiltonian],
+        collapse_operators=collapse_operators,
+        store_intermediate_results=cudaq.IntermediateResultSave.
+        EXPECTATION_VALUE,
+        # Torch integrator with a user-provided stepper.
+        integrator=CUDATorchDiffEqDopri5Integrator(stepper=tracking))
+
+    assert tracking.call_count > 0
+    expectation_values = [
+        exp_vals[0].expectation()
+        for exp_vals in evolution_result.expectation_values()
+    ]
+    expected_answer = (N - 1) * np.exp(-decay_rate * steps)
+    np.testing.assert_allclose(expected_answer, expectation_values, 1e-3)
+
+
+def test_user_provided_stepper_torch_compute_only():
+    """Verify the fallback path: a stepper with only compute() (no compute_inplace)."""
+    from cudaq.dynamics.integrators.builtin_integrators import cuDensityMatTimeStepper
+    from cudaq.dynamics.integrator import BaseTimeStepper
+    from cudaq.mlir._mlir_libs._quakeDialects.cudaq_runtime import MatrixOperator, State
+    from cudaq.dynamics import nvqir_dynamics_bindings as bindings
+
+    N = 10
+    steps = np.linspace(0, 10, 101)
+    schedule = Schedule(steps, ["t"])
+    hamiltonian = number(0)
+    dimensions = {0: N}
+    decay_rate = 0.1
+    collapse_operators = [np.sqrt(decay_rate) * annihilate(0)]
+
+    bindings_schedule = bindings.Schedule(steps, ["t"])
+    real_stepper = cuDensityMatTimeStepper(
+        bindings_schedule, MatrixOperator(hamiltonian),
+        [MatrixOperator(op) for op in collapse_operators], [N], True)
+
+    class ComputeOnlyStepper(BaseTimeStepper[State]):
+
+        def __init__(self, stepper):
+            self.stepper = stepper
+            self.call_count = 0
+
+        def compute(self, state, t):
+            self.call_count += 1
+            return self.stepper.compute(state, t)
+
+    tracking = ComputeOnlyStepper(real_stepper)
+    psi0_ = cp.zeros(N, dtype=cp.complex128)
+    psi0_[-1] = 1.0
+    psi0 = cudaq.State.from_data(psi0_)
+
+    evolution_result = cudaq.evolve(
+        hamiltonian,
+        dimensions,
+        schedule,
+        psi0,
+        observables=[hamiltonian],
+        collapse_operators=collapse_operators,
+        store_intermediate_results=cudaq.IntermediateResultSave.
+        EXPECTATION_VALUE,
+        integrator=CUDATorchDiffEqDopri5Integrator(stepper=tracking))
+
+    assert tracking.call_count > 0
+    expectation_values = [
+        exp_vals[0].expectation()
+        for exp_vals in evolution_result.expectation_values()
+    ]
+    expected_answer = (N - 1) * np.exp(-decay_rate * steps)
+    np.testing.assert_allclose(expected_answer, expectation_values, 1e-3)
+
+
 # leave for gdb debugging
 if __name__ == "__main__":
     loc = os.path.abspath(__file__)
diff --git a/python/tests/dynamics/test_evolve_dynamics.py b/python/tests/dynamics/test_evolve_dynamics.py
index b327a76614f..c648ab28ed9 100644
--- a/python/tests/dynamics/test_evolve_dynamics.py
+++ b/python/tests/dynamics/test_evolve_dynamics.py
@@ -247,6 +247,63 @@ def test_evolve_from_data_random_density_matrix_preserved_cudm():
         err_msg="final state should match initial density matrix")
 
 
+def test_user_provided_stepper_scipy():
+    """Verify that ScipyZvodeIntegrator uses a user-provided stepper."""
+    from cudaq.dynamics.integrators.builtin_integrators import cuDensityMatTimeStepper
+    from cudaq.dynamics.integrator import BaseTimeStepper
+    from cudaq.mlir._mlir_libs._quakeDialects.cudaq_runtime import MatrixOperator, State
+    from cudaq.dynamics import nvqir_dynamics_bindings as bindings
+
+    N = 10
+    steps = np.linspace(0, 10, 101)
+    schedule = Schedule(steps, ["t"])
+    hamiltonian = boson.number(0)
+    dimensions = {0: N}
+    decay_rate = 0.1
+    collapse_operators = [np.sqrt(decay_rate) * boson.annihilate(0)]
+
+    bindings_schedule = bindings.Schedule(steps, ["t"])
+    # The actual stepper that will be used for integration. We will wrap this with a `TrackingStepper` to verify that it is called during integration.
+    real_stepper = cuDensityMatTimeStepper(
+        bindings_schedule, MatrixOperator(hamiltonian),
+        [MatrixOperator(op) for op in collapse_operators], [N], True)
+
+    class TrackingStepper(BaseTimeStepper[State]):
+
+        def __init__(self, stepper):
+            self.stepper = stepper
+            # A counter to let us know that this stepper is actually being called during integration.
+            self.call_count = 0
+
+        def compute(self, state, t):
+            self.call_count += 1
+            return self.stepper.compute(state, t)
+
+    tracking = TrackingStepper(real_stepper)
+    psi0_ = cp.zeros(N, dtype=cp.complex128)
+    psi0_[-1] = 1.0
+    psi0 = cudaq.State.from_data(psi0_)
+
+    evolution_result = cudaq.evolve(
+        hamiltonian,
+        dimensions,
+        schedule,
+        psi0,
+        observables=[hamiltonian],
+        collapse_operators=collapse_operators,
+        store_intermediate_results=cudaq.IntermediateResultSave.
+        EXPECTATION_VALUE,
+        integrator=ScipyZvodeIntegrator(stepper=tracking))
+
+    assert tracking.call_count > 0
+    expectation_values = [
+        exp_vals[0].expectation()
+        for exp_vals in evolution_result.expectation_values()
+    ]
+    expected_answer = (N - 1) * np.exp(-decay_rate * steps)
+    np.testing.assert_allclose(expected_answer, expectation_values, 1e-3)
+
+
 # leave for gdb debugging
 if __name__ == "__main__":
     loc = os.path.abspath(__file__)
diff --git a/python/tests/interop/CMakeLists.txt b/python/tests/interop/CMakeLists.txt
index 989549f7bd7..8e921aa6001 100644
--- a/python/tests/interop/CMakeLists.txt
+++ b/python/tests/interop/CMakeLists.txt
@@ -17,7 +17,8 @@ set(CMAKE_INSTALL_RPATH
 
 add_subdirectory(quantum_lib)
 
-pybind11_add_module(cudaq_test_cpp_algo
+nanobind_add_module(cudaq_test_cpp_algo
+  NB_STATIC
   test_cpp_quantum_algorithm_module.cpp
 )
 
@@ -28,6 +29,10 @@ target_link_libraries(cudaq_test_cpp_algo
     cudaq-python-interop
 )
 
+if(APPLE)
+  target_link_options(cudaq_test_cpp_algo PRIVATE -Wl,-undefined,dynamic_lookup)
+endif()
+
 target_include_directories(cudaq_test_cpp_algo
   PRIVATE
     ${CMAKE_SOURCE_DIR}/python
diff --git a/python/tests/interop/qlib.py b/python/tests/interop/qlib.py
new file mode 100644
index 00000000000..2a1c3ec6d75
--- /dev/null
+++ b/python/tests/interop/qlib.py
@@ -0,0 +1,12 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+# Simulates a Python package that re-exports C++ device kernels from a C++
+# extension module (issue #2348). The kernels are registered under
+# cudaq_test_cpp_algo, but are accessed here via the qlib namespace.
+from cudaq_test_cpp_algo import qstd
diff --git a/python/tests/interop/quantum_lib/CMakeLists.txt b/python/tests/interop/quantum_lib/CMakeLists.txt
index 6e64f778694..4ec95bc4ba4 100644
--- a/python/tests/interop/quantum_lib/CMakeLists.txt
+++ b/python/tests/interop/quantum_lib/CMakeLists.txt
@@ -20,7 +20,6 @@ add_library(quantum_lib
 target_include_directories(quantum_lib
   PRIVATE
     ${PYTHON_INCLUDE_DIRS}
-    ${pybind11_INCLUDE_DIRS}
 )
 
 # Dependencies: quantum_lib uses nvq++ as its compiler, so we need the full
diff --git a/python/tests/interop/quantum_lib/quantum_lib.cpp b/python/tests/interop/quantum_lib/quantum_lib.cpp
index c93e3d59bf0..f150ee3a4ac 100644
--- a/python/tests/interop/quantum_lib/quantum_lib.cpp
+++ b/python/tests/interop/quantum_lib/quantum_lib.cpp
@@ -8,8 +8,6 @@
 
 #include "quantum_lib.h"
 
-namespace py = pybind11;
-
 __qpu__ void
 cudaq::entryPoint(const std::function<void(cudaq::qvector<> &)> &statePrep) {
   cudaq::qvector q(2);
diff --git a/python/tests/interop/quantum_lib/quantum_lib.h b/python/tests/interop/quantum_lib/quantum_lib.h
index 0fe62c24e9b..81f95c06d1c 100644
--- a/python/tests/interop/quantum_lib/quantum_lib.h
+++ b/python/tests/interop/quantum_lib/quantum_lib.h
@@ -9,7 +9,6 @@
 #pragma once
 
 #include "cudaq/qis/qubit_qis.h"
-#include <pybind11/pybind11.h>
 
 namespace cudaq {
 
diff --git a/python/tests/interop/test_cpp_quantum_algorithm_module.cpp b/python/tests/interop/test_cpp_quantum_algorithm_module.cpp
index 50e918e7c8f..06e29e01b55 100644
--- a/python/tests/interop/test_cpp_quantum_algorithm_module.cpp
+++ b/python/tests/interop/test_cpp_quantum_algorithm_module.cpp
@@ -10,12 +10,10 @@
 #include "cudaq/algorithms/sample.h"
 #include "quantum_lib/quantum_lib.h"
 #include "runtime/interop/PythonCppInterop.h"
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/vector.h>
 
-namespace py = pybind11;
-
-PYBIND11_MODULE(cudaq_test_cpp_algo, m) {
+NB_MODULE(cudaq_test_cpp_algo, m) {
   // Example of how to expose C++ kernels.
   cudaq::python::addDeviceKernelInterop<cudaq::qview<>>(
       m, "qstd", "qft", "(Fake) Quantum Fourier Transform.");
@@ -27,34 +25,33 @@ PYBIND11_MODULE(cudaq_test_cpp_algo, m) {
   // Callback tests
   m.def(
       "run0",
-      [](py::object qern, std::size_t qnum) {
+      [](nanobind::object qern, std::size_t qnum) {
         cudaq::python::launch_specialized_py_decorator<cudaq::qkernel<void()>>(
             qern, cudaq::sit_and_spin_test, qnum);
       },
       "");
   m.def(
       "run0b",
-      [](py::object qern, std::size_t qnum) {
+      [](nanobind::object qern, std::size_t qnum) {
         // This idiom uses argument marshaling instead of specialization. This
         // allows `entryPoint` to be called with different arguments. Note that
         // the `decorator` must remain alive for `entryPoint` to be valid.
         cudaq::python::CppPyKernelDecorator decorator(qern);
         auto entryPoint =
-            decorator
-                .getEntryPointFunction<cudaq::qkernel<void(std::size_t)>>();
+            decorator.getDirectKernelCall<cudaq::qkernel<void(std::size_t)>>();
         marshal_test(std::move(entryPoint), qnum);
       },
       "");
   m.def(
       "run1",
-      [](py::object qern) {
+      [](nanobind::object qern) {
         cudaq::python::launch_specialized_py_decorator<cudaq::qkernel<void()>>(
             qern, cudaq::plug_and_chug_test);
       },
       "");
   m.def(
       "run2",
-      [](py::object qern) {
+      [](nanobind::object qern) {
         cudaq::python::launch_specialized_py_decorator<
             cudaq::qkernel<void(cudaq::qvector<> &)>>(qern,
                                                       cudaq::brain_bend_test);
@@ -62,7 +59,7 @@ PYBIND11_MODULE(cudaq_test_cpp_algo, m) {
       "");
   m.def(
       "run3",
-      [](py::object qern) {
+      [](nanobind::object qern) {
         cudaq::python::launch_specialized_py_decorator<
             cudaq::qkernel<void(cudaq::qvector<> &, std::size_t)>>(
             qern, cudaq::most_curious_test);
@@ -70,7 +67,7 @@ PYBIND11_MODULE(cudaq_test_cpp_algo, m) {
       "");
   m.def(
       "run4",
-      [](py::object qern) {
+      [](nanobind::object qern) {
         cudaq::python::launch_specialized_py_decorator<
             cudaq::qkernel<std::size_t(cudaq::qvector<> &, std::size_t)>>(
             qern, cudaq::callback_test);
@@ -79,7 +76,7 @@ PYBIND11_MODULE(cudaq_test_cpp_algo, m) {
 
   m.def(
       "run5",
-      [](py::object qern) {
+      [](nanobind::object qern) {
         cudaq::python::launch_specialized_py_decorator<
             cudaq::qkernel<std::vector<float>()>>(qern, cudaq::py_ret_test1);
       },
@@ -87,7 +84,7 @@ PYBIND11_MODULE(cudaq_test_cpp_algo, m) {
 
   m.def(
       "run6",
-      [](py::object qern) {
+      [](nanobind::object qern) {
         cudaq::python::launch_specialized_py_decorator<
             cudaq::qkernel<std::vector<float>(std::size_t)>>(
             qern, cudaq::py_ret_test2);
diff --git a/python/tests/interop/test_interop.py b/python/tests/interop/test_interop.py
index de8ff4264c5..62118d289c1 100644
--- a/python/tests/interop/test_interop.py
+++ b/python/tests/interop/test_interop.py
@@ -196,6 +196,30 @@ def call_call_c_twice():
     call_call_c_twice()
 
 
+def test_cpp_kernel_from_python_4():
+    """Regression test for issue #2348."""
+    pytest.importorskip('cudaq_test_cpp_algo')
+
+    import qlib
+
+    # Sanity checks
+    print(qlib.qstd.qft)
+    print(qlib.qstd.another)
+
+    @cudaq.kernel
+    def callQftAndAnother():
+        q = cudaq.qvector(4)
+        qlib.qstd.qft(q)
+        h(q)
+        qlib.qstd.another(q, 2)
+
+    callQftAndAnother()
+
+    counts = cudaq.sample(callQftAndAnother)
+    counts.dump()
+    assert len(counts) == 1 and '0010' in counts
+
+
 def test_callbacks():
     pytest.importorskip('cudaq_test_cpp_algo')
 
diff --git a/python/tests/kernel/issue_2344/hidden_module/__init__.py b/python/tests/kernel/issue_2344/hidden_module/__init__.py
new file mode 100644
index 00000000000..803799868dd
--- /dev/null
+++ b/python/tests/kernel/issue_2344/hidden_module/__init__.py
@@ -0,0 +1,14 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+import cudaq
+
+
+@cudaq.kernel
+def spooky_kernel():
+    cudaq.qubit()
diff --git a/python/tests/kernel/issue_2344/test_issue_2344.py b/python/tests/kernel/issue_2344/test_issue_2344.py
new file mode 100644
index 00000000000..72775baf240
--- /dev/null
+++ b/python/tests/kernel/issue_2344/test_issue_2344.py
@@ -0,0 +1,36 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+# Regression test for issue #2344:
+# A kernel defined in another module (hidden_module.spooky_kernel) must NOT be
+# callable without its module qualifier. The call `spooky_kernel()` inside
+# test0 should raise a NameError because spooky_kernel is not in scope.
+#
+# Package layout (relative to this file):
+#   hidden_module/__init__.py    <- defines @cudaq.kernel def spooky_kernel()
+
+import cudaq
+import pytest
+
+import hidden_module  # noqa: F401 — imported for kernel registration side effect
+
+
+@pytest.fixture(autouse=True)
+def clear_registries():
+    yield
+    cudaq.__clearKernelRegistries()
+
+
+def test_unqualified_cross_module_kernel_call_raises():
+
+    @cudaq.kernel
+    def test0():
+        spooky_kernel()  # not imported — should not resolve
+
+    with pytest.raises((NameError, RuntimeError)):
+        cudaq.sample(test0)
diff --git a/python/tests/kernel/issue_2346/algo_lib/__init__.py b/python/tests/kernel/issue_2346/algo_lib/__init__.py
new file mode 100644
index 00000000000..ea0aa8bf146
--- /dev/null
+++ b/python/tests/kernel/issue_2346/algo_lib/__init__.py
@@ -0,0 +1,9 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+from .gates import qft_ops
diff --git a/python/tests/kernel/issue_2346/algo_lib/gates/__init__.py b/python/tests/kernel/issue_2346/algo_lib/gates/__init__.py
new file mode 100644
index 00000000000..bec24eb03db
--- /dev/null
+++ b/python/tests/kernel/issue_2346/algo_lib/gates/__init__.py
@@ -0,0 +1,7 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
diff --git a/python/tests/kernel/issue_2346/algo_lib/gates/qft_ops.py b/python/tests/kernel/issue_2346/algo_lib/gates/qft_ops.py
new file mode 100644
index 00000000000..16e39f4004f
--- /dev/null
+++ b/python/tests/kernel/issue_2346/algo_lib/gates/qft_ops.py
@@ -0,0 +1,14 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+import cudaq
+
+
+@cudaq.kernel
+def qft_kernel():
+    cudaq.qubit()
diff --git a/python/tests/kernel/issue_2346/test_issue_2346.py b/python/tests/kernel/issue_2346/test_issue_2346.py
new file mode 100644
index 00000000000..76bb3dd914e
--- /dev/null
+++ b/python/tests/kernel/issue_2346/test_issue_2346.py
@@ -0,0 +1,38 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+# Regression test for issue #2346:
+# A kernel called via an intermediate import path (gates.qft_ops.qft_kernel)
+# must resolve even though it is registered under the fully-qualified path
+# (algo_lib.gates.qft_ops.qft_kernel).
+#
+# Package layout (relative to this file):
+#   algo_lib/__init__.py              <- from .gates import qft_ops
+#   algo_lib/gates/__init__.py
+#   algo_lib/gates/qft_ops.py         <- defines @cudaq.kernel def qft_kernel()
+
+import cudaq
+import pytest
+
+from algo_lib import gates
+
+
+@pytest.fixture(autouse=True)
+def clear_registries():
+    yield
+    cudaq.__clearKernelRegistries()
+
+
+def test_kernel_call_via_partial_module_path():
+
+    @cudaq.kernel
+    def test0():
+        gates.qft_ops.qft_kernel()
+
+    counts = cudaq.sample(test0)
+    assert '0' in counts
diff --git a/python/tests/kernel/test_explicit_measurements.py b/python/tests/kernel/test_explicit_measurements.py
index 2c69837c080..3b3b216fdf4 100644
--- a/python/tests/kernel/test_explicit_measurements.py
+++ b/python/tests/kernel/test_explicit_measurements.py
@@ -195,21 +195,62 @@ def no_measure_ops():
     assert "not supported on a kernel without any measurement" in repr(e)
 
 
-def test_mx_my():
+def test_mixed_basis_measurement_order_and_preservation():
 
     @cudaq.kernel
-    def my_kernel():
-        q = cudaq.qvector(2)
-        h(q[0])
-        x.ctrl(q[0], q[1])
-        mx(q[0])
-        my(q[1])
+    def mixed_basis_kernel():
+        q = cudaq.qvector(9)
+
+        # Prepare a non-palindromic deterministic pattern over measured bits.
+        # q0=0 (mz), q1=1 (mz), q2=1 (mx), q3=? (my), q4=0 (mz), q5=0 (mx),
+        # q6=1 (mz) -> 011?001 in allocation order.
+        x(q[1])
+        x(q[2])
+        h(q[2])
+        h(q[5])
+        x(q[6])
+
+        # Mix measurement bases and execution order.
+        mz(q[4])
+        mx(q[2])
+        my(q[3])
+        mz(q[0])
+        mx(q[5])
+        mz(q[6])
+        mz(q[1])
 
-    counts = cudaq.sample(my_kernel)
-    assert len(counts) == 2
+    counts = cudaq.sample(mixed_basis_kernel, shots_count=100)
 
-    counts = cudaq.sample(my_kernel, explicit_measurements=True)
-    assert len(counts) == 4
+    total_counts = 0
+    for bits in counts:
+        assert len(bits) == 7
+        assert bits[0] == '0'
+        assert bits[1] == '1'
+        assert bits[2] == '1'
+        assert bits[4] == '0'
+        assert bits[5] == '0'
+        assert bits[6] == '1'
+        total_counts += counts[bits]
+
+    assert total_counts == 100
+
+    counts = cudaq.sample(mixed_basis_kernel,
+                          explicit_measurements=True,
+                          shots_count=100)
+
+    # Execution order was q4, q2, q3, q0, q5, q6, q1 => 01?0011.
+    total_counts = 0
+    for bits in counts:
+        assert len(bits) == 7
+        assert bits[0] == '0'
+        assert bits[1] == '1'
+        assert bits[3] == '0'
+        assert bits[4] == '0'
+        assert bits[5] == '1'
+        assert bits[6] == '1'
+        total_counts += counts[bits]
+
+    assert total_counts == 100
 
 
 # NOTE: Ref - https://github.com/NVIDIA/cuda-quantum/issues/1925
diff --git a/python/tests/kernel/test_kernel_features.py b/python/tests/kernel/test_kernel_features.py
index b3b99086028..856c2ea8311 100644
--- a/python/tests/kernel/test_kernel_features.py
+++ b/python/tests/kernel/test_kernel_features.py
@@ -570,6 +570,66 @@ def test2(myList: List[int]):
     assert '1010' in counts
 
 
+def test_dbg_ast_strict_path():
+    """Only cudaq.dbg.ast.print_i64/f64 is valid inside a kernel.
+
+    cudaq.ast is a lazy alias for cudaq.dbg.ast (via _LAZY_SUBMODULES), so
+    without an explicit AST structure check it would pass the devKey guard.
+    Component reorderings like dbg.cudaq.ast are also rejected.
+
+    See https://github.com/NVIDIA/cuda-quantum/issues/2342
+    """
+
+    @cudaq.kernel
+    def valid_kernel(n: int, f: float):
+        q = cudaq.qvector(n)
+        h(q[0])
+        cudaq.dbg.ast.print_i64(n)
+        cudaq.dbg.ast.print_f64(f)
+        mz(q)
+
+    counts = cudaq.sample(valid_kernel, 2, 2.0)
+    assert len(counts) > 0
+
+    # cudaq.ast resolves to cudaq.dbg.ast at runtime via _LAZY_SUBMODULES;
+    # isExactCudaqDbgAstCall rejects it because the AST node chain is wrong.
+    with pytest.raises(RuntimeError):
+
+        @cudaq.kernel
+        def invalid_cudaq_ast(n: int):
+            q = cudaq.qvector(n)
+            h(q[0])
+            cudaq.ast.print_i64(n)
+            mz(q)
+
+        cudaq.sample(invalid_cudaq_ast, 2)
+
+    # dbg.cudaq.ast - resolveQualifiedName returns 'dbg.cudaq.ast', which
+    # never matches the devKey guard.
+    with pytest.raises(RuntimeError):
+
+        @cudaq.kernel
+        def invalid_dbg_cudaq_ast(n: int):
+            q = cudaq.qvector(n)
+            h(q[0])
+            dbg.cudaq.ast.print_i64(n)
+            mz(q)
+
+        cudaq.sample(invalid_dbg_cudaq_ast, 2)
+
+    # ast.cudaq.dbg - same: resolveQualifiedName returns 'ast.cudaq.dbg'.
+    with pytest.raises(RuntimeError):
+
+        @cudaq.kernel
+        def invalid_ast_cudaq_dbg(n: int):
+            q = cudaq.qvector(n)
+            h(q[0])
+            ast.cudaq.dbg.print_i64(n)
+            mz(q)
+
+        cudaq.sample(invalid_ast_cudaq_dbg, 2)
+
+
 def test_no_dynamic_Lists():
     with pytest.raises(RuntimeError) as error:
 
diff --git a/python/tests/kernel/test_kernel_repl_source.py b/python/tests/kernel/test_kernel_repl_source.py
new file mode 100644
index 00000000000..52bd0a37eb4
--- /dev/null
+++ b/python/tests/kernel/test_kernel_repl_source.py
@@ -0,0 +1,141 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+"""
+Regression tests for GitHub issue #2593: decorating a function with
+`@cudaq.kernel` in the standard Python REPL previously raised an opaque
+`OSError: could not get source code`. The fix replaces that with a
+`RuntimeError` whose message explains the cause and suggests workarounds.
+"""
+
+import linecache
+
+import pytest
+
+import cudaq
+from cudaq.kernel.analysis import FetchDepFuncsSourceCode
+from cudaq.kernel.utils import get_function_source_or_raise
+
+
+@pytest.fixture(autouse=True)
+def _clear_registries():
+    yield
+    cudaq.__clearKernelRegistries()
+
+
+def _make_synthetic_function(src, name, filename):
+    """
+    Compile `src` with `filename` as its source-of-record (mimicking what
+    CPython does when it executes code typed at the REPL). Returns the
+    named callable from the resulting namespace.
+
+    The filename must not already be cached in `linecache` — otherwise
+    `inspect.getsource` could succeed unexpectedly and produce a false
+    negative for the tests below.
+    """
+    assert filename not in linecache.cache, (
+        f"linecache already has an entry for {filename!r}; pick a unique name")
+    code = compile(src, filename, 'exec')
+    ns = {}
+    exec(code, ns)
+    return ns[name]
+
+
+def test_repl_decoration_raises_clear_error():
+    """
+    Direct reproduction of issue #2593: a function with `<stdin>` as its
+    source filename cannot be compiled, but the error must name the
+    function and point at Jupyter/file workarounds instead of surfacing a
+    raw `OSError`.
+    """
+    fn = _make_synthetic_function(
+        "def my_repl_kernel(n: int):\n    pass\n",
+        name='my_repl_kernel',
+        filename='<stdin>',
+    )
+
+    with pytest.raises(RuntimeError) as excinfo:
+        cudaq.kernel(fn)
+
+    msg = str(excinfo.value)
+    assert 'my_repl_kernel' in msg
+    assert 'REPL' in msg
+    assert 'Jupyter' in msg
+    # Original OSError preserved for debugging.
+    assert isinstance(excinfo.value.__cause__, OSError)
+
+
+def test_synthetic_filename_raises_non_repl_message():
+    """
+    A function whose source filename is synthetic but not the REPL
+    sentinel (e.g., `<generated>`) produces the non-file-context message,
+    not the REPL-specific one.
+    """
+    fn = _make_synthetic_function(
+        "def generated_kernel(n: int):\n    pass\n",
+        name='generated_kernel',
+        filename='<generated-test-src>',
+    )
+
+    with pytest.raises(RuntimeError) as excinfo:
+        get_function_source_or_raise(fn)
+
+    msg = str(excinfo.value)
+    assert 'generated_kernel' in msg
+    assert '<generated-test-src>' in msg
+    # Must not misidentify this as a REPL case.
+    assert 'REPL' not in msg
+
+
+def test_dep_fetch_raises_clear_error_for_repl_helper():
+    """
+    When a kernel calls a helper defined in the REPL, the dependency
+    fetcher in `analysis.py` must surface the same clear diagnostic,
+    naming the offending helper rather than blowing up with `OSError`.
+    """
+    repl_helper = _make_synthetic_function(
+        "def repl_helper(x: int) -> int:\n    return x + 1\n",
+        name='repl_helper',
+        filename='<python-input-1>',
+    )
+
+    def parent_kernel(x: int) -> int:
+        return repl_helper(x)
+
+    # Inject the helper into the calling frame's locals so
+    # FetchDepFuncsSourceCode can resolve it by name, then trigger the
+    # dep fetch. The failure happens inside analysis.py, not the decorator.
+    with pytest.raises(RuntimeError) as excinfo:
+        FetchDepFuncsSourceCode.fetch(parent_kernel)
+
+    msg = str(excinfo.value)
+    assert 'repl_helper' in msg
+    assert 'REPL' in msg
+    assert isinstance(excinfo.value.__cause__, OSError)
+
+
+def test_normal_function_still_compiles():
+    """
+    Regression guard: ensure the error-path wrapping did not break the
+    ordinary success path. A kernel defined in this test file (which
+    `inspect.getsource` can read) must compile without raising.
+    """
+
+    @cudaq.kernel
+    def bell_pair():
+        q = cudaq.qvector(2)
+        h(q[0])
+        x.ctrl(q[0], q[1])
+
+    result = cudaq.sample(bell_pair, shots_count=100)
+    # The test passes if decoration and sampling succeed; specific counts
+    # are irrelevant here.
+    assert result is not None
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/python/tests/kernel/test_reuse_compiler.py b/python/tests/kernel/test_reuse_compiler.py
index 5a0f28336fe..5bb1c63bdc1 100644
--- a/python/tests/kernel/test_reuse_compiler.py
+++ b/python/tests/kernel/test_reuse_compiler.py
@@ -39,8 +39,8 @@ def nop(numQubits: int):
         res = cudaq.sample(simple, 4, shots_count=1)
         assert (res.count("1111") == 1)
 
-        with pytest.raises(RuntimeError):
-            res = cudaq.sample(simple, 5, shots_count=1)
+        res = cudaq.sample(simple, 5, shots_count=1)
+        assert (res.count("11111") == 1)
 
         @cudaq.kernel
         def simple(numQubits: int):
@@ -51,11 +51,29 @@ def simple(numQubits: int):
 
         simple = nop
         with pytest.raises(RuntimeError):
-            res = cudaq.sample(simple, 5, shots_count=1)
+            res = cudaq.sample(simple, 4, shots_count=1)
     res = cudaq.sample(simple, 6, shots_count=1)
     assert (res.count("000000") == 1)
 
 
+def test_reuse_different_kernel_raises():
+    """Calling a different kernel inside reuse_compiler_artifacts should raise."""
+
+    @cudaq.kernel
+    def kernel_a(numQubits: int):
+        qubits = cudaq.qvector(numQubits)
+        x(qubits.front())
+
+    @cudaq.kernel
+    def kernel_b(numQubits: int):
+        qubits = cudaq.qvector(numQubits)
+
+    with cudaq.cudaq_runtime.reuse_compiler_artifacts():
+        cudaq.sample(kernel_a, 2, shots_count=1)
+        with pytest.raises(RuntimeError):
+            cudaq.sample(kernel_b, 2, shots_count=1)
+
+
 def test_reuse_no_arguments():
     """A no-arg kernel should be reusable in artifact-reuse mode."""
 
@@ -119,8 +137,7 @@ def apply_complex_angles(angles: list[complex]):
                            same_angles_different_value,
                            shots_count=1)
         assert (res.count("11") == 1)
-        with pytest.raises(RuntimeError):
-            cudaq.sample(apply_complex_angles, different_angles, shots_count=1)
+        cudaq.sample(apply_complex_angles, different_angles, shots_count=1)
 
 
 def test_different_launch_mode():
@@ -159,3 +176,62 @@ def test_reuse_of_builder():
     with cudaq.cudaq_runtime.reuse_compiler_artifacts():
         cudaq.sample(kernel, 5, shots_count=1)
         cudaq.sample(kernel, 5, shots_count=1)
+
+
+def test_reuse_with_result_no_args():
+
+    @cudaq.kernel
+    def flip() -> bool:
+        q = cudaq.qubit()
+        x(q)
+        return mz(q)
+
+    with cudaq.cudaq_runtime.reuse_compiler_artifacts():
+        result = flip()
+        assert result == True
+        result = flip()  # cached kernel
+        assert result == True
+
+
+def test_reuse_with_result_and_args():
+
+    @cudaq.kernel
+    def count_ones(n: int) -> int:
+        qubits = cudaq.qvector(n)
+        for qubit in qubits:
+            x(qubit)
+        result = 0
+        for i in range(n):
+            if mz(qubits[i]):
+                result += 1
+        return result
+
+    with cudaq.cudaq_runtime.reuse_compiler_artifacts():
+        result = count_ones(3)
+        assert result == 3
+        result = count_ones(3)  # cached kernel, same arg
+        assert result == 3
+        result = count_ones(4)  # cached kernel, different arg
+        assert result == 4
+
+
+def test_reuse_via_run_with_result():
+
+    @cudaq.kernel
+    def count_ones(n: int) -> int:
+        qubits = cudaq.qvector(n)
+        for qubit in qubits:
+            x(qubit)
+        result = 0
+        for i in range(n):
+            if mz(qubits[i]):
+                result += 1
+        return result
+
+    with cudaq.cudaq_runtime.reuse_compiler_artifacts():
+        results = cudaq.run(count_ones, 3, shots_count=2)
+        assert len(results) == 2
+        assert all(r == 3 for r in results)
+        results = cudaq.run(count_ones, 4, shots_count=2)  # cached kernel
+        assert len(results) == 2
+        assert all(r == 4 for r in results)
diff --git a/python/tests/mlir/adjoint.py b/python/tests/mlir/adjoint.py
index 66b1f6cfe5a..2321902b7c7 100644
--- a/python/tests/mlir/adjoint.py
+++ b/python/tests/mlir/adjoint.py
@@ -302,7 +302,7 @@ def test_sample_adjoint_qreg():
 # CHECK:           } {invariant}
 # CHECK:           call @__nvqpp__mlirgen__PythonKernelBuilderInstance{{.*}}(%[[VAL_3]]) : (!quake.veq<?>) -> ()
 # CHECK:           quake.apply<adj> @__nvqpp__mlirgen__PythonKernelBuilderInstance{{.*}} %[[VAL_3]] : (!quake.veq<?>) -> ()
-# CHECK:           %[[VAL_13:.*]] = quake.mz %0 : (!quake.veq<?>) -> !quake.measurements<?>
+# CHECK:           %[[VAL_13:.*]] = quake.mz %0 : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
 # CHECK:           return
 # CHECK:         }
 
diff --git a/python/tests/mlir/ast_list_comprehension.py b/python/tests/mlir/ast_list_comprehension.py
index 56ebce4b87b..b220d5272a6 100644
--- a/python/tests/mlir/ast_list_comprehension.py
+++ b/python/tests/mlir/ast_list_comprehension.py
@@ -1135,6 +1135,97 @@ def kernel6(qs: cudaq.qvector, indices: list[int]):
 # CHECK: return
 
 
+def test_list_comprehension_filter():
+    print("test_list_comprehension_filter:")
+
+    @cudaq.kernel
+    def kernel1() -> list[int]:
+        return [x for x in range(16) if False]
+
+    out = cudaq.run(kernel1, shots_count=1)
+    assert len(out) == 1 and out[0] == []
+    print(kernel1)
+
+    @cudaq.kernel
+    def kernel2(mask: int) -> list[int]:
+        return [x for x in range(8) if ((1 << x) & mask) != 0]
+
+    out = cudaq.run(kernel2, 0b10011, shots_count=1)
+    assert len(out) == 1 and out[0] == [0, 1, 4]
+    print(kernel2)
+
+    @cudaq.kernel
+    def kernel3() -> list[int]:
+        vals = [1, 2, 3, 4, 5]
+        return [v for v in vals if v % 2 == 0]
+
+    out = cudaq.run(kernel3, shots_count=1)
+    assert len(out) == 1 and out[0] == [2, 4]
+    print(kernel3)
+
+    @cudaq.kernel
+    def kernel4() -> list[int]:
+        return [x for x in range(10) if x > 2 if x < 7]
+
+    out = cudaq.run(kernel4, shots_count=1)
+    assert len(out) == 1 and out[0] == [3, 4, 5, 6]
+    print(kernel4)
+
+    @cudaq.kernel
+    def kernel5():
+        qs = cudaq.qvector(4)
+        flips = [qs[i] for i in range(4) if i % 2 == 1]
+        x(flips)
+
+    out = cudaq.sample(kernel5)
+    assert len(out) == 1 and '0101' in out
+    print(kernel5)
+
+    @cudaq.kernel
+    def kernel6(mask: int):
+        qs = cudaq.qvector(4)
+        x(qs)
+        target = cudaq.qubit()
+        x.ctrl([qs[i] for i in range(4) if ((1 << i) & mask) != 0], target)
+
+    out = cudaq.sample(kernel6, 0b1001)
+    assert len(out) == 1 and '11111' in out
+    print(kernel6)
+
+
+# CHECK-LABEL: test_list_comprehension_filter:
+# CHECK-LABEL: func.func @__nvqpp__mlirgen__kernel1..
+# CHECK: cc.loop
+# CHECK: cc.if
+# CHECK: cc.stdvec_init
+# CHECK: return
+# CHECK-LABEL: func.func @__nvqpp__mlirgen__kernel2..
+# CHECK: cc.loop
+# CHECK: cc.if
+# CHECK: cc.stdvec_init
+# CHECK: return
+# CHECK-LABEL: func.func @__nvqpp__mlirgen__kernel3..
+# CHECK: cc.loop
+# CHECK: cc.if
+# CHECK: cc.stdvec_init
+# CHECK: return
+# CHECK-LABEL: func.func @__nvqpp__mlirgen__kernel4..
+# CHECK: cc.loop
+# CHECK: cc.if
+# CHECK: cc.stdvec_init
+# CHECK: return
+# CHECK-LABEL: func.func @__nvqpp__mlirgen__kernel5..
+# CHECK: cc.loop
+# CHECK: cc.if
+# CHECK: quake.concat
+# CHECK: return
+# CHECK-LABEL: func.func @__nvqpp__mlirgen__kernel6..
+# CHECK: cc.loop
+# CHECK: cc.if
+# CHECK: quake.concat
+# CHECK: return
+
+
 def test_list_comprehension_failures():
     print("test_list_comprehension_failures:")
     try:
diff --git a/python/tests/mlir/bug_1777.py b/python/tests/mlir/bug_1777.py
index b4bfc5f5e06..0ed54ea9403 100644
--- a/python/tests/mlir/bug_1777.py
+++ b/python/tests/mlir/bug_1777.py
@@ -48,7 +48,7 @@ def test():
 # CHECK:             %[[VAL_17:.*]] = quake.discriminate %[[VAL_16]] : (!quake.measure) -> i1
 # CHECK:             %[[VAL_18:.*]] = arith.cmpi eq, %[[VAL_17]], %[[VAL_0]] : i1
 # CHECK:             cc.if(%[[VAL_18]]) {
-# CHECK:               %[[VAL_19:.*]] = quake.mz %[[VAL_6]] name "inner_mz" : (!quake.veq<2>) -> !quake.measurements<2>
+# CHECK:               %[[VAL_19:.*]] = quake.mz %[[VAL_6]] name "inner_mz" : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 # CHECK:             } else {
 # CHECK:             }
 # CHECK:             cc.continue %[[VAL_12]], %[[VAL_12]], %[[VAL_17]] : i64, i64, i1
@@ -59,7 +59,7 @@ def test():
 # CHECK:           }
 # CHECK:           %[[VAL_24:.*]] = arith.cmpi eq, %[[VAL_25:.*]]#2, %[[VAL_3]] : i1
 # CHECK:           cc.if(%[[VAL_24]]) {
-# CHECK:             %[[VAL_26:.*]] = quake.mz %[[VAL_6]] name "outer_mz" : (!quake.veq<2>) -> !quake.measurements<2>
+# CHECK:             %[[VAL_26:.*]] = quake.mz %[[VAL_6]] name "outer_mz" : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 # CHECK:           } else {
 # CHECK:           }
 # CHECK:           quake.dealloc %[[VAL_6]] : !quake.veq<2>
diff --git a/python/tests/mlir/call_qpu.py b/python/tests/mlir/call_qpu.py
index 259fd08bdae..b041f369a3e 100644
--- a/python/tests/mlir/call_qpu.py
+++ b/python/tests/mlir/call_qpu.py
@@ -55,8 +55,8 @@ def main_kernel() -> int:
 # CHECK-LABEL:   func.func @__nvqpp__mlirgen__func_achat..
 # CHECK-SAME:      %[[VAL_0:.*]]: !quake.veq<?>) -> !cc.stdvec<i1> attributes {"cudaq-kernel", qubitMeasurementFeedback = true} {
 # CHECK:           %[[VAL_1:.*]] = arith.constant false
-# CHECK:           %[[VAL_2:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<?>) -> !quake.measurements<?>
-# CHECK:           %[[VAL_3:.*]] = quake.discriminate %[[VAL_2]] : (!quake.measurements<?>) -> !cc.stdvec<i1>
+# CHECK:           %[[VAL_2:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
+# CHECK:           %[[VAL_3:.*]] = quake.discriminate %[[VAL_2]] : (!cc.stdvec<!quake.measure>) -> !cc.stdvec<i1>
 # CHECK:           %[[VAL_4:.*]] = cc.stdvec_data %[[VAL_3]] : (!cc.stdvec<i1>) -> !cc.ptr<!cc.array<i8 x ?>>
 # CHECK:           %[[VAL_5:.*]] = cc.stdvec_size %[[VAL_3]] : (!cc.stdvec<i1>) -> i64
 # CHECK:           %[[VAL_6:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
diff --git a/python/tests/mlir/exp_pauli.py b/python/tests/mlir/exp_pauli.py
index 85482237eec..41c5f691f7d 100644
--- a/python/tests/mlir/exp_pauli.py
+++ b/python/tests/mlir/exp_pauli.py
@@ -122,3 +122,106 @@ def kernel_noancilla_rotation(angles: list[float]):
 # CHECK:         call void @__quantum__qis__exp_pauli(double 2.310000e+01, %[[VAL_0]]* %[[VAL_1]], i8* %[[VAL_5]])
 # CHECK:         ret void
 # CHECK:       }
+
+
+def test_exp_pauli_loop_controlled():
+    """Regression test for issue #2822: cudaq.control on a kernel that calls
+    exp_pauli in a loop must compile without error."""
+
+    @cudaq.kernel
+    def exp_pauli_loop(qubits: cudaq.qview, coefficients: list[float],
+                       words: list[cudaq.pauli_word], time: float):
+        for i in range(len(coefficients)):
+            exp_pauli(coefficients[i] * time, qubits, words[i])
+
+    @cudaq.kernel
+    def kernel_controlled_exp_pauli_loop(coefficients: list[float],
+                                         words: list[cudaq.pauli_word]):
+        ctrl = cudaq.qubit()
+        qreg = cudaq.qvector(2)
+        h(ctrl)
+        cudaq.control(exp_pauli_loop, ctrl, qreg, coefficients, words, 1.0)
+
+    cudaq.set_target('qpp-cpu')
+    coefficients = [1.0, 0.5]
+    words = [cudaq.pauli_word("ZZ"), cudaq.pauli_word("XX")]
+
+    state = np.array(
+        cudaq.get_state(kernel_controlled_exp_pauli_loop, coefficients, words))
+    assert len(state) > 0
+
+    # FileCheck below verifies the QIR contains the loop structure
+    # (phi/icmp/br blocks) and calls __quantum__qis__exp_pauli__ctl per iteration.
+    print(
+        cudaq.translate(kernel_controlled_exp_pauli_loop,
+                        coefficients,
+                        words,
+                        format='qir'))
+
+
+# CHECK-LABEL: define void @__nvqpp__mlirgen__kernel_controlled_exp_pauli_loop..
+# CHECK:         %[[VAL_0:.*]] = alloca [1 x { i8*, i64 }], align 8
+# CHECK:         %[[VAL_1:.*]] = call %[[VAL_2:.*]]* @__quantum__rt__qubit_allocate_array(i64 3)
+# CHECK:         %[[VAL_3:.*]] = alloca [2 x { i8*, i64 }], align 8
+# CHECK:         %[[VAL_4:.*]] = bitcast [2 x { i8*, i64 }]* %[[VAL_3]] to { i8*, i64 }*
+# CHECK:         store { i8*, i64 } { i8* getelementptr inbounds ([3 x i8], [3 x i8]* @cstr.5A5A00, i32 0, i32 0), i64 3 }, { i8*, i64 }* %[[VAL_4]], align 8
+# CHECK:         %[[VAL_5:.*]] = getelementptr [2 x { i8*, i64 }], [2 x { i8*, i64 }]* %[[VAL_3]], i32 0, i32 1
+# CHECK:         store { i8*, i64 } { i8* getelementptr inbounds ([3 x i8], [3 x i8]* @cstr.585800, i32 0, i32 0), i64 3 }, { i8*, i64 }* %[[VAL_5]], align 8
+# CHECK:         %[[VAL_6:.*]] = alloca [2 x double], align 8
+# CHECK:         %[[VAL_7:.*]] = bitcast [2 x double]* %[[VAL_6]] to double*
+# CHECK:         store double 1.000000e+00, double* %[[VAL_7]], align 8
+# CHECK:         %[[VAL_8:.*]] = getelementptr [2 x double], [2 x double]* %[[VAL_6]], i32 0, i32 1
+# CHECK:         store double 5.000000e-01, double* %[[VAL_8]], align 8
+# CHECK:         %[[VAL_9:.*]] = call %[[VAL_10:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_2]]* %[[VAL_1]], i64 0)
+# CHECK:         %[[VAL_11:.*]] = load %[[VAL_10]]*, %[[VAL_10]]** %[[VAL_9]], align 8
+# CHECK:         %[[VAL_12:.*]] = call %[[VAL_2]]* @__quantum__rt__array_slice(%[[VAL_2]]* %[[VAL_1]], i32 1, i64 1, i64 1, i64 2)
+# CHECK:         call void @__quantum__qis__h(%[[VAL_10]]* %[[VAL_11]])
+# CHECK:         %[[VAL_13:.*]] = call %[[VAL_2]]* @__quantum__rt__array_create_1d(i32 8, i64 1)
+# CHECK:         %[[VAL_14:.*]] = call %[[VAL_10:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_2]]* %[[VAL_13]], i64 0)
+# CHECK:         store %[[VAL_10]]* %[[VAL_11]], %[[VAL_10]]** %[[VAL_14]], align 8
+# CHECK:         br label %[[VAL_15:.*]]
+# CHECK:       17:                                               ; preds = %[[VAL_16:.*]], %[[VAL_17:.*]]
+# CHECK:         %[[VAL_18:.*]] = phi i64 [ %[[VAL_19:.*]], %[[VAL_16]] ], [ 0, %[[VAL_17]] ]
+# CHECK:         %[[VAL_20:.*]] = icmp slt i64 %[[VAL_18]], 2
+# CHECK:         br i1 %[[VAL_20]], label %[[VAL_16]], label %[[VAL_21:.*]]
+# CHECK:       20:                                               ; preds = %[[VAL_15]]
+# CHECK:         %[[VAL_22:.*]] = phi i64 [ %[[VAL_18]], %[[VAL_15]] ]
+# CHECK:         %[[VAL_23:.*]] = getelementptr [2 x double], [2 x double]* %[[VAL_6]], i32 0, i64 %[[VAL_22]]
+# CHECK:         %[[VAL_24:.*]] = load double, double* %[[VAL_23]], align 8
+# CHECK:         %[[VAL_25:.*]] = getelementptr [2 x { i8*, i64 }], [2 x { i8*, i64 }]* %[[VAL_3]], i32 0, i64 %[[VAL_22]]
+# CHECK:         %[[VAL_26:.*]] = load { i8*, i64 }, { i8*, i64 }* %[[VAL_25]], align 8
+# CHECK:         %[[VAL_27:.*]] = bitcast [1 x { i8*, i64 }]* %[[VAL_0]] to { i8*, i64 }*
+# CHECK:         store { i8*, i64 } %[[VAL_26]], { i8*, i64 }* %[[VAL_27]], align 8
+# CHECK:         %[[VAL_28:.*]] = bitcast [1 x { i8*, i64 }]* %[[VAL_0]] to i8*
+# CHECK:         call void @__quantum__qis__exp_pauli__ctl(double %[[VAL_24]], %[[VAL_2]]* %[[VAL_13]], %[[VAL_2]]* %[[VAL_12]], i8* %[[VAL_28]])
+# CHECK:         %[[VAL_19]] = add i64 %[[VAL_22]], 1
+# CHECK:         br label %[[VAL_15]]
+# CHECK:       29:                                               ; preds = %[[VAL_15]]
+# CHECK:         call void @__quantum__rt__qubit_release_array(%[[VAL_2]]* %[[VAL_1]])
+# CHECK:         ret void
+
+# CHECK-LABEL: define void @__nvqpp__mlirgen__exp_pauli_loop..
+# CHECK:         %[[VAL_29:.*]] = alloca [1 x { i8*, i64 }], align 8
+# CHECK:         %[[VAL_30:.*]] = extractvalue { double*, i64 } %[[VAL_31:.*]], 1
+# CHECK:         br label %[[VAL_32:.*]]
+# CHECK:       7:                                                ; preds = %[[VAL_33:.*]], %[[VAL_34:.*]]
+# CHECK:         %[[VAL_35:.*]] = phi i64 [ %[[VAL_36:.*]], %[[VAL_33]] ], [ 0, %[[VAL_34]] ]
+# CHECK:         %[[VAL_37:.*]] = icmp slt i64 %[[VAL_35]], %[[VAL_30]]
+# CHECK:         br i1 %[[VAL_37]], label %[[VAL_33]], label %[[VAL_38:.*]]
+# CHECK:       10:                                               ; preds = %[[VAL_32]]
+# CHECK:         %[[VAL_39:.*]] = phi i64 [ %[[VAL_35]], %[[VAL_32]] ]
+# CHECK:         %[[VAL_40:.*]] = extractvalue { double*, i64 } %[[VAL_31]], 0
+# CHECK:         %[[VAL_41:.*]] = getelementptr double, double* %[[VAL_40]], i64 %[[VAL_39]]
+# CHECK:         %[[VAL_42:.*]] = load double, double* %[[VAL_41]], align 8
+# CHECK:         %[[VAL_43:.*]] = fmul double %[[VAL_42]], %[[VAL_44:.*]]
+# CHECK:         %[[VAL_45:.*]] = extractvalue { { i8*, i64 }*, i64 } %[[VAL_46:.*]], 0
+# CHECK:         %[[VAL_47:.*]] = getelementptr { i8*, i64 }, { i8*, i64 }* %[[VAL_45]], i64 %[[VAL_39]]
+# CHECK:         %[[VAL_48:.*]] = load { i8*, i64 }, { i8*, i64 }* %[[VAL_47]], align 8
+# CHECK:         %[[VAL_49:.*]] = bitcast [1 x { i8*, i64 }]* %[[VAL_29]] to { i8*, i64 }*
+# CHECK:         store { i8*, i64 } %[[VAL_48]], { i8*, i64 }* %[[VAL_49]], align 8
+# CHECK:         %[[VAL_50:.*]] = bitcast [1 x { i8*, i64 }]* %[[VAL_29]] to i8*
+# CHECK:         call void @__quantum__qis__exp_pauli(double %[[VAL_43]], %[[VAL_51:.*]]* %[[VAL_52:.*]], i8* %[[VAL_50]])
+# CHECK:         %[[VAL_36]] = add i64 %[[VAL_39]], 1
+# CHECK:         br label %[[VAL_32]]
+# CHECK:       22:                                               ; preds = %[[VAL_32]]
+# CHECK:         ret void
diff --git a/python/tests/mlir/measure.py b/python/tests/mlir/measure.py
index 4092412ec40..878836a4326 100644
--- a/python/tests/mlir/measure.py
+++ b/python/tests/mlir/measure.py
@@ -72,9 +72,9 @@ def test_kernel_measure_qreg():
 # CHECK-LABEL:   func.func @__nvqpp__mlirgen__PythonKernelBuilderInstance
 # CHECK-SAME: () attributes {"cudaq-entrypoint"
 # CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<3>
-# CHECK:           %[[VAL_1:.*]] = quake.mx %[[VAL_0]] : (!quake.veq<3>) -> !quake.measurements<3>
-# CHECK:           %[[VAL_2:.*]] = quake.my %[[VAL_0]] : (!quake.veq<3>) -> !quake.measurements<3>
-# CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<3>) -> !quake.measurements<3>
+# CHECK:           %[[VAL_1:.*]] = quake.mx %[[VAL_0]] : (!quake.veq<3>) -> !cc.stdvec<!quake.measure>
+# CHECK:           %[[VAL_2:.*]] = quake.my %[[VAL_0]] : (!quake.veq<3>) -> !cc.stdvec<!quake.measure>
+# CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<3>) -> !cc.stdvec<!quake.measure>
 # CHECK:           return
 # CHECK:         }
 
diff --git a/python/tests/mlir/swap.py b/python/tests/mlir/swap.py
index dd5701e9922..a6b8cbce7e3 100644
--- a/python/tests/mlir/swap.py
+++ b/python/tests/mlir/swap.py
@@ -40,7 +40,7 @@ def test_swap_2q():
 # CHECK:           %[[VAL_2:.*]] = quake.extract_ref %[[VAL_0]][1] : (!quake.veq<2>) -> !quake.ref
 # CHECK:           quake.x %[[VAL_1]] : (!quake.ref) -> ()
 # CHECK:           quake.swap %[[VAL_1]], %[[VAL_2]] : (!quake.ref, !quake.ref) -> ()
-# CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !quake.measurements<2>
+# CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 # CHECK:           return
 # CHECK:         }
 
diff --git a/python/utils/NanobindAdaptors.h b/python/utils/NanobindAdaptors.h
new file mode 100644
index 00000000000..343dde098b7
--- /dev/null
+++ b/python/utils/NanobindAdaptors.h
@@ -0,0 +1,472 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2026 NVIDIA Corporation & Affiliates.                         *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
+
+#include "mlir-c/Bindings/Python/Interop.h"
+#include "mlir-c/IR.h"
+
+#include "llvm/ADT/Twine.h"
+
+// Type casters for MLIR C-API types.
+namespace nanobind {
+namespace detail {
+
+/// Helper to convert a presumed MLIR API object to a capsule, accepting either
+/// an explicit Capsule or indirectly by querying the MLIR_PYTHON_CAPI_PTR_ATTR
+/// attribute.
+static nanobind::object mlirApiObjectToCapsule(nanobind::handle apiObject) {
+  if (PyCapsule_CheckExact(apiObject.ptr()))
+    return nanobind::borrow<nanobind::object>(apiObject);
+  if (!nanobind::hasattr(apiObject, MLIR_PYTHON_CAPI_PTR_ATTR)) {
+    auto repr = nanobind::repr(apiObject);
+    throw nanobind::type_error((llvm::Twine("Expected an MLIR object (got ") +
+                                std::string(nanobind::str(repr).c_str()) + ").")
+                                   .str()
+                                   .c_str());
+  }
+  return apiObject.attr(MLIR_PYTHON_CAPI_PTR_ATTR);
+}
+
+/// Casts object <-> MlirAffineMap.
+template <>
+struct type_caster<MlirAffineMap> {
+  NB_TYPE_CASTER(MlirAffineMap, const_name("MlirAffineMap"))
+  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
+    try {
+      nanobind::object capsule = mlirApiObjectToCapsule(src);
+      value = mlirPythonCapsuleToAffineMap(capsule.ptr());
+      return !mlirAffineMapIsNull(value);
+    } catch (...) {
+      return false;
+    }
+  }
+  static handle from_cpp(MlirAffineMap v, rv_policy, cleanup_list *) noexcept {
+    try {
+      nanobind::object capsule =
+          nanobind::steal<nanobind::object>(mlirPythonAffineMapToCapsule(v));
+      return nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir"))
+          .attr("AffineMap")
+          .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule)
+          .release();
+    } catch (...) {
+      return handle();
+    }
+  }
+};
+
+/// Casts object <-> MlirAttribute.
+template <>
+struct type_caster<MlirAttribute> {
+  NB_TYPE_CASTER(MlirAttribute, const_name("MlirAttribute"))
+  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
+    try {
+      nanobind::object capsule = mlirApiObjectToCapsule(src);
+      value = mlirPythonCapsuleToAttribute(capsule.ptr());
+      return !mlirAttributeIsNull(value);
+    } catch (...) {
+      return false;
+    }
+  }
+  static handle from_cpp(MlirAttribute v, rv_policy, cleanup_list *) noexcept {
+    try {
+      nanobind::object capsule =
+          nanobind::steal<nanobind::object>(mlirPythonAttributeToCapsule(v));
+      return nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir"))
+          .attr("Attribute")
+          .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule)
+          .release();
+    } catch (...) {
+      return handle();
+    }
+  }
+};
+
+/// Casts object -> MlirContext.
+template <>
+struct type_caster<MlirContext> {
+  NB_TYPE_CASTER(MlirContext, const_name("MlirContext"))
+  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
+    try {
+      if (src.is_none()) {
+        src = nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir"))
+                  .attr("Context")
+                  .attr("current");
+      }
+      nanobind::object capsule = mlirApiObjectToCapsule(src);
+      value = mlirPythonCapsuleToContext(capsule.ptr());
+      return !mlirContextIsNull(value);
+    } catch (...) {
+      return false;
+    }
+  }
+};
+
+/// Casts object <-> MlirDialectRegistry.
+template <>
+struct type_caster<MlirDialectRegistry> {
+  NB_TYPE_CASTER(MlirDialectRegistry, const_name("MlirDialectRegistry"))
+  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
+    try {
+      nanobind::object capsule = mlirApiObjectToCapsule(src);
+      value = mlirPythonCapsuleToDialectRegistry(capsule.ptr());
+      return !mlirDialectRegistryIsNull(value);
+    } catch (...) {
+      return false;
+    }
+  }
+  static handle from_cpp(MlirDialectRegistry v, rv_policy,
+                         cleanup_list *) noexcept {
+    try {
+      nanobind::object capsule = nanobind::steal<nanobind::object>(
+          mlirPythonDialectRegistryToCapsule(v));
+      return nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir"))
+          .attr("DialectRegistry")
+          .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule)
+          .release();
+    } catch (...) {
+      return handle();
+    }
+  }
+};
+
+/// Casts object <-> MlirLocation.
+template <>
+struct type_caster<MlirLocation> {
+  NB_TYPE_CASTER(MlirLocation, const_name("MlirLocation"))
+  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
+    try {
+      if (src.is_none()) {
+        src = nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir"))
+                  .attr("Location")
+                  .attr("current");
+      }
+      nanobind::object capsule = mlirApiObjectToCapsule(src);
+      value = mlirPythonCapsuleToLocation(capsule.ptr());
+      return !mlirLocationIsNull(value);
+    } catch (...) {
+      return false;
+    }
+  }
+  static handle from_cpp(MlirLocation v, rv_policy, cleanup_list *) noexcept {
+    try {
+      nanobind::object capsule =
+          nanobind::steal<nanobind::object>(mlirPythonLocationToCapsule(v));
+      return nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir"))
+          .attr("Location")
+          .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule)
+          .release();
+    } catch (...) {
+      return handle();
+    }
+  }
+};
+
+/// Casts object <-> MlirModule.
+template <>
+struct type_caster<MlirModule> {
+  NB_TYPE_CASTER(MlirModule, const_name("MlirModule"))
+  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
+    try {
+      nanobind::object capsule = mlirApiObjectToCapsule(src);
+      value = mlirPythonCapsuleToModule(capsule.ptr());
+      return !mlirModuleIsNull(value);
+    } catch (...) {
+      return false;
+    }
+  }
+  static handle from_cpp(MlirModule v, rv_policy, cleanup_list *) noexcept {
+    try {
+      nanobind::object capsule =
+          nanobind::steal<nanobind::object>(mlirPythonModuleToCapsule(v));
+      return nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir"))
+          .attr("Module")
+          .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule)
+          .release();
+    } catch (...) {
+      return handle();
+    }
+  }
+};
+
+/// Casts object <-> MlirOperation.
+template <>
+struct type_caster<MlirOperation> {
+  NB_TYPE_CASTER(MlirOperation, const_name("MlirOperation"))
+  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
+    try {
+      nanobind::object capsule = mlirApiObjectToCapsule(src);
+      value = mlirPythonCapsuleToOperation(capsule.ptr());
+      return !mlirOperationIsNull(value);
+    } catch (...) {
+      return false;
+    }
+  }
+  static handle from_cpp(MlirOperation v, rv_policy, cleanup_list *) noexcept {
+    if (v.ptr == nullptr)
+      return nanobind::none().release();
+    try {
+      nanobind::object capsule =
+          nanobind::steal<nanobind::object>(mlirPythonOperationToCapsule(v));
+      return nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir"))
+          .attr("Operation")
+          .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule)
+          .release();
+    } catch (...) {
+      return handle();
+    }
+  }
+};
+
+/// Casts object <-> MlirValue.
+template <>
+struct type_caster<MlirValue> {
+  NB_TYPE_CASTER(MlirValue, const_name("MlirValue"))
+  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
+    try {
+      nanobind::object capsule = mlirApiObjectToCapsule(src);
+      value = mlirPythonCapsuleToValue(capsule.ptr());
+      return !mlirValueIsNull(value);
+    } catch (...) {
+      return false;
+    }
+  }
+  static handle from_cpp(MlirValue v, rv_policy, cleanup_list *) noexcept {
+    if (v.ptr == nullptr)
+      return nanobind::none().release();
+    try {
+      nanobind::object capsule =
+          nanobind::steal<nanobind::object>(mlirPythonValueToCapsule(v));
+      return nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir"))
+          .attr("Value")
+          .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule)
+          .release();
+    } catch (...) {
+      return handle();
+    }
+  }
+};
+
+/// Casts object -> MlirPassManager.
+template <>
+struct type_caster<MlirPassManager> {
+  NB_TYPE_CASTER(MlirPassManager, const_name("MlirPassManager"))
+  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
+    try {
+      nanobind::object capsule = mlirApiObjectToCapsule(src);
+      value = mlirPythonCapsuleToPassManager(capsule.ptr());
+      return !mlirPassManagerIsNull(value);
+    } catch (...) {
+      return false;
+    }
+  }
+};
+
+/// Casts object <-> MlirType.
+template <>
+struct type_caster<MlirType> {
+  NB_TYPE_CASTER(MlirType, const_name("MlirType"))
+  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
+    try {
+      nanobind::object capsule = mlirApiObjectToCapsule(src);
+      value = mlirPythonCapsuleToType(capsule.ptr());
+      return !mlirTypeIsNull(value);
+    } catch (...) {
+      return false;
+    }
+  }
+  static handle from_cpp(MlirType t, rv_policy, cleanup_list *) noexcept {
+    try {
+      nanobind::object capsule =
+          nanobind::steal<nanobind::object>(mlirPythonTypeToCapsule(t));
+      return nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir"))
+          .attr("Type")
+          .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule)
+          .release();
+    } catch (...) {
+      return handle();
+    }
+  }
+};
+
+} // namespace detail
+} // namespace nanobind
+
+namespace mlir {
+namespace python {
+namespace nanobind_adaptors {
+
+/// Provides a facility like nanobind::class_ for defining a new class in a
+/// scope, but this allows extension of an arbitrary Python class, defining
+/// methods on it in a similar way. These are not "real" nanobind classes but
+/// pure Python classes with no relation to a concrete C++ class.
+class pure_subclass {
+public:
+  pure_subclass(nanobind::handle scope, const char *derivedClassName,
+                const nanobind::object &superClass) {
+    nanobind::object pyType =
+        nanobind::borrow<nanobind::object>((PyObject *)&PyType_Type);
+    nanobind::object metaclass = pyType(superClass);
+    nanobind::dict attributes;
+
+    thisClass = metaclass(derivedClassName, nanobind::make_tuple(superClass),
+                          attributes);
+    scope.attr(derivedClassName) = thisClass;
+  }
+
+  template <typename Func, typename... Extra>
+  pure_subclass &def(const char *name, Func &&f, const Extra &...extra) {
+    nanobind::object cf(
+        nanobind::cpp_function(std::forward<Func>(f), nanobind::name(name),
+                               nanobind::arg("self"), extra...));
+    thisClass.attr(name) = cf;
+    return *this;
+  }
+
+  template <typename Func, typename... Extra>
+  pure_subclass &def_property_readonly(const char *name, Func &&f,
+                                       const Extra &...extra) {
+    nanobind::object cf(
+        nanobind::cpp_function(std::forward<Func>(f), nanobind::name(name),
+                               nanobind::arg("self"), extra...));
+    auto builtinProperty =
+        nanobind::borrow<nanobind::object>((PyObject *)&PyProperty_Type);
+    thisClass.attr(name) = builtinProperty(cf);
+    return *this;
+  }
+
+  template <typename Func, typename... Extra>
+  pure_subclass &def_staticmethod(const char *name, Func &&f,
+                                  const Extra &...extra) {
+    nanobind::object cf(nanobind::cpp_function(std::forward<Func>(f),
+                                               nanobind::name(name), extra...));
+    thisClass.attr(name) = nanobind::steal(PyStaticMethod_New(cf.ptr()));
+    return *this;
+  }
+
+  template <typename Func, typename... Extra>
+  pure_subclass &def_classmethod(const char *name, Func &&f,
+                                 const Extra &...extra) {
+    nanobind::object cf(
+        nanobind::cpp_function(std::forward<Func>(f), nanobind::name(name),
+                               nanobind::scope(thisClass), extra...));
+    thisClass.attr(name) =
+        nanobind::steal<nanobind::object>(PyClassMethod_New(cf.ptr()));
+    return *this;
+  }
+
+  nanobind::object get_class() const { return thisClass; }
+
+protected:
+  nanobind::object superClass;
+  nanobind::object thisClass;
+};
+
+/// Creates a custom subclass of mlir.ir.Type, implementing a casting
+/// constructor and type checking methods.
+class mlir_type_subclass : public pure_subclass {
+public:
+  using IsAFunctionTy = bool (*)(MlirType);
+
+  /// Subclasses by looking up the super-class dynamically.
+  mlir_type_subclass(nanobind::handle scope, const char *typeClassName,
+                     IsAFunctionTy isaFunction)
+      : mlir_type_subclass(
+            scope, typeClassName, isaFunction,
+            nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir"))
+                .attr("Type")) {}
+
+  /// Subclasses with a provided mlir.ir.Type super-class.
+  mlir_type_subclass(nanobind::handle scope, const char *typeClassName,
+                     IsAFunctionTy isaFunction,
+                     const nanobind::object &superCls)
+      : pure_subclass(scope, typeClassName, superCls) {
+    std::string captureTypeName(typeClassName);
+    nanobind::object newCf(nanobind::cpp_function(
+        [superCls, isaFunction, captureTypeName](nanobind::object cls,
+                                                 nanobind::object otherType) {
+          MlirType rawType = nanobind::cast<MlirType>(otherType);
+          if (!isaFunction(rawType)) {
+            auto origRepr =
+                std::string(nanobind::str(nanobind::repr(otherType)).c_str());
+            throw std::invalid_argument((llvm::Twine("Cannot cast type to ") +
+                                         captureTypeName + " (from " +
+                                         origRepr + ")")
+                                            .str());
+          }
+          nanobind::object self = superCls.attr("__new__")(cls, otherType);
+          return self;
+        },
+        nanobind::name("__new__"), nanobind::arg("cls"),
+        nanobind::arg("cast_from_type")));
+    thisClass.attr("__new__") = newCf;
+
+    // 'isinstance' method.
+    def_staticmethod(
+        "isinstance",
+        [isaFunction](MlirType other) { return isaFunction(other); },
+        nanobind::arg("other_type"));
+  }
+};
+
+/// Creates a custom subclass of mlir.ir.Attribute, implementing a casting
+/// constructor and type checking methods.
+class mlir_attribute_subclass : public pure_subclass {
+public:
+  using IsAFunctionTy = bool (*)(MlirAttribute);
+
+  /// Subclasses by looking up the super-class dynamically.
+  mlir_attribute_subclass(nanobind::handle scope, const char *attrClassName,
+                          IsAFunctionTy isaFunction)
+      : mlir_attribute_subclass(
+            scope, attrClassName, isaFunction,
+            nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir"))
+                .attr("Attribute")) {}
+
+  /// Subclasses with a provided mlir.ir.Attribute super-class.
+  mlir_attribute_subclass(nanobind::handle scope, const char *typeClassName,
+                          IsAFunctionTy isaFunction,
+                          const nanobind::object &superCls)
+      : pure_subclass(scope, typeClassName, superCls) {
+    std::string captureTypeName(typeClassName);
+    nanobind::object newCf(nanobind::cpp_function(
+        [superCls, isaFunction, captureTypeName](
+            nanobind::object cls, nanobind::object otherAttribute) {
+          MlirAttribute rawAttribute =
+              nanobind::cast<MlirAttribute>(otherAttribute);
+          if (!isaFunction(rawAttribute)) {
+            auto origRepr = std::string(
+                nanobind::str(nanobind::repr(otherAttribute)).c_str());
+            throw std::invalid_argument(
+                (llvm::Twine("Cannot cast attribute to ") + captureTypeName +
+                 " (from " + origRepr + ")")
+                    .str());
+          }
+          nanobind::object self = superCls.attr("__new__")(cls, otherAttribute);
+          return self;
+        },
+        nanobind::name("__new__"), nanobind::arg("cls"),
+        nanobind::arg("cast_from_attr")));
+    thisClass.attr("__new__") = newCf;
+
+    // 'isinstance' method.
+    def_staticmethod(
+        "isinstance",
+        [isaFunction](MlirAttribute other) { return isaFunction(other); },
+        nanobind::arg("other_attribute"));
+  }
+};
+
+} // namespace nanobind_adaptors
+} // namespace python
+} // namespace mlir
diff --git a/python/utils/OpaqueArguments.h b/python/utils/OpaqueArguments.h
index 6f7849a7dfe..b81577bbae2 100644
--- a/python/utils/OpaqueArguments.h
+++ b/python/utils/OpaqueArguments.h
@@ -27,10 +27,9 @@
 #include <complex>
 #include <functional>
 #include <future>
-#include <pybind11/complex.h>
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/string.h>
 
 namespace cudaq {
 
@@ -42,7 +41,7 @@ class OpaqueArguments;
 /// argument types. Future work should make this function perform more checks,
 /// we probably want to take the kernel MLIR argument types as input and use
 /// that to validate that the passed arguments are good to go.
-py::args simplifiedValidateInputArguments(py::args &args);
+nanobind::args simplifiedValidateInputArguments(nanobind::args &args);
 
 /// @brief Search the given Module for the function with provided name.
 template <bool noThrow = false>
@@ -76,24 +75,26 @@ mlir::func::FuncOp getKernelFuncOp(MlirModule module,
 }
 
 template <typename T>
-void checkArgumentType(py::handle arg, int index, const std::string &word) {
+void checkArgumentType(nanobind::handle arg, int index,
+                       const std::string &word) {
   if (!py_ext::isConvertible<T>(arg)) {
     throw std::runtime_error(
         "kernel argument" + word + " type is '" +
         std::string(py_ext::typeName<T>()) + "'" +
         " but argument provided is not (argument " + std::to_string(index) +
-        ", value=" + py::str(arg).cast<std::string>() +
-        ", type=" + py::str(py::type::of(arg)).cast<std::string>() + ").");
+        ", value=" + nanobind::cast<std::string>(nanobind::str(arg)) +
+        ", type=" + nanobind::cast<std::string>(nanobind::str(arg.type())) +
+        ").");
   }
 }
 
 template <typename T>
-void checkArgumentType(py::handle arg, int index) {
+void checkArgumentType(nanobind::handle arg, int index) {
   checkArgumentType<T>(arg, index, "");
 }
 
 template <typename T>
-void checkListElementType(py::handle arg, int index) {
+void checkListElementType(nanobind::handle arg, int index) {
   checkArgumentType<T>(arg, index, "'s element");
 }
 
@@ -111,36 +112,61 @@ void valueArgument(OpaqueArguments &argData, T *arg) {
 
 std::string mlirTypeToString(mlir::Type ty);
 
+/// Controls how `packArgs` and its helpers lay out argument data in memory.
+enum class PackingStyle : bool {
+  /// Direct-launch path: values are placed into a message buffer passed
+  /// directly to the generated `.thunk` at runtime. The encoding must match
+  /// the ABI the thunk expects exactly.
+  argsCreator = false,
+  /// Synthesis path (default): values are consumed by the MLIR
+  /// argument-synthesis pass (`ArgumentConverter`), which substitutes them as
+  /// constants into the kernel IR before JIT compilation. The exact in-memory
+  /// layout is not observable at runtime, so a simpler encoding is used.
+  synthesis = true,
+};
+
+/// Maps a PackingStyle to the element type used to store boolean values in
+/// vectors: synthesis uses `char` (span-compatible), argsCreator uses `bool`.
+template <PackingStyle style>
+using BoolVecElem =
+    std::conditional_t<style == PackingStyle::synthesis, char, bool>;
+
 /// For the current struct member variable type, insert the value into the
 /// dynamically constructed struct.
+template <PackingStyle style = PackingStyle::synthesis>
 void handleStructMemberVariable(void *data, std::size_t offset,
-                                mlir::Type memberType, py::object value);
+                                mlir::Type memberType, nanobind::object value);
 
 /// For the current vector element type, insert the value into the dynamically
 /// constructed vector.
-void *handleVectorElements(mlir::Type eleTy, py::list list);
+template <PackingStyle style = PackingStyle::synthesis>
+void *handleVectorElements(mlir::Type eleTy, nanobind::list list);
 
 /// Take a list of python objects (the arguments) and convert them to C++
 /// objects on the heap. The results are returned in \p argData and include
 /// special `deletors` so that the argument data is cleaned up correctly.
-void packArgs(OpaqueArguments &argData, py::list args,
+/// See \p PackingStyle for the two encoding modes.
+template <PackingStyle style = PackingStyle::synthesis>
+void packArgs(OpaqueArguments &argData, nanobind::list args,
               mlir::ArrayRef<mlir::Type> mlirTys,
-              const std::function<bool(OpaqueArguments &, py::object &,
+              const std::function<bool(OpaqueArguments &, nanobind::object &,
                                        unsigned)> &backupHandler,
               mlir::func::FuncOp kernelFuncOp);
 
 /// This overload handles dropping the front \p startingArgIdx arguments on the
 /// floor. They are not packed in \p argData and are simply ignored.
-void packArgs(OpaqueArguments &argData, py::args args,
+template <PackingStyle style = PackingStyle::synthesis>
+void packArgs(OpaqueArguments &argData, nanobind::args args,
               mlir::func::FuncOp kernelFuncOp,
-              const std::function<bool(OpaqueArguments &, py::object &,
+              const std::function<bool(OpaqueArguments &, nanobind::object &,
                                        unsigned)> &backupHandler,
               std::size_t startingArgIdx = 0);
 
 /// Return `true` if the given \p args represents a request for broadcasting
 /// sample or observe over all argument sets. \p args types can be `int`,
 /// `float`, `list`, so must check if `args[i]` is a `list` or `ndarray`.
-inline bool isBroadcastRequest(kernel_builder<> &builder, py::args &args) {
+inline bool isBroadcastRequest(kernel_builder<> &builder,
+                               nanobind::args &args) {
   // FIXME: The use of isArgStdVec in this function inhibits moving this code
   // out of the header file.
   if (args.empty())
@@ -148,14 +174,14 @@ inline bool isBroadcastRequest(kernel_builder<> &builder, py::args &args) {
 
   auto arg = args[0];
   // Just need to check the leading argument
-  if (py::isinstance<py::list>(arg) && !builder.isArgStdVec(0))
+  if (nanobind::isinstance<nanobind::list>(arg) && !builder.isArgStdVec(0))
     return true;
 
-  if (py::hasattr(arg, "tolist")) {
-    if (!py::hasattr(arg, "shape"))
+  if (nanobind::hasattr(arg, "tolist")) {
+    if (!nanobind::hasattr(arg, "shape"))
       return false;
 
-    auto shape = arg.attr("shape").cast<py::tuple>();
+    auto shape = nanobind::cast<nanobind::tuple>(arg.attr("shape"));
     if (shape.size() == 1 && !builder.isArgStdVec(0))
       return true;
 
diff --git a/python/utils/PyTypes.h b/python/utils/PyTypes.h
index 1872e5228be..6bba9f02fee 100644
--- a/python/utils/PyTypes.h
+++ b/python/utils/PyTypes.h
@@ -9,22 +9,29 @@
 #pragma once
 
 #include <complex>
-#include <pybind11/complex.h>
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/complex.h>
 
 namespace py_ext {
 
 /// Extended python complex object.
 ///
 /// Includes `complex`, `numpy.complex64`, `numpy.complex128`.
-class Complex : public pybind11::object {
+class Complex : public nanobind::object {
 public:
-  PYBIND11_OBJECT_CVT(Complex, object, isComplex_, convert_)
+  NB_OBJECT_DEFAULT(Complex, object, "complex", isComplex_)
+
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  Complex(const nanobind::object &o)
+      : object(nanobind::steal(convert_(o.ptr()))) {
+    if (!m_ptr)
+      throw nanobind::python_error();
+  }
 
   Complex(double real, double imag)
-      : object(PyComplex_FromDoubles(real, imag), stolen_t{}) {
+      : object(nanobind::steal(PyComplex_FromDoubles(real, imag))) {
     if (!m_ptr) {
-      pybind11::pybind11_fail("Could not allocate complex object!");
+      throw std::runtime_error("Could not allocate complex object!");
     }
   }
 
@@ -67,7 +74,7 @@ class Complex : public pybind11::object {
       double imag = PyComplex_ImagAsDouble(o);
       ret = PyComplex_FromDoubles(real, imag);
     } else {
-      pybind11::set_error(PyExc_TypeError, "Unexpected type");
+      PyErr_SetString(PyExc_TypeError, "Unexpected type");
     }
     return ret;
   }
@@ -76,22 +83,31 @@ class Complex : public pybind11::object {
 /// Extended python float object.
 ///
 /// Includes `float`, `numpy.float64`, `numpy.float32`.
-class Float : public pybind11::object {
+class Float : public nanobind::object {
 public:
-  PYBIND11_OBJECT_CVT(Float, object, isFloat_, convert_)
+  NB_OBJECT_DEFAULT(Float, object, "float", isFloat_)
+
+  // Converting constructor
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  Float(const nanobind::object &o)
+      : object(nanobind::steal(convert_(o.ptr()))) {
+    if (!m_ptr)
+      throw nanobind::python_error();
+  }
 
   // Allow implicit conversion from float/double:
   // NOLINTNEXTLINE(google-explicit-constructor)
-  Float(float value) : object(PyFloat_FromDouble((double)value), stolen_t{}) {
+  Float(float value)
+      : object(nanobind::steal(PyFloat_FromDouble((double)value))) {
     if (!m_ptr) {
-      pybind11::pybind11_fail("Could not allocate float object!");
+      throw std::runtime_error("Could not allocate float object!");
     }
   }
   // NOLINTNEXTLINE(google-explicit-constructor)
   Float(double value = .0)
-      : object(PyFloat_FromDouble((double)value), stolen_t{}) {
+      : object(nanobind::steal(PyFloat_FromDouble((double)value))) {
     if (!m_ptr) {
-      pybind11::pybind11_fail("Could not allocate float object!");
+      throw std::runtime_error("Could not allocate float object!");
     }
   }
   // NOLINTNEXTLINE(google-explicit-constructor)
@@ -116,7 +132,7 @@ class Float : public pybind11::object {
     if (isFloat_(o)) {
       ret = PyFloat_FromDouble(PyFloat_AsDouble(o));
     } else {
-      pybind11::set_error(PyExc_TypeError, "Unexpected type");
+      PyErr_SetString(PyExc_TypeError, "Unexpected type");
     }
     return ret;
   }
@@ -125,15 +141,22 @@ class Float : public pybind11::object {
 /// Extended python int object.
 ///
 /// Includes `int`, `numpy.intXXX`.
-class Int : public pybind11::object {
+class Int : public nanobind::object {
 public:
-  PYBIND11_OBJECT_CVT(Int, object, isInt_, convert_)
+  NB_OBJECT_DEFAULT(Int, object, "int", isInt_)
+
+  // Converting constructor
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  Int(const nanobind::object &o) : object(nanobind::steal(convert_(o.ptr()))) {
+    if (!m_ptr)
+      throw nanobind::python_error();
+  }
 
   // Allow implicit conversion from int:
   // NOLINTNEXTLINE(google-explicit-constructor)
-  Int(long value) : object(PyLong_FromLong((long)value), stolen_t{}) {
+  Int(long value) : object(nanobind::steal(PyLong_FromLong((long)value))) {
     if (!m_ptr) {
-      pybind11::pybind11_fail("Could not allocate float object!");
+      throw std::runtime_error("Could not allocate int object!");
     }
   }
 
@@ -164,7 +187,7 @@ class Int : public pybind11::object {
     if (isInt_(o)) {
       ret = PyLong_FromLong(PyLong_AsLong(o));
     } else {
-      pybind11::set_error(PyExc_TypeError, "Unexpected type");
+      PyErr_SetString(PyExc_TypeError, "Unexpected type");
     }
     return ret;
   }
@@ -187,79 +210,79 @@ inline char const *typeName<py_ext::Int>() {
   return "long";
 }
 template <>
-inline char const *typeName<pybind11::int_>() {
+inline char const *typeName<nanobind::int_>() {
   return "long";
 }
 template <>
-inline char const *typeName<pybind11::bool_>() {
+inline char const *typeName<nanobind::bool_>() {
   return "bool";
 }
 template <>
-inline char const *typeName<pybind11::list>() {
+inline char const *typeName<nanobind::list>() {
   return "list";
 }
 
-template <typename T, pybind11::detail::enable_if_t<
-                          std::is_base_of<pybind11::object, T>::value, int> = 0>
-inline bool isConvertible(pybind11::handle o) {
-  return pybind11::isinstance<T>(o);
+template <typename T, std::enable_if_t<
+                          std::is_base_of<nanobind::object, T>::value, int> = 0>
+inline bool isConvertible(nanobind::handle o) {
+  return nanobind::isinstance<T>(o);
 }
 template <>
-inline bool isConvertible<Complex>(pybind11::handle o) {
-  return pybind11::isinstance<Complex>(o) || pybind11::isinstance<Float>(o) ||
-         pybind11::isinstance<pybind11::int_>(o);
+inline bool isConvertible<Complex>(nanobind::handle o) {
+  return nanobind::isinstance<Complex>(o) || nanobind::isinstance<Float>(o) ||
+         nanobind::isinstance<nanobind::int_>(o);
 }
 template <>
-inline bool isConvertible<Float>(pybind11::handle o) {
-  return pybind11::isinstance<Float>(o) ||
-         pybind11::isinstance<pybind11::int_>(o);
+inline bool isConvertible<Float>(nanobind::handle o) {
+  return nanobind::isinstance<Float>(o) ||
+         nanobind::isinstance<nanobind::int_>(o);
 }
 
 template <typename T>
-inline pybind11::object convert(T value) = delete;
+inline nanobind::object convert(T value) = delete;
 
 template <>
-inline pybind11::object convert(bool value) {
-  return pybind11::bool_(value);
+inline nanobind::object convert(bool value) {
+  return nanobind::bool_(value);
 }
 
 template <>
-inline pybind11::object convert(std::int8_t value) {
-  return pybind11::int_(value);
+inline nanobind::object convert(std::int8_t value) {
+  return nanobind::int_(value);
 }
 
 template <>
-inline pybind11::object convert(std::int16_t value) {
-  return pybind11::int_(value);
+inline nanobind::object convert(std::int16_t value) {
+  return nanobind::int_(value);
 }
 
 template <>
-inline pybind11::object convert(std::int32_t value) {
-  return pybind11::int_(value);
+inline nanobind::object convert(std::int32_t value) {
+  return nanobind::int_(value);
 }
 
 template <>
-inline pybind11::object convert(std::int64_t value) {
-  return pybind11::int_(value);
+inline nanobind::object convert(std::int64_t value) {
+  return nanobind::int_(value);
 }
 
 template <>
-inline pybind11::object convert(float value) {
+inline nanobind::object convert(float value) {
   return Float(value);
 }
 
 template <>
-inline pybind11::object convert(double value) {
+inline nanobind::object convert(double value) {
   return Float(value);
 }
 
 template <>
-inline pybind11::object convert(std::complex<float> value) {
+inline nanobind::object convert(std::complex<float> value) {
   return Complex(value);
 }
 
 template <>
-inline pybind11::object convert(std::complex<double> value) {
+inline nanobind::object convert(std::complex<double> value) {
   return Complex(value);
 }
 
diff --git a/requirements-dev.txt b/requirements-dev.txt
index ce1dbf89a46..8fc90efbddd 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -16,7 +16,7 @@ pytest-xdist==3.8.0
 psutil
 numpy==1.26.4
 notebook==7.5.2
-nbconvert==7.17.0
+nbconvert==7.17.1
 llvmlite==0.44.0
 scipy==1.16.3
 requests==2.33.1
diff --git a/runtime/common/CompiledModule.cpp b/runtime/common/CompiledModule.cpp
index e0fabfa2480..69fe55da0e0 100644
--- a/runtime/common/CompiledModule.cpp
+++ b/runtime/common/CompiledModule.cpp
@@ -7,50 +7,58 @@
  ******************************************************************************/
 
 #include "CompiledModule.h"
-#include "cudaq/Optimizer/Builder/RuntimeNames.h"
-#include <memory>
 #include <stdexcept>
 
-using namespace cudaq_internal::compiler;
-
-cudaq::CompiledModule::CompiledModule(std::string kernelName,
-                                      ResultInfo resultInfo)
-    : name(std::move(kernelName)), resultInfo(std::move(resultInfo)) {}
+cudaq::CompiledModule::CompiledModule(std::string kernelName)
+    : name(std::move(kernelName)) {}
+
+std::optional<cudaq::CompiledModule::JitArtifact>
+cudaq::CompiledModule::getJit(std::optional<std::string> jitName) const {
+  auto name = jitName.value_or(this->name);
+  auto it = artifacts.find(name);
+  if (it == artifacts.end())
+    return std::nullopt;
+  const auto *jit = std::get_if<JitArtifact>(&it->second);
+  return jit ? std::optional(*jit) : std::nullopt;
+}
 
-const cudaq::CompiledModule::JitArtifact &
-cudaq::CompiledModule::getJit() const {
-  for (auto &[key, artifact] : artifacts)
-    if (auto *jit = std::get_if<JitArtifact>(&artifact))
-      return *jit;
-  throw std::runtime_error("CompiledModule has no JIT artifact.");
+std::optional<cudaq::CompiledModule::MlirArtifact>
+cudaq::CompiledModule::getMlir(std::optional<std::string> mlirName) const {
+  auto name = mlirName.value_or(this->name + ".mlir");
+  auto it = artifacts.find(name);
+  if (it == artifacts.end())
+    return std::nullopt;
+  const auto *mlir = std::get_if<MlirArtifact>(&it->second);
+  return mlir ? std::optional(*mlir) : std::nullopt;
 }
 
-const cudaq::CompiledModule::MlirArtifact &
-cudaq::CompiledModule::getMlir() const {
-  for (auto &[key, artifact] : artifacts)
-    if (auto *mlir = std::get_if<MlirArtifact>(&artifact))
-      return *mlir;
-  throw std::runtime_error("CompiledModule has no MLIR artifact.");
+bool cudaq::CompiledModule::isFullySpecialized() const {
+  return getArgsCreator() == nullptr;
 }
 
-bool cudaq::CompiledModule::hasJit() const {
-  for (auto &[key, artifact] : artifacts)
-    if (std::holds_alternative<JitArtifact>(artifact))
-      return true;
-  return false;
+int64_t (*cudaq::CompiledModule::getArgsCreator() const)(const void *,
+                                                         void **) {
+  auto jit = getJit(name + ".argsCreator");
+  return jit ? reinterpret_cast<int64_t (*)(const void *, void **)>(jit->fn)
+             : nullptr;
 }
 
-bool cudaq::CompiledModule::hasMlir() const {
-  for (auto &[key, artifact] : artifacts)
-    if (std::holds_alternative<MlirArtifact>(artifact))
-      return true;
-  return false;
+std::optional<std::int64_t> cudaq::CompiledModule::getReturnOffset() const {
+  auto jit = getJit(name + ".returnOffset");
+  if (!jit)
+    return std::nullopt;
+  auto fn = reinterpret_cast<std::int64_t (*)()>(jit->fn);
+  return fn();
 }
 
-bool cudaq::CompiledModule::isFullySpecialized() const {
-  if (!hasJit())
-    return true; // No JIT artifact → fully specialized.
-  return getJit().argsCreator == nullptr;
+const cudaq::Resources *cudaq::CompiledModule::getResources(
+    std::optional<std::string> resourcesName) const {
+  auto name = resourcesName.value_or(this->name + ".resources");
+  auto it = artifacts.find(name);
+  if (it == artifacts.end())
+    return nullptr;
+  const auto *resources = std::get_if<ResourcesArtifact>(&it->second);
+  return resources ? &resources->getResources() : nullptr;
 }
 
 void cudaq::CompiledModule::addArtifact(std::string name,
@@ -60,63 +68,8 @@ void cudaq::CompiledModule::addArtifact(std::string name,
   artifacts.emplace(std::move(name), std::move(artifact));
 }
 
-cudaq::KernelThunkResultType
-cudaq::CompiledModule::execute(const std::vector<void *> &rawArgs) const {
-  auto &jit = getJit();
-  auto funcPtr = jit.entryPoint;
-  if (resultInfo.hasResult()) {
-    void *buff = const_cast<void *>(rawArgs.back());
-    return reinterpret_cast<KernelThunkResultType (*)(void *, bool)>(funcPtr)(
-        buff, /*client_server=*/false);
-  }
-  if (!isFullySpecialized()) {
-    void *buff = nullptr;
-    jit.argsCreator(static_cast<const void *>(rawArgs.data()), &buff);
-    reinterpret_cast<KernelThunkResultType (*)(void *, bool)>(funcPtr)(
-        buff, /*client_server=*/false);
-    std::free(buff);
-    return {nullptr, 0};
-  }
-
-  funcPtr();
-  return {nullptr, 0};
-}
-
-cudaq::KernelThunkResultType cudaq::CompiledModule::execute() const {
-  if (!isFullySpecialized())
-    throw std::runtime_error(
-        "Kernel has unspecialized parameters; call execute(rawArgs) instead.");
-  if (!resultInfo.hasResult()) {
-    getJit().entryPoint();
-    return {nullptr, 0};
-  }
-  // Allocate a result buffer on-the-fly.
-  auto buf = std::make_unique<char[]>(resultInfo.bufferSize);
-  std::vector<void *> rawArgs = {buf.get()};
-  execute(rawArgs);
-  return {buf.release(), resultInfo.bufferSize};
-}
-
-void (*cudaq::CompiledModule::JitArtifact::getEntryPoint() const)() {
-  return entryPoint;
-}
+void (*cudaq::CompiledModule::JitArtifact::getFn() const)() { return fn; }
 
 cudaq::JitEngine cudaq::CompiledModule::JitArtifact::getEngine() const {
   return engine;
 }
-
-void cudaq::CompiledModule::attachJit(JitEngine engine,
-                                      bool isFullySpecialized) {
-  bool hasResult = resultInfo.hasResult();
-  std::string fullName = cudaq::runtime::cudaqGenPrefixName + name;
-  std::string entryName =
-      (hasResult || !isFullySpecialized) ? name + ".thunk" : fullName;
-  void (*entryPoint)() = engine.lookupRawNameOrFail(entryName);
-  int64_t (*argsCreator)(const void *, void **) = nullptr;
-  if (!isFullySpecialized)
-    argsCreator = reinterpret_cast<int64_t (*)(const void *, void **)>(
-        engine.lookupRawNameOrFail(name + ".argsCreator"));
-
-  addArtifact(name, JitArtifact{std::move(engine), entryPoint, argsCreator,
-                                std::nullopt});
-}
diff --git a/runtime/common/CompiledModule.h b/runtime/common/CompiledModule.h
index df170960107..52d8688ed71 100644
--- a/runtime/common/CompiledModule.h
+++ b/runtime/common/CompiledModule.h
@@ -20,25 +20,18 @@
 #include <vector>
 
 // This header file and the types defined within are designed to have no
-// dependencies and be useable across the compiler and runtime. However,
-// constructing instances of these types is easiest done within compilation
-// units that do link against MLIR. We provide this functionality via free
-// functions, defined as friends of the types defined here and implemented in
-// the `cudaq-mlir-runtime` library.
+// dependencies and be useable across the compiler and runtime. Constructing
+// `CompiledModule` is supported through
+// `cudaq_internal::compiler::CompiledModuleHelper`, available in
+// `CompiledModuleHelper.h` from `cudaq-mlir-runtime`.
 
 namespace mlir {
-class Type;
-class ModuleOp;
 class ExecutionEngine;
+class MLIRContext;
 } // namespace mlir
 
-namespace cudaq {
-class ResultInfo;
-} // namespace cudaq
-
 namespace cudaq_internal::compiler {
-cudaq::ResultInfo createResultInfo(mlir::Type resultType, bool isEntryPoint,
-                                   mlir::ModuleOp module);
+class CompiledModuleHelper;
 } // namespace cudaq_internal::compiler
 
 namespace cudaq {
@@ -73,12 +66,9 @@ class JitEngine {
 };
 
 /// Pre-computed result metadata, set at build time. Used at execution time
-/// for result buffer allocation and type conversion. Construct via
-/// `createResultInfo` (implemented in `cudaq-mlir-runtime`).
+/// for result buffer allocation and type conversion.
 class ResultInfo {
-  // Friend factory function, to be used for construction.
-  friend cudaq::ResultInfo cudaq_internal::compiler::createResultInfo(
-      mlir::Type resultType, bool isEntryPoint, mlir::ModuleOp module);
+  friend class cudaq_internal::compiler::CompiledModuleHelper;
   friend class CompiledModule;
 
   /// Opaque pointer to the `mlir::Type` of the result. Obtained via
@@ -97,17 +87,19 @@ class ResultInfo {
 public:
   /// Whether this kernel has a result that must be marshaled.
   bool hasResult() const { return typeOpaquePtr != nullptr; }
+  /// Get the size (in bytes) of the buffer needed to hold the result value.
+  std::size_t getBufferSize() const { return bufferSize; }
 };
 
 /// @brief A compiled MLIR module, ready for execution or code generation.
 ///
 /// Contains any number of named compilation artifacts (we currently support
-/// JIT binaries and optimized MLIR modules) that result from the compilation
-/// of a Quake MLIR module.
+/// JIT binaries, optimized MLIR modules, and pre-computed resource metrics)
+/// that result from the compilation of a Quake MLIR module.
 ///
 /// This type does not depend on MLIR/LLVM — it only keeps type-erased / opaque
-/// pointers. Use the `attachJit` member function to attach JIT-compiled
-/// artifacts after construction.
+/// pointers. Build instances with
+/// `cudaq_internal::compiler::CompiledModuleHelper`.
 class CompiledModule {
 public:
   // --- Compiled artifact types ---
@@ -115,36 +107,17 @@ class CompiledModule {
   /// JIT-compiled artifact, ready for local execution.
   class JitArtifact {
     JitEngine engine;
-    void (*entryPoint)() = nullptr;
-    int64_t (*argsCreator)(const void *, void **) = nullptr;
-    std::optional<Resources> resourceCounts;
+    void (*fn)() = nullptr;
 
-    JitArtifact(JitEngine engine, void (*entryPoint)(),
-                int64_t (*argsCreator)(const void *, void **),
-                std::optional<Resources> resourceCounts)
-        : engine(engine), entryPoint(entryPoint), argsCreator(argsCreator),
-          resourceCounts(std::move(resourceCounts)) {}
+    JitArtifact(JitEngine engine, void (*fn)())
+        : engine(std::move(engine)), fn(fn) {}
 
     friend class CompiledModule;
+    friend class cudaq_internal::compiler::CompiledModuleHelper;
 
   public:
-    // TODO: remove the following two methods once the `CompiledModule` instance
-    // is returned to Python.
-
-    /// @brief Get the entry point of the kernel as a function pointer.
-    ///
-    /// Assumes that there is (exactly one) compiled JIT artifact.
-    ///
-    /// The returned function pointer will expect different arguments depending
-    /// on the kernel:
-    ///  - if the kernel returns a value and/or is not fully specialized, the
-    ///    entry point will expect a pointer to a buffer storing the packed
-    ///    arguments and result.
-    ///  - otherwise, the entry point will not expect any arguments.
-    ///
-    /// Prefer using `CompiledModule::execute` instead of calling this function
-    /// as it will handle the buffer and argument packing automatically.
-    void (*getEntryPoint() const)();
+    /// Get the raw function pointer stored in this artifact.
+    void (*getFn() const)();
     JitEngine getEngine() const;
   };
 
@@ -152,83 +125,106 @@ class CompiledModule {
   /// re-targeting.
   /// Type-erased to keep this header MLIR-free.
   class MlirArtifact {
-    /// Opaque ModuleOp pointer (via `ModuleOp::getAsOpaquePointer()`).
-    ///
-    /// Lifetime: the caller must ensure that the `MLIRContext` that owns
-    /// this ModuleOp outlives this object.
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wattributes"
-    [[maybe_unused]] const void *modulePtr = nullptr;
-#pragma GCC diagnostic pop
+    /// Opaque ModuleOp pointer (via `module.getAsOpaquePointer()`).
+    const void *modulePtr = nullptr;
+
+    /// Optional owning reference to the containing `MLIRContext`.
+    std::shared_ptr<mlir::MLIRContext> context;
+
+    MlirArtifact(const void *modulePtr,
+                 std::shared_ptr<mlir::MLIRContext> context)
+        : modulePtr(modulePtr), context(std::move(context)) {}
 
     friend class CompiledModule;
+    friend class cudaq_internal::compiler::CompiledModuleHelper;
   };
 
-  /// A compiled artifact is either a JIT binary or an MLIR module.
-  using CompiledArtifact = std::variant<JitArtifact, MlirArtifact>;
+  /// Pre-computed resource metrics (gate counts, depth) from IR analysis.
+  class ResourcesArtifact {
+    Resources resources;
 
-  // --- Construction ---
+    ResourcesArtifact(Resources resources) : resources(std::move(resources)) {}
 
-  CompiledModule(std::string kernelName, ResultInfo resultInfo);
+    friend class CompiledModule;
+    friend class cudaq_internal::compiler::CompiledModuleHelper;
 
-  /// @brief Populate the JIT representation of a `CompiledModule`.
-  ///
-  /// Resolves the entry point and (optionally) `argsCreator` symbols from the
-  /// engine, using the kernel's name and result metadata to determine the
-  /// correct mangled symbol names.
-  void attachJit(JitEngine engine, bool isFullySpecialized);
+  public:
+    const Resources &getResources() const { return resources; }
+  };
 
-  // --- Queries ---
+  /// A compiled artifact is a JIT binary, an MLIR module, or resource metrics.
+  using CompiledArtifact =
+      std::variant<JitArtifact, MlirArtifact, ResourcesArtifact>;
 
-  /// Whether any artifact in the map is a JitArtifact.
-  bool hasJit() const;
+  // --- Compilation metadata ---
 
-  /// Whether any artifact in the map is an MlirArtifact.
-  bool hasMlir() const;
+  /// Metadata on the compilation artifacts.
+  struct CompilationMetadata {
+    /// Qubit reorder indices emitted by the qubit-mapping pass.
+    std::vector<std::size_t> reorderIdx;
+  };
+
+  // --- Queries ---
 
-  /// Get the compiled JIT artifact. Returns the first one found.
+  /// Get the JIT artifact with the given name.
   ///
-  /// Throws if none exists.
-  const JitArtifact &getJit() const;
+  /// If no name is provided, defaults to the kernel name.
+  std::optional<JitArtifact>
+  getJit(std::optional<std::string> jitName = std::nullopt) const;
 
-  /// Get the optimized MLIR artifact. Returns the first one found.
+  /// Get the MLIR artifact with the given name.
   ///
-  /// Throws if none exists.
-  const MlirArtifact &getMlir() const;
+  /// If no name is provided, defaults to `kernel_name + ".mlir"`.
+  std::optional<MlirArtifact>
+  getMlir(std::optional<std::string> mlirName = std::nullopt) const;
+
+  /// Get the pre-computed resource counts, or `nullptr` if it does not exist.
+  ///
+  /// If no name is provided, defaults to `kernel_name + ".resources"`.
+  const Resources *
+  getResources(std::optional<std::string> resourcesName = std::nullopt) const;
 
   /// Get all compiled artifacts.
   const std::map<std::string, CompiledArtifact> &getArtifacts() const {
     return artifacts;
   }
 
-  /// Whether the kernel is fully specialized (all arguments inlined). For JIT
-  /// kernels this means `argsCreator` is null.
-  /// Kernels without a JIT artifact are considered fully specialized.
+  /// Whether the kernel is fully specialized (all arguments inlined).
+  ///
+  /// Currently, kernels are considered fully specialized if and only if they do
+  /// not have an `argsCreator` artifact.
   bool isFullySpecialized() const;
 
-  const std::string &getName() const { return name; }
-  const ResultInfo &getResultInfo() const { return resultInfo; }
-
-  // --- Execution (local JIT path) ---
-
-  /// @brief Execute a fully specialized kernel (no external arguments needed).
+  /// Get the argument-marshaling function, or `nullptr` if it does not exist.
   ///
-  /// Assumes that there is (exactly one) compiled JIT artifact.
-  KernelThunkResultType execute() const;
+  /// Assumes the artifact is named `kernelName + ".argsCreator"`.
+  int64_t (*getArgsCreator() const)(const void *, void **);
 
-  /// @brief Execute the JIT-ed kernel with caller-provided arguments.
+  /// Get the offset (in bytes) of the result field within the
+  /// `argsCreator`-packed buffer, evaluating the stored JIT function.
+  /// Returns `std::nullopt` if no `.returnOffset` artifact was emitted
+  /// (e.g. the kernel has no result or is fully specialized).
   ///
-  /// Assumes that there is (exactly one) compiled JIT artifact.
-  KernelThunkResultType execute(const std::vector<void *> &rawArgs) const;
+  /// Assumes the artifact is named `kernelName + ".returnOffset"`.
+  std::optional<std::int64_t> getReturnOffset() const;
+
+  const std::string &getName() const { return name; }
+  const ResultInfo &getResultInfo() const { return resultInfo; }
+  const CompilationMetadata &getMetadata() const { return metadata; }
 
 private:
-  /// Add a compiled artifact to the kernel.
+  friend class cudaq_internal::compiler::CompiledModuleHelper;
+
+  CompiledModule(std::string kernelName);
+
+  /// Add a compiled artifact to the module under the given name.
   void addArtifact(std::string name, CompiledArtifact artifact);
 
   std::string name;
   ResultInfo resultInfo; // TODO: we might want to store the entire kernel
                          // signature here. Though I'm not sure what MLIR
                          // agnostic information is worth storing.
+  CompilationMetadata metadata;
   std::map<std::string, CompiledArtifact> artifacts;
 };
 
diff --git a/runtime/common/ExecutionContext.cpp b/runtime/common/ExecutionContext.cpp
index 3e163ed0f6f..48ca8d9dc65 100644
--- a/runtime/common/ExecutionContext.cpp
+++ b/runtime/common/ExecutionContext.cpp
@@ -27,56 +27,39 @@ thread_local bool reuseArtifact = false;
 
 class SavedCompilerArtifact {
 public:
-  void saveArtifact(const std::string &kernelName,
-                    const std::vector<void *> &args, const JitEngine &engine,
-                    std::function<void *()> argsCreatorThunk) {
+  void saveArtifact(const std::string &kernelName, const JitEngine engine) {
     if (jitEng.has_value()) {
       throw std::runtime_error(
           "Attempted to overwrite saved compiler artifact.");
     }
     jitEng = engine;
-    argsCreator = reinterpret_cast<int64_t (*)(const void *, void **)>(
-        argsCreatorThunk());
     this->kernelName = kernelName;
-    auto [resSize, scopedArgBuffer] = processArgs(args);
-    argSize = resSize;
-    argBuff = std::move(scopedArgBuffer);
   }
 
   void checkArtifactReuse(const std::string &kernelName,
-                          const std::vector<void *> &args,
-                          const JitEngine &engine,
-                          std::function<void *()> argsCreatorThunk) {
+                          const JitEngine engine) {
     if (!jitEng.has_value()) {
-      saveArtifact(kernelName, args, engine, argsCreatorThunk);
+      saveArtifact(kernelName, engine);
       return;
     }
 
     if (kernelName != this->kernelName)
       throw std::runtime_error("Detected reuse of compiler artifact with "
                                "a different kernel.");
+  }
 
-    auto [resSize, scopedArgBuffer] = processArgs(args);
-
-    auto validate = [this, resSize, &scopedArgBuffer]() {
-      if (resSize != this->argSize)
-        return false;
-      return memcmp(this->argBuff.get(), scopedArgBuffer.get(), resSize) == 0;
-    };
+  void reset() { jitEng.reset(); }
 
-    if (!validate())
+  std::optional<JitEngine> getArtifactJit(const std::string &kernelName) {
+    if (!jitEng.has_value())
+      return std::nullopt;
+    if (kernelName != this->kernelName)
       throw std::runtime_error("Detected reuse of compiler artifact with "
-                               "diverging explicit arguments.");
-  }
-
-  void reset() {
-    jitEng.reset();
-    argsCreator = nullptr;
-    argBuff.reset();
-    argSize = 0;
+                               "a different kernel.");
+    return jitEng;
   }
 
-  SavedCompilerArtifact() : argBuff(nullptr, free) {}
+  SavedCompilerArtifact() {}
 
   void saveEngineForReuse(ExecutionContext *ctx) {
     if (!reuseArtifact || !ctx)
@@ -89,9 +72,13 @@ class SavedCompilerArtifact {
     if (!reuseArtifact || !ctx || !jitEng.has_value())
       return;
 
-    if (launchMode != ctx->name)
+    // Allow launchMode == "" when the artifact was saved before any execution
+    // context was set (e.g., via precompile_module). In that case, accept any
+    // context and record the mode for future checks.
+    if (!launchMode.empty() && launchMode != ctx->name)
       throw std::runtime_error(
           "Detected reuse of compiler artifact with different launch mode");
+    launchMode = ctx->name;
     ctx->jitEng = jitEng.value();
   }
 
@@ -99,20 +86,8 @@ class SavedCompilerArtifact {
   std::optional<JitEngine> jitEng = std::nullopt;
   // This is actually going to be a pointer into the jitEng,
   // but we have to store it explicitly due to linking issues.
-  int64_t (*argsCreator)(const void *, void **);
   std::string kernelName;
   std::string launchMode;
-  std::unique_ptr<void, decltype(&free)> argBuff;
-  size_t argSize = 0;
-
-  std::tuple<size_t, std::unique_ptr<void, decltype(&free)>>
-  processArgs(const std::vector<void *> &args) {
-    assert(jitEng.has_value());
-    void *resBuffer;
-    auto resSize = argsCreator(args.data(), &resBuffer);
-    std::unique_ptr<void, decltype(&free)> scopedArgBuffer(resBuffer, free);
-    return std::tuple(resSize, std::move(scopedArgBuffer));
-  }
 };
 
 thread_local SavedCompilerArtifact savedArtifact;
@@ -128,22 +103,24 @@ void disablePersistentJITEngine() {
 
 bool isPersistingJITEngine() { return reuseArtifact; }
 
-void checkArtifactReuse(const std::string kernelName,
-                        const std::vector<void *> &args, const JitEngine jit,
-                        std::function<void *()> argsCreatorThunk) {
+void checkArtifactReuse(const std::string kernelName, const JitEngine jit) {
   if (!reuseArtifact)
     return;
 
-  savedArtifact.checkArtifactReuse(kernelName, args, jit, argsCreatorThunk);
+  savedArtifact.checkArtifactReuse(kernelName, jit);
 }
 
-void saveArtifact(const std::string kernelName, const std::vector<void *> &args,
-                  const JitEngine jit,
-                  std::function<void *()> argsCreatorThunk) {
+void saveArtifact(const std::string kernelName, const JitEngine jit) {
   if (!reuseArtifact)
     return;
 
-  savedArtifact.saveArtifact(kernelName, args, jit, argsCreatorThunk);
+  savedArtifact.saveArtifact(kernelName, jit);
+}
+
+std::optional<JitEngine> getArtifactJit(const std::string &kernelName) {
+  if (!reuseArtifact)
+    return std::nullopt;
+  return savedArtifact.getArtifactJit(kernelName);
 }
 } // namespace compiler_artifact
 
diff --git a/runtime/common/ExecutionContext.h b/runtime/common/ExecutionContext.h
index 15ee2e973e2..f0e26ad4651 100644
--- a/runtime/common/ExecutionContext.h
+++ b/runtime/common/ExecutionContext.h
@@ -218,17 +218,16 @@ void enablePersistentJITEngine();
 void disablePersistentJITEngine();
 bool isPersistingJITEngine();
 
-/// Checks that the compiler artifact (if present) can be reused
-/// for the given explicit launch arguments.
-///
-/// `argsCreatorPtr` must point to the `.argsCreator` function from `jit`
+/// Checks that the compiler artifact (if present) can be reused for the
+/// given kernel. Throws if a different kernel name was previously saved.
 void checkArtifactReuse(const std::string kernelName,
-                        const std::vector<void *> &args,
-                        const cudaq::JitEngine jit,
-                        std::function<void *()> argsCreatorThunk);
+                        const cudaq::JitEngine jit);
+
+void saveArtifact(const std::string kernelName, const cudaq::JitEngine jit);
 
-void saveArtifact(const std::string kernelName, const std::vector<void *> &args,
-                  const cudaq::JitEngine jit,
-                  std::function<void *()> argsCreatorThunk);
+/// Returns the saved JIT engine if one is present for \p kernelName.
+/// Throws if a different kernel name was previously saved.
+/// Returns std::nullopt if no artifact has been saved yet.
+std::optional<JitEngine> getArtifactJit(const std::string &kernelName);
 }; // namespace compiler_artifact
 } // namespace cudaq
diff --git a/runtime/common/RecordLogParser.cpp b/runtime/common/RecordLogParser.cpp
index 27cf35ade11..b2a83145e31 100644
--- a/runtime/common/RecordLogParser.cpp
+++ b/runtime/common/RecordLogParser.cpp
@@ -68,7 +68,7 @@ void cudaq::RecordLogParser::handleHeader(
     const std::vector<std::string> &entries) {
   if (entries.size() < 3)
     throw std::runtime_error("Invalid HEADER record");
-  if (entries[1] == "schema_name") {
+  if (entries[1] == "schema_id") {
     if (entries[2] == "labeled")
       schema = RecordSchemaType::LABELED;
     else if (entries[2] == "ordered")
diff --git a/runtime/cudaq/CMakeLists.txt b/runtime/cudaq/CMakeLists.txt
index a6042f12f46..77bfd68e78f 100644
--- a/runtime/cudaq/CMakeLists.txt
+++ b/runtime/cudaq/CMakeLists.txt
@@ -44,11 +44,15 @@ add_library(${LIBRARY_NAME}
 set_property(GLOBAL APPEND PROPERTY CUDAQ_RUNTIME_LIBS ${LIBRARY_NAME})
 
 if (CUDA_FOUND)
+  set(_cudaq_cuda_build_includes "")
+  foreach(_cuda_inc IN LISTS CUDAToolkit_INCLUDE_DIRS)
+    list(APPEND _cudaq_cuda_build_includes "$<BUILD_INTERFACE:${_cuda_inc}>")
+  endforeach()
   target_include_directories(${LIBRARY_NAME}
     PUBLIC $<INSTALL_INTERFACE:include>
             $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/runtime>
             $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/tpls/eigen>
-            $<BUILD_INTERFACE:${CUDAToolkit_INCLUDE_DIRS}>
+            ${_cudaq_cuda_build_includes}
     PRIVATE . ptsbe ptsbe/strategies)
 
   target_link_libraries(${LIBRARY_NAME}
diff --git a/runtime/cudaq/algorithms/observe.h b/runtime/cudaq/algorithms/observe.h
index c768548517a..d9fdf8c3f18 100644
--- a/runtime/cudaq/algorithms/observe.h
+++ b/runtime/cudaq/algorithms/observe.h
@@ -11,6 +11,7 @@
 #include "common/ExecutionContext.h"
 #include "common/ObserveResult.h"
 #include "cudaq/algorithms/broadcast.h"
+#include "cudaq/algorithms/observe/policy.h"
 #include "cudaq/concepts.h"
 #include "cudaq/host_config.h"
 #include "cudaq/operators.h"
@@ -49,21 +50,6 @@ concept ObserveCallValid =
     ValidArgumentsPassed<QuantumKernel, Args...> &&
     HasVoidReturnType<std::invoke_result_t<QuantumKernel, Args...>>;
 
-/// @brief Observe options to provide as an argument to the `observe()`,
-/// `async_observe()` functions.
-/// @param shots number of shots to run for the given kernel, or -1 if not
-/// applicable.
-/// @param noise noise model to use for the sample operation
-/// @param num_trajectories is the optional number of trajectories to be used
-/// when computing the expectation values in the presence of noise. This
-/// parameter is only applied to simulation backends that support noisy
-/// simulation of trajectories.
-struct observe_options {
-  int shots = -1;
-  cudaq::noise_model noise;
-  std::optional<std::size_t> num_trajectories;
-};
-
 namespace details {
 
 /// @brief Take the input KernelFunctor (a lambda that captures runtime
diff --git a/runtime/cudaq/algorithms/observe/options.h b/runtime/cudaq/algorithms/observe/options.h
new file mode 100644
index 00000000000..4002ad65bf6
--- /dev/null
+++ b/runtime/cudaq/algorithms/observe/options.h
@@ -0,0 +1,30 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+#include "common/NoiseModel.h"
+#include <cstddef>
+#include <optional>
+
+namespace cudaq {
+
+/// @brief Observe options to provide as an argument to the `observe()`,
+/// `async_observe()` functions.
+/// @param shots number of shots to run for the given kernel, or -1 if not
+/// applicable.
+/// @param noise noise model to use for the sample operation
+/// @param num_trajectories is the optional number of trajectories to be used
+/// when computing the expectation values in the presence of noise. This
+/// parameter is only applied to simulation backends that support noisy
+/// simulation of trajectories.
+struct observe_options {
+  int shots = -1;
+  cudaq::noise_model noise;
+  std::optional<std::size_t> num_trajectories;
+};
+} // namespace cudaq
diff --git a/runtime/cudaq/algorithms/observe/policy.h b/runtime/cudaq/algorithms/observe/policy.h
new file mode 100644
index 00000000000..2c4af9abb9f
--- /dev/null
+++ b/runtime/cudaq/algorithms/observe/policy.h
@@ -0,0 +1,25 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include "common/ObserveResult.h"
+#include "cudaq/algorithms/observe/options.h"
+
+namespace cudaq {
+
+/// @brief Tag and options for computing expectation values.
+struct observe_policy {
+  /// Associated result type for synchronous APIs keyed off this policy.
+  using result_type = observe_result;
+
+  /// Observe options.
+  observe_options options;
+};
+
+} // namespace cudaq
diff --git a/runtime/cudaq/algorithms/policies.h b/runtime/cudaq/algorithms/policies.h
new file mode 100644
index 00000000000..08b3b2446bb
--- /dev/null
+++ b/runtime/cudaq/algorithms/policies.h
@@ -0,0 +1,19 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include "cudaq/algorithms/observe/policy.h"
+#include "cudaq/algorithms/sample/policy.h"
+
+namespace cudaq {
+
+/// @brief Fallback policy tag used when no specific policy matches.
+struct other_policies {};
+
+} // namespace cudaq
diff --git a/runtime/cudaq/algorithms/policy_cpos.h b/runtime/cudaq/algorithms/policy_cpos.h
new file mode 100644
index 00000000000..0efb02b0e94
--- /dev/null
+++ b/runtime/cudaq/algorithms/policy_cpos.h
@@ -0,0 +1,107 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+/// @file policy_cpos.h
+/// @brief Customization-point objects (CPOs) for policy-based dispatching.
+///
+/// Each CPO wraps a named customization point (e.g. @c finalize). To opt in
+/// for a policy @c P, declare a hidden-friend overload of the @c finalize
+/// function inside the policy struct; the CPO discovers it via ADL.
+///
+/// When no policy-specific overload is found, the CPO falls back to a
+/// default implementation that bypasses the policy entirely.
+
+#pragma once
+
+namespace nvqir {
+class CircuitSimulator;
+}
+
+namespace cudaq {
+class ExecutionManager;
+class ExecutionContext;
+
+/// @brief Default finalization — called when no policy-specific overload
+///        exists. Defined in execution_manager.h.
+void finalize_execution_manager_impl(cudaq::ExecutionManager &mgr,
+                                     cudaq::ExecutionContext &ctx);
+
+namespace detail {
+
+/// @brief Detects whether a policy-specific @c finalize_execution_manager_impl
+///        overload exists for type @p T (found via ADL through hidden friends).
+template <class T>
+concept has_em_custom_finalize =
+    requires(cudaq::ExecutionManager &mgr, const T &policy,
+             cudaq::ExecutionContext &ctx) {
+      finalize_execution_manager_impl(mgr, policy, ctx);
+    };
+
+/// @brief CPO function object for ExecutionManager finalization.
+///
+/// Dispatches to a policy-specific @c finalize_execution_manager_impl if one
+/// exists, otherwise falls back to the 2-argument default.
+struct finalize_execution_manager_fn {
+  template <class Policy>
+  decltype(auto) operator()(cudaq::ExecutionManager &mgr, const Policy &policy,
+                            cudaq::ExecutionContext &ctx) const {
+    if constexpr (has_em_custom_finalize<Policy>) {
+      return finalize_execution_manager_impl(mgr, policy, ctx);
+    } else {
+      return finalize_execution_manager_impl(mgr, ctx);
+    }
+  }
+};
+
+} // namespace detail
+
+/// @brief CPO: finalize an execution context via the ExecutionManager.
+inline constexpr detail::finalize_execution_manager_fn
+    finalize_execution_manager{};
+
+} // namespace cudaq
+
+namespace nvqir {
+
+/// @brief Default finalization — called when no policy-specific overload
+///        exists. Defined in CircuitSimulator.h.
+void finalize_simulation_circuit_impl(nvqir::CircuitSimulator &sim,
+                                      cudaq::ExecutionContext &ctx);
+
+namespace detail {
+
+/// @brief Detects whether a policy-specific
+///        @c finalize_simulation_circuit_impl overload exists for type @p T.
+template <class T>
+concept has_sim_custom_finalize =
+    requires(nvqir::CircuitSimulator &sim, const T &policy,
+             cudaq::ExecutionContext &ctx) {
+      finalize_simulation_circuit_impl(sim, policy, ctx);
+    };
+
+/// @brief CPO function object for CircuitSimulator finalization.
+///
+/// Dispatches to a policy-specific @c finalize_simulation_circuit_impl if one
+/// exists, otherwise falls back to the 2-argument default.
+struct finalize_simulation_circuit_fn {
+  template <class Policy>
+  decltype(auto) operator()(nvqir::CircuitSimulator &sim, const Policy &policy,
+                            cudaq::ExecutionContext &ctx) const {
+    if constexpr (has_sim_custom_finalize<Policy>) {
+      return finalize_simulation_circuit_impl(sim, policy, ctx);
+    } else {
+      return finalize_simulation_circuit_impl(sim, ctx);
+    }
+  }
+};
+} // namespace detail
+
+/// @brief CPO: finalize an execution context via the CircuitSimulator.
+inline constexpr detail::finalize_simulation_circuit_fn
+    finalize_simulation_circuit{};
+} // namespace nvqir
diff --git a/runtime/cudaq/algorithms/policy_dispatch.h b/runtime/cudaq/algorithms/policy_dispatch.h
new file mode 100644
index 00000000000..1eaa988e88b
--- /dev/null
+++ b/runtime/cudaq/algorithms/policy_dispatch.h
@@ -0,0 +1,195 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include "cudaq/algorithms/policies.h"
+#include <string_view>
+#include <type_traits>
+#include <utility>
+
+namespace cudaq::policies {
+
+/// @brief Run-time policy dispatch utilities for name-based overload selection.
+///
+/// Typical usage:
+/// @code
+/// // Simple case: all overloads return void or the same type
+/// withPolicy(context.name, [&](auto policy) {
+///     beginExecutionContext(policy, context);
+/// });
+///
+/// // Heterogeneous return types: use withPolicy + visitResult
+/// withPolicy(context.name, [&](auto policy) {
+///   visitResult(
+///     [&]{ return finalizeExecutionContext(policy, context); },
+///     [&](sample_result&&  r) { context.sample_data  = std::move(r.data); },
+///     [&](run_result&&     r) { context.exit_code    = r.exit_code; },
+///     [&](void_result)        { context.result       = {}; }
+///   );
+/// });
+/// @endcode
+
+// =============================================================================
+// withPolicy
+// =============================================================================
+
+/// @brief Maps a runtime name to a compile-time policy and invokes a generic
+/// callable.
+///
+/// Performs a linear search over a static registry of known policy names.
+/// On a match, the callable is invoked with the corresponding concrete policy
+/// type, enabling compile-time overload resolution. If no match is found, the
+/// callable is invoked with @c other_policies{} as a fallback.
+///
+/// Overload resolution follows standard C++ rules: the most derived policy type
+/// is preferred. If no specific overload exists for a given policy, the
+/// compiler uses the @c other_policies fallback.
+///
+/// @note A linear search over a static array is used rather than a hash map.
+///       For ~10 short string keys, all entries fit in a few cache lines,
+///       making linear search faster than the hash computation and pointer
+///       indirection of @c std::unordered_map. Place the most frequently used
+///       policies first in the registry for best average performance.
+///
+/// @tparam Func  A generic callable type, typically a lambda taking @c auto
+/// policy.
+/// @param  name  The runtime name used to select a policy (e.g. @c
+/// context.name).
+/// @param  func  A generic callable invoked with the resolved policy tag.
+///
+/// @par Example
+/// @code
+/// withPolicy(context.name, [&](auto policy) {
+///     beginExecutionContext(policy, context);
+/// });
+/// @endcode
+template <typename Func>
+void withPolicy(std::string_view name, Func &&func) {
+  using FuncRef = std::remove_reference_t<Func> &;
+  using Entry = std::pair<std::string_view, void (*)(FuncRef)>;
+
+  // One static array per Func instantiation — initialized once, no heap
+  // allocation. To add a new policy, append an entry here and define the policy
+  // struct above.
+  static const Entry registry[] = {
+      {"sample", [](FuncRef f) { f(sample_policy{}); }},
+      {"observe", [](FuncRef f) { f(observe_policy{}); }},
+  };
+
+  for (auto &[key, dispatch] : registry) {
+    if (name == key) {
+      dispatch(func);
+      return;
+    }
+  }
+
+  func(other_policies{});
+}
+
+// =============================================================================
+// visitResult
+// =============================================================================
+
+/// @brief Tag type passed to a visitor when the invoked function returns @c
+/// void.
+///
+/// Allows clients to handle the void case explicitly alongside non-void cases
+/// in a uniform visitor pattern, rather than requiring a separate code path.
+///
+/// @par Example
+/// @code
+/// visitResult(
+///     [&]{ return doSomething(policy, context); },
+///     [&](some_result&& r) { ... },
+///     [&](void_result)     { ... }  // called when doSomething returns void
+/// );
+/// @endcode
+struct void_result {};
+
+namespace detail {
+
+/// @brief Merges multiple callables into a single visitor that dispatches on
+/// argument type.
+///
+/// Inherits @c operator() from each provided type, allowing the compiler to
+/// select the most specific overload based on the argument type at the call
+/// site.
+///
+/// @tparam Ts  Callable types (typically lambdas) to merge into the visitor.
+template <typename... Ts>
+struct overloaded : Ts... {
+  using Ts::operator()...;
+};
+
+template <typename... Ts>
+overloaded(Ts...) -> overloaded<Ts...>;
+
+/// @brief Invokes @p func and forwards its result to @p visitor.
+///
+/// If @p func returns @c void, @c void_result{} is forwarded to @p visitor
+/// instead, ensuring a uniform visitor interface regardless of the return type.
+///
+/// @tparam Visitor  A callable accepting either the return type of @p func or
+/// @c void_result.
+/// @tparam Func     A zero-argument callable whose return type determines the
+/// dispatch.
+/// @param  visitor  The visitor to invoke with the result.
+/// @param  func     The callable to invoke.
+template <typename Visitor, typename Func>
+void invokeVisitor(Visitor &&visitor, Func &&func) {
+  if constexpr (std::is_void_v<std::invoke_result_t<Func>>) {
+    std::forward<Func>(func)();
+    std::forward<Visitor>(visitor)(void_result{});
+  } else {
+    std::forward<Visitor>(visitor)(std::forward<Func>(func)());
+  }
+}
+
+} // namespace detail
+
+/// @brief Invokes a callable and dispatches its return value to a matching
+/// handler.
+///
+/// Builds a visitor from @p handlers using the @c overloaded idiom, then
+/// invokes
+/// @p func and forwards its return value to the matching handler. If @p func
+/// returns @c void, a @c void_result tag is forwarded instead, allowing the
+/// client to handle that case explicitly.
+///
+/// Intended to be used inside a @c withPolicy call to handle heterogeneous
+/// return types from policy-dispatched functions.
+///
+/// @tparam Func      A zero-argument callable, typically a lambda capturing
+///                   the policy and context (e.g. @c [&]{ return f(policy,
+///                   context); }).
+/// @tparam Handlers  Callable types handling each possible return type.
+///                   One handler should accept @c void_result to cover
+///                   void-returning overloads.
+/// @param  func      The callable to invoke (called exactly once).
+/// @param  handlers  Lambdas handling each concrete return type.
+///
+/// @par Example
+/// @code
+/// withPolicy(context.name, [&](auto policy) {
+///   visitResult(
+///     [&]{ return finalizeExecutionContext(policy, context); },
+///     [&](sample_result&&  r) { context.sample_data  = std::move(r.data); },
+///     [&](run_result&&     r) { context.exit_code    = r.exit_code; },
+///     [&](observe_result&& r) { context.observations =
+///     std::move(r.observations); },
+///     [&](void_result)        { context.result       = {}; }
+///   );
+/// });
+/// @endcode
+template <typename Func, typename... Handlers>
+void visitResult(Func &&func, Handlers &&...handlers) {
+  detail::invokeVisitor(detail::overloaded{std::forward<Handlers>(handlers)...},
+                        std::forward<Func>(func));
+}
+} // namespace cudaq::policies
diff --git a/runtime/cudaq/algorithms/sample.h b/runtime/cudaq/algorithms/sample.h
index 99297ab2cac..942b182ed8e 100644
--- a/runtime/cudaq/algorithms/sample.h
+++ b/runtime/cudaq/algorithms/sample.h
@@ -11,11 +11,11 @@
 #include "common/ExecutionContext.h"
 #include "common/SampleResult.h"
 #include "cudaq/algorithms/broadcast.h"
+#include "cudaq/algorithms/sample/options.h"
+#include "cudaq/algorithms/sample/policy.h"
 #include "cudaq/concepts.h"
 #include "cudaq/host_config.h"
 
-constexpr int DEFAULT_NUM_SHOTS = 1000;
-
 namespace cudaq {
 bool kernelHasConditionalFeedback(const std::string &);
 namespace detail {
@@ -229,18 +229,6 @@ auto runSamplingAsync(KernelFunctor &&wrappedKernel, quantum_platform &platform,
 }
 } // namespace details
 
-/// @brief Sample options to provide to the sample() / async_sample() functions
-///
-/// @param shots number of shots to run for the given kernel
-/// @param noise noise model to use for the sample operation
-/// @param explicit_measurements whether or not to form the global register
-/// based on user-supplied measurement order.
-struct sample_options {
-  std::size_t shots = DEFAULT_NUM_SHOTS;
-  cudaq::noise_model noise;
-  bool explicit_measurements = false;
-};
-
 /// @overload
 /// @brief Sample the given quantum kernel expression and return the
 /// mapping of observed bit strings to corresponding number of
diff --git a/runtime/cudaq/algorithms/sample/options.h b/runtime/cudaq/algorithms/sample/options.h
new file mode 100644
index 00000000000..18efdd8fb6e
--- /dev/null
+++ b/runtime/cudaq/algorithms/sample/options.h
@@ -0,0 +1,29 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include "common/NoiseModel.h"
+#include <cstddef>
+
+constexpr int DEFAULT_NUM_SHOTS = 1000;
+namespace cudaq {
+
+/// @brief Sample options to provide to the sample() / async_sample() functions
+///
+/// @param shots number of shots to run for the given kernel
+/// @param noise noise model to use for the sample operation
+/// @param explicit_measurements whether or not to form the global register
+/// based on user-supplied measurement order.
+struct sample_options {
+  std::size_t shots = DEFAULT_NUM_SHOTS;
+  cudaq::noise_model noise;
+  bool explicit_measurements = false;
+};
+
+} // namespace cudaq
diff --git a/runtime/cudaq/algorithms/sample/policy.h b/runtime/cudaq/algorithms/sample/policy.h
new file mode 100644
index 00000000000..1f6f8ebf4d6
--- /dev/null
+++ b/runtime/cudaq/algorithms/sample/policy.h
@@ -0,0 +1,41 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include "common/SampleResult.h"
+#include "cudaq/algorithms/sample/options.h"
+
+namespace nvqir {
+class CircuitSimulator;
+}
+
+namespace cudaq {
+
+class ExecutionManager;
+class ExecutionContext;
+
+/// @brief Tag and options for sampling quantum circuit measurements.
+struct sample_policy {
+  /// Associated result type for synchronous APIs keyed off this policy.
+  using result_type = sample_result;
+
+  /// Sampling  options.
+  sample_options options;
+
+  friend sample_result
+  finalize_execution_manager_impl(ExecutionManager &mgr,
+                                  const sample_policy &policy,
+                                  ExecutionContext &ctx);
+  friend sample_result
+  finalize_simulation_circuit_impl(nvqir::CircuitSimulator &sim,
+                                   const sample_policy &policy,
+                                   ExecutionContext &ctx);
+};
+
+} // namespace cudaq
diff --git a/runtime/cudaq/builder/QuakeValue.cpp b/runtime/cudaq/builder/QuakeValue.cpp
index e6c5137211e..f9c1f25c618 100644
--- a/runtime/cudaq/builder/QuakeValue.cpp
+++ b/runtime/cudaq/builder/QuakeValue.cpp
@@ -102,7 +102,7 @@ std::size_t QuakeValue::getRequiredElements() {
 QuakeValue QuakeValue::operator[](const std::size_t idx) {
   Value vectorValue = value->asMLIR();
   Type type = vectorValue.getType();
-  if (!isa<cc::StdvecType, quake::VeqType, quake::MeasurementsType>(type)) {
+  if (!isa<cc::StdvecType, quake::VeqType>(type)) {
     std::string typeName;
     {
       llvm::raw_string_ostream os(typeName);
@@ -113,11 +113,6 @@ QuakeValue QuakeValue::operator[](const std::size_t idx) {
                              typeName + ").");
   }
 
-  if (isa<quake::MeasurementsType>(type)) {
-    Value measure = opBuilder.create<quake::GetMeasureOp>(vectorValue, idx);
-    return QuakeValue(opBuilder, measure);
-  }
-
   Value indexVar = opBuilder.create<arith::ConstantIntOp>(idx, 32);
 
   if (isa<quake::VeqType>(type)) {
@@ -144,7 +139,7 @@ QuakeValue QuakeValue::operator[](const std::size_t idx) {
 QuakeValue QuakeValue::operator[](const QuakeValue &idx) {
   Value vectorValue = value->asMLIR();
   Type type = vectorValue.getType();
-  if (!isa<cc::StdvecType, quake::VeqType, quake::MeasurementsType>(type)) {
+  if (!isa<cc::StdvecType, quake::VeqType>(type)) {
     std::string typeName;
     {
       llvm::raw_string_ostream os(typeName);
@@ -157,12 +152,6 @@ QuakeValue QuakeValue::operator[](const QuakeValue &idx) {
 
   Value indexVar = idx.getValue();
 
-  if (isa<quake::MeasurementsType>(type)) {
-    Value measure =
-        opBuilder.create<quake::GetMeasureOp>(vectorValue, indexVar);
-    return QuakeValue(opBuilder, measure);
-  }
-
   if (isa<quake::VeqType>(type)) {
     Value extractedQubit =
         opBuilder.create<quake::ExtractRefOp>(vectorValue, indexVar);
@@ -186,15 +175,13 @@ QuakeValue QuakeValue::operator[](const QuakeValue &idx) {
 QuakeValue QuakeValue::size() {
   Value vectorValue = value->asMLIR();
   Type type = vectorValue.getType();
-  if (!isa<cc::StdvecType, quake::VeqType, quake::MeasurementsType>(type))
+  if (!isa<cc::StdvecType, quake::VeqType>(type))
     throw std::runtime_error("This QuakeValue does not expose .size().");
 
   Type i64Ty = opBuilder.getI64Type();
   Value ret;
   if (isa<cc::StdvecType>(type))
     ret = opBuilder.create<cc::StdvecSizeOp>(i64Ty, vectorValue);
-  else if (isa<quake::MeasurementsType>(type))
-    ret = opBuilder.create<quake::MeasurementsSizeOp>(i64Ty, vectorValue);
   else
     ret = opBuilder.create<quake::VeqSizeOp>(i64Ty, vectorValue);
 
diff --git a/runtime/cudaq/builder/kernel_builder.cpp b/runtime/cudaq/builder/kernel_builder.cpp
index d134918d287..474d3834150 100644
--- a/runtime/cudaq/builder/kernel_builder.cpp
+++ b/runtime/cudaq/builder/kernel_builder.cpp
@@ -784,14 +784,11 @@ QuakeValue applyMeasure(ImplicitLocOpBuilder &builder, Value value,
   if (!regName.empty())
     strAttr = builder.getStringAttr(regName);
 
+  Type resTy = builder.getI1Type();
   Type measTy = quake::MeasureType::get(builder.getContext());
   if (!isa<quake::RefType>(type)) {
-    if (auto veqTy = dyn_cast<quake::VeqType>(type);
-        veqTy && veqTy.hasSpecifiedSize())
-      measTy =
-          quake::MeasurementsType::get(builder.getContext(), veqTy.getSize());
-    else
-      measTy = quake::MeasurementsType::getUnsized(builder.getContext());
+    resTy = cc::StdvecType::get(resTy);
+    measTy = cc::StdvecType::get(measTy);
   }
   Value measureResult;
   if (strAttr)
@@ -802,7 +799,8 @@ QuakeValue applyMeasure(ImplicitLocOpBuilder &builder, Value value,
     measureResult =
         builder.template create<QuakeMeasureOp>(measTy, value).getMeasOut();
 
-  return QuakeValue(builder, measureResult);
+  Value bits = builder.create<quake::DiscriminateOp>(resTy, measureResult);
+  return QuakeValue(builder, bits);
 }
 
 QuakeValue mx(ImplicitLocOpBuilder &builder, QuakeValue &qubitOrQvec,
diff --git a/runtime/cudaq/builder/kernels.h b/runtime/cudaq/builder/kernels.h
index bc63601690c..6ae83c289f3 100644
--- a/runtime/cudaq/builder/kernels.h
+++ b/runtime/cudaq/builder/kernels.h
@@ -69,6 +69,31 @@ void from_state(Kernel &&kernel, QuakeValue &qubits,
         "[from_state] cannot infer size of input quantum register, please "
         "specify the number of qubits via the from_state() final argument.");
 
+  constexpr double basisTol = 1e-12;
+  std::size_t nonZeroCount = 0;
+  std::size_t nonZeroIdx = 0;
+  for (std::size_t i = 0; i < data.size(); ++i) {
+    if (std::abs(data[i]) > basisTol) {
+      ++nonZeroCount;
+      nonZeroIdx = i;
+      if (nonZeroCount > 1)
+        break;
+    }
+  }
+  if (nonZeroCount == 0)
+    throw std::invalid_argument(
+        "[from_state] input state vector is all zeros; a quantum state "
+        "must have unit norm.");
+  if (nonZeroCount == 1) {
+    // Möttönen ordering: state-vector index MSB maps to qubits[0], LSB to
+    // qubits[numQubits-1].
+    auto nq = static_cast<std::size_t>(numQubits);
+    for (std::size_t q = 0; q < nq; ++q)
+      if ((nonZeroIdx >> (nq - 1 - q)) & 1)
+        kernel.x(qubits[q]);
+    return;
+  }
+
   auto mutableQubits = cudaq::range(numQubits);
   std::reverse(mutableQubits.begin(), mutableQubits.end());
   bool omegaNonZero = false;
diff --git a/runtime/cudaq/operators/matrix.cpp b/runtime/cudaq/operators/matrix.cpp
index 7f38431aac3..e24ca74db52 100644
--- a/runtime/cudaq/operators/matrix.cpp
+++ b/runtime/cudaq/operators/matrix.cpp
@@ -168,14 +168,40 @@ cudaq::complex_matrix::operator*=(const cudaq::complex_matrix &right) {
   if (cols() != right.rows())
     throw std::runtime_error("matrix dimensions mismatch in operator*=");
 
-  auto new_data = new cudaq::complex_matrix::value_type[rows() * right.cols()];
-  cudaq::complex_matrix::Dimensions new_dims = {rows(), right.cols()};
-  for (std::size_t i = 0; i < rows(); i++)
-    for (std::size_t j = 0; j < right.cols(); j++)
-      for (std::size_t k = 0; k < cols(); k++)
-        access(new_data, new_dims, i, j, this->internal_order) +=
-            access(data, dimensions, i, k, this->internal_order) *
-            access(right.data, right.dimensions, k, j, right.internal_order);
+  const std::size_t new_rows = rows();
+  const std::size_t new_cols = right.cols();
+  auto *new_data = new cudaq::complex_matrix::value_type[new_rows * new_cols];
+  cudaq::complex_matrix::Dimensions new_dims = {new_rows, new_cols};
+
+  using RowMat = Eigen::Matrix<value_type, -1, -1, Eigen::RowMajor, -1, -1>;
+  using ColMat = Eigen::Matrix<value_type, -1, -1, Eigen::ColMajor, -1, -1>;
+
+  auto assign_product = [&](auto &&lhs_map, auto &&rhs_map) {
+    if (this->internal_order == cudaq::complex_matrix::order::row_major)
+      Eigen::Map<RowMat>(new_data, new_rows, new_cols).noalias() =
+          lhs_map * rhs_map;
+    else
+      Eigen::Map<ColMat>(new_data, new_rows, new_cols).noalias() =
+          lhs_map * rhs_map;
+  };
+
+  const bool l_row =
+      this->internal_order == cudaq::complex_matrix::order::row_major;
+  const bool r_row =
+      right.internal_order == cudaq::complex_matrix::order::row_major;
+  Eigen::Map<const RowMat> l_row_map(this->data, this->rows(), this->cols());
+  Eigen::Map<const ColMat> l_col_map(this->data, this->rows(), this->cols());
+  Eigen::Map<const RowMat> r_row_map(right.data, right.rows(), right.cols());
+  Eigen::Map<const ColMat> r_col_map(right.data, right.rows(), right.cols());
+  if (l_row && r_row)
+    assign_product(l_row_map, r_row_map);
+  else if (l_row && !r_row)
+    assign_product(l_row_map, r_col_map);
+  else if (!l_row && r_row)
+    assign_product(l_col_map, r_row_map);
+  else
+    assign_product(l_col_map, r_col_map);
+
   swap(new_data);
   dimensions = new_dims;
   return *this;
@@ -364,7 +390,7 @@ cudaq::complex_matrix cudaq::complex_matrix::exponential() {
   std::size_t columns = this->cols();
   if (rows != columns)
     throw std::runtime_error("Matrix exponential expects a square matrix.");
-  auto result = cudaq::complex_matrix(rows, columns, false);
+  auto result = cudaq::complex_matrix(rows, columns);
   // Taylor Series Approximation, fixed at 20 steps.
   std::size_t taylor_steps = 20;
   for (std::size_t step = 0; step < taylor_steps; step++) {
diff --git a/runtime/cudaq/platform/default/python/QPU.cpp b/runtime/cudaq/platform/default/python/QPU.cpp
index 31666b46db4..7146dd6f70d 100644
--- a/runtime/cudaq/platform/default/python/QPU.cpp
+++ b/runtime/cudaq/platform/default/python/QPU.cpp
@@ -23,13 +23,14 @@
 #include "cudaq/Verifier/QIRLLVMIRDialect.h"
 #include "cudaq/platform.h"
 #include "cudaq_internal/compiler/ArgumentConversion.h"
+#include "cudaq_internal/compiler/CompiledModuleHelper.h"
 #include "cudaq_internal/compiler/JIT.h"
 #include "cudaq_internal/compiler/RuntimeMLIR.h"
 #include "mlir/ExecutionEngine/ExecutionEngine.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Target/LLVMIR/Export.h"
 #include "mlir/Transforms/Passes.h"
-#include <unordered_set>
+#include <cudaq/Optimizer/CodeGen/QIROpaqueStructTypes.h>
 
 // Declared in runtime/cudaq/algorithms/resource_estimation.h (not included
 // here to avoid pulling in cudaq/platform.h which creates circular deps).
@@ -41,17 +42,16 @@ using namespace mlir;
 using namespace cudaq_internal::compiler;
 using cudaq::JitEngine;
 
-static void
-specializeKernel(const std::string &name, ModuleOp module,
-                 const std::vector<void *> &rawArgs, Type resultTy = {},
-                 bool enablePythonCodegenDump = false, bool isEntryPoint = true,
-                 const std::unordered_set<unsigned> &varArgIndices = {}) {
+static void specializeKernel(const std::string &name, ModuleOp module,
+                             const std::vector<void *> &rawArgs,
+                             Type resultTy = {},
+                             bool enablePythonCodegenDump = false,
+                             bool isEntryPoint = true,
+                             bool isFullySpecialized = true) {
   PassManager pm(module.getContext());
   ArgumentConverter argCon(name, module);
-  if (varArgIndices.empty())
-    argCon.gen(name, module, rawArgs);
-  else
-    argCon.gen(rawArgs, varArgIndices);
+  // Look up the kernel's type signature.
+  argCon.gen(name, module, rawArgs);
   SmallVector<std::string> kernels;
   SmallVector<std::string> substs;
   for (auto *kInfo : argCon.getKernelSubstitutions()) {
@@ -81,25 +81,11 @@ specializeKernel(const std::string &name, ModuleOp module,
   cudaq::opt::addAggressiveInlining(pm);
   pm.addPass(cudaq::opt::createDistributedDeviceCall());
   pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
-  // If we're persisting the jit cache we need to run GKE to have access
-  // to `.argsCreator` to serialize the arguments.
-  if (!varArgIndices.empty()) {
-    pm.addPass(
-        cudaq::opt::createGenerateKernelExecution({.positNullary = false}));
-  } else if ((resultTy && isEntryPoint) ||
-             cudaq::compiler_artifact::isPersistingJITEngine()) {
-    // If we're expecting a result, then we want to call the .thunk function so
-    // that the result is properly marshaled. Add the GKE pass to generate the
-    // .thunk. At this point, the kernel should have been specialized so it has
-    // an arity of 0.
-    auto nullary = true;
-    for (auto arg : rawArgs)
-      if (!arg) {
-        nullary = false;
-        break;
-      }
+  // Run GKE to generate `.thunk` / `.argsCreator` when the kernel has a result
+  // or any unspecialized arguments so they can be properly marshaled
+  if (isEntryPoint && (resultTy || !isFullySpecialized)) {
     pm.addPass(cudaq::opt::createGenerateKernelExecution(
-        {.positNullary = nullary, .ignoreHostFunction = true}));
+        {.positNullary = isFullySpecialized, .ignoreHostFunction = true}));
   }
   pm.addPass(createSymbolDCEPass());
   if (enablePythonCodegenDump) {
@@ -271,25 +257,21 @@ static std::optional<JitEngine>
 alreadyBuiltJITCode(const std::string &name,
                     const std::vector<void *> &rawArgs) {
   auto *currentExecCtx = cudaq::getExecutionContext();
-  if (!currentExecCtx || !currentExecCtx->allowJitEngineCaching)
-    return std::nullopt;
-
-  auto jit = currentExecCtx->jitEng;
-  if (jit && cudaq::compiler_artifact::isPersistingJITEngine()) {
-    CUDAQ_INFO("Loading previously compiled JIT engine for {}. This will "
-               "re-run the previous job, discarding any changes to the kernel, "
-               "arguments or launch configuration.",
-               currentExecCtx->kernelName);
-
-    // Ensure the arguments are the same as the previous launch.
-    auto argsCreatorThunk = [&jit, &name]() {
-      return (void *)jit->lookupRawNameOrFail(name + ".argsCreator");
-    };
-    cudaq::compiler_artifact::checkArtifactReuse(name, rawArgs, jit.value(),
-                                                 argsCreatorThunk);
+  if (currentExecCtx && currentExecCtx->allowJitEngineCaching) {
+    auto jit = currentExecCtx->jitEng;
+    if (jit && cudaq::compiler_artifact::isPersistingJITEngine()) {
+      CUDAQ_INFO("Loading previously compiled JIT engine for {}. This will "
+                 "re-run the previous job, discarding any changes to the "
+                 "kernel, arguments or launch configuration.",
+                 currentExecCtx->kernelName);
+      cudaq::compiler_artifact::checkArtifactReuse(name, jit.value());
+    }
+    return jit;
   }
 
-  return jit;
+  // Fallback for callers without an ExecutionContext (e.g. direct kernel
+  // calls): look up the artifact saved by a previous compilation.
+  return cudaq::compiler_artifact::getArtifactJit(name);
 }
 
 /// In a sample launch context, the (`JIT` compiled) execution engine may be
@@ -313,7 +295,7 @@ static void precountResources(ModuleOp module) {
     return;
   auto counts = cudaq::opt::countResourcesFromIR(module);
   if (failed(counts))
-    throw std::runtime_error("Resource count preprocessing failed.");
+    return;
   nvqir::setResourceCounts(std::move(*counts));
 }
 
@@ -335,36 +317,43 @@ struct PythonLauncher : public cudaq::ModuleLauncher {
       throw std::runtime_error("no kernel named " + name + " found in module");
     Type resultTy = cudaq::runtime::getReturnType(funcOp);
 
-    std::unordered_set<unsigned> varArgIndices;
-    {
-      auto mangledNameMap = module->getAttrOfType<mlir::DictionaryAttr>(
-          cudaq::runtime::mangledNameMap);
-      bool parametricCompatible = false;
-      if (mangledNameMap)
-        if (auto attr = mangledNameMap.getAs<mlir::StringAttr>(fullName)) {
-          mlir::StringRef mn = attr.getValue();
-          parametricCompatible = mn != "BuilderKernel.EntryPoint" &&
-                                 !mn.contains("PyKernelFakeEntryPoint");
+    const bool hasResult = !!resultTy;
+    auto resultInfo =
+        CompiledModuleHelper::createResultInfo(resultTy, isEntryPoint, module);
+
+    // Determine whether the kernel needs argument packing (argsCreator) by
+    // checking if any non-callable arguments are present. This must be done
+    // before the cache lookup so the cached path uses the correct value.
+    bool isFullySpecialized = true;
+    FunctionType fromFuncTy = funcOp.getFunctionType();
+    // Specialization for direct calls will take care of partial specialization
+    // separately
+    bool isLocalSimulator =
+        !(cudaq::is_remote_platform() || cudaq::is_emulated_platform());
+
+    std::vector<void *> closureArgs;
+
+    // Special handling in case the arguments were already synthesized
+    size_t numArgs = rawArgs.size() - (hasResult ? 1 : 0);
+    if (isEntryPoint && isLocalSimulator &&
+        numArgs == fromFuncTy.getNumInputs()) {
+      closureArgs = rawArgs;
+      for (auto [i, ty] : llvm::enumerate(fromFuncTy.getInputs())) {
+        if (!isa<cudaq::cc::CallableType>(ty)) {
+          isFullySpecialized = false;
+          closureArgs[i] = nullptr;
         }
-      if (parametricCompatible)
-        for (auto [idx, argTy] :
-             llvm::enumerate(funcOp.getFunctionType().getInputs()))
-          if (auto vecTy = dyn_cast<cudaq::cc::StdvecType>(argTy))
-            if (isa<mlir::FloatType>(vecTy.getElementType()))
-              varArgIndices.insert(idx);
-    }
-    {
-      auto *execCtx = cudaq::getExecutionContext();
-      if (!execCtx || !execCtx->useParametricJit)
-        varArgIndices.clear();
+      }
+    } else {
+      // Avoid copying
+      closureArgs = std::move(rawArgs);
     }
-    const bool isFullySpecialized = varArgIndices.empty();
-    auto resultInfo = createResultInfo(resultTy, isEntryPoint, module);
 
     if (auto jit = alreadyBuiltJITCode(name, rawArgs)) {
-      cudaq::CompiledModule ck(name, resultInfo);
-      ck.attachJit(*jit, isFullySpecialized);
-      return ck;
+      auto jitArtifacts = CompiledModuleHelper::createJitArtifacts(
+          name, *jit, resultInfo, isFullySpecialized);
+      return CompiledModuleHelper::createCompiledModule(name, resultInfo,
+                                                        jitArtifacts);
     }
 
     // 1. Check that this call is sane.
@@ -386,8 +375,9 @@ struct PythonLauncher : public cudaq::ModuleLauncher {
     CUDAQ_INFO("Run Argument Synth.\n");
     if (enablePythonCodegenDump)
       module.dump();
-    specializeKernel(name, module, rawArgs, resultTy, enablePythonCodegenDump,
-                     isEntryPoint, varArgIndices);
+
+    specializeKernel(name, module, closureArgs, resultTy,
+                     enablePythonCodegenDump, isEntryPoint, isFullySpecialized);
 
     // 3b. Run target-specific passes if configured.
     runTargetPassPipeline(module);
@@ -398,15 +388,12 @@ struct PythonLauncher : public cudaq::ModuleLauncher {
     // 4. Lower to QIR and JIT compile.
     auto jit = createJITEngine(module, "qir:");
     cacheJITForPerformance(jit);
-    auto argsCreatorThunk = [&jit, &name]() {
-      return (void *)jit.lookupRawNameOrFail(name + ".argsCreator");
-    };
-    cudaq::compiler_artifact::saveArtifact(name, rawArgs, jit,
-                                           argsCreatorThunk);
-
-    cudaq::CompiledModule ck(name, resultInfo);
-    ck.attachJit(jit, isFullySpecialized);
-    return ck;
+    cudaq::compiler_artifact::saveArtifact(name, jit);
+
+    auto jitArtifacts = CompiledModuleHelper::createJitArtifacts(
+        name, jit, resultInfo, isFullySpecialized);
+    return CompiledModuleHelper::createCompiledModule(
+        name, std::move(resultInfo), jitArtifacts);
   }
 };
 } // namespace
diff --git a/runtime/cudaq/platform/default/rest/helpers/CMakeLists.txt b/runtime/cudaq/platform/default/rest/helpers/CMakeLists.txt
index 5daa54ea114..4574b6ba8fe 100644
--- a/runtime/cudaq/platform/default/rest/helpers/CMakeLists.txt
+++ b/runtime/cudaq/platform/default/rest/helpers/CMakeLists.txt
@@ -27,3 +27,6 @@ endif()
 if(CUDAQ_ENABLE_TII_BACKEND)
   add_subdirectory(tii)
 endif()
+if(CUDAQ_ENABLE_QBRAID_BACKEND)
+  add_subdirectory(qbraid)
+endif()
diff --git a/runtime/cudaq/platform/default/rest/helpers/infleqtion/infleqtion.yml b/runtime/cudaq/platform/default/rest/helpers/infleqtion/infleqtion.yml
index f377488eada..74bc3eaeac6 100644
--- a/runtime/cudaq/platform/default/rest/helpers/infleqtion/infleqtion.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/infleqtion/infleqtion.yml
@@ -19,6 +19,7 @@ config:
   # Add preprocessor defines to compilation
   preprocessor-defines: ["-D CUDAQ_QUANTUM_DEVICE"]
   # Define the JIT lowering pipeline
+  jit-high-level-pipeline: "expand-measurements"
   jit-mid-level-pipeline: "lower-to-cfg,decomposition{basis=h,s,t,r1,rx,ry,rz,x,y,z,x(1)},quake-to-cc-prep,func.func(memtoreg{quantum=0})"
   # Tell the rest-qpu that we are generating OpenQASM 2.0.
   codegen-emission: qasm2
diff --git a/runtime/cudaq/platform/default/rest/helpers/qbraid/CMakeLists.txt b/runtime/cudaq/platform/default/rest/helpers/qbraid/CMakeLists.txt
new file mode 100644
index 00000000000..dac742b6824
--- /dev/null
+++ b/runtime/cudaq/platform/default/rest/helpers/qbraid/CMakeLists.txt
@@ -0,0 +1,17 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+target_sources(cudaq-rest-qpu PRIVATE QbraidServerHelper.cpp)
+add_target_config(qbraid)
+
+add_library(cudaq-serverhelper-qbraid SHARED QbraidServerHelper.cpp )
+target_link_libraries(cudaq-serverhelper-qbraid
+  PUBLIC
+    cudaq-common
+    cudaq-logger
+)
+install(TARGETS cudaq-serverhelper-qbraid DESTINATION lib)
\ No newline at end of file
diff --git a/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp b/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
new file mode 100644
index 00000000000..5996d1ffe64
--- /dev/null
+++ b/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
@@ -0,0 +1,390 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "common/RestClient.h"
+#include "common/ServerHelper.h"
+#include "cudaq/Support/Version.h"
+#include "cudaq/runtime/logger/logger.h"
+#include "cudaq/utils/cudaq_utils.h"
+#include <regex>
+#include <thread>
+
+namespace cudaq {
+
+/// @brief The QbraidServerHelper class extends the ServerHelper class to
+/// handle interactions with the qBraid server for submitting and retrieving
+/// quantum computation jobs to various qBraid supported devices.
+class QbraidServerHelper : public ServerHelper {
+  static constexpr const char *DEFAULT_URL = "https://api-v2.qbraid.com/api/v1";
+  static constexpr const char *DEFAULT_DEVICE = "qbraid:qbraid:sim:qir-sv";
+  static constexpr int DEFAULT_QUBITS = 30;
+
+public:
+  /// @brief Returns the name of the server helper.
+  const std::string name() const override { return "qbraid"; }
+
+  /// @brief Initializes the server helper with the provided backend
+  /// configuration.
+  void initialize(BackendConfig config) override {
+    cudaq::info("Initializing qBraid Backend.");
+
+    backendConfig.clear();
+    backendConfig["url"] = getValueOrDefault(config, "url", DEFAULT_URL);
+    backendConfig["user_agent"] = "cudaq/" + std::string(cudaq::getVersion());
+    backendConfig["qubits"] = std::to_string(DEFAULT_QUBITS);
+
+    // Accept "machine" as a user-friendly alias for qBraid's device_id
+    // Usage: cudaq.set_target("qbraid", machine="qbraid:qbraid:sim:qir-sv")
+    if (!config["machine"].empty()) {
+      backendConfig["device_id"] = config["machine"];
+    } else {
+      backendConfig["device_id"] =
+          getValueOrDefault(config, "device_id", DEFAULT_DEVICE);
+    }
+
+    // Accept api_key from target arguments, fall back to QBRAID_API_KEY env var
+    // Usage: cudaq.set_target("qbraid", api_key="my-key")
+    bool isApiKeyRequired = [&]() {
+      auto it = config.find("emulate");
+      if (it != config.end() && it->second == "true")
+        return false;
+      return true;
+    }();
+    if (!config["api_key"].empty()) {
+      backendConfig["api_key"] = config["api_key"];
+    } else {
+      backendConfig["api_key"] =
+          getEnvVar("QBRAID_API_KEY", "", isApiKeyRequired);
+    }
+    backendConfig["job_path"] = backendConfig["url"] + "/jobs";
+
+    if (!config["shots"].empty()) {
+      backendConfig["shots"] = config["shots"];
+      this->setShots(std::stoul(config["shots"]));
+    } else {
+      backendConfig["shots"] = "1000";
+      this->setShots(1000);
+    }
+
+    parseConfigForCommonParams(config);
+
+    cudaq::info("qBraid configuration initialized:");
+    for (const auto &[key, value] : backendConfig) {
+      if (key == "api_key") {
+        cudaq::info("  api_key = <redacted, {} chars>", value.size());
+      } else {
+        cudaq::info("  {} = {}", key, value);
+      }
+    }
+  }
+
+  /// @brief Creates a quantum computation job using the provided kernel
+  /// executions and returns the corresponding payload.
+  ServerJobPayload
+  createJob(std::vector<KernelExecution> &circuitCodes) override {
+    if (backendConfig.find("job_path") == backendConfig.end()) {
+      throw std::runtime_error(
+          "job_path not found in config. Was initialize() called?");
+    }
+
+    std::vector<ServerMessage> jobs;
+    for (auto &circuitCode : circuitCodes) {
+      ServerMessage job;
+      job["deviceQrn"] = backendConfig.at("device_id");
+      // Use the per-call shots (set via cudaq::sample(..., shots_count=N))
+      job["shots"] = shots;
+
+      // v2 API: program is a structured object with format and data
+      nlohmann::json program;
+      program["format"] = "qasm2";
+      program["data"] = circuitCode.code;
+      job["program"] = program;
+
+      // v2 API: name is a top-level field (not nested under tags)
+      if (!circuitCode.name.empty()) {
+        job["name"] = circuitCode.name;
+      }
+
+      jobs.push_back(job);
+    }
+
+    return std::make_tuple(backendConfig.at("job_path"), getHeaders(), jobs);
+  }
+
+  /// @brief Extracts the job ID from the server's response to a job submission.
+  std::string extractJobId(ServerMessage &postResponse) override {
+    // v2 API: jobQrn is nested under data envelope
+    if (postResponse.contains("data") &&
+        postResponse["data"].contains("jobQrn")) {
+      return postResponse["data"]["jobQrn"].get<std::string>();
+    }
+    throw std::runtime_error(
+        "ServerMessage doesn't contain 'data.jobQrn' key.");
+  }
+
+  /// @brief Constructs the URL for retrieving a job based on the server's
+  /// response to a job submission.
+  std::string constructGetJobPath(ServerMessage &postResponse) override {
+    // v2 API: use path parameter instead of query parameter
+    if (postResponse.contains("data") &&
+        postResponse["data"].contains("jobQrn")) {
+      return backendConfig.at("job_path") + "/" +
+             postResponse["data"]["jobQrn"].get<std::string>();
+    }
+    throw std::runtime_error(
+        "ServerMessage doesn't contain 'data.jobQrn' key.");
+  }
+
+  /// @brief Constructs the URL for retrieving a job based on a job ID.
+  std::string constructGetJobPath(std::string &jobId) override {
+    // v2 API: /jobs/{jobQrn}
+    return backendConfig.at("job_path") + "/" + jobId;
+  }
+
+  /// @brief Constructs the URL for retrieving the measurement results of a
+  /// completed job based on a job ID.
+  std::string constructGetResultsPath(const std::string &jobId) {
+    // v2 API: /jobs/{jobQrn}/result
+    return backendConfig.at("job_path") + "/" + jobId + "/result";
+  }
+
+  /// @brief Checks if a job is done based on the server's response to a job
+  /// retrieval request.
+  bool jobIsDone(ServerMessage &getJobResponse) override {
+    std::string status;
+
+    // v2 API: status is nested under data envelope
+    if (getJobResponse.contains("data") &&
+        getJobResponse["data"].contains("status")) {
+      status = getJobResponse["data"]["status"].get<std::string>();
+      cudaq::info("Job status from v2 data envelope: {}", status);
+    } else if (getJobResponse.contains("status")) {
+      // Fallback: direct status field
+      status = getJobResponse["status"].get<std::string>();
+      cudaq::info("Job status from direct response: {}", status);
+    } else {
+      cudaq::info("Unexpected job response format: {}", getJobResponse.dump());
+      throw std::runtime_error("Invalid job response format");
+    }
+
+    if (status == "FAILED" || status == "COMPLETED" || status == "CANCELLED") {
+      return true;
+    }
+
+    return false;
+  }
+
+  /// @brief Processes the server's response to a job retrieval request and
+  /// maps the results back to sample results.
+  cudaq::sample_result processResults(ServerMessage &getJobResponse,
+                                      std::string &jobId) override {
+    // qbraid's v2 API has a window where status transitions to COMPLETED
+    // before the result payload is queryable on /result, so /result returns
+    // {success: false, data: {message: "not yet available"}}. Retry with
+    // backoff absorbs that race. Exercised deterministically via the mock's
+    // POST /test/delay_next_results endpoint (see checkResultRetry /
+    // checkResultRetryExhaustion tests).
+    const int maxRetries = 3;
+    const int waitTime = 2;
+    const float backoffFactor = 2.0;
+
+    for (int attempt = 0; attempt < maxRetries; ++attempt) {
+      try {
+        auto resultsPath = constructGetResultsPath(jobId);
+        auto headers = getHeaders();
+
+        cudaq::info("Fetching results from v2 endpoint (attempt {}/{}): {}",
+                    attempt + 1, maxRetries, resultsPath);
+        RestClient client;
+        auto resultJson = client.get("", resultsPath, headers, true);
+
+        // v2 API: error indicated by success=false
+        if (resultJson.contains("success") &&
+            resultJson["success"].is_boolean() &&
+            !resultJson["success"].get<bool>()) {
+          std::string errorMsg = "Results not yet available";
+          if (resultJson.contains("data") &&
+              resultJson["data"].contains("message")) {
+            errorMsg = resultJson["data"]["message"].get<std::string>();
+          }
+          cudaq::info("Results endpoint returned success=false: {}", errorMsg);
+
+          if (attempt == maxRetries - 1) {
+            throw std::runtime_error("Error retrieving results: " + errorMsg);
+          }
+        }
+        // v2 API: measurementCounts nested under data.resultData
+        else if (resultJson.contains("data") &&
+                 resultJson["data"].contains("resultData") &&
+                 resultJson["data"]["resultData"].contains(
+                     "measurementCounts")) {
+          cudaq::info("Processing results from v2 endpoint");
+          CountsDictionary counts;
+          auto &measurements =
+              resultJson["data"]["resultData"]["measurementCounts"];
+
+          for (const auto &[bitstring, count] : measurements.items()) {
+            counts[bitstring] =
+                count.is_number()
+                    ? static_cast<std::size_t>(count.get<double>())
+                    : static_cast<std::size_t>(count);
+          }
+
+          // The returned bitstring spans every measured qubit, including
+          // compiler-generated ancillae that the user never declared. Reduce
+          // it down to the user-visible qubits using the output_names entry
+          // populated by the framework (Executor.cpp writes one per submitted
+          // circuit; Future.cpp re-initializes the helper with that config
+          // before processResults runs). Mirrors the IonQ / Braket helpers.
+          cudaq::ExecutionResult fullExecResults{counts};
+          auto fullSampleResults = cudaq::sample_result{fullExecResults};
+
+          std::vector<ExecutionResult> execResults;
+
+          auto outputNamesIt = outputNames.find(jobId);
+          if (outputNamesIt != outputNames.end() &&
+              !outputNamesIt->second.empty()) {
+            auto &job_output_names = outputNamesIt->second;
+
+            std::vector<std::size_t> qubitNumbers;
+            qubitNumbers.reserve(job_output_names.size());
+            for (auto &[result, info] : job_output_names)
+              qubitNumbers.push_back(info.qubitNum);
+
+            auto subset = fullSampleResults.get_marginal(qubitNumbers);
+            execResults.emplace_back(ExecutionResult{subset.to_map()});
+
+            // Emit one single-bit register per named result so that
+            // `sample_result::to_map(registerName)` still works.
+            for (const auto &[result, info] : job_output_names) {
+              CountsDictionary regCounts;
+              for (const auto &[bits, count] : fullSampleResults)
+                regCounts[std::string{bits[info.qubitNum]}] += count;
+              execResults.emplace_back(regCounts, info.registerName);
+            }
+          } else {
+            // No output_names available: fall back to the full flat counts.
+            execResults.emplace_back(ExecutionResult{counts});
+          }
+
+          return cudaq::sample_result(execResults);
+        }
+
+        // No valid data yet and no explicit error - retry
+        if (attempt < maxRetries - 1) {
+          int sleepTime = (attempt == 0)
+                              ? waitTime
+                              : waitTime * std::pow(backoffFactor, attempt);
+          cudaq::info("No valid results yet, retrying in {} seconds",
+                      sleepTime);
+          std::this_thread::sleep_for(std::chrono::seconds(sleepTime));
+        }
+
+      } catch (const std::exception &e) {
+        // RestClient throws std::runtime_error on any non-success HTTP status
+        // (see runtime/common/RestClient.cpp) with a fixed message format:
+        //   "HTTP <VERB> Error - status code <code>: <curl_err>: <body>"
+        // The code isn't exposed as a structured attribute, so we parse it
+        // out to distinguish terminal client errors (401/403/404/409) from
+        // transient server/network errors (5xx, parse errors) that retry.
+        static const std::regex statusRx(R"(status code (\d+))");
+        const std::string what = e.what();
+        std::smatch match;
+        int statusCode = 0;
+        if (std::regex_search(what, match, statusRx))
+          statusCode = std::stoi(match[1]);
+
+        // Terminal: auth failures - retrying will not recover.
+        if (statusCode == 401 || statusCode == 403)
+          throw std::runtime_error(
+              "qBraid authentication failed (HTTP " +
+              std::to_string(statusCode) +
+              "). Verify QBRAID_API_KEY or api_key target argument.");
+
+        // Terminal: result resource genuinely does not exist. This is
+        // distinct from the "not yet available" race which returns
+        // 200 + success=false (handled above).
+        if (statusCode == 404)
+          throw std::runtime_error(
+              "qBraid result not found (HTTP 404) for job " + jobId +
+              ". The job may have been deleted or never produced results.");
+
+        // Terminal: job reached a non-success terminal state (FAILED or
+        // CANCELLED). qBraid v2 returns 409 Conflict on /result in that case
+        // because no measurement data will ever be produced.
+        if (statusCode == 409)
+          throw std::runtime_error(
+              "qBraid job " + jobId +
+              " did not produce results (HTTP 409). The job likely FAILED "
+              "or was CANCELLED.");
+
+        // Retryable: 5xx, network errors, JSON parse failures, etc.
+        cudaq::info("Exception when fetching results (attempt {}/{}): {}",
+                    attempt + 1, maxRetries, what);
+        if (attempt < maxRetries - 1) {
+          int sleepTime = (attempt == 0)
+                              ? waitTime
+                              : waitTime * std::pow(backoffFactor, attempt);
+          cudaq::info("Retrying in {} seconds", sleepTime);
+          std::this_thread::sleep_for(std::chrono::seconds(sleepTime));
+        }
+      }
+    }
+
+    throw std::runtime_error("Failed to retrieve measurement counts after " +
+                             std::to_string(maxRetries) + " attempts");
+  }
+
+  /// @brief Override the polling interval method
+  std::chrono::microseconds
+  nextResultPollingInterval(ServerMessage &postResponse) override {
+    return std::chrono::seconds(1);
+  }
+
+private:
+  /// @brief Returns the headers for the server requests.
+  RestHeaders getHeaders() override {
+    if (backendConfig.find("api_key") == backendConfig.end()) {
+      throw std::runtime_error(
+          "API key not found in config. Was initialize() called?");
+    }
+
+    RestHeaders headers;
+    headers["X-API-KEY"] = backendConfig.at("api_key");
+    headers["Content-Type"] = "application/json";
+    headers["User-Agent"] = backendConfig.at("user_agent");
+    return headers;
+  }
+
+  /// @brief Helper method to retrieve the value of an environment variable.
+  std::string getEnvVar(const std::string &key, const std::string &defaultVal,
+                        const bool isRequired) const {
+    const char *env_var = std::getenv(key.c_str());
+    if (env_var == nullptr) {
+      if (isRequired) {
+        throw std::runtime_error(key + " environment variable is not set.");
+      }
+
+      return defaultVal;
+    }
+    return std::string(env_var);
+  }
+
+  /// @brief Helper function to get a value from config or return a default
+  /// value.
+  std::string getValueOrDefault(const BackendConfig &config,
+                                const std::string &key,
+                                const std::string &defaultValue) const {
+    return config.find(key) != config.end() ? config.at(key) : defaultValue;
+  }
+};
+} // namespace cudaq
+
+// Register the QbraidServerHelper with the name "qbraid" in the ServerHelper
+// factory
+CUDAQ_REGISTER_TYPE(cudaq::ServerHelper, cudaq::QbraidServerHelper, qbraid)
diff --git a/runtime/cudaq/platform/default/rest/helpers/qbraid/qbraid.yml b/runtime/cudaq/platform/default/rest/helpers/qbraid/qbraid.yml
new file mode 100644
index 00000000000..2b83c3ea7ff
--- /dev/null
+++ b/runtime/cudaq/platform/default/rest/helpers/qbraid/qbraid.yml
@@ -0,0 +1,40 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+name: qbraid
+description: "CUDA-Q target for qBraid."
+config:
+  # Tell DefaultQuantumPlatform what QPU subtype to use
+  platform-qpu: remote_rest
+  # Tell NVQ++ to generate glue code to set the target backend name
+  gen-target-backend: true
+  # Add the rest-qpu library to the link list
+  link-libs: ["-lcudaq-rest-qpu"]
+  # Define the JIT lowering pipeline. Mirrors Braket so that
+  # `combine-measurements` runs and attaches the QIROutputNamesAttrName
+  # metadata nvq++ needs to thread user-visible qubit indices through to
+  # `outputNames[jobId]` in the helper. Without that pass the default register
+  # would contain compiler-generated ancillae and named sub-registers would be
+  # unreachable (see QbraidServerHelper::processResults).
+  jit-mid-level-pipeline: "lower-to-cfg,decomposition{basis=h,s,t,rx,ry,rz,x,y,z,x(1)},quake-to-cc-prep,func.func(expand-control-veqs,combine-quantum-alloc,canonicalize,combine-measurements)"
+  # Tell the rest-qpu that we are generating OpenQASM.
+  codegen-emission: qasm2
+  # Library mode is only for simulators, physical backends must turn this off
+  library-mode: false
+
+target-arguments:
+  - key: machine
+    required: false
+    type: string
+    platform-arg: qpu
+    help-string: "Specify the qBraid QPU."
+  - key: api_key
+    required: false
+    type: string
+    platform-arg: api_key
+    help-string: "Specify the qBraid API key."
diff --git a/runtime/cudaq/platform/default/rest/helpers/tii/tii.yml b/runtime/cudaq/platform/default/rest/helpers/tii/tii.yml
index 0de439667b2..ca33039a306 100644
--- a/runtime/cudaq/platform/default/rest/helpers/tii/tii.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/tii/tii.yml
@@ -19,6 +19,7 @@ config:
   # Add preprocessor defines to compilation
   preprocessor-defines: ["-D CUDAQ_QUANTUM_DEVICE"]
   # Define the JIT lowering pipelines
+  jit-high-level-pipeline: "expand-measurements"
   jit-mid-level-pipeline: "lower-to-cfg,func.func(canonicalize,multicontrol-decomposition),decomposition{basis=h,s,t,r1,rx,ry,rz,x,y,z,x(1)},quake-to-cc-prep,func.func(expand-control-veqs,canonicalize),symbol-dce"
   codegen-emission: qasm2
   # Library mode is only for simulators, physical backends must turn this off
diff --git a/runtime/cudaq/platform/qpu.cpp b/runtime/cudaq/platform/qpu.cpp
index 38e1bd9ced8..4d7b059695f 100644
--- a/runtime/cudaq/platform/qpu.cpp
+++ b/runtime/cudaq/platform/qpu.cpp
@@ -8,11 +8,50 @@
 
 #include "qpu.h"
 #include "mlir/IR/BuiltinOps.h"
+#include <cstring>
 
 using namespace cudaq_internal::compiler;
 
 LLVM_INSTANTIATE_REGISTRY(cudaq::ModuleLauncher::RegistryType)
 
+/// Execute a JIT-compiled kernel with provided arguments.
+///
+/// Handles argument marshaling via `argsCreator` (if not fully specialized) and
+/// result buffer allocation.
+cudaq::KernelThunkResultType
+launchCompiledModule(const cudaq::CompiledModule &compiled,
+                     const std::vector<void *> &rawArgs) {
+  auto funcPtr = compiled.getJit()->getFn();
+  const auto &resultInfo = compiled.getResultInfo();
+  if (!compiled.isFullySpecialized()) {
+    // Pack args at runtime via argsCreator, then call the thunk.
+    auto argsCreator = compiled.getArgsCreator();
+    void *buff = nullptr;
+    argsCreator(static_cast<const void *>(rawArgs.data()), &buff);
+    reinterpret_cast<cudaq::KernelThunkResultType (*)(void *, bool)>(funcPtr)(
+        buff, /*client_server=*/false);
+    // If the kernel has a result, copy it from the packed buffer into
+    // rawArgs.back() (where the caller expects to find it).
+    if (resultInfo.hasResult()) {
+      auto offset = compiled.getReturnOffset().value();
+      std::memcpy(rawArgs.back(), static_cast<char *>(buff) + offset,
+                  resultInfo.getBufferSize());
+    }
+    std::free(buff);
+    return {nullptr, 0};
+  }
+  if (resultInfo.hasResult()) {
+    // Fully specialized with result: rawArgs.back() is the pre-allocated
+    // result buffer; pass it directly to the thunk.
+    void *buff = const_cast<void *>(rawArgs.back());
+    return reinterpret_cast<cudaq::KernelThunkResultType (*)(void *, bool)>(
+        funcPtr)(buff, /*client_server=*/false);
+  }
+  // Fully specialized, no result.
+  funcPtr();
+  return {nullptr, 0};
+}
+
 cudaq::KernelThunkResultType
 cudaq::QPU::launchModule(const std::string &name, mlir::ModuleOp module,
                          const std::vector<void *> &rawArgs) {
@@ -23,7 +62,7 @@ cudaq::QPU::launchModule(const std::string &name, mlir::ModuleOp module,
         "result of attempting to use `launchModule` outside Python.");
   ScopedTraceWithContext(cudaq::TIMING_LAUNCH, "QPU::launchModule", name);
   auto compiled = launcher->compileModule(name, module, rawArgs, true);
-  return compiled.execute(rawArgs);
+  return launchCompiledModule(compiled, rawArgs);
 }
 
 cudaq::CompiledModule
diff --git a/runtime/cudaq/ptsbe/PTSBEExecutionData.h b/runtime/cudaq/ptsbe/PTSBEExecutionData.h
index 5b0582b0e44..7aadf871389 100644
--- a/runtime/cudaq/ptsbe/PTSBEExecutionData.h
+++ b/runtime/cudaq/ptsbe/PTSBEExecutionData.h
@@ -40,7 +40,7 @@ enum class TraceInstructionType {
 /// @brief Single operation in the PTSBE execution trace.
 ///
 /// Stores gate, noise channel, or measurement info with plain qubit indices.
-/// This is the user-facing trace type exposed to Python via pybind11.
+/// This is the user-facing trace type exposed to Python via nanobind.
 ///
 struct TraceInstruction {
   /// @brief Instruction category (Gate, Noise, or Measurement)
diff --git a/runtime/cudaq/qis/execution_manager.cpp b/runtime/cudaq/qis/execution_manager.cpp
index 72627d6be0b..3d5335db87c 100644
--- a/runtime/cudaq/qis/execution_manager.cpp
+++ b/runtime/cudaq/qis/execution_manager.cpp
@@ -9,27 +9,39 @@
 #include "execution_manager.h"
 #include "common/ExecutionContext.h"
 #include "common/PluginUtils.h"
+#include "cudaq/algorithms/policy_cpos.h"
+#include "cudaq/algorithms/policy_dispatch.h"
+
+using namespace cudaq;
 
-namespace cudaq {
 static ExecutionManager *execution_manager;
 
-void setExecutionManagerInternal(ExecutionManager *em) {
+void cudaq::setExecutionManagerInternal(ExecutionManager *em) {
   CUDAQ_INFO("external caller setting the execution manager.");
   execution_manager = em;
 }
 
-void resetExecutionManagerInternal() {
+void cudaq::resetExecutionManagerInternal() {
   CUDAQ_INFO("external caller clearing the execution manager.");
   execution_manager = nullptr;
 }
 
-ExecutionManager *getExecutionManagerInternal() { return execution_manager; }
+ExecutionManager *cudaq::getExecutionManagerInternal() {
+  return execution_manager;
+}
 
-ExecutionManager *detail::getExecutionManagerFromContext() {
+ExecutionManager *cudaq::detail::getExecutionManagerFromContext() {
   auto ctx = getExecutionContext();
   if (ctx)
     return ctx->executionManager;
   return nullptr;
 }
 
-} // namespace cudaq
+void ExecutionManager::finalizeExecutionContext(ExecutionContext &ctx) {
+  policies::withPolicy(ctx.name, [&](auto policy) {
+    policies::visitResult(
+        [&]() { return cudaq::finalize_execution_manager(*this, policy, ctx); },
+        [&](sample_result &&r) { ctx.result = std::move(r); },
+        [&](policies::void_result &&r) {});
+  });
+}
diff --git a/runtime/cudaq/qis/execution_manager.h b/runtime/cudaq/qis/execution_manager.h
index f57b3cbbd82..96b5f2f215c 100644
--- a/runtime/cudaq/qis/execution_manager.h
+++ b/runtime/cudaq/qis/execution_manager.h
@@ -12,6 +12,7 @@
 #include "common/NoiseModel.h"
 #include "common/QuditIdTracker.h"
 #include "common/SampleResult.h"
+#include "cudaq/algorithms/policies.h"
 #include "cudaq/host_config.h"
 #include "cudaq/operators.h"
 #include <deque>
@@ -34,6 +35,45 @@ struct QuditInfo {
   }
 };
 
+extern "C" {
+bool __nvqpp__MeasureResultBoolConversion(int);
+}
+
+#ifdef CUDAQ_LIBRARY_MODE
+
+/// In library mode, we model the return type of a qubit measurement result via
+/// the measure_result type. This allows us to keep track of when the result is
+/// implicitly cast to a boolean (likely in the case of conditional feedback),
+/// and affect the simulation accordingly.
+class measure_result {
+private:
+  /// The intrinsic measurement result
+  int result = 0;
+
+  /// Unique integer for measure result identification
+  std::size_t uniqueId = 0;
+
+public:
+  measure_result(int res, std::size_t id) : result(res), uniqueId(id) {}
+  measure_result(int res) : result(res) {}
+
+  operator int() const { return result; }
+  operator bool() const { return __nvqpp__MeasureResultBoolConversion(result); }
+
+  static std::vector<bool>
+  to_bool_vector(const std::vector<measure_result> &results) {
+    std::vector<bool> boolResults;
+    boolResults.reserve(results.size());
+    for (const auto &res : results)
+      boolResults.push_back(static_cast<bool>(res));
+    return boolResults;
+  }
+};
+#else
+/// When compiling with MLIR, we default to a boolean.
+using measure_result = bool;
+#endif
+
 /// The ExecutionManager provides a base class describing a concrete sub-system
 /// for allocating qudits and executing quantum instructions on those qudits.
 /// This type is templated on the concrete qudit type (`qubit`, `qmode`, etc).
@@ -75,7 +115,12 @@ class ExecutionManager {
   virtual void configureExecutionContext(ExecutionContext &ctx) {}
 
   /// Finalize the execution context after an execution.
-  virtual void finalizeExecutionContext(ExecutionContext &ctx) {}
+  void finalizeExecutionContext(ExecutionContext &ctx);
+
+  virtual void finalizeExecutionContext(const other_policies &policy,
+                                        ExecutionContext &ctx) {}
+  virtual sample_result finalizeExecutionContext(const sample_policy &policy,
+                                                 ExecutionContext &ctx) = 0;
 
   /// Set up the execution manager for a new execution.
   virtual void beginExecution() {}
@@ -160,6 +205,16 @@ class ExecutionManager {
   virtual ~ExecutionManager() = default;
 };
 
+inline sample_result finalize_execution_manager_impl(
+    ExecutionManager &mgr, const sample_policy &policy, ExecutionContext &ctx) {
+  return mgr.finalizeExecutionContext(policy, ctx);
+}
+
+inline void finalize_execution_manager_impl(ExecutionManager &mgr,
+                                            ExecutionContext &ctx) {
+  mgr.finalizeExecutionContext(other_policies{}, ctx);
+}
+
 // Function declaration, implemented by the macro expansion below
 ExecutionManager *getRegisteredExecutionManager();
 
diff --git a/runtime/cudaq/qis/execution_manager_c_api.cpp b/runtime/cudaq/qis/execution_manager_c_api.cpp
index 28ccf4a7205..f90a420690d 100644
--- a/runtime/cudaq/qis/execution_manager_c_api.cpp
+++ b/runtime/cudaq/qis/execution_manager_c_api.cpp
@@ -7,7 +7,7 @@
  ******************************************************************************/
 
 #include "cudaq/platform.h"
-#include "measure_result.h"
+#include "execution_manager.h"
 
 bool cudaq::__nvqpp__MeasureResultBoolConversion(int result) {
   auto &platform = get_platform();
diff --git a/runtime/cudaq/qis/managers/BasicExecutionManager.h b/runtime/cudaq/qis/managers/BasicExecutionManager.h
index f8c57149850..229a47c143c 100644
--- a/runtime/cudaq/qis/managers/BasicExecutionManager.h
+++ b/runtime/cudaq/qis/managers/BasicExecutionManager.h
@@ -76,11 +76,18 @@ class BasicExecutionManager : public cudaq::ExecutionManager {
 
   /// @brief Measure the state in the respective basis described each term in
   /// the given `spin_op`.
-  virtual void measureSpinOp(const cudaq::spin_op &op) = 0;
+  virtual cudaq::SpinMeasureResult measureSpinOp(const cudaq::spin_op &op) = 0;
 
   /// @brief Subtype-specific method for performing qudit reset.
   virtual void resetQudit(const QuditInfo &q) = 0;
 
+  void finalizeExecutionContextImpl(ExecutionContext &ctx) {
+    assert(&ctx == cudaq::getExecutionContext() &&
+           "cannot finalize non-current execution context");
+    ScopedTraceWithContext("BasicExecutionManager::finalizeExecutionContext");
+    synchronize();
+  }
+
 public:
   BasicExecutionManager() = default;
   virtual ~BasicExecutionManager() = default;
@@ -90,13 +97,6 @@ class BasicExecutionManager : public cudaq::ExecutionManager {
     instructionQueue.clear();
   }
 
-  void finalizeExecutionContext(ExecutionContext &ctx) override {
-    assert(&ctx == cudaq::getExecutionContext() &&
-           "cannot finalize non-current execution context");
-    ScopedTraceWithContext("BasicExecutionManager::finalizeExecutionContext");
-    synchronize();
-  }
-
   void endExecution() override {
     deallocateQudits(contextQuditIdsForDeletion);
     for (auto &q : contextQuditIdsForDeletion)
@@ -267,10 +267,7 @@ class BasicExecutionManager : public cudaq::ExecutionManager {
 
   cudaq::SpinMeasureResult measure(const cudaq::spin_op &op) override {
     synchronize();
-    measureSpinOp(op);
-    auto executionContext = cudaq::getExecutionContext();
-    return std::make_pair(executionContext->expectationValue.value(),
-                          executionContext->result);
+    return measureSpinOp(op);
   }
 
   void reset(const QuditInfo &target) override {
diff --git a/runtime/cudaq/qis/managers/default/DefaultExecutionManager.cpp b/runtime/cudaq/qis/managers/default/DefaultExecutionManager.cpp
index 82654c593d8..7a30cfd7d3e 100644
--- a/runtime/cudaq/qis/managers/default/DefaultExecutionManager.cpp
+++ b/runtime/cudaq/qis/managers/default/DefaultExecutionManager.cpp
@@ -152,8 +152,8 @@ class DefaultExecutionManager : public cudaq::BasicExecutionManager {
     simulator()->configureExecutionContext(ctx);
   }
 
-  void finalizeExecutionContext(ExecutionContext &ctx) override {
-    BasicExecutionManager::finalizeExecutionContext(ctx);
+  void finalizeExecutionContextImpl(ExecutionContext &ctx) {
+    BasicExecutionManager::finalizeExecutionContextImpl(ctx);
 
     if (!requestedAllocations.empty()) {
       CUDAQ_INFO("[DefaultExecutionManager] Flushing remaining {} allocations "
@@ -165,6 +165,17 @@ class DefaultExecutionManager : public cudaq::BasicExecutionManager {
       simulator()->allocateQubits(requestedAllocations.size());
       requestedAllocations.clear();
     }
+  }
+
+  sample_result finalizeExecutionContext(const sample_policy &policy,
+                                         ExecutionContext &ctx) override {
+    finalizeExecutionContextImpl(ctx);
+    return simulator()->finalizeExecutionContext(policy, ctx);
+  }
+
+  void finalizeExecutionContext(const other_policies &policy,
+                                ExecutionContext &ctx) override {
+    finalizeExecutionContextImpl(ctx);
     simulator()->finalizeExecutionContext(ctx);
   }
 
@@ -277,9 +288,9 @@ class DefaultExecutionManager : public cudaq::BasicExecutionManager {
     simulator()->flushGateQueue();
   }
 
-  void measureSpinOp(const cudaq::spin_op &op) override {
+  cudaq::SpinMeasureResult measureSpinOp(const cudaq::spin_op &op) override {
     flushRequestedAllocations();
-    simulator()->measureSpinOp(op);
+    return simulator()->measureSpinOp(op);
   }
 
 public:
diff --git a/runtime/cudaq/qis/managers/photonics/PhotonicsExecutionManager.cpp b/runtime/cudaq/qis/managers/photonics/PhotonicsExecutionManager.cpp
index 8e273d3cca3..664ca50397a 100644
--- a/runtime/cudaq/qis/managers/photonics/PhotonicsExecutionManager.cpp
+++ b/runtime/cudaq/qis/managers/photonics/PhotonicsExecutionManager.cpp
@@ -6,6 +6,7 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 #include "common/FmtCore.h"
+#include "common/SampleResult.h"
 #include "cudaq/operators.h"
 #include "cudaq/qis/managers/BasicExecutionManager.h"
 #include "cudaq/runtime/logger/logger.h"
@@ -165,33 +166,46 @@ class PhotonicsExecutionManager : public cudaq::BasicExecutionManager {
   }
 
   /// @brief Process results into the execution context
-  void finalizeExecutionContext(ExecutionContext &ctx) override {
-    BasicExecutionManager::finalizeExecutionContext(ctx);
-
-    std::vector<std::size_t> ids;
+  void finalizeExecutionContextImpl(std::vector<std::size_t> &ids,
+                                    ExecutionContext &ctx) {
+    BasicExecutionManager::finalizeExecutionContextImpl(ctx);
     for (auto &s : sampleQudits) {
       ids.push_back(s.id);
     }
-    if (ctx.name == "sample") {
-      CUDAQ_INFO("Sampling");
-      auto shots = ctx.shots;
-      auto sampleResult =
-          qpp::sample(shots, state, ids, sampleQudits.begin()->levels);
-      cudaq::ExecutionResult counts;
-      for (auto [result, count] : sampleResult) {
-        std::stringstream bitstring;
-        for (const auto &quditRes : result) {
-          bitstring << quditRes;
-        }
-        // Add to the sample result
-        // in mid-circ sampling mode this will append 1 bitstring
-        counts.appendResult(bitstring.str(), count);
-        // Reset the string.
-        bitstring.str("");
-        bitstring.clear();
+  }
+
+  sample_result finalizeExecutionContext(const sample_policy &policy,
+                                         ExecutionContext &ctx) override {
+    std::vector<std::size_t> ids;
+    finalizeExecutionContextImpl(ids, ctx);
+    CUDAQ_INFO("Sampling");
+    auto shots = ctx.shots;
+    auto sampleResult =
+        qpp::sample(shots, state, ids, sampleQudits.begin()->levels);
+    cudaq::ExecutionResult counts;
+    for (auto [result, count] : sampleResult) {
+      std::stringstream bitstring;
+      for (const auto &quditRes : result) {
+        bitstring << quditRes;
       }
-      ctx.result.append(counts);
-    } else if (ctx.name == "extract-state") {
+      // Add to the sample result
+      // in mid-circ sampling mode this will append 1 bitstring
+      counts.appendResult(bitstring.str(), count);
+      // Reset the string.
+      bitstring.str("");
+      bitstring.clear();
+    }
+    sample_result result;
+    result.append(counts);
+    return result;
+  }
+
+  void finalizeExecutionContext(const other_policies &policy,
+                                ExecutionContext &ctx) override {
+    std::vector<std::size_t> ids;
+    finalizeExecutionContextImpl(ids, ctx);
+
+    if (ctx.name == "extract-state") {
       CUDAQ_INFO("Extracting state");
       // If here, then we care about the result qudit, so compute it.
       for (auto &q : sampleQudits) {
@@ -253,7 +267,9 @@ class PhotonicsExecutionManager : public cudaq::BasicExecutionManager {
   }
 
   /// @brief Measure the state in the basis described by the given `spin_op`.
-  void measureSpinOp(const cudaq::spin_op &) override {}
+  cudaq::SpinMeasureResult measureSpinOp(const cudaq::spin_op &) override {
+    return cudaq::SpinMeasureResult(0.0, {});
+  }
 
   /// @brief Method for performing qudit reset.
   void resetQudit(const cudaq::QuditInfo &id) override {}
diff --git a/runtime/cudaq/qis/measure_result.h b/runtime/cudaq/qis/measure_result.h
deleted file mode 100644
index c2032d3cf48..00000000000
--- a/runtime/cudaq/qis/measure_result.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/****************************************************************-*- C++ -*-****
- * Copyright (c) 2026 NVIDIA Corporation & Affiliates.                         *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- ******************************************************************************/
-
-#pragma once
-
-#include <cstdint>
-#include <limits>
-
-namespace cudaq {
-
-extern "C" {
-bool __nvqpp__MeasureResultBoolConversion(int);
-}
-
-/// We model the return type of a qubit measurement result via the
-/// `measure_result` type. This allows us to keep track of when the result is
-/// implicitly cast to a boolean (likely in the case of conditional feedback),
-/// and affect the simulation accordingly.
-///
-/// TODO: A companion `measure_vector` type will replace
-/// `std::vector<measure_result>` for multi-qubit measurements (see spec).
-class measure_result {
-public:
-  /// The intrinsic measurement value
-  std::int64_t value = 0;
-
-  /// Unique integer for measure result identification.
-  /// INT64_MAX means unassigned; negative values are valid
-  std::int64_t unique_id = std::numeric_limits<std::int64_t>::max();
-
-  // No default construction (measurements must come from mz/mx/my).
-  // No assignment (measurement collections are immutable).
-  measure_result() = delete;
-  measure_result(const measure_result &) = default;
-  measure_result(measure_result &&) = default;
-  measure_result &operator=(const measure_result &) = delete;
-  measure_result &operator=(measure_result &&) = delete;
-
-  explicit measure_result(int64_t val) : value(val) {}
-  explicit measure_result(int64_t val, int64_t id)
-      : value(val), unique_id(id) {}
-
-  // Operator overloads for conversions and comparisons
-#ifdef CUDAQ_LIBRARY_MODE
-  operator bool() const { return __nvqpp__MeasureResultBoolConversion(value); }
-#else
-  operator bool() const { return value == 1; }
-#endif
-  explicit operator int() const { return static_cast<int>(value); }
-  explicit operator double() const { return static_cast<double>(value); }
-
-  friend bool operator==(const measure_result &m1, const measure_result &m2) {
-    return (m1.value == m2.value) && (m1.unique_id == m2.unique_id);
-  }
-  friend bool operator==(const measure_result &m, bool b) {
-    return static_cast<bool>(m) == b;
-  }
-  friend bool operator==(bool b, const measure_result &m) {
-    return b == static_cast<bool>(m);
-  }
-
-  friend bool operator!=(const measure_result &m1, const measure_result &m2) {
-    return (m1.value != m2.value) || (m1.unique_id != m2.unique_id);
-  }
-  friend bool operator!=(const measure_result &m, bool b) {
-    return static_cast<bool>(m) != b;
-  }
-  friend bool operator!=(bool b, const measure_result &m) {
-    return b != static_cast<bool>(m);
-  }
-};
-
-} // namespace cudaq
diff --git a/runtime/cudaq/qis/pauli_word.h b/runtime/cudaq/qis/pauli_word.h
index c7eaf611ba1..6ef277eb339 100644
--- a/runtime/cudaq/qis/pauli_word.h
+++ b/runtime/cudaq/qis/pauli_word.h
@@ -12,6 +12,7 @@
 #include <cstdint>
 #include <ctype.h>
 #include <string>
+#include <vector>
 
 namespace cudaq {
 
diff --git a/runtime/cudaq/qis/qubit_qis.h b/runtime/cudaq/qis/qubit_qis.h
index c578c5c23a6..6e3df5db04f 100644
--- a/runtime/cudaq/qis/qubit_qis.h
+++ b/runtime/cudaq/qis/qubit_qis.h
@@ -12,7 +12,6 @@
 #include "cudaq/host_config.h"
 #include "cudaq/operators.h"
 #include "cudaq/platform.h"
-#include "cudaq/qis/measure_result.h"
 #include "cudaq/qis/modifiers.h"
 #include "cudaq/qis/pauli_word.h"
 #include "cudaq/qis/qarray.h"
@@ -421,33 +420,29 @@ void exp_pauli(QuantumRegister &ctrls, double theta, const char *pauliWord,
                                false, spin_op::from_word(pauliWord));
 }
 
-/// @brief Measure an individual qubit, return as `measure_result`
+/// @brief Measure an individual qubit, return 0,1 as `bool`
 inline measure_result mz(qubit &q) {
-  return measure_result(
-      getExecutionManager()->measure(QuditInfo{q.n_levels(), q.id()}));
+  return getExecutionManager()->measure(QuditInfo{q.n_levels(), q.id()});
 }
 
-/// @brief Measure an individual qubit in `x` basis, return as `measure_result`
+/// @brief Measure an individual qubit in `x` basis, return 0,1 as `bool`
 inline measure_result mx(qubit &q) {
   h(q);
-  return measure_result(
-      getExecutionManager()->measure(QuditInfo{q.n_levels(), q.id()}));
+  return getExecutionManager()->measure(QuditInfo{q.n_levels(), q.id()});
 }
 
-// Measure an individual qubit in `y` basis, return as `measure_result`
+// Measure an individual qubit in `y` basis, return 0,1 as `bool`
 inline measure_result my(qubit &q) {
   r1(-M_PI_2, q);
   h(q);
-  return measure_result(
-      getExecutionManager()->measure(QuditInfo{q.n_levels(), q.id()}));
+  return getExecutionManager()->measure(QuditInfo{q.n_levels(), q.id()});
 }
 
 inline void reset(qubit &q) {
   getExecutionManager()->reset({q.n_levels(), q.id()});
 }
 
-// Measure all qubits in the range.
-// TODO: return type will change to cudaq::measure_vector (see spec).
+// Measure all qubits in the range, return vector of 0,1
 template <typename QubitRange>
   requires std::ranges::range<QubitRange>
 std::vector<measure_result> mz(QubitRange &q) {
@@ -478,8 +473,7 @@ std::vector<measure_result> mz(QubitRange &qr, Qs &&...qs) {
   if constexpr (std::is_same_v<decltype(rest), measure_result>) {
     result.push_back(rest);
   } else {
-    for (const auto &r : rest)
-      result.push_back(r);
+    result.insert(result.end(), rest.begin(), rest.end());
   }
   return result;
 }
@@ -491,8 +485,7 @@ std::vector<measure_result> mz(qubit &q, Qs &&...qs) {
   if constexpr (std::is_same_v<decltype(rest), measure_result>) {
     result.push_back(rest);
   } else {
-    for (const auto &r : rest)
-      result.push_back(r);
+    result.insert(result.end(), rest.begin(), rest.end());
   }
   return result;
 }
@@ -513,7 +506,8 @@ inline SpinMeasureResult measure(const cudaq::spin_op &term) {
   return getExecutionManager()->measure(term);
 }
 
-// TODO: will become measure_vector::operator std::int64_t() (see spec).
+// Cast a measure register to an int64_t.
+// This function is classic control code that may run on a QPU.
 inline std::int64_t to_integer(const std::vector<measure_result> &bits) {
   std::int64_t ret = 0;
   for (std::size_t i = 0; i < bits.size(); i++) {
@@ -524,33 +518,12 @@ inline std::int64_t to_integer(const std::vector<measure_result> &bits) {
   return ret;
 }
 
-inline std::int64_t to_integer(const std::vector<bool> &bits) {
-  std::int64_t ret = 0;
-  for (std::size_t i = 0; i < bits.size(); i++) {
-    if (bits[i]) {
-      ret |= 1UL << i;
-    }
-  }
-  return ret;
-}
-
 inline std::int64_t to_integer(const std::string &arg) {
   std::string bitString{arg};
   std::reverse(bitString.begin(), bitString.end());
   return std::stoull(bitString, nullptr, 2);
 }
 
-// TODO: will be replaced by measure_vector::operator std::vector<bool>() (see
-// spec).
-inline std::vector<bool>
-to_bool_vector(const std::vector<measure_result> &results) {
-  std::vector<bool> out;
-  out.reserve(results.size());
-  for (const auto &r : results)
-    out.push_back(static_cast<bool>(r));
-  return out;
-}
-
 // This concept tests if `Kernel` is a `Callable` that takes the arguments,
 // `Args`, and returns `void`.
 template <typename Kernel, typename... Args>
diff --git a/runtime/internal/compiler/CMakeLists.txt b/runtime/internal/compiler/CMakeLists.txt
index 662f003a542..5af9675271c 100644
--- a/runtime/internal/compiler/CMakeLists.txt
+++ b/runtime/internal/compiler/CMakeLists.txt
@@ -23,6 +23,7 @@ add_library(cudaq-mlir-runtime
   SHARED
     ArgumentConversion.cpp
     Compiler.cpp
+    CompiledModuleHelper.cpp
     JIT.cpp
     RuntimeMLIR.cpp
     RuntimeCppMLIR.cpp
diff --git a/runtime/internal/compiler/CompiledModuleHelper.cpp b/runtime/internal/compiler/CompiledModuleHelper.cpp
new file mode 100644
index 00000000000..d92e1e82e66
--- /dev/null
+++ b/runtime/internal/compiler/CompiledModuleHelper.cpp
@@ -0,0 +1,98 @@
+/*******************************************************************************
+ * Copyright (c) 2026 NVIDIA Corporation & Affiliates.                         *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "cudaq_internal/compiler/CompiledModuleHelper.h"
+#include "cudaq/Optimizer/Builder/RuntimeNames.h"
+#include "cudaq_internal/compiler/LayoutInfo.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Types.h"
+
+using namespace mlir;
+using cudaq::CompiledModule;
+
+namespace cudaq_internal::compiler {
+
+cudaq::ResultInfo CompiledModuleHelper::createResultInfo(Type resultTy,
+                                                         bool isEntryPoint,
+                                                         ModuleOp module) {
+  cudaq::ResultInfo info;
+  if (!resultTy || !isEntryPoint)
+    return info;
+
+  info.typeOpaquePtr = resultTy.getAsOpaquePointer();
+  auto [size, offsets] = getResultBufferLayout(module, resultTy);
+  info.bufferSize = size;
+  info.fieldOffsets = std::move(offsets);
+  return info;
+}
+
+std::vector<CompiledModuleHelper::NamedCompiledArtifact>
+CompiledModuleHelper::createJitArtifacts(const std::string &kernelName,
+                                         cudaq::JitEngine engine,
+                                         const cudaq::ResultInfo &resultInfo,
+                                         bool isFullySpecialized) {
+  bool hasResult = resultInfo.hasResult();
+  std::string fullName =
+      std::string(cudaq::runtime::cudaqGenPrefixName) + kernelName;
+  std::string entryName =
+      (hasResult || !isFullySpecialized) ? kernelName + ".thunk" : fullName;
+  void (*entryPoint)() = engine.lookupRawNameOrFail(entryName);
+
+  std::vector<NamedCompiledArtifact> artifacts;
+  artifacts.emplace_back(kernelName,
+                         CompiledModule::JitArtifact{engine, entryPoint});
+  if (!isFullySpecialized) {
+    void (*argsCreatorFn)() =
+        engine.lookupRawNameOrFail(kernelName + ".argsCreator");
+    artifacts.emplace_back(kernelName + ".argsCreator",
+                           CompiledModule::JitArtifact{engine, argsCreatorFn});
+    if (hasResult) {
+      void (*returnOffsetFn)() =
+          engine.lookupRawNameOrFail(kernelName + ".returnOffset");
+      artifacts.emplace_back(
+          kernelName + ".returnOffset",
+          CompiledModule::JitArtifact{engine, returnOffsetFn});
+    }
+  }
+  return artifacts;
+}
+
+CompiledModuleHelper::NamedCompiledArtifact
+CompiledModuleHelper::createResourcesArtifact(std::string name,
+                                              cudaq::Resources rc) {
+  return {std::move(name), CompiledModule::ResourcesArtifact{std::move(rc)}};
+}
+
+CompiledModuleHelper::NamedCompiledArtifact
+CompiledModuleHelper::createMlirArtifact(std::string name, ModuleOp module,
+                                         std::shared_ptr<MLIRContext> context) {
+  const void *ptr = module.getAsOpaquePointer();
+  return {std::move(name),
+          CompiledModule::MlirArtifact{ptr, std::move(context)}};
+}
+
+ModuleOp CompiledModuleHelper::getMlirModuleOp(
+    const CompiledModule::MlirArtifact &artifact) {
+  return ModuleOp::getFromOpaquePointer(artifact.modulePtr);
+}
+
+CompiledModule CompiledModuleHelper::createCompiledModule(
+    std::string name, cudaq::ResultInfo resultInfo,
+    std::vector<NamedCompiledArtifact> compiledArtifacts,
+    CompiledModule::CompilationMetadata metadata) {
+  CompiledModule compiled(std::move(name));
+  compiled.resultInfo = std::move(resultInfo);
+  compiled.metadata = std::move(metadata);
+  for (auto &[artName, artifact] : compiledArtifacts)
+    compiled.addArtifact(std::move(artName), std::move(artifact));
+  return compiled;
+}
+
+} // namespace cudaq_internal::compiler
diff --git a/runtime/internal/compiler/Compiler.cpp b/runtime/internal/compiler/Compiler.cpp
index ee76e986be5..0bfe10bddb6 100644
--- a/runtime/internal/compiler/Compiler.cpp
+++ b/runtime/internal/compiler/Compiler.cpp
@@ -90,11 +90,26 @@ nlohmann::json formOutputNames(const std::string &codegenTranslation,
   }
   return output_names;
 }
+/// Extract qubit-mapping reorder indices from the entry-point attributes.
+std::vector<std::size_t> extractMappingReorderIdx(mlir::ModuleOp moduleOp,
+                                                  mlir::func::FuncOp epFunc) {
+  assert(moduleOp.template lookupSymbol<mlir::func::FuncOp>(epFunc.getName()) &&
+         "Entry point function must survive the lowering pipeline.");
+  std::vector<std::size_t> mapping_reorder_idx;
+  if (auto mappingAttr = dyn_cast_if_present<mlir::ArrayAttr>(
+          epFunc->getAttr("mapping_reorder_idx"))) {
+    mapping_reorder_idx.resize(mappingAttr.size());
+    std::transform(mappingAttr.begin(), mappingAttr.end(),
+                   mapping_reorder_idx.begin(), [](mlir::Attribute attr) {
+                     return mlir::cast<mlir::IntegerAttr>(attr).getInt();
+                   });
+  }
+  return mapping_reorder_idx;
+}
 } // namespace
 
-std::tuple<mlir::ModuleOp, std::unique_ptr<mlir::MLIRContext>, void *>
-Compiler::extractQuakeCodeAndContext(const std::string &kernelName,
-                                     void *data) {
+std::pair<mlir::ModuleOp, std::unique_ptr<mlir::MLIRContext>>
+Compiler::extractQuakeCodeAndContext(const std::string &kernelName) {
   auto context = getOwningMLIRContext();
 
   // Get the quake representation of the kernel
@@ -103,7 +118,7 @@ Compiler::extractQuakeCodeAndContext(const std::string &kernelName,
   if (!m_module)
     throw std::runtime_error("module cannot be parsed");
 
-  return std::make_tuple(m_module.release(), std::move(context), data);
+  return std::make_pair(m_module.release(), std::move(context));
 }
 
 Compiler::Compiler(cudaq::ServerHelper *serverHelper,
@@ -225,40 +240,45 @@ Compiler::Compiler(cudaq::ServerHelper *serverHelper,
 
 Compiler::~Compiler() = default;
 
-std::vector<cudaq::KernelExecution> Compiler::lowerQuakeCodePart2(
-    cudaq::ExecutionContext *executionContext, const std::string &kernelName,
-    void *kernelArgs, const std::vector<void *> &rawArgs,
-    mlir::ModuleOp m_module, mlir::MLIRContext *contextPtr, void *updatedArgs) {
-  // Extract the kernel name
+// =============================================================================
+// Common helpers for policy-specific runPassPipeline overloads
+// =============================================================================
+
+void Compiler::applyPipeline(const std::string &pipeline,
+                             mlir::ModuleOp moduleOp,
+                             const std::string &kernelName) {
+  auto *contextPtr = moduleOp.getContext();
+  mlir::PassManager pm(contextPtr);
+  std::string errMsg;
+  llvm::raw_string_ostream os(errMsg);
+  CUDAQ_INFO("Pass pipeline for {} = {}", kernelName, pipeline);
+  if (failed(parsePassPipeline(pipeline, pm, os)))
+    throw std::runtime_error(
+        "Remote rest platform failed to add passes to pipeline (" + errMsg +
+        ").");
+  if (disableMLIRthreading || enablePrintMLIREachPass)
+    contextPtr->disableMultithreading();
+  if (enablePrintMLIREachPass)
+    pm.enableIRPrinting();
+  if (failed(pm.run(moduleOp)))
+    throw std::runtime_error("Remote rest platform Quake lowering failed.");
+}
+
+std::pair<mlir::ModuleOp, mlir::func::FuncOp>
+Compiler::prepareModule(const std::string &kernelName, mlir::ModuleOp m_module,
+                        const std::vector<void *> &rawArgs, void *kernelArgs) {
+  auto *contextPtr = m_module.getContext();
+
   auto origFn = m_module.template lookupSymbol<mlir::func::FuncOp>(
       std::string(cudaq::runtime::cudaqGenPrefixName) + kernelName);
 
   auto moduleOp =
       lowerQuakeCodeBuildModule(kernelName, m_module, contextPtr, origFn);
 
-  // Lambda to apply a specific pipeline to the given ModuleOp
-  auto runPassPipeline = [&](const std::string &pipeline,
-                             mlir::ModuleOp moduleOpIn) {
-    mlir::PassManager pm(contextPtr);
-    std::string errMsg;
-    llvm::raw_string_ostream os(errMsg);
-    CUDAQ_INFO("Pass pipeline for {} = {}", kernelName, pipeline);
-    if (failed(parsePassPipeline(pipeline, pm, os)))
-      throw std::runtime_error(
-          "Remote rest platform failed to add passes to pipeline (" + errMsg +
-          ").");
-    if (disableMLIRthreading || enablePrintMLIREachPass)
-      moduleOpIn.getContext()->disableMultithreading();
-    if (enablePrintMLIREachPass)
-      pm.enableIRPrinting();
-    if (failed(pm.run(moduleOpIn)))
-      throw std::runtime_error("Remote rest platform Quake lowering failed.");
-  };
-
   auto epFunc =
       moduleOp.template lookupSymbol<mlir::func::FuncOp>(origFn.getName());
   const bool isPython = moduleOp->hasAttr(cudaq::runtime::pythonUniqueAttrName);
-  if (!rawArgs.empty() || updatedArgs) {
+  if (!rawArgs.empty() || kernelArgs) {
     mlir::PassManager pm(contextPtr);
     if (isPython)
       mergeAllCallableClosures(moduleOp, kernelName, rawArgs);
@@ -312,9 +332,9 @@ std::vector<cudaq::KernelExecution> Compiler::lowerQuakeCodePart2(
           cudaq::opt::createReplaceStateWithKernel());
       cudaq::opt::addAggressiveInlining(pm);
       pm.addPass(mlir::createSymbolDCEPass());
-    } else if (updatedArgs) {
+    } else if (kernelArgs) {
       CUDAQ_INFO("Run Quake Synth.\n");
-      pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, updatedArgs));
+      pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, kernelArgs));
     }
     pm.addPass(mlir::createCanonicalizerPass());
     if (disableMLIRthreading || enablePrintMLIREachPass)
@@ -325,8 +345,72 @@ std::vector<cudaq::KernelExecution> Compiler::lowerQuakeCodePart2(
       throw std::runtime_error("Could not successfully apply quake-synth.");
   }
 
+  return {moduleOp, epFunc};
+}
+
+bool Compiler::executeMainPipeline(mlir::ModuleOp moduleOp,
+                                   const std::string &kernelName) {
+  auto combineMeasurements =
+      passPipelineConfig.find("combine-measurements") != std::string::npos;
+  if (emulate && combineMeasurements) {
+    std::regex combine("(.*),([ ]*)combine-measurements(.*)");
+    std::string replacement("$1$3");
+    passPipelineConfig =
+        std::regex_replace(passPipelineConfig, combine, replacement);
+    CUDAQ_INFO("Delaying combine-measurements pass due to emulation. "
+               "Updating pipeline to {}",
+               passPipelineConfig);
+  }
+  applyPipeline(passPipelineConfig, moduleOp, kernelName);
+  return combineMeasurements;
+}
+
+cudaq::CompiledModule Compiler::assembleCompiledModule(
+    const std::string &kernelName,
+    std::vector<std::pair<std::string, mlir::ModuleOp>> &modules, bool needJit,
+    bool runCombineMeasurements, std::optional<cudaq::Resources> resourceCounts,
+    const std::vector<std::size_t> &mappingReorderIdx,
+    std::shared_ptr<mlir::MLIRContext> context) {
+  std::vector<CompiledModuleHelper::NamedCompiledArtifact> artifacts;
+  if (needJit) {
+    for (auto &[name, module] : modules) {
+      auto clonedModule = module.clone();
+      auto jitArtifacts = CompiledModuleHelper::createJitArtifacts(
+          kernelName, createJITEngine(clonedModule, codegenTranslation), {},
+          /*isFullySpecialized=*/true);
+      assert(jitArtifacts.size() == 1);
+      jitArtifacts[0].first = name;
+      artifacts.push_back(std::move(jitArtifacts[0]));
+      if (resourceCounts)
+        artifacts.push_back(CompiledModuleHelper::createResourcesArtifact(
+            name + ".resources", std::move(*resourceCounts)));
+    }
+  }
+
+  if (runCombineMeasurements)
+    for (auto &[name, module] : modules)
+      applyPipeline("func.func(combine-measurements)", module, kernelName);
+
+  for (auto &[name, module] : modules) {
+    auto mlirName = name + ".mlir";
+    artifacts.push_back(
+        CompiledModuleHelper::createMlirArtifact(mlirName, module, context));
+  }
+
+  return CompiledModuleHelper::createCompiledModule(
+      kernelName, {}, std::move(artifacts), {.reorderIdx = mappingReorderIdx});
+}
+
+cudaq::CompiledModule Compiler::runPassPipeline(
+    cudaq::ExecutionContext *executionContext, const std::string &kernelName,
+    mlir::ModuleOp m_module, const std::vector<void *> &rawArgs,
+    void *kernelArgs, std::shared_ptr<mlir::MLIRContext> context) {
+  assert(!context || context.get() == m_module.getContext());
+  auto [moduleOp, epFunc] =
+      prepareModule(kernelName, m_module, rawArgs, kernelArgs);
+
+  // Populate conditional measurement flag in the context.
   if (emulate && executionContext && executionContext->name == "sample") {
-    // Populate conditional measurement flag in the context.
     for (auto &artifact : moduleOp) {
       quake::detail::QuakeFunctionAnalysis analysis{&artifact};
       auto info = analysis.getAnalysisInfo();
@@ -345,21 +429,9 @@ std::vector<cudaq::KernelExecution> Compiler::lowerQuakeCodePart2(
       }
     }
   }
-  // Delay combining measurements for backends that cannot handle
-  // subveqs and multiple measurements until we created the emulation code.
-  auto combineMeasurements =
-      passPipelineConfig.find("combine-measurements") != std::string::npos;
-  if (emulate && combineMeasurements) {
-    std::regex combine("(.*),([ ]*)combine-measurements(.*)");
-    std::string replacement("$1$3");
-    passPipelineConfig =
-        std::regex_replace(passPipelineConfig, combine, replacement);
-    CUDAQ_INFO("Delaying combine-measurements pass due to emulation. "
-               "Updating pipeline to {}",
-               passPipelineConfig);
-  }
 
-  runPassPipeline(passPipelineConfig, moduleOp);
+  bool combineMeasurements = executeMainPipeline(moduleOp, kernelName);
+
   // We need to run resource counting preprocessing after the pass pipeline as
   // the pre-processing might change the IR structure (may interfere with
   // other passes).
@@ -372,17 +444,7 @@ std::vector<cudaq::KernelExecution> Compiler::lowerQuakeCodePart2(
     resourceCounts = std::move(*result);
   }
 
-  assert(moduleOp.template lookupSymbol<mlir::func::FuncOp>(epFunc.getName()) &&
-         "Entry point function must survive the lowering pipeline.");
-  std::vector<std::size_t> mapping_reorder_idx;
-  if (auto mappingAttr = dyn_cast_if_present<mlir::ArrayAttr>(
-          epFunc->getAttr("mapping_reorder_idx"))) {
-    mapping_reorder_idx.resize(mappingAttr.size());
-    std::transform(mappingAttr.begin(), mappingAttr.end(),
-                   mapping_reorder_idx.begin(), [](mlir::Attribute attr) {
-                     return mlir::cast<mlir::IntegerAttr>(attr).getInt();
-                   });
-  }
+  auto mapping_reorder_idx = extractMappingReorderIdx(moduleOp, epFunc);
 
   if (executionContext) {
     if (executionContext->name == "sample") {
@@ -414,17 +476,17 @@ std::vector<cudaq::KernelExecution> Compiler::lowerQuakeCodePart2(
       }
       // No need to add measurements only to remove them eventually
       if (postCodeGenPasses.find("remove-measurements") == std::string::npos)
-        runPassPipeline("func.func(add-measurements)", moduleOp);
+        applyPipeline("func.func(add-measurements)", moduleOp, kernelName);
     } else {
       executionContext->reorderIdx.clear();
     }
   }
 
-  std::vector<std::pair<std::string, mlir::ModuleOp>> modules;
   // Apply observations if necessary
+  std::vector<std::pair<std::string, mlir::ModuleOp>> modules;
   if (executionContext && executionContext->name == "observe") {
     mapping_reorder_idx.clear();
-    runPassPipeline("canonicalize,cse", moduleOp);
+    applyPipeline("canonicalize,cse", moduleOp, kernelName);
     cudaq::spin_op &spin = executionContext->spin.value();
     for (const auto &term : spin) {
       if (term.is_identity())
@@ -441,6 +503,7 @@ std::vector<cudaq::KernelExecution> Compiler::lowerQuakeCodePart2(
 
       // Create the pass manager, add the quake observe ansatz pass and run it
       // followed by the canonicalizer
+      auto *contextPtr = moduleOp.getContext();
       mlir::PassManager pm(contextPtr);
       pm.addNestedPass<mlir::func::FuncOp>(cudaq::opt::createObserveAnsatzPass(
           term.get_binary_symplectic_form()));
@@ -457,38 +520,37 @@ std::vector<cudaq::KernelExecution> Compiler::lowerQuakeCodePart2(
       auto csvSplit = cudaq::split(passPipelineConfig, ',');
       for (auto &pass : csvSplit)
         if (pass.ends_with("-gate-set-mapping"))
-          runPassPipeline(pass, tmpModuleOp);
+          applyPipeline(pass, tmpModuleOp, kernelName);
       if (!emulate && combineMeasurements)
-        runPassPipeline("func.func(combine-measurements)", tmpModuleOp);
+        applyPipeline("func.func(combine-measurements)", tmpModuleOp,
+                      kernelName);
       modules.emplace_back(term.get_term_id(), tmpModuleOp);
     }
   } else {
     modules.emplace_back(kernelName, moduleOp);
   }
 
-  if (emulate ||
-      (executionContext && executionContext->name == "resource-count")) {
-    // If we are in emulation mode, we need to first get a full QIR
-    // representation of the code. Then we'll map to an LLVM Module, create a
-    // JIT ExecutionEngine pointer and use that for execution
-    for (auto &[name, module] : modules) {
-      auto clonedModule = module.clone();
-      jitEngines.emplace_back(
-          createJITEngine(clonedModule, codegenTranslation));
-    }
-  }
-
-  if (emulate && combineMeasurements)
-    for (auto &[name, module] : modules)
-      runPassPipeline("func.func(combine-measurements)", module);
+  bool needJit = emulate || (executionContext &&
+                             executionContext->name == "resource-count");
+  return assembleCompiledModule(
+      kernelName, modules, needJit, emulate && combineMeasurements,
+      std::move(resourceCounts), mapping_reorder_idx, context);
+}
 
+std::vector<cudaq::KernelExecution>
+Compiler::emitKernelExecutions(const cudaq::CompiledModule &compiled) {
   // Get the code gen translation
   auto translation = getTranslation(codegenTranslation);
 
   // Apply user-specified codegen
   std::vector<cudaq::KernelExecution> codes;
-  for (auto iter : llvm::enumerate(modules)) {
-    auto &[name, moduleOpI] = iter.value();
+  for (auto &[name, artifact] : compiled.getArtifacts()) {
+    if (!name.ends_with(".mlir"))
+      continue;
+    auto &mlirArtifact =
+        std::get<cudaq::CompiledModule::MlirArtifact>(artifact);
+    auto moduleOpI = CompiledModuleHelper::getMlirModuleOp(mlirArtifact);
+
     std::string codeStr;
     llvm::raw_string_ostream outStr(codeStr);
     if (disableMLIRthreading)
@@ -509,12 +571,20 @@ std::vector<cudaq::KernelExecution> Compiler::lowerQuakeCodePart2(
     // Form an output_names mapping from codeStr
     nlohmann::json j = formOutputNames(codegenTranslation, moduleOpI, codeStr);
 
-    auto optionalJit = jitEngines.size() > iter.index()
-                           ? std::optional(jitEngines[iter.index()])
-                           : std::nullopt;
-    auto optionalResourceCounts = resourceCounts;
-    codes.emplace_back(name, codeStr, optionalJit, optionalResourceCounts, j,
-                       mapping_reorder_idx);
+    // Retrieve pre-computed JIT engine and resource counts (if any).
+    std::optional<cudaq::JitEngine> optionalJit;
+    std::optional<cudaq::Resources> optionalResourceCounts;
+    auto kernelName = name.substr(0, name.length() - 5);
+    auto jit = compiled.getJit(kernelName);
+    if (jit)
+      optionalJit = jit->getEngine();
+    auto resourceCounts = compiled.getResources(kernelName + ".resources");
+    if (resourceCounts)
+      optionalResourceCounts = *resourceCounts;
+
+    auto mapping_reorder_idx = compiled.getMetadata().reorderIdx;
+    codes.emplace_back(kernelName, codeStr, optionalJit, optionalResourceCounts,
+                       j, mapping_reorder_idx);
   }
 
   return codes;
@@ -528,19 +598,19 @@ std::vector<cudaq::KernelExecution>
 Compiler::lowerQuakeCode(cudaq::ExecutionContext *executionContext,
                          const std::string &kernelName, void *kernelArgs,
                          const std::vector<void *> &rawArgs) {
-
-  auto [m_module, contextPtr, updatedArgs] =
-      extractQuakeCodeAndContext(kernelName, kernelArgs);
-  return lowerQuakeCodePart2(executionContext, kernelName, kernelArgs, rawArgs,
-                             m_module, contextPtr.get(), updatedArgs);
+  auto [m_module, context] = extractQuakeCodeAndContext(kernelName);
+  auto compiled = runPassPipeline(executionContext, kernelName, m_module,
+                                  rawArgs, kernelArgs, std::move(context));
+  return emitKernelExecutions(compiled);
 }
 
 std::vector<cudaq::KernelExecution>
 Compiler::lowerQuakeCode(cudaq::ExecutionContext *executionContext,
                          const std::string &kernelName, mlir::ModuleOp module,
                          const std::vector<void *> &rawArgs) {
-  return lowerQuakeCodePart2(executionContext, kernelName, nullptr, rawArgs,
-                             module, module.getContext(), nullptr);
+  auto compiled =
+      runPassPipeline(executionContext, kernelName, module, rawArgs);
+  return emitKernelExecutions(compiled);
 }
 
 mlir::ModuleOp Compiler::lowerQuakeCodeBuildModule(
diff --git a/runtime/internal/compiler/JIT.cpp b/runtime/internal/compiler/JIT.cpp
index 9696c94403c..90b783cb69f 100644
--- a/runtime/internal/compiler/JIT.cpp
+++ b/runtime/internal/compiler/JIT.cpp
@@ -19,7 +19,6 @@
 #include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Verifier/QIRLLVMIRDialect.h"
 #include "cudaq/runtime/logger/logger.h"
-#include "cudaq_internal/compiler/LayoutInfo.h"
 #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
 #include "llvm/ExecutionEngine/Orc/LLJIT.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
@@ -340,23 +339,6 @@ cudaq_internal::compiler::createJITEngine(ModuleOp &moduleOp,
   return JitEngine(std::move(jitOrError.get()));
 }
 
-/// Build a `ResultInfo` from an MLIR return type.
-/// \p resultTy may be null (no return value). When \p isEntryPoint is false,
-/// the result is not marshaled — returns an empty `ResultInfo`.
-cudaq::ResultInfo cudaq_internal::compiler::createResultInfo(Type resultTy,
-                                                             bool isEntryPoint,
-                                                             ModuleOp module) {
-  cudaq::ResultInfo info;
-  if (!resultTy || !isEntryPoint)
-    return info;
-
-  info.typeOpaquePtr = resultTy.getAsOpaquePointer();
-  auto [size, offsets] = getResultBufferLayout(module, resultTy);
-  info.bufferSize = size;
-  info.fieldOffsets = std::move(offsets);
-  return info;
-}
-
 class cudaq::JitEngine::Impl : public cudaq::JitEngine::Base {
 public:
   Impl(std::unique_ptr<ExecutionEngine> jitEngine)
diff --git a/runtime/internal/compiler/include/cudaq_internal/compiler/CompiledModuleHelper.h b/runtime/internal/compiler/include/cudaq_internal/compiler/CompiledModuleHelper.h
new file mode 100644
index 00000000000..e04383d1214
--- /dev/null
+++ b/runtime/internal/compiler/include/cudaq_internal/compiler/CompiledModuleHelper.h
@@ -0,0 +1,81 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2026 NVIDIA Corporation & Affiliates.                         *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+#pragma once
+
+#include "common/CompiledModule.h"
+#include <memory>
+
+namespace mlir {
+class Type;
+class MLIRContext;
+class ModuleOp;
+} // namespace mlir
+
+namespace cudaq_internal::compiler {
+
+/// Compiler-side helper for `cudaq::CompiledModule`: static factory methods and
+/// utilities that depend on MLIR but pair with the MLIR-free `CompiledModule`
+/// API in `common/CompiledModule.h`.
+class CompiledModuleHelper {
+public:
+  // --- Named artifact aliases ---
+
+  using NamedCompiledArtifact =
+      std::pair<std::string, cudaq::CompiledModule::CompiledArtifact>;
+
+  CompiledModuleHelper() = delete;
+
+  // --- ResultInfo construction ---
+
+  /// Create a `ResultInfo` from MLIR type metadata.
+  ///
+  /// When \p resultType is null or \p isEntryPoint is false, returns an empty
+  /// `ResultInfo` (no marshaled return value).
+  static cudaq::ResultInfo createResultInfo(mlir::Type resultType,
+                                            bool isEntryPoint,
+                                            mlir::ModuleOp module);
+
+  // --- JitArtifact construction ---
+
+  /// Construct named JitArtifacts from the compiled functions in the JIT
+  /// engine.
+  ///
+  /// Uses the kernel's name and result metadata to determine the correct
+  /// mangled symbol names.
+  static std::vector<NamedCompiledArtifact>
+  createJitArtifacts(const std::string &kernelName, cudaq::JitEngine engine,
+                     const cudaq::ResultInfo &resultInfo,
+                     bool isFullySpecialized);
+
+  // --- ResourcesArtifact construction ---
+
+  /// Construct a named `ResourcesArtifact` from pre-computed resource counts.
+  static NamedCompiledArtifact createResourcesArtifact(std::string name,
+                                                       cudaq::Resources rc);
+
+  // --- MlirArtifact construction and access ---
+
+  /// Construct a named `MlirArtifact` from a `ModuleOp`.
+  static NamedCompiledArtifact
+  createMlirArtifact(std::string name, mlir::ModuleOp module,
+                     std::shared_ptr<mlir::MLIRContext> context = nullptr);
+
+  /// Extract the `ModuleOp` from a `MlirArtifact`.
+  static mlir::ModuleOp
+  getMlirModuleOp(const cudaq::CompiledModule::MlirArtifact &artifact);
+
+  // --- CompiledModule construction ---
+
+  /// Create a `CompiledModule` containing the given compiled artifacts.
+  static cudaq::CompiledModule createCompiledModule(
+      std::string name, cudaq::ResultInfo resultInfo,
+      std::vector<NamedCompiledArtifact> compiledArtifacts,
+      cudaq::CompiledModule::CompilationMetadata metadata = {});
+};
+
+} // namespace cudaq_internal::compiler
diff --git a/runtime/internal/compiler/include/cudaq_internal/compiler/Compiler.h b/runtime/internal/compiler/include/cudaq_internal/compiler/Compiler.h
index 682cef3b3af..6f00735d966 100644
--- a/runtime/internal/compiler/include/cudaq_internal/compiler/Compiler.h
+++ b/runtime/internal/compiler/include/cudaq_internal/compiler/Compiler.h
@@ -8,6 +8,7 @@
 #pragma once
 
 #include "common/CompiledModule.h"
+#include "cudaq_internal/compiler/CompiledModuleHelper.h"
 #include <map>
 #include <memory>
 #include <string>
@@ -62,31 +63,45 @@ class Compiler {
   /// to be printed. This is similar to `-mlir-pass-statistics` in `cudaq-opt`
   bool enablePassStatistics = false;
 
-  /// @brief If we are emulating locally, keep track
-  /// of JIT engines for invoking the kernels.
-  std::vector<cudaq::JitEngine> jitEngines;
-
   /// @brief Flag indicating whether we should emulate execution locally.
   bool emulate = false;
 
   /// @brief Flag indicating whether we should print the IR.
   bool printIR = false;
 
-  std::vector<cudaq::KernelExecution>
-  lowerQuakeCodePart2(cudaq::ExecutionContext *executionContext,
-                      const std::string &kernelName, void *kernelArgs,
-                      const std::vector<void *> &rawArgs,
-                      mlir::ModuleOp m_module, mlir::MLIRContext *contextPtr,
-                      void *updatedArgs);
-
-  std::tuple<mlir::ModuleOp, std::unique_ptr<mlir::MLIRContext>, void *>
-  extractQuakeCodeAndContext(const std::string &kernelName, void *data);
+  std::pair<mlir::ModuleOp, std::unique_ptr<mlir::MLIRContext>>
+  extractQuakeCodeAndContext(const std::string &kernelName);
 
   mlir::ModuleOp lowerQuakeCodeBuildModule(const std::string &,
                                            mlir::ModuleOp module,
                                            mlir::MLIRContext *,
                                            mlir::func::FuncOp);
 
+  // ---- Common helpers used by runPassPipeline ----
+
+  /// Run an arbitrary MLIR pass pipeline string on a module.
+  void applyPipeline(const std::string &pipeline, mlir::ModuleOp moduleOp,
+                     const std::string &kernelName);
+
+  /// Build the module, merge closures, and synthesize arguments.
+  std::pair<mlir::ModuleOp, mlir::func::FuncOp>
+  prepareModule(const std::string &kernelName, mlir::ModuleOp m_module,
+                const std::vector<void *> &rawArgs, void *kernelArgs);
+
+  /// Delay combine-measurements for emulation, then run the main pass
+  /// pipeline.  Returns true when combine-measurements was delayed.
+  bool executeMainPipeline(mlir::ModuleOp moduleOp,
+                           const std::string &kernelName);
+
+  /// Create JIT and MLIR artifacts and assemble a CompiledModule.
+  cudaq::CompiledModule assembleCompiledModule(
+      const std::string &kernelName,
+      std::vector<std::pair<std::string, mlir::ModuleOp>> &modules,
+      bool needJit, bool runCombineMeasurements,
+      std::optional<cudaq::Resources> resourceCounts,
+      const std::vector<std::size_t> &mappingReorderIdx,
+      std::shared_ptr<mlir::MLIRContext> context);
+
 public:
   Compiler(cudaq::ServerHelper *,
            const std::map<std::string, std::string> &backendConfig,
@@ -94,6 +109,27 @@ class Compiler {
            const cudaq::noise_model *noiseModel, bool emulate);
   ~Compiler();
 
+  /// @brief Compile the given module and return a `CompiledModule`.
+  ///
+  /// Performs argument synthesis, the full pass pipeline, and observation
+  /// splitting (for observe mode).
+  ///
+  /// If \p context is provided, `module.getContext() == context.get()` must
+  /// be true. In that case, the MLIR artifacts will keep a `shared_ptr` to
+  /// the context, guaranteeing it outlives the artifacts. Otherwise the
+  /// context lifetime must be managed by the caller.
+  cudaq::CompiledModule
+  runPassPipeline(cudaq::ExecutionContext *executionContext,
+                  const std::string &kernelName, mlir::ModuleOp module,
+                  const std::vector<void *> &rawArgs,
+                  void *kernelArgs = nullptr,
+                  std::shared_ptr<mlir::MLIRContext> context = nullptr);
+
+  /// @brief Emit target-specific code for each `MlirArtifact` in the
+  /// `CompiledModule` and produce `KernelExecution` objects.
+  std::vector<cudaq::KernelExecution>
+  emitKernelExecutions(const cudaq::CompiledModule &compiled);
+
   /// @brief Extract the Quake representation for the given kernel name and
   /// lower it to the code format required for the specific backend. The
   /// lowering process is controllable via the configuration file in the
diff --git a/runtime/internal/compiler/include/cudaq_internal/compiler/JIT.h b/runtime/internal/compiler/include/cudaq_internal/compiler/JIT.h
index f886a4f912c..54db0cbaffe 100644
--- a/runtime/internal/compiler/include/cudaq_internal/compiler/JIT.h
+++ b/runtime/internal/compiler/include/cudaq_internal/compiler/JIT.h
@@ -28,7 +28,6 @@ class Type;
 
 namespace cudaq {
 class CompiledModule;
-class ResultInfo;
 } // namespace cudaq
 
 namespace cudaq_internal::compiler {
@@ -46,11 +45,4 @@ createWrappedKernel(std::string_view llvmIr, const std::string &kernelName,
 cudaq::JitEngine createJITEngine(mlir::ModuleOp &moduleOp,
                                  llvm::StringRef convertTo);
 
-/// @brief Create a `ResultInfo` from MLIR type and module.
-///
-/// When `resultType` is null or `isEntryPoint` is false, returns an empty
-/// `ResultInfo`.
-cudaq::ResultInfo createResultInfo(mlir::Type resultType, bool isEntryPoint,
-                                   mlir::ModuleOp module);
-
 } // namespace cudaq_internal::compiler
diff --git a/runtime/nvqir/CircuitSimulator.h b/runtime/nvqir/CircuitSimulator.h
index 025006cf901..f03eca403c2 100644
--- a/runtime/nvqir/CircuitSimulator.h
+++ b/runtime/nvqir/CircuitSimulator.h
@@ -15,6 +15,9 @@
 #include "common/QuditIdTracker.h"
 #include "common/SampleResult.h"
 #include "common/Timing.h"
+#include "cudaq/algorithms/policies.h"
+#include "cudaq/algorithms/policy_cpos.h"
+#include "cudaq/algorithms/policy_dispatch.h"
 #include "cudaq/host_config.h"
 #include "cudaq/runtime/logger/logger.h"
 #include <concepts>
@@ -103,6 +106,9 @@ class CircuitSimulator {
   /// sample() function.
   bool supportsBufferedSample = false;
 
+  /// @brief Internal result
+  cudaq::sample_result internalResult = {};
+
 public:
   /// @brief The constructor
   CircuitSimulator() = default;
@@ -123,6 +129,9 @@ class CircuitSimulator {
   /// @brief Provide a mechanism for simulators to
   /// create and return a `SimulationState` instance from
   /// a user-specified data set.
+  /// Note: this may be called in the middle of a circuit execution
+  /// (`CreateStateOp` in the IR), so implementations must not read from or
+  /// write to the simulator's own live state.
   virtual std::unique_ptr<cudaq::SimulationState>
   createStateFromData(const cudaq::state_data &) = 0;
 
@@ -245,7 +254,20 @@ class CircuitSimulator {
   virtual void deallocateQubits(const std::vector<std::size_t> &qubits) = 0;
 
   /// @brief Process the results stored in the given execution context.
-  virtual void finalizeExecutionContext(cudaq::ExecutionContext &context) = 0;
+  void finalizeExecutionContext(cudaq::ExecutionContext &ctx) {
+    cudaq::policies::withPolicy(ctx.name, [&](auto policy) {
+      cudaq::policies::visitResult(
+          [&]() { return finalize_simulation_circuit(*this, policy, ctx); },
+          [&](cudaq::sample_result &&r) { ctx.result = std::move(r); },
+          [&](cudaq::policies::void_result &&r) {});
+    });
+  }
+
+  virtual void finalizeExecutionContext(const cudaq::other_policies &policy,
+                                        cudaq::ExecutionContext &ctx) {}
+  virtual cudaq::sample_result
+  finalizeExecutionContext(const cudaq::sample_policy &policy,
+                           cudaq::ExecutionContext &ctx) = 0;
 
   /// @brief Clean up after execution ends.
   virtual void endExecution() {}
@@ -379,7 +401,7 @@ class CircuitSimulator {
   virtual bool mz(const std::size_t qubitIdx,
                   const std::string &registerName) = 0;
 
-  virtual void measureSpinOp(const cudaq::spin_op &op) = 0;
+  virtual cudaq::SpinMeasureResult measureSpinOp(const cudaq::spin_op &op) = 0;
 
   /// @brief Set the current state to the |0> state,
   /// retaining the current number of qubits.
@@ -640,6 +662,18 @@ class CircuitSimulatorBase : public CircuitSimulator {
                              "subclasses, override addQubitsToState.");
   }
 
+  /// @brief Check if any of the given qubit indices have already been
+  /// recorded for sampling. Used to decide whether a gate application
+  /// should trigger a sampling flush (only needed when the gate operates
+  /// on a qubit that was already measured, i.e. mid-circuit measurement).
+  bool operatesOnMeasuredQubit(const std::vector<std::size_t> &qubits) const {
+    for (auto q : qubits)
+      if (std::find(sampleQubits.begin(), sampleQubits.end(), q) !=
+          sampleQubits.end())
+        return true;
+    return false;
+  }
+
   /// @brief Execute a sampling task with the current set of sample qubits.
   void flushAnySamplingTasks(bool force = false) {
     auto executionContext = cudaq::getExecutionContext();
@@ -656,7 +690,7 @@ class CircuitSimulatorBase : public CircuitSimulator {
       // OK, now we're ready to grab the buffered sample results for the entire
       // execution context.
       auto execResult = sample(sampleQubits, nShots);
-      executionContext->result.append(execResult);
+      internalResult.append(execResult);
       return;
     }
 
@@ -694,8 +728,7 @@ class CircuitSimulatorBase : public CircuitSimulator {
     }
 
     if (registerNameToMeasuredQubit.empty()) {
-      executionContext->result.append(execResult,
-                                      executionContext->explicitMeasurements);
+      internalResult.append(execResult, executionContext->explicitMeasurements);
     } else {
 
       for (auto &[regName, qubits] : registerNameToMeasuredQubit) {
@@ -723,7 +756,7 @@ class CircuitSimulatorBase : public CircuitSimulator {
           tmp.appendResult(b, count);
         }
 
-        executionContext->result.append(tmp);
+        internalResult.append(tmp);
       }
     }
 
@@ -845,13 +878,6 @@ class CircuitSimulatorBase : public CircuitSimulator {
   /// @brief The destructor
   virtual ~CircuitSimulatorBase() = default;
 
-  /// @brief Create a simulation-specific SimulationState
-  /// instance from a user-provided data set.
-  std::unique_ptr<cudaq::SimulationState>
-  createStateFromData(const cudaq::state_data &data) override {
-    return getSimulationState()->createFromData(data);
-  }
-
   /// @brief Set the current noise model to consider when
   /// simulating the state. This should be overridden by
   /// simulation strategies that support noise modeling.
@@ -965,72 +991,82 @@ class CircuitSimulatorBase : public CircuitSimulator {
     }
   }
 
+protected:
   /// @brief Reset the current execution context.
-  void finalizeExecutionContext(cudaq::ExecutionContext &context) override {
-    if (nQubitsAllocated == 0 && context.name != "sample")
-      return;
-
+  void finalizeExecutionContextImpl(cudaq::ExecutionContext &context) {
     // Get the ExecutionContext name
     auto execContextName = context.name;
 
     // Flush the queue if there are any gates to apply
     flushGateQueue();
+  }
 
-    // If we are sampling...
-    if (execContextName == "sample") {
-      // Sample the state over the specified number of shots
-      if (sampleQubits.empty() && !context.explicitMeasurements) {
-        sampleQubits.resize(getNumQubits());
-        if (sampleQubits.empty())
-          throw std::runtime_error(
-              "Sampling detected on a kernel with no qubits. Your kernel must "
-              "have qubits to sample it.");
-        std::iota(sampleQubits.begin(), sampleQubits.end(), 0);
-      }
+  cudaq::sample_result
+  finalizeExecutionContext(const cudaq::sample_policy &policy,
+                           cudaq::ExecutionContext &context) override {
+    finalizeExecutionContextImpl(context);
 
-      // Flush any queued up sampling tasks
-      flushAnySamplingTasks(/*force this*/ true);
-
-      // Handle the processing for any mid circuit measurements
-      for (auto &m : midCircuitSampleResults) {
-        // Get the register name and the vector of bit results
-        auto regName = m.first;
-        auto bitResults = m.second;
-        cudaq::ExecutionResult counts(regName);
-
-        if (std::find(vectorRegisters.begin(), vectorRegisters.end(),
-                      regName) != vectorRegisters.end()) {
-          // this is a vector register
-          std::string bitStr = "";
-          for (std::size_t j = 0; j < bitResults.size(); j++)
-            bitStr += bitResults[j];
-
-          counts.appendResult(bitStr, 1);
-
-        } else {
-          // Not a vector, collate all bits into a 1 qubit counts dict
-          for (std::size_t j = 0; j < bitResults.size(); j++) {
-            counts.appendResult(bitResults[j], 1);
-          }
-        }
-        context.result.append(counts);
-      }
+    // Sample the state over the specified number of shots
+    if (sampleQubits.empty() && !context.explicitMeasurements) {
+      sampleQubits.resize(getNumQubits());
+      if (sampleQubits.empty())
+        throw std::runtime_error(
+            "Sampling detected on a kernel with no qubits. Your kernel must "
+            "have qubits to sample it.");
+      std::iota(sampleQubits.begin(), sampleQubits.end(), 0);
+    }
+
+    // Flush any queued up sampling tasks
+    flushAnySamplingTasks(/*force this*/ true);
+
+    // Handle the processing for any mid circuit measurements
+    for (auto &m : midCircuitSampleResults) {
+      // Get the register name and the vector of bit results
+      auto regName = m.first;
+      auto bitResults = m.second;
+      cudaq::ExecutionResult counts(regName);
+
+      if (std::find(vectorRegisters.begin(), vectorRegisters.end(), regName) !=
+          vectorRegisters.end()) {
+        // this is a vector register
+        std::string bitStr = "";
+        for (std::size_t j = 0; j < bitResults.size(); j++)
+          bitStr += bitResults[j];
 
-      // Reorder the global register (if necessary). This might be necessary if
-      // the mapping pass had run and we want to undo the shuffle that occurred
-      // during mapping.
-      if (!context.reorderIdx.empty()) {
-        context.result.reorder(context.reorderIdx);
-        context.reorderIdx.clear();
+        counts.appendResult(bitStr, 1);
+
+      } else {
+        // Not a vector, collate all bits into a 1 qubit counts dict
+        for (std::size_t j = 0; j < bitResults.size(); j++) {
+          counts.appendResult(bitResults[j], 1);
+        }
       }
+      internalResult.append(counts);
+    }
 
-      // Clear the sample bits for the next run
-      sampleQubits.clear();
-      midCircuitSampleResults.clear();
-      lastMidCircuitRegisterName = "";
-      currentCircuitName = "";
+    // Reorder the global register (if necessary). This might be necessary if
+    // the mapping pass had run and we want to undo the shuffle that occurred
+    // during mapping.
+    if (!context.reorderIdx.empty()) {
+      internalResult.reorder(context.reorderIdx);
+      context.reorderIdx.clear();
     }
 
+    // Clear the sample bits for the next run
+    sampleQubits.clear();
+    midCircuitSampleResults.clear();
+    lastMidCircuitRegisterName = "";
+    currentCircuitName = "";
+
+    return internalResult;
+  }
+
+  void finalizeExecutionContext(const cudaq::other_policies &policy,
+                                cudaq::ExecutionContext &context) override {
+    if (nQubitsAllocated == 0)
+      return;
+    finalizeExecutionContextImpl(context);
+
     // Set the state data if requested.
     if (context.name == "extract-state") {
       context.simulationState = getSimulationState();
@@ -1047,6 +1083,7 @@ class CircuitSimulatorBase : public CircuitSimulator {
     }
   }
 
+public:
   /// @brief Clean up state after execution ends
   void endExecution() override {
     if (nQubitsAllocated == 0) {
@@ -1070,6 +1107,7 @@ class CircuitSimulatorBase : public CircuitSimulator {
     }
 
     tracker = {};
+    internalResult = {};
   }
 
   /// @brief Set the execution context
@@ -1092,7 +1130,8 @@ class CircuitSimulatorBase : public CircuitSimulator {
                             const std::vector<std::size_t> &controls,
                             const std::vector<std::size_t> &targets,
                             const std::string_view customName) override {
-    flushAnySamplingTasks();
+    if (operatesOnMeasuredQubit(controls) || operatesOnMeasuredQubit(targets))
+      flushAnySamplingTasks();
     auto numRows = std::sqrt(matrix.size());
     auto numQubits = std::log2(numRows);
     std::vector<std::complex<ScalarType>> actual;
@@ -1143,7 +1182,8 @@ class CircuitSimulatorBase : public CircuitSimulator {
   void enqueueQuantumOperation(const std::vector<ScalarType> &angles,
                                const std::vector<std::size_t> &controls,
                                const std::vector<std::size_t> &targets) {
-    flushAnySamplingTasks();
+    if (operatesOnMeasuredQubit(controls) || operatesOnMeasuredQubit(targets))
+      flushAnySamplingTasks();
     QuantumOperation gate;
     CUDAQ_INFO(gateToString(gate.name(), controls, angles, targets));
     enqueueGate(gate.name(), gate.getGate(angles), controls, targets, angles);
@@ -1233,7 +1273,9 @@ class CircuitSimulatorBase : public CircuitSimulator {
   /// @brief Invoke a general multi-control swap gate
   void swap(const std::vector<std::size_t> &ctrlBits, const std::size_t srcIdx,
             const std::size_t tgtIdx) override {
-    flushAnySamplingTasks();
+    if (operatesOnMeasuredQubit(ctrlBits) ||
+        operatesOnMeasuredQubit({srcIdx, tgtIdx}))
+      flushAnySamplingTasks();
     CUDAQ_INFO(gateToString("swap", ctrlBits, {}, {srcIdx, tgtIdx}));
     std::vector<std::complex<ScalarType>> matrix{
         {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0},
@@ -1292,22 +1334,18 @@ class CircuitSimulatorBase : public CircuitSimulator {
   // FIXME: it would be cleaner and more consistent (with exp_pauli) if
   // this function explicitly received a vector of qubit indices such that
   // only the relative order of the target in the spin op is relevant.
-  void measureSpinOp(const cudaq::spin_op &op) override {
+  cudaq::SpinMeasureResult measureSpinOp(const cudaq::spin_op &op) override {
     auto executionContext = cudaq::getExecutionContext();
 
     if (nQubitsAllocated == 0) {
-      if (executionContext)
-        executionContext->expectationValue = 0.0;
-      return;
+      return cudaq::SpinMeasureResult(0.0, {});
     }
 
     flushGateQueue();
 
     if (executionContext->canHandleObserve) {
       auto result = observe(executionContext->spin.value());
-      executionContext->expectationValue = result.expectation();
-      executionContext->result = result.raw_data();
-      return;
+      return cudaq::SpinMeasureResult(result.expectation(), result.raw_data());
     }
 
     if (op.num_terms() != 1)
@@ -1352,8 +1390,8 @@ class CircuitSimulatorBase : public CircuitSimulator {
 
     // Sample and give the data to the context
     cudaq::ExecutionResult result = sample(qubitsToMeasure, shots);
-    executionContext->expectationValue = result.expectationValue;
-    executionContext->result = cudaq::sample_result(result);
+    cudaq::SpinMeasureResult spinMeasureResult(
+        result.expectationValue.value_or(0.0), result);
 
     // Restore the state.
     if (!basisChange.empty()) {
@@ -1363,6 +1401,8 @@ class CircuitSimulatorBase : public CircuitSimulator {
 
       flushGateQueue();
     }
+
+    return spinMeasureResult;
   }
 
 private:
@@ -1405,6 +1445,27 @@ class CircuitSimulatorBase : public CircuitSimulator {
     return qubits;
   }
 };
+
+} // namespace nvqir
+
+namespace cudaq {
+
+inline sample_result
+finalize_simulation_circuit_impl(nvqir::CircuitSimulator &sim,
+                                 const sample_policy &policy,
+                                 ExecutionContext &ctx) {
+  return sim.finalizeExecutionContext(policy, ctx);
+}
+
+} // namespace cudaq
+
+namespace nvqir {
+
+inline void finalize_simulation_circuit_impl(CircuitSimulator &sim,
+                                             cudaq::ExecutionContext &ctx) {
+  sim.finalizeExecutionContext(cudaq::other_policies{}, ctx);
+}
+
 } // namespace nvqir
 
 #define CONCAT(a, b) CONCAT_INNER(a, b)
diff --git a/runtime/nvqir/NVQIR.cpp b/runtime/nvqir/NVQIR.cpp
index ed48279ea23..be1931b67c1 100644
--- a/runtime/nvqir/NVQIR.cpp
+++ b/runtime/nvqir/NVQIR.cpp
@@ -1048,86 +1048,6 @@ static std::vector<Pauli> extractPauliTermIds(Array *paulis) {
   return pauliIds;
 }
 
-/// @brief QIR function measuring the qubit state in the given Pauli basis.
-/// @param pauli_arr
-/// @param qubits
-/// @return
-Result *__quantum__qis__measure__body(Array *pauli_arr, Array *qubits) {
-  CUDAQ_INFO("NVQIR measuring in pauli basis");
-  ScopedTraceWithContext("NVQIR::observe_measure_body");
-
-  auto *circuitSimulator = nvqir::getCircuitSimulatorInternal();
-  auto *currentContext = cudaq::getExecutionContext();
-
-  // Some backends may better handle the observe task.
-  // Let's give them that opportunity.
-  if (currentContext->canHandleObserve) {
-    circuitSimulator->flushGateQueue();
-    auto result = circuitSimulator->observe(currentContext->spin.value());
-    currentContext->expectationValue = result.expectation();
-    currentContext->result = result.raw_data();
-    return ResultZero;
-  }
-
-  const auto paulis = extractPauliTermIds(pauli_arr);
-  std::vector<std::size_t> qubits_to_measure;
-  std::vector<std::pair<std::string, std::size_t>> reverser;
-  for (size_t i = 0; i < paulis.size(); ++i) {
-    const auto pauli = paulis[i];
-    switch (pauli) {
-    case Pauli::Pauli_I:
-      break;
-    case Pauli::Pauli_X: {
-
-      circuitSimulator->h(i);
-      qubits_to_measure.push_back(i);
-      reverser.push_back({"X", i});
-      break;
-    }
-    case Pauli::Pauli_Y: {
-      double angle = M_PI_2;
-      circuitSimulator->rx(angle, i);
-      qubits_to_measure.push_back(i);
-      reverser.push_back({"Y", i});
-
-      break;
-    }
-    case Pauli::Pauli_Z: {
-      qubits_to_measure.push_back(i);
-      break;
-    }
-    }
-  }
-
-  circuitSimulator->flushGateQueue();
-  int shots = 0;
-  if (currentContext->shots > 0) {
-    shots = currentContext->shots;
-  }
-
-  // Sample and give the data to the context
-  cudaq::ExecutionResult result =
-      circuitSimulator->sample(qubits_to_measure, shots);
-  currentContext->expectationValue = result.expectationValue;
-  currentContext->result = cudaq::sample_result(result);
-
-  // Reverse the measurements bases change.
-  if (!reverser.empty()) {
-    CUDAQ_INFO("NVQIR reverse pauli bases change for measurement.");
-    for (auto it = reverser.rbegin(); it != reverser.rend(); ++it) {
-      if (it->first == "X") {
-        circuitSimulator->h(it->second);
-      } else if (it->first == "Y") {
-        double angle = -M_PI_2;
-        circuitSimulator->rx(angle, it->second);
-      }
-    }
-    circuitSimulator->flushGateQueue();
-  }
-
-  return ResultZero;
-}
-
 /// @brief Implementation of first order trotterization
 /// enables exp( i * angle * H), where H = Sum (PauliTensorProduct)
 /// @param paulis
diff --git a/runtime/nvqir/QIRTypes.cpp b/runtime/nvqir/QIRTypes.cpp
index f566c62715d..ec0b5f21f94 100644
--- a/runtime/nvqir/QIRTypes.cpp
+++ b/runtime/nvqir/QIRTypes.cpp
@@ -153,20 +153,12 @@ Array *__quantum__rt__array_create_1d(int32_t itemSizeInBytes,
   return array;
 }
 
-Array *__quantum__rt__result_array_create_1d(std::int64_t count) {
-  return __quantum__rt__array_create_1d(sizeof(void *), count);
-}
-
 int8_t *__quantum__rt__array_get_element_ptr_1d(Array *q, uint64_t idx) {
   Array &arr = *q;
   int8_t *ptr = arr[idx];
   return ptr;
 }
 
-int8_t *__quantum__rt__result_array_get_element_ptr_1d(Array *q, uint64_t idx) {
-  return __quantum__rt__array_get_element_ptr_1d(q, idx);
-}
-
 int64_t __quantum__rt__array_get_size_1d(Array *state1) {
   if (state1 == nullptr)
     return 0;
diff --git a/runtime/nvqir/QIRTypes.h b/runtime/nvqir/QIRTypes.h
index 5d9b495f5cf..1673a1cc4c0 100644
--- a/runtime/nvqir/QIRTypes.h
+++ b/runtime/nvqir/QIRTypes.h
@@ -83,17 +83,15 @@ Array *__quantum__rt__array_slice(Array *array, int32_t dim,
 Array *__quantum__rt__array_slice_1d(Array *array, int64_t range_start,
                                      int64_t range_step, int64_t range_end);
 Array *quantum__rt__array_slice(Array *array, int32_t dim, Range range);
-Array *__quantum__rt__result_array_create_1d(std::int64_t count);
-int8_t *__quantum__rt__result_array_get_element_ptr_1d(Array *q, uint64_t idx);
 
 // Internal method to clean up any dangling arrays
 void __nvqpp_cleanup_arrays();
 }
 
 // Results
-using Result = int;
-static const Result ResultZeroVal = 0;
-static const Result ResultOneVal = 1;
+using Result = bool;
+static const Result ResultZeroVal = false;
+static const Result ResultOneVal = true;
 inline Result *ResultZero = const_cast<Result *>(&ResultZeroVal);
 inline Result *ResultOne = const_cast<Result *>(&ResultOneVal);
 
diff --git a/runtime/nvqir/cudensitymat/CuDensityMatSim.cpp b/runtime/nvqir/cudensitymat/CuDensityMatSim.cpp
index ff8f9fac12c..c1d310d9f74 100644
--- a/runtime/nvqir/cudensitymat/CuDensityMatSim.cpp
+++ b/runtime/nvqir/cudensitymat/CuDensityMatSim.cpp
@@ -128,7 +128,13 @@ class CuDensityMatSim : public nvqir::CircuitSimulatorBase<double> {
     return std::make_unique<cudaq::CuDensityMatState>();
   }
 
-  void finalizeExecutionContext(cudaq::ExecutionContext &context) override {
+  std::unique_ptr<cudaq::SimulationState>
+  createStateFromData(const cudaq::state_data &data) override {
+    return std::make_unique<cudaq::CuDensityMatState>()->createFromData(data);
+  }
+
+protected:
+  void finalizeExecutionContextImpl(cudaq::ExecutionContext &context) {
     // Just check that the dynamics target was not invoked in gate simulation
     // contexts.
     if (context.name != "evolve")
@@ -137,6 +143,19 @@ class CuDensityMatSim : public nvqir::CircuitSimulatorBase<double> {
           context.name));
   }
 
+  cudaq::sample_result
+  finalizeExecutionContext(const cudaq::sample_policy &policy,
+                           cudaq::ExecutionContext &ctx) override {
+    finalizeExecutionContextImpl(ctx);
+    return cudaq::sample_result();
+  }
+
+  void finalizeExecutionContext(const cudaq::other_policies &policy,
+                                cudaq::ExecutionContext &ctx) override {
+    finalizeExecutionContextImpl(ctx);
+  }
+
+public:
   void addQubitToState() override {
     throw std::runtime_error(
         "[dynamics target] Quantum gate simulation is not supported.");
diff --git a/runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cpp b/runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cpp
index e89228d6eaa..1deb1f53347 100644
--- a/runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cpp
+++ b/runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cpp
@@ -750,6 +750,12 @@ class CuStateVecCircuitSimulator
                                                           deviceStateVector);
   }
 
+  std::unique_ptr<cudaq::SimulationState>
+  createStateFromData(const cudaq::state_data &data) override {
+    return std::make_unique<cudaq::CusvState<ScalarType>>(0, nullptr)
+        ->createFromData(data);
+  }
+
   bool isStateVectorSimulator() const override { return true; }
 
   std::string name() const override;
diff --git a/runtime/nvqir/cutensornet/simulator_mps.h b/runtime/nvqir/cutensornet/simulator_mps.h
index 0228f95e0b0..cf1ea9b3f0c 100644
--- a/runtime/nvqir/cutensornet/simulator_mps.h
+++ b/runtime/nvqir/cutensornet/simulator_mps.h
@@ -461,6 +461,14 @@ class SimulatorMPS : public SimulatorTensorNetBase<ScalarType> {
         m_cutnHandle, m_randomEngine);
   }
 
+  std::unique_ptr<cudaq::SimulationState>
+  createStateFromData(const cudaq::state_data &data) override {
+    return std::make_unique<MPSSimulationState<ScalarType>>(
+               nullptr, std::vector<MPSTensor>{}, scratchPad, m_cutnHandle,
+               m_randomEngine)
+        ->createFromData(data);
+  }
+
   bool requireCacheWorkspace() const override { return false; }
   bool canHandleGeneralNoiseChannel() const override { return true; }
   virtual ~SimulatorMPS() noexcept {
diff --git a/runtime/nvqir/cutensornet/simulator_tensornet.h b/runtime/nvqir/cutensornet/simulator_tensornet.h
index 3dda056d99e..801b3cf5a77 100644
--- a/runtime/nvqir/cutensornet/simulator_tensornet.h
+++ b/runtime/nvqir/cutensornet/simulator_tensornet.h
@@ -86,6 +86,13 @@ class SimulatorTensorNet : public SimulatorTensorNetBase<ScalarType> {
         std::move(m_state), scratchPad, m_cutnHandle, m_randomEngine);
   }
 
+  std::unique_ptr<cudaq::SimulationState>
+  createStateFromData(const cudaq::state_data &data) override {
+    return std::make_unique<TensorNetSimulationState<ScalarType>>(
+               nullptr, scratchPad, m_cutnHandle, m_randomEngine)
+        ->createFromData(data);
+  }
+
   void addQubitsToState(std::size_t numQubits, const void *ptr) override {
     LOG_API_TIME();
     if (!m_state) {
diff --git a/runtime/nvqir/qpp/QppCircuitSimulator.cpp b/runtime/nvqir/qpp/QppCircuitSimulator.cpp
index 24d9c1262e5..61c46561d6e 100644
--- a/runtime/nvqir/qpp/QppCircuitSimulator.cpp
+++ b/runtime/nvqir/qpp/QppCircuitSimulator.cpp
@@ -409,8 +409,7 @@ class QppCircuitSimulator : public nvqir::CircuitSimulatorBase<double> {
 
   std::unique_ptr<cudaq::SimulationState>
   createStateFromData(const cudaq::state_data &data) override {
-    qpp::ket dummy;
-    return QppState(std::move(dummy)).createFromData(data);
+    return std::make_unique<QppState>(qpp::ket{})->createFromData(data);
   }
 
   bool isStateVectorSimulator() const override {
diff --git a/runtime/nvqir/qpp/QppDMCircuitSimulator.cpp b/runtime/nvqir/qpp/QppDMCircuitSimulator.cpp
index 263ce9cd809..e247581897d 100644
--- a/runtime/nvqir/qpp/QppDMCircuitSimulator.cpp
+++ b/runtime/nvqir/qpp/QppDMCircuitSimulator.cpp
@@ -319,8 +319,7 @@ class QppNoiseCircuitSimulator : public nvqir::QppCircuitSimulator<qpp::cmat> {
 
   std::unique_ptr<cudaq::SimulationState>
   createStateFromData(const cudaq::state_data &data) override {
-    qpp::cmat dummy;
-    return QppDmState(std::move(dummy)).createFromData(data);
+    return std::make_unique<QppDmState>(qpp::cmat{})->createFromData(data);
   }
 
   NVQIR_SIMULATOR_CLONE_IMPL(QppNoiseCircuitSimulator)
diff --git a/runtime/nvqir/resourcecounter/ResourceCounter.h b/runtime/nvqir/resourcecounter/ResourceCounter.h
index 34b95206d05..536e9f04b99 100644
--- a/runtime/nvqir/resourcecounter/ResourceCounter.h
+++ b/runtime/nvqir/resourcecounter/ResourceCounter.h
@@ -74,6 +74,12 @@ class ResourceCounter : public nvqir::CircuitSimulatorBase<double> {
 
   CircuitSimulator *clone() override { return this; };
 
+  std::unique_ptr<cudaq::SimulationState>
+  createStateFromData(const cudaq::state_data &) override {
+    throw std::runtime_error(
+        "Simulation data not available for the resource counter backend.");
+  }
+
   void deallocateStateImpl() override {}
 
   void setToZeroState() override {
diff --git a/runtime/nvqir/stim/StimCircuitSimulator.cpp b/runtime/nvqir/stim/StimCircuitSimulator.cpp
index 296f651dcb1..6871154cb3c 100644
--- a/runtime/nvqir/stim/StimCircuitSimulator.cpp
+++ b/runtime/nvqir/stim/StimCircuitSimulator.cpp
@@ -9,6 +9,7 @@
 #include "common/FmtCore.h"
 #include "nvqir/CircuitSimulator.h"
 #include "stim.h"
+#include <cmath>
 #include <numeric>
 
 using namespace cudaq;
@@ -400,6 +401,11 @@ class StimCircuitSimulator : public nvqir::CircuitSimulatorBase<double> {
            paulis.find(gateName[1]) != std::string::npos;
   }
 
+  static bool isApproxAngle(double value, double target) {
+    constexpr double tolerance = 1e-12;
+    return std::abs(value - target) < tolerance;
+  }
+
   void applyGate(const GateApplicationTask &task) override {
     std::string gateName(task.operationName);
     std::transform(gateName.begin(), gateName.end(), gateName.begin(),
@@ -430,7 +436,26 @@ class StimCircuitSimulator : public nvqir::CircuitSimulatorBase<double> {
           fmt::format("Gate not supported by Stim simulator: {}. Note that "
                       "Stim can only simulate Clifford gates.",
                       task.operationName));
-    else if (gateName == "SDG")
+    else if (gateName == "R1") {
+      if (task.parameters.size() != 1)
+        throw std::runtime_error(
+            fmt::format("Gate not supported by Stim simulator: {}. Note that "
+                        "Stim can only simulate Clifford gates.",
+                        task.operationName));
+
+      auto angle = task.parameters.front();
+      if (isApproxAngle(angle, M_PI_2))
+        gateName = "S";
+      else if (isApproxAngle(angle, -M_PI_2))
+        gateName = "S_DAG";
+      else if (isApproxAngle(angle, M_PI) || isApproxAngle(angle, -M_PI))
+        gateName = "Z";
+      else
+        throw std::runtime_error(
+            fmt::format("Gate not supported by Stim simulator: {}({}). Note "
+                        "that Stim can only simulate Clifford gates.",
+                        task.operationName, angle));
+    } else if (gateName == "SDG")
       gateName = "S_DAG";
     else if (gateName == "ID")
       gateName = "I";
@@ -610,6 +635,13 @@ class StimCircuitSimulator : public nvqir::CircuitSimulatorBase<double> {
   bool isStateVectorSimulator() const override { return false; }
 
   std::string name() const override { return "stim"; }
+
+  std::unique_ptr<cudaq::SimulationState>
+  createStateFromData(const cudaq::state_data &) override {
+    throw std::runtime_error(
+        "Simulation data not available for the stim simulator backend.");
+  }
+
   NVQIR_SIMULATOR_CLONE_IMPL(StimCircuitSimulator)
 };
 
diff --git a/scripts/configure_build.sh b/scripts/configure_build.sh
index d065b1d2072..17d4cc36125 100644
--- a/scripts/configure_build.sh
+++ b/scripts/configure_build.sh
@@ -75,7 +75,7 @@ if [ "$1" == "install-cuquantum" ]; then
     CUDA_ARCH_FOLDER=$([ "$(uname -m)" == "aarch64" ] && echo sbsa || echo x86_64)
 
 # [>cuQuantumInstall]
-    CUQUANTUM_VERSION=26.01.0.4
+    CUQUANTUM_VERSION=26.03.1.9
     CUQUANTUM_DOWNLOAD_URL=https://developer.download.nvidia.com/compute/cuquantum/redist/cuquantum
 
     cuquantum_archive=cuquantum-linux-${CUDA_ARCH_FOLDER}-${CUQUANTUM_VERSION}_cuda$(echo ${CUDA_VERSION} | cut -d . -f1)-archive.tar.xz
diff --git a/scripts/validate_pycudaq.sh b/scripts/validate_pycudaq.sh
index 738e3b4da46..2197dd05b81 100644
--- a/scripts/validate_pycudaq.sh
+++ b/scripts/validate_pycudaq.sh
@@ -472,7 +472,8 @@ if [ -d "$root_folder/targets" ]; then
                 skip_example=true
             elif [ "$t" == "tii" ] || [ "$t" == "scaleway" ] || [ "$t" == "quantum_machines" ] || \
                  [ "$t" == "quantinuum" ] || [ "$t" == "orca" ] || [ "$t" == "orca-photonics" ] || \
-                 [ "$t" == "iqm" ] || [ "$t" == "infleqtion" ] || [ "$t" == "anyon" ]; then
+                 [ "$t" == "iqm" ] || [ "$t" == "infleqtion" ] || [ "$t" == "anyon" ] || \
+                 [ "$t" == "qbraid" ]; then
                 echo "Skipping $ex (remote target '$t' not available)" >&2
                 skip_example=true
             fi
diff --git a/targettests/CMakeLists.txt b/targettests/CMakeLists.txt
index 064f116987a..82ed3de6049 100644
--- a/targettests/CMakeLists.txt
+++ b/targettests/CMakeLists.txt
@@ -35,6 +35,7 @@ set(CUDAQ_TEST_DEPENDS
     cudaq-translate
     FileCheck
     cudaq-qpud
+    CustomPassPlugin
 )
 # We require split-file, which should be installed along with FileCheck, but
 # the CI doesn't do it. Comment this out and open a bug.
diff --git a/targettests/Kernel/inline-qpu-func.cpp b/targettests/Kernel/inline-qpu-func.cpp
index 948165af55d..9d0dae31c17 100644
--- a/targettests/Kernel/inline-qpu-func.cpp
+++ b/targettests/Kernel/inline-qpu-func.cpp
@@ -10,8 +10,9 @@
 
 #include "cudaq.h"
 
-// This is device only kernel since entry-point kernels cannot accept
-// `measure_result` or `std::vector<measure_result>` as parameters.
+// This function has no cudaq::qubit's in the parameter list, so it will be
+// tagged as a possible cudaq-entrypoint kernel. Make sure we can still inline
+// it if called from another kernel.
 bool xor_result(const std::vector<cudaq::measure_result> &result_vec) __qpu__ {
   bool result = false;
   for (auto x : result_vec)
diff --git a/targettests/Kernel/signature-0.cpp b/targettests/Kernel/signature-0.cpp
index 6153ff3cc8d..fe853eee719 100644
--- a/targettests/Kernel/signature-0.cpp
+++ b/targettests/Kernel/signature-0.cpp
@@ -53,7 +53,7 @@ class Qernel5 {
 public:
   std::vector<bool> operator()() __qpu__ {
     cudaq::qvector q(5);
-    return cudaq::to_bool_vector(mz(q));
+    return mz(q);
   }
 };
 
@@ -61,7 +61,7 @@ class Qernel6 {
 public:
   std::vector<bool> operator()(int sz) __qpu__ {
     cudaq::qvector q(sz);
-    return cudaq::to_bool_vector(mz(q));
+    return mz(q);
   }
 };
 
diff --git a/targettests/execution/adjoint_control.cpp b/targettests/execution/adjoint_control.cpp
new file mode 100644
index 00000000000..d6174c27bb8
--- /dev/null
+++ b/targettests/execution/adjoint_control.cpp
@@ -0,0 +1,81 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// clang-format off
+// RUN: nvq++ --enable-mlir %s -o %t && %t | FileCheck %s
+// clang-format on
+
+// Verify that the controlled-adjoint of a kernel produces the correct inverse
+// action. See: https://github.com/NVIDIA/cuda-quantum/issues/854
+
+#include <cstdio>
+#include <cudaq.h>
+
+struct s_gate {
+  void operator()(cudaq::qubit &q) __qpu__ { s(q); }
+};
+
+struct s_adj {
+  void operator()(cudaq::qubit &q) __qpu__ { cudaq::adjoint(s_gate{}, q); }
+};
+
+// Wrapper that applies s_gate under a control qubit passed as a regular arg.
+struct s_ctrl {
+  void operator()(cudaq::qubit &ctrl, cudaq::qubit &q) __qpu__ {
+    cudaq::control(s_gate{}, {ctrl}, q);
+  }
+};
+
+// S gate is used because S != S_dagger, making an incorrect adjoint detectable:
+// S_dagger*S|+> = |+> (correct, q -> 0), but S*S|+> = Z|+> = |-> (wrong, q ->
+// 1).
+
+// Approach 1: control(adj(S)) -- wrap the adjoint in a struct, then control it.
+// Circuit:
+//   ctrl = |1>, q = |+>
+//   S(q)           -> q = S|+>
+//   ctrl(S_dag)(q) (with ctrl = |1>) -> q = S_dag*S|+> = |+>
+//   H(q)           -> q = |0>
+// Expected bitstring: ctrl=1, q=0 -> "10"
+struct ctrl_adj_s {
+  void operator()() __qpu__ {
+    cudaq::qubit ctrl, q;
+    x(ctrl);
+    h(q);
+    s(q);
+    cudaq::control(s_adj{}, {ctrl}, q);
+    h(q);
+  }
+};
+
+// Approach 2: adj(control(S)) -- wrap the controlled form in a struct, then
+// adjoint it.
+// Same expected result as approach 1 since adj(ctrl(U)) = ctrl(adj(U)).
+struct adj_ctrl_s {
+  void operator()() __qpu__ {
+    cudaq::qubit ctrl, q;
+    x(ctrl);
+    h(q);
+    s(q);
+    cudaq::adjoint(s_ctrl{}, ctrl, q);
+    h(q);
+  }
+};
+
+int main() {
+  auto counts1 = cudaq::sample(ctrl_adj_s{});
+  for (auto &[bits, count] : counts1)
+    printf("%s\n", bits.data());
+  auto counts2 = cudaq::sample(adj_ctrl_s{});
+  for (auto &[bits, count] : counts2)
+    printf("%s\n", bits.data());
+  return 0;
+}
+
+// CHECK: 10
+// CHECK: 10
diff --git a/targettests/execution/auto_kernel.cpp b/targettests/execution/auto_kernel.cpp
index bbe105cd22d..61b484d6348 100644
--- a/targettests/execution/auto_kernel.cpp
+++ b/targettests/execution/auto_kernel.cpp
@@ -20,7 +20,7 @@ struct ak2 {
     h(q[0]);
     x(q[1]);
     y(q[2]);
-    return cudaq::to_bool_vector(mz(q));
+    return mz(q);
   }
 };
 
diff --git a/targettests/execution/bug_qubit.cpp b/targettests/execution/bug_qubit.cpp
index 6b33d51778c..d3b3d01d59e 100644
--- a/targettests/execution/bug_qubit.cpp
+++ b/targettests/execution/bug_qubit.cpp
@@ -17,6 +17,7 @@
 // RUN: IQM_QPU_QA=%iqm_tests_dir/Crystal_20.txt %t
 // RUN: IQM_QPU_QA=%iqm_tests_dir/Crystal_54.txt %t
 // RUN: nvq++ --target oqc        --emulate %s -o %t && %t
+// RUN: nvq++ --target qbraid     --emulate %s -o %t && %t
 // RUN: nvq++ --target quantinuum --emulate %s -o %t && %t
 // RUN: if %braket_avail; then nvq++ --target braket --emulate %s -o %t && %t; fi
 // RUN: if %qci_avail; then nvq++ --target qci --emulate %s -o %t && %t; fi
diff --git a/targettests/execution/callable_kernel_arg.cpp b/targettests/execution/callable_kernel_arg.cpp
index 7eeca0e5bbc..a036b046c5a 100644
--- a/targettests/execution/callable_kernel_arg.cpp
+++ b/targettests/execution/callable_kernel_arg.cpp
@@ -12,6 +12,7 @@
 // RUN: nvq++ --target ionq       --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target iqm        --emulate %s -o %t && IQM_QPU_QA=%iqm_tests_dir/Crystal_5.txt  %t | FileCheck %s
 // RUN: nvq++ --target oqc        --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target qbraid     --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target quantinuum --emulate %s -o %t && %t | FileCheck %s
 // RUN: if %braket_avail; then nvq++ --target braket --emulate %s -o %t && %t | FileCheck %s; fi
 // RUN: if %qci_avail; then nvq++ --target qci --emulate %s -o %t && %t | FileCheck %s; fi
diff --git a/targettests/execution/conditional_run.cpp b/targettests/execution/conditional_run.cpp
index 7780f567aa4..99ddf388188 100644
--- a/targettests/execution/conditional_run.cpp
+++ b/targettests/execution/conditional_run.cpp
@@ -17,7 +17,7 @@
 #include <cudaq.h>
 
 struct kernel {
-  bool operator()() __qpu__ {
+  auto operator()() __qpu__ {
     cudaq::qarray<3> q;
     // Initial state prep
     x(q[0]);
diff --git a/targettests/execution/cudaq_observe-cpp17.cpp b/targettests/execution/cudaq_observe-cpp17.cpp
new file mode 100644
index 00000000000..ffd05d7780f
--- /dev/null
+++ b/targettests/execution/cudaq_observe-cpp17.cpp
@@ -0,0 +1,56 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// REQUIRES: c++17
+// clang-format off
+// RUN: nvq++ %cpp_std --target infleqtion               --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target ionq                     --emulate %s -o %t && %t | FileCheck %s
+// 2 different IQM machines for 2 different topologies
+// RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target iqm --iqm-machine Apollo --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target oqc                      --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target qbraid                   --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
+// clang-format on
+
+#include <cudaq.h>
+#include <cudaq/algorithm.h>
+
+// The example here shows a simple use case for the `cudaq::observe`
+// function in computing expected values of provided spin_ops.
+
+struct ansatz {
+  auto operator()(double theta) __qpu__ {
+    cudaq::qvector q(2);
+    x(q[0]);
+    ry(theta, q[1]);
+    cx(q[1], q[0]);
+  }
+};
+
+int main() {
+
+  // Build up your spin op algebraically
+   cudaq::spin_op h = 5.907 - 2.1433 * cudaq::spin_op::x(0) * cudaq::spin_op::x(1) -
+                     2.1433 * cudaq::spin_op::y(0) * cudaq::spin_op::y(1) +
+                     .21829 * cudaq::spin_op::z(0) - 6.125 * cudaq::spin_op::z(1);
+
+  // Make repeatable for shots-based emulation
+  cudaq::set_random_seed(13);
+
+  // Observe takes the kernel, the spin_op, and the concrete
+  // parameters for the kernel
+  double energy = cudaq::observe(ansatz{}, h, .59);
+  printf("Energy is %.16lf\n", energy);
+  return 0;
+}
+
+// Note: seeds 2 and 12 will push this to -2 instead of -1. All all other
+// seeds in 1-100 range will be -1.x.
+
+// CHECK: Energy is -1.
diff --git a/targettests/execution/cudaq_observe.cpp b/targettests/execution/cudaq_observe.cpp
index a28f7537f2e..230775628f9 100644
--- a/targettests/execution/cudaq_observe.cpp
+++ b/targettests/execution/cudaq_observe.cpp
@@ -12,6 +12,7 @@
 // RUN: nvq++ --target ionq            --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target iqm             --emulate %s -o %t && IQM_QPU_QA=%iqm_tests_dir/Crystal_5.txt  %t | FileCheck %s
 // RUN: nvq++ --target oqc             --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target qbraid          --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target quantinuum      --emulate %s -o %t && %t | FileCheck %s
 // RUN: if %braket_avail; then nvq++ --target braket --emulate %s -o %t && %t | FileCheck %s; fi
 // RUN: if %qci_avail; then nvq++ --target qci --emulate %s -o %t && %t | FileCheck %s; fi
diff --git a/targettests/execution/cudaq_run.cpp b/targettests/execution/cudaq_run.cpp
index 97294f1bdd4..c3b14378eb7 100644
--- a/targettests/execution/cudaq_run.cpp
+++ b/targettests/execution/cudaq_run.cpp
@@ -53,7 +53,11 @@ struct vector_mz_test {
     cudaq::qvector q(5);
     cudaq::qubit p;
     x(q);
-    return cudaq::to_bool_vector(mz(q));
+#ifdef CUDAQ_LIBRARY_MODE
+    return cudaq::measure_result::to_bool_vector(mz(q));
+#else
+    return mz(q);
+#endif
   }
 };
 
@@ -90,18 +94,6 @@ auto struct_test = []() __qpu__ {
   return t;
 };
 
-__qpu__ auto return_mz() {
-  cudaq::qubit q;
-  h(q);
-  return static_cast<bool>(mz(q));
-}
-
-__qpu__ auto return_vector_mz() {
-  cudaq::qvector q(3);
-  x(q);
-  return cudaq::to_bool_vector(mz(q));
-}
-
 int main() {
   std::size_t shots = 10;
   int c = 0;
@@ -263,34 +255,6 @@ int main() {
     }
   }
 
-  {
-    const auto results = cudaq::run(shots, return_mz);
-    if (results.size() != shots) {
-      printf("FAILED! Expected %lu shots. Got %lu\n", shots, results.size());
-    } else {
-      c = 0;
-      for (auto i : results)
-        printf("%d: %d\n", c++, (bool)i);
-      printf("success - return_mz\n");
-    }
-  }
-
-  {
-    const auto results = cudaq::run(shots, return_vector_mz);
-    if (results.size() != shots) {
-      printf("FAILED! Expected %lu shots. Got %lu\n", shots, results.size());
-    } else {
-      c = 0;
-      for (auto &vec : results) {
-        printf("%d: {", c++);
-        for (auto b : vec)
-          printf("%d ", static_cast<bool>(b));
-        printf("}\n");
-      }
-      printf("success - return_vector_mz\n");
-    }
-  }
-
   return 0;
 }
 
@@ -304,5 +268,3 @@ int main() {
 // CHECK: success - vector_int_test
 // CHECK: success - vector_float_test
 // CHECK: success - struct_test
-// CHECK: success - return_mz
-// CHECK: success - return_vector_mz
diff --git a/targettests/execution/cudaq_run_dynamic_result.cpp b/targettests/execution/cudaq_run_dynamic_result.cpp
index c46ee18a389..6b240f1fffc 100644
--- a/targettests/execution/cudaq_run_dynamic_result.cpp
+++ b/targettests/execution/cudaq_run_dynamic_result.cpp
@@ -17,13 +17,13 @@
 __qpu__ std::vector<bool> arg_size_bool(int n) {
   cudaq::qvector qs(n);
   x(qs);
-  return cudaq::to_bool_vector(mz(qs));
+  return mz(qs);
 }
 
 __qpu__ std::vector<int> arg_size_int(int n) {
   cudaq::qvector qs(n);
   x(qs);
-  auto bits = cudaq::to_bool_vector(mz(qs));
+  auto bits = mz(qs);
   std::vector<int> result(n);
   for (int i = 0; i < n; i++)
     result[i] = bits[i] ? 1 : 0;
@@ -33,7 +33,7 @@ __qpu__ std::vector<int> arg_size_int(int n) {
 __qpu__ std::vector<float> arg_size_float(int n) {
   cudaq::qvector qs(n);
   x(qs);
-  auto bits = cudaq::to_bool_vector(mz(qs));
+  auto bits = mz(qs);
   std::vector<float> result(n);
   for (int i = 0; i < n; i++)
     result[i] = bits[i] ? 1.0f : 0.0f;
@@ -49,7 +49,7 @@ __qpu__ std::vector<bool> branch_vec_test(bool flip) {
   bool b = mz(ctrl);
   int sz = b ? 2 : 4;
   cudaq::qvector data(sz);
-  return cudaq::to_bool_vector(mz(data));
+  return mz(data);
 }
 
 int main() {
diff --git a/targettests/execution/cudaq_run_emulation.cpp b/targettests/execution/cudaq_run_emulation.cpp
index e4176af6c51..1b73dca5023 100644
--- a/targettests/execution/cudaq_run_emulation.cpp
+++ b/targettests/execution/cudaq_run_emulation.cpp
@@ -6,6 +6,7 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
+
 // clang-format off
 // RUN: nvq++ --target quantinuum --emulate %s -o %t && %t 2>&1 | FileCheck %s -check-prefix=FAIL
 // RUN: nvq++ --target quantinuum --quantinuum-machine Helios-1SC --emulate %s -o %t && %t | FileCheck %s
@@ -29,19 +30,7 @@ __qpu__ int test_kernel(int count) {
 __qpu__ std::vector<bool> mz_test(int count) {
   cudaq::qvector v(count);
   h(v);
-  return cudaq::to_bool_vector(mz(v));
-}
-
-__qpu__ bool return_mz() {
-  cudaq::qubit q;
-  h(q);
-  return mz(q);
-}
-
-__qpu__ auto return_vector_mz() {
-  cudaq::qvector q(3);
-  x(q);
-  return cudaq::to_bool_vector(mz(q));
+  return mz(v);
 }
 
 int main() {
@@ -79,39 +68,9 @@ int main() {
     }
   }
 
-  {
-    const auto results = cudaq::run(shots, return_mz);
-    if (results.size() != shots) {
-      printf("FAILED! Expected %lu shots. Got %lu\n", shots, results.size());
-    } else {
-      c = 0;
-      for (auto i : results)
-        printf("%d: %d\n", c++, (bool)i);
-      printf("success - return_mz\n");
-    }
-  }
-
-  {
-    const auto results = cudaq::run(shots, return_vector_mz);
-    if (results.size() != shots) {
-      printf("FAILED! Expected %lu shots. Got %lu\n", shots, results.size());
-    } else {
-      c = 0;
-      for (auto &vec : results) {
-        printf("%d: {", c++);
-        for (auto b : vec)
-          printf("%d ", (bool)b);
-        printf("}\n");
-      }
-      printf("success - return_vector_mz\n");
-    }
-  }
-
   return 0;
 }
 
 // FAIL: `run` is not yet supported on this target
 // CHECK: success!
 // CHECK: success async!
-// CHECK: success - return_mz
-// CHECK: success - return_vector_mz
diff --git a/targettests/execution/if_jit.cpp b/targettests/execution/if_jit.cpp
index 7f3eb72205d..9bc39c2e3be 100644
--- a/targettests/execution/if_jit.cpp
+++ b/targettests/execution/if_jit.cpp
@@ -14,6 +14,7 @@
 // RUN: nvq++ --target ionq       --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target iqm        --emulate %s -o %t && IQM_QPU_QA=%iqm_tests_dir/Crystal_5.txt  %t | FileCheck %s
 // RUN: nvq++ --target oqc        --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target qbraid     --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target quantinuum --emulate %s -o %t && %t | FileCheck %s
 // RUN: if %braket_avail; then nvq++ --target braket --emulate %s -o %t && %t | FileCheck %s; fi
 // RUN: if %qci_avail; then nvq++ --target qci --emulate %s -o %t && %t | FileCheck %s; fi
diff --git a/targettests/execution/int8_t.cpp b/targettests/execution/int8_t.cpp
index 8323b5f0acb..d38bfd799d7 100644
--- a/targettests/execution/int8_t.cpp
+++ b/targettests/execution/int8_t.cpp
@@ -12,6 +12,7 @@
 // RUN: nvq++ --target ionq       --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target iqm        --emulate %s -o %t && IQM_QPU_QA=%iqm_tests_dir/Crystal_5.txt %t | FileCheck %s
 // RUN: nvq++ --target oqc        --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target qbraid     --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target quantinuum --emulate %s -o %t && %t | FileCheck %s
 // RUN: if %braket_avail; then nvq++ --target braket --emulate %s -o %t && %t | FileCheck %s; fi
 // RUN: if %qci_avail; then nvq++ --target qci --emulate %s -o %t && %t | FileCheck %s; fi
diff --git a/targettests/execution/int8_t_free_func.cpp b/targettests/execution/int8_t_free_func.cpp
index 0cf8f4bd156..8a7642813aa 100644
--- a/targettests/execution/int8_t_free_func.cpp
+++ b/targettests/execution/int8_t_free_func.cpp
@@ -12,6 +12,7 @@
 // RUN: nvq++ --target ionq       --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target iqm        --emulate %s -o %t && IQM_QPU_QA=%iqm_tests_dir/Crystal_5.txt  %t | FileCheck %s
 // RUN: nvq++ --target oqc        --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target qbraid     --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target quantinuum --emulate %s -o %t && %t | FileCheck %s
 // RUN: if %braket_avail; then nvq++ --target braket --emulate %s -o %t && %t | FileCheck %s; fi
 // RUN: if %qci_avail; then nvq++ --target qci --emulate %s -o %t && %t | FileCheck %s; fi
diff --git a/targettests/execution/load_value.cpp b/targettests/execution/load_value.cpp
index 1d1412980b7..46513671e84 100644
--- a/targettests/execution/load_value.cpp
+++ b/targettests/execution/load_value.cpp
@@ -12,6 +12,7 @@
 // RUN: nvq++ --target ionq       --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target iqm        --emulate %s -o %t && IQM_QPU_QA=%iqm_tests_dir/Crystal_5.txt  %t | FileCheck %s
 // RUN: nvq++ --target oqc        --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target qbraid     --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target quantinuum --emulate %s -o %t && %t | FileCheck %s
 // RUN: if %braket_avail; then nvq++ --target braket --emulate %s -o %t && %t | FileCheck %s; fi
 // RUN: if %qci_avail; then nvq++ --target qci --emulate %s -o %t && %t | FileCheck %s; fi
diff --git a/targettests/execution/mixed_basis_sample.cpp b/targettests/execution/mixed_basis_sample.cpp
new file mode 100644
index 00000000000..e8af1a59aa3
--- /dev/null
+++ b/targettests/execution/mixed_basis_sample.cpp
@@ -0,0 +1,70 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// Regression test for https://github.com/NVIDIA/cuda-quantum/issues/4333
+// Verify that mixing mz/mx/my in a sampled kernel produces bitstrings that
+// include all measured qubits.
+
+// RUN: nvq++ --enable-mlir %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --library-mode %s -o %t && %t | FileCheck %s
+
+#include <cassert>
+#include <cudaq.h>
+#include <cstdio>
+
+// Kernel that measures 7 out of 9 qubits using mixed bases.
+// Deterministic bits: q0=0(mz), q1=1(mz), q4=0(mz), q6=1(mz).
+// mx on |+> (q2) and |-> (q5) are deterministic: q2=0, q5=1.
+// my on |0> (q3) is non-deterministic.
+__qpu__ void mixed_basis_kernel() {
+  cudaq::qvector q(9);
+
+  x(q[1]);
+  h(q[2]); // |+> so mx deterministically gives 0
+  x(q[5]);
+  h(q[5]); // |-> so mx deterministically gives 1
+  x(q[6]);
+
+  mz(q[4]);
+  mx(q[2]);
+  my(q[3]);
+  mz(q[0]);
+  mx(q[5]);
+  mz(q[6]);
+  mz(q[1]);
+}
+
+int main() {
+  auto counts = cudaq::sample(100, mixed_basis_kernel);
+
+  // Must have at least one bitstring.
+  assert(counts.size() > 0);
+
+  for (auto &[bits, count] : counts) {
+    // Every bitstring must be 7 bits (7 measured qubits).
+    assert(bits.size() == 7);
+
+    // Deterministic z-basis results.
+    assert(bits[0] == '0'); // q0 mz -> 0
+    assert(bits[1] == '1'); // q1 mz -> 1
+    assert(bits[4] == '0'); // q4 mz -> 0
+    assert(bits[5] == '1'); // q5 mx(|->) -> 1
+    assert(bits[6] == '1'); // q6 mz -> 1
+
+    // q2: mx(|+>) -> 0
+    assert(bits[2] == '0');
+
+    // q3: my(|0>) is non-deterministic, just check valid bit.
+    assert(bits[3] == '0' || bits[3] == '1');
+  }
+
+  printf("passed\n");
+  return 0;
+}
+
+// CHECK: passed
diff --git a/targettests/execution/qir_cond_for_loop-3.cpp b/targettests/execution/qir_cond_for_loop-3.cpp
index 4285c37c829..a13e27ff08b 100644
--- a/targettests/execution/qir_cond_for_loop-3.cpp
+++ b/targettests/execution/qir_cond_for_loop-3.cpp
@@ -11,12 +11,6 @@
 // RUN: CUDAQ_DEFAULT_SIMULATOR=stim nvq++ --target quantinuum --quantinuum-machine Helios-1SC --emulate %s -o %t && %t | FileCheck %s
 // clang-format on
 
-// Original test used `std::vector<cudaq::measure_result>(n)` with element
-// assignment, both of which are incompatible with measure_result's deleted
-// default ctor and deleted assignment operators. Rewritten to use a loop-local
-// `measure_result` variable, retaining the mid-circuit measurement +
-// conditional parity pattern with deferred discrimination.
-
 #include <cudaq.h>
 #include <iostream>
 
@@ -24,10 +18,11 @@ struct kernel {
   bool operator()(const int n_iter) __qpu__ {
     cudaq::qubit q0;
     cudaq::qubit q1;
+    std::vector<cudaq::measure_result> resultVector(n_iter);
     for (int i = 0; i < n_iter; i++) {
       h(q0);
-      cudaq::measure_result result = mz(q0);
-      if (result)
+      resultVector[i] = mz(q0);
+      if (resultVector[i])
         x(q1); // toggle q1 on every q0 coin toss that lands heads
     }
     return mz(q1); // the measured q1 should contain the parity bit for
diff --git a/targettests/execution/qubit_management_if_classical.cpp b/targettests/execution/qubit_management_if_classical.cpp
index 4fcf35fdfe3..9f47d62291b 100644
--- a/targettests/execution/qubit_management_if_classical.cpp
+++ b/targettests/execution/qubit_management_if_classical.cpp
@@ -22,7 +22,7 @@ struct run_test {
       rx(1., p);
       y(p);
     }
-    bool res = mz(p);
+    auto res = mz(p);
     return res;
   }
 };
diff --git a/targettests/execution/state_init_mid_circuit.cpp b/targettests/execution/state_init_mid_circuit.cpp
new file mode 100644
index 00000000000..1d518d21d5c
--- /dev/null
+++ b/targettests/execution/state_init_mid_circuit.cpp
@@ -0,0 +1,116 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// Regression test for https://github.com/NVIDIA/cuda-quantum/issues/4350
+// (`createStateFromData` was called via `CreateStateOp` in the middle of kernel
+// execution, after qubits were already live in the simulator, corrupting the
+// active state).
+
+// Compile and execute; verifies the runtime produces the correct bitstrings.
+// RUN: nvq++ %s -o %t && %t | FileCheck %s
+
+// Lower to Quake IR and verify the expected IR structure.
+// RUN: cudaq-quake %s | cudaq-opt | FileCheck %s --check-prefix=MLIR
+
+#include <cudaq.h>
+#include <iostream>
+#include <vector>
+
+// Allocate one qubit first, then allocate more qubits from a state vector.
+// The second qalloc emits `quake.create_state` which calls
+// `createStateFromData` while the simulator already owns the live state for
+// `p`.
+__qpu__ void test_single_then_state(std::vector<cudaq::complex> inState) {
+  cudaq::qubit p;
+  cudaq::qvector q{cudaq::state(inState)};
+  mz(p);
+  mz(q);
+}
+
+// Same pattern with a multi-qubit register allocated first.
+__qpu__ void test_multi_then_state(std::vector<cudaq::complex> inState) {
+  cudaq::qvector p(2);
+  cudaq::qvector q{cudaq::state(inState)};
+  mz(p);
+  mz(q);
+}
+
+int main() {
+  constexpr int shots = 1000;
+  // State vector for |11> (2 qubits, 4 amplitudes, last = 1.0).
+  std::vector<cudaq::complex> state2q{0., 0., 0., 1.};
+
+  {
+    // p starts in |0>; q is initialized to |11>.
+    // All shots must produce "011".
+    auto counts = cudaq::sample(shots, test_single_then_state, state2q);
+    std::cout << "single qubit + state: ";
+    counts.dump();
+    if (counts.count("011") != static_cast<std::size_t>(shots)) {
+      std::cerr << "FAIL: expected all " << shots << " shots as '011', got "
+                << counts.count("011") << "\n";
+      return 1;
+    }
+    std::cout << "single_then_state PASSED\n";
+  }
+
+  {
+    // p[0],p[1] start in |00>; q is initialized to |11>.
+    // All shots must produce "0011".
+    auto counts = cudaq::sample(shots, test_multi_then_state, state2q);
+    std::cout << "multi-qubit + state: ";
+    counts.dump();
+    if (counts.count("0011") != static_cast<std::size_t>(shots)) {
+      std::cerr << "FAIL: expected all " << shots << " shots as '0011', got "
+                << counts.count("0011") << "\n";
+      return 1;
+    }
+    std::cout << "multi_then_state PASSED\n";
+  }
+
+  return 0;
+}
+
+// CHECK: single_then_state PASSED
+// CHECK: multi_then_state PASSED
+
+// Verify the mid-execution state-init IR: `quake.alloca` for the prior
+// qubits appears before `quake.create_state`, i.e., `createStateFromData` call
+// while the simulator already owns live qubits.
+
+// The mangled suffix after the dot is compiler-dependent (GCC vs Clang libc++),
+// so the LABEL stops at the common prefix.
+// clang-format off
+// MLIR-LABEL: func.func @__nvqpp__mlirgen__function_test_single_then_state.
+// MLIR:         %[[VAL_1:.*]] = quake.alloca !quake.ref
+// MLIR:         %[[VAL_2:.*]] = cc.stdvec_data %[[VAL_0:.*]] : (!cc.stdvec<complex<f64>>) -> !cc.ptr<complex<f64>>
+// MLIR:         %[[VAL_3:.*]] = cc.stdvec_size %[[VAL_0]] : (!cc.stdvec<complex<f64>>) -> i64
+// MLIR:         %[[VAL_4:.*]] = quake.create_state %[[VAL_2]], %[[VAL_3]] : (!cc.ptr<complex<f64>>, i64) -> !cc.ptr<!quake.state>
+// MLIR:         %[[VAL_5:.*]] = quake.get_number_of_qubits %[[VAL_4]] : (!cc.ptr<!quake.state>) -> i64
+// MLIR:         %[[VAL_6:.*]] = quake.alloca !quake.veq<?>{{\[}}%[[VAL_5]] : i64]
+// MLIR:         %[[VAL_7:.*]] = quake.init_state %[[VAL_6]], %[[VAL_4]] : (!quake.veq<?>, !cc.ptr<!quake.state>) -> !quake.veq<?>
+// MLIR:         quake.delete_state %[[VAL_4]] : !cc.ptr<!quake.state>
+// MLIR:         %[[VAL_8:.*]] = quake.mz %[[VAL_1]] : (!quake.ref) -> !quake.measure
+// MLIR:         %[[VAL_9:.*]] = quake.mz %[[VAL_7]] : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
+// MLIR:         return
+// MLIR:       }
+
+// MLIR-LABEL: func.func @__nvqpp__mlirgen__function_test_multi_then_state.
+// MLIR:         %[[VAL_1:.*]] = quake.alloca !quake.veq<2>
+// MLIR:         %[[VAL_2:.*]] = cc.stdvec_data %[[VAL_0:.*]] : (!cc.stdvec<complex<f64>>) -> !cc.ptr<complex<f64>>
+// MLIR:         %[[VAL_3:.*]] = cc.stdvec_size %[[VAL_0]] : (!cc.stdvec<complex<f64>>) -> i64
+// MLIR:         %[[VAL_4:.*]] = quake.create_state %[[VAL_2]], %[[VAL_3]] : (!cc.ptr<complex<f64>>, i64) -> !cc.ptr<!quake.state>
+// MLIR:         %[[VAL_5:.*]] = quake.get_number_of_qubits %[[VAL_4]] : (!cc.ptr<!quake.state>) -> i64
+// MLIR:         %[[VAL_6:.*]] = quake.alloca !quake.veq<?>{{\[}}%[[VAL_5]] : i64]
+// MLIR:         %[[VAL_7:.*]] = quake.init_state %[[VAL_6]], %[[VAL_4]] : (!quake.veq<?>, !cc.ptr<!quake.state>) -> !quake.veq<?>
+// MLIR:         quake.delete_state %[[VAL_4]] : !cc.ptr<!quake.state>
+// MLIR:         %[[VAL_8:.*]] = quake.mz %[[VAL_1]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
+// MLIR:         %[[VAL_9:.*]] = quake.mz %[[VAL_7]] : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
+// MLIR:         return
+// MLIR:       }
+// clang-format on
diff --git a/targettests/execution/sudoku_2x2-1.cpp b/targettests/execution/sudoku_2x2-1.cpp
index 0ee64a18855..df05df508fa 100644
--- a/targettests/execution/sudoku_2x2-1.cpp
+++ b/targettests/execution/sudoku_2x2-1.cpp
@@ -12,6 +12,7 @@
 // RUN: nvq++ --target ionq       --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target iqm        --emulate %s -o %t && IQM_QPU_QA=%iqm_tests_dir/Crystal_20.txt %t | FileCheck %s
 // RUN: nvq++ --target oqc        --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target qbraid     --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target quantinuum --emulate %s -o %t && %t | FileCheck %s
 // RUN: if %qci_avail; then nvq++ --target qci --emulate %s -o %t && %t | FileCheck %s; fi
 // clang-format on
diff --git a/targettests/execution/sudoku_2x2-bit_name.cpp b/targettests/execution/sudoku_2x2-bit_name.cpp
index 809e237dda3..5ecff676380 100644
--- a/targettests/execution/sudoku_2x2-bit_name.cpp
+++ b/targettests/execution/sudoku_2x2-bit_name.cpp
@@ -12,6 +12,7 @@
 // RUN: nvq++ --target ionq       --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target iqm        --emulate %s -o %t && IQM_QPU_QA=%iqm_tests_dir/Crystal_20.txt %t | FileCheck %s
 // RUN: nvq++ --target oqc        --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target qbraid     --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target quantinuum --emulate %s -o %t && %t | FileCheck %s
 // RUN: if %qci_avail; then nvq++ --target qci --emulate %s -o %t && %t | FileCheck %s; fi
 // clang-format on
diff --git a/targettests/execution/sudoku_2x2-reg_name.cpp b/targettests/execution/sudoku_2x2-reg_name.cpp
index a75e6f04d0e..6fc79267b65 100644
--- a/targettests/execution/sudoku_2x2-reg_name.cpp
+++ b/targettests/execution/sudoku_2x2-reg_name.cpp
@@ -12,6 +12,7 @@
 // RUN: nvq++ --target ionq       --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target iqm        --emulate %s -o %t && IQM_QPU_QA=%iqm_tests_dir/Crystal_20.txt %t | FileCheck %s
 // RUN: nvq++ --target oqc        --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target qbraid     --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target quantinuum --emulate %s -o %t && %t | FileCheck %s
 // RUN: if %qci_avail; then nvq++ --target qci --emulate %s -o %t && %t | FileCheck %s; fi
 // clang-format on
diff --git a/targettests/execution/sudoku_2x2.cpp b/targettests/execution/sudoku_2x2.cpp
index ff3906f2595..b86eddcbead 100644
--- a/targettests/execution/sudoku_2x2.cpp
+++ b/targettests/execution/sudoku_2x2.cpp
@@ -12,6 +12,7 @@
 // RUN: nvq++ --target ionq       --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target iqm        --emulate %s -o %t && IQM_QPU_QA=%iqm_tests_dir/Crystal_20.txt %t | FileCheck %s
 // RUN: nvq++ --target oqc        --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target qbraid     --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target quantinuum --emulate %s -o %t && %t | FileCheck %s
 // RUN: if %qci_avail; then nvq++ --target qci --emulate %s -o %t && %t | FileCheck %s; fi
 // clang-format on
diff --git a/targettests/execution/swap_gate.cpp b/targettests/execution/swap_gate.cpp
index e9d8092dd56..e836b58f99a 100644
--- a/targettests/execution/swap_gate.cpp
+++ b/targettests/execution/swap_gate.cpp
@@ -12,6 +12,7 @@
 // RUN: nvq++ --target ionq       --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target iqm        --emulate %s -o %t && IQM_QPU_QA=%iqm_tests_dir/Crystal_5.txt %t | FileCheck %s
 // RUN: nvq++ --target oqc        --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target qbraid     --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target quantinuum --emulate %s -o %t && %t | FileCheck %s
 // RUN: if %braket_avail; then nvq++ --target braket --emulate %s -o %t && %t | FileCheck %s; fi
 // RUN: if %qci_avail; then nvq++ --target qci --emulate %s -o %t && %t | FileCheck %s; fi
diff --git a/targettests/execution/variable_size_qreg.cpp b/targettests/execution/variable_size_qreg.cpp
index 9844855ffc8..9d6f35f0adc 100644
--- a/targettests/execution/variable_size_qreg.cpp
+++ b/targettests/execution/variable_size_qreg.cpp
@@ -12,6 +12,7 @@
 // RUN: nvq++ --target ionq       --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target iqm        --emulate %s -o %t && IQM_QPU_QA=%iqm_tests_dir/Crystal_5.txt %t | FileCheck %s
 // RUN: nvq++ --target oqc        --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target qbraid     --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target quantinuum --emulate %s -o %t && %t | FileCheck %s
 // RUN: if %braket_avail; then nvq++ --target braket --emulate %s -o %t && %t | FileCheck %s; fi
 // RUN: if %qci_avail; then nvq++ --target qci --emulate %s -o %t && %t | FileCheck %s; fi
diff --git a/test/AST-Quake/measure_result_assign.cpp b/targettests/qbraid/bug_qubit.cpp
similarity index 58%
rename from test/AST-Quake/measure_result_assign.cpp
rename to targettests/qbraid/bug_qubit.cpp
index 114bb04260f..05533521413 100644
--- a/test/AST-Quake/measure_result_assign.cpp
+++ b/targettests/qbraid/bug_qubit.cpp
@@ -1,21 +1,10 @@
 /*******************************************************************************
- * Copyright (c) 2026 NVIDIA Corporation & Affiliates.                         *
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
  * All rights reserved.                                                        *
  *                                                                             *
  * This source code and the accompanying materials are made available under    *
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-// RUN: not cudaq-quake %s 2>&1 | FileCheck %s
-
-#include <cudaq.h>
-
-__qpu__ bool assign_kernel() {
-  cudaq::qvector q(2);
-  auto results = mz(q);
-  results[0] = mz(q[1]);
-  return static_cast<bool>(results[0]);
-}
-
-// CHECK: error:{{.*}}deleted operator '='
-// CHECK: error: C++ source has errors
+// RUN: echo skipping
+#include "../execution/bug_qubit.cpp"
diff --git a/targettests/qbraid/callable_kernel_arg.cpp b/targettests/qbraid/callable_kernel_arg.cpp
new file mode 100644
index 00000000000..7a6ca74ee20
--- /dev/null
+++ b/targettests/qbraid/callable_kernel_arg.cpp
@@ -0,0 +1,10 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// RUN: echo skipping
+#include "../execution/callable_kernel_arg.cpp"
diff --git a/targettests/qbraid/cudaq_observe.cpp b/targettests/qbraid/cudaq_observe.cpp
new file mode 100644
index 00000000000..1b75a817e14
--- /dev/null
+++ b/targettests/qbraid/cudaq_observe.cpp
@@ -0,0 +1,10 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// RUN: echo skipping
+#include "../execution/cudaq_observe.cpp"
diff --git a/targettests/qbraid/if_jit.cpp b/targettests/qbraid/if_jit.cpp
new file mode 100644
index 00000000000..3e916bb1e88
--- /dev/null
+++ b/targettests/qbraid/if_jit.cpp
@@ -0,0 +1,10 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// RUN: echo skipping
+#include "../execution/if_jit.cpp"
diff --git a/targettests/qbraid/int8_t.cpp b/targettests/qbraid/int8_t.cpp
new file mode 100644
index 00000000000..2c6751705ec
--- /dev/null
+++ b/targettests/qbraid/int8_t.cpp
@@ -0,0 +1,10 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// RUN: echo skipping
+#include "../execution/int8_t.cpp"
diff --git a/targettests/qbraid/int8_t_free_func.cpp b/targettests/qbraid/int8_t_free_func.cpp
new file mode 100644
index 00000000000..7a29487abbb
--- /dev/null
+++ b/targettests/qbraid/int8_t_free_func.cpp
@@ -0,0 +1,10 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// RUN: echo skipping
+#include "../execution/int8_t_free_func.cpp"
diff --git a/targettests/qbraid/load_value.cpp b/targettests/qbraid/load_value.cpp
new file mode 100644
index 00000000000..e1aee9db9b5
--- /dev/null
+++ b/targettests/qbraid/load_value.cpp
@@ -0,0 +1,10 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// RUN: echo skipping
+#include "../execution/load_value.cpp"
diff --git a/targettests/qbraid/sudoku_2x2-1.cpp b/targettests/qbraid/sudoku_2x2-1.cpp
new file mode 100644
index 00000000000..3fae8d26e6c
--- /dev/null
+++ b/targettests/qbraid/sudoku_2x2-1.cpp
@@ -0,0 +1,10 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// RUN: echo skipping
+#include "../execution/sudoku_2x2-1.cpp"
diff --git a/targettests/qbraid/sudoku_2x2-bit_name.cpp b/targettests/qbraid/sudoku_2x2-bit_name.cpp
new file mode 100644
index 00000000000..f875955b7be
--- /dev/null
+++ b/targettests/qbraid/sudoku_2x2-bit_name.cpp
@@ -0,0 +1,10 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// RUN: echo skipping
+#include "../execution/sudoku_2x2-bit_name.cpp"
diff --git a/targettests/qbraid/sudoku_2x2-reg_name.cpp b/targettests/qbraid/sudoku_2x2-reg_name.cpp
new file mode 100644
index 00000000000..17a48caec48
--- /dev/null
+++ b/targettests/qbraid/sudoku_2x2-reg_name.cpp
@@ -0,0 +1,10 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// RUN: echo skipping
+#include "../execution/sudoku_2x2-reg_name.cpp"
diff --git a/targettests/qbraid/sudoku_2x2.cpp b/targettests/qbraid/sudoku_2x2.cpp
new file mode 100644
index 00000000000..090b230072a
--- /dev/null
+++ b/targettests/qbraid/sudoku_2x2.cpp
@@ -0,0 +1,10 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// RUN: echo skipping
+#include "../execution/sudoku_2x2.cpp"
diff --git a/targettests/qbraid/swap_gate.cpp b/targettests/qbraid/swap_gate.cpp
new file mode 100644
index 00000000000..c592ce69b31
--- /dev/null
+++ b/targettests/qbraid/swap_gate.cpp
@@ -0,0 +1,10 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// RUN: echo skipping
+#include "../execution/swap_gate.cpp"
diff --git a/targettests/qbraid/variable_size_qreg.cpp b/targettests/qbraid/variable_size_qreg.cpp
new file mode 100644
index 00000000000..cc4845f4df9
--- /dev/null
+++ b/targettests/qbraid/variable_size_qreg.cpp
@@ -0,0 +1,10 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// RUN: echo skipping
+#include "../execution/variable_size_qreg.cpp"
diff --git a/targettests/quantinuum/reset_after_measure.cpp b/targettests/quantinuum/reset_after_measure.cpp
index 7c932813c04..c994d79ef67 100644
--- a/targettests/quantinuum/reset_after_measure.cpp
+++ b/targettests/quantinuum/reset_after_measure.cpp
@@ -9,10 +9,6 @@
 // clang-format off
 // RUN: nvq++ --target quantinuum --emulate %s -o %t && %t | FileCheck %s
 // clang-format on
-// XFAIL: *
-// TODO: QIR adaptive profile fails to materialize Result* -> Array* conversion
-// for get_measure results. Needs QIR lowering fix for MeasurementsType in
-// the adaptive profile pipeline.
 
 #include <cudaq.h>
 #include <cudaq/algorithms/resource_estimation.h>
diff --git a/test/AST-Quake/adjoint_control.cpp b/test/AST-Quake/adjoint_control.cpp
new file mode 100644
index 00000000000..ebb34556709
--- /dev/null
+++ b/test/AST-Quake/adjoint_control.cpp
@@ -0,0 +1,90 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// RUN: cudaq-quake %s | FileCheck %s
+
+// Verify adjoint and controlled modifiers can be combined in two ways:
+//   1. control(adj(k)): wrap adjoint(k) in a `__qpu__` struct, then control it.
+//   2. adj(control(k)): wrap control(k) in a `__qpu__` struct, then adjoint it.
+// (https://github.com/NVIDIA/cuda-quantum/issues/854)
+
+#include <cudaq.h>
+
+struct k {
+  void operator()(cudaq::qview<> q) __qpu__ {
+    h(q[0]);
+    t(q[1]);
+  }
+};
+
+// clang-format off
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__k(
+// CHECK-SAME:      %[[VAL_0:.*]]: !quake.veq<?>
+// CHECK:           quake.h %{{.*}}
+// CHECK:           quake.t %{{.*}}
+// CHECK:           return
+// clang-format on
+
+struct k_adj {
+  void operator()(cudaq::qview<> q) __qpu__ { cudaq::adjoint(k{}, q); }
+};
+
+// clang-format off
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__k_adj(
+// CHECK-SAME:      %[[VAL_0:.*]]: !quake.veq<?>
+// CHECK:           quake.apply<adj> @__nvqpp__mlirgen__k %[[VAL_0]]
+// CHECK:           return
+// clang-format on
+
+struct ep {
+  void operator()() __qpu__ {
+    cudaq::qarray<2> q;
+    cudaq::qubit ctrl;
+    cudaq::control(k_adj{}, {ctrl}, q);
+  }
+};
+
+// clang-format off
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__ep()
+// CHECK:           %[[CTRL:.*]] = quake.alloca !quake.ref
+// CHECK:           %[[Q:.*]] = quake.relax_size %{{.*}} : (!quake.veq<2>) -> !quake.veq<?>
+// CHECK:           quake.apply @__nvqpp__mlirgen__k_adj {{\[}}%[[CTRL]]] %[[Q]]
+// CHECK:           return
+// clang-format on
+
+// Approach 2: adj(control(k)) -- wrap control(k) in a `__qpu__` struct, adjoint
+// it.
+struct k_ctrl {
+  void operator()(cudaq::qubit &ctrl, cudaq::qview<> q) __qpu__ {
+    cudaq::control(k{}, {ctrl}, q);
+  }
+};
+
+// clang-format off
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__k_ctrl(
+// CHECK-SAME:      %[[CTRL:.*]]: !quake.ref
+// CHECK-SAME:      %[[Q:.*]]: !quake.veq<?>
+// CHECK:           quake.apply @__nvqpp__mlirgen__k {{\[}}%[[CTRL]]] %[[Q]]
+// CHECK:           return
+// clang-format on
+
+struct ep2 {
+  void operator()() __qpu__ {
+    cudaq::qarray<2> q;
+    cudaq::qubit ctrl;
+    cudaq::adjoint(k_ctrl{}, ctrl, q);
+  }
+};
+
+// clang-format off
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__ep2()
+// CHECK:           %[[CTRL2:.*]] = quake.alloca !quake.ref
+// CHECK:           %[[Q2:.*]] = quake.relax_size %{{.*}} : (!quake.veq<2>) -> !quake.veq<?>
+// CHECK:           quake.apply<adj> @__nvqpp__mlirgen__k_ctrl %[[CTRL2]], %[[Q2]]
+// CHECK:           return
+// clang-format on
diff --git a/test/AST-Quake/auto_kernel-1.cpp b/test/AST-Quake/auto_kernel-1.cpp
index 0a994388858..e695831d348 100644
--- a/test/AST-Quake/auto_kernel-1.cpp
+++ b/test/AST-Quake/auto_kernel-1.cpp
@@ -22,11 +22,14 @@ struct ak1 {
 };
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__ak1(
-// CHECK-SAME:      %[[VAL_0:.*]]: i32{{.*}}) -> !quake.measure attributes
-// CHECK:           %[[VAL_1:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_2:.*]] = cc.alloca i32
-// CHECK:           cc.store %[[VAL_0]], %[[VAL_2]] : !cc.ptr<i32>
-// CHECK:           %[[VAL_3:.*]] = quake.alloca !quake.veq<2>
-// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_3]] name "vec" : (!quake.veq<2>) -> !quake.measurements<2>
-// CHECK:           %[[VAL_5:.*]] = quake.get_measure %[[VAL_4]][%[[VAL_1]]] : (!quake.measurements<2>, i64) -> !quake.measure
-// CHECK:           return %[[VAL_5]] : !quake.measure
+// CHECK-SAME:      %[[VAL_0:.*]]: i32{{.*}}) -> i1 attributes
+// CHECK:           %[[VAL_1:.*]] = cc.alloca i32
+// CHECK:           cc.store %[[VAL_0]], %[[VAL_1]] : !cc.ptr<i32>
+// CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.veq<2>
+// CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_2]] name "vec" : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
+// CHECK:           %[[VAL_7:.*]] = quake.discriminate %[[VAL_3]] : (!cc.stdvec<!quake.measure>) -> !cc.stdvec<i1>
+// CHECK:           %[[VAL_4:.*]] = cc.stdvec_data %[[VAL_7]] : (!cc.stdvec<i1>) -> !cc.ptr<!cc.array<i8 x ?>>
+// CHECK:           %[[VAL_5:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_6:.*]] = cc.load %[[VAL_5]] : !cc.ptr<i8>
+// CHECK:           %[[VAL_8:.*]] = cc.cast %[[VAL_6]] : (i8) -> i1
+// CHECK:           return %[[VAL_8]] : i1
diff --git a/test/AST-Quake/auto_kernel-2.cpp b/test/AST-Quake/auto_kernel-2.cpp
index 1a95553e461..d42a07a011c 100644
--- a/test/AST-Quake/auto_kernel-2.cpp
+++ b/test/AST-Quake/auto_kernel-2.cpp
@@ -21,12 +21,17 @@ struct ak2 {
 };
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__ak2
-// CHECK-SAME:      () -> !quake.measurements<?> attributes
-// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<5>
-// CHECK:           cc.loop while
-// CHECK:           %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<5>) -> !quake.measurements<5>
-// CHECK:           %[[VAL_2:.*]] = quake.relax_size %[[VAL_1]] : (!quake.measurements<5>) -> !quake.measurements<?>
-// CHECK:           return %[[VAL_2]] : !quake.measurements<?>
+// CHECK-SAME: () -> !cc.stdvec<i1> attributes {
+// CHECK:           %[[VAL_22:.*]] = arith.constant 1 : i64
+// CHECK:           %[[VAL_19:.*]] = quake.mz %{{.*}} : (!quake.veq<5>) -> !cc.stdvec<!quake.measure>
+// CHECK:           %[[VAL_1:.*]] = quake.discriminate %[[VAL_19]] : (!cc.stdvec<!quake.measure>) -> !cc.stdvec<i1>
+// CHECK:           %[[VAL_20:.*]] = cc.stdvec_data %[[VAL_1]] : (!cc.stdvec<i1>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_21:.*]] = cc.stdvec_size %[[VAL_1]] : (!cc.stdvec<i1>) -> i64
+// CHECK:           %[[VAL_23:.*]] = call @__nvqpp_vectorCopyCtor(%[[VAL_20]], %[[VAL_21]], %[[VAL_22]]) : (!cc.ptr<i8>, i64, i64) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_24:.*]] = cc.stdvec_init %[[VAL_23]], %[[VAL_21]] : (!cc.ptr<i8>, i64) -> !cc.stdvec<i1>
+// CHECK:           return %[[VAL_24]] : !cc.stdvec<i1>
 // CHECK:         }
 // CHECK-NOT:   func.func {{.*}} @_ZNKSt14_Bit_referencecvbEv() -> i1
+// CHECK-LABEL: func.func private @__nvqpp_vectorCopyCtor(
+// CHECK-NOT:   func.func {{.*}} @_ZNKSt14_Bit_referencecvbEv() -> i1
 
diff --git a/test/AST-Quake/base_profile-0.cpp b/test/AST-Quake/base_profile-0.cpp
index 1ea649c4056..0067901d282 100644
--- a/test/AST-Quake/base_profile-0.cpp
+++ b/test/AST-Quake/base_profile-0.cpp
@@ -34,7 +34,6 @@ struct kernel {
 // CHECK-LABEL: define void @__nvqpp__mlirgen__kernel()
 // CHECK:         tail call void @__quantum__qis__mz__body(%{{.*}}* null, %{{.*}}* null)
 // CHECK:         tail call void @__quantum__qis__mz__body(%{{.*}}* nonnull inttoptr (i64 1 to %{{.*}}*), %{{.*}}* nonnull inttoptr (i64 1 to %{{.*}}*))
-// CHECK:         tail call void @__quantum__rt__array_record_output(i64 2, i8* nonnull getelementptr inbounds ([14 x i8], [14 x i8]* @cstr.61727261793C6931207820323E00, i64 0, i64 0))
 // CHECK:         tail call void @__quantum__rt__result_record_output(%{{.*}}* null, i8* nonnull getelementptr inbounds ([3 x i8], [3 x i8]* @cstr.623000, i64 0, i64 0))
 // CHECK:         tail call void @__quantum__rt__result_record_output(%{{.*}}* nonnull inttoptr (i64 1 to %{{.*}}*), i8* nonnull getelementptr inbounds ([3 x i8], [3 x i8]* @cstr.623100, i64 0, i64 0))
 // clang-format on
diff --git a/test/AST-Quake/qir_profiles.cpp b/test/AST-Quake/base_profile-1.cpp
similarity index 97%
rename from test/AST-Quake/qir_profiles.cpp
rename to test/AST-Quake/base_profile-1.cpp
index 27020e23fb0..231baa16041 100644
--- a/test/AST-Quake/qir_profiles.cpp
+++ b/test/AST-Quake/base_profile-1.cpp
@@ -89,13 +89,6 @@ struct comprehensive {
   }
 };
 
-struct adapt_mz_read {
-  bool operator()() __qpu__ {
-    cudaq::qubit q;
-    return static_cast<bool>(mz(q));
-  }
-};
-
 // clang-format off
 
 // BASE-LABEL: define void @__nvqpp__mlirgen__comprehensive()
@@ -175,7 +168,6 @@ struct adapt_mz_read {
 // ADAPT:         tail call void @__quantum__qis__z__body(%Qubit* nonnull inttoptr (i64 4 to %Qubit*))
 // ADAPT:         tail call void @__quantum__qis__z__body(%Qubit* nonnull inttoptr (i64 5 to %Qubit*))
 // ADAPT:         tail call void @__quantum__qis__z__body(%Qubit* nonnull inttoptr (i64 6 to %Qubit*))
-// ADAPT:         tail call void @__quantum__qis__cz__body(%Qubit* nonnull inttoptr (i64 2 to %Qubit*), %Qubit* nonnull inttoptr (i64 4 to %Qubit*))
 // ADAPT:         tail call void @__quantum__qis__t__body(%Qubit* null)
 // ADAPT:         tail call void @__quantum__qis__t__body(%Qubit* nonnull inttoptr (i64 4 to %Qubit*))
 // ADAPT:         tail call void @__quantum__qis__t__body(%Qubit* nonnull inttoptr (i64 5 to %Qubit*))
@@ -201,31 +193,28 @@ struct adapt_mz_read {
 // ADAPT:         tail call void @__quantum__qis__swap__body(%Qubit* null, %Qubit* nonnull inttoptr (i64 6 to %Qubit*))
 // ADAPT:         tail call void @__quantum__qis__u3__body(double 8.000000e-01, double 5.000000e-01, double -1.000000e+00, %Qubit* nonnull inttoptr (i64 3 to %Qubit*))
 // ADAPT:         tail call void @__quantum__qis__mz__body(%Qubit* null, %Result* null)
-// ADAPT:         tail call void @__quantum__rt__array_record_output(i64 7, i8* nonnull getelementptr inbounds ([14 x i8], [14 x i8]* @cstr.61727261793C6931207820373E00, i64 0, i64 0))
 // ADAPT:         tail call void @__quantum__rt__result_record_output(%Result* null, i8* nonnull getelementptr inbounds ([10 x i8], [10 x i8]* @cstr.73696E676C65746F6E00, i64 0, i64 0))
 // ADAPT:         tail call void @__quantum__qis__mz__body(%Qubit* nonnull inttoptr (i64 1 to %Qubit*), %Result* nonnull inttoptr (i64 1 to %Result*))
 // ADAPT:         tail call void @__quantum__rt__result_record_output(%Result* nonnull inttoptr (i64 1 to %Result*), i8* nonnull getelementptr inbounds ([5 x i8], [5 x i8]* @cstr.65696E7300, i64 0, i64 0))
+// ADAPT:         %[[VAL_2:.*]] = tail call i1 @__quantum__qis__read_result__body(%Result* nonnull inttoptr (i64 1 to %Result*))
 // ADAPT:         tail call void @__quantum__qis__mz__body(%Qubit* nonnull inttoptr (i64 2 to %Qubit*), %Result* nonnull inttoptr (i64 2 to %Result*))
 // ADAPT:         tail call void @__quantum__rt__result_record_output(%Result* nonnull inttoptr (i64 2 to %Result*), i8* nonnull getelementptr inbounds ([4 x i8], [4 x i8]* @cstr.64756200, i64 0, i64 0))
+// ADAPT:         %[[VAL_3:.*]] = tail call i1 @__quantum__qis__read_result__body(%Result* nonnull inttoptr (i64 2 to %Result*))
 // ADAPT:         tail call void @__quantum__qis__mz__body(%Qubit* nonnull inttoptr (i64 3 to %Qubit*), %Result* nonnull inttoptr (i64 3 to %Result*))
 // ADAPT:         tail call void @__quantum__rt__result_record_output(%Result* nonnull inttoptr (i64 3 to %Result*), i8* nonnull getelementptr inbounds ([4 x i8], [4 x i8]* @cstr.64756200, i64 0, i64 0))
+// ADAPT:         %[[VAL_4:.*]] = tail call i1 @__quantum__qis__read_result__body(%Result* nonnull inttoptr (i64 3 to %Result*))
 // ADAPT:         tail call void @__quantum__qis__mz__body(%Qubit* nonnull inttoptr (i64 4 to %Qubit*), %Result* nonnull inttoptr (i64 4 to %Result*))
 // ADAPT:         tail call void @__quantum__rt__result_record_output(%Result* nonnull inttoptr (i64 4 to %Result*), i8* nonnull getelementptr inbounds ([5 x i8], [5 x i8]* @cstr.7472697000, i64 0, i64 0))
+// ADAPT:         %[[VAL_5:.*]] = tail call i1 @__quantum__qis__read_result__body(%Result* nonnull inttoptr (i64 4 to %Result*))
 // ADAPT:         tail call void @__quantum__qis__mz__body(%Qubit* nonnull inttoptr (i64 5 to %Qubit*), %Result* nonnull inttoptr (i64 5 to %Result*))
 // ADAPT:         tail call void @__quantum__rt__result_record_output(%Result* nonnull inttoptr (i64 5 to %Result*), i8* nonnull getelementptr inbounds ([5 x i8], [5 x i8]* @cstr.7472697000, i64 0, i64 0))
+// ADAPT:         %[[VAL_6:.*]] = tail call i1 @__quantum__qis__read_result__body(%Result* nonnull inttoptr (i64 5 to %Result*))
 // ADAPT:         tail call void @__quantum__qis__mz__body(%Qubit* nonnull inttoptr (i64 6 to %Qubit*), %Result* nonnull inttoptr (i64 6 to %Result*))
 // ADAPT:         tail call void @__quantum__rt__result_record_output(%Result* nonnull inttoptr (i64 6 to %Result*), i8* nonnull getelementptr inbounds ([5 x i8], [5 x i8]* @cstr.7472697000, i64 0, i64 0))
+// ADAPT:         %[[VAL_7:.*]] = tail call i1 @__quantum__qis__read_result__body(%Result* nonnull inttoptr (i64 6 to %Result*))
 // ADAPT:         ret void
 // ADAPT:       }
 
-// ADAPT-LABEL: define i1 @__nvqpp__mlirgen__adapt_mz_read()
-// ADAPT:         tail call void @__quantum__qis__mz__body(%[[VAL_2:.*]]* null, %[[VAL_3:.*]]* null)
-// ADAPT:         tail call void @__quantum__rt__array_record_output(i64 1, i8* nonnull getelementptr inbounds ([14 x i8], [14 x i8]* @cstr.{{.*}}, i64 0, i64 0))
-// ADAPT:         tail call void @__quantum__rt__result_record_output(%[[VAL_3]]* null, i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @cstr.{{.*}}, i64 0, i64 0))
-// ADAPT:         %[[VAL_4:.*]] = tail call i1 @__quantum__qis__read_result__body(%[[VAL_3]]* null)
-// ADAPT:         ret i1 %[[VAL_4]]
-// ADAPT:       }
-
 // FULL-LABEL: define void @__nvqpp__mlirgen__comprehensive()
 // FULL:         %[[VAL_0:.*]] = tail call %Array* @__quantum__rt__qubit_allocate_array(i64 9)
 // FULL:         %[[VAL_2:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 0)
diff --git a/test/AST-Quake/bool_literal.cpp b/test/AST-Quake/bool_literal.cpp
index 5df0fe63859..48578561bc0 100644
--- a/test/AST-Quake/bool_literal.cpp
+++ b/test/AST-Quake/bool_literal.cpp
@@ -28,8 +28,8 @@ struct testBoolLiteral {
 // CHECK:           %[[VAL_2:.*]] = cc.alloca i1
 // CHECK:           cc.store %[[VAL_0]], %[[VAL_2]] : !cc.ptr<i1>
 // CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_1]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_4:.*]] = quake.discriminate %[[VAL_3]] : (!quake.measure) -> i1
-// CHECK:           cc.store %[[VAL_4]], %[[VAL_2]] : !cc.ptr<i1>
-// CHECK:           %[[VAL_5:.*]] = cc.load %[[VAL_2]] : !cc.ptr<i1>
-// CHECK:           return %[[VAL_5]] : i1
+// CHECK:           %[[VAL_9:.*]] = quake.discriminate %[[VAL_3]] : (!quake.measure) -> i1
+// CHECK:           cc.store %[[VAL_9]], %[[VAL_2]] : !cc.ptr<i1>
+// CHECK:           %[[VAL_4:.*]] = cc.load %[[VAL_2]] : !cc.ptr<i1>
+// CHECK:           return %[[VAL_4]] : i1
 
diff --git a/test/AST-Quake/bug_3270.cpp b/test/AST-Quake/bug_3270.cpp
index e03f5cacafb..d6ae42c199f 100644
--- a/test/AST-Quake/bug_3270.cpp
+++ b/test/AST-Quake/bug_3270.cpp
@@ -26,8 +26,21 @@ __qpu__ void foo() {
 // CHECK:           quake.x %[[VAL_2]] : (!quake.ref) -> ()
 // CHECK:           %[[VAL_3:.*]] = quake.extract_ref %[[VAL_0]][2] : (!quake.veq<3>) -> !quake.ref
 // CHECK:           quake.x %[[VAL_3]] : (!quake.ref) -> ()
-// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_1]] name "result%[[VAL_0]]" : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_5:.*]] = quake.mz %[[VAL_2]] name "result%[[VAL_1]]" : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_6:.*]] = quake.mz %[[VAL_3]] name "result%[[VAL_2]]" : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_4:.*]] = cc.alloca !cc.array<i8 x 3>
+// CHECK:           %[[VAL_5:.*]] = quake.mz %[[VAL_1]] name "result%[[VAL_0]]" : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_6:.*]] = quake.discriminate %[[VAL_5]] : (!quake.measure) -> i1
+// CHECK:           %[[VAL_7:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<!cc.array<i8 x 3>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_8:.*]] = cc.cast unsigned %[[VAL_6]] : (i1) -> i8
+// CHECK:           cc.store %[[VAL_8]], %[[VAL_7]] : !cc.ptr<i8>
+// CHECK:           %[[VAL_9:.*]] = quake.mz %[[VAL_2]] name "result%[[VAL_1]]" : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_10:.*]] = quake.discriminate %[[VAL_9]] : (!quake.measure) -> i1
+// CHECK:           %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_4]][1] : (!cc.ptr<!cc.array<i8 x 3>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_12:.*]] = cc.cast unsigned %[[VAL_10]] : (i1) -> i8
+// CHECK:           cc.store %[[VAL_12]], %[[VAL_11]] : !cc.ptr<i8>
+// CHECK:           %[[VAL_13:.*]] = quake.mz %[[VAL_3]] name "result%[[VAL_2]]" : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_14:.*]] = quake.discriminate %[[VAL_13]] : (!quake.measure) -> i1
+// CHECK:           %[[VAL_15:.*]] = cc.compute_ptr %[[VAL_4]][2] : (!cc.ptr<!cc.array<i8 x 3>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_16:.*]] = cc.cast unsigned %[[VAL_14]] : (i1) -> i8
+// CHECK:           cc.store %[[VAL_16]], %[[VAL_15]] : !cc.ptr<i8>
 // CHECK:           return
 // CHECK:         }
diff --git a/test/AST-Quake/call_qpu.cpp b/test/AST-Quake/call_qpu.cpp
index e766ad62137..bd6d3f1ce6d 100644
--- a/test/AST-Quake/call_qpu.cpp
+++ b/test/AST-Quake/call_qpu.cpp
@@ -12,14 +12,14 @@
 
 std::vector<bool> func_achat(cudaq::qview<> &qv) __qpu__ {
   // measure the entire register
-  return cudaq::to_bool_vector(mz(qv));
+  return mz(qv);
 }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_func_achat._Z10func_achatRN5cudaq5qviewILm2EEE(
 // CHECK-SAME:      %[[VAL_0:.*]]: !quake.veq<?>) -> !cc.stdvec<i1> attributes {"cudaq-kernel", no_this} {
 // CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_2:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<?>) -> !quake.measurements<?>
-// CHECK:           %[[VAL_3:.*]] = quake.discriminate %[[VAL_2]] : (!quake.measurements<?>) -> !cc.stdvec<i1>
+// CHECK:           %[[VAL_2:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
+// CHECK:           %[[VAL_3:.*]] = quake.discriminate %[[VAL_2]] : (!cc.stdvec<!quake.measure>) -> !cc.stdvec<i1>
 // CHECK:           %[[VAL_4:.*]] = cc.stdvec_data %[[VAL_3]] : (!cc.stdvec<i1>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_5:.*]] = cc.stdvec_size %[[VAL_3]] : (!cc.stdvec<i1>) -> i64
 // CHECK:           %[[VAL_6:.*]] = call @__nvqpp_vectorCopyCtor(%[[VAL_4]], %[[VAL_5]], %[[VAL_1]]) : (!cc.ptr<i8>, i64, i64) -> !cc.ptr<i8>
@@ -52,21 +52,21 @@ int func_shiim(cudaq::qvector<> &qv) __qpu__ {
 // CHECK:           %[[VAL_10:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
 // CHECK:           call @__nvqpp_vectorCopyToStack(%[[VAL_10]], %[[VAL_7]], %[[VAL_8]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64) -> ()
 // CHECK:           %[[VAL_11:.*]] = cc.undef i32
-// CHECK:           %[[VAL_12:.*]]:2 = cc.loop while ((%[[VAL_13:.*]] = %[[VAL_2]], %[[VAL_14:.*]] = %[[VAL_11]]) -> (i64, i32)) {
-// CHECK:             %[[VAL_15:.*]] = arith.cmpi slt, %[[VAL_13]], %[[VAL_8]] : i64
-// CHECK:             cc.condition %[[VAL_15]](%[[VAL_13]], %[[VAL_14]] : i64, i32)
+// CHECK:           %[[VAL_13:.*]]:2 = cc.loop while ((%[[VAL_14:.*]] = %[[VAL_2]], %[[VAL_15:.*]] = %[[VAL_11]]) -> (i64, i32)) {
+// CHECK:             %[[VAL_16:.*]] = arith.cmpi slt, %[[VAL_14]], %[[VAL_8]] : i64
+// CHECK:             cc.condition %[[VAL_16]](%[[VAL_14]], %[[VAL_15]] : i64, i32)
 // CHECK:           } do {
-// CHECK:           ^bb0(%[[VAL_16:.*]]: i64, %[[VAL_17:.*]]: i32):
-// CHECK:             %[[VAL_18:.*]] = cc.compute_ptr %[[VAL_9]]{{\[}}%[[VAL_16]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
-// CHECK:             %[[VAL_19:.*]] = cc.load %[[VAL_18]] : !cc.ptr<i8>
-// CHECK:             %[[VAL_20:.*]] = cc.cast %[[VAL_19]] : (i8) -> i1
-// CHECK:             %[[VAL_21:.*]] = cc.if(%[[VAL_20]]) -> i32 {
-// CHECK:               %[[VAL_22:.*]] = arith.addi %[[VAL_17]], %[[VAL_3]] : i32
+// CHECK:           ^bb0(%[[VAL_17:.*]]: i64, %[[VAL_18:.*]]: i32):
+// CHECK:             %[[VAL_19:.*]] = cc.compute_ptr %[[VAL_9]][%[[VAL_17]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
+// CHECK:             %[[VAL_20:.*]] = cc.load %[[VAL_19]] : !cc.ptr<i8>
+// CHECK:             %[[VAL_12:.*]] = cc.cast %[[VAL_20]] : (i8) -> i1
+// CHECK:             %[[VAL_21:.*]] = cc.if(%[[VAL_12]]) -> i32 {
+// CHECK:               %[[VAL_22:.*]] = arith.addi %[[VAL_18]], %[[VAL_3]] : i32
 // CHECK:               cc.continue %[[VAL_22]] : i32
 // CHECK:             } else {
-// CHECK:               cc.continue %[[VAL_17]] : i32
+// CHECK:               cc.continue %[[VAL_18]] : i32
 // CHECK:             }
-// CHECK:             cc.continue %[[VAL_16]], %[[VAL_23:.*]] : i64, i32
+// CHECK:             cc.continue %[[VAL_17]], %[[VAL_23:.*]] : i64, i32
 // CHECK:           } step {
 // CHECK:           ^bb0(%[[VAL_24:.*]]: i64, %[[VAL_25:.*]]: i32):
 // CHECK:             %[[VAL_26:.*]] = arith.addi %[[VAL_24]], %[[VAL_1]] : i64
@@ -132,7 +132,6 @@ void func_arba() __qpu__ {
 // CHECK:               %[[VAL_19:.*]] = arith.addi %[[VAL_18]], %[[VAL_1]] : i64
 // CHECK:               cc.continue %[[VAL_19]] : i64
 // CHECK:             } {invariant}
-// CHECK:           } else {
 // CHECK:           }
 // CHECK:           %[[VAL_20:.*]] = cc.loop while ((%[[VAL_21:.*]] = %[[VAL_2]]) -> (i64)) {
 // CHECK:             %[[VAL_22:.*]] = arith.cmpi slt, %[[VAL_21]], %[[VAL_0]] : i64
diff --git a/test/AST-Quake/cast.cpp b/test/AST-Quake/cast.cpp
index 7ff33801582..39d9d820135 100644
--- a/test/AST-Quake/cast.cpp
+++ b/test/AST-Quake/cast.cpp
@@ -6,7 +6,7 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-// RUN: cudaq-quake %s | cudaq-opt | FileCheck %s
+// RUN: cudaq-quake %s | FileCheck %s
 
 #include <cudaq.h>
 
@@ -14,7 +14,7 @@ struct testCast {
   void operator()() __qpu__ {
     cudaq::qubit q0, q1;
     h(q0);
-    double bit = static_cast<double>(mz(q0));
+    double bit = mz(q0);
     // This tests implicit casting from double to bool
     if (bit)
       x(q1);
@@ -29,16 +29,16 @@ struct testCast {
 // CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.ref
 // CHECK:           quake.h %[[VAL_1]] : (!quake.ref) -> ()
 // CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_1]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_4:.*]] = quake.discriminate %[[VAL_3]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_5:.*]] = cc.cast unsigned %[[VAL_4]] : (i1) -> f64
-// CHECK:           %[[VAL_6:.*]] = cc.alloca f64
-// CHECK:           cc.store %[[VAL_5]], %[[VAL_6]] : !cc.ptr<f64>
-// CHECK:           %[[VAL_7:.*]] = cc.load %[[VAL_6]] : !cc.ptr<f64>
-// CHECK:           %[[VAL_8:.*]] = arith.cmpf une, %[[VAL_7]], %[[VAL_0]] : f64
-// CHECK:           cc.if(%[[VAL_8]]) {
+// CHECK:           %[[VAL_10:.*]] = quake.discriminate %[[VAL_3]] :
+// CHECK:           %[[VAL_4:.*]] = cc.cast unsigned %[[VAL_10]] : (i1) -> f64
+// CHECK:           %[[VAL_5:.*]] = cc.alloca f64
+// CHECK:           cc.store %[[VAL_4]], %[[VAL_5]] : !cc.ptr<f64>
+// CHECK:           %[[VAL_6:.*]] = cc.load %[[VAL_5]] : !cc.ptr<f64>
+// CHECK:           %[[VAL_7:.*]] = arith.cmpf une, %[[VAL_6]], %[[VAL_0]] : f64
+// CHECK:           cc.if(%[[VAL_7]]) {
 // CHECK:             quake.x %[[VAL_2]] : (!quake.ref) -> ()
 // CHECK:           }
-// CHECK:           %[[VAL_9:.*]] = quake.mz %[[VAL_2]] : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_8:.*]] = quake.mz %[[VAL_2]] : (!quake.ref) -> !quake.measure
 // CHECK:           return
 // CHECK:         }
 // clang-format on
@@ -58,24 +58,24 @@ struct testCastBoolMeasurement {
 
 // clang-format off
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__testCastBoolMeasurement() attributes {"cudaq-entrypoint", "cudaq-kernel"} {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_1_i32:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.ref
 // CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.ref
-// CHECK:           quake.h %[[VAL_1]] : (!quake.ref) -> ()
-// CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_1]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_4:.*]] = quake.discriminate %[[VAL_3]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_5:.*]] = cc.alloca i1
-// CHECK:           cc.store %[[VAL_4]], %[[VAL_5]] : !cc.ptr<i1>
-// CHECK:           %[[VAL_6:.*]] = cc.load %[[VAL_5]] : !cc.ptr<i1>
-// CHECK:           %[[VAL_7:.*]] = cc.cast unsigned %[[VAL_6]] : (i1) -> i32
-// CHECK:           %[[VAL_8:.*]] = cc.alloca i32
-// CHECK:           cc.store %[[VAL_7]], %[[VAL_8]] : !cc.ptr<i32>
-// CHECK:           %[[VAL_9:.*]] = cc.load %[[VAL_8]] : !cc.ptr<i32>
-// CHECK:           %[[VAL_10:.*]] = arith.cmpi eq, %[[VAL_9]], %[[VAL_0]] : i32
-// CHECK:           cc.if(%[[VAL_10]]) {
-// CHECK:             quake.x %[[VAL_2]] : (!quake.ref) -> ()
+// CHECK:           quake.h %[[VAL_0]] : (!quake.ref) -> ()
+// CHECK:           %[[VAL_m0:.*]] = quake.mz %[[VAL_0]] name {{.*}} : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_2:.*]] = quake.discriminate %[[VAL_m0]] :
+// CHECK:           %[[VAL_3:.*]] = cc.alloca i1
+// CHECK:           cc.store %[[VAL_2]], %[[VAL_3]] : !cc.ptr<i1>
+// CHECK:           %[[VAL_4:.*]] = cc.load %[[VAL_3]] : !cc.ptr<i1>
+// CHECK:           %[[VAL_5:.*]] = cc.cast unsigned %[[VAL_4]] : (i1) -> i32
+// CHECK:           %[[VAL_6:.*]] = cc.alloca i32
+// CHECK:           cc.store %[[VAL_5]], %[[VAL_6]] : !cc.ptr<i32>
+// CHECK:           %[[VAL_7:.*]] = cc.load %[[VAL_6]] : !cc.ptr<i32>
+// CHECK:           %[[VAL_8:.*]] = arith.cmpi eq, %[[VAL_7:.*]], %[[VAL_1_i32]] : i32
+// CHECK:           cc.if(%[[VAL_8]]) {
+// CHECK:             quake.x %[[VAL_1]] : (!quake.ref) -> ()
 // CHECK:           }
-// CHECK:           %[[VAL_11:.*]] = quake.mz %[[VAL_2]] : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_m1:.*]] = quake.mz %[[VAL_1]] : (!quake.ref) -> !quake.measure
 // CHECK:           return
 // CHECK:         }
 // clang-format on
@@ -94,16 +94,16 @@ struct testUnsignedCastBoolConstTrue {
 
 // clang-format off
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__testUnsignedCastBoolConstTrue() attributes {"cudaq-entrypoint", "cudaq-kernel"} {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 1 : i32
-// CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_2:.*]] = cc.alloca i32
-// CHECK:           cc.store %[[VAL_0]], %[[VAL_2]] : !cc.ptr<i32>
-// CHECK:           %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr<i32>
-// CHECK:           %[[VAL_4:.*]] = arith.cmpi eq, %[[VAL_3]], %[[VAL_0]] : i32
-// CHECK:           cc.if(%[[VAL_4]]) {
-// CHECK:             quake.x %[[VAL_1]] : (!quake.ref) -> ()
+// CHECK:           %[[VAL_1_i32:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.ref
+// CHECK:           %[[VAL_1:.*]] = cc.alloca i32
+// CHECK:           cc.store %[[VAL_1_i32]], %[[VAL_1]] : !cc.ptr<i32>
+// CHECK:           %[[VAL_2:.*]] = cc.load %[[VAL_1]] : !cc.ptr<i32>
+// CHECK:           %[[VAL_3:.*]] = arith.cmpi eq, %[[VAL_2:.*]], %[[VAL_1_i32]] : i32
+// CHECK:           cc.if(%[[VAL_3]]) {
+// CHECK:             quake.x %[[VAL_0]] : (!quake.ref) -> ()
 // CHECK:           }
-// CHECK:           %[[VAL_5:.*]] = quake.mz %[[VAL_1]] : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_m:.*]] = quake.mz %[[VAL_0]] : (!quake.ref) -> !quake.measure
 // CHECK:           return
 // CHECK:         }
 // clang-format on
@@ -122,16 +122,16 @@ struct testUnsignedCastBoolConstFalse {
 
 // clang-format off
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__testUnsignedCastBoolConstFalse() attributes {"cudaq-entrypoint", "cudaq-kernel"} {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i32
-// CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_2:.*]] = cc.alloca i32
-// CHECK:           cc.store %[[VAL_0]], %[[VAL_2]] : !cc.ptr<i32>
-// CHECK:           %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr<i32>
-// CHECK:           %[[VAL_4:.*]] = arith.cmpi eq, %[[VAL_3]], %[[VAL_0]] : i32
-// CHECK:           cc.if(%[[VAL_4]]) {
-// CHECK:             quake.x %[[VAL_1]] : (!quake.ref) -> ()
+// CHECK:           %[[VAL_0_i32:.*]] = arith.constant 0 : i32
+// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.ref
+// CHECK:           %[[VAL_1:.*]] = cc.alloca i32
+// CHECK:           cc.store %[[VAL_0_i32]], %[[VAL_1]] : !cc.ptr<i32>
+// CHECK:           %[[VAL_2:.*]] = cc.load %[[VAL_1]] : !cc.ptr<i32>
+// CHECK:           %[[VAL_3:.*]] = arith.cmpi eq, %[[VAL_2:.*]], %[[VAL_0_i32]] : i32
+// CHECK:           cc.if(%[[VAL_3]]) {
+// CHECK:             quake.x %[[VAL_0]] : (!quake.ref) -> ()
 // CHECK:           }
-// CHECK:           %[[VAL_5:.*]] = quake.mz %[[VAL_1]] : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_m:.*]] = quake.mz %[[VAL_0]] : (!quake.ref) -> !quake.measure
 // CHECK:           return
 // CHECK:         }
 // clang-format on
@@ -150,16 +150,16 @@ struct testSignedCastBoolConstTrue {
 
 // clang-format off
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__testSignedCastBoolConstTrue() attributes {"cudaq-entrypoint", "cudaq-kernel"} {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 1 : i32
-// CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_2:.*]] = cc.alloca i32
-// CHECK:           cc.store %[[VAL_0]], %[[VAL_2]] : !cc.ptr<i32>
-// CHECK:           %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr<i32>
-// CHECK:           %[[VAL_4:.*]] = arith.cmpi eq, %[[VAL_3]], %[[VAL_0]] : i32
-// CHECK:           cc.if(%[[VAL_4]]) {
-// CHECK:             quake.x %[[VAL_1]] : (!quake.ref) -> ()
+// CHECK:           %[[VAL_1_i32:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.ref
+// CHECK:           %[[VAL_1:.*]] = cc.alloca i32
+// CHECK:           cc.store %[[VAL_1_i32]], %[[VAL_1]] : !cc.ptr<i32>
+// CHECK:           %[[VAL_2:.*]] = cc.load %[[VAL_1]] : !cc.ptr<i32>
+// CHECK:           %[[VAL_3:.*]] = arith.cmpi eq, %[[VAL_2:.*]], %[[VAL_1_i32]] : i32
+// CHECK:           cc.if(%[[VAL_3]]) {
+// CHECK:             quake.x %[[VAL_0]] : (!quake.ref) -> ()
 // CHECK:           }
-// CHECK:           %[[VAL_5:.*]] = quake.mz %[[VAL_1]] : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_m:.*]] = quake.mz %[[VAL_0]] : (!quake.ref) -> !quake.measure
 // CHECK:           return
 // CHECK:         }
 // clang-format on
@@ -178,48 +178,16 @@ struct testSignedCastBoolConstFalse {
 
 // clang-format off
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__testSignedCastBoolConstFalse() attributes {"cudaq-entrypoint", "cudaq-kernel"} {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i32
-// CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_2:.*]] = cc.alloca i32
-// CHECK:           cc.store %[[VAL_0]], %[[VAL_2]] : !cc.ptr<i32>
-// CHECK:           %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr<i32>
-// CHECK:           %[[VAL_4:.*]] = arith.cmpi eq, %[[VAL_3]], %[[VAL_0]] : i32
-// CHECK:           cc.if(%[[VAL_4]]) {
-// CHECK:             quake.x %[[VAL_1]] : (!quake.ref) -> ()
-// CHECK:           }
-// CHECK:           %[[VAL_5:.*]] = quake.mz %[[VAL_1]] : (!quake.ref) -> !quake.measure
-// CHECK:           return
-// CHECK:         }
-// clang-format on
-
-struct testCastIntMeasurement {
-  void operator()() __qpu__ {
-    cudaq::qubit q0, q1;
-    h(q0);
-    int bit = static_cast<int>(mz(q0));
-    if (bit == 1)
-      x(q1);
-    mz(q1);
-  }
-};
-
-// clang-format off
-// CHECK-LABEL:   func.func @__nvqpp__mlirgen__testCastIntMeasurement() attributes {"cudaq-entrypoint", "cudaq-kernel"} {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 1 : i32
-// CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.ref
-// CHECK:           quake.h %[[VAL_1]] : (!quake.ref) -> ()
-// CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_1]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_4:.*]] = quake.discriminate %[[VAL_3]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_5:.*]] = cc.cast unsigned %[[VAL_4]] : (i1) -> i32
-// CHECK:           %[[VAL_6:.*]] = cc.alloca i32
-// CHECK:           cc.store %[[VAL_5]], %[[VAL_6]] : !cc.ptr<i32>
-// CHECK:           %[[VAL_7:.*]] = cc.load %[[VAL_6]] : !cc.ptr<i32>
-// CHECK:           %[[VAL_8:.*]] = arith.cmpi eq, %[[VAL_7]], %[[VAL_0]] : i32
-// CHECK:           cc.if(%[[VAL_8]]) {
-// CHECK:             quake.x %[[VAL_2]] : (!quake.ref) -> ()
+// CHECK:           %[[VAL_0_i32:.*]] = arith.constant 0 : i32
+// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.ref
+// CHECK:           %[[VAL_1:.*]] = cc.alloca i32
+// CHECK:           cc.store %[[VAL_0_i32]], %[[VAL_1]] : !cc.ptr<i32>
+// CHECK:           %[[VAL_2:.*]] = cc.load %[[VAL_1]] : !cc.ptr<i32>
+// CHECK:           %[[VAL_3:.*]] = arith.cmpi eq, %[[VAL_2:.*]], %[[VAL_0_i32]] : i32
+// CHECK:           cc.if(%[[VAL_3]]) {
+// CHECK:             quake.x %[[VAL_0]] : (!quake.ref) -> ()
 // CHECK:           }
-// CHECK:           %[[VAL_9:.*]] = quake.mz %[[VAL_2]] : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_m:.*]] = quake.mz %[[VAL_0]] : (!quake.ref) -> !quake.measure
 // CHECK:           return
 // CHECK:         }
 // clang-format on
diff --git a/test/AST-Quake/const_reference_extension.cpp b/test/AST-Quake/const_reference_extension.cpp
index a97f0990ec7..e64ff72b007 100644
--- a/test/AST-Quake/const_reference_extension.cpp
+++ b/test/AST-Quake/const_reference_extension.cpp
@@ -28,9 +28,8 @@ __qpu__ uint64_t foo() {
   return qubit_values_to_integer(results);
 }
 
-// clang-format off
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_qubit_values_to_integer.
-// CHECK-SAME:      %[[VAL_0:.*]]: !quake.measurements<?>) -> i64 attributes
+// CHECK-SAME:      (%[[VAL_0:.*]]: !cc.stdvec<i1>) -> i64 attributes
 // CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 1 : i64
 // CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 0 : i64
 // CHECK-DAG:       %[[VAL_3:.*]] = cc.alloca i64
@@ -40,28 +39,29 @@ __qpu__ uint64_t foo() {
 // CHECK:             cc.store %[[VAL_2]], %[[VAL_4]] : !cc.ptr<i64>
 // CHECK:             cc.loop while {
 // CHECK:               %[[VAL_5:.*]] = cc.load %[[VAL_4]] : !cc.ptr<i64>
-// CHECK:               %[[VAL_6:.*]] = quake.measurements_size %[[VAL_0]] : (!quake.measurements<?>) -> i64
+// CHECK:               %[[VAL_6:.*]] = cc.stdvec_size %[[VAL_0]] : (!cc.stdvec<i1>) -> i64
 // CHECK:               %[[VAL_7:.*]] = arith.cmpi ult, %[[VAL_5]], %[[VAL_6]] : i64
 // CHECK:               cc.condition %[[VAL_7]]
 // CHECK:             } do {
 // CHECK:               %[[VAL_8:.*]] = cc.load %[[VAL_4]] : !cc.ptr<i64>
-// CHECK:               %[[VAL_9:.*]] = quake.get_measure %[[VAL_0]]{{\[}}%[[VAL_8]]] : (!quake.measurements<?>, i64) -> !quake.measure
-// CHECK:               %[[VAL_10:.*]] = quake.discriminate %[[VAL_9]] : (!quake.measure) -> i1
-// CHECK:               %[[VAL_11:.*]] = cc.cast unsigned %[[VAL_10]] : (i1) -> i64
-// CHECK:               %[[VAL_12:.*]] = cc.load %[[VAL_4]] : !cc.ptr<i64>
-// CHECK:               %[[VAL_13:.*]] = arith.shli %[[VAL_11]], %[[VAL_12]] : i64
-// CHECK:               %[[VAL_14:.*]] = cc.load %[[VAL_3]] : !cc.ptr<i64>
-// CHECK:               %[[VAL_15:.*]] = arith.ori %[[VAL_14]], %[[VAL_13]] : i64
-// CHECK:               cc.store %[[VAL_15]], %[[VAL_3]] : !cc.ptr<i64>
+// CHECK:               %[[VAL_9:.*]] = cc.stdvec_data %[[VAL_0]] : (!cc.stdvec<i1>) -> !cc.ptr<!cc.array<i8 x ?>>
+// CHECK:               %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_9]]{{\[}}%[[VAL_8]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
+// CHECK:               %[[VAL_11:.*]] = cc.load %[[VAL_10]] : !cc.ptr<i8>
+// CHECK-DAG:           %[[VAL_12:.*]] = cc.cast unsigned %{{.*}} : (i{{[18]}}) -> i64
+// CHECK-DAG:           %[[VAL_13:.*]] = cc.load %[[VAL_4]] : !cc.ptr<i64>
+// CHECK:               %[[VAL_14:.*]] = arith.shli %[[VAL_12]], %[[VAL_13]] : i64
+// CHECK:               %[[VAL_15:.*]] = cc.load %[[VAL_3]] : !cc.ptr<i64>
+// CHECK:               %[[VAL_16:.*]] = arith.ori %[[VAL_15]], %[[VAL_14]] : i64
+// CHECK:               cc.store %[[VAL_16]], %[[VAL_3]] : !cc.ptr<i64>
 // CHECK:               cc.continue
 // CHECK:             } step {
-// CHECK:               %[[VAL_16:.*]] = cc.load %[[VAL_4]] : !cc.ptr<i64>
-// CHECK:               %[[VAL_17:.*]] = arith.addi %[[VAL_16]], %[[VAL_1]] : i64
-// CHECK:               cc.store %[[VAL_17]], %[[VAL_4]] : !cc.ptr<i64>
+// CHECK:               %[[VAL_17:.*]] = cc.load %[[VAL_4]] : !cc.ptr<i64>
+// CHECK:               %[[VAL_18:.*]] = arith.addi %[[VAL_17]], %[[VAL_1]] : i64
+// CHECK:               cc.store %[[VAL_18]], %[[VAL_4]] : !cc.ptr<i64>
 // CHECK:             }
 // CHECK:           }
-// CHECK:           %[[VAL_18:.*]] = cc.load %[[VAL_3]] : !cc.ptr<i64>
-// CHECK:           return %[[VAL_18]] : i64
+// CHECK:           %[[VAL_19:.*]] = cc.load %[[VAL_3]] : !cc.ptr<i64>
+// CHECK:           return %[[VAL_19]] : i64
 // CHECK:         }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_foo.
@@ -82,9 +82,8 @@ __qpu__ uint64_t foo() {
 // CHECK:             %[[VAL_10:.*]] = arith.addi %[[VAL_9]], %[[VAL_1]] : i64
 // CHECK:             cc.continue %[[VAL_10]] : i64
 // CHECK:           } {invariant}
-// CHECK:           %[[VAL_11:.*]] = quake.mz %[[VAL_3]] name "results" : (!quake.veq<2>) -> !quake.measurements<2>
-// CHECK:           %[[VAL_12:.*]] = quake.relax_size %[[VAL_11]] : (!quake.measurements<2>) -> !quake.measurements<?>
+// CHECK:           %[[VAL_11:.*]] = quake.mz %[[VAL_3]] name "results" : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
+// CHECK:           %[[VAL_12:.*]] = quake.discriminate %[[VAL_11]] : (!cc.stdvec<!quake.measure>) -> !cc.stdvec<i1>
 // CHECK:           %[[VAL_13:.*]] = call @__nvqpp__mlirgen__function_qubit_values_to_integer.
 // CHECK:           return %[[VAL_13]] : i64
 // CHECK:         }
-// clang-format on
diff --git a/test/AST-Quake/control_flow.cpp b/test/AST-Quake/control_flow.cpp
index 6b586b8941f..3e09ad848e8 100644
--- a/test/AST-Quake/control_flow.cpp
+++ b/test/AST-Quake/control_flow.cpp
@@ -103,7 +103,7 @@ struct C {
 // CHECK:             }
 // CHECK:           }
 // CHECK:           call @_Z2g4v() : () -> ()
-// CHECK:           %[[VAL_28:.*]] = quake.mz %[[VAL_6]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           %[[VAL_28:.*]] = quake.mz %[[VAL_6]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
@@ -192,7 +192,7 @@ struct D {
 // CHECK:             }
 // CHECK:           }
 // CHECK:           call @_Z2g4v() : () -> ()
-// CHECK:           %[[VAL_28:.*]] = quake.mz %[[VAL_6]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           %[[VAL_28:.*]] = quake.mz %[[VAL_6]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
@@ -280,7 +280,7 @@ struct E {
 // CHECK:           cf.br ^bb1
 // CHECK:         ^bb7:
 // CHECK:           call @_Z2g4v() : () -> ()
-// CHECK:           %[[VAL_28:.*]] = quake.mz %[[VAL_6]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           %[[VAL_28:.*]] = quake.mz %[[VAL_6]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           quake.dealloc %[[VAL_6]] : !quake.veq<2>
 // CHECK:           return
 
@@ -370,6 +370,6 @@ struct F {
 // CHECK:           cf.br ^bb1
 // CHECK:         ^bb8:
 // CHECK:           call @_Z2g4v() : () -> ()
-// CHECK:           %[[VAL_28:.*]] = quake.mz %[[VAL_6]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           %[[VAL_28:.*]] = quake.mz %[[VAL_6]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           quake.dealloc %[[VAL_6]] : !quake.veq<2>
 // CHECK:           return
diff --git a/test/AST-Quake/ctrl_vector.cpp b/test/AST-Quake/ctrl_vector.cpp
index 626ef24238d..828eba75c10 100644
--- a/test/AST-Quake/ctrl_vector.cpp
+++ b/test/AST-Quake/ctrl_vector.cpp
@@ -29,7 +29,7 @@ struct lower_ctrl_as_qreg {
 // CHECK:           quake.h [%[[VAL_0]]] %[[VAL_2]] : (!quake.veq<4>, !quake.ref) -> ()
 // CHECK:           %[[VAL_3:.*]] = quake.extract_ref %[[VAL_1]][1] : (!quake.veq<2>) -> !quake.ref
 // CHECK:           quake.x [%[[VAL_0]]] %[[VAL_3]] : (!quake.veq<4>, !quake.ref) -> ()
-// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_1]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_1]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
diff --git a/test/AST-Quake/cudaq_run.cpp b/test/AST-Quake/cudaq_run.cpp
index cd7f5c3175b..4c7f1a20c75 100644
--- a/test/AST-Quake/cudaq_run.cpp
+++ b/test/AST-Quake/cudaq_run.cpp
@@ -16,7 +16,7 @@ struct K9 {
   std::vector<bool> operator()() __qpu__ {
     cudaq::qvector q(5);
     cudaq::qubit p;
-    return cudaq::to_bool_vector(mz(q));
+    return mz(q);
   }
 };
 
@@ -71,7 +71,7 @@ __qpu__ std::vector<bool> dyn_vec_test(int n) {
   cudaq::qvector qs(n);
   for (int i = 0; i < n; i++)
     FlipQubit{}(qs[i]);
-  return cudaq::to_bool_vector(mz(qs));
+  return mz(qs);
 }
 
 // A kernel with a measurement-branch-dependent result size
@@ -81,7 +81,7 @@ __qpu__ std::vector<bool> branch_vec_test() {
   bool b = mz(ctrl);
   int sz = b ? 2 : 4;
   cudaq::qvector data(sz);
-  return cudaq::to_bool_vector(mz(data));
+  return mz(data);
 }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__K9.run()
diff --git a/test/AST-Quake/cudaq_types.cpp b/test/AST-Quake/cudaq_types.cpp
index 2e77e08a48c..e4ef92cf2ea 100644
--- a/test/AST-Quake/cudaq_types.cpp
+++ b/test/AST-Quake/cudaq_types.cpp
@@ -36,7 +36,7 @@ struct Qernel0 {
 // clang-format off
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__Qernel0()
 // CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<2>
-// CHECK:           %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 // clang-format on
diff --git a/test/AST-Quake/grover.cpp b/test/AST-Quake/grover.cpp
index 2464674d380..b4ada4c8230 100644
--- a/test/AST-Quake/grover.cpp
+++ b/test/AST-Quake/grover.cpp
@@ -152,7 +152,7 @@ int main(int argc, char *argv[]) {
 // CHECK:               cc.store %[[VAL_36]], %[[VAL_30]] : !cc.ptr<i32>
 // CHECK:             }
 // CHECK:           }
-// CHECK:           %[[VAL_37:.*]] = quake.mz %[[VAL_21]] : (!quake.veq<?>) -> !quake.measurements<?>
+// CHECK:           %[[VAL_37:.*]] = quake.mz %[[VAL_21]] : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
diff --git a/test/AST-Quake/if.cpp b/test/AST-Quake/if.cpp
index e64fe464be1..a24698c6c2c 100644
--- a/test/AST-Quake/if.cpp
+++ b/test/AST-Quake/if.cpp
@@ -6,7 +6,7 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-// RUN: cudaq-quake %s | cudaq-opt | FileCheck %s
+// RUN: cudaq-quake %s | FileCheck %s
 
 #include <cudaq.h>
 
@@ -80,20 +80,20 @@ struct kernel_short_circuit_and {
 // CHECK:           %[[VAL_1:.*]] = arith.constant 0 : i32
 // CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.veq<3>
 // CHECK:           %[[VAL_3:.*]] = quake.extract_ref %[[VAL_2]][0] : (!quake.veq<3>) -> !quake.ref
-// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_3]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_5:.*]] = quake.discriminate %[[VAL_4]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_6:.*]] = arith.cmpi eq, %[[VAL_5]], %[[VAL_0]] : i1
-// CHECK:           %[[VAL_7:.*]] = cc.if(%[[VAL_6]]) -> i1 {
+// CHECK:           %[[VAL_10:.*]] = quake.mz %[[VAL_3]] : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_4:.*]] = quake.discriminate %[[VAL_10]] :
+// CHECK:           %[[VAL_5:.*]] = arith.cmpi eq, %[[VAL_4]], %[[VAL_0]] : i1
+// CHECK:           %[[VAL_6:.*]] = cc.if(%[[VAL_5]]) -> i1 {
 // CHECK:             cc.continue %[[VAL_0]] : i1
 // CHECK:           } else {
-// CHECK:             %[[VAL_8:.*]] = quake.extract_ref %[[VAL_2]][1] : (!quake.veq<3>) -> !quake.ref
-// CHECK:             %[[VAL_9:.*]] = quake.mz %[[VAL_8]] : (!quake.ref) -> !quake.measure
-// CHECK:             %[[VAL_10:.*]] = quake.discriminate %[[VAL_9]] : (!quake.measure) -> i1
-// CHECK:             cc.continue %[[VAL_10]] : i1
+// CHECK:             %[[VAL_7:.*]] = quake.extract_ref %[[VAL_2]][1] : (!quake.veq<3>) -> !quake.ref
+// CHECK:             %[[VAL_8:.*]] = quake.mz %[[VAL_7]] : (!quake.ref) -> !quake.measure
+// CHECK:             %[[VAL_81:.*]] = quake.discriminate %[[VAL_8]] :
+// CHECK:             cc.continue %[[VAL_81]] : i1
 // CHECK:           }
-// CHECK:           cc.if(%[[VAL_11:.*]]) {
-// CHECK:             %[[VAL_12:.*]] = quake.extract_ref %[[VAL_2]][2] : (!quake.veq<3>) -> !quake.ref
-// CHECK:             quake.x %[[VAL_12]] : (!quake.ref) -> ()
+// CHECK:           cc.if(%[[VAL_6]]) {
+// CHECK:             %[[VAL_9:.*]] = quake.extract_ref %[[VAL_2]][2] : (!quake.veq<3>) -> !quake.ref
+// CHECK:             quake.x %[[VAL_9]] : (!quake.ref) -> ()
 // CHECK:           }
 // CHECK:           return %[[VAL_1]] : i32
 // CHECK:         }
@@ -112,20 +112,20 @@ struct kernel_short_circuit_or {
 // CHECK:           %[[VAL_1:.*]] = arith.constant 0 : i32
 // CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.veq<3>
 // CHECK:           %[[VAL_3:.*]] = quake.extract_ref %[[VAL_2]][0] : (!quake.veq<3>) -> !quake.ref
-// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_3]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_5:.*]] = quake.discriminate %[[VAL_4]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_6:.*]] = arith.cmpi ne, %[[VAL_5]], %[[VAL_0]] : i1
-// CHECK:           %[[VAL_7:.*]] = cc.if(%[[VAL_6]]) -> i1 {
-// CHECK:             cc.continue %[[VAL_6]] : i1
+// CHECK:           %[[VAL_41:.*]] = quake.mz %[[VAL_3]] : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_4:.*]] = quake.discriminate %[[VAL_41]] :
+// CHECK:           %[[VAL_5:.*]] = arith.cmpi ne, %[[VAL_4]], %[[VAL_0]] : i1
+// CHECK:           %[[VAL_6:.*]] = cc.if(%[[VAL_5]]) -> i1 {
+// CHECK:             cc.continue %[[VAL_5]] : i1
 // CHECK:           } else {
-// CHECK:             %[[VAL_8:.*]] = quake.extract_ref %[[VAL_2]][1] : (!quake.veq<3>) -> !quake.ref
-// CHECK:             %[[VAL_9:.*]] = quake.mz %[[VAL_8]] : (!quake.ref) -> !quake.measure
-// CHECK:             %[[VAL_10:.*]] = quake.discriminate %[[VAL_9]] : (!quake.measure) -> i1
-// CHECK:             cc.continue %[[VAL_10]] : i1
+// CHECK:             %[[VAL_7:.*]] = quake.extract_ref %[[VAL_2]][1] : (!quake.veq<3>) -> !quake.ref
+// CHECK:             %[[VAL_8:.*]] = quake.mz %[[VAL_7]] : (!quake.ref) -> !quake.measure
+// CHECK:             %[[VAL_81:.*]] = quake.discriminate %[[VAL_8]] :
+// CHECK:             cc.continue %[[VAL_81]] : i1
 // CHECK:           }
-// CHECK:           cc.if(%[[VAL_11:.*]]) {
-// CHECK:             %[[VAL_12:.*]] = quake.extract_ref %[[VAL_2]][2] : (!quake.veq<3>) -> !quake.ref
-// CHECK:             quake.x %[[VAL_12]] : (!quake.ref) -> ()
+// CHECK:           cc.if(%[[VAL_6]]) {
+// CHECK:             %[[VAL_9:.*]] = quake.extract_ref %[[VAL_2]][2] : (!quake.veq<3>) -> !quake.ref
+// CHECK:             quake.x %[[VAL_9]] : (!quake.ref) -> ()
 // CHECK:           }
 // CHECK:           return %[[VAL_1]] : i32
 // CHECK:         }
@@ -143,15 +143,19 @@ struct kernel_ternary {
 // CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.veq<3>
 // CHECK:           %[[VAL_2:.*]] = quake.extract_ref %[[VAL_1]][0] : (!quake.veq<3>) -> !quake.ref
 // CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_2]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_4:.*]] = quake.discriminate %[[VAL_3]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_5:.*]] = cc.if(%[[VAL_4]]) -> !quake.measure {
-// CHECK:             %[[VAL_6:.*]] = quake.extract_ref %[[VAL_1]][1] : (!quake.veq<3>) -> !quake.ref
-// CHECK:             %[[VAL_7:.*]] = quake.mz %[[VAL_6]] : (!quake.ref) -> !quake.measure
-// CHECK:             cc.continue %[[VAL_7]] : !quake.measure
+// CHECK:           %[[VAL_31:.*]] = quake.discriminate %[[VAL_3]] :
+// CHECK:           %[[VAL_4:.*]] = cc.if(%[[VAL_31]]) -> i1 {
+// CHECK:             %[[VAL_5:.*]] = quake.extract_ref %[[VAL_1]][1] : (!quake.veq<3>) -> !quake.ref
+// CHECK:             %[[VAL_6:.*]] = quake.mz %[[VAL_5]] : (!quake.ref) -> !quake.measure
+// CHECK:             %[[VAL_61:.*]] = quake.discriminate %[[VAL_6]] :
+// CHECK:             cc.continue %[[VAL_61]] : i1
 // CHECK:           } else {
-// CHECK:             %[[VAL_8:.*]] = quake.extract_ref %[[VAL_1]][2] : (!quake.veq<3>) -> !quake.ref
-// CHECK:             %[[VAL_9:.*]] = quake.mz %[[VAL_8]] : (!quake.ref) -> !quake.measure
-// CHECK:             cc.continue %[[VAL_9]] : !quake.measure
+// CHECK:             %[[VAL_7:.*]] = quake.extract_ref %[[VAL_1]][2] : (!quake.veq<3>) -> !quake.ref
+// CHECK:             %[[VAL_8:.*]] = quake.mz %[[VAL_7]] : (!quake.ref) -> !quake.measure
+// CHECK:             %[[VAL_81:.*]] = quake.discriminate %[[VAL_8]] :
+// CHECK:             cc.continue %[[VAL_81]] : i1
 // CHECK:           }
+// CHECK:           %[[VAL_9:.*]] = cc.alloca i1
+// CHECK:           cc.store %[[VAL_4]], %[[VAL_9]] : !cc.ptr<i1>
 // CHECK:           return %[[VAL_0]] : i32
 // CHECK:         }
diff --git a/test/AST-Quake/indirect_callable.cpp b/test/AST-Quake/indirect_callable.cpp
index 9824d334ddd..80ebad2b583 100644
--- a/test/AST-Quake/indirect_callable.cpp
+++ b/test/AST-Quake/indirect_callable.cpp
@@ -36,7 +36,7 @@ void meanwhile_on_safari() {
 // CHECK:           %[[VAL_6:.*]] = cc.load %[[VAL_5]] : !cc.ptr<i32>
 // CHECK:           %[[VAL_7:.*]] = cc.cast signed %[[VAL_6]] : (i32) -> i64
 // CHECK:           %[[VAL_8:.*]] = quake.alloca !quake.veq<?>[%[[VAL_7]] : i64]
-// CHECK:           %[[VAL_9:.*]] = quake.mz %[[VAL_8]] : (!quake.veq<?>) -> !quake.measurements<?>
+// CHECK:           %[[VAL_9:.*]] = quake.mz %[[VAL_8]] : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
diff --git a/test/AST-Quake/loop_unroll-1.cpp b/test/AST-Quake/loop_unroll-1.cpp
index 24c4f3ba706..7f0bdd62e12 100644
--- a/test/AST-Quake/loop_unroll-1.cpp
+++ b/test/AST-Quake/loop_unroll-1.cpp
@@ -20,13 +20,21 @@ struct C {
 };
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__C()
-// CHECK-DAG:       %[[VAL_0:.*]] = quake.alloca !quake.veq<2>
-// CHECK-DAG:       %[[VAL_1:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_2:.*]] = quake.mz %[[VAL_1]] name "singleQubit" : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_3:.*]] = quake.extract_ref %[[VAL_0]][0] : (!quake.veq<2>) -> !quake.ref
-// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_3]] name "myRegister%[[VAL_0]]" : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_5:.*]] = quake.extract_ref %[[VAL_0]][1] : (!quake.veq<2>) -> !quake.ref
-// CHECK:           %[[VAL_6:.*]] = quake.mz %[[VAL_5]] name "myRegister%[[VAL_1]]" : (!quake.ref) -> !quake.measure
+// CHECK-DAG:       %[[VAL_3:.*]] = quake.alloca !quake.veq<2>
+// CHECK-DAG:       %[[VAL_10:.*]] = quake.alloca !quake.ref
+// CHECK-DAG:       %[[VAL_11:.*]] = quake.mz %[[VAL_10]] name "singleQubit" : (!quake.ref) -> !quake.measure
+// CHECK-DAG:       %[[VAL_4:.*]] = cc.alloca !cc.array<i8 x 2>
+// CHECK:           %[[VAL_5:.*]] = quake.extract_ref %[[VAL_3]][0] : (!quake.veq<2>) -> !quake.ref
+// CHECK:           %[[VAL_6:.*]] = quake.mz %[[VAL_5]] name "myRegister%0" : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_10:.*]] = quake.discriminate %[[VAL_6]] : {{.*}} -> i1
+// CHECK:           %[[VAL_14:.*]] = cc.cast unsigned %[[VAL_10]]
+// CHECK:           cc.store %[[VAL_14]], %{{.*}} : !cc.ptr<i8>
+// CHECK:           %[[VAL_7:.*]] = quake.extract_ref %[[VAL_3]][1] : (!quake.veq<2>) -> !quake.ref
+// CHECK:           %[[VAL_8:.*]] = quake.mz %[[VAL_7]] name "myRegister%1" : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_11:.*]] = quake.discriminate %[[VAL_8]] :
+// CHECK:           %[[VAL_9:.*]] = cc.compute_ptr %[[VAL_4]][1] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_13:.*]] = cc.cast unsigned %[[VAL_11]]
+// CHECK:           cc.store %[[VAL_13]], %[[VAL_9]] : !cc.ptr<i8>
 // CHECK:           return
 // CHECK:         }
 
diff --git a/test/AST-Quake/loop_unroll-3.cpp b/test/AST-Quake/loop_unroll-3.cpp
index d73ea53bbf8..ef2ebdb3732 100644
--- a/test/AST-Quake/loop_unroll-3.cpp
+++ b/test/AST-Quake/loop_unroll-3.cpp
@@ -107,6 +107,6 @@ struct Qernel {
 // CHECK:           quake.x %[[VAL_2]] : (!quake.ref) -> ()
 // CHECK:           %[[VAL_3:.*]] = quake.extract_ref %[[VAL_0]][0] : (!quake.veq<1>) -> !quake.ref
 // CHECK:           quake.x %[[VAL_3]] : (!quake.ref) -> ()
-// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<1>) -> !quake.measurements<1>
+// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<1>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
diff --git a/test/AST-Quake/measure_bell.cpp b/test/AST-Quake/measure_bell.cpp
index c2a5103ae7b..1471775ec87 100644
--- a/test/AST-Quake/measure_bell.cpp
+++ b/test/AST-Quake/measure_bell.cpp
@@ -32,10 +32,8 @@ struct bell {
 // clang-format off
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__bell(
 // CHECK-SAME:      %[[VAL_0:.*]]: i32) attributes {"cudaq-entrypoint", "cudaq-kernel"} {
-// CHECK-DAG:       %[[VAL_C1I64:.*]] = arith.constant 1 : i64
-// CHECK-DAG:       %[[VAL_C0I64:.*]] = arith.constant 0 : i64
-// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 1 : i32
-// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 0 : i32
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
 // CHECK:           %[[VAL_3:.*]] = cc.alloca i32
 // CHECK:           cc.store %[[VAL_0]], %[[VAL_3]] : !cc.ptr<i32>
 // CHECK:           %[[VAL_4:.*]] = quake.alloca !quake.veq<2>
@@ -50,31 +48,33 @@ struct bell {
 // CHECK:               %[[VAL_9:.*]] = arith.cmpi slt, %[[VAL_7]], %[[VAL_8]] : i32
 // CHECK:               cc.condition %[[VAL_9]]
 // CHECK:             } do {
-// CHECK:               cc.scope {
-// CHECK:                 %[[VAL_10:.*]] = quake.extract_ref %[[VAL_4]][0] : (!quake.veq<2>) -> !quake.ref
-// CHECK:                 quake.h %[[VAL_10]] : (!quake.ref) -> ()
-// CHECK:                 %[[VAL_11:.*]] = quake.extract_ref %[[VAL_4]][1] : (!quake.veq<2>) -> !quake.ref
-// CHECK:                 quake.x {{\[}}%[[VAL_10]]] %[[VAL_11]] : (!quake.ref, !quake.ref) -> ()
-// CHECK:                 %[[VAL_12:.*]] = quake.mz %[[VAL_4]] name "results" : (!quake.veq<2>) -> !quake.measurements<2>
-// CHECK:                 %[[VAL_13:.*]] = quake.get_measure %[[VAL_12]][%[[VAL_C0I64]]] : (!quake.measurements<2>, i64) -> !quake.measure
-// CHECK:                 %[[VAL_14:.*]] = quake.discriminate %[[VAL_13]] : (!quake.measure) -> i1
-// CHECK:                 %[[VAL_15:.*]] = cc.alloca i1
-// CHECK:                 cc.store %[[VAL_14]], %[[VAL_15]] : !cc.ptr<i1>
-// CHECK:                 %[[VAL_16:.*]] = cc.load %[[VAL_15]] : !cc.ptr<i1>
-// CHECK:                 %[[VAL_17:.*]] = quake.get_measure %[[VAL_12]][%[[VAL_C1I64]]] : (!quake.measurements<2>, i64) -> !quake.measure
-// CHECK:                 %[[VAL_18:.*]] = quake.discriminate %[[VAL_17]] : (!quake.measure) -> i1
-// CHECK:                 %[[VAL_19:.*]] = arith.cmpi eq, %[[VAL_16]], %[[VAL_18]] : i1
-// CHECK:                 cc.if(%[[VAL_19]]) {
-// CHECK:                   %[[VAL_20:.*]] = cc.load %[[VAL_5]] : !cc.ptr<i32>
-// CHECK:                   %[[VAL_21:.*]] = arith.addi %[[VAL_20]], %[[VAL_1]] : i32
-// CHECK:                   cc.store %[[VAL_21]], %[[VAL_5]] : !cc.ptr<i32>
-// CHECK:                 }
+// CHECK:               %[[VAL_10:.*]] = quake.extract_ref %[[VAL_4]][0] : (!quake.veq<2>) -> !quake.ref
+// CHECK:               quake.h %[[VAL_10]] : (!quake.ref) -> ()
+// CHECK:               %[[VAL_11:.*]] = quake.extract_ref %[[VAL_4]][1] : (!quake.veq<2>) -> !quake.ref
+// CHECK:               quake.x {{\[}}%[[VAL_10]]] %[[VAL_11]] : (!quake.ref, !quake.ref) -> ()
+// CHECK:               %[[VAL_112:.*]] = quake.mz %[[VAL_4]] name "results" : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
+// CHECK:               %[[VAL_12:.*]] = quake.discriminate %[[VAL_112]] :
+// CHECK:               %[[VAL_13:.*]] = cc.stdvec_data %[[VAL_12]] : (!cc.stdvec<i1>) -> !cc.ptr<!cc.array<i8 x ?>>
+// CHECK:               %[[VAL_14:.*]] = cc.cast %[[VAL_13]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
+// CHECK:               %[[VAL_15:.*]] = cc.load %[[VAL_14]] : !cc.ptr<i8>
+// CHECK:               %[[VAL_35:.*]] = cc.cast %[[VAL_15]] : (i8) -> i1
+// CHECK:               %[[VAL_16:.*]] = cc.alloca i1
+// CHECK:               cc.store %[[VAL_35]], %[[VAL_16]] : !cc.ptr<i1>
+// CHECK:               %[[VAL_17:.*]] = cc.load %[[VAL_16]] : !cc.ptr<i1>
+// CHECK:               %[[VAL_18:.*]] = cc.compute_ptr %[[VAL_13]][1] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
+// CHECK:               %[[VAL_19:.*]] = cc.load %[[VAL_18]] : !cc.ptr<i8>
+// CHECK:               %[[VAL_39:.*]] = cc.cast %[[VAL_19]] : (i8) -> i1
+// CHECK:               %[[VAL_20:.*]] = arith.cmpi eq, %[[VAL_17]], %[[VAL_39]] : i1
+// CHECK:               cc.if(%[[VAL_20]]) {
+// CHECK:                 %[[VAL_21:.*]] = cc.load %[[VAL_5]] : !cc.ptr<i32>
+// CHECK:                 %[[VAL_22:.*]] = arith.addi %[[VAL_21]], %[[VAL_1]] : i32
+// CHECK:                 cc.store %[[VAL_22]], %[[VAL_5]] : !cc.ptr<i32>
 // CHECK:               }
 // CHECK:               cc.continue
 // CHECK:             } step {
-// CHECK:               %[[VAL_22:.*]] = cc.load %[[VAL_6]] : !cc.ptr<i32>
-// CHECK:               %[[VAL_23:.*]] = arith.addi %[[VAL_22]], %[[VAL_1]] : i32
-// CHECK:               cc.store %[[VAL_23]], %[[VAL_6]] : !cc.ptr<i32>
+// CHECK:               %[[VAL_23:.*]] = cc.load %[[VAL_6]] : !cc.ptr<i32>
+// CHECK:               %[[VAL_24:.*]] = arith.addi %[[VAL_23]], %[[VAL_1]] : i32
+// CHECK:               cc.store %[[VAL_24]], %[[VAL_6]] : !cc.ptr<i32>
 // CHECK:             }
 // CHECK:           }
 // CHECK:           return
@@ -99,10 +99,8 @@ struct libertybell {
 // clang-format off
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__libertybell(
 // CHECK-SAME:      %[[VAL_0:.*]]: i32) attributes {"cudaq-entrypoint", "cudaq-kernel"} {
-// CHECK-DAG:       %[[VAL_C1I64:.*]] = arith.constant 1 : i64
-// CHECK-DAG:       %[[VAL_C0I64:.*]] = arith.constant 0 : i64
-// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 1 : i32
-// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 0 : i32
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
 // CHECK:           %[[VAL_3:.*]] = cc.alloca i32
 // CHECK:           cc.store %[[VAL_0]], %[[VAL_3]] : !cc.ptr<i32>
 // CHECK:           %[[VAL_4:.*]] = quake.alloca !quake.veq<2>
@@ -121,22 +119,24 @@ struct libertybell {
 // CHECK:               quake.h %[[VAL_10]] : (!quake.ref) -> ()
 // CHECK:               %[[VAL_11:.*]] = quake.extract_ref %[[VAL_4]][1] : (!quake.veq<2>) -> !quake.ref
 // CHECK:               quake.x {{\[}}%[[VAL_10]]] %[[VAL_11]] : (!quake.ref, !quake.ref) -> ()
-// CHECK:               %[[VAL_12:.*]] = quake.mz %[[VAL_4]] name "results" : (!quake.veq<2>) -> !quake.measurements<2>
-// CHECK:               %[[VAL_13:.*]] = quake.get_measure %[[VAL_12]][%[[VAL_C0I64]]] : (!quake.measurements<2>, i64) -> !quake.measure
-// CHECK:               %[[VAL_14:.*]] = quake.get_measure %[[VAL_12]][%[[VAL_C1I64]]] : (!quake.measurements<2>, i64) -> !quake.measure
-// CHECK:               %[[VAL_15:.*]] = quake.discriminate %[[VAL_13]] : (!quake.measure) -> i1
-// CHECK:               %[[VAL_16:.*]] = quake.discriminate %[[VAL_14]] : (!quake.measure) -> i1
-// CHECK:               %[[VAL_17:.*]] = arith.cmpi eq, %[[VAL_15]], %[[VAL_16]] : i1
-// CHECK:               cc.if(%[[VAL_17]]) {
-// CHECK:                 %[[VAL_18:.*]] = cc.load %[[VAL_5]] : !cc.ptr<i32>
-// CHECK:                 %[[VAL_19:.*]] = arith.addi %[[VAL_18]], %[[VAL_1]] : i32
-// CHECK:                 cc.store %[[VAL_19]], %[[VAL_5]] : !cc.ptr<i32>
+// CHECK:               %[[VAL_112:.*]] = quake.mz %[[VAL_4]] name "results" : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
+// CHECK:               %[[VAL_12:.*]] = quake.discriminate %[[VAL_112]] :
+// CHECK:               %[[VAL_13:.*]] = cc.stdvec_data %[[VAL_12]] : (!cc.stdvec<i1>) -> !cc.ptr<!cc.array<i8 x ?>>
+// CHECK:               %[[VAL_14:.*]] = cc.cast %[[VAL_13]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
+// CHECK-DAG:           %[[VAL_15:.*]] = cc.compute_ptr %[[VAL_13]][1] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
+// CHECK-DAG:           %[[VAL_16:.*]] = cc.load %[[VAL_15]] : !cc.ptr<i8>
+// CHECK-DAG:           %[[VAL_17:.*]] = cc.load %[[VAL_14]] : !cc.ptr<i8>
+// CHECK:               %[[VAL_18:.*]] = arith.cmpi eq, %[[VAL_17]], %[[VAL_16]] : i8
+// CHECK:               cc.if(%[[VAL_18]]) {
+// CHECK:                 %[[VAL_19:.*]] = cc.load %[[VAL_5]] : !cc.ptr<i32>
+// CHECK:                 %[[VAL_20:.*]] = arith.addi %[[VAL_19]], %[[VAL_1]] : i32
+// CHECK:                 cc.store %[[VAL_20]], %[[VAL_5]] : !cc.ptr<i32>
 // CHECK:               }
 // CHECK:               cc.continue
 // CHECK:             } step {
-// CHECK:               %[[VAL_20:.*]] = cc.load %[[VAL_6]] : !cc.ptr<i32>
-// CHECK:               %[[VAL_21:.*]] = arith.addi %[[VAL_20]], %[[VAL_1]] : i32
-// CHECK:               cc.store %[[VAL_21]], %[[VAL_6]] : !cc.ptr<i32>
+// CHECK:               %[[VAL_21:.*]] = cc.load %[[VAL_6]] : !cc.ptr<i32>
+// CHECK:               %[[VAL_22:.*]] = arith.addi %[[VAL_21]], %[[VAL_1]] : i32
+// CHECK:               cc.store %[[VAL_22]], %[[VAL_6]] : !cc.ptr<i32>
 // CHECK:             }
 // CHECK:           }
 // CHECK:           return
@@ -163,10 +163,8 @@ struct tinkerbell {
 // clang-format off
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__tinkerbell(
 // CHECK-SAME:      %[[VAL_0:.*]]: i32) attributes {"cudaq-entrypoint", "cudaq-kernel"} {
-// CHECK-DAG:       %[[VAL_C1I64:.*]] = arith.constant 1 : i64
-// CHECK-DAG:       %[[VAL_C0I64:.*]] = arith.constant 0 : i64
-// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 1 : i32
-// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 0 : i32
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
 // CHECK:           %[[VAL_3:.*]] = cc.alloca i32
 // CHECK:           cc.store %[[VAL_0]], %[[VAL_3]] : !cc.ptr<i32>
 // CHECK:           %[[VAL_4:.*]] = quake.alloca !quake.veq<2>
@@ -185,22 +183,24 @@ struct tinkerbell {
 // CHECK:               quake.h %[[VAL_10]] : (!quake.ref) -> ()
 // CHECK:               %[[VAL_11:.*]] = quake.extract_ref %[[VAL_4]][1] : (!quake.veq<2>) -> !quake.ref
 // CHECK:               quake.x {{\[}}%[[VAL_10]]] %[[VAL_11]] : (!quake.ref, !quake.ref) -> ()
-// CHECK:               %[[VAL_12:.*]] = quake.mz %[[VAL_4]] name "results" : (!quake.veq<2>) -> !quake.measurements<2>
-// CHECK:               %[[VAL_13:.*]] = quake.get_measure %[[VAL_12]][%[[VAL_C0I64]]] : (!quake.measurements<2>, i64) -> !quake.measure
-// CHECK:               %[[VAL_14:.*]] = quake.get_measure %[[VAL_12]][%[[VAL_C1I64]]] : (!quake.measurements<2>, i64) -> !quake.measure
-// CHECK:               %[[VAL_15:.*]] = quake.discriminate %[[VAL_13]] : (!quake.measure) -> i1
-// CHECK:               %[[VAL_16:.*]] = quake.discriminate %[[VAL_14]] : (!quake.measure) -> i1
-// CHECK:               %[[VAL_17:.*]] = arith.cmpi eq, %[[VAL_15]], %[[VAL_16]] : i1
-// CHECK:               cc.if(%[[VAL_17]]) {
-// CHECK:                 %[[VAL_18:.*]] = cc.load %[[VAL_5]] : !cc.ptr<i32>
-// CHECK:                 %[[VAL_19:.*]] = arith.addi %[[VAL_18]], %[[VAL_1]] : i32
-// CHECK:                 cc.store %[[VAL_19]], %[[VAL_5]] : !cc.ptr<i32>
+// CHECK:               %[[VAL_112:.*]] = quake.mz %[[VAL_4]] name "results" : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
+// CHECK:               %[[VAL_12:.*]] = quake.discriminate %[[VAL_112]] :
+// CHECK:               %[[VAL_13:.*]] = cc.stdvec_data %[[VAL_12]] : (!cc.stdvec<i1>) -> !cc.ptr<!cc.array<i8 x ?>>
+// CHECK:               %[[VAL_14:.*]] = cc.cast %[[VAL_13]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
+// CHECK-DAG:           %[[VAL_15:.*]] = cc.compute_ptr %[[VAL_13]][1] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
+// CHECK-DAG:           %[[VAL_16:.*]] = cc.load %[[VAL_15]] : !cc.ptr<i8>
+// CHECK-DAG:           %[[VAL_17:.*]] = cc.load %[[VAL_14]] : !cc.ptr<i8>
+// CHECK:               %[[VAL_18:.*]] = arith.cmpi eq, %[[VAL_17]], %[[VAL_16]] : i8
+// CHECK:               cc.if(%[[VAL_18]]) {
+// CHECK:                 %[[VAL_19:.*]] = cc.load %[[VAL_5]] : !cc.ptr<i32>
+// CHECK:                 %[[VAL_20:.*]] = arith.addi %[[VAL_19]], %[[VAL_1]] : i32
+// CHECK:                 cc.store %[[VAL_20]], %[[VAL_5]] : !cc.ptr<i32>
 // CHECK:               }
 // CHECK:               cc.continue
 // CHECK:             } step {
-// CHECK:               %[[VAL_20:.*]] = cc.load %[[VAL_6]] : !cc.ptr<i32>
-// CHECK:               %[[VAL_21:.*]] = arith.addi %[[VAL_20]], %[[VAL_1]] : i32
-// CHECK:               cc.store %[[VAL_21]], %[[VAL_6]] : !cc.ptr<i32>
+// CHECK:               %[[VAL_21:.*]] = cc.load %[[VAL_6]] : !cc.ptr<i32>
+// CHECK:               %[[VAL_22:.*]] = arith.addi %[[VAL_21]], %[[VAL_1]] : i32
+// CHECK:               cc.store %[[VAL_22]], %[[VAL_6]] : !cc.ptr<i32>
 // CHECK:             }
 // CHECK:           }
 // CHECK:           return
diff --git a/test/AST-Quake/measure_result_compare.cpp b/test/AST-Quake/measure_result_compare.cpp
deleted file mode 100644
index a34ee2f2d93..00000000000
--- a/test/AST-Quake/measure_result_compare.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2026 NVIDIA Corporation & Affiliates.                         *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- ******************************************************************************/
-
-// RUN: cudaq-quake %s | cudaq-opt | FileCheck %s
-
-#include <cudaq.h>
-
-__qpu__ int compare_kernel() {
-  cudaq::qvector q(2);
-  cudaq::measure_result a = mz(q[0]);
-  cudaq::measure_result b = mz(q[1]);
-  if (a == b)
-    return 1;
-  if (a != b)
-    return 0;
-  return -1;
-}
-
-// clang-format off
-// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_compare_kernel._Z14compare_kernelv() -> i32 attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-// CHECK:           %[[VAL_0:.*]] = arith.constant -1 : i32
-// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i32
-// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
-// CHECK:           %[[VAL_3:.*]] = quake.alloca !quake.veq<2>
-// CHECK:           %[[VAL_4:.*]] = quake.extract_ref %[[VAL_3]][0] : (!quake.veq<2>) -> !quake.ref
-// CHECK:           %[[VAL_5:.*]] = quake.mz %[[VAL_4]] name "a" : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_6:.*]] = quake.extract_ref %[[VAL_3]][1] : (!quake.veq<2>) -> !quake.ref
-// CHECK:           %[[VAL_7:.*]] = quake.mz %[[VAL_6]] name "b" : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_8:.*]] = quake.discriminate %[[VAL_5]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_9:.*]] = quake.discriminate %[[VAL_7]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_10:.*]] = arith.cmpi eq, %[[VAL_8]], %[[VAL_9]] : i1
-// CHECK:           cc.if(%[[VAL_10]]) {
-// CHECK:             cc.unwind_return %[[VAL_1]] : i32
-// CHECK:           }
-// CHECK:           %[[VAL_11:.*]] = quake.discriminate %[[VAL_5]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_12:.*]] = quake.discriminate %[[VAL_7]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_13:.*]] = arith.cmpi ne, %[[VAL_11]], %[[VAL_12]] : i1
-// CHECK:           cc.if(%[[VAL_13]]) {
-// CHECK:             cc.unwind_return %[[VAL_2]] : i32
-// CHECK:           }
-// CHECK:           return %[[VAL_0]] : i32
-// CHECK:         }
-// clang-format on
-
-__qpu__ int compare_with_bool_kernel() {
-  cudaq::qubit q;
-  cudaq::measure_result a = mz(q);
-  if (a == true)
-    return 1;
-  if (a != false)
-    return 2;
-  return 0;
-}
-
-// clang-format off
-// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_compare_with_bool_kernel._Z24compare_with_bool_kernelv() -> i32 attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i32
-// CHECK:           %[[VAL_1:.*]] = arith.constant 2 : i32
-// CHECK:           %[[VAL_2:.*]] = arith.constant false
-// CHECK:           %[[VAL_3:.*]] = arith.constant 1 : i32
-// CHECK:           %[[VAL_4:.*]] = arith.constant true
-// CHECK:           %[[VAL_5:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_6:.*]] = quake.mz %[[VAL_5]] name "a" : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_7:.*]] = quake.discriminate %[[VAL_6]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_8:.*]] = arith.cmpi eq, %[[VAL_7]], %[[VAL_4]] : i1
-// CHECK:           cc.if(%[[VAL_8]]) {
-// CHECK:             cc.unwind_return %[[VAL_3]] : i32
-// CHECK:           }
-// CHECK:           %[[VAL_9:.*]] = quake.discriminate %[[VAL_6]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_10:.*]] = arith.cmpi ne, %[[VAL_9]], %[[VAL_2]] : i1
-// CHECK:           cc.if(%[[VAL_10]]) {
-// CHECK:             cc.unwind_return %[[VAL_1]] : i32
-// CHECK:           }
-// CHECK:           return %[[VAL_0]] : i32
-// CHECK:         }
-// clang-format on
diff --git a/test/AST-Quake/measure_result_device_entry.cpp b/test/AST-Quake/measure_result_device_entry.cpp
deleted file mode 100644
index 3c1a56e3119..00000000000
--- a/test/AST-Quake/measure_result_device_entry.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2026 NVIDIA Corporation & Affiliates.                         *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- ******************************************************************************/
-
-// RUN: cudaq-quake %s | cudaq-opt | FileCheck %s
-
-#include <cudaq.h>
-
-__qpu__ cudaq::measure_result device_helper(cudaq::qubit &q) {
-  h(q);
-  return mz(q);
-}
-
-__qpu__ bool entry_kernel() {
-  cudaq::qubit q;
-  auto m = device_helper(q);
-  return static_cast<bool>(m);
-}
-
-// clang-format off
-// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_device_helper
-// CHECK-SAME:      (%[[VAL_0:.*]]: !quake.ref) -> !quake.measure attributes {"cudaq-kernel", no_this} {
-// CHECK:           quake.h %[[VAL_0]] : (!quake.ref) -> ()
-// CHECK:           %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.ref) -> !quake.measure
-// CHECK:           return %[[VAL_1]] : !quake.measure
-// CHECK:         }
-
-// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_entry_kernel._Z12entry_kernelv() -> i1 attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_1:.*]] = call @__nvqpp__mlirgen__function_device_helper{{.*}}(%[[VAL_0]]) : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_2:.*]] = quake.discriminate %[[VAL_1]] : (!quake.measure) -> i1
-// CHECK:           return %[[VAL_2]] : i1
-// CHECK:         }
-// clang-format on
diff --git a/test/AST-Quake/mz.cpp b/test/AST-Quake/mz.cpp
index d3e31898df8..b7a5fe29463 100644
--- a/test/AST-Quake/mz.cpp
+++ b/test/AST-Quake/mz.cpp
@@ -6,7 +6,7 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-// RUN: cudaq-quake %s | cudaq-opt | FileCheck %s
+// RUN: cudaq-quake %s | FileCheck %s
 
 #include <cudaq.h>
 
@@ -18,67 +18,40 @@ struct S {
 };
 
 // clang-format off
-// CHECK-LABEL:   func.func @__nvqpp__mlirgen__S() attributes {"cudaq-entrypoint", "cudaq-kernel"} {
-// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<20>
-// CHECK:           %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<20>) -> !quake.measurements<20>
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__S() attributes
+// CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.veq<20>
+// CHECK:           quake.mz %[[VAL_2]] : (!quake.veq<20>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 // clang-format on
 
 struct VectorOfStaticVeq {
-  std::vector<cudaq::measure_result> operator()() __qpu__ {
-    cudaq::qubit q1;
-    cudaq::qvector reg1(4);
-    cudaq::qvector reg2(2);
-    cudaq::qubit q2;
-    return mz(q1, reg1, reg2, q2);
-  }
-};
-
-// clang-format off
-// CHECK-LABEL:   func.func @__nvqpp__mlirgen__VectorOfStaticVeq() -> !quake.measurements<?> 
-// CHECK-NOT: cudaq-entrypoint
-// CHECK-DAG:       %[[VAL_0:.*]] = quake.alloca !quake.ref
-// CHECK-DAG:       %[[VAL_1:.*]] = quake.alloca !quake.veq<4>
-// CHECK-DAG:       %[[VAL_2:.*]] = quake.alloca !quake.veq<2>
-// CHECK-DAG:       %[[VAL_3:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_0]], %[[VAL_1]], %[[VAL_2]], %[[VAL_3]] : (!quake.ref, !quake.veq<4>, !quake.veq<2>, !quake.ref) -> !quake.measurements<8>
-// CHECK:           %[[VAL_5:.*]] = quake.relax_size %[[VAL_4]] : (!quake.measurements<8>) -> !quake.measurements<?>
-// CHECK:           return %[[VAL_5]] : !quake.measurements<?>
-// CHECK:         }
-// clang-format on
-
-struct VectorOfStaticVeq_Bool {
   std::vector<bool> operator()() __qpu__ {
     cudaq::qubit q1;
     cudaq::qvector reg1(4);
     cudaq::qvector reg2(2);
     cudaq::qubit q2;
-    auto res = mz(q1, reg1, reg2, q2);
-    return cudaq::to_bool_vector(res);
+    return mz(q1, reg1, reg2, q2);
   }
 };
 
-// clang-format off
-// CHECK-LABEL:   func.func @__nvqpp__mlirgen__VectorOfStaticVeq_Bool() -> !cc.stdvec<i1>
-// CHECK-SAME:      attributes {"cudaq-entrypoint"
-// CHECK-DAG:       %[[VAL_0:.*]] = arith.constant 1 : i64
-// CHECK-DAG:       %[[VAL_1:.*]] = quake.alloca !quake.ref
-// CHECK-DAG:       %[[VAL_2:.*]] = quake.alloca !quake.veq<4>
-// CHECK-DAG:       %[[VAL_3:.*]] = quake.alloca !quake.veq<2>
-// CHECK-DAG:       %[[VAL_4:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_5:.*]] = quake.mz %[[VAL_1]], %[[VAL_2]], %[[VAL_3]], %[[VAL_4]] name "res" : (!quake.ref, !quake.veq<4>, !quake.veq<2>, !quake.ref) -> !quake.measurements<8>
-// CHECK:           %[[VAL_6:.*]] = quake.discriminate %[[VAL_5]] : (!quake.measurements<8>) -> !cc.stdvec<i1>
-// CHECK:           %[[VAL_7:.*]] = cc.stdvec_data %[[VAL_6]] : (!cc.stdvec<i1>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_8:.*]] = cc.stdvec_size %[[VAL_6]] : (!cc.stdvec<i1>) -> i64
-// CHECK:           %[[VAL_9:.*]] = call @__nvqpp_vectorCopyCtor(%[[VAL_7]], %[[VAL_8]], %[[VAL_0]]) : (!cc.ptr<i8>, i64, i64) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_10:.*]] = cc.stdvec_init %[[VAL_9]], %[[VAL_8]] : (!cc.ptr<i8>, i64) -> !cc.stdvec<i1>
-// CHECK:           return %[[VAL_10]] : !cc.stdvec<i1>
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__VectorOfStaticVeq() -> !cc.stdvec<i1> attributes {
+// CHECK:           %[[VAL_11:.*]] = arith.constant 1 : i64
+// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.ref
+// CHECK:           %[[VAL_3:.*]] = quake.alloca !quake.veq<4>
+// CHECK:           %[[VAL_6:.*]] = quake.alloca !quake.veq<2>
+// CHECK:           %[[VAL_7:.*]] = quake.alloca !quake.ref
+// CHECK:           %[[VAL_81:.*]] = quake.mz %[[VAL_0]], %[[VAL_3]], %[[VAL_6]], %[[VAL_7]] : (!quake.ref, !quake.veq<4>, !quake.veq<2>, !quake.ref) -> !cc.stdvec<!quake.measure>
+// CHECK:           %[[VAL_8:.*]] = quake.discriminate %[[VAL_81]] :
+// CHECK:           %[[VAL_9:.*]] = cc.stdvec_data %[[VAL_8]] : (!cc.stdvec<i1>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_10:.*]] = cc.stdvec_size %[[VAL_8]] : (!cc.stdvec<i1>) -> i64
+// CHECK:           %[[VAL_12:.*]] = call @__nvqpp_vectorCopyCtor(%[[VAL_9]], %[[VAL_10]], %[[VAL_11]]) : (!cc.ptr<i8>, i64, i64) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_13:.*]] = cc.stdvec_init %[[VAL_12]], %[[VAL_10]] : (!cc.ptr<i8>, i64) -> !cc.stdvec<i1>
+// CHECK:           return %[[VAL_13]] : !cc.stdvec<i1>
 // CHECK:         }
-// clang-format on
 
 struct VectorOfDynamicVeq {
-  std::vector<cudaq::measure_result> operator()(unsigned i, unsigned j) __qpu__ {
+  std::vector<bool> operator()(unsigned i, unsigned j) __qpu__ {
     cudaq::qubit q1;
     cudaq::qvector reg1(i);
     cudaq::qvector reg2(j);
@@ -87,11 +60,9 @@ struct VectorOfDynamicVeq {
   }
 };
 
-// clang-format off
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__VectorOfDynamicVeq(
-// CHECK-SAME:                                                    %[[VAL_0:.*]]: i32,
-// CHECK-SAME:                                                    %[[VAL_1:.*]]: i32) -> !quake.measurements<?> attributes {"cudaq-kernel"} {
-// CHECK-NOT: cudaq-entrypoint
+// CHECK-SAME:        %[[VAL_0:.*]]: i32{{.*}}, %[[VAL_1:.*]]: i32{{.*}}) -> !cc.stdvec<i1> attributes {
+// CHECK:           %[[VAL_15:.*]] = arith.constant 1 : i64
 // CHECK:           %[[VAL_2:.*]] = cc.alloca i32
 // CHECK:           cc.store %[[VAL_0]], %[[VAL_2]] : !cc.ptr<i32>
 // CHECK:           %[[VAL_3:.*]] = cc.alloca i32
@@ -99,50 +70,17 @@ struct VectorOfDynamicVeq {
 // CHECK:           %[[VAL_4:.*]] = quake.alloca !quake.ref
 // CHECK:           %[[VAL_5:.*]] = cc.load %[[VAL_2]] : !cc.ptr<i32>
 // CHECK:           %[[VAL_6:.*]] = cc.cast unsigned %[[VAL_5]] : (i32) -> i64
-// CHECK:           %[[VAL_7:.*]] = quake.alloca !quake.veq<?>{{\[}}%[[VAL_6]] : i64]
+// CHECK:           %[[VAL_7:.*]] = quake.alloca !quake.veq<?>[%[[VAL_6]] : i64]
 // CHECK:           %[[VAL_8:.*]] = cc.load %[[VAL_3]] : !cc.ptr<i32>
 // CHECK:           %[[VAL_9:.*]] = cc.cast unsigned %[[VAL_8]] : (i32) -> i64
-// CHECK:           %[[VAL_10:.*]] = quake.alloca !quake.veq<?>{{\[}}%[[VAL_9]] : i64]
+// CHECK:           %[[VAL_10:.*]] = quake.alloca !quake.veq<?>[%[[VAL_9]] : i64]
 // CHECK:           %[[VAL_11:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_12:.*]] = quake.mz %[[VAL_4]], %[[VAL_7]], %[[VAL_10]], %[[VAL_11]] : (!quake.ref, !quake.veq<?>, !quake.veq<?>, !quake.ref) -> !quake.measurements<?>
-// CHECK:           return %[[VAL_12]] : !quake.measurements<?>
+// CHECK:           %[[VAL_112:.*]] = quake.mz %[[VAL_4]], %[[VAL_7]], %[[VAL_10]], %[[VAL_11]] : (!quake.ref, !quake.veq<?>, !quake.veq<?>, !quake.ref) -> !cc.stdvec<!quake.measure>
+// CHECK:           %[[VAL_12:.*]] = quake.discriminate %[[VAL_112]] :
+// CHECK:           %[[VAL_13:.*]] = cc.stdvec_data %[[VAL_12]] : (!cc.stdvec<i1>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_14:.*]] = cc.stdvec_size %[[VAL_12]] : (!cc.stdvec<i1>) -> i64
+// CHECK:           %[[VAL_16:.*]] = call @__nvqpp_vectorCopyCtor(%[[VAL_13]], %[[VAL_14]], %[[VAL_15]]) : (!cc.ptr<i8>, i64, i64) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_17:.*]] = cc.stdvec_init %[[VAL_16]], %[[VAL_14]] : (!cc.ptr<i8>, i64) -> !cc.stdvec<i1>
+// CHECK:           return %[[VAL_17]] : !cc.stdvec<i1>
 // CHECK:         }
-// clang-format on
 
-struct MxTest {
-  void operator()() __qpu__ {
-    cudaq::qubit q;
-    auto r = mx(q);
-    bool b = r;
-  }
-};
-
-// clang-format off
-// CHECK-LABEL:   func.func @__nvqpp__mlirgen__MxTest() attributes {"cudaq-entrypoint", "cudaq-kernel"} {
-// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_1:.*]] = quake.mx %[[VAL_0]] name "r" : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_2:.*]] = quake.discriminate %[[VAL_1]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_3:.*]] = cc.alloca i1
-// CHECK:           cc.store %[[VAL_2]], %[[VAL_3]] : !cc.ptr<i1>
-// CHECK:           return
-// CHECK:         }
-// clang-format on
-
-struct MyTest {
-  void operator()() __qpu__ {
-    cudaq::qubit q;
-    auto r = my(q);
-    bool b = r;
-  }
-};
-
-// clang-format off
-// CHECK-LABEL:   func.func @__nvqpp__mlirgen__MyTest() attributes {"cudaq-entrypoint", "cudaq-kernel"} {
-// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_1:.*]] = quake.my %[[VAL_0]] name "r" : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_2:.*]] = quake.discriminate %[[VAL_1]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_3:.*]] = cc.alloca i1
-// CHECK:           cc.store %[[VAL_2]], %[[VAL_3]] : !cc.ptr<i1>
-// CHECK:           return
-// CHECK:         }
-// clang-format on
diff --git a/test/AST-Quake/qalloc_initialization.cpp b/test/AST-Quake/qalloc_initialization.cpp
index 7ddacc59eeb..ca4ba11510b 100644
--- a/test/AST-Quake/qalloc_initialization.cpp
+++ b/test/AST-Quake/qalloc_initialization.cpp
@@ -19,7 +19,7 @@ struct Vanilla {
   std::vector<bool> operator()() __qpu__ {
     cudaq::qvector v{cudaq::state{0., 1., 1., 0.}};
     h(v);
-    return cudaq::to_bool_vector(mz(v));
+    return mz(v);
   }
 };
 
@@ -54,7 +54,7 @@ struct VanillaBean {
   std::vector<bool> operator()() __qpu__ {
     cudaq::qvector v = cudaq::state{0., 1., 1., 0.};
     h(v);
-    return cudaq::to_bool_vector(mz(v));
+    return mz(v);
   }
 };
 
@@ -91,7 +91,7 @@ struct Cherry {
     cudaq::qvector v{{std::initializer_list<std::complex<double>>{
         {0.0, 1.0}, {0.6, 0.4}, {1.0, 0.0}, {0.0, 0.0}}}};
     h(v);
-    return cudaq::to_bool_vector(mz(v));
+    return mz(v);
   }
 };
 
@@ -131,7 +131,7 @@ struct MooseTracks {
         {std::complex<double>{0.0, 1.0}, std::complex<double>{0.75, 0.25},
          std::complex<double>{1.0, 0.0}, std::complex<double>{0.0, 0.0}}};
     h(v);
-    return cudaq::to_bool_vector(mz(v));
+    return mz(v);
   }
 };
 
@@ -170,7 +170,7 @@ struct RockyRoad {
     cudaq::qvector v{cudaq::state{0.0 + 1.0i, std::complex<double>{0.8, 0.2},
                                   1.0 + 0.0i, std::complex<double>{0.0, 0.0}}};
     h(v);
-    return cudaq::to_bool_vector(mz(v));
+    return mz(v);
   }
 };
 
@@ -285,7 +285,7 @@ struct Neapolitan {
   std::vector<bool> operator()() __qpu__ {
     cudaq::qvector v{getComplexInit()};
     h(v);
-    return cudaq::to_bool_vector(mz(v));
+    return mz(v);
   }
 };
 
@@ -317,7 +317,7 @@ struct ButterPecan {
   std::vector<bool> operator()() __qpu__ {
     cudaq::qvector v(getComplexInit());
     h(v);
-    return cudaq::to_bool_vector(mz(v));
+    return mz(v);
   }
 };
 
diff --git a/test/AST-Quake/qalloc_state.cpp b/test/AST-Quake/qalloc_state.cpp
index 5180e9000f3..1d4d53f0e38 100644
--- a/test/AST-Quake/qalloc_state.cpp
+++ b/test/AST-Quake/qalloc_state.cpp
@@ -14,7 +14,7 @@ struct Eins {
   std::vector<bool> operator()(cudaq::state *state) __qpu__ {
     cudaq::qvector v(state);
     h(v);
-    return cudaq::to_bool_vector(mz(v));
+    return mz(v);
   }
 };
 
@@ -28,7 +28,7 @@ struct Zwei {
   std::vector<bool> operator()(const cudaq::state *state) __qpu__ {
     cudaq::qvector v(state);
     h(v);
-    return cudaq::to_bool_vector(mz(v));
+    return mz(v);
   }
 };
 
@@ -42,7 +42,7 @@ struct Drei {
   std::vector<bool> operator()(cudaq::state &state) __qpu__ {
     cudaq::qvector v(state);
     h(v);
-    return cudaq::to_bool_vector(mz(v));
+    return mz(v);
   }
 };
 
@@ -56,7 +56,7 @@ struct Vier {
   std::vector<bool> operator()(const cudaq::state &state) __qpu__ {
     cudaq::qvector v(state);
     h(v);
-    return cudaq::to_bool_vector(mz(v));
+    return mz(v);
   }
 };
 
diff --git a/test/AST-Quake/qpe.cpp b/test/AST-Quake/qpe.cpp
index 9012aebb114..5249c5aecc9 100644
--- a/test/AST-Quake/qpe.cpp
+++ b/test/AST-Quake/qpe.cpp
@@ -329,7 +329,7 @@ int main() {
 // CHECK:             }
 // CHECK:           }
 // CHECK:           call @__nvqpp__mlirgen__function_iqft{{.*}}(%[[VAL_20]]) : (!quake.veq<?>) -> ()
-// CHECK:           %[[VAL_54:.*]] = quake.mz %[[VAL_20]] : (!quake.veq<?>) -> !quake.measurements<?>
+// CHECK:           %[[VAL_54:.*]] = quake.mz %[[VAL_20]] : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
diff --git a/test/AST-Quake/reset_after_measure.cpp b/test/AST-Quake/reset_after_measure.cpp
index 0e0d80227e9..15de755fea3 100644
--- a/test/AST-Quake/reset_after_measure.cpp
+++ b/test/AST-Quake/reset_after_measure.cpp
@@ -79,7 +79,10 @@ void reuse2() __qpu__ {
 // CHECK:           cc.if(%[[VAL_2]]) {
 // CHECK:             quake.x %[[VAL_0]] : (!quake.ref) -> ()
 // CHECK:           }
-// CHECK:           cc.if(%[[VAL_2]]) {
+// CHECK:           %[[VAL_3:.*]] = cc.alloca i1
+// CHECK:           cc.store %[[VAL_2]], %[[VAL_3]] : !cc.ptr<i1>
+// CHECK:           %[[VAL_4:.*]] = cc.load %[[VAL_3]] : !cc.ptr<i1>
+// CHECK:           cc.if(%[[VAL_4]]) {
 // CHECK:             quake.x %[[VAL_0]] : (!quake.ref) -> ()
 // CHECK:           }
 // clang-format on
@@ -109,7 +112,10 @@ void reuse3() __qpu__ {
 // CHECK:           cc.if(%[[VAL_3]]) {
 // CHECK:             quake.x %[[VAL_0]] : (!quake.ref) -> ()
 // CHECK:           }
-// CHECK:           cc.if(%[[VAL_3]]) {
+// CHECK:           %[[VAL_4:.*]] = cc.alloca i1
+// CHECK:           cc.store %[[VAL_3]], %[[VAL_4]] : !cc.ptr<i1>
+// CHECK:           %[[VAL_5:.*]] = cc.load %[[VAL_4]] : !cc.ptr<i1>
+// CHECK:           cc.if(%[[VAL_5]]) {
 // CHECK:             quake.x %[[VAL_0]] : (!quake.ref) -> ()
 // CHECK:             quake.x %[[VAL_1]] : (!quake.ref) -> ()
 // CHECK:           }
diff --git a/test/AST-Quake/separate_compilation.cpp b/test/AST-Quake/separate_compilation.cpp
index f7ab183aa3f..5ea64335439 100644
--- a/test/AST-Quake/separate_compilation.cpp
+++ b/test/AST-Quake/separate_compilation.cpp
@@ -18,14 +18,6 @@ __qpu__ uint64_t test_entry_point() {
   return otherKernel(results);
 }
 
-// clang-format off
-// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_test_entry_point._Z16test_entry_pointv() -> i64
-// CHECK-SAME:      attributes {"cudaq-entrypoint", "cudaq-kernel", no_this}
-// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<5>
-// CHECK:           %[[VAL_1:.*]] = quake.mz %[[VAL_0]] name "results" : (!quake.veq<5>) -> !quake.measurements<5>
-// CHECK:           %[[VAL_2:.*]] = quake.relax_size %[[VAL_1]] : (!quake.measurements<5>) -> !quake.measurements<?>
-// CHECK:           %[[VAL_3:.*]] = call @{{.*otherKernel.*}}(%[[VAL_2]]) : (!quake.measurements<?>) -> i64
-// CHECK:           return %[[VAL_3]] : i64
-// CHECK:         }
-// CHECK:         func.func private @{{.*otherKernel.*}}(!quake.measurements<?>) -> i64
-// clang-format on
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_test_entry_point.
+// CHECK-SAME:      () -> i64 attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// CHECK:           %[[VAL_3:.*]] = call @__nvqpp__mlirgen__function_otherKernel.{{.*}}(%{{.*}}) : (!cc.stdvec<i1>) -> i64
diff --git a/test/AST-Quake/simple.cpp b/test/AST-Quake/simple.cpp
index c76455540b6..165c1fe5deb 100644
--- a/test/AST-Quake/simple.cpp
+++ b/test/AST-Quake/simple.cpp
@@ -61,7 +61,7 @@ struct ghz {
 // CHECK:               cc.store %[[VAL_21]], %[[VAL_8]] : !cc.ptr<i32>
 // CHECK:             }
 // CHECK:           }
-// CHECK:           %[[VAL_22:.*]] = quake.mz %[[VAL_6]] : (!quake.veq<?>) -> !quake.measurements<?>
+// CHECK:           %[[VAL_22:.*]] = quake.mz %[[VAL_6]] : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
diff --git a/test/AST-Quake/simple_qarray.cpp b/test/AST-Quake/simple_qarray.cpp
index b6552564ef4..7ada6b1ee8f 100644
--- a/test/AST-Quake/simple_qarray.cpp
+++ b/test/AST-Quake/simple_qarray.cpp
@@ -74,7 +74,7 @@ int main() {
 // CHECK:               cc.store %[[VAL_16]], %[[VAL_5]] : !cc.ptr<i32>
 // CHECK:             }
 // CHECK:           }
-// CHECK:           %[[VAL_17:.*]] = quake.mz %[[VAL_3]] : (!quake.veq<5>) -> !quake.measurements<5>
+// CHECK:           %[[VAL_17:.*]] = quake.mz %[[VAL_3]] : (!quake.veq<5>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
diff --git a/test/AST-Quake/slice.cpp b/test/AST-Quake/slice.cpp
index 90c9e90b64d..c7bbc85e505 100644
--- a/test/AST-Quake/slice.cpp
+++ b/test/AST-Quake/slice.cpp
@@ -38,13 +38,15 @@ __qpu__ bool issue_3092() {
 }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_issue_3092._Z10issue_3092v() -> i1
-// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.veq<6>
-// CHECK:           %[[VAL_2:.*]] = quake.extract_ref %[[VAL_1]][3] : (!quake.veq<6>) -> !quake.ref
-// CHECK:           quake.x %[[VAL_2]] : (!quake.ref) -> ()
-// CHECK:           %[[VAL_3:.*]] = quake.subveq %[[VAL_1]], 3, 3 : (!quake.veq<6>) -> !quake.veq<1>
-// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_3]] : (!quake.veq<1>) -> !quake.measurements<1>
-// CHECK:           %[[VAL_5:.*]] = quake.get_measure %[[VAL_4]][%[[VAL_0]]] : (!quake.measurements<1>, i64) -> !quake.measure
-// CHECK:           %[[VAL_6:.*]] = quake.discriminate %[[VAL_5]] : (!quake.measure) -> i1
-// CHECK:           return %[[VAL_6]] : i1
+// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<6>
+// CHECK:           %[[VAL_1:.*]] = quake.extract_ref %[[VAL_0]][3] : (!quake.veq<6>) -> !quake.ref
+// CHECK:           quake.x %[[VAL_1]] : (!quake.ref) -> ()
+// CHECK:           %[[VAL_2:.*]] = quake.subveq %[[VAL_0]], 3, 3 : (!quake.veq<6>) -> !quake.veq<1>
+// CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_2]] : (!quake.veq<1>) -> !cc.stdvec<!quake.measure>
+// CHECK:           %[[VAL_4:.*]] = quake.discriminate %[[VAL_3]] : (!cc.stdvec<!quake.measure>) -> !cc.stdvec<i1>
+// CHECK:           %[[VAL_5:.*]] = cc.stdvec_data %[[VAL_4]] : (!cc.stdvec<i1>) -> !cc.ptr<!cc.array<i8 x ?>>
+// CHECK:           %[[VAL_6:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_7:.*]] = cc.load %[[VAL_6]] : !cc.ptr<i8>
+// CHECK:           %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (i8) -> i1
+// CHECK:           return %[[VAL_8]] : i1
 // CHECK:         }
diff --git a/test/AST-Quake/ternary.cpp b/test/AST-Quake/ternary.cpp
index 74c437d2392..375b9645d12 100644
--- a/test/AST-Quake/ternary.cpp
+++ b/test/AST-Quake/ternary.cpp
@@ -36,68 +36,15 @@ int main() {
 }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_test_kernel._Z11test_kerneli(
-// CHECK-SAME:                                                                       %[[VAL_0:.*]]: i32) -> i32 attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 1 : i64
-// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 0 : i64
-// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 1 : i32
-// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : i32
-// CHECK:           %[[VAL_5:.*]] = cc.alloca i32
-// CHECK:           cc.store %[[VAL_0]], %[[VAL_5]] : !cc.ptr<i32>
-// CHECK:           %[[VAL_6:.*]] = cc.load %[[VAL_5]] : !cc.ptr<i32>
-// CHECK:           %[[VAL_7:.*]] = cc.cast signed %[[VAL_6]] : (i32) -> i64
-// CHECK:           %[[VAL_8:.*]] = quake.alloca !quake.veq<?>{{\[}}%[[VAL_7]] : i64]
-// CHECK:           %[[VAL_9:.*]] = quake.extract_ref %[[VAL_8]][0] : (!quake.veq<?>) -> !quake.ref
-// CHECK:           quake.h %[[VAL_9]] : (!quake.ref) -> ()
-// CHECK:           cc.scope {
-// CHECK:             %[[VAL_10:.*]] = cc.alloca i32
-// CHECK:             cc.store %[[VAL_4]], %[[VAL_10]] : !cc.ptr<i32>
-// CHECK:             cc.loop while {
-// CHECK:               %[[VAL_11:.*]] = cc.load %[[VAL_10]] : !cc.ptr<i32>
-// CHECK:               %[[VAL_12:.*]] = cc.load %[[VAL_5]] : !cc.ptr<i32>
-// CHECK:               %[[VAL_13:.*]] = arith.subi %[[VAL_12]], %[[VAL_3]] : i32
-// CHECK:               %[[VAL_14:.*]] = arith.cmpi slt, %[[VAL_11]], %[[VAL_13]] : i32
-// CHECK:               cc.condition %[[VAL_14]]
-// CHECK:             } do {
-// CHECK:               %[[VAL_15:.*]] = cc.load %[[VAL_10]] : !cc.ptr<i32>
-// CHECK:               %[[VAL_16:.*]] = cc.cast signed %[[VAL_15]] : (i32) -> i64
-// CHECK:               %[[VAL_17:.*]] = quake.extract_ref %[[VAL_8]]{{\[}}%[[VAL_16]]] : (!quake.veq<?>, i64) -> !quake.ref
-// CHECK:               %[[VAL_18:.*]] = cc.load %[[VAL_10]] : !cc.ptr<i32>
-// CHECK:               %[[VAL_19:.*]] = arith.addi %[[VAL_18]], %[[VAL_3]] : i32
-// CHECK:               %[[VAL_20:.*]] = cc.cast signed %[[VAL_19]] : (i32) -> i64
-// CHECK:               %[[VAL_21:.*]] = quake.extract_ref %[[VAL_8]]{{\[}}%[[VAL_20]]] : (!quake.veq<?>, i64) -> !quake.ref
-// CHECK:               quake.x {{\[}}%[[VAL_17]]] %[[VAL_21]] : (!quake.ref, !quake.ref) -> ()
-// CHECK:               cc.continue
-// CHECK:             } step {
-// CHECK:               %[[VAL_22:.*]] = cc.load %[[VAL_10]] : !cc.ptr<i32>
-// CHECK:               %[[VAL_23:.*]] = arith.addi %[[VAL_22]], %[[VAL_3]] : i32
-// CHECK:               cc.store %[[VAL_23]], %[[VAL_10]] : !cc.ptr<i32>
+// CHECK-DAG:        %[[VAL_3:.*]] = arith.constant 1 : i32
+// CHECK-DAG:        %[[VAL_4:.*]] = arith.constant 0 : i32
+// CHECK:             cc.scope {
+// CHECK:               %[[VAL_35:.*]] = cc.alloca i1
+// CHECK:               %[[VAL_36:.*]] = cc.load %[[VAL_35]] : !cc.ptr<i1>
+// CHECK:               %[[VAL_37:.*]] = cc.if(%[[VAL_36]]) -> i32 {
+// CHECK:                 cc.continue %[[VAL_3]] : i32
+// CHECK:               } else {
+// CHECK:                 cc.continue %[[VAL_4]] : i32
+// CHECK:               }
 // CHECK:             }
-// CHECK:           }
-// CHECK:           %[[VAL_24:.*]] = quake.mz %[[VAL_8]] name "results" : (!quake.veq<?>) -> !quake.measurements<?>
-// CHECK:           %[[VAL_25:.*]] = cc.alloca i32
-// CHECK:           cc.store %[[VAL_4]], %[[VAL_25]] : !cc.ptr<i32>
-// CHECK:           %[[VAL_26:.*]] = quake.veq_size %[[VAL_8]] : (!quake.veq<?>) -> i64
-// CHECK:           %[[VAL_27:.*]] = cc.loop while ((%[[VAL_28:.*]] = %[[VAL_2]]) -> (i64)) {
-// CHECK:             %[[VAL_29:.*]] = arith.cmpi slt, %[[VAL_28]], %[[VAL_26]] : i64
-// CHECK:             cc.condition %[[VAL_29]](%[[VAL_28]] : i64)
-// CHECK:           } do {
-// CHECK:           ^bb0(%[[VAL_30:.*]]: i64):
-// CHECK:             %[[VAL_31:.*]] = quake.get_measure %[[VAL_24]]{{\[}}%[[VAL_30]]] : (!quake.measurements<?>, i64) -> !quake.measure
-// CHECK:             %[[VAL_32:.*]] = quake.discriminate %[[VAL_31]] : (!quake.measure) -> i1
-// CHECK:             %[[VAL_33:.*]] = cc.if(%[[VAL_32]]) -> i32 {
-// CHECK:               cc.continue %[[VAL_3]] : i32
-// CHECK:             } else {
-// CHECK:               cc.continue %[[VAL_4]] : i32
-// CHECK:             }
-// CHECK:             %[[VAL_34:.*]] = cc.load %[[VAL_25]] : !cc.ptr<i32>
-// CHECK:             %[[VAL_35:.*]] = arith.addi %[[VAL_34]], %[[VAL_33]] : i32
-// CHECK:             cc.store %[[VAL_35]], %[[VAL_25]] : !cc.ptr<i32>
-// CHECK:             cc.continue %[[VAL_30]] : i64
-// CHECK:           } step {
-// CHECK:           ^bb0(%[[VAL_36:.*]]: i64):
-// CHECK:             %[[VAL_37:.*]] = arith.addi %[[VAL_36]], %[[VAL_1]] : i64
-// CHECK:             cc.continue %[[VAL_37]] : i64
-// CHECK:           } {invariant}
-// CHECK:           %[[VAL_38:.*]] = cc.load %[[VAL_25]] : !cc.ptr<i32>
-// CHECK:           return %[[VAL_38]] : i32
-// CHECK:         }
+// CHECK:             cc.continue %{{.*}} : i64
diff --git a/test/AST-Quake/to_integer.cpp b/test/AST-Quake/to_integer.cpp
index 14dbddb4db8..af4b217c45e 100644
--- a/test/AST-Quake/to_integer.cpp
+++ b/test/AST-Quake/to_integer.cpp
@@ -23,17 +23,7 @@ struct kernel {
   }
 };
 
-struct kernel_via_bool_vector {
-  void operator()() __qpu__ {
-    cudaq::qvector q(4);
-    int64_t results_int =
-        cudaq::to_integer(cudaq::to_bool_vector(mz(q)));
-    external_call_to_keep_result(results_int);
-  }
-};
-
 // clang-format off
 // CHECK-LABEL: define void @__nvqpp__mlirgen__kernel()
 // CHECK-NOT: llvm.vector
-// CHECK-LABEL: define void @__nvqpp__mlirgen__kernel_via_bool_vector()
-// CHECK-NOT: llvm.vector
+
diff --git a/test/AST-Quake/to_qir.cpp b/test/AST-Quake/to_qir.cpp
index 7248a7c6229..e09c4998bec 100644
--- a/test/AST-Quake/to_qir.cpp
+++ b/test/AST-Quake/to_qir.cpp
@@ -33,33 +33,34 @@ struct kernel {
 
 // clang-format off
 // CHECK-LABEL: define void @__nvqpp__mlirgen__kernel()
-// CHECK:         %[[VAL_0:.*]] = tail call %[[VAL_1:.*]]* @__quantum__rt__qubit_allocate_array(i64 3)
-// CHECK:         %[[VAL_2:.*]] = tail call %[[VAL_3:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_1]]* %[[VAL_0]], i64 1)
-// CHECK:         %[[VAL_4:.*]] = load %[[VAL_3]]*, %[[VAL_3]]** %[[VAL_2]], align 8
-// CHECK:         tail call void @__quantum__qis__h(%[[VAL_3]]* %[[VAL_4]])
-// CHECK:         %[[VAL_5:.*]] = tail call %[[VAL_3]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_1]]* %[[VAL_0]], i64 2)
-// CHECK:         %[[VAL_6:.*]] = load %[[VAL_3]]*, %[[VAL_3]]** %[[VAL_5]], align 8
-// CHECK:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (%[[VAL_1]]*, %[[VAL_3]]*)* @__quantum__qis__x__ctl to i8*), %[[VAL_3]]* %[[VAL_4]], %[[VAL_3]]* %[[VAL_6]])
-// CHECK:         %[[VAL_7:.*]] = tail call %[[VAL_3]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_1]]* %[[VAL_0]], i64 0)
-// CHECK:         %[[VAL_8:.*]] = load %[[VAL_3]]*, %[[VAL_3]]** %[[VAL_7]], align 8
-// CHECK:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (%[[VAL_1]]*, %[[VAL_3]]*)* @__quantum__qis__x__ctl to i8*), %[[VAL_3]]* %[[VAL_8]], %[[VAL_3]]* %[[VAL_4]])
-// CHECK:         tail call void @__quantum__qis__h(%[[VAL_3]]* %[[VAL_8]])
-// CHECK:         %[[VAL_9:.*]] = tail call %[[VAL_10:.*]]* @__quantum__qis__mz__to__register(%[[VAL_3]]* %[[VAL_8]], i8* nonnull getelementptr inbounds ([3 x i8], [3 x i8]* @cstr.623000, i64 0, i64 0))
-// CHECK:         %[[VAL_11:.*]] = tail call %[[VAL_10]]* @__quantum__qis__mz__to__register(%[[VAL_3]]* %[[VAL_4]], i8* nonnull getelementptr inbounds ([3 x i8], [3 x i8]* @cstr.623100, i64 0, i64 0))
-// CHECK:         %[[VAL_12:.*]] = bitcast %[[VAL_10]]* %[[VAL_11]] to i1*
-// CHECK:         %[[VAL_13:.*]] = load i1, i1* %[[VAL_12]], align 1
-// CHECK:         br i1 %[[VAL_13]], label %[[VAL_14:.*]], label %[[VAL_15:.*]]
-// CHECK:       {{[0-9]+}}:
-// CHECK:         tail call void @__quantum__qis__x(%[[VAL_3]]* %[[VAL_6]])
-// CHECK:         br label %[[VAL_15]]
-// CHECK:       {{[0-9]+}}:
-// CHECK:         %[[VAL_16:.*]] = bitcast %[[VAL_10]]* %[[VAL_9]] to i1*
-// CHECK:         %[[VAL_17:.*]] = load i1, i1* %[[VAL_16]], align 1
-// CHECK:         br i1 %[[VAL_17]], label %[[VAL_18:.*]], label %[[VAL_19:.*]]
-// CHECK:       {{[0-9]+}}:
-// CHECK:         tail call void @__quantum__qis__z(%[[VAL_3]]* %[[VAL_6]])
-// CHECK:         br label %[[VAL_19]]
-// CHECK:       {{[0-9]+}}:
-// CHECK:         tail call void @__quantum__rt__qubit_release_array(%[[VAL_1]]* %[[VAL_0]])
+// CHECK:         %[[VAL_0:.*]] = tail call %Array* @__quantum__rt__qubit_allocate_array(i64 3)
+// CHECK:         %[[VAL_2:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 1)
+// CHECK:         %[[VAL_4:.*]] = load %Qubit*, %Qubit** %[[VAL_2]], align 8
+// CHECK:         tail call void @__quantum__qis__h(%Qubit* %[[VAL_4]])
+// CHECK:         %[[VAL_5:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 2)
+// CHECK:         %[[VAL_6:.*]] = load %Qubit*, %Qubit** %[[VAL_5]], align 8
+// CHECK:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_4]], %Qubit* %[[VAL_6]])
+// CHECK:         %[[VAL_7:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 0)
+// CHECK:         %[[VAL_8:.*]] = load %Qubit*, %Qubit** %[[VAL_7]], align 8
+// CHECK:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_8]], %Qubit* %[[VAL_4]])
+// CHECK:         tail call void @__quantum__qis__h(%Qubit* %[[VAL_8]])
+// CHECK:         %[[VAL_9:.*]] = tail call %Result* @__quantum__qis__mz__to__register(%Qubit* %[[VAL_8]], i8* nonnull getelementptr inbounds ([3 x i8], [3 x i8]* @cstr.623000, i64 0, i64 0))
+// CHECK:         %[[VAL_11:.*]] = bitcast %Result* %[[VAL_9]] to i1*
+// CHECK:         %[[VAL_12:.*]] = load i1, i1* %[[VAL_11]], align 1
+// CHECK:         %[[VAL_13:.*]] = tail call %Result* @__quantum__qis__mz__to__register(%Qubit* %[[VAL_4]], i8* nonnull getelementptr inbounds ([3 x i8], [3 x i8]* @cstr.623100, i64 0, i64 0))
+// CHECK:         %[[VAL_14:.*]] = bitcast %Result* %[[VAL_13]] to i1*
+// CHECK:         %[[VAL_15:.*]] = load i1, i1* %[[VAL_14]], align 1
+// CHECK:         br i1 %[[VAL_15]], label %[[VAL_16:.*]], label %[[VAL_17:.*]]
+// CHECK:       14:                                               ; preds = %[[VAL_18:.*]]
+// CHECK:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_6]])
+// CHECK:         br label %[[VAL_17]]
+// CHECK:       15:                                               ; preds = %[[VAL_16]], %[[VAL_18]]
+// CHECK:         br i1 %[[VAL_12]], label %[[VAL_19:.*]], label %[[VAL_20:.*]]
+// CHECK:       16:                                               ; preds = %[[VAL_17]]
+// CHECK:         tail call void @__quantum__qis__z(%Qubit* %[[VAL_6]])
+// CHECK:         br label %[[VAL_20]]
+// CHECK:       17:                                               ; preds = %[[VAL_19]], %[[VAL_17]]
+// CHECK:         tail call void @__quantum__rt__qubit_release_array(%Array* %[[VAL_0]])
 // CHECK:         ret void
 // CHECK:       }
+
diff --git a/test/AST-Quake/tuple-0.cpp b/test/AST-Quake/tuple-0.cpp
index 8b32b0ada52..1480c42f677 100644
--- a/test/AST-Quake/tuple-0.cpp
+++ b/test/AST-Quake/tuple-0.cpp
@@ -23,7 +23,7 @@ struct ArithmeticTupleQernel {
 // CHECK:           %[[VAL_1:.*]] = cc.alloca !cc.struct<{[[TUP]]}{{.*}}>
 // CHECK:           cc.store %[[VAL_0]], %[[VAL_1]] : !cc.ptr<!cc.struct<{[[TUP]]}{{.*}}>>
 // CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.veq<1>
-// CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_2]] : (!quake.veq<1>) -> !quake.measurements<1>
+// CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_2]] : (!quake.veq<1>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 // clang-format on
@@ -41,7 +41,7 @@ struct ArithmeticPairQernel {
 // CHECK:           %[[VAL_1:.*]] = cc.alloca !cc.struct<{f32, i32} [64,4]>
 // CHECK:           cc.store %[[VAL_0]], %[[VAL_1]] : !cc.ptr<!cc.struct<{f32, i32} [64,4]>>
 // CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.veq<1>
-// CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_2]] : (!quake.veq<1>) -> !quake.measurements<1>
+// CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_2]] : (!quake.veq<1>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 // clang-format on
@@ -79,7 +79,7 @@ struct ArithmeticTupleQernelWithUse {
 // CHECK:             %[[VAL_15:.*]] = arith.addi %[[VAL_14]], %[[VAL_1]] : i64
 // CHECK:             cc.continue %[[VAL_15]] : i64
 // CHECK:           } {invariant}
-// CHECK:           %[[VAL_16:.*]] = quake.mz %[[VAL_7]] : (!quake.veq<?>) -> !quake.measurements<?>
+// CHECK:           %[[VAL_16:.*]] = quake.mz %[[VAL_7]] : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 // clang-format on
@@ -117,7 +117,7 @@ struct ArithmeticTupleQernelWithUse0 {
 // CHECK:             %[[VAL_15:.*]] = arith.addi %[[VAL_14]], %[[VAL_1]] : i64
 // CHECK:             cc.continue %[[VAL_15]] : i64
 // CHECK:           } {invariant}
-// CHECK:           %[[VAL_16:.*]] = quake.mz %[[VAL_7]] : (!quake.veq<?>) -> !quake.measurements<?>
+// CHECK:           %[[VAL_16:.*]] = quake.mz %[[VAL_7]] : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 // clang-format on
@@ -138,7 +138,7 @@ struct ArithmeticPairQernelWithUse {
 // CHECK:           %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr<i32>
 // CHECK:           %[[VAL_4:.*]] = cc.cast signed %[[VAL_3]] : (i32) -> i64
 // CHECK:           %[[VAL_5:.*]] = quake.alloca !quake.veq<?>[%[[VAL_4]] : i64]
-// CHECK:           %[[VAL_6:.*]] = quake.mz %[[VAL_5]] : (!quake.veq<?>) -> !quake.measurements<?>
+// CHECK:           %[[VAL_6:.*]] = quake.mz %[[VAL_5]] : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 // clang-format on
diff --git a/test/AST-Quake/vector-0.cpp b/test/AST-Quake/vector-0.cpp
index 2fb78bd51ef..93ea14a6949 100644
--- a/test/AST-Quake/vector-0.cpp
+++ b/test/AST-Quake/vector-0.cpp
@@ -45,7 +45,7 @@ struct simple_double_rotation {
 // CHECK:           %[[VAL_12:.*]] = cc.load %[[VAL_11]] : !cc.ptr<f64>
 // CHECK:           %[[VAL_13:.*]] = quake.extract_ref %[[VAL_8]][0] : (!quake.veq<1>) -> !quake.ref
 // CHECK:           quake.rx (%[[VAL_12]]) %[[VAL_13]] : (f64, !quake.ref) -> ()
-// CHECK:           %[[VAL_14:.*]] = quake.mz %[[VAL_8]] : (!quake.veq<1>) -> !quake.measurements<1>
+// CHECK:           %[[VAL_14:.*]] = quake.mz %[[VAL_8]] : (!quake.veq<1>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 // clang-format on
@@ -79,7 +79,7 @@ struct simple_float_rotation {
 // CHECK:           %[[VAL_12:.*]] = math.absf %[[VAL_11]] : f32
 // CHECK:           %[[VAL_13:.*]] = quake.extract_ref %[[VAL_8]][0] : (!quake.veq<1>) -> !quake.ref
 // CHECK:           quake.rx (%[[VAL_12]]) %[[VAL_13]] : (f32, !quake.ref) -> ()
-// CHECK:           %[[VAL_14:.*]] = quake.mz %[[VAL_8]] : (!quake.veq<1>) -> !quake.measurements<1>
+// CHECK:           %[[VAL_14:.*]] = quake.mz %[[VAL_8]] : (!quake.veq<1>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 // clang-format on
@@ -105,7 +105,7 @@ struct difficult_symphony {
 // CHECK:           %[[VAL_6:.*]] = cc.load %[[VAL_5]] : !cc.ptr<f32>
 // CHECK:           %[[VAL_7:.*]] = quake.extract_ref %[[VAL_3]][0] : (!quake.veq<1>) -> !quake.ref
 // CHECK:           quake.rx (%[[VAL_6]]) %[[VAL_7]] : (f32, !quake.ref) -> ()
-// CHECK:           %[[VAL_8:.*]] = quake.mz %[[VAL_3]] : (!quake.veq<1>) -> !quake.measurements<1>
+// CHECK:           %[[VAL_8:.*]] = quake.mz %[[VAL_3]] : (!quake.veq<1>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 // clang-format on
diff --git a/test/AST-Quake/vector_bool.cpp b/test/AST-Quake/vector_bool.cpp
index cf9a0459799..1a9e504308c 100644
--- a/test/AST-Quake/vector_bool.cpp
+++ b/test/AST-Quake/vector_bool.cpp
@@ -22,13 +22,15 @@ struct t1 {
 
 // clang-format off
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__t1(
-// CHECK-SAME:                                    %[[VAL_0:.*]]: !cc.stdvec<f64>) -> i1 attributes {"cudaq-entrypoint", "cudaq-kernel"} {
-// CHECK:           %[[VAL_1:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.veq<2>
-// CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_2]] name "vec" : (!quake.veq<2>) -> !quake.measurements<2>
-// CHECK:           %[[VAL_4:.*]] = quake.get_measure %[[VAL_3]]{{\[}}%[[VAL_1]]] : (!quake.measurements<2>, i64) -> !quake.measure
-// CHECK:           %[[VAL_5:.*]] = quake.discriminate %[[VAL_4]] : (!quake.measure) -> i1
-// CHECK:           return %[[VAL_5]] : i1
+// CHECK-SAME:        %[[VAL_0:.*]]: !cc.stdvec<f64>{{.*}}) -> i1 attributes
+// CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.veq<2>
+// CHECK:           %[[VAL_12:.*]] = quake.mz %[[VAL_1]] name "vec" : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
+// CHECK:           %[[VAL_2:.*]] = quake.discriminate %[[VAL_12]] :
+// CHECK:           %[[VAL_3:.*]] = cc.stdvec_data %[[VAL_2]] : (!cc.stdvec<i1>) -> !cc.ptr<!cc.array<i8 x ?>>
+// CHECK:           %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_5:.*]] = cc.load %[[VAL_4]] : !cc.ptr<i8>
+// CHECK:           %[[VAL_6:.*]] = cc.cast %[[VAL_5]] : (i8) -> i1
+// CHECK:           return %[[VAL_6]] : i1
 // CHECK:         }
 // CHECK-NOT:     func.func private @_ZNKSt14_Bit_referencecvbEv() -> i1
 // clang-format on
@@ -36,18 +38,16 @@ struct t1 {
 struct VectorBoolReturn {
    std::vector<bool> operator()() __qpu__ {
     cudaq::qvector q(4);
-    auto res = mz(q);
-    return cudaq::to_bool_vector(res);
+    return mz(q);
   }
 };
 
 // clang-format off
-// CHECK-LABEL:   func.func @__nvqpp__mlirgen__VectorBoolReturn() -> !cc.stdvec<i1>
-// CHECK-SAME:      attributes {"cudaq-entrypoint"
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__VectorBoolReturn() -> !cc.stdvec<i1> attributes {"cudaq-entrypoint", "cudaq-kernel"} {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 1 : i64
 // CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.veq<4>
-// CHECK:           %[[VAL_2:.*]] = quake.mz %[[VAL_1]] name "res" : (!quake.veq<4>) -> !quake.measurements<4>
-// CHECK:           %[[VAL_3:.*]] = quake.discriminate %[[VAL_2]] : (!quake.measurements<4>) -> !cc.stdvec<i1>
+// CHECK:           %[[VAL_2:.*]] = quake.mz %[[VAL_1]] : (!quake.veq<4>) -> !cc.stdvec<!quake.measure>
+// CHECK:           %[[VAL_3:.*]] = quake.discriminate %[[VAL_2]] : (!cc.stdvec<!quake.measure>) -> !cc.stdvec<i1>
 // CHECK:           %[[VAL_4:.*]] = cc.stdvec_data %[[VAL_3]] : (!cc.stdvec<i1>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_5:.*]] = cc.stdvec_size %[[VAL_3]] : (!cc.stdvec<i1>) -> i64
 // CHECK:           %[[VAL_6:.*]] = call @__nvqpp_vectorCopyCtor(%[[VAL_4]], %[[VAL_5]], %[[VAL_0]]) : (!cc.ptr<i8>, i64, i64) -> !cc.ptr<i8>
@@ -56,20 +56,24 @@ struct VectorBoolReturn {
 // CHECK:         }
 // clang-format on
 
-struct VectorMeasureResult {
-   std::vector<cudaq::measure_result> operator()() __qpu__ {
+struct VectorBoolResult {
+   std::vector<bool> operator()() __qpu__ {
     cudaq::qvector q(4);
-    return mz(q);
+    std::vector<bool> vec = mz(q);
+    return vec;
   }
 };
 
 // clang-format off
-// CHECK-LABEL:   func.func @__nvqpp__mlirgen__VectorMeasureResult() -> !quake.measurements<?>
-// CHECK-NOT:     cudaq-entrypoint
-// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<4>
-// CHECK:           %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<4>) -> !quake.measurements<4>
-// CHECK:           %[[VAL_2:.*]] = quake.relax_size %[[VAL_1]] : (!quake.measurements<4>) -> !quake.measurements<?>
-// CHECK-NOT:       quake.discriminate
-// CHECK:           return %[[VAL_2]] : !quake.measurements<?>
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__VectorBoolResult() -> !cc.stdvec<i1> attributes {"cudaq-entrypoint", "cudaq-kernel"} {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 1 : i64
+// CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.veq<4>
+// CHECK:           %[[VAL_2:.*]] = quake.mz %[[VAL_1]] name "vec" : (!quake.veq<4>) -> !cc.stdvec<!quake.measure>
+// CHECK:           %[[VAL_3:.*]] = quake.discriminate %[[VAL_2]] : (!cc.stdvec<!quake.measure>) -> !cc.stdvec<i1>
+// CHECK:           %[[VAL_4:.*]] = cc.stdvec_data %[[VAL_3]] : (!cc.stdvec<i1>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_5:.*]] = cc.stdvec_size %[[VAL_3]] : (!cc.stdvec<i1>) -> i64
+// CHECK:           %[[VAL_6:.*]] = call @__nvqpp_vectorCopyCtor(%[[VAL_4]], %[[VAL_5]], %[[VAL_0]]) : (!cc.ptr<i8>, i64, i64) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_7:.*]] = cc.stdvec_init %[[VAL_6]], %[[VAL_5]] : (!cc.ptr<i8>, i64) -> !cc.stdvec<i1>
+// CHECK:           return %[[VAL_7]] : !cc.stdvec<i1>
 // CHECK:         }
 // clang-format on
diff --git a/test/AST-Quake/veq_size_init_state.cpp b/test/AST-Quake/veq_size_init_state.cpp
index 112b3c1d21a..acf59bef7cb 100644
--- a/test/AST-Quake/veq_size_init_state.cpp
+++ b/test/AST-Quake/veq_size_init_state.cpp
@@ -54,6 +54,6 @@ struct kernel {
 // CHECK:             %[[VAL_22:.*]] = arith.addi %[[VAL_21]], %[[VAL_2]] : i64
 // CHECK:             cc.continue %[[VAL_22]] : i64
 // CHECK:           } {invariant}
-// CHECK:           %[[VAL_23:.*]] = quake.mz %[[VAL_14]] : (!quake.veq<?>) -> !quake.measurements<?>
+// CHECK:           %[[VAL_23:.*]] = quake.mz %[[VAL_14]] : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
diff --git a/test/AST-error/run_struct_of_vec.cpp b/test/AST-error/run_struct_of_vec.cpp
index a79d3beb481..74e814f5613 100644
--- a/test/AST-error/run_struct_of_vec.cpp
+++ b/test/AST-error/run_struct_of_vec.cpp
@@ -23,12 +23,12 @@ struct Foo {
 struct Quark {
   Foo operator()() __qpu__ { // expected-error{{kernel result type not supported}}
     cudaq::qvector q(3);
-    return {747, cudaq::to_bool_vector(mz(q))};
+    return {747, mz(q)};
   }
 };
 
 int main() {
-  auto const result1 = cudaq::run(10, vec_of_vec);
+  auto const result1 = cudaq::run(10, vec_of_vec); 
   auto const result2 = cudaq::run(10, Quark{});
   return 0;
 }
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index dc84306391d..49a70201702 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -15,6 +15,8 @@ if (CUDAQ_DISABLE_RUNTIME)
   set(CUDAQ_EXTRA_CONFIG_EXCLUDES "NVQPP")
 endif()
 
+add_subdirectory(plugin)
+
 configure_lit_site_cfg(
   ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
   ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py
diff --git a/test/Transforms/add_measurements-0.qke b/test/Transforms/add_measurements-0.qke
index 2f61e3ab649..0430bfa1b40 100644
--- a/test/Transforms/add_measurements-0.qke
+++ b/test/Transforms/add_measurements-0.qke
@@ -57,7 +57,7 @@ func.func @__nvqpp__mlirgen__bell_pair_no_mz() attributes {"cudaq-entrypoint", "
 // CHECK:           quake.x {{\[}}%[[VAL_1]]] %[[VAL_2]] : (!quake.ref, !quake.ref) -> ()
 // CHECK:           cf.br ^bb1
 // CHECK:         ^bb1:
-// CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
diff --git a/test/Transforms/add_measurements-1.qke b/test/Transforms/add_measurements-1.qke
index 4c4301728ea..6214cc576af 100644
--- a/test/Transforms/add_measurements-1.qke
+++ b/test/Transforms/add_measurements-1.qke
@@ -96,7 +96,7 @@ module attributes {quake.mangled_name_map = {__nvqpp__mlirgen__caller = "__nvqpp
 // CHECK:           } {invariant}
 // CHECK:           cf.br ^bb1
 // CHECK:         ^bb1:
-// CHECK:           %[[VAL_20:.*]] = quake.mz %[[VAL_4]] : (!quake.veq<3>) -> !quake.measurements<3>
-// CHECK:           %[[VAL_21:.*]] = quake.mz %[[VAL_12]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           %[[VAL_20:.*]] = quake.mz %[[VAL_4]] : (!quake.veq<3>) -> !cc.stdvec<!quake.measure>
+// CHECK:           %[[VAL_21:.*]] = quake.mz %[[VAL_12]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
diff --git a/test/Transforms/combine_measurements.qke b/test/Transforms/combine_measurements.qke
index d773e130a35..b6cb2db4a82 100644
--- a/test/Transforms/combine_measurements.qke
+++ b/test/Transforms/combine_measurements.qke
@@ -20,7 +20,7 @@ func.func @mz_2bits_extract_cst_index() attributes {"cudaq-entrypoint"} {
 
 // CHECK-LABEL: func.func @mz_2bits_extract_cst_index() attributes {"cudaq-entrypoint", output_names = "{{\[\[\[0,\[0,\\220\\22\]\],\[1,\[1,\\221\\22\]\]\]\]}}"} {
 // CHECK:         %[[VAL_0:.*]] = quake.alloca !quake.veq<2>
-// CHECK:         %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:         %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:         return
 // CHECK:       }
 
@@ -36,7 +36,7 @@ func.func @mz_2bits_extract_cst_index() attributes {"cudaq-entrypoint"} {
   }
 // CHECK-LABEL: func.func @mz_2bits_extract_cst_op_index() attributes {"cudaq-entrypoint", output_names = "{{\[\[\[0,\[0,\\220\\22\]\],\[1,\[1,\\221\\22\]\]\]\]}}"} {
 // CHECK:         %[[VAL_0:.*]] = quake.alloca !quake.veq<2>
-// CHECK:         %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:         %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:         return
 // CHECK:       }
 
@@ -51,7 +51,7 @@ func.func @mz_2bits_extract_cst_index() attributes {"cudaq-entrypoint"} {
 
 // CHECK-LABEL: func.func @mz_2bits_extract_non_consecutive() attributes {"cudaq-entrypoint", output_names = "{{\[\[\[0,\[0,\\220\\22\]\],\[1,\[2,\\222\\22\]\]\]\]}}"} {
 // CHECK:         %[[VAL_0:.*]] = quake.alloca !quake.veq<3>
-// CHECK:         %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<3>) -> !quake.measurements<3>
+// CHECK:         %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<3>) -> !cc.stdvec<!quake.measure>
 // CHECK:         return
 // CHECK:       }
 
@@ -59,13 +59,13 @@ func.func @mz_2bits_extract_cst_index() attributes {"cudaq-entrypoint"} {
     %c1_i64 = arith.constant 1 : i64
     %0 = quake.alloca !quake.veq<4>
     %1 = quake.subveq %0, %c1_i64, %c1_i64 : (!quake.veq<4>, i64, i64) -> !quake.veq<1>
-    %measOut = quake.mz %1 : (!quake.veq<1>) -> !quake.measurements<1>
+    %measOut = quake.mz %1 : (!quake.veq<1>) -> !cc.stdvec<!quake.measure>
     return
   }
 
 // CHECK-LABEL: func.func @subveq_4_1() attributes {"cudaq-entrypoint", output_names = "{{\[\[\[0,\[1,\\221\\22\]\]\]\]}}"} {
 // CHECK:         %[[VAL_0:.*]] = quake.alloca !quake.veq<4>
-// CHECK:         %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<4>) -> !quake.measurements<4>
+// CHECK:         %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<4>) -> !cc.stdvec<!quake.measure>
 // CHECK:         return
 // CHECK:       }
 
@@ -74,13 +74,13 @@ func.func @mz_2bits_extract_cst_index() attributes {"cudaq-entrypoint"} {
     %c2_i64 = arith.constant 2 : i64
     %0 = quake.alloca !quake.veq<4>
     %1 = quake.subveq %0, %c1_i64, %c2_i64 : (!quake.veq<4>, i64, i64) -> !quake.veq<2>
-    %measOut = quake.mz %1 : (!quake.veq<2>) -> !quake.measurements<2>
+    %measOut = quake.mz %1 : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
     return
   }
 
 // CHECK-LABEL: func.func @subveq_4_2() attributes {"cudaq-entrypoint", output_names = "{{\[\[\[0,\[1,\\221\\22\]\],\[1,\[2,\\222\\22\]\]\]\]}}"} {
 // CHECK:         %[[VAL_0:.*]] = quake.alloca !quake.veq<4>
-// CHECK:         %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<4>) -> !quake.measurements<4>
+// CHECK:         %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<4>) -> !cc.stdvec<!quake.measure>
 // CHECK:         return
 // CHECK:       }
 
@@ -89,13 +89,13 @@ func.func @mz_2bits_extract_cst_index() attributes {"cudaq-entrypoint"} {
     %c3_i64 = arith.constant 3 : i64
     %0 = quake.alloca !quake.veq<4>
     %1 = quake.subveq %0, %c0_i64, %c3_i64 : (!quake.veq<4>, i64, i64) -> !quake.veq<4>
-    %measOut = quake.mz %1 : (!quake.veq<4>) -> !quake.measurements<4>
+    %measOut = quake.mz %1 : (!quake.veq<4>) -> !cc.stdvec<!quake.measure>
     return
   }
 
 // CHECK-LABEL: func.func @subveq_4_4() attributes {"cudaq-entrypoint", output_names = "{{\[\[\[0,\[0,\\220\\22\]\],\[1,\[1,\\221\\22\]\],\[2,\[2,\\222\\22\]\],\[3,\[3,\\223\\22\]\]\]\]}}"} {
 // CHECK:         %[[VAL_0:.*]] = quake.alloca !quake.veq<4>
-// CHECK:         %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<4>) -> !quake.measurements<4>
+// CHECK:         %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<4>) -> !cc.stdvec<!quake.measure>
 // CHECK:         return
 // CHECK:       }
 
@@ -104,15 +104,15 @@ func.func @mz_2bits_extract_cst_index() attributes {"cudaq-entrypoint"} {
     %1 = quake.extract_ref %0[1] : (!quake.veq<4>) -> !quake.ref
     %2 = quake.extract_ref %0[2] : (!quake.veq<4>) -> !quake.ref
     %3 = quake.subveq %0, 0, 1 : (!quake.veq<4>) -> !quake.veq<2>
-    %measOut = quake.mz %3 : (!quake.veq<2>) -> !quake.measurements<2>
+    %measOut = quake.mz %3 : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
     %4 = quake.subveq %0, 2, 2 : (!quake.veq<4>) -> !quake.veq<1>
-    %measOut_0 = quake.mz %4 : (!quake.veq<1>) -> !quake.measurements<1>
+    %measOut_0 = quake.mz %4 : (!quake.veq<1>) -> !cc.stdvec<!quake.measure>
     return
   }
 
 // CHECK-LABEL: func.func @mz_2subveqs_extract() attributes {"cudaq-entrypoint", output_names = "{{\[\[\[0,\[0,\\220\\22\]\],\[1,\[1,\\221\\22\]\],\[2,\[2,\\222\\22\]\]\]\]}}"} {
 // CHECK:         %[[VAL_0:.*]] = quake.alloca !quake.veq<4>
-// CHECK:         %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<4>) -> !quake.measurements<4>
+// CHECK:         %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<4>) -> !cc.stdvec<!quake.measure>
 // CHECK:         return
 // CHECK:       }
 }
diff --git a/test/Transforms/convert_to_qir_measurements.qke b/test/Transforms/convert_to_qir_measurements.qke
deleted file mode 100644
index 97a97b595cd..00000000000
--- a/test/Transforms/convert_to_qir_measurements.qke
+++ /dev/null
@@ -1,180 +0,0 @@
-// ========================================================================== //
-// Copyright (c) 2026 NVIDIA Corporation & Affiliates.                        //
-// All rights reserved.                                                       //
-//                                                                            //
-// This source code and the accompanying materials are made available under   //
-// the terms of the Apache License 2.0 which accompanies this distribution.   //
-// ========================================================================== //
-
-// RUN: cudaq-opt --convert-to-qir-api %s | FileCheck %s
-
-func.func @get_measure_lowering(%ms : !quake.measurements<?>) -> !quake.measure attributes {"cudaq-kernel"} {
-  %m = quake.get_measure %ms[0] : (!quake.measurements<?>) -> !quake.measure
-  return %m : !quake.measure
-}
-
-// CHECK-LABEL:   func.func @get_measure_lowering(
-// CHECK-SAME:                                    %[[VAL_0:.*]]: !cc.ptr<!llvm.struct<"Array", opaque>>) -> !cc.ptr<!llvm.struct<"Result", opaque>> attributes {"cudaq-kernel", "qir-api"} {
-// CHECK:           %[[VAL_1:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_2:.*]] = call @__quantum__rt__result_array_get_element_ptr_1d(%[[VAL_0]], %[[VAL_1]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>, i64) -> !cc.ptr<!cc.ptr<!llvm.struct<"Result", opaque>>>
-// CHECK:           %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr<!cc.ptr<!llvm.struct<"Result", opaque>>>
-// CHECK:           return %[[VAL_3]] : !cc.ptr<!llvm.struct<"Result", opaque>>
-// CHECK:         }
-
-
-func.func @get_measure_dynamic_index(%ms : !quake.measurements<?>, %idx : index) -> !quake.measure attributes {"cudaq-kernel"} {
-  %m = quake.get_measure %ms[%idx] : (!quake.measurements<?>, index) -> !quake.measure
-  return %m : !quake.measure
-}
-
-// CHECK-LABEL:   func.func @get_measure_dynamic_index(
-// CHECK-SAME:                                         %[[VAL_0:.*]]: !cc.ptr<!llvm.struct<"Array", opaque>>,
-// CHECK-SAME:                                         %[[VAL_1:.*]]: index) -> !cc.ptr<!llvm.struct<"Result", opaque>> attributes {"cudaq-kernel", "qir-api"} {
-// CHECK:           %[[VAL_2:.*]] = arith.index_cast %[[VAL_1]] : index to i64
-// CHECK:           %[[VAL_3:.*]] = call @__quantum__rt__result_array_get_element_ptr_1d(%[[VAL_0]], %[[VAL_2]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>, i64) -> !cc.ptr<!cc.ptr<!llvm.struct<"Result", opaque>>>
-// CHECK:           %[[VAL_4:.*]] = cc.load %[[VAL_3]] : !cc.ptr<!cc.ptr<!llvm.struct<"Result", opaque>>>
-// CHECK:           return %[[VAL_4]] : !cc.ptr<!llvm.struct<"Result", opaque>>
-// CHECK:         }
-
-
-func.func @discriminate_i1(%m : !quake.measure) -> i1 attributes {"cudaq-kernel"} {
-  %bit = quake.discriminate %m : (!quake.measure) -> i1
-  return %bit : i1
-}
-
-// CHECK-LABEL:   func.func @discriminate_i1(
-// CHECK-SAME:                               %[[VAL_0:.*]]: !cc.ptr<!llvm.struct<"Result", opaque>>) -> i1 attributes {"cudaq-kernel", "qir-api"} {
-// CHECK:           %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!llvm.struct<"Result", opaque>>) -> !cc.ptr<i1>
-// CHECK:           %[[VAL_2:.*]] = cc.load %[[VAL_1]] : !cc.ptr<i1>
-// CHECK:           return %[[VAL_2]] : i1
-// CHECK:         }
-
-
-func.func @discriminate_i4(%m : !quake.measure) -> i4 attributes {"cudaq-kernel"} {
-  %bit = quake.discriminate %m : (!quake.measure) -> i4
-  return %bit : i4
-}
-
-// CHECK-LABEL:   func.func @discriminate_i4(
-// CHECK-SAME:                               %[[VAL_0:.*]]: !cc.ptr<!llvm.struct<"Result", opaque>>) -> i4 attributes {"cudaq-kernel", "qir-api"} {
-// CHECK:           %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!llvm.struct<"Result", opaque>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_2:.*]] = cc.load %[[VAL_1]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_3:.*]] = cc.cast %[[VAL_2]] : (i8) -> i4
-// CHECK:           return %[[VAL_3]] : i4
-// CHECK:         }
-
-
-func.func private @callee(%ms : !quake.measurements<?>) -> i1
-
-func.func @caller(%ms : !quake.measurements<?>) -> i1 attributes {"cudaq-kernel"} {
-  %r = call @callee(%ms) : (!quake.measurements<?>) -> i1
-  return %r : i1
-}
-
-// CHECK:         func.func private @callee(!cc.ptr<!llvm.struct<"Array", opaque>>) -> i1 attributes {"qir-api"}
-
-// CHECK-LABEL:   func.func @caller(
-// CHECK-SAME:                      %[[VAL_0:.*]]: !cc.ptr<!llvm.struct<"Array", opaque>>) -> i1 attributes {"cudaq-kernel", "qir-api"} {
-// CHECK:           %[[VAL_1:.*]] = call @callee(%[[VAL_0]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>) -> i1
-// CHECK:           return %[[VAL_1]] : i1
-// CHECK:         }
-
-
-func.func @discriminate_unsized(%ms : !quake.measurements<?>) -> !cc.stdvec<i1> attributes {"cudaq-kernel"} {
-  %bits = quake.discriminate %ms : (!quake.measurements<?>) -> !cc.stdvec<i1>
-  return %bits : !cc.stdvec<i1>
-}
-
-// CHECK-LABEL:   func.func @discriminate_unsized(
-// CHECK-SAME:                                    %[[VAL_0:.*]]: !cc.ptr<!llvm.struct<"Array", opaque>>) -> !cc.stdvec<i1> attributes {"cudaq-kernel", "qir-api"} {
-// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 1 : i64
-// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_3:.*]] = call @__quantum__rt__array_get_size_1d(%[[VAL_0]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>) -> i64
-// CHECK:           %[[VAL_4:.*]] = cc.alloca i8{{\[}}%[[VAL_3]] : i64]
-// CHECK:           %[[VAL_5:.*]] = cc.loop while ((%[[VAL_6:.*]] = %[[VAL_2]]) -> (i64)) {
-// CHECK:             %[[VAL_7:.*]] = arith.cmpi slt, %[[VAL_6]], %[[VAL_3]] : i64
-// CHECK:             cc.condition %[[VAL_7]](%[[VAL_6]] : i64)
-// CHECK:           } do {
-// CHECK:           ^bb0(%[[VAL_8:.*]]: i64):
-// CHECK:             %[[VAL_9:.*]] = func.call @__quantum__rt__result_array_get_element_ptr_1d(%[[VAL_0]], %[[VAL_8]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>, i64) -> !cc.ptr<!cc.ptr<!llvm.struct<"Result", opaque>>>
-// CHECK:             %[[VAL_10:.*]] = cc.load %[[VAL_9]] : !cc.ptr<!cc.ptr<!llvm.struct<"Result", opaque>>>
-// CHECK:             %[[VAL_11:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!llvm.struct<"Result", opaque>>) -> !cc.ptr<i1>
-// CHECK:             %[[VAL_12:.*]] = cc.load %[[VAL_11]] : !cc.ptr<i1>
-// CHECK:             %[[VAL_13:.*]] = cc.compute_ptr %[[VAL_4]]{{\[}}%[[VAL_8]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
-// CHECK:             %[[VAL_14:.*]] = cc.cast unsigned %[[VAL_12]] : (i1) -> i8
-// CHECK:             cc.store %[[VAL_14]], %[[VAL_13]] : !cc.ptr<i8>
-// CHECK:             cc.continue %[[VAL_8]] : i64
-// CHECK:           } step {
-// CHECK:           ^bb0(%[[VAL_15:.*]]: i64):
-// CHECK:             %[[VAL_16:.*]] = arith.addi %[[VAL_15]], %[[VAL_1]] : i64
-// CHECK:             cc.continue %[[VAL_16]] : i64
-// CHECK:           } {invariant}
-// CHECK:           %[[VAL_17:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.array<i1 x ?>>
-// CHECK:           %[[VAL_18:.*]] = cc.stdvec_init %[[VAL_17]], %[[VAL_3]] : (!cc.ptr<!cc.array<i1 x ?>>, i64) -> !cc.stdvec<i1>
-// CHECK:           return %[[VAL_18]] : !cc.stdvec<i1>
-// CHECK:         }
-
-func.func @get_measure_i32_index(%ms : !quake.measurements<?>, %idx : i32) -> !quake.measure attributes {"cudaq-kernel"} {
-  %m = quake.get_measure %ms[%idx] : (!quake.measurements<?>, i32) -> !quake.measure
-  return %m : !quake.measure
-}
-
-// CHECK-LABEL:   func.func @get_measure_i32_index(
-// CHECK-SAME:                                     %[[VAL_0:.*]]: !cc.ptr<!llvm.struct<"Array", opaque>>,
-// CHECK-SAME:                                     %[[VAL_1:.*]]: i32) -> !cc.ptr<!llvm.struct<"Result", opaque>> attributes {"cudaq-kernel", "qir-api"} {
-// CHECK:           %[[VAL_2:.*]] = cc.cast unsigned %[[VAL_1]] : (i32) -> i64
-// CHECK:           %[[VAL_3:.*]] = call @__quantum__rt__result_array_get_element_ptr_1d(%[[VAL_0]], %[[VAL_2]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>, i64) -> !cc.ptr<!cc.ptr<!llvm.struct<"Result", opaque>>>
-// CHECK:           %[[VAL_4:.*]] = cc.load %[[VAL_3]] : !cc.ptr<!cc.ptr<!llvm.struct<"Result", opaque>>>
-// CHECK:           return %[[VAL_4]] : !cc.ptr<!llvm.struct<"Result", opaque>>
-// CHECK:         }
-
-func.func @multi_qubit_mz_ref_and_veq() attributes {"cudaq-kernel", "cudaq-entrypoint"} {
-  %q = quake.alloca !quake.ref
-  %qs = quake.alloca !quake.veq<2>
-  %meas = quake.mz %q, %qs name "reg" : (!quake.ref, !quake.veq<2>) -> !quake.measurements<3>
-  return
-}
-
-// CHECK-LABEL:   func.func @multi_qubit_mz_ref_and_veq() attributes {"cudaq-entrypoint", "cudaq-kernel", "qir-api"} {
-// CHECK-DAG:       %[[VAL_0:.*]] = arith.constant 1 : i64
-// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 0 : i64
-// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 2 : i64
-// CHECK:           %[[VAL_3:.*]] = call @__quantum__rt__qubit_allocate() : () -> !cc.ptr<!llvm.struct<"Qubit", opaque>>
-// CHECK:           %[[VAL_4:.*]] = call @__quantum__rt__qubit_allocate_array(%[[VAL_2]]) : (i64) -> !cc.ptr<!llvm.struct<"Array", opaque>>
-// CHECK:           %[[VAL_5:.*]] = call @__quantum__rt__array_get_size_1d(%[[VAL_4]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>) -> i64
-// CHECK:           %[[VAL_6:.*]] = arith.addi %[[VAL_5]], %[[VAL_0]] : i64
-// CHECK:           %[[VAL_7:.*]] = call @__quantum__rt__result_array_create_1d(%[[VAL_6]]) : (i64) -> !cc.ptr<!llvm.struct<"Array", opaque>>
-// CHECK:           %[[VAL_8:.*]] = cc.address_of @cstr.{{.*}} : !cc.ptr<!llvm.array<4 x i8>>
-// CHECK:           %[[VAL_9:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr<!llvm.array<4 x i8>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_10:.*]] = call @__quantum__qis__mz__to__register(%[[VAL_3]], %[[VAL_9]]) : (!cc.ptr<!llvm.struct<"Qubit", opaque>>, !cc.ptr<i8>) -> !cc.ptr<!llvm.struct<"Result", opaque>>
-// CHECK:           %[[VAL_11:.*]] = call @__quantum__rt__result_array_get_element_ptr_1d(%[[VAL_7]], %[[VAL_1]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>, i64) -> !cc.ptr<!cc.ptr<!llvm.struct<"Result", opaque>>>
-// CHECK:           cc.store %[[VAL_10]], %[[VAL_11]] : !cc.ptr<!cc.ptr<!llvm.struct<"Result", opaque>>>
-// CHECK:           %[[VAL_12:.*]] = cc.loop while ((%[[VAL_13:.*]] = %[[VAL_1]]) -> (i64)) {
-// CHECK:             %[[VAL_14:.*]] = arith.cmpi slt, %[[VAL_13]], %[[VAL_5]] : i64
-// CHECK:             cc.condition %[[VAL_14]](%[[VAL_13]] : i64)
-// CHECK:           } do {
-// CHECK:           ^bb0(%[[VAL_15:.*]]: i64):
-// CHECK:             %[[VAL_16:.*]] = func.call @__quantum__rt__array_get_element_ptr_1d(%[[VAL_4]], %[[VAL_15]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>, i64) -> !cc.ptr<!cc.ptr<!llvm.struct<"Qubit", opaque>>>
-// CHECK:             %[[VAL_17:.*]] = cc.load %[[VAL_16]] : !cc.ptr<!cc.ptr<!llvm.struct<"Qubit", opaque>>>
-// CHECK:             %[[VAL_18:.*]] = func.call @__quantum__qis__mz__to__register(%[[VAL_17]], %[[VAL_9]]) : (!cc.ptr<!llvm.struct<"Qubit", opaque>>, !cc.ptr<i8>) -> !cc.ptr<!llvm.struct<"Result", opaque>>
-// CHECK:             %[[VAL_19:.*]] = arith.addi %[[VAL_15]], %[[VAL_0]] : i64
-// CHECK:             %[[VAL_20:.*]] = func.call @__quantum__rt__result_array_get_element_ptr_1d(%[[VAL_7]], %[[VAL_19]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>, i64) -> !cc.ptr<!cc.ptr<!llvm.struct<"Result", opaque>>>
-// CHECK:             cc.store %[[VAL_18]], %[[VAL_20]] : !cc.ptr<!cc.ptr<!llvm.struct<"Result", opaque>>>
-// CHECK:             cc.continue %[[VAL_15]] : i64
-// CHECK:           } step {
-// CHECK:           ^bb0(%[[VAL_21:.*]]: i64):
-// CHECK:             %[[VAL_22:.*]] = arith.addi %[[VAL_21]], %[[VAL_0]] : i64
-// CHECK:             cc.continue %[[VAL_22]] : i64
-// CHECK:           } {invariant}
-// CHECK:           return
-// CHECK:         }
-
-func.func @relax_size_measurements(%ms : !quake.measurements<4>) -> !quake.measurements<?> attributes {"cudaq-kernel"} {
-  %relaxed = quake.relax_size %ms : (!quake.measurements<4>) -> !quake.measurements<?>
-  return %relaxed : !quake.measurements<?>
-}
-
-// CHECK-LABEL:   func.func @relax_size_measurements(
-// CHECK-SAME:                                       %[[VAL_0:.*]]: !cc.ptr<!llvm.struct<"Array", opaque>>) -> !cc.ptr<!llvm.struct<"Array", opaque>> attributes {"cudaq-kernel", "qir-api"} {
-// CHECK:           return %[[VAL_0]] : !cc.ptr<!llvm.struct<"Array", opaque>>
-// CHECK:         }
diff --git a/test/Transforms/cse.qke b/test/Transforms/cse.qke
index bd1de41a9c9..d095cf815e9 100644
--- a/test/Transforms/cse.qke
+++ b/test/Transforms/cse.qke
@@ -63,13 +63,13 @@ func.func private @device_kernel(!quake.veq<?>)
 func.func @canonicalize_concat() {
   %q1 = quake.alloca !quake.ref
   %q2 = quake.concat %q1 : (!quake.ref) -> !quake.veq<1>
-  %b1 = quake.mz %q2 : (!quake.veq<1>) -> !quake.measurements<1>
+  %b1 = quake.mz %q2 : (!quake.veq<1>) -> !cc.stdvec<!quake.measure>
   %q3 = quake.alloca !quake.veq<1>
   %q4 = quake.concat %q3 : (!quake.veq<1>) -> !quake.veq<1>
-  %b2 = quake.mz %q4 : (!quake.veq<1>) -> !quake.measurements<1>
+  %b2 = quake.mz %q4 : (!quake.veq<1>) -> !cc.stdvec<!quake.measure>
   %q5 = quake.alloca !quake.veq<1>
   %q6 = quake.concat %q5 : (!quake.veq<1>) -> !quake.veq<?>
-  %b3 = quake.mz %q6 : (!quake.veq<?>) -> !quake.measurements<?>
+  %b3 = quake.mz %q6 : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
   %q7 = quake.alloca !quake.veq<2>
   %q8 = quake.concat %q7 : (!quake.veq<2>) -> !quake.veq<?>
   call @device_kernel(%q8) : (!quake.veq<?>) -> ()
@@ -83,11 +83,11 @@ func.func @canonicalize_concat() {
 // CHECK-LABEL:   func.func @canonicalize_concat() {
 // CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.ref
 // CHECK:           %[[VAL_1:.*]] = quake.concat %[[VAL_0]] : (!quake.ref) -> !quake.veq<1>
-// CHECK:           %[[VAL_2:.*]] = quake.mz %[[VAL_1]] : (!quake.veq<1>) -> !quake.measurements<1>
+// CHECK:           %[[VAL_2:.*]] = quake.mz %[[VAL_1]] : (!quake.veq<1>) -> !cc.stdvec<!quake.measure>
 // CHECK:           %[[VAL_3:.*]] = quake.alloca !quake.veq<1>
-// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_3]] : (!quake.veq<1>) -> !quake.measurements<1>
+// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_3]] : (!quake.veq<1>) -> !cc.stdvec<!quake.measure>
 // CHECK:           %[[VAL_5:.*]] = quake.alloca !quake.veq<1>
-// CHECK:           %[[VAL_6:.*]] = quake.mz %[[VAL_5]] : (!quake.veq<1>) -> !quake.measurements<1>
+// CHECK:           %[[VAL_6:.*]] = quake.mz %[[VAL_5]] : (!quake.veq<1>) -> !cc.stdvec<!quake.measure>
 // CHECK:           %[[VAL_7:.*]] = quake.alloca !quake.veq<2>
 // CHECK:           %[[VAL_8:.*]] = quake.relax_size %[[VAL_7]] : (!quake.veq<2>) -> !quake.veq<?>
 // CHECK:           call @device_kernel(%[[VAL_8]]) : (!quake.veq<?>) -> ()
@@ -101,13 +101,13 @@ func.func @canonicalize_multiple_concat() {
   %q1 = quake.alloca !quake.ref
   %p1 = quake.alloca !quake.ref
   %q2 = quake.concat %q1, %p1 : (!quake.ref, !quake.ref) -> !quake.veq<2>
-  %b1 = quake.mz %q2 : (!quake.veq<2>) -> !quake.measurements<2>
+  %b1 = quake.mz %q2 : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
   %q3 = quake.alloca !quake.veq<1>
   %q4 = quake.concat %q1, %p1, %q3 : (!quake.ref, !quake.ref, !quake.veq<1>) -> !quake.veq<3>
-  %b2 = quake.mz %q4 : (!quake.veq<3>) -> !quake.measurements<3>
+  %b2 = quake.mz %q4 : (!quake.veq<3>) -> !cc.stdvec<!quake.measure>
   %q5 = quake.alloca !quake.veq<1>
   %q6 = quake.concat %q3, %q5 : (!quake.veq<1>, !quake.veq<1>) -> !quake.veq<?>
-  %b3 = quake.mz %q6 : (!quake.veq<?>) -> !quake.measurements<?>
+  %b3 = quake.mz %q6 : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
   %q7 = quake.alloca !quake.veq<2>
   %q8 = quake.concat %q3, %q7 : (!quake.veq<1>, !quake.veq<2>) -> !quake.veq<?>
   call @device_kernel(%q8) : (!quake.veq<?>) -> ()
@@ -122,13 +122,13 @@ func.func @canonicalize_multiple_concat() {
 // CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.ref
 // CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.ref
 // CHECK:           %[[VAL_2:.*]] = quake.concat %[[VAL_0]], %[[VAL_1]] : (!quake.ref, !quake.ref) -> !quake.veq<2>
-// CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_2]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_2]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           %[[VAL_4:.*]] = quake.alloca !quake.veq<1>
 // CHECK:           %[[VAL_5:.*]] = quake.concat %[[VAL_0]], %[[VAL_1]], %[[VAL_4]] : (!quake.ref, !quake.ref, !quake.veq<1>) -> !quake.veq<3>
-// CHECK:           %[[VAL_6:.*]] = quake.mz %[[VAL_5]] : (!quake.veq<3>) -> !quake.measurements<3>
+// CHECK:           %[[VAL_6:.*]] = quake.mz %[[VAL_5]] : (!quake.veq<3>) -> !cc.stdvec<!quake.measure>
 // CHECK:           %[[VAL_7:.*]] = quake.alloca !quake.veq<1>
 // CHECK:           %[[VAL_8:.*]] = quake.concat %[[VAL_4]], %[[VAL_7]] : (!quake.veq<1>, !quake.veq<1>) -> !quake.veq<2>
-// CHECK:           %[[VAL_9:.*]] = quake.mz %[[VAL_8]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           %[[VAL_9:.*]] = quake.mz %[[VAL_8]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           %[[VAL_10:.*]] = quake.alloca !quake.veq<2>
 // CHECK:           %[[VAL_11:.*]] = quake.concat %[[VAL_4]], %[[VAL_10]] : (!quake.veq<1>, !quake.veq<2>) -> !quake.veq<3>
 // CHECK:           %[[VAL_12:.*]] = quake.relax_size %[[VAL_11]] : (!quake.veq<3>) -> !quake.veq<?>
diff --git a/test/Transforms/eliminate_dead_heap_copy.qke b/test/Transforms/eliminate_dead_heap_copy.qke
new file mode 100644
index 00000000000..0ac14f07e31
--- /dev/null
+++ b/test/Transforms/eliminate_dead_heap_copy.qke
@@ -0,0 +1,126 @@
+// ========================================================================== //
+// Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                 //
+// All rights reserved.                                                       //
+//                                                                            //
+// This source code and the accompanying materials are made available under   //
+// the terms of the Apache License 2.0 which accompanies this distribution.   //
+// ========================================================================== //
+
+// RUN: cudaq-opt --eliminate-dead-heap-copy %s | FileCheck %s
+
+// After ReturnToOutputLog, the malloc+memcpy used to create a heap copy of
+// stack data for stdvec returns becomes dead. The only remaining users of the
+// malloc result are the memcpy (as dest) and a cc.cast that feeds into
+// record_output calls. This pass should replace the cc.cast's use of the
+// malloc result with the memcpy source and erase the dead malloc+memcpy.
+
+func.func private @malloc(i64) -> !cc.ptr<i8>
+func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr<i8>, !cc.ptr<i8>, i64, i1)
+func.func private @__quantum__rt__int_record_output(i64, !cc.ptr<i8>)
+func.func private @__quantum__rt__array_record_output(i64, !cc.ptr<i8>)
+
+// Test basic malloc+memcpy elimination where the malloc result is only used
+// by memcpy (as dest) and a cc.cast.
+func.func @test_basic_elimination() {
+  %c40 = arith.constant 40 : i64
+  %c5 = arith.constant 5 : i64
+  %false = arith.constant false
+  %alloca = cc.alloca !cc.array<i64 x 5>
+  %cast_src = cc.cast %alloca : (!cc.ptr<!cc.array<i64 x 5>>) -> !cc.ptr<i8>
+  %malloc_res = call @malloc(%c40) : (i64) -> !cc.ptr<i8>
+  call @llvm.memcpy.p0i8.p0i8.i64(%malloc_res, %cast_src, %c40, %false) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+  %cast_dst = cc.cast %malloc_res : (!cc.ptr<i8>) -> !cc.ptr<!cc.array<i64 x ?>>
+  %ptr0 = cc.compute_ptr %cast_dst[0] : (!cc.ptr<!cc.array<i64 x ?>>) -> !cc.ptr<i64>
+  %val0 = cc.load %ptr0 : !cc.ptr<i64>
+  %label = cc.string_literal "[0]" : !cc.ptr<!cc.array<i8 x 4>>
+  %label_cast = cc.cast %label : (!cc.ptr<!cc.array<i8 x 4>>) -> !cc.ptr<i8>
+  call @__quantum__rt__int_record_output(%val0, %label_cast) : (i64, !cc.ptr<i8>) -> ()
+  return
+}
+
+// CHECK-LABEL:   func.func @test_basic_elimination() {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 40 : i64
+// CHECK:           %[[VAL_1:.*]] = arith.constant 5 : i64
+// CHECK:           %[[VAL_2:.*]] = arith.constant false
+// CHECK:           %[[VAL_3:.*]] = cc.alloca !cc.array<i64 x 5>
+// CHECK:           %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<i64 x 5>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_5:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.array<i64 x ?>>
+// CHECK:           %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_5]][0] : (!cc.ptr<!cc.array<i64 x ?>>) -> !cc.ptr<i64>
+// CHECK:           %[[VAL_7:.*]] = cc.load %[[VAL_6]] : !cc.ptr<i64>
+// CHECK:           %[[VAL_8:.*]] = cc.string_literal "[0]" : !cc.ptr<!cc.array<i8 x 4>>
+// CHECK:           %[[VAL_9:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr<!cc.array<i8 x 4>>) -> !cc.ptr<i8>
+// CHECK:           call @__quantum__rt__int_record_output(%[[VAL_7]], %[[VAL_9]]) : (i64, !cc.ptr<i8>) -> ()
+// CHECK:           return
+// CHECK:         }
+
+// Test that dead cc.stdvec_init ops are also removed.
+func.func @test_dead_stdvec_init() {
+  %c40 = arith.constant 40 : i64
+  %c5 = arith.constant 5 : i64
+  %false = arith.constant false
+  %alloca = cc.alloca !cc.array<i64 x 5>
+  %cast_src = cc.cast %alloca : (!cc.ptr<!cc.array<i64 x 5>>) -> !cc.ptr<i8>
+  %malloc_res = call @malloc(%c40) : (i64) -> !cc.ptr<i8>
+  call @llvm.memcpy.p0i8.p0i8.i64(%malloc_res, %cast_src, %c40, %false) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+  %dead_vec = cc.stdvec_init %malloc_res, %c5 : (!cc.ptr<i8>, i64) -> !cc.stdvec<i64>
+  %cast_dst = cc.cast %malloc_res : (!cc.ptr<i8>) -> !cc.ptr<!cc.array<i64 x ?>>
+  %ptr0 = cc.compute_ptr %cast_dst[0] : (!cc.ptr<!cc.array<i64 x ?>>) -> !cc.ptr<i64>
+  %val0 = cc.load %ptr0 : !cc.ptr<i64>
+  %label = cc.string_literal "[0]" : !cc.ptr<!cc.array<i8 x 4>>
+  %label_cast = cc.cast %label : (!cc.ptr<!cc.array<i8 x 4>>) -> !cc.ptr<i8>
+  call @__quantum__rt__int_record_output(%val0, %label_cast) : (i64, !cc.ptr<i8>) -> ()
+  return
+}
+
+// CHECK-LABEL:   func.func @test_dead_stdvec_init() {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 40 : i64
+// CHECK:           %[[VAL_1:.*]] = arith.constant 5 : i64
+// CHECK:           %[[VAL_2:.*]] = arith.constant false
+// CHECK:           %[[VAL_3:.*]] = cc.alloca !cc.array<i64 x 5>
+// CHECK:           %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<i64 x 5>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_5:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.array<i64 x ?>>
+// CHECK:           %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_5]][0] : (!cc.ptr<!cc.array<i64 x ?>>) -> !cc.ptr<i64>
+// CHECK:           %[[VAL_7:.*]] = cc.load %[[VAL_6]] : !cc.ptr<i64>
+// CHECK:           %[[VAL_8:.*]] = cc.string_literal "[0]" : !cc.ptr<!cc.array<i8 x 4>>
+// CHECK:           %[[VAL_9:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr<!cc.array<i8 x 4>>) -> !cc.ptr<i8>
+// CHECK:           call @__quantum__rt__int_record_output(%[[VAL_7]], %[[VAL_9]]) : (i64, !cc.ptr<i8>) -> ()
+// CHECK:           return
+// CHECK:         }
+
+// Test that malloc is NOT removed when it has non-memcpy, non-stdvec_init
+// users that cannot be redirected (e.g., another call using it as an argument).
+func.func private @use_ptr(!cc.ptr<i8>)
+
+func.func @test_no_elimination_extra_user() {
+  %c40 = arith.constant 40 : i64
+  %false = arith.constant false
+  %alloca = cc.alloca !cc.array<i64 x 5>
+  %cast_src = cc.cast %alloca : (!cc.ptr<!cc.array<i64 x 5>>) -> !cc.ptr<i8>
+  %malloc_res = call @malloc(%c40) : (i64) -> !cc.ptr<i8>
+  call @llvm.memcpy.p0i8.p0i8.i64(%malloc_res, %cast_src, %c40, %false) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+  call @use_ptr(%malloc_res) : (!cc.ptr<i8>) -> ()
+  %cast_dst = cc.cast %malloc_res : (!cc.ptr<i8>) -> !cc.ptr<!cc.array<i64 x ?>>
+  %ptr0 = cc.compute_ptr %cast_dst[0] : (!cc.ptr<!cc.array<i64 x ?>>) -> !cc.ptr<i64>
+  %val0 = cc.load %ptr0 : !cc.ptr<i64>
+  %label = cc.string_literal "[0]" : !cc.ptr<!cc.array<i8 x 4>>
+  %label_cast = cc.cast %label : (!cc.ptr<!cc.array<i8 x 4>>) -> !cc.ptr<i8>
+  call @__quantum__rt__int_record_output(%val0, %label_cast) : (i64, !cc.ptr<i8>) -> ()
+  return
+}
+
+// CHECK-LABEL:   func.func @test_no_elimination_extra_user() {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 40 : i64
+// CHECK:           %[[VAL_1:.*]] = arith.constant false
+// CHECK:           %[[VAL_2:.*]] = cc.alloca !cc.array<i64 x 5>
+// CHECK:           %[[VAL_3:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<!cc.array<i64 x 5>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_4:.*]] = call @malloc(%[[VAL_0]]) : (i64) -> !cc.ptr<i8>
+// CHECK:           call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_4]], %[[VAL_3]], %[[VAL_0]], %[[VAL_1]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+// CHECK:           call @use_ptr(%[[VAL_4]]) : (!cc.ptr<i8>) -> ()
+// CHECK:           %[[VAL_5:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.array<i64 x ?>>
+// CHECK:           %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_5]][0] : (!cc.ptr<!cc.array<i64 x ?>>) -> !cc.ptr<i64>
+// CHECK:           %[[VAL_7:.*]] = cc.load %[[VAL_6]] : !cc.ptr<i64>
+// CHECK:           %[[VAL_8:.*]] = cc.string_literal "[0]" : !cc.ptr<!cc.array<i8 x 4>>
+// CHECK:           %[[VAL_9:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr<!cc.array<i8 x 4>>) -> !cc.ptr<i8>
+// CHECK:           call @__quantum__rt__int_record_output(%[[VAL_7]], %[[VAL_9]]) : (i64, !cc.ptr<i8>) -> ()
+// CHECK:           return
+// CHECK:         }
diff --git a/test/Transforms/expand_and_qir_measurements.qke b/test/Transforms/expand_and_qir_measurements.qke
deleted file mode 100644
index 4cd4ac78de2..00000000000
--- a/test/Transforms/expand_and_qir_measurements.qke
+++ /dev/null
@@ -1,89 +0,0 @@
-// ========================================================================== //
-// Copyright (c) 2026 NVIDIA Corporation & Affiliates.                        //
-// All rights reserved.                                                       //
-//                                                                            //
-// This source code and the accompanying materials are made available under   //
-// the terms of the Apache License 2.0 which accompanies this distribution.   //
-// ========================================================================== //
-
-// RUN: cudaq-opt --expand-measurements --convert-to-qir-api %s | FileCheck %s
-
-func.func @converter_func(%ms : !quake.measurements<2>) -> !cc.stdvec<i1> attributes {"cudaq-kernel"} {
-  %bits = quake.discriminate %ms : (!quake.measurements<2>) -> !cc.stdvec<i1>
-  return %bits : !cc.stdvec<i1>
-}
-
-// CHECK-LABEL:   func.func @converter_func(
-// CHECK-SAME:                              %[[VAL_0:.*]]: !cc.ptr<!llvm.struct<"Array", opaque>>) -> !cc.stdvec<i1> attributes {"cudaq-kernel", "qir-api"} {
-// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_3:.*]] = arith.constant 2 : i64
-// CHECK:           %[[VAL_4:.*]] = cc.alloca !cc.array<i8 x 2>
-// CHECK:           %[[VAL_5:.*]] = call @__quantum__rt__result_array_get_element_ptr_1d(%[[VAL_0]], %[[VAL_2]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>, i64) -> !cc.ptr<!cc.ptr<!llvm.struct<"Result", opaque>>>
-// CHECK:           %[[VAL_6:.*]] = cc.load %[[VAL_5]] : !cc.ptr<!cc.ptr<!llvm.struct<"Result", opaque>>>
-// CHECK:           %[[VAL_7:.*]] = cc.cast %[[VAL_6]] : (!cc.ptr<!llvm.struct<"Result", opaque>>) -> !cc.ptr<i1>
-// CHECK:           %[[VAL_8:.*]] = cc.load %[[VAL_7]] : !cc.ptr<i1>
-// CHECK:           %[[VAL_9:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_10:.*]] = cc.cast unsigned %[[VAL_8]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_10]], %[[VAL_9]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_11:.*]] = call @__quantum__rt__result_array_get_element_ptr_1d(%[[VAL_0]], %[[VAL_1]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>, i64) -> !cc.ptr<!cc.ptr<!llvm.struct<"Result", opaque>>>
-// CHECK:           %[[VAL_12:.*]] = cc.load %[[VAL_11]] : !cc.ptr<!cc.ptr<!llvm.struct<"Result", opaque>>>
-// CHECK:           %[[VAL_13:.*]] = cc.cast %[[VAL_12]] : (!cc.ptr<!llvm.struct<"Result", opaque>>) -> !cc.ptr<i1>
-// CHECK:           %[[VAL_14:.*]] = cc.load %[[VAL_13]] : !cc.ptr<i1>
-// CHECK:           %[[VAL_15:.*]] = cc.compute_ptr %[[VAL_4]][1] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_16:.*]] = cc.cast unsigned %[[VAL_14]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_16]], %[[VAL_15]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_17:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<!cc.array<i1 x ?>>
-// CHECK:           %[[VAL_18:.*]] = cc.stdvec_init %[[VAL_17]], %[[VAL_3]] : (!cc.ptr<!cc.array<i1 x ?>>, i64) -> !cc.stdvec<i1>
-// CHECK:           return %[[VAL_18]] : !cc.stdvec<i1>
-// CHECK:         }
-
-
-func.func @combination_targets() -> !cc.stdvec<i1> attributes {"cudaq-kernel", "cudaq-entrypoint"} {
-  %q = quake.alloca !quake.ref
-  %qs = quake.alloca !quake.veq<2>
-  %meas = quake.mz %q, %qs name "mixed" : (!quake.ref, !quake.veq<2>) -> !quake.measurements<3>
-  %bits = quake.discriminate %meas : (!quake.measurements<3>) -> !cc.stdvec<i1>
-  return %bits : !cc.stdvec<i1>
-}
-
-// CHECK-LABEL:   func.func @combination_targets() -> !cc.stdvec<i1> attributes {"cudaq-entrypoint", "cudaq-kernel", "qir-api"} {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 2 : i64
-// CHECK:           %[[VAL_1:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_3:.*]] = arith.constant 3 : i64
-// CHECK:           %[[VAL_4:.*]] = call @__quantum__rt__qubit_allocate() : () -> !cc.ptr<!llvm.struct<"Qubit", opaque>>
-// CHECK:           %[[VAL_5:.*]] = call @__quantum__rt__qubit_allocate_array(%[[VAL_0]]) : (i64) -> !cc.ptr<!llvm.struct<"Array", opaque>>
-// CHECK:           %[[VAL_6:.*]] = cc.address_of @cstr.6D6978656400 : !cc.ptr<!llvm.array<6 x i8>>
-// CHECK:           %[[VAL_7:.*]] = cc.cast %[[VAL_6]] : (!cc.ptr<!llvm.array<6 x i8>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_8:.*]] = call @__quantum__qis__mz__to__register(%[[VAL_4]], %[[VAL_7]]) {registerName = "mixed"} : (!cc.ptr<!llvm.struct<"Qubit", opaque>>, !cc.ptr<i8>) -> !cc.ptr<!llvm.struct<"Result", opaque>>
-// CHECK:           %[[VAL_9:.*]] = call @__quantum__rt__array_get_element_ptr_1d(%[[VAL_5]], %[[VAL_1]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>, i64) -> !cc.ptr<!cc.ptr<!llvm.struct<"Qubit", opaque>>>
-// CHECK:           %[[VAL_10:.*]] = cc.load %[[VAL_9]] : !cc.ptr<!cc.ptr<!llvm.struct<"Qubit", opaque>>>
-// CHECK:           %[[VAL_11:.*]] = cc.address_of @cstr.6D6978656400 : !cc.ptr<!llvm.array<6 x i8>>
-// CHECK:           %[[VAL_12:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr<!llvm.array<6 x i8>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_13:.*]] = call @__quantum__qis__mz__to__register(%[[VAL_10]], %[[VAL_12]]) {registerName = "mixed"} : (!cc.ptr<!llvm.struct<"Qubit", opaque>>, !cc.ptr<i8>) -> !cc.ptr<!llvm.struct<"Result", opaque>>
-// CHECK:           %[[VAL_14:.*]] = call @__quantum__rt__array_get_element_ptr_1d(%[[VAL_5]], %[[VAL_2]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>, i64) -> !cc.ptr<!cc.ptr<!llvm.struct<"Qubit", opaque>>>
-// CHECK:           %[[VAL_15:.*]] = cc.load %[[VAL_14]] : !cc.ptr<!cc.ptr<!llvm.struct<"Qubit", opaque>>>
-// CHECK:           %[[VAL_16:.*]] = cc.address_of @cstr.6D6978656400 : !cc.ptr<!llvm.array<6 x i8>>
-// CHECK:           %[[VAL_17:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr<!llvm.array<6 x i8>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_18:.*]] = call @__quantum__qis__mz__to__register(%[[VAL_15]], %[[VAL_17]]) {registerName = "mixed"} : (!cc.ptr<!llvm.struct<"Qubit", opaque>>, !cc.ptr<i8>) -> !cc.ptr<!llvm.struct<"Result", opaque>>
-// CHECK:           %[[VAL_19:.*]] = cc.alloca !cc.array<i8 x 3>
-// CHECK:           %[[VAL_20:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr<!llvm.struct<"Result", opaque>>) -> !cc.ptr<i1>
-// CHECK:           %[[VAL_21:.*]] = cc.load %[[VAL_20]] : !cc.ptr<i1>
-// CHECK:           %[[VAL_22:.*]] = cc.cast %[[VAL_19]] : (!cc.ptr<!cc.array<i8 x 3>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_23:.*]] = cc.cast unsigned %[[VAL_21]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_23]], %[[VAL_22]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_24:.*]] = cc.cast %[[VAL_13]] : (!cc.ptr<!llvm.struct<"Result", opaque>>) -> !cc.ptr<i1>
-// CHECK:           %[[VAL_25:.*]] = cc.load %[[VAL_24]] : !cc.ptr<i1>
-// CHECK:           %[[VAL_26:.*]] = cc.compute_ptr %[[VAL_19]][1] : (!cc.ptr<!cc.array<i8 x 3>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_27:.*]] = cc.cast unsigned %[[VAL_25]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_27]], %[[VAL_26]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_28:.*]] = cc.cast %[[VAL_18]] : (!cc.ptr<!llvm.struct<"Result", opaque>>) -> !cc.ptr<i1>
-// CHECK:           %[[VAL_29:.*]] = cc.load %[[VAL_28]] : !cc.ptr<i1>
-// CHECK:           %[[VAL_30:.*]] = cc.compute_ptr %[[VAL_19]][2] : (!cc.ptr<!cc.array<i8 x 3>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_31:.*]] = cc.cast unsigned %[[VAL_29]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_31]], %[[VAL_30]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_32:.*]] = cc.cast %[[VAL_19]] : (!cc.ptr<!cc.array<i8 x 3>>) -> !cc.ptr<!cc.array<i1 x ?>>
-// CHECK:           %[[VAL_33:.*]] = cc.stdvec_init %[[VAL_32]], %[[VAL_3]] : (!cc.ptr<!cc.array<i1 x ?>>, i64) -> !cc.stdvec<i1>
-// CHECK:           return %[[VAL_33]] : !cc.stdvec<i1>
-// CHECK:         }
diff --git a/test/Transforms/expand_measurements.qke b/test/Transforms/expand_measurements.qke
deleted file mode 100644
index 35edfbaf8b0..00000000000
--- a/test/Transforms/expand_measurements.qke
+++ /dev/null
@@ -1,451 +0,0 @@
-// ========================================================================== //
-// Copyright (c) 2026 NVIDIA Corporation & Affiliates.                        //
-// All rights reserved.                                                       //
-//                                                                            //
-// This source code and the accompanying materials are made available under   //
-// the terms of the Apache License 2.0 which accompanies this distribution.   //
-// ========================================================================== //
-
-// RUN: cudaq-opt --expand-measurements %s | FileCheck %s
-
-func.func @converter_sized(%ms : !quake.measurements<3>) -> !cc.stdvec<i1> {
-  %bits = quake.discriminate %ms : (!quake.measurements<3>) -> !cc.stdvec<i1>
-  return %bits : !cc.stdvec<i1>
-}
-
-// CHECK-LABEL:   func.func @converter_sized(
-// CHECK-SAME:                               %[[VAL_0:.*]]: !quake.measurements<3>) -> !cc.stdvec<i1> {
-// CHECK:           %[[VAL_1:.*]] = arith.constant 3 : i64
-// CHECK:           %[[VAL_2:.*]] = cc.alloca i8{{\[}}%[[VAL_1]] : i64]
-// CHECK:           %[[VAL_3:.*]] = quake.get_measure %[[VAL_0]][0] : (!quake.measurements<3>) -> !quake.measure
-// CHECK:           %[[VAL_4:.*]] = quake.discriminate %[[VAL_3]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_6:.*]] = cc.cast unsigned %[[VAL_4]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_6]], %[[VAL_5]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_7:.*]] = quake.get_measure %[[VAL_0]][1] : (!quake.measurements<3>) -> !quake.measure
-// CHECK:           %[[VAL_8:.*]] = quake.discriminate %[[VAL_7]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_9:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_10:.*]] = cc.cast unsigned %[[VAL_8]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_10]], %[[VAL_9]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_11:.*]] = quake.get_measure %[[VAL_0]][2] : (!quake.measurements<3>) -> !quake.measure
-// CHECK:           %[[VAL_12:.*]] = quake.discriminate %[[VAL_11]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_13:.*]] = cc.compute_ptr %[[VAL_2]][2] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_14:.*]] = cc.cast unsigned %[[VAL_12]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_14]], %[[VAL_13]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_15:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.array<i1 x ?>>
-// CHECK:           %[[VAL_16:.*]] = cc.stdvec_init %[[VAL_15]], %[[VAL_1]] : (!cc.ptr<!cc.array<i1 x ?>>, i64) -> !cc.stdvec<i1>
-// CHECK:           return %[[VAL_16]] : !cc.stdvec<i1>
-// CHECK:         }
-
-func.func @converter_single(%m : !quake.measure) -> i1 {
-  %bit = quake.discriminate %m : (!quake.measure) -> i1
-  return %bit : i1
-}
-
-// CHECK-LABEL:   func.func @converter_single(
-// CHECK-SAME:                                %[[VAL_0:.*]]: !quake.measure) -> i1 {
-// CHECK:           %[[VAL_1:.*]] = quake.discriminate %[[VAL_0]] : (!quake.measure) -> i1
-// CHECK:           return %[[VAL_1]] : i1
-// CHECK:         }
-
-func.func @converter_sized_i4(%ms : !quake.measurements<2>) -> !cc.stdvec<i4> {
-  %bits = quake.discriminate %ms : (!quake.measurements<2>) -> !cc.stdvec<i4>
-  return %bits : !cc.stdvec<i4>
-}
-
-// CHECK-LABEL:   func.func @converter_sized_i4(
-// CHECK-SAME:                                  %[[VAL_0:.*]]: !quake.measurements<2>) -> !cc.stdvec<i4> {
-// CHECK:           %[[VAL_1:.*]] = arith.constant 2 : i64
-// CHECK:           %[[VAL_2:.*]] = cc.alloca i8{{\[}}%[[VAL_1]] : i64]
-// CHECK:           %[[VAL_3:.*]] = quake.get_measure %[[VAL_0]][0] : (!quake.measurements<2>) -> !quake.measure
-// CHECK:           %[[VAL_4:.*]] = quake.discriminate %[[VAL_3]] : (!quake.measure) -> i4
-// CHECK:           %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_6:.*]] = cc.cast unsigned %[[VAL_4]] : (i4) -> i8
-// CHECK:           cc.store %[[VAL_6]], %[[VAL_5]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_7:.*]] = quake.get_measure %[[VAL_0]][1] : (!quake.measurements<2>) -> !quake.measure
-// CHECK:           %[[VAL_8:.*]] = quake.discriminate %[[VAL_7]] : (!quake.measure) -> i4
-// CHECK:           %[[VAL_9:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_10:.*]] = cc.cast unsigned %[[VAL_8]] : (i4) -> i8
-// CHECK:           cc.store %[[VAL_10]], %[[VAL_9]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_11:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.array<i4 x ?>>
-// CHECK:           %[[VAL_12:.*]] = cc.stdvec_init %[[VAL_11]], %[[VAL_1]] : (!cc.ptr<!cc.array<i4 x ?>>, i64) -> !cc.stdvec<i4>
-// CHECK:           return %[[VAL_12]] : !cc.stdvec<i4>
-// CHECK:         }
-
-func.func @expand_mz_veq_i3() -> !cc.stdvec<i3> {
-  %0 = quake.alloca !quake.veq<2>
-  %measOut = quake.mz %0 : (!quake.veq<2>) -> !quake.measurements<2>
-  %bits = quake.discriminate %measOut : (!quake.measurements<2>) -> !cc.stdvec<i3>
-  return %bits : !cc.stdvec<i3>
-}
-
-// CHECK-LABEL:   func.func @expand_mz_veq_i3() -> !cc.stdvec<i3> {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_2:.*]] = arith.constant 2 : i64
-// CHECK:           %[[VAL_3:.*]] = quake.alloca !quake.veq<2>
-// CHECK:           %[[VAL_4:.*]] = quake.extract_ref %[[VAL_3]]{{\[}}%[[VAL_0]]] : (!quake.veq<2>, i64) -> !quake.ref
-// CHECK:           %[[VAL_5:.*]] = quake.mz %[[VAL_4]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_6:.*]] = quake.extract_ref %[[VAL_3]]{{\[}}%[[VAL_1]]] : (!quake.veq<2>, i64) -> !quake.ref
-// CHECK:           %[[VAL_7:.*]] = quake.mz %[[VAL_6]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_8:.*]] = cc.alloca i8{{\[}}%[[VAL_2]] : i64]
-// CHECK:           %[[VAL_9:.*]] = quake.discriminate %[[VAL_5]] : (!quake.measure) -> i3
-// CHECK:           %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_8]][0] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_11:.*]] = cc.cast unsigned %[[VAL_9]] : (i3) -> i8
-// CHECK:           cc.store %[[VAL_11]], %[[VAL_10]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_12:.*]] = quake.discriminate %[[VAL_7]] : (!quake.measure) -> i3
-// CHECK:           %[[VAL_13:.*]] = cc.compute_ptr %[[VAL_8]][1] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_14:.*]] = cc.cast unsigned %[[VAL_12]] : (i3) -> i8
-// CHECK:           cc.store %[[VAL_14]], %[[VAL_13]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_15:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.array<i3 x ?>>
-// CHECK:           %[[VAL_16:.*]] = cc.stdvec_init %[[VAL_15]], %[[VAL_2]] : (!cc.ptr<!cc.array<i3 x ?>>, i64) -> !cc.stdvec<i3>
-// CHECK:           return %[[VAL_16]] : !cc.stdvec<i3>
-// CHECK:         }
-
-func.func @expand_mz_ref_i1() -> !cc.stdvec<i1> {
-  %0 = quake.alloca !quake.ref
-  %1 = quake.alloca !quake.ref
-  %m = quake.mz %0, %1 : (!quake.ref, !quake.ref) -> !quake.measurements<2>
-  %bits = quake.discriminate %m : (!quake.measurements<2>) -> !cc.stdvec<i1>
-  return %bits : !cc.stdvec<i1>
-}
-
-// CHECK-LABEL:   func.func @expand_mz_ref_i1() -> !cc.stdvec<i1> {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 2 : i64
-// CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_1]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_2]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_5:.*]] = cc.alloca i8{{\[}}%[[VAL_0]] : i64]
-// CHECK:           %[[VAL_6:.*]] = quake.discriminate %[[VAL_3]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_7:.*]] = cc.compute_ptr %[[VAL_5]][0] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_8:.*]] = cc.cast unsigned %[[VAL_6]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_8]], %[[VAL_7]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_9:.*]] = quake.discriminate %[[VAL_4]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_5]][1] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_11:.*]] = cc.cast unsigned %[[VAL_9]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_11]], %[[VAL_10]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_12:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.array<i1 x ?>>
-// CHECK:           %[[VAL_13:.*]] = cc.stdvec_init %[[VAL_12]], %[[VAL_0]] : (!cc.ptr<!cc.array<i1 x ?>>, i64) -> !cc.stdvec<i1>
-// CHECK:           return %[[VAL_13]] : !cc.stdvec<i1>
-// CHECK:         }
-
-
-func.func @callee(%q0 : !quake.ref, %q1 : !quake.ref) -> !quake.measurements<2> {
-  %m = quake.mz %q0, %q1 : (!quake.ref, !quake.ref) -> !quake.measurements<2>
-  return %m : !quake.measurements<2>
-}
-
-func.func @caller() -> !cc.stdvec<i1> {
-  %q0 = quake.alloca !quake.ref
-  %q1 = quake.alloca !quake.ref
-  %ms = call @callee(%q0, %q1) : (!quake.ref, !quake.ref) -> !quake.measurements<2>
-  %bits = quake.discriminate %ms : (!quake.measurements<2>) -> !cc.stdvec<i1>
-  return %bits : !cc.stdvec<i1>
-}
-
-// CHECK-LABEL:   func.func @callee(
-// CHECK-SAME:                      %[[VAL_0:.*]]: !quake.ref,
-// CHECK-SAME:                      %[[VAL_1:.*]]: !quake.ref) -> !quake.measurements<2> {
-// CHECK:           %[[VAL_2:.*]] = quake.mz %[[VAL_0]], %[[VAL_1]] : (!quake.ref, !quake.ref) -> !quake.measurements<2>
-// CHECK:           return %[[VAL_2]] : !quake.measurements<2>
-// CHECK:         }
-
-// CHECK-LABEL:   func.func @caller() -> !cc.stdvec<i1> {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 2 : i64
-// CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_3:.*]] = call @callee(%[[VAL_1]], %[[VAL_2]]) : (!quake.ref, !quake.ref) -> !quake.measurements<2>
-// CHECK:           %[[VAL_4:.*]] = cc.alloca i8{{\[}}%[[VAL_0]] : i64]
-// CHECK:           %[[VAL_5:.*]] = quake.get_measure %[[VAL_3]][0] : (!quake.measurements<2>) -> !quake.measure
-// CHECK:           %[[VAL_6:.*]] = quake.discriminate %[[VAL_5]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_7:.*]] = cc.compute_ptr %[[VAL_4]][0] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_8:.*]] = cc.cast unsigned %[[VAL_6]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_8]], %[[VAL_7]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_9:.*]] = quake.get_measure %[[VAL_3]][1] : (!quake.measurements<2>) -> !quake.measure
-// CHECK:           %[[VAL_10:.*]] = quake.discriminate %[[VAL_9]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_4]][1] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_12:.*]] = cc.cast unsigned %[[VAL_10]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_12]], %[[VAL_11]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_13:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.array<i1 x ?>>
-// CHECK:           %[[VAL_14:.*]] = cc.stdvec_init %[[VAL_13]], %[[VAL_0]] : (!cc.ptr<!cc.array<i1 x ?>>, i64) -> !cc.stdvec<i1>
-// CHECK:           return %[[VAL_14]] : !cc.stdvec<i1>
-// CHECK:         }
-
-func.func @expand_mz_mixed_ref_veq() -> !cc.stdvec<i1> {
-  %0 = quake.alloca !quake.ref
-  %1 = quake.alloca !quake.veq<2>
-  %m = quake.mz %0, %1 : (!quake.ref, !quake.veq<2>) -> !quake.measurements<3>
-  %bits = quake.discriminate %m : (!quake.measurements<3>) -> !cc.stdvec<i1>
-  return %bits : !cc.stdvec<i1>
-}
-
-// CHECK-LABEL:   func.func @expand_mz_mixed_ref_veq() -> !cc.stdvec<i1> {
-// CHECK-DAG:       %[[VAL_0:.*]] = arith.constant 0 : i64
-// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 1 : i64
-// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 3 : i64
-// CHECK:           %[[VAL_3:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_4:.*]] = quake.alloca !quake.veq<2>
-// CHECK:           %[[VAL_5:.*]] = quake.mz %[[VAL_3]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_6:.*]] = quake.extract_ref %[[VAL_4]]{{\[}}%[[VAL_0]]] : (!quake.veq<2>, i64) -> !quake.ref
-// CHECK:           %[[VAL_7:.*]] = quake.mz %[[VAL_6]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_8:.*]] = quake.extract_ref %[[VAL_4]]{{\[}}%[[VAL_1]]] : (!quake.veq<2>, i64) -> !quake.ref
-// CHECK:           %[[VAL_9:.*]] = quake.mz %[[VAL_8]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_10:.*]] = cc.alloca i8{{\[}}%[[VAL_2]] : i64]
-// CHECK:           %[[VAL_11:.*]] = quake.discriminate %[[VAL_5]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_10]][0] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_13:.*]] = cc.cast unsigned %[[VAL_11]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_13]], %[[VAL_12]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_14:.*]] = quake.discriminate %[[VAL_7]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_15:.*]] = cc.compute_ptr %[[VAL_10]][1] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_16:.*]] = cc.cast unsigned %[[VAL_14]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_16]], %[[VAL_15]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_17:.*]] = quake.discriminate %[[VAL_9]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_18:.*]] = cc.compute_ptr %[[VAL_10]][2] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_19:.*]] = cc.cast unsigned %[[VAL_17]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_19]], %[[VAL_18]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_20:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.array<i1 x ?>>
-// CHECK:           %[[VAL_21:.*]] = cc.stdvec_init %[[VAL_20]], %[[VAL_2]] : (!cc.ptr<!cc.array<i1 x ?>>, i64) -> !cc.stdvec<i1>
-// CHECK:           return %[[VAL_21]] : !cc.stdvec<i1>
-// CHECK:         }
-
-
-func.func @expand_mz_multi_veq() -> !cc.stdvec<i1> {
-  %0 = quake.alloca !quake.veq<2>
-  %1 = quake.alloca !quake.veq<3>
-  %m = quake.mz %0, %1 : (!quake.veq<2>, !quake.veq<3>) -> !quake.measurements<5>
-  %bits = quake.discriminate %m : (!quake.measurements<5>) -> !cc.stdvec<i1>
-  return %bits : !cc.stdvec<i1>
-}
-// CHECK-LABEL:   func.func @expand_mz_multi_veq() -> !cc.stdvec<i1> {
-// CHECK-DAG:       %[[VAL_0:.*]] = arith.constant 0 : i64
-// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 1 : i64
-// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 2 : i64
-// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 5 : i64
-// CHECK:           %[[VAL_4:.*]] = quake.alloca !quake.veq<2>
-// CHECK:           %[[VAL_5:.*]] = quake.alloca !quake.veq<3>
-// CHECK:           %[[VAL_6:.*]] = quake.extract_ref %[[VAL_4]]{{\[}}%[[VAL_0]]] : (!quake.veq<2>, i64) -> !quake.ref
-// CHECK:           %[[VAL_7:.*]] = quake.mz %[[VAL_6]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_8:.*]] = quake.extract_ref %[[VAL_4]]{{\[}}%[[VAL_1]]] : (!quake.veq<2>, i64) -> !quake.ref
-// CHECK:           %[[VAL_9:.*]] = quake.mz %[[VAL_8]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_10:.*]] = quake.extract_ref %[[VAL_5]]{{\[}}%[[VAL_0]]] : (!quake.veq<3>, i64) -> !quake.ref
-// CHECK:           %[[VAL_11:.*]] = quake.mz %[[VAL_10]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_12:.*]] = quake.extract_ref %[[VAL_5]]{{\[}}%[[VAL_1]]] : (!quake.veq<3>, i64) -> !quake.ref
-// CHECK:           %[[VAL_13:.*]] = quake.mz %[[VAL_12]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_14:.*]] = quake.extract_ref %[[VAL_5]]{{\[}}%[[VAL_2]]] : (!quake.veq<3>, i64) -> !quake.ref
-// CHECK:           %[[VAL_15:.*]] = quake.mz %[[VAL_14]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_16:.*]] = cc.alloca i8{{\[}}%[[VAL_3]] : i64]
-// CHECK:           %[[VAL_17:.*]] = quake.discriminate %[[VAL_7]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_18:.*]] = cc.compute_ptr %[[VAL_16]][0] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_19:.*]] = cc.cast unsigned %[[VAL_17]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_19]], %[[VAL_18]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_20:.*]] = quake.discriminate %[[VAL_9]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_21:.*]] = cc.compute_ptr %[[VAL_16]][1] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_22:.*]] = cc.cast unsigned %[[VAL_20]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_22]], %[[VAL_21]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_23:.*]] = quake.discriminate %[[VAL_11]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_24:.*]] = cc.compute_ptr %[[VAL_16]][2] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_25:.*]] = cc.cast unsigned %[[VAL_23]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_25]], %[[VAL_24]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_26:.*]] = quake.discriminate %[[VAL_13]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_27:.*]] = cc.compute_ptr %[[VAL_16]][3] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_28:.*]] = cc.cast unsigned %[[VAL_26]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_28]], %[[VAL_27]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_29:.*]] = quake.discriminate %[[VAL_15]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_30:.*]] = cc.compute_ptr %[[VAL_16]][4] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_31:.*]] = cc.cast unsigned %[[VAL_29]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_31]], %[[VAL_30]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_32:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.array<i1 x ?>>
-// CHECK:           %[[VAL_33:.*]] = cc.stdvec_init %[[VAL_32]], %[[VAL_3]] : (!cc.ptr<!cc.array<i1 x ?>>, i64) -> !cc.stdvec<i1>
-// CHECK:           return %[[VAL_33]] : !cc.stdvec<i1>
-// CHECK:         }
-
-
-func.func @converter_unsized(%ms : !quake.measurements<?>) -> !cc.stdvec<i1> {
-  %bits = quake.discriminate %ms : (!quake.measurements<?>) -> !cc.stdvec<i1>
-  return %bits : !cc.stdvec<i1>
-}
-
-// CHECK-LABEL:   func.func @converter_unsized(
-// CHECK-SAME:                                 %[[VAL_0:.*]]: !quake.measurements<?>) -> !cc.stdvec<i1> {
-// CHECK:           %[[VAL_1:.*]] = quake.discriminate %[[VAL_0]] : (!quake.measurements<?>) -> !cc.stdvec<i1>
-// CHECK:           return %[[VAL_1]] : !cc.stdvec<i1>
-// CHECK:         }
-
-
-func.func @veq_to_measurements(%veq : !quake.veq<?>) -> !quake.measurements<?> {
-  %m = quake.mz %veq : (!quake.veq<?>) -> !quake.measurements<?>
-  return %m : !quake.measurements<?>
-}
-
-// CHECK-LABEL:   func.func @veq_to_measurements(
-// CHECK-SAME:                                   %[[VAL_0:.*]]: !quake.veq<?>) -> !quake.measurements<?> {
-// CHECK:           %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<?>) -> !quake.measurements<?>
-// CHECK:           return %[[VAL_1]] : !quake.measurements<?>
-// CHECK:         }
-
-func.func @dynamic_get_measure_used(%idx: i64, %ptr: !cc.ptr<i1>) {
-  %veq = quake.alloca !quake.veq<3>
-  %m = quake.mz %veq : (!quake.veq<3>) -> !quake.measurements<3>
-  %gm = quake.get_measure %m[%idx] : (!quake.measurements<3>, i64) -> !quake.measure
-  %bit = quake.discriminate %gm : (!quake.measure) -> i1
-  cc.store %bit, %ptr : !cc.ptr<i1>
-  quake.dealloc %veq : !quake.veq<3>
-  return
-}
-
-// CHECK-LABEL:   func.func @dynamic_get_measure_used(
-// CHECK-SAME:                                        %[[VAL_0:.*]]: i64,
-// CHECK-SAME:                                        %[[VAL_1:.*]]: !cc.ptr<i1>) {
-// CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.veq<3>
-// CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_2]] : (!quake.veq<3>) -> !quake.measurements<3>
-// CHECK:           %[[VAL_4:.*]] = quake.get_measure %[[VAL_3]]{{\[}}%[[VAL_0]]] : (!quake.measurements<3>, i64) -> !quake.measure
-// CHECK:           %[[VAL_5:.*]] = quake.discriminate %[[VAL_4]] : (!quake.measure) -> i1
-// CHECK:           cc.store %[[VAL_5]], %[[VAL_1]] : !cc.ptr<i1>
-// CHECK:           quake.dealloc %[[VAL_2]] : !quake.veq<3>
-// CHECK:           return
-// CHECK:         }
-
-func.func @expand_mz_unsized_no_disc(%v : !quake.veq<?>) {
-  %m = quake.mz %v : (!quake.veq<?>) -> !quake.measurements<?>
-  return
-}
-
-// CHECK-LABEL:   func.func @expand_mz_unsized_no_disc(
-// CHECK-SAME:                                         %[[VAL_0:.*]]: !quake.veq<?>) {
-// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 0 : i64
-// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_3:.*]] = quake.veq_size %[[VAL_0]] : (!quake.veq<?>) -> i64
-// CHECK:           %[[VAL_4:.*]] = cc.loop while ((%[[VAL_5:.*]] = %[[VAL_1]]) -> (i64)) {
-// CHECK:             %[[VAL_6:.*]] = arith.cmpi slt, %[[VAL_5]], %[[VAL_3]] : i64
-// CHECK:             cc.condition %[[VAL_6]](%[[VAL_5]] : i64)
-// CHECK:           } do {
-// CHECK:           ^bb0(%[[VAL_7:.*]]: i64):
-// CHECK:             %[[VAL_8:.*]] = quake.extract_ref %[[VAL_0]]{{\[}}%[[VAL_7]]] : (!quake.veq<?>, i64) -> !quake.ref
-// CHECK:             %[[VAL_9:.*]] = quake.mz %[[VAL_8]] : (!quake.ref) -> !quake.measure
-// CHECK:             cc.continue %[[VAL_7]] : i64
-// CHECK:           } step {
-// CHECK:           ^bb0(%[[VAL_10:.*]]: i64):
-// CHECK:             %[[VAL_11:.*]] = arith.addi %[[VAL_10]], %[[VAL_2]] : i64
-// CHECK:             cc.continue %[[VAL_11]] : i64
-// CHECK:           } {invariant}
-// CHECK:           return
-// CHECK:         }
-
-func.func @expand_mz_sized_no_users() {
-  %0 = quake.alloca !quake.veq<2>
-  %m = quake.mz %0 : (!quake.veq<2>) -> !quake.measurements<2>
-  return
-}
-
-// CHECK-LABEL:   func.func @expand_mz_sized_no_users() {
-// CHECK-DAG:       %[[VAL_0:.*]] = arith.constant 0 : i64
-// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.veq<2>
-// CHECK:           %[[VAL_3:.*]] = quake.extract_ref %[[VAL_2]]{{\[}}%[[VAL_0]]] : (!quake.veq<2>, i64) -> !quake.ref
-// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_3]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_5:.*]] = quake.extract_ref %[[VAL_2]]{{\[}}%[[VAL_1]]] : (!quake.veq<2>, i64) -> !quake.ref
-// CHECK:           %[[VAL_6:.*]] = quake.mz %[[VAL_5]] : (!quake.ref) -> !quake.measure
-// CHECK:           return
-// CHECK:         }
-
-func.func @expand_mz_unsized_ref_veq(%r : !quake.ref, %v : !quake.veq<?>) -> !cc.stdvec<i1> {
-  %m = quake.mz %r, %v : (!quake.ref, !quake.veq<?>) -> !quake.measurements<?>
-  %bits = quake.discriminate %m : (!quake.measurements<?>) -> !cc.stdvec<i1>
-  return %bits : !cc.stdvec<i1>
-}
-
-// CHECK-LABEL:   func.func @expand_mz_unsized_ref_veq(
-// CHECK-SAME:                                         %[[VAL_0:.*]]: !quake.ref,
-// CHECK-SAME:                                         %[[VAL_1:.*]]: !quake.veq<?>) -> !cc.stdvec<i1> {
-// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 1 : i64
-// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_4:.*]] = quake.veq_size %[[VAL_1]] : (!quake.veq<?>) -> i64
-// CHECK:           %[[VAL_5:.*]] = arith.addi %[[VAL_4]], %[[VAL_2]] : i64
-// CHECK:           %[[VAL_6:.*]] = cc.alloca i8{{\[}}%[[VAL_5]] : i64]
-// CHECK:           %[[VAL_7:.*]] = quake.mz %[[VAL_0]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_8:.*]] = quake.discriminate %[[VAL_7]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_9:.*]] = cc.compute_ptr %[[VAL_6]][0] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_10:.*]] = cc.cast unsigned %[[VAL_8]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_10]], %[[VAL_9]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_11:.*]] = quake.veq_size %[[VAL_1]] : (!quake.veq<?>) -> i64
-// CHECK:           %[[VAL_12:.*]] = cc.loop while ((%[[VAL_13:.*]] = %[[VAL_3]]) -> (i64)) {
-// CHECK:             %[[VAL_14:.*]] = arith.cmpi slt, %[[VAL_13]], %[[VAL_11]] : i64
-// CHECK:             cc.condition %[[VAL_14]](%[[VAL_13]] : i64)
-// CHECK:           } do {
-// CHECK:           ^bb0(%[[VAL_15:.*]]: i64):
-// CHECK:             %[[VAL_16:.*]] = quake.extract_ref %[[VAL_1]]{{\[}}%[[VAL_15]]] : (!quake.veq<?>, i64) -> !quake.ref
-// CHECK:             %[[VAL_17:.*]] = quake.mz %[[VAL_16]] : (!quake.ref) -> !quake.measure
-// CHECK:             %[[VAL_18:.*]] = quake.discriminate %[[VAL_17]] : (!quake.measure) -> i1
-// CHECK:             %[[VAL_19:.*]] = arith.addi %[[VAL_15]], %[[VAL_2]] : i64
-// CHECK:             %[[VAL_20:.*]] = cc.compute_ptr %[[VAL_6]]{{\[}}%[[VAL_19]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
-// CHECK:             %[[VAL_21:.*]] = cc.cast unsigned %[[VAL_18]] : (i1) -> i8
-// CHECK:             cc.store %[[VAL_21]], %[[VAL_20]] : !cc.ptr<i8>
-// CHECK:             cc.continue %[[VAL_15]] : i64
-// CHECK:           } step {
-// CHECK:           ^bb0(%[[VAL_22:.*]]: i64):
-// CHECK:             %[[VAL_23:.*]] = arith.addi %[[VAL_22]], %[[VAL_2]] : i64
-// CHECK:             cc.continue %[[VAL_23]] : i64
-// CHECK:           } {invariant}
-// CHECK:           %[[VAL_24:.*]] = cc.cast %[[VAL_6]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.array<i1 x ?>>
-// CHECK:           %[[VAL_25:.*]] = cc.stdvec_init %[[VAL_24]], %[[VAL_5]] : (!cc.ptr<!cc.array<i1 x ?>>, i64) -> !cc.stdvec<i1>
-// CHECK:           return %[[VAL_25]] : !cc.stdvec<i1>
-// CHECK:         }
-
-func.func @expand_mx_unsized_veq(%v : !quake.veq<?>) -> !cc.stdvec<i1> {
-  %m = quake.mx %v : (!quake.veq<?>) -> !quake.measurements<?>
-  %bits = quake.discriminate %m : (!quake.measurements<?>) -> !cc.stdvec<i1>
-  return %bits : !cc.stdvec<i1>
-}
-
-// CHECK-LABEL:   func.func @expand_mx_unsized_veq(
-// CHECK-SAME:                                     %[[VAL_0:.*]]: !quake.veq<?>) -> !cc.stdvec<i1> {
-// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 0 : i64
-// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_3:.*]] = quake.veq_size %[[VAL_0]] : (!quake.veq<?>) -> i64
-// CHECK:           %[[VAL_4:.*]] = cc.alloca i8{{\[}}%[[VAL_3]] : i64]
-// CHECK:           %[[VAL_5:.*]] = quake.veq_size %[[VAL_0]] : (!quake.veq<?>) -> i64
-// CHECK:           %[[VAL_6:.*]] = cc.loop while ((%[[VAL_7:.*]] = %[[VAL_1]]) -> (i64)) {
-// CHECK:             %[[VAL_8:.*]] = arith.cmpi slt, %[[VAL_7]], %[[VAL_5]] : i64
-// CHECK:             cc.condition %[[VAL_8]](%[[VAL_7]] : i64)
-// CHECK:           } do {
-// CHECK:           ^bb0(%[[VAL_9:.*]]: i64):
-// CHECK:             %[[VAL_10:.*]] = quake.extract_ref %[[VAL_0]]{{\[}}%[[VAL_9]]] : (!quake.veq<?>, i64) -> !quake.ref
-// CHECK:             %[[VAL_11:.*]] = quake.mx %[[VAL_10]] : (!quake.ref) -> !quake.measure
-// CHECK:             %[[VAL_12:.*]] = quake.discriminate %[[VAL_11]] : (!quake.measure) -> i1
-// CHECK:             %[[VAL_13:.*]] = cc.compute_ptr %[[VAL_4]]{{\[}}%[[VAL_9]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
-// CHECK:             %[[VAL_14:.*]] = cc.cast unsigned %[[VAL_12]] : (i1) -> i8
-// CHECK:             cc.store %[[VAL_14]], %[[VAL_13]] : !cc.ptr<i8>
-// CHECK:             cc.continue %[[VAL_9]] : i64
-// CHECK:           } step {
-// CHECK:           ^bb0(%[[VAL_15:.*]]: i64):
-// CHECK:             %[[VAL_16:.*]] = arith.addi %[[VAL_15]], %[[VAL_2]] : i64
-// CHECK:             cc.continue %[[VAL_16]] : i64
-// CHECK:           } {invariant}
-// CHECK:           %[[VAL_17:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.array<i1 x ?>>
-// CHECK:           %[[VAL_18:.*]] = cc.stdvec_init %[[VAL_17]], %[[VAL_3]] : (!cc.ptr<!cc.array<i1 x ?>>, i64) -> !cc.stdvec<i1>
-// CHECK:           return %[[VAL_18]] : !cc.stdvec<i1>
-// CHECK:         }
-
-func.func @expand_my_veq() -> !cc.stdvec<i1> {
-  %0 = quake.alloca !quake.veq<1>
-  %measOut = quake.my %0 : (!quake.veq<1>) -> !quake.measurements<1>
-  %bits = quake.discriminate %measOut : (!quake.measurements<1>) -> !cc.stdvec<i1>
-  return %bits : !cc.stdvec<i1>
-}
-
-// CHECK-LABEL:   func.func @expand_my_veq() -> !cc.stdvec<i1> {
-// CHECK-DAG:       %[[VAL_0:.*]] = arith.constant 0 : i64
-// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.veq<1>
-// CHECK:           %[[VAL_3:.*]] = quake.extract_ref %[[VAL_2]]{{\[}}%[[VAL_0]]] : (!quake.veq<1>, i64) -> !quake.ref
-// CHECK:           %[[VAL_4:.*]] = quake.my %[[VAL_3]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_5:.*]] = cc.alloca i8{{\[}}%[[VAL_1]] : i64]
-// CHECK:           %[[VAL_6:.*]] = quake.discriminate %[[VAL_4]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_7:.*]] = cc.compute_ptr %[[VAL_5]][0] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_8:.*]] = cc.cast unsigned %[[VAL_6]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_8]], %[[VAL_7]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_9:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.array<i1 x ?>>
-// CHECK:           %[[VAL_10:.*]] = cc.stdvec_init %[[VAL_9]], %[[VAL_1]] : (!cc.ptr<!cc.array<i1 x ?>>, i64) -> !cc.stdvec<i1>
-// CHECK:           return %[[VAL_10]] : !cc.stdvec<i1>
-// CHECK:         }
diff --git a/test/Transforms/invalid.qke b/test/Transforms/invalid.qke
index 381e2863ec9..70267203f83 100644
--- a/test/Transforms/invalid.qke
+++ b/test/Transforms/invalid.qke
@@ -226,36 +226,3 @@ func.func private @wonk(!quake.veq<18>, i32) -> f64
 
 // expected-error@+1 {{cannot classically allocate quake abstract type}}
 %0 = cc.alloca !quake.measure
-
-// -----
-
-func.func @relax_size_measurements_sized_result(%q : !quake.veq<4>) {
-  %ms = quake.mz %q : (!quake.veq<4>) -> !quake.measurements<4>
-  // expected-error@+1 {{result measurements type must not specify a size}}
-  %bad = quake.relax_size %ms : (!quake.measurements<4>) -> !quake.measurements<4>
-  return
-}
-
-// -----
-
-func.func @relax_size_type_mismatch(%q : !quake.veq<4>) {
-  // expected-error@+1 {{input and result must both be measurements types}}
-  %bad = quake.relax_size %q : (!quake.veq<4>) -> !quake.measurements<?>
-  return
-}
-
-// -----
-
-func.func @relax_size_veq_sized_result(%q : !quake.veq<4>) {
-  // expected-error@+1 {{result veq type must not specify a size}}
-  %bad = quake.relax_size %q : (!quake.veq<4>) -> !quake.veq<4>
-  return
-}
-
-// -----
-
-func.func @relax_size_veq_type_mismatch(%ms : !quake.measurements<4>) {
-  // expected-error@+1 {{input and result must both be veq types}}
-  %bad = quake.relax_size %ms : (!quake.measurements<4>) -> !quake.veq<?>
-  return
-}
diff --git a/test/Transforms/kernel_exec-1.qke b/test/Transforms/kernel_exec-1.qke
index 035cdd64aab..24e20ab60b3 100644
--- a/test/Transforms/kernel_exec-1.qke
+++ b/test/Transforms/kernel_exec-1.qke
@@ -13,7 +13,7 @@
 module attributes {quake.mangled_name_map = {
   __nvqpp__mlirgen__ghz = "_ZN3ghzclEi"}} {
 
-  func.func @__nvqpp__mlirgen__ghz(%arg0: i32) -> f64 attributes {"cudaq-entrypoint"} {
+  func.func @__nvqpp__mlirgen__ghz(%arg0: i32) -> f64 {
     %0 = cc.alloca i32
     cc.store %arg0, %0 : !cc.ptr<i32>
     %1 = cc.load %0 : !cc.ptr<i32>
@@ -446,3 +446,4 @@ module attributes {quake.mangled_name_map = {
 // HYBRID:           llvm.return
 // HYBRID:         }
 // HYBRID:         llvm.mlir.global_ctors {ctors = [@ghz.kernelRegFunc], priorities = [17 : i32]}
+
diff --git a/test/Transforms/loop.qke b/test/Transforms/loop.qke
index d209777fd8c..57b67eaad52 100644
--- a/test/Transforms/loop.qke
+++ b/test/Transforms/loop.qke
@@ -412,7 +412,7 @@ func.func @empty_step() {
     } step {
     }
   }
-  %2 = quake.mz %1 : (!quake.veq<?>) -> !quake.measurements<?>
+  %2 = quake.mz %1 : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
   return
 }
 
diff --git a/test/Transforms/mapping_non_unitaries.qke b/test/Transforms/mapping_non_unitaries.qke
index d008ee613a8..353e8599016 100644
--- a/test/Transforms/mapping_non_unitaries.qke
+++ b/test/Transforms/mapping_non_unitaries.qke
@@ -30,7 +30,7 @@ func.func @test_measurement() {
   %3:2 = quake.x [%1] %0 : (!quake.wire, !quake.wire) -> (!quake.wire, !quake.wire)
   %4:2 = quake.x [%3#0] %2 : (!quake.wire, !quake.wire) -> (!quake.wire, !quake.wire)
   %5:2 = quake.x [%4#1] %3#1 : (!quake.wire, !quake.wire) -> (!quake.wire, !quake.wire)
-  %bits, %wires:3 = quake.mz %5#1, %4#0, %5#0 name "result": (!quake.wire, !quake.wire, !quake.wire) -> (!quake.measurements<3>, !quake.wire, !quake.wire, !quake.wire)
+  %bits, %wires:3 = quake.mz %5#1, %4#0, %5#0 name "result": (!quake.wire, !quake.wire, !quake.wire) -> (!cc.stdvec<!quake.measure>, !quake.wire, !quake.wire, !quake.wire)
   quake.return_wire %wires#0 : !quake.wire
   quake.return_wire %wires#1 : !quake.wire
   quake.return_wire %wires#2 : !quake.wire
@@ -45,7 +45,7 @@ func.func @test_measurement() {
 // CHECK:           %[[VAL_4:.*]]:2 = quake.x {{\[}}%[[VAL_3]]#0] %[[VAL_2]] : (!quake.wire, !quake.wire) -> (!quake.wire, !quake.wire)
 // CHECK:           %[[VAL_5:.*]]:2 = quake.swap %[[VAL_4]]#1, %[[VAL_4]]#0 : (!quake.wire, !quake.wire) -> (!quake.wire, !quake.wire)
 // CHECK:           %[[VAL_6:.*]]:2 = quake.x {{\[}}%[[VAL_5]]#1] %[[VAL_3]]#1 : (!quake.wire, !quake.wire) -> (!quake.wire, !quake.wire)
-// CHECK:           %[[VAL_7:.*]], %[[VAL_8:.*]]:3 = quake.mz %[[VAL_6]]#1, %[[VAL_5]]#0, %[[VAL_6]]#0 name "result" : (!quake.wire, !quake.wire, !quake.wire) -> (!quake.measurements<3>, !quake.wire, !quake.wire, !quake.wire)
+// CHECK:           %[[VAL_7:.*]], %[[VAL_8:.*]]:3 = quake.mz %[[VAL_6]]#1, %[[VAL_5]]#0, %[[VAL_6]]#0 name "result" : (!quake.wire, !quake.wire, !quake.wire) -> (!cc.stdvec<!quake.measure>, !quake.wire, !quake.wire, !quake.wire)
 // CHECK-DAG:       quake.return_wire %[[VAL_8]]#0 : !quake.wire
 // CHECK-DAG:       quake.return_wire %[[VAL_8]]#1 : !quake.wire
 // CHECK-DAG:       quake.return_wire %[[VAL_8]]#2 : !quake.wire
diff --git a/test/Transforms/measurements_size.qke b/test/Transforms/measurements_size.qke
deleted file mode 100644
index 5edf280223c..00000000000
--- a/test/Transforms/measurements_size.qke
+++ /dev/null
@@ -1,57 +0,0 @@
-// ========================================================================== //
-// Copyright (c) 2026 NVIDIA Corporation & Affiliates.                        //
-// All rights reserved.                                                       //
-//                                                                            //
-// This source code and the accompanying materials are made available under   //
-// the terms of the Apache License 2.0 which accompanies this distribution.   //
-// ========================================================================== //
-
-// RUN: cudaq-opt --canonicalize %s | FileCheck %s
-// RUN: cudaq-opt --convert-to-qir-api %s | FileCheck --check-prefix=QIR %s
-// RUN: cudaq-opt --quake-to-qir %s | FileCheck --check-prefix=LLVM %s
-
-func.func @test_sized(%ms : !quake.measurements<4>) -> i64 {
-  %n = quake.measurements_size %ms : (!quake.measurements<4>) -> i64
-  return %n : i64
-}
-
-func.func @test_unsized(%ms : !quake.measurements<?>) -> i64 {
-  %n = quake.measurements_size %ms : (!quake.measurements<?>) -> i64
-  return %n : i64
-}
-
-// CHECK-LABEL:   func.func @test_sized(
-// CHECK-SAME:                          %[[VAL_0:.*]]: !quake.measurements<4>) -> i64 {
-// CHECK:           %[[VAL_1:.*]] = arith.constant 4 : i64
-// CHECK:           return %[[VAL_1]] : i64
-// CHECK:         }
-
-// CHECK-LABEL:   func.func @test_unsized(
-// CHECK-SAME:                            %[[VAL_0:.*]]: !quake.measurements<?>) -> i64 {
-// CHECK:           %[[VAL_1:.*]] = quake.measurements_size %[[VAL_0]] : (!quake.measurements<?>) -> i64
-// CHECK:           return %[[VAL_1]] : i64
-// CHECK:         }
-
-// QIR-LABEL:   func.func @test_sized(
-// QIR-SAME:                          %[[VAL_0:.*]]: !cc.ptr<!llvm.struct<"Array", opaque>>) -> i64 attributes {"qir-api"} {
-// QIR:           %[[VAL_1:.*]] = call @__quantum__rt__array_get_size_1d(%[[VAL_0]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>) -> i64
-// QIR:           return %[[VAL_1]] : i64
-// QIR:         }
-
-// QIR-LABEL:   func.func @test_unsized(
-// QIR-SAME:                            %[[VAL_0:.*]]: !cc.ptr<!llvm.struct<"Array", opaque>>) -> i64 attributes {"qir-api"} {
-// QIR:           %[[VAL_1:.*]] = call @__quantum__rt__array_get_size_1d(%[[VAL_0]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>) -> i64
-// QIR:           return %[[VAL_1]] : i64
-// QIR:         }
-
-// LLVM-LABEL:   llvm.func @test_sized(
-// LLVM-SAME:                          %[[VAL_0:.*]]: !llvm.ptr<struct<"Array", opaque>>) -> i64 {
-// LLVM:           %[[VAL_1:.*]] = llvm.call @__quantum__rt__array_get_size_1d(%[[VAL_0]]) : (!llvm.ptr<struct<"Array", opaque>>) -> i64
-// LLVM:           llvm.return %[[VAL_1]] : i64
-// LLVM:         }
-
-// LLVM-LABEL:   llvm.func @test_unsized(
-// LLVM-SAME:                            %[[VAL_0:.*]]: !llvm.ptr<struct<"Array", opaque>>) -> i64 {
-// LLVM:           %[[VAL_1:.*]] = llvm.call @__quantum__rt__array_get_size_1d(%[[VAL_0]]) : (!llvm.ptr<struct<"Array", opaque>>) -> i64
-// LLVM:           llvm.return %[[VAL_1]] : i64
-// LLVM:         }
diff --git a/test/Transforms/memtoreg-2.qke b/test/Transforms/memtoreg-2.qke
index a4594ab8c0a..aedd2f3c69b 100644
--- a/test/Transforms/memtoreg-2.qke
+++ b/test/Transforms/memtoreg-2.qke
@@ -182,7 +182,7 @@ func.func @classical_if06(%veq : !quake.veq<2>, %c1: i1) {
     %q1 = quake.extract_ref %veq[%c_1] : (!quake.veq<2>, i32) -> !quake.ref
     quake.y %q1 : (!quake.ref) -> ()
   }
-  %reg = quake.mz %veq : (!quake.veq<2>) -> !quake.measurements<2>
+  %reg = quake.mz %veq : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
   return
 }
 
@@ -201,7 +201,7 @@ func.func @classical_if06(%veq : !quake.veq<2>, %c1: i1) {
 // CHECK:             quake.wrap %[[VAL_9]] to %[[VAL_7]] : !quake.wire, !quake.ref
 // CHECK:           } else {
 // CHECK:           }
-// CHECK:           %[[VAL_10:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           %[[VAL_10:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
@@ -216,7 +216,7 @@ func.func @classical_if07(%veq : !quake.veq<2>, %c1: i1, %c2: i1) {
       quake.reset %veq : (!quake.veq<2>) -> ()
     }
   }
-  %reg = quake.mz %veq : (!quake.veq<2>) -> !quake.measurements<2>
+  %reg = quake.mz %veq : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
   return
 }
 
@@ -236,7 +236,7 @@ func.func @classical_if07(%veq : !quake.veq<2>, %c1: i1, %c2: i1) {
 // CHECK:             }
 // CHECK:           } else {
 // CHECK:           }
-// CHECK:           quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
@@ -329,7 +329,7 @@ func.func @scope_local_extract_and_vec_measurement(%veq : !quake.veq<2>) {
     %q1 = quake.extract_ref %veq[%c_1] : (!quake.veq<2>,i32) -> !quake.ref
     quake.y %q1 : (!quake.ref) -> ()
   }
-  %reg = quake.mz %veq : (!quake.veq<2>) -> !quake.measurements<2>
+  %reg = quake.mz %veq : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
   return
 }
 
@@ -344,7 +344,7 @@ func.func @scope_local_extract_and_vec_measurement(%veq : !quake.veq<2>) {
 // CHECK:             %[[VAL_6:.*]] = quake.unwrap %[[VAL_5]] : (!quake.ref) -> !quake.wire
 // CHECK:             %[[VAL_7:.*]] = quake.y %[[VAL_6]] : (!quake.wire) -> !quake.wire
 // CHECK:           }
-// CHECK:           quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
@@ -356,7 +356,7 @@ func.func @vec_op_in_nested_scope(%veq : !quake.veq<2>) {
       quake.reset %veq : (!quake.veq<2>)-> ()
     }
   }
-  %reg = quake.mz %veq : (!quake.veq<2>) -> !quake.measurements<2>
+  %reg = quake.mz %veq : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
   return
 }
 
@@ -370,7 +370,7 @@ func.func @vec_op_in_nested_scope(%veq : !quake.veq<2>) {
 // CHECK:               quake.reset %[[VAL_0]] : (!quake.veq<2>) -> ()
 // CHECK:             }
 // CHECK:           }
-// CHECK:           quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
@@ -384,7 +384,7 @@ func.func @vec_op_in_nested_scope_and_local_extraction(%veq : !quake.veq<2>) {
       quake.reset %veq : (!quake.veq<2>) -> ()
     }
   }
-  %reg = quake.mz %veq : (!quake.veq<2>) -> !quake.measurements<2>
+  %reg = quake.mz %veq : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
   return
 }
 
@@ -401,7 +401,7 @@ func.func @vec_op_in_nested_scope_and_local_extraction(%veq : !quake.veq<2>) {
 // CHECK:               quake.reset %[[VAL_0]] : (!quake.veq<2>) -> ()
 // CHECK:             }
 // CHECK:           }
-// CHECK:           quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
@@ -524,7 +524,7 @@ func.func @raw_cfg05(%c1: i1) {
 ^bb2:
   cf.br ^bb3
 ^bb3:
-  %reg = quake.mz %veq: (!quake.veq<2>) -> !quake.measurements<2>
+  %reg = quake.mz %veq: (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
   quake.dealloc %veq : !quake.veq<2>
   return
 }
@@ -541,7 +541,7 @@ func.func @raw_cfg05(%c1: i1) {
 // CHECK:         ^bb2:
 // CHECK:           cf.br ^bb3
 // CHECK:         ^bb3:
-// CHECK:           quake.mz %[[VAL_2]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           quake.mz %[[VAL_2]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
@@ -562,7 +562,7 @@ func.func @raw_cfg06(%c1: i1) {
   %q2 = quake.extract_ref %veq[%c1_i64] : (!quake.veq<2>, i64) -> !quake.ref
   cf.br ^bb5
 ^bb5:
-  %reg = quake.mz %veq: (!quake.veq<2>)-> !quake.measurements<2>
+  %reg = quake.mz %veq: (!quake.veq<2>)-> !cc.stdvec<!quake.measure>
   quake.dealloc %veq : !quake.veq<2>
   return
 }
@@ -588,7 +588,7 @@ func.func @raw_cfg06(%c1: i1) {
 // CHECK:           %[[VAL_9:.*]] = quake.unwrap %[[VAL_8]] : (!quake.ref) -> !quake.wire
 // CHECK:           cf.br ^bb5
 // CHECK:         ^bb5:
-// CHECK:           quake.mz %[[VAL_3]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           quake.mz %[[VAL_3]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
@@ -605,7 +605,7 @@ func.func @raw_cfg07(%c1: i1) {
   quake.reset %veq: (!quake.veq<2>)->()
   cf.br ^bb3
 ^bb3:
-  %reg = quake.mz %veq: (!quake.veq<2>) -> !quake.measurements<2>
+  %reg = quake.mz %veq: (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
   quake.dealloc %veq : !quake.veq<2>
   return
 }
@@ -626,7 +626,7 @@ func.func @raw_cfg07(%c1: i1) {
 // CHECK:           quake.reset %[[VAL_3]] : (!quake.veq<2>) -> ()
 // CHECK:           cf.br ^bb3
 // CHECK:         ^bb3:
-// CHECK:           quake.mz %[[VAL_3]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           quake.mz %[[VAL_3]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
@@ -730,7 +730,7 @@ func.func @mz_and_reset_veq_with_extracted_refs() {
   %0 = quake.alloca !quake.veq<2>
   %q0 = quake.extract_ref %0[%c_0] : (!quake.veq<2>, i32) -> !quake.ref
   %q1 = quake.extract_ref %0[%c_1] : (!quake.veq<2>, i32) -> !quake.ref
-  %reg = quake.mz %0 : (!quake.veq<2>) -> !quake.measurements<2>
+  %reg = quake.mz %0 : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
   quake.reset %0 : (!quake.veq<2>) -> ()
   quake.dealloc %0 : !quake.veq<2>
   return
@@ -744,7 +744,7 @@ func.func @mz_and_reset_veq_with_extracted_refs() {
 // CHECK:           %[[VAL_4:.*]] = quake.unwrap %[[VAL_3]] : (!quake.ref) -> !quake.wire
 // CHECK:           %[[VAL_5:.*]] = quake.extract_ref %[[VAL_2]][%[[VAL_1]]] : (!quake.veq<2>, i32) -> !quake.ref
 // CHECK:           %[[VAL_6:.*]] = quake.unwrap %[[VAL_5]] : (!quake.ref) -> !quake.wire
-// CHECK:           quake.mz %[[VAL_2]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           quake.mz %[[VAL_2]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           quake.reset %[[VAL_2]] : (!quake.veq<2>) -> ()
 // CHECK:           return
 // CHECK:         }
@@ -817,7 +817,7 @@ func.func @floop_with_vector_and_qextract() {
     %4 = arith.addi %3, %c1_i64 : i64
     cc.store %4, %alloca : !cc.ptr<i64>
   }
-  %2 = quake.mz %veq : (!quake.veq<2>) -> !quake.measurements<2>
+  %2 = quake.mz %veq : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
   quake.dealloc %veq : !quake.veq<2>
   return
 }
@@ -843,7 +843,7 @@ func.func @floop_with_vector_and_qextract() {
 // CHECK:             %[[VAL_14:.*]] = arith.addi %[[VAL_13]], %[[VAL_1]] : i64
 // CHECK:             cc.continue %[[VAL_14]] : i64
 // CHECK:           }
-// CHECK:           %[[VAL_15:.*]] = quake.mz %[[VAL_3]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           %[[VAL_15:.*]] = quake.mz %[[VAL_3]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           quake.dealloc %[[VAL_3]] : !quake.veq<2>
 // CHECK:           return
 // CHECK:         }
diff --git a/test/Transforms/memtoreg-3.qke b/test/Transforms/memtoreg-3.qke
index c59584b7bb4..96e4461d7e0 100644
--- a/test/Transforms/memtoreg-3.qke
+++ b/test/Transforms/memtoreg-3.qke
@@ -83,7 +83,7 @@ func.func @promote_induction_variable() {
     %4 = arith.addi %3, %c1_i64 : i64
     cc.store %4, %alloca : !cc.ptr<i64>
   }
-  %2 = quake.mz %veq : (!quake.veq<2>) -> !quake.measurements<2>
+  %2 = quake.mz %veq : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
   quake.dealloc %veq : !quake.veq<2>
   return
 }
@@ -109,7 +109,7 @@ func.func @promote_induction_variable() {
 // CHECK:             %[[VAL_14:.*]] = arith.addi %[[VAL_13]], %[[VAL_1]] : i64
 // CHECK:             cc.continue %[[VAL_14]] : i64
 // CHECK:           }
-// CHECK:           quake.mz %[[VAL_3]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           quake.mz %[[VAL_3]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           quake.dealloc %[[VAL_3]] : !quake.veq<2>
 // CHECK:           return
 // CHECK:         }
@@ -133,7 +133,7 @@ func.func @promote_induction_variable() {
 // TOMEM:             %[[VAL_12:.*]] = arith.addi %[[VAL_11]], %[[VAL_1]] : i64
 // TOMEM:             cc.continue %[[VAL_12]] : i64
 // TOMEM:           }
-// TOMEM:           quake.mz %[[VAL_3]] : (!quake.veq<2>) -> !quake.measurements<2>
+// TOMEM:           quake.mz %[[VAL_3]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // TOMEM:           quake.dealloc %[[VAL_3]] : !quake.veq<2>
 // TOMEM:           return
 // TOMEM:         }
diff --git a/test/Transforms/memtoreg-7.qke b/test/Transforms/memtoreg-7.qke
index 3b234eba0a8..111d7934abb 100644
--- a/test/Transforms/memtoreg-7.qke
+++ b/test/Transforms/memtoreg-7.qke
@@ -30,8 +30,8 @@ func.func @__nvqpp__mlirgen__test() attributes {"cudaq-entrypoint", qubitMeasure
     %7 = cc.load %1 : !cc.ptr<i1>
     %8 = arith.cmpi eq, %7, %false : i1
     cc.if(%8) {
-      %measOut_0 = quake.mz %0 name "inner_mz" : (!quake.veq<2>) -> !quake.measurements<2>
-      %9 = quake.discriminate %measOut_0 : (!quake.measurements<2>) -> !cc.stdvec<i1>
+      %measOut_0 = quake.mz %0 name "inner_mz" : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
+      %9 = quake.discriminate %measOut_0 : (!cc.stdvec<!quake.measure>) -> !cc.stdvec<i1>
       cc.scope {
         %10 = cc.alloca !cc.stdvec<i1>
         cc.store %9, %10 : !cc.ptr<!cc.stdvec<i1>>
@@ -46,8 +46,8 @@ func.func @__nvqpp__mlirgen__test() attributes {"cudaq-entrypoint", qubitMeasure
   %3 = cc.load %1 : !cc.ptr<i1>
   %4 = arith.cmpi eq, %3, %true : i1
   cc.if(%4) {
-    %measOut = quake.mz %0 name "outer_mz" : (!quake.veq<2>) -> !quake.measurements<2>
-    %5 = quake.discriminate %measOut : (!quake.measurements<2>) -> !cc.stdvec<i1>
+    %measOut = quake.mz %0 name "outer_mz" : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
+    %5 = quake.discriminate %measOut : (!cc.stdvec<!quake.measure>) -> !cc.stdvec<i1>
     cc.scope {
       %6 = cc.alloca !cc.stdvec<i1>
       cc.store %5, %6 : !cc.ptr<!cc.stdvec<i1>>
@@ -76,8 +76,8 @@ func.func @__nvqpp__mlirgen__test() attributes {"cudaq-entrypoint", qubitMeasure
 // CHECK:             %[[VAL_17:.*]] = quake.discriminate %[[VAL_15]] : (!quake.measure) -> i1
 // CHECK:             %[[VAL_18:.*]] = arith.cmpi eq, %[[VAL_17]], %[[VAL_0]] : i1
 // CHECK:             cc.if(%[[VAL_18]]) {
-// CHECK:               %[[VAL_19:.*]] = quake.mz %[[VAL_5]] name "inner_mz" : (!quake.veq<2>) -> !quake.measurements<2>
-// CHECK:               %[[VAL_20:.*]] = quake.discriminate %[[VAL_19]] : (!quake.measurements<2>) -> !cc.stdvec<i1>
+// CHECK:               %[[VAL_19:.*]] = quake.mz %[[VAL_5]] name "inner_mz" : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
+// CHECK:               %[[VAL_20:.*]] = quake.discriminate %[[VAL_19]] : (!cc.stdvec<!quake.measure>) -> !cc.stdvec<i1>
 // CHECK:               cc.scope {
 // CHECK:                 %[[VAL_21:.*]] = cc.undef !cc.stdvec<i1>
 // CHECK:               }
@@ -91,8 +91,8 @@ func.func @__nvqpp__mlirgen__test() attributes {"cudaq-entrypoint", qubitMeasure
 // CHECK:           } {invariant}
 // CHECK:           %[[VAL_25:.*]] = arith.cmpi eq, %[[VAL_26:.*]]#1, %[[VAL_3]] : i1
 // CHECK:           cc.if(%[[VAL_25]]) {
-// CHECK:             %[[VAL_27:.*]] = quake.mz %[[VAL_5]] name "outer_mz" : (!quake.veq<2>) -> !quake.measurements<2>
-// CHECK:             %[[VAL_28:.*]] = quake.discriminate %[[VAL_27]] : (!quake.measurements<2>) -> !cc.stdvec<i1>
+// CHECK:             %[[VAL_27:.*]] = quake.mz %[[VAL_5]] name "outer_mz" : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
+// CHECK:             %[[VAL_28:.*]] = quake.discriminate %[[VAL_27]] : (!cc.stdvec<!quake.measure>) -> !cc.stdvec<i1>
 // CHECK:             cc.scope {
 // CHECK:               %[[VAL_29:.*]] = cc.undef !cc.stdvec<i1>
 // CHECK:             }
@@ -112,18 +112,18 @@ func.func @__nvqpp__mlirgen__test() attributes {"cudaq-entrypoint", qubitMeasure
 // CANOE:           %[[VAL_7:.*]] = quake.discriminate %[[VAL_6]] : (!quake.measure) -> i1
 // CANOE:           %[[VAL_8:.*]] = arith.cmpi eq, %[[VAL_7]], %[[VAL_1]] : i1
 // CANOE:           cc.if(%[[VAL_8]]) {
-// CANOE:             %[[VAL_9:.*]] = quake.mz %[[VAL_3]] name "inner_mz" : (!quake.ref) -> !quake.measure
-// CANOE:             %[[VAL_10:.*]] = quake.mz %[[VAL_4]] name "inner_mz" : (!quake.ref) -> !quake.measure
-// CANOE:             %[[VAL_11:.*]] = cc.alloca !cc.array<i8 x 2>
-// CANOE:             %[[VAL_12:.*]] = quake.discriminate %[[VAL_9]] : (!quake.measure) -> i1
-// CANOE:             %[[VAL_13:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
-// CANOE:             %[[VAL_14:.*]] = cc.cast unsigned %[[VAL_12]] : (i1) -> i8
-// CANOE:             cc.store %[[VAL_14]], %[[VAL_13]] : !cc.ptr<i8>
-// CANOE:             %[[VAL_15:.*]] = quake.discriminate %[[VAL_10]] : (!quake.measure) -> i1
-// CANOE:             %[[VAL_16:.*]] = cc.compute_ptr %[[VAL_11]][1] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
+// CANOE:             %[[VAL_9:.*]] = cc.alloca !cc.array<i8 x 2>
+// CANOE:             %[[VAL_10:.*]] = quake.mz %[[VAL_3]] name "inner_mz" : (!quake.ref) -> !quake.measure
+// CANOE:             %[[VAL_11:.*]] = quake.discriminate %[[VAL_10]] : (!quake.measure) -> i1
+// CANOE:             %[[VAL_12:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
+// CANOE:             %[[VAL_13:.*]] = cc.cast unsigned %[[VAL_11]] : (i1) -> i8
+// CANOE:             cc.store %[[VAL_13]], %[[VAL_12]] : !cc.ptr<i8>
+// CANOE:             %[[VAL_14:.*]] = quake.mz %[[VAL_4]] name "inner_mz" : (!quake.ref) -> !quake.measure
+// CANOE:             %[[VAL_15:.*]] = quake.discriminate %[[VAL_14]] : (!quake.measure) -> i1
+// CANOE:             %[[VAL_16:.*]] = cc.compute_ptr %[[VAL_9]][1] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
 // CANOE:             %[[VAL_17:.*]] = cc.cast unsigned %[[VAL_15]] : (i1) -> i8
 // CANOE:             cc.store %[[VAL_17]], %[[VAL_16]] : !cc.ptr<i8>
-// CANOE:             %[[VAL_18:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<!cc.array<i1 x ?>>
+// CANOE:             %[[VAL_18:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<!cc.array<i1 x ?>>
 // CANOE:             %[[VAL_19:.*]] = cc.stdvec_init %[[VAL_18]], %[[VAL_0]] : (!cc.ptr<!cc.array<i1 x ?>>, i64) -> !cc.stdvec<i1>
 // CANOE:             cc.scope {
 // CANOE:               %[[VAL_20:.*]] = cc.undef !cc.stdvec<i1>
@@ -134,18 +134,18 @@ func.func @__nvqpp__mlirgen__test() attributes {"cudaq-entrypoint", qubitMeasure
 // CANOE:           %[[VAL_22:.*]] = quake.discriminate %[[VAL_21]] : (!quake.measure) -> i1
 // CANOE:           %[[VAL_23:.*]] = arith.cmpi eq, %[[VAL_22]], %[[VAL_1]] : i1
 // CANOE:           cc.if(%[[VAL_23]]) {
-// CANOE:             %[[VAL_24:.*]] = quake.mz %[[VAL_3]] name "inner_mz" : (!quake.ref) -> !quake.measure
-// CANOE:             %[[VAL_25:.*]] = quake.mz %[[VAL_4]] name "inner_mz" : (!quake.ref) -> !quake.measure
-// CANOE:             %[[VAL_26:.*]] = cc.alloca !cc.array<i8 x 2>
-// CANOE:             %[[VAL_27:.*]] = quake.discriminate %[[VAL_24]] : (!quake.measure) -> i1
-// CANOE:             %[[VAL_28:.*]] = cc.cast %[[VAL_26]] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
-// CANOE:             %[[VAL_29:.*]] = cc.cast unsigned %[[VAL_27]] : (i1) -> i8
-// CANOE:             cc.store %[[VAL_29]], %[[VAL_28]] : !cc.ptr<i8>
-// CANOE:             %[[VAL_30:.*]] = quake.discriminate %[[VAL_25]] : (!quake.measure) -> i1
-// CANOE:             %[[VAL_31:.*]] = cc.compute_ptr %[[VAL_26]][1] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
+// CANOE:             %[[VAL_24:.*]] = cc.alloca !cc.array<i8 x 2>
+// CANOE:             %[[VAL_25:.*]] = quake.mz %[[VAL_3]] name "inner_mz" : (!quake.ref) -> !quake.measure
+// CANOE:             %[[VAL_26:.*]] = quake.discriminate %[[VAL_25]] : (!quake.measure) -> i1
+// CANOE:             %[[VAL_27:.*]] = cc.cast %[[VAL_24]] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
+// CANOE:             %[[VAL_28:.*]] = cc.cast unsigned %[[VAL_26]] : (i1) -> i8
+// CANOE:             cc.store %[[VAL_28]], %[[VAL_27]] : !cc.ptr<i8>
+// CANOE:             %[[VAL_29:.*]] = quake.mz %[[VAL_4]] name "inner_mz" : (!quake.ref) -> !quake.measure
+// CANOE:             %[[VAL_30:.*]] = quake.discriminate %[[VAL_29]] : (!quake.measure) -> i1
+// CANOE:             %[[VAL_31:.*]] = cc.compute_ptr %[[VAL_24]][1] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
 // CANOE:             %[[VAL_32:.*]] = cc.cast unsigned %[[VAL_30]] : (i1) -> i8
 // CANOE:             cc.store %[[VAL_32]], %[[VAL_31]] : !cc.ptr<i8>
-// CANOE:             %[[VAL_33:.*]] = cc.cast %[[VAL_26]] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<!cc.array<i1 x ?>>
+// CANOE:             %[[VAL_33:.*]] = cc.cast %[[VAL_24]] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<!cc.array<i1 x ?>>
 // CANOE:             %[[VAL_34:.*]] = cc.stdvec_init %[[VAL_33]], %[[VAL_0]] : (!cc.ptr<!cc.array<i1 x ?>>, i64) -> !cc.stdvec<i1>
 // CANOE:             cc.scope {
 // CANOE:               %[[VAL_35:.*]] = cc.undef !cc.stdvec<i1>
@@ -154,18 +154,18 @@ func.func @__nvqpp__mlirgen__test() attributes {"cudaq-entrypoint", qubitMeasure
 // CANOE:           }
 // CANOE:           %[[VAL_36:.*]] = arith.cmpi eq, %[[VAL_22]], %[[VAL_2]] : i1
 // CANOE:           cc.if(%[[VAL_36]]) {
-// CANOE:             %[[VAL_37:.*]] = quake.mz %[[VAL_3]] name "outer_mz" : (!quake.ref) -> !quake.measure
-// CANOE:             %[[VAL_38:.*]] = quake.mz %[[VAL_4]] name "outer_mz" : (!quake.ref) -> !quake.measure
-// CANOE:             %[[VAL_39:.*]] = cc.alloca !cc.array<i8 x 2>
-// CANOE:             %[[VAL_40:.*]] = quake.discriminate %[[VAL_37]] : (!quake.measure) -> i1
-// CANOE:             %[[VAL_41:.*]] = cc.cast %[[VAL_39]] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
-// CANOE:             %[[VAL_42:.*]] = cc.cast unsigned %[[VAL_40]] : (i1) -> i8
-// CANOE:             cc.store %[[VAL_42]], %[[VAL_41]] : !cc.ptr<i8>
-// CANOE:             %[[VAL_43:.*]] = quake.discriminate %[[VAL_38]] : (!quake.measure) -> i1
-// CANOE:             %[[VAL_44:.*]] = cc.compute_ptr %[[VAL_39]][1] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
+// CANOE:             %[[VAL_37:.*]] = cc.alloca !cc.array<i8 x 2>
+// CANOE:             %[[VAL_38:.*]] = quake.mz %[[VAL_3]] name "outer_mz" : (!quake.ref) -> !quake.measure
+// CANOE:             %[[VAL_39:.*]] = quake.discriminate %[[VAL_38]] : (!quake.measure) -> i1
+// CANOE:             %[[VAL_40:.*]] = cc.cast %[[VAL_37]] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
+// CANOE:             %[[VAL_41:.*]] = cc.cast unsigned %[[VAL_39]] : (i1) -> i8
+// CANOE:             cc.store %[[VAL_41]], %[[VAL_40]] : !cc.ptr<i8>
+// CANOE:             %[[VAL_42:.*]] = quake.mz %[[VAL_4]] name "outer_mz" : (!quake.ref) -> !quake.measure
+// CANOE:             %[[VAL_43:.*]] = quake.discriminate %[[VAL_42]] : (!quake.measure) -> i1
+// CANOE:             %[[VAL_44:.*]] = cc.compute_ptr %[[VAL_37]][1] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
 // CANOE:             %[[VAL_45:.*]] = cc.cast unsigned %[[VAL_43]] : (i1) -> i8
 // CANOE:             cc.store %[[VAL_45]], %[[VAL_44]] : !cc.ptr<i8>
-// CANOE:             %[[VAL_46:.*]] = cc.cast %[[VAL_39]] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<!cc.array<i1 x ?>>
+// CANOE:             %[[VAL_46:.*]] = cc.cast %[[VAL_37]] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<!cc.array<i1 x ?>>
 // CANOE:             %[[VAL_47:.*]] = cc.stdvec_init %[[VAL_46]], %[[VAL_0]] : (!cc.ptr<!cc.array<i1 x ?>>, i64) -> !cc.stdvec<i1>
 // CANOE:             cc.scope {
 // CANOE:               %[[VAL_48:.*]] = cc.undef !cc.stdvec<i1>
diff --git a/test/Transforms/mz.qke b/test/Transforms/mz.qke
index 4701477c396..0ea14e788b4 100644
--- a/test/Transforms/mz.qke
+++ b/test/Transforms/mz.qke
@@ -13,7 +13,7 @@ func.func @static.mz_test() {
   %1 = quake.alloca  !quake.veq<4>
   %2 = quake.alloca  !quake.veq<2>
   %3 = quake.alloca  !quake.ref
-  quake.mz %0, %1, %2, %3 : (!quake.ref, !quake.veq<4>, !quake.veq<2>, !quake.ref) -> !quake.measurements<?>
+  quake.mz %0, %1, %2, %3 : (!quake.ref, !quake.veq<4>, !quake.veq<2>, !quake.ref) -> !cc.stdvec<!quake.measure>
   return
 }
 
@@ -22,7 +22,7 @@ func.func @static.mz_test() {
 // CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.veq<4>
 // CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.veq<2>
 // CHECK:           %[[VAL_3:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_0]], %[[VAL_1]], %[[VAL_2]], %[[VAL_3]] : (!quake.ref, !quake.veq<4>, !quake.veq<2>, !quake.ref) -> !quake.measurements<?>
+// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_0]], %[[VAL_1]], %[[VAL_2]], %[[VAL_3]] : (!quake.ref, !quake.veq<4>, !quake.veq<2>, !quake.ref) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
@@ -31,7 +31,7 @@ func.func @dynamic.mz_test(%arg0 : i32, %arg1 : i32) {
   %1 = quake.alloca !quake.veq<?>[%arg0 : i32]
   %2 = quake.alloca !quake.veq<?>[%arg1 : i32]
   %3 = quake.alloca  !quake.ref
-  quake.mz %0, %1, %2, %3 : (!quake.ref, !quake.veq<?>, !quake.veq<?>, !quake.ref) -> !quake.measurements<?>
+  quake.mz %0, %1, %2, %3 : (!quake.ref, !quake.veq<?>, !quake.veq<?>, !quake.ref) -> !cc.stdvec<!quake.measure>
   return
 }
 
@@ -41,7 +41,7 @@ func.func @dynamic.mz_test(%arg0 : i32, %arg1 : i32) {
 // CHECK:           %[[VAL_3:.*]] = quake.alloca !quake.veq<?>[%[[VAL_0]] : i32]
 // CHECK:           %[[VAL_4:.*]] = quake.alloca !quake.veq<?>[%[[VAL_1]] : i32]
 // CHECK:           %[[VAL_5:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_6:.*]] = quake.mz %[[VAL_2]], %[[VAL_3]], %[[VAL_4]], %[[VAL_5]] : (!quake.ref, !quake.veq<?>, !quake.veq<?>, !quake.ref) -> !quake.measurements<?>
+// CHECK:           %[[VAL_6:.*]] = quake.mz %[[VAL_2]], %[[VAL_3]], %[[VAL_4]], %[[VAL_5]] : (!quake.ref, !quake.veq<?>, !quake.veq<?>, !quake.ref) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
diff --git a/test/Transforms/propagate_metadata_apply.qke b/test/Transforms/propagate_metadata_apply.qke
index 41f7d7e2998..99b4a2b8272 100644
--- a/test/Transforms/propagate_metadata_apply.qke
+++ b/test/Transforms/propagate_metadata_apply.qke
@@ -15,7 +15,7 @@ module attributes {quake.mangled_name_map = {__nvqpp__mlirgen__controlled_operat
     quake.h %1 : (!quake.ref) -> ()
     %2 = quake.extract_ref %0[1] : (!quake.veq<2>) -> !quake.ref
     quake.apply %arg0 [%1] %2 : (!quake.ref, !quake.ref) -> ()
-    %measOut = quake.mz %0 : (!quake.veq<2>) -> !quake.measurements<2>
+    %measOut = quake.mz %0 : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
     return
   }
 }
@@ -29,6 +29,6 @@ module attributes {quake.mangled_name_map = {__nvqpp__mlirgen__controlled_operat
 // CHECK:           quake.h %[[VAL_2]] : (!quake.ref) -> ()
 // CHECK:           %[[VAL_3:.*]] = quake.extract_ref %[[VAL_1]][1] : (!quake.veq<2>) -> !quake.ref
 // CHECK:           quake.apply %[[VAL_0]] {{\[}}%[[VAL_2]]] %[[VAL_3]] : (!quake.ref, !quake.ref) -> ()
-// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_1]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_1]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
diff --git a/test/Transforms/quake-errors.qke b/test/Transforms/quake-errors.qke
index 68b20d83cf7..08442c9d47a 100644
--- a/test/Transforms/quake-errors.qke
+++ b/test/Transforms/quake-errors.qke
@@ -493,11 +493,3 @@ func.func @test(%0: !quake.veq<3>, %1: !quake.veq<2>, %2: !quake.ref) {
   %4 = quake.concat %0, %1, %2 : (!quake.veq<3>, !quake.veq<2>, !quake.ref) -> !quake.veq<34>
   return
 }
-
-// -----
-
-func.func @test(%ms : !quake.measurements<4>) -> !quake.measure {
-  // expected-error @+1 {{'quake.get_measure' op invalid index [4] because >= size [4]}}
-  %m = quake.get_measure %ms[4] : (!quake.measurements<4>) -> !quake.measure
-  return %m : !quake.measure
-}
diff --git a/test/Transforms/resource_count_preprocess.qke b/test/Transforms/resource_count_preprocess.qke
index 5aaf239857a..23aed20fb6e 100644
--- a/test/Transforms/resource_count_preprocess.qke
+++ b/test/Transforms/resource_count_preprocess.qke
@@ -58,7 +58,7 @@ func.func @kernel2() {
 // CHECK: Preprocessing h(0) for 9 counts
 // CHECK-LABEL:   func.func @kernel3() {
 // CHECK:     %0 = quake.alloca !quake.veq<10>
-// CHECK:     %measOut = quake.mz %0 : (!quake.veq<10>) -> !quake.measurements<10>
+// CHECK:     %measOut = quake.mz %0 : (!quake.veq<10>) -> !cc.stdvec<!quake.measure>
 // CHECK:     return
 // CHECK:   }
 
@@ -82,7 +82,7 @@ func.func @kernel3() {
     %2 = arith.addi %arg0, %c1_i64 : i64
     cc.continue %2 : i64
   }
-  %measOut = quake.mz %0 : (!quake.veq<10>) -> !quake.measurements<10>
+  %measOut = quake.mz %0 : (!quake.veq<10>) -> !cc.stdvec<!quake.measure>
   return
 }
 
diff --git a/test/Transforms/return_vector.qke b/test/Transforms/return_vector.qke
index de4f72ff646..ddeccffa1c8 100644
--- a/test/Transforms/return_vector.qke
+++ b/test/Transforms/return_vector.qke
@@ -15,7 +15,7 @@ module attributes{ quake.mangled_name_map = {
 
 func.func private @malloc(i64) -> !cc.ptr<i8>
 
-func.func @__nvqpp__mlirgen__test_0(%arg0: i32) -> !cc.stdvec<i32> attributes {"cudaq-entrypoint"} {
+func.func @__nvqpp__mlirgen__test_0(%arg0: i32) -> !cc.stdvec<i32> {
   %0 = arith.constant 256 : i64
   %1 = call @malloc(%0) : (i64) -> !cc.ptr<i8>
   %2 = arith.constant 8 : i64
@@ -28,7 +28,7 @@ func.func @test_0(%0: !cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i3
 }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__test_0(
-// CHECK-SAME:      %[[VAL_0:.*]]: i32) -> !cc.stdvec<i32> 
+// CHECK-SAME:      %[[VAL_0:.*]]: i32) -> !cc.stdvec<i32> {
 // CHECK:           %[[VAL_1:.*]] = arith.constant 8 : i64
 // CHECK:           %[[VAL_2:.*]] = arith.constant 256 : i64
 // CHECK:           %[[VAL_3:.*]] = call @malloc(%[[VAL_2]]) : (i64) -> !cc.ptr<i8>
@@ -113,7 +113,7 @@ func.func @test_0(%0: !cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i3
 // CHECK:           return
 // CHECK:         }
 
-func.func @__nvqpp__mlirgen__test_1(%arg0: i32) -> !cc.stdvec<f64> attributes {"cudaq-entrypoint"} {
+func.func @__nvqpp__mlirgen__test_1(%arg0: i32) -> !cc.stdvec<f64> {
   %0 = arith.constant 520 : i64
   %1 = call @malloc(%0) : (i64) -> !cc.ptr<i8>
   %2 = arith.constant 9 : i64
@@ -126,7 +126,7 @@ func.func @test_1(%0: !cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f6
 }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__test_1(
-// CHECK-SAME:      %[[VAL_0:.*]]: i32) -> !cc.stdvec<f64> 
+// CHECK-SAME:      %[[VAL_0:.*]]: i32) -> !cc.stdvec<f64> {
 // CHECK:           %[[VAL_1:.*]] = arith.constant 9 : i64
 // CHECK:           %[[VAL_2:.*]] = arith.constant 520 : i64
 // CHECK:           %[[VAL_3:.*]] = call @malloc(%[[VAL_2]]) : (i64) -> !cc.ptr<i8>
@@ -236,7 +236,7 @@ func.func @test_1(%0: !cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f6
 // CHECK-SAME:                                                   %[[VAL_0:.*]]: !cc.ptr<i8>,
 // CHECK-SAME:                                                   %[[VAL_1:.*]]: i64,
 // CHECK-SAME:                                                   %[[VAL_2:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>,
-// CHECK-SAME:                                                   %[[VAL_3:.*]]: i64) -> !cc.struct<{!cc.ptr<i8>, i64}> 
+// CHECK-SAME:                                                   %[[VAL_3:.*]]: i64) -> !cc.struct<{!cc.ptr<i8>, i64}> {
 // CHECK:           %[[VAL_4:.*]] = arith.constant false
 // CHECK:           %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.ptr<i64>
 // CHECK:           %[[VAL_6:.*]] = cc.load %[[VAL_5]] : !cc.ptr<i64>
@@ -265,7 +265,7 @@ func.func @test_1(%0: !cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f6
 
 // CHECK-LABEL:   func.func @test_0.thunk(
 // CHECK-SAME:        %[[VAL_0:.*]]: !cc.ptr<i8>,
-// CHECK-SAME:        %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr<i8>, i64}> 
+// CHECK-SAME:        %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr<i8>, i64}> {
 // CHECK:           %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>
 // CHECK:           %[[VAL_3:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}> : i64
 // CHECK:           %[[VAL_4:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<i8>) -> !cc.ptr<i32>
@@ -287,7 +287,7 @@ func.func @test_1(%0: !cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f6
 
 // CHECK-LABEL:   func.func @test_0.argsCreator(
 // CHECK-SAME:                                  %[[VAL_0:.*]]: !cc.ptr<!cc.ptr<i8>>,
-// CHECK-SAME:                                  %[[VAL_1:.*]]: !cc.ptr<!cc.ptr<i8>>) -> i64 
+// CHECK-SAME:                                  %[[VAL_1:.*]]: !cc.ptr<!cc.ptr<i8>>) -> i64 {
 // CHECK:           %[[VAL_2:.*]] = cc.load %[[VAL_0]] : !cc.ptr<!cc.ptr<i8>>
 // CHECK:           %[[VAL_3:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<i8>) -> !cc.ptr<i32>
 // CHECK:           %[[VAL_4:.*]] = cc.load %[[VAL_3]] : !cc.ptr<i32>
@@ -320,7 +320,7 @@ func.func @test_1(%0: !cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f6
 
 // CHECK-LABEL:   func.func @test_1.thunk(
 // CHECK-SAME:        %[[VAL_0:.*]]: !cc.ptr<i8>,
-// CHECK-SAME:        %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr<i8>, i64}> 
+// CHECK-SAME:        %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr<i8>, i64}> {
 // CHECK:           %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>
 // CHECK:           %[[VAL_3:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}> : i64
 // CHECK:           %[[VAL_4:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<i8>) -> !cc.ptr<i32>
@@ -366,3 +366,4 @@ func.func @test_1(%0: !cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f6
 // CHECK:           llvm.return
 // CHECK:         }
 // CHECK:         llvm.mlir.global_ctors {ctors = [@test_1.kernelRegFunc], priorities = [17 : i32]}
+
diff --git a/test/Transforms/roundtrip-ops.qke b/test/Transforms/roundtrip-ops.qke
index 28649a90694..89a7bbbfab7 100644
--- a/test/Transforms/roundtrip-ops.qke
+++ b/test/Transforms/roundtrip-ops.qke
@@ -69,11 +69,10 @@ func.func @quantum_ops() {
   quake.u3 (%f, %g, %h) %7 : (f32, f32, f32, !quake.ref) -> ()
 
   %15 = quake.mx %4 : (!quake.ref) -> !quake.measure
-  %16 = quake.my %5 : (!quake.veq<?>) -> !quake.measurements<?>
-  %17 = quake.mz %6 : (!quake.veq<5>) -> !quake.measurements<5>
-  %ms_relaxed = quake.relax_size %17 : (!quake.measurements<5>) -> !quake.measurements<?>
+  %16 = quake.my %5 : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
+  %17 = quake.mz %6 : (!quake.veq<5>) -> !cc.stdvec<!quake.measure>
   %z15 = quake.discriminate %15 : (!quake.measure) -> i1
-  %z16 = quake.discriminate %16 : (!quake.measurements<?>) -> !cc.stdvec<i1>
+  %z16 = quake.discriminate %16 : (!cc.stdvec<!quake.measure>) -> !cc.stdvec<i1>
 
   // Quantum operations, wire form
   %19 = cc.undef i32 {wires = true}
@@ -217,11 +216,10 @@ cc.global constant private @quantum_ops.rodata_synth_0 (dense<[(0.707106769,0.00
 // CHECK:           %[[VAL_22:.*]] = arith.constant 3.400000e+01 : f32
 // CHECK:           quake.u3 (%[[VAL_20]], %[[VAL_21]], %[[VAL_22]]) %[[VAL_10]] : (f32, f32, f32, !quake.ref) -> ()
 // CHECK:           %[[VAL_23:.*]] = quake.mx %[[VAL_5]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_24:.*]] = quake.my %[[VAL_8]] : (!quake.veq<?>) -> !quake.measurements<?>
-// CHECK:           %[[VAL_25:.*]] = quake.mz %[[VAL_9]] : (!quake.veq<5>) -> !quake.measurements<5>
-// CHECK:           %[[VAL_125:.*]] = quake.relax_size %[[VAL_25]] : (!quake.measurements<5>) -> !quake.measurements<?>
+// CHECK:           %[[VAL_24:.*]] = quake.my %[[VAL_8]] : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
+// CHECK:           %[[VAL_25:.*]] = quake.mz %[[VAL_9]] : (!quake.veq<5>) -> !cc.stdvec<!quake.measure>
 // CHECK:           %[[VAL_123:.*]] = quake.discriminate %[[VAL_23]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_124:.*]] = quake.discriminate %[[VAL_24]] : (!quake.measurements<?>) -> !cc.stdvec<i1>
+// CHECK:           %[[VAL_124:.*]] = quake.discriminate %[[VAL_24]] : (!cc.stdvec<!quake.measure>) -> !cc.stdvec<i1>
 // CHECK:           %[[VAL_26:.*]] = cc.undef i32 {wires = true}
 // CHECK:           %[[VAL_27:.*]] = quake.null_wire
 // CHECK:           %[[VAL_28:.*]] = quake.null_wire
@@ -977,28 +975,3 @@ func.func @integrated_device() {
 // CHECK:           %[[VAL_14:.*]] = cc.device_call @integrated_device_callback<%[[VAL_2]], %[[VAL_10]], %[[VAL_11]] * %[[VAL_3]], %[[VAL_12]], %[[VAL_13]]> on %[[VAL_5]](%[[VAL_0]]) : (i64, i64, i64, i64, i64, i64, i64, i64) -> i64
 // CHECK:           return
 // CHECK:         }
-
-func.func @measurements_ops(%ms4 : !quake.measurements<4>,
-                            %msd : !quake.measurements<?>,
-                            %idx : index) {
-  %m0 = quake.get_measure %ms4[0] : (!quake.measurements<4>) -> !quake.measure
-  %m_dyn = quake.get_measure %ms4[%idx] : (!quake.measurements<4>, index) -> !quake.measure
-  %c2 = arith.constant 2 : i64
-  %m_unsized = quake.get_measure %msd[%c2] : (!quake.measurements<?>, i64) -> !quake.measure
-  %n4 = quake.measurements_size %ms4 : (!quake.measurements<4>) -> i64
-  %nd = quake.measurements_size %msd : (!quake.measurements<?>) -> i64
-  return
-}
-
-// CHECK-LABEL:   func.func @measurements_ops(
-// CHECK-SAME:                                %[[VAL_0:.*]]: !quake.measurements<4>,
-// CHECK-SAME:                                %[[VAL_1:.*]]: !quake.measurements<?>,
-// CHECK-SAME:                                %[[VAL_2:.*]]: index) {
-// CHECK:           %[[VAL_3:.*]] = quake.get_measure %[[VAL_0]][0] : (!quake.measurements<4>) -> !quake.measure
-// CHECK:           %[[VAL_4:.*]] = quake.get_measure %[[VAL_0]]{{\[}}%[[VAL_2]]] : (!quake.measurements<4>, index) -> !quake.measure
-// CHECK:           %[[VAL_5:.*]] = arith.constant 2 : i64
-// CHECK:           %[[VAL_6:.*]] = quake.get_measure %[[VAL_1]]{{\[}}%[[VAL_5]]] : (!quake.measurements<?>, i64) -> !quake.measure
-// CHECK:           %[[VAL_7:.*]] = quake.measurements_size %[[VAL_0]] : (!quake.measurements<4>) -> i64
-// CHECK:           %[[VAL_8:.*]] = quake.measurements_size %[[VAL_1]] : (!quake.measurements<?>) -> i64
-// CHECK:           return
-// CHECK:         }
diff --git a/test/Translate/OpenQASM/basic.qke b/test/Translate/OpenQASM/basic.qke
index 7cb8fd719b5..3eab2c14b7c 100644
--- a/test/Translate/OpenQASM/basic.qke
+++ b/test/Translate/OpenQASM/basic.qke
@@ -71,7 +71,7 @@ module {
 
     quake.apply @umaj %cout, %b0, %a0 : (!quake.ref, !quake.ref, !quake.ref) -> ()
 
-    %ans = quake.mz %b : (!quake.veq<4>) -> !quake.measurements<4>
+    %ans = quake.mz %b : (!quake.veq<4>) -> !cc.stdvec<!quake.measure>
     %ans_cout = quake.mz %cout : (!quake.ref) -> !quake.measure
     return
   }
@@ -95,28 +95,25 @@ module {
 // CHECK:   cx  q2, q0;
 // CHECK: }
 
-// CHECK: qreg var0[10];
-// CHECK: x var0[1];
-// CHECK: x var0[5];
-// CHECK: x var0[6];
-// CHECK: x var0[7];
-// CHECK: x var0[8];
-// CHECK: @maj var0[0], var0[5], var0[1];
-// CHECK: @maj var0[1], var0[6], var0[2];
-// CHECK: @maj var0[2], var0[7], var0[3];
-// CHECK: @maj var0[3], var0[8], var0[4];
-// CHECK: cx var0[4], var0[9];
-// CHECK: @umaj var0[3], var0[8], var0[4];
-// CHECK: @umaj var0[2], var0[7], var0[3];
-// CHECK: @umaj var0[1], var0[6], var0[2];
-// CHECK: @umaj var0[9], var0[5], var0[1];
-// CHECK: creg var11[1];
-// CHECK: measure var0[5] -> var11[0];
-// CHECK: creg var12[1];
-// CHECK: measure var0[6] -> var12[0];
+// CHECK: qreg var0[1];
+// CHECK: qreg var1[4];
+// CHECK: qreg var2[4];
+// CHECK: qreg var3[1];
+// CHECK: x var1[0];
+// CHECK: x var2[0];
+// CHECK: x var2[1];
+// CHECK: x var2[2];
+// CHECK: x var2[3];
+// CHECK: @maj var0[0], var2[0], var1[0];
+// CHECK: @maj var1[0], var2[1], var1[1];
+// CHECK: @maj var1[1], var2[2], var1[2];
+// CHECK: @maj var1[2], var2[3], var1[3];
+// CHECK: cx var1[3], var3[0];
+// CHECK: @umaj var1[2], var2[3], var1[3];
+// CHECK: @umaj var1[1], var2[2], var1[2];
+// CHECK: @umaj var1[0], var2[1], var1[1];
+// CHECK: @umaj var3[0], var2[0], var1[0];
+// CHECK: creg var12[4];
+// CHECK: measure var2 -> var12;
 // CHECK: creg var13[1];
-// CHECK: measure var0[7] -> var13[0];
-// CHECK: creg var14[1];
-// CHECK: measure var0[8] -> var14[0];
-// CHECK: creg var15[1];
-// CHECK: measure var0[9] -> var15[0];
+// CHECK: measure var3[0] -> var13[0];
diff --git a/test/Translate/OpenQASM/combine_then_translate.qke b/test/Translate/OpenQASM/combine_then_translate.qke
new file mode 100644
index 00000000000..3525af1516f
--- /dev/null
+++ b/test/Translate/OpenQASM/combine_then_translate.qke
@@ -0,0 +1,31 @@
+// ========================================================================== //
+// Copyright (c) 2026 NVIDIA Corporation & Affiliates.                        //
+// All rights reserved.                                                       //
+//                                                                            //
+// This source code and the accompanying materials are made available under   //
+// the terms of the Apache License 2.0 which accompanies this distribution.   //
+// ========================================================================== //
+
+// RUN: cudaq-opt -combine-measurements -canonicalize %s | cudaq-translate --convert-to=openqasm2 | FileCheck %s
+
+module {
+  func.func @bell() attributes {"cudaq-entrypoint"} {
+    %q = quake.alloca !quake.veq<2>
+    %q0 = quake.extract_ref %q[0] : (!quake.veq<2>) -> !quake.ref
+    %q1 = quake.extract_ref %q[1] : (!quake.veq<2>) -> !quake.ref
+    quake.h %q0 : (!quake.ref) -> ()
+    quake.x [%q0] %q1 : (!quake.ref, !quake.ref) -> ()
+    %m0 = quake.mz %q0 : (!quake.ref) -> !quake.measure
+    %m1 = quake.mz %q1 : (!quake.ref) -> !quake.measure
+    return
+  }
+}
+
+// CHECK: OPENQASM 2.0;
+// CHECK: include "qelib1.inc";
+// CHECK: qreg var0[2];
+// CHECK: h var0[0];
+// CHECK: cx var0[0], var0[1];
+// CHECK: creg var3[2];
+// CHECK: measure var0 -> var3;
+// CHECK-NOT: creg
diff --git a/test/Translate/argument.qke b/test/Translate/argument.qke
index f9a0a4859e5..220e945eacf 100644
--- a/test/Translate/argument.qke
+++ b/test/Translate/argument.qke
@@ -18,7 +18,7 @@ module
 
 func.func private @anchor(!cc.ptr<none>, i64)
 
-func.func @__nvqpp__mlirgen__test_0(%arg0: !cc.stdvec<!cc.struct<{i32, f64}>>) attributes {"cudaq-entrypoint"} {
+func.func @__nvqpp__mlirgen__test_0(%arg0: !cc.stdvec<!cc.struct<{i32, f64}>>) {
   %0 = cc.stdvec_data %arg0 : (!cc.stdvec<!cc.struct<{i32, f64}>>) -> !cc.ptr<!cc.struct<{i32, f64}>>
   %1 = cc.stdvec_size %arg0 : (!cc.stdvec<!cc.struct<{i32, f64}>>) -> i64
   %2 = cc.cast %0 : (!cc.ptr<!cc.struct<{i32, f64}>>) -> !cc.ptr<none>
@@ -74,7 +74,7 @@ func.func @test_0(%0: !cc.ptr<i8>, %1: !cc.ptr<!cc.struct<{!cc.ptr<!cc.struct<{i
 // CHECK:         ret void
 // CHECK:       }
 
-func.func @__nvqpp__mlirgen__test_1(%arg0 : !cc.struct<{!cc.stdvec<i16>, !cc.stdvec<f32>}>) attributes {"cudaq-entrypoint"} {
+func.func @__nvqpp__mlirgen__test_1(%arg0 : !cc.struct<{!cc.stdvec<i16>, !cc.stdvec<f32>}>) {
   %0 = cc.extract_value %arg0[0] : (!cc.struct<{!cc.stdvec<i16>, !cc.stdvec<f32>}>) -> !cc.stdvec<i16>
   %1 = cc.stdvec_data %0 : (!cc.stdvec<i16>) -> !cc.ptr<!cc.array<i16 x ?>>
   %2 = cc.stdvec_size %0 : (!cc.stdvec<i16>) -> i64
@@ -156,7 +156,7 @@ func.func @test_1(%0: !cc.ptr<i8>, %1: !cc.ptr<!cc.struct<{!cc.struct<{!cc.ptr<i
 // CHECK:         ret void
 // CHECK:       }
 
-func.func @__nvqpp__mlirgen__test_2(%arg0: !cc.stdvec<!cc.struct<{i32, f64}>>) attributes {"cudaq-entrypoint"} {
+func.func @__nvqpp__mlirgen__test_2(%arg0: !cc.stdvec<!cc.struct<{i32, f64}>>) {
   %0 = cc.stdvec_data %arg0 : (!cc.stdvec<!cc.struct<{i32, f64}>>) -> !cc.ptr<!cc.struct<{i32, f64}>>
   %1 = cc.stdvec_size %arg0 : (!cc.stdvec<!cc.struct<{i32, f64}>>) -> i64
   %2 = cc.cast %0 : (!cc.ptr<!cc.struct<{i32, f64}>>) -> !cc.ptr<none>
@@ -212,7 +212,7 @@ func.func @test_2(%0: !cc.ptr<i8>, %1: !cc.ptr<!cc.struct<{!cc.ptr<!cc.struct<{i
 // CHECK:         ret void
 // CHECK:       }
 
-func.func @__nvqpp__mlirgen__test_3(%arg0 : !cc.struct<{!cc.stdvec<i16>, !cc.stdvec<f32>}>) attributes {"cudaq-entrypoint"} {
+func.func @__nvqpp__mlirgen__test_3(%arg0 : !cc.struct<{!cc.stdvec<i16>, !cc.stdvec<f32>}>) {
   %0 = cc.extract_value %arg0[0] : (!cc.struct<{!cc.stdvec<i16>, !cc.stdvec<f32>}>) -> !cc.stdvec<i16>
   %1 = cc.stdvec_data %0 : (!cc.stdvec<i16>) -> !cc.ptr<i16>
   %2 = cc.stdvec_size %0 : (!cc.stdvec<i16>) -> i64
@@ -489,3 +489,4 @@ func.func @test_3(%0: !cc.ptr<i8>, %1: !cc.ptr<!cc.struct<{!cc.struct<{!cc.ptr<i
 // CHECK:         tail call void @cudaqRegisterArgsCreator(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_3.kernelName, i64 0, i64 0), i8* nonnull bitcast (i64 (i8**, i8**)* @test_3.argsCreator to i8*))
 // CHECK:         ret void
 // CHECK:       }
+
diff --git a/test/Translate/openqasm2_adj_rotations.cpp b/test/Translate/openqasm2_adj_rotations.cpp
index 2f3b8ff25d7..24ae89dc954 100644
--- a/test/Translate/openqasm2_adj_rotations.cpp
+++ b/test/Translate/openqasm2_adj_rotations.cpp
@@ -59,15 +59,5 @@ int main() {
 // CHECK: x var0[1];
 // CHECK: ch var0[0], var0[1];
 // CHECK: rz(-6.700000e-01) var0[0];
-// CHECK: creg var7[1];
-// CHECK: measure var0[0] -> var7[0];
-// CHECK: creg var8[1];
-// CHECK: measure var0[1] -> var8[0];
-// CHECK: creg var9[1];
-// CHECK: measure var0[2] -> var9[0];
-// CHECK: creg var10[1];
-// CHECK: measure var0[3] -> var10[0];
-// CHECK: creg var11[1];
-// CHECK: measure var0[4] -> var11[0];
-// CHECK: creg var12[1];
-// CHECK: measure var0[5] -> var12[0];
+// CHECK: creg var7[6];
+// CHECK: measure var0 -> var7;
diff --git a/test/Translate/openqasm2_loop.cpp b/test/Translate/openqasm2_loop.cpp
index eda2c4e32cb..dd260cac988 100644
--- a/test/Translate/openqasm2_loop.cpp
+++ b/test/Translate/openqasm2_loop.cpp
@@ -53,13 +53,5 @@ int main() {
 // CHECK: cx var0[2], var0[3];
 // CHECK: cx var0[3], var0[4];
 // CHECK: ccx var0[0], var0[2], var0[1];
-// CHECK: creg var6[1];
-// CHECK: measure var0[0] -> var6[0];
-// CHECK: creg var7[1];
-// CHECK: measure var0[1] -> var7[0];
-// CHECK: creg var8[1];
-// CHECK: measure var0[2] -> var8[0];
-// CHECK: creg var9[1];
-// CHECK: measure var0[3] -> var9[0];
-// CHECK: creg var10[1];
-// CHECK: measure var0[4] -> var10[0];
+// CHECK: creg var6[5];
+// CHECK: measure var0 -> var6;
diff --git a/test/Translate/openqasm2_simple.cpp b/test/Translate/openqasm2_simple.cpp
index c8566a51d48..1d98483d5a5 100644
--- a/test/Translate/openqasm2_simple.cpp
+++ b/test/Translate/openqasm2_simple.cpp
@@ -36,7 +36,5 @@ int main() {
 // CHECK:  qreg var0[2];
 // CHECK:  h var0[0];
 // CHECK:  cx var0[0], var0[1];
-// CHECK:  creg var3[1];
-// CHECK:  measure var0[0] -> var3[0];
-// CHECK:  creg var4[1];
-// CHECK:  measure var0[1] -> var4[0];
+// CHECK:  creg var3[2];
+// CHECK:  measure var0 -> var3;
diff --git a/test/Translate/openqasm2_vector.cpp b/test/Translate/openqasm2_vector.cpp
index 1911e784d9d..183cdbcb1a9 100644
--- a/test/Translate/openqasm2_vector.cpp
+++ b/test/Translate/openqasm2_vector.cpp
@@ -40,7 +40,5 @@ int main() {
 // CHECK:  cx var0[1], var0[0];
 // CHECK:  ry(7.853982e-01) var0[0];
 // CHECK:  cx var0[1], var0[0];
-// CHECK:  creg var3[1];
-// CHECK:  measure var0[0] -> var3[0];
-// CHECK:  creg var4[1];
-// CHECK:  measure var0[1] -> var4[0];
+// CHECK:  creg var3[2];
+// CHECK:  measure var0 -> var3;
diff --git a/test/Translate/return_values.qke b/test/Translate/return_values.qke
index 438f243a745..102bf049c0b 100644
--- a/test/Translate/return_values.qke
+++ b/test/Translate/return_values.qke
@@ -23,7 +23,7 @@ module attributes{ quake.mangled_name_map = {
 func.func private @__nvqpp_vectorCopyCtor(%arg0: !cc.ptr<i8> , %arg1: i64 , %arg2: i64 ) -> !cc.ptr<i8>
 
 // vector<bool> -> struct ptr sret
-func.func @__nvqpp__mlirgen__test_0(%arg0: i32) -> !cc.stdvec<i1> attributes {"cudaq-entrypoint"} {
+func.func @__nvqpp__mlirgen__test_0(%arg0: i32) -> !cc.stdvec<i1> {
   %c1_i64 = arith.constant 1 : i64
   %c1 = arith.constant 1 : i64
   %c0 = arith.constant 0 : i64
@@ -46,8 +46,8 @@ func.func @__nvqpp__mlirgen__test_0(%arg0: i32) -> !cc.stdvec<i1> attributes {"c
     %12 = arith.addi %arg1, %c1 : i64
     cc.continue %12 : i64
   } {invariant}
-  %measOut = quake.mz %3 : (!quake.veq<?>) -> !quake.measurements<?>
-  %7 = quake.discriminate %measOut : (!quake.measurements<?>) -> !cc.stdvec<i1>
+  %measOut = quake.mz %3 : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
+  %7 = quake.discriminate %measOut : (!cc.stdvec<!quake.measure>) -> !cc.stdvec<i1>
   %8 = cc.stdvec_data %7 : (!cc.stdvec<i1>) -> !cc.ptr<i8>
   %9 = cc.stdvec_size %7 : (!cc.stdvec<i1>) -> i64
   %10 = call @__nvqpp_vectorCopyCtor(%8, %9, %c1_i64) : (!cc.ptr<i8>, i64, i64) -> !cc.ptr<i8>
@@ -61,47 +61,46 @@ func.func @test_0(%1: !cc.ptr<!cc.struct<{!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>}
 
 // CHECK-LABEL: define { i1*, i64 } @__nvqpp__mlirgen__test_0(i32 
 // CHECK-SAME:                                                    %[[VAL_0:.*]]) local_unnamed_addr {
-// CHECK:         %[[VAL_0:.*]] = sext i32
-// CHECK:         %[[VAL_1:.*]] to i64
-// CHECK:         %[[VAL_2:.*]] = tail call %[[VAL_3:.*]]* @__quantum__rt__qubit_allocate_array(i64 %[[VAL_0]])
+// CHECK:         %[[VAL_1:.*]] = sext i32 %[[VAL_0]] to i64
+// CHECK:         %[[VAL_2:.*]] = tail call %[[VAL_3:.*]]* @__quantum__rt__qubit_allocate_array(i64 %[[VAL_1]])
 // CHECK:         %[[VAL_4:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%[[VAL_3]]* %[[VAL_2]])
 // CHECK:         %[[VAL_5:.*]] = icmp sgt i64 %[[VAL_4]], 0
 // CHECK:         br i1 %[[VAL_5]], label %[[VAL_6:.*]], label %[[VAL_7:.*]]
-// CHECK:       ._crit_edge.thread:
-// CHECK:         %[[VAL_15:.*]] = alloca i8, i64 %[[VAL_4]], align 1
-// CHECK:         br label %[[VAL_33:.*]]
-// CHECK:       .lr.ph:                                           ; preds = %[[VAL_8:.*]], %[[VAL_6]]
-// CHECK:         %[[VAL_9:.*]] = phi i64 [ %[[VAL_10:.*]], %[[VAL_6]] ], [ 0, %[[VAL_8]] ]
-// CHECK:         %[[VAL_11:.*]] = tail call %[[VAL_12:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_3]]* %[[VAL_2]], i64 %[[VAL_9]])
-// CHECK:         %[[VAL_13:.*]] = load %[[VAL_12]]*, %[[VAL_12]]** %[[VAL_11]], align 8
-// CHECK:         tail call void @__quantum__qis__h(%[[VAL_12]]* %[[VAL_13]])
-// CHECK:         %[[VAL_10]] = add nuw nsw i64 %[[VAL_9]], 1
-// CHECK:         %[[VAL_14:.*]] = icmp eq i64 %[[VAL_10]], %[[VAL_4]]
-// CHECK:         br i1 %[[VAL_14]], label %[[VAL_16:.*]], label %[[VAL_6]]
-// CHECK:       ._crit_edge:
-// CHECK:         %[[VAL_17:.*]] = alloca i8, i64 %[[VAL_4]], align 1
-// CHECK:         br i1 %[[VAL_5]], label %[[VAL_18:.*]], label %[[VAL_33]]
-// CHECK:       .lr.ph4:                                          ; preds = %[[VAL_16]], %[[VAL_18]]
-// CHECK:         %[[VAL_19:.*]] = phi i64 [ %[[VAL_20:.*]], %[[VAL_18]] ], [ 0, %[[VAL_16]] ]
-// CHECK:         %[[VAL_21:.*]] = tail call %[[VAL_12]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_3]]* %[[VAL_2]], i64 %[[VAL_19]])
-// CHECK:         %[[VAL_22:.*]] = load %[[VAL_12]]*, %[[VAL_12]]** %[[VAL_21]], align 8
-// CHECK:         %[[VAL_23:.*]] = tail call %[[VAL_24:.*]]* @__quantum__qis__mz(%[[VAL_12]]* %[[VAL_22]])
-// CHECK:         %[[VAL_25:.*]] = bitcast %[[VAL_24]]* %[[VAL_23]] to i1*
-// CHECK:         %[[VAL_26:.*]] = load i1, i1* %[[VAL_25]], align 1
-// CHECK:         %[[VAL_27:.*]] = getelementptr i8, i8* %[[VAL_17]], i64 %[[VAL_19]]
-// CHECK:         %[[VAL_28:.*]] = zext i1 %[[VAL_26]] to i8
-// CHECK:         store i8 %[[VAL_28]], i8* %[[VAL_27]], align 1
-// CHECK:         %[[VAL_20]] = add nuw nsw i64 %[[VAL_19]], 1
-// CHECK:         %[[VAL_29:.*]] = icmp eq i64 %[[VAL_20]], %[[VAL_4]]
-// CHECK:         br i1 %[[VAL_29]], label %[[VAL_33]], label %[[VAL_18]]
-// CHECK:       ._crit_edge5:
-// CHECK:         %[[VAL_30:.*]] = phi i8*
-// CHECK:         %[[VAL_43:.*]] = call i8* @__nvqpp_vectorCopyCtor(i8* nonnull %[[VAL_30]], i64 %[[VAL_4]], i64 1)
-// CHECK:         %[[VAL_44:.*]] = bitcast i8* %[[VAL_43]] to i1*
-// CHECK:         %[[VAL_45:.*]] = insertvalue { i1*, i64 } undef, i1* %[[VAL_44]], 0
-// CHECK:         %[[VAL_46:.*]] = insertvalue { i1*, i64 } %[[VAL_45]], i64 %[[VAL_4]], 1
+// CHECK:       ._crit_edge.thread:                               ; preds = %[[VAL_8:.*]]
+// CHECK:         %[[VAL_9:.*]] = alloca i8, i64 %[[VAL_4]], align 1
+// CHECK:         br label %[[VAL_10:.*]]
+// CHECK:       .lr.ph:                                           ; preds = %[[VAL_8]], %[[VAL_6]]
+// CHECK:         %[[VAL_11:.*]] = phi i64 [ %[[VAL_12:.*]], %[[VAL_6]] ], [ 0, %[[VAL_8]] ]
+// CHECK:         %[[VAL_13:.*]] = tail call %[[VAL_14:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_3]]* %[[VAL_2]], i64 %[[VAL_11]])
+// CHECK:         %[[VAL_15:.*]] = load %[[VAL_14]]*, %[[VAL_14]]** %[[VAL_13]], align 8
+// CHECK:         tail call void @__quantum__qis__h(%[[VAL_14]]* %[[VAL_15]])
+// CHECK:         %[[VAL_12]] = add nuw nsw i64 %[[VAL_11]], 1
+// CHECK:         %[[VAL_16:.*]] = icmp eq i64 %[[VAL_12]], %[[VAL_4]]
+// CHECK:         br i1 %[[VAL_16]], label %[[VAL_17:.*]], label %[[VAL_6]]
+// CHECK:       ._crit_edge:                                      ; preds = %[[VAL_6]]
+// CHECK:         %[[VAL_18:.*]] = alloca i8, i64 %[[VAL_4]], align 1
+// CHECK:         br i1 %[[VAL_5]], label %[[VAL_19:.*]], label %[[VAL_10]]
+// CHECK:       .lr.ph4:                                          ; preds = %[[VAL_17]], %[[VAL_19]]
+// CHECK:         %[[VAL_20:.*]] = phi i64 [ %[[VAL_21:.*]], %[[VAL_19]] ], [ 0, %[[VAL_17]] ]
+// CHECK:         %[[VAL_22:.*]] = tail call %[[VAL_14]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_3]]* %[[VAL_2]], i64 %[[VAL_20]])
+// CHECK:         %[[VAL_23:.*]] = load %[[VAL_14]]*, %[[VAL_14]]** %[[VAL_22]], align 8
+// CHECK:         %[[VAL_24:.*]] = tail call %[[VAL_25:.*]]* @__quantum__qis__mz(%[[VAL_14]]* %[[VAL_23]])
+// CHECK:         %[[VAL_26:.*]] = bitcast %[[VAL_25]]* %[[VAL_24]] to i1*
+// CHECK:         %[[VAL_27:.*]] = load i1, i1* %[[VAL_26]], align 1
+// CHECK:         %[[VAL_28:.*]] = getelementptr i8, i8* %[[VAL_18]], i64 %[[VAL_20]]
+// CHECK:         %[[VAL_29:.*]] = zext i1 %[[VAL_27]] to i8
+// CHECK:         store i8 %[[VAL_29]], i8* %[[VAL_28]], align 1
+// CHECK:         %[[VAL_21]] = add nuw nsw i64 %[[VAL_20]], 1
+// CHECK:         %[[VAL_30:.*]] = icmp eq i64 %[[VAL_21]], %[[VAL_4]]
+// CHECK:         br i1 %[[VAL_30]], label %[[VAL_10]], label %[[VAL_19]]
+// CHECK:       ._crit_edge5:                                     ; preds = %[[VAL_19]], %[[VAL_7]], %[[VAL_17]]
+// CHECK:         %[[VAL_31:.*]] = phi i8* [ %[[VAL_9]], %[[VAL_7]] ], [ %[[VAL_18]], %[[VAL_17]] ], [ %[[VAL_18]], %[[VAL_19]] ]
+// CHECK:         %[[VAL_32:.*]] = call i8* @__nvqpp_vectorCopyCtor(i8* nonnull %[[VAL_31]], i64 %[[VAL_4]], i64 1)
+// CHECK:         %[[VAL_33:.*]] = bitcast i8* %[[VAL_32]] to i1*
+// CHECK:         %[[VAL_34:.*]] = insertvalue { i1*, i64 } undef, i1* %[[VAL_33]], 0
+// CHECK:         %[[VAL_35:.*]] = insertvalue { i1*, i64 } %[[VAL_34]], i64 %[[VAL_4]], 1
 // CHECK:         call void @__quantum__rt__qubit_release_array(%[[VAL_3]]* %[[VAL_2]])
-// CHECK:         ret { i1*, i64 } %[[VAL_46]]
+// CHECK:         ret { i1*, i64 } %[[VAL_35]]
 // CHECK:       }
 
 // CHECK-LABEL: define void @test_0({ i8*, i8*, i8* }* sret({ i8*, i8*, i8* }) 
@@ -150,7 +149,7 @@ func.func @test_0(%1: !cc.ptr<!cc.struct<{!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>}
 // CHECK:       }
 
 // struct{bool, bool} -> i16
-func.func @__nvqpp__mlirgen__test_1() -> !cc.struct<{i1, i1}> attributes {"cudaq-entrypoint"} {
+func.func @__nvqpp__mlirgen__test_1() -> !cc.struct<{i1, i1}> {
   %qubits = quake.alloca !quake.veq<2>
   %q0 = quake.extract_ref %qubits[0] : (!quake.veq<2>) -> !quake.ref
   %q1 = quake.extract_ref %qubits[1] : (!quake.veq<2>) -> !quake.ref
@@ -212,7 +211,7 @@ func.func @test_1(%this: !cc.ptr<i8>) -> i16 {
 // CHECK:       }
 
 // struct{i16, f32, f64, i64} -> sret ptr
-func.func @__nvqpp__mlirgen__test_2() -> !cc.struct<{i16, f32, f64, i64}> attributes {"cudaq-entrypoint"} {
+func.func @__nvqpp__mlirgen__test_2() -> !cc.struct<{i16, f32, f64, i64}> {
   %rv = cc.undef !cc.struct<{i16, f32, f64, i64}>
   %c1 = arith.constant 8 : i16
   %rv1 = cc.insert_value %rv[0], %c1 : (!cc.struct<{i16, f32, f64, i64}>, i16) -> !cc.struct<{i16, f32, f64, i64}>
@@ -256,7 +255,7 @@ func.func @test_2(%1: !cc.ptr<!cc.struct<{i16, f32, f64, i64}>> {llvm.sret = !cc
 
 
 // array<T x n> -> sret ptr
-func.func @__nvqpp__mlirgen__test_3() -> !cc.array<i64 x 5> attributes {"cudaq-entrypoint"} {
+func.func @__nvqpp__mlirgen__test_3() -> !cc.array<i64 x 5> {
   %rv = cc.undef !cc.array<i64 x 5>
   %c1 = arith.constant 5 : i64
   %rv1 = cc.insert_value %rv[0], %c1 : (!cc.array<i64 x 5>, i64) -> !cc.array<i64 x 5>
@@ -301,7 +300,7 @@ func.func @test_3(%1: !cc.ptr<!cc.array<i64 x 5>> {llvm.sret = !cc.array<i64 x 5
 // CHECK:       }
 
 // small struct (<= 128) -> { i64, f64 }
-func.func @__nvqpp__mlirgen__test_4() -> (i64, f64) attributes {"cudaq-entrypoint"} {
+func.func @__nvqpp__mlirgen__test_4() -> (i64, f64) {
   %c1 = arith.constant 537892 : i64
   %c2 = arith.constant 94.2134 : f64
   return %c1, %c2 : i64, f64
@@ -336,7 +335,7 @@ func.func @test_4(%sret: !cc.ptr<!cc.struct<{i64, f64}>> {llvm.sret = !cc.struct
 // CHECK:         ret void
 // CHECK:       }
 
-func.func @__nvqpp__mlirgen__test_5() -> (i64, f64) attributes {no_this, "cudaq-entrypoint"} {
+func.func @__nvqpp__mlirgen__test_5() -> (i64, f64) attributes {no_this} {
   %c1 = arith.constant 537892 : i64
   %c2 = arith.constant 94.2134 : f64
   return %c1, %c2 : i64, f64
diff --git a/test/lit.cfg.py b/test/lit.cfg.py
index cc62231f3a2..f0bca187850 100644
--- a/test/lit.cfg.py
+++ b/test/lit.cfg.py
@@ -49,7 +49,7 @@
 
 # Exclude a list of directories from the test suite:
 #   - 'Inputs' contain auxiliary inputs for various tests.
-local_excludes = ['Inputs', 'CMakeLists.txt', 'README.txt', 'LICENSE.txt']
+local_excludes = ['Inputs', 'CMakeLists.txt', 'README.txt', 'LICENSE.txt', 'plugin']
 config.excludes = [exclude for exclude in config.excludes] + local_excludes
 
 # The root path where tests are located.
diff --git a/unittests/plugin/CMakeLists.txt b/test/plugin/CMakeLists.txt
similarity index 100%
rename from unittests/plugin/CMakeLists.txt
rename to test/plugin/CMakeLists.txt
diff --git a/unittests/plugin/CustomPassPlugin.cpp b/test/plugin/CustomPassPlugin.cpp
similarity index 100%
rename from unittests/plugin/CustomPassPlugin.cpp
rename to test/plugin/CustomPassPlugin.cpp
diff --git a/tpls/nanobind b/tpls/nanobind
new file mode 160000
index 00000000000..2a61ad2494d
--- /dev/null
+++ b/tpls/nanobind
@@ -0,0 +1 @@
+Subproject commit 2a61ad2494d09fecb2e13322c1383342c299900d
diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt
index e84493fbb70..5144ce8bd57 100644
--- a/unittests/CMakeLists.txt
+++ b/unittests/CMakeLists.txt
@@ -197,7 +197,7 @@ if (cuStateVec_FOUND)
   )
   target_include_directories(test_custatevec_observe_from_sampling PRIVATE .)
   target_compile_definitions(test_custatevec_observe_from_sampling
-                             PRIVATE -DNVQIR_BACKEND_NAME=custatevec_fp32)
+    PRIVATE -DNVQIR_BACKEND_NAME=custatevec_fp32 -DCUDAQ_SIMULATION_SCALAR_FP32)
   if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT APPLE)
     target_link_options(test_custatevec_observe_from_sampling PRIVATE ${CUDAQ_FORCE_LINK_FLAG})
   endif()
@@ -443,8 +443,6 @@ if (CUDA_FOUND AND TARGET nvqir-dynamics)
   endif()
 endif()
 
-add_subdirectory(plugin)
-
 # build the test qudit execution manager
 add_subdirectory(qudit)
 add_executable(test_qudit main.cpp qudit/SimpleQuditTester.cpp)
diff --git a/unittests/Optimizer/DecompositionPatternSelectionTest.cpp b/unittests/Optimizer/DecompositionPatternSelectionTest.cpp
index e2007f0ed6d..03b596caeec 100644
--- a/unittests/Optimizer/DecompositionPatternSelectionTest.cpp
+++ b/unittests/Optimizer/DecompositionPatternSelectionTest.cpp
@@ -364,6 +364,26 @@ TEST_F(FullDecompositionPatternSelectionTest, DecomposeCCXToCZ) {
   EXPECT_EQ(selectedPatterns, exp);
 }
 
+// Regression: multi-hop chain where intermediate gates (t, z(2)) are not
+// in the basis but are reachable through further patterns.
+// Chain: x(2) -> CCXToCCZ -> {h,z(2)} -> CCZToCX -> {t,x(1)}
+//        t -> TToR1 -> {r1(1)} -> CR1ToCX -> {r1,x(1)}
+//        r1 -> R1ToU3 -> {u3} -> U3ToRotations -> {rz,rx}
+TEST_F(FullDecompositionPatternSelectionTest, DecomposeCCXDeepChain) {
+  std::vector<std::string> targetBasis{"h", "rx", "ry", "rz", "x", "x(1)"};
+  auto selectedPatterns = selectPatterns(targetBasis);
+
+  EXPECT_TRUE(std::find(selectedPatterns.begin(), selectedPatterns.end(),
+                        "CCXToCCZ") != selectedPatterns.end())
+      << "CCXToCCZ not selected";
+  EXPECT_TRUE(std::find(selectedPatterns.begin(), selectedPatterns.end(),
+                        "CCZToCX") != selectedPatterns.end())
+      << "CCZToCX not selected";
+  EXPECT_TRUE(std::find(selectedPatterns.begin(), selectedPatterns.end(),
+                        "TToR1") != selectedPatterns.end())
+      << "TToR1 not selected";
+}
+
 //===----------------------------------------------------------------------===//
 // Test OperatorInfo adjoint parsing
 //===----------------------------------------------------------------------===//
diff --git a/unittests/backends/CMakeLists.txt b/unittests/backends/CMakeLists.txt
index ed42c11cc55..627ae6a7395 100644
--- a/unittests/backends/CMakeLists.txt
+++ b/unittests/backends/CMakeLists.txt
@@ -8,15 +8,15 @@
 
 # List of libraries to link with by default to create a test executable
 set(default_backend_unittest_libs
-  fmt::fmt-header-only 
-  cudaq-common 
+  fmt::fmt-header-only
+  cudaq-common
   cudaq
   cudaq-builder
   cudaq-mlir-runtime
   cudaq-rest-qpu
   cudaq-operator
   nvqir nvqir-qpp
-  cudaq-platform-default 
+  cudaq-platform-default
   gtest_main)
 
 define_property(DIRECTORY PROPERTY BACKEND_UNITTEST_LIBS INHERITED
@@ -32,12 +32,12 @@ set_property(DIRECTORY PROPERTY BACKEND_UNITTEST_LIBS ${default_backend_unittest
 # Helper function to create an executable to be used by the gtest unit tests
 # - target: positional argument, name of the executable
 # - BACKEND: named argument to specify a prefix for the test names
-# - BACKEND_CONFIG: if present, the test will set NVQPP_TARGET_BACKEND_CONFIG 
+# - BACKEND_CONFIG: if present, the test will set NVQPP_TARGET_BACKEND_CONFIG
 #     with this value so the backend gets loaded by a constructor before entering main.
 #     To avoid issues with semicolon the format is: backend key1=value1 key2=value2
 #     The function will convert this to           : backend;key1;value1;key2;value2
 #     Example:     infleqtion emulate=false url=http://localhost:62447
-# - LINK_LIBS: optional argument to provide non-default list of libraries to link with 
+# - LINK_LIBS: optional argument to provide non-default list of libraries to link with
 function(add_backend_unittest_executable target)
   set(singleValues BACKEND BACKEND_CONFIG)
   set(multiValues SOURCES INCLUDES LINK_LIBS)
@@ -97,6 +97,9 @@ if (OPENSSL_FOUND AND CUDAQ_ENABLE_PYTHON AND CUDAQ_TEST_MOCK_SERVERS)
   if (CUDAQ_ENABLE_SCALEWAY_BACKEND)
     add_subdirectory(scaleway)
   endif()
+  if (CUDAQ_ENABLE_QBRAID_BACKEND)
+    add_subdirectory(qbraid)
+  endif()
   add_subdirectory(extra_payload_provider)
   add_subdirectory(quake_backend)
 endif()
diff --git a/unittests/backends/QPPTester.h b/unittests/backends/QPPTester.h
index 31a2f2c3083..ab8a0ee86ea 100644
--- a/unittests/backends/QPPTester.h
+++ b/unittests/backends/QPPTester.h
@@ -7,6 +7,7 @@
  ******************************************************************************/
 
 #include "common/ExecutionContext.h"
+#include "cudaq/algorithms/sample/policy.h"
 #include <string>
 #include <vector>
 
@@ -20,10 +21,10 @@ class QppCircuitSimulatorTester : public Simulator {
     // a quick set-reset to trigger sampling
     this->configureExecutionContext(ctx);
     cudaq::detail::setExecutionContext(&ctx);
-    this->finalizeExecutionContext(ctx);
+    auto sampleResults =
+        this->finalizeExecutionContext(cudaq::sample_policy{}, ctx);
     cudaq::detail::resetExecutionContext();
 
-    auto sampleResults = ctx.result;
     return sampleResults.begin()->first;
   }
 
diff --git a/unittests/backends/qbraid/CMakeLists.txt b/unittests/backends/qbraid/CMakeLists.txt
new file mode 100644
index 00000000000..e1e7a1c07e7
--- /dev/null
+++ b/unittests/backends/qbraid/CMakeLists.txt
@@ -0,0 +1,16 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+add_backend_unittest_executable(test_qbraid
+  SOURCES QbraidTester.cpp
+  BACKEND qbraid
+  BACKEND_CONFIG "qbraid emulate=false url=http://localhost:62454 api_key=00000000000000000000000000000000"
+)
+
+configure_file("QbraidStartServerAndTest.sh.in" "${CMAKE_BINARY_DIR}/unittests/backends/qbraid/QbraidStartServerAndTest.sh" @ONLY)
+add_test(NAME qbraid-tests COMMAND bash QbraidStartServerAndTest.sh WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/unittests/backends/qbraid/)
diff --git a/unittests/backends/qbraid/QbraidStartServerAndTest.sh.in b/unittests/backends/qbraid/QbraidStartServerAndTest.sh.in
new file mode 100644
index 00000000000..4221dc109fa
--- /dev/null
+++ b/unittests/backends/qbraid/QbraidStartServerAndTest.sh.in
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+# ============================================================================ #
+# Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+checkServerConnection() {
+  PYTHONPATH=@CMAKE_BINARY_DIR@/python @Python_EXECUTABLE@ - << EOF
+import socket
+try:
+    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    s.connect(("localhost", 62454))
+    s.close()
+except Exception:
+    exit(1)
+EOF
+}
+
+# Launch the fake server
+PYTHONPATH=@CMAKE_BINARY_DIR@/python @Python_EXECUTABLE@ @CMAKE_SOURCE_DIR@/utils/mock_qpu/qbraid/__init__.py &
+# we'll need the process id to kill it
+pid=$(echo "$!")
+n=0
+while ! checkServerConnection; do
+  sleep 1
+  n=$((n+1))
+  if [ "$n" -eq "10" ]; then
+    kill -INT $pid
+    exit 99
+  fi
+done
+# api_key is passed via the backend config (see CMakeLists BACKEND_CONFIG),
+# so we unset QBRAID_API_KEY to force the helper to use the config value.
+# checkApiKeyFromTarget asserts the env var is null.
+unset QBRAID_API_KEY
+# Run the tests
+./test_qbraid
+# Did they fail?
+testsPassed=$?
+# kill the server
+kill -INT $pid
+# return success / failure
+exit $testsPassed
diff --git a/unittests/backends/qbraid/QbraidTester.cpp b/unittests/backends/qbraid/QbraidTester.cpp
new file mode 100644
index 00000000000..a73b0dc6da3
--- /dev/null
+++ b/unittests/backends/qbraid/QbraidTester.cpp
@@ -0,0 +1,308 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "CUDAQTestUtils.h"
+#include "common/FmtCore.h"
+#include "common/RestClient.h"
+#include "cudaq/algorithm.h"
+#include <fstream>
+#include <gtest/gtest.h>
+#include <stdlib.h>
+
+// Update the backend string to match the QBraid format
+std::string mockPort = "62454";
+std::string backendStringTemplate =
+    "qbraid;emulate;false;url;http://localhost:{}";
+
+bool isValidExpVal(double value) {
+  // The qbraid mock server doesn't simulate quantum mechanics - X0X1 counts
+  // are uniform random per 1000-shot sample (std dev ~0.03), so the
+  // expectation value for this VQE Hamiltonian fluctuates around -2.14 by
+  // a few hundredths per run. The band below is wide enough (~10 sigma) to
+  // be stable across test runs while still catching corrupt / NaN results.
+  return value < -1.0 && value > -3.0;
+}
+
+CUDAQ_TEST(QbraidTester, checkSampleSync) {
+  auto kernel = cudaq::make_kernel();
+  auto qubit = kernel.qalloc(2);
+  kernel.h(qubit[0]);
+  kernel.mz(qubit[0]);
+
+  auto counts = cudaq::sample(kernel);
+  counts.dump();
+  EXPECT_EQ(counts.size(), 2);
+}
+
+CUDAQ_TEST(QbraidTester, checkSampleAsync) {
+  auto kernel = cudaq::make_kernel();
+  auto qubit = kernel.qalloc(2);
+  kernel.h(qubit[0]);
+  kernel.mz(qubit[0]);
+
+  auto future = cudaq::sample_async(kernel);
+  auto counts = future.get();
+  EXPECT_EQ(counts.size(), 2);
+}
+
+CUDAQ_TEST(QbraidTester, checkSampleAsyncLoadFromFile) {
+  auto kernel = cudaq::make_kernel();
+  auto qubit = kernel.qalloc(2);
+  kernel.h(qubit[0]);
+  kernel.mz(qubit[0]);
+
+  auto future = cudaq::sample_async(kernel);
+  {
+    std::ofstream out("saveMe.json");
+    out << future;
+  }
+
+  cudaq::async_result<cudaq::sample_result> readIn;
+  std::ifstream in("saveMe.json");
+  in >> readIn;
+
+  auto counts = readIn.get();
+  EXPECT_EQ(counts.size(), 2);
+
+  std::remove("saveMe.json");
+}
+
+CUDAQ_TEST(QbraidTester, checkObserveSync) {
+  auto [kernel, theta] = cudaq::make_kernel<double>();
+  auto qubit = kernel.qalloc(2);
+  kernel.x(qubit[0]);
+  kernel.ry(theta, qubit[1]);
+  kernel.x<cudaq::ctrl>(qubit[1], qubit[0]);
+
+  using namespace cudaq::spin;
+  cudaq::spin_op h = 5.907 - 2.1433 * x(0) * x(1) - 2.1433 * y(0) * y(1) +
+                     .21829 * z(0) - 6.125 * z(1);
+  auto result = cudaq::observe(kernel, h, .59);
+  result.dump();
+
+  printf("ENERGY: %lf\n", result.expectation());
+  EXPECT_TRUE(isValidExpVal(result.expectation()));
+}
+
+CUDAQ_TEST(QbraidTester, checkObserveAsync) {
+  auto [kernel, theta] = cudaq::make_kernel<double>();
+  auto qubit = kernel.qalloc(2);
+  kernel.x(qubit[0]);
+  kernel.ry(theta, qubit[1]);
+  kernel.x<cudaq::ctrl>(qubit[1], qubit[0]);
+
+  using namespace cudaq::spin;
+  cudaq::spin_op h = 5.907 - 2.1433 * x(0) * x(1) - 2.1433 * y(0) * y(1) +
+                     .21829 * z(0) - 6.125 * z(1);
+  auto future = cudaq::observe_async(kernel, h, .59);
+
+  auto result = future.get();
+  result.dump();
+
+  printf("ENERGY: %lf\n", result.expectation());
+  EXPECT_TRUE(isValidExpVal(result.expectation()));
+}
+
+CUDAQ_TEST(QbraidTester, checkObserveAsyncLoadFromFile) {
+  auto [kernel, theta] = cudaq::make_kernel<double>();
+  auto qubit = kernel.qalloc(2);
+  kernel.x(qubit[0]);
+  kernel.ry(theta, qubit[1]);
+  kernel.x<cudaq::ctrl>(qubit[1], qubit[0]);
+
+  using namespace cudaq::spin;
+  cudaq::spin_op h = 5.907 - 2.1433 * x(0) * x(1) - 2.1433 * y(0) * y(1) +
+                     .21829 * z(0) - 6.125 * z(1);
+  auto future = cudaq::observe_async(kernel, h, .59);
+
+  {
+    std::ofstream out("saveMeObserve.json");
+    out << future;
+  }
+
+  cudaq::async_result<cudaq::observe_result> readIn(&h);
+  std::ifstream in("saveMeObserve.json");
+  in >> readIn;
+
+  auto result = readIn.get();
+
+  std::remove("saveMeObserve.json");
+  result.dump();
+
+  printf("ENERGY: %lf\n", result.expectation());
+  EXPECT_TRUE(isValidExpVal(result.expectation()));
+}
+
+// Every test in this file runs through the backend configured by
+// add_backend_unittest_executable in CMakeLists, which passes api_key via the
+// target config (BACKEND_CONFIG). QBRAID_API_KEY env var is NOT set by the
+// launch script, so a successful sample here exercises the target-arg path.
+CUDAQ_TEST(QbraidTester, checkApiKeyFromTarget) {
+  ASSERT_EQ(std::getenv("QBRAID_API_KEY"), nullptr)
+      << "QBRAID_API_KEY should not be set; this test verifies the "
+         "api_key=... target-arg path.";
+
+  auto kernel = cudaq::make_kernel();
+  auto qubit = kernel.qalloc(2);
+  kernel.h(qubit[0]);
+  kernel.mz(qubit[0]);
+
+  auto counts = cudaq::sample(kernel);
+  EXPECT_GE(counts.size(), 1u);
+}
+
+CUDAQ_TEST(QbraidTester, checkJobFailure) {
+  // Arm the mock to fail the next submitted job.
+  cudaq::RestClient client;
+  nlohmann::json body = nlohmann::json::object();
+  std::map<std::string, std::string> headers;
+  auto armed = client.post("http://localhost:62454/", "test/fail_next", body,
+                           headers, /*enableLogging=*/false);
+  ASSERT_TRUE(armed.value("armed", false));
+
+  auto kernel = cudaq::make_kernel();
+  auto qubit = kernel.qalloc(2);
+  kernel.h(qubit[0]);
+  kernel.mz(qubit[0]);
+
+  EXPECT_ANY_THROW({ (void)cudaq::sample(kernel); });
+}
+
+// Arm the mock to make the next N /result calls return "not yet available",
+// so processResults must retry. maxRetries is 3, so 2 delays should succeed.
+CUDAQ_TEST(QbraidTester, checkResultRetry) {
+  cudaq::RestClient client;
+  nlohmann::json body = nlohmann::json::object();
+  std::map<std::string, std::string> headers;
+  auto armed =
+      client.post("http://localhost:62454/", "test/delay_next_results/2", body,
+                  headers, /*enableLogging=*/false);
+  ASSERT_EQ(armed.value("remaining", -1), 2);
+
+  auto kernel = cudaq::make_kernel();
+  auto qubit = kernel.qalloc(2);
+  kernel.h(qubit[0]);
+  kernel.mz(qubit[0]);
+
+  auto counts = cudaq::sample(kernel);
+  EXPECT_GE(counts.size(), 1u);
+}
+
+// Arm enough delays to exhaust the retry budget (maxRetries = 3). Sample must
+// throw. Uses 10 so the retry loop can never succeed.
+CUDAQ_TEST(QbraidTester, checkResultRetryExhaustion) {
+  cudaq::RestClient client;
+  nlohmann::json body = nlohmann::json::object();
+  std::map<std::string, std::string> headers;
+  auto armed =
+      client.post("http://localhost:62454/", "test/delay_next_results/10", body,
+                  headers, /*enableLogging=*/false);
+  ASSERT_EQ(armed.value("remaining", -1), 10);
+
+  auto kernel = cudaq::make_kernel();
+  auto qubit = kernel.qalloc(2);
+  kernel.h(qubit[0]);
+  kernel.mz(qubit[0]);
+
+  EXPECT_ANY_THROW({ (void)cudaq::sample(kernel); });
+}
+
+// Helper: arm the mock to return a specific HTTP status on the next /result.
+// Resets prior test-hook state first so the test is order-independent.
+static void armResultStatus(int code) {
+  cudaq::RestClient client;
+  nlohmann::json body = nlohmann::json::object();
+  std::map<std::string, std::string> headers;
+  (void)client.post("http://localhost:62454/", "test/reset", body, headers,
+                    /*enableLogging=*/false);
+  auto armed =
+      client.post("http://localhost:62454/",
+                  "test/force_next_result_status/" + std::to_string(code), body,
+                  headers, /*enableLogging=*/false);
+  ASSERT_EQ(armed.value("armed_status", -1), code);
+}
+
+// Helper: match a substring in the exception message.
+static ::testing::AssertionResult throwsWithMessage(std::function<void()> fn,
+                                                    const std::string &needle) {
+  try {
+    fn();
+  } catch (const std::exception &e) {
+    std::string what = e.what();
+    if (what.find(needle) != std::string::npos)
+      return ::testing::AssertionSuccess();
+    return ::testing::AssertionFailure()
+           << "exception message did not contain '" << needle << "'. Actual: '"
+           << what << "'";
+  }
+  return ::testing::AssertionFailure() << "expected exception, none thrown";
+}
+
+// 401 on /result -> terminal auth failure, message must name the status.
+CUDAQ_TEST(QbraidTester, checkResultAuthFailure) {
+  armResultStatus(401);
+  auto kernel = cudaq::make_kernel();
+  auto qubit = kernel.qalloc(2);
+  kernel.h(qubit[0]);
+  kernel.mz(qubit[0]);
+  EXPECT_TRUE(throwsWithMessage([&]() { (void)cudaq::sample(kernel); },
+                                "authentication failed"));
+}
+
+// 403 on /result -> same terminal auth failure translation as 401.
+CUDAQ_TEST(QbraidTester, checkResultForbidden) {
+  armResultStatus(403);
+  auto kernel = cudaq::make_kernel();
+  auto qubit = kernel.qalloc(2);
+  kernel.h(qubit[0]);
+  kernel.mz(qubit[0]);
+  EXPECT_TRUE(throwsWithMessage([&]() { (void)cudaq::sample(kernel); },
+                                "authentication failed"));
+}
+
+// 404 on /result -> terminal "not found", message must mention the job id.
+CUDAQ_TEST(QbraidTester, checkResultNotFound) {
+  armResultStatus(404);
+  auto kernel = cudaq::make_kernel();
+  auto qubit = kernel.qalloc(2);
+  kernel.h(qubit[0]);
+  kernel.mz(qubit[0]);
+  EXPECT_TRUE(throwsWithMessage([&]() { (void)cudaq::sample(kernel); },
+                                "result not found"));
+}
+
+// 409 on /result -> terminal. qBraid v2 returns this when the job reached a
+// non-success terminal state (FAILED or CANCELLED), so results will never
+// appear and the helper must fail fast instead of burning the retry budget.
+CUDAQ_TEST(QbraidTester, checkResultConflict) {
+  armResultStatus(409);
+  auto kernel = cudaq::make_kernel();
+  auto qubit = kernel.qalloc(2);
+  kernel.h(qubit[0]);
+  kernel.mz(qubit[0]);
+  EXPECT_TRUE(throwsWithMessage([&]() { (void)cudaq::sample(kernel); },
+                                "did not produce results"));
+}
+
+// 500 on /result -> retryable. Force hook fires once then clears, so the
+// second attempt succeeds. Sampling must not throw.
+CUDAQ_TEST(QbraidTester, checkResultServerErrorRetries) {
+  armResultStatus(500);
+  auto kernel = cudaq::make_kernel();
+  auto qubit = kernel.qalloc(2);
+  kernel.h(qubit[0]);
+  kernel.mz(qubit[0]);
+  auto counts = cudaq::sample(kernel);
+  EXPECT_GE(counts.size(), 1u);
+}
+
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  auto ret = RUN_ALL_TESTS();
+  return ret;
+}
diff --git a/unittests/backends/quake_backend/QuakeStartServerAndTest.sh.in b/unittests/backends/quake_backend/QuakeStartServerAndTest.sh.in
index 26617326ebd..989039f1870 100644
--- a/unittests/backends/quake_backend/QuakeStartServerAndTest.sh.in
+++ b/unittests/backends/quake_backend/QuakeStartServerAndTest.sh.in
@@ -80,6 +80,15 @@ else
   fi
 fi
 
+# Run the Python test
+PYTHONPATH=@CMAKE_BINARY_DIR@/python @Python_EXECUTABLE@ @CMAKE_SOURCE_DIR@/unittests/backends/quake_backend/test_app.py
+if [ $? -ne 0 ]; then
+  echo ":x: Python test_app.py failed"
+  test_err_sum=$((test_err_sum+1))
+else
+  echo ":white_check_mark: Successfully ran Python test_app.py"
+fi
+
 # kill the server
 kill -INT $pid
 # return success / failure
diff --git a/unittests/backends/quake_backend/mock_server.py b/unittests/backends/quake_backend/mock_server.py
index 3d497e0608d..9adc4456af0 100644
--- a/unittests/backends/quake_backend/mock_server.py
+++ b/unittests/backends/quake_backend/mock_server.py
@@ -9,7 +9,7 @@
 import cudaq
 from fastapi import FastAPI, HTTPException, Header, Request
 from typing import Union
-import uvicorn, uuid, base64, ctypes, sys
+import uvicorn, uuid, base64, ctypes, sys, re
 from pydantic import BaseModel
 from llvmlite import binding as llvm
 from cudaq.mlir.passmanager import PassManager
@@ -39,6 +39,15 @@ async def postJob(request: Request):
     payload = await request.json()
     # Decode base64
     decoded_payload = base64.b64decode(payload["ir"]).decode('utf-8')
+    # Verify that the input MLIR does not contain actual `malloc` or `memcpy` calls.
+    # Match `@malloc` or `@llvm.memcpy` as function references (calls or declarations).
+    if re.search(r'@malloc\b', decoded_payload) or \
+       re.search(r'@(llvm\.)?memcpy\b', decoded_payload):
+        raise RuntimeError(
+            "Input MLIR contains malloc or memcpy calls. "
+            "These should have been eliminated by the eliminate-dead-heap-copy pass."
+        )
+
     ctx = getMLIRContext()
     recovered_mod = Module.parse(decoded_payload, context=ctx)
     pm = PassManager.parse(
diff --git a/unittests/backends/quake_backend/quake_fake.yml b/unittests/backends/quake_backend/quake_fake.yml
index c9d9f8a53e6..47da8122955 100644
--- a/unittests/backends/quake_backend/quake_fake.yml
+++ b/unittests/backends/quake_backend/quake_fake.yml
@@ -19,6 +19,7 @@ config:
   link-libs: ["-lcudaq-rest-qpu"]
   # Define the JIT lowering pipeline
   jit-high-level-pipeline: "expand-measurements"
+  jit-low-level-pipeline: "return-to-output-log,func.func(eliminate-dead-heap-copy),symbol-dce"
   # Tell the rest-qpu that we are simply dumping CUDA-Q MLIR code.
   codegen-emission: nop
   # Library mode is only for simulators, physical backends must turn this off
diff --git a/unittests/backends/quake_backend/test_app.py b/unittests/backends/quake_backend/test_app.py
new file mode 100644
index 00000000000..ee55a114a3b
--- /dev/null
+++ b/unittests/backends/quake_backend/test_app.py
@@ -0,0 +1,95 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+import cudaq
+import sys
+
+cudaq.set_target("quake_fake")
+
+qubit_count = 5
+
+
+@cudaq.kernel
+def kernel() -> list[int]:
+    qvector = cudaq.qvector(qubit_count)
+
+    for i in range(qubit_count - 1):
+        h(qvector[i])
+
+    s(qvector[0])
+    r1(math.pi / 2, qvector[1])
+    a = mz(qvector)
+    return a
+
+
+@cudaq.kernel
+def all_zeros() -> list[int]:
+    q = cudaq.qvector(4)
+    return mz(q)
+
+
+@cudaq.kernel
+def all_ones() -> list[int]:
+    q = cudaq.qvector(4)
+    x(q)
+    return mz(q)
+
+
+@cudaq.kernel
+def alternating_01() -> list[int]:
+    q = cudaq.qvector(4)
+    x(q[1])
+    x(q[3])
+    return mz(q)
+
+
+@cudaq.kernel
+def single_qubit_flip() -> list[int]:
+    q = cudaq.qvector(1)
+    x(q[0])
+    return mz(q)
+
+
+try:
+    res = cudaq.run(kernel)
+    assert res is not None
+    assert len(res) > 0
+    assert len(res[0]) == qubit_count
+    for shot in res:
+        for val in shot:
+            assert val in (0, 1)
+
+    # Deterministic: all qubits stay |0>.
+    res = cudaq.run(all_zeros)
+    assert len(res) > 0
+    for shot in res:
+        assert list(shot) == [0, 0, 0,
+                              0], f"expected [0,0,0,0], got {list(shot)}"
+
+    # Deterministic: X on all qubits -> all |1>.
+    res = cudaq.run(all_ones)
+    assert len(res) > 0
+    for shot in res:
+        assert list(shot) == [1, 1, 1,
+                              1], f"expected [1,1,1,1], got {list(shot)}"
+
+    # Deterministic: X on qubits 1 and 3 -> [0,1,0,1].
+    res = cudaq.run(alternating_01)
+    assert len(res) > 0
+    for shot in res:
+        assert list(shot) == [0, 1, 0,
+                              1], f"expected [0,1,0,1], got {list(shot)}"
+
+    # Deterministic: single qubit X -> [1].
+    res = cudaq.run(single_qubit_flip)
+    assert len(res) > 0
+    for shot in res:
+        assert list(shot) == [1], f"expected [1], got {list(shot)}"
+
+except Exception as e:
+    print(e)
+    sys.exit(1)
diff --git a/unittests/common/MeasureCountsTester.cpp b/unittests/common/MeasureCountsTester.cpp
index 0969ea43822..92ba31429ae 100644
--- a/unittests/common/MeasureCountsTester.cpp
+++ b/unittests/common/MeasureCountsTester.cpp
@@ -64,42 +64,3 @@ CUDAQ_TEST(MeasureCountsTester, checkMeasureCountsSerialize) {
 
   EXPECT_TRUE(mm == mc);
 }
-
-CUDAQ_TEST(MeasureResultTester, checkConstructors) {
-  static_assert(!std::is_default_constructible_v<cudaq::measure_result>);
-  static_assert(std::is_copy_constructible_v<cudaq::measure_result>);
-  static_assert(std::is_move_constructible_v<cudaq::measure_result>);
-  static_assert(!std::is_copy_assignable_v<cudaq::measure_result>);
-  static_assert(!std::is_move_assignable_v<cudaq::measure_result>);
-
-  cudaq::measure_result r1(int64_t(1));
-  EXPECT_EQ(static_cast<int>(r1), 1);
-  EXPECT_TRUE(static_cast<bool>(r1));
-
-  cudaq::measure_result r2(int64_t(0), int64_t(42));
-  EXPECT_EQ(static_cast<int>(r2), 0);
-  EXPECT_FALSE(static_cast<bool>(r2));
-  EXPECT_NEAR(static_cast<double>(r2), 0.0, 1e-9);
-
-  cudaq::measure_result r3(r1);
-  EXPECT_EQ(static_cast<int>(r3), 1);
-
-  cudaq::measure_result r4(std::move(r1));
-  EXPECT_EQ(static_cast<int>(r4), 1);
-}
-
-CUDAQ_TEST(MeasureResultTester, checkComparisons) {
-  cudaq::measure_result a(int64_t(1), int64_t(10));
-  cudaq::measure_result b(int64_t(1), int64_t(10));
-  cudaq::measure_result c(int64_t(0), int64_t(10));
-  cudaq::measure_result d(int64_t(1), int64_t(20));
-
-  EXPECT_TRUE(a == b);
-  EXPECT_TRUE(a != c);
-  EXPECT_TRUE(a != d);
-  EXPECT_TRUE(a == true);
-  EXPECT_TRUE(true == a);
-  EXPECT_TRUE(c == false);
-  EXPECT_TRUE(c != true);
-  EXPECT_TRUE(false != a);
-}
diff --git a/unittests/integration/builder_tester.cpp b/unittests/integration/builder_tester.cpp
index d6138960a8a..35998547a97 100644
--- a/unittests/integration/builder_tester.cpp
+++ b/unittests/integration/builder_tester.cpp
@@ -1365,9 +1365,10 @@ CUDAQ_TEST(BuilderTester, checkControlledRotations) {
 
 TEST(BuilderTester, checkFromStateVector) {
   std::vector<cudaq::complex> vec{M_SQRT1_2, 0., 0., M_SQRT1_2};
+  cudaq::state st0{vec};
   {
     auto kernel = cudaq::make_kernel();
-    auto qubits = kernel.qalloc(vec);
+    auto qubits = kernel.qalloc(st0);
     std::cout << kernel << "\n";
     auto counts = cudaq::sample(kernel);
     counts.dump();
@@ -1381,11 +1382,10 @@ TEST(BuilderTester, checkFromStateVector) {
   }
 
   {
-    auto [kernel, initState] =
-        cudaq::make_kernel<std::vector<cudaq::complex>>();
+    auto [kernel, initState] = cudaq::make_kernel<cudaq::state *>();
     auto qubits = kernel.qalloc(initState);
     std::cout << kernel << "\n";
-    auto counts = cudaq::sample(kernel, vec);
+    auto counts = cudaq::sample(kernel, &st0);
     counts.dump();
     EXPECT_EQ(counts.size(), 2);
     std::size_t counter = 0;
@@ -1399,14 +1399,13 @@ TEST(BuilderTester, checkFromStateVector) {
   {
     // 2 qubit 11 state
     std::vector<cudaq::complex> vec{0., 0., 0., 1.};
-    auto [kernel, initState] =
-        cudaq::make_kernel<std::vector<cudaq::complex>>();
+    cudaq::state st1{vec};
+    auto [kernel, initState] = cudaq::make_kernel<cudaq::state *>();
     auto qubits = kernel.qalloc(initState);
-    // induce the need for a kron prod between
-    // [0,0,0,1] and [1, 0, 0, 0]
+    // induce the need for a kron prod between [0,0,0,1] and [1, 0, 0, 0]
     auto anotherOne = kernel.qalloc(2);
     std::cout << kernel << "\n";
-    auto counts = cudaq::sample(kernel, vec);
+    auto counts = cudaq::sample(kernel, &st1);
     counts.dump();
     EXPECT_EQ(counts.size(), 1);
     EXPECT_EQ(counts.count("1100"), 1000);
@@ -1415,14 +1414,13 @@ TEST(BuilderTester, checkFromStateVector) {
   {
     // 2 qubit 11 state
     std::vector<cudaq::complex> vec{0., 0., 0., 1.};
-    auto [kernel, initState] =
-        cudaq::make_kernel<std::vector<cudaq::complex>>();
+    cudaq::state st2{std::move(vec)};
+    auto [kernel, initState] = cudaq::make_kernel<cudaq::state *>();
     auto qubits = kernel.qalloc(initState);
-    // induce the need for a kron prod between
-    // [0,0,0,1] and [1, 0]
+    // induce the need for a kron prod between [0,0,0,1] and [1, 0]
     auto anotherOne = kernel.qalloc();
     std::cout << kernel << "\n";
-    auto counts = cudaq::sample(kernel, vec);
+    auto counts = cudaq::sample(kernel, &st2);
     counts.dump();
     EXPECT_EQ(counts.size(), 1);
     EXPECT_EQ(counts.count("110"), 1000);
diff --git a/unittests/integration/kernels_tester.cpp b/unittests/integration/kernels_tester.cpp
index 243db537e45..141f5cf4c52 100644
--- a/unittests/integration/kernels_tester.cpp
+++ b/unittests/integration/kernels_tester.cpp
@@ -181,6 +181,46 @@ CUDAQ_TEST(KernelsTester, checkFromState) {
   }
 }
 
+CUDAQ_TEST(KernelsTester, checkFromStateBasis) {
+  auto verifyBasis = [](std::size_t numQubits, std::size_t idx) {
+    std::vector<std::complex<double>> state(1ULL << numQubits, 0.0);
+    state[idx] = 1.0;
+    auto kernel = cudaq::make_kernel();
+    auto qubits = kernel.qalloc(numQubits);
+    cudaq::from_state(kernel, qubits, state);
+    auto ss = cudaq::get_state(kernel);
+    for (std::size_t i = 0; i < state.size(); i++)
+      EXPECT_NEAR(std::abs(ss[i] - state[i]), 0.0, 1e-6);
+  };
+
+  for (std::size_t idx = 0; idx < 8; idx++)
+    verifyBasis(3, idx);
+
+  verifyBasis(4, 0);
+  verifyBasis(4, 5);
+  verifyBasis(4, 15);
+
+  {
+    std::vector<std::complex<double>> zero(8, 0.0);
+    auto kernel = cudaq::make_kernel();
+    auto qubits = kernel.qalloc(3);
+    EXPECT_THROW(cudaq::from_state(kernel, qubits, zero),
+                 std::invalid_argument);
+  }
+
+  {
+    constexpr std::size_t numQubits = 16;
+    std::vector<std::complex<double>> state(1ULL << numQubits, 0.0);
+    state[0] = 1.0;
+    auto kernel = cudaq::make_kernel();
+    auto qubits = kernel.qalloc(numQubits);
+    cudaq::from_state(kernel, qubits, state);
+    auto counts = cudaq::sample(kernel);
+    EXPECT_EQ(counts.size(), 1u);
+    EXPECT_EQ(counts.begin()->first, std::string(numQubits, '0'));
+  }
+}
+
 CUDAQ_TEST(KernelsTester, checkSampleBug2937) {
   constexpr int qubit_count = 20;
   auto kernel = cudaq::make_kernel();
diff --git a/unittests/integration/measure_reset_tester.cpp b/unittests/integration/measure_reset_tester.cpp
index 93673fd60c4..e9a29e21740 100644
--- a/unittests/integration/measure_reset_tester.cpp
+++ b/unittests/integration/measure_reset_tester.cpp
@@ -82,3 +82,70 @@ TEST(MeasureResetTester, checkLibModeOrdering) {
   counts.dump();
   EXPECT_EQ("10", counts.begin()->first);
 }
+
+TEST(MeasureResetTester, checkMixedBasisOrderingAndPreservation) {
+  constexpr std::size_t shots = 100;
+
+  auto kernel = []() __qpu__ {
+    cudaq::qvector q(7);
+
+    // Prepare a non-palindromic deterministic pattern over measured bits.
+    // q0=0 (mz), q1=1 (mz), q2=1 (mx), q3=? (my), q4=0 (mz), q5=0 (mx),
+    // q6=1 (mz) -> 011?001 in allocation order.
+    x(q[1]);
+    x(q[2]);
+    h(q[2]);
+    h(q[5]);
+    x(q[6]);
+
+    // Mix measurement bases and execution order.
+    mz(q[4]);
+    mx(q[2]);
+    my(q[3]);
+    mz(q[0]);
+    mx(q[5]);
+    mz(q[6]);
+    mz(q[1]);
+  };
+
+  auto counts = cudaq::sample(shots, kernel);
+  std::size_t totalCounts = 0;
+  for (const auto &[bits, count] : counts) {
+    if (bits.size() != 7u) {
+      ADD_FAILURE() << "Expected 7-bit string in default mode, got '" << bits
+                    << "'";
+      continue;
+    }
+    EXPECT_EQ(bits[0], '0');
+    EXPECT_EQ(bits[1], '1');
+    EXPECT_EQ(bits[2], '1');
+    EXPECT_EQ(bits[4], '0');
+    EXPECT_EQ(bits[5], '0');
+    EXPECT_EQ(bits[6], '1');
+    totalCounts += count;
+  }
+  EXPECT_EQ(totalCounts, shots);
+
+  cudaq::sample_options options{};
+  options.shots = shots;
+  options.explicit_measurements = true;
+  counts = cudaq::sample(options, kernel);
+  totalCounts = 0;
+
+  // Execution order was q4, q2, q3, q0, q5, q6, q1 => 01?0011.
+  for (const auto &[bits, count] : counts) {
+    if (bits.size() != 7u) {
+      ADD_FAILURE() << "Expected 7-bit string in explicit mode, got '" << bits
+                    << "'";
+      continue;
+    }
+    EXPECT_EQ(bits[0], '0');
+    EXPECT_EQ(bits[1], '1');
+    EXPECT_EQ(bits[3], '0');
+    EXPECT_EQ(bits[4], '0');
+    EXPECT_EQ(bits[5], '1');
+    EXPECT_EQ(bits[6], '1');
+    totalCounts += count;
+  }
+  EXPECT_EQ(totalCounts, shots);
+}
diff --git a/unittests/output_record/RecordParserTester.cpp b/unittests/output_record/RecordParserTester.cpp
index ddc14d981bd..ad4779148e6 100644
--- a/unittests/output_record/RecordParserTester.cpp
+++ b/unittests/output_record/RecordParserTester.cpp
@@ -253,7 +253,7 @@ CUDAQ_TEST(ParserTester, checkTupleLabeled) {
 }
 
 CUDAQ_TEST(ParserTester, checkMultipleShots) {
-  const std::string log = "HEADER\tschema_name\tlabeled\n"
+  const std::string log = "HEADER\tschema_id\tlabeled\n"
                           "START\n"
                           "METADATA\tqir_profiles\tbase_profile\n"
                           "OUTPUT\tARRAY\t2\tarray<i16 x 2>\n"
@@ -392,7 +392,7 @@ CUDAQ_TEST(ParserTester, checkFailureCases) {
   }
   {
     const std::string invalidSchema =
-        "HEADER\tschema_name\tordered_and_labeled\n";
+        "HEADER\tschema_id\tordered_and_labeled\n";
     EXPECT_ANY_THROW(parser.parse(invalidSchema));
   }
   {
@@ -577,7 +577,7 @@ CUDAQ_TEST(ParserTester, checkFailedShot_0) {
 }
 
 CUDAQ_TEST(ParserTester, checkFailedShot_1) {
-  const std::string log = "HEADER\tschema_name\tlabeled\n"
+  const std::string log = "HEADER\tschema_id\tlabeled\n"
                           "START\n"
                           "OUTPUT\tARRAY\t2\tarray<i16 x 2>\n"
                           "OUTPUT\tINT\t2345\t[0]\n"
diff --git a/unittests/qir/NVQIRTester.cpp b/unittests/qir/NVQIRTester.cpp
index 2536bbcdf91..169e5aa1d98 100644
--- a/unittests/qir/NVQIRTester.cpp
+++ b/unittests/qir/NVQIRTester.cpp
@@ -59,7 +59,6 @@ void __quantum__qis__rz(double x, Qubit *q);
 void __quantum__qis__rz__ctl(double x, Array *ctrls, Qubit *q);
 void __quantum__qis__u3(double theta, double phi, double lambda, Qubit *q);
 Result *__quantum__qis__mz(Qubit *q);
-Result *__quantum__qis__measure__body(Array *basis, Array *qubits);
 Result *__quantum__rt__result_get_one();
 Result *__quantum__rt__result_get_zero();
 void __quantum__qis__exp__body(Array *paulis, double angle, Array *qubits);
@@ -105,7 +104,6 @@ Array *__quantum__rt__array_slice(Array *array, int32_t dim,
                                   int64_t range_end);
 Array *__quantum__rt__array_slice_1d(Array *array, int64_t range_start,
                                      int64_t range_step, int64_t range_end);
-Array *__quantum__rt__result_array_create_1d(int64_t count);
 }
 
 CUDAQ_TEST(NVQIRTester, checkSimple) {
@@ -126,18 +124,6 @@ CUDAQ_TEST(NVQIRTester, checkSimple) {
   __quantum__rt__finalize();
 }
 
-CUDAQ_TEST(NVQIRTester, checkResultArrayCreate) {
-  __quantum__rt__initialize(0, nullptr);
-
-  constexpr int64_t numResults = 3;
-  auto *arr = __quantum__rt__result_array_create_1d(numResults);
-  EXPECT_NE(arr, nullptr);
-  EXPECT_EQ(__quantum__rt__array_get_size_1d(arr), numResults);
-
-  __quantum__rt__array_release(arr);
-  __quantum__rt__finalize();
-}
-
 // Stim does not support many of the gates used in these tests.
 #ifndef CUDAQ_BACKEND_STIM
 
diff --git a/unittests/qudit/simple_qudit/SimpleQuditExecutionManager.cpp b/unittests/qudit/simple_qudit/SimpleQuditExecutionManager.cpp
index 0aad0cd3332..be711841570 100644
--- a/unittests/qudit/simple_qudit/SimpleQuditExecutionManager.cpp
+++ b/unittests/qudit/simple_qudit/SimpleQuditExecutionManager.cpp
@@ -8,6 +8,7 @@
 
 #include "common/ExecutionContext.h"
 #include "common/FmtCore.h"
+#include "common/SampleResult.h"
 #include "cudaq/runtime/logger/logger.h"
 
 #include "cudaq/operators.h"
@@ -57,30 +58,31 @@ class SimpleQuditExecutionManager : public cudaq::BasicExecutionManager {
   void deallocateQudit(const cudaq::QuditInfo &q) override {}
   void deallocateQudits(const std::vector<cudaq::QuditInfo> &qudits) override {}
 
-  void finalizeExecutionContext(ExecutionContext &ctx) override {
-    BasicExecutionManager::finalizeExecutionContext(ctx);
-
-    if (ctx.name == "sample") {
-      std::vector<std::size_t> ids;
-      for (auto &s : sampleQudits) {
-        ids.push_back(s.id);
-      }
-      auto sampleResult =
-          qpp::sample(ctx.shots, state, ids, sampleQudits.begin()->levels);
-
-      ExecutionResult execResult;
-      for (auto [result, count] : sampleResult) {
-        std::cout << fmt::format("Sample {} : {}", result, count) << "\n";
-        // Populate counts dictionary. FIXME - handle qudits with >= 10 levels
-        // better.
-        std::string resultStr;
-        resultStr.reserve(result.size());
-        for (auto x : result)
-          resultStr += std::to_string(x);
-        execResult.counts[resultStr] = count;
-      }
-      ctx.result.append(execResult);
+  sample_result finalizeExecutionContext(const sample_policy &policy,
+                                         ExecutionContext &ctx) override {
+    BasicExecutionManager::finalizeExecutionContextImpl(ctx);
+
+    std::vector<std::size_t> ids;
+    for (auto &s : sampleQudits) {
+      ids.push_back(s.id);
+    }
+    auto sampleResult =
+        qpp::sample(ctx.shots, state, ids, sampleQudits.begin()->levels);
+
+    ExecutionResult execResult;
+    for (auto [result, count] : sampleResult) {
+      std::cout << fmt::format("Sample {} : {}", result, count) << "\n";
+      // Populate counts dictionary. FIXME - handle qudits with >= 10 levels
+      // better.
+      std::string resultStr;
+      resultStr.reserve(result.size());
+      for (auto x : result)
+        resultStr += std::to_string(x);
+      execResult.counts[resultStr] = count;
     }
+    sample_result result;
+    result.append(execResult);
+    return result;
   }
 
   void endExecution() override {
@@ -115,7 +117,9 @@ class SimpleQuditExecutionManager : public cudaq::BasicExecutionManager {
     return measurement_result;
   }
 
-  void measureSpinOp(const cudaq::spin_op &) override {}
+  cudaq::SpinMeasureResult measureSpinOp(const cudaq::spin_op &) override {
+    return cudaq::SpinMeasureResult(0.0, {});
+  }
 
 public:
   SimpleQuditExecutionManager() {
diff --git a/utils/mock_qpu/__init__.py b/utils/mock_qpu/__init__.py
index 8167902c1e1..c508a32c796 100644
--- a/utils/mock_qpu/__init__.py
+++ b/utils/mock_qpu/__init__.py
@@ -21,6 +21,7 @@
     "qci": 62449,
     "scaleway": 62450,
     "tii": 62451,
+    "qbraid": 62452,
 }
 
 
diff --git a/utils/mock_qpu/qbraid/__init__.py b/utils/mock_qpu/qbraid/__init__.py
new file mode 100644
index 00000000000..066650fac86
--- /dev/null
+++ b/utils/mock_qpu/qbraid/__init__.py
@@ -0,0 +1,347 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+import itertools
+import random
+import re
+import uuid
+from typing import Any, Optional
+
+import uvicorn
+from fastapi import FastAPI, Header, HTTPException, Path
+from pydantic import BaseModel
+
+app = FastAPI()
+
+
+class Program(BaseModel):
+    """Structured program payload for v2 API."""
+
+    format: str
+    data: str
+
+
+class Job(BaseModel):
+    """Data required to submit a quantum job (v2 API)."""
+
+    program: Program
+    shots: int
+    deviceQrn: str
+    name: Optional[str] = None
+    tags: Optional[dict] = None
+
+
+JOBS_MOCK_DB = {}
+JOBS_MOCK_RESULTS = {}
+# Testing toggle: when True, the next job submitted via POST /jobs is created
+# with status FAILED. Consumed (reset to False) after use.
+FAIL_NEXT_JOB = {"enabled": False}
+# Testing counter: how many upcoming GET /jobs/{id}/result calls should return
+# success=false (simulating the qbraid v2 race where status=COMPLETED before
+# results are queryable). Decrements on each /result call until 0.
+DELAY_RESULTS_COUNT = {"remaining": 0}
+# Testing hook: when set, the next GET /jobs/{id}/result call raises the given
+# HTTP status. Consumed (reset to None) after one call. Used to exercise the
+# helper's 401/403/404/5xx handling paths.
+FORCE_NEXT_RESULT_STATUS = {"code": None}
+
+
+def count_qubits(qasm: str) -> int:
+    """Extracts the number of qubits from an OpenQASM string."""
+    pattern = r"qreg\s+\w+\[(\d+)\];"
+
+    match = re.search(pattern, qasm)
+
+    if match:
+        return int(match.group(1))
+
+    raise ValueError("No qreg declaration found in the OpenQASM string.")
+
+
+def simulate_job(qasm: str, num_shots: int) -> dict[str, int]:
+    """Simulates a quantum job by generating random measurement outcomes based on the circuit."""
+    num_qubits = count_qubits(qasm)
+
+    measured_qubits = []
+
+    measure_pattern = r"measure\s+(\w+)\[(\d+)\]"
+    measure_matches = re.findall(measure_pattern, qasm)
+
+    hadamard_pattern = r"h\s+(\w+)\[(\d+)\]"
+    hadamard_matches = re.findall(hadamard_pattern, qasm)
+
+    superposition_qubits = set()
+    for _, qubit_idx in hadamard_matches:
+        superposition_qubits.add(int(qubit_idx))
+
+    for _, qubit_idx in measure_matches:
+        measured_qubits.append(int(qubit_idx))
+
+    if not measured_qubits:
+        measured_qubits = list(range(num_qubits))
+
+    result = {}
+
+    possible_states = []
+
+    if measured_qubits:
+        # Generate strings of the appropriate length for measured qubits
+        # For superposition qubits, include both 0 and 1 outcomes
+        for measured_qubit in measured_qubits:
+            if measured_qubit in superposition_qubits:
+                if not possible_states:
+                    possible_states = ["0", "1"]
+                else:
+                    new_states = []
+                    for state in possible_states:
+                        new_states.append(state + "0")
+                        new_states.append(state + "1")
+                    possible_states = new_states
+            else:
+                if not possible_states:
+                    possible_states = ["0"]
+                else:
+                    possible_states = [state + "0" for state in possible_states]
+
+    if not possible_states:
+        if superposition_qubits:
+            possible_states = ["0", "1"]
+        else:
+            possible_states = ["0" * num_qubits]
+
+    distribution = random.choices(possible_states, k=num_shots)
+    result = {state: distribution.count(state) for state in set(distribution)}
+
+    if (num_qubits == 2 and len(measured_qubits) == 1 and
+            measured_qubits[0] == 0 and 0 in superposition_qubits):
+        new_result = {}
+        total_shots = num_shots
+        half_shots = total_shots // 2
+
+        new_result["00"] = random.randint(half_shots - half_shots // 4,
+                                          half_shots + half_shots // 4)
+        new_result["01"] = 0
+        new_result["10"] = random.randint(half_shots - half_shots // 4,
+                                          half_shots + half_shots // 4)
+        new_result["11"] = 0
+
+        remaining = total_shots - (new_result["00"] + new_result["10"])
+        if remaining > 0:
+            new_result["00"] += remaining
+
+        result = {k: v for k, v in new_result.items() if v > 0}
+
+    return result
+
+
+def poll_job_status(job_id: str) -> dict[str, Any]:
+    """Updates the status of a job and returns the updated job data."""
+    if job_id not in JOBS_MOCK_DB:
+        raise HTTPException(status_code=404, detail="Job not found")
+
+    status = JOBS_MOCK_DB[job_id]["status"]
+
+    status_transitions = {
+        "INITIALIZING": "QUEUED",
+        "QUEUED": "RUNNING",
+        "RUNNING": "COMPLETED",
+        "CANCELLING": "CANCELLED",
+    }
+
+    new_status = status_transitions.get(status, status)
+    JOBS_MOCK_DB[job_id]["status"] = new_status
+
+    return {"jobQrn": job_id, **JOBS_MOCK_DB[job_id]}
+
+
+# v2 API: POST /jobs
+@app.post("/jobs")
+async def postJob(job: Job,
+                  x_api_key: Optional[str] = Header(None, alias="X-API-KEY")):
+    """Submit a quantum job for execution (v2 API)."""
+    if x_api_key is None:
+        raise HTTPException(status_code=401, detail="API key is required")
+
+    newId = str(uuid.uuid4())
+
+    # Test hook: fail this job immediately if the toggle was armed.
+    if FAIL_NEXT_JOB["enabled"]:
+        FAIL_NEXT_JOB["enabled"] = False
+        job_data = {
+            "status": "FAILED",
+            "statusText": "Triggered failure for testing",
+            **job.model_dump(),
+        }
+        JOBS_MOCK_DB[newId] = job_data
+        return {"success": True, "data": {"jobQrn": newId, "status": "FAILED"}}
+
+    # Extract QASM from the structured program payload
+    counts = simulate_job(job.program.data, job.shots)
+
+    job_data = {"status": "INITIALIZING", "statusText": "", **job.model_dump()}
+
+    JOBS_MOCK_DB[newId] = job_data
+    JOBS_MOCK_RESULTS[newId] = counts
+
+    # v2 response: wrapped in success/data envelope
+    return {
+        "success": True,
+        "data": {
+            "jobQrn": newId,
+            "status": "INITIALIZING"
+        }
+    }
+
+
+# Test-only: arm a failure for the next submitted job.
+@app.post("/test/fail_next")
+async def armFailNext():
+    FAIL_NEXT_JOB["enabled"] = True
+    return {"armed": True}
+
+
+# Test-only: force the next N /result calls to return success=false.
+@app.post("/test/delay_next_results/{count}")
+async def armDelayResults(count: int = Path(...)):
+    DELAY_RESULTS_COUNT["remaining"] = count
+    return {"remaining": count}
+
+
+# Test-only: force the next GET /result call to return the given HTTP status.
+# Consumed after one call.
+@app.post("/test/force_next_result_status/{code}")
+async def armForceResultStatus(code: int = Path(...)):
+    FORCE_NEXT_RESULT_STATUS["code"] = code
+    return {"armed_status": code}
+
+
+# Test-only: reset all test-hook globals so tests are order-independent.
+@app.post("/test/reset")
+async def resetTestState():
+    FAIL_NEXT_JOB["enabled"] = False
+    DELAY_RESULTS_COUNT["remaining"] = 0
+    FORCE_NEXT_RESULT_STATUS["code"] = None
+    return {"reset": True}
+
+
+# v2 API: GET /jobs/{job_qrn}
+@app.get("/jobs/{job_id}")
+async def getJob(
+        job_id: str = Path(...),
+        x_api_key: Optional[str] = Header(None, alias="X-API-KEY"),
+):
+    """Retrieve the status of a quantum job (v2 API)."""
+    if x_api_key is None:
+        raise HTTPException(status_code=401, detail="API key is required")
+
+    job_data = poll_job_status(job_id)
+
+    # v2 response: wrapped in success/data envelope
+    return {"success": True, "data": job_data}
+
+
+# v2 API: GET /jobs/{job_qrn}/program
+@app.get("/jobs/{job_id}/program")
+async def getJobProgram(
+        job_id: str = Path(...),
+        x_api_key: Optional[str] = Header(None, alias="X-API-KEY"),
+):
+    """Retrieve the program of a quantum job (v2 API)."""
+    if x_api_key is None:
+        raise HTTPException(status_code=401, detail="API key is required")
+
+    if job_id not in JOBS_MOCK_DB:
+        raise HTTPException(status_code=404, detail="Job not found")
+
+    job_data = JOBS_MOCK_DB[job_id]
+
+    # Return the stored program in v2 format: { success, data: { format, data } }
+    return {
+        "success": True,
+        "data": {
+            "format": job_data.get("program", {}).get("format", "qasm2"),
+            "data": job_data.get("program", {}).get("data", ""),
+        },
+    }
+
+
+# v2 API: GET /jobs/{job_qrn}/result
+@app.get("/jobs/{job_id}/result")
+async def getJobResult(
+        job_id: str = Path(...),
+        x_api_key: Optional[str] = Header(None, alias="X-API-KEY"),
+):
+    """Retrieve the results of a quantum job (v2 API)."""
+    # Test hook: if armed, raise the requested status. Checked first so tests
+    # can force 401/403 even when a valid api key is present.
+    if FORCE_NEXT_RESULT_STATUS["code"] is not None:
+        forced = FORCE_NEXT_RESULT_STATUS["code"]
+        FORCE_NEXT_RESULT_STATUS["code"] = None
+        raise HTTPException(status_code=forced,
+                            detail=f"Forced HTTP {forced} for test")
+
+    if x_api_key is None:
+        raise HTTPException(status_code=401, detail="API key is required")
+
+    if job_id not in JOBS_MOCK_DB:
+        raise HTTPException(status_code=404, detail="Job not found")
+
+    if JOBS_MOCK_DB[job_id]["status"] in {"FAILED", "CANCELLED"}:
+        raise HTTPException(
+            status_code=409,
+            detail="Results unavailable. Job failed or was cancelled.")
+
+    if JOBS_MOCK_DB[job_id]["status"] != "COMPLETED":
+        # v2: use success=false instead of "error" field
+        return {
+            "success": False,
+            "data": {
+                "status": JOBS_MOCK_DB[job_id]["status"]
+            },
+        }
+
+    if job_id not in JOBS_MOCK_RESULTS:
+        raise HTTPException(status_code=500, detail="Job results not found")
+
+    # Test hook: return "not yet available" for the next N /result calls if
+    # the delay counter is armed. Decrements on each call.
+    if DELAY_RESULTS_COUNT["remaining"] > 0:
+        DELAY_RESULTS_COUNT["remaining"] -= 1
+        return {
+            "success": False,
+            "data": {
+                "status":
+                    "COMPLETED",
+                "message":
+                    "Failed to retrieve job results. Please wait, and try again.",
+            },
+        }
+
+    counts = JOBS_MOCK_RESULTS[job_id]
+
+    # v2 response: measurementCounts nested under data.resultData
+    return {
+        "success": True,
+        "data": {
+            "resultData": {
+                "measurementCounts": counts
+            },
+            "status": "COMPLETED",
+            "cost": 0,
+            "timeStamps": {},
+        },
+    }
+
+
+def startServer(port):
+    """Start the REST server."""
+    uvicorn.run(app, port=port, host="0.0.0.0", log_level="info")
+
+
+if __name__ == "__main__":
+    startServer(62454)