From 7f549f856df0be87466856aae546f5d534640ec0 Mon Sep 17 00:00:00 2001
From: Viren Baraiya <virenx@gmail.com>
Date: Sun, 8 Mar 2026 23:44:39 -0700
Subject: [PATCH 1/2] Add update-task-v2 endpoint with tight execute loop and
 reduce log verbosity

---
 AGENTS.md                                     |  21 ++
 README.md                                     | 338 +++++++++---------
 .../design/WORKER_SDK_IMPLEMENTATION_GUIDE.md | 208 ++++++++++-
 pyproject.toml                                |   3 +
 src/conductor/__init__.py                     |   5 +
 .../client/automator/async_task_runner.py     |  87 ++---
 .../client/automator/task_handler.py          |  10 +
 src/conductor/client/automator/task_runner.py | 183 +++-------
 .../http/api/async_task_resource_api.py       |  83 +++++
 .../client/http/api/task_resource_api.py      |  83 +++++
 .../test_orkes_service_registry_client.py     |   3 +
 tests/integration/conftest.py                 | 115 ++++++
 .../metadata/test_schema_service.py           |   3 +
 .../metadata/test_task_metadata_service.py    |   3 +
 tests/integration/test_agentic_workflows.py   |   3 +
 tests/integration/test_ai_examples.py         |   3 +
 tests/integration/test_ai_task_types.py       |   3 +
 .../test_authorization_client_intg.py         |   3 +
 tests/integration/test_comprehensive_e2e.py   |  12 +-
 .../integration/test_workflow_client_intg.py  |   3 +
 .../unit/automator/test_async_task_runner.py  |  38 +-
 .../automator/test_task_handler_coverage.py   |  15 +-
 tests/unit/automator/test_task_runner.py      |  30 +-
 .../automator/test_task_runner_coverage.py    |  96 +++--
 24 files changed, 915 insertions(+), 436 deletions(-)
 create mode 100644 tests/integration/conftest.py

diff --git a/AGENTS.md b/AGENTS.md
index 767d0fd49..0656ad669 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -779,3 +779,24 @@ Remember: The goal is to make Conductor easy to use in every language while main
 
 ---
 
+## 🧪 Post-Session Testing Checklist
+
+**After every coding session, run the full test suite to ensure zero failures:**
+
+```bash
+# All suites — expect 0 failures, 0 errors
+python3 -m pytest tests/unit tests/backwardcompatibility tests/serdesertest tests/chaos tests/integration -v
+
+# Expected results:
+#   Unit tests:             ~626 passed
+#   Backward compatibility: ~1015 passed
+#   Serialization:          ~58 passed
+#   Chaos:                  2 skipped (require special setup)
+#   Integration:            128 skipped (require live Conductor server)
+#   TOTAL:                  0 failures, 0 errors
+```
+
+Integration tests skip gracefully when the Conductor server is not available (no `CONDUCTOR_SERVER_URL` / `CONDUCTOR_AUTH_KEY` / `CONDUCTOR_AUTH_SECRET` env vars). When a server is available, they run against it. **There should be NO failures in any suite.**
+
+---
+
diff --git a/README.md b/README.md
index 3468e73af..886142ac8 100644
--- a/README.md
+++ b/README.md
@@ -7,33 +7,30 @@
 
 Python SDK for [Conductor](https://www.conductor-oss.org/) (OSS and Orkes Conductor) — an orchestration platform for building distributed applications, AI agents, and workflow-driven microservices. Define workflows as code, run workers anywhere, and let Conductor handle retries, state management, and observability.
 
-If you find [Conductor](https://github.com/conductor-oss/conductor) useful, please consider giving it a star on GitHub -- it helps the project grow.
+If you find [Conductor](https://github.com/conductor-oss/conductor) useful, please consider giving it a star on GitHub — it helps the project grow.
 
 [![GitHub stars](https://img.shields.io/github/stars/conductor-oss/conductor.svg?style=social&label=Star&maxAge=)](https://GitHub.com/conductor-oss/conductor/)
 
-
-
 <!-- TOC -->
-* [Python SDK for Conductor](#python-sdk-for-conductor)
-  * [Start  Conductor server](#start--conductor-server)
-  * [Install the SDK](#install-the-sdk)
-  * [60-Second Quickstart](#60-second-quickstart)
-  * [Comprehensive worker example](#comprehensive-worker-example-)
-  * [Workers](#workers)
-  * [Monitoring Workers](#monitoring-workers)
-  * [Workflows](#workflows)
-  * [Troubleshooting](#troubleshooting)
-  * [AI & LLM Workflows](#ai--llm-workflows)
-  * [Examples](#examples)
-  * [API Journey Examples](#api-journey-examples)
-  * [Documentation](#documentation)
-  * [Support](#support)
-  * [Frequently Asked Questions](#frequently-asked-questions)
-  * [License](#license)
+* [Start Conductor Server](#start-conductor-server)
+* [Install the SDK](#install-the-sdk)
+* [60-Second Quickstart](#60-second-quickstart)
+* [Feature Showcase](#feature-showcase)
+  * [Workers: Sync and Async](#workers-sync-and-async)
+  * [Workflows with HTTP Calls and Waits](#workflows-with-http-calls-and-waits)
+  * [Long-Running Tasks with TaskContext](#long-running-tasks-with-taskcontext)
+  * [Monitoring with Metrics](#monitoring-with-metrics)
+  * [Managing Workflow Executions](#managing-workflow-executions)
+* [AI & LLM Workflows](#ai--llm-workflows)
+* [Why Conductor?](#why-conductor)
+* [Examples](#examples)
+* [Documentation](#documentation)
+* [Frequently Asked Questions](#frequently-asked-questions)
+* [Support](#support)
+* [License](#license)
 <!-- TOC -->
 
-
-## Start  Conductor server
+## Start Conductor Server
 
 If you don't already have a Conductor server running, pick one:
 
@@ -84,7 +81,7 @@ workflow.output_parameters({'result': greet_task.output('result')})
 workflow.register(overwrite=True)
 ```
 
-**Step 2: Write worker**
+**Step 2: Write a worker**
 
 Workers are just Python functions decorated with `@worker_task` that poll Conductor for tasks and execute them.
 
@@ -149,216 +146,201 @@ Run it:
 python quickstart.py
 ```
 
-> ### Using Orkes Conductor / Remote Server? 
+> ### Using Orkes Conductor / Remote Server?
 > Export your authentication credentials as well:
-> 
+>
 > ```shell
 > export CONDUCTOR_SERVER_URL="https://your-cluster.orkesconductor.io/api"
-> 
+>
 > # If using Orkes Conductor that requires auth key/secret
 > export CONDUCTOR_AUTH_KEY="your-key"
 > export CONDUCTOR_AUTH_SECRET="your-secret"
-> 
+>
 > # Optional — set to false to force HTTP/1.1 if your network environment has unstable long-lived HTTP/2 connections (default: true)
 > # export CONDUCTOR_HTTP2_ENABLED=false
 > ```
 > See [Configuration](#configuration) for details.
 
-That's it -- you just defined a worker, built a workflow, and executed it. Open the Conductor UI (default:
+That's it — you just defined a worker, built a workflow, and executed it. Open the Conductor UI (default:
 [http://localhost:8127](http://localhost:8127)) to see the execution.
 
-## Comprehensive worker example 
-The example includes sync + async workers, metrics, and long-running tasks
-
-See [examples/workers_e2e.py](examples/workers_e2e.py)
-
 ---
 
-## Workers
+## Feature Showcase
 
-Workers are Python functions that execute Conductor tasks. Decorate any function with `@worker_task` to:
+### Workers: Sync and Async
 
-- register it as a worker (auto-discovered by `TaskHandler`)
-- use it as a workflow task (call it with `task_ref_name=...`)
-
-Note: Workers can also be used by LLMs for tool calling (see [AI & LLM Workflows](#ai--llm-workflows)).
+The SDK automatically selects the right runner based on your function signature — `TaskRunner` (thread pool) for sync functions, `AsyncTaskRunner` (event loop) for async.
 
 ```python
 from conductor.client.worker.worker_task import worker_task
 
-@worker_task(task_definition_name='greet')
-def greet(name: str) -> str:
-    return f'Hello {name}'
-```
+# Sync worker — for CPU-bound work (uses ThreadPoolExecutor)
+@worker_task(task_definition_name='process_image', thread_count=4)
+def process_image(image_url: str) -> dict:
+    import PIL.Image, io, requests
+    img = PIL.Image.open(io.BytesIO(requests.get(image_url).content))
+    img.thumbnail((256, 256))
+    return {'width': img.width, 'height': img.height}
 
-**Async workers** for I/O-bound tasks — the SDK automatically uses `AsyncTaskRunner` (event loop, no thread overhead):
 
-```python
-import httpx
-
-@worker_task(task_definition_name='fetch_data')
+# Async worker — for I/O-bound work (uses AsyncTaskRunner, no thread overhead)
+@worker_task(task_definition_name='fetch_data', thread_count=50)
 async def fetch_data(url: str) -> dict:
+    import httpx
     async with httpx.AsyncClient() as client:
-        response = await client.get(url)
-    return response.json()
+        resp = await client.get(url)
+    return resp.json()
 ```
 
-**Start workers** with `TaskHandler`:
-
-Note: `@worker_task` functions are discovered only after their modules are imported. Either import your worker modules
-explicitly, or pass `import_modules=[...]` when constructing `TaskHandler`.
+Start workers with `TaskHandler` — it auto-discovers `@worker_task` functions and spawns one subprocess per worker:
 
 ```python
 from conductor.client.automator.task_handler import TaskHandler
 from conductor.client.configuration.configuration import Configuration
 
-api_config = Configuration()
-task_handler = TaskHandler(
-    workers=[],
-    configuration=api_config,
-    scan_for_annotated_workers=True,  # auto-discover @worker_task functions
-    # monitor_processes=True and restart_on_failure=True by default
-)
-task_handler.start_processes()
-try:
-    task_handler.join_processes()  # blocks (workers poll forever)
-finally:
-    task_handler.stop_processes()
+config = Configuration()
+with TaskHandler(configuration=config, scan_for_annotated_workers=True) as task_handler:
+    task_handler.start_processes()
+    task_handler.join_processes()  # blocks forever (workers poll continuously)
 ```
 
-Workers support complex inputs (dataclasses), long-running tasks (`TaskInProgress`), and hierarchical configuration via environment variables.
-
-**Resilience: auto-restart and health checks**
+See [examples/worker_example.py](examples/worker_example.py) and [examples/workers_e2e.py](examples/workers_e2e.py) for complete examples.
 
+### Workflows with HTTP Calls and Waits
 
-Workers are typically long-lived services. By default, `TaskHandler` monitors worker subprocesses and restarts them if
-they exit unexpectedly.
-
-For a `/healthcheck` endpoint, you can use:
+Chain custom workers with built-in system tasks — HTTP calls, waits, JavaScript, JQ transforms — all in one workflow:
 
 ```python
-task_handler.is_healthy()
-task_handler.get_worker_process_status()
-```
+from conductor.client.workflow.conductor_workflow import ConductorWorkflow
+from conductor.client.workflow.task.http_task import HttpTask
+from conductor.client.workflow.task.wait_task import WaitTask
 
-To disable monitoring/restarts (e.g., local debugging):
+workflow = ConductorWorkflow(name='order_pipeline', version=1, executor=executor)
 
-```python
-TaskHandler(..., monitor_processes=False, restart_on_failure=False)
-```
+# Custom worker task
+validate = validate_order(task_ref_name='validate', order_id=workflow.input('order_id'))
 
-**Worker Configuration**
+# Built-in HTTP task — call any API, no worker needed
+charge_payment = HttpTask(task_ref_name='charge_payment', http_input={
+    'uri': 'https://api.stripe.com/v1/charges',
+    'method': 'POST',
+    'headers': {'Authorization': ['Bearer ${workflow.input.stripe_key}']},
+    'body': {'amount': '${validate.output.amount}'}
+})
 
-Workers support hierarchical environment variable configuration — global settings that can be overridden per worker:
+# Built-in Wait task — pause the workflow for 10 seconds
+cool_down = WaitTask(task_ref_name='cool_down', wait_for_seconds=10)
 
-```shell
-# Global (all workers)
-export CONDUCTOR_WORKER_ALL_POLL_INTERVAL_MILLIS=250
-export CONDUCTOR_WORKER_ALL_THREAD_COUNT=20
-export CONDUCTOR_WORKER_ALL_DOMAIN=production
+# Another custom worker task
+notify = send_notification(task_ref_name='notify', message='Order complete')
 
-# Per-worker override
-export CONDUCTOR_WORKER_GREETINGS_THREAD_COUNT=50
+# Chain with >> operator
+workflow >> validate >> charge_payment >> cool_down >> notify
+
+# Execute synchronously and wait for the result
+result = workflow.execute(workflow_input={'order_id': 'ORD-123', 'stripe_key': 'sk_test_...'})
+print(result.output)
 ```
 
-See [WORKER_CONFIGURATION.md](WORKER_CONFIGURATION.md) for all options.
+See [examples/kitchensink.py](examples/kitchensink.py) for all task types (HTTP, JavaScript, JQ, Switch, Terminate) and [examples/workflow_ops.py](examples/workflow_ops.py) for lifecycle operations.
 
-## Monitoring Workers
+### Long-Running Tasks with TaskContext
 
-Enable Prometheus metrics:
+For tasks that take minutes or hours (batch processing, ML training, external approvals), use `TaskContext` to report progress and poll incrementally:
 
 ```python
-from conductor.client.automator.task_handler import TaskHandler
-from conductor.client.configuration.configuration import Configuration
-from conductor.client.configuration.settings.metrics_settings import MetricsSettings
+from typing import Union
+from conductor.client.worker.worker_task import worker_task
+from conductor.client.context.task_context import get_task_context, TaskInProgress
 
-api_config = Configuration()
-metrics_settings = MetricsSettings(directory='/tmp/conductor-metrics', http_port=8000)
+@worker_task(task_definition_name='batch_job')
+def batch_job(batch_id: str) -> Union[dict, TaskInProgress]:
+    ctx = get_task_context()
+    ctx.add_log(f"Processing batch {batch_id}, poll #{ctx.get_poll_count()}")
 
-task_handler = TaskHandler(configuration=api_config, metrics_settings=metrics_settings, scan_for_annotated_workers=True)
-task_handler.start_processes()
-# Metrics at http://localhost:8000/metrics
-try:
-    task_handler.join_processes()  # blocks (workers poll forever)
-finally:
-    task_handler.stop_processes()
+    if ctx.get_poll_count() < 3:
+        # Not done yet — re-queue and check again in 30 seconds
+        return TaskInProgress(callback_after_seconds=30, output={'progress': ctx.get_poll_count() * 33})
+
+    # Done after 3 polls
+    return {'status': 'completed', 'batch_id': batch_id}
 ```
 
-See [METRICS.md](METRICS.md) for details.
+`TaskContext` also provides access to task metadata, retry counts, workflow IDs, and the ability to add logs visible in the Conductor UI.
 
-**Learn more:**
-- [Worker Design & Architecture](docs/design/WORKER_DESIGN.md) — AsyncTaskRunner vs TaskRunner, discovery, lifecycle
-- [Worker Configuration](WORKER_CONFIGURATION.md) — Environment variable configuration system
-- [Complete Worker Guide](docs/WORKER.md) — All worker patterns (function, class, annotation, async)
+See [examples/task_context_example.py](examples/task_context_example.py) for all patterns (polling, retry-aware logic, async context, input access).
 
-## Workflows
+### Monitoring with Metrics
 
-Define workflows in Python using the `>>` operator to chain tasks:
+Enable Prometheus metrics with a single setting — the SDK exposes poll counts, execution times, error rates, and HTTP latency:
 
 ```python
+from conductor.client.automator.task_handler import TaskHandler
 from conductor.client.configuration.configuration import Configuration
-from conductor.client.orkes_clients import OrkesClients
-from conductor.client.workflow.conductor_workflow import ConductorWorkflow
+from conductor.client.configuration.settings.metrics_settings import MetricsSettings
 
-api_config = Configuration()
-clients = OrkesClients(configuration=api_config)
-workflow_executor = clients.get_workflow_executor()
+config = Configuration()
+metrics = MetricsSettings(directory='/tmp/conductor-metrics', http_port=8000)
 
-workflow = ConductorWorkflow(name='greetings', version=1, executor=workflow_executor)
-# Assuming greet is defined (see Workers section above).
-workflow >> greet(task_ref_name='greet_ref', name=workflow.input('name'))
-# Registering is required if you want to start/execute by name+version; optional if you only execute inline.
-workflow.register(overwrite=True)
+with TaskHandler(configuration=config, metrics_settings=metrics, scan_for_annotated_workers=True) as task_handler:
+    task_handler.start_processes()
+    task_handler.join_processes()
 ```
 
-**Execute workflows:**
-
-```python
-# Synchronous (waits for completion)
-result = workflow_executor.execute(name='greetings', version=1, workflow_input={'name': 'Orkes'})
-print(result.output)
+```shell
+# Prometheus-compatible endpoint
+curl http://localhost:8000/metrics
+```
 
-# Asynchronous (returns workflow ID immediately)
-from conductor.client.http.models import StartWorkflowRequest
-request = StartWorkflowRequest(name='greetings', version=1, input={'name': 'Orkes'})
-workflow_id = workflow_executor.start_workflow(request)
+See [examples/metrics_example.py](examples/metrics_example.py) and [METRICS.md](METRICS.md) for details on all tracked metrics.
 
-# Inline (sends the workflow definition with the request; no prior register required)
-run = workflow.execute(workflow_input={'name': 'Orkes'}, wait_for_seconds=10)
-print(run.output)
-```
+### Managing Workflow Executions
 
-**Manage running workflows and send signals:**
+Full lifecycle control — start, execute, pause, resume, terminate, retry, restart, rerun, signal, and search:
 
 ```python
+from conductor.client.configuration.configuration import Configuration
+from conductor.client.http.models import StartWorkflowRequest, RerunWorkflowRequest, TaskResult
 from conductor.client.orkes_clients import OrkesClients
 
-clients = OrkesClients(configuration=api_config)
+config = Configuration()
+clients = OrkesClients(configuration=config)
 workflow_client = clients.get_workflow_client()
+task_client = clients.get_task_client()
+executor = clients.get_workflow_executor()
 
+# Start async (returns workflow ID immediately)
+workflow_id = executor.start_workflow(StartWorkflowRequest(name='my_workflow', input={'key': 'value'}))
+
+# Execute sync (blocks until workflow completes)
+result = executor.execute(name='my_workflow', version=1, workflow_input={'key': 'value'})
+
+# Lifecycle management
 workflow_client.pause_workflow(workflow_id)
 workflow_client.resume_workflow(workflow_id)
 workflow_client.terminate_workflow(workflow_id, reason='no longer needed')
-workflow_client.retry_workflow(workflow_id)
-workflow_client.restart_workflow(workflow_id)
+workflow_client.retry_workflow(workflow_id)          # retry from last failed task
+workflow_client.restart_workflow(workflow_id)         # restart from the beginning
+workflow_client.rerun_workflow(workflow_id,           # rerun from a specific task
+    RerunWorkflowRequest(re_run_from_task_id=task_id))
+
+# Send a signal to a waiting workflow (complete a WAIT task externally)
+task_client.update_task(TaskResult(
+    workflow_instance_id=workflow_id,
+    task_id=wait_task_id,
+    status='COMPLETED',
+    output_data={'approved': True}
+))
+
+# Search workflows
+results = workflow_client.search(query='status IN (RUNNING) AND correlationId = "order-123"')
 ```
 
-**Learn more:**
-- [Workflow Management](docs/WORKFLOW.md) — Start, pause, resume, terminate, retry, search
-- [Workflow Testing](docs/WORKFLOW_TESTING.md) — Unit testing with mock task outputs
-- [Metadata Management](docs/METADATA.md) — Task & workflow definitions
-
-## Troubleshooting
-
-- Worker stops polling or crashes: `TaskHandler` monitors and restarts worker subprocesses by default. Consider exposing
-  a `/healthcheck` endpoint using `task_handler.is_healthy()` + `task_handler.get_worker_process_status()`. If you
-  enable metrics, alert on `worker_restart_total`.
-- `httpcore.RemoteProtocolError: <ConnectionTerminated ...>`: the SDK recreates the underlying HTTP client and retries
-  once for idempotent requests. If your environment is still unstable with HTTP/2, set
-  `CONDUCTOR_HTTP2_ENABLED=false` (forces HTTP/1.1) — see `docs/WORKER.md`.
-- FastAPI/Uvicorn: avoid running `uvicorn` with multiple web workers unless you explicitly want multiple independent
-  `TaskHandler`s polling Conductor (see `examples/fastapi_worker_service.py`).
+See [examples/workflow_ops.py](examples/workflow_ops.py) for a complete walkthrough of every operation.
+
 ---
+
 ## AI & LLM Workflows
 
 Conductor supports AI-native workflows including agentic tool calling, RAG pipelines, and multi-agent orchestration.
@@ -394,6 +376,23 @@ pip install "markitdown[pdf]"
 python examples/rag_workflow.py document.pdf "What are the key findings?"
 ```
 
+---
+
+## Why Conductor?
+
+| | |
+|---|---|
+| **Language agnostic** | Workers in Python, Java, Go, JS, C# — all in one workflow |
+| **Durable execution** | Survives crashes, retries automatically, never loses state |
+| **Built-in HTTP/Wait/JS tasks** | No code needed for common operations |
+| **Horizontal scaling** | Built at Netflix for millions of workflows |
+| **Full visibility** | UI shows every execution, every task, every retry |
+| **Sync + Async execution** | Start-and-forget OR wait-for-result |
+| **Human-in-the-loop** | WAIT tasks pause until an external signal |
+| **AI-native** | LLM chat, RAG pipelines, function calling, MCP tools built-in |
+
+---
+
 ## Examples
 
 See the [Examples Guide](examples/README.md) for the full catalog. Key examples:
@@ -401,17 +400,16 @@ See the [Examples Guide](examples/README.md) for the full catalog. Key examples:
 | Example | Description | Run |
 |---------|-------------|-----|
 | [workers_e2e.py](examples/workers_e2e.py) | End-to-end: sync + async workers, metrics | `python examples/workers_e2e.py` |
-| [fastapi_worker_service.py](examples/fastapi_worker_service.py) | FastAPI: expose a workflow as an API (+ workers) (deps: fastapi, uvicorn) | `uvicorn examples.fastapi_worker_service:app --port 8081 --workers 1` |
+| [kitchensink.py](examples/kitchensink.py) | All task types (HTTP, JS, JQ, Switch) | `python examples/kitchensink.py` |
+| [workflow_ops.py](examples/workflow_ops.py) | Pause, resume, terminate, retry, restart, rerun, signal | `python examples/workflow_ops.py` |
+| [task_context_example.py](examples/task_context_example.py) | Long-running tasks with TaskInProgress | `python examples/task_context_example.py` |
+| [metrics_example.py](examples/metrics_example.py) | Prometheus metrics collection | `python examples/metrics_example.py` |
+| [fastapi_worker_service.py](examples/fastapi_worker_service.py) | FastAPI: expose a workflow as an API (+ workers) | `uvicorn examples.fastapi_worker_service:app --port 8081 --workers 1` |
 | [helloworld.py](examples/helloworld/helloworld.py) | Minimal hello world | `python examples/helloworld/helloworld.py` |
 | [dynamic_workflow.py](examples/dynamic_workflow.py) | Build workflows programmatically | `python examples/dynamic_workflow.py` |
-| [llm_chat.py](examples/agentic_workflows/llm_chat.py) | AI multi-turn chat | `python examples/agentic_workflows/llm_chat.py` |
-| [rag_workflow.py](examples/rag_workflow.py) | RAG pipeline (PDF → pgvector → answer) | `python examples/rag_workflow.py file.pdf "question"` |
-| [task_context_example.py](examples/task_context_example.py) | Long-running tasks with TaskInProgress | `python examples/task_context_example.py` |
-| [workflow_ops.py](examples/workflow_ops.py) | Pause, resume, terminate workflows | `python examples/workflow_ops.py` |
 | [test_workflows.py](examples/test_workflows.py) | Unit testing workflows | `python -m unittest examples.test_workflows` |
-| [kitchensink.py](examples/kitchensink.py) | All task types (HTTP, JS, JQ, Switch) | `python examples/kitchensink.py` |
 
-## API Journey Examples
+**API Journey Examples**
 
 End-to-end examples covering all APIs for each domain:
 
@@ -441,13 +439,6 @@ End-to-end examples covering all APIs for each domain:
 | [Metrics](METRICS.md) | Prometheus metrics collection |
 | [Examples](examples/README.md) | Complete examples catalog |
 
-## Support
-
-- [Open an issue (SDK)](https://github.com/conductor-sdk/conductor-python/issues) for SDK bugs, questions, and feature requests
-- [Open an issue (Conductor server)](https://github.com/conductor-oss/conductor/issues) for Conductor OSS server issues
-- [Join the Conductor Slack](https://join.slack.com/t/orkes-conductor/shared_invite/zt-2vdbx239s-Eacdyqya9giNLHfrCavfaA) for community discussion and help
-- [Orkes Community Forum](https://community.orkes.io/) for Q&A
-
 ## Frequently Asked Questions
 
 **Is this the same as Netflix Conductor?**
@@ -472,7 +463,7 @@ No. While Conductor excels at asynchronous orchestration, it also supports synch
 
 **Do I need to use a Conductor-specific framework?**
 
-No. Conductor is language and framework agnostic. Use your preferred language and framework -- the [SDKs](https://github.com/conductor-oss/conductor#conductor-sdks) provide native integration for Python, Java, JavaScript, Go, C#, and more.
+No. Conductor is language and framework agnostic. Use your preferred language and framework — the [SDKs](https://github.com/conductor-oss/conductor#conductor-sdks) provide native integration for Python, Java, JavaScript, Go, C#, and more.
 
 **Can I mix workers written in different languages?**
 
@@ -484,16 +475,23 @@ Python 3.9 and above.
 
 **Should I use `def` or `async def` for my workers?**
 
-Use `async def` for I/O-bound tasks (API calls, database queries) -- the SDK uses `AsyncTaskRunner` with a single event loop for high concurrency with low overhead. Use regular `def` for CPU-bound or blocking work -- the SDK uses `TaskRunner` with a thread pool. The SDK selects the right runner automatically based on your function signature.
+Use `async def` for I/O-bound tasks (API calls, database queries) — the SDK uses `AsyncTaskRunner` with a single event loop for high concurrency with low overhead. Use regular `def` for CPU-bound or blocking work — the SDK uses `TaskRunner` with a thread pool. The SDK selects the right runner automatically based on your function signature.
 
 **How do I run workers in production?**
 
-Workers are standard Python processes. Deploy them as you would any Python application -- in containers, VMs, or bare metal. Workers poll the Conductor server for tasks, so no inbound ports need to be opened. See [Worker Design](docs/design/WORKER_DESIGN.md) for architecture details.
+Workers are standard Python processes. Deploy them as you would any Python application — in containers, VMs, or bare metal. Workers poll the Conductor server for tasks, so no inbound ports need to be opened. See [Worker Design](docs/design/WORKER_DESIGN.md) for architecture details.
 
 **How do I test workflows without running a full Conductor server?**
 
 The SDK provides a test framework that uses Conductor's `POST /api/workflow/test` endpoint to evaluate workflows with mock task outputs. See [Workflow Testing](docs/WORKFLOW_TESTING.md) for details.
 
+## Support
+
+- [Open an issue (SDK)](https://github.com/conductor-sdk/conductor-python/issues) for SDK bugs, questions, and feature requests
+- [Open an issue (Conductor server)](https://github.com/conductor-oss/conductor/issues) for Conductor OSS server issues
+- [Join the Conductor Slack](https://join.slack.com/t/orkes-conductor/shared_invite/zt-2vdbx239s-Eacdyqya9giNLHfrCavfaA) for community discussion and help
+- [Orkes Community Forum](https://community.orkes.io/) for Q&A
+
 ## License
 
 Apache 2.0
diff --git a/docs/design/WORKER_SDK_IMPLEMENTATION_GUIDE.md b/docs/design/WORKER_SDK_IMPLEMENTATION_GUIDE.md
index fdd81937b..3f28504df 100644
--- a/docs/design/WORKER_SDK_IMPLEMENTATION_GUIDE.md
+++ b/docs/design/WORKER_SDK_IMPLEMENTATION_GUIDE.md
@@ -203,6 +203,10 @@ Initialize → Register Task Def → Start Polling → Execute Tasks → Update
    - Manages lifecycle (start, stop, restart)
    - Provides configuration to workers
    - Coordinates metrics collection
+   - Monitors and auto-restarts crashed worker processes (see Section 4.4)
+   - Provides health check APIs for container orchestrators
+   - Supports `import_modules` to force-import modules before scanning for decorated workers
+   - Implements context manager protocol (`with TaskHandler(...) as th:`) for clean lifecycle
 
 2. **TaskRunner (Execution Engine)**
    - Runs in worker process
@@ -241,11 +245,20 @@ class TaskHandler {
     MetricsSettings metricsSettings
     List<EventListener> eventListeners
 
+    // Supervision settings
+    Bool monitorProcesses       // default: true
+    Bool restartOnFailure       // default: true
+    Int restartMaxAttempts      // default: 0 (unlimited)
+    Float restartBackoffSeconds // default: 5.0
+    Float restartBackoffMaxSeconds // default: 300.0
+
     // Methods
     discover_workers() → List<Worker>
     start_processes()
     stop_processes()
     join_processes()
+    is_healthy() → Bool
+    get_worker_process_status() → Map<String, ProcessStatus>
 }
 
 // Worker metadata
@@ -263,6 +276,7 @@ class Worker {
     Bool overwriteTaskDef
     Bool strictSchema
     Bool paused
+    Bool leaseExtendEnabled  // Auto-extend task lease for long-running tasks
 }
 
 // Execution engine (one per worker process)
@@ -340,6 +354,103 @@ FUNCTION detect_worker_type(worker_function):
 - Rust: Check for `async fn` keyword
 - JavaScript/TypeScript: Check for `async function`
 
+### 4.4 Worker Process Supervision
+
+**Key Principle:** Worker processes must be monitored and auto-restarted on failure for production reliability.
+
+**Architecture:**
+- TaskHandler spawns a background monitor thread after `start_processes()`
+- Monitor thread periodically checks if each worker process is alive
+- Dead processes are restarted with exponential backoff to prevent crash loops
+
+```
+FUNCTION start_monitor():
+    IF not monitor_processes:
+        RETURN
+
+    // Spawn background thread
+    monitor_thread = Thread(target=monitor_loop, daemon=True)
+    monitor_thread.start()
+
+FUNCTION monitor_loop():
+    WHILE not shutdown_requested:
+        check_and_restart_processes()
+        sleep(monitor_interval_seconds)  // default: 5s
+
+FUNCTION check_and_restart_processes():
+    LOCK process_lock:
+        FOR i, process IN enumerate(worker_processes):
+            IF process.is_alive():
+                CONTINUE
+
+            exitcode = process.exitcode
+            worker_name = workers[i].task_definition_name
+
+            log_warning("Worker process exited (worker={worker_name}, exitcode={exitcode})")
+
+            IF not restart_on_failure:
+                CONTINUE
+
+            restart_worker_process(i)
+
+FUNCTION restart_worker_process(index: Int):
+    // Enforce max attempts (0 = unlimited)
+    IF restart_max_attempts > 0 AND restart_counts[index] >= restart_max_attempts:
+        log_error("Max restart attempts reached for worker {worker_name}")
+        RETURN
+
+    // Exponential backoff per-worker to prevent tight crash loops
+    now = current_time()
+    IF now < next_restart_at[index]:
+        RETURN  // Still in backoff period
+
+    backoff = min(
+        restart_backoff_seconds * (2 ^ restart_counts[index]),
+        restart_backoff_max_seconds
+    )
+    next_restart_at[index] = now + backoff
+
+    // Reap old process (avoid zombie accumulation)
+    old_process.join(timeout=0)
+    old_process.close()
+
+    // Spawn new process
+    new_process = build_process_for_worker(workers[index])
+    worker_processes[index] = new_process
+    new_process.start()
+    restart_counts[index] += 1
+
+    // Metrics
+    increment_metric("worker_restart_total", {task_type: worker_name})
+
+    log_info("Restarted worker (worker={worker_name}, attempt={restart_counts[index]}, backoff={backoff}s)")
+```
+
+**Configuration:**
+
+| Property | Type | Default | Description |
+|----------|------|---------|-------------|
+| `monitor_processes` | Bool | true | Enable process supervision |
+| `restart_on_failure` | Bool | true | Auto-restart crashed workers |
+| `restart_max_attempts` | Int | 0 | Max restarts per worker (0 = unlimited) |
+| `restart_backoff_seconds` | Float | 5.0 | Initial backoff before restart |
+| `restart_backoff_max_seconds` | Float | 300.0 | Maximum backoff cap |
+| `monitor_interval_seconds` | Float | 5.0 | Health check interval |
+
+**Health Check API:**
+
+```
+FUNCTION is_healthy() → Bool:
+    FOR process IN worker_processes:
+        IF not process.is_alive():
+            RETURN false
+    RETURN true
+
+FUNCTION get_worker_process_status() → Map<String, ProcessStatus>:
+    // Returns per-worker status: alive, pid, exitcode, restart_count
+    // Useful for /healthcheck endpoints in web frameworks
+```
+
 ---
 
 ## 5. Polling & Execution Loop
@@ -415,7 +526,7 @@ FUNCTION batch_poll(count: Int) → List<Task>:
     params = {
         "workerid": worker_id,
         "count": count,
-        "timeout": 100  // ms, server-side long poll
+        "timeout": worker.poll_timeout  // ms, server-side long poll (default: 100)
     }
 
     // Only include domain if not null/empty
@@ -624,12 +735,66 @@ FUNCTION update_task(task_result: TaskResult):
 
 **Why This Matters:** Task was executed successfully, but Conductor doesn't know. External systems must handle recovery.
 
+### 5.5b v2 Update Endpoint & Task Chaining (Optimization)
+
+**Key Principle:** The v2 update endpoint returns the next task to process, eliminating a round-trip poll.
+
+Instead of the pattern: execute → update → poll → execute → update → poll, the v2 endpoint enables: execute → update+poll → execute → update+poll.
+
+```
+FUNCTION update_task_v2(task_result: TaskResult) → Task | null:
+    // Same retry logic as update_task (Section 5.5)
+    // BUT: response is a Task object (the next task to process) or null
+
+    FOR attempt IN [0, 1, 2, 3]:
+        IF attempt > 0:
+            sleep(attempt * 10 seconds)
+
+        TRY:
+            next_task = http_client.update_task_v2(task_result)
+            RETURN next_task  // May be null if no pending tasks
+
+        CATCH Exception:
+            // Same retry logic as v1
+
+    RETURN null
+```
+
+**Execute-Update Loop:**
+
+```
+FUNCTION execute_and_update_task(task: Task):
+    // Tight loop: execute → update_v2 (get next) → execute → ...
+    WHILE task is not null AND not shutdown:
+        result = execute_task(task)
+
+        // TaskInProgress or async: stop chaining
+        IF result is null OR result is TaskInProgress:
+            RETURN
+
+        // Update AND get next task in one call
+        task = update_task_v2(result)
+```
+
+**Benefits:**
+- ~50% fewer HTTP round-trips under load (update + poll combined)
+- Lower latency between consecutive tasks
+- Backward compatible: falls back to normal polling when v2 returns null
+
+**HTTP Endpoint:**
+```
+POST /api/tasks/update-v2
+Body: TaskResult (JSON)
+Response: Task | null (next task to process for same task type)
+```
+
 ### 5.6 Capacity Management
 
 **Key Principle:** Capacity represents end-to-end task handling (execute + update)
 
+**Async Workers (Explicit Semaphore):**
 ```
-// Semaphore/capacity held during BOTH execute and update
+// Semaphore held during BOTH execute and update
 FUNCTION execute_and_update_task(task: Task):
     ACQUIRE semaphore:  // Blocks if at capacity
         result = execute_task(task)
@@ -641,7 +806,23 @@ FUNCTION execute_and_update_task(task: Task):
         // Only then can new task be polled
 ```
 
-**Why:** Ensures we don't poll more tasks than we can fully handle (execute AND update).
+**Sync Workers (Implicit via Thread Pool):**
+```
+// Thread pool naturally provides capacity management.
+// Each thread runs execute_and_update_task — the thread stays
+// occupied during BOTH execute and update, so the pool size
+// (= thread_count) limits concurrency without an explicit semaphore.
+FUNCTION execute_and_update_task(task: Task):
+    // Runs inside ThreadPoolExecutor(max_workers=thread_count)
+    result = execute_task(task)
+
+    IF result is not TaskInProgress:
+        update_task(result)
+
+    // Thread returns to pool — capacity slot freed
+```
+
+**Why:** Ensures we don't poll more tasks than we can fully handle (execute AND update). Both approaches achieve the same goal — async uses explicit semaphore, sync uses thread pool sizing.
 
 ---
 
@@ -667,6 +848,7 @@ FUNCTION execute_and_update_task(task: Task):
 | `overwrite_task_def` | Bool | true | Overwrite existing task definitions |
 | `strict_schema` | Bool | false | Enforce strict JSON schema validation |
 | `paused` | Bool | false | Pause worker (stop polling) |
+| `lease_extend_enabled` | Bool | false | Auto-extend task lease for long-running tasks (alternative to TaskInProgress) |
 
 ### 6.3 Environment Variable Format
 
@@ -1144,6 +1326,8 @@ FUNCTION reset_auth_failures():
     auth_failures = 0
 ```
 
+**When to Reset:** Auth failures should be reset when a poll succeeds (200 response), regardless of whether tasks were returned. A successful HTTP response means authentication is working.
+
 ### 9.3 Adaptive Backoff for Empty Polls
 
 ```
@@ -1440,7 +1624,7 @@ Query Params:
 Response: List<Task>
 ```
 
-**Update Task:**
+**Update Task (v1):**
 ```
 POST /api/tasks
 Body: TaskResult (JSON)
@@ -1448,6 +1632,15 @@ Body: TaskResult (JSON)
 Response: string (task status)
 ```
 
+**Update Task (v2) — Recommended:**
+```
+POST /api/tasks/update-v2
+Body: TaskResult (JSON)
+
+Response: Task | null (next task to process for same task type)
+```
+The v2 endpoint combines update + poll: it updates the current task result and returns the next pending task (if any) for the same task type. This enables the execute-update loop optimization described in Section 5.5b.
+
 **Register Task Definition:**
 ```
 POST /api/metadata/taskdefs
@@ -1782,7 +1975,12 @@ FUNCTION validate_and_process_order(order_id: String) → Result:
 
 ### 16.2 Long-Running Tasks (TaskInProgress)
 
-**Pattern:** Return TaskInProgress to extend lease
+**Two approaches for long-running tasks:**
+
+1. **TaskInProgress (manual):** Worker returns `TaskInProgress` to re-queue itself with a callback delay. Best for tasks that need incremental progress tracking.
+2. **Lease Extension (automatic):** Set `lease_extend_enabled=true` on the worker — the SDK automatically extends the task lease periodically. Best for tasks that run continuously without needing poll-based progress.
+
+**Pattern 1: TaskInProgress — Return to re-queue**
 
 ```
 class TaskInProgress {
diff --git a/pyproject.toml b/pyproject.toml
index 9f88cb7cf..35756df8e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -153,6 +153,9 @@ line-ending = "auto"
 "tests/**/*.py" = ["B", "C4", "SIM"]
 "examples/**/*.py" = ["B", "C4", "SIM"]
 
+[tool.pytest.ini_options]
+pythonpath = ["src"]
+
 [tool.coverage.run]
 source = ["src/conductor"]
 omit = [
diff --git a/src/conductor/__init__.py b/src/conductor/__init__.py
index 85edbea80..4b1a9c969 100644
--- a/src/conductor/__init__.py
+++ b/src/conductor/__init__.py
@@ -1 +1,6 @@
+from __future__ import annotations
+
+from pkgutil import extend_path
+
+__path__ = extend_path(__path__, __name__)
 __version__ = "1.1.10"
diff --git a/src/conductor/client/automator/async_task_runner.py b/src/conductor/client/automator/async_task_runner.py
index 85410c7b6..97c2f1ac5 100644
--- a/src/conductor/client/automator/async_task_runner.py
+++ b/src/conductor/client/automator/async_task_runner.py
@@ -212,9 +212,9 @@ async def __async_register_task_definition(self) -> None:
         """
         task_name = self.worker.get_task_definition_name()
 
-        logger.info("=" * 80)
-        logger.info(f"Registering task definition: {task_name}")
-        logger.info("=" * 80)
+        logger.debug("=" * 80)
+        logger.debug(f"Registering task definition: {task_name}")
+        logger.debug("=" * 80)
 
         try:
             # Create metadata client (sync client works in async context)
@@ -227,7 +227,7 @@ async def __async_register_task_definition(self) -> None:
             schema_registry_available = True
 
             if hasattr(self.worker, 'execute_function'):
-                logger.info(f"Generating JSON schemas from function signature...")
+                logger.debug(f"Generating JSON schemas from function signature...")
                 # Pass strict_schema flag to control additionalProperties
                 strict_mode = getattr(self.worker, 'strict_schema', False)
                 logger.debug(f"  strict_schema mode: {strict_mode}")
@@ -238,9 +238,9 @@ async def __async_register_task_definition(self) -> None:
                     has_output_schema = schemas.get('output') is not None
 
                     if has_input_schema or has_output_schema:
-                        logger.info(f"  ✓ Generated schemas: input={'Yes' if has_input_schema else 'No'}, output={'Yes' if has_output_schema else 'No'}")
+                        logger.debug(f"  ✓ Generated schemas: input={'Yes' if has_input_schema else 'No'}, output={'Yes' if has_output_schema else 'No'}")
                     else:
-                        logger.info(f"  ⚠ No schemas generated (type hints not fully supported)")
+                        logger.debug(f"  ⚠ No schemas generated (type hints not fully supported)")
                     # Register schemas with schema client
                     try:
                         logger.debug(f"Creating schema client...")
@@ -253,7 +253,7 @@ async def __async_register_task_definition(self) -> None:
                         schema_client = None
 
                     if schema_registry_available and schema_client:
-                        logger.info(f"Registering JSON schemas...")
+                        logger.debug(f"Registering JSON schemas...")
                         try:
                             # Register input schema
                             if schemas.get('input'):
@@ -267,7 +267,7 @@ async def __async_register_task_definition(self) -> None:
                                         data=schemas['input']
                                     )
                                     schema_client.register_schema(input_schema_def)
-                                    logger.info(f"  ✓ Registered input schema: {input_schema_name} (v1)")
+                                    logger.debug(f"  ✓ Registered input schema: {input_schema_name} (v1)")
 
                                 except Exception as e:
                                     # Check if this is a 404 (API endpoint doesn't exist on server)
@@ -292,7 +292,7 @@ async def __async_register_task_definition(self) -> None:
                                         data=schemas['output']
                                     )
                                     schema_client.register_schema(output_schema_def)
-                                    logger.info(f"  ✓ Registered output schema: {output_schema_name} (v1)")
+                                    logger.debug(f"  ✓ Registered output schema: {output_schema_name} (v1)")
 
                                 except Exception as e:
                                     # Check if this is a 404 (API endpoint doesn't exist on server)
@@ -307,12 +307,12 @@ async def __async_register_task_definition(self) -> None:
                         except Exception as e:
                             logger.debug(f"Could not register schemas for {task_name}: {e}")
                 else:
-                    logger.info(f"  ⚠ No schemas generated (unable to analyze function signature)")
+                    logger.debug(f"  ⚠ No schemas generated (unable to analyze function signature)")
             else:
-                logger.info(f"  ⚠ Class-based worker (no execute_function) - registering task without schemas")
+                logger.debug(f"  ⚠ Class-based worker (no execute_function) - registering task without schemas")
 
             # Create task definition
-            logger.info(f"Creating task definition for '{task_name}'...")
+            logger.debug(f"Creating task definition for '{task_name}'...")
 
             # Check if task_def_template is provided
             logger.debug(f"  task_def_template present: {hasattr(self.worker, 'task_def_template')}")
@@ -321,7 +321,7 @@ async def __async_register_task_definition(self) -> None:
 
             # Use provided task_def template if available, otherwise create minimal TaskDef
             if hasattr(self.worker, 'task_def_template') and self.worker.task_def_template:
-                logger.info(f"  Using provided TaskDef configuration:")
+                logger.debug(f"  Using provided TaskDef configuration:")
 
                 # Create a copy to avoid mutating the original
                 import copy
@@ -332,22 +332,22 @@ async def __async_register_task_definition(self) -> None:
 
                 # Log configuration being applied
                 if task_def.retry_count:
-                    logger.info(f"    - retry_count: {task_def.retry_count}")
+                    logger.debug(f"    - retry_count: {task_def.retry_count}")
                 if task_def.retry_logic:
-                    logger.info(f"    - retry_logic: {task_def.retry_logic}")
+                    logger.debug(f"    - retry_logic: {task_def.retry_logic}")
                 if task_def.timeout_seconds:
-                    logger.info(f"    - timeout_seconds: {task_def.timeout_seconds}")
+                    logger.debug(f"    - timeout_seconds: {task_def.timeout_seconds}")
                 if task_def.timeout_policy:
-                    logger.info(f"    - timeout_policy: {task_def.timeout_policy}")
+                    logger.debug(f"    - timeout_policy: {task_def.timeout_policy}")
                 if task_def.response_timeout_seconds:
-                    logger.info(f"    - response_timeout_seconds: {task_def.response_timeout_seconds}")
+                    logger.debug(f"    - response_timeout_seconds: {task_def.response_timeout_seconds}")
                 if task_def.concurrent_exec_limit:
-                    logger.info(f"    - concurrent_exec_limit: {task_def.concurrent_exec_limit}")
+                    logger.debug(f"    - concurrent_exec_limit: {task_def.concurrent_exec_limit}")
                 if task_def.rate_limit_per_frequency:
-                    logger.info(f"    - rate_limit: {task_def.rate_limit_per_frequency}/{task_def.rate_limit_frequency_in_seconds}s")
+                    logger.debug(f"    - rate_limit: {task_def.rate_limit_per_frequency}/{task_def.rate_limit_frequency_in_seconds}s")
             else:
                 # Create minimal task definition
-                logger.info(f"  Creating minimal TaskDef (no custom configuration)")
+                logger.debug(f"  Creating minimal TaskDef (no custom configuration)")
                 task_def = TaskDef(name=task_name)
 
             # Link schemas if they were generated (overrides any schemas in task_def_template)
@@ -383,7 +383,7 @@ async def __async_register_task_definition(self) -> None:
                         existing = metadata_client.get_task_def(task_name)
                         if existing:
                             logger.info(f"✓ Task definition '{task_name}' already exists - skipping (overwrite=False)")
-                            logger.info(f"  View at: {self.configuration.ui_host}/taskDef/{task_name}")
+                            logger.debug(f"  View at: {self.configuration.ui_host}/taskDef/{task_name}")
                             return
                     except Exception:
                         # Task doesn't exist, proceed to register
@@ -393,7 +393,7 @@ async def __async_register_task_definition(self) -> None:
                 # Print success message with link
                 task_def_url = f"{self.configuration.ui_host}/taskDef/{task_name}"
                 logger.info(f"✓ Registered/Updated task definition: {task_name} with {task_def.to_dict()}")
-                logger.info(f"  View at: {task_def_url}")
+                logger.debug(f"  View at: {task_def_url}")
 
                 if input_schema_name or output_schema_name:
                     schema_count = sum([1 for s in [input_schema_name, output_schema_name] if s])
@@ -406,7 +406,7 @@ async def __async_register_task_definition(self) -> None:
 
                     task_def_url = f"{self.configuration.ui_host}/taskDef/{task_name}"
                     logger.info(f"✓ Registered task definition: {task_name}")
-                    logger.info(f"  View at: {task_def_url}")
+                    logger.debug(f"  View at: {task_def_url}")
 
                     if input_schema_name or output_schema_name:
                         schema_count = sum([1 for s in [input_schema_name, output_schema_name] if s])
@@ -519,9 +519,8 @@ async def __async_batch_poll(self, count: int) -> list:
                 tasks_received=len(tasks) if tasks else 0
             ))
 
-            # Success - reset auth failure counter
-            if tasks:
-                self._auth_failures = 0
+            # Success - reset auth failure counter (any successful HTTP response means auth is working)
+            self._auth_failures = 0
 
             return tasks if tasks else []
 
@@ -564,18 +563,26 @@ async def __async_batch_poll(self, count: int) -> list:
             return []
 
     async def __async_execute_and_update_task(self, task: Task) -> None:
-        """Execute task and update result (async version - runs in event loop, not thread pool)."""
+        """Execute task and update result in a tight loop (async version).
+
+        Uses the v2 update endpoint which returns the next task to process.
+        Loops: execute -> update_v2 (get next task) -> execute -> ...
+        The semaphore is held for the entire loop duration, keeping the slot occupied.
+        """
         # Acquire semaphore for entire task lifecycle (execution + update)
         # This ensures we never exceed thread_count tasks in any stage of processing
         async with self._semaphore:
             try:
-                task_result = await self.__async_execute_task(task)
-                # If task returned TaskInProgress, don't update yet
-                if isinstance(task_result, TaskInProgress):
-                    logger.debug("Task %s is in progress, will update when complete", task.task_id)
-                    return
-                if task_result is not None:
-                    await self.__async_update_task(task_result)
+                while task is not None and not self._shutdown:
+                    task_result = await self.__async_execute_task(task)
+                    # If task returned TaskInProgress, don't update yet
+                    if isinstance(task_result, TaskInProgress):
+                        logger.debug("Task %s is in progress, will update when complete", task.task_id)
+                        return
+                    if task_result is None:
+                        return
+                    # Update task and get next task from v2 response
+                    task = await self.__async_update_task(task_result)
             except Exception as e:
                 logger.error(
                     "Error executing/updating task %s: %s",
@@ -786,7 +793,7 @@ def __merge_context_modifications(self, task_result: TaskResult, context_result:
                 task_result.output_data = context_result.output_data
 
     async def __async_update_task(self, task_result: TaskResult):
-        """Async update task result (async version of TaskRunner.__update_task)."""
+        """Async update task result using v2 endpoint. Returns the next Task to process, or None."""
         if not isinstance(task_result, TaskResult):
             return None
         task_definition_name = self.worker.get_task_definition_name()
@@ -808,15 +815,15 @@ async def __async_update_task(self, task_result: TaskResult):
                 # Exponential backoff: [10s, 20s, 30s] before retry
                 await asyncio.sleep(attempt * 10)
             try:
-                response = await self.async_task_client.update_task(body=task_result)
+                next_task = await self.async_task_client.update_task_v2(body=task_result)
                 logger.debug(
-                    "Updated async task, id: %s, workflow_instance_id: %s, task_definition_name: %s, response: %s",
+                    "Updated async task (v2), id: %s, workflow_instance_id: %s, task_definition_name: %s, next_task: %s",
                     task_result.task_id,
                     task_result.workflow_instance_id,
                     task_definition_name,
-                    response
+                    next_task.task_id if next_task else None
                 )
-                return response
+                return next_task
             except Exception as e:
                 last_exception = e
                 if self.metrics_collector is not None:
diff --git a/src/conductor/client/automator/task_handler.py b/src/conductor/client/automator/task_handler.py
index 08ef6961b..ff43ecbc4 100644
--- a/src/conductor/client/automator/task_handler.py
+++ b/src/conductor/client/automator/task_handler.py
@@ -556,6 +556,16 @@ def __stop_process(self, process: Process):
         if process is None:
             return
         try:
+            # If the process was never started (pid is None), there's nothing to terminate.
+            if getattr(process, "pid", None) is None:
+                return
+            if not process.is_alive():
+                # Already stopped; attempt a short join to reap resources and return.
+                try:
+                    process.join(timeout=0.1)
+                except Exception:
+                    pass
+                return
             logger.debug("Terminating process: %s", process.pid)
             process.terminate()
             # Wait for graceful termination
diff --git a/src/conductor/client/automator/task_runner.py b/src/conductor/client/automator/task_runner.py
index 6165d0d94..16c8f4323 100644
--- a/src/conductor/client/automator/task_runner.py
+++ b/src/conductor/client/automator/task_runner.py
@@ -180,9 +180,9 @@ def __register_task_definition(self) -> None:
         """
         task_name = self.worker.get_task_definition_name()
 
-        logger.info("=" * 80)
-        logger.info(f"Registering task definition: {task_name}")
-        logger.info("=" * 80)
+        logger.debug("=" * 80)
+        logger.debug(f"Registering task definition: {task_name}")
+        logger.debug("=" * 80)
 
         try:
             # Create metadata client
@@ -195,7 +195,7 @@ def __register_task_definition(self) -> None:
             schema_registry_available = True
 
             if hasattr(self.worker, 'execute_function'):
-                logger.info(f"Generating JSON schemas from function signature...")
+                logger.debug(f"Generating JSON schemas from function signature...")
                 # Pass strict_schema flag to control additionalProperties
                 strict_mode = getattr(self.worker, 'strict_schema', False)
                 logger.debug(f"  strict_schema mode: {strict_mode}")
@@ -206,9 +206,9 @@ def __register_task_definition(self) -> None:
                     has_output_schema = schemas.get('output') is not None
 
                     if has_input_schema or has_output_schema:
-                        logger.info(f"  ✓ Generated schemas: input={'Yes' if has_input_schema else 'No'}, output={'Yes' if has_output_schema else 'No'}")
+                        logger.debug(f"  ✓ Generated schemas: input={'Yes' if has_input_schema else 'No'}, output={'Yes' if has_output_schema else 'No'}")
                     else:
-                        logger.info(f"  ⚠ No schemas generated (type hints not fully supported)")
+                        logger.debug(f"  ⚠ No schemas generated (type hints not fully supported)")
 
                     # Register schemas with schema client
                     try:
@@ -222,7 +222,7 @@ def __register_task_definition(self) -> None:
                         schema_client = None
 
                     if schema_registry_available and schema_client:
-                        logger.info(f"Registering JSON schemas...")
+                        logger.debug(f"Registering JSON schemas...")
                         try:
                             # Register input schema
                             if schemas.get('input'):
@@ -236,7 +236,7 @@ def __register_task_definition(self) -> None:
                                         data=schemas['input']
                                     )
                                     schema_client.register_schema(input_schema_def)
-                                    logger.info(f"  ✓ Registered input schema: {input_schema_name} (v1)")
+                                    logger.debug(f"  ✓ Registered input schema: {input_schema_name} (v1)")
 
                                 except Exception as e:
                                     # Check if this is a 404 (API endpoint doesn't exist on server)
@@ -261,7 +261,7 @@ def __register_task_definition(self) -> None:
                                         data=schemas['output']
                                     )
                                     schema_client.register_schema(output_schema_def)
-                                    logger.info(f"  ✓ Registered output schema: {output_schema_name} (v1)")
+                                    logger.debug(f"  ✓ Registered output schema: {output_schema_name} (v1)")
 
                                 except Exception as e:
                                     # Check if this is a 404 (API endpoint doesn't exist on server)
@@ -276,12 +276,12 @@ def __register_task_definition(self) -> None:
                         except Exception as e:
                             logger.debug(f"Could not register schemas for {task_name}: {e}")
                 else:
-                    logger.info(f"  ⚠ No schemas generated (unable to analyze function signature)")
+                    logger.debug(f"  ⚠ No schemas generated (unable to analyze function signature)")
             else:
-                logger.info(f"  ⚠ Class-based worker (no execute_function) - registering task without schemas")
+                logger.debug(f"  ⚠ Class-based worker (no execute_function) - registering task without schemas")
 
             # Create task definition
-            logger.info(f"Creating task definition for '{task_name}'...")
+            logger.debug(f"Creating task definition for '{task_name}'...")
 
             # Check if task_def_template is provided
             logger.debug(f"  task_def_template present: {hasattr(self.worker, 'task_def_template')}")
@@ -290,7 +290,7 @@ def __register_task_definition(self) -> None:
 
             # Use provided task_def template if available, otherwise create minimal TaskDef
             if hasattr(self.worker, 'task_def_template') and self.worker.task_def_template:
-                logger.info(f"  Using provided TaskDef configuration:")
+                logger.debug(f"  Using provided TaskDef configuration:")
 
                 # Create a copy to avoid mutating the original
                 import copy
@@ -301,22 +301,22 @@ def __register_task_definition(self) -> None:
 
                 # Log configuration being applied
                 if task_def.retry_count:
-                    logger.info(f"    - retry_count: {task_def.retry_count}")
+                    logger.debug(f"    - retry_count: {task_def.retry_count}")
                 if task_def.retry_logic:
-                    logger.info(f"    - retry_logic: {task_def.retry_logic}")
+                    logger.debug(f"    - retry_logic: {task_def.retry_logic}")
                 if task_def.timeout_seconds:
-                    logger.info(f"    - timeout_seconds: {task_def.timeout_seconds}")
+                    logger.debug(f"    - timeout_seconds: {task_def.timeout_seconds}")
                 if task_def.timeout_policy:
-                    logger.info(f"    - timeout_policy: {task_def.timeout_policy}")
+                    logger.debug(f"    - timeout_policy: {task_def.timeout_policy}")
                 if task_def.response_timeout_seconds:
-                    logger.info(f"    - response_timeout_seconds: {task_def.response_timeout_seconds}")
+                    logger.debug(f"    - response_timeout_seconds: {task_def.response_timeout_seconds}")
                 if task_def.concurrent_exec_limit:
-                    logger.info(f"    - concurrent_exec_limit: {task_def.concurrent_exec_limit}")
+                    logger.debug(f"    - concurrent_exec_limit: {task_def.concurrent_exec_limit}")
                 if task_def.rate_limit_per_frequency:
-                    logger.info(f"    - rate_limit: {task_def.rate_limit_per_frequency}/{task_def.rate_limit_frequency_in_seconds}s")
+                    logger.debug(f"    - rate_limit: {task_def.rate_limit_per_frequency}/{task_def.rate_limit_frequency_in_seconds}s")
             else:
                 # Create minimal task definition
-                logger.info(f"  Creating minimal TaskDef (no custom configuration)")
+                logger.debug(f"  Creating minimal TaskDef (no custom configuration)")
                 task_def = TaskDef(name=task_name)
 
             # Link schemas if they were generated (overrides any schemas in task_def_template)
@@ -352,7 +352,7 @@ def __register_task_definition(self) -> None:
                         existing = metadata_client.get_task_def(task_name)
                         if existing:
                             logger.info(f"✓ Task definition '{task_name}' already exists - skipping (overwrite=False)")
-                            logger.info(f"  View at: {self.configuration.ui_host}/taskDef/{task_name}")
+                            logger.debug(f"  View at: {self.configuration.ui_host}/taskDef/{task_name}")
                             return
                     except Exception:
                         # Task doesn't exist, proceed to register
@@ -362,7 +362,7 @@ def __register_task_definition(self) -> None:
                 # Print success message with link
                 task_def_url = f"{self.configuration.ui_host}/taskDef/{task_name}"
                 logger.info(f"✓ Registered/Updated task definition: {task_name} with {task_def.to_dict()}")
-                logger.info(f"  View at: {task_def_url}")
+                logger.debug(f"  View at: {task_def_url}")
 
                 if input_schema_name or output_schema_name:
                     schema_count = sum([1 for s in [input_schema_name, output_schema_name] if s])
@@ -375,7 +375,7 @@ def __register_task_definition(self) -> None:
 
                     task_def_url = f"{self.configuration.ui_host}/taskDef/{task_name}"
                     logger.info(f"✓ Registered task definition: {task_name}")
-                    logger.info(f"  View at: {task_def_url}")
+                    logger.debug(f"  View at: {task_def_url}")
 
                     if input_schema_name or output_schema_name:
                         schema_count = sum([1 for s in [input_schema_name, output_schema_name] if s])
@@ -488,8 +488,13 @@ def __check_completed_async_tasks(self) -> None:
                     output_size_bytes=output_size
                 ))
 
-                update_response = self.__update_task(task_result)
-                logger.debug("Successfully updated async task %s with output %s, response: %s", task_id, task_result.output_data, update_response)
+                next_task = self.__update_task(task_result)
+                logger.debug("Successfully updated async task %s with output %s, next_task: %s", task_id, task_result.output_data, next_task.task_id if next_task else None)
+
+                # If v2 returned a next task, submit it to the executor
+                if next_task is not None and next_task.task_id:
+                    future = self._executor.submit(self.__execute_and_update_task, next_task)
+                    self._running_tasks.add(future)
             except Exception as e:
                 logger.error(
                     "Error updating completed async task %s: %s",
@@ -498,19 +503,26 @@ def __check_completed_async_tasks(self) -> None:
                 )
 
     def __execute_and_update_task(self, task: Task) -> None:
-        """Execute task and update result (runs in thread pool)"""
+        """Execute task and update result in a tight loop (runs in thread pool).
+
+        Uses the v2 update endpoint which returns the next task to process.
+        Loops: execute -> update_v2 (get next task) -> execute -> ...
+        The loop breaks when no next task is available, the task is async/in-progress,
+        or shutdown is requested.
+        """
         try:
-            task_result = self.__execute_task(task)
-            # If task returned None, it's an async task running in background - don't update yet
-            # (Note: __execute_task returns None for async tasks, regardless of their actual return value)
-            if task_result is None:
-                logger.debug("Task %s is running async, will update when complete", task.task_id)
-                return
-            # If task returned TaskInProgress, it's running async - don't update yet
-            if isinstance(task_result, TaskInProgress):
-                logger.debug("Task %s is in progress, will update when complete", task.task_id)
-                return
-            self.__update_task(task_result)
+            while task is not None and not self._shutdown:
+                task_result = self.__execute_task(task)
+                # If task returned None, it's an async task running in background - don't update yet
+                if task_result is None:
+                    logger.debug("Task %s is running async, will update when complete", task.task_id)
+                    return
+                # If task returned TaskInProgress, it's running async - don't update yet
+                if isinstance(task_result, TaskInProgress):
+                    logger.debug("Task %s is in progress, will update when complete", task.task_id)
+                    return
+                # Update task and get next task from v2 response
+                task = self.__update_task(task_result)
         except Exception as e:
             logger.error(
                 "Error executing/updating task %s: %s",
@@ -565,9 +577,8 @@ def __batch_poll_tasks(self, count: int) -> list:
                 tasks_received=len(tasks) if tasks else 0
             ))
 
-            # Success - reset auth failure counter
-            if tasks:
-                self._auth_failures = 0
+            # Success - reset auth failure counter (any successful HTTP response means auth is working)
+            self._auth_failures = 0
 
             return tasks if tasks else []
 
@@ -609,87 +620,6 @@ def __batch_poll_tasks(self, count: int) -> list:
             )
             return []
 
-    def __poll_task(self) -> Task:
-        task_definition_name = self.worker.get_task_definition_name()
-        if self.worker.paused:
-            logger.debug("Stop polling task for: %s", task_definition_name)
-            return None
-
-        # Apply exponential backoff if we have recent auth failures
-        if self._auth_failures > 0:
-            now = time.time()
-            # Exponential backoff: 2^failures seconds (2s, 4s, 8s, 16s, 32s)
-            backoff_seconds = min(2 ** self._auth_failures, 60)  # Cap at 60s
-            time_since_last_failure = now - self._last_auth_failure
-
-            if time_since_last_failure < backoff_seconds:
-                # Still in backoff period - skip polling
-                time.sleep(0.1)  # Small sleep to prevent tight loop
-                return None
-
-        if self.metrics_collector is not None:
-            self.metrics_collector.increment_task_poll(
-                task_definition_name
-            )
-
-        try:
-            start_time = time.time()
-            domain = self.worker.get_domain()
-            params = {"workerid": self.worker.get_identity()}
-            # Only add domain if it's not None and not empty string
-            if domain is not None and domain != "":
-                params["domain"] = domain
-            task = self.task_client.poll(tasktype=task_definition_name, **params)
-            finish_time = time.time()
-            time_spent = finish_time - start_time
-            if self.metrics_collector is not None:
-                self.metrics_collector.record_task_poll_time(task_definition_name, time_spent)
-        except AuthorizationException as auth_exception:
-            # Track auth failure for backoff
-            self._auth_failures += 1
-            self._last_auth_failure = time.time()
-            backoff_seconds = min(2 ** self._auth_failures, 60)
-
-            if self.metrics_collector is not None:
-                self.metrics_collector.increment_task_poll_error(task_definition_name, type(auth_exception))
-
-            if auth_exception.invalid_token:
-                logger.error(
-                    f"Failed to poll task {task_definition_name} due to invalid auth token "
-                    f"(failure #{self._auth_failures}). Will retry with exponential backoff ({backoff_seconds}s). "
-                    "Please check your CONDUCTOR_AUTH_KEY and CONDUCTOR_AUTH_SECRET."
-                )
-            else:
-                logger.error(
-                    f"Failed to poll task {task_definition_name} error: {auth_exception.status} - {auth_exception.error_code} "
-                    f"(failure #{self._auth_failures}). Will retry with exponential backoff ({backoff_seconds}s)."
-                )
-            return None
-        except Exception as e:
-            if self.metrics_collector is not None:
-                self.metrics_collector.increment_task_poll_error(task_definition_name, type(e))
-            logger.error(
-                "Failed to poll task for: %s, reason: %s",
-                task_definition_name,
-                traceback.format_exc()
-            )
-            return None
-
-        # Success - reset auth failure counter
-        if task is not None:
-            self._auth_failures = 0
-            logger.trace(
-                "Polled task: %s, worker_id: %s, domain: %s",
-                task_definition_name,
-                self.worker.get_identity(),
-                self.worker.get_domain()
-            )
-        else:
-            # No task available - also reset auth failures since poll succeeded
-            self._auth_failures = 0
-
-        return task
-
     def __execute_task(self, task: Task) -> TaskResult:
         if not isinstance(task, Task):
             return None
@@ -894,6 +824,7 @@ def __merge_context_modifications(self, task_result: TaskResult, context_result:
                 task_result.output_data = context_result.output_data
 
     def __update_task(self, task_result: TaskResult):
+        """Update task result using v2 endpoint. Returns the next Task to process, or None."""
         if not isinstance(task_result, TaskResult):
             return None
         task_definition_name = self.worker.get_task_definition_name()
@@ -914,15 +845,15 @@ def __update_task(self, task_result: TaskResult):
                 # Exponential backoff: [10s, 20s, 30s] before retry
                 time.sleep(attempt * 10)
             try:
-                response = self.task_client.update_task(body=task_result)
+                next_task = self.task_client.update_task_v2(body=task_result)
                 logger.debug(
-                    "Updated task, id: %s, workflow_instance_id: %s, task_definition_name: %s, response: %s",
+                    "Updated task (v2), id: %s, workflow_instance_id: %s, task_definition_name: %s, next_task: %s",
                     task_result.task_id,
                     task_result.workflow_instance_id,
                     task_definition_name,
-                    response
+                    next_task.task_id if next_task else None
                 )
-                return response
+                return next_task
             except Exception as e:
                 last_exception = e
                 if self.metrics_collector is not None:
diff --git a/src/conductor/client/http/api/async_task_resource_api.py b/src/conductor/client/http/api/async_task_resource_api.py
index 1114a228e..6335c12ac 100644
--- a/src/conductor/client/http/api/async_task_resource_api.py
+++ b/src/conductor/client/http/api/async_task_resource_api.py
@@ -186,3 +186,86 @@ async def update_task_with_http_info(self, body, **kwargs):
             _preload_content=params.get('_preload_content', True),
             _request_timeout=params.get('_request_timeout'),
             collection_formats=collection_formats)
+
+    async def update_task_v2(self, body, **kwargs):
+        """Update a task and return the next available task (v2, async).
+
+        Combines task update + poll into a single HTTP call.
+        Returns the next Task to process, or None if no task is available (204).
+
+        :param TaskResult body: (required)
+        :return: Task or None
+        """
+        kwargs['_return_http_data_only'] = True
+        data = await self.update_task_v2_with_http_info(body, **kwargs)
+        # 204 No Content: deserializer returns empty Task() with task_id=None
+        if data is not None and hasattr(data, 'task_id') and data.task_id:
+            return data
+        return None
+
+    async def update_task_v2_with_http_info(self, body, **kwargs):
+        """Update a task and return the next available task (v2, async).
+
+        :param TaskResult body: (required)
+        :return: Task
+        """
+
+        all_params = ['body']
+        all_params.append('_return_http_data_only')
+        all_params.append('_preload_content')
+        all_params.append('_request_timeout')
+
+        params = locals()
+        for key, val in six.iteritems(params['kwargs']):
+            if key not in all_params:
+                raise TypeError(
+                    "Got an unexpected keyword argument '%s'"
+                    " to method update_task_v2" % key
+                )
+            params[key] = val
+        del params['kwargs']
+
+        # verify the required parameter 'body' is set
+        if ('body' not in params or
+                params['body'] is None):
+            raise ValueError("Missing the required parameter `body` when calling `update_task_v2`")
+
+        collection_formats = {}
+
+        path_params = {}
+
+        query_params = []
+
+        header_params = {}
+
+        form_params = []
+        local_var_files = {}
+
+        body_params = None
+        if 'body' in params:
+            body_params = params['body']
+        # HTTP header `Accept`
+        header_params['Accept'] = self.api_client.select_header_accept(
+            ['application/json'])
+
+        # HTTP header `Content-Type`
+        header_params['Content-Type'] = self.api_client.select_header_content_type(
+            ['application/json'])
+
+        # Authentication setting
+        auth_settings = []
+
+        return await self.api_client.call_api(
+            '/tasks/update-v2', 'POST',
+            path_params,
+            query_params,
+            header_params,
+            body=body_params,
+            post_params=form_params,
+            files=local_var_files,
+            response_type='Task',
+            auth_settings=auth_settings,
+            _return_http_data_only=params.get('_return_http_data_only'),
+            _preload_content=params.get('_preload_content', True),
+            _request_timeout=params.get('_request_timeout'),
+            collection_formats=collection_formats)
diff --git a/src/conductor/client/http/api/task_resource_api.py b/src/conductor/client/http/api/task_resource_api.py
index 0515cc89e..b3c2dfbcc 100644
--- a/src/conductor/client/http/api/task_resource_api.py
+++ b/src/conductor/client/http/api/task_resource_api.py
@@ -1476,6 +1476,89 @@ def update_task_with_http_info(self, body, **kwargs):  # noqa: E501
             _request_timeout=params.get('_request_timeout'),
             collection_formats=collection_formats)
 
+    def update_task_v2(self, body, **kwargs):  # noqa: E501
+        """Update a task and return the next available task (v2).  # noqa: E501
+
+        Combines task update + poll into a single HTTP call.
+        Returns the next Task to process, or None if no task is available (204).
+
+        :param TaskResult body: (required)
+        :return: Task or None
+        """
+        kwargs['_return_http_data_only'] = True
+        data = self.update_task_v2_with_http_info(body, **kwargs)  # noqa: E501
+        # 204 No Content: deserializer returns empty Task() with task_id=None
+        if data is not None and hasattr(data, 'task_id') and data.task_id:
+            return data
+        return None
+
+    def update_task_v2_with_http_info(self, body, **kwargs):  # noqa: E501
+        """Update a task and return the next available task (v2).  # noqa: E501
+
+        :param TaskResult body: (required)
+        :return: Task
+        """
+
+        all_params = ['body']  # noqa: E501
+        all_params.append('_return_http_data_only')
+        all_params.append('_preload_content')
+        all_params.append('_request_timeout')
+
+        params = locals()
+        for key, val in six.iteritems(params['kwargs']):
+            if key not in all_params:
+                raise TypeError(
+                    "Got an unexpected keyword argument '%s'"
+                    " to method update_task_v2" % key
+                )
+            params[key] = val
+        del params['kwargs']
+        # verify the required parameter 'body' is set
+        if ('body' not in params or
+                params['body'] is None):
+            raise ValueError("Missing the required parameter `body` when calling `update_task_v2`")  # noqa: E501
+
+        collection_formats = {}
+
+        path_params = {}
+
+        query_params = []
+
+        header_params = {}
+
+        form_params = []
+        local_var_files = {}
+
+        body_params = None
+        if 'body' in params:
+            body_params = params['body']
+        # HTTP header `Accept`
+        header_params['Accept'] = self.api_client.select_header_accept(
+            ['application/json'])  # noqa: E501
+
+        # HTTP header `Content-Type`
+        header_params['Content-Type'] = self.api_client.select_header_content_type(  # noqa: E501
+            ['application/json'])  # noqa: E501
+
+        # Authentication setting
+        auth_settings = []  # noqa: E501
+
+        return self.api_client.call_api(
+            '/tasks/update-v2', 'POST',
+            path_params,
+            query_params,
+            header_params,
+            body=body_params,
+            post_params=form_params,
+            files=local_var_files,
+            response_type='Task',  # noqa: E501
+            auth_settings=auth_settings,
+            async_req=params.get('async_req'),
+            _return_http_data_only=params.get('_return_http_data_only'),
+            _preload_content=params.get('_preload_content', True),
+            _request_timeout=params.get('_request_timeout'),
+            collection_formats=collection_formats)
+
     def update_task1(self, body, workflow_id, task_ref_name, status, **kwargs):  # noqa: E501
         """Update a task By Ref Name  # noqa: E501
 
diff --git a/tests/integration/client/orkes/test_orkes_service_registry_client.py b/tests/integration/client/orkes/test_orkes_service_registry_client.py
index c31d978e1..f7830a657 100644
--- a/tests/integration/client/orkes/test_orkes_service_registry_client.py
+++ b/tests/integration/client/orkes/test_orkes_service_registry_client.py
@@ -265,6 +265,9 @@ class TestOrkesServiceRegistryClientIntg(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
+        from tests.integration.conftest import skip_if_server_unavailable
+        skip_if_server_unavailable()
+
         cls.config = get_configuration()
         logger.info(f'Setting up TestOrkesServiceRegistryClientIntg with config {cls.config}')
 
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
new file mode 100644
index 000000000..aede140e8
--- /dev/null
+++ b/tests/integration/conftest.py
@@ -0,0 +1,115 @@
+"""
+Shared fixtures and skip logic for integration tests.
+
+When the Conductor server or credentials are unavailable, all integration
+tests are skipped gracefully instead of failing with AuthorizationException
+or connection errors.
+"""
+
+import logging
+import unittest
+
+import pytest
+
+from conductor.client.configuration.configuration import Configuration
+from conductor.client.http.api_client import ApiClient
+from conductor.client.workflow.executor.workflow_executor import WorkflowExecutor
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Connectivity check (cached at module level, runs once per session)
+# ---------------------------------------------------------------------------
+
+_server_available = None  # None = not yet checked
+_skip_reason = ""
+
+
+def _check_server_connectivity():
+    """
+    Attempt a lightweight API call to verify the Conductor server is
+    reachable and credentials (if required) are valid.
+
+    Returns (is_available, skip_reason).
+    """
+    global _server_available, _skip_reason
+    if _server_available is not None:
+        return _server_available, _skip_reason
+
+    try:
+        config = Configuration()
+        from conductor.client.orkes.orkes_metadata_client import OrkesMetadataClient
+        client = OrkesMetadataClient(config)
+        client.get_all_task_defs()
+        _server_available = True
+        _skip_reason = ""
+        logger.info("Conductor server is available at %s", config.host)
+    except Exception as e:
+        _server_available = False
+        _skip_reason = f"Conductor server not available: {e}"
+        logger.warning(_skip_reason)
+
+    return _server_available, _skip_reason
+
+
+def skip_if_server_unavailable():
+    """
+    Call from unittest.TestCase.setUpClass to skip the entire test class
+    when the Conductor server is not available.
+
+    Usage::
+
+        @classmethod
+        def setUpClass(cls):
+            from tests.integration.conftest import skip_if_server_unavailable
+            skip_if_server_unavailable()
+            # ... rest of setup
+    """
+    available, reason = _check_server_connectivity()
+    if not available:
+        raise unittest.SkipTest(reason)
+
+
+# ---------------------------------------------------------------------------
+# Pytest session-scoped fixtures
+# ---------------------------------------------------------------------------
+
+@pytest.fixture(scope="session")
+def conductor_config():
+    """Provide a Configuration connected to the Conductor server, or skip."""
+    available, reason = _check_server_connectivity()
+    if not available:
+        pytest.skip(reason)
+    return Configuration()
+
+
+@pytest.fixture(scope="session")
+def api_client(conductor_config):
+    """Provide an ApiClient instance for integration tests."""
+    return ApiClient(conductor_config)
+
+
+@pytest.fixture(scope="session")
+def workflow_executor(conductor_config):
+    """Provide a WorkflowExecutor instance for integration tests."""
+    return WorkflowExecutor(conductor_config)
+
+
+@pytest.fixture(scope="session")
+def workflow_quantity():
+    """Default number of workflows to run in execution tests."""
+    return 6
+
+
+# ---------------------------------------------------------------------------
+# Autouse fixture: skip ALL pytest-native integration tests when server
+# is unavailable. (unittest.TestCase tests need skip_if_server_unavailable()
+# in setUpClass instead — autouse fixtures don't apply to them.)
+# ---------------------------------------------------------------------------
+
+@pytest.fixture(autouse=True)
+def _skip_integration_if_no_server():
+    """Skip every integration test when the Conductor server is not reachable."""
+    available, reason = _check_server_connectivity()
+    if not available:
+        pytest.skip(reason)
diff --git a/tests/integration/metadata/test_schema_service.py b/tests/integration/metadata/test_schema_service.py
index 8448de50e..e44ca8712 100644
--- a/tests/integration/metadata/test_schema_service.py
+++ b/tests/integration/metadata/test_schema_service.py
@@ -23,6 +23,9 @@ class TestOrkesSchemaClient(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
+        from tests.integration.conftest import skip_if_server_unavailable
+        skip_if_server_unavailable()
+
         configuration = Configuration()
         cls.schema_client = OrkesSchemaClient(configuration)
 
diff --git a/tests/integration/metadata/test_task_metadata_service.py b/tests/integration/metadata/test_task_metadata_service.py
index 9a72f5563..e2b363434 100644
--- a/tests/integration/metadata/test_task_metadata_service.py
+++ b/tests/integration/metadata/test_task_metadata_service.py
@@ -81,6 +81,9 @@ class TestOrkesMetadataClient(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
+        from tests.integration.conftest import skip_if_server_unavailable
+        skip_if_server_unavailable()
+
         configuration = Configuration()
         cls.metadata_client = OrkesMetadataClient(configuration)
 
diff --git a/tests/integration/test_agentic_workflows.py b/tests/integration/test_agentic_workflows.py
index d2c604d4c..732f50730 100644
--- a/tests/integration/test_agentic_workflows.py
+++ b/tests/integration/test_agentic_workflows.py
@@ -95,6 +95,9 @@ class AgenticWorkflowTests(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
+        from tests.integration.conftest import skip_if_server_unavailable
+        skip_if_server_unavailable()
+
         cls.config = Configuration()
         cls.clients = OrkesClients(configuration=cls.config)
         cls.workflow_executor = cls.clients.get_workflow_executor()
diff --git a/tests/integration/test_ai_examples.py b/tests/integration/test_ai_examples.py
index 8ec11fb92..83fe5df8c 100644
--- a/tests/integration/test_ai_examples.py
+++ b/tests/integration/test_ai_examples.py
@@ -50,6 +50,9 @@ class TestAIExamples(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
+        from tests.integration.conftest import skip_if_server_unavailable
+        skip_if_server_unavailable()
+
         cls.config = Configuration(server_api_url=SERVER_URL)
         cls.clients = OrkesClients(configuration=cls.config)
         cls.executor = WorkflowExecutor(configuration=cls.config)
diff --git a/tests/integration/test_ai_task_types.py b/tests/integration/test_ai_task_types.py
index d03e13b2b..40edce4b1 100644
--- a/tests/integration/test_ai_task_types.py
+++ b/tests/integration/test_ai_task_types.py
@@ -52,6 +52,9 @@ class TestAITaskTypeRegistration(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
+        from tests.integration.conftest import skip_if_server_unavailable
+        skip_if_server_unavailable()
+
         cls.config = Configuration(server_api_url="http://localhost:7001/api")
         cls.clients = OrkesClients(configuration=cls.config)
         cls.executor = WorkflowExecutor(configuration=cls.config)
diff --git a/tests/integration/test_authorization_client_intg.py b/tests/integration/test_authorization_client_intg.py
index b3b2456c6..8e9146a6f 100644
--- a/tests/integration/test_authorization_client_intg.py
+++ b/tests/integration/test_authorization_client_intg.py
@@ -39,6 +39,9 @@ class TestOrkesAuthorizationClientIntg(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
+        from tests.integration.conftest import skip_if_server_unavailable
+        skip_if_server_unavailable()
+
         cls.config = get_configuration()
         cls.client = OrkesAuthorizationClient(cls.config)
 
diff --git a/tests/integration/test_comprehensive_e2e.py b/tests/integration/test_comprehensive_e2e.py
index 586721372..f300b8c99 100644
--- a/tests/integration/test_comprehensive_e2e.py
+++ b/tests/integration/test_comprehensive_e2e.py
@@ -124,11 +124,14 @@ class TestComprehensiveE2E(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
+        from tests.integration.conftest import skip_if_server_unavailable
+        skip_if_server_unavailable()
+
         logging.basicConfig(level=logging.INFO)
         cls.config = Configuration()
         cls.event_collector = EventCollector()
         cls.metrics_dir = '/tmp/conductor_test_metrics'
-        
+
         if os.path.exists(cls.metrics_dir):
             import shutil
             shutil.rmtree(cls.metrics_dir)
@@ -140,13 +143,6 @@ def setUpClass(cls):
             file_name='metrics.prom'  # Enable file-based metrics
         )
 
-        # Test server connection
-        try:
-            metadata_client = OrkesMetadataClient(cls.config)
-            print(f"✓ Connected to: {cls.config.host}")
-        except Exception as e:
-            raise RuntimeError(f"Server not available: {e}")
-
         cls.workers_started = False
 
     def test_01_create_workflow(self):
diff --git a/tests/integration/test_workflow_client_intg.py b/tests/integration/test_workflow_client_intg.py
index 3d7744b54..d38ca0802 100644
--- a/tests/integration/test_workflow_client_intg.py
+++ b/tests/integration/test_workflow_client_intg.py
@@ -33,6 +33,9 @@ class TestOrkesWorkflowClientIntg(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
+        from tests.integration.conftest import skip_if_server_unavailable
+        skip_if_server_unavailable()
+
         cls.config = get_configuration()
         cls.workflow_client = OrkesWorkflowClient(cls.config)
         logger.info(f'setting up TestOrkesWorkflowClientIntg with config {cls.config}')
diff --git a/tests/unit/automator/test_async_task_runner.py b/tests/unit/automator/test_async_task_runner.py
index be7f30359..3bd3d1c6e 100644
--- a/tests/unit/automator/test_async_task_runner.py
+++ b/tests/unit/automator/test_async_task_runner.py
@@ -97,7 +97,7 @@ async def run_test():
             runner.async_task_client.batch_poll = AsyncMock(return_value=mock_tasks)
 
             # Mock update_task to succeed
-            runner.async_task_client.update_task = AsyncMock(return_value=self.UPDATE_TASK_RESPONSE)
+            runner.async_task_client.update_task_v2 = AsyncMock(return_value=None)
 
             # Run one iteration
             await runner.run_once()
@@ -109,8 +109,8 @@ async def run_test():
             runner.async_task_client.batch_poll.assert_called_once()
 
             # Verify update_task was called with correct result
-            runner.async_task_client.update_task.assert_called_once()
-            call_args = runner.async_task_client.update_task.call_args
+            runner.async_task_client.update_task_v2.assert_called_once()
+            call_args = runner.async_task_client.update_task_v2.call_args
             task_result = call_args.kwargs['body']
 
             self.assertEqual(task_result.task_id, self.TASK_ID)
@@ -152,13 +152,13 @@ async def run_test():
             runner._semaphore = asyncio.Semaphore(1)
 
             runner.async_task_client.batch_poll = AsyncMock(return_value=[mock_task])
-            runner.async_task_client.update_task = AsyncMock(return_value=self.UPDATE_TASK_RESPONSE)
+            runner.async_task_client.update_task_v2 = AsyncMock(return_value=None)
 
             await runner.run_once()
             await asyncio.sleep(0.1)
 
             # Verify task completed with None result
-            call_args = runner.async_task_client.update_task.call_args
+            call_args = runner.async_task_client.update_task_v2.call_args
             task_result = call_args.kwargs['body']
 
             self.assertEqual(task_result.status, TaskResultStatus.COMPLETED)
@@ -256,7 +256,7 @@ async def batch_poll_mock(*args, **kwargs):
                     return []
 
             runner.async_task_client.batch_poll = AsyncMock(side_effect=batch_poll_mock)
-            runner.async_task_client.update_task = AsyncMock(return_value=self.UPDATE_TASK_RESPONSE)
+            runner.async_task_client.update_task_v2 = AsyncMock(return_value=None)
 
             # First run_once - poll 2 tasks
             await runner.run_once()
@@ -397,7 +397,7 @@ async def run_test():
             runner._semaphore = asyncio.Semaphore(1)
 
             runner.async_task_client.batch_poll = AsyncMock(return_value=[mock_task])
-            runner.async_task_client.update_task = AsyncMock(return_value=self.UPDATE_TASK_RESPONSE)
+            runner.async_task_client.update_task_v2 = AsyncMock(return_value=None)
 
             await runner.run_once()
             await asyncio.sleep(0.1)
@@ -408,7 +408,7 @@ async def run_test():
             self.assertIn("Intentional test error", str(failure_events[0].cause))
 
             # Verify update_task was called with FAILED status
-            call_args = runner.async_task_client.update_task.call_args
+            call_args = runner.async_task_client.update_task_v2.call_args
             task_result = call_args.kwargs['body']
             self.assertEqual(task_result.status, TaskResultStatus.FAILED)
 
@@ -444,7 +444,7 @@ async def batch_poll_respects_count(*args, **kwargs):
                 return [self.__create_task(task_id=f'task_{i}', input_data={'value': i}) for i in range(count)]
 
             runner.async_task_client.batch_poll = AsyncMock(side_effect=batch_poll_respects_count)
-            runner.async_task_client.update_task = AsyncMock(return_value=self.UPDATE_TASK_RESPONSE)
+            runner.async_task_client.update_task_v2 = AsyncMock(return_value=None)
 
             # First poll - should request 2 tasks (available_slots=2)
             await runner.run_once()
@@ -532,7 +532,7 @@ async def run_test():
             runner._semaphore = asyncio.Semaphore(3)
 
             runner.async_task_client.batch_poll = AsyncMock(return_value=mock_tasks)
-            runner.async_task_client.update_task = AsyncMock(return_value=self.UPDATE_TASK_RESPONSE)
+            runner.async_task_client.update_task_v2 = AsyncMock(return_value=None)
 
             await runner.run_once()
             await asyncio.sleep(0.2)
@@ -577,13 +577,13 @@ async def run_test():
             runner._semaphore = asyncio.Semaphore(1)
 
             runner.async_task_client.batch_poll = AsyncMock(return_value=[mock_task])
-            runner.async_task_client.update_task = AsyncMock(return_value=self.UPDATE_TASK_RESPONSE)
+            runner.async_task_client.update_task_v2 = AsyncMock(return_value=None)
 
             await runner.run_once()
             await asyncio.sleep(0.1)
 
             # Verify task result was serialized correctly
-            call_args = runner.async_task_client.update_task.call_args
+            call_args = runner.async_task_client.update_task_v2.call_args
             task_result = call_args.kwargs['body']
 
             self.assertIsInstance(task_result, TaskResult)
@@ -670,7 +670,7 @@ async def run_test():
 
             # Successful execution scenario
             runner.async_task_client.batch_poll = AsyncMock(return_value=[mock_task])
-            runner.async_task_client.update_task = AsyncMock(return_value=self.UPDATE_TASK_RESPONSE)
+            runner.async_task_client.update_task_v2 = AsyncMock(return_value=None)
 
             await runner.run_once()
             await asyncio.sleep(0.1)
@@ -756,7 +756,7 @@ async def run_test():
             runner._semaphore = asyncio.Semaphore(1)
 
             runner.async_task_client.batch_poll = AsyncMock(return_value=[mock_task])
-            runner.async_task_client.update_task = AsyncMock(return_value=self.UPDATE_TASK_RESPONSE)
+            runner.async_task_client.update_task_v2 = AsyncMock(return_value=None)
 
             await runner.run_once()
             await asyncio.sleep(0.1)
@@ -811,7 +811,7 @@ async def run_test():
             runner._semaphore = asyncio.Semaphore(1)
 
             runner.async_task_client.batch_poll = AsyncMock(return_value=[mock_task])
-            runner.async_task_client.update_task = AsyncMock(return_value=self.UPDATE_TASK_RESPONSE)
+            runner.async_task_client.update_task_v2 = AsyncMock(return_value=None)
 
             await runner.run_once()
             await asyncio.sleep(0.1)
@@ -866,7 +866,7 @@ async def run_test():
             runner._semaphore = asyncio.Semaphore(1)
 
             runner.async_task_client.batch_poll = AsyncMock(return_value=[mock_task])
-            runner.async_task_client.update_task = AsyncMock(return_value=self.UPDATE_TASK_RESPONSE)
+            runner.async_task_client.update_task_v2 = AsyncMock(return_value=None)
 
             # Should complete without raising exception (listener error isolated)
             await runner.run_once()
@@ -876,7 +876,7 @@ async def run_test():
             self.assertEqual(len(good_listener_events), 1)
 
             # Update task should still be called (worker execution not affected)
-            runner.async_task_client.update_task.assert_called_once()
+            runner.async_task_client.update_task_v2.assert_called_once()
 
         asyncio.run(run_test())
 
@@ -922,7 +922,7 @@ async def run_test():
             runner._semaphore = asyncio.Semaphore(1)
 
             runner.async_task_client.batch_poll = AsyncMock(return_value=[mock_task])
-            runner.async_task_client.update_task = AsyncMock(return_value=self.UPDATE_TASK_RESPONSE)
+            runner.async_task_client.update_task_v2 = AsyncMock(return_value=None)
 
             await runner.run_once()
             await asyncio.sleep(0.1)
@@ -997,7 +997,7 @@ async def run_test():
             runner._semaphore = asyncio.Semaphore(1)
 
             runner.async_task_client.batch_poll = AsyncMock(return_value=[mock_task])
-            runner.async_task_client.update_task = AsyncMock(return_value=self.UPDATE_TASK_RESPONSE)
+            runner.async_task_client.update_task_v2 = AsyncMock(return_value=None)
 
             await runner.run_once()
             await asyncio.sleep(0.1)
diff --git a/tests/unit/automator/test_task_handler_coverage.py b/tests/unit/automator/test_task_handler_coverage.py
index ae7284837..b97e00838 100644
--- a/tests/unit/automator/test_task_handler_coverage.py
+++ b/tests/unit/automator/test_task_handler_coverage.py
@@ -474,9 +474,11 @@ def test_stop_processes(self, mock_import, mock_logging):
         handler.queue = Mock()
         handler.logger_process = Mock()
 
-        # Mock the processes
-        for process in handler.task_runner_processes:
+        # Mock the processes with pid and is_alive so __stop_process doesn't short-circuit
+        for i, process in enumerate(handler.task_runner_processes):
             process.terminate = Mock()
+            process.is_alive = Mock(return_value=True)
+            type(process).pid = PropertyMock(return_value=10000 + i)
 
         handler.stop_processes()
 
@@ -508,10 +510,14 @@ def test_stop_processes_with_metrics(self, mock_import, mock_logging):
         handler.queue = Mock()
         handler.logger_process = Mock()
 
-        # Mock the terminate methods
+        # Mock the terminate methods and ensure pid/is_alive are set
         handler.metrics_provider_process.terminate = Mock()
-        for process in handler.task_runner_processes:
+        handler.metrics_provider_process.is_alive = Mock(return_value=True)
+        type(handler.metrics_provider_process).pid = PropertyMock(return_value=20000)
+        for i, process in enumerate(handler.task_runner_processes):
             process.terminate = Mock()
+            process.is_alive = Mock(return_value=True)
+            type(process).pid = PropertyMock(return_value=10000 + i)
 
         handler.stop_processes()
 
@@ -541,6 +547,7 @@ def test_stop_process_with_exception(self, mock_import, mock_logging):
         for process in handler.task_runner_processes:
             process.terminate = Mock(side_effect=Exception("terminate failed"))
             process.kill = Mock()
+            process.is_alive = Mock(return_value=True)
             # Use PropertyMock for pid
             type(process).pid = PropertyMock(return_value=12345)
 
diff --git a/tests/unit/automator/test_task_runner.py b/tests/unit/automator/test_task_runner.py
index fc6595652..c7399227d 100644
--- a/tests/unit/automator/test_task_runner.py
+++ b/tests/unit/automator/test_task_runner.py
@@ -120,8 +120,8 @@ def test_run_once(self):
         ):
             with patch.object(
                     TaskResourceApi,
-                    'update_task',
-                    return_value=self.UPDATE_TASK_RESPONSE
+                    'update_task_v2',
+                    return_value=None
             ):
                 task_runner = self.__get_valid_task_runner()
                 # With mocked sleep, we just verify the method runs without errors
@@ -138,23 +138,23 @@ def test_poll_task(self):
         expected_task = self.__get_valid_task()
         with patch.object(
                 TaskResourceApi,
-                'poll',
-                return_value=self.__get_valid_task()
+                'batch_poll',
+                return_value=[self.__get_valid_task()]
         ):
             task_runner = self.__get_valid_task_runner()
-            task = task_runner._TaskRunner__poll_task()
-            self.assertEqual(task, expected_task)
+            tasks = task_runner._TaskRunner__batch_poll_tasks(1)
+            self.assertEqual(len(tasks), 1)
+            self.assertEqual(tasks[0], expected_task)
 
     def test_poll_task_with_faulty_task_api(self):
-        expected_task = None
         with patch.object(
                 TaskResourceApi,
-                'poll',
+                'batch_poll',
                 side_effect=Exception()
         ):
             task_runner = self.__get_valid_task_runner()
-            task = task_runner._TaskRunner__poll_task()
-            self.assertEqual(task, expected_task)
+            tasks = task_runner._TaskRunner__batch_poll_tasks(1)
+            self.assertEqual(tasks, [])
 
     def test_execute_task_with_invalid_task(self):
         task_runner = self.__get_valid_task_runner()
@@ -200,23 +200,23 @@ def test_update_task_with_invalid_task_result(self):
     @patch('time.sleep', Mock(return_value=None))
     def test_update_task_with_faulty_task_api(self):
         expected_response = None
-        with patch.object(TaskResourceApi, 'update_task', side_effect=Exception()):
+        with patch.object(TaskResourceApi, 'update_task_v2', side_effect=Exception()):
             task_runner = self.__get_valid_task_runner()
             task_result = self.__get_valid_task_result()
             response = task_runner._TaskRunner__update_task(task_result)
             self.assertEqual(response, expected_response)
 
     def test_update_task(self):
-        expected_response = self.UPDATE_TASK_RESPONSE
+        mock_next_task = Task(task_id='next_task_id', workflow_instance_id='next_wf_id')
         with patch.object(
                 TaskResourceApi,
-                'update_task',
-                return_value=self.UPDATE_TASK_RESPONSE
+                'update_task_v2',
+                return_value=mock_next_task
         ):
             task_runner = self.__get_valid_task_runner()
             task_result = self.__get_valid_task_result()
             response = task_runner._TaskRunner__update_task(task_result)
-            self.assertEqual(response, expected_response)
+            self.assertEqual(response, mock_next_task)
 
     def test_wait_for_polling_interval_with_faulty_worker(self):
         expected_exception = Exception(
diff --git a/tests/unit/automator/test_task_runner_coverage.py b/tests/unit/automator/test_task_runner_coverage.py
index 766427da5..1686d12f7 100644
--- a/tests/unit/automator/test_task_runner_coverage.py
+++ b/tests/unit/automator/test_task_runner_coverage.py
@@ -258,8 +258,8 @@ def test_run_once_with_exception_handling(self):
         worker = MockWorker('test_task')
         task_runner = TaskRunner(worker=worker)
 
-        # Mock __poll_task to raise an exception
-        with patch.object(task_runner, '_TaskRunner__poll_task', side_effect=Exception("Test error")):
+        # Mock __batch_poll_tasks to raise an exception
+        with patch.object(task_runner, '_TaskRunner__batch_poll_tasks', side_effect=Exception("Test error")):
             # Should not raise, exception is caught
             task_runner.run_once()
 
@@ -280,15 +280,15 @@ def test_run_once_clears_task_definition_name_cache(self):
 
     @patch('time.sleep')
     def test_poll_task_when_worker_paused(self, mock_sleep):
-        """Test polling returns None when worker is paused"""
+        """Test polling returns empty list when worker is paused"""
         worker = MockWorker('test_task')
         worker.paused = True
 
         task_runner = TaskRunner(worker=worker)
 
-        task = task_runner._TaskRunner__poll_task()
+        tasks = task_runner._TaskRunner__batch_poll_tasks(1)
 
-        self.assertIsNone(task)
+        self.assertEqual(tasks, [])
 
     @patch('time.sleep')
     def test_poll_task_with_auth_failure_backoff(self, mock_sleep):
@@ -302,11 +302,11 @@ def test_poll_task_with_auth_failure_backoff(self, mock_sleep):
         task_runner._auth_failures = 2
         task_runner._last_auth_failure = time.time()
 
-        with patch.object(TaskResourceApi, 'poll', return_value=None):
-            task = task_runner._TaskRunner__poll_task()
+        with patch.object(TaskResourceApi, 'batch_poll', return_value=[]):
+            tasks = task_runner._TaskRunner__batch_poll_tasks(1)
 
-            # Should skip polling and return None due to backoff
-            self.assertIsNone(task)
+            # Should skip polling and return empty list due to backoff
+            self.assertEqual(tasks, [])
             mock_sleep.assert_called_once_with(0.1)
 
     @patch('time.sleep')
@@ -328,10 +328,10 @@ def test_poll_task_auth_failure_with_invalid_token(self, mock_sleep):
             http_resp=mock_http_resp
         )
 
-        with patch.object(TaskResourceApi, 'poll', side_effect=auth_exception):
-            task = task_runner._TaskRunner__poll_task()
+        with patch.object(TaskResourceApi, 'batch_poll', side_effect=auth_exception):
+            tasks = task_runner._TaskRunner__batch_poll_tasks(1)
 
-            self.assertIsNone(task)
+            self.assertEqual(tasks, [])
             self.assertEqual(task_runner._auth_failures, 1)
             self.assertGreater(task_runner._last_auth_failure, 0)
 
@@ -354,10 +354,10 @@ def test_poll_task_auth_failure_without_invalid_token(self, mock_sleep):
             http_resp=mock_http_resp
         )
 
-        with patch.object(TaskResourceApi, 'poll', side_effect=auth_exception):
-            task = task_runner._TaskRunner__poll_task()
+        with patch.object(TaskResourceApi, 'batch_poll', side_effect=auth_exception):
+            tasks = task_runner._TaskRunner__batch_poll_tasks(1)
 
-            self.assertIsNone(task)
+            self.assertEqual(tasks, [])
             self.assertEqual(task_runner._auth_failures, 1)
 
     @patch('time.sleep')
@@ -372,28 +372,29 @@ def test_poll_task_success_resets_auth_failures(self, mock_sleep):
 
         test_task = Task(task_id='test_id', workflow_instance_id='wf_id')
 
-        with patch.object(TaskResourceApi, 'poll', return_value=test_task):
-            task = task_runner._TaskRunner__poll_task()
+        with patch.object(TaskResourceApi, 'batch_poll', return_value=[test_task]):
+            tasks = task_runner._TaskRunner__batch_poll_tasks(1)
 
-            self.assertEqual(task, test_task)
+            self.assertEqual(len(tasks), 1)
+            self.assertEqual(tasks[0], test_task)
             self.assertEqual(task_runner._auth_failures, 0)
 
     def test_poll_task_no_task_available_resets_auth_failures(self):
-        """Test that None result from successful poll resets auth failures"""
+        """Test that empty result from successful poll resets auth failures"""
         worker = MockWorker('test_task')
         task_runner = TaskRunner(worker=worker)
 
         # Set some auth failures
         task_runner._auth_failures = 2
 
-        with patch.object(TaskResourceApi, 'poll', return_value=None):
-            task = task_runner._TaskRunner__poll_task()
+        with patch.object(TaskResourceApi, 'batch_poll', return_value=[]):
+            tasks = task_runner._TaskRunner__batch_poll_tasks(1)
 
-            self.assertIsNone(task)
+            self.assertEqual(tasks, [])
             self.assertEqual(task_runner._auth_failures, 0)
 
     def test_poll_task_with_metrics_collector(self):
-        """Test polling with metrics collection enabled"""
+        """Test polling with metrics collection enabled publishes events"""
         worker = MockWorker('test_task')
         metrics_settings = MetricsSettings()
         task_runner = TaskRunner(
@@ -403,17 +404,14 @@ def test_poll_task_with_metrics_collector(self):
 
         test_task = Task(task_id='test_id', workflow_instance_id='wf_id')
 
-        with patch.object(TaskResourceApi, 'poll', return_value=test_task):
-            with patch.object(task_runner.metrics_collector, 'increment_task_poll'):
-                with patch.object(task_runner.metrics_collector, 'record_task_poll_time'):
-                    task = task_runner._TaskRunner__poll_task()
+        with patch.object(TaskResourceApi, 'batch_poll', return_value=[test_task]):
+            tasks = task_runner._TaskRunner__batch_poll_tasks(1)
 
-                    self.assertEqual(task, test_task)
-                    task_runner.metrics_collector.increment_task_poll.assert_called_once()
-                    task_runner.metrics_collector.record_task_poll_time.assert_called_once()
+            self.assertEqual(len(tasks), 1)
+            self.assertEqual(tasks[0], test_task)
 
     def test_poll_task_with_metrics_on_auth_error(self):
-        """Test metrics collection on authorization error"""
+        """Test that auth error during poll publishes PollFailure event"""
         worker = MockWorker('test_task')
         metrics_settings = MetricsSettings()
         task_runner = TaskRunner(
@@ -434,15 +432,14 @@ def test_poll_task_with_metrics_on_auth_error(self):
             http_resp=mock_http_resp
         )
 
-        with patch.object(TaskResourceApi, 'poll', side_effect=auth_exception):
-            with patch.object(task_runner.metrics_collector, 'increment_task_poll_error'):
-                task = task_runner._TaskRunner__poll_task()
+        with patch.object(TaskResourceApi, 'batch_poll', side_effect=auth_exception):
+            tasks = task_runner._TaskRunner__batch_poll_tasks(1)
 
-                self.assertIsNone(task)
-                task_runner.metrics_collector.increment_task_poll_error.assert_called_once()
+            self.assertEqual(tasks, [])
+            self.assertEqual(task_runner._auth_failures, 1)
 
     def test_poll_task_with_metrics_on_general_error(self):
-        """Test metrics collection on general polling error"""
+        """Test that general error during poll publishes PollFailure event"""
         worker = MockWorker('test_task')
         metrics_settings = MetricsSettings()
         task_runner = TaskRunner(
@@ -450,12 +447,10 @@ def test_poll_task_with_metrics_on_general_error(self):
             metrics_settings=metrics_settings
         )
 
-        with patch.object(TaskResourceApi, 'poll', side_effect=Exception("General error")):
-            with patch.object(task_runner.metrics_collector, 'increment_task_poll_error'):
-                task = task_runner._TaskRunner__poll_task()
+        with patch.object(TaskResourceApi, 'batch_poll', side_effect=Exception("General error")):
+            tasks = task_runner._TaskRunner__batch_poll_tasks(1)
 
-                self.assertIsNone(task)
-                task_runner.metrics_collector.increment_task_poll_error.assert_called_once()
+            self.assertEqual(tasks, [])
 
     def test_poll_task_with_domain(self):
         """Test polling with domain parameter"""
@@ -466,10 +461,11 @@ def test_poll_task_with_domain(self):
 
         test_task = Task(task_id='test_id', workflow_instance_id='wf_id')
 
-        with patch.object(TaskResourceApi, 'poll', return_value=test_task) as mock_poll:
-            task = task_runner._TaskRunner__poll_task()
+        with patch.object(TaskResourceApi, 'batch_poll', return_value=[test_task]) as mock_poll:
+            tasks = task_runner._TaskRunner__batch_poll_tasks(1)
 
-            self.assertEqual(task, test_task)
+            self.assertEqual(len(tasks), 1)
+            self.assertEqual(tasks[0], test_task)
             # Verify domain was passed
             mock_poll.assert_called_once()
             call_kwargs = mock_poll.call_args[1]
@@ -767,15 +763,17 @@ def test_update_task_with_retry_success(self):
         )
         task_result.status = TaskResultStatus.COMPLETED
 
+        mock_next_task = Task(task_id='next_id', workflow_instance_id='next_wf_id')
+
         # First call fails, second succeeds
         with patch.object(
             TaskResourceApi,
-            'update_task',
-            side_effect=[Exception("Network error"), "SUCCESS"]
+            'update_task_v2',
+            side_effect=[Exception("Network error"), mock_next_task]
         ) as mock_update:
             response = task_runner._TaskRunner__update_task(task_result)
 
-            self.assertEqual(response, "SUCCESS")
+            self.assertEqual(response, mock_next_task)
             self.assertEqual(mock_update.call_count, 2)
 
     @patch('time.sleep', Mock(return_value=None))
@@ -794,7 +792,7 @@ def test_update_task_with_metrics_on_error(self):
             worker_id=worker.get_identity()
         )
 
-        with patch.object(TaskResourceApi, 'update_task', side_effect=Exception("Update failed")):
+        with patch.object(TaskResourceApi, 'update_task_v2', side_effect=Exception("Update failed")):
             with patch.object(task_runner.metrics_collector, 'increment_task_update_error'):
                 response = task_runner._TaskRunner__update_task(task_result)
 

From c0baba8badab5fb1a757d1b1f047f1fe6cf721ef Mon Sep 17 00:00:00 2001
From: Viren Baraiya <virenx@gmail.com>
Date: Tue, 10 Mar 2026 08:18:12 -0700
Subject: [PATCH 2/2] Create test_update_task_v2_perf.py

---
 tests/integration/test_update_task_v2_perf.py | 386 ++++++++++++++++++
 1 file changed, 386 insertions(+)
 create mode 100644 tests/integration/test_update_task_v2_perf.py

diff --git a/tests/integration/test_update_task_v2_perf.py b/tests/integration/test_update_task_v2_perf.py
new file mode 100644
index 000000000..de2a39d02
--- /dev/null
+++ b/tests/integration/test_update_task_v2_perf.py
@@ -0,0 +1,386 @@
+"""
+Performance test for update-task-v2 (tight execute loop).
+
+Measures task queue wait time — the time a task sits scheduled before a worker
+picks it up.  With the v2 endpoint the worker receives the next task in the
+update response, so consecutive same-type tasks should have near-zero queue
+latency (<20 ms).
+
+Workflow shape (10 tasks, 3 types, same-type tasks consecutive):
+
+    type_a_1 → type_a_2 → type_a_3 → type_a_4
+             → type_b_1 → type_b_2 → type_b_3
+             → type_c_1 → type_c_2 → type_c_3
+
+Run (fixed 1 000 workflows — default):
+
+    python -m pytest tests/integration/test_update_task_v2_perf.py -v -s
+
+Run (duration-based, 1 hour):
+
+    PERF_DURATION_MINUTES=60 python -m pytest tests/integration/test_update_task_v2_perf.py -v -s
+
+Env-vars:
+    PERF_WORKFLOW_COUNT   – number of workflows (default 1000, ignored when duration is set)
+    PERF_DURATION_MINUTES – run for this many minutes instead of a fixed count
+    PERF_RATE             – workflows submitted per second (default 20)
+    PERF_WORKER_THREADS   – thread_count per worker type (default 10)
+"""
+
+import logging
+import os
+import statistics
+import sys
+import time
+import threading
+import unittest
+from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+
+from conductor.client.automator.task_handler import TaskHandler
+from conductor.client.configuration.configuration import Configuration
+from conductor.client.http.models.start_workflow_request import StartWorkflowRequest
+from conductor.client.http.models.workflow_def import WorkflowDef
+from conductor.client.http.models.workflow_task import WorkflowTask
+from conductor.client.orkes.orkes_metadata_client import OrkesMetadataClient
+from conductor.client.orkes.orkes_workflow_client import OrkesWorkflowClient
+from conductor.client.worker.worker_task import worker_task
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Configuration
+# ---------------------------------------------------------------------------
+
+WORKFLOW_NAME = "perf_v2_update_test"
+WORKFLOW_VERSION = 1
+
+TASK_SEQUENCE = [
+    ("perf_type_a", 4),
+    ("perf_type_b", 3),
+    ("perf_type_c", 3),
+]
+
+WORKER_THREADS = int(os.environ.get("PERF_WORKER_THREADS", "10"))
+WORKFLOW_COUNT = int(os.environ.get("PERF_WORKFLOW_COUNT", "1000"))
+DURATION_MINUTES = float(os.environ.get("PERF_DURATION_MINUTES", "0"))
+SUBMIT_RATE = float(os.environ.get("PERF_RATE", "20"))  # workflows/sec
+
+# ---------------------------------------------------------------------------
+# Workers — near-zero execution time so we isolate queue latency
+# ---------------------------------------------------------------------------
+
+@worker_task(task_definition_name="perf_type_a", thread_count=WORKER_THREADS, register_task_def=True)
+def perf_worker_a(task_index: int = 0) -> dict:
+    return {"worker": "perf_type_a", "task_index": task_index}
+
+
+@worker_task(task_definition_name="perf_type_b", thread_count=WORKER_THREADS, register_task_def=True)
+def perf_worker_b(task_index: int = 0) -> dict:
+    return {"worker": "perf_type_b", "task_index": task_index}
+
+
+@worker_task(task_definition_name="perf_type_c", thread_count=WORKER_THREADS, register_task_def=True)
+def perf_worker_c(task_index: int = 0) -> dict:
+    return {"worker": "perf_type_c", "task_index": task_index}
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _build_workflow_tasks() -> list:
+    tasks = []
+    idx = 0
+    for task_type, count in TASK_SEQUENCE:
+        for i in range(count):
+            idx += 1
+            tasks.append(
+                WorkflowTask(
+                    name=task_type,
+                    task_reference_name=f"{task_type}_{i + 1}",
+                    input_parameters={"task_index": idx},
+                )
+            )
+    return tasks
+
+
+def _percentile(data: list, p: float) -> float:
+    if not data:
+        return 0.0
+    k = (len(data) - 1) * (p / 100.0)
+    f = int(k)
+    c = f + 1
+    if c >= len(data):
+        return data[f]
+    return data[f] + (k - f) * (data[c] - data[f])
+
+
+def _print_latency_report(queue_times_by_task: dict, workflow_durations: list,
+                           elapsed: float, total_workflows: int):
+    print("\n" + "=" * 90)
+    print(f"  PERFORMANCE REPORT  —  {total_workflows} workflows in {elapsed:.1f}s "
+          f"({total_workflows / max(elapsed, 0.001):.1f} wf/s)")
+    print("=" * 90)
+
+    print(f"\n{'Task Ref':<20} {'Count':>6} {'Mean':>8} {'p50':>8} {'p90':>8} "
+          f"{'p95':>8} {'p99':>8} {'Max':>8}")
+    print("-" * 90)
+
+    all_queue_times = []
+    for ref in sorted(queue_times_by_task.keys()):
+        times = sorted(queue_times_by_task[ref])
+        all_queue_times.extend(times)
+        mean = statistics.mean(times)
+        p50 = _percentile(times, 50)
+        p90 = _percentile(times, 90)
+        p95 = _percentile(times, 95)
+        p99 = _percentile(times, 99)
+        mx = max(times)
+        print(f"{ref:<20} {len(times):>6} {mean:>7.0f}ms {p50:>7.0f}ms "
+              f"{p90:>7.0f}ms {p95:>7.0f}ms {p99:>7.0f}ms {mx:>7.0f}ms")
+
+    all_queue_times.sort()
+    if all_queue_times:
+        print("-" * 90)
+        print(f"{'ALL TASKS':<20} {len(all_queue_times):>6} "
+              f"{statistics.mean(all_queue_times):>7.0f}ms "
+              f"{_percentile(all_queue_times, 50):>7.0f}ms "
+              f"{_percentile(all_queue_times, 90):>7.0f}ms "
+              f"{_percentile(all_queue_times, 95):>7.0f}ms "
+              f"{_percentile(all_queue_times, 99):>7.0f}ms "
+              f"{max(all_queue_times):>7.0f}ms")
+
+    if workflow_durations:
+        workflow_durations.sort()
+        print(f"\n{'Workflow E2E':<20} {len(workflow_durations):>6} "
+              f"{statistics.mean(workflow_durations):>7.0f}ms "
+              f"{_percentile(workflow_durations, 50):>7.0f}ms "
+              f"{_percentile(workflow_durations, 90):>7.0f}ms "
+              f"{_percentile(workflow_durations, 95):>7.0f}ms "
+              f"{_percentile(workflow_durations, 99):>7.0f}ms "
+              f"{max(workflow_durations):>7.0f}ms")
+
+    print("=" * 90 + "\n")
+
+
+# ---------------------------------------------------------------------------
+# Test
+# ---------------------------------------------------------------------------
+
+class TestUpdateTaskV2Perf(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        from tests.integration.conftest import skip_if_server_unavailable
+        skip_if_server_unavailable()
+
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s [%(process)d] %(name)-45s %(levelname)-10s %(message)s",
+        )
+        logging.getLogger("conductor.client").setLevel(logging.WARNING)
+
+        cls.config = Configuration()
+        cls.workflow_client = OrkesWorkflowClient(cls.config)
+        cls.metadata_client = OrkesMetadataClient(cls.config)
+
+    # ---- setup ----------------------------------------------------------
+
+    def test_0_register_workflow(self):
+        """Register the performance-test workflow definition."""
+        workflow = WorkflowDef(name=WORKFLOW_NAME, version=WORKFLOW_VERSION)
+        workflow._tasks = _build_workflow_tasks()
+        try:
+            self.metadata_client.update_workflow_def(workflow, overwrite=True)
+        except Exception:
+            self.metadata_client.register_workflow_def(workflow, overwrite=True)
+        print(f"\n✓ Registered workflow '{WORKFLOW_NAME}' with {len(workflow._tasks)} tasks")
+
+    # ---- main perf test -------------------------------------------------
+
+    def test_1_run_perf(self):
+        """Start workers, fire workflows at a controlled rate, collect queue-wait latencies."""
+
+        handler_ready = threading.Event()
+        handler_ref = {}
+
+        def _run_workers():
+            handler = TaskHandler(
+                configuration=self.config,
+                scan_for_annotated_workers=True,
+            )
+            handler_ref["h"] = handler
+            handler.start_processes()
+            handler_ready.set()
+            handler_ref["stop"] = threading.Event()
+            handler_ref["stop"].wait()
+            handler.stop_processes()
+
+        worker_thread = threading.Thread(target=_run_workers, daemon=True)
+        worker_thread.start()
+        handler_ready.wait(timeout=30)
+        self.assertTrue(handler_ready.is_set(), "Workers failed to start within 30s")
+        # Let workers warm up — establish polling loops
+        time.sleep(5)
+        print(f"\n✓ Workers started ({WORKER_THREADS} threads/type, warmed up)")
+
+        try:
+            if DURATION_MINUTES > 0:
+                self._run_duration_mode()
+            else:
+                self._run_fixed_count_mode()
+        finally:
+            handler_ref.get("stop", threading.Event()).set()
+            worker_thread.join(timeout=15)
+
+    # ---- fixed-count mode -----------------------------------------------
+
+    def _run_fixed_count_mode(self):
+        total = WORKFLOW_COUNT
+        interval = 1.0 / SUBMIT_RATE
+        print(f"\n→ Fixed-count mode: {total} workflows at {SUBMIT_RATE} wf/s")
+
+        workflow_ids = []
+        start_time = time.time()
+
+        for i in range(total):
+            wf_id = self._start_one_workflow(i)
+            if wf_id:
+                workflow_ids.append(wf_id)
+
+            # Pace submission
+            expected_time = start_time + (i + 1) * interval
+            sleep_for = expected_time - time.time()
+            if sleep_for > 0:
+                time.sleep(sleep_for)
+
+            done = len(workflow_ids)
+            if done % 100 == 0:
+                elapsed = time.time() - start_time
+                print(f"  Submitted {done}/{total} ({elapsed:.1f}s)")
+
+        submit_elapsed = time.time() - start_time
+        print(f"✓ All {len(workflow_ids)} workflows submitted in {submit_elapsed:.1f}s "
+              f"(actual rate: {len(workflow_ids) / submit_elapsed:.1f} wf/s)")
+
+        # Collect results
+        queue_times, wf_durations = self._collect_results(workflow_ids)
+        elapsed = time.time() - start_time
+        _print_latency_report(queue_times, wf_durations, elapsed, len(workflow_ids))
+
+    # ---- duration mode --------------------------------------------------
+
+    def _run_duration_mode(self):
+        deadline = time.time() + DURATION_MINUTES * 60
+        interval = 1.0 / SUBMIT_RATE
+        print(f"\n→ Duration mode: running for {DURATION_MINUTES} min at {SUBMIT_RATE} wf/s")
+
+        all_queue_times = defaultdict(list)
+        all_wf_durations = []
+        total_started = 0
+        report_num = 0
+        overall_start = time.time()
+
+        # Submit and collect in rolling windows
+        batch_ids = []
+        batch_start = time.time()
+        BATCH_SIZE = 200  # collect results every N workflows
+
+        while time.time() < deadline:
+            wf_id = self._start_one_workflow(total_started)
+            if wf_id:
+                batch_ids.append(wf_id)
+            total_started += 1
+
+            # Pace
+            expected_time = batch_start + len(batch_ids) * interval
+            sleep_for = expected_time - time.time()
+            if sleep_for > 0:
+                time.sleep(sleep_for)
+
+            # Collect in batches to avoid unbounded memory
+            if len(batch_ids) >= BATCH_SIZE:
+                report_num += 1
+                queue_times, wf_durations = self._collect_results(batch_ids)
+                for ref, times in queue_times.items():
+                    all_queue_times[ref].extend(times)
+                all_wf_durations.extend(wf_durations)
+                remaining = deadline - time.time()
+                print(f"  Report #{report_num}: {len(batch_ids)} wf, "
+                      f"total={total_started}, remaining={remaining:.0f}s")
+                batch_ids = []
+                batch_start = time.time()
+
+        # Final batch
+        if batch_ids:
+            queue_times, wf_durations = self._collect_results(batch_ids)
+            for ref, times in queue_times.items():
+                all_queue_times[ref].extend(times)
+            all_wf_durations.extend(wf_durations)
+
+        elapsed = time.time() - overall_start
+        _print_latency_report(dict(all_queue_times), all_wf_durations, elapsed, total_started)
+
+    # ---- helpers --------------------------------------------------------
+
+    def _start_one_workflow(self, index: int) -> str:
+        req = StartWorkflowRequest()
+        req.name = WORKFLOW_NAME
+        req.version = WORKFLOW_VERSION
+        req.input = {"run_index": index}
+        try:
+            return self.workflow_client.start_workflow(start_workflow_request=req)
+        except Exception as e:
+            logger.error("Failed to start workflow %d: %s", index, e)
+            return None
+
+    def _collect_results(self, workflow_ids: list, timeout_s: int = 120) -> tuple:
+        """
+        Poll completed workflows and extract per-task queue_wait_time.
+        Returns (queue_times_by_task_ref, workflow_durations_ms).
+        """
+        queue_times = defaultdict(list)
+        wf_durations = []
+        pending = set(workflow_ids)
+        deadline = time.time() + timeout_s
+
+        while pending and time.time() < deadline:
+            still_pending = set()
+            for wf_id in pending:
+                try:
+                    wf = self.workflow_client.get_workflow(wf_id, include_tasks=True)
+                except Exception:
+                    still_pending.add(wf_id)
+                    continue
+
+                if wf.status not in ("COMPLETED", "FAILED", "TERMINATED", "TIMED_OUT"):
+                    still_pending.add(wf_id)
+                    continue
+
+                if wf.start_time and wf.end_time:
+                    wf_durations.append(wf.end_time - wf.start_time)
+
+                for task in (wf.tasks or []):
+                    ref = task.reference_task_name
+                    qwt = getattr(task, "queue_wait_time", None)
+                    if qwt is not None and qwt >= 0:
+                        queue_times[ref].append(qwt)
+                    elif (task.scheduled_time and task.start_time
+                          and task.start_time >= task.scheduled_time):
+                        queue_times[ref].append(task.start_time - task.scheduled_time)
+
+            pending = still_pending
+            if pending:
+                time.sleep(0.5)
+
+        if pending:
+            logger.warning("%d workflows did not complete within timeout", len(pending))
+
+        return dict(queue_times), wf_durations
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)