diff --git a/changes/907.fix.md b/changes/907.fix.md new file mode 100644 index 00000000000..7d43ce13e2f --- /dev/null +++ b/changes/907.fix.md @@ -0,0 +1 @@ +Fix occasional random mismatch of cluster hostnames and actual container hostnames in cluster sessions under the host networking mode diff --git a/src/ai/backend/manager/registry.py b/src/ai/backend/manager/registry.py index 78bd6f38a24..2d03bf0c7b0 100644 --- a/src/ai/backend/manager/registry.py +++ b/src/ai/backend/manager/registry.py @@ -1304,7 +1304,7 @@ async def start_session( agent_addr = item.agent_alloc_ctx.agent_addr.replace( "tcp://", "" ).split(":", maxsplit=1)[0] - cluster_ssh_port_mapping[f"{cluster_role}{index+1}"] = ( + cluster_ssh_port_mapping[item.kernel.cluster_hostname] = ( agent_addr, port, )