From 908ccc1f047b501b2d62051278d0cf1a3a333944 Mon Sep 17 00:00:00 2001 From: Joongi Kim Date: Wed, 7 Dec 2022 14:51:59 +0000 Subject: [PATCH 1/3] feat: Support host-mode networking in automatic TF_CONFIG --- .../runner-scripts/setup_multinode.py | 64 +++++++++++++------ 1 file changed, 46 insertions(+), 18 deletions(-) diff --git a/python-tensorflow/runner-scripts/setup_multinode.py b/python-tensorflow/runner-scripts/setup_multinode.py index 181dbfc93..cbec5766f 100644 --- a/python-tensorflow/runner-scripts/setup_multinode.py +++ b/python-tensorflow/runner-scripts/setup_multinode.py @@ -1,22 +1,50 @@ import os import json +from pathlib import Path -if 'BACKENDAI_CLUSTER_HOST' in os.environ: # Start mutli-instance setup. - env = {} - env['cluster'] = {} - env['cluster']['worker'] = [] - env['cluster']['chief'] = [] # For compatibility - env['cluster']['chief'].append('main1:2220') - for container in os.environ['BACKENDAI_CLUSTER_HOSTS'].split(","): - if container != 'main1': - env['cluster']['worker'].append(container + ":2220") - env['task'] = {} - if os.environ['BACKENDAI_CLUSTER_ROLE'] == 'main': - env['task']['type'] = "chief" # For compatibility. Recent TF choose first worker as chief. - env['task']["index"] = str(int(os.environ['BACKENDAI_CLUSTER_IDX']) - 1) # Index starts from 0 + +def get_ssh_port(mapping, cluster_host): + if mapping is None: + return 2220 + try: + _, port = mapping[cluster_host] # host-ip and port + return port + except KeyError: + return 2220 + + +if __name__ == "__main__": + if "BACKENDAI_CLUSTER_HOST" in os.environ: # Start mutli-instance setup. + try: + ssh_port_mapping = json.loads(Path("/home/config/ssh/port-mapping.json").read_bytes()) + except FileNotFoundError: + ssh_port_mapping = None + + env = { + "cluster": { + "chief": ["main1:" + get_ssh_port(ssh_port_mapping, "main1")], + "worker": [], + }, + } + for cluster_host in os.environ["BACKENDAI_CLUSTER_HOSTS"].split(","): + if cluster_host != "main1": + env["cluster"]["worker"].append( + cluster_host + ":" + get_ssh_port(ssh_port_mapping, cluster_host) + ) + + # TF's worker index starts from 0. + cluster_role_idx = str(int(os.environ["BACKENDAI_CLUSTER_IDX"]) - 1) + if os.environ["BACKENDAI_CLUSTER_ROLE"] == "main": + task = { + "type": "chief", + "index": cluster_role_idx, + } + else: + task = { + "type": "worker", + "index": cluster_role_idx, + } + env["task"] = task + print(json.dumps(env)) else: - env['task']['type'] = "worker" - env['task']["index"] = str(int(os.environ['BACKENDAI_CLUSTER_IDX']) - 1) # Index starts from 0 - print(json.dumps(env)) -else: - print("") + print("") From 2fb496721fff828b210b5ecf8684b3e9b8b6ed20 Mon Sep 17 00:00:00 2001 From: Joongi Kim Date: Wed, 7 Dec 2022 15:06:59 +0000 Subject: [PATCH 2/3] fix: type matching and formatting --- python-tensorflow/runner-scripts/setup_multinode.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/python-tensorflow/runner-scripts/setup_multinode.py b/python-tensorflow/runner-scripts/setup_multinode.py index cbec5766f..84a9a4119 100644 --- a/python-tensorflow/runner-scripts/setup_multinode.py +++ b/python-tensorflow/runner-scripts/setup_multinode.py @@ -14,7 +14,7 @@ def get_ssh_port(mapping, cluster_host): if __name__ == "__main__": - if "BACKENDAI_CLUSTER_HOST" in os.environ: # Start mutli-instance setup. + if "BACKENDAI_CLUSTER_HOST" in os.environ: # Start mutli-instance setup. try: ssh_port_mapping = json.loads(Path("/home/config/ssh/port-mapping.json").read_bytes()) except FileNotFoundError: @@ -22,14 +22,14 @@ def get_ssh_port(mapping, cluster_host): env = { "cluster": { - "chief": ["main1:" + get_ssh_port(ssh_port_mapping, "main1")], + "chief": ["main1:" + str(get_ssh_port(ssh_port_mapping, "main1"))], "worker": [], }, } for cluster_host in os.environ["BACKENDAI_CLUSTER_HOSTS"].split(","): if cluster_host != "main1": env["cluster"]["worker"].append( - cluster_host + ":" + get_ssh_port(ssh_port_mapping, cluster_host) + cluster_host + ":" + str(get_ssh_port(ssh_port_mapping, cluster_host)) ) # TF's worker index starts from 0. @@ -48,3 +48,6 @@ def get_ssh_port(mapping, cluster_host): print(json.dumps(env)) else: print("") + + +# vim: et sts=4 sw=4 tw=105 From acd6d58fe40633dde14aba1b7e1aed5231047e13 Mon Sep 17 00:00:00 2001 From: Jonghyun Park Date: Wed, 7 Dec 2022 21:35:04 +0000 Subject: [PATCH 3/3] fix: update setup_multinode.py in other places --- python-ff/runner-scripts/setup_multinode.py | 67 ++++++++++++++----- .../runner-scripts/setup_multinode.py | 67 ++++++++++++++----- .../runner-scripts/setup_multinode.py | 64 +++++++++++++----- .../runner-scripts/setup_multinode.py | 64 +++++++++++++----- .../runner-scripts/setup_multinode.py | 67 ++++++++++++++----- 5 files changed, 245 insertions(+), 84 deletions(-) diff --git a/python-ff/runner-scripts/setup_multinode.py b/python-ff/runner-scripts/setup_multinode.py index 181dbfc93..84a9a4119 100644 --- a/python-ff/runner-scripts/setup_multinode.py +++ b/python-ff/runner-scripts/setup_multinode.py @@ -1,22 +1,53 @@ import os import json +from pathlib import Path -if 'BACKENDAI_CLUSTER_HOST' in os.environ: # Start mutli-instance setup. - env = {} - env['cluster'] = {} - env['cluster']['worker'] = [] - env['cluster']['chief'] = [] # For compatibility - env['cluster']['chief'].append('main1:2220') - for container in os.environ['BACKENDAI_CLUSTER_HOSTS'].split(","): - if container != 'main1': - env['cluster']['worker'].append(container + ":2220") - env['task'] = {} - if os.environ['BACKENDAI_CLUSTER_ROLE'] == 'main': - env['task']['type'] = "chief" # For compatibility. Recent TF choose first worker as chief. - env['task']["index"] = str(int(os.environ['BACKENDAI_CLUSTER_IDX']) - 1) # Index starts from 0 + +def get_ssh_port(mapping, cluster_host): + if mapping is None: + return 2220 + try: + _, port = mapping[cluster_host] # host-ip and port + return port + except KeyError: + return 2220 + + +if __name__ == "__main__": + if "BACKENDAI_CLUSTER_HOST" in os.environ: # Start mutli-instance setup. + try: + ssh_port_mapping = json.loads(Path("/home/config/ssh/port-mapping.json").read_bytes()) + except FileNotFoundError: + ssh_port_mapping = None + + env = { + "cluster": { + "chief": ["main1:" + str(get_ssh_port(ssh_port_mapping, "main1"))], + "worker": [], + }, + } + for cluster_host in os.environ["BACKENDAI_CLUSTER_HOSTS"].split(","): + if cluster_host != "main1": + env["cluster"]["worker"].append( + cluster_host + ":" + str(get_ssh_port(ssh_port_mapping, cluster_host)) + ) + + # TF's worker index starts from 0. + cluster_role_idx = str(int(os.environ["BACKENDAI_CLUSTER_IDX"]) - 1) + if os.environ["BACKENDAI_CLUSTER_ROLE"] == "main": + task = { + "type": "chief", + "index": cluster_role_idx, + } + else: + task = { + "type": "worker", + "index": cluster_role_idx, + } + env["task"] = task + print(json.dumps(env)) else: - env['task']['type'] = "worker" - env['task']["index"] = str(int(os.environ['BACKENDAI_CLUSTER_IDX']) - 1) # Index starts from 0 - print(json.dumps(env)) -else: - print("") + print("") + + +# vim: et sts=4 sw=4 tw=105 diff --git a/python-pytorch/runner-scripts/setup_multinode.py b/python-pytorch/runner-scripts/setup_multinode.py index 181dbfc93..84a9a4119 100644 --- a/python-pytorch/runner-scripts/setup_multinode.py +++ b/python-pytorch/runner-scripts/setup_multinode.py @@ -1,22 +1,53 @@ import os import json +from pathlib import Path -if 'BACKENDAI_CLUSTER_HOST' in os.environ: # Start mutli-instance setup. - env = {} - env['cluster'] = {} - env['cluster']['worker'] = [] - env['cluster']['chief'] = [] # For compatibility - env['cluster']['chief'].append('main1:2220') - for container in os.environ['BACKENDAI_CLUSTER_HOSTS'].split(","): - if container != 'main1': - env['cluster']['worker'].append(container + ":2220") - env['task'] = {} - if os.environ['BACKENDAI_CLUSTER_ROLE'] == 'main': - env['task']['type'] = "chief" # For compatibility. Recent TF choose first worker as chief. - env['task']["index"] = str(int(os.environ['BACKENDAI_CLUSTER_IDX']) - 1) # Index starts from 0 + +def get_ssh_port(mapping, cluster_host): + if mapping is None: + return 2220 + try: + _, port = mapping[cluster_host] # host-ip and port + return port + except KeyError: + return 2220 + + +if __name__ == "__main__": + if "BACKENDAI_CLUSTER_HOST" in os.environ: # Start mutli-instance setup. + try: + ssh_port_mapping = json.loads(Path("/home/config/ssh/port-mapping.json").read_bytes()) + except FileNotFoundError: + ssh_port_mapping = None + + env = { + "cluster": { + "chief": ["main1:" + str(get_ssh_port(ssh_port_mapping, "main1"))], + "worker": [], + }, + } + for cluster_host in os.environ["BACKENDAI_CLUSTER_HOSTS"].split(","): + if cluster_host != "main1": + env["cluster"]["worker"].append( + cluster_host + ":" + str(get_ssh_port(ssh_port_mapping, cluster_host)) + ) + + # TF's worker index starts from 0. + cluster_role_idx = str(int(os.environ["BACKENDAI_CLUSTER_IDX"]) - 1) + if os.environ["BACKENDAI_CLUSTER_ROLE"] == "main": + task = { + "type": "chief", + "index": cluster_role_idx, + } + else: + task = { + "type": "worker", + "index": cluster_role_idx, + } + env["task"] = task + print(json.dumps(env)) else: - env['task']['type'] = "worker" - env['task']["index"] = str(int(os.environ['BACKENDAI_CLUSTER_IDX']) - 1) # Index starts from 0 - print(json.dumps(env)) -else: - print("") + print("") + + +# vim: et sts=4 sw=4 tw=105 diff --git a/python-pytorchlightning/runner-scripts/setup_multinode.py b/python-pytorchlightning/runner-scripts/setup_multinode.py index 8977234b7..84a9a4119 100644 --- a/python-pytorchlightning/runner-scripts/setup_multinode.py +++ b/python-pytorchlightning/runner-scripts/setup_multinode.py @@ -1,19 +1,53 @@ import os import json +from pathlib import Path -if 'BACKENDAI_CLUSTER_HOST' in os.environ: # Start mutli-instance setup. - env = {} - env['cluster'] = {} - env['cluster']['worker'] = [] - for container in os.environ['BACKENDAI_CLUSTER_HOSTS'].split(","): - env['cluster']['worker'].append(container + ":2220") - env['task'] = {} - if os.environ['BACKENDAI_CLUSTER_ROLE'] == 'main': - env['task']['type'] = "worker" # Was chief. but recent TF choose first worker as chief. - env['task']["index"] = str(int(os.environ['BACKENDAI_CLUSTER_IDX']) - 1) # Index starts from 0 + +def get_ssh_port(mapping, cluster_host): + if mapping is None: + return 2220 + try: + _, port = mapping[cluster_host] # host-ip and port + return port + except KeyError: + return 2220 + + +if __name__ == "__main__": + if "BACKENDAI_CLUSTER_HOST" in os.environ: # Start mutli-instance setup. + try: + ssh_port_mapping = json.loads(Path("/home/config/ssh/port-mapping.json").read_bytes()) + except FileNotFoundError: + ssh_port_mapping = None + + env = { + "cluster": { + "chief": ["main1:" + str(get_ssh_port(ssh_port_mapping, "main1"))], + "worker": [], + }, + } + for cluster_host in os.environ["BACKENDAI_CLUSTER_HOSTS"].split(","): + if cluster_host != "main1": + env["cluster"]["worker"].append( + cluster_host + ":" + str(get_ssh_port(ssh_port_mapping, cluster_host)) + ) + + # TF's worker index starts from 0. + cluster_role_idx = str(int(os.environ["BACKENDAI_CLUSTER_IDX"]) - 1) + if os.environ["BACKENDAI_CLUSTER_ROLE"] == "main": + task = { + "type": "chief", + "index": cluster_role_idx, + } + else: + task = { + "type": "worker", + "index": cluster_role_idx, + } + env["task"] = task + print(json.dumps(env)) else: - env['task']['type'] = "worker" - env['task']["index"] = os.environ['BACKENDAI_CLUSTER_IDX'] - print(json.dumps(env)) -else: - print("") + print("") + + +# vim: et sts=4 sw=4 tw=105 diff --git a/vendor/ngc-pytorch/runner-scripts/setup_multinode.py b/vendor/ngc-pytorch/runner-scripts/setup_multinode.py index 8977234b7..84a9a4119 100644 --- a/vendor/ngc-pytorch/runner-scripts/setup_multinode.py +++ b/vendor/ngc-pytorch/runner-scripts/setup_multinode.py @@ -1,19 +1,53 @@ import os import json +from pathlib import Path -if 'BACKENDAI_CLUSTER_HOST' in os.environ: # Start mutli-instance setup. - env = {} - env['cluster'] = {} - env['cluster']['worker'] = [] - for container in os.environ['BACKENDAI_CLUSTER_HOSTS'].split(","): - env['cluster']['worker'].append(container + ":2220") - env['task'] = {} - if os.environ['BACKENDAI_CLUSTER_ROLE'] == 'main': - env['task']['type'] = "worker" # Was chief. but recent TF choose first worker as chief. - env['task']["index"] = str(int(os.environ['BACKENDAI_CLUSTER_IDX']) - 1) # Index starts from 0 + +def get_ssh_port(mapping, cluster_host): + if mapping is None: + return 2220 + try: + _, port = mapping[cluster_host] # host-ip and port + return port + except KeyError: + return 2220 + + +if __name__ == "__main__": + if "BACKENDAI_CLUSTER_HOST" in os.environ: # Start mutli-instance setup. + try: + ssh_port_mapping = json.loads(Path("/home/config/ssh/port-mapping.json").read_bytes()) + except FileNotFoundError: + ssh_port_mapping = None + + env = { + "cluster": { + "chief": ["main1:" + str(get_ssh_port(ssh_port_mapping, "main1"))], + "worker": [], + }, + } + for cluster_host in os.environ["BACKENDAI_CLUSTER_HOSTS"].split(","): + if cluster_host != "main1": + env["cluster"]["worker"].append( + cluster_host + ":" + str(get_ssh_port(ssh_port_mapping, cluster_host)) + ) + + # TF's worker index starts from 0. + cluster_role_idx = str(int(os.environ["BACKENDAI_CLUSTER_IDX"]) - 1) + if os.environ["BACKENDAI_CLUSTER_ROLE"] == "main": + task = { + "type": "chief", + "index": cluster_role_idx, + } + else: + task = { + "type": "worker", + "index": cluster_role_idx, + } + env["task"] = task + print(json.dumps(env)) else: - env['task']['type'] = "worker" - env['task']["index"] = os.environ['BACKENDAI_CLUSTER_IDX'] - print(json.dumps(env)) -else: - print("") + print("") + + +# vim: et sts=4 sw=4 tw=105 diff --git a/vendor/ngc-tensorflow/runner-scripts/setup_multinode.py b/vendor/ngc-tensorflow/runner-scripts/setup_multinode.py index 181dbfc93..84a9a4119 100644 --- a/vendor/ngc-tensorflow/runner-scripts/setup_multinode.py +++ b/vendor/ngc-tensorflow/runner-scripts/setup_multinode.py @@ -1,22 +1,53 @@ import os import json +from pathlib import Path -if 'BACKENDAI_CLUSTER_HOST' in os.environ: # Start mutli-instance setup. - env = {} - env['cluster'] = {} - env['cluster']['worker'] = [] - env['cluster']['chief'] = [] # For compatibility - env['cluster']['chief'].append('main1:2220') - for container in os.environ['BACKENDAI_CLUSTER_HOSTS'].split(","): - if container != 'main1': - env['cluster']['worker'].append(container + ":2220") - env['task'] = {} - if os.environ['BACKENDAI_CLUSTER_ROLE'] == 'main': - env['task']['type'] = "chief" # For compatibility. Recent TF choose first worker as chief. - env['task']["index"] = str(int(os.environ['BACKENDAI_CLUSTER_IDX']) - 1) # Index starts from 0 + +def get_ssh_port(mapping, cluster_host): + if mapping is None: + return 2220 + try: + _, port = mapping[cluster_host] # host-ip and port + return port + except KeyError: + return 2220 + + +if __name__ == "__main__": + if "BACKENDAI_CLUSTER_HOST" in os.environ: # Start mutli-instance setup. + try: + ssh_port_mapping = json.loads(Path("/home/config/ssh/port-mapping.json").read_bytes()) + except FileNotFoundError: + ssh_port_mapping = None + + env = { + "cluster": { + "chief": ["main1:" + str(get_ssh_port(ssh_port_mapping, "main1"))], + "worker": [], + }, + } + for cluster_host in os.environ["BACKENDAI_CLUSTER_HOSTS"].split(","): + if cluster_host != "main1": + env["cluster"]["worker"].append( + cluster_host + ":" + str(get_ssh_port(ssh_port_mapping, cluster_host)) + ) + + # TF's worker index starts from 0. + cluster_role_idx = str(int(os.environ["BACKENDAI_CLUSTER_IDX"]) - 1) + if os.environ["BACKENDAI_CLUSTER_ROLE"] == "main": + task = { + "type": "chief", + "index": cluster_role_idx, + } + else: + task = { + "type": "worker", + "index": cluster_role_idx, + } + env["task"] = task + print(json.dumps(env)) else: - env['task']['type'] = "worker" - env['task']["index"] = str(int(os.environ['BACKENDAI_CLUSTER_IDX']) - 1) # Index starts from 0 - print(json.dumps(env)) -else: - print("") + print("") + + +# vim: et sts=4 sw=4 tw=105