Skip to content

Issue for getting embedding #10

@zxshi0102

Description

@zxshi0102

Hi there,
I am trying to create the embedding layer for my antibody sequence with ABodyBuilder3-LM

use_precomputed = False
if use_precomputed:
    embedding = torch.load("/home/user/package_download/abodybuilder3/data/structures/structures_plm/6yio_H0-L0.pt")[
        "plm_embedding"
    ]
else:
    plm = ProtT5()
    embedding = plm.get_embeddings(
        [
            heavy,
        ],
        [
            light,
        ],
    )

After running this code, it shows the error:

RuntimeError                              Traceback (most recent call last)
Cell In[4], line 8
      4     embedding = torch.load("/home/user/package_download/abodybuilder3/data/structures/structures_plm/6yio_H0-L0.pt")[
      5         "plm_embedding"
      6     ]
      7 else:
----> 8     plm = ProtT5()
      9     embedding = plm.get_embeddings(
     10         [
     11             heavy,
   (...)
     15         ],
     16     )
     18 print(f"{embedding.shape=}")

File <string>:8, in __init__(self, weights_dir, model_type, paired, batch_size, device_map)

File ~/package_download/abodybuilder3/src/abodybuilder3/language/model.py:55, in ProtTrans.__post_init__(self)
     52 elif self.paired and self.model_type == "t5":
     53     self.seperator_token = "</s>"
---> 55 self.trainer = Trainer(num_nodes=1, devices=1)

File ~/package_download/abodybuilder3/.venv/lib/python3.9/site-packages/lightning/pytorch/utilities/argparse.py:70, in _defaults_from_env_vars.<locals>.insert_env_defaults(self, *args, **kwargs)
     67 kwargs = dict(list(env_variables.items()) + list(kwargs.items()))
     69 # all args were already moved to kwargs
---> 70 return fn(self, **kwargs)

File ~/package_download/abodybuilder3/.venv/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py:401, in Trainer.__init__(self, accelerator, strategy, devices, num_nodes, precision, logger, callbacks, fast_dev_run, max_epochs, min_epochs, max_steps, min_steps, max_time, limit_train_batches, limit_val_batches, limit_test_batches, limit_predict_batches, overfit_batches, val_check_interval, check_val_every_n_epoch, num_sanity_val_steps, log_every_n_steps, enable_checkpointing, enable_progress_bar, enable_model_summary, accumulate_grad_batches, gradient_clip_val, gradient_clip_algorithm, deterministic, benchmark, inference_mode, use_distributed_sampler, profiler, detect_anomaly, barebones, plugins, sync_batchnorm, reload_dataloaders_every_n_epochs, default_root_dir)
    398 # init connectors
    399 self._data_connector = _DataConnector(self)
--> 401 self._accelerator_connector = _AcceleratorConnector(
    402     devices=devices,
    403     accelerator=accelerator,
    404     strategy=strategy,
    405     num_nodes=num_nodes,
    406     sync_batchnorm=sync_batchnorm,
    407     benchmark=benchmark,
    408     use_distributed_sampler=use_distributed_sampler,
    409     deterministic=deterministic,
    410     precision=precision,
    411     plugins=plugins,
    412 )
    413 self._logger_connector = _LoggerConnector(self)
    414 self._callback_connector = _CallbackConnector(self)

File ~/package_download/abodybuilder3/.venv/lib/python3.9/site-packages/lightning/pytorch/trainer/connectors/accelerator_connector.py:158, in _AcceleratorConnector.__init__(self, devices, num_nodes, accelerator, strategy, plugins, precision, sync_batchnorm, benchmark, use_distributed_sampler, deterministic)
    155 self._set_parallel_devices_and_init_accelerator()
    157 # 3. Instantiate ClusterEnvironment
--> 158 self.cluster_environment: ClusterEnvironment = self._choose_and_init_cluster_environment()
    160 # 4. Instantiate Strategy - Part 1
    161 if self._strategy_flag == "auto":

File ~/package_download/abodybuilder3/.venv/lib/python3.9/site-packages/lightning/pytorch/trainer/connectors/accelerator_connector.py:428, in _AcceleratorConnector._choose_and_init_cluster_environment(self)
    420 for env_type in (
    421     # TorchElastic has the highest priority since it can also be used inside SLURM
    422     TorchElasticEnvironment,
   (...)
    425     MPIEnvironment,
    426 ):
    427     if env_type.detect():
--> 428         return env_type()
    429 if _LIGHTNING_BAGUA_AVAILABLE:
    430     from lightning_bagua import BaguaEnvironment

File ~/package_download/abodybuilder3/.venv/lib/python3.9/site-packages/lightning/fabric/plugins/environments/slurm.py:52, in SLURMEnvironment.__init__(self, auto_requeue, requeue_signal)
     50 self.requeue_signal = requeue_signal
     51 self._validate_srun_used()
---> 52 self._validate_srun_variables()

File ~/package_download/abodybuilder3/.venv/lib/python3.9/site-packages/lightning/fabric/plugins/environments/slurm.py:210, in SLURMEnvironment._validate_srun_variables()
    208 print(os.environ)
    209 if ntasks > 1 and "SLURM_NTASKS_PER_NODE" not in os.environ:
--> 210     raise RuntimeError(
    211         f"You set `--ntasks={ntasks}` in your SLURM bash script, but this variable is not supported."
    212         f" HINT: Use `--ntasks-per-node={ntasks}` instead."
    213     )

RuntimeError: You set `--ntasks=64` in your SLURM bash script, but this variable is not supported. HINT: Use `--ntasks-per-node=64` instead.

And i tried to change all the ntasks to ntasks-per-node in abodybuilder3/.venv/lib/python3.9/site-packages/lightning/fabric/plugins/environments/slurm.py, it would make the package cannot be imported at the beginning. But I cannot find the other ways to fix it, could you give me some hints to fix it? Thanks!

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions