From c50ac1f2311d4aedc6fd0ceaf83931f1ceffda90 Mon Sep 17 00:00:00 2001 From: "Dr. Aleksei Krasnov" Date: Mon, 14 Oct 2024 13:53:33 +0200 Subject: [PATCH 01/25] chore: implemented dynamic adjustment of num_workers. updated gitignore --- .gitignore | 1 + molscribe/interface.py | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/.gitignore b/.gitignore index b440a5e..d71eaad 100755 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ tmp/ **/checkpoints/ *.png **/.DS_Store +*.egg-info \ No newline at end of file diff --git a/molscribe/interface.py b/molscribe/interface.py index 54a3a2e..2f291b7 100644 --- a/molscribe/interface.py +++ b/molscribe/interface.py @@ -1,4 +1,5 @@ import argparse +import os from typing import List import cv2 @@ -109,6 +110,12 @@ def predict_images(self, input_images: List, return_atoms_bonds=False, return_co node_symbols = [pred['chartok_coords']['symbols'] for pred in predictions] edges = [pred['edges'] for pred in predictions] + # Dynamically adjust num_workers based on the number of images in list + if len(input_images) <= 100: + self.num_workers = 1 + else: + self.num_workers = os.cpu_count() - 2 # subtract 2 for main and other processes + smiles_list, molblock_list, r_success = convert_graph_to_smiles( node_coords, node_symbols, edges, images=input_images, num_workers=self.num_workers) From 266d31c679b7db33920c1424b129b539bf93c4fb Mon Sep 17 00:00:00 2001 From: "Dr. Aleksei Krasnov" Date: Fri, 25 Oct 2024 15:15:20 +0200 Subject: [PATCH 02/25] chore: bumped dependency of rdkit to 2024.3.5 --- setup.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/setup.py b/setup.py index d008def..e8e6386 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,4 @@ from distutils.core import setup -from pathlib import Path setup(name='MolScribe', version='1.1.1', @@ -13,14 +12,14 @@ python_requires='>=3.7', setup_requires=['numpy'], install_requires=[ - "numpy", - "torch>=1.11.0", - "pandas", - "matplotlib", - "opencv-python>=4.5.5.64", - "SmilesPE==0.0.3", - "OpenNMT-py==2.2.0", - "rdkit-pypi>=2021.03.2", - "albumentations==1.1.0", - "timm==0.4.12" + "numpy", + "torch>=1.11.0", + "pandas", + "matplotlib", + "opencv-python>=4.5.5.64", + "SmilesPE==0.0.3", + "OpenNMT-py==2.2.0", + "rdkit>=2024.3.5", + "albumentations==1.1.0", + "timm==0.4.12" ]) From d8346d05fd64e044c814b535feac21ac0cf5f270 Mon Sep 17 00:00:00 2001 From: "Dr. Aleksei Krasnov" Date: Wed, 30 Oct 2024 15:27:25 +0100 Subject: [PATCH 03/25] chore: updated verson rdkit in requirements.txt, set num_workers equal to os.cpu_count for input images greater that 100 --- molscribe/interface.py | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/molscribe/interface.py b/molscribe/interface.py index 2f291b7..5aa7e6b 100644 --- a/molscribe/interface.py +++ b/molscribe/interface.py @@ -114,7 +114,7 @@ def predict_images(self, input_images: List, return_atoms_bonds=False, return_co if len(input_images) <= 100: self.num_workers = 1 else: - self.num_workers = os.cpu_count() - 2 # subtract 2 for main and other processes + self.num_workers = os.cpu_count() smiles_list, molblock_list, r_success = convert_graph_to_smiles( node_coords, node_symbols, edges, images=input_images, num_workers=self.num_workers) diff --git a/requirements.txt b/requirements.txt index 8e6273f..5ef28c1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,6 +9,6 @@ huggingface-hub>=0.11.0 tensorboardX SmilesPE==0.0.3 OpenNMT-py==2.2.0 -rdkit-pypi>=2021.03.2 +rdkit>=2024.3.5 albumentations @ git+https://github.com/albumentations-team/albumentations@37e714fd2e326f6f88778e425f98c2de8c8d5372 timm @ git+https://github.com/rwightman/pytorch-image-models.git@54a6cca27a9a3e092a07457f5d56709da56e3cf5 \ No newline at end of file From 2c0aa8d3015ef6db1614e98d228d8feb07e4e4ee Mon Sep 17 00:00:00 2001 From: "Dr. Aleksei Krasnov" Date: Wed, 12 Mar 2025 11:41:49 +0100 Subject: [PATCH 04/25] fix: fixed version of transformers>=4.5.1,<=4.47.0 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 5ef28c1..f2a2a95 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ numpy>=1.19.5 pandas>=1.2.4 matplotlib>=3.5.3 opencv-python==4.5.5.64 -transformers>=4.5.1 +transformers>=4.5.1,<=4.47.0 huggingface-hub>=0.11.0 tensorboardX SmilesPE==0.0.3 From b77f086b0e50af2db852ac48cc78bf3884c72d2d Mon Sep 17 00:00:00 2001 From: "Dr. Aleksei Krasnov" Date: Wed, 12 Mar 2025 12:07:50 +0100 Subject: [PATCH 05/25] fix: fixed versions of opencv-python>=4.10.0.84 and numpy>=1.19.5,<2.0. Implemented function read_requirements in setup.py for reading requirements.txt --- requirements.txt | 4 ++-- setup.py | 19 +++++++------------ 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/requirements.txt b/requirements.txt index f2a2a95..30d7619 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,9 @@ torch torchvision -numpy>=1.19.5 +numpy>=1.19.5,<2.0 pandas>=1.2.4 matplotlib>=3.5.3 -opencv-python==4.5.5.64 +opencv-python>=4.10.0.84 transformers>=4.5.1,<=4.47.0 huggingface-hub>=0.11.0 tensorboardX diff --git a/setup.py b/setup.py index e8e6386..1f394d2 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,10 @@ from distutils.core import setup +def read_requirements(): + """Read the requirements.txt file and return a list of dependencies.""" + with open("requirements.txt", "r", encoding="utf-8") as fh: + return fh.read().splitlines() + setup(name='MolScribe', version='1.1.1', description='MolScribe', @@ -11,15 +16,5 @@ package_data={'molscribe': ['vocab/*']}, python_requires='>=3.7', setup_requires=['numpy'], - install_requires=[ - "numpy", - "torch>=1.11.0", - "pandas", - "matplotlib", - "opencv-python>=4.5.5.64", - "SmilesPE==0.0.3", - "OpenNMT-py==2.2.0", - "rdkit>=2024.3.5", - "albumentations==1.1.0", - "timm==0.4.12" - ]) + install_requires=read_requirements(), + ) From 02f8014d7a9d5159a5f19de0105448c5fa434466 Mon Sep 17 00:00:00 2001 From: "Dr. Aleksei Krasnov" Date: Mon, 17 Mar 2025 12:17:43 +0100 Subject: [PATCH 06/25] feat: updated requirements. Tested updated version of dependencies. Trying to implement new versio of albumentations --- .gitignore | 3 ++- molscribe/augment.py | 46 ++++++++++++++++++++++++++++++++++++++++++-- requirements.txt | 10 +++++----- 3 files changed, 51 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index d71eaad..81d4edc 100755 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,5 @@ tmp/ **/checkpoints/ *.png **/.DS_Store -*.egg-info \ No newline at end of file +*.egg-info +build/ \ No newline at end of file diff --git a/molscribe/augment.py b/molscribe/augment.py index a80ebc5..85491b5 100755 --- a/molscribe/augment.py +++ b/molscribe/augment.py @@ -1,12 +1,54 @@ import albumentations as A +# from albucore import maybe_process_in_chunks +# from albumentations.augmentations.utils import angle_2pi_range from albumentations.augmentations.geometric.functional import safe_rotate_enlarged_img_size, _maybe_process_in_chunks, \ - keypoint_rotate + keypoint_rotate import cv2 import math import random import numpy as np +# @angle_2pi_range +# def keypoint_shift_scale_rotate(keypoint, angle, scale, dx, dy, rows, cols, **params): +# ( +# x, +# y, +# a, +# s, +# ) = keypoint[:4] +# height, width = rows, cols +# center = (width / 2, height / 2) +# matrix = cv2.getRotationMatrix2D(center, angle, scale) +# matrix[0, 2] += dx * width +# matrix[1, 2] += dy * height +# +# x, y = cv2.transform(np.array([[[x, y]]]), matrix).squeeze() +# angle = a + math.radians(angle) +# scale = s * scale +# +# return x, y, angle, scale +# +# +# def safe_rotate_enlarged_img_size(angle: float, rows: int, cols: int): +# +# deg_angle = abs(angle) +# +# # The rotation angle +# angle = np.deg2rad(deg_angle % 90) +# +# # The width of the frame to contain the rotated image +# r_cols = cols * np.cos(angle) + rows * np.sin(angle) +# +# # The height of the frame to contain the rotated image +# r_rows = cols * np.sin(angle) + rows * np.cos(angle) +# +# # The above calculations work as is for 0<90 degrees, and for 90<180 the cols and rows are flipped +# if deg_angle > 90: +# return int(r_cols), int(r_rows) +# else: +# return int(r_rows), int(r_cols) + def safe_rotate( img: np.ndarray, angle: int = 0, @@ -271,7 +313,7 @@ def normalized_grid_distortion( return A.augmentations.functional.grid_distortion(img, num_steps, xsteps, ysteps, *args, **kwargs) -class NormalizedGridDistortion(A.augmentations.transforms.GridDistortion): +class NormalizedGridDistortion(A.augmentations.geometric.transforms.GridDistortion): def apply(self, img, stepsx=(), stepsy=(), interpolation=cv2.INTER_LINEAR, **params): return normalized_grid_distortion(img, self.num_steps, stepsx, stepsy, interpolation, self.border_mode, self.value) diff --git a/requirements.txt b/requirements.txt index 30d7619..f68888f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,14 +1,14 @@ -torch +torch>=2.5.0 torchvision numpy>=1.19.5,<2.0 -pandas>=1.2.4 +pandas>=2.2.3 matplotlib>=3.5.3 opencv-python>=4.10.0.84 -transformers>=4.5.1,<=4.47.0 +transformers>=4.49.0 huggingface-hub>=0.11.0 tensorboardX SmilesPE==0.0.3 OpenNMT-py==2.2.0 rdkit>=2024.3.5 -albumentations @ git+https://github.com/albumentations-team/albumentations@37e714fd2e326f6f88778e425f98c2de8c8d5372 -timm @ git+https://github.com/rwightman/pytorch-image-models.git@54a6cca27a9a3e092a07457f5d56709da56e3cf5 \ No newline at end of file +albumentations==1.1.0 +timm==0.4.12 \ No newline at end of file From bcae48517865f7e6d20664b4e3bcdf0751bbdc19 Mon Sep 17 00:00:00 2001 From: "Dr. Aleksei Krasnov" Date: Mon, 17 Mar 2025 12:18:03 +0100 Subject: [PATCH 07/25] feat: updated requirements. Tested updated version of dependencies. Trying to implement new versio of albumentations --- .gitignore | 3 +-- molscribe/augment.py | 5 ++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index 81d4edc..d71eaad 100755 --- a/.gitignore +++ b/.gitignore @@ -10,5 +10,4 @@ tmp/ **/checkpoints/ *.png **/.DS_Store -*.egg-info -build/ \ No newline at end of file +*.egg-info \ No newline at end of file diff --git a/molscribe/augment.py b/molscribe/augment.py index 85491b5..c070232 100755 --- a/molscribe/augment.py +++ b/molscribe/augment.py @@ -1,6 +1,4 @@ import albumentations as A -# from albucore import maybe_process_in_chunks -# from albumentations.augmentations.utils import angle_2pi_range from albumentations.augmentations.geometric.functional import safe_rotate_enlarged_img_size, _maybe_process_in_chunks, \ keypoint_rotate import cv2 @@ -8,7 +6,8 @@ import random import numpy as np - +# from albucore import maybe_process_in_chunks +# from albumentations.augmentations.utils import angle_2pi_range # @angle_2pi_range # def keypoint_shift_scale_rotate(keypoint, angle, scale, dx, dy, rows, cols, **params): # ( From 77fe487c31aec4a9f6e2db4c628e7df4368cfaf8 Mon Sep 17 00:00:00 2001 From: "Dr. Aleksei Krasnov" Date: Mon, 17 Mar 2025 12:18:32 +0100 Subject: [PATCH 08/25] feat: updated requirements. Tested updated versions of dependencies. --- requirements.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/requirements.txt b/requirements.txt index 30d7619..f68888f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,14 +1,14 @@ -torch +torch>=2.5.0 torchvision numpy>=1.19.5,<2.0 -pandas>=1.2.4 +pandas>=2.2.3 matplotlib>=3.5.3 opencv-python>=4.10.0.84 -transformers>=4.5.1,<=4.47.0 +transformers>=4.49.0 huggingface-hub>=0.11.0 tensorboardX SmilesPE==0.0.3 OpenNMT-py==2.2.0 rdkit>=2024.3.5 -albumentations @ git+https://github.com/albumentations-team/albumentations@37e714fd2e326f6f88778e425f98c2de8c8d5372 -timm @ git+https://github.com/rwightman/pytorch-image-models.git@54a6cca27a9a3e092a07457f5d56709da56e3cf5 \ No newline at end of file +albumentations==1.1.0 +timm==0.4.12 \ No newline at end of file From 1fabf640d319706c5d701d376e87927bdf740401 Mon Sep 17 00:00:00 2001 From: "Dr. Aleksei Krasnov" Date: Mon, 17 Mar 2025 15:09:28 +0100 Subject: [PATCH 09/25] fix: fixed requirements for transformers>=4.5.1,<=4.47.0 --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index f68888f..7150c6c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,8 @@ numpy>=1.19.5,<2.0 pandas>=2.2.3 matplotlib>=3.5.3 opencv-python>=4.10.0.84 -transformers>=4.49.0 +# Only these version of transformers are compatible with timm +transformers>=4.5.1,<=4.47.0 huggingface-hub>=0.11.0 tensorboardX SmilesPE==0.0.3 From 25fb1a92b39b1d077b71b20a680a9f809da89133 Mon Sep 17 00:00:00 2001 From: "Dr. Aleksei Krasnov" Date: Mon, 17 Mar 2025 15:12:55 +0100 Subject: [PATCH 10/25] style: removed redundand commented code --- molscribe/augment.py | 42 ------------------------------------------ 1 file changed, 42 deletions(-) diff --git a/molscribe/augment.py b/molscribe/augment.py index c070232..c637fbc 100755 --- a/molscribe/augment.py +++ b/molscribe/augment.py @@ -6,48 +6,6 @@ import random import numpy as np -# from albucore import maybe_process_in_chunks -# from albumentations.augmentations.utils import angle_2pi_range -# @angle_2pi_range -# def keypoint_shift_scale_rotate(keypoint, angle, scale, dx, dy, rows, cols, **params): -# ( -# x, -# y, -# a, -# s, -# ) = keypoint[:4] -# height, width = rows, cols -# center = (width / 2, height / 2) -# matrix = cv2.getRotationMatrix2D(center, angle, scale) -# matrix[0, 2] += dx * width -# matrix[1, 2] += dy * height -# -# x, y = cv2.transform(np.array([[[x, y]]]), matrix).squeeze() -# angle = a + math.radians(angle) -# scale = s * scale -# -# return x, y, angle, scale -# -# -# def safe_rotate_enlarged_img_size(angle: float, rows: int, cols: int): -# -# deg_angle = abs(angle) -# -# # The rotation angle -# angle = np.deg2rad(deg_angle % 90) -# -# # The width of the frame to contain the rotated image -# r_cols = cols * np.cos(angle) + rows * np.sin(angle) -# -# # The height of the frame to contain the rotated image -# r_rows = cols * np.sin(angle) + rows * np.cos(angle) -# -# # The above calculations work as is for 0<90 degrees, and for 90<180 the cols and rows are flipped -# if deg_angle > 90: -# return int(r_cols), int(r_rows) -# else: -# return int(r_rows), int(r_cols) - def safe_rotate( img: np.ndarray, angle: int = 0, From f69a5e4e9aaa9234c1f1e72f9ca94f613d09be99 Mon Sep 17 00:00:00 2001 From: "Dr. Aleksei Krasnov" Date: Tue, 18 Mar 2025 10:04:16 +0100 Subject: [PATCH 11/25] fix: fixed version of albumentations, used proper import of class from albumentations --- molscribe/augment.py | 2 +- requirements.txt | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/molscribe/augment.py b/molscribe/augment.py index c637fbc..22c92b2 100755 --- a/molscribe/augment.py +++ b/molscribe/augment.py @@ -270,7 +270,7 @@ def normalized_grid_distortion( return A.augmentations.functional.grid_distortion(img, num_steps, xsteps, ysteps, *args, **kwargs) -class NormalizedGridDistortion(A.augmentations.geometric.transforms.GridDistortion): +class NormalizedGridDistortion(A.augmentations.transforms.GridDistortion): def apply(self, img, stepsx=(), stepsy=(), interpolation=cv2.INTER_LINEAR, **params): return normalized_grid_distortion(img, self.num_steps, stepsx, stepsy, interpolation, self.border_mode, self.value) diff --git a/requirements.txt b/requirements.txt index 7150c6c..279dce9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,12 +4,12 @@ numpy>=1.19.5,<2.0 pandas>=2.2.3 matplotlib>=3.5.3 opencv-python>=4.10.0.84 -# Only these version of transformers are compatible with timm -transformers>=4.5.1,<=4.47.0 huggingface-hub>=0.11.0 tensorboardX SmilesPE==0.0.3 OpenNMT-py==2.2.0 rdkit>=2024.3.5 albumentations==1.1.0 -timm==0.4.12 \ No newline at end of file +timm==0.4.12 +# Only this version of transformers are compatible with timm +transformers>=4.5.1,<=4.47.0 \ No newline at end of file From 94b07c922837f2af20eda746abd9005cefe35894 Mon Sep 17 00:00:00 2001 From: "Dr. Aleksei Krasnov" Date: Fri, 20 Jun 2025 15:30:02 +0200 Subject: [PATCH 12/25] chore: implemented batch_size-32 for Apple MPS. Adjusted and tested timm, torch, transformers versions --- molscribe/interface.py | 11 ++++++++++- requirements.txt | 6 +++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/molscribe/interface.py b/molscribe/interface.py index 5aa7e6b..e931ad5 100644 --- a/molscribe/interface.py +++ b/molscribe/interface.py @@ -42,6 +42,9 @@ def __init__(self, model_path, device=None, num_workers=1): self.encoder, self.decoder = self._get_model(args, self.tokenizer, self.device, model_states) self.transform = get_transforms(args.input_size, augment=False) self.num_workers = num_workers + # MPS-specific optimizations + self.is_mps = str(device).startswith('mps') + self.optimal_batch_size = 32 if self.is_mps else 16 def _get_args(self, args_states=None): parser = argparse.ArgumentParser() @@ -91,7 +94,13 @@ def _get_model(self, args, tokenizer, device, states): decoder.eval() return encoder, decoder - def predict_images(self, input_images: List, return_atoms_bonds=False, return_confidence=False, batch_size=16): + def predict_images(self, input_images: List, return_atoms_bonds=False, return_confidence=False, batch_size=None): + """ + Optimized version of predict_images with better MPS performance + """ + if batch_size is None: + batch_size = self.optimal_batch_size + device = self.device predictions = [] self.decoder.compute_confidence = return_confidence diff --git a/requirements.txt b/requirements.txt index 279dce9..d04bb70 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -torch>=2.5.0 +torch>=2.5.0,<=2.7.1 torchvision numpy>=1.19.5,<2.0 pandas>=2.2.3 @@ -10,6 +10,6 @@ SmilesPE==0.0.3 OpenNMT-py==2.2.0 rdkit>=2024.3.5 albumentations==1.1.0 -timm==0.4.12 +timm>=0.4.12,<=0.5.4 # Only this version of transformers are compatible with timm -transformers>=4.5.1,<=4.47.0 \ No newline at end of file +transformers>=4.5.1,<=4.52.4 \ No newline at end of file From 31e4dd197c6997abc41945e4f6491b65feb2ba25 Mon Sep 17 00:00:00 2001 From: "Dr. Aleksei Krasnov" Date: Tue, 15 Jul 2025 13:49:47 +0200 Subject: [PATCH 13/25] chore: bumped version to 1.2.0. Implemented fix a bug about case like (CH2)5 --- molscribe/chemistry.py | 114 ++++++++++++++++++++++++++--------------- setup.py | 2 +- 2 files changed, 74 insertions(+), 42 deletions(-) diff --git a/molscribe/chemistry.py b/molscribe/chemistry.py index 5c541b8..21ab642 100644 --- a/molscribe/chemistry.py +++ b/molscribe/chemistry.py @@ -4,10 +4,11 @@ import multiprocessing import itertools -import rdkit +import rdkit.RDLogger as RDLogger import rdkit.Chem as Chem +import rdkit.Chem.AllChem as AllChem -rdkit.RDLogger.DisableLog('rdApp.*') +RDLogger.DisableLog('rdApp.*') from SmilesPE.pretokenizer import atomwise_tokenizer @@ -366,7 +367,11 @@ def get_smiles_from_symbol(symbol, mol, atom, bonds): total_bonds = int(sum([bond.GetBondTypeAsDouble() for bond in bonds])) formula_list = _expand_carbon(_parse_formula(symbol)) - smiles, bonds_left, num_trails, success = _condensed_formula_list_to_smiles(formula_list, total_bonds, None) + # smiles, bonds_left, num_trails, success = _condensed_formula_list_to_smiles(formula_list, total_bonds, None) + if len(bonds) != 2: + smiles, bonds_left, num_trails, success = _condensed_formula_list_to_smiles(formula_list, total_bonds, None) + else: + smiles, bonds_left, num_trails, success = _condensed_formula_list_to_smiles(formula_list, 1, None) if success: return smiles return None @@ -448,47 +453,72 @@ def _need_expand(mol, mappings): atom.SetIsotope(0) continue - # remove bonds connected to abbreviation/condensed formula - adjacent_indices = [bond.GetOtherAtomIdx(i) for bond in bonds] - for adjacent_idx in adjacent_indices: - mol_w.RemoveBond(i, adjacent_idx) - - adjacent_atoms = [mol_w.GetAtomWithIdx(adjacent_idx) for adjacent_idx in adjacent_indices] - for adjacent_atom, bond in zip(adjacent_atoms, bonds): - adjacent_atom.SetNumRadicalElectrons(int(bond.GetBondTypeAsDouble())) - - # get indices of atoms of main body that connect to substituent - bonding_atoms_w = adjacent_indices - # assume indices are concated after combine mol_w and mol_r - bonding_atoms_r = [mol_w.GetNumAtoms()] - for atm in mol_r.GetAtoms(): - if atm.GetNumRadicalElectrons() and atm.GetIdx() > 0: - bonding_atoms_r.append(mol_w.GetNumAtoms() + atm.GetIdx()) - - # combine main body and substituent into a single molecule object - combo = Chem.CombineMols(mol_w, mol_r) - - # connect substituent to main body with bonds - mol_w = Chem.RWMol(combo) - # if len(bonding_atoms_r) == 1: # substituent uses one atom to bond to main body - for atm in bonding_atoms_w: - bond_order = mol_w.GetAtomWithIdx(atm).GetNumRadicalElectrons() - mol_w.AddBond(atm, bonding_atoms_r[0], order=BOND_TYPES[bond_order]) - - # reset radical electrons - for atm in bonding_atoms_w: - mol_w.GetAtomWithIdx(atm).SetNumRadicalElectrons(0) - for atm in bonding_atoms_r: - mol_w.GetAtomWithIdx(atm).SetNumRadicalElectrons(0) - atoms_to_remove.append(i) + if "(" in symbol and len(bonds) == 2: + # Get connection information for the current atom + connected_info = [(neighbor.GetIdx(), mol_w.GetBondBetweenAtoms(i, neighbor.GetIdx()).GetBondType()) + for neighbor in atom.GetNeighbors()] + + # Create a new molecule by combining the current molecule with the expanded fragment + combined = Chem.RWMol(AllChem.CombineMols(mol_w, mol_r)) + + # Connect the first atom of the expanded fragment to the first neighbor + combined.AddBond(connected_info[0][0], mol_w.GetNumAtoms(), connected_info[0][1]) + + # Connect the last atom of the expanded fragment to the second neighbor + combined.AddBond(connected_info[1][0], mol_w.GetNumAtoms() + mol_r.GetNumAtoms() - 1, + connected_info[1][1]) + + # Update the working molecule + mol_w = combined + + # Mark the original atom for removal + atoms_to_remove.append(i) + + else: + # remove bonds connected to abbreviation/condensed formula + adjacent_indices = [bond.GetOtherAtomIdx(i) for bond in bonds] + for adjacent_idx in adjacent_indices: + mol_w.RemoveBond(i, adjacent_idx) + + adjacent_atoms = [mol_w.GetAtomWithIdx(adjacent_idx) for adjacent_idx in adjacent_indices] + for adjacent_atom, bond in zip(adjacent_atoms, bonds): + adjacent_atom.SetNumRadicalElectrons(int(bond.GetBondTypeAsDouble())) + + # get indices of atoms of main body that connect to substituent + bonding_atoms_w = adjacent_indices + # assume indices are concated after combine mol_w and mol_r + bonding_atoms_r = [mol_w.GetNumAtoms()] + for atm in mol_r.GetAtoms(): + if atm.GetNumRadicalElectrons() and atm.GetIdx() > 0: + bonding_atoms_r.append(mol_w.GetNumAtoms() + atm.GetIdx()) + + # combine main body and substituent into a single molecule object + combo = Chem.CombineMols(mol_w, mol_r) + + # connect substituent to main body with bonds + mol_w = Chem.RWMol(combo) + # if len(bonding_atoms_r) == 1: # substituent uses one atom to bond to main body + for atm in bonding_atoms_w: + bond_order = mol_w.GetAtomWithIdx(atm).GetNumRadicalElectrons() + mol_w.AddBond(atm, bonding_atoms_r[0], order=BOND_TYPES[bond_order]) + + # reset radical electrons + for atm in bonding_atoms_w: + mol_w.GetAtomWithIdx(atm).SetNumRadicalElectrons(0) + for atm in bonding_atoms_r: + mol_w.GetAtomWithIdx(atm).SetNumRadicalElectrons(0) + atoms_to_remove.append(i) # Remove atom in the end, otherwise the id will change # Reverse the order and remove atoms with larger id first atoms_to_remove.sort(reverse=True) for i in atoms_to_remove: mol_w.RemoveAtom(i) - smiles = Chem.MolToSmiles(mol_w) + + # Generate the updated molfile with expanded structure mol = mol_w.GetMol() + AllChem.Compute2DCoords(mol) + smiles = Chem.MolToSmiles(mol_w) else: smiles = Chem.MolToSmiles(mol) return smiles, mol @@ -551,10 +581,12 @@ def _convert_graph_to_smiles(coords, symbols, edges, image=None, debug=False): ratio = width / height coords = [[x * ratio * 10, y * 10] for x, y in coords] mol = _verify_chirality(mol, coords, symbols, edges, debug) - # molblock is obtained before expanding func groups, otherwise the expanded group won't have coordinates. - # TODO: make sure molblock has the abbreviation information - pred_molblock = Chem.MolToMolBlock(mol) - pred_smiles, mol = _expand_functional_group(mol, {}, debug) + + # First expand functional groups + pred_smiles, expanded_mol = _expand_functional_group(mol, {}, debug) + + # Generate molblock from the expanded molecule + pred_molblock = Chem.MolToMolBlock(expanded_mol) success = True except Exception as e: if debug: diff --git a/setup.py b/setup.py index 1f394d2..f4b944b 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ def read_requirements(): return fh.read().splitlines() setup(name='MolScribe', - version='1.1.1', + version='1.2.0', description='MolScribe', author='Yujie Qian', author_email='yujieq@csail.mit.edu', From 65e92184e8a1283b58ae49ca585185a884f0baa9 Mon Sep 17 00:00:00 2001 From: "Dr. Aleksei Krasnov" Date: Tue, 15 Jul 2025 15:46:51 +0200 Subject: [PATCH 14/25] updated and tested versions of transformers>=4.47.0,<=4.52.4 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d04bb70..d56081e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,4 +12,4 @@ rdkit>=2024.3.5 albumentations==1.1.0 timm>=0.4.12,<=0.5.4 # Only this version of transformers are compatible with timm -transformers>=4.5.1,<=4.52.4 \ No newline at end of file +transformers>=4.47.0,<=4.52.4 \ No newline at end of file From f3b9825f85b5331ca5d9f330954ba690b62dc670 Mon Sep 17 00:00:00 2001 From: "Dr. Aleksei Krasnov" Date: Tue, 3 Feb 2026 10:22:03 +0100 Subject: [PATCH 15/25] chore: made changed according to CrystalEye42 fork --- molscribe/constants.py | 5 +++-- molscribe/interface.py | 4 +++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/molscribe/constants.py b/molscribe/constants.py index 6e470c8..b199fea 100644 --- a/molscribe/constants.py +++ b/molscribe/constants.py @@ -3,7 +3,7 @@ ORGANIC_SET = {'B', 'C', 'N', 'O', 'P', 'S', 'F', 'Cl', 'Br', 'I'} -RGROUP_SYMBOLS = ['R', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'R9', 'R10', 'R11', 'R12', +RGROUP_SYMBOLS = ['R', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'R9', 'R10', 'R11', 'R12', "R'", 'Ra', 'Rb', 'Rc', 'Rd', 'X', 'Y', 'Z', 'Q', 'A', 'E', 'Ar'] PLACEHOLDER_ATOMS = ["Lv", "Lu", "Nd", "Yb", "At", "Fm", "Er"] @@ -21,8 +21,9 @@ def __init__(self, abbrvs, smarts, smiles, probability): SUBSTITUTIONS: List[Substitution] = [ Substitution(['NO2', 'O2N'], '[N+](=O)[O-]', "[N+](=O)[O-]", 0.5), + Substitution(['OCOCH3'], '[#8]-[#6](=[#8])-[#6]', "[O]C(=O)C]", 0.5), Substitution(['CHO', 'OHC'], '[CH1](=O)', "[CH1](=O)", 0.5), - Substitution(['CO2Et', 'COOEt'], 'C(=O)[OH0;D2][CH2;D2][CH3]', "[C](=O)OCC", 0.5), + Substitution(['CO2Et', 'COOEt', 'EtO2C'], 'C(=O)[OH0;D2][CH2;D2][CH3]', "[C](=O)OCC", 0.5), Substitution(['OAc'], '[OH0;X2]C(=O)[CH3]', "[O]C(=O)C", 0.7), Substitution(['NHAc'], '[NH1;D2]C(=O)[CH3]', "[NH]C(=O)C", 0.7), diff --git a/molscribe/interface.py b/molscribe/interface.py index e931ad5..5d90d6f 100644 --- a/molscribe/interface.py +++ b/molscribe/interface.py @@ -114,7 +114,9 @@ def predict_images(self, input_images: List, return_atoms_bonds=False, return_co batch_predictions = self.decoder.decode(features, hiddens) predictions += batch_predictions - smiles = [pred['chartok_coords']['smiles'] for pred in predictions] + return self.convert_graph_to_output(predictions, input_images, return_confidence, return_atoms_bonds) + + def convert_graph_to_output(self, predictions, input_images, return_confidence=False, return_atoms_bonds=False): node_coords = [pred['chartok_coords']['coords'] for pred in predictions] node_symbols = [pred['chartok_coords']['symbols'] for pred in predictions] edges = [pred['edges'] for pred in predictions] From 89f2fc889bd444c996cdfb4a24b8af376e6ae036 Mon Sep 17 00:00:00 2001 From: "Dr. Aleksei Krasnov" Date: Wed, 4 Feb 2026 14:56:54 +0100 Subject: [PATCH 16/25] chore: added R-groups from MolNexTR --- molscribe/constants.py | 5 +++-- molscribe/interface.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/molscribe/constants.py b/molscribe/constants.py index b199fea..d0cde7b 100644 --- a/molscribe/constants.py +++ b/molscribe/constants.py @@ -3,8 +3,9 @@ ORGANIC_SET = {'B', 'C', 'N', 'O', 'P', 'S', 'F', 'Cl', 'Br', 'I'} -RGROUP_SYMBOLS = ['R', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'R9', 'R10', 'R11', 'R12', "R'", - 'Ra', 'Rb', 'Rc', 'Rd', 'X', 'Y', 'Z', 'Q', 'A', 'E', 'Ar'] +RGROUP_SYMBOLS = ['R', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'R9', 'R10', 'R11', 'R12', + 'Ra', 'Rb', 'Rc', 'Rd', 'Rf', 'X', 'Y', 'Z', 'Q', 'A', 'E', 'Ar', 'Ar1', 'Ar2', 'Ari', 'Ar3', 'Ar4','Ar5','Ar6','Ar7',"R'", + '1*', '2*','3*', '4*','5*', '6*','7*', '8*','9*', '10*','11*', '12*','[a*]', '[b*]','[c*]', '[d*]',"EWG",'Nu'] PLACEHOLDER_ATOMS = ["Lv", "Lu", "Nd", "Yb", "At", "Fm", "Er"] diff --git a/molscribe/interface.py b/molscribe/interface.py index 5d90d6f..2e3a463 100644 --- a/molscribe/interface.py +++ b/molscribe/interface.py @@ -44,7 +44,7 @@ def __init__(self, model_path, device=None, num_workers=1): self.num_workers = num_workers # MPS-specific optimizations self.is_mps = str(device).startswith('mps') - self.optimal_batch_size = 32 if self.is_mps else 16 + self.optimal_batch_size = 32 if self.is_mps else 4 def _get_args(self, args_states=None): parser = argparse.ArgumentParser() From 2e98d43bca77205455e6d0f1473b1e4c096613fe Mon Sep 17 00:00:00 2001 From: "Dr. Aleksei Krasnov" Date: Wed, 4 Feb 2026 19:42:34 +0100 Subject: [PATCH 17/25] chore: bumped version from 1.2.0 to 1.2.1 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index f4b944b..daa675f 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ def read_requirements(): return fh.read().splitlines() setup(name='MolScribe', - version='1.2.0', + version='1.2.1', description='MolScribe', author='Yujie Qian', author_email='yujieq@csail.mit.edu', From c8d33bb9612b1e0b4fde3e9ecb0e1af2ba8c9775 Mon Sep 17 00:00:00 2001 From: "Dr. Aleksei Krasnov" Date: Wed, 18 Feb 2026 08:08:35 +0100 Subject: [PATCH 18/25] chore: Removed dynamically adjusted num_workers based on the number of images in list. Reduced optimal_batch_size to 4 for CPU, due to error in inferenct at 16 --- molscribe/interface.py | 6 ------ requirements.txt | 2 +- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/molscribe/interface.py b/molscribe/interface.py index 2e3a463..2ba6bee 100644 --- a/molscribe/interface.py +++ b/molscribe/interface.py @@ -121,12 +121,6 @@ def convert_graph_to_output(self, predictions, input_images, return_confidence=F node_symbols = [pred['chartok_coords']['symbols'] for pred in predictions] edges = [pred['edges'] for pred in predictions] - # Dynamically adjust num_workers based on the number of images in list - if len(input_images) <= 100: - self.num_workers = 1 - else: - self.num_workers = os.cpu_count() - smiles_list, molblock_list, r_success = convert_graph_to_smiles( node_coords, node_symbols, edges, images=input_images, num_workers=self.num_workers) diff --git a/requirements.txt b/requirements.txt index d56081e..d34d695 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,7 +8,7 @@ huggingface-hub>=0.11.0 tensorboardX SmilesPE==0.0.3 OpenNMT-py==2.2.0 -rdkit>=2024.3.5 +rdkit~=2025.3.6 albumentations==1.1.0 timm>=0.4.12,<=0.5.4 # Only this version of transformers are compatible with timm From be9cf7e5ea9a0a2ef3d3dd102ba007b510b9d8b1 Mon Sep 17 00:00:00 2001 From: "Dr. Aleksei Krasnov" Date: Wed, 18 Feb 2026 08:09:10 +0100 Subject: [PATCH 19/25] chore: Removed dynamically adjusted num_workers based on the number of images in list. Reduced optimal_batch_size to 4 for CPU, due to error in inferenct at 16 --- molscribe/interface.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/molscribe/interface.py b/molscribe/interface.py index 2ba6bee..b61eb5b 100644 --- a/molscribe/interface.py +++ b/molscribe/interface.py @@ -44,7 +44,7 @@ def __init__(self, model_path, device=None, num_workers=1): self.num_workers = num_workers # MPS-specific optimizations self.is_mps = str(device).startswith('mps') - self.optimal_batch_size = 32 if self.is_mps else 4 + self. = 32 if self.is_mps else 4 def _get_args(self, args_states=None): parser = argparse.ArgumentParser() From 0ce05a2bfa06681bd7cc8c45561f4ecbb0390f82 Mon Sep 17 00:00:00 2001 From: "Dr. Aleksei Krasnov" Date: Wed, 18 Feb 2026 08:09:53 +0100 Subject: [PATCH 20/25] chore: Removed dynamically adjusted num_workers based on the number of images in list --- molscribe/interface.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/molscribe/interface.py b/molscribe/interface.py index 5d90d6f..76bf263 100644 --- a/molscribe/interface.py +++ b/molscribe/interface.py @@ -121,12 +121,6 @@ def convert_graph_to_output(self, predictions, input_images, return_confidence=F node_symbols = [pred['chartok_coords']['symbols'] for pred in predictions] edges = [pred['edges'] for pred in predictions] - # Dynamically adjust num_workers based on the number of images in list - if len(input_images) <= 100: - self.num_workers = 1 - else: - self.num_workers = os.cpu_count() - smiles_list, molblock_list, r_success = convert_graph_to_smiles( node_coords, node_symbols, edges, images=input_images, num_workers=self.num_workers) From 1c6278fbcc6a576e999c61fe3d374391d4c05f2e Mon Sep 17 00:00:00 2001 From: "Dr. Aleksei Krasnov" Date: Wed, 18 Feb 2026 08:34:35 +0100 Subject: [PATCH 21/25] chore: Removed unnessesary AllChem.Compute2DCoords(mol) from _expand_functional_group method --- molscribe/chemistry.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/molscribe/chemistry.py b/molscribe/chemistry.py index 21ab642..602c5aa 100644 --- a/molscribe/chemistry.py +++ b/molscribe/chemistry.py @@ -514,11 +514,8 @@ def _need_expand(mol, mappings): atoms_to_remove.sort(reverse=True) for i in atoms_to_remove: mol_w.RemoveAtom(i) - - # Generate the updated molfile with expanded structure - mol = mol_w.GetMol() - AllChem.Compute2DCoords(mol) smiles = Chem.MolToSmiles(mol_w) + mol = mol_w.GetMol() else: smiles = Chem.MolToSmiles(mol) return smiles, mol From a5dcd5ba585fbcb6e802bbf3ec61363037925144 Mon Sep 17 00:00:00 2001 From: "Dr. Aleksei Krasnov" Date: Wed, 18 Feb 2026 08:40:31 +0100 Subject: [PATCH 22/25] chore: Removed unnessesary AllChem.Compute2DCoords(mol) from _expand_functional_group method --- molscribe/chemistry.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/molscribe/chemistry.py b/molscribe/chemistry.py index 21ab642..602c5aa 100644 --- a/molscribe/chemistry.py +++ b/molscribe/chemistry.py @@ -514,11 +514,8 @@ def _need_expand(mol, mappings): atoms_to_remove.sort(reverse=True) for i in atoms_to_remove: mol_w.RemoveAtom(i) - - # Generate the updated molfile with expanded structure - mol = mol_w.GetMol() - AllChem.Compute2DCoords(mol) smiles = Chem.MolToSmiles(mol_w) + mol = mol_w.GetMol() else: smiles = Chem.MolToSmiles(mol) return smiles, mol From c7b5010fb91c91df0e6094e32773970b9d18bee2 Mon Sep 17 00:00:00 2001 From: "Dr. Aleksei Krasnov" Date: Wed, 18 Feb 2026 10:09:11 +0100 Subject: [PATCH 23/25] fix: added accidentally removed self.optimal_batch_size param --- molscribe/interface.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/molscribe/interface.py b/molscribe/interface.py index b61eb5b..2ba6bee 100644 --- a/molscribe/interface.py +++ b/molscribe/interface.py @@ -44,7 +44,7 @@ def __init__(self, model_path, device=None, num_workers=1): self.num_workers = num_workers # MPS-specific optimizations self.is_mps = str(device).startswith('mps') - self. = 32 if self.is_mps else 4 + self.optimal_batch_size = 32 if self.is_mps else 4 def _get_args(self, args_states=None): parser = argparse.ArgumentParser() From 564973d04850af75049d75e1d55553aa03a1db5a Mon Sep 17 00:00:00 2001 From: "Dr. Aleksei Krasnov" Date: Wed, 18 Feb 2026 10:14:05 +0100 Subject: [PATCH 24/25] fix: added param num_workers in docstring of MolScribe class --- molscribe/interface.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/molscribe/interface.py b/molscribe/interface.py index 76bf263..0a08bde 100644 --- a/molscribe/interface.py +++ b/molscribe/interface.py @@ -31,7 +31,7 @@ def __init__(self, model_path, device=None, num_workers=1): MolScribe Interface :param model_path: path of the model checkpoint. :param device: torch device, defaults to be CPU. - :param multiprocessing_enabled: uses multiprocessing to parallelize parts of the inference when enabled, defaults to False. + :param num_workers: number of workers for parallel processing, defaults to 1. """ model_states = torch.load(model_path, map_location=torch.device('cpu')) args = self._get_args(model_states['args']) From 784450edcb7173e159fd365576664c11cd6a42e9 Mon Sep 17 00:00:00 2001 From: "Dr. Aleksei Krasnov" Date: Wed, 18 Feb 2026 10:14:45 +0100 Subject: [PATCH 25/25] fix: added param num_workers in docstring of MolScribe class --- molscribe/interface.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/molscribe/interface.py b/molscribe/interface.py index 2ba6bee..055f862 100644 --- a/molscribe/interface.py +++ b/molscribe/interface.py @@ -31,7 +31,7 @@ def __init__(self, model_path, device=None, num_workers=1): MolScribe Interface :param model_path: path of the model checkpoint. :param device: torch device, defaults to be CPU. - :param multiprocessing_enabled: uses multiprocessing to parallelize parts of the inference when enabled, defaults to False. + :param num_workers: number of workers for parallel processing, defaults to 1. """ model_states = torch.load(model_path, map_location=torch.device('cpu')) args = self._get_args(model_states['args'])