diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..cc53d48 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,76 @@ +name: CI +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + test: + runs-on: ubuntu-latest + permissions: + pull-requests: write + contents: write + checks: write + + defaults: + run: + shell: bash -el {0} + + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Set up conda + uses: conda-incubator/setup-miniconda@v3 + with: + miniconda-version: latest + activate-environment: cf-random + python-version: '3.12' + auto-activate-base: false + + - name: Cache conda environment + uses: actions/cache@v4 + id: conda-cache + with: + path: ${{ env.CONDA }}/envs/cf-random + key: conda-${{ runner.os }}-${{ hashFiles('install.sh') }} + + - name: Install dependencies + if: steps.conda-cache.outputs.cache-hit != 'true' + run: | + chmod +x install.sh + ./install.sh + + - name: Install dev dependencies + run: pip install -e ".[dev]" + + - name: Run tests with coverage + run: | + pytest tests/ --cov=cf_random --cov-report=xml:coverage.xml --cov-report=term-missing | tee pytest-coverage.txt + + - name: Upload coverage report + if: always() + uses: actions/upload-artifact@v4 + with: + name: coverage-report + path: | + coverage.xml + pytest-coverage.txt + + - name: Build package + run: python -m build + + - name: Upload to Test PyPI + if: github.event_name == 'pull_request' + env: + TWINE_USERNAME: __token__ + TWINE_PASSWORD: ${{ secrets.TEST_PYPI_API_TOKEN }} + run: twine upload --repository testpypi dist/* + + - name: Upload to PyPI + if: github.ref == 'refs/heads/main' && github.event_name == 'push' + env: + TWINE_USERNAME: __token__ + TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} + run: twine upload dist/* \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..518bcac --- /dev/null +++ b/.gitignore @@ -0,0 +1,86 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +*.manifest +*.spec + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ +.DS_Store + +# Project specific +examples/*/results/ +blind_prediction/ +successful_predictions/ +failed_predictions/ +multimer_prediction/ +test_data/ +msa_folder/ +structures_all.csv + +# Foldseek +pdb* +pdb_* +tmp/ + + +params/ +msa_folder/ +range_fs_pairs_all.txt +*.pdb +*.a3m +2oug_C-search \ No newline at end of file diff --git a/Data/Fold-switch_hits-SPEACH_AF/pdb_pairs.csv b/Data/Fold-switch_hits-SPEACH_AF/pdb_pairs.csv deleted file mode 100644 index 95a47ab..0000000 --- a/Data/Fold-switch_hits-SPEACH_AF/pdb_pairs.csv +++ /dev/null @@ -1,8 +0,0 @@ -FOLD1,FOLD2 -1kct_A,3t1p_A -5ejb_C,1wp8_C -2a73_B,3l5n_B -6c6s_D,2oug_C -3j7w_B,3j7v_E -4y0m_J,4xws_D -2pbk_B,3njq_A diff --git a/Install/install_colabbatch_linux_101624.sh b/Install/install_colabbatch_linux_101624.sh deleted file mode 100644 index 070dd35..0000000 --- a/Install/install_colabbatch_linux_101624.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash -e - -type wget 2>/dev/null || { echo "wget is not installed. Please install it using apt or yum." ; exit 1 ; } - -CURRENTPATH=`pwd` -COLABFOLDDIR="${CURRENTPATH}/localcolabfold" - -mkdir -p "${COLABFOLDDIR}" -cd "${COLABFOLDDIR}" -wget -q -P . https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh -bash ./Miniforge3-Linux-x86_64.sh -b -p "${COLABFOLDDIR}/conda" -rm Miniforge3-Linux-x86_64.sh - -source "${COLABFOLDDIR}/conda/etc/profile.d/conda.sh" -export PATH="${COLABFOLDDIR}/conda/condabin:${PATH}" -conda update -n base conda -y -conda create -p "$COLABFOLDDIR/colabfold-conda" -c conda-forge -c bioconda \ - git python=3.10 openmm==7.7.0 pdbfixer \ - kalign2=2.04 hhsuite=3.3.0 mmseqs2=15.6f452 -y -conda activate "$COLABFOLDDIR/colabfold-conda" - -# install ColabFold and Jaxlib -"$COLABFOLDDIR/colabfold-conda/bin/pip" install --no-warn-conflicts \ - "colabfold[alphafold-minus-jax] @ git+https://github.com/sokrypton/ColabFold" -"$COLABFOLDDIR/colabfold-conda/bin/pip" install "colabfold[alphafold]" -"$COLABFOLDDIR/colabfold-conda/bin/pip" install --upgrade "jax[cuda12]"==0.4.28 -"$COLABFOLDDIR/colabfold-conda/bin/pip" install --upgrade tensorflow -"$COLABFOLDDIR/colabfold-conda/bin/pip" install silence_tensorflow - -# Download the updater -wget -qnc -O "$COLABFOLDDIR/update_linux.sh" \ - https://raw.githubusercontent.com/YoshitakaMo/localcolabfold/main/update_linux.sh -chmod +x "$COLABFOLDDIR/update_linux.sh" - -pushd "${COLABFOLDDIR}/colabfold-conda/lib/python3.10/site-packages/colabfold" -# Use 'Agg' for non-GUI backend -sed -i -e "s#from matplotlib import pyplot as plt#import matplotlib\nmatplotlib.use('Agg')\nimport matplotlib.pyplot as plt#g" plot.py -# modify the default params directory -sed -i -e "s#appdirs.user_cache_dir(__package__ or \"colabfold\")#\"${COLABFOLDDIR}/colabfold\"#g" download.py -# suppress warnings related to tensorflow -sed -i -e "s#from io import StringIO#from io import StringIO\nfrom silence_tensorflow import silence_tensorflow\nsilence_tensorflow()#g" batch.py -# remove cache directory -rm -rf __pycache__ -popd - -# Download weights -"$COLABFOLDDIR/colabfold-conda/bin/python3" -m colabfold.download -echo "Download of alphafold2 weights finished." -echo "-----------------------------------------" -echo "Installation of ColabFold finished." -echo "Add ${COLABFOLDDIR}/colabfold-conda/bin to your PATH environment variable to run 'colabfold_batch'." -echo -e "i.e. for Bash:\n\texport PATH=\"${COLABFOLDDIR}/colabfold-conda/bin:\$PATH\"" -echo "For more details, please run 'colabfold_batch --help'." diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..3cb37fd --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,4 @@ +include README.md +include LICENSE.md +recursive-include cf_random/data * +recursive-include examples * diff --git a/README.md b/README.md index 60b8ea2..2f136f7 100644 --- a/README.md +++ b/README.md @@ -1,134 +1,160 @@ -# Data and code for CF-random -General installation and usage guidance of CF-random for predicting the alternative conformation and fold-switching proteins.
-To run CF-random in a Colab notebook, please use following [link](https://colab.research.google.com/drive/16pD2tUMkUx1gwDxZXcSr9WOosYp0ZU6j?authuser=0).

+# CF-random + +CF-random predicts alternative protein conformations and fold-switching behavior using AlphaFold2 with varied MSA depths. + +To run CF-random in a Colab notebook: + - Open CF-random Colab + Open CF-random Colab +--- -# Installation -We currently not support the Windows and MacOS environment.
-Installation process including Colabfold, dependencies, and Foldseek is done with following commands. +## Installation +CF-random depends on ColabFold (structure prediction) and Foldseek (structure search). A convenience script sets up the conda environment and required packages: -Now create a conda new conda environment: -``` -conda create --name CF-random python=3.10 -conda activate CF-random -pip install textalloc tmtools adjustText thefuzz mdtraj biopython seaborn MDAnalysis Colabfold -conda install conda-forge::pymol-open-source -pip3 install -U scikit-learn -``` -Once the dependencies are installed, install Foldseek. -
-``` -conda install -c conda-forge -c bioconda foldseek +```bash +chmod +x install.sh +./install.sh ``` -
+--- -# Usage -* CF-random has different prediction modes such as fold-switching default, alternative conformation, and blind mode.
-* To execute all modes of CF-random, a multiple sequence alignment (MSA) is required. To avoid the overwriting the output files, we recommend using a different folder containing MSA.
-* PDB files for both fold1 (dominant conformation) and fold2 (alternative conformation) are required for TM-score measurement with reference files. Blind mode doesn't require PDB files, but default fold-switching and alternative conformation modes do.
-* ### All required PDB files and MSA file should be in same directory with provided Python scripts. -* Please make sure that a PDB file should have a single chain, not multiple chains. If PDB file has multiple chains, CF-random would be stopped. +## Usage -``` - --fname #### | folder name having a multiple sequence alignment (MSA) - --pname #### | project name for running blind mode (only for blind mode) - --pdb1 #### | dominant reference model used to calculate TM-score with predicted models - --pdb2 #### | alternative reference model used to calculate TM-score with predicted models - --nMSA #### | the number of additional samples for predicting the structure with MSAs, default = 0 - --type #### | can choose the model type of Colabfold. e.g.) ptm, monomer, and multimer - --options ### | AC: predicting alternative conformations of protein with references, FS: predicting the fold-switching protein with references, and blind: predicting the alternative conformations or fold-switching proteins without reference PDB files. -``` -* In default mode (fold-switching and alternative conformation), CF-ramdon produces the results of TM-scores (csv and png files), plDDT, and information of selected random MSA. If CF-random predicts the both folds, generated prediction files are deposited under successed_prediction/pdb1_name and additional_sampling/pdb1_name . If not, it would not generate anything.
-* Before running the default mode of fold-switching, setting the "range_fs_pairs_all.txt" file is required. The name of reference PDB files, residue ranges of reference pdb files, and residue ranges of prediction files. ColabFold generates the residue index starting from 1, so please choose the residue range of fold-switching region correctly. CF-random reads the residue index in PDB file, make sure that selection of residue range is correct.
- examples) pdb1, pdb2, XXX-XXX, XXX-XXX, XXX-XXX, XXX-XXX (residue range of reference 1, residue range of reference 2, residue range of prediction1, resodie range of prediction2)
-* --nMSA can be applied for all options, but --nESN cannot be used for blind mode. -* In blind mode, predicted files are deposited under blind_prediction/pdb1_name . CF-random with blind mode produces the comparison result with Foldseek.
-* ### For running the foldseek in blind mode, Foldseek parameter files and running Python scripts should be in same directory.
- -* Before running the CF-random, ensure that the CF-random conda environment is activated:
-``` -conda activate CF-random -``` -
+CF-random supports three modes: -# Examples -We provide some examples how users can run the CF-random with different modes.
-First two modes such as fold-switching and alternative conformation are default modes of CF-random and the last one is a blind mode. -## 1. For CF-random with fold-switching mode.
-For this example, RfaH would be predicted with two reference structures (i.e., 2oug_C.pdb and 6c6s_D.pdb) and a MSA file. -``` -python main.py --fname 2oug_C-search/ --pdb1 2oug_C.pdb --pdb2 6c6s_D.pdb --option FS -``` -### Used input files:
-* PDB1: 2oug_C.pdb
-* PDB2: 6c6s_D.pdb
-* MSA: 2oug_C-search/0.a3m (MSA file should be in a folder)
-* range_fs_pairs_all.txt (This file is required for reading the fold-switching region in refernece and predicted structures. Users should check the region before running this mode.)
- -*This takes <30 Minutes to run on an A100 GPU (generates 200 structures total).*
- -### Generated output files:
-_Predicted files from deep and random MSAs are deposited in 'predictions_all' directory._
-_If CF-random fails to find the selected random MSA, all generated files will be in 'predictions_all' directory._
-* TM-score plot of whole structure: TMscore_fs-region_full-MSA_2oug_C.png
-* TM-score plot of fold-switching region: TMscore_full-MSA_2oug_C.png
-* TM-score plot of fold-switching region with label of prediction rank: TMscore_fs-region_full-MSA_2oug_C_label.png
-* TM-scores and plDDT scores of predictions with deep MSA: TMs_plDDT_full_all_2oug_C.csv
-* TM-scores and plDDT scores of predictions with random MSAs: TMs_plDDT_rand_all_2oug_C.csv
-* Selection of random MSA: selected_MSA-size_2oug_C.csv (When CF-random finds the MSA depth) - - MSA depth information (e.g. # = max-seq:max-seq-extra) (0 = 1:2, 1 = 2:4, 2 = 4:8, 3 = 8:16, 4 = 16:32, 5 = 32:64, 6 = 64:128)
- - -## 2. For CF-random with alternative conformation mode.
-For this mode, Lactococcal OppA would be predicted with two reference structures (i.e., 3drh.pdb and 3drf.pdb) and an MSA file.
+- **FS** — fold-switching prediction with two reference PDBs +- **AC** — alternative conformation prediction with two reference PDBs +- **blind** — alternative conformation or fold-switching without reference PDBs + +### General notes + +- FS and AC modes require two reference PDB files (fold1 and fold2). +- Each target needs its own MSA directory; do not reuse output folders across runs. +- All required PDB files, MSA files, and Python scripts must be in the same directory. +- PDB files should contain a single chain. Multi-chain PDBs may be converted automatically in some workflows, but providing single-chain PDBs avoids issues. +- FS mode requires a `range_fs_pairs_all.txt` file describing the fold-switching region (see [Fold-switching mode](#1-fold-switching-mode-fs) below for format details). ColabFold uses 1-based residue indexing; ensure ranges match your PDB/sequence. +- `--num_msa` and `--num_ens` apply to all modes except blind, which does not support `--num_ens`. +- Activate the conda environment before running: + +```bash +conda activate cf-random ``` -python main.py --fname 5olw_A-search --pdb1 5olw_A.pdb --pdb2 5olx_A.pdb --option AC --nMSA 5 + +### Arguments + +| Argument | Required | Description | +|---|---|---| +| `--option` | Yes | Run mode: `AC`, `FS`, `inAC`, or `blind` | +| `--fname` | Yes | MSA folder name (output of ColabSearch) | +| `--pdb1` | FS / AC | Target crystal structure PDB | +| `--pdb2` | FS / AC | Alternative crystal structure PDB | +| `--fmname` | No | Multimer MSA folder name (output of ColabSearch) | +| `--pname` | blind | Job name for blind mode output naming | +| `--num_msa` | No | Number of additional MSA seeds to run, added to the default 5 | +| `--num_ens` | No | Number of ensemble samples to generate | +| `--model_type` | No | ColabFold model type: `ptm`, `monomer`, or `multimer` | + +--- + +## Examples + +### 1. Fold-switching mode (FS) + +Predicts RfaH using two reference structures and an MSA. + +```bash +cf-random --fname 2oug_C-search/ --pdb1 2oug_C.pdb --pdb2 6c6s_D.pdb --option FS ``` -### Used input files:
-* PDB1: 5olw_A.pdb
-* PDB2: 5olx_A.pdb
-* MSA: 5olw_A-search/0.a3m (MSA file should be in a folder)
- -*This takes <70 Minutes to run on an A100 GPU (generates 200 structures total; protein is large: ~250 residues).*
- -### Generated output files:
-_Predicted files from deep and random MSAs are deposited in 'predictions_all' directory._
-_If CF-random fails to find the selected random MSA, all generated files will be in 'predictions_all' directory._
-* TM-score plot of whole structure: TMscore_full-MSA_5olw_A.png
-* TM-scores and plDDT scores of predictions with deep MSA: TMs_plDDT_full_all_5olw_A.csv
-* TM-scores and plDDT scores of predictions with random MSAs: TMs_plDDT_rand_all_5olw_A.csv
-* Selection of random MSA: selected_MSA-size_3drh_A.csv (When CF-random finds the MSA depth) - - MSA depth information (e.g. # = max-seq:max-seq-extra) (0 = 1:2, 1 = 2:4, 2 = 4:8, 3 = 8:16, 4 = 16:32, 5 = 32:64, 6 = 64:128)
- -## 3. For CF-random with blind mode covering both fold-switching and alternative conformation.
+ +**Input files:** +- `2oug_C.pdb` — dominant reference +- `6c6s_D.pdb` — alternative reference +- `2oug_C-search/0.a3m` — MSA +- `range_fs_pairs_all.txt` — fold-switching region definition + +**`range_fs_pairs_all.txt` format:** + +Each line defines the fold-switching region for a pair of reference structures: +pdb1, pdb2, XXX-XXX, XXX-XXX, XXX-XXX, XXX-XXX + +Fields are: PDB1 name, PDB2 name, residue range of reference 1, residue range of reference 2, residue range of prediction 1, residue range of prediction 2. ColabFold generates residue indices starting from 1; ensure your ranges match the residue numbering in your PDB files. + +*Generates 200 structures; takes under 30 minutes on an A100 GPU.* + +**Output files** (written to `predictions_all/`): + +| File | Description | +|---|---| +| `TMscore_full-MSA_2oug_C.png` | TM-score scatter plot, whole structure | +| `TMscore_fs-region_full-MSA_2oug_C.png` | TM-score scatter plot, fold-switching region | +| `TMscore_fs-region_full-MSA_2oug_C_label.png` | Same plot with prediction rank labels | +| `TMs_plDDT_full_all_2oug_C.csv` | TM-scores and pLDDT for deep MSA predictions | +| `TMs_plDDT_rand_all_2oug_C.csv` | TM-scores and pLDDT for random MSA predictions | +| `selected_MSA-size_2oug_C.csv` | Selected MSA depth (when CF-random identifies the optimal depth) | + +MSA depth encoding: `0=1:2`, `1=2:4`, `2=4:8`, `3=8:16`, `4=16:32`, `5=32:64`, `6=64:128` + +--- + +### 2. Alternative conformation mode (AC) + +Predicts Lactococcal OppA using two reference structures and an MSA. + +```bash +cf-random --fname P71447-search/ --pdb1 2wfa_A.pdb --pdb2 2wf5_A.pdb --option AC --num_msa 5 ``` -python main.py --pname Mad2_test --fname 2vfx_L-search/ --option blind + +**Input files:** +- `2wfa_A.pdb` — dominant reference +- `2wf5_A.pdb` — alternative reference +- `P71447-search/P71447_converted.a3m` — MSA + +*Generates 200 structures; takes under 70 minutes on an A100 GPU (~250 residue protein).* + +**Output files** (written to `predictions_all/`): + +| File | Description | +|---|---| +| `TMscore_full-MSA_2wfa_A.png` | TM-score scatter plot, whole structure | +| `TMs_plDDT_full_all_2wfa_A.csv` | TM-scores and pLDDT for deep MSA predictions | +| `TMs_plDDT_rand_all_2wfa_A.csv` | TM-scores and pLDDT for random MSA predictions | +| `selected_MSA-size_2wfa_A.csv` | Selected MSA depth (when CF-random identifies the optimal depth) | + +MSA depth encoding: `0=1:2`, `1=2:4`, `2=4:8`, `3=8:16`, `4=16:32`, `5=32:64`, `6=64:128` + +--- + +### 3. Blind mode + +Predicts alternative conformations or fold-switching without reference PDBs. Uses Foldseek to cluster predicted structures and identify structures of interest. + +```bash +cf-random --pname Mad2_test --fname 2vfx_L-search/ --option blind ``` -*Before running this code, make a symbolic link to the foldseek pdb libraries in the directory where you run the command above.* +> **Note:** Before running blind mode, make a symbolic link to the Foldseek PDB libraries in the directory where you run the command. + +**Input files:** +- `2vfx_L-search/0.a3m` — MSA -### Used input files:
-MSA: 2vfx_L-search/0.a3m (MSA file should be in a folder)
+*Generates 200 structures + 200 Foldseek result files; takes under 70 minutes on an A100 GPU.* +**Output files** (written to `blind_prediction/Mad2_test/`): -### Generated output files:
-_Predicted files from deep and random MSAs are deposited in 'blind_prediction' directory._
-_If user uses the option '--pname', the name of output files would be entered '--pname'._
-* List of prediction files: Mad2-structures_of_interest.csv -* The best hit list of alternative conformations: Mad2-structures_of_interest.csv -* Cluster analysis result as an image file: Mad2-cluster.png +| File | Description | +|---|---| +| `Mad2_test-cluster.png` | PCA + HDBSCAN cluster plot | +| `Mad2_test-structures_of_interest.csv` | Representative structures per cluster | +| `Mad2_test-structures_of_interest.pse` | PyMOL session with aligned representatives | -*This takes <70 Minutes to run on an A100 GPU (generates 200 structures total + 200 foldseek files).*
+--- -# How to Cite -Lee, M., Schafer, J.W., Prabakaran, J. et al. Large-scale predictions of alternative protein conformations by AlphaFold2-based sequence association. Nat Commun 16, 5622 (2025). https://doi.org/10.1038/s41467-025-60759-5 -

+## Citation -# License -Please see the LICENSE.md file. +Lee, M., Schafer, J.W., Prabakaran, J. et al. Large-scale predictions of alternative protein conformations by AlphaFold2-based sequence association. *Nat Commun* **16**, 5622 (2025). https://doi.org/10.1038/s41467-025-60759-5 +## License +See [LICENSE.md](LICENSE.md). \ No newline at end of file diff --git a/cf_random/__init__.py b/cf_random/__init__.py new file mode 100644 index 0000000..a53bd4b --- /dev/null +++ b/cf_random/__init__.py @@ -0,0 +1,35 @@ +"""CF-Random: Protein fold-switching and alternative conformation prediction. + +A comprehensive package for identifying and analyzing protein fold-switching +events and alternative conformations using AlphaFold predictions and +structural analysis tools. + +Key Features: + - Predict fold-switching regions in proteins + - Analyze alternative conformations + - Assess prediction quality via TM-scores + - Visualize structural variations + - Blind structural screening without reference structures + +Main entry point: + Use cf_random.main() to execute the prediction workflow. +""" + +__version__ = "0.2.0" +__author__ = "Myeongsang (Samuel) Lee, Pramesh Sharma" +__all__ = ["main"] + +import logging + +# Configure package-level logging +logging.getLogger(__name__).addHandler(logging.NullHandler()) + +# Import main modules for easier access +try: + from .core.main import ( + main, + ) +except ImportError as e: + import warnings + + warnings.warn(f"Failed to import main module: {e}", ImportWarning) diff --git a/cf_random/analysis/__init__.py b/cf_random/analysis/__init__.py new file mode 100644 index 0000000..8bb21c0 --- /dev/null +++ b/cf_random/analysis/__init__.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""Analysis module for structural quality metrics. + +Provides classes for computing TM-scores and pLDDT metrics focused on +fold-switching regions and structural comparisons. +""" + +from .base import ( + BaseTMScore, +) +from .cal_plddt_ac_fs import ( + PlddtCal, +) +from .cal_tmscore_fs_flmsa import ( + TMScoreFS, +) +from .cal_tmscore_fs_multimer import ( + TMScoreFSMulti, +) +from .tmscore_all_var import ( + TMScoreCalAllVar, +) +from .tmscore_all_var_fs import ( + TMScoreCalAllVarFS, +) + +__all__ = [ + "BaseTMScore", + "TMScoreFS", + "PlddtCal", + "TMScoreCalAllVar", + "TMScoreCalAllVarFS", +] diff --git a/cf_random/analysis/base.py b/cf_random/analysis/base.py new file mode 100644 index 0000000..c607db7 --- /dev/null +++ b/cf_random/analysis/base.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""Base classes and utilities for structural analysis.""" + +import glob +import logging +import os +from typing import ( + List, +) + +import numpy as np +from tmtools import ( + tm_align, +) +from tmtools.io import ( + get_residue_data, + get_structure, +) +from tmtools.testing import ( + get_pdb_path, +) + +from ..utils.convert_multi_single import ( + ConvertM2S, +) + +logger = logging.getLogger(__name__) + +ZERO_TM_SCORES = [0.0, 0.0, 0.0, 0.0, 0.0] +MULTIMER_MODEL_TYPE = "alphafold2_multimer_v3" + + +class BaseTMScore: + """Base class for computing TM-scores for predicted protein models. + + For each predicted model, TM-scores are computed in both the forward + (model→reference) and reverse (reference→model) directions against + both reference structures. Whichever direction yields the higher maximum + score is kept. + """ + + def __init__( + self, + pred_dir: str, + pdb1: str, + pdb1_name: str, + pdb2: str, + pdb2_name: str, + model_type: str, + ) -> None: + """Initialise and compute TM-scores. + + Args: + pred_dir: Glob pattern or path to prediction directory. + pdb1: Path to first reference PDB (without extension). + pdb1_name: Name of first reference structure. + pdb2: Path to second reference PDB (without extension). + pdb2_name: Name of second reference structure. + model_type: ColabFold model type string. + """ + self.pred_dir = pred_dir + self.pdb1 = pdb1 + self.pdb1_name = pdb1_name + self.pdb2 = pdb2 + self.pdb2_name = pdb2_name + self.model_type = model_type + self.tmscores: List[float] = self._calculate_scores() + + def _resolve_models(self) -> List[str]: + """Resolve paths to predicted model PDB files. + + Handles both plain directory paths and glob patterns. + For multimer models, converts to single-chain first if needed. + + Returns: + List of absolute PDB file path strings (without extension). + """ + pwd = os.getcwd() + "/" + + if self.model_type != MULTIMER_MODEL_TYPE: + pattern = str(self.pred_dir) + "/*_unrelaxed*pdb" + files_list = glob.glob(pattern) + else: + check_pattern = str(self.pred_dir) + "/rmTER*_unrelaxed*pdb" + files_list = glob.glob(check_pattern) + if not files_list: + self._convert_multimer_to_single() + files_list = glob.glob(check_pattern) + + # Prepend pwd and strip extension + return [pwd + f.replace(".pdb", "") for f in files_list] + + def _convert_multimer_to_single(self) -> None: + """Convert multimer predictions to single chains.""" + matched_dirs = glob.glob(str(self.pred_dir)) + if not matched_dirs: + logger.warning("No directories matched pattern: %s", self.pred_dir) + return + for pred_dir in matched_dirs: + ConvertM2S(pred_dir, self.pdb1_name, self.pdb2_name) + + def _calculate_scores(self) -> List[float]: + """Calculate TM-scores against both reference structures. + + Computes scores in both forward (model→ref) and reverse (ref→model) + directions. Whichever direction yields the higher max is returned. + + Returns: + List of TM-scores, one per predicted model, for both references + concatenated (pdb1 scores first, then pdb2 scores). + """ + pwd = os.getcwd() + "/" + predicted_models = self._resolve_models() + + if not predicted_models: + logger.warning("No predicted models found for %s", self.pred_dir) + return ZERO_TM_SCORES.copy() + + # Load reference structures + ref1 = get_structure(get_pdb_path(pwd + self.pdb1_name)) + ref1_coords, ref1_seq = get_residue_data(ref1) + + ref2 = get_structure(get_pdb_path(pwd + self.pdb2_name)) + ref2_coords, ref2_seq = get_residue_data(ref2) + + tmscores_ord: List[float] = [] + tmscores_rev: List[float] = [] + + for model_path in predicted_models: + s = get_structure(get_pdb_path(model_path)) + coords, seq = get_residue_data(s) + + res = tm_align(coords, ref1_coords, seq, ref1_seq) + tmscores_ord.append(round(res.tm_norm_chain1, 5)) + + res = tm_align(ref1_coords, coords, ref1_seq, seq) + tmscores_rev.append(round(res.tm_norm_chain1, 5)) + + for model_path in predicted_models: + s = get_structure(get_pdb_path(model_path)) + coords, seq = get_residue_data(s) + + res = tm_align(coords, ref2_coords, seq, ref2_seq) + tmscores_ord.append(round(res.tm_norm_chain1, 5)) + + res = tm_align(ref2_coords, coords, ref2_seq, seq) + tmscores_rev.append(round(res.tm_norm_chain1, 5)) + + logger.debug("TM-scores forward: %s", tmscores_ord) + logger.debug("TM-scores reverse: %s", tmscores_rev) + + # Return whichever direction gives the higher maximum + if np.max(tmscores_ord) > np.max(tmscores_rev): + return tmscores_ord + return tmscores_rev diff --git a/cf_random/analysis/cal_plddt_ac_fs.py b/cf_random/analysis/cal_plddt_ac_fs.py new file mode 100644 index 0000000..1a6644d --- /dev/null +++ b/cf_random/analysis/cal_plddt_ac_fs.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""pLDDT score calculation for AlphaFold predictions.""" + +import json +import logging +import re +from pathlib import ( + Path, +) +from typing import ( + Dict, + List, +) + +import numpy as np + +logger = logging.getLogger(__name__) + +# Regex pattern for parsing AlphaFold JSON filenames +RANK_PATTERN = re.compile(r".*?_scores_rank_(?P\d+)_alphafold2.*") + +MULTIMER_MODEL_TYPE = "alphafold2_multimer_v3" + + +def read_plddt(jsonfile: str) -> np.ndarray: + """Reads pLDDT scores from an AlphaFold prediction JSON file. + + Args: + jsonfile: Path to the JSON file containing prediction scores. + + Returns: + Array of pLDDT scores as float64. + """ + with open(jsonfile, "r", encoding="utf-8") as json_file: + data = json.load(json_file) + + plddt_scores = np.array(data["plddt"], dtype=np.float64) + return plddt_scores + + +def calculate_average_plddt(score: np.ndarray) -> float: + """Calculates the average pLDDT score from an array of scores. + + Args: + score: Array of pLDDT scores. + + Returns: + Average pLDDT score rounded to 2 decimal places. + """ + avg_plddt = round(np.average(score), 2) + return avg_plddt + + +class PlddtCal: + """Calculates average pLDDT scores for protein models across different categories. + + This class processes AlphaFold prediction JSON files to extract pLDDT scores + and computes average scores for different MSA categories and model types. + """ + + def __init__( + self, + sub_list: List[str], + category: str, + pdb_name: str, + num_msa: int, + num_ens: int, + model_type: str, + ) -> None: + """Initializes pLDDT calculation for given subdirectories and parameters. + + Args: + sub_list: List of subdirectory paths to process. + category: MSA category ('full-MSA', 'additional-MSA', 'random-MSA'). + pdb_name: Name of the PDB structure. + num_msa: Number of MSA sequences. + num_ens: Number of ensemble models. + model_type: Type of AlphaFold model. + """ + if not sub_list: + raise ValueError("No subdirectories provided for pLDDT calculation") + + logger.info("Processing pLDDT scores for %d subdirectories", len(sub_list)) + logger.debug("Subdirectories: %s", sub_list) + + values_all, _, cnt = self._process_subdirs(sub_list) + + if category == "full-MSA": + cnt = int(cnt / 5) + + logger.debug("Processed %d files", cnt) + + # Reshape based on category and model_type + if category == "full-MSA": + values_all_resh = values_all.reshape(num_msa + 5, 5) + elif category == "additional-MSA" and model_type == MULTIMER_MODEL_TYPE: + values_all_resh = values_all.reshape((num_ens + 20), 5) + elif category == "additional-MSA": + values_all_resh = values_all.reshape((num_ens + 20), 5) + elif category == "random-MSA" and model_type != MULTIMER_MODEL_TYPE: + values_all_resh = values_all.reshape((num_msa + 5) * 7, 5) + elif category == "random-MSA": + values_all_resh = values_all.reshape((num_msa + 5) * 7, 5) + else: + raise ValueError(f"Unknown category/model_type combination: {category}/{model_type}") + + logger.info("Calculated pLDDT scores") + output_file = f"plddt_{category}_{pdb_name}.csv" + np.savetxt(output_file, values_all_resh, fmt="%2.3f") + logger.info("Saved pLDDT results to %s", output_file) + + def _process_subdirs(self, sub_list: List[str]) -> tuple[np.ndarray, Dict[str, float], int]: + """Processes subdirectories to extract pLDDT scores. + + Args: + sub_list: List of subdirectory paths. + + Returns: + Tuple of (values_all, out_dict_all, cnt) where values_all is numpy array + of scores, out_dict_all is dict of key-value pairs, cnt is count. + """ + out_dict_all: Dict[str, float] = {} + values_all = np.array([]) + cnt = 0 + + for subdir in sub_list: + subdir_path = Path(subdir) + if not subdir_path.is_dir(): + logger.warning("Skipping non-directory: %s", subdir) + continue + + subdir_name = subdir_path.name + jsonfiles = list(subdir_path.glob("*_scores*json")) + + for jsonfile in jsonfiles: + plddt_score = read_plddt(str(jsonfile)) + values = calculate_average_plddt(plddt_score) + values_all = np.append(values_all, values) + + jsonfilename = jsonfile.stem + match = RANK_PATTERN.match(jsonfilename) + rank = match.group("rank") if match else "000" + + key_pair = f"{subdir_name}:{rank}" + if key_pair not in out_dict_all: + out_dict_all[key_pair] = values + + cnt += 1 + + return values_all, out_dict_all, cnt diff --git a/cf_random/analysis/cal_tmscore_fs_flmsa.py b/cf_random/analysis/cal_tmscore_fs_flmsa.py new file mode 100644 index 0000000..8662b06 --- /dev/null +++ b/cf_random/analysis/cal_tmscore_fs_flmsa.py @@ -0,0 +1,227 @@ +#!/bin/env python3 +# -*- coding: utf-8 -*- +"""Fold-switching TM-score computation. + +Extracts coordinates for FS regions and computes TM-align scores +against both reference structures. +""" + +import glob +import logging +import os +import sys +from pathlib import ( + Path, +) +from typing import ( + Dict, + List, + Optional, + Tuple, + Union, +) + +import numpy as np +from Bio.PDB import ( + PDBParser, +) +from tmtools import ( + tm_align, +) + +from cf_random.utils.constants import ( + AA3TO1, +) + +logger = logging.getLogger(__name__) + + +class TMScoreFS: + """Calculates TM-scores for fold-switching regions between PDB structures. + + Compares predicted protein models against reference PDB structures, + focusing on fold-switching regions. Supports both monomer and multimer + predicted models. + + Attributes: + tmscores_fs (numpy.ndarray): Array of TM-scores for fold-switching comparisons. + """ + + def __init__( + self, + pred_path: str, + pdb1: Union[str, Path], + pdb1_name: str, + pdb2: Union[str, Path], + pdb2_name: str, + model_glob: str = "*_unrelaxed*pdb", + ) -> None: + """Initializes TM-score calculation for fold-switching analysis. + + Args: + pred_path (str): Path or glob pattern to directory containing predicted models. + pdb1 (str or Path): Path to first PDB file. + pdb1_name (str): Name/ID of first PDB structure. + pdb2 (str or Path): Path to second PDB file. + pdb2_name (str): Name/ID of second PDB structure. + model_glob (str): Glob pattern for model files within each prediction directory. + Defaults to "*_unrelaxed*pdb". Use "single_0_unrelaxed*pdb" for multimer. + + Raises: + SystemExit: If PDB names are not found in range file. + """ + self.tmscores_fs = None + self.model_glob = model_glob + + range_file = os.path.join(os.getcwd(), "range_fs_pairs_all.txt") + fs_res: Dict[str, Tuple[str, str]] = {} + + with open(range_file, "r", encoding="utf-8") as f: + next(f) # Skip header + for line in f: + n1, n2, p1, p2, m1, m2 = line.strip().split(",") + if n1 not in fs_res: + fs_res[n1] = (p1, m1) + if n2 not in fs_res: + fs_res[n2] = (p2, m2) + + logger.info("Running fold-switching TM-score for pair %s / %s", pdb1_name, pdb2_name) + + try: + range_pdb1 = fs_res[pdb1_name] + range_pdb2 = fs_res[pdb2_name] + except KeyError: + logger.error( + "PDB ID(s) not found in range file — check identifiers: %s, %s", + pdb1_name, + pdb2_name, + ) + sys.exit(1) + + self._run_for_models(pdb1, pdb2, pred_path, range_pdb1[1], range_pdb1[0], range_pdb2[0]) + + def _get_coords(self, pdb_file: Union[str, Path], fs_range: str) -> Tuple[np.ndarray, str]: + """Extracts CA coordinates and sequence for a fold-switching region from a PDB file. + + Args: + pdb_file (str or Path): Path to the PDB file. + fs_range (str): Residue range for fold-switching region, e.g., "112-162". + + Returns: + tuple: (coords_np, seq) where coords_np is a numpy array of CA coordinates + (N x 3) and seq is the one-letter amino acid sequence string. + """ + struct = PDBParser(QUIET=True).get_structure("x", str(pdb_file)) + coords: List[List[float]] = [] + seq_dict: Dict[int, str] = {} + + start, stop = fs_range.split("-") + res_range = range(int(start), int(stop) + 1) + + for atom in struct.get_atoms(): + residue = atom.get_parent() + res_id = residue.get_id()[1] + if res_id in res_range and atom.get_name() == "CA": + x, y, z = atom.get_coord() + coords.append([x, y, z]) + if res_id not in seq_dict: + seq_dict[res_id] = AA3TO1[residue.get_resname()] + + coords_np = np.array(coords) + seq = "".join(v for _, v in sorted(seq_dict.items())) + + logger.debug("Extracted %d CA atoms from %s (range %s)", len(coords_np), pdb_file, fs_range) + return coords_np, seq + + def _get_tmscore( + self, + coords1: np.ndarray, + seq1: str, + pred_dir: Union[str, Path], + res_range: str, + ) -> List[float]: + """Calculates TM-scores between a reference structure and predicted models. + + Scores are computed in both directions and the higher-scoring direction is returned. + + Args: + coords1 (numpy.ndarray): CA coordinates of the reference structure. + seq1 (str): Sequence of the reference structure. + pred_dir (str or Path): Path to directory containing predicted models. + res_range (str): Residue range for the fold-switching region in predicted models. + + Returns: + list: TM-scores for each predicted model (rounded to 2 decimals). + Returns [0.0, 0.0, 0.0, 0.0, 0.0] if no models are found. + """ + model_files = sorted(glob.glob(str(pred_dir) + f"/{self.model_glob}")) + + if not model_files: + logger.warning("No unrelaxed model files found in %s", pred_dir) + return [0.0, 0.0, 0.0, 0.0, 0.0] + + tmscores_fwd: List[float] = [] + tmscores_rev: List[float] = [] + + for model in model_files: + coords2, seq2 = self._get_coords(model, res_range) + tmscores_fwd.append(round(tm_align(coords1, coords2, seq1, seq2).tm_norm_chain1, 2)) + tmscores_rev.append(round(tm_align(coords2, coords1, seq2, seq1).tm_norm_chain1, 2)) + + tmscores = tmscores_fwd if np.max(tmscores_fwd) > np.max(tmscores_rev) else tmscores_rev + logger.debug("TM-scores for %s: %s", pred_dir, tmscores) + return tmscores + + def _run_for_models( + self, + pdb_file1: Union[str, Path], + pdb_file2: Union[str, Path], + pred_path: Union[str, Path], + pred_range: str, + res_range1: str, + res_range2: str, + ) -> None: + """Compares predicted models against both reference PDB structures. + + Args: + pdb_file1 (str or Path): Path to the first reference PDB (Fold1). + pdb_file2 (str or Path): Path to the second reference PDB (Fold2). + pred_path (str or Path): Path or glob pattern to predicted model directories. + pred_range (str): Residue range for fold-switching in predicted models. + res_range1 (str): Residue range for fold-switching in pdb_file1. + res_range2 (str): Residue range for fold-switching in pdb_file2. + """ + all_subdirs = glob.glob(str(pred_path)) + + if not all_subdirs: + logger.warning("No prediction subdirectories matched pattern: %s", pred_path) + return + + logger.info( + "Found %d prediction director%s under %s", + len(all_subdirs), + "y" if len(all_subdirs) == 1 else "ies", + pred_path, + ) + + coords1, seq1 = self._get_coords(pdb_file1, res_range1) + coords2, seq2 = self._get_coords(pdb_file2, res_range2) + + tmscores_fs: List[List[float]] = [] + + for subdir in all_subdirs: + preddir = Path(subdir) + if not preddir.exists(): + logger.debug("Skipping missing directory: %s", preddir) + continue + tmscores_fs.append(self._get_tmscore(coords1, seq1, preddir, pred_range)) + + for subdir in all_subdirs: + preddir = Path(subdir) + if not preddir.exists(): + logger.debug("Skipping missing directory: %s", preddir) + continue + tmscores_fs.append(self._get_tmscore(coords2, seq2, preddir, pred_range)) + + self.tmscores_fs = np.array(tmscores_fs) + logger.info("TM-score array shape: %s", self.tmscores_fs.shape) diff --git a/cf_random/analysis/cal_tmscore_fs_multimer.py b/cf_random/analysis/cal_tmscore_fs_multimer.py new file mode 100644 index 0000000..c105a19 --- /dev/null +++ b/cf_random/analysis/cal_tmscore_fs_multimer.py @@ -0,0 +1,223 @@ +#!/bin/env python3 +# -*- coding: utf-8 -*- +"""Multimer-aware TM-score calculations for fold-switching regions. + +Provides utilities to compute TM-scores for multimeric predicted models +and compare fold-switching regions to reference structures. +""" + +import glob +import logging +import os +import sys +from pathlib import ( + Path, +) +from typing import ( + Dict, + List, + Tuple, + Union, +) + +import numpy as np +from Bio.PDB import ( + PDBParser, +) +from tmtools import ( + tm_align, +) + +from cf_random.utils.constants import ( + AA3TO1, +) + +logger = logging.getLogger(__name__) + + +class TMScoreFSMulti: + """Calculates TM-scores for fold-switching regions in multimeric protein structures. + + This class compares predicted protein models against original PDB structures, + focusing on fold-switching regions. It computes TM-align scores for structural + alignments between predicted and reference structures. + + Attributes: + tmscores_fs (numpy.ndarray): Array of TM-scores for fold-switching comparisons. + """ + + def __init__( + self, + pred_path: str, + pdb1: Union[str, Path], + pdb1_name: str, + pdb2: Union[str, Path], + pdb2_name: str, + ) -> None: + """Initializes TM-score calculation for fold-switching multimer analysis. + + Args: + pred_path (str): Path to directory containing predicted model subdirectories. + pdb1 (str or Path): Path to first PDB file. + pdb1_name (str): Name/ID of first PDB structure. + pdb2 (str or Path): Path to second PDB file. + pdb2_name (str): Name/ID of second PDB structure. + + Raises: + SystemExit: If PDB names are not found in range file. + """ + current_dir = os.getcwd() + "/" + data_dir = Path(pred_path) + + range_file = current_dir + "range_fs_pairs_all.txt" + fs_res: Dict[str, Tuple[str, str]] = {} + + with open(range_file, "r", encoding="utf-8") as file: + next(file) # skip header line "# pdb1,pdb2,pred1,pred2" + for line in file: + line = line.strip() + n1, n2, p1, p2, m1, m2 = line.split(",") + if n1 not in fs_res: + fs_res[n1] = (p1, m1) + if n2 not in fs_res: + fs_res[n2] = (p2, m2) + + logger.info("Running fold-switching TM-score for pair %s / %s", pdb1_name, pdb2_name) + + try: + range_pdb1 = fs_res[pdb1_name] + range_pdb2 = fs_res[pdb2_name] + except KeyError: + logger.error( + "PDB ID(s) not found in range file — check identifiers: %s, %s", + pdb1_name, + pdb2_name, + ) + sys.exit(1) + + range_pred = range_pdb1[1] + self.run_for_models(pdb1, pdb2, data_dir, range_pred, range_pdb1[0], range_pdb2[0]) + + def get_coords(self, pdb_file: Union[str, Path], fs_range: str) -> Tuple[np.ndarray, str]: + """Extracts coordinates and sequence for fold-switching region from PDB file. + + Args: + pdb_file (str or Path): Path to the PDB file. + fs_range (str): Residue range for fold-switching region, e.g., "112-162". + + Returns: + tuple: (coords_np, seq) where coords_np is numpy array of CA coordinates + and seq is the one-letter amino acid sequence. + """ + pdb_parser = PDBParser(QUIET=True) + struct = pdb_parser.get_structure("x", str(pdb_file)) + coords: List[List[float]] = [] + seq_dict: Dict[int, str] = {} + + start, stop = fs_range.split("-") + res_range = range(int(start), int(stop) + 1) + + for atom in struct.get_atoms(): + residue = atom.get_parent() + res_id = residue.get_id()[1] + resname = residue.get_resname() + if res_id in res_range and atom.get_name() == "CA": + x, y, z = atom.get_coord() + coords.append([x, y, z]) + if res_id not in seq_dict: + seq_dict[res_id] = AA3TO1[resname] + + coords_np = np.array(coords) + seq = "".join(item[1] for item in sorted(seq_dict.items())) + + logger.debug("Extracted %d CA atoms from %s (range %s)", len(coords_np), pdb_file, fs_range) + return coords_np, seq + + def get_tmscore( + self, + coords1: np.ndarray, + seq1: str, + pred_file_path: Union[str, Path], + res_range: str, + ) -> List[float]: + """Calculates TM-scores between reference structure and predicted models. + + Args: + coords1 (numpy.ndarray): Coordinates of reference structure. + seq1 (str): Sequence of reference structure. + pred_file_path (str or Path): Path to directory containing predicted models. + res_range (str): Residue range for fold-switching in predicted models. + + Returns: + list: TM-scores for each predicted model (rounded to 2 decimals). + Returns [0.0, 0.0, 0.0, 0.0, 0.0] if no models found. + """ + model_files = glob.glob(str(pred_file_path) + "/single_0_unrelaxed*pdb") + + if not model_files: + logger.warning("No unrelaxed model files found in %s", pred_file_path) + return [0.0, 0.0, 0.0, 0.0, 0.0] + + tmscores: List[float] = [] + for model in model_files: + coords2, seq2 = self.get_coords(Path(model), res_range) + res = tm_align(coords1, coords2, seq1, seq2) + tmscores.append(round(res.tm_norm_chain1, 2)) + + logger.debug("TM-scores for %s: %s", pred_file_path, tmscores) + return tmscores + + def run_for_models( + self, + pdb_file1: Union[str, Path], + pdb_file2: Union[str, Path], + data_dir: Union[str, Path], + pred_range: str, + res_range1: str, + res_range2: str, + ) -> None: + """Compares predicted models against both original PDB structures. + + Calculates TM-scores for fold-switching regions by comparing predicted models + against both fold states (pdb_file1 and pdb_file2). + + Args: + pdb_file1 (str or Path): Path to first PDB structure (Fold1). + pdb_file2 (str or Path): Path to second PDB structure (Fold2). + data_dir (str or Path): Path to directory containing predicted model subdirectories. + pred_range (str): Residue range for fold-switching in predicted models. + res_range1 (str): Residue range for fold-switching in pdb_file1. + res_range2 (str): Residue range for fold-switching in pdb_file2. + + Returns: + None: Stores results in self.tmscores_fs attribute. + """ + all_sub_dir_paths = glob.glob(str(data_dir)) + + if not all_sub_dir_paths: + logger.warning("No prediction subdirectories matched pattern: %s", data_dir) + return + + logger.info( + "Found %d prediction director%s under %s", + len(all_sub_dir_paths), + "y" if len(all_sub_dir_paths) == 1 else "ies", + data_dir, + ) + + coords1, seq1 = self.get_coords(pdb_file1, res_range1) + coords2, seq2 = self.get_coords(pdb_file2, res_range2) + + tmscores_fs: List[List[float]] = [] + + for subdir in all_sub_dir_paths: + preddir = Path(subdir) + if not preddir.exists(): + logger.debug("Skipping missing directory: %s", preddir) + continue + + tmscores_fs.append(self.get_tmscore(coords1, seq1, preddir, pred_range)) + tmscores_fs.append(self.get_tmscore(coords2, seq2, preddir, pred_range)) + + self.tmscores_fs = np.array(tmscores_fs) + logger.info("TM-score array shape: %s", self.tmscores_fs.shape) diff --git a/cf_random/analysis/cal_tmscore_fs_only.py b/cf_random/analysis/cal_tmscore_fs_only.py new file mode 100644 index 0000000..b6c6d76 --- /dev/null +++ b/cf_random/analysis/cal_tmscore_fs_only.py @@ -0,0 +1,231 @@ +#!/bin/env python3 +# -*- coding: utf-8 -*- +"""Utilities to compute TM-scores focused on fold-switching regions. + +This module provides tools to extract region coordinates and compute +TM-align-based comparisons for predicted models. +""" + +import glob +import logging +import os +import sys +from pathlib import ( + Path, +) +from typing import ( + Dict, + List, + Tuple, + Union, +) + +import numpy as np +from Bio.PDB import ( + PDBParser, +) +from tmtools import ( + tm_align, +) + +from cf_random.utils.constants import ( + AA3TO1, +) + +logger = logging.getLogger(__name__) + + +class TMScoreFS: + """Calculates TM-scores for fold-switching regions between PDB structures. + + Compares predicted protein models against original PDB structures, + focusing on fold-switching regions. Computes TM-align scores for structural + alignments. + + Attributes: + tmscores_fs (numpy.ndarray): Array of TM-scores for fold-switching comparisons. + """ + + def __init__( + self, + pred_path: Union[str, Path], + pdb1: Union[str, Path], + pdb1_name: str, + pdb2: Union[str, Path], + pdb2_name: str, + ) -> None: + """Initializes TM-score calculation for fold-switching analysis. + + Args: + pred_path (str or Path): Path to directory containing predicted model subdirectories. + pdb1 (str or Path): Path to first PDB file. + pdb1_name (str): Name/ID of first PDB structure. + pdb2 (str or Path): Path to second PDB file. + pdb2_name (str): Name/ID of second PDB structure. + + Raises: + SystemExit: If PDB names are not found in range file. + """ + current_dir = os.getcwd() + "/" + + range_file = current_dir + "range_fs_pairs_all.txt" + fs_res: Dict[str, Tuple[str, str]] = {} + + with open(range_file, "r", encoding="utf-8") as infile: + next(infile) # Skip header + for line in infile: + line = line.strip() + n1, n2, p1, p2, m1, m2 = line.split(",") + if n1 not in fs_res: + fs_res[n1] = (p1, m1) + if n2 not in fs_res: + fs_res[n2] = (p2, m2) + + logger.info("Running fold-switching TM-score for pair %s / %s", pdb1_name, pdb2_name) + + try: + range_pdb1 = fs_res[pdb1_name] + range_pdb2 = fs_res[pdb2_name] + except KeyError: + logger.error( + "PDB ID(s) not found in range file — check identifiers: %s, %s", + pdb1_name, + pdb2_name, + ) + sys.exit(1) + + range_pred = range_pdb1[1] + self.run_for_models(pdb1, pdb2, pred_path, range_pred, range_pdb1[0], range_pdb2[0]) + + def get_coords(self, pdb_file: Union[str, Path], fs_range: str) -> Tuple[np.ndarray, str]: + """Extracts CA coordinates and sequence for fold-switching region from PDB. + + Args: + pdb_file (str or Path): Path to the PDB file. + fs_range (str): Residue range for fold-switching region, e.g., "112-162". + + Returns: + tuple: (coords_np, seq) where coords_np is numpy array of CA coordinates + (N x 3) and seq is the one-letter amino acid sequence string. + """ + pdb_parser = PDBParser(QUIET=True) + struct = pdb_parser.get_structure("x", str(pdb_file)) + coords: List[List[float]] = [] + seq_dict: Dict[int, str] = {} + + start, stop = fs_range.split("-") + res_range = range(int(start), int(stop) + 1) + + for atom in struct.get_atoms(): + residue = atom.get_parent() + res_id = residue.get_id()[1] + resname = residue.get_resname() + + if res_id in res_range and atom.get_name() == "CA": + x, y, z = atom.get_coord() + coords.append([x, y, z]) + if res_id not in seq_dict: + seq_dict[res_id] = AA3TO1[resname] + + coords_np = np.array(coords) + seq = "".join(item[1] for item in sorted(seq_dict.items())) + + logger.debug("Extracted %d CA atoms from %s (range %s)", len(coords_np), pdb_file, fs_range) + return coords_np, seq + + def get_tmscore( + self, coords1: np.ndarray, seq1: str, pred_file_path: Union[str, Path], res_range: str + ) -> List[float]: + """Calculates TM-scores between reference and predicted structures. + + Args: + coords1 (numpy.ndarray): Coordinates of reference structure. + seq1 (str): Sequence of reference structure. + pred_file_path (str or Path): Path to directory containing predicted models. + res_range (str): Residue range for fold-switching in predicted models. + + Returns: + list: TM-scores for each predicted model (rounded to 2 decimals). + Returns [0.0, 0.0, 0.0, 0.0, 0.0] if no models found. + """ + model_files = glob.glob(str(pred_file_path) + "/*_unrelaxed*pdb") + + if not model_files: + logger.warning("No unrelaxed model files found in %s", pred_file_path) + return [0.0, 0.0, 0.0, 0.0, 0.0] + + tmscores_ord: List[float] = [] + tmscores_rev: List[float] = [] + + for model in model_files: + coords2, seq2 = self.get_coords(Path(model), res_range) + + res = tm_align(coords1, coords2, seq1, seq2) + tmscores_ord.append(round(res.tm_norm_chain1, 2)) + + # Note: both directions rounded to 2 for consistent comparison + res = tm_align(coords2, coords1, seq2, seq1) + tmscores_rev.append(round(res.tm_norm_chain1, 2)) + + if np.max(tmscores_ord) > np.max(tmscores_rev): + tmscores = tmscores_ord + else: + tmscores = tmscores_rev + + logger.debug("TM-scores for %s: %s", pred_file_path, tmscores) + return tmscores + + def run_for_models( + self, + pdb_file1: Union[str, Path], + pdb_file2: Union[str, Path], + data_dir: Union[str, Path], + pred_range: str, + res_range1: str, + res_range2: str, + ) -> None: + """Compares predicted models against both original PDB structures. + + Calculates TM-scores for fold-switching regions by comparing predicted + models against both fold states (pdb_file1 and pdb_file2). + + Args: + pdb_file1 (str or Path): Path to first PDB structure (Fold1). + pdb_file2 (str or Path): Path to second PDB structure (Fold2). + data_dir (str or Path): Path to directory containing predicted model subdirectories. + pred_range (str): Residue range for fold-switching in predicted models. + res_range1 (str): Residue range for fold-switching in pdb_file1. + res_range2 (str): Residue range for fold-switching in pdb_file2. + + Returns: + None: Stores results in self.tmscores_fs attribute. + """ + all_sub_dir_paths = glob.glob(str(data_dir)) + + if not all_sub_dir_paths: + logger.warning("No prediction subdirectories matched pattern: %s", data_dir) + return + + logger.info( + "Found %d prediction director%s under %s", + len(all_sub_dir_paths), + "y" if len(all_sub_dir_paths) == 1 else "ies", + data_dir, + ) + + coords1, seq1 = self.get_coords(pdb_file1, res_range1) + coords2, seq2 = self.get_coords(pdb_file2, res_range2) + + tmscores_fs: List[List[float]] = [] + + for subdir in all_sub_dir_paths: + preddir = Path(subdir) + if not preddir.exists(): + logger.debug("Skipping missing directory: %s", preddir) + continue + + tmscores_fs.append(self.get_tmscore(coords1, seq1, preddir, pred_range)) + tmscores_fs.append(self.get_tmscore(coords2, seq2, preddir, pred_range)) + + self.tmscores_fs = np.array(tmscores_fs) + logger.info("TM-score array shape: %s", self.tmscores_fs.shape) diff --git a/cf_random/analysis/tmscore_all_var.py b/cf_random/analysis/tmscore_all_var.py new file mode 100644 index 0000000..3976842 --- /dev/null +++ b/cf_random/analysis/tmscore_all_var.py @@ -0,0 +1,325 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""TM-score analysis for alternative conformation prediction workflows.""" + +import glob +import shutil +from pathlib import ( + Path, +) +from typing import ( + List, + Optional, +) + +import numpy as np + +from .base import ( + MULTIMER_MODEL_TYPE, + BaseTMScore, + logger, +) + +PREDICTIONS_ROOT = Path("predictions_all") +FAILED_ROOT = Path("failed_predictions") +MSA_MULTIPLIERS = (1, 2, 2, 2, 2, 2, 2) + + +class TMScore(BaseTMScore): + """Compute TM-scores for one prediction directory. + + Inherits all scoring logic from BaseTMScore (forward/reverse comparison, + glob resolution, multimer conversion). Only adds select_size, which + identifies the optimal shallow MSA depth index from clustered TM-scores. + """ + + def select_size( + self, + tmscores_random: np.ndarray, + pdb1_name: str, + pdb2_name: str, + alt_name: str, + num_seeds: int, + ) -> None: + """Select the optimal shallow MSA depth index. + + Mirrors the original select_size logic exactly: + 1. Reshape flat array to (14, num_seeds*5). + 2. Extract every-other row for the alternative structure. + 3. Sum each group row and find the argmax. + 4. Map back to the full-matrix row via alt_name offset. + 5. Verify at least one score >= 0.5 threshold. + + Args: + tmscores_random: Flat array of all shallow-MSA TM-scores. + pdb1_name: Name of first reference structure. + pdb2_name: Name of second reference structure. + alt_name: Name of the alternative conformation structure. + num_seeds: Number of prediction seeds. + + Raises: + RuntimeError: If no model exceeds the 0.5 TM-score threshold. + """ + tmscores_random_reshape = tmscores_random.reshape(14, num_seeds * 5) + tmscores_random_locat = np.zeros((7, num_seeds * 5)) + + # Extract rows for the alternative structure + if alt_name == pdb2_name: + tmp_cnt = 0 + for i in range(1, 14, 2): + tmscores_random_locat[tmp_cnt, :] = tmscores_random_reshape[i, :] + tmp_cnt += 1 + else: + tmp_cnt = 0 + for i in range(0, 13, 2): + tmscores_random_locat[tmp_cnt, :] = tmscores_random_reshape[i, :] + tmp_cnt += 1 + + # Sum each MSA-depth group and pick the best via argmax of max + tmscore_data_sum = np.zeros((7, 1)) + for i in range(tmscores_random_locat.shape[0]): + tmscore_data_sum[i] = np.sum(tmscores_random_locat[i]) + + location = int(np.argmax(np.max(tmscore_data_sum, axis=1))) + + # Map group index back to full-matrix row using alt_name offset + if alt_name == pdb2_name: + location_full = (location * 2) + 1 + else: + location_full = location * 2 + + tmscore_check = tmscores_random_reshape + + if alt_name == pdb2_name and np.any(tmscore_check[location_full, :] >= 0.5): + self.selection = int((location_full - 1) / 2) + elif alt_name == pdb1_name and np.any(tmscore_check[location_full, :] >= 0.5): + self.selection = int(location_full / 2) + else: + raise RuntimeError( + "Predictions are bad: no alternative-conformation models exceed " + "the 0.5 TM-score threshold" + ) + + logger.info("Selected shallow MSA index %s for %s", self.selection, pdb1_name) + + +class TMScoreCalAllVar: + """Evaluate TM-scores for alternative-conformation prediction results.""" + + def __init__( + self, + pdb1: str, + pdb1_name: str, + pdb2: str, + pdb2_name: str, + num_msa: int, + option: str, + model_type: str, + search_dir: Optional[str] = None, + search_multi_dir: Optional[str] = None, + ) -> None: + self.pdb1 = pdb1 + self.pdb2 = pdb2 + self.pdb1_name = pdb1_name + self.pdb2_name = pdb2_name + self.num_msa = num_msa + self.option = option + self.model_type = model_type + self.search_dir = search_dir + self.search_multi_dir = search_multi_dir + self.size_selection: List[int] = [] + + if self.model_type != MULTIMER_MODEL_TYPE: + self._evaluate_monomer() + else: + self._evaluate_multimer() + + def _evaluate_monomer(self) -> None: + """Run the full monomer TM-score evaluation pipeline.""" + num_seeds = 5 + self.num_msa + + # Pass as glob pattern string — BaseTMScore._resolve_models expands it + full_pred_dir = ( + str(PREDICTIONS_ROOT / self.pdb1_name) + + f"/{self.pdb1_name}_predicted_models_full_rand_*" + ) + msa_full = TMScore( + full_pred_dir, + self.pdb1, + self.pdb1_name, + self.pdb2, + self.pdb2_name, + self.model_type, + ) + full_scores_array = np.asarray(msa_full.tmscores, dtype=float).reshape(2, num_seeds * 5) + + # Three-branch quality check + if np.any(full_scores_array[0, :] > 0.5) or np.any(full_scores_array[1, :] > 0.5): + alt_name = self._determine_alternative(np.average(full_scores_array, axis=1)) + elif np.all(full_scores_array[0, :] < 0.5) and np.all(full_scores_array[1, :] < 0.5): + self._move_failed_full_outputs() + raise RuntimeError("All predictions with deep MSA are failed") + else: + alt_name = self._determine_alternative(np.average(full_scores_array, axis=1)) + + logger.info( + "Reference: %s Alternative: %s", + self.pdb1_name if alt_name == self.pdb2_name else self.pdb2_name, + alt_name, + ) + np.savetxt(f"TMScore_full-MSA_{self.pdb1_name}.csv", full_scores_array, fmt="%2.3f") + + # Shallow random MSA TM-scores + max_msa = 1 + ext_msa = 2 + tmscores_random: List[float] = [] + last_shallow: Optional[TMScore] = None + + for multi in MSA_MULTIPLIERS: + max_msa *= multi + ext_msa *= multi + + pred_dir = ( + str(PREDICTIONS_ROOT / self.pdb1_name) + + f"/{self.pdb1_name}_predicted_models_rand_*" + + f"_max_{max_msa}_ext_{ext_msa}/" + ) + logger.debug("Shallow MSA dir pattern: %s", pred_dir) + + shallow = TMScore( + pred_dir, + self.pdb1, + self.pdb1_name, + self.pdb2, + self.pdb2_name, + self.model_type, + ) + tmscores_random = list(np.append(tmscores_random, shallow.tmscores)) + last_shallow = shallow + + random_array = np.asarray(tmscores_random, dtype=float) + tmscores_random_reshape = random_array.reshape(14, num_seeds * 5) + + # Extract alternative rows for quality check + tmscores_random_alter = self._extract_alternative_rows( + tmscores_random_reshape, alt_name, self.pdb1_name, self.pdb2_name + ) + if np.all(tmscores_random_alter < 0.5): + raise RuntimeError("All shallow predictions are failed") + + logger.info("Finding optimal size of random MSA...") + last_shallow.select_size(random_array, self.pdb1_name, self.pdb2_name, alt_name, num_seeds) + self.size_selection = [last_shallow.selection] + logger.info("Selected MSA size index: %s", self.size_selection) + + np.savetxt(f"TMScore_random-MSA_{self.pdb1_name}.csv", tmscores_random_reshape, fmt="%2.3f") + + def _determine_alternative(self, reference_scores: np.ndarray) -> str: + """Return the name of the alternative conformation structure.""" + if reference_scores[0] >= reference_scores[1]: + return self.pdb2_name + return self.pdb1_name + + @staticmethod + def _extract_alternative_rows( + matrix: np.ndarray, + alt_name: str, + pdb1_name: str, + pdb2_name: str, + ) -> np.ndarray: + """Extract rows corresponding to the alternative structure.""" + if alt_name == pdb2_name: + return matrix[1::2, :] + return matrix[0::2, :] + + def _move_failed_full_outputs(self) -> None: + """Move failed full-MSA prediction folders to failed_predictions/.""" + FAILED_ROOT.mkdir(parents=True, exist_ok=True) + pattern = ( + str(PREDICTIONS_ROOT / self.pdb1_name) + + f"/{self.pdb1_name}_predicted_models_full_rand_*" + ) + for candidate in glob.glob(pattern): + destination = FAILED_ROOT / self.pdb1_name / Path(candidate).name + destination.parent.mkdir(parents=True, exist_ok=True) + shutil.move(candidate, str(destination)) + logger.info("Moved failed prediction %s -> %s", candidate, destination) + + def _evaluate_multimer(self) -> None: + """Run multimer TM-score evaluation pipeline.""" + num_seeds = 5 + self.num_msa + pdb1_basename = self.pdb1_name.split("/")[-1] + + # Full MSA whole-structure TM-scores + full_pred_dir = ( + str(PREDICTIONS_ROOT / self.pdb1_name) + + f"/{pdb1_basename}_predicted_models_full_rand_*" + ) + msa_full = TMScore( + full_pred_dir, + self.pdb1, + self.pdb1_name, + self.pdb2, + self.pdb2_name, + self.model_type, + ) + full_tmscore_flat = np.asarray(msa_full.tmscores, dtype=float) + n_cols = full_tmscore_flat.size // 2 + full_scores_array = full_tmscore_flat.reshape(2, n_cols) + + # Quality check + if np.all(full_scores_array[0, :] < 0.5) and np.all(full_scores_array[1, :] < 0.5): + self._move_failed_full_outputs() + raise RuntimeError("All predictions with deep MSA are failed") + + alt_name = self._determine_alternative(np.average(full_scores_array, axis=1)) + logger.info( + "Reference: %s Alternative: %s", + self.pdb1_name if alt_name == self.pdb2_name else self.pdb2_name, + alt_name, + ) + np.savetxt(f"TMScore_full-MSA_{self.pdb1_name}.csv", full_scores_array, fmt="%2.3f") + + # Shallow random MSA TM-scores + max_msa = 1 + ext_msa = 2 + tmscores_random: List[float] = [] + last_shallow: Optional[TMScore] = None + + for multi in MSA_MULTIPLIERS: + max_msa *= multi + ext_msa *= multi + + pred_dir = ( + str(PREDICTIONS_ROOT / self.pdb1_name) + + f"/{self.pdb1_name}_predicted_models_rand_*" + + f"_max_{max_msa}_ext_{ext_msa}/" + ) + logger.debug("Shallow MSA dir pattern: %s", pred_dir) + + shallow = TMScore( + pred_dir, + self.pdb1, + self.pdb1_name, + self.pdb2, + self.pdb2_name, + self.model_type, + ) + tmscores_random = list(np.append(tmscores_random, shallow.tmscores)) + last_shallow = shallow + + random_array = np.asarray(tmscores_random, dtype=float) + tmscores_random_reshape = random_array.reshape(14, n_cols) + + tmscores_random_alter = self._extract_alternative_rows( + tmscores_random_reshape, alt_name, self.pdb1_name, self.pdb2_name + ) + if np.all(tmscores_random_alter < 0.5): + raise RuntimeError("All shallow predictions are failed") + + logger.info("Finding optimal size of random MSA...") + last_shallow.select_size(random_array, self.pdb1_name, self.pdb2_name, alt_name, num_seeds) + self.size_selection = [last_shallow.selection] + logger.info("Selected MSA size index: %s", self.size_selection) + + np.savetxt(f"TMScore_random-MSA_{self.pdb1_name}.csv", tmscores_random_reshape, fmt="%2.3f") diff --git a/cf_random/analysis/tmscore_all_var_fs.py b/cf_random/analysis/tmscore_all_var_fs.py new file mode 100644 index 0000000..3d43e59 --- /dev/null +++ b/cf_random/analysis/tmscore_all_var_fs.py @@ -0,0 +1,475 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""TM-score analysis for fold-switching prediction workflows.""" + +import glob +import shutil +from pathlib import ( + Path, +) +from typing import ( + List, + Optional, +) + +import numpy as np + +from ..analysis.cal_tmscore_fs_multimer import ( + TMScoreFSMulti, +) +from .base import ( + MULTIMER_MODEL_TYPE, + BaseTMScore, + logger, +) +from .cal_tmscore_fs_flmsa import ( + TMScoreFS, +) + +PREDICTIONS_ROOT = Path("predictions_all") +FAILED_ROOT = Path("failed_predictions") + +# Must match the multiplier sequence used in prediction_all_var.py +MSA_MULTIPLIERS = (1, 2, 2, 2, 2, 2, 2) + + +class TMScore(BaseTMScore): + """Compute whole-structure TM-scores for a prediction set. + + Inherits forward/reverse scoring, glob resolution, and multimer + conversion from BaseTMScore. Adds fold-switching select_size logic. + """ + + def select_size( + self, + tmscores_random: np.ndarray, + tmscores_fs_random: np.ndarray, + pdb1_name: str, + pdb2_name: str, + alt_name: str, + num_seeds: int, + ) -> None: + """Select optimal shallow MSA depth using whole-structure and FS scores. + + 1. Extract alternative rows from both whole and FS matrices. + 2. Sum whole-structure group scores and pick argmax location. + 3. Primary check: whole >= 0.5 AND fs >= 0.5 at that location. + 4. Fallback: if fs < 0.5 at primary location, scan all pairs + for any combination where whole >= 0.4 and fs >= 0.5. + 5. Raise RuntimeError if no valid selection found. + + Args: + tmscores_random: Flat array of shallow whole-structure TM-scores. + tmscores_fs_random: Flat array of shallow FS-region TM-scores. + pdb1_name: Name of first reference structure. + pdb2_name: Name of second reference structure. + alt_name: Name of the alternative conformation structure. + num_seeds: Number of prediction seeds. + + Raises: + RuntimeError: If no viable shallow MSA selection can be found. + """ + tmscores_random_reshape = tmscores_random.reshape(14, num_seeds * 5) + tmscores_fs_random_reshape = tmscores_fs_random.reshape(14, num_seeds * 5) + tmscores_random_locat = np.zeros((7, num_seeds * 5)) + tmscores_fs_random_locat = np.zeros((7, num_seeds * 5)) + + # Extract every-other row for the alternative structure (both matrices) + if alt_name == pdb2_name: + tmp_cnt = 0 + for i in range(1, 14, 2): + tmscores_random_locat[tmp_cnt, :] = tmscores_random_reshape[i, :] + tmscores_fs_random_locat[tmp_cnt, :] = tmscores_fs_random_reshape[i, :] + tmp_cnt += 1 + else: + tmp_cnt = 0 + for i in range(0, 13, 2): + tmscores_random_locat[tmp_cnt, :] = tmscores_random_reshape[i, :] + tmscores_fs_random_locat[tmp_cnt, :] = tmscores_fs_random_reshape[i, :] + tmp_cnt += 1 + + # Sum whole-structure groups and find best location + tmscore_data_sum = np.zeros((7, 1)) + for i in range(tmscores_random_locat.shape[0]): + tmscore_data_sum[i] = np.sum(tmscores_random_locat[i]) + + location = int(np.argmax(np.max(tmscore_data_sum, axis=1))) + + # Map group index back to full-matrix row + if alt_name == pdb2_name: + location_full = (location * 2) + 1 + else: + location_full = location * 2 + + tmscore_data = tmscores_random_reshape + tmscore_fs_data = tmscores_fs_random_reshape + + # Primary check: both whole and FS must have >= 0.5 at location + if alt_name == pdb2_name and ( + np.any(tmscore_data[location_full, :] >= 0.5) + and np.any(tmscore_fs_data[location_full, :] >= 0.5) + ): + self.selection = int((location_full - 1) / 2) + logger.info("Selected shallow MSA index %s for %s", self.selection, pdb1_name) + + elif alt_name == pdb1_name and ( + np.any(tmscore_data[location_full, :] >= 0.5) + and np.any(tmscore_fs_data[location_full, :] >= 0.5) + ): + self.selection = int(location_full / 2) + logger.info("Selected shallow MSA index %s for %s", self.selection, pdb1_name) + + # Scan all pairs for whole >= 0.4 AND fs >= 0.5 in any combination + elif np.any(tmscore_fs_data[location_full, :] < 0.5): + found = False + for jj in range(int(tmscore_data.shape[0] / 2)): + cross1 = np.any(tmscore_data[jj * 2, :] >= 0.4) and np.any( + tmscore_fs_data[(jj * 2) + 1, :] >= 0.5 + ) + cross2 = np.any(tmscore_data[(jj * 2) + 1, :] >= 0.4) and np.any( + tmscore_fs_data[jj * 2, :] >= 0.5 + ) + same1 = np.any(tmscore_data[jj * 2, :] >= 0.4) and np.any( + tmscore_fs_data[jj * 2, :] >= 0.5 + ) + same2 = np.any(tmscore_data[(jj * 2) + 1, :] >= 0.4) and np.any( + tmscore_fs_data[(jj * 2) + 1, :] >= 0.5 + ) + if cross1 or cross2 or same1 or same2: + self.selection = jj + found = True + logger.info("Fallback selection: shallow MSA index %s for %s", jj, pdb1_name) + break + elif jj == (int(tmscore_data.shape[0]) - 1) and np.all(tmscore_data[jj, :] < 0.5): + raise RuntimeError("Predictions are bad: no viable shallow MSA selection found") + if not found: + raise RuntimeError("Predictions are bad: no viable shallow MSA selection found") + + else: + raise RuntimeError("Predictions are bad: whole-structure predictions are bad") + + +class TMScoreCalAllVarFS: + """Evaluate TM-scores for fold-switching prediction results.""" + + def __init__( + self, + pdb1: str, + pdb1_name: str, + pdb2: str, + pdb2_name: str, + num_msa: int, + option: str, + model_type: str, + search_dir: Optional[str] = None, + search_multi_dir: Optional[str] = None, + ) -> None: + self.pdb1 = pdb1 + self.pdb2 = pdb2 + self.pdb1_name = pdb1_name + self.pdb2_name = pdb2_name + self.num_msa = num_msa + self.option = option + self.model_type = model_type + self.search_dir = search_dir + self.search_multi_dir = search_multi_dir + self.size_selection: List[int] = [] + + if self.model_type != MULTIMER_MODEL_TYPE: + self._evaluate_monomer() + else: + self._evaluate_multimer() + + def _move_failed_full_outputs(self) -> None: + """Move failed full-MSA prediction folders to failed_predictions/.""" + FAILED_ROOT.mkdir(parents=True, exist_ok=True) + pattern = ( + str(PREDICTIONS_ROOT / self.pdb1_name) + + f"/{self.pdb1_name}_predicted_models_full_rand_*" + ) + for candidate in glob.glob(pattern): + destination = FAILED_ROOT / self.pdb1_name / Path(candidate).name + destination.parent.mkdir(parents=True, exist_ok=True) + shutil.move(candidate, str(destination)) + logger.info("Moved failed prediction %s -> %s", candidate, destination) + + @staticmethod + def _msa_pairs() -> List[tuple]: + """Generate (max_msa, ext_msa) pairs matching the prediction multiplier sequence.""" + pairs: List[tuple] = [] + max_msa = 1 + ext_msa = 2 + for multiplier in MSA_MULTIPLIERS: + max_msa *= multiplier + ext_msa *= multiplier + pairs.append((max_msa, ext_msa)) + return pairs + + def _evaluate_monomer(self) -> None: + """Run the full monomer FS TM-score evaluation pipeline.""" + num_seeds = 5 + self.num_msa + + # Full MSA whole-structure TM-scores + # Passed as glob pattern string so BaseTMScore._resolve_models expands it + pdb1_basename = self.pdb1_name.split("/")[-1] + + full_pred_dir = ( + str(PREDICTIONS_ROOT / self.pdb1_name) + + f"/{pdb1_basename}_predicted_models_full_rand_*" + ) + msa_full_tmscore = TMScore( + full_pred_dir, + self.pdb1, + self.pdb1_name, + self.pdb2, + self.pdb2_name, + self.model_type, + ) + full_tmscore = np.asarray(msa_full_tmscore.tmscores, dtype=float).reshape(2, num_seeds * 5) + + # Full MSA fold-switching region TM-scores + msa_fs_tmscore = TMScoreFS( + full_pred_dir, + self.pdb1, + self.pdb1_name, + self.pdb2, + self.pdb2_name, + ) + fs_tmscore = np.asarray(msa_fs_tmscore.tmscores_fs, dtype=float).reshape(2, num_seeds * 5) + + # Two-branch quality check using both whole and FS scores + if np.average(full_tmscore[0, :]) > np.average(full_tmscore[1, :]): + if np.any(fs_tmscore[0, :] >= 0.5) and np.any(full_tmscore[0, :] >= 0.5): + ref_name = self.pdb1_name + alt_name = self.pdb2_name + elif np.any(fs_tmscore[1, :] >= 0.5) and np.any(full_tmscore[1, :] >= 0.5): + ref_name = self.pdb2_name + alt_name = self.pdb1_name + else: + self._move_failed_full_outputs() + raise RuntimeError("Prediction with deep MSA was failed") + else: + if np.any(fs_tmscore[1, :] >= 0.5) and np.any(full_tmscore[1, :] >= 0.5): + ref_name = self.pdb2_name + alt_name = self.pdb1_name + elif np.any(fs_tmscore[0, :] >= 0.5) and np.any(full_tmscore[0, :] >= 0.5): + ref_name = self.pdb1_name + alt_name = self.pdb2_name + else: + self._move_failed_full_outputs() + raise RuntimeError("Prediction with deep MSA was failed") + + logger.info("Reference structure: %s", ref_name) + logger.info("Alternative structure: %s", alt_name) + + # Save full MSA scores for both whole structure and FS region + np.savetxt(f"TMScore_full-MSA_{self.pdb1_name}.csv", full_tmscore, fmt="%2.3f") + np.savetxt(f"TMScore_fs_full-MSA_{self.pdb1_name}.csv", fs_tmscore, fmt="%2.3f") + + # Shallow random MSA TM-scores (whole + FS) + tmscores_random: List[float] = [] + tmscores_fs_random: List[float] = [] + last_shallow: Optional[TMScore] = None + + for max_msa, ext_msa in self._msa_pairs(): + pred_dir = ( + str(PREDICTIONS_ROOT / self.pdb1_name) + + f"/{self.pdb1_name}_predicted_models_rand_*" + + f"_max_{max_msa}_ext_{ext_msa}" + ) + logger.debug("Shallow MSA dir pattern: %s", pred_dir) + + shallow = TMScore( + pred_dir, + self.pdb1, + self.pdb1_name, + self.pdb2, + self.pdb2_name, + self.model_type, + ) + tmscores_random = list(np.append(tmscores_random, shallow.tmscores)) + last_shallow = shallow + + shallow_fs = TMScoreFS( + pred_dir, + self.pdb1, + self.pdb1_name, + self.pdb2, + self.pdb2_name, + ) + tmscores_fs_random = list(np.append(tmscores_fs_random, shallow_fs.tmscores_fs)) + + random_array = np.asarray(tmscores_random, dtype=float) + fs_random_array = np.asarray(tmscores_fs_random, dtype=float) + + tmscores_random_reshape = random_array.reshape(14, num_seeds * 5) + tmscores_fs_random_reshape = fs_random_array.reshape(14, num_seeds * 5) + + # Extract alternative rows for quality check (both matrices) + if alt_name == self.pdb2_name: + tmscores_random_alter = tmscores_random_reshape[1::2, :] + tmscores_fs_random_alter = tmscores_fs_random_reshape[1::2, :] + else: + tmscores_random_alter = tmscores_random_reshape[0::2, :] + tmscores_fs_random_alter = tmscores_fs_random_reshape[0::2, :] + + # Quality gate: both whole and FS must have at least one value > 0.5 + if np.any(tmscores_random_alter > 0.5) and np.any(tmscores_fs_random_alter > 0.5): + np.savetxt( + f"TMScore_random-MSA_{self.pdb1_name}.csv", + tmscores_random_reshape, + fmt="%2.3f", + ) + np.savetxt( + f"TMScore_fs_random-MSA_{self.pdb1_name}.csv", + tmscores_fs_random_reshape, + fmt="%2.3f", + ) + + logger.info("Finding optimal size of random MSA...") + last_shallow.select_size( + tmscores_random_reshape, + tmscores_fs_random_reshape, + self.pdb1_name, + self.pdb2_name, + alt_name, + num_seeds, + ) + self.size_selection = [last_shallow.selection] + logger.info("Selected MSA size index: %s", self.size_selection) + else: + raise RuntimeError( + "Full-MSA prediction is not tightly aligned to crystal structure " + "with additional seeds" + ) + + def _evaluate_multimer(self) -> None: + """Run the full multimer FS TM-score evaluation pipeline.""" + num_seeds = 5 + self.num_msa + pdb1_basename = self.pdb1_name.split("/")[-1] + + full_pred_dir = ( + str(PREDICTIONS_ROOT / self.pdb1_name) + + f"/{pdb1_basename}_predicted_models_full_rand_*" + ) + msa_full_tmscore = TMScore( + full_pred_dir, + self.pdb1, + self.pdb1_name, + self.pdb2, + self.pdb2_name, + self.model_type, + ) + full_tmscore = np.asarray(msa_full_tmscore.tmscores, dtype=float).reshape(2, num_seeds * 5) + + msa_fs_tmscore = TMScoreFS( + full_pred_dir, + self.pdb1, + self.pdb1_name, + self.pdb2, + self.pdb2_name, + model_glob="single_0_unrelaxed*pdb", + ) + fs_tmscore = np.asarray(msa_fs_tmscore.tmscores_fs, dtype=float).reshape(2, num_seeds * 5) + + if np.average(full_tmscore[0, :]) > np.average(full_tmscore[1, :]): + if np.any(fs_tmscore[0, :] >= 0.5) and np.any(full_tmscore[0, :] >= 0.5): + ref_name = self.pdb1_name + alt_name = self.pdb2_name + elif np.any(fs_tmscore[1, :] >= 0.5) and np.any(full_tmscore[1, :] >= 0.5): + ref_name = self.pdb2_name + alt_name = self.pdb1_name + else: + self._move_failed_full_outputs() + raise RuntimeError("Prediction with deep MSA was failed") + else: + if np.any(fs_tmscore[1, :] >= 0.5) and np.any(full_tmscore[1, :] >= 0.5): + ref_name = self.pdb2_name + alt_name = self.pdb1_name + elif np.any(fs_tmscore[0, :] >= 0.5) and np.any(full_tmscore[0, :] >= 0.5): + ref_name = self.pdb1_name + alt_name = self.pdb2_name + else: + self._move_failed_full_outputs() + raise RuntimeError("Prediction with deep MSA was failed") + + logger.info("Reference structure: %s", ref_name) + logger.info("Alternative structure: %s", alt_name) + + np.savetxt(f"TMScore_full-MSA_{self.pdb1_name}.csv", full_tmscore, fmt="%2.3f") + np.savetxt(f"TMScore_fs_full-MSA_{self.pdb1_name}.csv", fs_tmscore, fmt="%2.3f") + + tmscores_random: List[float] = [] + tmscores_fs_random: List[float] = [] + last_shallow: Optional[TMScore] = None + + for max_msa, ext_msa in self._msa_pairs(): + pred_dir = ( + str(PREDICTIONS_ROOT / self.pdb1_name) + + f"/{self.pdb1_name}_predicted_models_rand_*" + + f"_max_{max_msa}_ext_{ext_msa}" + ) + logger.debug("Shallow MSA dir pattern: %s", pred_dir) + + shallow = TMScore( + pred_dir, + self.pdb1, + self.pdb1_name, + self.pdb2, + self.pdb2_name, + self.model_type, + ) + tmscores_random = list(np.append(tmscores_random, shallow.tmscores)) + last_shallow = shallow + + shallow_fs = TMScoreFS( + pred_dir, + self.pdb1, + self.pdb1_name, + self.pdb2, + self.pdb2_name, + model_glob="single_0_unrelaxed*pdb", + ) + tmscores_fs_random = list(np.append(tmscores_fs_random, shallow_fs.tmscores_fs)) + + random_array = np.asarray(tmscores_random, dtype=float) + fs_random_array = np.asarray(tmscores_fs_random, dtype=float) + + tmscores_random_reshape = random_array.reshape(14, num_seeds * 5) + tmscores_fs_random_reshape = fs_random_array.reshape(14, num_seeds * 5) + + if alt_name == self.pdb2_name: + tmscores_random_alter = tmscores_random_reshape[1::2, :] + tmscores_fs_random_alter = tmscores_fs_random_reshape[1::2, :] + else: + tmscores_random_alter = tmscores_random_reshape[0::2, :] + tmscores_fs_random_alter = tmscores_fs_random_reshape[0::2, :] + + if np.any(tmscores_random_alter > 0.5) and np.any(tmscores_fs_random_alter > 0.5): + np.savetxt( + f"TMScore_random-MSA_{self.pdb1_name}.csv", + tmscores_random_reshape, + fmt="%2.3f", + ) + np.savetxt( + f"TMScore_fs_random-MSA_{self.pdb1_name}.csv", + tmscores_fs_random_reshape, + fmt="%2.3f", + ) + + logger.info("Finding optimal size of random MSA...") + last_shallow.select_size( + tmscores_random_reshape, + tmscores_fs_random_reshape, + self.pdb1_name, + self.pdb2_name, + alt_name, + num_seeds, + ) + self.size_selection = [last_shallow.selection] + logger.info("Selected MSA size index: %s", self.size_selection) + else: + raise RuntimeError( + "Full-MSA prediction is not tightly aligned to crystal structure " + "with additional seeds" + ) diff --git a/cf_random/cli.py b/cf_random/cli.py new file mode 100644 index 0000000..5540c74 --- /dev/null +++ b/cf_random/cli.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Command-line interface for CF-random +""" + +from .core.main import ( + main, +) + +if __name__ == "__main__": + main() diff --git a/Data/Fold-switch_hits-AFcluster/CRKL/2bzy_B_tmscores_fs_all.csv b/cf_random/core/__init__.py similarity index 100% rename from Data/Fold-switch_hits-AFcluster/CRKL/2bzy_B_tmscores_fs_all.csv rename to cf_random/core/__init__.py diff --git a/cf_random/core/main.py b/cf_random/core/main.py new file mode 100644 index 0000000..e2385a2 --- /dev/null +++ b/cf_random/core/main.py @@ -0,0 +1,570 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""Main entry point for CF-random protein structure prediction pipeline. + +This module orchestrates the prediction and analysis workflow for alternative +conformation (AC), fold-switching (FS), and blind mode predictions using +ColabFold and AlphaFold models. +""" + +import argparse +import glob +import logging +import os +import shutil +import warnings +from pathlib import ( + Path, +) +from typing import ( + Optional, + Union, +) + +import numpy as np +from colabfold.download import ( + download_alphafold_params, +) + +from ..analysis.cal_plddt_ac_fs import ( + PlddtCal, +) +from ..analysis.tmscore_all_var import ( + TMScoreCalAllVar, +) +from ..analysis.tmscore_all_var_fs import ( + TMScoreCalAllVarFS, +) +from ..plotting.plot_ac import ( + Plot2DScatterAC, +) +from ..plotting.plot_fs import ( + Plot2DScatter, +) +from ..prediction.prediction_all_var import ( + PredictionAll, +) +from ..utils.search_foldseek_cluster import ( + BlindScreening, +) + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", +) +logger = logging.getLogger(__name__) + +warnings.filterwarnings("ignore") + +# Constants +MODEL_TYPES = { + "ptm": "alphafold2_ptm", + "monomer": "alphafold2", + "multimer": "alphafold2_multimer_v3", +} +BLIND = "predictions_all" +SUCCESS = "predictions_all" + +ALTERNATIVE_CONFORMATION = "AC" +FOLD_SWITCHING = "FS" +BLIND_MODE = "blind" +FULL_MSA = "full-MSA" +RANDOM_MSA = "random-MSA" + +FOLDSEEK_DONE_FILE_COUNT = 640 + + +def parse_arguments() -> argparse.Namespace: + """Parse command line arguments.""" + parser = argparse.ArgumentParser( + description="CF-random protein structure prediction pipeline", + formatter_class=argparse.RawTextHelpFormatter, + ) + parser.add_argument( + "--pdb1", + type=str, + help="PDB structure for the target crystal structure (target to be predicted)", + ) + parser.add_argument( + "--pdb2", + type=str, + help="PDB structure for the alternative crystal structure", + ) + parser.add_argument("--fname", type=str, help="MSA folder name after colabsearch") + parser.add_argument("--fmname", type=str, help="multimer MSA folder name after colabsearch") + parser.add_argument("--pname", type=str, help="job name for predicting blind mode") + parser.add_argument( + "--num_msa", + type=str, + help="number of additional MSA seeds to run (added to default 5)", + ) + parser.add_argument( + "--num_ens", + type=str, + help="number of ensemble samples to generate (integer)", + ) + parser.add_argument( + "--option", + type=str, + required=True, + help=( + "select prediction mode: AC (alternative conformation), " + "FS (fold-switching), inAC (increased AC sampling), " + "or blind (no crystal structures)" + ), + ) + parser.add_argument( + "--model-type", + type=str, + choices=["ptm", "monomer", "multimer"], + help="select model-type of Colabfold: ptm, monomer, or multimer", + ) + return parser.parse_args() + + +def resolve_pdb_name(args: argparse.Namespace) -> str: + """Resolve the working name for blind mode.""" + if args.pdb1 is None and args.pdb2 is None: + return args.pname + + if args.pdb1 is None and args.pname is None: + return args.fname.replace("/", "") + else: + return args.fname.replace("/", "") + + +def resolve_num_msa_num_ens(args: argparse.Namespace): + """Resolve num_msa and num_ens from optional string arguments.""" + num_msa_raw = args.num_msa + num_ens_raw = args.num_ens + + if num_msa_raw is None and num_ens_raw is None: + return 0, 0 + + if num_msa_raw is not None and num_ens_raw is not None: + return int(num_msa_raw), int(num_ens_raw) + + if num_msa_raw is None and num_ens_raw is not None: + return 0, int(num_ens_raw) + + if num_msa_raw is not None and num_ens_raw is None: + return int(num_msa_raw), 0 + else: + raise ValueError("Please provide a valid combination of --num_msa and --num_ens") + + +def resolve_search_dirs(args: argparse.Namespace): + """Resolve search_dir and search_multi_dir.""" + if args.fname is None and args.fmname is None: + raise ValueError("--fname (MSA folder) is required for all modes") + + if args.fname is None and args.fmname is not None: + raise ValueError("--fname (monomer MSA folder) is required alongside --fmname") + + if args.fname is not None and args.fmname is None: + return args.fname, None + else: + return args.fname, " " + args.fmname + + +def determine_model_type(args: argparse.Namespace, pdb1: Optional[str]) -> str: + """Determine the model type and set up multimer directory if needed.""" + if args.model_type is None or args.model_type == "ptm": + return MODEL_TYPES["ptm"] + elif args.model_type == "monomer": + return MODEL_TYPES["monomer"] + elif args.model_type == "multimer" and args.option == BLIND_MODE: + model_type = MODEL_TYPES["multimer"] + return model_type + elif args.model_type == "multimer": + ter_count = count_chains(pdb1) + logger.info("%d chain(s) in this multimer file.", ter_count) + model_type = MODEL_TYPES["multimer"] + return model_type + else: + raise ValueError( + f"Unrecognized model type: {args.model_type!r}. Choose from: ptm, monomer, multimer" + ) + + +def count_chains(pdb_file: str) -> int: + """Count the number of chains in a PDB file.""" + ter_count = 0 + with open(pdb_file, "r") as f: + for line in f: + ter_count += line.split().count("TER") + return ter_count + + +def main() -> None: + """Main entry point for the CF-random pipeline.""" + args = parse_arguments() + + if args.model_type is None: + args.model_type = "ptm" + + download_alphafold_params(MODEL_TYPES[args.model_type], Path(".")) + + pwd = os.getcwd() + "/" + pdb1: Optional[str] = None + pdb2: Optional[str] = None + pdb1_name: Optional[str] = None + pdb2_name: Optional[str] = None + + # Resolve working names + if args.option == BLIND_MODE: + pdb1_name = resolve_pdb_name(args) + logger.info("Work name: %s", pdb1_name) + + if args.pdb1 and args.pdb2: + pdb1 = args.pdb1 + pdb2 = args.pdb2 + pdb1_name = pdb1.replace(".pdb", "") + pdb2_name = pdb2.replace(".pdb", "") + logger.info("PDB names: %s, %s", pdb1_name, pdb2_name) + + num_msa, num_ens = resolve_num_msa_num_ens(args) + search_dir, search_multi_dir = resolve_search_dirs(args) + model_type = determine_model_type(args, pdb1) + + search_dir = args.fname + success_dir = f"{SUCCESS}/{pdb1_name}/" + + logger.info( + "Running CF-random pipeline with updated options: %s", + { + "pdb1": pdb1, + "pdb1_name": pdb1_name, + "pdb2": pdb2, + "pdb2_name": pdb2_name, + "num_msa": num_msa, + "num_ens": num_ens, + "model_type": model_type, + "search_dir": search_dir, + "search_multi_dir": search_multi_dir, + "success_dir": success_dir, + }, + ) + + if args.option == ALTERNATIVE_CONFORMATION: + run_alternative_conformation_workflow( + pdb1=pdb1, + pdb1_name=pdb1_name, + pdb2=pdb2, + pdb2_name=pdb2_name, + num_msa=num_msa, + num_ens=num_ens, + model_type=model_type, + search_dir=search_dir, + search_multi_dir=search_multi_dir, + success_dir=success_dir, + pwd=pwd, + ) + elif args.option == FOLD_SWITCHING: + run_fold_switching_workflow( + pdb1=pdb1, + pdb1_name=pdb1_name, + pdb2=pdb2, + pdb2_name=pdb2_name, + num_msa=num_msa, + num_ens=num_ens, + model_type=model_type, + search_dir=search_dir, + search_multi_dir=search_multi_dir, + success_dir=success_dir, + pwd=pwd, + ) + elif args.option == BLIND_MODE: + run_blind_workflow( + pdb1_name=pdb1_name, + search_dir=search_dir, + search_multi_dir=search_multi_dir, + num_msa=num_msa, + model_type=model_type, + ) + else: + raise ValueError(f"Unrecognized option: {args.option!r}. Choose from: AC, FS, inAC, blind") + + +def run_alternative_conformation_workflow( + pdb1: str, + pdb1_name: str, + pdb2: str, + pdb2_name: str, + num_msa: int, + num_ens: int, + model_type: str, + search_dir: str, + search_multi_dir: Union[int, str], + success_dir: str, + pwd: str, +) -> None: + """Run the alternative conformation prediction workflow.""" + logger.info("Predicting alternative conformations") + + if not os.path.exists(success_dir): + Path(success_dir).mkdir(parents=True, exist_ok=True) + succ_dir_count = 0 + else: + succ_dir_count = 0 + for _, cur_dir, _ in os.walk(pwd + success_dir + "/"): + succ_dir_count += len(cur_dir) + + if os.path.exists(success_dir) and 0 < succ_dir_count < 8: + logger.info("Folder exists but is incomplete — cleaning subfolders") + shutil.rmtree(success_dir) + + if os.path.exists(success_dir) and succ_dir_count >= 8: + logger.info("Predictions including full and random-MSA were already completed.") + calculate_tm_score = TMScoreCalAllVar( + pdb1=pdb1, + pdb1_name=pdb1_name, + pdb2=pdb2, + pdb2_name=pdb2_name, + num_msa=num_msa, + option=ALTERNATIVE_CONFORMATION, + model_type=model_type, + search_dir=search_dir, + search_multi_dir=search_multi_dir, + ) + else: + PredictionAll( + pdb1_name=pdb1_name, + search_dir=search_dir, + search_multi_dir=search_multi_dir, + num_msa=num_msa, + model_type=model_type, + ) + calculate_tm_score = TMScoreCalAllVar( + pdb1=pdb1, + pdb1_name=pdb1_name, + pdb2=pdb2, + pdb2_name=pdb2_name, + num_msa=num_msa, + option=ALTERNATIVE_CONFORMATION, + model_type=model_type, + search_dir=search_dir, + search_multi_dir=search_multi_dir, + ) + + shallow_msa_size = np.append([], calculate_tm_score.size_selection) + logger.info("Specific size of shallow random MSA is similar to full-MSA: %s", shallow_msa_size) + np.savetxt("selected_MSA-size_" + pdb1_name + ".csv", shallow_msa_size) + + base = os.path.join(pwd, success_dir) + list_org_samplings = glob.glob(os.path.join(base, "*full_rand*")) + list_ran_samplings = glob.glob(os.path.join(base, "*max*")) + + logger.info("Searching for pLDDT folders in: %s", base) + logger.info( + "Found %d folders for pLDDT calculation", len(list_org_samplings) + len(list_ran_samplings) + ) + + PlddtCal( + sub_list=list_org_samplings, + category=FULL_MSA, + pdb_name=pdb1_name, + num_msa=num_msa, + num_ens=num_ens, + model_type=model_type, + ) + PlddtCal( + sub_list=list_ran_samplings, + category=RANDOM_MSA, + pdb_name=pdb1_name, + num_msa=num_msa, + num_ens=num_ens, + model_type=model_type, + ) + Plot2DScatterAC( + full_category=FULL_MSA, + random_category=RANDOM_MSA, + pdb1=pdb1, + pdb1_name=pdb1_name, + pdb2=pdb2, + pdb2_name=pdb2_name, + num_msa=num_msa, + num_ens=num_ens, + model_type=model_type, + ) + + +def run_fold_switching_workflow( + pdb1: str, + pdb1_name: str, + pdb2: str, + pdb2_name: str, + num_msa: int, + num_ens: int, + model_type: str, + search_dir: str, + search_multi_dir: Union[int, str], + success_dir: str, + pwd: str, +) -> None: + """Run the fold-switching prediction workflow.""" + logger.info("Predicting fold-switching models.") + + if not os.path.exists(success_dir): + Path(success_dir).mkdir(parents=True, exist_ok=True) + succ_dir_count = 0 + else: + succ_dir_count = 0 + for _, cur_dir, _ in os.walk(pwd + success_dir + "/"): + succ_dir_count += len(cur_dir) + + if os.path.exists(success_dir) and 0 < succ_dir_count < 8: + logger.info("Folder exists but is incomplete — cleaning subfolders") + shutil.rmtree(success_dir) + + shallow_msa_size = np.array([]) + + logging.info("Success directory and count: %s, %d", success_dir, succ_dir_count) + + if os.path.exists(success_dir) and succ_dir_count >= 8: + logger.info("Predictions including full and random-MSA were already completed.") + calculate_tm_score = TMScoreCalAllVarFS( + pdb1=pdb1, + pdb1_name=pdb1_name, + pdb2=pdb2, + pdb2_name=pdb2_name, + num_msa=num_msa, + option=FOLD_SWITCHING, + model_type=model_type, + search_dir=search_dir, + search_multi_dir=search_multi_dir, + ) + shallow_msa_size = np.append(shallow_msa_size, calculate_tm_score.size_selection) + else: + PredictionAll( + pdb1_name=pdb1_name, + search_dir=search_dir, + search_multi_dir=search_multi_dir, + num_msa=num_msa, + model_type=model_type, + ) + calculate_tm_score = TMScoreCalAllVarFS( + pdb1=pdb1, + pdb1_name=pdb1_name, + pdb2=pdb2, + pdb2_name=pdb2_name, + num_msa=num_msa, + option=FOLD_SWITCHING, + model_type=model_type, + search_dir=search_dir, + search_multi_dir=search_multi_dir, + ) + shallow_msa_size = np.append(shallow_msa_size, calculate_tm_score.size_selection) + + logger.info( + "Specific size of shallow random MSA is similar to full-MSA: %s", shallow_msa_size + ) + np.savetxt("selected_MSA-size_" + pdb1_name + ".csv", shallow_msa_size) + + base = os.path.join(pwd, success_dir) + list_org_samplings = glob.glob(os.path.join(base, "*full_rand*")) + list_ran_samplings = glob.glob(os.path.join(base, "*max*")) + + logger.info("Searching for pLDDT folders in: %s", base) + logger.info( + "Found %d folders for pLDDT calculation", len(list_org_samplings) + len(list_ran_samplings) + ) + + PlddtCal( + sub_list=list_org_samplings, + category=FULL_MSA, + pdb_name=pdb1_name, + num_msa=num_msa, + num_ens=num_ens, + model_type=model_type, + ) + PlddtCal( + sub_list=list_ran_samplings, + category=RANDOM_MSA, + pdb_name=pdb1_name, + num_msa=num_msa, + num_ens=num_ens, + model_type=model_type, + ) + + if model_type == MODEL_TYPES["multimer"]: + Plot2DScatterAC( + full_category=FULL_MSA, + random_category=RANDOM_MSA, + pdb1=pdb1, + pdb1_name=pdb1_name, + pdb2=pdb2, + pdb2_name=pdb2_name, + num_msa=num_msa, + num_ens=num_ens, + model_type=model_type, + ) + else: + Plot2DScatter( + full_category=FULL_MSA, + random_category=RANDOM_MSA, + pdb1=pdb1, + pdb1_name=pdb1_name, + pdb2=pdb2, + pdb2_name=pdb2_name, + num_msa=num_msa, + num_ens=num_ens, + ) + + +def run_blind_workflow( + pdb1_name: str, + search_dir: str, + search_multi_dir: Union[int, str], + num_msa: int, + model_type: str, +) -> None: + """Run the blind prediction workflow.""" + logger.info("Predicting fold-switching proteins without crystal structures.") + + if not os.path.exists(BLIND): + Path(BLIND).mkdir(parents=True, exist_ok=True) + + blind_pdb_dir = BLIND + "/" + pdb1_name + blind_pred_path = f"{BLIND}/{pdb1_name}" + logger.info("Blind prediction path: %s", blind_pred_path) + + blind_dir_count = 0 + if os.path.exists(blind_pdb_dir): + for _, cur_dir, _ in os.walk(blind_pdb_dir + "/"): + blind_dir_count += len(cur_dir) + + if os.path.exists(blind_pdb_dir) and 0 < blind_dir_count < 8: + logger.info("Folder exists but is incomplete — cleaning subfolders") + shutil.rmtree(blind_pdb_dir) + + if os.path.exists(blind_pdb_dir) and blind_dir_count >= 8: + logger.info("Predictions including full and random-MSA were already completed.") + + # Count total files to determine whether Foldseek has already been run + fseek_file_count = 0 + for _, cur_dir, files in os.walk(blind_pdb_dir + "/"): + fseek_file_count += len(files) + logger.info("Number of files in blind prediction path: %d", fseek_file_count) + + if fseek_file_count >= FOLDSEEK_DONE_FILE_COUNT: + logger.info("Foldseek search was already done") + + # Run blind screening regardless — Foldseek searches skip existing results + BlindScreening(pdb1_name=pdb1_name, blind_path=blind_pred_path) + else: + PredictionAll( + pdb1_name=pdb1_name, + search_dir=search_dir, + search_multi_dir=search_multi_dir, + num_msa=num_msa, + model_type=model_type, + ) + logger.info("Finished running predictions using full and shallow random-MSAs") + logger.info("Running Foldseek to find related crystal structures") + BlindScreening(pdb1_name=pdb1_name, blind_path=blind_pred_path) + + +if __name__ == "__main__": + main() diff --git a/cf_random/plotting/__init__.py b/cf_random/plotting/__init__.py new file mode 100644 index 0000000..16d2a08 --- /dev/null +++ b/cf_random/plotting/__init__.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""Plotting utilities for structure analysis visualization. + +Provides classes for creating publication-quality plots of structural +predictions and quality metrics. +""" + +from .plot_ac import ( + Plot2DScatterAC, +) +from .plot_fs import ( + Plot2DScatter, +) + +__all__ = [ + "Plot2DScatter", + "Plot2DScatterAC", +] diff --git a/cf_random/plotting/plot_ac.py b/cf_random/plotting/plot_ac.py new file mode 100644 index 0000000..96976da --- /dev/null +++ b/cf_random/plotting/plot_ac.py @@ -0,0 +1,115 @@ +#!/bin/env python3 +# -*- coding: utf-8 -*- +"""Plotting utilities for alternative TM-score comparisons. + +Provides plotting helpers used by the command-line tools. +""" + +import logging + +import numpy as np +import seaborn as sns +from matplotlib import ( + pyplot as plt, +) +from numpy import ( + genfromtxt, +) + +logger = logging.getLogger(__name__) + + +class Plot2DScatterAC: + """Create 2D scatter plots of TM-scores for alternative conformations.""" + + def __init__( + self, + full_category: str, + random_category: str, + pdb1: str, + pdb1_name: str, + pdb2: str, + pdb2_name: str, + num_msa: int, + num_ens: int, + model_type: str, + ): + # Load TM-scores both full- and random-MSA + tmscores_full = genfromtxt(f"TMScore_{full_category}_{pdb1_name}.csv", delimiter=" ") + tmscores_random = genfromtxt(f"TMScore_{random_category}_{pdb1_name}.csv", delimiter=" ") + + # Load pLDDT scores both full- and random-MSA + plddt_full = genfromtxt(f"plddt_{full_category}_{pdb1_name}.csv", delimiter=" ") + plddt_random = genfromtxt(f"plddt_{random_category}_{pdb1_name}.csv", delimiter=" ") + + logger.debug( + "TM-score array shape: rows=%d, cols=%d, ndim=%d", + tmscores_random.shape[0], + tmscores_random.shape[-1], + tmscores_random.ndim, + ) + + plddt_random = np.reshape(plddt_random, (7, (num_msa + 5) * 5)) + tmscore_full_resh = np.reshape(tmscores_full, (((num_msa + 5) * 2), 5)) + + self._plot( + tmscores_random, + tmscore_full_resh, + plddt_random, + plddt_full, + num_msa, + pdb1_name, + pdb2_name, + full_category, + ) + + def _plot( + self, + tmscores_random: np.ndarray, + tmscore_full_resh: np.ndarray, + plddt_random: np.ndarray, + plddt_full: np.ndarray, + num_msa: int, + pdb1_name: str, + pdb2_name: str, + full_category: str, + ) -> None: + """Render and save the 2D TM-score scatter plot.""" + plt.figure(0) + for i in range(0, int(tmscores_random.shape[0] / 2)): + plt.scatter( + tmscores_random[i * 2, :], + tmscores_random[(i * 2 + 1), :], + c=plddt_random[i, :], + cmap="rocket_r", + vmin=50, + vmax=100, + s=35, + marker="o", + ) + + clb = plt.colorbar() + clb.ax.tick_params(labelsize=15) + + plt.scatter( + tmscore_full_resh[0 : (num_msa + 5), :], + tmscore_full_resh[(num_msa + 5) : (num_msa + 5) * 2, :], + c=plddt_full, + cmap="rocket_r", + vmin=50, + vmax=100, + s=35, + marker="o", + ) + + plt.ylim(0, 1) + plt.xlim(0, 1) + plt.plot([0, 1], [0, 1], linestyle="dashed", color="black") + plt.xticks(fontsize=15) + plt.yticks(fontsize=15) + plt.xlabel(f"TM-Score similar to fold1({pdb1_name})", fontsize=15) + plt.ylabel(f"TM-score similar to fold2({pdb2_name})", fontsize=15) + + output_file = f"TMscore_{full_category}_{pdb1_name}.png" + plt.savefig(output_file, transparent=True) + logger.info("Saved AC scatter plot to %s", output_file) diff --git a/cf_random/plotting/plot_fs.py b/cf_random/plotting/plot_fs.py new file mode 100644 index 0000000..c1816f3 --- /dev/null +++ b/cf_random/plotting/plot_fs.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""2D scatter plotting utilities for fold-switching analysis. + +Provides visualization of TM-score distributions and quality metrics +for predicted structures across varied MSA configurations. +""" + +import logging + +import numpy as np +import seaborn as sns +from matplotlib import ( + pyplot as plt, +) +from numpy import ( + genfromtxt, +) + +logger = logging.getLogger(__name__) + +# Visualization constants +DEFAULT_FIGURE_SIZE = (10, 8) +DEFAULT_DPI = 300 +COLORBAR_LABELSIZE = 15 +AXIS_LABELSIZE = 15 +TICK_LABELSIZE = 15 +PLDT_MIN = 50 +PLDT_MAX = 100 +SCATTER_SIZE = 35 + + +class Plot2DScatter: + """Create 2D scatter plots of TM-scores and pLDDT metrics. + + Visualizes the relationship between TM-scores for two conformations + and color-codes points by predicted local distance difference test (pLDDT) + score, which indicates model confidence. + + Attributes: + pdb1_name (str): Name of first reference structure. + pdb2_name (str): Name of second reference structure. + num_msa (int): Number of additional MSA samples used. + """ + + def __init__( + self, + full_category: str, + random_category: str, + pdb1: str, + pdb1_name: str, + pdb2: str, + pdb2_name: str, + num_msa: int, + num_ens: int, + ) -> None: + """Initialize and generate scatter plots. + + Args: + full_category: Category name for full MSA results. + random_category: Category name for random/variable MSA results. + pdb1: Path to first reference structure (unused, kept for API compatibility). + pdb1_name: Name of first reference structure (used for filenames). + pdb2: Path to second reference structure (unused, kept for API compatibility). + pdb2_name: Name of second reference structure (for axis labels). + num_msa: Number of additional MSA samples. + num_ens: Number of ensemble samples (unused, kept for API compatibility). + + Raises: + FileNotFoundError: If required CSV files not found. + ValueError: If data shapes are incompatible. + """ + self.pdb1_name = pdb1_name + self.pdb2_name = pdb2_name + self.num_msa = num_msa + self.full_category = full_category + self.random_category = random_category + + try: + self._load_data() + self._create_plots() + except Exception as e: + logger.error(f"Failed to create plots: {e}") + raise + + def _load_data(self) -> None: + """Load TM-score and pLDDT data from CSV files. + + Raises: + FileNotFoundError: If required CSV files cannot be found. + """ + try: + # Load TM-scores + self.tmscore_full = genfromtxt( + f"TMScore_{self.full_category}_{self.pdb1_name}.csv", delimiter=" " + ) + self.tmscore_random = genfromtxt( + f"TMScore_{self.random_category}_{self.pdb1_name}.csv", delimiter=" " + ) + + # Load pLDDT scores + self.plddt_full = genfromtxt( + f"plddt_{self.full_category}_{self.pdb1_name}.csv", delimiter=" " + ) + self.plddt_random = genfromtxt( + f"plddt_{self.random_category}_{self.pdb1_name}.csv", delimiter=" " + ) + + # Load fold-switching region TM-scores + self.tmscore_fs_full = genfromtxt( + f"TMScore_fs_{self.full_category}_{self.pdb1_name}.csv", delimiter=" " + ) + self.tmscore_fs_random = genfromtxt( + f"TMScore_fs_{self.random_category}_{self.pdb1_name}.csv", delimiter=" " + ) + + logger.info("Successfully loaded all data files") + + except FileNotFoundError as e: + logger.error(f"Required CSV file not found: {e}") + raise + + def _create_plots(self) -> None: + """Create and save scatter plot visualizations.""" + logger.info("Creating scatter plots...") + + # Reshape pLDDT data for proper color mapping + self.plddt_random = np.reshape(self.plddt_random, (7, (self.num_msa + 5) * 5)) + + # Create whole structure plot + self._plot_whole_structure() + + # Create fold-switching region plot + self._plot_foldswitching_region() + + def _plot_whole_structure(self) -> None: + """Create scatter plot for whole protein structure TM-scores.""" + plt.figure(figsize=DEFAULT_FIGURE_SIZE, dpi=DEFAULT_DPI) + + # Plot variable MSA results + num_msa_depths = int(self.tmscore_random.shape[0] / 2) + for i in range(num_msa_depths): + plt.scatter( + self.tmscore_random[i * 2, :], + self.tmscore_random[(i * 2 + 1), :], + c=self.plddt_random[i, :], + cmap="rocket_r", + vmin=PLDT_MIN, + vmax=PLDT_MAX, + s=SCATTER_SIZE, + marker="o", + alpha=0.7, + ) + + # Add colorbar + cbar = plt.colorbar() + cbar.ax.tick_params(labelsize=COLORBAR_LABELSIZE) + cbar.set_label("pLDDT Score", fontsize=AXIS_LABELSIZE) + + # Overlay full MSA results + plt.scatter( + self.tmscore_full[0, :], + self.tmscore_full[1, :], + c=self.plddt_full, + cmap="plasma", + vmin=PLDT_MIN, + vmax=PLDT_MAX, + s=SCATTER_SIZE, + marker="o", + alpha=0.8, + label="Full MSA", + ) + + # Add diagonal reference line + plt.plot([0, 1], [0, 1], linestyle="dashed", color="black", linewidth=2) + + # Configure axes + plt.xlim(0, 1) + plt.ylim(0, 1) + plt.xticks(fontsize=TICK_LABELSIZE) + plt.yticks(fontsize=TICK_LABELSIZE) + plt.xlabel(f"TM-Score similarity to {self.pdb1_name}", fontsize=AXIS_LABELSIZE) + plt.ylabel(f"TM-score similarity to {self.pdb2_name}", fontsize=AXIS_LABELSIZE) + plt.legend(fontsize=TICK_LABELSIZE) + plt.grid(True, alpha=0.3) + + # Save figure + output_file = f"TMscore_{self.full_category}_{self.pdb1_name}.png" + plt.savefig(output_file, dpi=DEFAULT_DPI, bbox_inches="tight", transparent=True) + logger.info(f"Saved whole structure plot to {output_file}") + plt.close() + + def _plot_foldswitching_region(self) -> None: + """Create scatter plot for fold-switching region TM-scores.""" + plt.figure(figsize=DEFAULT_FIGURE_SIZE, dpi=DEFAULT_DPI) + + # Plot variable MSA results + num_msa_depths = int(self.tmscore_fs_random.shape[0] / 2) + for i in range(num_msa_depths): + plt.scatter( + self.tmscore_fs_random[i * 2, :], + self.tmscore_fs_random[(i * 2 + 1), :], + c=self.plddt_random[i, :], + cmap="plasma", + vmin=PLDT_MIN, + vmax=PLDT_MAX, + s=SCATTER_SIZE, + marker="o", + alpha=0.7, + ) + + # Add colorbar + cbar = plt.colorbar() + cbar.ax.tick_params(labelsize=COLORBAR_LABELSIZE) + cbar.set_label("pLDDT Score", fontsize=AXIS_LABELSIZE) + + # Overlay full MSA results + plt.scatter( + self.tmscore_fs_full[0, :], + self.tmscore_fs_full[1, :], + c=self.plddt_full, + cmap="plasma", + vmin=PLDT_MIN, + vmax=PLDT_MAX, + s=SCATTER_SIZE, + marker="D", + alpha=0.8, + label="Full MSA", + ) + + # Add diagonal reference line + plt.plot([0, 1], [0, 1], linestyle="dashed", color="black", linewidth=2) + + # Configure axes + plt.xlim(0, 1) + plt.ylim(0, 1) + plt.xticks(fontsize=TICK_LABELSIZE) + plt.yticks(fontsize=TICK_LABELSIZE) + plt.xlabel(f"TM-Score similarity to {self.pdb1_name}", fontsize=AXIS_LABELSIZE) + plt.ylabel(f"TM-score similarity to {self.pdb2_name}", fontsize=AXIS_LABELSIZE) + plt.legend(fontsize=TICK_LABELSIZE) + plt.grid(True, alpha=0.3) + + # Save figure + output_file = f"TMscore_fs-region_{self.full_category}_{self.pdb1_name}.png" + plt.savefig(output_file, dpi=DEFAULT_DPI, bbox_inches="tight", transparent=True) + logger.info(f"Saved fold-switching region plot to {output_file}") + plt.close() diff --git a/cf_random/prediction/__init__.py b/cf_random/prediction/__init__.py new file mode 100644 index 0000000..68c6ff3 --- /dev/null +++ b/cf_random/prediction/__init__.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""Prediction orchestration module. + +Provides classes for running ColabFold predictions with various MSA +configurations and computing quality metrics for predicted structures. +""" + +from .base import ( + ColabFoldRunner, + MSAMaxRunner, + MSAVariableRunner, +) +from .prediction_all_var import ( + PredictionAll, +) + +__all__ = [ + "ColabFoldRunner", + "MSAMaxRunner", + "MSAVariableRunner", + "PredictionAll", +] diff --git a/cf_random/prediction/base.py b/cf_random/prediction/base.py new file mode 100644 index 0000000..10a7fe4 --- /dev/null +++ b/cf_random/prediction/base.py @@ -0,0 +1,270 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""Base classes and utilities for ColabFold prediction orchestration. + +This module provides the foundation for running ColabFold predictions with +various MSA configurations. It consolidates common patterns for managing +prediction workflows and result organization. +""" + +import glob +import logging +import os +import shutil +from abc import ( + ABC, +) +from pathlib import ( + Path, +) +from typing import ( + Optional, +) + +import numpy as np +from colabfold.batch import ( + get_queries, + run, +) +from colabfold.utils import ( + setup_logging, +) + +logger = logging.getLogger(__name__) + +# Configuration constants +DEFAULT_NUM_MODELS = 5 +MSA_DEPTH_MULTIPLIERS = (1, 2, 2, 2, 2, 2, 2) +INITIAL_MAX_MSA = 1 +INITIAL_EXTRA_MSA = 2 + + +class ColabFoldRunner(ABC): + """Base class for orchestrating ColabFold predictions. + + Provides shared functionality for running predictions with ColabFold, + including setup, query preparation, and result handling. + """ + + def __init__( + self, + search_dir: str, + output_dir: str, + pdb_name: str, + num_seeds: int, + model_type: str, + ) -> None: + """Initialize ColabFold runner. + + Args: + search_dir: Directory containing MSA files or fasta sequences. + output_dir: Directory where prediction results will be saved. + pdb_name: Name of the target protein for file naming. + num_seeds: Number of random seeds for predictions. + model_type: ColabFold model type (e.g., 'ptm', 'monomer', 'multimer'). + """ + self.search_dir = search_dir + self.output_dir = output_dir + self.pdb_name = pdb_name + self.num_seeds = num_seeds + self.model_type = model_type + self._setup_logging() + + def _setup_logging(self) -> None: + """Configure logging for the prediction run.""" + output_path = Path(self.output_dir) + output_path.parent.mkdir(parents=True, exist_ok=True) + setup_logging(output_path / "log.txt") + + def _run_colabfold( + self, + random_seed: int, + max_seq: Optional[int] = None, + max_extra_seq: Optional[int] = None, + ) -> None: + """Execute ColabFold prediction. + + Args: + random_seed: Random seed for reproducibility. + max_seq: Maximum sequence depth (optional). + max_extra_seq: Maximum extra sequence depth (optional). + + Raises: + RuntimeError: If query extraction or prediction fails. + """ + try: + queries, is_complex = get_queries(self.search_dir) + except Exception as e: + raise RuntimeError(f"Query extraction failed: {e}") from e + + run_kwargs = { + "queries": queries, + "result_dir": self.output_dir, + "num_models": DEFAULT_NUM_MODELS, + "is_complex": is_complex, + "model_type": self.model_type, + "num_seeds": int(self.num_seeds), + "random_seed": int(random_seed), + "data_dir": Path("."), + } + + if max_seq is not None: + run_kwargs["max_seq"] = int(max_seq) + if max_extra_seq is not None: + run_kwargs["max_extra_seq"] = int(max_extra_seq) + + try: + run(**run_kwargs) + except Exception as e: + raise RuntimeError(f"Prediction failed: {e}") from e + + @staticmethod + def _ensure_dir(dir_path: str) -> None: + """Create directory if it doesn't exist.""" + os.makedirs(dir_path, exist_ok=True) + + @staticmethod + def _move_results(source_pattern: str, dest_dir: str) -> None: + """Move prediction results matching a glob pattern to destination directory. + + Args: + source_pattern: Glob pattern for source directories/files. + dest_dir: Destination directory path. + + Raises: + RuntimeError: If no files matched or move operation fails. + """ + ColabFoldRunner._ensure_dir(dest_dir) + matched = glob.glob(source_pattern) + if not matched: + raise RuntimeError(f"No files matched pattern: {source_pattern}") + for source_path in matched: + shutil.move(source_path, dest_dir) + logger.info("Moved %s -> %s", source_path, dest_dir) + + +class MSAMaxRunner(ColabFoldRunner): + """Run ColabFold prediction with maximum (full) MSA depth.""" + + def __init__( + self, + search_dir: str, + output_dir: str, + pdb_name: str, + random_seed: int, + num_seeds: int, + model_type: str, + ) -> None: + """Initialize and execute full MSA prediction. + + Args: + search_dir: Directory containing MSA files. + output_dir: Local output directory for ColabFold results. + pdb_name: Protein name for file naming. + random_seed: Random seed for prediction (string or int). + num_seeds: Number of random seeds. + model_type: ColabFold model type. + """ + # ColabFold writes locally; use a local output dir then move + local_output_dir = f"{pdb_name}_predicted_models_full_rand_{random_seed}" + super().__init__(search_dir, local_output_dir, pdb_name, num_seeds, model_type) + + logger.info("Running full MSA prediction for %s", pdb_name) + self._run_colabfold(int(random_seed) if not isinstance(random_seed, int) else random_seed) + + # Move completed folder into predictions_all// + dest_dir = str(Path("predictions_all") / pdb_name) + self._move_results(local_output_dir + "/", dest_dir) + + +class MSAVariableRunner(ColabFoldRunner): + """Run ColabFold predictions across varied MSA depths.""" + + def __init__( + self, + search_dir: str, + output_dir: str, + pdb_name: str, + random_seed: int, + num_seeds: int, + model_type: str, + multipliers: tuple = MSA_DEPTH_MULTIPLIERS, + initial_max_msa: int = INITIAL_MAX_MSA, + initial_extra_msa: int = INITIAL_EXTRA_MSA, + ) -> None: + """Initialize and execute varied MSA predictions. + + Args: + search_dir: Directory containing MSA files. + output_dir: Base name for local output directories. + pdb_name: Protein name for file naming. + random_seed: Random seed for predictions (list, ndarray, or int). + num_seeds: Number of random seeds. + model_type: ColabFold model type. + multipliers: Tuple of multipliers for MSA depth variation. + initial_max_msa: Starting maximum sequence depth. + initial_extra_msa: Starting extra sequence depth. + """ + # Normalise random seed to string + if isinstance(random_seed, (list, np.ndarray)): + random_seed_str = "".join(map(str, random_seed)) + else: + random_seed_str = str(random_seed) + + logger.info( + "Running variable MSA predictions with random seed %s on search dir: %s with model type: %s under output dir: %s", + random_seed_str, + search_dir, + model_type, + output_dir, + ) + + # Base output_dir is just a prefix used to name local folders + super().__init__(search_dir, output_dir, pdb_name, num_seeds, model_type) + + self.random_seed_str = random_seed_str + self.multipliers = multipliers + self.pdb_name = pdb_name + + self._run_varied_predictions(initial_max_msa, initial_extra_msa) + + def _run_varied_predictions(self, initial_max_msa: int, initial_extra_msa: int) -> None: + """Execute predictions across varied MSA depths and move results. + + Args: + initial_max_msa: Starting maximum sequence depth. + initial_extra_msa: Starting extra sequence depth. + """ + max_msa = initial_max_msa + extra_msa = initial_extra_msa + dest_dir = str(Path("predictions_all") / self.pdb_name) + + for multiplier in self.multipliers: + max_msa = int(max_msa * multiplier) + extra_msa = int(extra_msa * multiplier) + + # Local folder name matches original pattern exactly + local_output_dir = ( + f"{self.pdb_name}_predicted_models_rand_" + f"{self.random_seed_str}_max_{max_msa}_ext_{extra_msa}" + ) + + logger.info( + "Running prediction: max_seq=%d, max_extra_seq=%d -> %s", + max_msa, + extra_msa, + local_output_dir, + ) + + # Write to local dir + self.output_dir = local_output_dir + self._setup_logging() + + self._run_colabfold( + int(self.random_seed_str), + max_seq=max_msa, + max_extra_seq=extra_msa, + ) + + # Move completed folder into predictions_all// + self._move_results(local_output_dir + "/", dest_dir) diff --git a/cf_random/prediction/prediction_all_var.py b/cf_random/prediction/prediction_all_var.py new file mode 100644 index 0000000..50aed73 --- /dev/null +++ b/cf_random/prediction/prediction_all_var.py @@ -0,0 +1,108 @@ +#!/bin/env python3 +# -*- coding: utf-8 -*- +"""Helpers to run ColabFold prediction batches across varied MSAs. + +Small utilities used by higher-level workflows to orchestrate +multiple ColabFold runs with different MSA settings. +""" + +import logging +import random +from pathlib import ( + Path, +) + +import numpy as np + +from .base import ( + MSAMaxRunner, + MSAVariableRunner, +) + +logger = logging.getLogger(__name__) + +MULTIMER_MODEL_TYPE = "alphafold2_multimer_v3" + + +class PredictionAll: + """High-level orchestration for full and variable MSA predictions.""" + + def __init__( + self, + pdb1_name: str, + search_dir: str, + search_multi_dir: str, + num_msa: int, + model_type: str, + ) -> None: + """Run the full and varied MSA prediction pipeline. + + Args: + pdb1_name: Name of the target protein. + search_dir: Path to the single-chain MSA folder. + search_multi_dir: Path to the multimer MSA folder. + num_msa: Number of additional MSA seeds to add to the default 5. + model_type: ColabFold model type. + """ + self.pdb1_name = pdb1_name + self.search_dir = search_dir + self.search_multi_dir = self.search_dir or search_multi_dir + self.model_type = model_type + + self.base_output_dir = Path("predictions_all") / pdb1_name + self.base_output_dir.mkdir(parents=True, exist_ok=True) + + num_seeds = num_msa + 5 + + # Full MSA seed: range 0-15, joined as string + full_random_seed = "".join(map(str, np.random.randint(0, 16, 1))) + + full_output_dir = ( + self.base_output_dir / f"{pdb1_name}_predicted_models_full_rand_{full_random_seed}" + ) + logger.info( + "Running full MSA prediction with parameters: %s", + { + "random_seed": full_random_seed, + "search_dir": self.search_dir, + "output_dir": full_output_dir, + "model_type": self.model_type, + }, + ) + MSAMaxRunner( + search_dir=self.search_dir, + output_dir=str(full_output_dir), + pdb_name=self.pdb1_name, + random_seed=full_random_seed, + num_seeds=num_seeds, + model_type=self.model_type, + ) + + # Variable MSA seed: independently sampled from range 0-99 + var_random_seed = random.sample(range(100), 1) + + if self.model_type == MULTIMER_MODEL_TYPE: + variable_search_dir = self.search_multi_dir + else: + variable_search_dir = self.search_dir + + variable_output_dir = f"{pdb1_name}_predicted_models_rand_" + logger.info( + "Running variable MSA predictions under parameters: %s", + { + "search_dir": variable_search_dir, + "output_dir": variable_output_dir, + "pdb_name": self.pdb1_name, + "random_seed": var_random_seed, + "num_seeds": num_seeds, + "model_type": self.model_type, + }, + ) + MSAVariableRunner( + search_dir=variable_search_dir, + output_dir=str(variable_output_dir), + pdb_name=self.pdb1_name, + random_seed=var_random_seed, + num_seeds=num_seeds, + model_type=self.model_type, + ) diff --git a/cf_random/utils/__init__.py b/cf_random/utils/__init__.py new file mode 100644 index 0000000..caf95a1 --- /dev/null +++ b/cf_random/utils/__init__.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""Utility functions for structure processing and analysis. + +Provides helpers for file conversion, sequence comparison, and +structure clustering. +""" + +from .convert_multi_single import ( + ConvertM2S, +) +from .search_foldseek_cluster import ( + BlindScreening, +) +from .split_multi_single import ( + SplitMultiToChains, +) + +__all__ = [ + "ConvertM2S", + "BlindScreening", + "SplitMultiToChains", +] diff --git a/cf_random/utils/constants.py b/cf_random/utils/constants.py new file mode 100644 index 0000000..b419422 --- /dev/null +++ b/cf_random/utils/constants.py @@ -0,0 +1,27 @@ +""" +Constants used across the CF-Random codebase. +""" + +# Amino acid three-letter to one-letter code mapping +AA3TO1 = { + "CYS": "C", + "ASP": "D", + "SER": "S", + "GLN": "Q", + "LYS": "K", + "ILE": "I", + "PRO": "P", + "THR": "T", + "PHE": "F", + "ASN": "N", + "GLY": "G", + "HIS": "H", + "LEU": "L", + "ARG": "R", + "TRP": "W", + "ALA": "A", + "VAL": "V", + "GLU": "E", + "TYR": "Y", + "MET": "M", +} diff --git a/cf_random/utils/convert_multi_single.py b/cf_random/utils/convert_multi_single.py new file mode 100644 index 0000000..ef99a0d --- /dev/null +++ b/cf_random/utils/convert_multi_single.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""Utilities for converting multimer PDB files to single-chain structures. + +This module provides functionality to convert multimer prediction outputs +into single-chain PDB files, removing TER records and extracting specific chains. +""" + +import glob +import logging +from pathlib import ( + Path, +) + +logger = logging.getLogger(__name__) + + +class ConvertM2S: + """Convert multimer PDB structures to single-chain PDB files. + + Processes multimer prediction outputs by removing TER records and + extracting individual chains for separate analysis. + """ + + def __init__(self, pred_path: str, pdb1_name: str, pdb2_name: str) -> None: + """Initialize and execute multimer to single-chain conversion. + + Args: + pred_path: Path to directory containing multimer predictions. + pdb1_name: Name of first reference structure (used for naming). + pdb2_name: Name of second reference structure (used for conversion). + + Raises: + FileNotFoundError: If prediction directory or reference PDB not found. + RuntimeError: If conversion commands fail. + """ + self.pred_path = Path(pred_path) + self.pdb1_name = pdb1_name + self.pdb2_name = pdb2_name + + if not self.pred_path.exists(): + raise FileNotFoundError(f"Prediction directory not found: {pred_path}") + + try: + self._remove_ter_records() + self._extract_single_chains() + logger.info("Successfully converted multimer predictions to single chains") + except Exception as e: + logger.error(f"Conversion failed: {e}") + raise + + def _find_unrelaxed_files(self) -> list: + """Find unrelaxed PDB files in the prediction directory. + + Tries the ColabFold default prefix first, then falls back to a + wildcard match to handle sequence-ID-prefixed filenames. + + Returns: + List of matched file path strings. + """ + files = glob.glob(str(self.pred_path / "0_unrelaxed*pdb")) + if not files: + files = glob.glob(str(self.pred_path / "*_unrelaxed*pdb")) + if files: + logger.debug( + "Default prefix not found; matched %d file(s) with wildcard in %s", + len(files), + self.pred_path, + ) + return files + + def _remove_ter_records(self) -> None: + """Remove TER records from predicted multimer PDB files. + + Also creates cleaned versions of reference structures. + """ + for pred_file in self._find_unrelaxed_files(): + try: + output_file = pred_file.replace(".pdb", "").split("/")[-1] + output_path = self.pred_path / f"rmTER_{output_file}.pdb" + + with open(pred_file, "r", encoding="utf-8") as infile: + with open(output_path, "w", encoding="utf-8") as outfile: + for line in infile: + if "TER" not in line: + outfile.write(line) + + logger.debug("Removed TER records: %s", output_path) + except Exception as e: + logger.warning("Failed to process %s: %s", pred_file, e) + continue + + # Process reference structure + try: + ref_file = Path(f"{self.pdb2_name}.pdb") + if ref_file.exists(): + output_path = Path(f"{self.pdb2_name}_rmTER.pdb") + with open(ref_file, "r", encoding="utf-8") as infile: + with open(output_path, "w", encoding="utf-8") as outfile: + for line in infile: + if "TER" not in line: + outfile.write(line) + logger.debug("Removed TER records: %s", output_path) + except Exception as e: + logger.warning("Failed to process reference %s: %s", self.pdb2_name, e) + + def _extract_single_chains(self) -> None: + """Extract individual chains from multimer PDB files. + + Creates single-chain PDB files for the first chain found in each prediction. + """ + for pred_file in self._find_unrelaxed_files(): + try: + output_basename = pred_file.replace(".pdb", "").split("/")[-1] + output_path = self.pred_path / f"single_{output_basename}.pdb" + + with open(pred_file, "r", encoding="utf-8") as infile: + with open(output_path, "w", encoding="utf-8") as outfile: + for line in infile: + outfile.write(line) + if "TER" in line: + break + + logger.debug("Extracted single chain: %s", output_path) + except Exception as e: + logger.warning("Failed to extract chain from %s: %s", pred_file, e) + continue diff --git a/cf_random/utils/fs_seq_compare.py b/cf_random/utils/fs_seq_compare.py new file mode 100644 index 0000000..c691c83 --- /dev/null +++ b/cf_random/utils/fs_seq_compare.py @@ -0,0 +1,205 @@ +"""Module for comparing fold-switching regions in predicted models against reference structures.""" + +import glob +import logging +import os + +import numpy as np +import pandas as pd +from Bio.PDB.PDBParser import ( + PDBParser, +) +from thefuzz import ( + fuzz, +) + +logger = logging.getLogger(__name__) + + +class FSRange: + """Class for comparing fold-switching regions in predicted models against reference structures.""" + + def __init__(self, pdb1: str, pdb2: str, pdb1_name: str, pdb2_name: str, pred_dir: str): + self.first_res_check(pdb1, pdb2) + logger.debug( + "First residue indices — pdb1: %s, pdb2: %s", + self.pdb1_res_index_1, + self.pdb2_res_index_1, + ) + + pred_files = glob.glob(str(pred_dir) + "/*_unrelaxed*pdb") + logger.debug("Prediction directory: %s (%d files found)", pred_dir, len(pred_files)) + + self.res_check(pdb1, pdb2, pdb1_name, pdb2_name) + logger.debug( + "Crystal FS residues: %s, %s — Predicted FS residues: %s, %s", + self.crys_fs_res_1_update, + self.crys_fs_res_2_update, + self.pred_fs_res_1_update, + self.pred_fs_res_2_update, + ) + + crys1_fs_res_st = self.crys_fs_res_1_update[0] + crys1_fs_res_ed = self.crys_fs_res_1_update[1] + crys2_fs_res_st = self.crys_fs_res_2_update[0] + crys2_fs_res_ed = self.crys_fs_res_2_update[1] + pred1_fs_res_st = self.pred_fs_res_1_update[0] + pred1_fs_res_ed = self.pred_fs_res_1_update[1] + pred2_fs_res_st = self.pred_fs_res_2_update[0] + pred2_fs_res_ed = self.pred_fs_res_2_update[1] + + if int(self.pdb1_res_index_1) > 1: + logger.debug( + "pdb1 residue numbering does not start at 1 (starts at %s); adjusting FS range", + self.pdb1_res_index_1, + ) + self.crys_fs_res_1_update[0] -= int(self.pdb1_res_index_1) + self.crys_fs_res_1_update[1] -= int(self.pdb1_res_index_1) + crys1_fs_res_st = self.crys_fs_res_1_update[0] + crys1_fs_res_ed = self.crys_fs_res_1_update[1] + + if int(self.pdb2_res_index_1) > 1: + logger.debug( + "pdb2 residue numbering does not start at 1 (starts at %s); adjusting FS range", + self.pdb2_res_index_1, + ) + self.crys_fs_res_2_update[0] -= int(self.pdb2_res_index_1) + self.crys_fs_res_2_update[1] -= int(self.pdb2_res_index_1) + crys2_fs_res_st = self.crys_fs_res_2_update[0] + crys2_fs_res_ed = self.crys_fs_res_2_update[1] + + logger.debug( + "Adjusted FS ranges — crystal: pdb1=[%s,%s] pdb2=[%s,%s]; predicted: pdb1=[%s,%s] pdb2=[%s,%s]", + crys1_fs_res_st, + crys1_fs_res_ed, + crys2_fs_res_st, + crys2_fs_res_ed, + pred1_fs_res_st, + pred1_fs_res_ed, + pred2_fs_res_st, + pred2_fs_res_ed, + ) + + # Compare secondary structure of predicted models against pdb1 + index = 0 + logger.info( + "Comparing FS region secondary structure against %s (%d models)", + pdb1_name, + np.size(pred_files), + ) + for model in pred_files: + logger.debug("Processing model: %s", model) + self.pydssp(pdb1, model, index, pdb1_name) + dssp_read_tmp = pd.read_csv(f"output_{pdb1_name}_{index}.log", sep=" ", header=None) + seq1 = dssp_read_tmp[0].iloc[0] + seq2 = dssp_read_tmp[0].iloc[1] + + logger.debug( + "Crystal FS region: %s | Predicted FS region: %s", + seq1[crys1_fs_res_st:crys1_fs_res_ed], + seq2[pred2_fs_res_st:pred2_fs_res_ed], + ) + + if ( + fuzz.ratio( + seq1[crys1_fs_res_st:crys1_fs_res_ed], + seq2[pred2_fs_res_st:pred2_fs_res_ed], + ) + > 85 + ): + logger.info("FS region correctly predicted (matched pdb1: %s)", pdb1_name) + with open(f"fs_compare_output_{pdb1_name}.log", "w", encoding="utf-8") as f: + f.write("success") + break + + if index == (int(np.size(pred_files)) - 1): + logger.info( + "FS region not matched against %s; retrying against %s", pdb1_name, pdb2_name + ) + index = 0 + + for model in pred_files: + self.pydssp(pdb2, model, index, pdb1_name) + dssp_read_tmp = pd.read_csv( + f"output_{pdb1_name}_{index}.log", sep=" ", header=None + ) + seq1 = dssp_read_tmp[0].iloc[0] + seq2 = dssp_read_tmp[0].iloc[1] + + logger.debug( + "Crystal FS region: %s | Predicted FS region: %s", + seq1[crys2_fs_res_st:crys2_fs_res_ed], + seq2[pred2_fs_res_st:pred2_fs_res_ed], + ) + + if ( + fuzz.ratio( + seq1[crys2_fs_res_st:crys2_fs_res_ed], + seq2[pred2_fs_res_st:pred2_fs_res_ed], + ) + > 85 + ): + logger.info("FS region correctly predicted (matched pdb2: %s)", pdb2_name) + break + if index == (int(np.size(pred_files)) - 1): + logger.warning( + "FS region not correctly predicted for %s against either reference", + pdb1_name, + ) + with open(f"fs_compare_output_{pdb1_name}.log", "w", encoding="utf-8") as f: + f.write("fail") + else: + index += 1 + + else: + index += 1 + + def first_res_check(self, pdb1: str, pdb2: str) -> None: + """Check the first residue index of both PDB files to ensure correct residue numbering.""" + structure_1 = PDBParser().get_structure("pdb1", pdb1) + structure_2 = PDBParser().get_structure("pdb2", pdb2) + + res_index_1 = [r.id[1] for c in structure_1[0] for r in c.get_residues()] + res_index_2 = [r.id[1] for c in structure_2[0] for r in c.get_residues()] + + self.pdb1_res_index_1 = int(res_index_1[0]) + self.pdb2_res_index_1 = int(res_index_2[0]) + logger.debug( + "Parsed first residue indices — pdb1: %d, pdb2: %d", + self.pdb1_res_index_1, + self.pdb2_res_index_1, + ) + + def pydssp(self, crys_pdb: str, pred_pdb: str, number: int, pdb_name: str) -> None: + """Generate and execute the pydssp command to compare secondary structures.""" + command = f"pydssp {crys_pdb} {pred_pdb} -o output_{pdb_name}_{number}.log" + logger.debug("Executing: %s", command) + os.system(command) + + def res_check(self, pdb1: str, pdb2: str, pdb1_name: str, pdb2_name: str) -> None: + """Read fold-switching residue ranges from file and store for the given PDB pair.""" + range_file = os.path.join(os.getcwd(), "range_fs_pairs_all.txt") + + crys_fs_res_1 = crys_fs_res_2 = pred_fs_res_1 = pred_fs_res_2 = "" + + with open(range_file, "r", encoding="utf-8") as file: + next(file) # skip header + for line in file: + line = line.strip() + n1, n2, p1, p2, m1, m2 = line.split(",") + if (n1 == pdb1_name and n2 == pdb2_name) or (n2 == pdb1_name and n1 == pdb2_name): + crys_fs_res_1, crys_fs_res_2 = p1, p2 + pred_fs_res_1, pred_fs_res_2 = m1, m2 + + self.crys_fs_res_1_update = [int(i) for i in crys_fs_res_1.split("-")] + self.crys_fs_res_2_update = [int(i) for i in crys_fs_res_2.split("-")] + self.pred_fs_res_1_update = [int(i) for i in pred_fs_res_1.split("-")] + self.pred_fs_res_2_update = [int(i) for i in pred_fs_res_2.split("-")] + + logger.debug( + "Loaded FS ranges — crystal: %s / %s, predicted: %s / %s", + self.crys_fs_res_1_update, + self.crys_fs_res_2_update, + self.pred_fs_res_1_update, + self.pred_fs_res_2_update, + ) diff --git a/cf_random/utils/search_foldseek_cluster.py b/cf_random/utils/search_foldseek_cluster.py new file mode 100644 index 0000000..aaeadb9 --- /dev/null +++ b/cf_random/utils/search_foldseek_cluster.py @@ -0,0 +1,495 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""Blind structure screening using Foldseek and clustering analysis. + +This module performs unsupervised clustering of predicted structures using +Foldseek bit-score similarity matrices, PCA dimensionality reduction, HDBSCAN +clustering, and k-medoids representative selection. + +Requires: + - Foldseek: For structure similarity computation + - MDAnalysis: For structure analysis and DSSP filtering + - scikit-learn: For clustering algorithms + - PyMOL (optional): For .pse session file output +""" + +import csv +import logging +import re +import shutil +import subprocess +from pathlib import ( + Path, +) +from typing import ( + Dict, + List, + Tuple, +) + +import matplotlib.pyplot as plt +import MDAnalysis as mda +import numpy as np +from MDAnalysis.analysis.dssp import ( + DSSP, +) +from scipy import ( + stats, +) +from scipy.spatial import ( + distance, +) +from sklearn.cluster import ( + HDBSCAN, +) +from sklearn.decomposition import ( + PCA, +) +from sklearn.metrics import ( + silhouette_score, +) +from sklearn.preprocessing import ( + minmax_scale, +) + +logger = logging.getLogger(__name__) + +try: + import pymol +except ImportError: + pymol = None + logger.warning("PyMOL not available; .pse session file will not be saved") + +# Configuration constants +HDBSCAN_K_RANGE = range(2, 51) +HDBSCAN_MIN_SAMPLES = 1 +KMEDOIDS_DEFAULT_K = 3 +KMEDOIDS_MIN_CLUSTER_SIZE = 4 +KMEDOIDS_MAX_ITER = 100 +RANDOM_SEED = 42 +DISTANCE_METRIC = "euclidean" +ZSCORE_OUTLIER_THRESHOLD = 3.0 +FOLDSEEK_SENSITIVITY = "9.5" +FOLDSEEK_FORMAT_OUTPUT = "query,target,alntmscore,qaln,taln,alnlen,evalue,bits" +FOLDSEEK_BIT_SCORE_BUG_VALUE = -2147483648 +PCA_N_COMPONENTS = 4 + + +class BlindScreening: + """Perform blind structural screening and clustering analysis. + + 1. Stage PDB files into a flat directory and build a Foldseek DB. + 2. Run per-file exhaustive Foldseek easy-search against the DB. + 3. Parse bit scores into a pairwise correlation matrix. + 4. Remove unfolded outliers via DSSP z-score filtering. + 5. Normalise the matrix, run PCA, HDBSCAN, and k-medoids. + 6. Write CSV summaries, a cluster PNG, and (optionally) a PyMOL .pse. + """ + + def __init__(self, pdb1_name: str, blind_path: str) -> None: + """Initialise and run the full blind screening pipeline.""" + self.pdb1_name = pdb1_name + self.blind_path = Path(blind_path) + + if not self.blind_path.exists(): + raise FileNotFoundError(f"Blind screening path not found: {blind_path}") + + logger.info("Starting blind screening for %s", pdb1_name) + + self._stage_pdb_files() + self._build_foldseek_database() + self._run_foldseek_searches() + self._perform_clustering_analysis() + + logger.info("Blind screening completed successfully") + + @staticmethod + def cluster_structures(x: np.ndarray) -> np.ndarray: + """Find optimal HDBSCAN clustering labels for reduced structure features. + + Iterates over a range of min_cluster_size values and selects the value + that maximises the silhouette score. + + Args: + x: PCA-reduced feature matrix (n_samples, n_components). + + Returns: + np.ndarray: Cluster labels for each sample (-1 = noise). + """ + sil_scores: List[float] = [] + for k in HDBSCAN_K_RANGE: + clustering = HDBSCAN(min_cluster_size=k, min_samples=HDBSCAN_MIN_SAMPLES) + clustering.fit(x) + n_unique = len(set(clustering.labels_)) + if 1 < n_unique < len(x): + score = silhouette_score(x, clustering.labels_, metric=DISTANCE_METRIC) + sil_scores.append(score) + else: + sil_scores.append(-1.0) + + opt_k = HDBSCAN_K_RANGE[int(np.argmax(sil_scores))] + logger.info("Optimal HDBSCAN min_cluster_size: %s", opt_k) + + final = HDBSCAN(min_cluster_size=opt_k, min_samples=HDBSCAN_MIN_SAMPLES) + final.fit(x) + return final.labels_ + + @staticmethod + def k_medoids( + x: np.ndarray, + cluster_label: int, + labels: np.ndarray, + k: int = KMEDOIDS_DEFAULT_K, + max_iter: int = KMEDOIDS_MAX_ITER, + ) -> Tuple[np.ndarray, float]: + """PAM-style k-medoids to find representative structures in a cluster. + + Args: + x: Full PCA-reduced feature matrix (all samples). + cluster_label: The HDBSCAN label whose members are to be processed. + labels: Full label array from HDBSCAN (length = n_samples). + k: Number of medoids to return. + max_iter: Maximum PAM swap iterations. + + Returns: + Tuple of (medoid_indices, total_cost) where medoid_indices indexes + into the *full* x array (not only the cluster subset). + """ + np.random.seed(RANDOM_SEED) + + temp = x.copy() + mask = np.zeros(x.shape, dtype=bool) + mask[np.argwhere(labels == cluster_label)] = True + + # Count members of this cluster + unique_vals, counts = np.unique(mask[:, 0], return_counts=True) + true_idx = [i for i, v in enumerate(unique_vals) if v] + if not true_idx: + return np.array([], dtype=int), float("nan") + cluster_count = counts[true_idx[0]] + + if cluster_count < KMEDOIDS_MIN_CLUSTER_SIZE: + logger.debug( + "Cluster %s has only %s members; returning all indices", + cluster_label, + cluster_count, + ) + return np.ravel(np.argwhere(mask[:, 0])), float("nan") + + # Mask out non-cluster members so they never win a distance comparison + temp[~mask] = 9999.0 + + n = temp.shape[0] + medoids = np.random.choice(n, k, replace=False) + c_dis = distance.cdist(temp, temp[medoids], metric=DISTANCE_METRIC) + tot_cost = float(np.sum(np.min(c_dis, axis=1))) + + for _ in range(max_iter): + improved = False + for m_idx in range(k): + for candidate in range(n): + if candidate in medoids: + continue + new_medoids = medoids.copy() + new_medoids[m_idx] = candidate + new_dis = distance.cdist(temp, temp[new_medoids], metric=DISTANCE_METRIC) + new_cost = float(np.sum(np.min(new_dis, axis=1))) + if new_cost < tot_cost: + medoids = new_medoids + tot_cost = new_cost + improved = True + break + if improved: + break + if not improved: + break + + return medoids, tot_cost + + def _stage_pdb_files(self) -> None: + """Copy all PDB files into a flat staging directory for Foldseek. + + Replicates the original's flattening logic: replace '/' with '-' in the + full path, then strip the first 17 characters to derive the DB filename. + The mapping from source Path -> flat label is stored in + ``self.pdb_label_map`` so that ``_build_correlation_matrix`` can match + Foldseek target fields back to matrix rows. + """ + self.db_directory = self.blind_path / "pdbs_for_db" + self.db_directory.mkdir(exist_ok=True) + + raw_pdb_files = sorted(self.blind_path.rglob("*.pdb")) + raw_pdb_files = [f for f in raw_pdb_files if self.db_directory not in f.parents] + + if not raw_pdb_files: + raise FileNotFoundError(f"No PDB files found under {self.blind_path}") + + # pdb_label_map: source Path -> flat label used as Foldseek DB key + self.pdb_label_map: Dict[Path, str] = {} + + self.pdb_files: List[Path] = [] + logger.info("Staging %d PDB files for Foldseek database", len(raw_pdb_files)) + + for src in raw_pdb_files: + dest_name = str(src).replace("/", "-")[17:] + dest = self.db_directory / dest_name + if not dest.exists(): + shutil.copyfile(src, dest) + self.pdb_files.append(src) + # Label is the dest_name without the .pdb extension + self.pdb_label_map[src] = dest_name.replace(".pdb", "") + + logger.info("Staged %d files to %s", len(self.pdb_files), self.db_directory) + + def _build_foldseek_database(self) -> None: + """Create a Foldseek database from the staged PDB directory.""" + self.foldseek_db = self.db_directory / "DB" + + if self.foldseek_db.exists(): + logger.info( + "Existing Foldseek database found at %s; skipping creation", self.foldseek_db + ) + return + + cmd = ["foldseek", "createdb", str(self.db_directory), str(self.foldseek_db)] + logger.info("Building Foldseek database...") + result = subprocess.run(cmd, capture_output=True, text=True) + if result.returncode != 0: + raise RuntimeError(f"foldseek createdb failed:\n{result.stderr.strip()}") + logger.info("Foldseek database created at %s", self.foldseek_db) + + def _run_foldseek_searches(self) -> None: + """Run per-file exhaustive Foldseek easy-search against the database.""" + tmp_dir = self.blind_path / "tmp" + tmp_dir.mkdir(exist_ok=True) + + for pdb_file in self.pdb_files: + result_file = pdb_file.with_suffix("").parent / (pdb_file.stem + "-self.foldseek") + if result_file.exists(): + logger.debug("Foldseek result exists; skipping %s", pdb_file.name) + continue + + cmd = [ + "foldseek", + "easy-search", + str(pdb_file), + str(self.foldseek_db), + str(result_file), + str(tmp_dir), + "--format-mode", + "0", + "--format-output", + FOLDSEEK_FORMAT_OUTPUT, + "--exhaustive-search", + "1", + "-s", + FOLDSEEK_SENSITIVITY, + ] + result = subprocess.run(cmd, capture_output=True, text=True) + if result.returncode != 0: + logger.warning( + "foldseek easy-search failed for %s:\n%s", + pdb_file.name, + result.stderr.strip(), + ) + else: + logger.info("Foldseek search completed for %s", pdb_file.name) + + def _perform_clustering_analysis(self) -> None: + """Build similarity matrix, filter outliers, cluster, and save outputs.""" + foldseek_files = sorted(self.blind_path.rglob("*-self.foldseek")) + if not foldseek_files: + raise FileNotFoundError("No .foldseek result files found") + + # _filter_unfolded returns files sorted; labels are derived after sort + foldseek_files, pdb_labels = self._filter_unfolded(foldseek_files) + + corr_mtx = self._build_correlation_matrix(foldseek_files, pdb_labels) + + norm = minmax_scale(corr_mtx, axis=1) + norm = (norm + norm.T) / 2.0 + + pca_coords = PCA(n_components=PCA_N_COMPONENTS).fit_transform(norm) + labels = self.cluster_structures(pca_coords) + + files_of_interest: List[Tuple[Path, int]] = [] + pca_of_interest: List[np.ndarray] = [] + + for cluster_label in np.unique(labels): + medoid_indices, _ = self.k_medoids(pca_coords, cluster_label, labels) + for idx in medoid_indices: + files_of_interest.append((foldseek_files[idx], int(cluster_label))) + pca_of_interest.append(pca_coords[idx]) + + self._save_cluster_plot(pca_coords, labels) + self._save_structures_of_interest(files_of_interest, pca_of_interest) + self._save_all_structures(foldseek_files, labels, pca_coords) + self._save_pymol_session(foldseek_files, files_of_interest) + + def _filter_unfolded(self, foldseek_files: List[Path]) -> Tuple[List[Path], List[str]]: + """Remove unfolded predictions using DSSP loop-content z-scores. + + A structure is flagged as an outlier if its loop ('-') residue count + has a z-score > ZSCORE_OUTLIER_THRESHOLD across all structures. + + Returns: + Filtered (sorted) foldseek file list and matching flat PDB labels. + """ + files_count: List[np.ndarray] = [] + + for ff in foldseek_files: + pdb_path = Path(str(ff).replace("-self.foldseek", ".pdb")) + u = mda.Universe(str(pdb_path)) + s = DSSP(u).run().results.dssp[0] + dssp_types, counts = np.unique(s, return_counts=True) + + # Ensure all three categories are present + for missing, pos in [("-", 0), ("E", 1), ("H", 2)]: + if missing not in dssp_types: + dssp_types = np.insert(dssp_types, pos, missing) + counts = np.insert(counts, pos, 0) + + files_count.append(counts) + + files_count_arr = np.array(files_count) + z_scores = stats.zscore(files_count_arr[:, 0]) # column 0 = '-' (loops) + + filtered: List[Path] = [] + for i, ff in enumerate(foldseek_files): + if z_scores[i] > ZSCORE_OUTLIER_THRESHOLD: + logger.info( + "Removed unfolded structure from analysis: %s", + str(ff).replace("-self.foldseek", ".pdb"), + ) + else: + filtered.append(ff) + + filtered = sorted(filtered) + + # Derive flat labels using the same slash->dash + [17:] rule as staging, + # applied to the .pdb path so they match what Foldseek indexed. + pdb_labels = [ + str(Path(str(ff).replace("-self.foldseek", ".pdb"))) + .replace("/", "-")[17:] + .replace(".pdb", "") + for ff in filtered + ] + + return filtered, pdb_labels + + def _build_correlation_matrix( + self, foldseek_files: List[Path], pdb_labels: List[str] + ) -> np.ndarray: + """Parse bit scores from .foldseek files into a pairwise matrix.""" + corr_mtx: List[List[float]] = [] + + for ff in foldseek_files: + row_dict: Dict[str, float] = {label: 0.0 for label in pdb_labels} + + with ff.open("r") as fh: + for line in fh: + parts = line.rstrip().split("\t") + if len(parts) < 8: + continue + target = parts[1] + try: + bit_score = int(parts[-1]) + except ValueError: + continue + if bit_score == FOLDSEEK_BIT_SCORE_BUG_VALUE: + bit_score = 0 + if target in row_dict: + row_dict[target] = float(bit_score) + + corr_mtx.append([row_dict[label] for label in pdb_labels]) + + return np.array(corr_mtx) + + def _save_cluster_plot(self, pca_coords: np.ndarray, labels: np.ndarray) -> None: + """Save a 2D PCA scatter plot coloured by HDBSCAN cluster.""" + plot_path = self.blind_path / f"{self.pdb1_name}-cluster.png" + plt.figure(figsize=(8, 6)) + plt.scatter(pca_coords[:, 0], pca_coords[:, 1], c=labels, cmap="viridis", s=45) + plt.xlabel("PC 1") + plt.ylabel("PC 2") + plt.title(f"Blind screening cluster map — {self.pdb1_name}") + plt.tight_layout() + plt.savefig(plot_path) + plt.clf() + logger.info("Saved cluster plot to %s", plot_path) + + def _save_structures_of_interest( + self, + files_of_interest: List[Tuple[Path, int]], + pca_of_interest: List[np.ndarray], + ) -> None: + """Write the structures-of-interest CSV (group, file, pca_1, pca_2).""" + out_path = self.blind_path / f"{self.pdb1_name}-structures_of_interest.csv" + with out_path.open("w", newline="") as fh: + writer = csv.writer(fh) + writer.writerow(["group", "file", "pca_1", "pca_2"]) + for (ff, cluster_label), pca_pt in zip(files_of_interest, pca_of_interest): + writer.writerow([cluster_label, str(ff), pca_pt[0], pca_pt[1]]) + logger.info("Saved structures of interest to %s", out_path) + + def _save_all_structures( + self, + foldseek_files: List[Path], + labels: np.ndarray, + pca_coords: np.ndarray, + ) -> None: + """Write cluster assignments for every structure to structures_all.csv.""" + out_path = self.blind_path / "structures_all.csv" + with out_path.open("w", newline="") as fh: + writer = csv.writer(fh) + writer.writerow(["group", "file", "pca_1", "pca_2"]) + for i, ff in enumerate(foldseek_files): + writer.writerow([int(labels[i]), str(ff), pca_coords[i, 0], pca_coords[i, 1]]) + logger.info("Saved all-structures summary to %s", out_path) + + def _save_pymol_session( + self, + foldseek_files: List[Path], + files_of_interest: List[Tuple[Path, int]], + ) -> None: + """Align structures of interest and save a PyMOL .pse session file. + + Uses foldseek_files[0] as the alignment reference. + + Skipped silently if PyMOL is not installed. + """ + if pymol is None: + logger.warning("PyMOL unavailable; skipping .pse export") + return + + pse_path = self.blind_path / f"{self.pdb1_name}-structures_of_interest.pse" + viridis = plt.get_cmap("viridis", len(files_of_interest)) + largest_label = max(files_of_interest, key=lambda x: x[1])[1] + + # Load the first of all filtered files as the alignment reference + dominant_pdb = str(foldseek_files[0]).replace("-self.foldseek", ".pdb") + pymol.cmd.load(dominant_pdb, "Dominant") + + for idx, (ff, cluster_label) in enumerate(files_of_interest): + pdb_path = str(ff).replace("-self.foldseek", ".pdb") + + tokens = re.findall(r"(full)|(max\w+)|(rank_\d+)", str(ff)) + short = "_".join(t for group in tokens for t in group if t) + obj_name = f"{idx}_{short}" if short else f"struct_{idx}" + + if largest_label == -1: + colour_val = 0.0 + else: + colour_val = (cluster_label + 1) / (largest_label + 1) + rgb = viridis(colour_val)[:3] + + pymol.cmd.load(pdb_path, obj_name) + pymol.cmd.align(obj_name, "Dominant") + colour_name = f"col_{cluster_label}" + pymol.cmd.set_color(colour_name, list(rgb)) + pymol.cmd.color(colour_name, obj_name) + + pymol.cmd.save(str(pse_path), "pse") + pymol.cmd.delete("all") + pymol.cmd.reinitialize() + logger.info("Saved PyMOL session to %s", pse_path) diff --git a/cf_random/utils/split_chains.py b/cf_random/utils/split_chains.py new file mode 100644 index 0000000..295f894 --- /dev/null +++ b/cf_random/utils/split_chains.py @@ -0,0 +1,82 @@ +#!/bin/env python3 +# -*- coding: utf-8 -*- +"""Split protein chains into single-chain PDB files. + +Simple CLI utility to extract individual chains from a multi-chain PDB. +""" + +import argparse +import linecache + +chain_char = [ + "A", + "B", + "C", + "D", + "E", + "F", + "G", + "H", + "I", + "J", + "K", + "L", + "M", + "N", + "O", + "P", + "Q", + "R", + "S", + "T", + "U", + "V", + "W", + "X", + "Y", + "Z", +] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--pdb1", type=str, help="PDB structure for the target crystal structure") + args = parser.parse_args() + + pdb1 = args.pdb1 + pdb1_name = pdb1.replace(".pdb", "") + + ter_count = 0 + with open(pdb1, "r", encoding="utf-8") as file: + for line in file: + ter = line.split() + ter_count += ter.count("TER") + + line_cnt = 0 + for i in range(0, ter_count): + output_file_name = pdb1_name + "_" + chain_char[i] + ".pdb" + + if line_cnt == 0: + with ( + open(pdb1, "r", encoding="utf-8") as infile, + open(output_file_name, "w", encoding="utf-8") as outfile, + ): + for line in infile: + outfile.write(line) + line_cnt = line_cnt + 1 + if "TER " in line: + line_cnt = line_cnt + 1 + break + + else: + with ( + open(pdb1, "r", encoding="utf-8") as infile, + open(output_file_name, "w", encoding="utf-8") as outfile, + ): + for line in infile: + linecache.getline(pdb1, line_cnt) + outfile.write(linecache.getline(pdb1, line_cnt)) + line_cnt = line_cnt + 1 + if linecache.getline(pdb1, line_cnt) == "TER ": + line_cnt = line_cnt + 1 + break diff --git a/cf_random/utils/split_multi_single.py b/cf_random/utils/split_multi_single.py new file mode 100644 index 0000000..373d00a --- /dev/null +++ b/cf_random/utils/split_multi_single.py @@ -0,0 +1,84 @@ +#!/bin/env python3 +# -*- coding: utf-8 -*- +"""Split multimer PDBs into single-chain PDB files. + +Utility to extract individual chains from multimer prediction files. +""" + +import glob +import linecache + + +class SplitMultiToChains: + """Splits multimer PDB files into single-chain PDB files.""" + + def __init__(self, pred_path: str): + + chain_char = [ + "A", + "B", + "C", + "D", + "E", + "F", + "G", + "H", + "I", + "J", + "K", + "L", + "M", + "N", + "O", + "P", + "Q", + "R", + "S", + "T", + "U", + "V", + "W", + "X", + "Y", + "Z", + ] + + files_list = glob.glob(str(pred_path) + "/*_unrelaxed*pdb") + + for fl in files_list: + ter_count = 0 + with open(fl, "r", encoding="utf-8") as file: + for line in file: + ter = line.split() + ter_count += ter.count("TER") + + line_cnt = 0 + + fl_name = fl.replace(".pdb", "") + for i in range(0, ter_count): + output_file_name = fl_name + "_chain_" + chain_char[i] + ".pdb" + + if line_cnt == 0: + with ( + open(fl, "r", encoding="utf-8") as infile, + open(output_file_name, "w", encoding="utf-8") as outfile, + ): + for line in infile: + outfile.write(line) + line_cnt = line_cnt + 1 + if "TER " in line: + line_cnt = line_cnt + 1 + break + + else: + with ( + open(fl, "r", encoding="utf-8") as infile, + open(output_file_name, "w", encoding="utf-8") as outfile, + ): + for line in infile: + linecache.getline(fl, line_cnt) + outfile.write(linecache.getline(fl, line_cnt)) + line_cnt = line_cnt + 1 + if linecache.getline(fl, line_cnt) == "TER ": + line_cnt = line_cnt + 1 + break diff --git a/code/PLOT_AC.py b/code/PLOT_AC.py deleted file mode 100644 index e80edca..0000000 --- a/code/PLOT_AC.py +++ /dev/null @@ -1,139 +0,0 @@ -#!/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Thu Feb 22 13:40:00 2024 - -@author: Myeongsang (Samuel) Lee -""" -import os -import sys -import textalloc as ta -import seaborn as sns -from pathlib import Path -import numpy as np -from numpy import genfromtxt -from matplotlib import pyplot as plt -from adjustText import adjust_text -import glob - - -class plot_2D_scatter_AC(): - def __init__(self, full_cate, random_cate, pdb1, pdb1_name, pdb2, pdb2_name, nMSA, nENS, model_type): - ##### load TM-scores both full- and ramdon-MSA - TMs_full = genfromtxt("TMScore_" + full_cate + "_" + pdb1_name + ".csv", delimiter = ' ' ) - TMs_random = genfromtxt("TMScore_" + random_cate + "_" + pdb1_name + ".csv", delimiter = ' ' ) - - ############ load pLDDT scores both full- and ramdon-MSA - plddt_full = genfromtxt("plddt_" + full_cate + "_" + pdb1_name + ".csv", delimiter = ' ' ) - plddt_random = genfromtxt("plddt_" + random_cate + "_" + pdb1_name + ".csv", delimiter = ' ' ) - - - ################################################################# - ########### getting the TM-score values of fold-switching region - - pwd = os.getcwd() + '/' - - - ######### plotting the TM-score values as 2D scatter plot - print(" ") - print("Size of column: ", TMs_random.shape[-1]) - print("Size of row: ", TMs_random.shape[0]) - print("Dimension: ", TMs_random.ndim) - - print(" ") - print(TMs_random) - print(" ") - print(TMs_full) - - - print("checking plddt") - print(plddt_full) - print(plddt_random) - - plddt_random = np.reshape(plddt_random, (7, (nMSA + 5) * 5)) - print(plddt_random) - - - if model_type != 'alphafold2_multimer_v3': - TMs_full_resh = np.reshape(TMs_full, ((((nMSA + 5) * 2), 5))) - - #f1 = np.concatenate((TMs_addition[0:(nENS + 20), :], TMs_full_resh[0:(nMSA + 5), :]), axis=0) - #print(f1) - #f2 = np.concatenate((TMs_addition[(nENS + 20):(nENS + 20) * 2, :], TMs_full_resh[(nMSA + 5):(nMSA + 5) * 2, :]), axis=0) - #print(f2) - else: - TMs_full_resh = np.reshape(TMs_full, (((nMSA + 5) * 2), 5)) - - #f1 = np.concatenate((TMs_addition[0:(nENS + 20), :], TMs_full_resh[0:(nMSA + 5), :]), axis=0) - #print(f1) - #f2 = np.concatenate((TMs_addition[(nENS + 20):(nENS + 20) * 2, :], TMs_full_resh[(nMSA + 5):(nMSA + 5) * 2, :]), axis=0) - #print(f2) - - - - - - - if model_type != 'alphafold2_multimer_v3': - #if np.all(f1 > f2) or np.all(f1 < f2): - # print("Prediction is biased"); sys.exit() - #else: - # print("Prediction is not biased") - - plt.figure(0) - for ii in range(0, int(TMs_random.shape[0] / 2) ): - plt.scatter(TMs_random[ii * 2, :], TMs_random[(ii * 2 + 1), :], c = plddt_random[ii, :], cmap='rocket_r', vmin=50, vmax=100, s=35, marker="o") - - clb=plt.colorbar() - clb.ax.tick_params(labelsize=15) - - plt.scatter(TMs_full_resh[0 : (nMSA + 5), :], TMs_full_resh[(nMSA + 5):(nMSA + 5) * 2, :], c = plddt_full, cmap='rocket_r', vmin=50, vmax=100, s=35, marker="o") - - x = [ 0 , 1 ] - y = [ 0 , 1 ] - - plt.ylim(0, 1) - plt.xlim(0, 1) - - plt.plot(x, y, linestyle='dashed', color = 'black') - - plt.xticks(fontsize=15) - plt.yticks(fontsize=15) - - plt.xlabel('TM-Score similar to fold1(' + pdb1_name + ')', fontsize=15); plt.ylabel('TM-score similar to fold2(' + pdb2_name + ')', fontsize=15) - plt.savefig('TMscore_' + full_cate + '_' + pdb1_name + '.png', transparent = True) - - - else: - ##print("Not determine for the multimer mode") - #if np.all(f1 > f2) or np.all(f1 < f2): - # print("Prediction is biased"); sys.exit() - #else: - # print("Prediction is not biased") - - plt.figure(0) - for ii in range(0, int(TMs_random.shape[0] / 2) ): - plt.scatter(TMs_random[ii * 2, :], TMs_random[(ii * 2 + 1), :], c = plddt_random[ii, :], cmap='rocket_r', vmin=50, vmax=100, s=35, marker="o") - - - clb=plt.colorbar() - clb.ax.tick_params(labelsize=15) - - plt.scatter(TMs_full_resh[0 : (nMSA + 5), :], TMs_full_resh[(nMSA + 5):(nMSA + 5) * 2, :], c = plddt_full, cmap='rocket_r', vmin=50, vmax=100, s=35, marker="o") - - - - x = [ 0 , 1 ] - y = [ 0 , 1 ] - - plt.ylim(0, 1) - plt.xlim(0, 1) - - plt.plot(x, y, linestyle='dashed', color = 'black') - - plt.xticks(fontsize=15) - plt.yticks(fontsize=15) - - plt.xlabel('TM-Score similar to fold1(' + pdb1_name + ')', fontsize=15); plt.ylabel('TM-score similar to fold2(' + pdb2_name + ')', fontsize=15) - plt.savefig('TMscore_' + full_cate + '_' + pdb1_name + '.png', transparent = True) - diff --git a/code/PLOT_FS.py b/code/PLOT_FS.py deleted file mode 100644 index fb6b6f7..0000000 --- a/code/PLOT_FS.py +++ /dev/null @@ -1,119 +0,0 @@ -#!/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Thu Feb 22 13:40:00 2024 - -@author: Myeongsang (Samuel) Lee -""" -import os -import sys -import textalloc as ta -import seaborn as sns -from pathlib import Path -import numpy as np -from numpy import genfromtxt -from matplotlib import pyplot as plt -from adjustText import adjust_text -import glob - -from cal_tmscore_fs_flmsa import * -from fs_seq_compare import * - -class plot_2D_scatter(): - def __init__(self, full_cate, random_cate, pdb1, pdb1_name, pdb2, pdb2_name, nMSA, nENS): - ##### load TM-scores both full- and ramdon-MSA - TMs_full = genfromtxt("TMScore_" + full_cate + "_" + pdb1_name + ".csv", delimiter = ' ' ) - TMs_random = genfromtxt("TMScore_" + random_cate + "_" + pdb1_name + ".csv", delimiter = ' ' ) - - ############ load pLDDT scores both full- and ramdon-MSA - plddt_full = genfromtxt("plddt_" + full_cate + "_" + pdb1_name + ".csv", delimiter = ' ' ) - plddt_random = genfromtxt("plddt_" + random_cate + "_" + pdb1_name + ".csv", delimiter = ' ' ) - - - ################################################################# - ########### getting the TM-score values of fold-switching region - - pwd = os.getcwd() + '/' - - fs_full_TMs = genfromtxt("TMScore_fs_" + full_cate + "_" + pdb1_name + ".csv", delimiter = ' ') - TMs_fs_full = fs_full_TMs - fs_random_TMs = genfromtxt("TMScore_fs_" + random_cate + "_" + pdb1_name + ".csv", delimiter = ' ') - TMs_fs_random = fs_random_TMs - - - - ######### plotting the TM-score values as 2D scatter plot - print(" ") - print("Size of column: ", TMs_random.shape[-1]) - print("Size of row: ", TMs_random.shape[0]) - print("Dimension: ", TMs_random.ndim) - - print(" ") - print(TMs_random) - print(" ") - print(TMs_full) - - - print("checking plddt") - print(plddt_full) - print(plddt_random) - - plddt_random = np.reshape(plddt_random, (7, (nMSA + 5) * 5)) - TMs_fs_full_resh = np.reshape(TMs_fs_full, ((((nMSA + 5) * 2), 5))) - - - - - plt.figure(0) - - - for ii in range(0, int(TMs_random.shape[0] / 2) ): - plt.scatter(TMs_random[ii * 2, :], TMs_random[(ii * 2 + 1), :], c = plddt_random[ii, :], cmap='rocket_r', vmin=50, vmax=100, s=35, marker="o") - - - clb=plt.colorbar() - clb.ax.tick_params(labelsize=15) - - - plt.scatter(TMs_full[0, :], TMs_full[1, :], c = plddt_full, cmap='plasma', vmin=50, vmax=100, s=35, marker="o") - - - x = [ 0 , 1 ] - y = [ 0 , 1 ] - - plt.ylim(0, 1) - plt.xlim(0, 1) - - - plt.plot(x, y, linestyle='dashed', color = 'black') - - plt.xticks(fontsize=15) - plt.yticks(fontsize=15) - - plt.xlabel('TM-Score similar to fold1(' + pdb1_name + ')', fontsize=15); plt.ylabel('TM-score similar to fold2(' + pdb2_name + ')', fontsize=15) - plt.savefig('TMscore_' + full_cate + '_' + pdb1_name + '.png', transparent = True) - - - plt.figure(1) - for ii in range(0, int(TMs_random.shape[0] / 2) ): - plt.scatter(TMs_fs_random[ii * 2, :], TMs_fs_random[(ii * 2 + 1), :], c = plddt_random[ii, :], cmap='plasma', vmin=50, vmax=100, s=35, marker="o") - - - x = [ 0.0 , 1 ] ; y = [ 0.0 , 1 ] - plt.ylim(0.0, 1) - plt.xlim(0.0, 1) - - dlb=plt.colorbar() - dlb.ax.tick_params(labelsize=15) - - plt.scatter(TMs_fs_full[0, :], TMs_fs_full[1, :], c = plddt_full, cmap='plasma', vmin=50, vmax=100, s=35, marker="o") - - - plt.plot(x, y, linestyle='dashed', color = 'black') - - plt.xticks(fontsize=15) - plt.yticks(fontsize=15) - - plt.xlabel('TM-Score similar to fold1(' + pdb1_name + ')', fontsize=15); plt.ylabel('TM-score similar to fold2(' + pdb2_name + ')', fontsize=15) - plt.savefig('TMscore_fs-region_' + full_cate + '_' + pdb1_name + '.png', transparent = True) - diff --git a/code/TMscore_all_var.py b/code/TMscore_all_var.py deleted file mode 100644 index c68b94e..0000000 --- a/code/TMscore_all_var.py +++ /dev/null @@ -1,302 +0,0 @@ -#!/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Wed Feb 21 14:51:00 2024 - -@author: Myeongsang (Samuel) Lee -""" -import re -import Bio -import os -from os import listdir -from os.path import isfile, join -import sys -from pathlib import Path -import numpy as np -from numpy import genfromtxt -import matplotlib.pyplot as plt -import glob -import random -import argparse -# call related modules of tmtools after installation -from tmtools import tm_align -from tmtools.io import get_structure, get_residue_data -from tmtools.testing import get_pdb_path - -# call calculating TM-scores of fs region -from cal_tmscore_fs_only import * - -# call converting the multimer as a single chain structure -from convert_multi_single import * - -# call colabfold for multimer option -from pred_cal_tmscore_multimer import * - - -class TM_score(): - def __init__(self, pred_dir, pdb1, pdb1_name, pdb2, pdb2_name, model_type): - - ## loading reference pdb for TM-score - pwd = os.getcwd() + '/' - tmscores = [] - tmscores_ord = []; tmscores_rev = [] - - #files_list = sorted(glob.glob(str(pred_dir) + "/*_unrelaxed*pdb")) - if model_type != "alphafold2_multimer_v3": - files_list = (glob.glob(str(pred_dir) + "/*_unrelaxed*pdb")) - print(files_list) - else: - #### convert the multimer file as a single structure - check_files_list = (glob.glob(str(pred_dir) + "/rmTER*_unrelaxed*pdb")) - print(check_files_list) - if not check_files_list: - convert_m2s(pred_dir, pdb1_name, pdb2_name) - files_list = (glob.glob(str(pred_dir) + "/rmTER*_unrelaxed*pdb")) - print(files_list) - else: - files_list = (glob.glob(str(pred_dir) + "/rmTER*_unrelaxed*pdb")) - print(files_list) - - - ##### pdb1_name part - pdb1_dir = pwd + pdb1_name - r2 = get_structure(get_pdb_path(str(pdb1_dir))) - coords2, seq2 = get_residue_data(r2) - - if len(files_list) == 0: - tmscores = [0.0, 0.0, 0.0, 0.0, 0.0] - return tmscores - - for model in files_list: - #modelpath = Path(model) - #model = str(modelpath.parent) + "/" + modelpath.stem - model = model.replace('.pdb','') - #model = model.replace('_converted.pdb','_converted') - model = pwd + model - s = get_structure(get_pdb_path(model)) - coords1, seq1 = get_residue_data(s) - res = tm_align(coords1, coords2, seq1, seq2) - tmscore = round(res.tm_norm_chain1,5) # wrt to model - tmscores_ord.append(tmscore) - - res = tm_align(coords2, coords1, seq2, seq1) - tmscore = round(res.tm_norm_chain1,5) # wrt to model - tmscores_rev.append(tmscore) - - - #print(tmscores[0:5]) - ##### pdb2_name part - pdb2_dir = pwd + pdb2_name - r3 = get_structure(get_pdb_path(str(pdb2_dir))) - coords2, seq2 = get_residue_data(r3) - - - for model in files_list: - #modelpath = Path(model) - #model = str(modelpath.parent) + "/" + modelpath.stem - model = model.replace('.pdb','') - #model = model.replace('_converted.pdb','_converted') - model = pwd + model - s = get_structure(get_pdb_path(model)) - coords1, seq1 = get_residue_data(s) - res = tm_align(coords1, coords2, seq1, seq2) - tmscore = round(res.tm_norm_chain1,5) # wrt to model - tmscores_ord.append(tmscore) - - res = tm_align(coords2, coords1, seq2, seq1) - tmscore = round(res.tm_norm_chain1,5) # wrt to model - tmscores_rev.append(tmscore) - - - print("normal") - print(tmscores_ord) - print("reverse") - print(tmscores_rev) - if np.max(tmscores_ord) > np.max(tmscores_rev): - tmscores = tmscores_ord - else: - tmscores = tmscores_rev - - - - print(tmscores) - self.tmscores = tmscores - - - - - def select_size(self, TMscores_random_alter, pdb1_name, pdb2_name, alt_name, num_seeds): - - TMscores_random_reshape = TMscores_random_alter.reshape(14, num_seeds * 5) - TMscores_random_locat = np.zeros((7, num_seeds * 5)) - - #### finding locatnative pdb_name - - if alt_name == pdb2_name: - #for i in 1, 3, 5, 7, 9, 11, 13 in TM_scores: - tmp_cnt = 0 - for i in range(1, 14, 2): - print(TMscores_random_reshape[i, :]) - TMscores_random_locat[tmp_cnt, :] = TMscores_random_reshape[i, :] - tmp_cnt = tmp_cnt + 1 - else: - #for i in 0, 2, 4, 6, 8, 10, 12 in TM_scores: - tmp_cnt = 0 - for i in range(0, 13, 2): - print(TMscores_random_reshape[i, :]) - TMscores_random_locat[tmp_cnt, :] = TMscores_random_reshape[i, :] - tmp_cnt = tmp_cnt + 1 - - - TMscore_data = TMscores_random_locat - TMscore_data = TMscores_random_locat.reshape(7, num_seeds * 5) - TMscore_data_sum = np.zeros((7, 1)) - - - for ii in range(0, int(TMscore_data.shape[0])): - TMscore_data_sum[ii] = np.sum(TMscore_data[ii]) - - - location = np.argmax(np.max(TMscore_data_sum, axis=1)) - - print("Selecting...") - - TMscore_data = TMscores_random_alter - TMscore_data = TMscores_random_alter.reshape(14, num_seeds * 5) - - - location_org = location - - - if alt_name == pdb2_name: - location = (location * 2) + 1 - else: - location = (location * 2) - - - - if alt_name == pdb2_name and np.any(TMscore_data[location, :] >= 0.5): - print(TMscore_data[location, :]) - selection = int((location - 1) / 2) - self.selection = selection - - elif alt_name == pdb1_name and np.any(TMscore_data[location, :] >= 0.5): - print(TMscore_data[location, :]) - selection = int(location / 2); - self.selection = selection - - else: - print("Predictions are bad") - print("Predictions of whole structure are bad") - rm_folder_cmd = 'rm -rf successed_prediction/' + self.pdb1_name + '/' - print(rm_folder_cmd) - os.system(rm_folder_cmd) - sys.exit() - - - - -class TMscore_cal_all_var(): - def __init__(self, pdb1, pdb1_name, pdb2, pdb2_name, nMSA, option, model_type): - num_seeds = 5 + nMSA - pwd = os.getcwd() + '/' - - if model_type != "alphafold2_multimer_v3": - #################################################################### - ##### check-out TM-scores of prediction with full-length-MSA (whole) - pred_dir = 'predictions_all/' + pdb1_name + "/" + pdb1_name + "_predicted_models_full_rand_*" - MSA_full_TMscore = TM_score(pred_dir, pdb1, pdb1_name, pdb2, pdb2_name, model_type) - full_TMscore = np.array(MSA_full_TMscore.tmscores) - full_TMscore = full_TMscore.reshape(2, num_seeds * 5) - - ##### check-out the 1st prediction results are good or not - if np.any(full_TMscore[0, :] > 0.5) or np.any(full_TMscore[1, :] > 0.5): - if np.average(full_TMscore[0, :]) > np.average(full_TMscore[1, :]): - ref_name = pdb1_name; alt_name = pdb2_name - else: - ref_name = pdb2_name; alt_name = pdb1_name - elif np.all(full_TMscore[0, :] < 0.5) and np.all(full_TMscore[1, :] < 0.5): - #If prediction is failed, move the folder to "failed_prediction"" - gen_dir = 'failed_prediction/' + pdb1_name - if not os.path.exists(gen_dir): - os.mkdir(gen_dir) - - mv_folder_cmd = 'mv ' + pdb1_name + '_predicted_models_full_rand_' + str(random_seed_full_MSA) + ' failed_prediction/' + pdb1_name - print(mv_folder_cmd); os.system(mv_folder_cmd) - print("All predictions with deep MSA are failed"); sys.exit() - else: - if np.average(full_TMscore[0, :]) > np.average(full_TMscore[1, :]): - ref_name = pdb1_name; alt_name = pdb2_name - else: - ref_name = pdb2_name; alt_name = pdb1_name - - - print("Reference structure: ", ref_name); print("Alternative structure: ", alt_name) - - # save TM-score from full-length MSA - np.savetxt('TMScore_full-MSA_' + pdb1_name + '.csv', full_TMscore, fmt='%2.3f') - print("Full-MSA prediction is tightly aligned to crystal structure"); print(" ") - - - - - ################################################################ - ##### chech-out TM-scores of prediction with shallow random MSAs - max_msa = 1; ext_msa = 2 - TMscores_random = [] - - for multi in (1, 2, 2, 2, 2, 2, 2): - max_msa = max_msa * multi - ext_msa = ext_msa * multi - - pred_dir = 'predictions_all/' + pdb1_name + "/" + pdb1_name + '_predicted_models_rand_*' + '_max_' + str(max_msa) + '_ext_' + str(ext_msa) + '/'; print(pred_dir) - ##### TMscore of whole part - MSA_shallow_TMscore = TM_score(pred_dir, pdb1, pdb1_name, pdb2, pdb2_name, model_type) - TMscores_random = np.append(TMscores_random, MSA_shallow_TMscore.tmscores) - - - fin_pred_dir = 'predictions_all/' + pdb1_name + "/" + pdb1_name + '_predicted_models_rand_*' + '_max_*' - TMscores_random_reshape = TMscores_random.reshape(14, num_seeds *5) - TMscores_random_alter = np.zeros((7, num_seeds *5)) - - - #### finding alternative pdb_name - if alt_name == pdb2_name: - #for i in 1, 3, 5, 7, 9, 11, 13 in TM_scores: - tmp_cnt = 0 - for i in range(1, 14, 2): - print(TMscores_random_reshape[i, :]) - TMscores_random_alter[tmp_cnt, :] = TMscores_random_reshape[i, :] - tmp_cnt = tmp_cnt + 1 - else: - #for i in 0, 2, 4, 6, 8, 10, 12 in TM_scores: - tmp_cnt = 0 - for i in range(0, 13, 2): - print(TMscores_random_reshape[i, :]) - TMscores_random_alter[tmp_cnt, :] = TMscores_random_reshape[i, :] - tmp_cnt = tmp_cnt + 1 - - - - ##### check out varied-MSA with (msa-max: 1, 2, 4, 8, 16, 32, 64) (msa-extra: 2, 4, 8, 16, 32, 64, 128) - if np.all(TMscores_random_alter) < 0.5: - print("All predictions are failed") - sys.exit() - - else: - print(" "); print("Finding optimal size of ramdon MSA...") - MSA_shallow_TMscore.select_size(TMscores_random_reshape, pdb1_name, pdb2_name, alt_name, num_seeds) - - size_selection = MSA_shallow_TMscore.selection - print(size_selection) - self.size_selection = size_selection - ## save all TM-scores from random MSA (1-2, 2-4, 4-8.... in order) - np.savetxt('TMScore_random-MSA_' + pdb1_name + '.csv', TMscores_random_reshape, fmt='%2.3f') - - - - elif model_type == "alphafold2_multimer_v3": - print("Currently working on") - MSA_multi = prediction_all_multimer(pdb1_name, pdb2_name, search_dir, nMSA, model_type, search_multi_dir) - self.size_selection = MSA_multi.size_selection - #sys.exit() diff --git a/code/TMscore_all_var_FS.py b/code/TMscore_all_var_FS.py deleted file mode 100644 index e3d39ea..0000000 --- a/code/TMscore_all_var_FS.py +++ /dev/null @@ -1,359 +0,0 @@ -#!/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Wed Feb 21 14:51:00 2024 - -@author: Myeongsang (Samuel) Lee -""" -import re -import Bio -import os -from os import listdir -from os.path import isfile, join -import sys -from pathlib import Path -import numpy as np -from numpy import genfromtxt -import matplotlib.pyplot as plt -import glob -import random -import argparse -# call related modules of tmtools after installation -from tmtools import tm_align -from tmtools.io import get_structure, get_residue_data -from tmtools.testing import get_pdb_path - -# call calculating TM-scores of fs region -from cal_tmscore_fs_only import * - -# call converting the multimer as a single chain structure -from convert_multi_single import * - -# call colabfold for multimer option -from pred_cal_tmscore_multimer_FS import * - - -class TM_score(): - def __init__(self, pred_dir, pdb1, pdb1_name, pdb2, pdb2_name, model_type): - - ## loading reference pdb for TM-score - pwd = os.getcwd() + '/' - tmscores = [] - tmscores_ord = []; tmscores_rev = [] - - #files_list = sorted(glob.glob(str(pred_dir) + "/*_unrelaxed*pdb")) - if model_type != "alphafold2_multimer_v3": - files_list = (glob.glob(str(pred_dir) + "/*_unrelaxed*pdb")) - print(files_list) - else: - #### convert the multimer file as a single structure - check_files_list = (glob.glob(str(pred_dir) + "/rmTER*_unrelaxed*pdb")) - print(check_files_list) - if not check_files_list: - convert_m2s(pred_dir, pdb1_name, pdb2_name) - files_list = (glob.glob(str(pred_dir) + "/rmTER*_unrelaxed*pdb")) - print(files_list) - else: - files_list = (glob.glob(str(pred_dir) + "/rmTER*_unrelaxed*pdb")) - print(files_list) - - - ##### pdb1_name part - pdb1_dir = pwd + pdb1_name - r2 = get_structure(get_pdb_path(str(pdb1_dir))) - coords2, seq2 = get_residue_data(r2) - - if len(files_list) == 0: - tmscores = [0.0, 0.0, 0.0, 0.0, 0.0] - return tmscores - - for model in files_list: - #modelpath = Path(model) - #model = str(modelpath.parent) + "/" + modelpath.stem - model = model.replace('.pdb','') - #model = model.replace('_converted.pdb','_converted') - model = pwd + model - s = get_structure(get_pdb_path(model)) - coords1, seq1 = get_residue_data(s) - res = tm_align(coords1, coords2, seq1, seq2) - tmscore = round(res.tm_norm_chain1,5) # wrt to model - tmscores_ord.append(tmscore) - - res = tm_align(coords2, coords1, seq2, seq1) - tmscore = round(res.tm_norm_chain1,5) # wrt to model - tmscores_rev.append(tmscore) - - - #print(tmscores[0:5]) - ##### pdb2_name part - pdb2_dir = pwd + pdb2_name - r3 = get_structure(get_pdb_path(str(pdb2_dir))) - coords2, seq2 = get_residue_data(r3) - - - for model in files_list: - #modelpath = Path(model) - #model = str(modelpath.parent) + "/" + modelpath.stem - model = model.replace('.pdb','') - #model = model.replace('_converted.pdb','_converted') - model = pwd + model - s = get_structure(get_pdb_path(model)) - coords1, seq1 = get_residue_data(s) - res = tm_align(coords1, coords2, seq1, seq2) - tmscore = round(res.tm_norm_chain1,5) # wrt to model - tmscores_ord.append(tmscore) - - res = tm_align(coords2, coords1, seq2, seq1) - tmscore = round(res.tm_norm_chain1,5) # wrt to model - tmscores_rev.append(tmscore) - - - print("normal") - print(tmscores_ord) - print("reverse") - print(tmscores_rev) - if np.max(tmscores_ord) > np.max(tmscores_rev): - tmscores = tmscores_ord - else: - tmscores = tmscores_rev - - - - print(tmscores) - self.tmscores = tmscores - - - - - def select_size(self, TMscores_random_alter, TMscores_fs_random_alter, pdb1_name, pdb2_name, alt_name, num_seeds): - - TMscores_random_reshape = TMscores_random_alter.reshape(14, num_seeds * 5) - TMscores_fs_random_reshape = TMscores_fs_random_alter.reshape(14, num_seeds * 5) - TMscores_random_locat = np.zeros((7, num_seeds * 5)) - TMscores_fs_random_locat = np.zeros((7, num_seeds * 5)) - - #### finding locatnative pdb_name - - if alt_name == pdb2_name: - #for i in 1, 3, 5, 7, 9, 11, 13 in TM_scores: - tmp_cnt = 0 - for i in range(1, 14, 2): - print(TMscores_random_reshape[i, :]) - print(TMscores_fs_random_reshape[i, :]) - TMscores_random_locat[tmp_cnt, :] = TMscores_random_reshape[i, :] - TMscores_fs_random_locat[tmp_cnt, :] = TMscores_fs_random_reshape[i, :] - tmp_cnt = tmp_cnt + 1 - else: - #for i in 0, 2, 4, 6, 8, 10, 12 in TM_scores: - tmp_cnt = 0 - for i in range(0, 13, 2): - print(TMscores_random_reshape[i, :]) - print(TMscores_fs_random_reshape[i, :]) - TMscores_random_locat[tmp_cnt, :] = TMscores_random_reshape[i, :] - TMscores_fs_random_locat[tmp_cnt, :] = TMscores_fs_random_reshape[i, :] - tmp_cnt = tmp_cnt + 1 - - - TMscore_data = TMscores_random_locat - TMscore_data = TMscores_random_locat.reshape(7, num_seeds * 5) - TMscore_data_sum = np.zeros((7, 1)) - - - TMscore_fs_data = TMscores_fs_random_locat - TMscore_fs_data = TMscores_fs_random_locat.reshape(7, num_seeds * 5) - - for ii in range(0, int(TMscore_data.shape[0])): - TMscore_data_sum[ii] = np.sum(TMscore_data[ii]) - - - location = np.argmax(np.max(TMscore_data_sum, axis=1)) - - print("Selecting...") - - TMscore_data = TMscores_random_alter - TMscore_data = TMscores_random_alter.reshape(14, num_seeds * 5) - - TMscore_fs_data = TMscores_fs_random_alter - TMscore_fs_data = TMscores_fs_random_alter.reshape(14, num_seeds * 5) - - location_org = location - - - if alt_name == pdb2_name: - location = (location * 2) + 1 - else: - location = (location * 2) - - - - if alt_name == pdb2_name and ((np.any(TMscore_data[location, :] >= 0.5) and np.any(TMscore_fs_data[location, :] >= 0.5))): - print(TMscore_data[location, :], TMscore_fs_data[location, :]) - selection = int((location - 1) / 2) - self.selection = selection - - elif alt_name == pdb1_name and ((np.any(TMscore_data[location, :] >= 0.5) and np.any(TMscore_fs_data[location, :] >= 0.5))): - print(TMscore_data[location, :], TMscore_fs_data[location, :]) - selection = int(location / 2); - self.selection = selection - - - #elif location == int(TMscore_data.shape[0]) and np.any(TMscore_fs_data[location, :] < 0.5): - elif np.any(TMscore_fs_data[location, :] < 0.5): - for jj in range(0, int(TMscore_data.shape[0] / 2)): - print(TMscore_data[(2 * jj), :], TMscore_fs_data[(jj * 2) + 1, :]) - print(TMscore_data[(jj * 2) + 1, :], TMscore_fs_data[(jj * 2), :]) - if (np.any(TMscore_data[(jj * 2), :] >= 0.4) and np.any(TMscore_fs_data[(jj * 2) + 1, :] >= 0.5)) or (np.any(TMscore_data[(jj * 2) + 1, :] >= 0.4) and np.any(TMscore_fs_data[(jj * 2), :] >= 0.5)): - selection = jj - self.selection = selection - break - elif (np.any(TMscore_data[(jj * 2), :] >= 0.4) and np.any(TMscore_fs_data[(jj * 2), :] >= 0.5)) or (np.any(TMscore_data[(jj * 2) + 1, :] >= 0.4) and np.any(TMscore_fs_data[(jj * 2) + 1, :] >= 0.5)): - - selection = jj - self.selection = selection - break - elif jj == (int(TMscore_data.shape[0])) and np.all(TMscore_data[jj, :] < 0.5): - print("Predictions are bad") - sys.exit() - else: - print("Predictions are bad") - else: - print("Predictions are bad") - print("Predictions of whole structure are bad") - sys.exit() - - - - -class TMscore_cal_all_var_FS(): - def __init__(self, pdb1, pdb1_name, pdb2, pdb2_name, nMSA, option, model_type): - num_seeds = 5 + nMSA - pwd = os.getcwd() + '/' - - if model_type != "alphafold2_multimer_v3": - #################################################################### - ##### check-out TM-scores of prediction with full-length-MSA (whole) - pred_dir = 'predictions_all/' + pdb1_name + "/" + pdb1_name + "_predicted_models_full_rand_*" - MSA_full_TMscore = TM_score(pred_dir, pdb1, pdb1_name, pdb2, pdb2_name, model_type) - full_TMscore = np.array(MSA_full_TMscore.tmscores) - full_TMscore = full_TMscore.reshape(2, num_seeds * 5) - - ##### check-out TM-scores of prediction with full-length-MSA (fs region) - pred_path = 'predictions_all/' + pdb1_name + "/" + pdb1_name + '_predicted_models_full_rand_*' - MSA_fs_TMscore = TM_score_fs(pred_path, pdb1, pdb1_name, pdb2, pdb2_name) - fs_TMscore = np.array(MSA_fs_TMscore.tmscores_fs) - fs_TMscore = fs_TMscore.reshape(2, num_seeds * 5) - - ##### check-out the 1st prediction results are good or not - if np.average(full_TMscore[0, :]) > np.average(full_TMscore[1, :]): - if np.any(fs_TMscore[0, :] >= 0.5) and np.any(full_TMscore[0, :] >= 0.5): - ref_name = pdb1_name; alt_name = pdb2_name - elif np.any(fs_TMscore[1, :] >= 0.5) and np.any(full_TMscore[1, :] >= 0.5): - ref_name = pdb2_name; alt_name = pdb1_name - else: - print("Prediction with deep MSA was failed"); - sys.exit() - else: - if np.any(fs_TMscore[1, :] >= 0.5) and np.any(full_TMscore[1, :] >= 0.5): - ref_name = pdb2_name; alt_name = pdb1_name - elif np.any(fs_TMscore[0, :] >= 0.5) and np.any(full_TMscore[0, :] >= 0.5): - ref_name = pdb1_name; alt_name = pdb2_name - else: - print("Prediction with deep MSA was failed"); - sys.exit() - - - print("Reference structure: ", ref_name); print("Alternative structure: ", alt_name) - - # save TM-score of whole structure from full-length MSA - np.savetxt('TMScore_full-MSA_' + pdb1_name + '.csv', full_TMscore, fmt='%2.3f') - # save TM-score of fold-switching region from full-length MSA - np.savetxt('TMScore_fs_full-MSA_' + pdb1_name + '.csv', fs_TMscore, fmt='%2.3f') - - # Directory section and save to successed_prediction folder - gen_dir = 'predictions_all/' + pdb1_name - - if not os.path.exists(gen_dir): - os.mkdir(gen_dir) - - mv_folder_cmd = 'mv ' + pred_dir + ' predictions_all/' + pdb1_name - print(mv_folder_cmd); os.system(mv_folder_cmd) - print("Full-MSA prediction is tightly aligned to crystal structure"); print(" ") - - - - - - ######################################################################## - ##### check-out TM-scores of prediction with shallow random MSAs (whole) - max_msa = 1; ext_msa = 2 - TMscores_random = []; TMscores_fs_random = [] - - for multi in (1, 2, 2, 2, 2, 2, 2): - max_msa = max_msa * multi - ext_msa = ext_msa * multi - - pred_dir = 'predictions_all/' + pdb1_name + "/" + pdb1_name + '_predicted_models_rand_*_max_' + str(max_msa) + '_ext_' + str(ext_msa) - ##### TMscore of whole part - MSA_shallow_TMscore = TM_score(pred_dir, pdb1, pdb1_name, pdb2, pdb2_name, model_type) - TMscores_random = np.append(TMscores_random, MSA_shallow_TMscore.tmscores); print(TMscores_random) - - ### TMscore fs part - MSA_shallow_fs_TMscore = TM_score_fs(pred_dir, pdb1, pdb1_name, pdb2, pdb2_name) - TMscores_fs_random = np.append(TMscores_fs_random, MSA_shallow_fs_TMscore.tmscores_fs); print(TMscores_fs_random) - - fin_pred_dir = 'predictions_all/' + pdb1_name + "/" + pdb1_name + '_predicted_models_rand_*_max_*' - - TMscores_random_reshape = TMscores_random.reshape(14, num_seeds * 5) - TMscores_fs_random_reshape = TMscores_fs_random.reshape(14, num_seeds * 5) - - TMscores_random_alter = np.zeros((7, num_seeds * 5)) - TMscores_fs_random_alter = np.zeros((7, num_seeds * 5)) - - - ##### finding the TMscores of alternative conformations for determining the length of shallow random MSAs - if alt_name == pdb2_name: - #for i in 1, 3, 5, 7, 9, 11, 13 in TM_scores: - tmp_cnt = 0 - for i in range(1, 14, 2): - print(TMscores_random_reshape[i, :]); print(TMscores_fs_random_reshape[i, :]) - TMscores_random_alter[tmp_cnt, :] = TMscores_random_reshape[i, :] - TMscores_fs_random_alter[tmp_cnt, :] = TMscores_fs_random_reshape[i, :] - tmp_cnt = tmp_cnt + 1 - else: - #for i in 0, 2, 4, 6, 8, 10, 12 in TM_scores: - tmp_cnt = 0 - for i in range(0, 13, 2): - print(TMscores_random_reshape[i, :]); print(TMscores_fs_random_reshape[i, :]) - TMscores_random_alter[tmp_cnt, :] = TMscores_random_reshape[i, :] - TMscores_fs_random_alter[tmp_cnt, :] = TMscores_fs_random_reshape[i, :] - tmp_cnt = tmp_cnt + 1 - - print(" ") - print("Confirming the TM-score with alternative conformation is good or not") - print(TMscores_random_alter) - print("Confirming the TM-score with fs region of alternative conformation is good or not") - print(TMscores_fs_random_alter) - print(" ") - - - if np.any(TMscores_random_alter > 0.5) and np.any(TMscores_fs_random_alter > 0.5): - # save all TM-scores from random MSA (1-2, 2-4, 4-8.... in order) - #TMscores_random_reshape = TMscores_random.reshape(14, 5) - np.savetxt('TMScore_random-MSA_' + pdb1_name + '.csv', TMscores_random_reshape, fmt='%2.3f') - np.savetxt('TMScore_fs_random-MSA_' + pdb1_name + '.csv', TMscores_fs_random_reshape, fmt='%2.3f') - - MSA_shallow_TMscore.select_size(TMscores_random_reshape, TMscores_fs_random_reshape, pdb1_name, pdb2_name, alt_name, num_seeds) - size_selection = MSA_shallow_TMscore.selection; self.size_selection = size_selection - - else: - print("Full-MSA prediction is not tightly aligned to crystal structure with additional seeds") - print("Predcition is done") - sys.exit() - - - - elif model_type == "alphafold2_multimer_v3": - print("Currently working on") - MSA_multi = prediction_all_multimer_FS(pdb1_name, pdb2_name, search_dir, nMSA, model_type, search_multi_dir, pdb1, pdb2) - self.size_selection = MSA_multi.size_selection - #sys.exit() diff --git a/code/cal_plddt_ACFS.py b/code/cal_plddt_ACFS.py deleted file mode 100644 index fc4f5b8..0000000 --- a/code/cal_plddt_ACFS.py +++ /dev/null @@ -1,158 +0,0 @@ -""" -find the average pLDDT score -""" -import sys,re -import glob -import json -import numpy as np -from pathlib import Path - -#define pattern for regular expression - -# 0_000_scores_rank_001_alphafold2_ptm_model_4_seed_000.json -pattrn = re.compile(r'.*?_scores_rank_(?P\d+)_alphafold2.*') - -# default if pattern doesn't work -rank = "000" - -def read_plddt (jsonfile): - """ - read the json file - return the plddt scores - as numpy array - """ - with open(jsonfile) as json_file: - data = json.load(json_file) - - plddt_scores = np.array(data['plddt'],dtype='float64') - - return plddt_scores - -def fract_good (score): - """ - return percentage - of residue with - plddt score > 70 - """ - vals_greater_70 = (score > 70).sum() - percent_good = round((vals_greater_70 / score.size)*100,2) - avg_plddt = round(np.average(score),2) - #return percent_good,avg_plddt - return avg_plddt - - - - -class plddt_cal(): - def __init__(self, sub_list, category, pdb_name, nMSA, nENS, model_type): - # if files found then continue - if len(sub_list) == 0: - sys.exit(1) - - # create a data dictionary - out_dict_all = {} - - values_all = [] - cnt = 0 - - if category =='full-MSA': - #if category == 'additional-MSA': - print("working...") - print(sub_list) - for subdir in sub_list: - print(subdir) - if Path(subdir).is_dir(): - subdir_name = Path(subdir).name - jsonfiles = glob.glob(str(subdir) + "/*_scores*json") - - for jsonfile in jsonfiles: - plddt_score = read_plddt(jsonfile) - values = fract_good(plddt_score) - values_all = np.append(values_all, values) - jsonfilepath = Path(jsonfile) - jsonfilename = jsonfilepath.stem - match = pattrn.match(jsonfilename) - if match: - rank = match.group('rank') - - key_pair = subdir_name + ":" + rank - # for all - if key_pair not in out_dict_all: - out_dict_all[key_pair]=values - - cnt = cnt + 1 - cnt = int(cnt / 5) - - - - elif category == 'additional-MSA': - print("working...") - print(sub_list) - for subdir in sub_list: - print(subdir) - if Path(subdir).is_dir(): - subdir_name = Path(subdir).name - jsonfiles = glob.glob(str(subdir) + "/*_scores*json") - - for jsonfile in jsonfiles: - plddt_score = read_plddt(jsonfile) - values = fract_good(plddt_score) - values_all = np.append(values_all, values) - jsonfilepath = Path(jsonfile) - jsonfilename = jsonfilepath.stem - match = pattrn.match(jsonfilename) - if match: - rank = match.group('rank') - - key_pair = subdir_name + ":" + rank - # for all - if key_pair not in out_dict_all: - out_dict_all[key_pair]=values - - cnt = cnt + 1 - - - - else: - for subdir in sub_list: - #for subdir in all_sub_dir_paths: - # make sure subdir exists - if Path(subdir).is_dir(): - subdir_name = Path(subdir).name - # get the list of json files - jsonfiles = glob.glob(str(subdir) + "/*_scores*json") - for jsonfile in jsonfiles: - plddt_score = read_plddt(jsonfile) - values = fract_good(plddt_score) - values_all = np.append(values_all, values) - jsonfilepath = Path(jsonfile) - jsonfilename = jsonfilepath.stem - match = pattrn.match(jsonfilename) - if match: - rank = match.group('rank') - - key_pair = subdir_name + ":" + rank - # for all - if key_pair not in out_dict_all: - out_dict_all[key_pair]=values - - cnt = cnt + 1 - - - print(cnt) - print(values_all) - - if category =='full-MSA': - values_all_resh = values_all.reshape(nMSA + 5, 5) - elif category == 'additional-MSA' and model_type == 'alphafold2_multimer_v3': - values_all_resh = values_all.reshape(((nENS + 20)), 5) - elif category == 'additional-MSA' and model_type != 'alphafold2_multimer_v3': - values_all_resh = values_all.reshape(((nENS + 20)), 5) - elif category == 'random-MSA' and model_type != 'alphafold2_multimer_v3': - values_all_resh = values_all.reshape(((nMSA + 5) * 7), 5) - elif category == 'random-MSA' and model_type == 'alphafold2_multimer_v3': - values_all_resh = values_all.reshape(((nMSA + 5) * 7), 5) - print(" ") - print("Calculated pLDDT") - print(values_all_resh) - np.savetxt('plddt_' + category + '_' + pdb_name +'.csv', values_all_resh, fmt='%2.3f') diff --git a/code/cal_tmscore_fs_flmsa.py b/code/cal_tmscore_fs_flmsa.py deleted file mode 100644 index c082a50..0000000 --- a/code/cal_tmscore_fs_flmsa.py +++ /dev/null @@ -1,283 +0,0 @@ -#!/bin/env python3 -# -*- coding: utf-8 -*- -""" -Compare the predicted models with original PDBs -report TM-scores for ranked 0 to 4 -input line is pdb1 pdb2 preds_of_pdb dirname - -This version requires tmtools 0.0.2 (Python bindings around the TM-align code for structural alignment of proteins) -check this for local installation -https://pypi.org/project/tmtools/ - -Usage: - -python3.8 compare_strs_fs.py 2k42_A 1cee_B 1cee_B 0_msas_models/ - -Created on Wed Feb 21 14:51:00 2024 -@author: Myeongsang (Samuel) Lee -""" -import re -import Bio -import os -from os import listdir -from os.path import isfile, join -import sys -from pathlib import Path -import numpy as np -from numpy import genfromtxt -import matplotlib.pyplot as plt -import glob -import random -import argparse -# call related modules of tmtools after installation -from tmtools import tm_align -from tmtools.io import get_structure, get_residue_data -from tmtools.testing import get_pdb_path -import Bio.PDB -from Bio.PDB import PDBParser, Structure - - - - -pdbParser = PDBParser(QUIET=True) - -# convert three letter code to one letter code -aa3to1 = {'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K', - 'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N', - 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W', - 'ALA': 'A', 'VAL':'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'} - - -class TM_score_fs(): - def get_coords(self, pdbfile, fs_range): - """ - parameters: - pdbfile - path to pdbfile - fs_range - range of residues at the fold-switching region, given as string - "112-162" - returns: - numpy array of coords - string of seqs in 1-letter-code - """ - - seq = "" - struct = pdbParser.get_structure('x',str(pdbfile)) - coords = [] - seq_dict = {} - - # for residues within a certain range, using numpy to save the coords - # and save the sequence as a dict and then sorted list of tuples - # return the coords and the seq - - # convert str to residue range for the fs region - (start,stop) = fs_range.split("-") - res_range = range(int(start),int(stop)+1) - - for atom in struct.get_atoms(): - residue = atom.get_parent() # from atom we can get the parent residue - res_id = residue.get_id()[1] - resname = residue.get_resname() - if res_id in res_range and atom.get_name()=="CA": - x,y,z = atom.get_coord() - coords.append([x,y,z]) - if res_id not in seq_dict: - seq_dict[res_id]=aa3to1[resname] - - - #print(coords) - # convert to np array - coords_np = np.array(coords) - # sort the seq_dict by keys a.k.a res_ids - sorted_data = sorted(seq_dict.items()) - for i in sorted_data: - seq+=i[1] - - return coords_np,seq - - - - def get_tmscore(self, coords1, seq1, predfilepath, res_range): - """ - parameters: - coords1, seq1 - the numpy array of PDB coords and its seqs - predfilepath - path for predicted files - res_range - fs range in predicted models - - returns: - tmscore list - - """ - - tmscores = [] - tmscores_ord = []; tmscores_rev = [] - modelfiles = sorted(glob.glob(str(predfilepath) + "/*_unrelaxed*pdb")) - - if len(modelfiles)==0: - tmscores = [0.0,0.0,0.0,0.0,0.0] - return tmscores - - for model in modelfiles: - modelpath = Path(model) - coords2, seq2 = self.get_coords(modelpath,res_range) - res = tm_align(coords1, coords2, seq1, seq2) - tmscore = round(res.tm_norm_chain1,2) # wrt to model - tmscores_ord.append(tmscore) - - res = tm_align(coords2, coords1, seq2, seq1) - tmscore = round(res.tm_norm_chain1,2) - tmscores_rev.append(tmscore) - - if np.max(tmscores_ord) > tmscores_rev: - tmscores = tmscores_ord - else: - tmscores = tmscores_rev - - - - return tmscores - - - - #def run_for_models(self, FH, pdbfile1, pdbfile2, data_dir,pred_range,res_range1,res_range2): - def run_for_models(self, pdbfile1, pdbfile2, data_dir,pred_range,res_range1,res_range2): - """ - compare the original PDB - with the predicted models, 0 to 5 - - parameters: - FH - filehandle for writing - pdbfile1 - path to original PDB, Fold1 - pdbfile2 - path to alternate PDB, Fold2 - data_dir - path for the predicted strs - res_range1 - fs range in PDB1 and its models - res_range2 - fs range in PDB2 and its models - - returns: - nothing - - saves the TM-scores in a local file - """ - #print(res_range1,res_range2) - - # get list of subdirectories - all_sub_dir_paths = glob.glob(str(data_dir)) # returns list of sub directory paths - tmscores_fs = [] - - print(all_sub_dir_paths) - # files found then continue - if len(all_sub_dir_paths) == 0: - pass - - for subdir in all_sub_dir_paths: - preddir = Path(subdir) - - # predicted dir doesn't exist then continue - if not preddir.exists(): - pass - - # only comparing on one set of predicted models - # but with both PDBs/Folds - coords1,seq1 = self.get_coords(pdbfile1,res_range1) - print(preddir, pred_range) - tmscore_lst1 = self.get_tmscore(coords1,seq1,preddir,pred_range) # wrt pdb1 - tmp_tm_fs = tmscore_lst1 - print(tmp_tm_fs) - tmscores_fs.append(tmp_tm_fs) - #print(tmscore_lst1) - - - - for subdir in all_sub_dir_paths: - preddir = Path(subdir) - - # predicted dir doesn't exist then continue - if not preddir.exists(): - pass - - # only comparing on one set of predicted models - # but with both PDBs/Folds - coords2,seq2 = self.get_coords(pdbfile2,res_range2) - tmscore_lst2 = self.get_tmscore(coords2,seq2,preddir,pred_range) # wrt pdb2 - tmp_tm_fs = tmscore_lst2 - print(tmp_tm_fs) - tmscores_fs.append(tmp_tm_fs) - #print(tmscore_lst2) - - #print(" ") - tmscores_fs = np.array(tmscores_fs) - self.tmscores_fs = tmscores_fs - print(" ") - print(tmscores_fs) - - - - def __init__(self, pdb1, pdb1_name, pdb2, pdb2_name): - # get numpy arrays for coords at the fold-switching region - # also return the seq in 1-letter code for the same - - # input arguments: sys.argv[1] - pdb1, sys.argv[2] - pdb2 - # sys.argv[3] - preds, sys.argv[4] - current directory - - current_dir = os.getcwd() + '/' - pred_dir = pdb1_name + '_predicted_models_full_*' - pred_path = current_dir + pred_dir + '/' - data_dir = Path(pred_path) # Path to the predicted models - print(data_dir) - - # the range of the fold-switching region - range_file = current_dir + 'range_fs_pairs_all.txt' - - # convert this file into a dictionary for reference later - fs_res = {} - - # The range_file file has the fold-switching residue ranges - # for the original PDB/PDB1, alternate PDB/PDB2 - # Predicted model for PDB1, predicted model for PDB2 - with open(range_file,'r') as Infile: - next(Infile) # skip header line "# pdb1,pdb2,pred1,pred2" - for line in Infile: - line=line.strip() - (n1,n2,p1,p2,m1,m2)=line.split(",") - # the value of the dictionary is a tuple - # the first element of tuple is the fs range in the original PDB - # followed by the range in the predicted model - if n1 not in fs_res: - fs_res[n1]=(p1,m1) - if n2 not in fs_res: - fs_res[n2]=(p2,m2) - - - - print("Running for pair ",pdb1_name, pdb2_name, end="..") - print(" ") - print("comparing predictions of ", pdb1_name, end="...") - print(" ") - - - try: - range_pdb1 = fs_res[pdb1_name] # so if pdb1 is '1nqd_A', fs_res['1nqd_A']=('895-919', '1-33') - range_pdb2 = fs_res[pdb2_name] # and if pdb2 is '1nqj_B', fs_res['1nqj_B']=('894-919', '1-33') - except: - print("check PDBIDs ",pdb1_name, pdb2_name) - sys.exit(1) - - - range_pred = range_pdb1[1] - self.run_for_models(pdb1, pdb2, data_dir, range_pred, range_pdb1[0], range_pdb2[0]) - - -#if __name__ == "__main__": -# -# import warnings -# warnings.filterwarnings('ignore') -# -# parser = argparse.ArgumentParser() -# parser.add_argument("--pdb1", type=str, help='PDB structure for the target crystal structure (target to be predicted)') -# parser.add_argument("--pdb2", type=str, help='PDB structure for the alternative crystal structure') -# -# args = parser.parse_args() -# -# pdb1 = args.pdb1; pdb2 = args.pdb2 -# pdb1_name = pdb1.replace('.pdb',''); pdb2_name = pdb2.replace('.pdb','') -# -# TM_score_fs(pdb1, pdb1_name, pdb2, pdb2_name) -# diff --git a/code/cal_tmscore_fs_multimer.py b/code/cal_tmscore_fs_multimer.py deleted file mode 100644 index 4d36261..0000000 --- a/code/cal_tmscore_fs_multimer.py +++ /dev/null @@ -1,264 +0,0 @@ -#!/bin/env python3 -# -*- coding: utf-8 -*- -""" -Compare the predicted models with original PDBs -report TM-scores for ranked 0 to 4 -input line is pdb1 pdb2 preds_of_pdb dirname - -This version requires tmtools 0.0.2 (Python bindings around the TM-align code for structural alignment of proteins) -check this for local installation -https://pypi.org/project/tmtools/ - -Usage: - -python3.8 compare_strs_fs.py 2k42_A 1cee_B 1cee_B 0_msas_models/ - -Created on Wed Feb 21 14:51:00 2024 -@author: Myeongsang (Samuel) Lee -""" -import re -import Bio -import os -from os import listdir -from os.path import isfile, join -import sys -from pathlib import Path -import numpy as np -from numpy import genfromtxt -import matplotlib.pyplot as plt -import glob -import random -import argparse -# call related modules of tmtools after installation -from tmtools import tm_align -from tmtools.io import get_structure, get_residue_data -from tmtools.testing import get_pdb_path -import Bio.PDB -from Bio.PDB import PDBParser, Structure - - - - -pdbParser = PDBParser(QUIET=True) - -# convert three letter code to one letter code -aa3to1 = {'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K', - 'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N', - 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W', - 'ALA': 'A', 'VAL':'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'} - - -class TM_score_fs_multi(): - def get_coords(self, pdbfile, fs_range): - """ - parameters: - pdbfile - path to pdbfile - fs_range - range of residues at the fold-switching region, given as string - "112-162" - returns: - numpy array of coords - string of seqs in 1-letter-code - """ - - seq = "" - struct = pdbParser.get_structure('x',str(pdbfile)) - coords = [] - seq_dict = {} - - # for residues within a certain range, using numpy to save the coords - # and save the sequence as a dict and then sorted list of tuples - # return the coords and the seq - - # convert str to residue range for the fs region - (start,stop) = fs_range.split("-") - res_range = range(int(start),int(stop)+1) - - for atom in struct.get_atoms(): - residue = atom.get_parent() # from atom we can get the parent residue - res_id = residue.get_id()[1] - resname = residue.get_resname() - if res_id in res_range and atom.get_name()=="CA": - x,y,z = atom.get_coord() - coords.append([x,y,z]) - if res_id not in seq_dict: - seq_dict[res_id]=aa3to1[resname] - - - #print(coords) - # convert to np array - coords_np = np.array(coords) - # sort the seq_dict by keys a.k.a res_ids - sorted_data = sorted(seq_dict.items()) - for i in sorted_data: - seq+=i[1] - - return coords_np,seq - - - - def get_tmscore(self, coords1, seq1, predfilepath, res_range): - """ - parameters: - coords1, seq1 - the numpy array of PDB coords and its seqs - predfilepath - path for predicted files - res_range - fs range in predicted models - - returns: - tmscore list - - """ - - tmscores = [] - #modelfiles = sorted(glob.glob(str(predfilepath) + "/*_unrelaxed*pdb")) - modelfiles = (glob.glob(str(predfilepath) + "/single*_unrelaxed*pdb")) - - if len(modelfiles)==0: - tmscores = [0.0,0.0,0.0,0.0,0.0] - return tmscores - - for model in modelfiles: - modelpath = Path(model) - coords2, seq2 = self.get_coords(modelpath,res_range) - res = tm_align(coords1, coords2, seq1, seq2) - tmscore = round(res.tm_norm_chain1,2) # wrt to model - tmscores.append(tmscore) - - return tmscores - - - - #def run_for_models(self, FH, pdbfile1, pdbfile2, data_dir,pred_range,res_range1,res_range2): - def run_for_models(self, pdbfile1, pdbfile2, data_dir,pred_range,res_range1,res_range2): - """ - compare the original PDB - with the predicted models, 0 to 5 - - parameters: - FH - filehandle for writing - pdbfile1 - path to original PDB, Fold1 - pdbfile2 - path to alternate PDB, Fold2 - data_dir - path for the predicted strs - res_range1 - fs range in PDB1 and its models - res_range2 - fs range in PDB2 and its models - - returns: - nothing - - saves the TM-scores in a local file - """ - #print(res_range1,res_range2) - - # get list of subdirectories - all_sub_dir_paths = glob.glob(str(data_dir)) - tmscores_fs = [] - - - ## files found then continue - if len(all_sub_dir_paths) == 0: - pass - - for subdir in all_sub_dir_paths: - preddir = Path(subdir) - # predicted dir doesn't exist then continue - if not preddir.exists(): - pass - - # only comparing on one set of predicted models - # but with both PDBs/Folds - coords1,seq1 = self.get_coords(pdbfile1,res_range1) - tmscore_lst1 = self.get_tmscore(coords1,seq1,preddir,pred_range) # wrt pdb1 - tmp_tm_fs = tmscore_lst1 - tmscores_fs.append(tmp_tm_fs) - - - for subdir in all_sub_dir_paths: - preddir = Path(subdir) - - # predicted dir doesn't exist then continue - if not preddir.exists(): - pass - - # only comparing on one set of predicted models - # but with both PDBs/Folds - coords2,seq2 = self.get_coords(pdbfile2,res_range2) - tmscore_lst2 = self.get_tmscore(coords2,seq2,preddir,pred_range) # wrt pdb2 - tmp_tm_fs = tmscore_lst2 - tmscores_fs.append(tmp_tm_fs) - - print(" ") - tmscores_fs = np.array(tmscores_fs) - print("tmscores_fs") - self.tmscores_fs = tmscores_fs - - - - def __init__(self, pred_path, pdb1, pdb1_name, pdb2, pdb2_name): - # get numpy arrays for coords at the fold-switching region - # also return the seq in 1-letter code for the same - - # input arguments: sys.argv[1] - pdb1, sys.argv[2] - pdb2 - # sys.argv[3] - preds, sys.argv[4] - current directory - - current_dir = os.getcwd() + '/' - #pred_dir = 'additional_sampling/' + pdb1_name - #pred_path = current_dir + pred_dir + '/' - data_dir = Path(pred_path) # Path to the predicted models - - - # the range of the fold-switching region - range_file = current_dir + 'range_fs_pairs_all.txt' - - # convert this file into a dictionary for reference later - fs_res = {} - - # The range_file file has the fold-switching residue ranges - # for the original PDB/PDB1, alternate PDB/PDB2 - # Predicted model for PDB1, predicted model for PDB2 - with open(range_file,'r') as Infile: - next(Infile) # skip header line "# pdb1,pdb2,pred1,pred2" - for line in Infile: - line=line.strip() - (n1,n2,p1,p2,m1,m2)=line.split(",") - # the value of the dictionary is a tuple - # the first element of tuple is the fs range in the original PDB - # followed by the range in the predicted model - if n1 not in fs_res: - fs_res[n1]=(p1,m1) - if n2 not in fs_res: - fs_res[n2]=(p2,m2) - - - - print("Running for pair ",pdb1_name, pdb2_name, end="..") - print(" ") - print("comparing predictions of ", pdb1_name, end="...") - print(" ") - - - try: - range_pdb1 = fs_res[pdb1_name] # so if pdb1 is '1nqd_A', fs_res['1nqd_A']=('895-919', '1-33') - range_pdb2 = fs_res[pdb2_name] # and if pdb2 is '1nqj_B', fs_res['1nqj_B']=('894-919', '1-33') - except: - print("check PDBIDs ",pdb1_name, pdb2_name) - sys.exit(1) - - - range_pred = range_pdb1[1] - self.run_for_models(pdb1, pdb2, data_dir, range_pred, range_pdb1[0], range_pdb2[0]) - - -#if __name__ == "__main__": -# -# import warnings -# warnings.filterwarnings('ignore') -# -# parser = argparse.ArgumentParser() -# parser.add_argument("--pdb1", type=str, help='PDB structure for the target crystal structure (target to be predicted)') -# parser.add_argument("--pdb2", type=str, help='PDB structure for the alternative crystal structure') -# -# args = parser.parse_args() -# -# pdb1 = args.pdb1; pdb2 = args.pdb2 -# pdb1_name = pdb1.replace('.pdb',''); pdb2_name = pdb2.replace('.pdb','') -# -# TM_score_fs(pdb1, pdb1_name, pdb2, pdb2_name) -# diff --git a/code/cal_tmscore_fs_only.py b/code/cal_tmscore_fs_only.py deleted file mode 100644 index 624df98..0000000 --- a/code/cal_tmscore_fs_only.py +++ /dev/null @@ -1,276 +0,0 @@ -#!/bin/env python3 -# -*- coding: utf-8 -*- -""" -Compare the predicted models with original PDBs -report TM-scores for ranked 0 to 4 -input line is pdb1 pdb2 preds_of_pdb dirname - -This version requires tmtools 0.0.2 (Python bindings around the TM-align code for structural alignment of proteins) -check this for local installation -https://pypi.org/project/tmtools/ - -Usage: - -python3.8 compare_strs_fs.py 2k42_A 1cee_B 1cee_B 0_msas_models/ - -Created on Wed Feb 21 14:51:00 2024 -@author: Myeongsang (Samuel) Lee -""" -import re -import Bio -import os -from os import listdir -from os.path import isfile, join -import sys -from pathlib import Path -import numpy as np -from numpy import genfromtxt -import matplotlib.pyplot as plt -import glob -import random -import argparse -# call related modules of tmtools after installation -from tmtools import tm_align -from tmtools.io import get_structure, get_residue_data -from tmtools.testing import get_pdb_path -import Bio.PDB -from Bio.PDB import PDBParser, Structure - - - - -pdbParser = PDBParser(QUIET=True) - -# convert three letter code to one letter code -aa3to1 = {'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K', - 'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N', - 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W', - 'ALA': 'A', 'VAL':'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'} - - -class TM_score_fs(): - def get_coords(self, pdbfile, fs_range): - """ - parameters: - pdbfile - path to pdbfile - fs_range - range of residues at the fold-switching region, given as string - "112-162" - returns: - numpy array of coords - string of seqs in 1-letter-code - """ - - seq = "" - struct = pdbParser.get_structure('x',str(pdbfile)) - coords = [] - seq_dict = {} - - # for residues within a certain range, using numpy to save the coords - # and save the sequence as a dict and then sorted list of tuples - # return the coords and the seq - - # convert str to residue range for the fs region - (start,stop) = fs_range.split("-") - res_range = range(int(start),int(stop)+1) - - for atom in struct.get_atoms(): - residue = atom.get_parent() # from atom we can get the parent residue - res_id = residue.get_id()[1] - resname = residue.get_resname() - if res_id in res_range and atom.get_name()=="CA": - x,y,z = atom.get_coord() - coords.append([x,y,z]) - if res_id not in seq_dict: - seq_dict[res_id]=aa3to1[resname] - - - #print(coords) - # convert to np array - coords_np = np.array(coords) - # sort the seq_dict by keys a.k.a res_ids - sorted_data = sorted(seq_dict.items()) - for i in sorted_data: - seq+=i[1] - - return coords_np,seq - - - - def get_tmscore(self, coords1, seq1, predfilepath, res_range): - """ - parameters: - coords1, seq1 - the numpy array of PDB coords and its seqs - predfilepath - path for predicted files - res_range - fs range in predicted models - - returns: - tmscore list - - """ - - tmscores = [] - tmscores_ord = []; tmscores_rev = [] - #modelfiles = sorted(glob.glob(str(predfilepath) + "/*_unrelaxed*pdb")) - modelfiles = (glob.glob(str(predfilepath) + "/*_unrelaxed*pdb")) - - if len(modelfiles)==0: - tmscores = [0.0,0.0,0.0,0.0,0.0] - return tmscores - - for model in modelfiles: - modelpath = Path(model) - coords2, seq2 = self.get_coords(modelpath,res_range) - res = tm_align(coords1, coords2, seq1, seq2) - tmscore = round(res.tm_norm_chain1,2) # wrt to model - tmscores_ord.append(tmscore) - - res = tm_align(coords2, coords1, seq2, seq1) - tmscore = round(res.tm_norm_chain1,5) # wrt to model - tmscores_rev.append(tmscore) - - if np.max(tmscores_ord) > np.max(tmscores_rev): - tmscores = tmscores_ord - else: - tmscores = tmscores_rev - - - return tmscores - - - - #def run_for_models(self, FH, pdbfile1, pdbfile2, data_dir,pred_range,res_range1,res_range2): - def run_for_models(self, pdbfile1, pdbfile2, data_dir,pred_range,res_range1,res_range2): - """ - compare the original PDB - with the predicted models, 0 to 5 - - parameters: - FH - filehandle for writing - pdbfile1 - path to original PDB, Fold1 - pdbfile2 - path to alternate PDB, Fold2 - data_dir - path for the predicted strs - res_range1 - fs range in PDB1 and its models - res_range2 - fs range in PDB2 and its models - - returns: - nothing - - saves the TM-scores in a local file - """ - #print(res_range1,res_range2) - - # get list of subdirectories - all_sub_dir_paths = glob.glob(str(data_dir)) # returns list of sub directory paths - tmscores_fs = [] - - - # files found then continue - if len(all_sub_dir_paths) == 0: - pass - - for subdir in all_sub_dir_paths: - preddir = Path(subdir) - # predicted dir doesn't exist then continue - if not preddir.exists(): - pass - - # only comparing on one set of predicted models - # but with both PDBs/Folds - coords1,seq1 = self.get_coords(pdbfile1,res_range1) - tmscore_lst1 = self.get_tmscore(coords1,seq1,preddir,pred_range) # wrt pdb1 - tmp_tm_fs = tmscore_lst1 - tmscores_fs.append(tmp_tm_fs) - - - for subdir in all_sub_dir_paths: - preddir = Path(subdir) - - # predicted dir doesn't exist then continue - if not preddir.exists(): - pass - - # only comparing on one set of predicted models - # but with both PDBs/Folds - coords2,seq2 = self.get_coords(pdbfile2,res_range2) - tmscore_lst2 = self.get_tmscore(coords2,seq2,preddir,pred_range) # wrt pdb2 - tmp_tm_fs = tmscore_lst2 - tmscores_fs.append(tmp_tm_fs) - - print(" ") - tmscores_fs = np.array(tmscores_fs) - print("tmscores_fs") - self.tmscores_fs = tmscores_fs - - - - def __init__(self, pred_path, pdb1, pdb1_name, pdb2, pdb2_name): - # get numpy arrays for coords at the fold-switching region - # also return the seq in 1-letter code for the same - - # input arguments: sys.argv[1] - pdb1, sys.argv[2] - pdb2 - # sys.argv[3] - preds, sys.argv[4] - current directory - - current_dir = os.getcwd() + '/' - #pred_dir = 'additional_sampling/' + pdb1_name - #pred_path = current_dir + pred_dir + '/' - #print(pred_path) - data_dir = Path(pred_path) # Path to the predicted models - - - # the range of the fold-switching region - range_file = current_dir + 'range_fs_pairs_all.txt' - - # convert this file into a dictionary for reference later - fs_res = {} - - # The range_file file has the fold-switching residue ranges - # for the original PDB/PDB1, alternate PDB/PDB2 - # Predicted model for PDB1, predicted model for PDB2 - with open(range_file,'r') as Infile: - next(Infile) # skip header line "# pdb1,pdb2,pred1,pred2" - for line in Infile: - line=line.strip() - (n1,n2,p1,p2,m1,m2)=line.split(",") - # the value of the dictionary is a tuple - # the first element of tuple is the fs range in the original PDB - # followed by the range in the predicted model - if n1 not in fs_res: - fs_res[n1]=(p1,m1) - if n2 not in fs_res: - fs_res[n2]=(p2,m2) - - - - print("Running for pair ",pdb1_name, pdb2_name, end="..") - print(" ") - print("comparing predictions of ", pdb1_name, end="...") - print(" ") - - - try: - range_pdb1 = fs_res[pdb1_name] # so if pdb1 is '1nqd_A', fs_res['1nqd_A']=('895-919', '1-33') - range_pdb2 = fs_res[pdb2_name] # and if pdb2 is '1nqj_B', fs_res['1nqj_B']=('894-919', '1-33') - except: - print("check PDBIDs ",pdb1_name, pdb2_name) - sys.exit(1) - - - range_pred = range_pdb1[1] - self.run_for_models(pdb1, pdb2, data_dir, range_pred, range_pdb1[0], range_pdb2[0]) - - -#if __name__ == "__main__": -# -# import warnings -# warnings.filterwarnings('ignore') -# -# parser = argparse.ArgumentParser() -# parser.add_argument("--pdb1", type=str, help='PDB structure for the target crystal structure (target to be predicted)') -# parser.add_argument("--pdb2", type=str, help='PDB structure for the alternative crystal structure') -# -# args = parser.parse_args() -# -# pdb1 = args.pdb1; pdb2 = args.pdb2 -# pdb1_name = pdb1.replace('.pdb',''); pdb2_name = pdb2.replace('.pdb','') -# -# TM_score_fs(pdb1, pdb1_name, pdb2, pdb2_name) -# diff --git a/code/convert_multi_single.py b/code/convert_multi_single.py deleted file mode 100644 index a7d5b82..0000000 --- a/code/convert_multi_single.py +++ /dev/null @@ -1,89 +0,0 @@ -#!/bin/env python3 -# -*- coding: utf-8 -*- -""" - -Converting the multimer PDB to a single PDB file - -Created on Tue Dec 24 14:51:00 2025 -@author: Myeongsang (Samuel) Lee -""" -import re -import Bio -import os -from os import listdir -from os.path import isfile, join -import sys -from pathlib import Path -import numpy as np -from numpy import genfromtxt -import glob -import random -import argparse - - - - - -class convert_m2s(): - def __init__(self, pred_path, pdb1_name, pdb2_name): - current_dir = os.getcwd() + '/' - data_dir = Path(pred_path) # Path to the predicted models - - files_list = (glob.glob(str(pred_path) + "/*_unrelaxed*pdb")) - print(files_list) - for fl in files_list: - fl_name = fl.replace('.pdb','') - predicted_name = fl_name.split('/')[1] - #convert = "awk '!/TER/' " + fl + " > " + fl_name + "_converted.pdb" - convert = "awk '!/TER/' " + fl + " > " + fl_name.split('/')[0] + '/' + "rmTER_" + predicted_name + ".pdb" - print(convert) - os.system(convert) - - - convert_pdb2 = "awk '!/TER/' " + pdb2_name + ".pdb > " + pdb2_name + "_rmTER.pdb" - print(convert_pdb2); os.system(convert_pdb2) - - - ##### extract a single chain from multimer - TER_count = 0 - - for fl in files_list: - fl_name = fl.replace('.pdb','') - predicted_name = fl_name.split('/')[1] - - with open(fl, 'r') as file: - for line in file: - TER = line.split() - TER_count += TER.count("TER") - - - line_cnt = 0 - for i in range(0, 2): - output_file_name = fl_name.split('/')[0] + '/' + "single_" + predicted_name + ".pdb" - - if line_cnt == 0: - with open(fl, 'r') as infile, open(output_file_name, 'w') as outfile: - for line in infile: - outfile.write(line) - line_cnt = line_cnt + 1 - if "TER " in line: - line_cnt = line_cnt + 1 - break - - #line_cnt = 0 - ##for i in range(0, TER_count): - #for i in range(0, 2): - # output_file_name = pdb2_name[0:4] + '_multi.pdb' - - # if line_cnt == 0: - # with open(pdb2, 'r') as infile, open(output_file_name, 'w') as outfile: - # for line in infile: - # outfile.write(line) - # line_cnt = line_cnt + 1 - # if "TER " in line: - # line_cnt = line_cnt + 1 - # break - - #pdb2_name_multi = output_file_name.replace('.pdb','') - - diff --git a/code/fs_seq_compare.py b/code/fs_seq_compare.py deleted file mode 100644 index f3730c6..0000000 --- a/code/fs_seq_compare.py +++ /dev/null @@ -1,278 +0,0 @@ -import sys -import os -import re -from os import listdir -from os.path import isfile, join -import pandas as pd -import numpy as np -import Bio.PDB -import matplotlib.pyplot as plt -import glob -import random -import argparse -from Bio import SeqIO -from Bio.PDB.PDBParser import PDBParser -from Bio import * -from Bio.SeqRecord import SeqRecord - -from thefuzz import fuzz -from thefuzz import process - - - -class fs_range(): - def first_res_check(self, pdb1, pdb2): - #self.pdb1 = pdb1; self.pdb2 = pdb2 - - ## first residue index check - structure_1 = PDBParser().get_structure('pdb1', pdb1) - model_1 = structure_1[0] - print(model_1) - - structure_2 = PDBParser().get_structure('pdb2', pdb2) - model_2 = structure_2[0] - print(model_2) - - res_index_1 = [] - res_index_2 = [] - - for chain_1 in model_1: - for i, residue in enumerate(chain_1.get_residues()): - #res_id = list(residue.id) - res_index_1.append(residue.id[1]) - #print(residue.id[1]) - - for chain_2 in model_2: - for i, residue in enumerate(chain_2.get_residues()): - res_index_2.append(residue.id[1]) - - #print(int(res_index_1[0])) - #print(int(res_index_2[0])) - - self.pdb1_res_index_1 = int(res_index_1[0]) - self.pdb2_res_index_1 = int(res_index_2[0]) - - - - - def pydssp(self, crys_pdb, pred_pdb, number, pdb_name): - - ##### generating the command for pydssp - number = str(number) - command = 'pydssp ' + crys_pdb + ' ' + pred_pdb + ' -o output_' + pdb_name + '_' + number + '.log' - print(command) - os.system(command) - - - - - def res_check(self, pdb1, pdb2, pdb1_name, pdb2_name): - current_dir = os.getcwd() + '/' - range_file = current_dir + 'range_fs_pairs_all.txt' - - crys_fs_res_1 = {}; crys_fs_res_2 = {} - pred_fs_res_1 = {}; pred_fs_res_2 = {} - - with open(range_file,'r') as Infile: - next(Infile) # skip header line "# pdb1,pdb2,pred1,pred2" - for line in Infile: - line=line.strip() - (n1,n2,p1,p2,m1,m2)=line.split(",") - # the value of the dictionary is a tuple - # the first element of tuple is the fs range in the original PDB - # followed by the range in the predicted model - #if n1 == pdb1_name and n2 == pdb2_name: - if (n1 == pdb1_name and n2 == pdb2_name) or (n2 == pdb1_name and n1 == pdb2_name): - #fs_res_1 = (m1); fs_res_2 = (m2) - crys_fs_res_1 = (p1); crys_fs_res_2 = (p2); - pred_fs_res_1 = (m1); pred_fs_res_2 = (m2); - - #fs_res_1_update = fs_res_1.split("-"); fs_res_2_update = fs_res_2.split("-"); - #print(fs_res_1_update, fs_res_2_update) - - - crys_fs_res_1_update = crys_fs_res_1.split("-"); crys_fs_res_2_update = crys_fs_res_2.split("-"); - print(crys_fs_res_1_update, crys_fs_res_2_update) - pred_fs_res_1_update = pred_fs_res_1.split("-"); pred_fs_res_2_update = pred_fs_res_2.split("-"); - print(pred_fs_res_1_update, pred_fs_res_2_update) - - ##### convert list data to int - self.crys_fs_res_1_update = [int(i) for i in crys_fs_res_1_update] - self.crys_fs_res_2_update = [int(i) for i in crys_fs_res_2_update] - - self.pred_fs_res_1_update = [int(i) for i in pred_fs_res_1_update] - self.pred_fs_res_2_update = [int(i) for i in pred_fs_res_2_update] - - - - - - - - def __init__(self, pdb1, pdb2, pdb1_name, pdb2_name, pred_dir): - ##### check first residue index of query proteins - #fs_check = fs_range(pdb1, pdb2) - self.first_res_check(pdb1, pdb2) - print(" "); print("checking first residue index") - print(self.pdb1_res_index_1) - print(self.pdb2_res_index_1) - - - - pred_folder = pred_dir - #pred_folder = '3hdf_A_predicted_models_full_rand_12' - pred_path = pred_folder - print(pred_path) - - - pred_files = (glob.glob(str(pred_path) + "/*_unrelaxed*pdb")) - - - ##### read range file information - self.res_check(pdb1, pdb2, pdb1_name, pdb2_name) - print(self.crys_fs_res_1_update, self.pred_fs_res_1_update) - print(self.crys_fs_res_2_update, self.pred_fs_res_2_update) - - crys1_fs_res_st = self.crys_fs_res_1_update[0]; crys1_fs_res_ed = self.crys_fs_res_1_update[1] - crys2_fs_res_st = self.crys_fs_res_2_update[0]; crys2_fs_res_ed = self.crys_fs_res_2_update[1] - pred1_fs_res_st = self.pred_fs_res_1_update[0]; pred1_fs_res_ed = self.pred_fs_res_1_update[1] - pred2_fs_res_st = self.pred_fs_res_2_update[0]; pred2_fs_res_ed = self.pred_fs_res_2_update[1] - - - - if int(self.pdb1_res_index_1) > 1: - print("Initial residue is not starting from 1") - self.crys_fs_res_1_update[0] = self.crys_fs_res_1_update[0] - int(self.pdb1_res_index_1) - self.crys_fs_res_1_update[1] = self.crys_fs_res_1_update[1] - int(self.pdb1_res_index_1) - crys1_fs_res_st = self.crys_fs_res_1_update[0]; - crys1_fs_res_ed = self.crys_fs_res_1_update[1] - - if int(self.pdb2_res_index_1) > 1: - print("Initial residue is not starting from 1") - self.crys_fs_res_2_update[0] = self.crys_fs_res_2_update[0] - int(self.pdb2_res_index_1) - self.crys_fs_res_2_update[1] = self.crys_fs_res_2_update[1] - int(self.pdb2_res_index_1) - crys2_fs_res_st = self.crys_fs_res_2_update[0] - crys2_fs_res_ed = self.crys_fs_res_2_update[1] - - print("checking starting and ending residue number") - print(""); print("crystal structure") - print(crys1_fs_res_st, crys1_fs_res_ed) - print(crys2_fs_res_st, crys2_fs_res_ed) - print(""); print("predicted structure") - print(pred1_fs_res_st, pred1_fs_res_ed) - print(pred2_fs_res_st, pred2_fs_res_ed) - - - pred_dir_add = 'additional_sampling/' + pdb1_name + '/' - pred_dir_suc = 'successed_prediction/' + pdb1_name + '/*/' - pred_dir_fal = ' failed_prediction/' - - - ##### perform pydssp and calculate secondary structure similarity - index = 0 - print(np.size(pred_files)) - print(" "); print("calculating with pdb1 ", pdb1_name) - for model in pred_files: - print(model) - self.pydssp(pdb1, model, index, pdb1_name) - dssp_read_tmp = pd.read_csv('output_' + pdb1_name + '_' + str(index) + '.log', sep=' ', header = None) - ## seq1 = crystal structure, seq2 = predicted structure - print(dssp_read_tmp) - print(dssp_read_tmp[0].iloc[0]); seq1 = dssp_read_tmp[0].iloc[0] - print(dssp_read_tmp[0].iloc[1]); seq2 = dssp_read_tmp[0].iloc[1] - - # crystal protein 1 and predictions - print(" ") - print(seq1[crys1_fs_res_st:crys1_fs_res_ed]) - print(seq2[pred2_fs_res_st:pred2_fs_res_ed]) - if fuzz.ratio(seq1[crys1_fs_res_st:crys1_fs_res_ed], seq2[pred2_fs_res_st:pred2_fs_res_ed]) > 85: - print("fs region is correctly predicted") - f = open("fs_compare_output_" + pdb1_name + ".log", "w") - f.write("success") - f.close() - break - elif index == (int(np.size(pred_files)) - 1): - print("fs region is not correctly predicted") - - #command = 'mv ' + pred_dir_add + pred_dir_fal - #print(command); os.system(command) - #command = 'mv ' + pred_dir_suc + pred_dir_fal + pdb1_name + '/' - #print(command); os.system(command) - - #command = 'rm *' + pdb1_name + '*csv' - #print(command); os.system(command) - print("calculating TM-score of fs with alternative pdb") - - index = 0 - print(" "); print("calculating with pdb2 ", pdb2_name) - - for model in pred_files: - self.pydssp(pdb2, model, index, pdb1_name) - dssp_read_tmp = pd.read_csv('output_' + pdb1_name + '_' + str(index) + '.log', sep=' ', header = None) - ## seq1 = crystal structure, seq2 = predicted structure - print(dssp_read_tmp[0].iloc[0]); seq1 = dssp_read_tmp[0].iloc[0] - print(dssp_read_tmp[0].iloc[1]); seq2 = dssp_read_tmp[0].iloc[1] - - - # crystal protein 1 and predictions - print(" ") - print(seq1[crys2_fs_res_st:crys2_fs_res_ed]) - print(seq2[pred2_fs_res_st:pred2_fs_res_ed]) - if fuzz.ratio(seq1[crys2_fs_res_st:crys2_fs_res_ed], seq2[pred2_fs_res_st:pred2_fs_res_ed]) > 85: - print("fs region is correctly predicted") - break - elif index == (int(np.size(pred_files)) - 1): - print("fs region is not correctly predicted") - - f = open("fs_compare_output_" + pdb1_name + ".log", "w") - f.write("fail") - f.close() - - #command = 'mv ' + pred_dir_add + pred_dir_fal - #print(command); os.system(command) - #command = 'mv ' + pred_dir_suc + pred_dir_fal + pdb1_name + '/' - #print(command); os.system(command) - - else: - index += 1 - - - else: - index += 1 - - # index += 1 - - - #index = 0 - #print(" "); print("calculating with pdb2 ", pdb2_name) - #for model in pred_files: - # self.pydssp(pdb2, model, index) - # dssp_read_tmp = pd.read_csv('output_' + str(index) + '.log', sep=' ', header = None) - # ## seq1 = crystal structure, seq2 = predicted structure - # print(dssp_read_tmp[0].iloc[0]); seq1 = dssp_read_tmp[0].iloc[0] - # print(dssp_read_tmp[0].iloc[1]); seq2 = dssp_read_tmp[0].iloc[1] - - # # crystal protein 1 and predictions - # print(" ") - # print(seq1[crys2_fs_res_st:crys2_fs_res_ed]) - # print(seq2[pred2_fs_res_st:pred2_fs_res_ed]) - # if fuzz.ratio(seq1[crys2_fs_res_st:crys2_fs_res_ed], seq2[pred2_fs_res_st:pred2_fs_res_ed]) > 85: - # print("fs region is correctly predicted") - # break - # elif index == (int(np.size(pred_files)) - 1): - # print("fs region is not correctly predicted") - - # command = 'mv ' + pred_dir_add + pred_dir_fal - # print(command); os.system(command) - # command = 'mv ' + pred_dir_suc + pred_dir_fal + pdb1_name + '/' - # print(command); os.system(command) - - # #command = 'rm *' + pdb1_name + '*csv' - # #print(command); os.system(command) - - - - # else: - # index += 1 - - diff --git a/code/main.py b/code/main.py deleted file mode 100644 index 73213a1..0000000 --- a/code/main.py +++ /dev/null @@ -1,327 +0,0 @@ -from pathlib import Path -import glob -import argparse - - -from prediction_all_var import * -from search_w_foldseek_cluster import * -from TMscore_all_var import * -from TMscore_all_var_FS import * - -#from pred_cal_tmscore_AC import * -from PLOT_AC import * -from PLOT_FS import * -from cal_plddt_ACFS import * - - - - -if __name__ == "__main__": - - import warnings - warnings.filterwarnings('ignore') - - ###### initiallization and input - parser = argparse.ArgumentParser() - parser.add_argument("--pdb1", type=str, help='PDB structure for the target crystal structure (target to be predicted)') - parser.add_argument("--pdb2", type=str, help='PDB structure for the alternative crystal structure') - parser.add_argument("--fname", type=str, help='put MSA folder name after colabsearch' ) - parser.add_argument("--fmname", type=str, help='put multimer MSA folder name after colabsearch') - parser.add_argument("--pname", type=str, help='job name for predicting blind mode' ) - parser.add_argument("--nMSA", type=str, help='number of samples for predicting the structure with MSA') - parser.add_argument("--nENS", type=str, help='number of samples for predicting the structure for ensemble generation') - parser.add_argument("--option", type=str, help='select prediction mode inAC, AC and FS e.g. AC = alterantive conformation or FS = fold-switching or inAC = increased sampling for predicting alternative conformation') - parser.add_argument("--type", type=str, help='select model-type of Colabfold e.g. ptm, monomer, and, multimer') - args = parser.parse_args() - - - - - download_alphafold_params("alphafold2_ptm", Path(".")) - - - blind = 'predictions_all' - success = 'predictions_all' - fail = 'failed_prediction' - multi = 'multimer_prediction' - pwd = os.getcwd() + '/' - - - - ##### output folder name designation - if args.option == "blind": - if args.pdb1 is None and args.pdb2 is None: - pdb1_name = args.pname - print("work name:", pdb1_name) - elif args.pdb1 is None and args.pname is None: - pdb1_name = args.fname; pdb1_name = pdb1_name.replace('/','') - print("work name:", pdb1_name) - else: - pdb1_name = args.fname; pdb1_name = pdb1_name.replace('/','') - print("work name:", pdb1_name) - elif args.pdb1 is not None and args.pdb2 is not None: - pdb1 = args.pdb1; pdb2 = args.pdb2 - pdb1_name = pdb1.replace('.pdb',''); pdb2_name = pdb2.replace('.pdb','') - print(pdb1_name, pdb2_name) - - - ##### The number of ensembles - if args.nMSA is None and args.nENS is None: - nMSA = 0; nENS = 0; - elif args.nMSA is not None and args.nENS is not None: - nMSA = int(args.nMSA); nENS = int(args.nENS) - elif args.nMSA is None and args.nENS is not None: - nMSA = 0; nENS = int(args.nENS) - elif args.nMSA is not None and args.nENS is None: - nMSA = int(args.nMSA); nENS = 0 - else: - print("Please put correct option of nMSA or nENS") - exit() - - - ##### Checking input MSA folder - if args.fname is None and args.fmname is None: - print("Please put MSA folder and file for prediction") - sys.exit() - elif args.fname is None and args.fmname is not None: - print("Please put MSA folder and file for monomer prediction") - sys.exit() - elif args.fname is not None and args.fmname is None: - search_dir = args.fname; search_multi_dir = 0 - elif args.fname is not None and args.fmname is not None: - search_dir = args.fname; search_multi_dir = ' ' + args.fmname; - - - ##### Model-type identification - model_type = [] - if args.type is None or args.type == "ptm": - model_type = "alphafold2_ptm" - elif args.type == "monomer": - model_type = "alphafold2" - elif args.type == "multimer" and args.option == "blind": - model_type = "alphafold2_multimer_v3" - if not os.path.exists(multi): - os.mkdir(multi) - elif args.type == "multimer": - ### check how many chains in a multimer - TER_count = 0 - with open(pdb1, 'r') as file: - for line in file: - TER = line.split() - TER_count += TER.count("TER") - - print(TER_count, " of chains in this multimer file.") - model_type = "alphafold2_multimer_v3" - - if not os.path.exists(multi): - os.mkdir(multi) - else: - print("Please put correct model-type option") - exit() - - - - search_dir = args.fname - success = 'predictions_all/' + pdb1_name + "/" - - - - if args.option == "AC": - print("Predicting alternative conformations") - ###################################################################################################### - ###### running prediction using full- and shallow random-MSA - if not os.path.exists(success): - os.mkdir(success) - succ_dir_count = 0 - else: - succ_dir_count = 0 - for root_dir, cur_dir, files in os.walk(pwd + success + '/'): - succ_dir_count += len(cur_dir) - - if os.path.exists(success): - if succ_dir_count >= 8: - print("Prediction was already done") - else: - print("Folder is already created and cleaning existed subfolders") - rm_pre_folders = 'rm -rf ' + success + '/' - os.system(rm_pre_folders) - else: - pass - - - prediction_option = args.option - if os.path.exists(success) and succ_dir_count >= 8: - print("Predictions including full- and random-MSA were already done") - cal_TMscore = TMscore_cal_all_var(pdb1, pdb1_name, pdb2, pdb2_name, nMSA, prediction_option, model_type) - shallow_MSA_size = [] - shallow_MSA_size = np.append(shallow_MSA_size, cal_TMscore.size_selection) - np.savetxt('selected_MSA-size_' + pdb1_name + '.csv', shallow_MSA_size) - else: - prediction_all(pdb1_name, search_dir, search_multi_dir, nMSA, model_type) - shallow_MSA_size = [] - cal_TMscore = TMscore_cal_all_var(pdb1, pdb1_name, pdb2, pdb2_name, nMSA, prediction_option, model_type) - shallow_MSA_size = np.append(shallow_MSA_size, cal_TMscore.size_selection) - print(" ") - print("Specific size of shallow random MSA is similar to full-MSA") - print(shallow_MSA_size) - np.savetxt('selected_MSA-size_' + pdb1_name + '.csv', shallow_MSA_size) - - - ###################################################################################################### - ##### calculate plddt of initial predictions - if model_type == "alphafold2_multimer_v3": - list_org_samplings = glob.glob( str(pwd) + str(multi) + '/' + str(pdb1_name) + '/*full_rand*/') - list_ran_samplings = glob.glob( str(pwd) + str(multi) + '/' + str(pdb1_name) + '/*max*/') - - full = 'full-MSA'; random = 'random-MSA' ; - plddt_cal(list_org_samplings, full, pdb1_name, nMSA, nENS, model_type) - plddt_cal(list_ran_samplings, random, pdb1_name, nMSA, nENS, model_type) - - else: - list_org_samplings = glob.glob( str(pwd) + str(success) + '*full_rand*/') - list_ran_samplings = glob.glob( str(pwd) + str(success) + '*max*/') - - full = 'full-MSA'; random = 'random-MSA' ; - plddt_cal(list_org_samplings, full, pdb1_name, nMSA, nENS, model_type) - plddt_cal(list_ran_samplings, random, pdb1_name, nMSA, nENS, model_type) - - ###################################################################################################### - ##### plot the 2D-scatter plot of TM-scores with pLDDT - plot_2D_scatter_AC(full, random, pdb1, pdb1_name, pdb2, pdb2_name, nMSA, nENS, model_type) - - - - - - - elif args.option == "FS": - if not os.path.exists(success): - os.mkdir(success) - succ_dir_count = 0 - else: - succ_dir_count = 0 - for root_dir, cur_dir, files in os.walk(pwd + success + '/'): - succ_dir_count += len(cur_dir) - - if os.path.exists(success + '/'): - if succ_dir_count >= 8: - print("Prediction was already done") - else: - print("Folder is already created and cleaning existed subfolders") - rm_pre_folders = 'rm -rf ' + success + '/' - os.system(rm_pre_folders) - else: - pass - - - print("Predicting fold-swithcing models") - prediction_option = args.option - ###################################################################################################### - ###### running prediction using full- and shallow random-MSA - if os.path.exists(success) and succ_dir_count >= 8: - print("Predictions including full- and random-MSA were already done") - cal_TMscore = TMscore_cal_all_var_FS(pdb1, pdb1_name, pdb2, pdb2_name, nMSA, prediction_option, model_type) - shallow_MSA_size = [] - shallow_MSA_size = np.append(shallow_MSA_size, cal_TMscore.size_selection) - np.savetxt('selected_MSA-size_' + pdb1_name + '.csv', shallow_MSA_size) - #elif os.path.exists(multi + '/' + pdb1_name) and succ_dir_count >= 8: - # print("Predictions including full- and random-MSA were already done") - else: - shallow_MSA_size = [] - if args.type != "multimer": - prediction_all(pdb1_name, search_dir, search_multi_dir, nMSA, model_type) - cal_TMscore = TMscore_cal_all_var_FS(pdb1, pdb1_name, pdb2, pdb2_name, nMSA, prediction_option, model_type) - shallow_MSA_size = np.append(shallow_MSA_size, cal_TMscore.size_selection) - else: - prediction_all(pdb1_name, search_dir, search_multi_dir, nMSA, model_type) - shallow_MSA_size = np.append(shallow_MSA_size, cal_TMscore.size_selection) - print(" ") - print("Specific size of shallow random MSA is similar to full-MSA") - print(shallow_MSA_size) - np.savetxt('selected_MSA-size_' + pdb1_name + '.csv', shallow_MSA_size) - - ###################################################################################################### - ##### calculate plddt of initial predictions - if model_type == "alphafold2_multimer_v3": - list_org_samplings = glob.glob( str(pwd) + str(multi) + '/' + str(pdb1_name) + '/*full_rand*/') - list_ran_samplings = glob.glob( str(pwd) + str(multi) + '/' + str(pdb1_name) + '/*max*/') - - full = 'full-MSA'; random = 'random-MSA' ; - plddt_cal(list_org_samplings, full, pdb1_name, nMSA, nENS, model_type) - plddt_cal(list_ran_samplings, random, pdb1_name, nMSA, nENS, model_type) - - else: - list_org_samplings = glob.glob( str(pwd) + str(success) + '/*full_rand*/') - list_ran_samplings = glob.glob( str(pwd) + str(success) + '/*max*/') - - full = 'full-MSA'; random = 'random-MSA' ; - plddt_cal(list_org_samplings, full, pdb1_name, nMSA, nENS, model_type) - plddt_cal(list_ran_samplings, random, pdb1_name, nMSA, nENS, model_type) - - - ###################################################################################################### - ##### plot the 2D-scatter plot of TM-scores with pLDDT - if model_type == "alphafold2_multimer_v3": - plot_2D_scatter_AC(full, random, pdb1, pdb1_name, pdb2, pdb2_name, nMSA, nENS, model_type) - else: - plot_2D_scatter(full, random, pdb1, pdb1_name, pdb2, pdb2_name, nMSA, nENS) - - - - - elif args.option == "blind": - print("Predicting fold-swithcing proteins without crystal structures of pdbs") - ###################################################################################################### - ###### check previous predictions were performed or not - if not os.path.exists(blind): - os.mkdir(blind) - else: - blind_dir_count = 0 - for root_dir, cur_dir, files in os.walk(pwd + blind + '/' + pdb1_name + '/'): - blind_dir_count += len(cur_dir) - - if os.path.exists(blind + '/' + pdb1_name): - if blind_dir_count >= 8: - print("Prediction was already done") - else: - print("Folder is already created and cleaning existed subfolders") - rm_pre_folders = 'rm -rf ' + blind + '/' + pdb1_name + '/' - os.system(rm_pre_folders) - else: - pass - - ###### running prediction using full- and shallow random-MSA - blind_pred_path = 'predictions_all/' + pdb1_name; print(blind_pred_path) - - if os.path.exists(blind + '/' + pdb1_name) and blind_dir_count >= 8: - print("Predictions including full- and random-MSA were already done") - - fseek_file_count = 0 - for root_dir, cur_dir, files in os.walk(pwd + blind + '/' + pdb1_name + '/'): - fseek_file_count += len(files) - - print(fseek_file_count) - #if fseek_file_count == 856: ##(107 * 8) 107 includes foldseek file and 8 means the numbers of prediction folders - if fseek_file_count >= 640: ##672 - print(" "); print("Foldseek search was done") - #### performing the PCA calculation with RMSD - blind_screening(pdb1_name, blind_pred_path) - else: - #running_foldseek_all(pdb1_name) - - #### performing the PCA calculation with RMSD - blind_screening(pdb1_name, blind_pred_path) - - else: - prediction_all(pdb1_name, search_dir, search_multi_dir, nMSA, model_type) - print(" ") - print("Finished running for prediction using full- and shallow random-MSAs") - - print(" ") - print("Running Foldseek to find the relatedcrystal structures") - #running_foldseek_all(pdb1_name) - - #### performing the PCA calculation with RMSD - blind_screening(pdb1_name, blind_pred_path) - - diff --git a/code/pred_cal_tmscore_multimer.py b/code/pred_cal_tmscore_multimer.py deleted file mode 100644 index ef42070..0000000 --- a/code/pred_cal_tmscore_multimer.py +++ /dev/null @@ -1,297 +0,0 @@ -#!/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Wed Feb 21 14:51:00 2024 - -@author: Myeongsang (Samuel) Lee -""" -import re -import Bio -import os -from os import listdir -from os.path import isfile, join -import sys -from pathlib import Path -import numpy as np -from numpy import genfromtxt -import matplotlib.pyplot as plt -import glob -import random -import argparse - -# call related modules of tmtools after installation -from tmtools import tm_align -from tmtools.io import get_structure, get_residue_data -from tmtools.testing import get_pdb_path - -# call converting the multimer as a single chain structure -from convert_multi_single import * - -# call converting the multimer as a separated chains -from split_multi_single import * - - -class TM_score_monomer(): - def __init__(self, pred_dir, pdb1_name, pdb2_name): - - ## loading reference pdb for TM-score - pwd = os.getcwd() + '/' - tmscores_monomer = [] - - files_list = (glob.glob(str(pred_dir) + "/*_unrelaxed*pdb")) - print(files_list) - - ##### pdb1_name part - pdb1_dir = pwd + pdb1_name - r2 = get_structure(get_pdb_path(str(pdb1_dir))) - coords2, seq2 = get_residue_data(r2) - - if len(files_list) == 0: - tmscores_monomer = [0.0, 0.0, 0.0, 0.0, 0.0] - return tmscores_monomer - - for model in files_list: - model = model.replace('.pdb','') - model = pwd + model - s = get_structure(get_pdb_path(model)) - coords1, seq1 = get_residue_data(s) - res = tm_align(coords1, coords2, seq1, seq2) - tmscore = round(res.tm_norm_chain1,5) # wrt to model - tmscores_monomer.append(tmscore) - - - ##### pdb2_name part - pdb2_dir = pwd + pdb2_name - r3 = get_structure(get_pdb_path(str(pdb2_dir))) - coords2, seq2 = get_residue_data(r3) - - if len(files_list) == 0: - tmscores_monomer = [0.0, 0.0, 0.0, 0.0, 0.0] - return tmscores_monomer - - for model in files_list: - model = model.replace('.pdb','') - model = pwd + model - s = get_structure(get_pdb_path(model)) - coords1, seq1 = get_residue_data(s) - res = tm_align(coords1, coords2, seq1, seq2) - tmscore = round(res.tm_norm_chain1,5) # wrt to model - tmscores_monomer.append(tmscore) - - print(tmscores_monomer) - self.tmscores_monomer = tmscores_monomer - - -class TM_score_multimer(): - def __init__(self, pred_dir, pdb1_name, pdb2_name): - - ## loading reference pdb for TM-score - pwd = os.getcwd() + '/' - tmscores_multimer = [] - - ##### convert the multimer file as a single structure - check_files_list = (glob.glob(str(pred_dir) + "/rmTER*_unrelaxed*.pdb")) - print(check_files_list) - if not check_files_list: - convert_m2s(pred_dir, pdb1_name, pdb2_name) - files_list = (glob.glob(str(pred_dir) + "/rmTER*_unrelaxed*.pdb")) - print(files_list) - else: - files_list = (glob.glob(str(pred_dir) + "/rmTER*_unrelaxed*.pdb")) - print(files_list) - - - ##### pdb2_name part - pdb2_dir = pwd + pdb2_name + '_rmTER' - r3 = get_structure(get_pdb_path(str(pdb2_dir))) - coords2, seq2 = get_residue_data(r3) - - if len(files_list) == 0: - tmscores_multimer = [0.0, 0.0, 0.0, 0.0, 0.0] - return tmscores_multimer - - for model in files_list: - model = model.replace('.pdb','') - model = pwd + model - s = get_structure(get_pdb_path(model)) - coords1, seq1 = get_residue_data(s) - res = tm_align(coords1, coords2, seq1, seq2) - tmscore = round(res.tm_norm_chain1,5) # wrt to model - tmscores_multimer.append(tmscore) - - print(tmscores_multimer) - - - ##### pdb1_name part - pdb1_dir = pwd + pdb1_name - r2 = get_structure(get_pdb_path(str(pdb1_dir))) - coords2, seq2 = get_residue_data(r2) - - if len(files_list) == 0: - tmscores_multimer = [0.0, 0.0, 0.0, 0.0, 0.0] - return tmscores_multimer - - for model in files_list: - model = model.replace('.pdb','') - model = pwd + model - s = get_structure(get_pdb_path(model)) - coords1, seq1 = get_residue_data(s) - res = tm_align(coords1, coords2, seq1, seq2) - tmscore = round(res.tm_norm_chain1,5) # wrt to model - tmscores_multimer.append(tmscore) - - self.tmscores_multimer = tmscores_multimer - - - - - - -class CF_MSA_max(): - def __init__(self, search_dir, output_dir, pdb_name, rseed, num_seeds, model_type): - - command = 'colabfold_batch --num-seeds ' + str(num_seeds) + ' --model-type alphafold2_ptm --random-seed ' + str(rseed) + search_dir + output_dir - print(command) - os.system(command) - - - - -class CF_MSA_var(): - def __init__(self, pdb1_name, pdb2_name, search_dir, output_dir, rseed, num_seeds, model_type): - #### shallow MSA section - #### Global viarlable - max_msa = 1; ext_msa = 2 - random_seed = rseed - self.pdb1_name = pdb1_name; self.pdb2_name = pdb2_name - - for multi in (1, 2, 2, 2, 2, 2, 2): - max_msa = int(max_msa * multi) - ext_msa = int(ext_msa * multi) - - #### Colabfold part - command = 'colabfold_batch --num-seeds ' + str(num_seeds) + ' --model-type ' + str(model_type) + ' --max-seq ' + str(max_msa) + ' --max-extra-seq ' + str(ext_msa) + search_dir + output_dir + str(random_seed) + '_max_' + str(max_msa) + '_ext_' + str(ext_msa) - print(command); os.system(command) - - - - def cal_TM_score_multi(self, pdb1_name, pdb2_name, num_seeds, search_dir, output_dir, rseed): - - max_msa = 1; ext_msa = 2 - multi_size = 0; random_seed = rseed - TMscore_multi = []; TMscore_multi_average = np.zeros((7, 1)) - - for multi in (1, 2, 2, 2, 2, 2, 2): - max_msa = int(max_msa * multi) - ext_msa = int(ext_msa * multi) - - fin_pred_dir = pdb1_name + '_predicted_models_rand_' + str(rseed) + '_max_' + str(max_msa) + '_ext_' + str(ext_msa) - fin_pred_dir_all = pdb1_name + '_predicted_models_rand_' + str(rseed) + '_max_*' - pred_files_list = (glob.glob(str(fin_pred_dir) + "/*_unrelaxed*pdb")) - - if len(pred_files_list) == 0: - print("The TMscore list is empty") - tmp = np.zeros((1, 25)) - TMscore_multi = np.append(TMscore_multi, tmp) - else: - run_TMscore_multi = TM_score_multimer(fin_pred_dir, pdb1_name, pdb2_name) - TMscore_multi = np.append(TMscore_multi, run_TMscore_multi.tmscores_multimer); print(TMscore_multi) - - - TMscore_multi = TMscore_multi.reshape(7 * 2, num_seeds * 5) - np.savetxt('TMScore_random-MSA_' + pdb1_name + '.csv', TMscore_multi, fmt='%2.3f') - - - print("TMscore multimer:"); print(TMscore_multi) - - if np.any(TMscore_multi > 0.4): - tmp_cnt = 0 - for i in range(0, 13, 2): - TMscore_multi_average[tmp_cnt] = np.average(TMscore_multi[i]) - tmp_cnt = tmp_cnt + 1 - - - location = np.argmax(np.max(TMscore_multi_average, axis=1)) - print("The selected size of shallow random MSA is: ", np.argmax(np.max(TMscore_multi_average, axis=1))) - self.size_selection = int(location) - - mv_command = 'mv ' + fin_pred_dir_all + ' multimer_prediction/' + pdb1_name - print(mv_command); os.system(mv_command) - - - else: - print("All calculated TMscores are not satisfying the creteria") - print("All process is done.") - mv_command = 'mv ' + fin_pred_dir + ' failed_prediction/'; os.system(mv_command) - sys.exit() - - - - - -class prediction_all_multimer(): - def __init__(self, pdb1_name, pdb2_name, search_dir, nMSA, model_type, search_multi_dir): - ### note: pdb1_name should be nomomer and pdb2_name should be multimer - num_seeds = 5 + nMSA - TER_count = 0 - pwd = os.getcwd() + '/' - rm_converted_pdb = 'rm ' + pdb2_name + '_rmTER.pdb'; os.system(rm_converted_pdb) - - - ############################################################## - ##### Predicting all CF-random runs before calculate TM-scores - ##### Predicting the monomer with deep MSA - #pre_random_seed = np.arange(0, 10, 1) - pre_random_seed = random.sample(range(10), 1) - random_seed_full_MSA = ''.join(map(str, pre_random_seed)) - output_dir = ' ' + pdb1_name + '_predicted_models_full_rand_' + str(random_seed_full_MSA) - - ##### Perform predction with full-length MSA - MSA_full = CF_MSA_max(search_dir, output_dir, pdb1_name, random_seed_full_MSA, num_seeds, model_type) - - ##### Predicting the multimer with shallow random MSAs - ##### check out varied-MSA with (msa-max: 1, 2, 4, 8, 16, 32, 64) (msa-extra: 2, 4, 8, 16, 32, 64, 128) - output_dir = ' ' + pdb1_name + '_predicted_models_rand_' - random_seed = random.sample(range(100), 1) - random_seed = ''.join(map(str, random_seed)) - search_dir_update = ' ' + search_multi_dir.replace(' ','') + ' ' - - MSA_var = CF_MSA_var(pdb1_name, pdb2_name, search_dir_update, output_dir, random_seed, num_seeds, model_type) - - - ################################################################ - ##### Calculating all TM-scores including monomer and multimer - ##### TM-score calculation for monoemr - TMscore_monomer = [] - - # Directory section - gen_dir = 'multimer_prediction/' + pdb1_name - - if not os.path.exists(gen_dir): - os.mkdir(gen_dir) - - pred_dir = pdb1_name + '*predicted_models_full*' - - ##### Calculate TM-score of monomer - run_TMscore = TM_score_monomer(pred_dir, pdb1_name, pdb2_name) - TMscore_monomer = np.array(run_TMscore.tmscores_monomer) - TMscore_monomer = TMscore_monomer.reshape(2, num_seeds * 5); print(TMscore_monomer) - - - ##### TM-score calculation for multimer - if np.any(TMscore_monomer > 0.5): - pred_dir = pdb1_name + '_predicted_models_full_rand_' + str(random_seed_full_MSA) + '/' - mv_folder_cmd = 'mv ' + pred_dir + ' multimer_prediction/' + pdb1_name - print(mv_folder_cmd); os.system(mv_folder_cmd) - np.savetxt('TMScore_full-MSA_' + pdb1_name + '.csv', TMscore_monomer, fmt='%2.3f') - - MSA_var.cal_TM_score_multi(pdb1_name, pdb2_name, num_seeds, search_dir_update, output_dir, random_seed) - print(MSA_var.size_selection); self.size_selection = MSA_var.size_selection - - else: - pred_dir = pdb1_name + '_predicted_models*_rand_*/' - mv_command = 'mv ' + pred_dir + ' failed_prediction/'; - print(mv_command); os.system(mv_command) - print("Deep MSA cannot find the monomer") - sys.exit() - diff --git a/code/pred_cal_tmscore_multimer_FS.py b/code/pred_cal_tmscore_multimer_FS.py deleted file mode 100644 index 9fb047b..0000000 --- a/code/pred_cal_tmscore_multimer_FS.py +++ /dev/null @@ -1,356 +0,0 @@ -#!/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Wed Feb 21 14:51:00 2024 - -@author: Myeongsang (Samuel) Lee -""" -import re -import Bio -import os -from os import listdir -from os.path import isfile, join -import sys -from pathlib import Path -import numpy as np -from numpy import genfromtxt -import matplotlib.pyplot as plt -import glob -import random -import argparse - -# call related modules of tmtools after installation -from tmtools import tm_align -from tmtools.io import get_structure, get_residue_data -from tmtools.testing import get_pdb_path - -# call calculating TM-scores of fs region -from cal_tmscore_fs_only import * -from cal_tmscore_fs_multimer import * - -# call converting the multimer as a single chain structure -from convert_multi_single import * - -# call converting the multimer as a separated chains -from split_multi_single import * - - -class TM_score_monomer(): - def __init__(self, pred_dir, pdb1_name, pdb2_name): - - ## loading reference pdb for TM-score - pwd = os.getcwd() + '/' - tmscores_monomer = [] - - files_list = (glob.glob(str(pred_dir) + "/*_unrelaxed*pdb")) - print(files_list) - - ##### pdb1_name part - pdb1_dir = pwd + pdb1_name - r2 = get_structure(get_pdb_path(str(pdb1_dir))) - coords2, seq2 = get_residue_data(r2) - - if len(files_list) == 0: - tmscores_monomer = [0.0, 0.0, 0.0, 0.0, 0.0] - return tmscores_monomer - - for model in files_list: - #modelpath = Path(model) - #model = str(modelpath.parent) + "/" + modelpath.stem - model = model.replace('.pdb','') - model = pwd + model - s = get_structure(get_pdb_path(model)) - coords1, seq1 = get_residue_data(s) - res = tm_align(coords1, coords2, seq1, seq2) - tmscore = round(res.tm_norm_chain1,5) # wrt to model - tmscores_monomer.append(tmscore) - - - ##### pdb2_name part - pdb2_dir = pwd + pdb2_name - r3 = get_structure(get_pdb_path(str(pdb2_dir))) - coords2, seq2 = get_residue_data(r3) - - if len(files_list) == 0: - tmscores_monomer = [0.0, 0.0, 0.0, 0.0, 0.0] - return tmscores_monomer - - for model in files_list: - #modelpath = Path(model) - #model = str(modelpath.parent) + "/" + modelpath.stem - #model = model.replace('_converted.pdb','_converted') - model = model.replace('.pdb','') - model = pwd + model - s = get_structure(get_pdb_path(model)) - coords1, seq1 = get_residue_data(s) - res = tm_align(coords1, coords2, seq1, seq2) - tmscore = round(res.tm_norm_chain1,5) # wrt to model - tmscores_monomer.append(tmscore) - - print(tmscores_monomer) - self.tmscores_monomer = tmscores_monomer - - -class TM_score_multimer(): - def __init__(self, pred_dir, pdb1_name, pdb2_name): - - ## loading reference pdb for TM-score - pwd = os.getcwd() + '/' - tmscores_multimer = [] - - ##### convert the multimer file as a single structure - check_files_list = (glob.glob(str(pred_dir) + "/rmTER*_unrelaxed*.pdb")) - #check_files_list = (glob.glob(str(pred_dir) + "/*_unrelaxed*pdb")) - print(check_files_list) - if not check_files_list: - convert_m2s(pred_dir, pdb1_name, pdb2_name) - files_list = (glob.glob(str(pred_dir) + "/rmTER*_unrelaxed*.pdb")) - #files_list = (glob.glob(str(pred_dir) + "/*_unrelaxed*pdb")) - print(files_list) - else: - files_list = (glob.glob(str(pred_dir) + "/rmTER*_unrelaxed*.pdb")) - #files_list = (glob.glob(str(pred_dir) + "/*_unrelaxed*pdb")) - print(files_list) - - - ##### pdb2_name part - pdb2_dir = pwd + pdb2_name + '_rmTER' - r3 = get_structure(get_pdb_path(str(pdb2_dir))) - coords2, seq2 = get_residue_data(r3) - - if len(files_list) == 0: - tmscores_multimer = [0.0, 0.0, 0.0, 0.0, 0.0] - return tmscores_multimer - - for model in files_list: - #modelpath = Path(model) - #model = str(modelpath.parent) + "/" + modelpath.stem - #model = model.replace('_converted.pdb','_converted') - model = model.replace('.pdb','') - model = pwd + model - s = get_structure(get_pdb_path(model)) - coords1, seq1 = get_residue_data(s) - res = tm_align(coords1, coords2, seq1, seq2) - tmscore = round(res.tm_norm_chain1,5) # wrt to model - tmscores_multimer.append(tmscore) - - print(tmscores_multimer) - - - ##### pdb1_name part - pdb1_dir = pwd + pdb1_name - r2 = get_structure(get_pdb_path(str(pdb1_dir))) - coords2, seq2 = get_residue_data(r2) - - if len(files_list) == 0: - tmscores_multimer = [0.0, 0.0, 0.0, 0.0, 0.0] - return tmscores_multimer - - for model in files_list: - #modelpath = Path(model) - #model = str(modelpath.parent) + "/" + modelpath.stem - model = model.replace('.pdb','') - #model = model.replace('.pdb','') - model = pwd + model - s = get_structure(get_pdb_path(model)) - coords1, seq1 = get_residue_data(s) - res = tm_align(coords1, coords2, seq1, seq2) - tmscore = round(res.tm_norm_chain1,5) # wrt to model - tmscores_multimer.append(tmscore) - - self.tmscores_multimer = tmscores_multimer - - - - - - -class CF_MSA_max(): - def __init__(self, search_dir, output_dir, pdb_name, rseed, num_seeds, model_type): - - command = 'colabfold_batch --num-seeds ' + str(num_seeds) + ' --model-type alphafold2_ptm --random-seed ' + str(rseed) + search_dir + output_dir - print(command) - os.system(command) - - - - -class CF_MSA_var(): - def __init__(self, pdb1_name, pdb2_name, search_dir, output_dir, rseed, num_seeds, model_type): - #### shallow MSA section - #### Global viarlable - max_msa = 1; ext_msa = 2 - random_seed = rseed - self.pdb1_name = pdb1_name; self.pdb2_name = pdb2_name - - for multi in (1, 2, 2, 2, 2, 2, 2): - max_msa = int(max_msa * multi) - ext_msa = int(ext_msa * multi) - - #### Colabfold part - command = 'colabfold_batch --num-seeds ' + str(num_seeds) + ' --model-type ' + str(model_type) + ' --max-seq ' + str(max_msa) + ' --max-extra-seq ' + str(ext_msa) + search_dir + output_dir + str(random_seed) + '_max_' + str(max_msa) + '_ext_' + str(ext_msa) - print(command); os.system(command) - - - - def cal_TM_score_multi(self, pdb1_name, pdb2_name, num_seeds, search_dir, output_dir, rseed, pdb1, pdb2): - - max_msa = 1; ext_msa = 2 - multi_size = 0; random_seed = rseed - TMscore_multi = []; TMscore_multi_average = np.zeros((7, 1)) - TMscore_multi_fs = []; TMscore_multi_fs_average = np.zeros((7, 1)) - - for multi in (1, 2, 2, 2, 2, 2, 2): - max_msa = int(max_msa * multi) - ext_msa = int(ext_msa * multi) - - fin_pred_dir = pdb1_name + '_predicted_models_rand_' + str(rseed) + '_max_' + str(max_msa) + '_ext_' + str(ext_msa) + '/' - fin_pred_dir_all = pdb1_name + '_predicted_models_rand_' + str(rseed) + '_max_*' - pred_files_list = (glob.glob(str(fin_pred_dir) + "/*_unrelaxed*pdb")) - - if len(pred_files_list) == 0: - print("The TMscore list is empty") - tmp = np.zeros((1, 25)) - TMscore_multi = np.append(TMscore_multi, tmp) - TMscore_multi_fs = np.append(TMscore_multi_fs, tmp) - else: - run_TMscore_multi = TM_score_multimer(fin_pred_dir, pdb1_name, pdb2_name) - TMscore_multi = np.append(TMscore_multi, run_TMscore_multi.tmscores_multimer); print(TMscore_multi) - ##### for measuring the fold-switching region in multimer, just measure the TM-score of - ##### the first chain in between predicted and reference file - pdb2 = pdb2_name + '_rmTER.pdb' - fin_fs_pred_dir = pdb1_name + '_predicted_models_rand_' + str(rseed) + '_max_' + str(max_msa) + '_ext_' + str(ext_msa) + '/' - print(fin_fs_pred_dir) - run_TMscore_multi_fs = TM_score_fs_multi(fin_fs_pred_dir, pdb1, pdb1_name, pdb2, pdb2_name) - TMscore_multi_fs = np.append(TMscore_multi_fs, run_TMscore_multi_fs.tmscores_fs ); print(TMscore_multi_fs) - - TMscore_multi = TMscore_multi.reshape(7 * 2, num_seeds * 5) - np.savetxt('TMScore_random-MSA_' + pdb1_name + '.csv', TMscore_multi, fmt='%2.3f') - TMscore_multi = TMscore_multi[::2] - TMscore_multi_fs = TMscore_multi_fs.reshape(7 * 2, num_seeds * 5) - np.savetxt('TMScore_fs_random-MSA_' + pdb1_name + '.csv', TMscore_multi_fs, fmt='%2.3f') - TMscore_multi_fs = TMscore_multi_fs[1::2] - - print("TMscore multimer:"); print(TMscore_multi) - print("TMscore fold-switching region in multimer:"); print(TMscore_multi_fs) - - if np.any(TMscore_multi > 0.4) and np.any(TMscore_multi_fs > 0.4): - tmp_cnt = 0 - for i in range(0, int(TMscore_multi.shape[0] - 1)): - TMscore_multi_average[tmp_cnt] = np.average(TMscore_multi[i]) - TMscore_multi_fs_average[tmp_cnt] = np.average(TMscore_multi_fs[i]) - tmp_cnt = tmp_cnt + 1 - - location = np.argmax(np.max(TMscore_multi_average, axis=1)) - print("The selected size of shallow random MSA is: ", np.argmax(np.max(TMscore_multi_fs_average, axis=1))) - self.size_selection = int(location) - - mv_command = 'mv ' + fin_pred_dir_all + ' multimer_prediction/' + pdb1_name - print(mv_command); os.system(mv_command) - - - else: - print("All calculated TMscores are not satisfying the creteria") - print("All process is done.") - mv_command = 'mv ' + fin_pred_dir_all + ' failed_prediction/'; os.system(mv_command) - sys.exit() - - - -class prediction_all_multimer_FS(): - def __init__(self, pdb1_name, pdb2_name, search_dir, nMSA, model_type, search_multi_dir, pdb1, pdb2): - ### note: pdb1_name should be nomomer and pdb2_name should be multimer - num_seeds = 5 + nMSA - TER_count = 0 - pwd = os.getcwd() + '/' - rm_converted_pdb = 'rm ' + pdb2_name + '_rmTER.pdb'; os.system(rm_converted_pdb) - - - ############################################################## - ##### Predicting all CF-random runs before calculate TM-scores - ##### Predicting the monomer with deep MSA - #pre_random_seed = np.arange(0, 10, 1) - pre_random_seed = random.sample(range(10), 1) - random_seed_full_MSA = ''.join(map(str, pre_random_seed)) - output_dir = ' ' + pdb1_name + '_predicted_models_full_rand_' + str(random_seed_full_MSA) - - ##### Perform predction with full-length MSA - MSA_full = CF_MSA_max(search_dir, output_dir, pdb1_name, random_seed_full_MSA, num_seeds, model_type) - - ##### Predicting the multimer with shallow random MSAs - ##### check out varied-MSA with (msa-max: 1, 2, 4, 8, 16, 32, 64) (msa-extra: 2, 4, 8, 16, 32, 64, 128) - output_dir = ' ' + pdb1_name + '_predicted_models_rand_' - random_seed = random.sample(range(100), 1) - random_seed = ''.join(map(str, random_seed)) - search_dir_update = ' ' + search_multi_dir.replace(' ','') + ' ' - - MSA_var = CF_MSA_var(pdb1_name, pdb2_name, search_dir_update, output_dir, random_seed, num_seeds, model_type) - - - ################################################################ - ##### Calculating all TM-scores including monomer and multimer - ##### TM-score calculation for monoemr - TMscore_monomer = [] - - # Directory section - gen_dir = 'multimer_prediction/' + pdb1_name - - if not os.path.exists(gen_dir): - os.mkdir(gen_dir) - - pred_dir = pdb1_name + '*predicted_models_full*' - - - - ##### Calculating the TM-score of fold-switching region - ##### Extracting a signle chain from a multimer - TER_count = 0 - with open(pdb2, 'r') as file: - for line in file: - TER = line.split() - TER_count += TER.count("TER") - - line_cnt = 0 - #for i in range(0, TER_count): - for i in range(0, 2): - output_file_name = pdb2_name.split('_')[0] + '_multi.pdb' - - if line_cnt == 0: - with open(pdb2, 'r') as infile, open(output_file_name, 'w') as outfile: - for line in infile: - outfile.write(line) - line_cnt = line_cnt + 1 - if "TER " in line: - line_cnt = line_cnt + 1 - break - - pdb2_name_multi = output_file_name.replace('.pdb','') - - - ##### Calculate TM-score of monomer - run_TMscore = TM_score_monomer(pred_dir, pdb1_name, pdb2_name) - TMscore_monomer = np.array(run_TMscore.tmscores_monomer) - TMscore_monomer = TMscore_monomer.reshape(2, num_seeds * 5); print(TMscore_monomer) - ##### Calculate TM-score of fold-switching region - run_fs_TMscore = TM_score_fs(pred_dir, pdb1, pdb1_name, output_file_name, pdb2_name_multi) - TMscore_monomer_fs = np.array(run_fs_TMscore.tmscores_fs) - TMscore_monomer_fs = TMscore_monomer_fs.reshape(2, num_seeds * 5); print(TMscore_monomer_fs) - - - ##### TM-score calculation for multimer - if np.any(TMscore_monomer[0, :] >= 0.5) and np.any(TMscore_monomer_fs[0, :] >= 0.4): - pred_dir = pdb1_name + '_predicted_models_full_rand_' + str(random_seed_full_MSA) + '/' - mv_folder_cmd = 'mv ' + pred_dir + ' multimer_prediction/' + pdb1_name - print(mv_folder_cmd); os.system(mv_folder_cmd) - np.savetxt('TMScore_full-MSA_' + pdb1_name + '.csv', TMscore_monomer, fmt='%2.3f') - - MSA_var.cal_TM_score_multi(pdb1_name, pdb2_name_multi, num_seeds, search_dir_update, output_dir, random_seed, pdb1, output_file_name) - print(MSA_var.size_selection); self.size_selection = MSA_var.size_selection - - - else: - pred_dir = pdb1_name + '_predicted_models*_rand_*/' - mv_command = 'mv ' + pred_dir + ' failed_prediction/'; - print(mv_command); os.system(mv_command) - print("Deep MSA cannot find the monomer") - sys.exit() diff --git a/code/prediction_all_var.py b/code/prediction_all_var.py deleted file mode 100644 index 166f76c..0000000 --- a/code/prediction_all_var.py +++ /dev/null @@ -1,156 +0,0 @@ -#!/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Wed Feb 21 14:51:00 2024 - -@author: Myeongsang (Samuel) Lee -""" -import re -import Bio -import os -from os import listdir -from os.path import isfile, join -import sys -from pathlib import Path -import numpy as np -from numpy import genfromtxt -import matplotlib.pyplot as plt -import glob -import random -import argparse - - -from colabfold.download import download_alphafold_params -from colabfold.batch import run, get_queries -from colabfold.utils import setup_logging -import logging - - - - -class CF_MSA_max(): - def __init__(self, search_dir, output_dir, pdb_name, rseed, num_seeds, model_type) -> str: - print(search_dir) - - setup_logging(Path(output_dir) / "log.txt") - logger = logging.getLogger(__name__) - - queries, is_complex = get_queries(search_dir) - - run( - queries = queries, - result_dir = output_dir, - num_models= 5, - is_complex= is_complex, - model_type= model_type, - - num_seeds = int(num_seeds), - random_seed = int(rseed), - - data_dir=Path("."), - ) - - - - -class CF_MSA_var(): - def __init__(self, pdb1_name, search_dir, output_dir, rseed, num_seeds, model_type): - #### shallow MSA section - #### Global viarlable - max_msa = 1 - ext_msa = 2 - pre_random_seed = np.array(rseed) ## needed to remove future - random_seed = ''.join(map(str, pre_random_seed)) - - self.pdb1_name = pdb1_name - - - - max_msa = 1 - ext_msa = 2 - - TMscores_random = [] - - for multi in (1, 2, 2, 2, 2, 2, 2): - max_msa = max_msa * multi - ext_msa = ext_msa * multi - - output_dir_var = output_dir + str(random_seed) + '_max_' + str(max_msa) + '_ext_' + str(ext_msa) - - ##### Colabfold_batch part with max-seq and max-extra-seq - setup_logging(Path(output_dir_var) / "log.txt") - logger = logging.getLogger(__name__) - - queries, is_complex = get_queries(search_dir) - - - run( - queries = queries, - result_dir = output_dir_var, - num_models= 5, - is_complex= is_complex, - model_type= model_type, - - num_seeds = int(num_seeds), - random_seed = int(random_seed), - - max_seq = int(max_msa), - max_extra_seq = int(ext_msa), - - data_dir=Path("."), - ) - - - fin_pred_dir = pdb1_name + '_predicted_models_rand_' + str(random_seed) + '_max_*' - gen_dir = 'predictions_all/' + pdb1_name - - if not os.path.exists(gen_dir): - os.makedirs(gen_dir) - mv_command = 'mv ' + fin_pred_dir + ' predictions_all/' + pdb1_name - print(mv_command); os.system(mv_command) - else: - mv_command = 'mv ' + fin_pred_dir + ' predictions_all/' + pdb1_name - print(mv_command); os.system(mv_command) - - - - - -class prediction_all(): - def __init__(self, pdb1_name, search_dir, search_multi_dir, nMSA, model_type): - - num_seeds = 5 + nMSA - - pre_random_seed = np.random.randint(0, 16, 1) - random_seed = ''.join(map(str, pre_random_seed)) - output_dir = pdb1_name + '_predicted_models_full_rand_' + str(random_seed) - - - ##### Perform predction with full-length MSA - MSA_full = CF_MSA_max(search_dir, output_dir, pdb1_name, random_seed, num_seeds, model_type) - pwd = os.getcwd() + '/' - - - # Directory section - gen_dir = 'predictions_all/' + pdb1_name - - if not os.path.exists(gen_dir): - os.mkdir(gen_dir) - - - pred_dir = pdb1_name + '_predicted_models_full_rand_' + str(random_seed) + '/' - mv_folder_cmd = 'mv ' + pred_dir + ' predictions_all/' + pdb1_name - print(mv_folder_cmd); os.system(mv_folder_cmd) - - - - ##### check out varied-MSA with (msa-max: 1, 2, 4, 8, 16, 32, 64) (msa-extra: 2, 4, 8, 16, 32, 64, 128) - output_dir = pdb1_name + '_predicted_models_rand_' - random_seed = random.sample(range(100), 1) - if model_type != "alphafold2_multimer_v3": - MSA_var = CF_MSA_var(pdb1_name, search_dir, output_dir, random_seed, num_seeds, model_type) - else: - MSA_var = CF_MSA_var(pdb1_name, search_multi_dir, output_dir, random_seed, num_seeds, model_type) - - - diff --git a/code/range_fs_pairs_all.txt b/code/range_fs_pairs_all.txt deleted file mode 100644 index d78441d..0000000 --- a/code/range_fs_pairs_all.txt +++ /dev/null @@ -1,5 +0,0 @@ -# pdb1,pdb2,pred1,pred2 -5jyt_A,2qke_E,50-99,50-99,50-99,50-99 -1nqd_A,1nqj_B,895-919,894-919,1-33,1-33 -6c6s_D,2oug_C,112-162,115-162,112-162,112-162 -2vfx_L,3gmh_L,165-202,165-202,167-204,167-204 diff --git a/code/search_w_foldseek_cluster.py b/code/search_w_foldseek_cluster.py deleted file mode 100644 index cfc3a84..0000000 --- a/code/search_w_foldseek_cluster.py +++ /dev/null @@ -1,308 +0,0 @@ -import glob -import shutil, os, sys, re -import subprocess -import numpy as np -import matplotlib.pyplot as plt - -from scipy import stats -from scipy.spatial import distance - -from sklearn.decomposition import PCA -from sklearn.metrics import silhouette_score -from sklearn.cluster import HDBSCAN -from sklearn.preprocessing import minmax_scale - -import MDAnalysis as mda -from MDAnalysis.analysis.dssp import DSSP - -import pymol - - -class blind_screening(): - def cluster_structures(X): - """ - loop through values of k and define best value of k with silhouette_score - - Input: - X : np.ndarray (n, m) | result of PCA - - Output: - cluster_labels : (n, 1) | list of optimal clusters for X - """ - - k_range = range(2,51) - sil_score = [] - for k in k_range: - clustering = HDBSCAN(min_cluster_size=k,min_samples=1) - clustering.fit(X) - if len(set(clustering.labels_)) > 1 and len(set(clustering.labels_)) < len(X): - score = silhouette_score(X, clustering.labels_, metric='euclidean') - sil_score.append(score) - else: - sil_score.append(-1) - - opt_k = k_range[np.argmax(sil_score)] - clustering = HDBSCAN(min_cluster_size=opt_k) - clustering.fit(X) - return clustering.labels_ - - def k_medoids(X, l, labels, k=3, max_iter=100): - """ - K-Medoid algorithm to find suitable representative structures from each cluster defined by HDBSCAN. - - Input: - X: np.ndarray (n, m) | all points from one HDBSCAN cluster - k: number of medoids | - max_iter: maximum number of iterations allowed to minimize the distance - l: current HDBSAN label - labels: full list of HDBSCAN labels - - Output: - medoids: indices of the K medoids - total_cost: sum of distances of each point to its medoid - """ - np.random.seed(42) - - #start with random k points - temp = X.copy() - mask = np.zeros(X.shape, dtype=bool) - mask[np.argwhere(labels == l)] = True - - #check the number of points in a cluster - #if less than 4 just return those indices - _, cluster_count = np.unique(mask[:,0], return_counts=True) # count = False, True - cluster_count = cluster_count[[idx for idx, val in enumerate(_) if val == True][0]]#<-account for the case of one cluster - - if cluster_count < 4: - return np.ravel(np.argwhere(mask[:,0] == True)), np.nan - # block out values that are not within the current HDBSCAN group - temp[~mask] = 9999 - - number_samples = temp.shape[0] - medoids = np.random.choice(number_samples, k, replace=False) - - #distance matrix of randomly chosen points - D = distance.cdist(temp, temp[medoids], metric='euclidean') - tot_cost = np.sum(np.min(D, axis=1)) - - itr = 0 - while itr < max_iter: - reduced = False - - #loop through all possibilities - for m_idx in range(k): - for current_idx in range(number_samples): - if current_idx in medoids: - continue - - new_medoids = medoids.copy() - new_medoids[m_idx] = current_idx - - #new distance matrix - D_new = distance.cdist(temp, temp[new_medoids], metric='euclidean') - new_cost = np.sum(np.min(D_new, axis=1)) - - #if the cost has been reduced move onto the the next sample - if new_cost < tot_cost: - medoids = new_medoids - tot_cost = new_cost - reduced = True - break - if reduced: - break - - if not reduced: - #If there was no improvement we should be converged - break - itr+=1 - return medoids, tot_cost - - - def __init__(self, pdb1_name, blind_path): - # def main(): - """ - requires Foldseek and Pymol - - Find all pdb files from CF-Random generated directories. - This script will automatically generate a Foldseek database of these structures - then calculate a similarity matrix of all structures based on bit-score. - similarity matrix -> PCA -> HDBSCAN -> K-medoids -> structures of interest. - - The final output is then a png file showing the result of PCA and HDBSCAN - a text file containing the coordinates of the structures of interest, file name, and group ID - finally this script will automatically generate a pse file of the structures_of_interest - """ - - - #_______________collect all pdb files that CF-Random generated_____________________________ - db_directory = blind_path + "/pdbs_for_db/" - #db_directory = "/pdbs_for_db/" - if not os.path.isdir(db_directory): - os.mkdir(db_directory) - #pdb_files = glob.glob("./**/*.pdb", recursive=True) - pdb_files = glob.glob(blind_path + "/**/*.pdb", recursive=True) - pdb_files = [file for file in pdb_files if db_directory not in file] - print("Gathering pdb pdb files for self-search") - for file in pdb_files: - dest_name = file.replace('/','-') - if not os.path.isfile(db_directory + dest_name[17:]): - shutil.copyfile(file, db_directory + dest_name[17:]) - #__________________________________________________________________________________________ - - - print("Creating database...") - create_db = ["foldseek", "createdb", db_directory, db_directory + "DB"] - if not os.path.isfile(db_directory + "DB"): - try: - response = subprocess.run(create_db, capture_output=True, text=True, check=True ) - except subprocess.CalledProcessError as e: - print("ERROR:\n", e.stderr) - - print('Succes database is up!') - else: - print("found an existing DB") - - #________________Calculate foldseek self comparison of all predicted structures____________ - - for file in pdb_files: - foldseek_run = ["foldseek", "easy-search", file, db_directory + "DB", file.replace(".pdb","-self.foldseek"), blind_path + "/tmp", "--format-mode", "0", "--format-output", "query,target,alntmscore,qaln,taln,alnlen,evalue,bits", "--exhaustive-search", "1", "-s", "9.5"] - if not os.path.isfile(file.replace(".pdb","-self.foldseek")): - response = subprocess.run(foldseek_run, capture_output=True, text=True, check=True) - try: - response = subprocess.run(foldseek_run, capture_output=True, text=True, check=True) - print(response.check_returncode()) - except subprocess.CalledProcessError as e: - print("foldseek failed to run {:}".format(file)) - print("Error:", e.stderr) - print('{:} succeeded!!!'.format(file)) - else: - print("{:} already exists".format(file.replace(".pdb","-self.foldseek"))) - - #__________________________________________________________________________________________ - - - #__________Populate a correlation matrix with bit scores_______________________________________________ - - #everything will be sorted by the text of the file name - files = glob.glob(blind_path + "/**/*-self.foldseek") - - #first remove any outliers from the dssp loop distribution, they tend to be unfolded predictions - files_dssp = [];files_count = []; - for file in files: - u = mda.Universe(file.replace("-self.foldseek",".pdb")) - s = DSSP(u).run().results.dssp[0] - dssp, count = np.unique(s, return_counts=True) - # ['-' 'E' 'H'] - if len(dssp) < 3: - if '-' not in dssp: - dssp = np.insert(dssp, 0 ,'-') - count = np.insert(count,0, 0) - if 'E' not in dssp: - dssp = np.insert(dssp, 1 ,'E') - count = np.insert(count,1, 0) - if 'H' not in dssp: - dssp = np.insert(dssp, 2 ,'H') - count = np.insert(count,2, 0) - files_dssp.append(dssp) - files_count.append(count) - files_dssp = np.array(files_dssp); files_count = np.array(files_count); - z_scores = stats.zscore(files_count[:, 0]) - outlier_idx = np.argwhere(z_scores > 3) - - # remove unfolded proteins from file list - files = np.array(files) - mask = np.zeros(files.shape, dtype=bool) - mask[outlier_idx] = True - for file in files[mask]: - print("removed from analysis: ",file.replace("-self.foldseek",".pdb")) - files = files[~mask] - files = sorted(files) - files_pdb = [file.replace('/','-')[17:].replace("-self.foldseek","") for file in files] - #files_pdb = [file.replace("-self.foldseek",".pdb") for file in files] - corr_mtx = [] - - df = {} - for file in files: - # it is possible for predictions to be so different that it isn't returned with a bit_score - # in that case we return a zero - dict_with_all = {file:[0] for file in files_pdb} - with open(file, 'r') as _: - data = [l.rstrip().split('\t') for l in _] - for d in data: - dict_with_all[d[1]] = d - print(dict_with_all[d[1]]) - #bug in foldseek occasionally returns -2,147,483,648 - _temp = [] - for pdb in files_pdb: - print("testing", pdb) - x = int(dict_with_all.get(pdb, 0)[-1]) - print(x) - if x == -2147483648: - _temp.append(0) - else: - _temp.append(x) - - corr_mtx.append(_temp) - - corr_mtx = np.array(corr_mtx) - - #normalize each row and subtract top model from full MSA depth to give more - #specific meaning to variance - norm_corr_mtx = minmax_scale(corr_mtx, axis=1) - norm_corr_mtx = (norm_corr_mtx + norm_corr_mtx.T) /2 - - sklearn_pca = PCA(n_components=4) - pca = sklearn_pca.fit_transform(norm_corr_mtx) - labels = blind_screening.cluster_structures(pca) - - plt.figure(figsize=(8,6)) - plt.scatter(pca[:,0], pca[:,1], c=labels, cmap='viridis', s=45) - plt.savefig(blind_path + '/' + pdb1_name + '-cluster.png') - plt.clf() - - - #find the structures_of_interest - files_of_interest = [] - pca_of_interest = [] - for l in np.unique(labels): - kmed_idx, tot_cost = blind_screening.k_medoids(pca, l, labels) - for idx in kmed_idx: - files_of_interest.append([files[idx], l]) - pca_of_interest.append(pca[idx]) - - #create pse file with colors that match viridis colors in cluster.png - viridis = plt.get_cmap('viridis',len(files_of_interest)) - largest_group_num = max(files_of_interest, key=lambda x: x[1]) - pymol.cmd.load(files[0].replace('-self.foldseek','.pdb'), 'Dominant') - with open(blind_path + '/' + pdb1_name + "-structures_of_interest.csv", "w") as file: - file.write("group, file, pca_1, pca_2\n") - - with open(blind_path + '/' + pdb1_name + "-structures_of_interest.csv", "a") as file: - for idx, foi in enumerate(files_of_interest): - if largest_group_num[1] == -1: - color = 0 - else: - color = (foi[1] + 1) / (largest_group_num[1]+1) - color = viridis(color)[:3] - new_name = re.findall(r'(full)|(max\w+)|(rank_\d+)', foi[0]) - new_name = str(idx)+ '_' + '_'.join([i for n in new_name for i in n if i != '']) - pymol.cmd.load(foi[0].replace('-self.foldseek','.pdb'), new_name) - pymol.cmd.align(new_name,'Dominant') - color_name = 'col_'+str(foi[1]) - pymol.cmd.set_color(color_name, color) - pymol.cmd.color(color_name,new_name) - file.write(f"{foi[1]}, {foi[0]}, {pca_of_interest[idx][0]}, {pca_of_interest[idx][1]}\n") - - pymol.cmd.save(blind_path + '/' + pdb1_name + '-structures_of_interest.pse', 'pse') - pymol.cmd.delete('all') - pymol.cmd.reinitialize() - - #save all data with clusters - with open("structures_all.csv", 'w') as file: - file.write("group, file, pca_1, pca_2\n") - for idx, f in enumerate(files): - file.write(f"{labels[idx]},{f},{pca[idx, 0]},{pca[idx, 1]}\n") - - sys.exit() - - diff --git a/code/split_chains.py b/code/split_chains.py deleted file mode 100644 index 8c19a34..0000000 --- a/code/split_chains.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Wed Dec 17 09:29:00 2024 - -Splitting protein chain as each single file - -@author: Myeongsang (Samuel) Lee -""" -import re -import Bio -import os -from os import listdir -from os.path import isfile, join -import sys -from pathlib import Path -import numpy as np -from numpy import genfromtxt -import glob -import linecache -import argparse - - -parser = argparse.ArgumentParser() -parser.add_argument("--pdb1", type=str, help='PDB structure for the target crystal structure') -args = parser.parse_args() - - -chain_char = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', - 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'] - -pdb1 = args.pdb1; pdb1_name = pdb1.replace('.pdb','') - - -TER_count = 0 -with open(pdb1, 'r') as file: - for line in file: - TER = line.split() - TER_count += TER.count("TER") - - - - - -line_cnt = 0 -for i in range(0, TER_count): - output_file_name = pdb1_name + '_' + chain_char[i] + '.pdb' - - if line_cnt == 0: - with open(pdb1, 'r') as infile, open(output_file_name, 'w') as outfile: - for line in infile: - outfile.write(line) - line_cnt = line_cnt + 1 - if "TER " in line: - line_cnt = line_cnt + 1 - break - - else: - with open(pdb1, 'r') as infile, open(output_file_name, 'w') as outfile: - for line in infile: - linecache.getline(pdb1, line_cnt) - outfile.write(linecache.getline(pdb1, line_cnt)) - line_cnt = line_cnt + 1 - if linecache.getline(pdb1, line_cnt) == "TER ": - line_cnt = line_cnt + 1 - break - - - - - diff --git a/code/split_multi_single.py b/code/split_multi_single.py deleted file mode 100644 index 35e948f..0000000 --- a/code/split_multi_single.py +++ /dev/null @@ -1,75 +0,0 @@ -#!/bin/env python3 -# -*- coding: utf-8 -*- -""" - -Converting the multimer PDB to a single PDB file - -Created on Tue Dec 24 14:51:00 2025 -@author: Myeongsang (Samuel) Lee -""" -import re -import Bio -import os -from os import listdir -from os.path import isfile, join -import sys -from pathlib import Path -import numpy as np -from numpy import genfromtxt -import glob -import random -import linecache -import argparse - - - -class split_multi_to_chains(): - def __init__(self, pred_path): - - - chain_char = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', - 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'] - - - current_dir = os.getcwd() + '/' - data_dir = Path(pred_path) # Path to the predicted models - - files_list = (glob.glob(str(pred_path) + "/*_unrelaxed*pdb")) - - for fl in files_list: - TER_count = 0 - with open(fl, 'r') as file: - for line in file: - TER = line.split() - TER_count += TER.count("TER") - - line_cnt = 0 - - fl_name = fl.replace('.pdb','') - for i in range(0, TER_count): - output_file_name = fl_name + '_chain_' + chain_char[i] + '.pdb' - - if line_cnt == 0: - with open(fl, 'r') as infile, open(output_file_name, 'w') as outfile: - for line in infile: - outfile.write(line) - line_cnt = line_cnt + 1 - if "TER " in line: - line_cnt = line_cnt + 1 - break - - else: - with open(fl, 'r') as infile, open(output_file_name, 'w') as outfile: - for line in infile: - linecache.getline(fl, line_cnt) - outfile.write(linecache.getline(fl, line_cnt)) - line_cnt = line_cnt + 1 - if linecache.getline(fl, line_cnt) == "TER ": - line_cnt = line_cnt + 1 - break - - - - - - diff --git a/Data/AFsample2_benchmark/AFsample2_pse_files /AFsample2_pse_files/.keep b/data/AFsample2_benchmark/AFsample2_pse_files /AFsample2_pse_files/.keep similarity index 100% rename from Data/AFsample2_benchmark/AFsample2_pse_files /AFsample2_pse_files/.keep rename to data/AFsample2_benchmark/AFsample2_pse_files /AFsample2_pse_files/.keep diff --git a/Data/AFsample2_benchmark/AFsample2_pse_files /AFsample2_pse_files/AFsample2_benchmark_pse_files.zip b/data/AFsample2_benchmark/AFsample2_pse_files /AFsample2_pse_files/AFsample2_benchmark_pse_files.zip similarity index 100% rename from Data/AFsample2_benchmark/AFsample2_pse_files /AFsample2_pse_files/AFsample2_benchmark_pse_files.zip rename to data/AFsample2_benchmark/AFsample2_pse_files /AFsample2_pse_files/AFsample2_benchmark_pse_files.zip diff --git a/Data/AFsample2_benchmark/OC23_heatmap-nsamples.png b/data/AFsample2_benchmark/OC23_heatmap-nsamples.png similarity index 100% rename from Data/AFsample2_benchmark/OC23_heatmap-nsamples.png rename to data/AFsample2_benchmark/OC23_heatmap-nsamples.png diff --git a/Data/AFsample2_benchmark/TMScore_full-MSA_1si1_A.csv b/data/AFsample2_benchmark/TMScore_full-MSA_1si1_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_full-MSA_1si1_A.csv rename to data/AFsample2_benchmark/TMScore_full-MSA_1si1_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_full-MSA_2ktv_A.csv b/data/AFsample2_benchmark/TMScore_full-MSA_2ktv_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_full-MSA_2ktv_A.csv rename to data/AFsample2_benchmark/TMScore_full-MSA_2ktv_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_full-MSA_2olo_A.csv b/data/AFsample2_benchmark/TMScore_full-MSA_2olo_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_full-MSA_2olo_A.csv rename to data/AFsample2_benchmark/TMScore_full-MSA_2olo_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_full-MSA_2rqm_A.csv b/data/AFsample2_benchmark/TMScore_full-MSA_2rqm_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_full-MSA_2rqm_A.csv rename to data/AFsample2_benchmark/TMScore_full-MSA_2rqm_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_full-MSA_2wfa_A.csv b/data/AFsample2_benchmark/TMScore_full-MSA_2wfa_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_full-MSA_2wfa_A.csv rename to data/AFsample2_benchmark/TMScore_full-MSA_2wfa_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_full-MSA_2xe6_A.csv b/data/AFsample2_benchmark/TMScore_full-MSA_2xe6_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_full-MSA_2xe6_A.csv rename to data/AFsample2_benchmark/TMScore_full-MSA_2xe6_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_full-MSA_3fto_A.csv b/data/AFsample2_benchmark/TMScore_full-MSA_3fto_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_full-MSA_3fto_A.csv rename to data/AFsample2_benchmark/TMScore_full-MSA_3fto_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_full-MSA_3iuj_A.csv b/data/AFsample2_benchmark/TMScore_full-MSA_3iuj_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_full-MSA_3iuj_A.csv rename to data/AFsample2_benchmark/TMScore_full-MSA_3iuj_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_full-MSA_3l6g_A.csv b/data/AFsample2_benchmark/TMScore_full-MSA_3l6g_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_full-MSA_3l6g_A.csv rename to data/AFsample2_benchmark/TMScore_full-MSA_3l6g_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_full-MSA_3o6w_A.csv b/data/AFsample2_benchmark/TMScore_full-MSA_3o6w_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_full-MSA_3o6w_A.csv rename to data/AFsample2_benchmark/TMScore_full-MSA_3o6w_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_full-MSA_3tee_A.csv b/data/AFsample2_benchmark/TMScore_full-MSA_3tee_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_full-MSA_3tee_A.csv rename to data/AFsample2_benchmark/TMScore_full-MSA_3tee_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_full-MSA_3zsf_A.csv b/data/AFsample2_benchmark/TMScore_full-MSA_3zsf_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_full-MSA_3zsf_A.csv rename to data/AFsample2_benchmark/TMScore_full-MSA_3zsf_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_full-MSA_4bp8_A.csv b/data/AFsample2_benchmark/TMScore_full-MSA_4bp8_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_full-MSA_4bp8_A.csv rename to data/AFsample2_benchmark/TMScore_full-MSA_4bp8_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_full-MSA_5ho2_A.csv b/data/AFsample2_benchmark/TMScore_full-MSA_5ho2_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_full-MSA_5ho2_A.csv rename to data/AFsample2_benchmark/TMScore_full-MSA_5ho2_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_full-MSA_6hac_A.csv b/data/AFsample2_benchmark/TMScore_full-MSA_6hac_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_full-MSA_6hac_A.csv rename to data/AFsample2_benchmark/TMScore_full-MSA_6hac_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_full-MSA_6hnj_A.csv b/data/AFsample2_benchmark/TMScore_full-MSA_6hnj_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_full-MSA_6hnj_A.csv rename to data/AFsample2_benchmark/TMScore_full-MSA_6hnj_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_full-MSA_6k8b_A.csv b/data/AFsample2_benchmark/TMScore_full-MSA_6k8b_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_full-MSA_6k8b_A.csv rename to data/AFsample2_benchmark/TMScore_full-MSA_6k8b_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_full-MSA_6mka_A.csv b/data/AFsample2_benchmark/TMScore_full-MSA_6mka_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_full-MSA_6mka_A.csv rename to data/AFsample2_benchmark/TMScore_full-MSA_6mka_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_full-MSA_6nc7_A.csv b/data/AFsample2_benchmark/TMScore_full-MSA_6nc7_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_full-MSA_6nc7_A.csv rename to data/AFsample2_benchmark/TMScore_full-MSA_6nc7_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_full-MSA_6yed_A.csv b/data/AFsample2_benchmark/TMScore_full-MSA_6yed_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_full-MSA_6yed_A.csv rename to data/AFsample2_benchmark/TMScore_full-MSA_6yed_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_full-MSA_7c63_A.csv b/data/AFsample2_benchmark/TMScore_full-MSA_7c63_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_full-MSA_7c63_A.csv rename to data/AFsample2_benchmark/TMScore_full-MSA_7c63_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_full-MSA_7cy2_A.csv b/data/AFsample2_benchmark/TMScore_full-MSA_7cy2_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_full-MSA_7cy2_A.csv rename to data/AFsample2_benchmark/TMScore_full-MSA_7cy2_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_full-MSA_7qga_A.csv b/data/AFsample2_benchmark/TMScore_full-MSA_7qga_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_full-MSA_7qga_A.csv rename to data/AFsample2_benchmark/TMScore_full-MSA_7qga_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_random-MSA_1si1_A.csv b/data/AFsample2_benchmark/TMScore_random-MSA_1si1_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_random-MSA_1si1_A.csv rename to data/AFsample2_benchmark/TMScore_random-MSA_1si1_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_random-MSA_2ktv_A.csv b/data/AFsample2_benchmark/TMScore_random-MSA_2ktv_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_random-MSA_2ktv_A.csv rename to data/AFsample2_benchmark/TMScore_random-MSA_2ktv_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_random-MSA_2olo_A.csv b/data/AFsample2_benchmark/TMScore_random-MSA_2olo_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_random-MSA_2olo_A.csv rename to data/AFsample2_benchmark/TMScore_random-MSA_2olo_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_random-MSA_2rqm_A.csv b/data/AFsample2_benchmark/TMScore_random-MSA_2rqm_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_random-MSA_2rqm_A.csv rename to data/AFsample2_benchmark/TMScore_random-MSA_2rqm_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_random-MSA_2wfa_A.csv b/data/AFsample2_benchmark/TMScore_random-MSA_2wfa_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_random-MSA_2wfa_A.csv rename to data/AFsample2_benchmark/TMScore_random-MSA_2wfa_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_random-MSA_2xe6_A.csv b/data/AFsample2_benchmark/TMScore_random-MSA_2xe6_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_random-MSA_2xe6_A.csv rename to data/AFsample2_benchmark/TMScore_random-MSA_2xe6_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_random-MSA_3fto_A.csv b/data/AFsample2_benchmark/TMScore_random-MSA_3fto_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_random-MSA_3fto_A.csv rename to data/AFsample2_benchmark/TMScore_random-MSA_3fto_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_random-MSA_3iuj_A.csv b/data/AFsample2_benchmark/TMScore_random-MSA_3iuj_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_random-MSA_3iuj_A.csv rename to data/AFsample2_benchmark/TMScore_random-MSA_3iuj_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_random-MSA_3l6g_A.csv b/data/AFsample2_benchmark/TMScore_random-MSA_3l6g_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_random-MSA_3l6g_A.csv rename to data/AFsample2_benchmark/TMScore_random-MSA_3l6g_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_random-MSA_3o6w_A.csv b/data/AFsample2_benchmark/TMScore_random-MSA_3o6w_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_random-MSA_3o6w_A.csv rename to data/AFsample2_benchmark/TMScore_random-MSA_3o6w_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_random-MSA_3tee_A.csv b/data/AFsample2_benchmark/TMScore_random-MSA_3tee_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_random-MSA_3tee_A.csv rename to data/AFsample2_benchmark/TMScore_random-MSA_3tee_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_random-MSA_3zsf_A.csv b/data/AFsample2_benchmark/TMScore_random-MSA_3zsf_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_random-MSA_3zsf_A.csv rename to data/AFsample2_benchmark/TMScore_random-MSA_3zsf_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_random-MSA_4bp8_A.csv b/data/AFsample2_benchmark/TMScore_random-MSA_4bp8_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_random-MSA_4bp8_A.csv rename to data/AFsample2_benchmark/TMScore_random-MSA_4bp8_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_random-MSA_5ho2_A.csv b/data/AFsample2_benchmark/TMScore_random-MSA_5ho2_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_random-MSA_5ho2_A.csv rename to data/AFsample2_benchmark/TMScore_random-MSA_5ho2_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_random-MSA_6hac_A.csv b/data/AFsample2_benchmark/TMScore_random-MSA_6hac_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_random-MSA_6hac_A.csv rename to data/AFsample2_benchmark/TMScore_random-MSA_6hac_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_random-MSA_6hnj_A.csv b/data/AFsample2_benchmark/TMScore_random-MSA_6hnj_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_random-MSA_6hnj_A.csv rename to data/AFsample2_benchmark/TMScore_random-MSA_6hnj_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_random-MSA_6k8b_A.csv b/data/AFsample2_benchmark/TMScore_random-MSA_6k8b_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_random-MSA_6k8b_A.csv rename to data/AFsample2_benchmark/TMScore_random-MSA_6k8b_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_random-MSA_6mka_A.csv b/data/AFsample2_benchmark/TMScore_random-MSA_6mka_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_random-MSA_6mka_A.csv rename to data/AFsample2_benchmark/TMScore_random-MSA_6mka_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_random-MSA_6nc7_A.csv b/data/AFsample2_benchmark/TMScore_random-MSA_6nc7_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_random-MSA_6nc7_A.csv rename to data/AFsample2_benchmark/TMScore_random-MSA_6nc7_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_random-MSA_6yed_A.csv b/data/AFsample2_benchmark/TMScore_random-MSA_6yed_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_random-MSA_6yed_A.csv rename to data/AFsample2_benchmark/TMScore_random-MSA_6yed_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_random-MSA_7c63_A.csv b/data/AFsample2_benchmark/TMScore_random-MSA_7c63_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_random-MSA_7c63_A.csv rename to data/AFsample2_benchmark/TMScore_random-MSA_7c63_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_random-MSA_7cy2_A.csv b/data/AFsample2_benchmark/TMScore_random-MSA_7cy2_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_random-MSA_7cy2_A.csv rename to data/AFsample2_benchmark/TMScore_random-MSA_7cy2_A.csv diff --git a/Data/AFsample2_benchmark/TMScore_random-MSA_7qga_A.csv b/data/AFsample2_benchmark/TMScore_random-MSA_7qga_A.csv similarity index 100% rename from Data/AFsample2_benchmark/TMScore_random-MSA_7qga_A.csv rename to data/AFsample2_benchmark/TMScore_random-MSA_7qga_A.csv diff --git a/Data/AFsample2_benchmark/heatmap-max TMscore comparison.png b/data/AFsample2_benchmark/heatmap-max TMscore comparison.png similarity index 100% rename from Data/AFsample2_benchmark/heatmap-max TMscore comparison.png rename to data/AFsample2_benchmark/heatmap-max TMscore comparison.png diff --git a/Data/AFsample2_benchmark/list_of_OC23-uniprot_ID-PDB_ID.csv b/data/AFsample2_benchmark/list_of_OC23-uniprot_ID-PDB_ID.csv similarity index 100% rename from Data/AFsample2_benchmark/list_of_OC23-uniprot_ID-PDB_ID.csv rename to data/AFsample2_benchmark/list_of_OC23-uniprot_ID-PDB_ID.csv diff --git a/Data/AFsample2_benchmark/max_TM_heatmap_v4.py b/data/AFsample2_benchmark/max_TM_heatmap_v4.py similarity index 97% rename from Data/AFsample2_benchmark/max_TM_heatmap_v4.py rename to data/AFsample2_benchmark/max_TM_heatmap_v4.py index 1eafdb2..9a75712 100644 --- a/Data/AFsample2_benchmark/max_TM_heatmap_v4.py +++ b/data/AFsample2_benchmark/max_TM_heatmap_v4.py @@ -5,18 +5,12 @@ @author: Myeongsang (Samuel) Lee """ -import os -import sys import csv -import textalloc as ta -from pathlib import Path import numpy as np import seaborn as sns import matplotlib.pyplot as plt -import matplotlib.font_manager from numpy import genfromtxt from matplotlib import pyplot as plt -from adjustText import adjust_text import glob CF_add = sorted(glob.glob("TMScore_random*csv")) diff --git a/Data/AFsample2_benchmark/nsample_heatmap.py b/data/AFsample2_benchmark/nsample_heatmap.py similarity index 92% rename from Data/AFsample2_benchmark/nsample_heatmap.py rename to data/AFsample2_benchmark/nsample_heatmap.py index 7b751c6..aa99c34 100644 --- a/Data/AFsample2_benchmark/nsample_heatmap.py +++ b/data/AFsample2_benchmark/nsample_heatmap.py @@ -5,19 +5,11 @@ @author: Myeongsang (Samuel) Lee """ -import os -import sys import csv -import textalloc as ta -from pathlib import Path import numpy as np import seaborn as sns import matplotlib.pyplot as plt -import matplotlib.font_manager -from numpy import genfromtxt from matplotlib import pyplot as plt -from adjustText import adjust_text -import glob diff --git a/Data/E_coli/.keep b/data/E_coli/.keep similarity index 100% rename from Data/E_coli/.keep rename to data/E_coli/.keep diff --git a/Data/E_coli/WP_000015473.pse b/data/E_coli/WP_000015473.pse similarity index 100% rename from Data/E_coli/WP_000015473.pse rename to data/E_coli/WP_000015473.pse diff --git a/Data/E_coli/WP_000024392.pse b/data/E_coli/WP_000024392.pse similarity index 100% rename from Data/E_coli/WP_000024392.pse rename to data/E_coli/WP_000024392.pse diff --git a/Data/E_coli/WP_000064148.pse b/data/E_coli/WP_000064148.pse similarity index 100% rename from Data/E_coli/WP_000064148.pse rename to data/E_coli/WP_000064148.pse diff --git a/Data/E_coli/WP_000134927.pse b/data/E_coli/WP_000134927.pse similarity index 100% rename from Data/E_coli/WP_000134927.pse rename to data/E_coli/WP_000134927.pse diff --git a/Data/E_coli/WP_000190655.pse b/data/E_coli/WP_000190655.pse similarity index 100% rename from Data/E_coli/WP_000190655.pse rename to data/E_coli/WP_000190655.pse diff --git a/Data/E_coli/WP_000323025.pse b/data/E_coli/WP_000323025.pse similarity index 100% rename from Data/E_coli/WP_000323025.pse rename to data/E_coli/WP_000323025.pse diff --git a/Data/E_coli/WP_000581937.pse b/data/E_coli/WP_000581937.pse similarity index 100% rename from Data/E_coli/WP_000581937.pse rename to data/E_coli/WP_000581937.pse diff --git a/Data/E_coli/WP_000617148.pse b/data/E_coli/WP_000617148.pse similarity index 100% rename from Data/E_coli/WP_000617148.pse rename to data/E_coli/WP_000617148.pse diff --git a/Data/E_coli/WP_000648420.pse b/data/E_coli/WP_000648420.pse similarity index 100% rename from Data/E_coli/WP_000648420.pse rename to data/E_coli/WP_000648420.pse diff --git a/Data/E_coli/WP_000675390.pse b/data/E_coli/WP_000675390.pse similarity index 100% rename from Data/E_coli/WP_000675390.pse rename to data/E_coli/WP_000675390.pse diff --git a/Data/E_coli/WP_000699809.pse b/data/E_coli/WP_000699809.pse similarity index 100% rename from Data/E_coli/WP_000699809.pse rename to data/E_coli/WP_000699809.pse diff --git a/Data/E_coli/WP_000705622.pse b/data/E_coli/WP_000705622.pse similarity index 100% rename from Data/E_coli/WP_000705622.pse rename to data/E_coli/WP_000705622.pse diff --git a/Data/E_coli/WP_000763330.pse b/data/E_coli/WP_000763330.pse similarity index 100% rename from Data/E_coli/WP_000763330.pse rename to data/E_coli/WP_000763330.pse diff --git a/Data/E_coli/WP_000803992.pse b/data/E_coli/WP_000803992.pse similarity index 100% rename from Data/E_coli/WP_000803992.pse rename to data/E_coli/WP_000803992.pse diff --git a/Data/E_coli/WP_000807125.pse b/data/E_coli/WP_000807125.pse similarity index 100% rename from Data/E_coli/WP_000807125.pse rename to data/E_coli/WP_000807125.pse diff --git a/Data/E_coli/WP_000841554.pse b/data/E_coli/WP_000841554.pse similarity index 100% rename from Data/E_coli/WP_000841554.pse rename to data/E_coli/WP_000841554.pse diff --git a/Data/E_coli/WP_000847304.pse b/data/E_coli/WP_000847304.pse similarity index 100% rename from Data/E_coli/WP_000847304.pse rename to data/E_coli/WP_000847304.pse diff --git a/Data/E_coli/WP_000881326.pse b/data/E_coli/WP_000881326.pse similarity index 100% rename from Data/E_coli/WP_000881326.pse rename to data/E_coli/WP_000881326.pse diff --git a/Data/E_coli/WP_000920571.pse b/data/E_coli/WP_000920571.pse similarity index 100% rename from Data/E_coli/WP_000920571.pse rename to data/E_coli/WP_000920571.pse diff --git a/Data/E_coli/WP_000951334.pse b/data/E_coli/WP_000951334.pse similarity index 100% rename from Data/E_coli/WP_000951334.pse rename to data/E_coli/WP_000951334.pse diff --git a/Data/E_coli/WP_000955366.pse b/data/E_coli/WP_000955366.pse similarity index 100% rename from Data/E_coli/WP_000955366.pse rename to data/E_coli/WP_000955366.pse diff --git a/Data/E_coli/WP_000956458.pse b/data/E_coli/WP_000956458.pse similarity index 100% rename from Data/E_coli/WP_000956458.pse rename to data/E_coli/WP_000956458.pse diff --git a/Data/E_coli/WP_000976004.pse b/data/E_coli/WP_000976004.pse similarity index 100% rename from Data/E_coli/WP_000976004.pse rename to data/E_coli/WP_000976004.pse diff --git a/Data/E_coli/WP_000994516.pse b/data/E_coli/WP_000994516.pse similarity index 100% rename from Data/E_coli/WP_000994516.pse rename to data/E_coli/WP_000994516.pse diff --git a/Data/E_coli/WP_001002059.pse b/data/E_coli/WP_001002059.pse similarity index 100% rename from Data/E_coli/WP_001002059.pse rename to data/E_coli/WP_001002059.pse diff --git a/Data/E_coli/WP_001023459.pse b/data/E_coli/WP_001023459.pse similarity index 100% rename from Data/E_coli/WP_001023459.pse rename to data/E_coli/WP_001023459.pse diff --git a/Data/E_coli/WP_001070563.pse b/data/E_coli/WP_001070563.pse similarity index 100% rename from Data/E_coli/WP_001070563.pse rename to data/E_coli/WP_001070563.pse diff --git a/Data/E_coli/WP_001119863.pse b/data/E_coli/WP_001119863.pse similarity index 100% rename from Data/E_coli/WP_001119863.pse rename to data/E_coli/WP_001119863.pse diff --git a/Data/E_coli/WP_001129553.pse b/data/E_coli/WP_001129553.pse similarity index 100% rename from Data/E_coli/WP_001129553.pse rename to data/E_coli/WP_001129553.pse diff --git a/Data/E_coli/WP_001151233.pse b/data/E_coli/WP_001151233.pse similarity index 100% rename from Data/E_coli/WP_001151233.pse rename to data/E_coli/WP_001151233.pse diff --git a/Data/E_coli/WP_001185665.pse b/data/E_coli/WP_001185665.pse similarity index 100% rename from Data/E_coli/WP_001185665.pse rename to data/E_coli/WP_001185665.pse diff --git a/Data/E_coli/WP_001192396.pse b/data/E_coli/WP_001192396.pse similarity index 100% rename from Data/E_coli/WP_001192396.pse rename to data/E_coli/WP_001192396.pse diff --git a/Data/E_coli/WP_001204859.pse b/data/E_coli/WP_001204859.pse similarity index 100% rename from Data/E_coli/WP_001204859.pse rename to data/E_coli/WP_001204859.pse diff --git a/Data/E_coli/WP_001217394.pse b/data/E_coli/WP_001217394.pse similarity index 100% rename from Data/E_coli/WP_001217394.pse rename to data/E_coli/WP_001217394.pse diff --git a/Data/E_coli/WP_001241339.pse b/data/E_coli/WP_001241339.pse similarity index 100% rename from Data/E_coli/WP_001241339.pse rename to data/E_coli/WP_001241339.pse diff --git a/Data/E_coli/WP_001260507.pse b/data/E_coli/WP_001260507.pse similarity index 100% rename from Data/E_coli/WP_001260507.pse rename to data/E_coli/WP_001260507.pse diff --git a/Data/E_coli/WP_001262174.pse b/data/E_coli/WP_001262174.pse similarity index 100% rename from Data/E_coli/WP_001262174.pse rename to data/E_coli/WP_001262174.pse diff --git a/Data/E_coli/WP_001264088.pse b/data/E_coli/WP_001264088.pse similarity index 100% rename from Data/E_coli/WP_001264088.pse rename to data/E_coli/WP_001264088.pse diff --git a/Data/E_coli/WP_001270286.pse b/data/E_coli/WP_001270286.pse similarity index 100% rename from Data/E_coli/WP_001270286.pse rename to data/E_coli/WP_001270286.pse diff --git a/Data/E_coli/WP_001270809.pse b/data/E_coli/WP_001270809.pse similarity index 100% rename from Data/E_coli/WP_001270809.pse rename to data/E_coli/WP_001270809.pse diff --git a/Data/E_coli/WP_001272149.pse b/data/E_coli/WP_001272149.pse similarity index 100% rename from Data/E_coli/WP_001272149.pse rename to data/E_coli/WP_001272149.pse diff --git a/Data/E_coli/WP_001272856.pse b/data/E_coli/WP_001272856.pse similarity index 100% rename from Data/E_coli/WP_001272856.pse rename to data/E_coli/WP_001272856.pse diff --git a/Data/E_coli/WP_001279084.pse b/data/E_coli/WP_001279084.pse similarity index 100% rename from Data/E_coli/WP_001279084.pse rename to data/E_coli/WP_001279084.pse diff --git a/Data/E_coli/WP_001280953.pse b/data/E_coli/WP_001280953.pse similarity index 100% rename from Data/E_coli/WP_001280953.pse rename to data/E_coli/WP_001280953.pse diff --git a/Data/E_coli/WP_001281772.pse b/data/E_coli/WP_001281772.pse similarity index 100% rename from Data/E_coli/WP_001281772.pse rename to data/E_coli/WP_001281772.pse diff --git a/Data/E_coli/WP_001282181.pse b/data/E_coli/WP_001282181.pse similarity index 100% rename from Data/E_coli/WP_001282181.pse rename to data/E_coli/WP_001282181.pse diff --git a/Data/E_coli/WP_001295442.pse b/data/E_coli/WP_001295442.pse similarity index 100% rename from Data/E_coli/WP_001295442.pse rename to data/E_coli/WP_001295442.pse diff --git a/Data/E_coli/WP_001296140.pse b/data/E_coli/WP_001296140.pse similarity index 100% rename from Data/E_coli/WP_001296140.pse rename to data/E_coli/WP_001296140.pse diff --git a/Data/E_coli/WP_001296901.pse b/data/E_coli/WP_001296901.pse similarity index 100% rename from Data/E_coli/WP_001296901.pse rename to data/E_coli/WP_001296901.pse diff --git a/Data/E_coli/WP_001300163.pse b/data/E_coli/WP_001300163.pse similarity index 100% rename from Data/E_coli/WP_001300163.pse rename to data/E_coli/WP_001300163.pse diff --git a/Data/E_coli/WP_001303590.pse b/data/E_coli/WP_001303590.pse similarity index 100% rename from Data/E_coli/WP_001303590.pse rename to data/E_coli/WP_001303590.pse diff --git a/Data/E_coli/WP_001316982.pse b/data/E_coli/WP_001316982.pse similarity index 100% rename from Data/E_coli/WP_001316982.pse rename to data/E_coli/WP_001316982.pse diff --git a/Data/Fold-switch_hits-AFcluster/AIAT/1kct_A.zip b/data/Fold-switch_hits-AFcluster/AIAT/1kct_A.zip similarity index 100% rename from Data/Fold-switch_hits-AFcluster/AIAT/1kct_A.zip rename to data/Fold-switch_hits-AFcluster/AIAT/1kct_A.zip diff --git a/Data/Fold-switch_hits-AFcluster/AIAT/1kct_A_tmscores_fs_all.csv b/data/Fold-switch_hits-AFcluster/AIAT/1kct_A_tmscores_fs_all.csv similarity index 100% rename from Data/Fold-switch_hits-AFcluster/AIAT/1kct_A_tmscores_fs_all.csv rename to data/Fold-switch_hits-AFcluster/AIAT/1kct_A_tmscores_fs_all.csv diff --git a/Data/Fold-switch_hits-AFcluster/AIAT/3t1p_A_tmscores_fs_all.csv b/data/Fold-switch_hits-AFcluster/AIAT/3t1p_A_tmscores_fs_all.csv similarity index 100% rename from Data/Fold-switch_hits-AFcluster/AIAT/3t1p_A_tmscores_fs_all.csv rename to data/Fold-switch_hits-AFcluster/AIAT/3t1p_A_tmscores_fs_all.csv diff --git a/Data/Fold-switch_hits-AFcluster/COMT/4pyi_A.zip b/data/Fold-switch_hits-AFcluster/COMT/4pyi_A.zip similarity index 100% rename from Data/Fold-switch_hits-AFcluster/COMT/4pyi_A.zip rename to data/Fold-switch_hits-AFcluster/COMT/4pyi_A.zip diff --git a/Data/Fold-switch_hits-AFcluster/COMT/4pyi_A_tmscores_fs_all.csv b/data/Fold-switch_hits-AFcluster/COMT/4pyi_A_tmscores_fs_all.csv similarity index 100% rename from Data/Fold-switch_hits-AFcluster/COMT/4pyi_A_tmscores_fs_all.csv rename to data/Fold-switch_hits-AFcluster/COMT/4pyi_A_tmscores_fs_all.csv diff --git a/Data/Fold-switch_hits-AFcluster/COMT/4pyj_A.zip b/data/Fold-switch_hits-AFcluster/COMT/4pyj_A.zip similarity index 100% rename from Data/Fold-switch_hits-AFcluster/COMT/4pyj_A.zip rename to data/Fold-switch_hits-AFcluster/COMT/4pyj_A.zip diff --git a/Data/Fold-switch_hits-AFcluster/COMT/4pyj_A_tmscores_fs_all.csv b/data/Fold-switch_hits-AFcluster/COMT/4pyj_A_tmscores_fs_all.csv similarity index 100% rename from Data/Fold-switch_hits-AFcluster/COMT/4pyj_A_tmscores_fs_all.csv rename to data/Fold-switch_hits-AFcluster/COMT/4pyj_A_tmscores_fs_all.csv diff --git a/data/Fold-switch_hits-AFcluster/CRKL/2bzy_B_tmscores_fs_all.csv b/data/Fold-switch_hits-AFcluster/CRKL/2bzy_B_tmscores_fs_all.csv new file mode 100644 index 0000000..e69de29 diff --git a/Data/Fold-switch_hits-AFcluster/CRKL/2lqw_A.zip b/data/Fold-switch_hits-AFcluster/CRKL/2lqw_A.zip similarity index 100% rename from Data/Fold-switch_hits-AFcluster/CRKL/2lqw_A.zip rename to data/Fold-switch_hits-AFcluster/CRKL/2lqw_A.zip diff --git a/Data/Fold-switch_hits-AFcluster/CRKL/2lqw_A_tmscores_fs_all.csv b/data/Fold-switch_hits-AFcluster/CRKL/2lqw_A_tmscores_fs_all.csv similarity index 100% rename from Data/Fold-switch_hits-AFcluster/CRKL/2lqw_A_tmscores_fs_all.csv rename to data/Fold-switch_hits-AFcluster/CRKL/2lqw_A_tmscores_fs_all.csv diff --git a/Data/Fold-switch_hits-AFcluster/CaBP/1jfk_A.zip b/data/Fold-switch_hits-AFcluster/CaBP/1jfk_A.zip similarity index 100% rename from Data/Fold-switch_hits-AFcluster/CaBP/1jfk_A.zip rename to data/Fold-switch_hits-AFcluster/CaBP/1jfk_A.zip diff --git a/Data/Fold-switch_hits-AFcluster/CaBP/1jfk_A_tmscores_fs_all.csv b/data/Fold-switch_hits-AFcluster/CaBP/1jfk_A_tmscores_fs_all.csv similarity index 100% rename from Data/Fold-switch_hits-AFcluster/CaBP/1jfk_A_tmscores_fs_all.csv rename to data/Fold-switch_hits-AFcluster/CaBP/1jfk_A_tmscores_fs_all.csv diff --git a/Data/Fold-switch_hits-AFcluster/CaBP/2nxq_B_tmscores_fs_all.csv b/data/Fold-switch_hits-AFcluster/CaBP/2nxq_B_tmscores_fs_all.csv similarity index 100% rename from Data/Fold-switch_hits-AFcluster/CaBP/2nxq_B_tmscores_fs_all.csv rename to data/Fold-switch_hits-AFcluster/CaBP/2nxq_B_tmscores_fs_all.csv diff --git a/Data/Fold-switch_hits-AFcluster/Cas9/4cmq_B_tmscores_fs_all.csv b/data/Fold-switch_hits-AFcluster/Cas9/4cmq_B_tmscores_fs_all.csv similarity index 100% rename from Data/Fold-switch_hits-AFcluster/Cas9/4cmq_B_tmscores_fs_all.csv rename to data/Fold-switch_hits-AFcluster/Cas9/4cmq_B_tmscores_fs_all.csv diff --git a/Data/Fold-switch_hits-AFcluster/Cas9/4zt0_C.zip b/data/Fold-switch_hits-AFcluster/Cas9/4zt0_C.zip similarity index 100% rename from Data/Fold-switch_hits-AFcluster/Cas9/4zt0_C.zip rename to data/Fold-switch_hits-AFcluster/Cas9/4zt0_C.zip diff --git a/Data/Fold-switch_hits-AFcluster/Cas9/4zt0_C_tmscores_fs_all.csv b/data/Fold-switch_hits-AFcluster/Cas9/4zt0_C_tmscores_fs_all.csv similarity index 100% rename from Data/Fold-switch_hits-AFcluster/Cas9/4zt0_C_tmscores_fs_all.csv rename to data/Fold-switch_hits-AFcluster/Cas9/4zt0_C_tmscores_fs_all.csv diff --git a/Data/Fold-switch_hits-AFcluster/Cwc2/3tp2_A.zip b/data/Fold-switch_hits-AFcluster/Cwc2/3tp2_A.zip similarity index 100% rename from Data/Fold-switch_hits-AFcluster/Cwc2/3tp2_A.zip rename to data/Fold-switch_hits-AFcluster/Cwc2/3tp2_A.zip diff --git a/Data/Fold-switch_hits-AFcluster/Cwc2/3tp2_A_tmscores_fs_all.csv b/data/Fold-switch_hits-AFcluster/Cwc2/3tp2_A_tmscores_fs_all.csv similarity index 100% rename from Data/Fold-switch_hits-AFcluster/Cwc2/3tp2_A_tmscores_fs_all.csv rename to data/Fold-switch_hits-AFcluster/Cwc2/3tp2_A_tmscores_fs_all.csv diff --git a/Data/Fold-switch_hits-AFcluster/Cwc2/5lj3_M_tmscores_fs_all.csv b/data/Fold-switch_hits-AFcluster/Cwc2/5lj3_M_tmscores_fs_all.csv similarity index 100% rename from Data/Fold-switch_hits-AFcluster/Cwc2/5lj3_M_tmscores_fs_all.csv rename to data/Fold-switch_hits-AFcluster/Cwc2/5lj3_M_tmscores_fs_all.csv diff --git a/Data/Fold-switch_hits-AFcluster/FUS_HENDH/1wp8_C_tmscores_fs_all.csv b/data/Fold-switch_hits-AFcluster/FUS_HENDH/1wp8_C_tmscores_fs_all.csv similarity index 100% rename from Data/Fold-switch_hits-AFcluster/FUS_HENDH/1wp8_C_tmscores_fs_all.csv rename to data/Fold-switch_hits-AFcluster/FUS_HENDH/1wp8_C_tmscores_fs_all.csv diff --git a/Data/Fold-switch_hits-AFcluster/FUS_HENDH/5ejb_C.zip b/data/Fold-switch_hits-AFcluster/FUS_HENDH/5ejb_C.zip similarity index 100% rename from Data/Fold-switch_hits-AFcluster/FUS_HENDH/5ejb_C.zip rename to data/Fold-switch_hits-AFcluster/FUS_HENDH/5ejb_C.zip diff --git a/Data/Fold-switch_hits-AFcluster/FUS_HENDH/5ejb_C_tmscores_fs_all.csv b/data/Fold-switch_hits-AFcluster/FUS_HENDH/5ejb_C_tmscores_fs_all.csv similarity index 100% rename from Data/Fold-switch_hits-AFcluster/FUS_HENDH/5ejb_C_tmscores_fs_all.csv rename to data/Fold-switch_hits-AFcluster/FUS_HENDH/5ejb_C_tmscores_fs_all.csv diff --git a/Data/Fold-switch_hits-AFcluster/Fab/3ztj_E_tmscores_fs_all.csv b/data/Fold-switch_hits-AFcluster/Fab/3ztj_E_tmscores_fs_all.csv similarity index 100% rename from Data/Fold-switch_hits-AFcluster/Fab/3ztj_E_tmscores_fs_all.csv rename to data/Fold-switch_hits-AFcluster/Fab/3ztj_E_tmscores_fs_all.csv diff --git a/Data/Fold-switch_hits-AFcluster/Fab/5hmg_A.zip b/data/Fold-switch_hits-AFcluster/Fab/5hmg_A.zip similarity index 100% rename from Data/Fold-switch_hits-AFcluster/Fab/5hmg_A.zip rename to data/Fold-switch_hits-AFcluster/Fab/5hmg_A.zip diff --git a/Data/Fold-switch_hits-AFcluster/Fab/5hmg_A_tmscores_fs_all.csv b/data/Fold-switch_hits-AFcluster/Fab/5hmg_A_tmscores_fs_all.csv similarity index 100% rename from Data/Fold-switch_hits-AFcluster/Fab/5hmg_A_tmscores_fs_all.csv rename to data/Fold-switch_hits-AFcluster/Fab/5hmg_A_tmscores_fs_all.csv diff --git a/Data/Fold-switch_hits-AFcluster/FraC/3zwg_N_tmscores_fs_all.csv b/data/Fold-switch_hits-AFcluster/FraC/3zwg_N_tmscores_fs_all.csv similarity index 100% rename from Data/Fold-switch_hits-AFcluster/FraC/3zwg_N_tmscores_fs_all.csv rename to data/Fold-switch_hits-AFcluster/FraC/3zwg_N_tmscores_fs_all.csv diff --git a/Data/Fold-switch_hits-AFcluster/FraC/4tsyD.zip b/data/Fold-switch_hits-AFcluster/FraC/4tsyD.zip similarity index 100% rename from Data/Fold-switch_hits-AFcluster/FraC/4tsyD.zip rename to data/Fold-switch_hits-AFcluster/FraC/4tsyD.zip diff --git a/Data/Fold-switch_hits-AFcluster/FraC/4tsy_D_tmscores_fs_all.csv b/data/Fold-switch_hits-AFcluster/FraC/4tsy_D_tmscores_fs_all.csv similarity index 100% rename from Data/Fold-switch_hits-AFcluster/FraC/4tsy_D_tmscores_fs_all.csv rename to data/Fold-switch_hits-AFcluster/FraC/4tsy_D_tmscores_fs_all.csv diff --git a/Data/Fold-switch_hits-AFcluster/GP2/1ebo_E.zip b/data/Fold-switch_hits-AFcluster/GP2/1ebo_E.zip similarity index 100% rename from Data/Fold-switch_hits-AFcluster/GP2/1ebo_E.zip rename to data/Fold-switch_hits-AFcluster/GP2/1ebo_E.zip diff --git a/Data/Fold-switch_hits-AFcluster/GP2/1ebo_E_tmscores_fs_all.csv b/data/Fold-switch_hits-AFcluster/GP2/1ebo_E_tmscores_fs_all.csv similarity index 100% rename from Data/Fold-switch_hits-AFcluster/GP2/1ebo_E_tmscores_fs_all.csv rename to data/Fold-switch_hits-AFcluster/GP2/1ebo_E_tmscores_fs_all.csv diff --git a/Data/Fold-switch_hits-AFcluster/GP2/5fhc_J.zip b/data/Fold-switch_hits-AFcluster/GP2/5fhc_J.zip similarity index 100% rename from Data/Fold-switch_hits-AFcluster/GP2/5fhc_J.zip rename to data/Fold-switch_hits-AFcluster/GP2/5fhc_J.zip diff --git a/Data/Fold-switch_hits-AFcluster/GP2/5fhc_J_tmscores_fs_all.csv b/data/Fold-switch_hits-AFcluster/GP2/5fhc_J_tmscores_fs_all.csv similarity index 100% rename from Data/Fold-switch_hits-AFcluster/GP2/5fhc_J_tmscores_fs_all.csv rename to data/Fold-switch_hits-AFcluster/GP2/5fhc_J_tmscores_fs_all.csv diff --git a/Data/Fold-switch_hits-AFcluster/IscA/1x0g_A.zip b/data/Fold-switch_hits-AFcluster/IscA/1x0g_A.zip similarity index 100% rename from Data/Fold-switch_hits-AFcluster/IscA/1x0g_A.zip rename to data/Fold-switch_hits-AFcluster/IscA/1x0g_A.zip diff --git a/Data/Fold-switch_hits-AFcluster/IscA/1x0g_C_tmscores_fs_all.csv b/data/Fold-switch_hits-AFcluster/IscA/1x0g_C_tmscores_fs_all.csv similarity index 100% rename from Data/Fold-switch_hits-AFcluster/IscA/1x0g_C_tmscores_fs_all.csv rename to data/Fold-switch_hits-AFcluster/IscA/1x0g_C_tmscores_fs_all.csv diff --git a/Data/Fold-switch_hits-AFcluster/KaiB/2qkeE.zip b/data/Fold-switch_hits-AFcluster/KaiB/2qkeE.zip similarity index 100% rename from Data/Fold-switch_hits-AFcluster/KaiB/2qkeE.zip rename to data/Fold-switch_hits-AFcluster/KaiB/2qkeE.zip diff --git a/Data/Fold-switch_hits-AFcluster/KaiB/2qke_E_tmscores_fs_all.csv b/data/Fold-switch_hits-AFcluster/KaiB/2qke_E_tmscores_fs_all.csv similarity index 100% rename from Data/Fold-switch_hits-AFcluster/KaiB/2qke_E_tmscores_fs_all.csv rename to data/Fold-switch_hits-AFcluster/KaiB/2qke_E_tmscores_fs_all.csv diff --git a/Data/Fold-switch_hits-AFcluster/KaiB/5jytA.zip b/data/Fold-switch_hits-AFcluster/KaiB/5jytA.zip similarity index 100% rename from Data/Fold-switch_hits-AFcluster/KaiB/5jytA.zip rename to data/Fold-switch_hits-AFcluster/KaiB/5jytA.zip diff --git a/Data/Fold-switch_hits-AFcluster/KaiB/5jyt_A_tmscores_fs_all.csv b/data/Fold-switch_hits-AFcluster/KaiB/5jyt_A_tmscores_fs_all.csv similarity index 100% rename from Data/Fold-switch_hits-AFcluster/KaiB/5jyt_A_tmscores_fs_all.csv rename to data/Fold-switch_hits-AFcluster/KaiB/5jyt_A_tmscores_fs_all.csv diff --git a/Data/Fold-switch_hits-AFcluster/Mad2/2vfx_L.zip b/data/Fold-switch_hits-AFcluster/Mad2/2vfx_L.zip similarity index 100% rename from Data/Fold-switch_hits-AFcluster/Mad2/2vfx_L.zip rename to data/Fold-switch_hits-AFcluster/Mad2/2vfx_L.zip diff --git a/Data/Fold-switch_hits-AFcluster/Mad2/2vfx_L_tmscores_fs_all.csv b/data/Fold-switch_hits-AFcluster/Mad2/2vfx_L_tmscores_fs_all.csv similarity index 100% rename from Data/Fold-switch_hits-AFcluster/Mad2/2vfx_L_tmscores_fs_all.csv rename to data/Fold-switch_hits-AFcluster/Mad2/2vfx_L_tmscores_fs_all.csv diff --git a/Data/Fold-switch_hits-AFcluster/Mad2/3gmh_L_tmscores_fs_all.csv b/data/Fold-switch_hits-AFcluster/Mad2/3gmh_L_tmscores_fs_all.csv similarity index 100% rename from Data/Fold-switch_hits-AFcluster/Mad2/3gmh_L_tmscores_fs_all.csv rename to data/Fold-switch_hits-AFcluster/Mad2/3gmh_L_tmscores_fs_all.csv diff --git a/Data/Fold-switch_hits-AFcluster/MinE/2kxo_A.zip b/data/Fold-switch_hits-AFcluster/MinE/2kxo_A.zip similarity index 100% rename from Data/Fold-switch_hits-AFcluster/MinE/2kxo_A.zip rename to data/Fold-switch_hits-AFcluster/MinE/2kxo_A.zip diff --git a/Data/Fold-switch_hits-AFcluster/MinE/2kxo_A_tmscores_fs_all.csv b/data/Fold-switch_hits-AFcluster/MinE/2kxo_A_tmscores_fs_all.csv similarity index 100% rename from Data/Fold-switch_hits-AFcluster/MinE/2kxo_A_tmscores_fs_all.csv rename to data/Fold-switch_hits-AFcluster/MinE/2kxo_A_tmscores_fs_all.csv diff --git a/Data/Fold-switch_hits-AFcluster/MinE/3r9j_C_tmscores_fs_all.csv b/data/Fold-switch_hits-AFcluster/MinE/3r9j_C_tmscores_fs_all.csv similarity index 100% rename from Data/Fold-switch_hits-AFcluster/MinE/3r9j_C_tmscores_fs_all.csv rename to data/Fold-switch_hits-AFcluster/MinE/3r9j_C_tmscores_fs_all.csv diff --git a/Data/Fold-switch_hits-AFcluster/Nrp2/2qqj_A_tmscores_fs_all.csv b/data/Fold-switch_hits-AFcluster/Nrp2/2qqj_A_tmscores_fs_all.csv similarity index 100% rename from Data/Fold-switch_hits-AFcluster/Nrp2/2qqj_A_tmscores_fs_all.csv rename to data/Fold-switch_hits-AFcluster/Nrp2/2qqj_A_tmscores_fs_all.csv diff --git a/Data/Fold-switch_hits-AFcluster/Nrp2/4qds_A.zip b/data/Fold-switch_hits-AFcluster/Nrp2/4qds_A.zip similarity index 100% rename from Data/Fold-switch_hits-AFcluster/Nrp2/4qds_A.zip rename to data/Fold-switch_hits-AFcluster/Nrp2/4qds_A.zip diff --git a/Data/Fold-switch_hits-AFcluster/Nrp2/4qds_A_tmscores_fs_all.csv b/data/Fold-switch_hits-AFcluster/Nrp2/4qds_A_tmscores_fs_all.csv similarity index 100% rename from Data/Fold-switch_hits-AFcluster/Nrp2/4qds_A_tmscores_fs_all.csv rename to data/Fold-switch_hits-AFcluster/Nrp2/4qds_A_tmscores_fs_all.csv diff --git a/Data/Fold-switch_hits-AFcluster/OxyR/4xws_D_tmscores_fs_all.csv b/data/Fold-switch_hits-AFcluster/OxyR/4xws_D_tmscores_fs_all.csv similarity index 100% rename from Data/Fold-switch_hits-AFcluster/OxyR/4xws_D_tmscores_fs_all.csv rename to data/Fold-switch_hits-AFcluster/OxyR/4xws_D_tmscores_fs_all.csv diff --git a/Data/Fold-switch_hits-AFcluster/OxyR/4y0m_J.zip b/data/Fold-switch_hits-AFcluster/OxyR/4y0m_J.zip similarity index 100% rename from Data/Fold-switch_hits-AFcluster/OxyR/4y0m_J.zip rename to data/Fold-switch_hits-AFcluster/OxyR/4y0m_J.zip diff --git a/Data/Fold-switch_hits-AFcluster/OxyR/4y0m_J_tmscores_fs_all.csv b/data/Fold-switch_hits-AFcluster/OxyR/4y0m_J_tmscores_fs_all.csv similarity index 100% rename from Data/Fold-switch_hits-AFcluster/OxyR/4y0m_J_tmscores_fs_all.csv rename to data/Fold-switch_hits-AFcluster/OxyR/4y0m_J_tmscores_fs_all.csv diff --git a/Data/Fold-switch_hits-AFcluster/PimA/4n9w_A.zip b/data/Fold-switch_hits-AFcluster/PimA/4n9w_A.zip similarity index 100% rename from Data/Fold-switch_hits-AFcluster/PimA/4n9w_A.zip rename to data/Fold-switch_hits-AFcluster/PimA/4n9w_A.zip diff --git a/Data/Fold-switch_hits-AFcluster/PimA/4n9w_A_tmscores_fs_all.csv b/data/Fold-switch_hits-AFcluster/PimA/4n9w_A_tmscores_fs_all.csv similarity index 100% rename from Data/Fold-switch_hits-AFcluster/PimA/4n9w_A_tmscores_fs_all.csv rename to data/Fold-switch_hits-AFcluster/PimA/4n9w_A_tmscores_fs_all.csv diff --git a/Data/Fold-switch_hits-AFcluster/PimA/4nc9_C.zip b/data/Fold-switch_hits-AFcluster/PimA/4nc9_C.zip similarity index 100% rename from Data/Fold-switch_hits-AFcluster/PimA/4nc9_C.zip rename to data/Fold-switch_hits-AFcluster/PimA/4nc9_C.zip diff --git a/Data/Fold-switch_hits-AFcluster/PimA/4nc9_C_tmscores_fs_all.csv b/data/Fold-switch_hits-AFcluster/PimA/4nc9_C_tmscores_fs_all.csv similarity index 100% rename from Data/Fold-switch_hits-AFcluster/PimA/4nc9_C_tmscores_fs_all.csv rename to data/Fold-switch_hits-AFcluster/PimA/4nc9_C_tmscores_fs_all.csv diff --git a/Data/Fold-switch_hits-AFcluster/RfaH/2oug_C_tmscores_fs_all.csv b/data/Fold-switch_hits-AFcluster/RfaH/2oug_C_tmscores_fs_all.csv similarity index 100% rename from Data/Fold-switch_hits-AFcluster/RfaH/2oug_C_tmscores_fs_all.csv rename to data/Fold-switch_hits-AFcluster/RfaH/2oug_C_tmscores_fs_all.csv diff --git a/Data/Fold-switch_hits-AFcluster/RfaH/RfaH_2ougC_both_folds.pse b/data/Fold-switch_hits-AFcluster/RfaH/RfaH_2ougC_both_folds.pse similarity index 100% rename from Data/Fold-switch_hits-AFcluster/RfaH/RfaH_2ougC_both_folds.pse rename to data/Fold-switch_hits-AFcluster/RfaH/RfaH_2ougC_both_folds.pse diff --git a/Data/Fold-switch_hits-AFcluster/details.csv b/data/Fold-switch_hits-AFcluster/details.csv similarity index 100% rename from Data/Fold-switch_hits-AFcluster/details.csv rename to data/Fold-switch_hits-AFcluster/details.csv diff --git a/Data/Fold-switch_hits-SPEACH_AF/A1AT.zip b/data/Fold-switch_hits-SPEACH_AF/A1AT.zip similarity index 100% rename from Data/Fold-switch_hits-SPEACH_AF/A1AT.zip rename to data/Fold-switch_hits-SPEACH_AF/A1AT.zip diff --git a/Data/Fold-switch_hits-SPEACH_AF/FUS_HENDH.zip b/data/Fold-switch_hits-SPEACH_AF/FUS_HENDH.zip similarity index 100% rename from Data/Fold-switch_hits-SPEACH_AF/FUS_HENDH.zip rename to data/Fold-switch_hits-SPEACH_AF/FUS_HENDH.zip diff --git a/Data/Fold-switch_hits-SPEACH_AF/KSHV_protease.zip b/data/Fold-switch_hits-SPEACH_AF/KSHV_protease.zip similarity index 100% rename from Data/Fold-switch_hits-SPEACH_AF/KSHV_protease.zip rename to data/Fold-switch_hits-SPEACH_AF/KSHV_protease.zip diff --git a/Data/Fold-switch_hits-SPEACH_AF/OxyR.zip b/data/Fold-switch_hits-SPEACH_AF/OxyR.zip similarity index 100% rename from Data/Fold-switch_hits-SPEACH_AF/OxyR.zip rename to data/Fold-switch_hits-SPEACH_AF/OxyR.zip diff --git a/Data/Fold-switch_hits-SPEACH_AF/RfAH.zip b/data/Fold-switch_hits-SPEACH_AF/RfAH.zip similarity index 100% rename from Data/Fold-switch_hits-SPEACH_AF/RfAH.zip rename to data/Fold-switch_hits-SPEACH_AF/RfAH.zip diff --git a/Data/Fold-switch_hits-SPEACH_AF/capsid_protein.zip b/data/Fold-switch_hits-SPEACH_AF/capsid_protein.zip similarity index 100% rename from Data/Fold-switch_hits-SPEACH_AF/capsid_protein.zip rename to data/Fold-switch_hits-SPEACH_AF/capsid_protein.zip diff --git a/Data/Fold-switch_hits-SPEACH_AF/componentC3.zip b/data/Fold-switch_hits-SPEACH_AF/componentC3.zip similarity index 100% rename from Data/Fold-switch_hits-SPEACH_AF/componentC3.zip rename to data/Fold-switch_hits-SPEACH_AF/componentC3.zip diff --git a/Data/Fold-switch_hits-SPEACH_AF/details.dat b/data/Fold-switch_hits-SPEACH_AF/details.dat similarity index 100% rename from Data/Fold-switch_hits-SPEACH_AF/details.dat rename to data/Fold-switch_hits-SPEACH_AF/details.dat diff --git a/Data/Fold-switch_hits/1iyt_plDDT.png b/data/Fold-switch_hits/1iyt_plDDT.png similarity index 100% rename from Data/Fold-switch_hits/1iyt_plDDT.png rename to data/Fold-switch_hits/1iyt_plDDT.png diff --git a/Data/Fold-switch_hits/1kct_plDDT.png b/data/Fold-switch_hits/1kct_plDDT.png similarity index 100% rename from Data/Fold-switch_hits/1kct_plDDT.png rename to data/Fold-switch_hits/1kct_plDDT.png diff --git a/Data/Fold-switch_hits/2jmr_A_plDDT.png b/data/Fold-switch_hits/2jmr_A_plDDT.png similarity index 100% rename from Data/Fold-switch_hits/2jmr_A_plDDT.png rename to data/Fold-switch_hits/2jmr_A_plDDT.png diff --git a/Data/Fold-switch_hits/2n54_A_plDDT.png b/data/Fold-switch_hits/2n54_A_plDDT.png similarity index 100% rename from Data/Fold-switch_hits/2n54_A_plDDT.png rename to data/Fold-switch_hits/2n54_A_plDDT.png diff --git a/Data/Fold-switch_hits/4phq_A_plDDT.png b/data/Fold-switch_hits/4phq_A_plDDT.png similarity index 100% rename from Data/Fold-switch_hits/4phq_A_plDDT.png rename to data/Fold-switch_hits/4phq_A_plDDT.png diff --git a/Data/Fold-switch_hits/4zrb_C_plDDT.png b/data/Fold-switch_hits/4zrb_C_plDDT.png similarity index 100% rename from Data/Fold-switch_hits/4zrb_C_plDDT.png rename to data/Fold-switch_hits/4zrb_C_plDDT.png diff --git a/Data/Fold-switch_hits/Figure1a.py b/data/Fold-switch_hits/Figure1a.py similarity index 90% rename from Data/Fold-switch_hits/Figure1a.py rename to data/Fold-switch_hits/Figure1a.py index 63b9ecd..d398d23 100644 --- a/Data/Fold-switch_hits/Figure1a.py +++ b/data/Fold-switch_hits/Figure1a.py @@ -1,10 +1,6 @@ #! /Users/porterll/miniconda3/bin/python -import sys -import pandas as pd -import numpy as np from matplotlib import pyplot as plt -import seaborn as sns import matplotlib as mpl if __name__ == '__main__': diff --git a/Data/Fold-switch_hits/Figure1b.py b/data/Fold-switch_hits/Figure1b.py similarity index 100% rename from Data/Fold-switch_hits/Figure1b.py rename to data/Fold-switch_hits/Figure1b.py diff --git a/Data/Fold-switch_hits/Fold_switch_pse/1QB3_CFR.pse b/data/Fold-switch_hits/Fold_switch_pse/1QB3_CFR.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/1QB3_CFR.pse rename to data/Fold-switch_hits/Fold_switch_pse/1QB3_CFR.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/1j9o.pse b/data/Fold-switch_hits/Fold_switch_pse/1j9o.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/1j9o.pse rename to data/Fold-switch_hits/Fold_switch_pse/1j9o.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/1kct.pse b/data/Fold-switch_hits/Fold_switch_pse/1kct.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/1kct.pse rename to data/Fold-switch_hits/Fold_switch_pse/1kct.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/1miq_B.pse b/data/Fold-switch_hits/Fold_switch_pse/1miq_B.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/1miq_B.pse rename to data/Fold-switch_hits/Fold_switch_pse/1miq_B.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/1nqd.pse b/data/Fold-switch_hits/Fold_switch_pse/1nqd.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/1nqd.pse rename to data/Fold-switch_hits/Fold_switch_pse/1nqd.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/1nqj.pse b/data/Fold-switch_hits/Fold_switch_pse/1nqj.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/1nqj.pse rename to data/Fold-switch_hits/Fold_switch_pse/1nqj.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/1qb3.pse b/data/Fold-switch_hits/Fold_switch_pse/1qb3.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/1qb3.pse rename to data/Fold-switch_hits/Fold_switch_pse/1qb3.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/1qs8_best_preds.pse b/data/Fold-switch_hits/Fold_switch_pse/1qs8_best_preds.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/1qs8_best_preds.pse rename to data/Fold-switch_hits/Fold_switch_pse/1qs8_best_preds.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/1rep.pse b/data/Fold-switch_hits/Fold_switch_pse/1rep.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/1rep.pse rename to data/Fold-switch_hits/Fold_switch_pse/1rep.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/1xjt.pse b/data/Fold-switch_hits/Fold_switch_pse/1xjt.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/1xjt.pse rename to data/Fold-switch_hits/Fold_switch_pse/1xjt.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/1xju.pse b/data/Fold-switch_hits/Fold_switch_pse/1xju.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/1xju.pse rename to data/Fold-switch_hits/Fold_switch_pse/1xju.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/2k0q.pse b/data/Fold-switch_hits/Fold_switch_pse/2k0q.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/2k0q.pse rename to data/Fold-switch_hits/Fold_switch_pse/2k0q.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/2kxo.pse b/data/Fold-switch_hits/Fold_switch_pse/2kxo.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/2kxo.pse rename to data/Fold-switch_hits/Fold_switch_pse/2kxo.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/2lel.pse b/data/Fold-switch_hits/Fold_switch_pse/2lel.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/2lel.pse rename to data/Fold-switch_hits/Fold_switch_pse/2lel.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/2n54.pse b/data/Fold-switch_hits/Fold_switch_pse/2n54.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/2n54.pse rename to data/Fold-switch_hits/Fold_switch_pse/2n54.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/2p3v_A.pse b/data/Fold-switch_hits/Fold_switch_pse/2p3v_A.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/2p3v_A.pse rename to data/Fold-switch_hits/Fold_switch_pse/2p3v_A.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/2p3v_D.pse b/data/Fold-switch_hits/Fold_switch_pse/2p3v_D.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/2p3v_D.pse rename to data/Fold-switch_hits/Fold_switch_pse/2p3v_D.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/2pbk.pse b/data/Fold-switch_hits/Fold_switch_pse/2pbk.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/2pbk.pse rename to data/Fold-switch_hits/Fold_switch_pse/2pbk.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/2qke.pse b/data/Fold-switch_hits/Fold_switch_pse/2qke.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/2qke.pse rename to data/Fold-switch_hits/Fold_switch_pse/2qke.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/2qqj.pse b/data/Fold-switch_hits/Fold_switch_pse/2qqj.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/2qqj.pse rename to data/Fold-switch_hits/Fold_switch_pse/2qqj.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/2vfx_A.pse b/data/Fold-switch_hits/Fold_switch_pse/2vfx_A.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/2vfx_A.pse rename to data/Fold-switch_hits/Fold_switch_pse/2vfx_A.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/2wcd_X.pse b/data/Fold-switch_hits/Fold_switch_pse/2wcd_X.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/2wcd_X.pse rename to data/Fold-switch_hits/Fold_switch_pse/2wcd_X.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/2z9o.pse b/data/Fold-switch_hits/Fold_switch_pse/2z9o.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/2z9o.pse rename to data/Fold-switch_hits/Fold_switch_pse/2z9o.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/3gmh_L.pse b/data/Fold-switch_hits/Fold_switch_pse/3gmh_L.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/3gmh_L.pse rename to data/Fold-switch_hits/Fold_switch_pse/3gmh_L.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/3hde.pse b/data/Fold-switch_hits/Fold_switch_pse/3hde.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/3hde.pse rename to data/Fold-switch_hits/Fold_switch_pse/3hde.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/3hdf.pse b/data/Fold-switch_hits/Fold_switch_pse/3hdf.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/3hdf.pse rename to data/Fold-switch_hits/Fold_switch_pse/3hdf.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/3nqj.pse b/data/Fold-switch_hits/Fold_switch_pse/3nqj.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/3nqj.pse rename to data/Fold-switch_hits/Fold_switch_pse/3nqj.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/3qy2.pse b/data/Fold-switch_hits/Fold_switch_pse/3qy2.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/3qy2.pse rename to data/Fold-switch_hits/Fold_switch_pse/3qy2.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/3r9j.pse b/data/Fold-switch_hits/Fold_switch_pse/3r9j.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/3r9j.pse rename to data/Fold-switch_hits/Fold_switch_pse/3r9j.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/3t1p_alignment.pse b/data/Fold-switch_hits/Fold_switch_pse/3t1p_alignment.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/3t1p_alignment.pse rename to data/Fold-switch_hits/Fold_switch_pse/3t1p_alignment.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/3tp2.pse b/data/Fold-switch_hits/Fold_switch_pse/3tp2.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/3tp2.pse rename to data/Fold-switch_hits/Fold_switch_pse/3tp2.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/3zwg.pse b/data/Fold-switch_hits/Fold_switch_pse/3zwg.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/3zwg.pse rename to data/Fold-switch_hits/Fold_switch_pse/3zwg.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/4hdd.pse b/data/Fold-switch_hits/Fold_switch_pse/4hdd.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/4hdd.pse rename to data/Fold-switch_hits/Fold_switch_pse/4hdd.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/4o01_preds.pse b/data/Fold-switch_hits/Fold_switch_pse/4o01_preds.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/4o01_preds.pse rename to data/Fold-switch_hits/Fold_switch_pse/4o01_preds.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/4o0p_preds.pse b/data/Fold-switch_hits/Fold_switch_pse/4o0p_preds.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/4o0p_preds.pse rename to data/Fold-switch_hits/Fold_switch_pse/4o0p_preds.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/4phq.pse b/data/Fold-switch_hits/Fold_switch_pse/4phq.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/4phq.pse rename to data/Fold-switch_hits/Fold_switch_pse/4phq.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/4qds.pse b/data/Fold-switch_hits/Fold_switch_pse/4qds.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/4qds.pse rename to data/Fold-switch_hits/Fold_switch_pse/4qds.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/4rwn.pse b/data/Fold-switch_hits/Fold_switch_pse/4rwn.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/4rwn.pse rename to data/Fold-switch_hits/Fold_switch_pse/4rwn.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/4rwq_best_pred.pse b/data/Fold-switch_hits/Fold_switch_pse/4rwq_best_pred.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/4rwq_best_pred.pse rename to data/Fold-switch_hits/Fold_switch_pse/4rwq_best_pred.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/4tsy.pse b/data/Fold-switch_hits/Fold_switch_pse/4tsy.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/4tsy.pse rename to data/Fold-switch_hits/Fold_switch_pse/4tsy.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/4yhd.pse b/data/Fold-switch_hits/Fold_switch_pse/4yhd.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/4yhd.pse rename to data/Fold-switch_hits/Fold_switch_pse/4yhd.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/4yhd_G_7ahl_E_pyMol_confirm.pse b/data/Fold-switch_hits/Fold_switch_pse/4yhd_G_7ahl_E_pyMol_confirm.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/4yhd_G_7ahl_E_pyMol_confirm.pse rename to data/Fold-switch_hits/Fold_switch_pse/4yhd_G_7ahl_E_pyMol_confirm.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/4zrb_A.pse b/data/Fold-switch_hits/Fold_switch_pse/4zrb_A.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/4zrb_A.pse rename to data/Fold-switch_hits/Fold_switch_pse/4zrb_A.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/4zrb_H.pse b/data/Fold-switch_hits/Fold_switch_pse/4zrb_H.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/4zrb_H.pse rename to data/Fold-switch_hits/Fold_switch_pse/4zrb_H.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/5ejb_1wp8_best_hits.pse b/data/Fold-switch_hits/Fold_switch_pse/5ejb_1wp8_best_hits.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/5ejb_1wp8_best_hits.pse rename to data/Fold-switch_hits/Fold_switch_pse/5ejb_1wp8_best_hits.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/5f3k.pse b/data/Fold-switch_hits/Fold_switch_pse/5f3k.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/5f3k.pse rename to data/Fold-switch_hits/Fold_switch_pse/5f3k.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/5f5r.pse b/data/Fold-switch_hits/Fold_switch_pse/5f5r.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/5f5r.pse rename to data/Fold-switch_hits/Fold_switch_pse/5f5r.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/5fhc_1ebo.pse b/data/Fold-switch_hits/Fold_switch_pse/5fhc_1ebo.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/5fhc_1ebo.pse rename to data/Fold-switch_hits/Fold_switch_pse/5fhc_1ebo.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/5i2m_5i2s_best_preds.pse b/data/Fold-switch_hits/Fold_switch_pse/5i2m_5i2s_best_preds.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/5i2m_5i2s_best_preds.pse rename to data/Fold-switch_hits/Fold_switch_pse/5i2m_5i2s_best_preds.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/5jyt.pse b/data/Fold-switch_hits/Fold_switch_pse/5jyt.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/5jyt.pse rename to data/Fold-switch_hits/Fold_switch_pse/5jyt.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/5lj3.pse b/data/Fold-switch_hits/Fold_switch_pse/5lj3.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/5lj3.pse rename to data/Fold-switch_hits/Fold_switch_pse/5lj3.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/5ond.pse b/data/Fold-switch_hits/Fold_switch_pse/5ond.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/5ond.pse rename to data/Fold-switch_hits/Fold_switch_pse/5ond.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/6c6s.pse b/data/Fold-switch_hits/Fold_switch_pse/6c6s.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/6c6s.pse rename to data/Fold-switch_hits/Fold_switch_pse/6c6s.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/7ahl.pse b/data/Fold-switch_hits/Fold_switch_pse/7ahl.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/7ahl.pse rename to data/Fold-switch_hits/Fold_switch_pse/7ahl.pse diff --git a/Data/Fold-switch_hits/Fold_switch_pse/amyloid_fibrils.pse b/data/Fold-switch_hits/Fold_switch_pse/amyloid_fibrils.pse similarity index 100% rename from Data/Fold-switch_hits/Fold_switch_pse/amyloid_fibrils.pse rename to data/Fold-switch_hits/Fold_switch_pse/amyloid_fibrils.pse diff --git a/Data/Fold-switch_hits/Hits_CF_parameters_fold2.csv b/data/Fold-switch_hits/Hits_CF_parameters_fold2.csv similarity index 100% rename from Data/Fold-switch_hits/Hits_CF_parameters_fold2.csv rename to data/Fold-switch_hits/Hits_CF_parameters_fold2.csv diff --git a/Data/Fold-switch_hits/Single-sequence_confirm/1qb3.png b/data/Fold-switch_hits/Single-sequence_confirm/1qb3.png similarity index 100% rename from Data/Fold-switch_hits/Single-sequence_confirm/1qb3.png rename to data/Fold-switch_hits/Single-sequence_confirm/1qb3.png diff --git a/Data/Fold-switch_hits/Single-sequence_confirm/1qb3_model.pdb b/data/Fold-switch_hits/Single-sequence_confirm/1qb3_model.pdb similarity index 100% rename from Data/Fold-switch_hits/Single-sequence_confirm/1qb3_model.pdb rename to data/Fold-switch_hits/Single-sequence_confirm/1qb3_model.pdb diff --git a/Data/Fold-switch_hits/Single-sequence_confirm/2bzy.pdb b/data/Fold-switch_hits/Single-sequence_confirm/2bzy.pdb similarity index 100% rename from Data/Fold-switch_hits/Single-sequence_confirm/2bzy.pdb rename to data/Fold-switch_hits/Single-sequence_confirm/2bzy.pdb diff --git a/Data/Fold-switch_hits/Single-sequence_confirm/2bzy_2lqw_confirm_manual.pse b/data/Fold-switch_hits/Single-sequence_confirm/2bzy_2lqw_confirm_manual.pse similarity index 100% rename from Data/Fold-switch_hits/Single-sequence_confirm/2bzy_2lqw_confirm_manual.pse rename to data/Fold-switch_hits/Single-sequence_confirm/2bzy_2lqw_confirm_manual.pse diff --git a/Data/Fold-switch_hits/Single-sequence_confirm/3qy2_1qb3_prediction_confirm_maual.pse b/data/Fold-switch_hits/Single-sequence_confirm/3qy2_1qb3_prediction_confirm_maual.pse similarity index 100% rename from Data/Fold-switch_hits/Single-sequence_confirm/3qy2_1qb3_prediction_confirm_maual.pse rename to data/Fold-switch_hits/Single-sequence_confirm/3qy2_1qb3_prediction_confirm_maual.pse diff --git a/Data/Fold-switch_hits/sample-TMscore_fs-region_full-MSA_2lep_A.png b/data/Fold-switch_hits/sample-TMscore_fs-region_full-MSA_2lep_A.png similarity index 100% rename from Data/Fold-switch_hits/sample-TMscore_fs-region_full-MSA_2lep_A.png rename to data/Fold-switch_hits/sample-TMscore_fs-region_full-MSA_2lep_A.png diff --git a/Data/Fold-switch_hits/sample-TMscore_fs-region_full-MSA_4hdd_A.png b/data/Fold-switch_hits/sample-TMscore_fs-region_full-MSA_4hdd_A.png similarity index 100% rename from Data/Fold-switch_hits/sample-TMscore_fs-region_full-MSA_4hdd_A.png rename to data/Fold-switch_hits/sample-TMscore_fs-region_full-MSA_4hdd_A.png diff --git a/Data/Fold-switch_hits/sample-TMscore_fs-region_full-MSA_4uv2_D.png b/data/Fold-switch_hits/sample-TMscore_fs-region_full-MSA_4uv2_D.png similarity index 100% rename from Data/Fold-switch_hits/sample-TMscore_fs-region_full-MSA_4uv2_D.png rename to data/Fold-switch_hits/sample-TMscore_fs-region_full-MSA_4uv2_D.png diff --git a/Data/Fold-switch_hits/sample-TMscore_full-MSA_1miq_B.png b/data/Fold-switch_hits/sample-TMscore_full-MSA_1miq_B.png similarity index 100% rename from Data/Fold-switch_hits/sample-TMscore_full-MSA_1miq_B.png rename to data/Fold-switch_hits/sample-TMscore_full-MSA_1miq_B.png diff --git a/Data/Fold-switch_hits/sample-TMscore_full-MSA_1nqj_B.png b/data/Fold-switch_hits/sample-TMscore_full-MSA_1nqj_B.png similarity index 100% rename from Data/Fold-switch_hits/sample-TMscore_full-MSA_1nqj_B.png rename to data/Fold-switch_hits/sample-TMscore_full-MSA_1nqj_B.png diff --git a/Data/Fold-switch_hits/sample-TMscore_full-MSA_1xju_B.png b/data/Fold-switch_hits/sample-TMscore_full-MSA_1xju_B.png similarity index 100% rename from Data/Fold-switch_hits/sample-TMscore_full-MSA_1xju_B.png rename to data/Fold-switch_hits/sample-TMscore_full-MSA_1xju_B.png diff --git a/Data/Fold-switch_hits/sample-TMscore_full-MSA_2a73_B.png b/data/Fold-switch_hits/sample-TMscore_full-MSA_2a73_B.png similarity index 100% rename from Data/Fold-switch_hits/sample-TMscore_full-MSA_2a73_B.png rename to data/Fold-switch_hits/sample-TMscore_full-MSA_2a73_B.png diff --git a/Data/Fold-switch_hits/sample-TMscore_full-MSA_2c1u_C.png b/data/Fold-switch_hits/sample-TMscore_full-MSA_2c1u_C.png similarity index 100% rename from Data/Fold-switch_hits/sample-TMscore_full-MSA_2c1u_C.png rename to data/Fold-switch_hits/sample-TMscore_full-MSA_2c1u_C.png diff --git a/Data/Fold-switch_hits/sample-TMscore_full-MSA_2kxo_A.png b/data/Fold-switch_hits/sample-TMscore_full-MSA_2kxo_A.png similarity index 100% rename from Data/Fold-switch_hits/sample-TMscore_full-MSA_2kxo_A.png rename to data/Fold-switch_hits/sample-TMscore_full-MSA_2kxo_A.png diff --git a/Data/Fold-switch_hits/sample-TMscore_full-MSA_2oug_C.png b/data/Fold-switch_hits/sample-TMscore_full-MSA_2oug_C.png similarity index 100% rename from Data/Fold-switch_hits/sample-TMscore_full-MSA_2oug_C.png rename to data/Fold-switch_hits/sample-TMscore_full-MSA_2oug_C.png diff --git a/Data/Fold-switch_hits/sample-TMscore_full-MSA_2p3v_A.png b/data/Fold-switch_hits/sample-TMscore_full-MSA_2p3v_A.png similarity index 100% rename from Data/Fold-switch_hits/sample-TMscore_full-MSA_2p3v_A.png rename to data/Fold-switch_hits/sample-TMscore_full-MSA_2p3v_A.png diff --git a/Data/Fold-switch_hits/sample-TMscore_full-MSA_2qke_E.png b/data/Fold-switch_hits/sample-TMscore_full-MSA_2qke_E.png similarity index 100% rename from Data/Fold-switch_hits/sample-TMscore_full-MSA_2qke_E.png rename to data/Fold-switch_hits/sample-TMscore_full-MSA_2qke_E.png diff --git a/Data/Fold-switch_hits/sample-TMscore_full-MSA_2vfx_L.png b/data/Fold-switch_hits/sample-TMscore_full-MSA_2vfx_L.png similarity index 100% rename from Data/Fold-switch_hits/sample-TMscore_full-MSA_2vfx_L.png rename to data/Fold-switch_hits/sample-TMscore_full-MSA_2vfx_L.png diff --git a/Data/Fold-switch_hits/sample-TMscore_full-MSA_3hdf_A.png b/data/Fold-switch_hits/sample-TMscore_full-MSA_3hdf_A.png similarity index 100% rename from Data/Fold-switch_hits/sample-TMscore_full-MSA_3hdf_A.png rename to data/Fold-switch_hits/sample-TMscore_full-MSA_3hdf_A.png diff --git a/Data/Fold-switch_hits/sample-TMscore_full-MSA_3njq_A.png b/data/Fold-switch_hits/sample-TMscore_full-MSA_3njq_A.png similarity index 100% rename from Data/Fold-switch_hits/sample-TMscore_full-MSA_3njq_A.png rename to data/Fold-switch_hits/sample-TMscore_full-MSA_3njq_A.png diff --git a/Data/Fold-switch_hits/sample-TMscore_full-MSA_3tp2_A.png b/data/Fold-switch_hits/sample-TMscore_full-MSA_3tp2_A.png similarity index 100% rename from Data/Fold-switch_hits/sample-TMscore_full-MSA_3tp2_A.png rename to data/Fold-switch_hits/sample-TMscore_full-MSA_3tp2_A.png diff --git a/Data/Fold-switch_hits/sample-TMscore_full-MSA_3zwg_N.png b/data/Fold-switch_hits/sample-TMscore_full-MSA_3zwg_N.png similarity index 100% rename from Data/Fold-switch_hits/sample-TMscore_full-MSA_3zwg_N.png rename to data/Fold-switch_hits/sample-TMscore_full-MSA_3zwg_N.png diff --git a/Data/Fold-switch_hits/sample-TMscore_full-MSA_4o0p_A.png b/data/Fold-switch_hits/sample-TMscore_full-MSA_4o0p_A.png similarity index 100% rename from Data/Fold-switch_hits/sample-TMscore_full-MSA_4o0p_A.png rename to data/Fold-switch_hits/sample-TMscore_full-MSA_4o0p_A.png diff --git a/Data/Fold-switch_hits/sample-TMscore_full-MSA_4qds_A.png b/data/Fold-switch_hits/sample-TMscore_full-MSA_4qds_A.png similarity index 100% rename from Data/Fold-switch_hits/sample-TMscore_full-MSA_4qds_A.png rename to data/Fold-switch_hits/sample-TMscore_full-MSA_4qds_A.png diff --git a/Data/Fold-switch_hits/sample-TMscore_full-MSA_4tsy_D.png b/data/Fold-switch_hits/sample-TMscore_full-MSA_4tsy_D.png similarity index 100% rename from Data/Fold-switch_hits/sample-TMscore_full-MSA_4tsy_D.png rename to data/Fold-switch_hits/sample-TMscore_full-MSA_4tsy_D.png diff --git a/Data/Fold-switch_hits/sample-TMscore_full-MSA_5f3k_A.png b/data/Fold-switch_hits/sample-TMscore_full-MSA_5f3k_A.png similarity index 100% rename from Data/Fold-switch_hits/sample-TMscore_full-MSA_5f3k_A.png rename to data/Fold-switch_hits/sample-TMscore_full-MSA_5f3k_A.png diff --git a/Data/Fold-switch_hits/sample-TMscore_full-MSA_5fhc_J.png b/data/Fold-switch_hits/sample-TMscore_full-MSA_5fhc_J.png similarity index 100% rename from Data/Fold-switch_hits/sample-TMscore_full-MSA_5fhc_J.png rename to data/Fold-switch_hits/sample-TMscore_full-MSA_5fhc_J.png diff --git a/Data/Fold-switch_hits/sample-TMscore_full-MSA_5i2m_A.png b/data/Fold-switch_hits/sample-TMscore_full-MSA_5i2m_A.png similarity index 100% rename from Data/Fold-switch_hits/sample-TMscore_full-MSA_5i2m_A.png rename to data/Fold-switch_hits/sample-TMscore_full-MSA_5i2m_A.png diff --git a/Data/Fold-switch_hits/sample-TMscore_full-MSA_7ahl_E.png b/data/Fold-switch_hits/sample-TMscore_full-MSA_7ahl_E.png similarity index 100% rename from Data/Fold-switch_hits/sample-TMscore_full-MSA_7ahl_E.png rename to data/Fold-switch_hits/sample-TMscore_full-MSA_7ahl_E.png diff --git a/Data/SPEACH_AF_benchmark/SPEACH_AF-heatmap-nsamples.png b/data/SPEACH_AF_benchmark/SPEACH_AF-heatmap-nsamples.png similarity index 100% rename from Data/SPEACH_AF_benchmark/SPEACH_AF-heatmap-nsamples.png rename to data/SPEACH_AF_benchmark/SPEACH_AF-heatmap-nsamples.png diff --git a/Data/SPEACH_AF_benchmark/TMScore_full-MSA_AK.csv b/data/SPEACH_AF_benchmark/TMScore_full-MSA_AK.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/TMScore_full-MSA_AK.csv rename to data/SPEACH_AF_benchmark/TMScore_full-MSA_AK.csv diff --git a/Data/SPEACH_AF_benchmark/TMScore_full-MSA_ASCT2.csv b/data/SPEACH_AF_benchmark/TMScore_full-MSA_ASCT2.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/TMScore_full-MSA_ASCT2.csv rename to data/SPEACH_AF_benchmark/TMScore_full-MSA_ASCT2.csv diff --git a/Data/SPEACH_AF_benchmark/TMScore_full-MSA_CCR5.csv b/data/SPEACH_AF_benchmark/TMScore_full-MSA_CCR5.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/TMScore_full-MSA_CCR5.csv rename to data/SPEACH_AF_benchmark/TMScore_full-MSA_CCR5.csv diff --git a/Data/SPEACH_AF_benchmark/TMScore_full-MSA_CGRPR.csv b/data/SPEACH_AF_benchmark/TMScore_full-MSA_CGRPR.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/TMScore_full-MSA_CGRPR.csv rename to data/SPEACH_AF_benchmark/TMScore_full-MSA_CGRPR.csv diff --git a/Data/SPEACH_AF_benchmark/TMScore_full-MSA_FZD7.csv b/data/SPEACH_AF_benchmark/TMScore_full-MSA_FZD7.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/TMScore_full-MSA_FZD7.csv rename to data/SPEACH_AF_benchmark/TMScore_full-MSA_FZD7.csv diff --git a/Data/SPEACH_AF_benchmark/TMScore_full-MSA_LAT1.csv b/data/SPEACH_AF_benchmark/TMScore_full-MSA_LAT1.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/TMScore_full-MSA_LAT1.csv rename to data/SPEACH_AF_benchmark/TMScore_full-MSA_LAT1.csv diff --git a/Data/SPEACH_AF_benchmark/TMScore_full-MSA_MCT1.csv b/data/SPEACH_AF_benchmark/TMScore_full-MSA_MCT1.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/TMScore_full-MSA_MCT1.csv rename to data/SPEACH_AF_benchmark/TMScore_full-MSA_MCT1.csv diff --git a/Data/SPEACH_AF_benchmark/TMScore_full-MSA_MurJ.csv b/data/SPEACH_AF_benchmark/TMScore_full-MSA_MurJ.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/TMScore_full-MSA_MurJ.csv rename to data/SPEACH_AF_benchmark/TMScore_full-MSA_MurJ.csv diff --git a/Data/SPEACH_AF_benchmark/TMScore_full-MSA_PTH1R.csv b/data/SPEACH_AF_benchmark/TMScore_full-MSA_PTH1R.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/TMScore_full-MSA_PTH1R.csv rename to data/SPEACH_AF_benchmark/TMScore_full-MSA_PTH1R.csv diff --git a/Data/SPEACH_AF_benchmark/TMScore_full-MSA_PfMATE.csv b/data/SPEACH_AF_benchmark/TMScore_full-MSA_PfMATE.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/TMScore_full-MSA_PfMATE.csv rename to data/SPEACH_AF_benchmark/TMScore_full-MSA_PfMATE.csv diff --git a/Data/SPEACH_AF_benchmark/TMScore_full-MSA_RBP.csv b/data/SPEACH_AF_benchmark/TMScore_full-MSA_RBP.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/TMScore_full-MSA_RBP.csv rename to data/SPEACH_AF_benchmark/TMScore_full-MSA_RBP.csv diff --git a/Data/SPEACH_AF_benchmark/TMScore_full-MSA_SERT.csv b/data/SPEACH_AF_benchmark/TMScore_full-MSA_SERT.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/TMScore_full-MSA_SERT.csv rename to data/SPEACH_AF_benchmark/TMScore_full-MSA_SERT.csv diff --git a/Data/SPEACH_AF_benchmark/TMScore_full-MSA_STP10.csv b/data/SPEACH_AF_benchmark/TMScore_full-MSA_STP10.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/TMScore_full-MSA_STP10.csv rename to data/SPEACH_AF_benchmark/TMScore_full-MSA_STP10.csv diff --git a/Data/SPEACH_AF_benchmark/TMScore_full-MSA_ZnT8.csv b/data/SPEACH_AF_benchmark/TMScore_full-MSA_ZnT8.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/TMScore_full-MSA_ZnT8.csv rename to data/SPEACH_AF_benchmark/TMScore_full-MSA_ZnT8.csv diff --git a/Data/SPEACH_AF_benchmark/TMScore_random-MSA_AK.csv b/data/SPEACH_AF_benchmark/TMScore_random-MSA_AK.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/TMScore_random-MSA_AK.csv rename to data/SPEACH_AF_benchmark/TMScore_random-MSA_AK.csv diff --git a/Data/SPEACH_AF_benchmark/TMScore_random-MSA_ASCT2.csv b/data/SPEACH_AF_benchmark/TMScore_random-MSA_ASCT2.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/TMScore_random-MSA_ASCT2.csv rename to data/SPEACH_AF_benchmark/TMScore_random-MSA_ASCT2.csv diff --git a/Data/SPEACH_AF_benchmark/TMScore_random-MSA_CCR5.csv b/data/SPEACH_AF_benchmark/TMScore_random-MSA_CCR5.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/TMScore_random-MSA_CCR5.csv rename to data/SPEACH_AF_benchmark/TMScore_random-MSA_CCR5.csv diff --git a/Data/SPEACH_AF_benchmark/TMScore_random-MSA_CGRPR.csv b/data/SPEACH_AF_benchmark/TMScore_random-MSA_CGRPR.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/TMScore_random-MSA_CGRPR.csv rename to data/SPEACH_AF_benchmark/TMScore_random-MSA_CGRPR.csv diff --git a/Data/SPEACH_AF_benchmark/TMScore_random-MSA_FZD7.csv b/data/SPEACH_AF_benchmark/TMScore_random-MSA_FZD7.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/TMScore_random-MSA_FZD7.csv rename to data/SPEACH_AF_benchmark/TMScore_random-MSA_FZD7.csv diff --git a/Data/SPEACH_AF_benchmark/TMScore_random-MSA_LAT1.csv b/data/SPEACH_AF_benchmark/TMScore_random-MSA_LAT1.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/TMScore_random-MSA_LAT1.csv rename to data/SPEACH_AF_benchmark/TMScore_random-MSA_LAT1.csv diff --git a/Data/SPEACH_AF_benchmark/TMScore_random-MSA_MCT1.csv b/data/SPEACH_AF_benchmark/TMScore_random-MSA_MCT1.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/TMScore_random-MSA_MCT1.csv rename to data/SPEACH_AF_benchmark/TMScore_random-MSA_MCT1.csv diff --git a/Data/SPEACH_AF_benchmark/TMScore_random-MSA_MurJ.csv b/data/SPEACH_AF_benchmark/TMScore_random-MSA_MurJ.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/TMScore_random-MSA_MurJ.csv rename to data/SPEACH_AF_benchmark/TMScore_random-MSA_MurJ.csv diff --git a/Data/SPEACH_AF_benchmark/TMScore_random-MSA_PTH1R.csv b/data/SPEACH_AF_benchmark/TMScore_random-MSA_PTH1R.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/TMScore_random-MSA_PTH1R.csv rename to data/SPEACH_AF_benchmark/TMScore_random-MSA_PTH1R.csv diff --git a/Data/SPEACH_AF_benchmark/TMScore_random-MSA_PfMATE.csv b/data/SPEACH_AF_benchmark/TMScore_random-MSA_PfMATE.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/TMScore_random-MSA_PfMATE.csv rename to data/SPEACH_AF_benchmark/TMScore_random-MSA_PfMATE.csv diff --git a/Data/SPEACH_AF_benchmark/TMScore_random-MSA_RBP.csv b/data/SPEACH_AF_benchmark/TMScore_random-MSA_RBP.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/TMScore_random-MSA_RBP.csv rename to data/SPEACH_AF_benchmark/TMScore_random-MSA_RBP.csv diff --git a/Data/SPEACH_AF_benchmark/TMScore_random-MSA_SERT.csv b/data/SPEACH_AF_benchmark/TMScore_random-MSA_SERT.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/TMScore_random-MSA_SERT.csv rename to data/SPEACH_AF_benchmark/TMScore_random-MSA_SERT.csv diff --git a/Data/SPEACH_AF_benchmark/TMScore_random-MSA_STP10.csv b/data/SPEACH_AF_benchmark/TMScore_random-MSA_STP10.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/TMScore_random-MSA_STP10.csv rename to data/SPEACH_AF_benchmark/TMScore_random-MSA_STP10.csv diff --git a/Data/SPEACH_AF_benchmark/TMScore_random-MSA_ZnT8.csv b/data/SPEACH_AF_benchmark/TMScore_random-MSA_ZnT8.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/TMScore_random-MSA_ZnT8.csv rename to data/SPEACH_AF_benchmark/TMScore_random-MSA_ZnT8.csv diff --git a/Data/SPEACH_AF_benchmark/heatmap-max TMscore comparison.png b/data/SPEACH_AF_benchmark/heatmap-max TMscore comparison.png similarity index 100% rename from Data/SPEACH_AF_benchmark/heatmap-max TMscore comparison.png rename to data/SPEACH_AF_benchmark/heatmap-max TMscore comparison.png diff --git a/Data/SPEACH_AF_benchmark/heatmap-max TMscore comparison.svg b/data/SPEACH_AF_benchmark/heatmap-max TMscore comparison.svg similarity index 100% rename from Data/SPEACH_AF_benchmark/heatmap-max TMscore comparison.svg rename to data/SPEACH_AF_benchmark/heatmap-max TMscore comparison.svg diff --git a/Data/SPEACH_AF_benchmark/list_of_SPEACH_AF-PDB_ID.csv b/data/SPEACH_AF_benchmark/list_of_SPEACH_AF-PDB_ID.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/list_of_SPEACH_AF-PDB_ID.csv rename to data/SPEACH_AF_benchmark/list_of_SPEACH_AF-PDB_ID.csv diff --git a/Data/SPEACH_AF_benchmark/max_TM_heatmap.py b/data/SPEACH_AF_benchmark/max_TM_heatmap.py similarity index 97% rename from Data/SPEACH_AF_benchmark/max_TM_heatmap.py rename to data/SPEACH_AF_benchmark/max_TM_heatmap.py index 32da5b1..9534b96 100644 --- a/Data/SPEACH_AF_benchmark/max_TM_heatmap.py +++ b/data/SPEACH_AF_benchmark/max_TM_heatmap.py @@ -5,16 +5,11 @@ @author: Myeongsang (Samuel) Lee """ -import os -import sys -import textalloc as ta -from pathlib import Path import numpy as np import seaborn as sns import matplotlib.pyplot as plt from numpy import genfromtxt from matplotlib import pyplot as plt -from adjustText import adjust_text import glob diff --git a/Data/SPEACH_AF_benchmark/nsample_heatmap.py b/data/SPEACH_AF_benchmark/nsample_heatmap.py similarity index 91% rename from Data/SPEACH_AF_benchmark/nsample_heatmap.py rename to data/SPEACH_AF_benchmark/nsample_heatmap.py index 3c52e70..640a58f 100644 --- a/Data/SPEACH_AF_benchmark/nsample_heatmap.py +++ b/data/SPEACH_AF_benchmark/nsample_heatmap.py @@ -5,17 +5,11 @@ @author: Myeongsang (Samuel) Lee """ -import os -import sys -import textalloc as ta -from pathlib import Path import numpy as np import seaborn as sns import matplotlib.pyplot as plt -import matplotlib.font_manager from numpy import genfromtxt from matplotlib import pyplot as plt -from adjustText import adjust_text import glob diff --git a/Data/SPEACH_AF_benchmark/number_of_predictions.csv b/data/SPEACH_AF_benchmark/number_of_predictions.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/number_of_predictions.csv rename to data/SPEACH_AF_benchmark/number_of_predictions.csv diff --git a/Data/SPEACH_AF_benchmark/pse_files/.keep b/data/SPEACH_AF_benchmark/pse_files/.keep similarity index 100% rename from Data/SPEACH_AF_benchmark/pse_files/.keep rename to data/SPEACH_AF_benchmark/pse_files/.keep diff --git a/Data/SPEACH_AF_benchmark/pse_files/pse_files.zip b/data/SPEACH_AF_benchmark/pse_files/pse_files.zip similarity index 100% rename from Data/SPEACH_AF_benchmark/pse_files/pse_files.zip rename to data/SPEACH_AF_benchmark/pse_files/pse_files.zip diff --git a/Data/SPEACH_AF_benchmark/ref_data_Mchaourab_AK.csv b/data/SPEACH_AF_benchmark/ref_data_Mchaourab_AK.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/ref_data_Mchaourab_AK.csv rename to data/SPEACH_AF_benchmark/ref_data_Mchaourab_AK.csv diff --git a/Data/SPEACH_AF_benchmark/ref_data_Mchaourab_ASCT2.csv b/data/SPEACH_AF_benchmark/ref_data_Mchaourab_ASCT2.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/ref_data_Mchaourab_ASCT2.csv rename to data/SPEACH_AF_benchmark/ref_data_Mchaourab_ASCT2.csv diff --git a/Data/SPEACH_AF_benchmark/ref_data_Mchaourab_CCR5.csv b/data/SPEACH_AF_benchmark/ref_data_Mchaourab_CCR5.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/ref_data_Mchaourab_CCR5.csv rename to data/SPEACH_AF_benchmark/ref_data_Mchaourab_CCR5.csv diff --git a/Data/SPEACH_AF_benchmark/ref_data_Mchaourab_CGRPR.csv b/data/SPEACH_AF_benchmark/ref_data_Mchaourab_CGRPR.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/ref_data_Mchaourab_CGRPR.csv rename to data/SPEACH_AF_benchmark/ref_data_Mchaourab_CGRPR.csv diff --git a/Data/SPEACH_AF_benchmark/ref_data_Mchaourab_FZD7.csv b/data/SPEACH_AF_benchmark/ref_data_Mchaourab_FZD7.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/ref_data_Mchaourab_FZD7.csv rename to data/SPEACH_AF_benchmark/ref_data_Mchaourab_FZD7.csv diff --git a/Data/SPEACH_AF_benchmark/ref_data_Mchaourab_LAT1.csv b/data/SPEACH_AF_benchmark/ref_data_Mchaourab_LAT1.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/ref_data_Mchaourab_LAT1.csv rename to data/SPEACH_AF_benchmark/ref_data_Mchaourab_LAT1.csv diff --git a/Data/SPEACH_AF_benchmark/ref_data_Mchaourab_MCT1.csv b/data/SPEACH_AF_benchmark/ref_data_Mchaourab_MCT1.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/ref_data_Mchaourab_MCT1.csv rename to data/SPEACH_AF_benchmark/ref_data_Mchaourab_MCT1.csv diff --git a/Data/SPEACH_AF_benchmark/ref_data_Mchaourab_MurJ.csv b/data/SPEACH_AF_benchmark/ref_data_Mchaourab_MurJ.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/ref_data_Mchaourab_MurJ.csv rename to data/SPEACH_AF_benchmark/ref_data_Mchaourab_MurJ.csv diff --git a/Data/SPEACH_AF_benchmark/ref_data_Mchaourab_PTH1R.csv b/data/SPEACH_AF_benchmark/ref_data_Mchaourab_PTH1R.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/ref_data_Mchaourab_PTH1R.csv rename to data/SPEACH_AF_benchmark/ref_data_Mchaourab_PTH1R.csv diff --git a/Data/SPEACH_AF_benchmark/ref_data_Mchaourab_PfMATE.csv b/data/SPEACH_AF_benchmark/ref_data_Mchaourab_PfMATE.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/ref_data_Mchaourab_PfMATE.csv rename to data/SPEACH_AF_benchmark/ref_data_Mchaourab_PfMATE.csv diff --git a/Data/SPEACH_AF_benchmark/ref_data_Mchaourab_RBP.csv b/data/SPEACH_AF_benchmark/ref_data_Mchaourab_RBP.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/ref_data_Mchaourab_RBP.csv rename to data/SPEACH_AF_benchmark/ref_data_Mchaourab_RBP.csv diff --git a/Data/SPEACH_AF_benchmark/ref_data_Mchaourab_SERT.csv b/data/SPEACH_AF_benchmark/ref_data_Mchaourab_SERT.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/ref_data_Mchaourab_SERT.csv rename to data/SPEACH_AF_benchmark/ref_data_Mchaourab_SERT.csv diff --git a/Data/SPEACH_AF_benchmark/ref_data_Mchaourab_STP10.csv b/data/SPEACH_AF_benchmark/ref_data_Mchaourab_STP10.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/ref_data_Mchaourab_STP10.csv rename to data/SPEACH_AF_benchmark/ref_data_Mchaourab_STP10.csv diff --git a/Data/SPEACH_AF_benchmark/ref_data_Mchaourab_ZnT8.csv b/data/SPEACH_AF_benchmark/ref_data_Mchaourab_ZnT8.csv similarity index 100% rename from Data/SPEACH_AF_benchmark/ref_data_Mchaourab_ZnT8.csv rename to data/SPEACH_AF_benchmark/ref_data_Mchaourab_ZnT8.csv diff --git a/Data/Sa1/8E6Y_GA_CFR_2_4.pse b/data/Sa1/8E6Y_GA_CFR_2_4.pse similarity index 100% rename from Data/Sa1/8E6Y_GA_CFR_2_4.pse rename to data/Sa1/8E6Y_GA_CFR_2_4.pse diff --git a/Data/Sa1/8e5y_rebalanced.a3m b/data/Sa1/8e5y_rebalanced.a3m similarity index 100% rename from Data/Sa1/8e5y_rebalanced.a3m rename to data/Sa1/8e5y_rebalanced.a3m diff --git a/Data/Sa1/8e6y_1gjs.a3m b/data/Sa1/8e6y_1gjs.a3m similarity index 100% rename from Data/Sa1/8e6y_1gjs.a3m rename to data/Sa1/8e6y_1gjs.a3m diff --git a/Data/Sa1/8e6y_1gjs.pse b/data/Sa1/8e6y_1gjs.pse similarity index 100% rename from Data/Sa1/8e6y_1gjs.pse rename to data/Sa1/8e6y_1gjs.pse diff --git a/Data/Sa1/8e6y_2fs1.a3m b/data/Sa1/8e6y_2fs1.a3m similarity index 100% rename from Data/Sa1/8e6y_2fs1.a3m rename to data/Sa1/8e6y_2fs1.a3m diff --git a/Data/Sa1/8e6y_2fs1.pse b/data/Sa1/8e6y_2fs1.pse similarity index 100% rename from Data/Sa1/8e6y_2fs1.pse rename to data/Sa1/8e6y_2fs1.pse diff --git a/Data/Sa1/8e6y_2mh8.a3m b/data/Sa1/8e6y_2mh8.a3m similarity index 100% rename from Data/Sa1/8e6y_2mh8.a3m rename to data/Sa1/8e6y_2mh8.a3m diff --git a/Data/Sa1/8e6y_2mh8.pse b/data/Sa1/8e6y_2mh8.pse similarity index 100% rename from Data/Sa1/8e6y_2mh8.pse rename to data/Sa1/8e6y_2mh8.pse diff --git a/Data/Sa1/8e6y_SA1_CFR_Full.pse b/data/Sa1/8e6y_SA1_CFR_Full.pse similarity index 100% rename from Data/Sa1/8e6y_SA1_CFR_Full.pse rename to data/Sa1/8e6y_SA1_CFR_Full.pse diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..6297c5d --- /dev/null +++ b/environment.yml @@ -0,0 +1,15 @@ +name: cf-random # conda activate cf-random +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - python=3.10 + - pandas=1.5.3 + - biopython=1.79 + - colabfold + - foldseek + - pymol-open-source + - pip + - pip: + - -e . \ No newline at end of file diff --git a/examples/1_FS_mode-RfaH/input_files/2oug_C-search/0.a3m b/examples/1_FS_mode-RfaH/input_files/2oug_C-search/0.a3m index 31d5b18..55cfc87 100644 --- a/examples/1_FS_mode-RfaH/input_files/2oug_C-search/0.a3m +++ b/examples/1_FS_mode-RfaH/input_files/2oug_C-search/0.a3m @@ -11477,4 +11477,4 @@ ME-WYLVTCSPGREEECKSRIEQRirfsgtddIPQVVIPTQEEITVKGGQRISSVKKIFPGYLMIEMEM----------- >SRR4051794_15176281 40 0.333 6.892E+00 108 159 162 36 89 113 ------------------------------------------------------------------------------------------------------------WFRRGDVVRITAGALEGIEGTVTEVDvRQGRAGLFLHLLCRQSElAWVDFKDLR-- >MGYP001480553237 39 0.174 9.305E+00 2 54 162 40 102 104 ---RWYAIQVASSCEKKVKATLEQRSItlgvsakilEIEIPQTPgVKLKKDGSRQTIEEKVFPGYV----------------------------------------------------------------------------------------------------------- +--RWYAIQVASSCEKKVKATLEQRSItlgvsakilEIEIPQTPgVKLKKDGSRQTIEEKVFPGYV----------------------------------------------------------------------------------------------------------- \ No newline at end of file diff --git a/examples/2_Beta-phosphoglucomutase/input_files/P71447-search/P71447_converted.a3m b/examples/2_Beta-phosphoglucomutase/input_files/P71447-search/P71447_converted.a3m index 8825c28..baa4639 100644 --- a/examples/2_Beta-phosphoglucomutase/input_files/P71447-search/P71447_converted.a3m +++ b/examples/2_Beta-phosphoglucomutase/input_files/P71447-search/P71447_converted.a3m @@ -23993,4 +23993,4 @@ MIEACIFDLDGVVTDTAEYHYLGWKKMADEANFH-FDRLINEQLRGVSRRQSLQIILRYNDEKLSEQKMKEMMDKKNEYY >name -KKGFIFDLDGVIVDTAKYHFLAWKKLANSLGIDF-TKEENEQLKGVSRVRSLQKILTWGNKSLSSEEFTDLMAKKNDDYLGYIDKMDASEVLPDVHRVLNYLKQKHQPIALGSASKNAKPILQKVNLLSDFDAIVDGNDVSRAKPDPEVFVTAANLLKIEPKECIVFEDSVAGVEAANVAGMVSIGIGSETVLGHADHVFQDFTQISEEFLNQ------- >name ---QACLFDLDGVLVDTAKYHFIAWKELADDLGFPFT-EQDNERLKGVSRVASLNILLEIGGLSFSEAEKVKLAEKKNNRYVEYITTMDSSEILPGAIEFLKECNAQGIKVALGSASKNAMMILDNTGLTPYFDAIIDGTHTSVAKPDPEVFLLGASALGIAPDHCVVFEDAEAGIEAAIRAGMANVGIGSPETLSAANMVVPSLQQMNVALLRES------ +--QACLFDLDGVLVDTAKYHFIAWKELADDLGFPFT-EQDNERLKGVSRVASLNILLEIGGLSFSEAEKVKLAEKKNNRYVEYITTMDSSEILPGAIEFLKECNAQGIKVALGSASKNAMMILDNTGLTPYFDAIIDGTHTSVAKPDPEVFLLGASALGIAPDHCVVFEDAEAGIEAAIRAGMANVGIGSPETLSAANMVVPSLQQMNVALLRES------ \ No newline at end of file diff --git a/examples/3_blind_mode-Mad2/input_files/2vfx_L-search/0.a3m b/examples/3_blind_mode-Mad2/input_files/2vfx_L-search/0.a3m index e8615ec..6c481c8 100644 --- a/examples/3_blind_mode-Mad2/input_files/2vfx_L-search/0.a3m +++ b/examples/3_blind_mode-Mad2/input_files/2vfx_L-search/0.a3m @@ -6527,4 +6527,4 @@ GMASTSQRRSQVDLR--LDVITEFLEAATHHTLHARGVYPKDLLESPAFYGASV-------------------------- >MGYP001490664935 41 0.692 5.418E+00 11 36 206 1 26 47 -----------ITLKGSVDIVSEFFFTAINSILYQRG------------------------------------------------------------------------------------------------------------------------------------------------------------------------- >ERR1700675_1634616 40 0.290 9.714E+00 54 108 206 21 75 104 -------------------------------------------------------PVSSDDQVKAYIKRIMSQLNKWMLGGKISNLVVVIPSKERVENVERGSFHVDILG------------------------------------------------------------------------------------------------- +------------------------------------------------------PVSSDDQVKAYIKRIMSQLNKWMLGGKISNLVVVIPSKERVENVERGSFHVDILG------------------------------------------------------------------------------------------------- \ No newline at end of file diff --git a/examples/README b/examples/README index 99a742f..c429927 100644 --- a/examples/README +++ b/examples/README @@ -17,8 +17,8 @@ python main.py --fname 2oug_C-search/ --pdb1 2oug_C.pdb --pdb2 6c6s_D.pdb --opti *This takes <30 Minutes to run on an A100 GPU (generates 300 structures total).*
### Generated output files:
-_Predicted files from deep and random MSAs are deposited in 'successed_prediction' directory, and ensembles were in 'additional_sampling' folder._
-_If CF-random fails to find the selected random MSA, all generated files will be in 'failed_prediction' directory._
+_Predicted files from deep and random MSAs are deposited in 'successful_predictions' directory, and ensembles were in 'additional_sampling' folder._
+_If CF-random fails to find the selected random MSA, all generated files will be in 'failed_predictions' directory._
* TM-score plot of whole structure: TMscore_fs-region_full-MSA_2oug_C.png
* TM-score plot of fold-switching region: TMscore_full-MSA_2oug_C.png
* TM-score plot of fold-switching region with label of prediction rank: TMscore_fs-region_full-MSA_2oug_C_label.png
@@ -33,7 +33,7 @@ _If CF-random fails to find the selected random MSA, all generated files will be ## 2. For CF-random with alternative conformation mode.
For this mode, Beta-phosphoglucomutase would be predicted with two reference structures (i.e., 2wfa.pdb and 2wf5.pdb) and an MSA file.
``` -python main.py --fname P71447-search --pdb1 2wfa_A.pdb --pdb2 2wf5_A.pdb --option AC --nMSA 15 +python main.py --fname P71447-search --pdb1 2wfa_A.pdb --pdb2 2wf5_A.pdb --option AC --num_msa 15 ``` ### Used input files:
* PDB1: 2wfa_A.pdb
@@ -43,8 +43,8 @@ python main.py --fname P71447-search --pdb1 2wfa_A.pdb --pdb2 2wf5_A.pdb --optio *This takes <90 Minutes to run on an A100 GPU (generates 800 structures total; protein is large: ~250 residues).*
### Generated output files:
-_Predicted files from deep and random MSAs are deposited in 'successed_prediction' directory, and ensembles were in 'additional_sampling' folder._
-_If CF-random fails to find the selected random MSA, all generated files will be in 'failed_prediction' directory._
+_Predicted files from deep and random MSAs are deposited in 'successful_predictions' directory, and ensembles were in 'additional_sampling' folder._
+_If CF-random fails to find the selected random MSA, all generated files will be in 'failed_predictions' directory._
* TM-score plot of whole structure: TMscore_full-MSA_5olw_A.png
* TM-scores and plDDT scores of predictions with deep MSA: TMs_plDDT_full_all_5olw_A.csv
* TM-scores and plDDT scores of predictions with random MSAs: TMs_plDDT_rand_all_5olw_A.csv
diff --git a/install.sh b/install.sh new file mode 100755 index 0000000..8f8f379 --- /dev/null +++ b/install.sh @@ -0,0 +1,45 @@ +#!/bin/bash +set -e +echo "=== CF-random Installation ===" +echo "[1/2] Ensuring conda environment 'cf-random' exists..." +eval "$(conda shell.bash hook)" +if conda env list | grep -q "cf-random"; then + echo " Conda environment 'cf-random' already exists; skipping creation." +else + echo " Creating conda environment from environment.yml..." + conda env create -f environment.yml -n cf-random +fi +conda activate cf-random + +# Install JAX with GPU or CPU depending on hardware +JAX_VERSION=0.4.28 +if command -v nvidia-smi &> /dev/null; then + CUDA_VERSION=$(nvidia-smi | grep -oP "CUDA Version: \K[0-9]+" | head -1) + echo " GPU detected (CUDA $CUDA_VERSION), installing GPU-enabled JAX..." + if [ "$CUDA_VERSION" -ge 12 ]; then + pip install "jax[cuda12_pip]==${JAX_VERSION}" \ + -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html + else + pip install "jax[cuda11_pip]==${JAX_VERSION}" \ + -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html + fi +else + echo " No GPU detected, installing CPU-only JAX..." + pip install "jax[cpu]==${JAX_VERSION}" \ + -f https://storage.googleapis.com/jax-releases/jax_releases.html +fi + +echo "[2/2] Verifying installation (best-effort checks)..." +python -c "import importlib.util; print(' biopython:', importlib.util.find_spec('Bio') is not None)" +python -c "import importlib.util; print(' numpy:', importlib.util.find_spec('numpy') is not None)" +python -c "import importlib.util; print(' jax:', importlib.util.find_spec('jax') is not None)" +if command -v colabfold_batch &> /dev/null; then + echo " colabfold ok" +fi +if command -v cf-random &> /dev/null; then + echo " cf-random ok" +fi + +echo "" +echo "=== Installation complete ===" +echo "Activate with: conda activate cf-random" \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..e4cd9e5 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,73 @@ +[build-system] +requires = ["setuptools>=65.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "cf-random" +version = "0.2.4" +description = "CF-random: Predicting alternative conformations and fold-switching proteins" +readme = "README.md" +requires-python = ">=3.10" +license = {text = "Public Domain (NCBI)"} +authors = [ + {name = "Myeongsang (Samuel) Lee"}, + {name = "Pramesh Sharma"}, +] +keywords = ["protein-structure", "fold-switching", "alternative-conformation", "colabfold"] +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Science/Research", + "License :: Public Domain", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Topic :: Scientific/Engineering :: Bio-Informatics", +] +dependencies = [ + "matplotlib", + "seaborn", + "scikit-learn", + "mdtraj", + "MDAnalysis", + "textalloc", + "tmtools", + "adjustText", + "thefuzz", + "numpy>=1.23.5,<2.0", + "biopython>=1.79", +] + +[project.optional-dependencies] +dev = [ + "pytest>=7.0", + "pytest-cov", + "black", + "isort", + "flake8", + "autoflake", + "twine", + "build", +] + +[project.urls] +Homepage = "https://github.com/ncbi/CF-random_software" + +[project.scripts] +cf-random = "cf_random.cli:main" + +[tool.setuptools.packages.find] +where = ["."] +include = ["cf_random*"] + +[tool.setuptools.package-data] +cf_random = ["data/*"] + +[tool.black] +line-length = 100 +target-version = ["py310"] + +[tool.isort] +profile = "black" +line_length = 10 + +[tool.pytest.ini_options] +testpaths = ["tests"] \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_alternative_conformation.py b/tests/test_alternative_conformation.py new file mode 100644 index 0000000..63c07bd --- /dev/null +++ b/tests/test_alternative_conformation.py @@ -0,0 +1,461 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""Tests for alternative conformation TM-score pipelines. + +Unit tests cover TMScore, TMScoreCalAllVar._evaluate_monomer / +_evaluate_multimer, and select_size logic. + +Run with: + pytest tests/test_alternative_conformation.py -v +""" + +from pathlib import Path +from unittest.mock import MagicMock, patch + +import numpy as np +import pytest + +from cf_random.utils.convert_multi_single import ConvertM2S +from cf_random.analysis.tmscore_all_var import TMScore, TMScoreCalAllVar, MSA_MULTIPLIERS +from cf_random.analysis.base import BaseTMScore + +PDB1_NAME = "5olw_A" +PDB2_NAME = "5olx_A" +MONOMER_MODEL_TYPE = "alphafold2_ptm" +MULTIMER_MODEL_TYPE = "alphafold2_multimer_v3" +NUM_SEEDS = 5 +NUM_MODELS = 5 +NUM_PREDICTIONS = NUM_SEEDS * NUM_MODELS # 25 + +RNG = np.random.default_rng(42) + +RESNAMES = ["ALA", "GLY", "VAL", "LEU", "ILE", "PRO", "PHE", "TRP", "MET", "SER"] + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _write_minimal_pdb(path: Path, n_residues: int = 10, n_chains: int = 1) -> None: + with path.open("w") as fh: + atom_num = 1 + res_num = 1 + for chain_idx in range(n_chains): + chain_id = chr(ord("A") + chain_idx) + for i in range(n_residues): + resname = RESNAMES[i % len(RESNAMES)] + x, y, z = RNG.uniform(-10, 10, 3) + fh.write( + f"ATOM {atom_num:5d} CA {resname} {chain_id}" + f"{res_num:4d} " + f"{x:8.3f}{y:8.3f}{z:8.3f} 1.00 10.00 C\n" + ) + atom_num += 1 + res_num += 1 + fh.write("TER\n") + fh.write("END\n") + + +def _make_monomer_prediction_dir( + base: Path, + name: str, + n_models: int = NUM_PREDICTIONS, + n_residues: int = 10, +) -> Path: + pred_dir = base / name + pred_dir.mkdir(parents=True, exist_ok=True) + for i in range(1, n_models + 1): + pdb = pred_dir / f"model_{i}_unrelaxed_rank_{i:03d}_alphafold2_ptm.pdb" + _write_minimal_pdb(pdb, n_residues=n_residues, n_chains=1) + return pred_dir + + +def _make_multimer_prediction_dir( + base: Path, + name: str, + n_models: int = NUM_PREDICTIONS, + n_chains: int = 2, + n_residues: int = 10, +) -> Path: + pred_dir = base / name + pred_dir.mkdir(parents=True, exist_ok=True) + for i in range(1, n_models + 1): + pdb = pred_dir / (f"0_unrelaxed_rank_{i:03d}_alphafold2_multimer_v3_model_1_seed_000.pdb") + _write_minimal_pdb(pdb, n_residues=n_residues, n_chains=n_chains) + return pred_dir + + +def _make_converted_multimer_dir( + base: Path, + name: str, + n_models: int = NUM_PREDICTIONS, + n_residues: int = 10, +) -> Path: + pred_dir = _make_multimer_prediction_dir(base, name, n_models=n_models, n_residues=n_residues) + pdb2_file = base / f"{PDB2_NAME}.pdb" + if not pdb2_file.exists(): + _write_minimal_pdb(pdb2_file, n_residues=n_residues, n_chains=1) + ConvertM2S(str(pred_dir), PDB1_NAME, PDB2_NAME) + return pred_dir + + +def _good_tm_result(value: float = 0.6) -> MagicMock: + result = MagicMock() + result.tm_norm_chain1 = value + return result + + +def _make_pipeline_dirs(tmp_path, multimer: bool = False) -> Path: + """Create full prediction directory structure for integration tests.""" + pdb1 = tmp_path / f"{PDB1_NAME}.pdb" + pdb2 = tmp_path / f"{PDB2_NAME}.pdb" + _write_minimal_pdb(pdb1, n_residues=10) + _write_minimal_pdb(pdb2, n_residues=10) + + pred_root = tmp_path / "predictions_all" / PDB1_NAME + pred_root.mkdir(parents=True) + + if multimer: + full_name = f"{PDB1_NAME}_predicted_models_full_rand_0" + _make_multimer_prediction_dir(pred_root, full_name, n_residues=10) + ConvertM2S(str(pred_root / full_name), PDB1_NAME, PDB2_NAME) + max_msa, ext_msa = 1, 2 + for mult in MSA_MULTIPLIERS: + max_msa *= mult + ext_msa *= mult + rand_name = f"{PDB1_NAME}_predicted_models_rand_0_max_{max_msa}_ext_{ext_msa}" + _make_multimer_prediction_dir(pred_root, rand_name, n_residues=10) + ConvertM2S(str(pred_root / rand_name), PDB1_NAME, PDB2_NAME) + else: + _make_monomer_prediction_dir( + pred_root, f"{PDB1_NAME}_predicted_models_full_rand_0", n_residues=10 + ) + max_msa, ext_msa = 1, 2 + for mult in MSA_MULTIPLIERS: + max_msa *= mult + ext_msa *= mult + _make_monomer_prediction_dir( + pred_root, + f"{PDB1_NAME}_predicted_models_rand_0_max_{max_msa}_ext_{ext_msa}", + n_residues=10, + ) + return tmp_path + + +# --------------------------------------------------------------------------- +# TMScore.select_size +# --------------------------------------------------------------------------- + + +class TestSelectSize: + def _make_scorer(self) -> TMScore: + scorer = object.__new__(TMScore) + scorer.pdb1_name = PDB1_NAME + scorer.pdb2_name = PDB2_NAME + return scorer + + def test_selects_pdb2_as_alternative(self): + scorer = self._make_scorer() + num_seeds = 5 + # Row 1 (pdb2) has high scores at MSA depth 0 → selection should be 0 + data = np.zeros((14, num_seeds * 5)) + data[1, :] = 0.8 + scorer.select_size(data.flatten(), PDB1_NAME, PDB2_NAME, PDB2_NAME, num_seeds) + assert scorer.selection == 0 + + def test_selects_pdb1_as_alternative(self): + scorer = self._make_scorer() + num_seeds = 5 + # Row 0 (pdb1) has high scores at MSA depth 0 → selection should be 0 + data = np.zeros((14, num_seeds * 5)) + data[0, :] = 0.8 + scorer.select_size(data.flatten(), PDB1_NAME, PDB2_NAME, PDB1_NAME, num_seeds) + assert scorer.selection == 0 + + def test_raises_when_no_scores_above_threshold(self): + scorer = self._make_scorer() + num_seeds = 5 + data = np.full((14, num_seeds * 5), 0.3) + with pytest.raises(RuntimeError): + scorer.select_size(data.flatten(), PDB1_NAME, PDB2_NAME, PDB2_NAME, num_seeds) + + def test_selection_is_int(self): + scorer = self._make_scorer() + num_seeds = 5 + data = np.zeros((14, num_seeds * 5)) + data[1, :] = 0.8 + scorer.select_size(data.flatten(), PDB1_NAME, PDB2_NAME, PDB2_NAME, num_seeds) + assert isinstance(scorer.selection, int) + + def test_picks_highest_sum_depth(self): + """Row with highest sum across seeds should be selected.""" + scorer = self._make_scorer() + num_seeds = 5 + data = np.zeros((14, num_seeds * 5)) + # MSA depth 2 (row index 5 for pdb2) has highest scores + data[5, :] = 0.9 + data[1, :] = 0.6 + scorer.select_size(data.flatten(), PDB1_NAME, PDB2_NAME, PDB2_NAME, num_seeds) + assert scorer.selection == 2 + + +# --------------------------------------------------------------------------- +# TMScoreCalAllVar._determine_alternative / _extract_alternative_rows +# --------------------------------------------------------------------------- + + +class TestHelperMethods: + def _make_cal(self) -> TMScoreCalAllVar: + cal = object.__new__(TMScoreCalAllVar) + cal.pdb1_name = PDB1_NAME + cal.pdb2_name = PDB2_NAME + return cal + + def test_determine_alternative_row0_higher(self): + cal = self._make_cal() + scores = np.array([0.8, 0.3]) + assert cal._determine_alternative(scores) == PDB2_NAME + + def test_determine_alternative_row1_higher(self): + cal = self._make_cal() + scores = np.array([0.3, 0.8]) + assert cal._determine_alternative(scores) == PDB1_NAME + + def test_determine_alternative_equal_scores(self): + """Equal scores → pdb2 is alternative (row0 >= row1 condition).""" + cal = self._make_cal() + scores = np.array([0.5, 0.5]) + assert cal._determine_alternative(scores) == PDB2_NAME + + def test_extract_alternative_rows_pdb2(self): + matrix = np.arange(14 * 5).reshape(14, 5).astype(float) + result = TMScoreCalAllVar._extract_alternative_rows(matrix, PDB2_NAME, PDB1_NAME, PDB2_NAME) + expected = matrix[1::2, :] + np.testing.assert_array_equal(result, expected) + + def test_extract_alternative_rows_pdb1(self): + matrix = np.arange(14 * 5).reshape(14, 5).astype(float) + result = TMScoreCalAllVar._extract_alternative_rows(matrix, PDB1_NAME, PDB1_NAME, PDB2_NAME) + expected = matrix[0::2, :] + np.testing.assert_array_equal(result, expected) + + +# --------------------------------------------------------------------------- +# Monomer integration +# --------------------------------------------------------------------------- + + +class TestEvaluateMonomerIntegration: + @pytest.fixture() + def monomer_dir(self, tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) + return _make_pipeline_dirs(tmp_path, multimer=False) + + @patch("cf_random.analysis.base.tm_align") + def test_completes(self, mock_align, monomer_dir): + mock_align.return_value = _good_tm_result(0.6) + scorer = TMScoreCalAllVar( + pdb1=f"{PDB1_NAME}.pdb", + pdb1_name=PDB1_NAME, + pdb2=f"{PDB2_NAME}.pdb", + pdb2_name=PDB2_NAME, + num_msa=0, + option="AC", + model_type=MONOMER_MODEL_TYPE, + ) + assert len(scorer.size_selection) == 1 + + @patch("cf_random.analysis.base.tm_align") + def test_csv_files_written(self, mock_align, monomer_dir): + mock_align.return_value = _good_tm_result(0.6) + TMScoreCalAllVar( + pdb1=f"{PDB1_NAME}.pdb", + pdb1_name=PDB1_NAME, + pdb2=f"{PDB2_NAME}.pdb", + pdb2_name=PDB2_NAME, + num_msa=0, + option="AC", + model_type=MONOMER_MODEL_TYPE, + ) + assert (monomer_dir / f"TMScore_full-MSA_{PDB1_NAME}.csv").exists() + assert (monomer_dir / f"TMScore_random-MSA_{PDB1_NAME}.csv").exists() + + @patch("cf_random.analysis.base.tm_align") + def test_no_fs_csv_written(self, mock_align, monomer_dir): + """AC mode must not write FS-region CSV files.""" + mock_align.return_value = _good_tm_result(0.6) + TMScoreCalAllVar( + pdb1=f"{PDB1_NAME}.pdb", + pdb1_name=PDB1_NAME, + pdb2=f"{PDB2_NAME}.pdb", + pdb2_name=PDB2_NAME, + num_msa=0, + option="AC", + model_type=MONOMER_MODEL_TYPE, + ) + assert not (monomer_dir / f"TMScore_fs_full-MSA_{PDB1_NAME}.csv").exists() + assert not (monomer_dir / f"TMScore_fs_random-MSA_{PDB1_NAME}.csv").exists() + + @patch("cf_random.analysis.base.tm_align") + def test_bad_full_msa_raises(self, mock_align, monomer_dir): + mock_align.return_value = _good_tm_result(0.1) + with pytest.raises(RuntimeError): + TMScoreCalAllVar( + pdb1=f"{PDB1_NAME}.pdb", + pdb1_name=PDB1_NAME, + pdb2=f"{PDB2_NAME}.pdb", + pdb2_name=PDB2_NAME, + num_msa=0, + option="AC", + model_type=MONOMER_MODEL_TYPE, + ) + + @patch("cf_random.analysis.base.tm_align") + def test_failed_predictions_moved(self, mock_align, monomer_dir): + mock_align.return_value = _good_tm_result(0.1) + with pytest.raises(RuntimeError): + TMScoreCalAllVar( + pdb1=f"{PDB1_NAME}.pdb", + pdb1_name=PDB1_NAME, + pdb2=f"{PDB2_NAME}.pdb", + pdb2_name=PDB2_NAME, + num_msa=0, + option="AC", + model_type=MONOMER_MODEL_TYPE, + ) + failed_root = monomer_dir / "failed_predictions" / PDB1_NAME + assert failed_root.exists() + assert any(failed_root.iterdir()) + + @patch("cf_random.analysis.base.tm_align") + def test_size_selection_is_int(self, mock_align, monomer_dir): + mock_align.return_value = _good_tm_result(0.6) + scorer = TMScoreCalAllVar( + pdb1=f"{PDB1_NAME}.pdb", + pdb1_name=PDB1_NAME, + pdb2=f"{PDB2_NAME}.pdb", + pdb2_name=PDB2_NAME, + num_msa=0, + option="AC", + model_type=MONOMER_MODEL_TYPE, + ) + assert isinstance(scorer.size_selection[0], int) + + @patch("cf_random.analysis.base.tm_align") + def test_ref_alt_determination(self, mock_align, monomer_dir): + """Row 0 averaging higher → pdb2 is alternative.""" + high = _good_tm_result(0.8) + low = _good_tm_result(0.2) + + def side_effect(*args, **kwargs): + side_effect.count = getattr(side_effect, "count", 0) + 1 + return high if side_effect.count % 2 == 1 else low + + mock_align.side_effect = side_effect + scorer = TMScoreCalAllVar( + pdb1=f"{PDB1_NAME}.pdb", + pdb1_name=PDB1_NAME, + pdb2=f"{PDB2_NAME}.pdb", + pdb2_name=PDB2_NAME, + num_msa=0, + option="AC", + model_type=MONOMER_MODEL_TYPE, + ) + assert len(scorer.size_selection) == 1 + + +# --------------------------------------------------------------------------- +# Multimer integration +# --------------------------------------------------------------------------- + + +class TestEvaluateMultimerIntegration: + @pytest.fixture() + def multimer_dir(self, tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) + return _make_pipeline_dirs(tmp_path, multimer=True) + + @patch("cf_random.analysis.base.tm_align") + def test_completes(self, mock_align, multimer_dir): + mock_align.return_value = _good_tm_result(0.6) + scorer = TMScoreCalAllVar( + pdb1=f"{PDB1_NAME}.pdb", + pdb1_name=PDB1_NAME, + pdb2=f"{PDB2_NAME}.pdb", + pdb2_name=PDB2_NAME, + num_msa=0, + option="AC", + model_type=MULTIMER_MODEL_TYPE, + ) + assert len(scorer.size_selection) == 1 + + @patch("cf_random.analysis.base.tm_align") + def test_csv_files_written(self, mock_align, multimer_dir): + mock_align.return_value = _good_tm_result(0.6) + TMScoreCalAllVar( + pdb1=f"{PDB1_NAME}.pdb", + pdb1_name=PDB1_NAME, + pdb2=f"{PDB2_NAME}.pdb", + pdb2_name=PDB2_NAME, + num_msa=0, + option="AC", + model_type=MULTIMER_MODEL_TYPE, + ) + assert (multimer_dir / f"TMScore_full-MSA_{PDB1_NAME}.csv").exists() + assert (multimer_dir / f"TMScore_random-MSA_{PDB1_NAME}.csv").exists() + + @patch("cf_random.analysis.base.tm_align") + def test_bad_full_msa_raises(self, mock_align, multimer_dir): + mock_align.return_value = _good_tm_result(0.1) + with pytest.raises(RuntimeError): + TMScoreCalAllVar( + pdb1=f"{PDB1_NAME}.pdb", + pdb1_name=PDB1_NAME, + pdb2=f"{PDB2_NAME}.pdb", + pdb2_name=PDB2_NAME, + num_msa=0, + option="AC", + model_type=MULTIMER_MODEL_TYPE, + ) + + @patch("cf_random.analysis.base.tm_align") + def test_size_selection_is_int(self, mock_align, multimer_dir): + mock_align.return_value = _good_tm_result(0.6) + scorer = TMScoreCalAllVar( + pdb1=f"{PDB1_NAME}.pdb", + pdb1_name=PDB1_NAME, + pdb2=f"{PDB2_NAME}.pdb", + pdb2_name=PDB2_NAME, + num_msa=0, + option="AC", + model_type=MULTIMER_MODEL_TYPE, + ) + assert isinstance(scorer.size_selection[0], int) + + @patch("cf_random.analysis.base.tm_align") + def test_multimer_uses_rmter_files(self, mock_align, multimer_dir): + """BaseTMScore must resolve rmTER files for multimer whole-structure scoring.""" + seen_files = [] + original_resolve = BaseTMScore._resolve_models + + def capturing_resolve(self_inner): + files = original_resolve(self_inner) + seen_files.extend(files) + return files + + mock_align.return_value = _good_tm_result(0.6) + + with patch.object(BaseTMScore, "_resolve_models", capturing_resolve): + TMScoreCalAllVar( + pdb1=f"{PDB1_NAME}.pdb", + pdb1_name=PDB1_NAME, + pdb2=f"{PDB2_NAME}.pdb", + pdb2_name=PDB2_NAME, + num_msa=0, + option="AC", + model_type=MULTIMER_MODEL_TYPE, + ) + + assert seen_files + assert all("rmTER" in f for f in seen_files) diff --git a/tests/test_blind_screening.py b/tests/test_blind_screening.py new file mode 100644 index 0000000..4ddf80f --- /dev/null +++ b/tests/test_blind_screening.py @@ -0,0 +1,640 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""Tests for BlindScreening. + +Unit tests cover the pure static methods and each private helper in isolation. +Integration tests wire the full __init__ pipeline together with Foldseek and +MDAnalysis mocked out, so no external binaries or real PDB files are required. + +Run with: + pytest tests/test_blind_screening.py -v +""" + +import csv +import shutil +import subprocess +from pathlib import Path +from typing import List +from unittest.mock import MagicMock, call, patch + +import numpy as np +import pytest + +from cf_random.utils.search_foldseek_cluster import ( + FOLDSEEK_BIT_SCORE_BUG_VALUE, + KMEDOIDS_MIN_CLUSTER_SIZE, + ZSCORE_OUTLIER_THRESHOLD, + BlindScreening, +) + +RNG = np.random.default_rng(0) + + +def _make_two_cluster_data(n_per_cluster: int = 30, n_components: int = 4) -> np.ndarray: + """Return a well-separated two-cluster dataset for clustering tests.""" + c1 = RNG.normal(loc=[-5, 0, 0, 0], scale=0.3, size=(n_per_cluster, n_components)) + c2 = RNG.normal(loc=[5, 0, 0, 0], scale=0.3, size=(n_per_cluster, n_components)) + return np.vstack([c1, c2]) + + +def _make_labels(n_per_cluster: int = 30) -> np.ndarray: + """Return ground-truth labels matching _make_two_cluster_data.""" + return np.array([0] * n_per_cluster + [1] * n_per_cluster) + + +@pytest.fixture() +def tmp_blind_path(tmp_path: Path) -> Path: + """Return a temporary directory that looks like a blind_path.""" + return tmp_path + + +class TestClusterStructures: + def test_returns_array(self): + X = _make_two_cluster_data() + labels = BlindScreening.cluster_structures(X) + assert isinstance(labels, np.ndarray) + assert labels.shape == (len(X),) + + def test_finds_two_clusters_on_separable_data(self): + X = _make_two_cluster_data(n_per_cluster=40) + labels = BlindScreening.cluster_structures(X) + # Noise points (-1) are allowed; meaningful labels should be 0 and 1 + meaningful = labels[labels >= 0] + assert len(np.unique(meaningful)) == 2 + + def test_all_noise_does_not_raise(self): + # Single tight ball — HDBSCAN may label everything as one cluster or noise + X = RNG.normal(size=(10, 4)) + labels = BlindScreening.cluster_structures(X) + assert labels.shape == (10,) + + def test_labels_length_matches_input(self): + for n in [10, 25, 60]: + X = _make_two_cluster_data(n_per_cluster=n) + labels = BlindScreening.cluster_structures(X) + assert len(labels) == 2 * n + + +class TestKMedoids: + def _make_input(self, n_per_cluster=20, n_components=4): + X = _make_two_cluster_data(n_per_cluster, n_components) + labels = _make_labels(n_per_cluster) + return X, labels + + def test_returns_tuple(self): + X, labels = self._make_input() + result = BlindScreening.k_medoids(X, 0, labels) + assert isinstance(result, tuple) and len(result) == 2 + + def test_medoid_indices_within_bounds(self): + X, labels = self._make_input() + medoids, _ = BlindScreening.k_medoids(X, 0, labels, k=3) + assert all(0 <= idx < len(X) for idx in medoids) + + def test_medoids_reduce_cost_vs_random(self): + """Converged medoids should have cost <= initial random assignment. + + The implementation pads non-cluster points with 9999 so they can still + be selected as medoid indices — checking cluster membership is therefore + not a valid assertion. Cost reduction is the correct invariant. + """ + X, labels = self._make_input(n_per_cluster=20) + medoids_init, cost_init = BlindScreening.k_medoids(X, 0, labels, k=3, max_iter=0) + medoids_conv, cost_conv = BlindScreening.k_medoids(X, 0, labels, k=3) + assert cost_conv <= cost_init + + def test_cost_is_finite_for_large_cluster(self): + X, labels = self._make_input(n_per_cluster=20) + _, cost = BlindScreening.k_medoids(X, 0, labels, k=3) + assert np.isfinite(cost) + + def test_small_cluster_returns_all_indices(self): + """Cluster smaller than KMEDOIDS_MIN_CLUSTER_SIZE returns all members.""" + n_small = KMEDOIDS_MIN_CLUSTER_SIZE - 1 + X = RNG.normal(size=(10, 4)) + labels = np.array([0] * n_small + [1] * (10 - n_small)) + medoids, cost = BlindScreening.k_medoids(X, 0, labels, k=3) + assert len(medoids) == n_small + assert np.isnan(cost) + + def test_unknown_cluster_label_returns_empty(self): + X, labels = self._make_input() + medoids, cost = BlindScreening.k_medoids(X, 99, labels) + assert len(medoids) == 0 + assert np.isnan(cost) + + def test_deterministic_with_fixed_seed(self): + X, labels = self._make_input(n_per_cluster=20) + m1, c1 = BlindScreening.k_medoids(X, 0, labels, k=3) + m2, c2 = BlindScreening.k_medoids(X, 0, labels, k=3) + np.testing.assert_array_equal(m1, m2) + assert c1 == c2 + + def test_k1_returns_single_medoid(self): + X, labels = self._make_input(n_per_cluster=20) + medoids, _ = BlindScreening.k_medoids(X, 0, labels, k=1) + assert len(medoids) == 1 + + +class TestBuildCorrelationMatrix: + """Test _build_correlation_matrix via a minimal BlindScreening instance + constructed without running __init__ (object.__new__).""" + + def _make_instance(self): + obj = object.__new__(BlindScreening) + return obj + + def _write_foldseek_file(self, path: Path, rows: List[tuple]) -> None: + """Write a mock .foldseek TSV file.""" + with path.open("w") as fh: + for row in rows: + fh.write("\t".join(str(x) for x in row) + "\n") + + def test_basic_matrix_shape(self, tmp_path): + labels = ["struct_a", "struct_b"] + f1 = tmp_path / "a-self.foldseek" + f2 = tmp_path / "b-self.foldseek" + # query, target, alntmscore, qaln, taln, alnlen, evalue, bits + self._write_foldseek_file( + f1, + [ + ("a", "struct_a", 0.9, "A", "A", 10, 0.001, 100), + ("a", "struct_b", 0.5, "A", "B", 10, 0.01, 50), + ], + ) + self._write_foldseek_file( + f2, + [ + ("b", "struct_a", 0.5, "B", "A", 10, 0.01, 50), + ("b", "struct_b", 0.9, "B", "B", 10, 0.001, 100), + ], + ) + obj = self._make_instance() + mtx = obj._build_correlation_matrix([f1, f2], labels) + assert mtx.shape == (2, 2) + + def test_known_values(self, tmp_path): + labels = ["struct_a", "struct_b"] + f1 = tmp_path / "a-self.foldseek" + f2 = tmp_path / "b-self.foldseek" + self._write_foldseek_file( + f1, + [ + ("a", "struct_a", 0.9, "A", "A", 10, 0.001, 200), + ("a", "struct_b", 0.5, "A", "B", 10, 0.01, 75), + ], + ) + self._write_foldseek_file( + f2, + [ + ("b", "struct_a", 0.5, "B", "A", 10, 0.01, 75), + ("b", "struct_b", 0.9, "B", "B", 10, 0.001, 300), + ], + ) + obj = self._make_instance() + mtx = obj._build_correlation_matrix([f1, f2], labels) + assert mtx[0, 0] == 200.0 + assert mtx[0, 1] == 75.0 + assert mtx[1, 0] == 75.0 + assert mtx[1, 1] == 300.0 + + def test_foldseek_bug_value_replaced_with_zero(self, tmp_path): + labels = ["struct_a"] + f1 = tmp_path / "a-self.foldseek" + self._write_foldseek_file( + f1, + [ + ("a", "struct_a", 0.9, "A", "A", 10, 0.001, FOLDSEEK_BIT_SCORE_BUG_VALUE), + ], + ) + obj = self._make_instance() + mtx = obj._build_correlation_matrix([f1], labels) + assert mtx[0, 0] == 0.0 + + def test_missing_target_defaults_to_zero(self, tmp_path): + labels = ["struct_a", "struct_b"] + f1 = tmp_path / "a-self.foldseek" + # Only struct_a present; struct_b missing → should default to 0 + self._write_foldseek_file( + f1, + [ + ("a", "struct_a", 0.9, "A", "A", 10, 0.001, 100), + ], + ) + obj = self._make_instance() + mtx = obj._build_correlation_matrix([f1], labels) + assert mtx[0, 1] == 0.0 + + def test_short_lines_are_skipped(self, tmp_path): + labels = ["struct_a"] + f1 = tmp_path / "a-self.foldseek" + with f1.open("w") as fh: + fh.write("only\ttwo\tcolumns\n") + fh.write("a\tstruct_a\t0.9\tA\tA\t10\t0.001\t100\n") + obj = self._make_instance() + mtx = obj._build_correlation_matrix([f1], labels) + assert mtx[0, 0] == 100.0 + + def test_non_integer_bit_score_skipped(self, tmp_path): + labels = ["struct_a"] + f1 = tmp_path / "a-self.foldseek" + self._write_foldseek_file( + f1, + [ + ("a", "struct_a", 0.9, "A", "A", 10, 0.001, "nan"), + ], + ) + obj = self._make_instance() + mtx = obj._build_correlation_matrix([f1], labels) + assert mtx[0, 0] == 0.0 + + +class TestFilterUnfolded: + """Mock MDAnalysis/DSSP so we never need real PDB files.""" + + def _make_foldseek_files(self, tmp_path: Path, n: int) -> List[Path]: + files = [] + for i in range(n): + ff = tmp_path / f"struct_{i:03d}-self.foldseek" + ff.touch() + # Create matching .pdb stub + pdb = tmp_path / f"struct_{i:03d}.pdb" + pdb.touch() + files.append(ff) + return sorted(files) + + def _dssp_mock(self, loop_counts: List[int]): + """Return a side_effect list for DSSP that yields the given loop counts.""" + mocks = [] + for lc in loop_counts: + dssp_array = np.array(["-"] * lc + ["H"] * 10 + ["E"] * 5) + run_mock = MagicMock() + run_mock.results.dssp = [dssp_array] + instance = MagicMock() + instance.run.return_value = run_mock + mocks.append(instance) + return mocks + + @patch("cf_random.utils.search_foldseek_cluster.mda.Universe") + @patch("cf_random.utils.search_foldseek_cluster.DSSP") + def test_no_outliers_returns_all(self, mock_dssp_cls, mock_universe, tmp_path): + files = self._make_foldseek_files(tmp_path, 5) + # All similar loop counts — no outlier + loop_counts = [10, 11, 10, 9, 10] + mock_dssp_cls.side_effect = self._dssp_mock(loop_counts) + + obj = object.__new__(BlindScreening) + filtered, labels = obj._filter_unfolded(files) + + assert len(filtered) == 5 + assert len(labels) == 5 + + @patch("cf_random.utils.search_foldseek_cluster.mda.Universe") + @patch("cf_random.utils.search_foldseek_cluster.DSSP") + def test_outlier_removed(self, mock_dssp_cls, mock_universe, tmp_path): + # 5 samples with value 1000 only reaches z=2.0; use 20 normal + 1 extreme. + n_normal = 20 + files = self._make_foldseek_files(tmp_path, n_normal + 1) + loop_counts = [10] * n_normal + [10000] + mock_dssp_cls.side_effect = self._dssp_mock(loop_counts) + + obj = object.__new__(BlindScreening) + filtered, labels = obj._filter_unfolded(files) + + assert len(filtered) == n_normal + assert len(labels) == n_normal + + @patch("cf_random.utils.search_foldseek_cluster.mda.Universe") + @patch("cf_random.utils.search_foldseek_cluster.DSSP") + def test_labels_match_slash_dash_rule(self, mock_dssp_cls, mock_universe, tmp_path): + """Labels must equal str(pdb_path).replace('/', '-')[17:].replace('.pdb', '').""" + files = self._make_foldseek_files(tmp_path, 3) + mock_dssp_cls.side_effect = self._dssp_mock([10, 10, 10]) + + obj = object.__new__(BlindScreening) + filtered, labels = obj._filter_unfolded(files) + + for ff, label in zip(filtered, labels): + pdb_path = str(ff).replace("-self.foldseek", ".pdb") + expected = pdb_path.replace("/", "-")[17:].replace(".pdb", "") + assert label == expected + + @patch("cf_random.utils.search_foldseek_cluster.mda.Universe") + @patch("cf_random.utils.search_foldseek_cluster.DSSP") + def test_output_is_sorted(self, mock_dssp_cls, mock_universe, tmp_path): + files = self._make_foldseek_files(tmp_path, 5) + mock_dssp_cls.side_effect = self._dssp_mock([10] * 5) + + obj = object.__new__(BlindScreening) + filtered, _ = obj._filter_unfolded(files) + + assert filtered == sorted(filtered) + + +class TestStagePdbFiles: + def _make_pdb_tree(self, base: Path, names: List[str]) -> List[Path]: + paths = [] + for name in names: + p = base / name + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text("ATOM dummy\n") + paths.append(p) + return paths + + def test_staging_directory_created(self, tmp_path): + self._make_pdb_tree(tmp_path, ["subdir/a.pdb", "subdir/b.pdb"]) + obj = object.__new__(BlindScreening) + obj.blind_path = tmp_path + obj._stage_pdb_files() + assert (tmp_path / "pdbs_for_db").is_dir() + + def test_all_pdbs_staged(self, tmp_path): + self._make_pdb_tree(tmp_path, ["subdir/a.pdb", "subdir/b.pdb"]) + obj = object.__new__(BlindScreening) + obj.blind_path = tmp_path + obj._stage_pdb_files() + staged = list((tmp_path / "pdbs_for_db").iterdir()) + assert len(staged) == 2 + + def test_pdbs_inside_db_directory_excluded(self, tmp_path): + self._make_pdb_tree(tmp_path, ["subdir/a.pdb"]) + obj = object.__new__(BlindScreening) + obj.blind_path = tmp_path + obj._stage_pdb_files() + # Manually add a .pdb inside pdbs_for_db and re-stage + (tmp_path / "pdbs_for_db" / "extra.pdb").write_text("ATOM extra\n") + obj2 = object.__new__(BlindScreening) + obj2.blind_path = tmp_path + obj2._stage_pdb_files() + staged = [ + f + for f in (tmp_path / "pdbs_for_db").iterdir() + if f.suffix == ".pdb" and f.name != "extra.pdb" + ] + assert len(staged) == 1 + + def test_label_map_populated(self, tmp_path): + self._make_pdb_tree(tmp_path, ["subdir/a.pdb", "subdir/b.pdb"]) + obj = object.__new__(BlindScreening) + obj.blind_path = tmp_path + obj._stage_pdb_files() + assert len(obj.pdb_label_map) == 2 + + def test_label_map_values_match_slash_dash_rule(self, tmp_path): + self._make_pdb_tree(tmp_path, ["subdir/a.pdb"]) + obj = object.__new__(BlindScreening) + obj.blind_path = tmp_path + obj._stage_pdb_files() + for src, label in obj.pdb_label_map.items(): + expected = str(src).replace("/", "-")[17:].replace(".pdb", "") + assert label == expected + + def test_raises_if_no_pdbs(self, tmp_path): + obj = object.__new__(BlindScreening) + obj.blind_path = tmp_path + with pytest.raises(FileNotFoundError): + obj._stage_pdb_files() + + def test_existing_staged_file_not_overwritten(self, tmp_path): + self._make_pdb_tree(tmp_path, ["subdir/a.pdb"]) + obj = object.__new__(BlindScreening) + obj.blind_path = tmp_path + obj._stage_pdb_files() + # Corrupt the staged file + staged = list((tmp_path / "pdbs_for_db").glob("*.pdb"))[0] + staged.write_text("CORRUPTED") + # Re-stage should not overwrite + obj2 = object.__new__(BlindScreening) + obj2.blind_path = tmp_path + obj2._stage_pdb_files() + assert staged.read_text() == "CORRUPTED" + + +class TestSaveClusterPlot: + def test_file_created(self, tmp_path): + obj = object.__new__(BlindScreening) + obj.blind_path = tmp_path + obj.pdb1_name = "myprotein" + coords = RNG.normal(size=(20, 4)) + labels = np.array([0] * 10 + [1] * 10) + obj._save_cluster_plot(coords, labels) + assert (tmp_path / "myprotein-cluster.png").exists() + + +class TestSaveStructuresOfInterest: + def test_csv_created_with_header(self, tmp_path): + obj = object.__new__(BlindScreening) + obj.blind_path = tmp_path + obj.pdb1_name = "myprotein" + files_of_interest = [(Path("a.foldseek"), 0), (Path("b.foldseek"), 1)] + pca_of_interest = [np.array([1.0, 2.0, 0, 0]), np.array([3.0, 4.0, 0, 0])] + obj._save_structures_of_interest(files_of_interest, pca_of_interest) + out = tmp_path / "myprotein-structures_of_interest.csv" + assert out.exists() + with out.open() as fh: + reader = csv.reader(fh) + header = next(reader) + assert header == ["group", "file", "pca_1", "pca_2"] + + def test_csv_row_count(self, tmp_path): + obj = object.__new__(BlindScreening) + obj.blind_path = tmp_path + obj.pdb1_name = "myprotein" + n = 5 + files_of_interest = [(Path(f"f{i}.foldseek"), i % 2) for i in range(n)] + pca_of_interest = [np.array([float(i), 0.0, 0, 0]) for i in range(n)] + obj._save_structures_of_interest(files_of_interest, pca_of_interest) + out = tmp_path / "myprotein-structures_of_interest.csv" + with out.open() as fh: + rows = list(csv.reader(fh)) + assert len(rows) == n + 1 # header + n data rows + + +class TestSaveAllStructures: + def test_csv_created(self, tmp_path): + obj = object.__new__(BlindScreening) + obj.blind_path = tmp_path + files = [Path(f"s{i}-self.foldseek") for i in range(4)] + labels = np.array([0, 0, 1, 1]) + coords = RNG.normal(size=(4, 4)) + obj._save_all_structures(files, labels, coords) + assert (tmp_path / "structures_all.csv").exists() + + def test_row_count(self, tmp_path): + obj = object.__new__(BlindScreening) + obj.blind_path = tmp_path + n = 6 + files = [Path(f"s{i}-self.foldseek") for i in range(n)] + labels = np.zeros(n, dtype=int) + coords = RNG.normal(size=(n, 4)) + obj._save_all_structures(files, labels, coords) + with (tmp_path / "structures_all.csv").open() as fh: + rows = list(csv.reader(fh)) + assert len(rows) == n + 1 + + +class TestBlindScreeningIntegration: + """Run the complete __init__ pipeline without Foldseek, MDAnalysis, or PyMOL.""" + + N_STRUCTS = 11 # enough for HDBSCAN to find structure + + @pytest.fixture() + def pipeline_dir(self, tmp_path: Path) -> Path: + """Build a minimal directory tree with fake PDB and .foldseek files.""" + for i in range(self.N_STRUCTS): + sub = tmp_path / f"run_{i:03d}" + sub.mkdir() + pdb = sub / f"struct_{i:03d}_rank_001_model.pdb" + pdb.write_text(f"ATOM {i}\n") + + # Pre-create .foldseek result files so _run_foldseek_searches skips them + # Labels must match the slash->dash[17:] rule for this tmp_path + pdb_files = sorted(tmp_path.rglob("*.pdb")) + labels = [str(p).replace("/", "-")[17:].replace(".pdb", "") for p in pdb_files] + + for i, pdb in enumerate(pdb_files): + ff = pdb.with_name(pdb.stem + "-self.foldseek") + with ff.open("w") as fh: + for j, label in enumerate(labels): + # Give high scores on diagonal, lower elsewhere + score = 500 if i == j else (200 if abs(i - j) <= 2 else 50) + fh.write(f"query\t{label}\t0.9\tA\tA\t10\t0.001\t{score}\n") + + return tmp_path + + def _make_dssp_side_effect(self, n: int): + """Return DSSP constructor side effects for n structures (no outliers).""" + mocks = [] + for _ in range(n): + dssp_array = np.array(["-"] * 10 + ["H"] * 20 + ["E"] * 10) + run_mock = MagicMock() + run_mock.results.dssp = [dssp_array] + instance = MagicMock() + instance.run.return_value = run_mock + mocks.append(instance) + return mocks + + @patch("cf_random.utils.search_foldseek_cluster.pymol", None) + @patch("cf_random.utils.search_foldseek_cluster.DSSP") + @patch("cf_random.utils.search_foldseek_cluster.mda.Universe") + @patch("cf_random.utils.search_foldseek_cluster.subprocess.run") + def test_pipeline_completes(self, mock_run, mock_universe, mock_dssp_cls, pipeline_dir): + mock_run.return_value = MagicMock(returncode=0, stderr="") + mock_dssp_cls.side_effect = self._make_dssp_side_effect(self.N_STRUCTS) + + bs = BlindScreening("testprot", str(pipeline_dir)) + + assert isinstance(bs, BlindScreening) + + @patch("cf_random.utils.search_foldseek_cluster.pymol", None) + @patch("cf_random.utils.search_foldseek_cluster.DSSP") + @patch("cf_random.utils.search_foldseek_cluster.mda.Universe") + @patch("cf_random.utils.search_foldseek_cluster.subprocess.run") + def test_output_files_created(self, mock_run, mock_universe, mock_dssp_cls, pipeline_dir): + mock_run.return_value = MagicMock(returncode=0, stderr="") + mock_dssp_cls.side_effect = self._make_dssp_side_effect(self.N_STRUCTS) + + BlindScreening("testprot", str(pipeline_dir)) + + assert (pipeline_dir / "testprot-cluster.png").exists() + assert (pipeline_dir / "testprot-structures_of_interest.csv").exists() + assert (pipeline_dir / "structures_all.csv").exists() + + @patch("cf_random.utils.search_foldseek_cluster.pymol", None) + @patch("cf_random.utils.search_foldseek_cluster.DSSP") + @patch("cf_random.utils.search_foldseek_cluster.mda.Universe") + @patch("cf_random.utils.search_foldseek_cluster.subprocess.run") + def test_structures_of_interest_csv_has_header( + self, mock_run, mock_universe, mock_dssp_cls, pipeline_dir + ): + mock_run.return_value = MagicMock(returncode=0, stderr="") + mock_dssp_cls.side_effect = self._make_dssp_side_effect(self.N_STRUCTS) + + BlindScreening("testprot", str(pipeline_dir)) + + with (pipeline_dir / "testprot-structures_of_interest.csv").open() as fh: + header = next(csv.reader(fh)) + assert header == ["group", "file", "pca_1", "pca_2"] + + @patch("cf_random.utils.search_foldseek_cluster.pymol", None) + @patch("cf_random.utils.search_foldseek_cluster.DSSP") + @patch("cf_random.utils.search_foldseek_cluster.mda.Universe") + @patch("cf_random.utils.search_foldseek_cluster.subprocess.run") + def test_all_structures_csv_row_count( + self, mock_run, mock_universe, mock_dssp_cls, pipeline_dir + ): + mock_run.return_value = MagicMock(returncode=0, stderr="") + mock_dssp_cls.side_effect = self._make_dssp_side_effect(self.N_STRUCTS) + + BlindScreening("testprot", str(pipeline_dir)) + + with (pipeline_dir / "structures_all.csv").open() as fh: + rows = list(csv.reader(fh)) + # header + one row per structure + assert len(rows) == self.N_STRUCTS + 1 + + @patch("cf_random.utils.search_foldseek_cluster.pymol", None) + @patch("cf_random.utils.search_foldseek_cluster.DSSP") + @patch("cf_random.utils.search_foldseek_cluster.mda.Universe") + @patch("cf_random.utils.search_foldseek_cluster.subprocess.run") + def test_outlier_removed_from_all_structures_csv( + self, mock_run, mock_universe, mock_dssp_cls, pipeline_dir + ): + """One structure with extreme loop count should be excluded from outputs.""" + mock_run.return_value = MagicMock(returncode=0, stderr="") + # Last structure is an outlier + normal = [10] * (self.N_STRUCTS - 1) + outlier_dssp = self._make_dssp_side_effect(self.N_STRUCTS - 1) + # Build outlier mock with extreme loop count + # 5000 loops against ~9 normals gives z < 3; use 10000 to reliably exceed threshold. + extreme_array = np.array(["-"] * 10000 + ["H"] * 5) + run_mock = MagicMock() + run_mock.results.dssp = [extreme_array] + outlier_instance = MagicMock() + outlier_instance.run.return_value = run_mock + mock_dssp_cls.side_effect = outlier_dssp + [outlier_instance] + + BlindScreening("testprot", str(pipeline_dir)) + + with (pipeline_dir / "structures_all.csv").open() as fh: + rows = list(csv.reader(fh)) + assert len(rows) == self.N_STRUCTS # header + (N-1) structures + + def test_raises_on_missing_path(self, tmp_path): + with pytest.raises(FileNotFoundError): + BlindScreening("test", str(tmp_path / "does_not_exist")) + + @patch("cf_random.utils.search_foldseek_cluster.pymol", None) + @patch("cf_random.utils.search_foldseek_cluster.DSSP") + @patch("cf_random.utils.search_foldseek_cluster.mda.Universe") + @patch("cf_random.utils.search_foldseek_cluster.subprocess.run") + def test_foldseek_createdb_called_once( + self, mock_run, mock_universe, mock_dssp_cls, pipeline_dir + ): + mock_run.return_value = MagicMock(returncode=0, stderr="") + mock_dssp_cls.side_effect = self._make_dssp_side_effect(self.N_STRUCTS) + + BlindScreening("testprot", str(pipeline_dir)) + + createdb_calls = [c for c in mock_run.call_args_list if c.args and "createdb" in c.args[0]] + # .foldseek files already exist so easy-search is skipped; + # createdb should be called exactly once + assert len(createdb_calls) == 1 + + @patch("cf_random.utils.search_foldseek_cluster.pymol", None) + @patch("cf_random.utils.search_foldseek_cluster.DSSP") + @patch("cf_random.utils.search_foldseek_cluster.mda.Universe") + @patch("cf_random.utils.search_foldseek_cluster.subprocess.run") + def test_existing_db_skips_createdb(self, mock_run, mock_universe, mock_dssp_cls, pipeline_dir): + mock_run.return_value = MagicMock(returncode=0, stderr="") + mock_dssp_cls.side_effect = self._make_dssp_side_effect(self.N_STRUCTS) + + # Pre-create the DB file so the code skips createdb + db_dir = pipeline_dir / "pdbs_for_db" + db_dir.mkdir(exist_ok=True) + (db_dir / "DB").touch() + + BlindScreening("testprot", str(pipeline_dir)) + + createdb_calls = [c for c in mock_run.call_args_list if c.args and "createdb" in c.args[0]] + assert len(createdb_calls) == 0 diff --git a/tests/test_fold_switching.py b/tests/test_fold_switching.py new file mode 100644 index 0000000..82b3da4 --- /dev/null +++ b/tests/test_fold_switching.py @@ -0,0 +1,590 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""Tests for fold-switching TM-score pipelines. + +Unit tests cover TMScoreFS (unified monomer + multimer), +BaseTMScore model resolution, and TMScoreCalAllVarFS._evaluate_monomer / +_evaluate_multimer in isolation. + +Run with: + pytest tests/test_fold_switching.py -v +""" + +import glob +from pathlib import Path +from unittest.mock import MagicMock, patch + +import numpy as np +import pytest + +from cf_random.utils.convert_multi_single import ConvertM2S +from cf_random.analysis.cal_tmscore_fs_flmsa import TMScoreFS +from cf_random.analysis.tmscore_all_var_fs import TMScoreCalAllVarFS, MSA_MULTIPLIERS +from cf_random.analysis.base import BaseTMScore + +PDB1_NAME = "2oug_C" +PDB2_NAME = "6c6s_D" +MONOMER_MODEL_TYPE = "alphafold2_ptm" +MULTIMER_MODEL_TYPE = "alphafold2_multimer_v3" +NUM_SEEDS = 5 +NUM_MODELS = 5 +NUM_PREDICTIONS = NUM_SEEDS * NUM_MODELS # 25 + +RNG = np.random.default_rng(42) + +RESNAMES = ["ALA", "GLY", "VAL", "LEU", "ILE", "PRO", "PHE", "TRP", "MET", "SER"] + + +def _write_minimal_pdb(path: Path, n_residues: int = 10, n_chains: int = 1) -> None: + """Write a minimal PDB with CA atoms and TER records per chain.""" + with path.open("w") as fh: + atom_num = 1 + res_num = 1 + for chain_idx in range(n_chains): + chain_id = chr(ord("A") + chain_idx) + for i in range(n_residues): + resname = RESNAMES[i % len(RESNAMES)] + x, y, z = RNG.uniform(-10, 10, 3) + fh.write( + f"ATOM {atom_num:5d} CA {resname} {chain_id}" + f"{res_num:4d} " + f"{x:8.3f}{y:8.3f}{z:8.3f} 1.00 10.00 C\n" + ) + atom_num += 1 + res_num += 1 + fh.write("TER\n") + fh.write("END\n") + + +def _make_monomer_prediction_dir( + base: Path, + name: str, + n_models: int = NUM_PREDICTIONS, + n_residues: int = 10, +) -> Path: + """Create a fake ColabFold monomer output directory.""" + pred_dir = base / name + pred_dir.mkdir(parents=True, exist_ok=True) + for i in range(1, n_models + 1): + pdb = pred_dir / f"model_{i}_unrelaxed_rank_{i:03d}_alphafold2_ptm.pdb" + _write_minimal_pdb(pdb, n_residues=n_residues, n_chains=1) + return pred_dir + + +def _make_multimer_prediction_dir( + base: Path, + name: str, + n_models: int = NUM_PREDICTIONS, + n_chains: int = 2, + n_residues: int = 10, +) -> Path: + """Create a fake ColabFold multimer output directory.""" + pred_dir = base / name + pred_dir.mkdir(parents=True, exist_ok=True) + for i in range(1, n_models + 1): + pdb = pred_dir / (f"0_unrelaxed_rank_{i:03d}_alphafold2_multimer_v3_model_1_seed_000.pdb") + _write_minimal_pdb(pdb, n_residues=n_residues, n_chains=n_chains) + return pred_dir + + +def _make_converted_multimer_dir( + base: Path, + name: str, + n_models: int = NUM_PREDICTIONS, + n_residues: int = 10, +) -> Path: + """Create a multimer prediction dir and run ConvertM2S on it.""" + pred_dir = _make_multimer_prediction_dir(base, name, n_models=n_models, n_residues=n_residues) + pdb2_file = base / f"{PDB2_NAME}.pdb" + if not pdb2_file.exists(): + _write_minimal_pdb(pdb2_file, n_residues=n_residues, n_chains=1) + ConvertM2S(str(pred_dir), PDB1_NAME, PDB2_NAME) + return pred_dir + + +def _make_range_file(path: Path, fs_range: str = "1-5") -> None: + """Write a minimal range_fs_pairs_all.txt.""" + with path.open("w") as fh: + fh.write("# pdb1,pdb2,pred1,pred2,m1,m2\n") + fh.write(f"{PDB1_NAME},{PDB2_NAME},{fs_range},{fs_range},{fs_range},{fs_range}\n") + + +def _good_tm_result(value: float = 0.6) -> MagicMock: + result = MagicMock() + result.tm_norm_chain1 = value + return result + + +def _make_fs_scorer(model_glob: str = "*_unrelaxed*pdb") -> TMScoreFS: + """Create a TMScoreFS instance without running __init__.""" + scorer = object.__new__(TMScoreFS) + scorer.model_glob = model_glob + return scorer + + +class TestTMScoreFS: + def test_get_coords_correct_range(self, tmp_path): + pdb = tmp_path / "test.pdb" + _write_minimal_pdb(pdb, n_residues=10, n_chains=1) + scorer = _make_fs_scorer() + coords, seq = scorer._get_coords(pdb, "1-5") + assert len(coords) == 5 + assert len(seq) == 5 + + def test_get_coords_full_range(self, tmp_path): + pdb = tmp_path / "test.pdb" + _write_minimal_pdb(pdb, n_residues=10, n_chains=1) + scorer = _make_fs_scorer() + coords, seq = scorer._get_coords(pdb, "1-10") + assert len(coords) == 10 + assert len(seq) == 10 + + def test_get_tmscore_returns_zeros_on_empty_dir(self, tmp_path): + empty = tmp_path / "empty" + empty.mkdir() + scorer = _make_fs_scorer() + coords = RNG.uniform(size=(5, 3)) + scores = scorer._get_tmscore(coords, "AGVLI", empty, "1-5") + assert scores == [0.0, 0.0, 0.0, 0.0, 0.0] + + def test_get_tmscore_monomer_uses_unrelaxed_glob(self, tmp_path): + pred_dir = _make_monomer_prediction_dir(tmp_path, "pred", n_models=3) + matched = glob.glob(str(pred_dir) + "/*_unrelaxed*pdb") + assert len(matched) == 3 + assert all("rmTER" not in f for f in matched) + + def test_get_tmscore_multimer_uses_single_files(self, tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) + pred_dir = _make_converted_multimer_dir(tmp_path, "pred", n_models=3) + matched = glob.glob(str(pred_dir) + "/single_*_unrelaxed*pdb") + assert len(matched) == 3 + assert all("rmTER" not in f for f in matched) + + @patch("cf_random.analysis.cal_tmscore_fs_flmsa.tm_align") + def test_get_tmscore_picks_best_orientation(self, mock_align, tmp_path): + """Forward scores > reverse → forward selected.""" + pred_dir = _make_monomer_prediction_dir(tmp_path, "pred", n_models=2, n_residues=5) + scorer = _make_fs_scorer() + + fwd = MagicMock() + fwd.tm_norm_chain1 = 0.8 + rev = MagicMock() + rev.tm_norm_chain1 = 0.3 + mock_align.side_effect = [fwd, rev, fwd, rev] + + coords = RNG.uniform(size=(5, 3)) + scores = scorer._get_tmscore(coords, "AGVLI", pred_dir, "1-5") + assert all(s == 0.8 for s in scores) + + @patch("cf_random.analysis.cal_tmscore_fs_flmsa.tm_align") + def test_get_tmscore_picks_reverse_when_higher(self, mock_align, tmp_path): + pred_dir = _make_monomer_prediction_dir(tmp_path, "pred", n_models=2, n_residues=5) + scorer = _make_fs_scorer() + + fwd = MagicMock() + fwd.tm_norm_chain1 = 0.3 + rev = MagicMock() + rev.tm_norm_chain1 = 0.8 + mock_align.side_effect = [fwd, rev, fwd, rev] + + coords = RNG.uniform(size=(5, 3)) + scores = scorer._get_tmscore(coords, "AGVLI", pred_dir, "1-5") + assert all(s == 0.8 for s in scores) + + @patch("cf_random.analysis.cal_tmscore_fs_flmsa.tm_align") + def test_run_for_models_shape(self, mock_align, tmp_path, monkeypatch): + """Two loops (pdb1 then pdb2) → shape (2, n_models).""" + monkeypatch.chdir(tmp_path) + _make_range_file(tmp_path / "range_fs_pairs_all.txt") + pred_dir = _make_monomer_prediction_dir(tmp_path, "pred", n_models=5, n_residues=10) + pdb1 = tmp_path / f"{PDB1_NAME}.pdb" + pdb2 = tmp_path / f"{PDB2_NAME}.pdb" + _write_minimal_pdb(pdb1, n_residues=10) + _write_minimal_pdb(pdb2, n_residues=10) + mock_align.return_value = _good_tm_result(0.6) + + scorer = _make_fs_scorer() + scorer._run_for_models(pdb1, pdb2, str(pred_dir), "1-5", "1-5", "1-5") + assert scorer.tmscores_fs.shape == (2, 5) + + @patch("cf_random.analysis.cal_tmscore_fs_flmsa.tm_align") + def test_run_for_models_multimer_shape(self, mock_align, tmp_path, monkeypatch): + """Multimer single_ glob → shape (2, n_models).""" + monkeypatch.chdir(tmp_path) + _make_range_file(tmp_path / "range_fs_pairs_all.txt") + pred_dir = _make_converted_multimer_dir(tmp_path, "pred", n_models=5, n_residues=10) + pdb1 = tmp_path / f"{PDB1_NAME}.pdb" + pdb2 = tmp_path / f"{PDB2_NAME}.pdb" + _write_minimal_pdb(pdb1, n_residues=10) + _write_minimal_pdb(pdb2, n_residues=10) + mock_align.return_value = _good_tm_result(0.6) + + scorer = _make_fs_scorer(model_glob="single_*_unrelaxed*pdb") + scorer._run_for_models(pdb1, pdb2, str(pred_dir), "1-5", "1-5", "1-5") + assert scorer.tmscores_fs.shape == (2, 5) + + @patch("cf_random.analysis.cal_tmscore_fs_flmsa.tm_align") + def test_full_init_completes(self, mock_align, tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) + _make_range_file(tmp_path / "range_fs_pairs_all.txt") + pred_root = tmp_path / "predictions_all" / PDB1_NAME + pred_root.mkdir(parents=True) + pred_dir = _make_monomer_prediction_dir( + pred_root, f"{PDB1_NAME}_predicted_models_full_rand_0", n_residues=10 + ) + pdb1 = tmp_path / f"{PDB1_NAME}.pdb" + pdb2 = tmp_path / f"{PDB2_NAME}.pdb" + _write_minimal_pdb(pdb1, n_residues=10) + _write_minimal_pdb(pdb2, n_residues=10) + mock_align.return_value = _good_tm_result(0.6) + + scorer = TMScoreFS(str(pred_dir), pdb1, PDB1_NAME, pdb2, PDB2_NAME) + assert scorer.tmscores_fs is not None + assert scorer.tmscores_fs.shape[0] == 2 + + @patch("cf_random.analysis.cal_tmscore_fs_flmsa.tm_align") + def test_full_init_multimer_completes(self, mock_align, tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) + _make_range_file(tmp_path / "range_fs_pairs_all.txt") + pred_root = tmp_path / "predictions_all" / PDB1_NAME + pred_root.mkdir(parents=True) + pred_dir = _make_converted_multimer_dir( + pred_root, f"{PDB1_NAME}_predicted_models_full_rand_0", n_residues=10 + ) + pdb1 = tmp_path / f"{PDB1_NAME}.pdb" + pdb2 = tmp_path / f"{PDB2_NAME}.pdb" + _write_minimal_pdb(pdb1, n_residues=10) + _write_minimal_pdb(pdb2, n_residues=10) + mock_align.return_value = _good_tm_result(0.6) + + scorer = TMScoreFS( + str(pred_dir), + pdb1, + PDB1_NAME, + pdb2, + PDB2_NAME, + model_glob="single_*_unrelaxed*pdb", + ) + assert scorer.tmscores_fs is not None + assert scorer.tmscores_fs.shape[0] == 2 + + +class TestTMScoreGetPredictedFiles: + def _make_scorer(self, pred_dir, model_type): + scorer = object.__new__(BaseTMScore) + scorer.pred_dir = str(pred_dir) + scorer.pdb1_name = PDB1_NAME + scorer.pdb2_name = PDB2_NAME + scorer.model_type = model_type + return scorer + + def test_monomer_returns_unrelaxed_files(self, tmp_path): + pred_dir = _make_monomer_prediction_dir(tmp_path, "pred", n_models=5) + scorer = self._make_scorer(pred_dir, MONOMER_MODEL_TYPE) + files = scorer._resolve_models() + assert len(files) == 5 + assert all("unrelaxed" in f for f in files) + assert all("rmTER" not in f for f in files) + + def test_multimer_returns_single_files(self, tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) + pred_dir = _make_converted_multimer_dir(tmp_path, "pred") + scorer = self._make_scorer(pred_dir, MULTIMER_MODEL_TYPE) + files = scorer._resolve_models() + assert len(files) == NUM_PREDICTIONS + assert all("single_" in f for f in files) + + def test_multimer_triggers_conversion_when_missing(self, tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) + pred_dir = _make_multimer_prediction_dir(tmp_path, "pred") + pdb2 = tmp_path / f"{PDB2_NAME}.pdb" + _write_minimal_pdb(pdb2) + scorer = self._make_scorer(pred_dir, MULTIMER_MODEL_TYPE) + files = scorer._resolve_models() + assert len(files) == NUM_PREDICTIONS + assert list(pred_dir.glob("single_*_unrelaxed*pdb")) + + def test_multimer_does_not_reconvert_if_single_exists(self, tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) + pred_dir = _make_converted_multimer_dir(tmp_path, "pred", n_models=3) + single_files = list(pred_dir.glob("single_*_unrelaxed*pdb")) + single_files[0].write_text("SENTINEL") + scorer = self._make_scorer(pred_dir, MULTIMER_MODEL_TYPE) + scorer._resolve_models() + assert single_files[0].read_text() == "SENTINEL" + + def test_monomer_does_not_create_rmter_files(self, tmp_path): + pred_dir = _make_monomer_prediction_dir(tmp_path, "pred") + scorer = self._make_scorer(pred_dir, MONOMER_MODEL_TYPE) + scorer._resolve_models() + assert not list(pred_dir.glob("rmTER*")) + + def test_resolve_models_strips_extension_and_prepends_pwd(self, tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) + pred_dir = _make_monomer_prediction_dir(tmp_path, "pred", n_models=2) + scorer = self._make_scorer(pred_dir, MONOMER_MODEL_TYPE) + files = scorer._resolve_models() + assert all(not f.endswith(".pdb") for f in files) + assert all(f.startswith(str(tmp_path)) for f in files) + + +class TestEvaluateMonomerIntegration: + @pytest.fixture() + def monomer_pipeline_dir(self, tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) + _make_range_file(tmp_path / "range_fs_pairs_all.txt", fs_range="1-5") + pdb1 = tmp_path / f"{PDB1_NAME}.pdb" + pdb2 = tmp_path / f"{PDB2_NAME}.pdb" + _write_minimal_pdb(pdb1, n_residues=10) + _write_minimal_pdb(pdb2, n_residues=10) + + pred_root = tmp_path / "predictions_all" / PDB1_NAME + pred_root.mkdir(parents=True) + + _make_monomer_prediction_dir( + pred_root, f"{PDB1_NAME}_predicted_models_full_rand_0", n_residues=10 + ) + max_msa, ext_msa = 1, 2 + for mult in MSA_MULTIPLIERS: + max_msa *= mult + ext_msa *= mult + _make_monomer_prediction_dir( + pred_root, + f"{PDB1_NAME}_predicted_models_rand_0_max_{max_msa}_ext_{ext_msa}", + n_residues=10, + ) + return tmp_path + + @patch("cf_random.analysis.cal_tmscore_fs_flmsa.tm_align") + @patch("cf_random.analysis.base.tm_align") + def test_completes(self, mock_whole, mock_fs, monomer_pipeline_dir): + mock_whole.return_value = _good_tm_result(0.6) + mock_fs.return_value = _good_tm_result(0.6) + scorer = TMScoreCalAllVarFS( + pdb1=f"{PDB1_NAME}.pdb", + pdb1_name=PDB1_NAME, + pdb2=f"{PDB2_NAME}.pdb", + pdb2_name=PDB2_NAME, + num_msa=0, + option="FS", + model_type=MONOMER_MODEL_TYPE, + ) + assert len(scorer.size_selection) == 1 + + @patch("cf_random.analysis.cal_tmscore_fs_flmsa.tm_align") + @patch("cf_random.analysis.base.tm_align") + def test_csv_files_written(self, mock_whole, mock_fs, monomer_pipeline_dir): + mock_whole.return_value = _good_tm_result(0.6) + mock_fs.return_value = _good_tm_result(0.6) + TMScoreCalAllVarFS( + pdb1=f"{PDB1_NAME}.pdb", + pdb1_name=PDB1_NAME, + pdb2=f"{PDB2_NAME}.pdb", + pdb2_name=PDB2_NAME, + num_msa=0, + option="FS", + model_type=MONOMER_MODEL_TYPE, + ) + base = monomer_pipeline_dir + assert (base / f"TMScore_full-MSA_{PDB1_NAME}.csv").exists() + assert (base / f"TMScore_fs_full-MSA_{PDB1_NAME}.csv").exists() + assert (base / f"TMScore_random-MSA_{PDB1_NAME}.csv").exists() + assert (base / f"TMScore_fs_random-MSA_{PDB1_NAME}.csv").exists() + + @patch("cf_random.analysis.cal_tmscore_fs_flmsa.tm_align") + @patch("cf_random.analysis.base.tm_align") + def test_bad_predictions_raise(self, mock_whole, mock_fs, monomer_pipeline_dir): + mock_whole.return_value = _good_tm_result(0.1) + mock_fs.return_value = _good_tm_result(0.1) + with pytest.raises(RuntimeError): + TMScoreCalAllVarFS( + pdb1=f"{PDB1_NAME}.pdb", + pdb1_name=PDB1_NAME, + pdb2=f"{PDB2_NAME}.pdb", + pdb2_name=PDB2_NAME, + num_msa=0, + option="FS", + model_type=MONOMER_MODEL_TYPE, + ) + + @patch("cf_random.analysis.cal_tmscore_fs_flmsa.tm_align") + @patch("cf_random.analysis.base.tm_align") + def test_ref_alt_determination(self, mock_whole, mock_fs, monomer_pipeline_dir): + """Row 0 scores higher → pdb1 is reference, pdb2 is alternative.""" + high = _good_tm_result(0.8) + low = _good_tm_result(0.2) + + def whole_side_effect(*args, **kwargs): + whole_side_effect.count = getattr(whole_side_effect, "count", 0) + 1 + return high if whole_side_effect.count % 2 == 1 else low + + mock_whole.side_effect = whole_side_effect + mock_fs.return_value = _good_tm_result(0.6) + + scorer = TMScoreCalAllVarFS( + pdb1=f"{PDB1_NAME}.pdb", + pdb1_name=PDB1_NAME, + pdb2=f"{PDB2_NAME}.pdb", + pdb2_name=PDB2_NAME, + num_msa=0, + option="FS", + model_type=MONOMER_MODEL_TYPE, + ) + assert len(scorer.size_selection) == 1 + + +class TestEvaluateMultimerIntegration: + @pytest.fixture() + def multimer_pipeline_dir(self, tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) + _make_range_file(tmp_path / "range_fs_pairs_all.txt", fs_range="1-5") + pdb1 = tmp_path / f"{PDB1_NAME}.pdb" + pdb2 = tmp_path / f"{PDB2_NAME}.pdb" + _write_minimal_pdb(pdb1, n_residues=10) + _write_minimal_pdb(pdb2, n_residues=10) + + pred_root = tmp_path / "predictions_all" / PDB1_NAME + pred_root.mkdir(parents=True) + + full_name = f"{PDB1_NAME}_predicted_models_full_rand_0" + _make_multimer_prediction_dir(pred_root, full_name, n_residues=10) + ConvertM2S(str(pred_root / full_name), PDB1_NAME, PDB2_NAME) + + max_msa, ext_msa = 1, 2 + for mult in MSA_MULTIPLIERS: + max_msa *= mult + ext_msa *= mult + rand_name = f"{PDB1_NAME}_predicted_models_rand_0_max_{max_msa}_ext_{ext_msa}" + _make_multimer_prediction_dir(pred_root, rand_name, n_residues=10) + ConvertM2S(str(pred_root / rand_name), PDB1_NAME, PDB2_NAME) + + return tmp_path + + @patch("cf_random.analysis.cal_tmscore_fs_flmsa.tm_align") + @patch("cf_random.analysis.base.tm_align") + def test_completes(self, mock_whole, mock_fs, multimer_pipeline_dir): + mock_whole.return_value = _good_tm_result(0.6) + mock_fs.return_value = _good_tm_result(0.6) + scorer = TMScoreCalAllVarFS( + pdb1=f"{PDB1_NAME}.pdb", + pdb1_name=PDB1_NAME, + pdb2=f"{PDB2_NAME}.pdb", + pdb2_name=PDB2_NAME, + num_msa=0, + option="FS", + model_type=MULTIMER_MODEL_TYPE, + ) + assert len(scorer.size_selection) == 1 + + @patch("cf_random.analysis.cal_tmscore_fs_flmsa.tm_align") + @patch("cf_random.analysis.base.tm_align") + def test_csv_files_written(self, mock_whole, mock_fs, multimer_pipeline_dir): + mock_whole.return_value = _good_tm_result(0.6) + mock_fs.return_value = _good_tm_result(0.6) + TMScoreCalAllVarFS( + pdb1=f"{PDB1_NAME}.pdb", + pdb1_name=PDB1_NAME, + pdb2=f"{PDB2_NAME}.pdb", + pdb2_name=PDB2_NAME, + num_msa=0, + option="FS", + model_type=MULTIMER_MODEL_TYPE, + ) + base = multimer_pipeline_dir + assert (base / f"TMScore_full-MSA_{PDB1_NAME}.csv").exists() + assert (base / f"TMScore_fs_full-MSA_{PDB1_NAME}.csv").exists() + assert (base / f"TMScore_random-MSA_{PDB1_NAME}.csv").exists() + assert (base / f"TMScore_fs_random-MSA_{PDB1_NAME}.csv").exists() + + @patch("cf_random.analysis.cal_tmscore_fs_flmsa.tm_align") + @patch("cf_random.analysis.base.tm_align") + def test_size_selection_is_int(self, mock_whole, mock_fs, multimer_pipeline_dir): + mock_whole.return_value = _good_tm_result(0.6) + mock_fs.return_value = _good_tm_result(0.6) + scorer = TMScoreCalAllVarFS( + pdb1=f"{PDB1_NAME}.pdb", + pdb1_name=PDB1_NAME, + pdb2=f"{PDB2_NAME}.pdb", + pdb2_name=PDB2_NAME, + num_msa=0, + option="FS", + model_type=MULTIMER_MODEL_TYPE, + ) + assert isinstance(scorer.size_selection[0], int) + + @patch("cf_random.analysis.cal_tmscore_fs_flmsa.tm_align") + @patch("cf_random.analysis.base.tm_align") + def test_bad_predictions_raise(self, mock_whole, mock_fs, multimer_pipeline_dir): + mock_whole.return_value = _good_tm_result(0.1) + mock_fs.return_value = _good_tm_result(0.1) + with pytest.raises(RuntimeError): + TMScoreCalAllVarFS( + pdb1=f"{PDB1_NAME}.pdb", + pdb1_name=PDB1_NAME, + pdb2=f"{PDB2_NAME}.pdb", + pdb2_name=PDB2_NAME, + num_msa=0, + option="FS", + model_type=MULTIMER_MODEL_TYPE, + ) + + @patch("cf_random.analysis.cal_tmscore_fs_flmsa.tm_align") + @patch("cf_random.analysis.base.tm_align") + def test_multimer_whole_structure_uses_single_files( + self, mock_whole, mock_fs, multimer_pipeline_dir + ): + """BaseTMScore must resolve single_ files for multimer whole-structure scoring.""" + seen_files = [] + original_resolve = BaseTMScore._resolve_models + + def capturing_resolve(self_inner): + files = original_resolve(self_inner) + seen_files.extend(files) + return files + + mock_whole.return_value = _good_tm_result(0.6) + mock_fs.return_value = _good_tm_result(0.6) + + with patch.object(BaseTMScore, "_resolve_models", capturing_resolve): + TMScoreCalAllVarFS( + pdb1=f"{PDB1_NAME}.pdb", + pdb1_name=PDB1_NAME, + pdb2=f"{PDB2_NAME}.pdb", + pdb2_name=PDB2_NAME, + num_msa=0, + option="FS", + model_type=MULTIMER_MODEL_TYPE, + ) + + assert seen_files + assert all("rmTER" in f for f in seen_files) + + @patch("cf_random.analysis.cal_tmscore_fs_flmsa.tm_align") + @patch("cf_random.analysis.base.tm_align") + def test_multimer_fs_region_uses_single_files(self, mock_whole, mock_fs, multimer_pipeline_dir): + """TMScoreFS must use single_ files for multimer FS-region scoring.""" + seen_files = [] + original_get = TMScoreFS._get_tmscore + + def capturing_get(self_inner, coords1, seq1, predfilepath, res_range): + matched = glob.glob(str(predfilepath) + "/single_*_unrelaxed*pdb") + seen_files.extend(matched) + return original_get(self_inner, coords1, seq1, predfilepath, res_range) + + mock_whole.return_value = _good_tm_result(0.6) + mock_fs.return_value = _good_tm_result(0.6) + + with patch.object(TMScoreFS, "_get_tmscore", capturing_get): + TMScoreCalAllVarFS( + pdb1=f"{PDB1_NAME}.pdb", + pdb1_name=PDB1_NAME, + pdb2=f"{PDB2_NAME}.pdb", + pdb2_name=PDB2_NAME, + num_msa=0, + option="FS", + model_type=MULTIMER_MODEL_TYPE, + ) + + assert seen_files + assert all("single_" in f for f in seen_files) + assert all("rmTER" not in f for f in seen_files)