genbio-ai
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 7 additions & 10 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 7 additions & 10 deletions
diff --git a/‎huggingface/aido.cell/.gitignore‎
Lines changed: 171 additions & 0 deletions b/‎huggingface/aido.cell/.gitignore‎
Lines changed: 171 additions & 0 deletions
diff --git a/‎huggingface/aido.cell/README.md‎
Lines changed: 129 additions & 0 deletions b/‎huggingface/aido.cell/README.md‎
Lines changed: 129 additions & 0 deletions
diff --git a/‎huggingface/aido.cell/aido_cell/__init__.py‎
Lines changed: 14 additions & 0 deletions b/‎huggingface/aido.cell/aido_cell/__init__.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎huggingface/aido.cell/aido_cell/models/__init__.py‎
Lines changed: 9 additions & 0 deletions b/‎huggingface/aido.cell/aido_cell/models/__init__.py‎
Lines changed: 9 additions & 0 deletions
@@ -4,23 +4,20 @@ repos:
     hooks:
       - id: ruff
         args: [--config, pyproject.toml, --fix]
+        exclude: ^huggingface/
       - id: ruff-format
         args: [--config, pyproject.toml]
+        exclude: ^huggingface/
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v5.0.0
     hooks:
       - id: trailing-whitespace
-        exclude: ^modelgenerator/(huggingface_models|prot_inv_fold|rna_inv_fold|rna_ss|structure_tokenizer)
+        exclude: ^(modelgenerator/(huggingface_models|prot_inv_fold|rna_inv_fold|rna_ss|structure_tokenizer)|huggingface/)
       - id: end-of-file-fixer
-        exclude: ^modelgenerator/(huggingface_models|prot_inv_fold|rna_inv_fold|rna_ss|structure_tokenizer)
+        exclude: ^(modelgenerator/(huggingface_models|prot_inv_fold|rna_inv_fold|rna_ss|structure_tokenizer)|huggingface/)
       - id: check-yaml
-        exclude: ^modelgenerator/(huggingface_models|prot_inv_fold|rna_inv_fold|rna_ss|structure_tokenizer)
+        exclude: ^(modelgenerator/(huggingface_models|prot_inv_fold|rna_inv_fold|rna_ss|structure_tokenizer)|huggingface/)
       - id: debug-statements
-        exclude: ^modelgenerator/(huggingface_models|prot_inv_fold|rna_inv_fold|rna_ss|structure_tokenizer)
+        exclude: ^(modelgenerator/(huggingface_models|prot_inv_fold|rna_inv_fold|rna_ss|structure_tokenizer)|huggingface/)
       - id: check-added-large-files
-        exclude: ^modelgenerator/(huggingface_models|prot_inv_fold|rna_inv_fold|rna_ss|structure_tokenizer)
-  - repo: https://github.com/python-poetry/poetry
-    rev: 2.1.2
-    hooks:
-      - id: poetry-check
-      - id: poetry-lock
+        exclude: ^(modelgenerator/(huggingface_models|prot_inv_fold|rna_inv_fold|rna_ss|structure_tokenizer)|huggingface/)
@@ -0,0 +1,171 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+uv.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+# Pre-commit
+.ruff_cache
@@ -0,0 +1,129 @@
+# AIDO.Cell
+
+Standalone AIDO.Cell model repo using HuggingFace handles.
+
+## Installation
+
+```bash
+# Barebones installation
+pip install -e .
+
+# FlashAttention2 support
+pip install -e ".[flash_attn]"
+
+# PEFT/LoRA support
+pip install -e ".[peft]"
+
+# All optional dependencies
+pip install -e ".[flash_attn,peft]"
+```
+
+## Quickstart
+
+1. **Edit the configuration** in `embed.py`:
+
+```python
+# CONFIGURATION - Set these variables
+MODEL_NAME = "genbio-ai/AIDO.Cell-3M"  # Or "genbio-ai/AIDO.Cell-100M"
+INPUT_FILE = "temp_adata.h5ad"          # Path to your input file
+OUTPUT_FILE = None                       # Auto-generates: input_embeddings.h5ad
+DEVICE = "cuda"                          # "cuda" or "cpu"
+BATCH_SIZE = 32
+EMBEDDING_KEY = "X_aido_cell"
+```
+
+2. **Run the script**:
+
+```bash
+python embed.py
+```
+
+## Finetune with LoRA
+
+> **Note**: Fine-tuning requires the `peft` optional dependency. Install with: `uv pip install -e ".[peft]"`
+
+1. **Edit the configuration** in `finetune.py`:
+
+```python
+# CONFIGURATION - Set these variables
+MODEL_NAME = "genbio-ai/AIDO.Cell-3M"   # HuggingFace model handle
+NUM_CLASSES = 5                          # Number of classification classes
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+BATCH_SIZE = 8
+LEARNING_RATE = 1e-4
+NUM_EPOCHS = 5
+FREEZE_BACKBONE = False                  # Set True to freeze AIDO.Cell weights (ignored if USE_LORA=True)
+
+# LoRA/PEFT Configuration
+USE_LORA = True                          # Set True to use LoRA for parameter-efficient fine-tuning
+LORA_R = 8                               # LoRA rank (higher = more parameters, default: 8)
+LORA_ALPHA = 16                          # LoRA alpha (scaling factor, default: 16)
+LORA_DROPOUT = 0.1                       # LoRA dropout
+LORA_TARGET_MODULES = ["query", "value"] # Modules to apply LoRA (query, key, value, dense)
+```
+
+2. **Run the fine-tuning script**:
+
+```bash
+python finetune.py
+```
+
+3. **Load your model**
+
+After training, load your fine-tuned model:
+
+```python
+from finetune import CellFoundationClassifier
+
+model = CellFoundationClassifier(MODEL_NAME, NUM_CLASSES, FREEZE_BACKBONE, USE_LORA, lora_config)
+checkpoint = torch.load('best_model.pt')
+model.load_state_dict(checkpoint['model_state_dict'])
+```
+
+## Quirks
+
+AIDO.Cell was pre-trained on a fixed set of 19,264 genes using a read depth-aware objective function.
+All inputs should be processed using the `aido_cell.utils.gene_alignment` and `aido_cell.utils.preprocessing` tools.
+
+1. Gene alignment 
+  1. Removes genes in your data that aren't in AIDO.Cell's gene set
+  2. Adds zero-filled entries for genes in AIDO.Cell's set that are missing from your data
+  3. Reorders genes to match AIDO.Cell's expected order
+  4. Creates attention masks so the model knows which genes are actually present
+2. Preprocessing
+  1. Calculates log10 of total counts per cell (minimum 5) for depth tokens
+  2. Normalizes counts to log1p(CPM) where CPM = counts per 10,000
+  3. Appends two depth tokens (rawcountsidx, inputcountidx) to the sequence. 
+  In pretraining these indicated the input and desired output depth, but in this script they are fixed to be equal.
+  4. Clips values at 20
+  5. Converts to bfloat16
+
+## Package Structure
+
+```
+aido.cell/
+├── embed.py                    # Embedding generation script
+├── finetune.py                 # Fine-tuning script with LoRA
+├── pyproject.toml              # Package configuration
+├── aido_cell/                  # Python package
+│   ├── __init__.py
+│   ├── models/                 # CellFoundation model implementations
+│   │   ├── __init__.py
+│   │   ├── configuration_cellfoundation.py
+│   │   ├── modeling_cellfoundation.py
+│   │   └── gene_lists/         # Reference gene set (19,264 genes)
+│   │       └── OS_scRNA_gene_index.19264.tsv
+│   └── utils/                  # Utility functions
+│       ├── __init__.py
+│       ├── gene_alignment.py   # Gene alignment utilities
+│       └── preprocessing.py    # Data normalization (log1p CPM + depth tokens)
+```
+
+## Available Models
+
+AIDO.Cell models on HuggingFace:
+- `genbio-ai/AIDO.Cell-3M`
+- `genbio-ai/AIDO.Cell-10M`
+- `genbio-ai/AIDO.Cell-100M`
+
+Check the [AIDO.Cell HuggingFace page](https://huggingface.co/genbio-ai) for the latest models.
@@ -0,0 +1,14 @@
+"""AIDO.Cell: Standalone package for cell foundation models."""
+
+from aido_cell.models import CellFoundationModel, CellFoundationConfig
+from aido_cell.utils.gene_alignment import align_adata
+from aido_cell.utils.preprocessing import preprocess_counts
+
+__version__ = "0.1.0"
+
+__all__ = [
+    "CellFoundationModel",
+    "CellFoundationConfig",
+    "align_adata",
+    "preprocess_counts",
+]
@@ -0,0 +1,9 @@
+"""CellFoundation model implementations."""
+
+from aido_cell.models.configuration_cellfoundation import CellFoundationConfig
+from aido_cell.models.modeling_cellfoundation import CellFoundationModel
+
+__all__ = [
+    "CellFoundationConfig",
+    "CellFoundationModel",
+]