From bd0491a98c2bb755dac85d7c191df0993f46afe4 Mon Sep 17 00:00:00 2001
From: Yi Su <90744702+suu-yi@users.noreply.github.com>
Date: Thu, 19 Mar 2026 15:56:02 +0100
Subject: [PATCH 1/8] Update validation.py

---
 cytetype/preprocessing/validation.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/cytetype/preprocessing/validation.py b/cytetype/preprocessing/validation.py
index bbede86..ae8113e 100644
--- a/cytetype/preprocessing/validation.py
+++ b/cytetype/preprocessing/validation.py
@@ -275,6 +275,21 @@ def validate_adata(
 ) -> str | None:
     if cell_group_key not in adata.obs:
         raise KeyError(f"Cell group key '{cell_group_key}' not found in `adata.obs`.")
+
+    nan_mask = adata.obs[cell_group_key].isna()
+    n_nan = int(nan_mask.sum())
+    if n_nan > 0:
+        if n_nan == adata.n_obs:
+            raise ValueError(
+                f"All {n_nan} cells have NaN values in '{cell_group_key}'. "
+                f"Cannot proceed with annotation."
+            )
+        pct = round(100 * n_nan / adata.n_obs, 1)
+        logger.warning(
+            f"Dropping {n_nan} cells ({pct}%) with NaN values in '{cell_group_key}'."
+        )
+        adata._inplace_subset_obs(~nan_mask)
+
     if adata.X is None:
         raise ValueError(
             "`adata.X` is required for ranking genes. Please ensure it contains log1p normalized data."

From 0649d874b3b18463341111d03d09bcafd550f700 Mon Sep 17 00:00:00 2001
From: Yi Su <90744702+suu-yi@users.noreply.github.com>
Date: Thu, 19 Mar 2026 16:28:00 +0100
Subject: [PATCH 2/8] raise error and drop_na_cells arg

---
 cytetype/main.py                     | 4 +++-
 cytetype/preprocessing/validation.py | 8 +++++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/cytetype/main.py b/cytetype/main.py
index 55b577a..256b09d 100644
--- a/cytetype/main.py
+++ b/cytetype/main.py
@@ -87,6 +87,7 @@ def __init__(
         max_metadata_categories: int = 500,
         api_url: str = "https://prod.cytetype.nygen.io",
         auth_token: str | None = None,
+        drop_na_cells: bool = False,
     ) -> None:
         """Initialize CyteType with AnnData object and perform data preparation.
 
@@ -152,7 +153,8 @@ def __init__(
             self._original_gene_symbols_column = self.gene_symbols_column
 
             self.coordinates_key = validate_adata(
-                adata, group_key, rank_key, self.gene_symbols_column, coordinates_key
+                adata, group_key, rank_key, self.gene_symbols_column, coordinates_key,
+                drop_na_cells=drop_na_cells,
             )
             (
                 self.gene_symbols_column,
diff --git a/cytetype/preprocessing/validation.py b/cytetype/preprocessing/validation.py
index ae8113e..ba87b7e 100644
--- a/cytetype/preprocessing/validation.py
+++ b/cytetype/preprocessing/validation.py
@@ -272,6 +272,7 @@ def validate_adata(
     rank_genes_key: str,
     gene_symbols_col: str | None,
     coordinates_key: str,
+    drop_na_cells: bool = False,
 ) -> str | None:
     if cell_group_key not in adata.obs:
         raise KeyError(f"Cell group key '{cell_group_key}' not found in `adata.obs`.")
@@ -279,12 +280,17 @@ def validate_adata(
     nan_mask = adata.obs[cell_group_key].isna()
     n_nan = int(nan_mask.sum())
     if n_nan > 0:
+        pct = round(100 * n_nan / adata.n_obs, 1)
         if n_nan == adata.n_obs:
             raise ValueError(
                 f"All {n_nan} cells have NaN values in '{cell_group_key}'. "
                 f"Cannot proceed with annotation."
             )
-        pct = round(100 * n_nan / adata.n_obs, 1)
+        if not drop_na_cells:
+            raise ValueError(
+                f"{n_nan} cells ({pct}%) have NaN values in '{cell_group_key}'. "
+                f"Either fix the data or set drop_na_cells=True to exclude these cells."
+            )
         logger.warning(
             f"Dropping {n_nan} cells ({pct}%) with NaN values in '{cell_group_key}'."
         )

From ab7741902c8ea707294d85b95034af4d502146b5 Mon Sep 17 00:00:00 2001
From: Yi Su <90744702+suu-yi@users.noreply.github.com>
Date: Thu, 19 Mar 2026 16:38:09 +0100
Subject: [PATCH 3/8] warning message

---
 cytetype/preprocessing/validation.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cytetype/preprocessing/validation.py b/cytetype/preprocessing/validation.py
index ba87b7e..a7d6647 100644
--- a/cytetype/preprocessing/validation.py
+++ b/cytetype/preprocessing/validation.py
@@ -292,7 +292,8 @@ def validate_adata(
                 f"Either fix the data or set drop_na_cells=True to exclude these cells."
             )
         logger.warning(
-            f"Dropping {n_nan} cells ({pct}%) with NaN values in '{cell_group_key}'."
+            f"⚠️  Dropping {n_nan} cells ({pct}%) with NaN values in '{cell_group_key}'. "
+            f"{adata.n_obs - n_nan} cells remaining."
         )
         adata._inplace_subset_obs(~nan_mask)
 

From 96c670b51bf963cac66066b6766f07856f32731b Mon Sep 17 00:00:00 2001
From: Yi Su <90744702+suu-yi@users.noreply.github.com>
Date: Fri, 20 Mar 2026 09:05:05 +0100
Subject: [PATCH 4/8] Update validation.py

---
 cytetype/preprocessing/validation.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/cytetype/preprocessing/validation.py b/cytetype/preprocessing/validation.py
index a7d6647..2b2126d 100644
--- a/cytetype/preprocessing/validation.py
+++ b/cytetype/preprocessing/validation.py
@@ -279,6 +279,7 @@ def validate_adata(
 
     nan_mask = adata.obs[cell_group_key].isna()
     n_nan = int(nan_mask.sum())
+    _pending_nan_drop = False
     if n_nan > 0:
         pct = round(100 * n_nan / adata.n_obs, 1)
         if n_nan == adata.n_obs:
@@ -291,11 +292,7 @@ def validate_adata(
                 f"{n_nan} cells ({pct}%) have NaN values in '{cell_group_key}'. "
                 f"Either fix the data or set drop_na_cells=True to exclude these cells."
             )
-        logger.warning(
-            f"⚠️  Dropping {n_nan} cells ({pct}%) with NaN values in '{cell_group_key}'. "
-            f"{adata.n_obs - n_nan} cells remaining."
-        )
-        adata._inplace_subset_obs(~nan_mask)
+        _pending_nan_drop = True
 
     if adata.X is None:
         raise ValueError(
@@ -361,4 +358,11 @@ def validate_adata(
             f"Visualization will be disabled."
         )
 
+    if _pending_nan_drop:
+        logger.warning(
+            f"⚠️  Dropping {n_nan} cells ({pct}%) with NaN values in '{cell_group_key}'. "
+            f"{adata.n_obs - n_nan} cells remaining."
+        )
+        adata._inplace_subset_obs(~nan_mask)
+
     return found_coordinates_key

From d36c831de9c6a1833a4e5537e1c5a2a477dc40ef Mon Sep 17 00:00:00 2001
From: Yi Su <90744702+suu-yi@users.noreply.github.com>
Date: Fri, 20 Mar 2026 09:15:23 +0100
Subject: [PATCH 5/8] docstring for drop na flag

---
 cytetype/main.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cytetype/main.py b/cytetype/main.py
index 256b09d..288c42f 100644
--- a/cytetype/main.py
+++ b/cytetype/main.py
@@ -126,6 +126,9 @@ def __init__(
                 deployment. Defaults to "https://prod.cytetype.nygen.io".
             auth_token (str | None, optional): Bearer token for API authentication. If provided,
                 will be included in the Authorization header as "Bearer {auth_token}". Defaults to None.
+            drop_na_cells (bool, optional): If True, cells with NaN values in the
+                ``group_key`` column are dropped with a warning. If False (default),
+                a ``ValueError`` is raised instead.
 
         Raises:
             KeyError: If the required keys are missing in `adata.obs` or `adata.uns`

From 4124401e441b786e930d7e3f49b9d9f5e0a913d3 Mon Sep 17 00:00:00 2001
From: Yi Su <90744702+suu-yi@users.noreply.github.com>
Date: Fri, 20 Mar 2026 10:14:23 +0100
Subject: [PATCH 6/8] fix to no mod adata

---
 cytetype/main.py                     | 13 +++++++++++++
 cytetype/preprocessing/validation.py |  9 ---------
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/cytetype/main.py b/cytetype/main.py
index 288c42f..38758e3 100644
--- a/cytetype/main.py
+++ b/cytetype/main.py
@@ -159,6 +159,19 @@ def __init__(
                 adata, group_key, rank_key, self.gene_symbols_column, coordinates_key,
                 drop_na_cells=drop_na_cells,
             )
+
+            if drop_na_cells:
+                nan_mask = adata.obs[group_key].isna()
+                if nan_mask.any():
+                    n_nan = int(nan_mask.sum())
+                    pct = round(100 * n_nan / adata.n_obs, 1)
+                    logger.warning(
+                        f"⚠️  Dropping {n_nan} cells ({pct}%) with NaN values in '{group_key}'. "
+                        f"{adata.n_obs - n_nan} cells remaining."
+                    )
+                    adata = adata[~nan_mask].copy()
+                    self.adata = adata
+
             (
                 self.gene_symbols_column,
                 self._original_gene_symbols_column,
diff --git a/cytetype/preprocessing/validation.py b/cytetype/preprocessing/validation.py
index 2b2126d..71fb9e0 100644
--- a/cytetype/preprocessing/validation.py
+++ b/cytetype/preprocessing/validation.py
@@ -279,7 +279,6 @@ def validate_adata(
 
     nan_mask = adata.obs[cell_group_key].isna()
     n_nan = int(nan_mask.sum())
-    _pending_nan_drop = False
     if n_nan > 0:
         pct = round(100 * n_nan / adata.n_obs, 1)
         if n_nan == adata.n_obs:
@@ -292,7 +291,6 @@ def validate_adata(
                 f"{n_nan} cells ({pct}%) have NaN values in '{cell_group_key}'. "
                 f"Either fix the data or set drop_na_cells=True to exclude these cells."
             )
-        _pending_nan_drop = True
 
     if adata.X is None:
         raise ValueError(
@@ -358,11 +356,4 @@ def validate_adata(
             f"Visualization will be disabled."
         )
 
-    if _pending_nan_drop:
-        logger.warning(
-            f"⚠️  Dropping {n_nan} cells ({pct}%) with NaN values in '{cell_group_key}'. "
-            f"{adata.n_obs - n_nan} cells remaining."
-        )
-        adata._inplace_subset_obs(~nan_mask)
-
     return found_coordinates_key

From 8dd88b1e1094a0509f0bcf7b9911b5af345d0a0e Mon Sep 17 00:00:00 2001
From: Yi Su <90744702+suu-yi@users.noreply.github.com>
Date: Fri, 20 Mar 2026 15:53:01 +0100
Subject: [PATCH 7/8] label_na as unknown

---
 cytetype/main.py                     | 42 +++++++++++++++++++++-------
 cytetype/preprocessing/validation.py | 17 +++++++++--
 2 files changed, 46 insertions(+), 13 deletions(-)

diff --git a/cytetype/main.py b/cytetype/main.py
index 38758e3..e5f28be 100644
--- a/cytetype/main.py
+++ b/cytetype/main.py
@@ -21,7 +21,10 @@
     aggregate_cluster_metadata,
     extract_visualization_coordinates,
 )
-from .preprocessing.validation import materialize_canonical_gene_symbols_column
+from .preprocessing.validation import (
+    materialize_canonical_gene_symbols_column,
+    _generate_unique_na_label,
+)
 from .core.payload import build_annotation_payload, save_query_to_file
 from .core.artifacts import (
     _is_integer_valued,
@@ -87,7 +90,7 @@ def __init__(
         max_metadata_categories: int = 500,
         api_url: str = "https://prod.cytetype.nygen.io",
         auth_token: str | None = None,
-        drop_na_cells: bool = False,
+        label_na: bool = False,
     ) -> None:
         """Initialize CyteType with AnnData object and perform data preparation.
 
@@ -126,9 +129,11 @@ def __init__(
                 deployment. Defaults to "https://prod.cytetype.nygen.io".
             auth_token (str | None, optional): Bearer token for API authentication. If provided,
                 will be included in the Authorization header as "Bearer {auth_token}". Defaults to None.
-            drop_na_cells (bool, optional): If True, cells with NaN values in the
-                ``group_key`` column are dropped with a warning. If False (default),
-                a ``ValueError`` is raised instead.
+            label_na (bool, optional): If True, cells with NaN values in the
+                ``group_key`` column are assigned an ``'Unknown'`` cluster label
+                (or ``'Unknown 2'``, etc. if that label already exists). The original
+                AnnData object is not modified. If False (default), a ``ValueError``
+                is raised instead.
 
         Raises:
             KeyError: If the required keys are missing in `adata.obs` or `adata.uns`
@@ -157,19 +162,36 @@ def __init__(
 
             self.coordinates_key = validate_adata(
                 adata, group_key, rank_key, self.gene_symbols_column, coordinates_key,
-                drop_na_cells=drop_na_cells,
+                label_na=label_na,
             )
 
-            if drop_na_cells:
+            if label_na:
                 nan_mask = adata.obs[group_key].isna()
                 if nan_mask.any():
                     n_nan = int(nan_mask.sum())
                     pct = round(100 * n_nan / adata.n_obs, 1)
+                    existing_labels = set(
+                        str(v) for v in adata.obs[group_key].dropna().unique()
+                    )
+                    na_label = _generate_unique_na_label(existing_labels)
                     logger.warning(
-                        f"⚠️  Dropping {n_nan} cells ({pct}%) with NaN values in '{group_key}'. "
-                        f"{adata.n_obs - n_nan} cells remaining."
+                        f"⚠️  Relabeling {n_nan} cells ({pct}%) with NaN values "
+                        f"in '{group_key}' as '{na_label}'."
+                    )
+                    adata = anndata.AnnData(
+                        X=adata.X,
+                        obs=adata.obs.copy(),
+                        var=adata.var,
+                        uns=adata.uns,
+                        obsm=adata.obsm,
+                        varm=adata.varm,
+                        layers=adata.layers,
+                        obsp=adata.obsp,
+                        varp=adata.varp,
+                    )
+                    adata.obs[group_key] = (
+                        adata.obs[group_key].fillna(na_label)
                     )
-                    adata = adata[~nan_mask].copy()
                     self.adata = adata
 
             (
diff --git a/cytetype/preprocessing/validation.py b/cytetype/preprocessing/validation.py
index 71fb9e0..c785daf 100644
--- a/cytetype/preprocessing/validation.py
+++ b/cytetype/preprocessing/validation.py
@@ -266,13 +266,23 @@ def _ur_sort_key(ur: float) -> float:
     return None
 
 
+def _generate_unique_na_label(existing_labels: set[str]) -> str:
+    label = "Unknown"
+    if label not in existing_labels:
+        return label
+    n = 2
+    while f"{label} {n}" in existing_labels:
+        n += 1
+    return f"{label} {n}"
+
+
 def validate_adata(
     adata: anndata.AnnData,
     cell_group_key: str,
     rank_genes_key: str,
     gene_symbols_col: str | None,
     coordinates_key: str,
-    drop_na_cells: bool = False,
+    label_na: bool = False,
 ) -> str | None:
     if cell_group_key not in adata.obs:
         raise KeyError(f"Cell group key '{cell_group_key}' not found in `adata.obs`.")
@@ -286,10 +296,11 @@ def validate_adata(
                 f"All {n_nan} cells have NaN values in '{cell_group_key}'. "
                 f"Cannot proceed with annotation."
             )
-        if not drop_na_cells:
+        if not label_na:
             raise ValueError(
                 f"{n_nan} cells ({pct}%) have NaN values in '{cell_group_key}'. "
-                f"Either fix the data or set drop_na_cells=True to exclude these cells."
+                f"Either fix the data or set label_na=True to assign these cells "
+                f"an 'Unknown' cluster label."
             )
 
     if adata.X is None:

From b7c0fed091d1a17debae7e86c8a22a3250a086d9 Mon Sep 17 00:00:00 2001
From: Yi Su <90744702+suu-yi@users.noreply.github.com>
Date: Fri, 20 Mar 2026 16:03:50 +0100
Subject: [PATCH 8/8] Update main.py

---
 cytetype/main.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/cytetype/main.py b/cytetype/main.py
index e5f28be..408f556 100644
--- a/cytetype/main.py
+++ b/cytetype/main.py
@@ -189,9 +189,10 @@ def __init__(
                         obsp=adata.obsp,
                         varp=adata.varp,
                     )
-                    adata.obs[group_key] = (
-                        adata.obs[group_key].fillna(na_label)
-                    )
+                    col = adata.obs[group_key]
+                    if hasattr(col, "cat"):
+                        col = col.cat.add_categories(na_label)
+                    adata.obs[group_key] = col.fillna(na_label)
                     self.adata = adata
 
             (