From bd0491a98c2bb755dac85d7c191df0993f46afe4 Mon Sep 17 00:00:00 2001 From: Yi Su <90744702+suu-yi@users.noreply.github.com> Date: Thu, 19 Mar 2026 15:56:02 +0100 Subject: [PATCH 1/8] Update validation.py --- cytetype/preprocessing/validation.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/cytetype/preprocessing/validation.py b/cytetype/preprocessing/validation.py index bbede86..ae8113e 100644 --- a/cytetype/preprocessing/validation.py +++ b/cytetype/preprocessing/validation.py @@ -275,6 +275,21 @@ def validate_adata( ) -> str | None: if cell_group_key not in adata.obs: raise KeyError(f"Cell group key '{cell_group_key}' not found in `adata.obs`.") + + nan_mask = adata.obs[cell_group_key].isna() + n_nan = int(nan_mask.sum()) + if n_nan > 0: + if n_nan == adata.n_obs: + raise ValueError( + f"All {n_nan} cells have NaN values in '{cell_group_key}'. " + f"Cannot proceed with annotation." + ) + pct = round(100 * n_nan / adata.n_obs, 1) + logger.warning( + f"Dropping {n_nan} cells ({pct}%) with NaN values in '{cell_group_key}'." + ) + adata._inplace_subset_obs(~nan_mask) + if adata.X is None: raise ValueError( "`adata.X` is required for ranking genes. Please ensure it contains log1p normalized data." From 0649d874b3b18463341111d03d09bcafd550f700 Mon Sep 17 00:00:00 2001 From: Yi Su <90744702+suu-yi@users.noreply.github.com> Date: Thu, 19 Mar 2026 16:28:00 +0100 Subject: [PATCH 2/8] raise error and drop_na_cells arg --- cytetype/main.py | 4 +++- cytetype/preprocessing/validation.py | 8 +++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/cytetype/main.py b/cytetype/main.py index 55b577a..256b09d 100644 --- a/cytetype/main.py +++ b/cytetype/main.py @@ -87,6 +87,7 @@ def __init__( max_metadata_categories: int = 500, api_url: str = "https://prod.cytetype.nygen.io", auth_token: str | None = None, + drop_na_cells: bool = False, ) -> None: """Initialize CyteType with AnnData object and perform data preparation. @@ -152,7 +153,8 @@ def __init__( self._original_gene_symbols_column = self.gene_symbols_column self.coordinates_key = validate_adata( - adata, group_key, rank_key, self.gene_symbols_column, coordinates_key + adata, group_key, rank_key, self.gene_symbols_column, coordinates_key, + drop_na_cells=drop_na_cells, ) ( self.gene_symbols_column, diff --git a/cytetype/preprocessing/validation.py b/cytetype/preprocessing/validation.py index ae8113e..ba87b7e 100644 --- a/cytetype/preprocessing/validation.py +++ b/cytetype/preprocessing/validation.py @@ -272,6 +272,7 @@ def validate_adata( rank_genes_key: str, gene_symbols_col: str | None, coordinates_key: str, + drop_na_cells: bool = False, ) -> str | None: if cell_group_key not in adata.obs: raise KeyError(f"Cell group key '{cell_group_key}' not found in `adata.obs`.") @@ -279,12 +280,17 @@ def validate_adata( nan_mask = adata.obs[cell_group_key].isna() n_nan = int(nan_mask.sum()) if n_nan > 0: + pct = round(100 * n_nan / adata.n_obs, 1) if n_nan == adata.n_obs: raise ValueError( f"All {n_nan} cells have NaN values in '{cell_group_key}'. " f"Cannot proceed with annotation." ) - pct = round(100 * n_nan / adata.n_obs, 1) + if not drop_na_cells: + raise ValueError( + f"{n_nan} cells ({pct}%) have NaN values in '{cell_group_key}'. " + f"Either fix the data or set drop_na_cells=True to exclude these cells." + ) logger.warning( f"Dropping {n_nan} cells ({pct}%) with NaN values in '{cell_group_key}'." ) From ab7741902c8ea707294d85b95034af4d502146b5 Mon Sep 17 00:00:00 2001 From: Yi Su <90744702+suu-yi@users.noreply.github.com> Date: Thu, 19 Mar 2026 16:38:09 +0100 Subject: [PATCH 3/8] warning message --- cytetype/preprocessing/validation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cytetype/preprocessing/validation.py b/cytetype/preprocessing/validation.py index ba87b7e..a7d6647 100644 --- a/cytetype/preprocessing/validation.py +++ b/cytetype/preprocessing/validation.py @@ -292,7 +292,8 @@ def validate_adata( f"Either fix the data or set drop_na_cells=True to exclude these cells." ) logger.warning( - f"Dropping {n_nan} cells ({pct}%) with NaN values in '{cell_group_key}'." + f"⚠️ Dropping {n_nan} cells ({pct}%) with NaN values in '{cell_group_key}'. " + f"{adata.n_obs - n_nan} cells remaining." ) adata._inplace_subset_obs(~nan_mask) From 96c670b51bf963cac66066b6766f07856f32731b Mon Sep 17 00:00:00 2001 From: Yi Su <90744702+suu-yi@users.noreply.github.com> Date: Fri, 20 Mar 2026 09:05:05 +0100 Subject: [PATCH 4/8] Update validation.py --- cytetype/preprocessing/validation.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/cytetype/preprocessing/validation.py b/cytetype/preprocessing/validation.py index a7d6647..2b2126d 100644 --- a/cytetype/preprocessing/validation.py +++ b/cytetype/preprocessing/validation.py @@ -279,6 +279,7 @@ def validate_adata( nan_mask = adata.obs[cell_group_key].isna() n_nan = int(nan_mask.sum()) + _pending_nan_drop = False if n_nan > 0: pct = round(100 * n_nan / adata.n_obs, 1) if n_nan == adata.n_obs: @@ -291,11 +292,7 @@ def validate_adata( f"{n_nan} cells ({pct}%) have NaN values in '{cell_group_key}'. " f"Either fix the data or set drop_na_cells=True to exclude these cells." ) - logger.warning( - f"⚠️ Dropping {n_nan} cells ({pct}%) with NaN values in '{cell_group_key}'. " - f"{adata.n_obs - n_nan} cells remaining." - ) - adata._inplace_subset_obs(~nan_mask) + _pending_nan_drop = True if adata.X is None: raise ValueError( @@ -361,4 +358,11 @@ def validate_adata( f"Visualization will be disabled." ) + if _pending_nan_drop: + logger.warning( + f"⚠️ Dropping {n_nan} cells ({pct}%) with NaN values in '{cell_group_key}'. " + f"{adata.n_obs - n_nan} cells remaining." + ) + adata._inplace_subset_obs(~nan_mask) + return found_coordinates_key From d36c831de9c6a1833a4e5537e1c5a2a477dc40ef Mon Sep 17 00:00:00 2001 From: Yi Su <90744702+suu-yi@users.noreply.github.com> Date: Fri, 20 Mar 2026 09:15:23 +0100 Subject: [PATCH 5/8] docstring for drop na flag --- cytetype/main.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cytetype/main.py b/cytetype/main.py index 256b09d..288c42f 100644 --- a/cytetype/main.py +++ b/cytetype/main.py @@ -126,6 +126,9 @@ def __init__( deployment. Defaults to "https://prod.cytetype.nygen.io". auth_token (str | None, optional): Bearer token for API authentication. If provided, will be included in the Authorization header as "Bearer {auth_token}". Defaults to None. + drop_na_cells (bool, optional): If True, cells with NaN values in the + ``group_key`` column are dropped with a warning. If False (default), + a ``ValueError`` is raised instead. Raises: KeyError: If the required keys are missing in `adata.obs` or `adata.uns` From 4124401e441b786e930d7e3f49b9d9f5e0a913d3 Mon Sep 17 00:00:00 2001 From: Yi Su <90744702+suu-yi@users.noreply.github.com> Date: Fri, 20 Mar 2026 10:14:23 +0100 Subject: [PATCH 6/8] fix to no mod adata --- cytetype/main.py | 13 +++++++++++++ cytetype/preprocessing/validation.py | 9 --------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/cytetype/main.py b/cytetype/main.py index 288c42f..38758e3 100644 --- a/cytetype/main.py +++ b/cytetype/main.py @@ -159,6 +159,19 @@ def __init__( adata, group_key, rank_key, self.gene_symbols_column, coordinates_key, drop_na_cells=drop_na_cells, ) + + if drop_na_cells: + nan_mask = adata.obs[group_key].isna() + if nan_mask.any(): + n_nan = int(nan_mask.sum()) + pct = round(100 * n_nan / adata.n_obs, 1) + logger.warning( + f"⚠️ Dropping {n_nan} cells ({pct}%) with NaN values in '{group_key}'. " + f"{adata.n_obs - n_nan} cells remaining." + ) + adata = adata[~nan_mask].copy() + self.adata = adata + ( self.gene_symbols_column, self._original_gene_symbols_column, diff --git a/cytetype/preprocessing/validation.py b/cytetype/preprocessing/validation.py index 2b2126d..71fb9e0 100644 --- a/cytetype/preprocessing/validation.py +++ b/cytetype/preprocessing/validation.py @@ -279,7 +279,6 @@ def validate_adata( nan_mask = adata.obs[cell_group_key].isna() n_nan = int(nan_mask.sum()) - _pending_nan_drop = False if n_nan > 0: pct = round(100 * n_nan / adata.n_obs, 1) if n_nan == adata.n_obs: @@ -292,7 +291,6 @@ def validate_adata( f"{n_nan} cells ({pct}%) have NaN values in '{cell_group_key}'. " f"Either fix the data or set drop_na_cells=True to exclude these cells." ) - _pending_nan_drop = True if adata.X is None: raise ValueError( @@ -358,11 +356,4 @@ def validate_adata( f"Visualization will be disabled." ) - if _pending_nan_drop: - logger.warning( - f"⚠️ Dropping {n_nan} cells ({pct}%) with NaN values in '{cell_group_key}'. " - f"{adata.n_obs - n_nan} cells remaining." - ) - adata._inplace_subset_obs(~nan_mask) - return found_coordinates_key From 8dd88b1e1094a0509f0bcf7b9911b5af345d0a0e Mon Sep 17 00:00:00 2001 From: Yi Su <90744702+suu-yi@users.noreply.github.com> Date: Fri, 20 Mar 2026 15:53:01 +0100 Subject: [PATCH 7/8] label_na as unknown --- cytetype/main.py | 42 +++++++++++++++++++++------- cytetype/preprocessing/validation.py | 17 +++++++++-- 2 files changed, 46 insertions(+), 13 deletions(-) diff --git a/cytetype/main.py b/cytetype/main.py index 38758e3..e5f28be 100644 --- a/cytetype/main.py +++ b/cytetype/main.py @@ -21,7 +21,10 @@ aggregate_cluster_metadata, extract_visualization_coordinates, ) -from .preprocessing.validation import materialize_canonical_gene_symbols_column +from .preprocessing.validation import ( + materialize_canonical_gene_symbols_column, + _generate_unique_na_label, +) from .core.payload import build_annotation_payload, save_query_to_file from .core.artifacts import ( _is_integer_valued, @@ -87,7 +90,7 @@ def __init__( max_metadata_categories: int = 500, api_url: str = "https://prod.cytetype.nygen.io", auth_token: str | None = None, - drop_na_cells: bool = False, + label_na: bool = False, ) -> None: """Initialize CyteType with AnnData object and perform data preparation. @@ -126,9 +129,11 @@ def __init__( deployment. Defaults to "https://prod.cytetype.nygen.io". auth_token (str | None, optional): Bearer token for API authentication. If provided, will be included in the Authorization header as "Bearer {auth_token}". Defaults to None. - drop_na_cells (bool, optional): If True, cells with NaN values in the - ``group_key`` column are dropped with a warning. If False (default), - a ``ValueError`` is raised instead. + label_na (bool, optional): If True, cells with NaN values in the + ``group_key`` column are assigned an ``'Unknown'`` cluster label + (or ``'Unknown 2'``, etc. if that label already exists). The original + AnnData object is not modified. If False (default), a ``ValueError`` + is raised instead. Raises: KeyError: If the required keys are missing in `adata.obs` or `adata.uns` @@ -157,19 +162,36 @@ def __init__( self.coordinates_key = validate_adata( adata, group_key, rank_key, self.gene_symbols_column, coordinates_key, - drop_na_cells=drop_na_cells, + label_na=label_na, ) - if drop_na_cells: + if label_na: nan_mask = adata.obs[group_key].isna() if nan_mask.any(): n_nan = int(nan_mask.sum()) pct = round(100 * n_nan / adata.n_obs, 1) + existing_labels = set( + str(v) for v in adata.obs[group_key].dropna().unique() + ) + na_label = _generate_unique_na_label(existing_labels) logger.warning( - f"⚠️ Dropping {n_nan} cells ({pct}%) with NaN values in '{group_key}'. " - f"{adata.n_obs - n_nan} cells remaining." + f"⚠️ Relabeling {n_nan} cells ({pct}%) with NaN values " + f"in '{group_key}' as '{na_label}'." + ) + adata = anndata.AnnData( + X=adata.X, + obs=adata.obs.copy(), + var=adata.var, + uns=adata.uns, + obsm=adata.obsm, + varm=adata.varm, + layers=adata.layers, + obsp=adata.obsp, + varp=adata.varp, + ) + adata.obs[group_key] = ( + adata.obs[group_key].fillna(na_label) ) - adata = adata[~nan_mask].copy() self.adata = adata ( diff --git a/cytetype/preprocessing/validation.py b/cytetype/preprocessing/validation.py index 71fb9e0..c785daf 100644 --- a/cytetype/preprocessing/validation.py +++ b/cytetype/preprocessing/validation.py @@ -266,13 +266,23 @@ def _ur_sort_key(ur: float) -> float: return None +def _generate_unique_na_label(existing_labels: set[str]) -> str: + label = "Unknown" + if label not in existing_labels: + return label + n = 2 + while f"{label} {n}" in existing_labels: + n += 1 + return f"{label} {n}" + + def validate_adata( adata: anndata.AnnData, cell_group_key: str, rank_genes_key: str, gene_symbols_col: str | None, coordinates_key: str, - drop_na_cells: bool = False, + label_na: bool = False, ) -> str | None: if cell_group_key not in adata.obs: raise KeyError(f"Cell group key '{cell_group_key}' not found in `adata.obs`.") @@ -286,10 +296,11 @@ def validate_adata( f"All {n_nan} cells have NaN values in '{cell_group_key}'. " f"Cannot proceed with annotation." ) - if not drop_na_cells: + if not label_na: raise ValueError( f"{n_nan} cells ({pct}%) have NaN values in '{cell_group_key}'. " - f"Either fix the data or set drop_na_cells=True to exclude these cells." + f"Either fix the data or set label_na=True to assign these cells " + f"an 'Unknown' cluster label." ) if adata.X is None: From b7c0fed091d1a17debae7e86c8a22a3250a086d9 Mon Sep 17 00:00:00 2001 From: Yi Su <90744702+suu-yi@users.noreply.github.com> Date: Fri, 20 Mar 2026 16:03:50 +0100 Subject: [PATCH 8/8] Update main.py --- cytetype/main.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cytetype/main.py b/cytetype/main.py index e5f28be..408f556 100644 --- a/cytetype/main.py +++ b/cytetype/main.py @@ -189,9 +189,10 @@ def __init__( obsp=adata.obsp, varp=adata.varp, ) - adata.obs[group_key] = ( - adata.obs[group_key].fillna(na_label) - ) + col = adata.obs[group_key] + if hasattr(col, "cat"): + col = col.cat.add_categories(na_label) + adata.obs[group_key] = col.fillna(na_label) self.adata = adata (