From f2d07ea1bbde9690117a9e1d9137519b0fc85b90 Mon Sep 17 00:00:00 2001 From: Yi Su <90744702+suu-yi@users.noreply.github.com> Date: Wed, 25 Mar 2026 15:59:07 +0100 Subject: [PATCH 1/7] clean unused arg --- cytetype/main.py | 3 ++- cytetype/preprocessing/validation.py | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cytetype/main.py b/cytetype/main.py index 55b577a..4c07fee 100644 --- a/cytetype/main.py +++ b/cytetype/main.py @@ -146,13 +146,14 @@ def __init__( self._temporary_gene_symbols_column: str | None = None try: + self.gene_symbols_column = resolve_gene_symbols_column( adata, gene_symbols_column ) self._original_gene_symbols_column = self.gene_symbols_column self.coordinates_key = validate_adata( - adata, group_key, rank_key, self.gene_symbols_column, coordinates_key + adata, group_key, rank_key, coordinates_key ) ( self.gene_symbols_column, diff --git a/cytetype/preprocessing/validation.py b/cytetype/preprocessing/validation.py index bbede86..a6e1ef5 100644 --- a/cytetype/preprocessing/validation.py +++ b/cytetype/preprocessing/validation.py @@ -270,7 +270,6 @@ def validate_adata( adata: anndata.AnnData, cell_group_key: str, rank_genes_key: str, - gene_symbols_col: str | None, coordinates_key: str, ) -> str | None: if cell_group_key not in adata.obs: From 7dac18b386e70829e3b3aaa156f221eb14d64632 Mon Sep 17 00:00:00 2001 From: Yi Su <90744702+suu-yi@users.noreply.github.com> Date: Wed, 25 Mar 2026 16:41:18 +0100 Subject: [PATCH 2/7] Update validation.py --- cytetype/preprocessing/validation.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/cytetype/preprocessing/validation.py b/cytetype/preprocessing/validation.py index a6e1ef5..1a641d7 100644 --- a/cytetype/preprocessing/validation.py +++ b/cytetype/preprocessing/validation.py @@ -28,7 +28,7 @@ def _is_gene_id_like(value: str) -> bool: if re.match(r"^[NX][MR]_\d+$", value): return True - if re.match(r"^\d+$", value): + if re.match(r"^\d+(?:\.0)?$", value): return True if re.match(r"^[A-Z0-9]+[._][A-Z0-9._]+$", value) and len(value) > 10: @@ -102,7 +102,21 @@ def materialize_canonical_gene_symbols_column( source_name = f"column '{gene_symbols_column}'" canonical_column = _temporary_gene_symbols_column_name(adata) - adata.var[canonical_column] = clean_gene_names(source_values) + cleaned = clean_gene_names(source_values) + + id_pct = _id_like_percentage(cleaned) + if id_pct > 50: + examples = [v for v in cleaned[:20] if _is_gene_id_like(v)][:5] + raise ValueError( + f"The resolved gene symbols from {source_name} contain more than 50% gene IDs rather than " + f"gene symbols ({id_pct:.0f}% of values look like identifiers, e.g., {examples}). " + f"CyteType requires human-readable gene symbols (e.g., 'TSPAN6', 'DPM1', 'SCYL3'). " + f"Please provide a gene_symbols_column pointing to a column in adata.var " + f"that contains gene symbols, or convert your gene identifiers to symbols " + f"before running CyteType." + ) + + adata.var[canonical_column] = cleaned logger.info( f"Materialized canonical gene symbols in temporary column '{canonical_column}' " f"from {source_name}." From a4216161dffc8deac110843460d67ed6688c3307 Mon Sep 17 00:00:00 2001 From: Yi Su <90744702+suu-yi@users.noreply.github.com> Date: Wed, 25 Mar 2026 16:44:53 +0100 Subject: [PATCH 3/7] Create test_validation.py --- tests/test_validation.py | 97 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 tests/test_validation.py diff --git a/tests/test_validation.py b/tests/test_validation.py new file mode 100644 index 0000000..b93e610 --- /dev/null +++ b/tests/test_validation.py @@ -0,0 +1,97 @@ +import pytest + +from cytetype.preprocessing.validation import _is_gene_id_like, _id_like_percentage + + +class TestIsGeneIdLike: + + @pytest.mark.parametrize("value", [ + "ENSG00000000003", + "ENSG00000000003.14", + "ENSMUSG00000000001", + "ensg00000000003", + ]) + def test_ensembl_ids(self, value: str) -> None: + assert _is_gene_id_like(value) is True + + @pytest.mark.parametrize("value", [ + "NM_001301", + "NR_046018", + "XM_011541", + "XR_001737", + ]) + def test_refseq_ids(self, value: str) -> None: + assert _is_gene_id_like(value) is True + + @pytest.mark.parametrize("value", [ + "7157", + "672", + "11286", + "0", + ]) + def test_integer_entrez_ids(self, value: str) -> None: + assert _is_gene_id_like(value) is True + + @pytest.mark.parametrize("value", [ + "7157.0", + "672.0", + "11286.0", + "0.0", + ]) + def test_float_stringified_entrez_ids(self, value: str) -> None: + assert _is_gene_id_like(value) is True + + @pytest.mark.parametrize("value", [ + "AFFY_HG_U133A.207163_S_AT", + "ILLUMINA_HUMANHT_12_V4.ILMN_1762337", + ]) + def test_long_dotted_ids(self, value: str) -> None: + assert _is_gene_id_like(value) is True + + @pytest.mark.parametrize("value", [ + "TSPAN6", + "DPM1", + "SCYL3", + "TP53", + "BRCA1", + "CD8A", + "MS4A1", + ]) + def test_gene_symbols_not_flagged(self, value: str) -> None: + assert _is_gene_id_like(value) is False + + @pytest.mark.parametrize("value", [ + "", + " ", + "7157.5", + ]) + def test_edge_cases(self, value: str) -> None: + assert _is_gene_id_like(value) is False + + +class TestIdLikePercentage: + + def test_all_gene_symbols(self) -> None: + values = ["TSPAN6", "DPM1", "SCYL3", "TP53", "BRCA1"] + assert _id_like_percentage(values) == 0.0 + + def test_all_ensembl_ids(self) -> None: + values = [f"ENSG{i:011d}" for i in range(20)] + assert _id_like_percentage(values) == 100.0 + + def test_all_integer_entrez_ids(self) -> None: + values = ["7157", "672", "3845", "11286", "9952"] + assert _id_like_percentage(values) == 100.0 + + def test_all_float_entrez_ids(self) -> None: + values = ["7157.0", "672.0", "3845.0", "11286.0", "9952.0"] + assert _id_like_percentage(values) == 100.0 + + def test_mixed_float_entrez_and_symbols(self) -> None: + symbols = ["TSPAN6", "DPM1", "SCYL3"] + entrez = ["7157.0", "672.0", "3845.0", "11286.0", "9952.0", "904.0", "405.0"] + pct = _id_like_percentage(symbols + entrez) + assert pct == 70.0 + + def test_empty_list(self) -> None: + assert _id_like_percentage([]) == 100.0 From 15a302e7f6bb63b8481cffe0cdcebff5beae6fa5 Mon Sep 17 00:00:00 2001 From: Yi Su Date: Wed, 25 Mar 2026 17:03:28 +0100 Subject: [PATCH 4/7] comma --- cytetype/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cytetype/main.py b/cytetype/main.py index 5cdd34d..410f021 100644 --- a/cytetype/main.py +++ b/cytetype/main.py @@ -162,7 +162,7 @@ def __init__( self._original_gene_symbols_column = self.gene_symbols_column self.coordinates_key = validate_adata( - adata, group_key, rank_key, coordinates_key + adata, group_key, rank_key, coordinates_key, label_na=label_na, ) From dbd59a6d3f1cff046788b1673c50281ef8870464 Mon Sep 17 00:00:00 2001 From: Yi Su <90744702+suu-yi@users.noreply.github.com> Date: Wed, 25 Mar 2026 18:18:28 +0100 Subject: [PATCH 5/7] qodo fix 1 --- cytetype/preprocessing/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cytetype/preprocessing/validation.py b/cytetype/preprocessing/validation.py index 1a641d7..62769a8 100644 --- a/cytetype/preprocessing/validation.py +++ b/cytetype/preprocessing/validation.py @@ -105,7 +105,7 @@ def materialize_canonical_gene_symbols_column( cleaned = clean_gene_names(source_values) id_pct = _id_like_percentage(cleaned) - if id_pct > 50: + if id_pct > 49: examples = [v for v in cleaned[:20] if _is_gene_id_like(v)][:5] raise ValueError( f"The resolved gene symbols from {source_name} contain more than 50% gene IDs rather than " From 6512bdfe57ad783f0040ea1a8d11d8850189072b Mon Sep 17 00:00:00 2001 From: Yi Su <90744702+suu-yi@users.noreply.github.com> Date: Wed, 25 Mar 2026 18:31:18 +0100 Subject: [PATCH 6/7] Update validation.py --- cytetype/preprocessing/validation.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/cytetype/preprocessing/validation.py b/cytetype/preprocessing/validation.py index 580a27a..acebf09 100644 --- a/cytetype/preprocessing/validation.py +++ b/cytetype/preprocessing/validation.py @@ -106,14 +106,13 @@ def materialize_canonical_gene_symbols_column( id_pct = _id_like_percentage(cleaned) if id_pct > 49: - examples = [v for v in cleaned[:20] if _is_gene_id_like(v)][:5] raise ValueError( - f"The resolved gene symbols from {source_name} contain more than 50% gene IDs rather than " - f"gene symbols ({id_pct:.0f}% of values look like identifiers, e.g., {examples}). " - f"CyteType requires human-readable gene symbols (e.g., 'TSPAN6', 'DPM1', 'SCYL3'). " - f"Please provide a gene_symbols_column pointing to a column in adata.var " - f"that contains gene symbols, or convert your gene identifiers to symbols " - f"before running CyteType." + f"\n\nGene Symbol Detection Error\n" + f"{'─' * 50}\n" + f"CyteType requires human-readable gene symbols (e.g., TSPAN6, DPM1, SCYL3)\n" + f"To fix this, either:\n" + f" 1. Set gene_symbols_column to a column in adata.var that contains gene symbols\n" + f" 2. Convert your gene identifiers to symbols before running CyteType\n" ) adata.var[canonical_column] = cleaned From 49b095582f3a86defff39a03363bd732be0d3add Mon Sep 17 00:00:00 2001 From: Yi Su <90744702+suu-yi@users.noreply.github.com> Date: Wed, 25 Mar 2026 18:35:45 +0100 Subject: [PATCH 7/7] revert arg cleanup --- cytetype/main.py | 3 +-- cytetype/preprocessing/validation.py | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cytetype/main.py b/cytetype/main.py index 410f021..408f556 100644 --- a/cytetype/main.py +++ b/cytetype/main.py @@ -155,14 +155,13 @@ def __init__( self._temporary_gene_symbols_column: str | None = None try: - self.gene_symbols_column = resolve_gene_symbols_column( adata, gene_symbols_column ) self._original_gene_symbols_column = self.gene_symbols_column self.coordinates_key = validate_adata( - adata, group_key, rank_key, coordinates_key, + adata, group_key, rank_key, self.gene_symbols_column, coordinates_key, label_na=label_na, ) diff --git a/cytetype/preprocessing/validation.py b/cytetype/preprocessing/validation.py index acebf09..d458848 100644 --- a/cytetype/preprocessing/validation.py +++ b/cytetype/preprocessing/validation.py @@ -293,6 +293,7 @@ def validate_adata( adata: anndata.AnnData, cell_group_key: str, rank_genes_key: str, + gene_symbols_col: str | None, coordinates_key: str, label_na: bool = False, ) -> str | None: