From f648c0ed8926207e06c255f07304f4d938d5a882 Mon Sep 17 00:00:00 2001
From: Camilo Cortes Parra <ccortesp@ific.uv.es>
Date: Thu, 13 Nov 2025 17:18:06 +0100
Subject: [PATCH 01/23] hits_clusterizer included in components.py

---
 invisible_cities/cities/components.py | 52 +++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/invisible_cities/cities/components.py b/invisible_cities/cities/components.py
index f94405e83..93c71c323 100644
--- a/invisible_cities/cities/components.py
+++ b/invisible_cities/cities/components.py
@@ -24,6 +24,12 @@
 import math
 import os
 
+
+
+from sklearn.cluster import DBSCAN
+
+
+
 from .. dataflow                  import                  dataflow as  fl
 from .. dataflow.dataflow         import                      sink
 from .. dataflow.dataflow         import                      pipe
@@ -1717,6 +1723,52 @@ def correct(hits : pd.DataFrame) -> pd.DataFrame:
 
     return correct
 
+def hits_clusterizer(df_pe_peak: pd.DataFrame, eps=2.3, npt=5)-> pd.DataFrame:
+    """
+    Cluster hits in 3D space for each event using DBSCAN.
+    
+    The coordinates are scaled to account for detector geometry differences 
+    in samplig 
+    
+    Parameters
+    ----------
+    df_pe_peak : pd.DataFrame
+    DataFrame containing hit information with columns 'X', 'Y', 'Z', and 'event'.
+    
+    Returns
+    -------
+    pd.DataFrame
+    Modified DataFrame with an added 'cluster' column indicating the cluster label 
+    for each hit (-1 for noise).
+    """
+    a = 14.55  # XY scale
+    b = 3.7  # Z scale
+
+    # Pre-allocate array for cluster labels
+    cluster_labels = np.full(len(df_pe_peak), -9999, dtype=int)
+
+    # Get values once (faster than repeatedly accessing DataFrame columns)
+    coords = df_pe_peak[['X', 'Y', 'Z']].to_numpy()
+    events = df_pe_peak['event'].to_numpy()
+    
+    # Use np.unique to get sorted event IDs
+    unique_events = np.unique(events)
+    
+    for event_id in unique_events:
+        mask = (events == event_id)
+        X = coords[mask].copy()
+        
+        # Scale
+        X[:, :2] /= a
+        X[:, 2] /= b
+        
+        labels = DBSCAN(eps=eps, min_samples=npt).fit_predict(X)
+        cluster_labels[mask] = labels
+
+    df_pe_peak['cluster'] = cluster_labels
+
+    return df_pe_peak
+
 
 def identity(x : Any) -> Any:
     return x

From ed5248ed894e19800223d0dccb3805a8c0365e54 Mon Sep 17 00:00:00 2001
From: Camilo Cortes Parra <ccortesp@ific.uv.es>
Date: Thu, 13 Nov 2025 17:35:08 +0100
Subject: [PATCH 02/23] Hits clusterizer step included in sophronia.py

---
 invisible_cities/cities/sophronia.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/invisible_cities/cities/sophronia.py b/invisible_cities/cities/sophronia.py
index 625ac3032..7580fa6f6 100644
--- a/invisible_cities/cities/sophronia.py
+++ b/invisible_cities/cities/sophronia.py
@@ -64,6 +64,7 @@
 from .  components import               collect
 from .  components import build_pointlike_event as pointlike_event_builder
 from .  components import        hits_corrector
+from .  components import      hits_clusterizer
 from .  components import              identity
 
 from typing import Optional
@@ -93,6 +94,11 @@ def sophronia( files_in           : OneOrManyFiles
              , sipm_charge_type   : SiPMCharge
              , same_peak          : bool
              , corrections        : Optional[dict] = None
+
+            # ¿QUEREMOS ESTO?
+            #  , apply_clustering   : bool  = False   # whether to apply clustering to hits
+            #  , cluster_eps        : float = 2.3     # eps for DBSCAN
+            #  , cluster_min_samples: int   = 5       # min_samples for DBSCAN
              ):
     """
     drift_v : float
@@ -177,6 +183,10 @@ def sophronia( files_in           : OneOrManyFiles
 
     correct_hits   = df.map( hits_corrector(**corrections) if corrections is not None else identity
                            , item = "hits")
+    
+    cluster_hits   = df.map( hits_clusterizer(eps=2.3, min_samples=5)
+                            , args="hits"
+                            , out="hits")
 
     build_pointlike_event = df.map( pointlike_event_builder( detector_db
                                                            , run_number
@@ -202,7 +212,7 @@ def sophronia( files_in           : OneOrManyFiles
                                        , args = "event_number enough_valid_hits".split())
 
         hits_branch         = ( make_hits, enough_valid_hits, df.branch(write_hits_filter)
-                              , hits_select.filter, merge_nn_hits, correct_hits, write_hits)
+                              , hits_select.filter, merge_nn_hits, correct_hits, cluster_hits, write_hits)
         kdst_branch         = build_pointlike_event, write_pointlike_event
         collect_evt_numbers = "event_number", event_number_collector.sink
 

From 8ff46af6dc58d590275f7dfc625e93ea3b536973 Mon Sep 17 00:00:00 2001
From: Camilo Cortes Parra <ccortesp@ific.uv.es>
Date: Fri, 14 Nov 2025 13:10:40 +0100
Subject: [PATCH 03/23] Update on 14/11

---
 invisible_cities/cities/sophronia.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/invisible_cities/cities/sophronia.py b/invisible_cities/cities/sophronia.py
index 7580fa6f6..3fa6ae4fd 100644
--- a/invisible_cities/cities/sophronia.py
+++ b/invisible_cities/cities/sophronia.py
@@ -184,7 +184,7 @@ def sophronia( files_in           : OneOrManyFiles
     correct_hits   = df.map( hits_corrector(**corrections) if corrections is not None else identity
                            , item = "hits")
     
-    cluster_hits   = df.map( hits_clusterizer(eps=2.3, min_samples=5)
+    cluster_hits   = df.map( hits_clusterizer(eps=2.3, npt=5)
                             , args="hits"
                             , out="hits")
 

From 7b5f1c57c64ff51d9e50ab8df9257047f4f61e7c Mon Sep 17 00:00:00 2001
From: Camilo Cortes Parra <ccortesp@ific.uv.es>
Date: Tue, 25 Nov 2025 12:22:54 +0100
Subject: [PATCH 04/23] New implementation of hits_clusterizer, factory version

---
 invisible_cities/cities/components.py | 75 +++++++++++++++------------
 invisible_cities/cities/sophronia.py  |  9 ++--
 2 files changed, 46 insertions(+), 38 deletions(-)

diff --git a/invisible_cities/cities/components.py b/invisible_cities/cities/components.py
index 93c71c323..927e3cc92 100644
--- a/invisible_cities/cities/components.py
+++ b/invisible_cities/cities/components.py
@@ -1723,51 +1723,60 @@ def correct(hits : pd.DataFrame) -> pd.DataFrame:
 
     return correct
 
-def hits_clusterizer(df_pe_peak: pd.DataFrame, eps=2.3, npt=5)-> pd.DataFrame:
+def hits_clusterizer( eps         : float
+                    , min_samples : float
+                    , scale_xy    : float = 14.55
+                    , scale_z     : float = 3.7
+                    ) -> Callable:
     """
     Cluster hits in 3D space for each event using DBSCAN.
-    
-    The coordinates are scaled to account for detector geometry differences 
-    in samplig 
+    The coordinates are scaled to account for detector geometry differences in samplig 
     
     Parameters
     ----------
-    df_pe_peak : pd.DataFrame
-    DataFrame containing hit information with columns 'X', 'Y', 'Z', and 'event'.
+    eps         : float, Epsilon value for DBSCAN.
+    min_samples : int, Min Samples value for DBSCAN.
+    scale_xy    : float, scale factor for XY coordinates.
+    scale_z     : float, scale factor for Z coordinate.
     
     Returns
     -------
-    pd.DataFrame
-    Modified DataFrame with an added 'cluster' column indicating the cluster label 
-    for each hit (-1 for noise).
+    Callable
+    A function that takes a DataFrame of hits and returns the same DataFrame 
+    with an added 'cluster' column, which are the clusters labels assigned by DBSCAN
+    (-1 for noise).
     """
-    a = 14.55  # XY scale
-    b = 3.7  # Z scale
+    def cluster_tagger(df_hits: pd.DataFrame) -> pd.DataFrame:
+        if df_hits.empty:
+            return df_hits.assign(cluster=pd.Series(dtype=int))  
 
-    # Pre-allocate array for cluster labels
-    cluster_labels = np.full(len(df_pe_peak), -9999, dtype=int)
+        # Pre-allocate array for cluster labels
+        cluster_labels = np.full(len(df_hits), -9999, dtype=int)
 
-    # Get values once (faster than repeatedly accessing DataFrame columns)
-    coords = df_pe_peak[['X', 'Y', 'Z']].to_numpy()
-    events = df_pe_peak['event'].to_numpy()
-    
-    # Use np.unique to get sorted event IDs
-    unique_events = np.unique(events)
+        # Get values once (faster than repeatedly accessing DataFrame columns)
+        coords = df_hits[['X', 'Y', 'Z']].to_numpy()
+        events = df_hits['event'].to_numpy()
+
+        # Use np.unique to get sorted event IDs
+        unique_events = np.unique(events)
+
+        for event_id in unique_events:
+            mask = (events == event_id)
+            X = coords[mask].copy()
+
+            # Scale
+            X[:, :2] /= scale_xy
+            X[:, 2]  /= scale_z
+
+            # DBSCAN clustering
+            labels = DBSCAN(eps=eps, min_samples=min_samples).fit_predict(X)
+            cluster_labels[mask] = labels
+
+        df_hits['cluster'] = cluster_labels
+
+        return df_hits
     
-    for event_id in unique_events:
-        mask = (events == event_id)
-        X = coords[mask].copy()
-        
-        # Scale
-        X[:, :2] /= a
-        X[:, 2] /= b
-        
-        labels = DBSCAN(eps=eps, min_samples=npt).fit_predict(X)
-        cluster_labels[mask] = labels
-
-    df_pe_peak['cluster'] = cluster_labels
-
-    return df_pe_peak
+    return cluster_tagger
 
 
 def identity(x : Any) -> Any:
diff --git a/invisible_cities/cities/sophronia.py b/invisible_cities/cities/sophronia.py
index 3fa6ae4fd..bb425f81c 100644
--- a/invisible_cities/cities/sophronia.py
+++ b/invisible_cities/cities/sophronia.py
@@ -97,8 +97,8 @@ def sophronia( files_in           : OneOrManyFiles
 
             # ¿QUEREMOS ESTO?
             #  , apply_clustering   : bool  = False   # whether to apply clustering to hits
-            #  , cluster_eps        : float = 2.3     # eps for DBSCAN
-            #  , cluster_min_samples: int   = 5       # min_samples for DBSCAN
+             , cluster_eps        : float = 2.3     # eps for DBSCAN
+             , cluster_min_samples: int   = 5       # min_samples for DBSCAN
              ):
     """
     drift_v : float
@@ -184,9 +184,8 @@ def sophronia( files_in           : OneOrManyFiles
     correct_hits   = df.map( hits_corrector(**corrections) if corrections is not None else identity
                            , item = "hits")
     
-    cluster_hits   = df.map( hits_clusterizer(eps=2.3, npt=5)
-                            , args="hits"
-                            , out="hits")
+    cluster_hits   = df.map( hits_clusterizer(eps=cluster_eps, npt=cluster_min_samples)
+                           , item = "hits")
 
     build_pointlike_event = df.map( pointlike_event_builder( detector_db
                                                            , run_number

From c9c43442bf5a028748f7059c311e955de8f146fe Mon Sep 17 00:00:00 2001
From: Camilo Cortes Parra <ccortesp@ific.uv.es>
Date: Wed, 14 Jan 2026 20:37:23 +0100
Subject: [PATCH 05/23] New version of cluster_hits in Sophronia. Main function
 in hits_functions.py

---
 invisible_cities/cities/components.py   | 77 ++++++++-----------------
 invisible_cities/cities/sophronia.py    | 21 +++++--
 invisible_cities/config/sophronia.conf  |  7 +++
 invisible_cities/reco/hits_functions.py | 72 ++++++++++++++++++++++-
 4 files changed, 115 insertions(+), 62 deletions(-)

diff --git a/invisible_cities/cities/components.py b/invisible_cities/cities/components.py
index 927e3cc92..f89351b1f 100644
--- a/invisible_cities/cities/components.py
+++ b/invisible_cities/cities/components.py
@@ -24,12 +24,6 @@
 import math
 import os
 
-
-
-from sklearn.cluster import DBSCAN
-
-
-
 from .. dataflow                  import                  dataflow as  fl
 from .. dataflow.dataflow         import                      sink
 from .. dataflow.dataflow         import                      pipe
@@ -65,6 +59,7 @@
 from .. reco   .corrections       import     get_df_to_z_converter
 from .. reco   .xy_algorithms     import                    corona
 from .. reco   .xy_algorithms     import                barycenter
+from .. reco   .hits_functions    import            cluster_tagger
 from .. filters.s1s2_filter       import               S12Selector
 from .. filters.s1s2_filter       import         S12SelectorOutput
 from .. filters.s1s2_filter       import               pmap_filter
@@ -1723,60 +1718,36 @@ def correct(hits : pd.DataFrame) -> pd.DataFrame:
 
     return correct
 
-def hits_clusterizer( eps         : float
-                    , min_samples : float
-                    , scale_xy    : float = 14.55
-                    , scale_z     : float = 3.7
-                    ) -> Callable:
-    """
-    Cluster hits in 3D space for each event using DBSCAN.
-    The coordinates are scaled to account for detector geometry differences in samplig 
-    
+def hits_clusterizer(clustering_params: dict) -> Callable:
+    """"
+    This function receives a configuration dictionary and returns a callable
+    that will perform DBSCAN clustering on a DataFrame of hits. 
+
     Parameters
     ----------
-    eps         : float, Epsilon value for DBSCAN.
-    min_samples : int, Min Samples value for DBSCAN.
-    scale_xy    : float, scale factor for XY coordinates.
-    scale_z     : float, scale factor for Z coordinate.
-    
+    clustering_params : dict
+        A dictionary containing the configuration for the clustering algorithm.
+        Expected keys are:
+        - 'eps'        : float, Epsilon value for DBSCAN.
+        - 'min_samples': int,   Min Samples value for DBSCAN.
+        - 'scale_xy'   : float, optional, scale factor for XY coordinates.
+        - 'scale_z'    : float, optional, scale factor for Z coordinate.
+
     Returns
     -------
     Callable
-    A function that takes a DataFrame of hits and returns the same DataFrame 
-    with an added 'cluster' column, which are the clusters labels assigned by DBSCAN
-    (-1 for noise).
+        A function that takes a DataFrame of hits and returns the same DataFrame 
+        with an added 'cluster' column, which are the clusters labels assigned by DBSCAN
+        (-1 for noise).
     """
-    def cluster_tagger(df_hits: pd.DataFrame) -> pd.DataFrame:
-        if df_hits.empty:
-            return df_hits.assign(cluster=pd.Series(dtype=int))  
-
-        # Pre-allocate array for cluster labels
-        cluster_labels = np.full(len(df_hits), -9999, dtype=int)
-
-        # Get values once (faster than repeatedly accessing DataFrame columns)
-        coords = df_hits[['X', 'Y', 'Z']].to_numpy()
-        events = df_hits['event'].to_numpy()
-
-        # Use np.unique to get sorted event IDs
-        unique_events = np.unique(events)
-
-        for event_id in unique_events:
-            mask = (events == event_id)
-            X = coords[mask].copy()
-
-            # Scale
-            X[:, :2] /= scale_xy
-            X[:, 2]  /= scale_z
-
-            # DBSCAN clustering
-            labels = DBSCAN(eps=eps, min_samples=min_samples).fit_predict(X)
-            cluster_labels[mask] = labels
-
-        df_hits['cluster'] = cluster_labels
-
-        return df_hits
+    eps         = clustering_params['eps']
+    min_samples = clustering_params['min_samples']
+    scale_xy    = clustering_params['scale_xy']
+    scale_z     = clustering_params['scale_z']
     
-    return cluster_tagger
+    return partial(cluster_tagger,
+                   eps=eps, min_samples=min_samples,
+                   scale_xy=scale_xy, scale_z=scale_z)
 
 
 def identity(x : Any) -> Any:
diff --git a/invisible_cities/cities/sophronia.py b/invisible_cities/cities/sophronia.py
index bb425f81c..6181a5059 100644
--- a/invisible_cities/cities/sophronia.py
+++ b/invisible_cities/cities/sophronia.py
@@ -94,11 +94,7 @@ def sophronia( files_in           : OneOrManyFiles
              , sipm_charge_type   : SiPMCharge
              , same_peak          : bool
              , corrections        : Optional[dict] = None
-
-            # ¿QUEREMOS ESTO?
-            #  , apply_clustering   : bool  = False   # whether to apply clustering to hits
-             , cluster_eps        : float = 2.3     # eps for DBSCAN
-             , cluster_min_samples: int   = 5       # min_samples for DBSCAN
+             , clustering_params  : Optional[dict] = None
              ):
     """
     drift_v : float
@@ -143,6 +139,19 @@ def sophronia( files_in           : OneOrManyFiles
             Normalization strategy
         norm_value : float, optional
             Normalization value in case of `norm_strat = NormStrategy.custom`
+
+    clustering_params : dict
+        eps : float
+            The maximum distance between two samples for one to be
+            considered as in the neighborhood of the other.
+        min_samples : int
+            The number of samples (or total weight) in a neighborhood
+            for a point to be considered as a core point. This includes the point
+            itself.
+        scale_xy : float
+            Scaling factor to apply to the (x, y) coordinates before clustering.
+        scale_z : float
+            Scaling factor to apply to the z coordinate before clustering.
     """
     global_reco = compute_xy_position( detector_db
                                      , run_number
@@ -184,7 +193,7 @@ def sophronia( files_in           : OneOrManyFiles
     correct_hits   = df.map( hits_corrector(**corrections) if corrections is not None else identity
                            , item = "hits")
     
-    cluster_hits   = df.map( hits_clusterizer(eps=cluster_eps, npt=cluster_min_samples)
+    cluster_hits   = df.map( hits_clusterizer(**clustering_params) if clustering_params is not None else identity
                            , item = "hits")
 
     build_pointlike_event = df.map( pointlike_event_builder( detector_db
diff --git a/invisible_cities/config/sophronia.conf b/invisible_cities/config/sophronia.conf
index 22613d646..83a3697bc 100644
--- a/invisible_cities/config/sophronia.conf
+++ b/invisible_cities/config/sophronia.conf
@@ -63,3 +63,10 @@ corrections = dict(
   apply_temp = True,
   norm_strat = kr,
   apply_z    = False)
+
+clustering_params = dict(
+    eps          = 2.3,
+    min_samples  = 5,
+    scale_xy     = 14.55,
+    scale_z      = 3.7
+)
diff --git a/invisible_cities/reco/hits_functions.py b/invisible_cities/reco/hits_functions.py
index 9bcdea2ef..4bb54c1d3 100644
--- a/invisible_cities/reco/hits_functions.py
+++ b/invisible_cities/reco/hits_functions.py
@@ -1,6 +1,12 @@
 import numpy  as np
 import pandas as pd
 
+from itertools       import compress
+from copy            import deepcopy
+from typing          import List
+from sklearn.cluster import DBSCAN
+
+from .. evm  import event_model as evm
 from .. types.ic_types      import NN
 
 EPSILON = np.finfo(np.float64).eps
@@ -64,8 +70,6 @@ def sipms_above_threshold(xys: np.ndarray, qs: np.ndarray, thr:float, energy: fl
     return xs, ys, qs, es
 
 
-
-
 def merge_NN_hits(hits: pd.DataFrame, same_peak: bool = True) -> pd.DataFrame:
     """
     Finds NN hits (defined as hits with Q=NN) and removes them without energy
@@ -237,4 +241,66 @@ def threshold_hits(hits: pd.DataFrame, th: float) -> pd.DataFrame:
     """
     if th <= 0: return hits
     return (hits.groupby("Z", as_index=False)
-                .apply(apply_threshold, th=th))
+                .apply(apply_threshold, th=th, on_corrected=on_corrected))
+
+
+def cluster_tagger(df_hits: pd.DataFrame, *, 
+                   eps:float, min_samples:int, 
+                   scale_xy:float, scale_z:float) -> pd.DataFrame:
+        """
+        Applies DBSCAN clustering to hits on an event-by-event basis.
+
+        This function processes a DataFrame of hits, groups them by event,
+        scales their coordinates, and applies DBSCAN to identify spatial clusters.
+        A 'cluster' column is added to the DataFrame with the resulting labels.
+
+        Parameters
+        ----------
+        df_hits     : pd.DataFrame
+            DataFrame containing hit information with columns 'X', 'Y', 'Z', and 'event'.
+        eps         : float
+            The maximum distance between two samples for one to be considered as in the
+            neighborhood of the other. This is the most important DBSCAN parameter.
+        min_samples : int
+            The number of samples (or total weight) in a neighborhood for a point
+            to be considered as a core point.
+        scale_xy    : float
+            Scale factor to apply to X and Y coordinates before clustering to account
+            for different detector resolutions.
+        scale_z     : float
+            Scale factor to apply to the Z coordinate.
+
+        Returns
+        -------
+        pd.DataFrame
+            The input DataFrame with an added 'cluster' column indicating the
+            cluster label for each hit (-1 for noise).
+        """
+        if df_hits.empty:
+            return df_hits.assign(cluster=pd.Series(dtype=int))  
+
+        # Pre-allocate array for cluster labels
+        cluster_labels = np.full(len(df_hits), -9999, dtype=int)
+
+        # Get values once (faster than repeatedly accessing DataFrame columns)
+        coords = df_hits[['X', 'Y', 'Z']].to_numpy()
+        events = df_hits['event'].to_numpy()
+
+        # Use np.unique to get sorted event IDs
+        unique_events = np.unique(events)
+        for event_id in unique_events:
+            
+            mask = (events == event_id)
+            X = coords[mask].copy()
+
+            # Scale
+            X[:, :2] /= scale_xy
+            X[:, 2]  /= scale_z
+
+            # DBSCAN clustering
+            labels = DBSCAN(eps=eps, min_samples=min_samples).fit_predict(X)
+            cluster_labels[mask] = labels
+
+        df_hits['cluster'] = cluster_labels
+
+        return df_hits
\ No newline at end of file

From 168a3a01f83f502503972b9c5b828067f8a4a045 Mon Sep 17 00:00:00 2001
From: Camilo Cortes Parra <ccortesp@ific.uv.es>
Date: Wed, 14 Jan 2026 20:58:28 +0100
Subject: [PATCH 06/23] Update on how to call cluster_hits in Sophronia

---
 invisible_cities/cities/sophronia.py   | 2 +-
 invisible_cities/config/sophronia.conf | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/invisible_cities/cities/sophronia.py b/invisible_cities/cities/sophronia.py
index 6181a5059..6ddb0c2a7 100644
--- a/invisible_cities/cities/sophronia.py
+++ b/invisible_cities/cities/sophronia.py
@@ -193,7 +193,7 @@ def sophronia( files_in           : OneOrManyFiles
     correct_hits   = df.map( hits_corrector(**corrections) if corrections is not None else identity
                            , item = "hits")
     
-    cluster_hits   = df.map( hits_clusterizer(**clustering_params) if clustering_params is not None else identity
+    cluster_hits   = df.map( hits_clusterizer(clustering_params) if clustering_params is not None else identity
                            , item = "hits")
 
     build_pointlike_event = df.map( pointlike_event_builder( detector_db
diff --git a/invisible_cities/config/sophronia.conf b/invisible_cities/config/sophronia.conf
index 83a3697bc..f52deb856 100644
--- a/invisible_cities/config/sophronia.conf
+++ b/invisible_cities/config/sophronia.conf
@@ -65,7 +65,7 @@ corrections = dict(
   apply_z    = False)
 
 clustering_params = dict(
-    eps          = 2.3,
+    eps          = 3,
     min_samples  = 5,
     scale_xy     = 14.55,
     scale_z      = 3.7

From 365063887a97d5d28beb255d5342e4aee2b62d6f Mon Sep 17 00:00:00 2001
From: Camilo Cortes Parra <ccortesp@ific.uv.es>
Date: Mon, 23 Feb 2026 12:59:11 +0100
Subject: [PATCH 07/23] Pytests for cluster_tagger function

---
 invisible_cities/cities/sophronia_test.py    |  43 ++---
 invisible_cities/conftest.py                 |   5 +
 invisible_cities/reco/hits_functions_test.py | 175 ++++++++++++++++++-
 3 files changed, 200 insertions(+), 23 deletions(-)

diff --git a/invisible_cities/cities/sophronia_test.py b/invisible_cities/cities/sophronia_test.py
index 96d7646e7..3b433ac5c 100644
--- a/invisible_cities/cities/sophronia_test.py
+++ b/invisible_cities/cities/sophronia_test.py
@@ -64,27 +64,28 @@ def test_sophronia_contains_all_tables(sophronia_config, config_tmpdir):
 
 @ignore_warning.no_config_group
 @mark.slow
-def test_sophronia_exact_result(sophronia_config, Th228_hits, config_tmpdir):
-    path_out = os.path.join(config_tmpdir, 'test_sophronia_exact_result.h5')
-    config   = dict(**sophronia_config)
-    config.update(dict(file_out = path_out))
-
-    sophronia(**config)
-
-    tables = ( "MC/hits", "MC/particles"
-             , "DST/Events"
-             , "RECO/Events"
-             , "Run/events", "Run/runInfo"
-             , "Filters/s12_selector", "Filters/valid_hit"
-             )
-
-    with tb.open_file(Th228_hits)   as true_output_file:
-        with tb.open_file(path_out) as      output_file:
-            for table in tables:
-                assert hasattr(output_file.root, table), table
-                got      = getattr(     output_file.root, table)
-                expected = getattr(true_output_file.root, table)
-                assert_tables_equality(got, expected)
+# RE-ACTIVAR
+# def test_sophronia_exact_result(sophronia_config, Th228_hits, config_tmpdir):
+#     path_out = os.path.join(config_tmpdir, 'test_sophronia_exact_result.h5')
+#     config   = dict(**sophronia_config)
+#     config.update(dict(file_out = path_out))
+
+#     sophronia(**config)
+
+#     tables = ( "MC/hits", "MC/particles"
+#              , "DST/Events"
+#              , "RECO/Events"
+#              , "Run/events", "Run/runInfo"
+#              , "Filters/s12_selector", "Filters/valid_hit"
+#              )
+
+#     with tb.open_file(Th228_hits)   as true_output_file:
+#         with tb.open_file(path_out) as      output_file:
+#             for table in tables:
+#                 assert hasattr(output_file.root, table), table
+#                 got      = getattr(     output_file.root, table)
+#                 expected = getattr(true_output_file.root, table)
+#                 assert_tables_equality(got, expected)
 
 
 @ignore_warning.no_config_group
diff --git a/invisible_cities/conftest.py b/invisible_cities/conftest.py
index 2476ae84f..7dc54dfcc 100644
--- a/invisible_cities/conftest.py
+++ b/invisible_cities/conftest.py
@@ -438,6 +438,11 @@ def sophronia_config(Th228_pmaps, next100_mc_krmap):
                        filename   = next100_mc_krmap,
                        apply_temp =            False,
                        norm_strat =  NormStrategy.kr)
+                   , clustering_params = dict(
+                        eps         =     3,
+                        min_samples =     5,
+                        scale_xy    = 14.55,
+                        scale_z     =   3.7)
                    )
     return config
 
diff --git a/invisible_cities/reco/hits_functions_test.py b/invisible_cities/reco/hits_functions_test.py
index c92b7abaa..ade6c9a65 100644
--- a/invisible_cities/reco/hits_functions_test.py
+++ b/invisible_cities/reco/hits_functions_test.py
@@ -1,6 +1,8 @@
 import numpy  as np
 import pandas as pd
 
+from pytest        import mark
+from pytest        import fixture
 from numpy.testing import assert_almost_equal
 
 from   .. core.testing_utils   import assert_dataframes_close
@@ -9,6 +11,7 @@
 from   .  hits_functions       import merge_NN_hits
 from   .  hits_functions       import threshold_hits
 from   .  hits_functions       import sipms_above_threshold
+from   .  hits_functions       import cluster_tagger
 from hypothesis                import given
 from hypothesis.strategies     import lists
 from hypothesis.strategies     import floats
@@ -16,7 +19,7 @@
 from copy                      import deepcopy
 from hypothesis                import assume
 from hypothesis.strategies     import composite
-
+from hypothesis.extra.pandas   import data_frames, column, range_indexes
 
 event_numbers = integers(0, np.iinfo(np.int32).max)
 
@@ -157,4 +160,172 @@ def test_threshold_hits_energy_conserved(hits, th):
 def test_threshold_hits_all_larger_than_th(hits, th):
     hits_thresh = threshold_hits(hits, th)
     non_nn = hits_thresh.loc[hits_thresh.Q != NN]
-    assert np.all(non_nn.Q >= th)
+    q = non_nn[col]
+    assert np.all(q >= th)
+
+# ----- CLUSTER TAGGER TESTS ----- #
+
+gen_cluster_df = data_frames( index=range_indexes(min_size=1, max_size=50),
+                              columns=[
+                                column('event', dtype=int,   elements=integers(min_value=0, max_value=10)),
+                                column('X',     dtype=float, elements=floats(min_value=-500, max_value=500)),
+                                column('Y',     dtype=float, elements=floats(min_value=-500, max_value=500)),
+                                column('Z',     dtype=float, elements=floats(min_value=0,    max_value=1200)),
+                                column('E',     dtype=float, elements=floats(min_value=0.1,  max_value=100)),    
+                                ])
+
+@given(df=gen_cluster_df)
+def test_dummy(df):
+    """
+    Hypothesis calls this function multiple times.
+    'df' will be a different pandas DataFrame in every call.
+    """
+    # Just for demonstration purposes, we print the shape of the generated DFs
+    print(f"Generated dataframe shape: {df.shape}")
+    
+    # Check some stuff here
+    assert 'X' in df.columns
+    assert df['event'].dtype == int
+    assert not df.empty
+
+@settings(deadline=None)
+@given(df=gen_cluster_df)
+def test_cluster_tagger_structure_preservation(df):
+    """
+    Verifies that cluster_tagger:
+        - Returns a DataFrame with the exact same length as the input.
+        - Adds exactly one column named 'cluster'.
+        - Does not modify any of the original columns (X, Y, Z, E, etc.).
+        - Preserves the original Index and order of rows.
+        - The 'cluster' column contains valid integers (no NaNs).
+    """
+    # Shuffle the input DataFrame to ensure cluster_tagger does not rely on any specific order
+    df_input = df.sample(frac=1.0).copy()
+    df_original = df_input.copy()           # Keep a copy of the original for later comparison
+
+    # Run the cluster tagger
+    params = dict(eps=10.0, min_samples=1, scale_xy=1.0, scale_z=1.0)     # Dummy values
+    df_result = cluster_tagger(df_input, **params)
+
+    # --- Assertations
+    assert len(df_result) == len(df_original), "Output DataFrame has different length than input."
+    assert 'cluster' in df_result.columns,     "Output DataFrame does not contain 'cluster' column."
+    expected_cols = set(df_original.columns) | {'cluster'}
+    assert set(df_result.columns) == expected_cols, "Output DataFrame has unexpected columns."
+    pd.testing.assert_frame_equal(  
+                                    df_result.drop(columns=['cluster']),
+                                    df_original,
+                                    check_dtype=True,
+                                    obj="Dataframe structure check"
+                                 )
+    assert pd.api.types.is_integer_dtype(df_result['cluster']), "'cluster' column is not of integer type."
+    assert not df_result['cluster'].isna().any(), "'cluster' column contains NaN values."
+
+def test_cluster_tagger_row_alignment():
+    """
+    Verifies that the calculated cluster label is assigned to the correct 
+    spatial hit, even if the input DataFrame is shuffled.
+    
+    Scenario:
+    - Event 0:
+        - Cluster A: 2 hits at (0,0,0) and (1,1,0)         -> Should be Cluster 0
+        - Cluster B: 2 hits at (100,100,0) and (101,101,0) -> Should be Cluster 1
+    - We check that hits near 0 get Label 0 and hits near 100 get Label 1 (NO noise here).
+    """
+    # Setup data
+    data = {
+                'event': [0, 0, 0, 0],
+                'X':     [0., 1., 100., 101.],
+                'Y':     [0., 1., 100., 101.],
+                'Z':     [0., 0.,   0.,   0.],
+                'E':     [10, 10,  10,   10 ]
+    }
+    df = pd.DataFrame(data)
+    df['expected_label'] = [0, 0, 1, 1]
+
+    # Shuffle the input DataFrame
+    df_input = df.sample(frac=1.0).copy()
+
+    # Run the cluster tagger
+    params = dict(eps=5.0, min_samples=1, scale_xy=1.0, scale_z=1.0)      # Enough to consider both clusters
+    df_result = cluster_tagger(df_input, **params)
+
+    # --- Assertations
+    hits_group_0 = df_result[df_result['expected_label'] == 0]
+    hits_group_1 = df_result[df_result['expected_label'] == 1]
+    assert hits_group_0['cluster'].nunique() == 1, "Hits near (0,0,0) were assigned multiple cluster labels."
+    assert hits_group_1['cluster'].nunique() == 1, "Hits near (100,100,0) were assigned multiple cluster labels."
+    label_0 = hits_group_0['cluster'].iloc[0]
+    label_1 = hits_group_1['cluster'].iloc[0]
+    assert label_0 != label_1, "Both clusters were assigned the same label."
+    assert label_0 != -1 and label_1 != -1, "One of the clusters was labeled as noise (-1)."
+    
+def test_cluster_tagger_noise_rejection():
+    """
+    Verifies that isolated hits (outliers) are correctly identified as noise (-1).
+    
+    Scenario:
+    - 3 points very close together (0,0), (1,0), (0,1). They should form a cluster.
+    - 1 point very far away (100, 100). It has 0 neighbors. Should be noise.
+    """
+    # Setup data
+    data = {
+                'event': [0, 0, 0, 0],
+                'X':     [0., 1., 0., 100.],
+                'Y':     [0., 0., 1., 100.],
+                'Z':     [0., 0., 0.,   0.],
+                'E':     [10, 10, 10,  10 ]
+    }
+    df = pd.DataFrame(data)
+
+    # Shuffle the input DataFrame
+    df_input = df.sample(frac=1.0).copy()
+
+    # Run the cluster tagger
+    params = dict(eps=5.0, min_samples=3, scale_xy=1.0, scale_z=1.0)      # Enough to consider one cluster
+    df_result = cluster_tagger(df_input, **params)
+
+    # --- Assertations
+    cluster_labels = df_result['cluster'].unique()
+    assert cluster_labels.size == 2, "Expected exactly 2 unique cluster labels (one cluster + one noise)."
+    cluster_hits = df_result[df_result['cluster'] != -1]
+    assert cluster_hits.shape[0] == 3, "Expected exactly 3 hits to be clustered together."
+    noise_hit = df_result[df_result['cluster'] == -1]
+    assert noise_hit.shape[0] == 1, "Expected exactly 1 noise hit."
+    assert noise_hit['X'].iloc[0] == 100 and noise_hit['Y'].iloc[0] == 100, "The noise hit identified is NOT the distant one."
+
+def test_cluster_tagger_event_distinction():
+    """
+    Verifies that hits from different events are not clustered together.
+    
+    Scenario:
+    - Event 0: 2 hits at (0,0,0) and (1,1,0) -> Should be Cluster 0
+    - Event 1: 2 hits at (100,100,0) and (101,101,0) and 1 hit at (0.5,0.5,0) -> Should be marked as noise (-1)
+    - We check that noise hit from Event 1 get a different cluster label than hits from Event 0, even if they are spatially close.
+    """
+    # Setup data
+    data = {
+                'event': [0, 0, 1, 1, 1],
+                'X':     [0., 1., 100., 101., 0.5],
+                'Y':     [0., 1., 100., 101., 0.5],
+                'Z':     [0., 0.,   0.,   0.,  0.],
+                'E':     [10, 10,   10,   10,  10]
+    }
+    df = pd.DataFrame(data)
+
+    # Shuffle the input DataFrame
+    df_input = df.sample(frac=1.0).copy()
+
+    # Run the cluster tagger
+    params = dict(eps=5.0, min_samples=2, scale_xy=1.0, scale_z=1.0)      # Enough to consider both clusters
+    df_result = cluster_tagger(df_input, **params)
+
+    # --- Assertations
+    event_0_clusters = df_result[df_result['event'] == 0]['cluster'].unique()
+    event_1_clusters = df_result[df_result['event'] == 1]['cluster'].unique()
+    assert len(event_0_clusters) == 1, "For event 0: expected exactly 1 unique cluster label (one cluster)."
+    assert len(event_1_clusters) == 2, "For event 1: expected exactly 2 unique cluster labels (one cluster + one noise)."
+    event_0_hits = df_result[df_result['event'] == 0]
+    noise_1_hit  = df_result[(df_result['X'] == 0.5)]
+    assert noise_1_hit['cluster'].iloc[0] == -1, "The hit at (0.5,0.5,0) in event 1 should be marked as noise (-1)."
+    assert event_0_hits['cluster'].iloc[0] != noise_1_hit['cluster'].iloc[0], "Hits from event 0 and the noise hit from event 1 were assigned the same cluster label."

From 0e64dad61399a7b1213951a69f5f3c05d2c58ab5 Mon Sep 17 00:00:00 2001
From: Camilo Cortes Parra <ccortesp@ific.uv.es>
Date: Mon, 23 Feb 2026 15:38:23 +0100
Subject: [PATCH 08/23] Pytest for hits clusterizer feature. Also, reference
 file for exact result test is updated to match the new implementation.

---
 invisible_cities/cities/sophronia_test.py | 91 +++++++++++++++++------
 1 file changed, 69 insertions(+), 22 deletions(-)

diff --git a/invisible_cities/cities/sophronia_test.py b/invisible_cities/cities/sophronia_test.py
index 3b433ac5c..922f25492 100644
--- a/invisible_cities/cities/sophronia_test.py
+++ b/invisible_cities/cities/sophronia_test.py
@@ -5,6 +5,8 @@
 
 from pytest import mark
 
+from .. io                   import dst_io as dio
+from .. core.testing_utils   import assert_dataframes_equal
 from .. core.testing_utils   import assert_tables_equality
 from .. core.testing_utils   import ignore_warning
 from .. core.system_of_units import pes
@@ -64,28 +66,27 @@ def test_sophronia_contains_all_tables(sophronia_config, config_tmpdir):
 
 @ignore_warning.no_config_group
 @mark.slow
-# RE-ACTIVAR
-# def test_sophronia_exact_result(sophronia_config, Th228_hits, config_tmpdir):
-#     path_out = os.path.join(config_tmpdir, 'test_sophronia_exact_result.h5')
-#     config   = dict(**sophronia_config)
-#     config.update(dict(file_out = path_out))
-
-#     sophronia(**config)
-
-#     tables = ( "MC/hits", "MC/particles"
-#              , "DST/Events"
-#              , "RECO/Events"
-#              , "Run/events", "Run/runInfo"
-#              , "Filters/s12_selector", "Filters/valid_hit"
-#              )
-
-#     with tb.open_file(Th228_hits)   as true_output_file:
-#         with tb.open_file(path_out) as      output_file:
-#             for table in tables:
-#                 assert hasattr(output_file.root, table), table
-#                 got      = getattr(     output_file.root, table)
-#                 expected = getattr(true_output_file.root, table)
-#                 assert_tables_equality(got, expected)
+def test_sophronia_exact_result(sophronia_config, Th228_hits, config_tmpdir):
+    path_out = os.path.join(config_tmpdir, 'test_sophronia_exact_result.h5')
+    config   = dict(**sophronia_config)
+    config.update(dict(file_out = path_out))
+
+    sophronia(**config)
+
+    tables = ( "MC/hits", "MC/particles"
+             , "DST/Events"
+             , "RECO/Events"
+             , "Run/events", "Run/runInfo"
+             , "Filters/s12_selector", "Filters/valid_hit"
+             )
+
+    with tb.open_file(Th228_hits)   as true_output_file:
+        with tb.open_file(path_out) as      output_file:
+            for table in tables:
+                assert hasattr(output_file.root, table), table
+                got      = getattr(     output_file.root, table)
+                expected = getattr(true_output_file.root, table)
+                assert_tables_equality(got, expected)
 
 
 @ignore_warning.no_config_group
@@ -148,3 +149,49 @@ def test_sophronia_keeps_hitless_events(config_tmpdir, sophronia_config):
     with tb.open_file(path_out) as output_file:
         assert len(output_file.root.Run.events) == 1
         assert "RECO" not in output_file.root
+
+
+@ignore_warning.no_config_group
+def test_sophronia_clustering_integration(config_tmpdir, sophronia_config):
+    """
+    Runs Sophronia twice (once disabled, once enabled) to verify:
+    1. Backward compatibility: No 'cluster' column when disabled.
+    2. Feature activation: 'cluster' column exists when enabled.
+    3. Data consistency: Enabling clustering does NOT change any other data.
+    """
+    path_out_no_cluster   = os.path.join(config_tmpdir, 'test_sophronia_no_cluster.h5')
+    path_out_with_cluster = os.path.join(config_tmpdir, 'test_sophronia_with_cluster.h5')
+
+    # Clustering disabled
+    config_no_cluster = dict(**sophronia_config)
+    config_no_cluster.update(dict( file_out          = path_out_no_cluster
+                                 , event_range       = 1
+                                 , clustering_params = None))
+    sophronia(**config_no_cluster)
+
+    # Clustering enabled
+    clustering_params = dict(
+                                eps         = 3,
+                                min_samples = 5,
+                                scale_xy    = 14.55,
+                                scale_z     = 3.7
+                            )
+    config_with_cluster = dict(**sophronia_config)
+    config_with_cluster.update(dict( file_out          = path_out_with_cluster
+                                   , event_range       = 1
+                                   , clustering_params = clustering_params))
+    sophronia(**config_with_cluster)
+
+    # Load both outputs
+    df_no_cluster   = dio.load_dst(path_out_no_cluster,   "RECO", "Events")
+    df_with_cluster = dio.load_dst(path_out_with_cluster, "RECO", "Events")
+
+    # ----- Assertions
+    assert not df_no_cluster.empty
+    assert not df_with_cluster.empty
+    assert 'cluster' not in df_no_cluster.columns, "'cluster' column should not exist when clustering is disabled."
+    assert 'cluster' in df_with_cluster.columns, "'cluster' column should exist when clustering is enabled."
+
+    # Compare all columns except 'cluster' for equality
+    df_with_cluster_compare = df_with_cluster.drop(columns=['cluster'])
+    assert_dataframes_equal(df_no_cluster, df_with_cluster_compare)
\ No newline at end of file

From f22bb7565c40b733a5f8a406f5bbe8d734933dc9 Mon Sep 17 00:00:00 2001
From: Camilo Cortes Parra <ccortesp@ific.uv.es>
Date: Tue, 3 Mar 2026 15:29:57 +0100
Subject: [PATCH 09/23] New reference file including cluster label for hits

---
 invisible_cities/reco/hits_functions.py      | 2 +-
 invisible_cities/reco/hits_functions_test.py | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/invisible_cities/reco/hits_functions.py b/invisible_cities/reco/hits_functions.py
index 4bb54c1d3..d04204d3b 100644
--- a/invisible_cities/reco/hits_functions.py
+++ b/invisible_cities/reco/hits_functions.py
@@ -241,7 +241,7 @@ def threshold_hits(hits: pd.DataFrame, th: float) -> pd.DataFrame:
     """
     if th <= 0: return hits
     return (hits.groupby("Z", as_index=False)
-                .apply(apply_threshold, th=th, on_corrected=on_corrected))
+                .apply(apply_threshold, th=th))
 
 
 def cluster_tagger(df_hits: pd.DataFrame, *, 
diff --git a/invisible_cities/reco/hits_functions_test.py b/invisible_cities/reco/hits_functions_test.py
index ade6c9a65..d3cc20c54 100644
--- a/invisible_cities/reco/hits_functions_test.py
+++ b/invisible_cities/reco/hits_functions_test.py
@@ -160,8 +160,7 @@ def test_threshold_hits_energy_conserved(hits, th):
 def test_threshold_hits_all_larger_than_th(hits, th):
     hits_thresh = threshold_hits(hits, th)
     non_nn = hits_thresh.loc[hits_thresh.Q != NN]
-    q = non_nn[col]
-    assert np.all(q >= th)
+    assert np.all(non_nn.Q >= th)
 
 # ----- CLUSTER TAGGER TESTS ----- #
 

From e062786353105663a5501362dad3930000a92d9c Mon Sep 17 00:00:00 2001
From: Camilo Cortes Parra <ccortesp@ific.uv.es>
Date: Tue, 3 Mar 2026 16:00:07 +0100
Subject: [PATCH 10/23] New hits reference file, git problem solved

---
 invisible_cities/database/test_data/228Th_10evt_hits.h5 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/invisible_cities/database/test_data/228Th_10evt_hits.h5 b/invisible_cities/database/test_data/228Th_10evt_hits.h5
index de4ac5337..3aa32af53 100644
--- a/invisible_cities/database/test_data/228Th_10evt_hits.h5
+++ b/invisible_cities/database/test_data/228Th_10evt_hits.h5
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c4ea23ce094795bfd779121b872ce7e7d1da60eb1e1992a8868e3fc7de73c4b8
-size 274703
+oid sha256:ee809a8d3f69241048b4c37c156b452e28bcce05d45b3504c15163599e6c8dbb
+size 274902

From 4e43dc846a1b90c38f605f02f0f8dc72188fd059 Mon Sep 17 00:00:00 2001
From: Camilo Cortes Parra <ccortesp@ific.uv.es>
Date: Wed, 4 Mar 2026 11:51:08 +0100
Subject: [PATCH 11/23] scikit-learn added

---
 manage.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/manage.sh b/manage.sh
index 04d8b7a83..f6c04e1fa 100644
--- a/manage.sh
+++ b/manage.sh
@@ -73,7 +73,7 @@ function install_conda {
     fi
 }
 
-CONDA_ENV_TAG=2026-03-05
+CONDA_ENV_TAG=2025-03-04
 CONDA_ENV_NAME=IC-${PYTHON_VERSION}-${CONDA_ENV_TAG}
 
 function make_environment {
@@ -109,6 +109,7 @@ dependencies:
 - scipy        = 1.9.3
 - seaborn      = 0.11.2
 - setuptools   = 58.0.4
+- scikit-learn = 1.1.3
 - sphinx       = 4.2.0
 - tornado      = 6.1
 - pip:

From c24a3313cd5269420051c73b8dc342e707bcf6c4 Mon Sep 17 00:00:00 2001
From: Camilo Cortes Parra <ccortesp@ific.uv.es>
Date: Wed, 4 Mar 2026 11:53:11 +0100
Subject: [PATCH 12/23] Update conda environment tag

---
 manage.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/manage.sh b/manage.sh
index f6c04e1fa..421370d10 100644
--- a/manage.sh
+++ b/manage.sh
@@ -73,7 +73,7 @@ function install_conda {
     fi
 }
 
-CONDA_ENV_TAG=2025-03-04
+CONDA_ENV_TAG=2026-03-04
 CONDA_ENV_NAME=IC-${PYTHON_VERSION}-${CONDA_ENV_TAG}
 
 function make_environment {

From 5c41eab8f5b1256d21c0a1acdab09cff11696a0e Mon Sep 17 00:00:00 2001
From: Camilo Cortes Parra <ccortesp@ific.uv.es>
Date: Wed, 4 Mar 2026 14:48:29 +0100
Subject: [PATCH 13/23] Update beersheba reference files

---
 invisible_cities/database/test_data/228Th_10evt_deco.h5       | 4 ++--
 .../database/test_data/228Th_10evt_deco_satellite.h5          | 4 ++--
 .../database/test_data/228Th_10evt_deco_separate.h5           | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/invisible_cities/database/test_data/228Th_10evt_deco.h5 b/invisible_cities/database/test_data/228Th_10evt_deco.h5
index e4f1f6006..47ff579ad 100644
--- a/invisible_cities/database/test_data/228Th_10evt_deco.h5
+++ b/invisible_cities/database/test_data/228Th_10evt_deco.h5
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3b40406d8530d0f7a6f6dfcd7b1d10bea83374ee49374f612183771f9ca5a9c5
-size 818438
+oid sha256:5ed63a53e9a78cef39a4ff4d14ae1cfbdfc0f121d27fd9ee215c18bcec91ff2d
+size 819282
diff --git a/invisible_cities/database/test_data/228Th_10evt_deco_satellite.h5 b/invisible_cities/database/test_data/228Th_10evt_deco_satellite.h5
index be2713440..273064f6e 100644
--- a/invisible_cities/database/test_data/228Th_10evt_deco_satellite.h5
+++ b/invisible_cities/database/test_data/228Th_10evt_deco_satellite.h5
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e1aaca525390738b4ee0f432d81f235ca2f37c38801f48f76210c92b91e0777b
-size 302572
+oid sha256:434f0bf46e4e326731dbb87e14d83d6649ea01eb0079654a42c9c3e3c41c7df1
+size 304668
diff --git a/invisible_cities/database/test_data/228Th_10evt_deco_separate.h5 b/invisible_cities/database/test_data/228Th_10evt_deco_separate.h5
index 490271120..0f6cddf7d 100644
--- a/invisible_cities/database/test_data/228Th_10evt_deco_separate.h5
+++ b/invisible_cities/database/test_data/228Th_10evt_deco_separate.h5
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5ae5b40acc034706762ad2de2209430b136001b1783c921e98c52731c6c2d101
-size 818469
+oid sha256:69175d759c8484973f9006911f08c8019cc84bc472481b7c8d3fafded16a3cc8
+size 819313

From 58bd2a3452306d46c4b02d17d284878e9169f4bb Mon Sep 17 00:00:00 2001
From: Camilo Cortes Parra <ccortesp@ific.uv.es>
Date: Wed, 22 Apr 2026 15:06:02 +0200
Subject: [PATCH 14/23] PR: first round of comments addressed

---
 invisible_cities/cities/components.py        |  34 ++---
 invisible_cities/cities/sophronia.py         |   9 +-
 invisible_cities/config/sophronia.conf       |   4 +-
 invisible_cities/conftest.py                 |   4 +-
 invisible_cities/reco/hits_functions.py      | 136 +++++++++++--------
 invisible_cities/reco/hits_functions_test.py | 115 ++++++++--------
 6 files changed, 161 insertions(+), 141 deletions(-)

diff --git a/invisible_cities/cities/components.py b/invisible_cities/cities/components.py
index f89351b1f..07fc816a4 100644
--- a/invisible_cities/cities/components.py
+++ b/invisible_cities/cities/components.py
@@ -1718,33 +1718,33 @@ def correct(hits : pd.DataFrame) -> pd.DataFrame:
 
     return correct
 
-def hits_clusterizer(clustering_params: dict) -> Callable:
-    """"
-    This function receives a configuration dictionary and returns a callable
-    that will perform DBSCAN clustering on a DataFrame of hits. 
+@check_annotations
+def hits_clusterizer( eps         : float
+                    , min_samples : int
+                    , scale_xy    : float
+                    , scale_z     : float
+                    ) -> Callable:
+    """
+    Creates a callable for performing DBSCAN clustering on a dataFrame of hits.
 
     Parameters
     ----------
-    clustering_params : dict
-        A dictionary containing the configuration for the clustering algorithm.
-        Expected keys are:
-        - 'eps'        : float, Epsilon value for DBSCAN.
-        - 'min_samples': int,   Min Samples value for DBSCAN.
-        - 'scale_xy'   : float, optional, scale factor for XY coordinates.
-        - 'scale_z'    : float, optional, scale factor for Z coordinate.
+    eps : float
+        Epsilon value for DBSCAN, defining the maximum distance between two samples for them to be considered neighbors.
+    min_samples : int
+        Minimum number of samples required to form a dense region (cluster). This includes the point itself.
+    scale_xy : float
+        Scaling factor to apply to the (x, y) coordinates before clustering.
+    scale_z : float
+         Scaling factor to apply to the z coordinate before clustering.
 
     Returns
     -------
     Callable
         A function that takes a DataFrame of hits and returns the same DataFrame 
-        with an added 'cluster' column, which are the clusters labels assigned by DBSCAN
+        with an added 'cluster' column, which contains the cluster labels assigned by DBSCAN
         (-1 for noise).
     """
-    eps         = clustering_params['eps']
-    min_samples = clustering_params['min_samples']
-    scale_xy    = clustering_params['scale_xy']
-    scale_z     = clustering_params['scale_z']
-    
     return partial(cluster_tagger,
                    eps=eps, min_samples=min_samples,
                    scale_xy=scale_xy, scale_z=scale_z)
diff --git a/invisible_cities/cities/sophronia.py b/invisible_cities/cities/sophronia.py
index 6ddb0c2a7..97947cda9 100644
--- a/invisible_cities/cities/sophronia.py
+++ b/invisible_cities/cities/sophronia.py
@@ -142,12 +142,9 @@ def sophronia( files_in           : OneOrManyFiles
 
     clustering_params : dict
         eps : float
-            The maximum distance between two samples for one to be
-            considered as in the neighborhood of the other.
+            Epsilon value for DBSCAN, defining the maximum distance between two samples for them to be considered neighbors.
         min_samples : int
-            The number of samples (or total weight) in a neighborhood
-            for a point to be considered as a core point. This includes the point
-            itself.
+            Minimum number of samples required to form a dense region (cluster). This includes the point itself.
         scale_xy : float
             Scaling factor to apply to the (x, y) coordinates before clustering.
         scale_z : float
@@ -193,7 +190,7 @@ def sophronia( files_in           : OneOrManyFiles
     correct_hits   = df.map( hits_corrector(**corrections) if corrections is not None else identity
                            , item = "hits")
     
-    cluster_hits   = df.map( hits_clusterizer(clustering_params) if clustering_params is not None else identity
+    cluster_hits   = df.map( hits_clusterizer(**clustering_params) if clustering_params is not None else identity
                            , item = "hits")
 
     build_pointlike_event = df.map( pointlike_event_builder( detector_db
diff --git a/invisible_cities/config/sophronia.conf b/invisible_cities/config/sophronia.conf
index f52deb856..56ca2eeb7 100644
--- a/invisible_cities/config/sophronia.conf
+++ b/invisible_cities/config/sophronia.conf
@@ -67,6 +67,6 @@ corrections = dict(
 clustering_params = dict(
     eps          = 3,
     min_samples  = 5,
-    scale_xy     = 14.55,
-    scale_z      = 3.7
+    scale_xy     = 15.55,
+    scale_z      = 4.0
 )
diff --git a/invisible_cities/conftest.py b/invisible_cities/conftest.py
index 7dc54dfcc..1b2af207d 100644
--- a/invisible_cities/conftest.py
+++ b/invisible_cities/conftest.py
@@ -441,8 +441,8 @@ def sophronia_config(Th228_pmaps, next100_mc_krmap):
                    , clustering_params = dict(
                         eps         =     3,
                         min_samples =     5,
-                        scale_xy    = 14.55,
-                        scale_z     =   3.7)
+                        scale_xy    = 15.55,
+                        scale_z     =   4.0)
                    )
     return config
 
diff --git a/invisible_cities/reco/hits_functions.py b/invisible_cities/reco/hits_functions.py
index d04204d3b..1b9c58f24 100644
--- a/invisible_cities/reco/hits_functions.py
+++ b/invisible_cities/reco/hits_functions.py
@@ -243,64 +243,80 @@ def threshold_hits(hits: pd.DataFrame, th: float) -> pd.DataFrame:
     return (hits.groupby("Z", as_index=False)
                 .apply(apply_threshold, th=th))
 
+def tag_hits_in_event(event_hits: pd.DataFrame, *,
+                      eps: float, min_samples: int,
+                      scale_xy: float, scale_z: float) -> pd.DataFrame:
+    """
+    Applies DBSCAN clustering to a DataFrame containing hits from a single event.
+
+    The coordinates are scaled to account for detector geometry differences
+    in sampling and applies DBSCAN to identify spatial clusters.
+    A 'cluster' column is added to the group with the resulting labels.
+
+    Parameters
+    ----------
+    event_hits  : pd.DataFrame
+        DataFrame with hits from a single event. Must contain 'X', 'Y', 'Z' columns.
+    eps, min_samples, scale_xy, scale_z :
+        Configuration parameters for scaling and DBSCAN. See `cluster_tagger` for details.
+
+    Returns
+    -------
+    pd.DataFrame
+        The input DataFrame with a 'cluster' column added.
+    """
+    # If the event has no hits, there's nothing to do
+    if event_hits.empty:
+        return event_hits.assign(cluster=pd.Series(dtype=int))
+
+    # Extract coordinates and apply scaling
+    coords = event_hits[['X', 'Y', 'Z']].to_numpy()
+    coords[:, :2] /= scale_xy
+    coords[:, 2]  /= scale_z
+
+    # DBSCAN clustering
+    labels = DBSCAN(eps=eps, min_samples=min_samples).fit_predict(coords)
+    # Add the cluster labels as a new column to the event's DataFrame.
+    event_hits['cluster'] = labels
 
-def cluster_tagger(df_hits: pd.DataFrame, *, 
-                   eps:float, min_samples:int, 
-                   scale_xy:float, scale_z:float) -> pd.DataFrame:
-        """
-        Applies DBSCAN clustering to hits on an event-by-event basis.
-
-        This function processes a DataFrame of hits, groups them by event,
-        scales their coordinates, and applies DBSCAN to identify spatial clusters.
-        A 'cluster' column is added to the DataFrame with the resulting labels.
-
-        Parameters
-        ----------
-        df_hits     : pd.DataFrame
-            DataFrame containing hit information with columns 'X', 'Y', 'Z', and 'event'.
-        eps         : float
-            The maximum distance between two samples for one to be considered as in the
-            neighborhood of the other. This is the most important DBSCAN parameter.
-        min_samples : int
-            The number of samples (or total weight) in a neighborhood for a point
-            to be considered as a core point.
-        scale_xy    : float
-            Scale factor to apply to X and Y coordinates before clustering to account
-            for different detector resolutions.
-        scale_z     : float
-            Scale factor to apply to the Z coordinate.
-
-        Returns
-        -------
-        pd.DataFrame
-            The input DataFrame with an added 'cluster' column indicating the
-            cluster label for each hit (-1 for noise).
-        """
-        if df_hits.empty:
-            return df_hits.assign(cluster=pd.Series(dtype=int))  
-
-        # Pre-allocate array for cluster labels
-        cluster_labels = np.full(len(df_hits), -9999, dtype=int)
-
-        # Get values once (faster than repeatedly accessing DataFrame columns)
-        coords = df_hits[['X', 'Y', 'Z']].to_numpy()
-        events = df_hits['event'].to_numpy()
-
-        # Use np.unique to get sorted event IDs
-        unique_events = np.unique(events)
-        for event_id in unique_events:
-            
-            mask = (events == event_id)
-            X = coords[mask].copy()
-
-            # Scale
-            X[:, :2] /= scale_xy
-            X[:, 2]  /= scale_z
-
-            # DBSCAN clustering
-            labels = DBSCAN(eps=eps, min_samples=min_samples).fit_predict(X)
-            cluster_labels[mask] = labels
-
-        df_hits['cluster'] = cluster_labels
-
-        return df_hits
\ No newline at end of file
+    return event_hits
+
+def cluster_tagger(df_hits: pd.DataFrame, *,
+                   eps: float, min_samples: int,
+                   scale_xy: float, scale_z: float) -> pd.DataFrame:
+    """
+    Applies DBSCAN clustering to hits on an event-by-event basis using groupby.apply.
+
+    This function groups the input DataFrame by 'event' and applies the
+    `tag_hits_in_event` function to each event's group of hits.
+
+    Parameters
+    ----------
+    df_hits : pd.DataFrame
+        DataFrame containing hit information with columns 'X', 'Y', 'Z', and 'event'.
+    eps : float
+        Epsilon value for DBSCAN, defining the maximum distance between two samples for them to be considered neighbors.
+    min_samples : int
+        Minimum number of samples required to form a dense region (cluster). This includes the point itself.
+    scale_xy : float
+        Scaling factor to apply to the (x, y) coordinates before clustering.
+    scale_z : float
+        Scaling factor to apply to the z coordinate before clustering.
+
+    Returns
+    -------
+    pd.DataFrame
+        The input DataFrame with an added 'cluster' column indicating the
+        cluster label for each hit (-1 for noise).
+    """
+    if df_hits.empty:
+        return df_hits.assign(cluster=pd.Series(dtype=int))
+
+    clustered_df = df_hits.groupby('event', as_index=False, group_keys=False) \
+                                   .apply(tag_hits_in_event,
+                                          eps=eps,
+                                          min_samples=min_samples,
+                                          scale_xy=scale_xy,
+                                          scale_z=scale_z)
+    
+    return clustered_df.set_index(df_hits.index)
\ No newline at end of file
diff --git a/invisible_cities/reco/hits_functions_test.py b/invisible_cities/reco/hits_functions_test.py
index d3cc20c54..1eab9ed93 100644
--- a/invisible_cities/reco/hits_functions_test.py
+++ b/invisible_cities/reco/hits_functions_test.py
@@ -173,63 +173,77 @@ def test_threshold_hits_all_larger_than_th(hits, th):
                                 column('E',     dtype=float, elements=floats(min_value=0.1,  max_value=100)),    
                                 ])
 
+@settings(deadline=None)
 @given(df=gen_cluster_df)
-def test_dummy(df):
+def test_cluster_tagger_output_shape(df):
     """
-    Hypothesis calls this function multiple times.
-    'df' will be a different pandas DataFrame in every call.
+    Verifies that the output DataFrame of cluster_tagger:
+    - Has the same number of rows as the input.
+    - Contains exactly one new column named 'cluster'.
     """
-    # Just for demonstration purposes, we print the shape of the generated DFs
-    print(f"Generated dataframe shape: {df.shape}")
-    
-    # Check some stuff here
-    assert 'X' in df.columns
-    assert df['event'].dtype == int
-    assert not df.empty
+    if df.empty:
+        return
+
+    # Run the cluster tagger
+    params = dict(eps=10.0, min_samples=1, scale_xy=1.0, scale_z=1.0)   # Dummy values
+    df_result = cluster_tagger(df.copy(), **params)
+
+     # --- Assertations
+    assert len(df_result) == len(df), "Output DataFrame has different length than input."
+    assert 'cluster' in df_result.columns, "Output DataFrame does not contain 'cluster' column."
+    expected_cols = set(df.columns) | {'cluster'}
+    assert set(df_result.columns) == expected_cols, "Output DataFrame has unexpected columns."
 
 @settings(deadline=None)
 @given(df=gen_cluster_df)
-def test_cluster_tagger_structure_preservation(df):
+def test_cluster_tagger_original(df):
     """
     Verifies that cluster_tagger:
-        - Returns a DataFrame with the exact same length as the input.
-        - Adds exactly one column named 'cluster'.
-        - Does not modify any of the original columns (X, Y, Z, E, etc.).
-        - Preserves the original Index and order of rows.
-        - The 'cluster' column contains valid integers (no NaNs).
+    - Does not modify any of the original columns.
+    - Preserves the original index and row order.
     """
-    # Shuffle the input DataFrame to ensure cluster_tagger does not rely on any specific order
-    df_input = df.sample(frac=1.0).copy()
-    df_original = df_input.copy()           # Keep a copy of the original for later comparison
+    if df.empty:
+        return
 
     # Run the cluster tagger
-    params = dict(eps=10.0, min_samples=1, scale_xy=1.0, scale_z=1.0)     # Dummy values
-    df_result = cluster_tagger(df_input, **params)
+    params = dict(eps=10.0, min_samples=1, scale_xy=1.0, scale_z=1.0)   # Dummy values
+    df_result = cluster_tagger(df.copy(), **params)
 
     # --- Assertations
-    assert len(df_result) == len(df_original), "Output DataFrame has different length than input."
-    assert 'cluster' in df_result.columns,     "Output DataFrame does not contain 'cluster' column."
-    expected_cols = set(df_original.columns) | {'cluster'}
-    assert set(df_result.columns) == expected_cols, "Output DataFrame has unexpected columns."
     pd.testing.assert_frame_equal(  
                                     df_result.drop(columns=['cluster']),
-                                    df_original,
+                                    df,
                                     check_dtype=True,
                                     obj="Dataframe structure check"
                                  )
+
+@settings(deadline=None)
+@given(df=gen_cluster_df)
+def test_cluster_tagger_new_column_validity(df):
+    """
+    Verifies that the new 'cluster' column:
+    - The 'cluster' column contains valid integers (no NaNs).
+    """
+    if df.empty:
+        return
+        
+    # Run the cluster tagger
+    params = dict(eps=10.0, min_samples=1, scale_xy=1.0, scale_z=1.0)   # Dummy values
+    df_result = cluster_tagger(df.copy(), **params)
+
+    # --- Assertations
     assert pd.api.types.is_integer_dtype(df_result['cluster']), "'cluster' column is not of integer type."
     assert not df_result['cluster'].isna().any(), "'cluster' column contains NaN values."
 
 def test_cluster_tagger_row_alignment():
     """
-    Verifies that the calculated cluster label is assigned to the correct 
-    spatial hit, even if the input DataFrame is shuffled.
-    
+    Verifies that the correct cluster label is assigned to the correct
+    row (hit), even if the input DataFrame is shuffled.
+
     Scenario:
     - Event 0:
         - Cluster A: 2 hits at (0,0,0) and (1,1,0)         -> Should be Cluster 0
         - Cluster B: 2 hits at (100,100,0) and (101,101,0) -> Should be Cluster 1
-    - We check that hits near 0 get Label 0 and hits near 100 get Label 1 (NO noise here).
     """
     # Setup data
     data = {
@@ -240,24 +254,23 @@ def test_cluster_tagger_row_alignment():
                 'E':     [10, 10,  10,   10 ]
     }
     df = pd.DataFrame(data)
-    df['expected_label'] = [0, 0, 1, 1]
-
-    # Shuffle the input DataFrame
-    df_input = df.sample(frac=1.0).copy()
+    # Shuffle rows in a specific order
+    df_shuffled = df.reindex(index=[0, 2, 1, 3]).copy()     
+    # Add expected labels for assertations (not shuffled, just for reference)
+    df['cluster'] = [0, 0, 1, 1] 
 
     # Run the cluster tagger
-    params = dict(eps=5.0, min_samples=1, scale_xy=1.0, scale_z=1.0)      # Enough to consider both clusters
-    df_result = cluster_tagger(df_input, **params)
+    params = dict(eps=5.0, min_samples=1, scale_xy=1.0, scale_z=1.0)    # Enough to consider both clusters
+    df_result = cluster_tagger(df_shuffled, **params)
+    df_result = df_result.sort_index()                                  # Sort back to original order for easier assertations
 
     # --- Assertations
-    hits_group_0 = df_result[df_result['expected_label'] == 0]
-    hits_group_1 = df_result[df_result['expected_label'] == 1]
-    assert hits_group_0['cluster'].nunique() == 1, "Hits near (0,0,0) were assigned multiple cluster labels."
-    assert hits_group_1['cluster'].nunique() == 1, "Hits near (100,100,0) were assigned multiple cluster labels."
-    label_0 = hits_group_0['cluster'].iloc[0]
-    label_1 = hits_group_1['cluster'].iloc[0]
-    assert label_0 != label_1, "Both clusters were assigned the same label."
-    assert label_0 != -1 and label_1 != -1, "One of the clusters was labeled as noise (-1)."
+    pd.testing.assert_frame_equal(  
+                                    df_result,
+                                    df,
+                                    check_dtype=True,
+                                    obj="Dataframe structure check"
+                                    )
     
 def test_cluster_tagger_noise_rejection():
     """
@@ -277,12 +290,9 @@ def test_cluster_tagger_noise_rejection():
     }
     df = pd.DataFrame(data)
 
-    # Shuffle the input DataFrame
-    df_input = df.sample(frac=1.0).copy()
-
     # Run the cluster tagger
-    params = dict(eps=5.0, min_samples=3, scale_xy=1.0, scale_z=1.0)      # Enough to consider one cluster
-    df_result = cluster_tagger(df_input, **params)
+    params = dict(eps=5.0, min_samples=3, scale_xy=1.0, scale_z=1.0)    # Enough to consider one cluster
+    df_result = cluster_tagger(df.copy(), **params)
 
     # --- Assertations
     cluster_labels = df_result['cluster'].unique()
@@ -312,12 +322,9 @@ def test_cluster_tagger_event_distinction():
     }
     df = pd.DataFrame(data)
 
-    # Shuffle the input DataFrame
-    df_input = df.sample(frac=1.0).copy()
-
     # Run the cluster tagger
     params = dict(eps=5.0, min_samples=2, scale_xy=1.0, scale_z=1.0)      # Enough to consider both clusters
-    df_result = cluster_tagger(df_input, **params)
+    df_result = cluster_tagger(df.copy(), **params)
 
     # --- Assertations
     event_0_clusters = df_result[df_result['event'] == 0]['cluster'].unique()
@@ -327,4 +334,4 @@ def test_cluster_tagger_event_distinction():
     event_0_hits = df_result[df_result['event'] == 0]
     noise_1_hit  = df_result[(df_result['X'] == 0.5)]
     assert noise_1_hit['cluster'].iloc[0] == -1, "The hit at (0.5,0.5,0) in event 1 should be marked as noise (-1)."
-    assert event_0_hits['cluster'].iloc[0] != noise_1_hit['cluster'].iloc[0], "Hits from event 0 and the noise hit from event 1 were assigned the same cluster label."
+    assert event_0_hits['cluster'].iloc[0] != noise_1_hit['cluster'].iloc[0], "Hits from event 0 and the noise hit from event 1 were assigned the same cluster label."
\ No newline at end of file

From 0e4703d392902257f2a6d6406958f159a9b67420 Mon Sep 17 00:00:00 2001
From: Camilo Cortes Parra <ccortesp@ific.uv.es>
Date: Thu, 23 Apr 2026 11:55:41 +0200
Subject: [PATCH 15/23] Remove @settings

---
 invisible_cities/reco/hits_functions_test.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/invisible_cities/reco/hits_functions_test.py b/invisible_cities/reco/hits_functions_test.py
index 1eab9ed93..c37d2b2c5 100644
--- a/invisible_cities/reco/hits_functions_test.py
+++ b/invisible_cities/reco/hits_functions_test.py
@@ -173,7 +173,6 @@ def test_threshold_hits_all_larger_than_th(hits, th):
                                 column('E',     dtype=float, elements=floats(min_value=0.1,  max_value=100)),    
                                 ])
 
-@settings(deadline=None)
 @given(df=gen_cluster_df)
 def test_cluster_tagger_output_shape(df):
     """
@@ -194,7 +193,6 @@ def test_cluster_tagger_output_shape(df):
     expected_cols = set(df.columns) | {'cluster'}
     assert set(df_result.columns) == expected_cols, "Output DataFrame has unexpected columns."
 
-@settings(deadline=None)
 @given(df=gen_cluster_df)
 def test_cluster_tagger_original(df):
     """
@@ -217,7 +215,6 @@ def test_cluster_tagger_original(df):
                                     obj="Dataframe structure check"
                                  )
 
-@settings(deadline=None)
 @given(df=gen_cluster_df)
 def test_cluster_tagger_new_column_validity(df):
     """

From 2eadf57ea0a8659ffdc08f15dd361e698ce62cdb Mon Sep 17 00:00:00 2001
From: Camilo Cortes Parra <ccortesp@ific.uv.es>
Date: Fri, 24 Apr 2026 17:26:58 +0200
Subject: [PATCH 16/23] PR: second round of comments addressed

---
 invisible_cities/reco/hits_functions.py      |  69 ++++----
 invisible_cities/reco/hits_functions_test.py | 159 ++++++++-----------
 2 files changed, 96 insertions(+), 132 deletions(-)

diff --git a/invisible_cities/reco/hits_functions.py b/invisible_cities/reco/hits_functions.py
index 1b9c58f24..471c99c33 100644
--- a/invisible_cities/reco/hits_functions.py
+++ b/invisible_cities/reco/hits_functions.py
@@ -243,65 +243,64 @@ def threshold_hits(hits: pd.DataFrame, th: float) -> pd.DataFrame:
     return (hits.groupby("Z", as_index=False)
                 .apply(apply_threshold, th=th))
 
-def tag_hits_in_event(event_hits: pd.DataFrame, *,
-                      eps: float, min_samples: int,
-                      scale_xy: float, scale_z: float) -> pd.DataFrame:
+def tag_hits_in_event(event_hits   : pd.DataFrame
+                     , *
+                     , eps         : float
+                     , min_samples : int
+                     , scale_xy    : float
+                     , scale_z     : float
+                     ) -> pd.DataFrame:
     """
     Applies DBSCAN clustering to a DataFrame containing hits from a single event.
-
-    The coordinates are scaled to account for detector geometry differences
-    in sampling and applies DBSCAN to identify spatial clusters.
+    Hits coordinates are scaled to account for the anisotropy of the detector geometry.
     A 'cluster' column is added to the group with the resulting labels.
 
     Parameters
     ----------
-    event_hits  : pd.DataFrame
+    event_hits : pd.DataFrame
         DataFrame with hits from a single event. Must contain 'X', 'Y', 'Z' columns.
     eps, min_samples, scale_xy, scale_z :
-        Configuration parameters for scaling and DBSCAN. See `cluster_tagger` for details.
+        Configuration parameters for DBSCAN and scaling. See `cluster_tagger` for details.
 
     Returns
     -------
     pd.DataFrame
         The input DataFrame with a 'cluster' column added.
     """
-    # If the event has no hits, there's nothing to do
-    if event_hits.empty:
-        return event_hits.assign(cluster=pd.Series(dtype=int))
-
-    # Extract coordinates and apply scaling
     coords = event_hits[['X', 'Y', 'Z']].to_numpy()
     coords[:, :2] /= scale_xy
     coords[:, 2]  /= scale_z
 
-    # DBSCAN clustering
     labels = DBSCAN(eps=eps, min_samples=min_samples).fit_predict(coords)
-    # Add the cluster labels as a new column to the event's DataFrame.
     event_hits['cluster'] = labels
 
     return event_hits
 
-def cluster_tagger(df_hits: pd.DataFrame, *,
-                   eps: float, min_samples: int,
-                   scale_xy: float, scale_z: float) -> pd.DataFrame:
+def cluster_tagger(df_hits      : pd.DataFrame
+                  , *
+                  , eps         : float
+                  , min_samples : int
+                  , scale_xy    : float
+                  , scale_z     : float
+                  ) -> pd.DataFrame:
     """
     Applies DBSCAN clustering to hits on an event-by-event basis using groupby.apply.
-
     This function groups the input DataFrame by 'event' and applies the
     `tag_hits_in_event` function to each event's group of hits.
 
     Parameters
     ----------
-    df_hits : pd.DataFrame
-        DataFrame containing hit information with columns 'X', 'Y', 'Z', and 'event'.
-    eps : float
-        Epsilon value for DBSCAN, defining the maximum distance between two samples for them to be considered neighbors.
+    df_hits     : pd.DataFrame
+        DataFrame with hit information. Must contain 'X', 'Y', 'Z', and 'event'.
+    eps         : float
+        Maximum distance between two samples for them to be considered neighbors.
     min_samples : int
-        Minimum number of samples required to form a dense region (cluster). This includes the point itself.
-    scale_xy : float
-        Scaling factor to apply to the (x, y) coordinates before clustering.
-    scale_z : float
-        Scaling factor to apply to the z coordinate before clustering.
+        Minimum number of samples required to form a dense region (cluster).
+        This includes the point itself.
+    scale_xy    : float
+        Scaling factor to apply to the XY coordinates before clustering.
+    scale_z     : float
+        Scaling factor to apply to the Z coordinate before clustering.
 
     Returns
     -------
@@ -312,11 +311,11 @@ def cluster_tagger(df_hits: pd.DataFrame, *,
     if df_hits.empty:
         return df_hits.assign(cluster=pd.Series(dtype=int))
 
-    clustered_df = df_hits.groupby('event', as_index=False, group_keys=False) \
-                                   .apply(tag_hits_in_event,
-                                          eps=eps,
-                                          min_samples=min_samples,
-                                          scale_xy=scale_xy,
-                                          scale_z=scale_z)
+    df_clustered = df_hits.groupby('event', as_index=False, group_keys=False) \
+                          .apply(tag_hits_in_event,
+                                 eps         = eps,
+                                 min_samples = min_samples,
+                                 scale_xy    = scale_xy,
+                                 scale_z     = scale_z)
     
-    return clustered_df.set_index(df_hits.index)
\ No newline at end of file
+    return df_clustered.set_index(df_hits.index)
\ No newline at end of file
diff --git a/invisible_cities/reco/hits_functions_test.py b/invisible_cities/reco/hits_functions_test.py
index c37d2b2c5..14bb88e49 100644
--- a/invisible_cities/reco/hits_functions_test.py
+++ b/invisible_cities/reco/hits_functions_test.py
@@ -162,32 +162,25 @@ def test_threshold_hits_all_larger_than_th(hits, th):
     non_nn = hits_thresh.loc[hits_thresh.Q != NN]
     assert np.all(non_nn.Q >= th)
 
-# ----- CLUSTER TAGGER TESTS ----- #
-
-gen_cluster_df = data_frames( index=range_indexes(min_size=1, max_size=50),
-                              columns=[
-                                column('event', dtype=int,   elements=integers(min_value=0, max_value=10)),
-                                column('X',     dtype=float, elements=floats(min_value=-500, max_value=500)),
-                                column('Y',     dtype=float, elements=floats(min_value=-500, max_value=500)),
-                                column('Z',     dtype=float, elements=floats(min_value=0,    max_value=1200)),
-                                column('E',     dtype=float, elements=floats(min_value=0.1,  max_value=100)),    
-                                ])
+gen_cluster_df = data_frames(index   = range_indexes(min_size=1, max_size=50),
+                             columns = [
+                                column('event', dtype=int  , elements=integers(min_value=0, max_value=10)),
+                                column(    'X', dtype=float, elements=floats(min_value=-500, max_value=500)),
+                                column(    'Y', dtype=float, elements=floats(min_value=-500, max_value=500)),
+                                column(    'Z', dtype=float, elements=floats(min_value=0,    max_value=1200)),
+                                column(    'E', dtype=float, elements=floats(min_value=0.1,  max_value=100)),    
+                                       ])
 
 @given(df=gen_cluster_df)
 def test_cluster_tagger_output_shape(df):
     """
-    Verifies that the output DataFrame of cluster_tagger:
+    Verifies that the output of cluster_tagger:
     - Has the same number of rows as the input.
     - Contains exactly one new column named 'cluster'.
     """
-    if df.empty:
-        return
-
-    # Run the cluster tagger
-    params = dict(eps=10.0, min_samples=1, scale_xy=1.0, scale_z=1.0)   # Dummy values
-    df_result = cluster_tagger(df.copy(), **params)
+    dummy_params = dict(eps=10.0, min_samples=1, scale_xy=1.0, scale_z=1.0)
+    df_result    = cluster_tagger(df.copy(), **dummy_params)
 
-     # --- Assertations
     assert len(df_result) == len(df), "Output DataFrame has different length than input."
     assert 'cluster' in df_result.columns, "Output DataFrame does not contain 'cluster' column."
     expected_cols = set(df.columns) | {'cluster'}
@@ -197,17 +190,12 @@ def test_cluster_tagger_output_shape(df):
 def test_cluster_tagger_original(df):
     """
     Verifies that cluster_tagger:
-    - Does not modify any of the original columns.
-    - Preserves the original index and row order.
+    - Does not modify any of the input information.
+    - Preserves the input index and row order.
     """
-    if df.empty:
-        return
+    dummy_params = dict(eps=10.0, min_samples=1, scale_xy=1.0, scale_z=1.0)
+    df_result    = cluster_tagger(df.copy(), **dummy_params)
 
-    # Run the cluster tagger
-    params = dict(eps=10.0, min_samples=1, scale_xy=1.0, scale_z=1.0)   # Dummy values
-    df_result = cluster_tagger(df.copy(), **params)
-
-    # --- Assertations
     pd.testing.assert_frame_equal(  
                                     df_result.drop(columns=['cluster']),
                                     df,
@@ -217,51 +205,39 @@ def test_cluster_tagger_original(df):
 
 @given(df=gen_cluster_df)
 def test_cluster_tagger_new_column_validity(df):
-    """
-    Verifies that the new 'cluster' column:
-    - The 'cluster' column contains valid integers (no NaNs).
-    """
-    if df.empty:
-        return
-        
-    # Run the cluster tagger
-    params = dict(eps=10.0, min_samples=1, scale_xy=1.0, scale_z=1.0)   # Dummy values
-    df_result = cluster_tagger(df.copy(), **params)
+    dummy_params = dict(eps=10.0, min_samples=1, scale_xy=1.0, scale_z=1.0)
+    df_result    = cluster_tagger(df.copy(), **dummy_params)
 
-    # --- Assertations
-    assert pd.api.types.is_integer_dtype(df_result['cluster']), "'cluster' column is not of integer type."
-    assert not df_result['cluster'].isna().any(), "'cluster' column contains NaN values."
+    assert pd.api.types.is_integer_dtype(df_result.cluster), "'cluster' column is not of integer type."
+    assert not df_result.cluster.isna().any(), "'cluster' column contains NaN values."
 
 def test_cluster_tagger_row_alignment():
     """
-    Verifies that the correct cluster label is assigned to the correct
-    row (hit), even if the input DataFrame is shuffled.
+    Verifies that the correct cluster label is assigned to the correct hit,
+    even if the input DataFrame is shuffled.
 
     Scenario:
     - Event 0:
         - Cluster A: 2 hits at (0,0,0) and (1,1,0)         -> Should be Cluster 0
         - Cluster B: 2 hits at (100,100,0) and (101,101,0) -> Should be Cluster 1
     """
-    # Setup data
     data = {
-                'event': [0, 0, 0, 0],
-                'X':     [0., 1., 100., 101.],
-                'Y':     [0., 1., 100., 101.],
-                'Z':     [0., 0.,   0.,   0.],
-                'E':     [10, 10,  10,   10 ]
+                'event'  : [ 0,  0,    0,    0],
+                'X'      : [0., 1., 100., 101.],
+                'Y'      : [0., 1., 100., 101.],
+                'Z'      : [0., 0.,   0.,   0.],
+                'cluster': [ 0,  0,    1,    1]
     }
     df = pd.DataFrame(data)
-    # Shuffle rows in a specific order
-    df_shuffled = df.reindex(index=[0, 2, 1, 3]).copy()     
-    # Add expected labels for assertations (not shuffled, just for reference)
-    df['cluster'] = [0, 0, 1, 1] 
+    # Shuffled dataframe must start with the same hit as the original
+    # to ensure that both hits close to (0,0,0) have same cluster label (0)
+    df_shuffled = df.reindex(index=[0, 2, 1, 3]).copy().drop(columns=['cluster'])
 
-    # Run the cluster tagger
-    params = dict(eps=5.0, min_samples=1, scale_xy=1.0, scale_z=1.0)    # Enough to consider both clusters
-    df_result = cluster_tagger(df_shuffled, **params)
-    df_result = df_result.sort_index()                                  # Sort back to original order for easier assertations
+    test_params = dict(eps=5.0, min_samples=1, scale_xy=1.0, scale_z=1.0)
+    df_result   = cluster_tagger(df_shuffled, **test_params)
+    # Sorted final result must match original dataframe 
+    df_result   = df_result.sort_index()
 
-    # --- Assertations
     pd.testing.assert_frame_equal(  
                                     df_result,
                                     df,
@@ -269,66 +245,55 @@ def test_cluster_tagger_row_alignment():
                                     obj="Dataframe structure check"
                                     )
     
-def test_cluster_tagger_noise_rejection():
+def test_cluster_tagger_noise_identification():
     """
-    Verifies that isolated hits (outliers) are correctly identified as noise (-1).
-    
     Scenario:
     - 3 points very close together (0,0), (1,0), (0,1). They should form a cluster.
     - 1 point very far away (100, 100). It has 0 neighbors. Should be noise.
     """
-    # Setup data
     data = {
-                'event': [0, 0, 0, 0],
-                'X':     [0., 1., 0., 100.],
-                'Y':     [0., 0., 1., 100.],
-                'Z':     [0., 0., 0.,   0.],
-                'E':     [10, 10, 10,  10 ]
+                'event': [ 0,  0,  0,    0],
+                'X'    : [0., 1., 0., 100.],
+                'Y'    : [0., 0., 1., 100.],
+                'Z'    : [0., 0., 0.,   0.]
     }
     df = pd.DataFrame(data)
 
-    # Run the cluster tagger
-    params = dict(eps=5.0, min_samples=3, scale_xy=1.0, scale_z=1.0)    # Enough to consider one cluster
-    df_result = cluster_tagger(df.copy(), **params)
-
-    # --- Assertations
-    cluster_labels = df_result['cluster'].unique()
-    assert cluster_labels.size == 2, "Expected exactly 2 unique cluster labels (one cluster + one noise)."
-    cluster_hits = df_result[df_result['cluster'] != -1]
-    assert cluster_hits.shape[0] == 3, "Expected exactly 3 hits to be clustered together."
-    noise_hit = df_result[df_result['cluster'] == -1]
-    assert noise_hit.shape[0] == 1, "Expected exactly 1 noise hit."
+    test_params = dict(eps=5.0, min_samples=3, scale_xy=1.0, scale_z=1.0)
+    df_result   = cluster_tagger(df.copy(), **test_params)
+
+    cluster_labels = df_result.cluster.unique()
+    assert len(cluster_labels) == 2, "Expected exactly 2 unique cluster labels (one cluster + one noise)."
+    cluster_hits = df_result[df_result.cluster != -1]
+    assert len(cluster_hits) == 3, "Expected exactly 3 hits to be clustered together."
+    noise_hit = df_result[df_result.cluster == -1]
+    assert len(noise_hit) == 1, "Expected exactly 1 noise hit."
     assert noise_hit['X'].iloc[0] == 100 and noise_hit['Y'].iloc[0] == 100, "The noise hit identified is NOT the distant one."
 
 def test_cluster_tagger_event_distinction():
     """
-    Verifies that hits from different events are not clustered together.
-    
     Scenario:
-    - Event 0: 2 hits at (0,0,0) and (1,1,0) -> Should be Cluster 0
-    - Event 1: 2 hits at (100,100,0) and (101,101,0) and 1 hit at (0.5,0.5,0) -> Should be marked as noise (-1)
+    - Event 0: 2 hits at (0,0,0) and (1,1,0)         -> Should be Cluster 0
+    - Event 1: 2 hits at (100,100,0) and (101,101,0) -> Should be Cluster 1
+               and 1 hit at (0.5,0.5,0)              -> Should be marked as noise (-1)
     - We check that noise hit from Event 1 get a different cluster label than hits from Event 0, even if they are spatially close.
     """
-    # Setup data
     data = {
-                'event': [0, 0, 1, 1, 1],
-                'X':     [0., 1., 100., 101., 0.5],
-                'Y':     [0., 1., 100., 101., 0.5],
-                'Z':     [0., 0.,   0.,   0.,  0.],
-                'E':     [10, 10,   10,   10,  10]
+                'event': [ 0,  0,    1,    1,   1],
+                'X'    : [0., 1., 100., 101., 0.5],
+                'Y'    : [0., 1., 100., 101., 0.5],
+                'Z'    : [0., 0.,   0.,   0.,  0.],
     }
     df = pd.DataFrame(data)
 
-    # Run the cluster tagger
-    params = dict(eps=5.0, min_samples=2, scale_xy=1.0, scale_z=1.0)      # Enough to consider both clusters
-    df_result = cluster_tagger(df.copy(), **params)
+    test_params = dict(eps=5.0, min_samples=2, scale_xy=1.0, scale_z=1.0)
+    df_result   = cluster_tagger(df.copy(), **test_params)
 
-    # --- Assertations
-    event_0_clusters = df_result[df_result['event'] == 0]['cluster'].unique()
-    event_1_clusters = df_result[df_result['event'] == 1]['cluster'].unique()
+    event_0_clusters = df_result[df_result.event == 0].cluster.unique()
+    event_1_clusters = df_result[df_result.event == 1].cluster.unique()
     assert len(event_0_clusters) == 1, "For event 0: expected exactly 1 unique cluster label (one cluster)."
     assert len(event_1_clusters) == 2, "For event 1: expected exactly 2 unique cluster labels (one cluster + one noise)."
-    event_0_hits = df_result[df_result['event'] == 0]
-    noise_1_hit  = df_result[(df_result['X'] == 0.5)]
-    assert noise_1_hit['cluster'].iloc[0] == -1, "The hit at (0.5,0.5,0) in event 1 should be marked as noise (-1)."
-    assert event_0_hits['cluster'].iloc[0] != noise_1_hit['cluster'].iloc[0], "Hits from event 0 and the noise hit from event 1 were assigned the same cluster label."
\ No newline at end of file
+    event_0_hits = df_result[df_result.event == 0]
+    noise_1_hit  = df_result[(df_result.X == 0.5)]
+    assert noise_1_hit.cluster.iloc[0] == -1, "The hit at (0.5,0.5,0) in event 1 should be marked as noise (-1)."
+    assert event_0_hits.cluster.iloc[0] != noise_1_hit.cluster.iloc[0], "Hits from event 0 and the noise hit from event 1 were assigned the same cluster label."
\ No newline at end of file

From 74dcedc478a5d73841d988edb70cc9cc2b24e9d6 Mon Sep 17 00:00:00 2001
From: Camilo Cortes Parra <ccortesp@ific.uv.es>
Date: Tue, 5 May 2026 16:11:21 +0200
Subject: [PATCH 17/23] Add @settings

---
 invisible_cities/reco/hits_functions_test.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/invisible_cities/reco/hits_functions_test.py b/invisible_cities/reco/hits_functions_test.py
index 14bb88e49..648c1d017 100644
--- a/invisible_cities/reco/hits_functions_test.py
+++ b/invisible_cities/reco/hits_functions_test.py
@@ -20,6 +20,7 @@
 from hypothesis                import assume
 from hypothesis.strategies     import composite
 from hypothesis.extra.pandas   import data_frames, column, range_indexes
+from hypothesis                import settings
 
 event_numbers = integers(0, np.iinfo(np.int32).max)
 
@@ -168,10 +169,11 @@ def test_threshold_hits_all_larger_than_th(hits, th):
                                 column(    'X', dtype=float, elements=floats(min_value=-500, max_value=500)),
                                 column(    'Y', dtype=float, elements=floats(min_value=-500, max_value=500)),
                                 column(    'Z', dtype=float, elements=floats(min_value=0,    max_value=1200)),
-                                column(    'E', dtype=float, elements=floats(min_value=0.1,  max_value=100)),    
+                                column(    'E', dtype=float, elements=floats(min_value=0.1,  max_value=100))  
                                        ])
 
 @given(df=gen_cluster_df)
+@settings(deadline=None)
 def test_cluster_tagger_output_shape(df):
     """
     Verifies that the output of cluster_tagger:

From 8dda6547c935c58715c5405a8d7d8f79d900974a Mon Sep 17 00:00:00 2001
From: Camilo Cortes Parra <ccortesp@ific.uv.es>
Date: Wed, 6 May 2026 18:18:43 +0200
Subject: [PATCH 18/23] Removing unused import

---
 invisible_cities/reco/hits_functions.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/invisible_cities/reco/hits_functions.py b/invisible_cities/reco/hits_functions.py
index 471c99c33..a5097d06d 100644
--- a/invisible_cities/reco/hits_functions.py
+++ b/invisible_cities/reco/hits_functions.py
@@ -6,8 +6,7 @@
 from typing          import List
 from sklearn.cluster import DBSCAN
 
-from .. evm  import event_model as evm
-from .. types.ic_types      import NN
+from .. types.ic_types import NN
 
 EPSILON = np.finfo(np.float64).eps
 
@@ -318,4 +317,4 @@ def cluster_tagger(df_hits      : pd.DataFrame
                                  scale_xy    = scale_xy,
                                  scale_z     = scale_z)
     
-    return df_clustered.set_index(df_hits.index)
\ No newline at end of file
+    return df_clustered.set_index(df_hits.index)

From fe3d020c54a74d312b7d96ee9c94968e07a007ad Mon Sep 17 00:00:00 2001
From: Camilo Cortes Parra <ccortesp@ific.uv.es>
Date: Tue, 12 May 2026 15:14:35 +0200
Subject: [PATCH 19/23] DBSCAN eps value set to 1.8 to retain only neighbouring
 hits as a cluster. It is not a parameter function anymore.

---
 invisible_cities/cities/components.py        | 15 +++----
 invisible_cities/cities/sophronia.py         |  7 ++-
 invisible_cities/cities/sophronia_test.py    |  5 +--
 invisible_cities/config/sophronia.conf       |  5 +--
 invisible_cities/conftest.py                 |  1 -
 invisible_cities/reco/hits_functions.py      | 45 ++++++++++----------
 invisible_cities/reco/hits_functions_test.py | 12 +++---
 7 files changed, 42 insertions(+), 48 deletions(-)

diff --git a/invisible_cities/cities/components.py b/invisible_cities/cities/components.py
index 07fc816a4..fd4ff59ac 100644
--- a/invisible_cities/cities/components.py
+++ b/invisible_cities/cities/components.py
@@ -1719,8 +1719,7 @@ def correct(hits : pd.DataFrame) -> pd.DataFrame:
     return correct
 
 @check_annotations
-def hits_clusterizer( eps         : float
-                    , min_samples : int
+def hits_clusterizer( min_samples : int
                     , scale_xy    : float
                     , scale_z     : float
                     ) -> Callable:
@@ -1729,10 +1728,9 @@ def hits_clusterizer( eps         : float
 
     Parameters
     ----------
-    eps : float
-        Epsilon value for DBSCAN, defining the maximum distance between two samples for them to be considered neighbors.
     min_samples : int
-        Minimum number of samples required to form a dense region (cluster). This includes the point itself.
+        Minimum number of samples required to form a dense region (cluster).
+        This includes the point itself.
     scale_xy : float
         Scaling factor to apply to the (x, y) coordinates before clustering.
     scale_z : float
@@ -1745,9 +1743,10 @@ def hits_clusterizer( eps         : float
         with an added 'cluster' column, which contains the cluster labels assigned by DBSCAN
         (-1 for noise).
     """
-    return partial(cluster_tagger,
-                   eps=eps, min_samples=min_samples,
-                   scale_xy=scale_xy, scale_z=scale_z)
+    return partial( cluster_tagger
+                  , min_samples = min_samples
+                  , scale_xy    = scale_xy
+                  , scale_z     = scale_z )
 
 
 def identity(x : Any) -> Any:
diff --git a/invisible_cities/cities/sophronia.py b/invisible_cities/cities/sophronia.py
index 97947cda9..e122865b7 100644
--- a/invisible_cities/cities/sophronia.py
+++ b/invisible_cities/cities/sophronia.py
@@ -141,13 +141,12 @@ def sophronia( files_in           : OneOrManyFiles
             Normalization value in case of `norm_strat = NormStrategy.custom`
 
     clustering_params : dict
-        eps : float
-            Epsilon value for DBSCAN, defining the maximum distance between two samples for them to be considered neighbors.
         min_samples : int
-            Minimum number of samples required to form a dense region (cluster). This includes the point itself.
+            Minimum number of samples required to form a dense region (cluster).
+            This includes the point itself.
         scale_xy : float
             Scaling factor to apply to the (x, y) coordinates before clustering.
-        scale_z : float
+        scale_z  : float
             Scaling factor to apply to the z coordinate before clustering.
     """
     global_reco = compute_xy_position( detector_db
diff --git a/invisible_cities/cities/sophronia_test.py b/invisible_cities/cities/sophronia_test.py
index 922f25492..a285a6989 100644
--- a/invisible_cities/cities/sophronia_test.py
+++ b/invisible_cities/cities/sophronia_test.py
@@ -171,10 +171,9 @@ def test_sophronia_clustering_integration(config_tmpdir, sophronia_config):
 
     # Clustering enabled
     clustering_params = dict(
-                                eps         = 3,
                                 min_samples = 5,
-                                scale_xy    = 14.55,
-                                scale_z     = 3.7
+                                scale_xy    = 15.55,
+                                scale_z     = 4.0
                             )
     config_with_cluster = dict(**sophronia_config)
     config_with_cluster.update(dict( file_out          = path_out_with_cluster
diff --git a/invisible_cities/config/sophronia.conf b/invisible_cities/config/sophronia.conf
index 56ca2eeb7..c1f27b0a8 100644
--- a/invisible_cities/config/sophronia.conf
+++ b/invisible_cities/config/sophronia.conf
@@ -65,8 +65,7 @@ corrections = dict(
   apply_z    = False)
 
 clustering_params = dict(
-    eps          = 3,
-    min_samples  = 5,
+    min_samples  =     5,
     scale_xy     = 15.55,
-    scale_z      = 4.0
+    scale_z      =   4.0
 )
diff --git a/invisible_cities/conftest.py b/invisible_cities/conftest.py
index 1b2af207d..56e5a50f4 100644
--- a/invisible_cities/conftest.py
+++ b/invisible_cities/conftest.py
@@ -439,7 +439,6 @@ def sophronia_config(Th228_pmaps, next100_mc_krmap):
                        apply_temp =            False,
                        norm_strat =  NormStrategy.kr)
                    , clustering_params = dict(
-                        eps         =     3,
                         min_samples =     5,
                         scale_xy    = 15.55,
                         scale_z     =   4.0)
diff --git a/invisible_cities/reco/hits_functions.py b/invisible_cities/reco/hits_functions.py
index a5097d06d..4e300af2d 100644
--- a/invisible_cities/reco/hits_functions.py
+++ b/invisible_cities/reco/hits_functions.py
@@ -244,7 +244,6 @@ def threshold_hits(hits: pd.DataFrame, th: float) -> pd.DataFrame:
 
 def tag_hits_in_event(event_hits   : pd.DataFrame
                      , *
-                     , eps         : float
                      , min_samples : int
                      , scale_xy    : float
                      , scale_z     : float
@@ -256,10 +255,15 @@ def tag_hits_in_event(event_hits   : pd.DataFrame
 
     Parameters
     ----------
-    event_hits : pd.DataFrame
+    event_hits  : pd.DataFrame
         DataFrame with hits from a single event. Must contain 'X', 'Y', 'Z' columns.
-    eps, min_samples, scale_xy, scale_z :
-        Configuration parameters for DBSCAN and scaling. See `cluster_tagger` for details.
+    min_samples : int
+        Minimum number of samples required to form a dense region (cluster).
+        This includes the point itself.
+    scale_xy    : float
+        Scaling factor to apply to the XY coordinates before clustering.
+    scale_z     : float
+        Scaling factor to apply to the Z coordinate before clustering.
 
     Returns
     -------
@@ -267,40 +271,35 @@ def tag_hits_in_event(event_hits   : pd.DataFrame
         The input DataFrame with a 'cluster' column added.
     """
     coords = event_hits[['X', 'Y', 'Z']].to_numpy()
+    # A proper scaling leads to hits being separeted 
+    # by a distance of 1 in the DBSCAN metric space
     coords[:, :2] /= scale_xy
     coords[:, 2]  /= scale_z
 
-    labels = DBSCAN(eps=eps, min_samples=min_samples).fit_predict(coords)
+    # eps parameter is fixed to a value a bit higher of √3
+    # to retain diagonal neighbours in the same cluster
+    labels = DBSCAN(eps=1.8, min_samples=min_samples).fit_predict(coords)
     event_hits['cluster'] = labels
 
     return event_hits
 
 def cluster_tagger(df_hits      : pd.DataFrame
                   , *
-                  , eps         : float
                   , min_samples : int
                   , scale_xy    : float
                   , scale_z     : float
                   ) -> pd.DataFrame:
     """
-    Applies DBSCAN clustering to hits on an event-by-event basis using groupby.apply.
     This function groups the input DataFrame by 'event' and applies the
     `tag_hits_in_event` function to each event's group of hits.
 
     Parameters
     ----------
-    df_hits     : pd.DataFrame
+    df_hits : pd.DataFrame
         DataFrame with hit information. Must contain 'X', 'Y', 'Z', and 'event'.
-    eps         : float
-        Maximum distance between two samples for them to be considered neighbors.
-    min_samples : int
-        Minimum number of samples required to form a dense region (cluster).
-        This includes the point itself.
-    scale_xy    : float
-        Scaling factor to apply to the XY coordinates before clustering.
-    scale_z     : float
-        Scaling factor to apply to the Z coordinate before clustering.
-
+    min_samples, scale_xy, scale_z : 
+        See `tag_hits_in_event`
+    
     Returns
     -------
     pd.DataFrame
@@ -311,10 +310,10 @@ def cluster_tagger(df_hits      : pd.DataFrame
         return df_hits.assign(cluster=pd.Series(dtype=int))
 
     df_clustered = df_hits.groupby('event', as_index=False, group_keys=False) \
-                          .apply(tag_hits_in_event,
-                                 eps         = eps,
-                                 min_samples = min_samples,
-                                 scale_xy    = scale_xy,
-                                 scale_z     = scale_z)
+                          .apply( tag_hits_in_event
+                                , min_samples = min_samples
+                                , scale_xy    = scale_xy
+                                , scale_z     = scale_z )
     
     return df_clustered.set_index(df_hits.index)
+    
\ No newline at end of file
diff --git a/invisible_cities/reco/hits_functions_test.py b/invisible_cities/reco/hits_functions_test.py
index 648c1d017..dde3f59c6 100644
--- a/invisible_cities/reco/hits_functions_test.py
+++ b/invisible_cities/reco/hits_functions_test.py
@@ -180,7 +180,7 @@ def test_cluster_tagger_output_shape(df):
     - Has the same number of rows as the input.
     - Contains exactly one new column named 'cluster'.
     """
-    dummy_params = dict(eps=10.0, min_samples=1, scale_xy=1.0, scale_z=1.0)
+    dummy_params = dict(min_samples=1, scale_xy=1.0, scale_z=1.0)
     df_result    = cluster_tagger(df.copy(), **dummy_params)
 
     assert len(df_result) == len(df), "Output DataFrame has different length than input."
@@ -195,7 +195,7 @@ def test_cluster_tagger_original(df):
     - Does not modify any of the input information.
     - Preserves the input index and row order.
     """
-    dummy_params = dict(eps=10.0, min_samples=1, scale_xy=1.0, scale_z=1.0)
+    dummy_params = dict(min_samples=1, scale_xy=1.0, scale_z=1.0)
     df_result    = cluster_tagger(df.copy(), **dummy_params)
 
     pd.testing.assert_frame_equal(  
@@ -207,7 +207,7 @@ def test_cluster_tagger_original(df):
 
 @given(df=gen_cluster_df)
 def test_cluster_tagger_new_column_validity(df):
-    dummy_params = dict(eps=10.0, min_samples=1, scale_xy=1.0, scale_z=1.0)
+    dummy_params = dict(min_samples=1, scale_xy=1.0, scale_z=1.0)
     df_result    = cluster_tagger(df.copy(), **dummy_params)
 
     assert pd.api.types.is_integer_dtype(df_result.cluster), "'cluster' column is not of integer type."
@@ -235,7 +235,7 @@ def test_cluster_tagger_row_alignment():
     # to ensure that both hits close to (0,0,0) have same cluster label (0)
     df_shuffled = df.reindex(index=[0, 2, 1, 3]).copy().drop(columns=['cluster'])
 
-    test_params = dict(eps=5.0, min_samples=1, scale_xy=1.0, scale_z=1.0)
+    test_params = dict(min_samples=1, scale_xy=1.0, scale_z=1.0)
     df_result   = cluster_tagger(df_shuffled, **test_params)
     # Sorted final result must match original dataframe 
     df_result   = df_result.sort_index()
@@ -261,7 +261,7 @@ def test_cluster_tagger_noise_identification():
     }
     df = pd.DataFrame(data)
 
-    test_params = dict(eps=5.0, min_samples=3, scale_xy=1.0, scale_z=1.0)
+    test_params = dict(min_samples=3, scale_xy=1.0, scale_z=1.0)
     df_result   = cluster_tagger(df.copy(), **test_params)
 
     cluster_labels = df_result.cluster.unique()
@@ -288,7 +288,7 @@ def test_cluster_tagger_event_distinction():
     }
     df = pd.DataFrame(data)
 
-    test_params = dict(eps=5.0, min_samples=2, scale_xy=1.0, scale_z=1.0)
+    test_params = dict(min_samples=2, scale_xy=1.0, scale_z=1.0)
     df_result   = cluster_tagger(df.copy(), **test_params)
 
     event_0_clusters = df_result[df_result.event == 0].cluster.unique()

From 9af71acc98031215524236bb3abff3ee2a082da0 Mon Sep 17 00:00:00 2001
From: Camilo Cortes Parra <ccortesp@ific.uv.es>
Date: Tue, 12 May 2026 15:14:56 +0200
Subject: [PATCH 20/23] Update reference files for tests

---
 228Th_10evt_hits.h5                                           | 3 +++
 invisible_cities/database/test_data/228Th_10evt_deco.h5       | 4 ++--
 .../database/test_data/228Th_10evt_deco_satellite.h5          | 4 ++--
 .../database/test_data/228Th_10evt_deco_separate.h5           | 4 ++--
 invisible_cities/database/test_data/228Th_10evt_hits.h5       | 4 ++--
 5 files changed, 11 insertions(+), 8 deletions(-)
 create mode 100644 228Th_10evt_hits.h5

diff --git a/228Th_10evt_hits.h5 b/228Th_10evt_hits.h5
new file mode 100644
index 000000000..4b8739c0d
--- /dev/null
+++ b/228Th_10evt_hits.h5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7016967f80568b698957649dab8dc581a7c2f3f7af330cf65c69391ad0ce675f
+size 276410
diff --git a/invisible_cities/database/test_data/228Th_10evt_deco.h5 b/invisible_cities/database/test_data/228Th_10evt_deco.h5
index 47ff579ad..0ffee6179 100644
--- a/invisible_cities/database/test_data/228Th_10evt_deco.h5
+++ b/invisible_cities/database/test_data/228Th_10evt_deco.h5
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5ed63a53e9a78cef39a4ff4d14ae1cfbdfc0f121d27fd9ee215c18bcec91ff2d
-size 819282
+oid sha256:3032cc3428c6df55802181aa3f2c0a72d288cabac34ac0b753f4d0cff6ba31be
+size 823118
diff --git a/invisible_cities/database/test_data/228Th_10evt_deco_satellite.h5 b/invisible_cities/database/test_data/228Th_10evt_deco_satellite.h5
index 273064f6e..e9051848a 100644
--- a/invisible_cities/database/test_data/228Th_10evt_deco_satellite.h5
+++ b/invisible_cities/database/test_data/228Th_10evt_deco_satellite.h5
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:434f0bf46e4e326731dbb87e14d83d6649ea01eb0079654a42c9c3e3c41c7df1
-size 304668
+oid sha256:ee059274164e63cd00f91fd41938d57af47ce6ee7dbd9b08357947ddb883f701
+size 303387
diff --git a/invisible_cities/database/test_data/228Th_10evt_deco_separate.h5 b/invisible_cities/database/test_data/228Th_10evt_deco_separate.h5
index 0f6cddf7d..0b4f7548e 100644
--- a/invisible_cities/database/test_data/228Th_10evt_deco_separate.h5
+++ b/invisible_cities/database/test_data/228Th_10evt_deco_separate.h5
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:69175d759c8484973f9006911f08c8019cc84bc472481b7c8d3fafded16a3cc8
-size 819313
+oid sha256:3fb02b1e386e4a978267875acc49eceb97c2c07ce6a74e3579a2524683f39fc7
+size 823121
diff --git a/invisible_cities/database/test_data/228Th_10evt_hits.h5 b/invisible_cities/database/test_data/228Th_10evt_hits.h5
index 3aa32af53..4b8739c0d 100644
--- a/invisible_cities/database/test_data/228Th_10evt_hits.h5
+++ b/invisible_cities/database/test_data/228Th_10evt_hits.h5
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ee809a8d3f69241048b4c37c156b452e28bcce05d45b3504c15163599e6c8dbb
-size 274902
+oid sha256:7016967f80568b698957649dab8dc581a7c2f3f7af330cf65c69391ad0ce675f
+size 276410

From 6009d782dea1d3967de5999e1b793bb336f3140b Mon Sep 17 00:00:00 2001
From: Camilo Cortes Parra <ccortesp@ific.uv.es>
Date: Tue, 12 May 2026 15:45:41 +0200
Subject: [PATCH 21/23] Removing deadline for tests that use hypothesis

---
 invisible_cities/reco/hits_functions_test.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/invisible_cities/reco/hits_functions_test.py b/invisible_cities/reco/hits_functions_test.py
index dde3f59c6..954ae1e84 100644
--- a/invisible_cities/reco/hits_functions_test.py
+++ b/invisible_cities/reco/hits_functions_test.py
@@ -189,6 +189,7 @@ def test_cluster_tagger_output_shape(df):
     assert set(df_result.columns) == expected_cols, "Output DataFrame has unexpected columns."
 
 @given(df=gen_cluster_df)
+@settings(deadline=None)
 def test_cluster_tagger_original(df):
     """
     Verifies that cluster_tagger:
@@ -206,6 +207,7 @@ def test_cluster_tagger_original(df):
                                  )
 
 @given(df=gen_cluster_df)
+@settings(deadline=None)
 def test_cluster_tagger_new_column_validity(df):
     dummy_params = dict(min_samples=1, scale_xy=1.0, scale_z=1.0)
     df_result    = cluster_tagger(df.copy(), **dummy_params)

From 995705d1e7b31e6c0b5ae97af72c831967be9751 Mon Sep 17 00:00:00 2001
From: Camilo Cortes Parra <ccortesp@ific.uv.es>
Date: Tue, 12 May 2026 15:46:01 +0200
Subject: [PATCH 22/23] Updating esmeralda reference file for tests

---
 invisible_cities/database/test_data/228Th_10evt_tracks.h5 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/invisible_cities/database/test_data/228Th_10evt_tracks.h5 b/invisible_cities/database/test_data/228Th_10evt_tracks.h5
index edc540196..3844b259b 100644
--- a/invisible_cities/database/test_data/228Th_10evt_tracks.h5
+++ b/invisible_cities/database/test_data/228Th_10evt_tracks.h5
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1ccacb5209c035672cd6b147f54d816863a9a7828f5ad88dd1a7dff488a8e7fc
-size 230547
+oid sha256:c7841bdae394479d350c3877a4f37778f3b3db528e4610f65c22f3b284d7f537
+size 230764

From 0772fbf71d66594148f570494bb5a35f161672ec Mon Sep 17 00:00:00 2001
From: Camilo Cortes Parra <ccortesp@ific.uv.es>
Date: Tue, 12 May 2026 16:45:06 +0200
Subject: [PATCH 23/23] Updating reference for esmeralda test

---
 invisible_cities/database/test_data/228Th_10evt_tracks.h5 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/invisible_cities/database/test_data/228Th_10evt_tracks.h5 b/invisible_cities/database/test_data/228Th_10evt_tracks.h5
index 3844b259b..304fcb987 100644
--- a/invisible_cities/database/test_data/228Th_10evt_tracks.h5
+++ b/invisible_cities/database/test_data/228Th_10evt_tracks.h5
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c7841bdae394479d350c3877a4f37778f3b3db528e4610f65c22f3b284d7f537
-size 230764
+oid sha256:ecc3e2524de6c1c5e0e9d89cdf7c510fde37557277d85417890c16bbe9485902
+size 230750