next-exp · camacortespar · Nov 13, 2025 · Nov 13, 2025 · Nov 14, 2025 · Nov 25, 2025
diff --git a/228Th_10evt_hits.h5 b/228Th_10evt_hits.h5
diff --git a/invisible_cities/cities/components.py b/invisible_cities/cities/components.py
@@ -59,6 +59,7 @@
 from .. reco   .corrections       import     get_df_to_z_converter
 from .. reco   .xy_algorithms     import                    corona
 from .. reco   .xy_algorithms     import                barycenter
+from .. reco   .hits_functions    import            cluster_tagger
 from .. filters.s1s2_filter       import               S12Selector
 from .. filters.s1s2_filter       import         S12SelectorOutput
 from .. filters.s1s2_filter       import               pmap_filter
@@ -1717,6 +1718,36 @@ def correct(hits : pd.DataFrame) -> pd.DataFrame:
 
     return correct
 
+@check_annotations
+def hits_clusterizer( min_samples : int
+                    , scale_xy    : float
+                    , scale_z     : float
+                    ) -> Callable:
+    """
+    Creates a callable for performing DBSCAN clustering on a dataFrame of hits.
+
+    Parameters
+    ----------
+    min_samples : int
+        Minimum number of samples required to form a dense region (cluster).
+        This includes the point itself.
+    scale_xy : float
+        Scaling factor to apply to the (x, y) coordinates before clustering.
+    scale_z : float
+         Scaling factor to apply to the z coordinate before clustering.
+
+    Returns
+    -------
+    Callable
+        A function that takes a DataFrame of hits and returns the same DataFrame 
+        with an added 'cluster' column, which contains the cluster labels assigned by DBSCAN
+        (-1 for noise).
+    """
+    return partial( cluster_tagger
+                  , min_samples = min_samples
+                  , scale_xy    = scale_xy
+                  , scale_z     = scale_z )
+
 
 def identity(x : Any) -> Any:
     return x
diff --git a/invisible_cities/cities/sophronia.py b/invisible_cities/cities/sophronia.py
@@ -64,6 +64,7 @@
 from .  components import               collect
 from .  components import build_pointlike_event as pointlike_event_builder
 from .  components import        hits_corrector
+from .  components import      hits_clusterizer
 from .  components import              identity
 
 from typing import Optional
@@ -93,6 +94,7 @@ def sophronia( files_in           : OneOrManyFiles
              , sipm_charge_type   : SiPMCharge
              , same_peak          : bool
              , corrections        : Optional[dict] = None
+             , clustering_params  : Optional[dict] = None
              ):
     """
     drift_v : float
@@ -137,6 +139,15 @@ def sophronia( files_in           : OneOrManyFiles
             Normalization strategy
         norm_value : float, optional
             Normalization value in case of `norm_strat = NormStrategy.custom`
+
+    clustering_params : dict
+        min_samples : int
+            Minimum number of samples required to form a dense region (cluster).
+            This includes the point itself.
+        scale_xy : float
+            Scaling factor to apply to the (x, y) coordinates before clustering.
+        scale_z  : float
+            Scaling factor to apply to the z coordinate before clustering.
     """
     global_reco = compute_xy_position( detector_db
                                      , run_number
@@ -177,6 +188,9 @@ def sophronia( files_in           : OneOrManyFiles
 
     correct_hits   = df.map( hits_corrector(**corrections) if corrections is not None else identity
                            , item = "hits")
+
+    cluster_hits   = df.map( hits_clusterizer(**clustering_params) if clustering_params is not None else identity
+                           , item = "hits")
 
     build_pointlike_event = df.map( pointlike_event_builder( detector_db
                                                            , run_number
@@ -202,7 +216,7 @@ def sophronia( files_in           : OneOrManyFiles
                                        , args = "event_number enough_valid_hits".split())
 
         hits_branch         = ( make_hits, enough_valid_hits, df.branch(write_hits_filter)
-                              , hits_select.filter, merge_nn_hits, correct_hits, write_hits)
+                              , hits_select.filter, merge_nn_hits, correct_hits, cluster_hits, write_hits)
         kdst_branch         = build_pointlike_event, write_pointlike_event
         collect_evt_numbers = "event_number", event_number_collector.sink
 

diff --git a/invisible_cities/cities/sophronia_test.py b/invisible_cities/cities/sophronia_test.py
@@ -5,6 +5,8 @@
 
 from pytest import mark
 
+from .. io                   import dst_io as dio
+from .. core.testing_utils   import assert_dataframes_equal
 from .. core.testing_utils   import assert_tables_equality
 from .. core.testing_utils   import ignore_warning
 from .. core.system_of_units import pes
@@ -147,3 +149,48 @@ def test_sophronia_keeps_hitless_events(config_tmpdir, sophronia_config):
     with tb.open_file(path_out) as output_file:
         assert len(output_file.root.Run.events) == 1
         assert "RECO" not in output_file.root
+
+
+@ignore_warning.no_config_group
+def test_sophronia_clustering_integration(config_tmpdir, sophronia_config):
+    """
+    Runs Sophronia twice (once disabled, once enabled) to verify:
+    1. Backward compatibility: No 'cluster' column when disabled.
+    2. Feature activation: 'cluster' column exists when enabled.
+    3. Data consistency: Enabling clustering does NOT change any other data.
+    """
+    path_out_no_cluster   = os.path.join(config_tmpdir, 'test_sophronia_no_cluster.h5')
+    path_out_with_cluster = os.path.join(config_tmpdir, 'test_sophronia_with_cluster.h5')
+
+    # Clustering disabled
+    config_no_cluster = dict(**sophronia_config)
+    config_no_cluster.update(dict( file_out          = path_out_no_cluster
+                                 , event_range       = 1
+                                 , clustering_params = None))
+    sophronia(**config_no_cluster)
+
+    # Clustering enabled
+    clustering_params = dict(
+                                min_samples = 5,
+                                scale_xy    = 15.55,
+                                scale_z     = 4.0
+                            )
+    config_with_cluster = dict(**sophronia_config)
+    config_with_cluster.update(dict( file_out          = path_out_with_cluster
+                                   , event_range       = 1
+                                   , clustering_params = clustering_params))
+    sophronia(**config_with_cluster)
+
+    # Load both outputs
+    df_no_cluster   = dio.load_dst(path_out_no_cluster,   "RECO", "Events")
+    df_with_cluster = dio.load_dst(path_out_with_cluster, "RECO", "Events")
+
+    # ----- Assertions
+    assert not df_no_cluster.empty
+    assert not df_with_cluster.empty
+    assert 'cluster' not in df_no_cluster.columns, "'cluster' column should not exist when clustering is disabled."
+    assert 'cluster' in df_with_cluster.columns, "'cluster' column should exist when clustering is enabled."
+
+    # Compare all columns except 'cluster' for equality
+    df_with_cluster_compare = df_with_cluster.drop(columns=['cluster'])
+    assert_dataframes_equal(df_no_cluster, df_with_cluster_compare)
diff --git a/invisible_cities/config/sophronia.conf b/invisible_cities/config/sophronia.conf
@@ -63,3 +63,9 @@ corrections = dict(
   apply_temp = True,
   norm_strat = kr,
   apply_z    = False)
+
+clustering_params = dict(
+    min_samples  =     5,
+    scale_xy     = 15.55,
+    scale_z      =   4.0
+)
diff --git a/invisible_cities/conftest.py b/invisible_cities/conftest.py
@@ -438,6 +438,10 @@ def sophronia_config(Th228_pmaps, next100_mc_krmap):
                        filename   = next100_mc_krmap,
                        apply_temp =            False,
                        norm_strat =  NormStrategy.kr)
+                   , clustering_params = dict(
+                        min_samples =     5,
+                        scale_xy    = 15.55,
+                        scale_z     =   4.0)
                    )
     return config
 

diff --git a/invisible_cities/database/test_data/228Th_10evt_deco.h5 b/invisible_cities/database/test_data/228Th_10evt_deco.h5
diff --git a/invisible_cities/database/test_data/228Th_10evt_deco_satellite.h5 b/invisible_cities/database/test_data/228Th_10evt_deco_satellite.h5
diff --git a/invisible_cities/database/test_data/228Th_10evt_deco_separate.h5 b/invisible_cities/database/test_data/228Th_10evt_deco_separate.h5
diff --git a/invisible_cities/database/test_data/228Th_10evt_hits.h5 b/invisible_cities/database/test_data/228Th_10evt_hits.h5
diff --git a/invisible_cities/reco/hits_functions.py b/invisible_cities/reco/hits_functions.py
@@ -1,7 +1,12 @@
 import numpy  as np
 import pandas as pd
 
-from .. types.ic_types      import NN
+from itertools       import compress
+from copy            import deepcopy
+from typing          import List
+from sklearn.cluster import DBSCAN
+
+from .. types.ic_types import NN
 
 EPSILON = np.finfo(np.float64).eps
 
@@ -64,8 +69,6 @@ def sipms_above_threshold(xys: np.ndarray, qs: np.ndarray, thr:float, energy: fl
     return xs, ys, qs, es
 
 
-
-
 def merge_NN_hits(hits: pd.DataFrame, same_peak: bool = True) -> pd.DataFrame:
     """
     Finds NN hits (defined as hits with Q=NN) and removes them without energy
@@ -238,3 +241,79 @@ def threshold_hits(hits: pd.DataFrame, th: float) -> pd.DataFrame:
     if th <= 0: return hits
     return (hits.groupby("Z", as_index=False)
                 .apply(apply_threshold, th=th))
+
+def tag_hits_in_event(event_hits   : pd.DataFrame
+                     , *
+                     , min_samples : int
+                     , scale_xy    : float
+                     , scale_z     : float
+                     ) -> pd.DataFrame:
+    """
+    Applies DBSCAN clustering to a DataFrame containing hits from a single event.
+    Hits coordinates are scaled to account for the anisotropy of the detector geometry.
+    A 'cluster' column is added to the group with the resulting labels.
+
+    Parameters
+    ----------
+    event_hits  : pd.DataFrame
+        DataFrame with hits from a single event. Must contain 'X', 'Y', 'Z' columns.
+    min_samples : int
+        Minimum number of samples required to form a dense region (cluster).
+        This includes the point itself.
+    scale_xy    : float
+        Scaling factor to apply to the XY coordinates before clustering.
+    scale_z     : float
+        Scaling factor to apply to the Z coordinate before clustering.
+
+    Returns
+    -------
+    pd.DataFrame
+        The input DataFrame with a 'cluster' column added.
+    """
+    coords = event_hits[['X', 'Y', 'Z']].to_numpy()
+    # A proper scaling leads to hits being separeted 
+    # by a distance of 1 in the DBSCAN metric space
+    coords[:, :2] /= scale_xy
+    coords[:, 2]  /= scale_z
+
+    # eps parameter is fixed to a value a bit higher of √3
+    # to retain diagonal neighbours in the same cluster
+    labels = DBSCAN(eps=1.8, min_samples=min_samples).fit_predict(coords)
+    event_hits['cluster'] = labels
+
+    return event_hits
+
+def cluster_tagger(df_hits      : pd.DataFrame
+                  , *
+                  , min_samples : int
+                  , scale_xy    : float
+                  , scale_z     : float
+                  ) -> pd.DataFrame:
+    """
+    This function groups the input DataFrame by 'event' and applies the
+    `tag_hits_in_event` function to each event's group of hits.
+
+    Parameters
+    ----------
+    df_hits : pd.DataFrame
+        DataFrame with hit information. Must contain 'X', 'Y', 'Z', and 'event'.
+    min_samples, scale_xy, scale_z : 
+        See `tag_hits_in_event`
+
+    Returns
+    -------
+    pd.DataFrame
+        The input DataFrame with an added 'cluster' column indicating the
+        cluster label for each hit (-1 for noise).
+    """
+    if df_hits.empty:
+        return df_hits.assign(cluster=pd.Series(dtype=int))
+
+    df_clustered = df_hits.groupby('event', as_index=False, group_keys=False) \
+                          .apply( tag_hits_in_event
+                                , min_samples = min_samples
+                                , scale_xy    = scale_xy
+                                , scale_z     = scale_z )
+
+    return df_clustered.set_index(df_hits.index)
+