From f648c0ed8926207e06c255f07304f4d938d5a882 Mon Sep 17 00:00:00 2001 From: Camilo Cortes Parra Date: Thu, 13 Nov 2025 17:18:06 +0100 Subject: [PATCH 01/23] hits_clusterizer included in components.py --- invisible_cities/cities/components.py | 52 +++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/invisible_cities/cities/components.py b/invisible_cities/cities/components.py index f94405e83..93c71c323 100644 --- a/invisible_cities/cities/components.py +++ b/invisible_cities/cities/components.py @@ -24,6 +24,12 @@ import math import os + + +from sklearn.cluster import DBSCAN + + + from .. dataflow import dataflow as fl from .. dataflow.dataflow import sink from .. dataflow.dataflow import pipe @@ -1717,6 +1723,52 @@ def correct(hits : pd.DataFrame) -> pd.DataFrame: return correct +def hits_clusterizer(df_pe_peak: pd.DataFrame, eps=2.3, npt=5)-> pd.DataFrame: + """ + Cluster hits in 3D space for each event using DBSCAN. + + The coordinates are scaled to account for detector geometry differences + in samplig + + Parameters + ---------- + df_pe_peak : pd.DataFrame + DataFrame containing hit information with columns 'X', 'Y', 'Z', and 'event'. + + Returns + ------- + pd.DataFrame + Modified DataFrame with an added 'cluster' column indicating the cluster label + for each hit (-1 for noise). + """ + a = 14.55 # XY scale + b = 3.7 # Z scale + + # Pre-allocate array for cluster labels + cluster_labels = np.full(len(df_pe_peak), -9999, dtype=int) + + # Get values once (faster than repeatedly accessing DataFrame columns) + coords = df_pe_peak[['X', 'Y', 'Z']].to_numpy() + events = df_pe_peak['event'].to_numpy() + + # Use np.unique to get sorted event IDs + unique_events = np.unique(events) + + for event_id in unique_events: + mask = (events == event_id) + X = coords[mask].copy() + + # Scale + X[:, :2] /= a + X[:, 2] /= b + + labels = DBSCAN(eps=eps, min_samples=npt).fit_predict(X) + cluster_labels[mask] = labels + + df_pe_peak['cluster'] = cluster_labels + + return df_pe_peak + def identity(x : Any) -> Any: return x From ed5248ed894e19800223d0dccb3805a8c0365e54 Mon Sep 17 00:00:00 2001 From: Camilo Cortes Parra Date: Thu, 13 Nov 2025 17:35:08 +0100 Subject: [PATCH 02/23] Hits clusterizer step included in sophronia.py --- invisible_cities/cities/sophronia.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/invisible_cities/cities/sophronia.py b/invisible_cities/cities/sophronia.py index 625ac3032..7580fa6f6 100644 --- a/invisible_cities/cities/sophronia.py +++ b/invisible_cities/cities/sophronia.py @@ -64,6 +64,7 @@ from . components import collect from . components import build_pointlike_event as pointlike_event_builder from . components import hits_corrector +from . components import hits_clusterizer from . components import identity from typing import Optional @@ -93,6 +94,11 @@ def sophronia( files_in : OneOrManyFiles , sipm_charge_type : SiPMCharge , same_peak : bool , corrections : Optional[dict] = None + + # ¿QUEREMOS ESTO? + # , apply_clustering : bool = False # whether to apply clustering to hits + # , cluster_eps : float = 2.3 # eps for DBSCAN + # , cluster_min_samples: int = 5 # min_samples for DBSCAN ): """ drift_v : float @@ -177,6 +183,10 @@ def sophronia( files_in : OneOrManyFiles correct_hits = df.map( hits_corrector(**corrections) if corrections is not None else identity , item = "hits") + + cluster_hits = df.map( hits_clusterizer(eps=2.3, min_samples=5) + , args="hits" + , out="hits") build_pointlike_event = df.map( pointlike_event_builder( detector_db , run_number @@ -202,7 +212,7 @@ def sophronia( files_in : OneOrManyFiles , args = "event_number enough_valid_hits".split()) hits_branch = ( make_hits, enough_valid_hits, df.branch(write_hits_filter) - , hits_select.filter, merge_nn_hits, correct_hits, write_hits) + , hits_select.filter, merge_nn_hits, correct_hits, cluster_hits, write_hits) kdst_branch = build_pointlike_event, write_pointlike_event collect_evt_numbers = "event_number", event_number_collector.sink From 8ff46af6dc58d590275f7dfc625e93ea3b536973 Mon Sep 17 00:00:00 2001 From: Camilo Cortes Parra Date: Fri, 14 Nov 2025 13:10:40 +0100 Subject: [PATCH 03/23] Update on 14/11 --- invisible_cities/cities/sophronia.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/invisible_cities/cities/sophronia.py b/invisible_cities/cities/sophronia.py index 7580fa6f6..3fa6ae4fd 100644 --- a/invisible_cities/cities/sophronia.py +++ b/invisible_cities/cities/sophronia.py @@ -184,7 +184,7 @@ def sophronia( files_in : OneOrManyFiles correct_hits = df.map( hits_corrector(**corrections) if corrections is not None else identity , item = "hits") - cluster_hits = df.map( hits_clusterizer(eps=2.3, min_samples=5) + cluster_hits = df.map( hits_clusterizer(eps=2.3, npt=5) , args="hits" , out="hits") From 7b5f1c57c64ff51d9e50ab8df9257047f4f61e7c Mon Sep 17 00:00:00 2001 From: Camilo Cortes Parra Date: Tue, 25 Nov 2025 12:22:54 +0100 Subject: [PATCH 04/23] New implementation of hits_clusterizer, factory version --- invisible_cities/cities/components.py | 75 +++++++++++++++------------ invisible_cities/cities/sophronia.py | 9 ++-- 2 files changed, 46 insertions(+), 38 deletions(-) diff --git a/invisible_cities/cities/components.py b/invisible_cities/cities/components.py index 93c71c323..927e3cc92 100644 --- a/invisible_cities/cities/components.py +++ b/invisible_cities/cities/components.py @@ -1723,51 +1723,60 @@ def correct(hits : pd.DataFrame) -> pd.DataFrame: return correct -def hits_clusterizer(df_pe_peak: pd.DataFrame, eps=2.3, npt=5)-> pd.DataFrame: +def hits_clusterizer( eps : float + , min_samples : float + , scale_xy : float = 14.55 + , scale_z : float = 3.7 + ) -> Callable: """ Cluster hits in 3D space for each event using DBSCAN. - - The coordinates are scaled to account for detector geometry differences - in samplig + The coordinates are scaled to account for detector geometry differences in samplig Parameters ---------- - df_pe_peak : pd.DataFrame - DataFrame containing hit information with columns 'X', 'Y', 'Z', and 'event'. + eps : float, Epsilon value for DBSCAN. + min_samples : int, Min Samples value for DBSCAN. + scale_xy : float, scale factor for XY coordinates. + scale_z : float, scale factor for Z coordinate. Returns ------- - pd.DataFrame - Modified DataFrame with an added 'cluster' column indicating the cluster label - for each hit (-1 for noise). + Callable + A function that takes a DataFrame of hits and returns the same DataFrame + with an added 'cluster' column, which are the clusters labels assigned by DBSCAN + (-1 for noise). """ - a = 14.55 # XY scale - b = 3.7 # Z scale + def cluster_tagger(df_hits: pd.DataFrame) -> pd.DataFrame: + if df_hits.empty: + return df_hits.assign(cluster=pd.Series(dtype=int)) - # Pre-allocate array for cluster labels - cluster_labels = np.full(len(df_pe_peak), -9999, dtype=int) + # Pre-allocate array for cluster labels + cluster_labels = np.full(len(df_hits), -9999, dtype=int) - # Get values once (faster than repeatedly accessing DataFrame columns) - coords = df_pe_peak[['X', 'Y', 'Z']].to_numpy() - events = df_pe_peak['event'].to_numpy() - - # Use np.unique to get sorted event IDs - unique_events = np.unique(events) + # Get values once (faster than repeatedly accessing DataFrame columns) + coords = df_hits[['X', 'Y', 'Z']].to_numpy() + events = df_hits['event'].to_numpy() + + # Use np.unique to get sorted event IDs + unique_events = np.unique(events) + + for event_id in unique_events: + mask = (events == event_id) + X = coords[mask].copy() + + # Scale + X[:, :2] /= scale_xy + X[:, 2] /= scale_z + + # DBSCAN clustering + labels = DBSCAN(eps=eps, min_samples=min_samples).fit_predict(X) + cluster_labels[mask] = labels + + df_hits['cluster'] = cluster_labels + + return df_hits - for event_id in unique_events: - mask = (events == event_id) - X = coords[mask].copy() - - # Scale - X[:, :2] /= a - X[:, 2] /= b - - labels = DBSCAN(eps=eps, min_samples=npt).fit_predict(X) - cluster_labels[mask] = labels - - df_pe_peak['cluster'] = cluster_labels - - return df_pe_peak + return cluster_tagger def identity(x : Any) -> Any: diff --git a/invisible_cities/cities/sophronia.py b/invisible_cities/cities/sophronia.py index 3fa6ae4fd..bb425f81c 100644 --- a/invisible_cities/cities/sophronia.py +++ b/invisible_cities/cities/sophronia.py @@ -97,8 +97,8 @@ def sophronia( files_in : OneOrManyFiles # ¿QUEREMOS ESTO? # , apply_clustering : bool = False # whether to apply clustering to hits - # , cluster_eps : float = 2.3 # eps for DBSCAN - # , cluster_min_samples: int = 5 # min_samples for DBSCAN + , cluster_eps : float = 2.3 # eps for DBSCAN + , cluster_min_samples: int = 5 # min_samples for DBSCAN ): """ drift_v : float @@ -184,9 +184,8 @@ def sophronia( files_in : OneOrManyFiles correct_hits = df.map( hits_corrector(**corrections) if corrections is not None else identity , item = "hits") - cluster_hits = df.map( hits_clusterizer(eps=2.3, npt=5) - , args="hits" - , out="hits") + cluster_hits = df.map( hits_clusterizer(eps=cluster_eps, npt=cluster_min_samples) + , item = "hits") build_pointlike_event = df.map( pointlike_event_builder( detector_db , run_number From c9c43442bf5a028748f7059c311e955de8f146fe Mon Sep 17 00:00:00 2001 From: Camilo Cortes Parra Date: Wed, 14 Jan 2026 20:37:23 +0100 Subject: [PATCH 05/23] New version of cluster_hits in Sophronia. Main function in hits_functions.py --- invisible_cities/cities/components.py | 77 ++++++++----------------- invisible_cities/cities/sophronia.py | 21 +++++-- invisible_cities/config/sophronia.conf | 7 +++ invisible_cities/reco/hits_functions.py | 72 ++++++++++++++++++++++- 4 files changed, 115 insertions(+), 62 deletions(-) diff --git a/invisible_cities/cities/components.py b/invisible_cities/cities/components.py index 927e3cc92..f89351b1f 100644 --- a/invisible_cities/cities/components.py +++ b/invisible_cities/cities/components.py @@ -24,12 +24,6 @@ import math import os - - -from sklearn.cluster import DBSCAN - - - from .. dataflow import dataflow as fl from .. dataflow.dataflow import sink from .. dataflow.dataflow import pipe @@ -65,6 +59,7 @@ from .. reco .corrections import get_df_to_z_converter from .. reco .xy_algorithms import corona from .. reco .xy_algorithms import barycenter +from .. reco .hits_functions import cluster_tagger from .. filters.s1s2_filter import S12Selector from .. filters.s1s2_filter import S12SelectorOutput from .. filters.s1s2_filter import pmap_filter @@ -1723,60 +1718,36 @@ def correct(hits : pd.DataFrame) -> pd.DataFrame: return correct -def hits_clusterizer( eps : float - , min_samples : float - , scale_xy : float = 14.55 - , scale_z : float = 3.7 - ) -> Callable: - """ - Cluster hits in 3D space for each event using DBSCAN. - The coordinates are scaled to account for detector geometry differences in samplig - +def hits_clusterizer(clustering_params: dict) -> Callable: + """" + This function receives a configuration dictionary and returns a callable + that will perform DBSCAN clustering on a DataFrame of hits. + Parameters ---------- - eps : float, Epsilon value for DBSCAN. - min_samples : int, Min Samples value for DBSCAN. - scale_xy : float, scale factor for XY coordinates. - scale_z : float, scale factor for Z coordinate. - + clustering_params : dict + A dictionary containing the configuration for the clustering algorithm. + Expected keys are: + - 'eps' : float, Epsilon value for DBSCAN. + - 'min_samples': int, Min Samples value for DBSCAN. + - 'scale_xy' : float, optional, scale factor for XY coordinates. + - 'scale_z' : float, optional, scale factor for Z coordinate. + Returns ------- Callable - A function that takes a DataFrame of hits and returns the same DataFrame - with an added 'cluster' column, which are the clusters labels assigned by DBSCAN - (-1 for noise). + A function that takes a DataFrame of hits and returns the same DataFrame + with an added 'cluster' column, which are the clusters labels assigned by DBSCAN + (-1 for noise). """ - def cluster_tagger(df_hits: pd.DataFrame) -> pd.DataFrame: - if df_hits.empty: - return df_hits.assign(cluster=pd.Series(dtype=int)) - - # Pre-allocate array for cluster labels - cluster_labels = np.full(len(df_hits), -9999, dtype=int) - - # Get values once (faster than repeatedly accessing DataFrame columns) - coords = df_hits[['X', 'Y', 'Z']].to_numpy() - events = df_hits['event'].to_numpy() - - # Use np.unique to get sorted event IDs - unique_events = np.unique(events) - - for event_id in unique_events: - mask = (events == event_id) - X = coords[mask].copy() - - # Scale - X[:, :2] /= scale_xy - X[:, 2] /= scale_z - - # DBSCAN clustering - labels = DBSCAN(eps=eps, min_samples=min_samples).fit_predict(X) - cluster_labels[mask] = labels - - df_hits['cluster'] = cluster_labels - - return df_hits + eps = clustering_params['eps'] + min_samples = clustering_params['min_samples'] + scale_xy = clustering_params['scale_xy'] + scale_z = clustering_params['scale_z'] - return cluster_tagger + return partial(cluster_tagger, + eps=eps, min_samples=min_samples, + scale_xy=scale_xy, scale_z=scale_z) def identity(x : Any) -> Any: diff --git a/invisible_cities/cities/sophronia.py b/invisible_cities/cities/sophronia.py index bb425f81c..6181a5059 100644 --- a/invisible_cities/cities/sophronia.py +++ b/invisible_cities/cities/sophronia.py @@ -94,11 +94,7 @@ def sophronia( files_in : OneOrManyFiles , sipm_charge_type : SiPMCharge , same_peak : bool , corrections : Optional[dict] = None - - # ¿QUEREMOS ESTO? - # , apply_clustering : bool = False # whether to apply clustering to hits - , cluster_eps : float = 2.3 # eps for DBSCAN - , cluster_min_samples: int = 5 # min_samples for DBSCAN + , clustering_params : Optional[dict] = None ): """ drift_v : float @@ -143,6 +139,19 @@ def sophronia( files_in : OneOrManyFiles Normalization strategy norm_value : float, optional Normalization value in case of `norm_strat = NormStrategy.custom` + + clustering_params : dict + eps : float + The maximum distance between two samples for one to be + considered as in the neighborhood of the other. + min_samples : int + The number of samples (or total weight) in a neighborhood + for a point to be considered as a core point. This includes the point + itself. + scale_xy : float + Scaling factor to apply to the (x, y) coordinates before clustering. + scale_z : float + Scaling factor to apply to the z coordinate before clustering. """ global_reco = compute_xy_position( detector_db , run_number @@ -184,7 +193,7 @@ def sophronia( files_in : OneOrManyFiles correct_hits = df.map( hits_corrector(**corrections) if corrections is not None else identity , item = "hits") - cluster_hits = df.map( hits_clusterizer(eps=cluster_eps, npt=cluster_min_samples) + cluster_hits = df.map( hits_clusterizer(**clustering_params) if clustering_params is not None else identity , item = "hits") build_pointlike_event = df.map( pointlike_event_builder( detector_db diff --git a/invisible_cities/config/sophronia.conf b/invisible_cities/config/sophronia.conf index 22613d646..83a3697bc 100644 --- a/invisible_cities/config/sophronia.conf +++ b/invisible_cities/config/sophronia.conf @@ -63,3 +63,10 @@ corrections = dict( apply_temp = True, norm_strat = kr, apply_z = False) + +clustering_params = dict( + eps = 2.3, + min_samples = 5, + scale_xy = 14.55, + scale_z = 3.7 +) diff --git a/invisible_cities/reco/hits_functions.py b/invisible_cities/reco/hits_functions.py index 9bcdea2ef..4bb54c1d3 100644 --- a/invisible_cities/reco/hits_functions.py +++ b/invisible_cities/reco/hits_functions.py @@ -1,6 +1,12 @@ import numpy as np import pandas as pd +from itertools import compress +from copy import deepcopy +from typing import List +from sklearn.cluster import DBSCAN + +from .. evm import event_model as evm from .. types.ic_types import NN EPSILON = np.finfo(np.float64).eps @@ -64,8 +70,6 @@ def sipms_above_threshold(xys: np.ndarray, qs: np.ndarray, thr:float, energy: fl return xs, ys, qs, es - - def merge_NN_hits(hits: pd.DataFrame, same_peak: bool = True) -> pd.DataFrame: """ Finds NN hits (defined as hits with Q=NN) and removes them without energy @@ -237,4 +241,66 @@ def threshold_hits(hits: pd.DataFrame, th: float) -> pd.DataFrame: """ if th <= 0: return hits return (hits.groupby("Z", as_index=False) - .apply(apply_threshold, th=th)) + .apply(apply_threshold, th=th, on_corrected=on_corrected)) + + +def cluster_tagger(df_hits: pd.DataFrame, *, + eps:float, min_samples:int, + scale_xy:float, scale_z:float) -> pd.DataFrame: + """ + Applies DBSCAN clustering to hits on an event-by-event basis. + + This function processes a DataFrame of hits, groups them by event, + scales their coordinates, and applies DBSCAN to identify spatial clusters. + A 'cluster' column is added to the DataFrame with the resulting labels. + + Parameters + ---------- + df_hits : pd.DataFrame + DataFrame containing hit information with columns 'X', 'Y', 'Z', and 'event'. + eps : float + The maximum distance between two samples for one to be considered as in the + neighborhood of the other. This is the most important DBSCAN parameter. + min_samples : int + The number of samples (or total weight) in a neighborhood for a point + to be considered as a core point. + scale_xy : float + Scale factor to apply to X and Y coordinates before clustering to account + for different detector resolutions. + scale_z : float + Scale factor to apply to the Z coordinate. + + Returns + ------- + pd.DataFrame + The input DataFrame with an added 'cluster' column indicating the + cluster label for each hit (-1 for noise). + """ + if df_hits.empty: + return df_hits.assign(cluster=pd.Series(dtype=int)) + + # Pre-allocate array for cluster labels + cluster_labels = np.full(len(df_hits), -9999, dtype=int) + + # Get values once (faster than repeatedly accessing DataFrame columns) + coords = df_hits[['X', 'Y', 'Z']].to_numpy() + events = df_hits['event'].to_numpy() + + # Use np.unique to get sorted event IDs + unique_events = np.unique(events) + for event_id in unique_events: + + mask = (events == event_id) + X = coords[mask].copy() + + # Scale + X[:, :2] /= scale_xy + X[:, 2] /= scale_z + + # DBSCAN clustering + labels = DBSCAN(eps=eps, min_samples=min_samples).fit_predict(X) + cluster_labels[mask] = labels + + df_hits['cluster'] = cluster_labels + + return df_hits \ No newline at end of file From 168a3a01f83f502503972b9c5b828067f8a4a045 Mon Sep 17 00:00:00 2001 From: Camilo Cortes Parra Date: Wed, 14 Jan 2026 20:58:28 +0100 Subject: [PATCH 06/23] Update on how to call cluster_hits in Sophronia --- invisible_cities/cities/sophronia.py | 2 +- invisible_cities/config/sophronia.conf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/invisible_cities/cities/sophronia.py b/invisible_cities/cities/sophronia.py index 6181a5059..6ddb0c2a7 100644 --- a/invisible_cities/cities/sophronia.py +++ b/invisible_cities/cities/sophronia.py @@ -193,7 +193,7 @@ def sophronia( files_in : OneOrManyFiles correct_hits = df.map( hits_corrector(**corrections) if corrections is not None else identity , item = "hits") - cluster_hits = df.map( hits_clusterizer(**clustering_params) if clustering_params is not None else identity + cluster_hits = df.map( hits_clusterizer(clustering_params) if clustering_params is not None else identity , item = "hits") build_pointlike_event = df.map( pointlike_event_builder( detector_db diff --git a/invisible_cities/config/sophronia.conf b/invisible_cities/config/sophronia.conf index 83a3697bc..f52deb856 100644 --- a/invisible_cities/config/sophronia.conf +++ b/invisible_cities/config/sophronia.conf @@ -65,7 +65,7 @@ corrections = dict( apply_z = False) clustering_params = dict( - eps = 2.3, + eps = 3, min_samples = 5, scale_xy = 14.55, scale_z = 3.7 From 365063887a97d5d28beb255d5342e4aee2b62d6f Mon Sep 17 00:00:00 2001 From: Camilo Cortes Parra Date: Mon, 23 Feb 2026 12:59:11 +0100 Subject: [PATCH 07/23] Pytests for cluster_tagger function --- invisible_cities/cities/sophronia_test.py | 43 ++--- invisible_cities/conftest.py | 5 + invisible_cities/reco/hits_functions_test.py | 175 ++++++++++++++++++- 3 files changed, 200 insertions(+), 23 deletions(-) diff --git a/invisible_cities/cities/sophronia_test.py b/invisible_cities/cities/sophronia_test.py index 96d7646e7..3b433ac5c 100644 --- a/invisible_cities/cities/sophronia_test.py +++ b/invisible_cities/cities/sophronia_test.py @@ -64,27 +64,28 @@ def test_sophronia_contains_all_tables(sophronia_config, config_tmpdir): @ignore_warning.no_config_group @mark.slow -def test_sophronia_exact_result(sophronia_config, Th228_hits, config_tmpdir): - path_out = os.path.join(config_tmpdir, 'test_sophronia_exact_result.h5') - config = dict(**sophronia_config) - config.update(dict(file_out = path_out)) - - sophronia(**config) - - tables = ( "MC/hits", "MC/particles" - , "DST/Events" - , "RECO/Events" - , "Run/events", "Run/runInfo" - , "Filters/s12_selector", "Filters/valid_hit" - ) - - with tb.open_file(Th228_hits) as true_output_file: - with tb.open_file(path_out) as output_file: - for table in tables: - assert hasattr(output_file.root, table), table - got = getattr( output_file.root, table) - expected = getattr(true_output_file.root, table) - assert_tables_equality(got, expected) +# RE-ACTIVAR +# def test_sophronia_exact_result(sophronia_config, Th228_hits, config_tmpdir): +# path_out = os.path.join(config_tmpdir, 'test_sophronia_exact_result.h5') +# config = dict(**sophronia_config) +# config.update(dict(file_out = path_out)) + +# sophronia(**config) + +# tables = ( "MC/hits", "MC/particles" +# , "DST/Events" +# , "RECO/Events" +# , "Run/events", "Run/runInfo" +# , "Filters/s12_selector", "Filters/valid_hit" +# ) + +# with tb.open_file(Th228_hits) as true_output_file: +# with tb.open_file(path_out) as output_file: +# for table in tables: +# assert hasattr(output_file.root, table), table +# got = getattr( output_file.root, table) +# expected = getattr(true_output_file.root, table) +# assert_tables_equality(got, expected) @ignore_warning.no_config_group diff --git a/invisible_cities/conftest.py b/invisible_cities/conftest.py index 2476ae84f..7dc54dfcc 100644 --- a/invisible_cities/conftest.py +++ b/invisible_cities/conftest.py @@ -438,6 +438,11 @@ def sophronia_config(Th228_pmaps, next100_mc_krmap): filename = next100_mc_krmap, apply_temp = False, norm_strat = NormStrategy.kr) + , clustering_params = dict( + eps = 3, + min_samples = 5, + scale_xy = 14.55, + scale_z = 3.7) ) return config diff --git a/invisible_cities/reco/hits_functions_test.py b/invisible_cities/reco/hits_functions_test.py index c92b7abaa..ade6c9a65 100644 --- a/invisible_cities/reco/hits_functions_test.py +++ b/invisible_cities/reco/hits_functions_test.py @@ -1,6 +1,8 @@ import numpy as np import pandas as pd +from pytest import mark +from pytest import fixture from numpy.testing import assert_almost_equal from .. core.testing_utils import assert_dataframes_close @@ -9,6 +11,7 @@ from . hits_functions import merge_NN_hits from . hits_functions import threshold_hits from . hits_functions import sipms_above_threshold +from . hits_functions import cluster_tagger from hypothesis import given from hypothesis.strategies import lists from hypothesis.strategies import floats @@ -16,7 +19,7 @@ from copy import deepcopy from hypothesis import assume from hypothesis.strategies import composite - +from hypothesis.extra.pandas import data_frames, column, range_indexes event_numbers = integers(0, np.iinfo(np.int32).max) @@ -157,4 +160,172 @@ def test_threshold_hits_energy_conserved(hits, th): def test_threshold_hits_all_larger_than_th(hits, th): hits_thresh = threshold_hits(hits, th) non_nn = hits_thresh.loc[hits_thresh.Q != NN] - assert np.all(non_nn.Q >= th) + q = non_nn[col] + assert np.all(q >= th) + +# ----- CLUSTER TAGGER TESTS ----- # + +gen_cluster_df = data_frames( index=range_indexes(min_size=1, max_size=50), + columns=[ + column('event', dtype=int, elements=integers(min_value=0, max_value=10)), + column('X', dtype=float, elements=floats(min_value=-500, max_value=500)), + column('Y', dtype=float, elements=floats(min_value=-500, max_value=500)), + column('Z', dtype=float, elements=floats(min_value=0, max_value=1200)), + column('E', dtype=float, elements=floats(min_value=0.1, max_value=100)), + ]) + +@given(df=gen_cluster_df) +def test_dummy(df): + """ + Hypothesis calls this function multiple times. + 'df' will be a different pandas DataFrame in every call. + """ + # Just for demonstration purposes, we print the shape of the generated DFs + print(f"Generated dataframe shape: {df.shape}") + + # Check some stuff here + assert 'X' in df.columns + assert df['event'].dtype == int + assert not df.empty + +@settings(deadline=None) +@given(df=gen_cluster_df) +def test_cluster_tagger_structure_preservation(df): + """ + Verifies that cluster_tagger: + - Returns a DataFrame with the exact same length as the input. + - Adds exactly one column named 'cluster'. + - Does not modify any of the original columns (X, Y, Z, E, etc.). + - Preserves the original Index and order of rows. + - The 'cluster' column contains valid integers (no NaNs). + """ + # Shuffle the input DataFrame to ensure cluster_tagger does not rely on any specific order + df_input = df.sample(frac=1.0).copy() + df_original = df_input.copy() # Keep a copy of the original for later comparison + + # Run the cluster tagger + params = dict(eps=10.0, min_samples=1, scale_xy=1.0, scale_z=1.0) # Dummy values + df_result = cluster_tagger(df_input, **params) + + # --- Assertations + assert len(df_result) == len(df_original), "Output DataFrame has different length than input." + assert 'cluster' in df_result.columns, "Output DataFrame does not contain 'cluster' column." + expected_cols = set(df_original.columns) | {'cluster'} + assert set(df_result.columns) == expected_cols, "Output DataFrame has unexpected columns." + pd.testing.assert_frame_equal( + df_result.drop(columns=['cluster']), + df_original, + check_dtype=True, + obj="Dataframe structure check" + ) + assert pd.api.types.is_integer_dtype(df_result['cluster']), "'cluster' column is not of integer type." + assert not df_result['cluster'].isna().any(), "'cluster' column contains NaN values." + +def test_cluster_tagger_row_alignment(): + """ + Verifies that the calculated cluster label is assigned to the correct + spatial hit, even if the input DataFrame is shuffled. + + Scenario: + - Event 0: + - Cluster A: 2 hits at (0,0,0) and (1,1,0) -> Should be Cluster 0 + - Cluster B: 2 hits at (100,100,0) and (101,101,0) -> Should be Cluster 1 + - We check that hits near 0 get Label 0 and hits near 100 get Label 1 (NO noise here). + """ + # Setup data + data = { + 'event': [0, 0, 0, 0], + 'X': [0., 1., 100., 101.], + 'Y': [0., 1., 100., 101.], + 'Z': [0., 0., 0., 0.], + 'E': [10, 10, 10, 10 ] + } + df = pd.DataFrame(data) + df['expected_label'] = [0, 0, 1, 1] + + # Shuffle the input DataFrame + df_input = df.sample(frac=1.0).copy() + + # Run the cluster tagger + params = dict(eps=5.0, min_samples=1, scale_xy=1.0, scale_z=1.0) # Enough to consider both clusters + df_result = cluster_tagger(df_input, **params) + + # --- Assertations + hits_group_0 = df_result[df_result['expected_label'] == 0] + hits_group_1 = df_result[df_result['expected_label'] == 1] + assert hits_group_0['cluster'].nunique() == 1, "Hits near (0,0,0) were assigned multiple cluster labels." + assert hits_group_1['cluster'].nunique() == 1, "Hits near (100,100,0) were assigned multiple cluster labels." + label_0 = hits_group_0['cluster'].iloc[0] + label_1 = hits_group_1['cluster'].iloc[0] + assert label_0 != label_1, "Both clusters were assigned the same label." + assert label_0 != -1 and label_1 != -1, "One of the clusters was labeled as noise (-1)." + +def test_cluster_tagger_noise_rejection(): + """ + Verifies that isolated hits (outliers) are correctly identified as noise (-1). + + Scenario: + - 3 points very close together (0,0), (1,0), (0,1). They should form a cluster. + - 1 point very far away (100, 100). It has 0 neighbors. Should be noise. + """ + # Setup data + data = { + 'event': [0, 0, 0, 0], + 'X': [0., 1., 0., 100.], + 'Y': [0., 0., 1., 100.], + 'Z': [0., 0., 0., 0.], + 'E': [10, 10, 10, 10 ] + } + df = pd.DataFrame(data) + + # Shuffle the input DataFrame + df_input = df.sample(frac=1.0).copy() + + # Run the cluster tagger + params = dict(eps=5.0, min_samples=3, scale_xy=1.0, scale_z=1.0) # Enough to consider one cluster + df_result = cluster_tagger(df_input, **params) + + # --- Assertations + cluster_labels = df_result['cluster'].unique() + assert cluster_labels.size == 2, "Expected exactly 2 unique cluster labels (one cluster + one noise)." + cluster_hits = df_result[df_result['cluster'] != -1] + assert cluster_hits.shape[0] == 3, "Expected exactly 3 hits to be clustered together." + noise_hit = df_result[df_result['cluster'] == -1] + assert noise_hit.shape[0] == 1, "Expected exactly 1 noise hit." + assert noise_hit['X'].iloc[0] == 100 and noise_hit['Y'].iloc[0] == 100, "The noise hit identified is NOT the distant one." + +def test_cluster_tagger_event_distinction(): + """ + Verifies that hits from different events are not clustered together. + + Scenario: + - Event 0: 2 hits at (0,0,0) and (1,1,0) -> Should be Cluster 0 + - Event 1: 2 hits at (100,100,0) and (101,101,0) and 1 hit at (0.5,0.5,0) -> Should be marked as noise (-1) + - We check that noise hit from Event 1 get a different cluster label than hits from Event 0, even if they are spatially close. + """ + # Setup data + data = { + 'event': [0, 0, 1, 1, 1], + 'X': [0., 1., 100., 101., 0.5], + 'Y': [0., 1., 100., 101., 0.5], + 'Z': [0., 0., 0., 0., 0.], + 'E': [10, 10, 10, 10, 10] + } + df = pd.DataFrame(data) + + # Shuffle the input DataFrame + df_input = df.sample(frac=1.0).copy() + + # Run the cluster tagger + params = dict(eps=5.0, min_samples=2, scale_xy=1.0, scale_z=1.0) # Enough to consider both clusters + df_result = cluster_tagger(df_input, **params) + + # --- Assertations + event_0_clusters = df_result[df_result['event'] == 0]['cluster'].unique() + event_1_clusters = df_result[df_result['event'] == 1]['cluster'].unique() + assert len(event_0_clusters) == 1, "For event 0: expected exactly 1 unique cluster label (one cluster)." + assert len(event_1_clusters) == 2, "For event 1: expected exactly 2 unique cluster labels (one cluster + one noise)." + event_0_hits = df_result[df_result['event'] == 0] + noise_1_hit = df_result[(df_result['X'] == 0.5)] + assert noise_1_hit['cluster'].iloc[0] == -1, "The hit at (0.5,0.5,0) in event 1 should be marked as noise (-1)." + assert event_0_hits['cluster'].iloc[0] != noise_1_hit['cluster'].iloc[0], "Hits from event 0 and the noise hit from event 1 were assigned the same cluster label." From 0e64dad61399a7b1213951a69f5f3c05d2c58ab5 Mon Sep 17 00:00:00 2001 From: Camilo Cortes Parra Date: Mon, 23 Feb 2026 15:38:23 +0100 Subject: [PATCH 08/23] Pytest for hits clusterizer feature. Also, reference file for exact result test is updated to match the new implementation. --- invisible_cities/cities/sophronia_test.py | 91 +++++++++++++++++------ 1 file changed, 69 insertions(+), 22 deletions(-) diff --git a/invisible_cities/cities/sophronia_test.py b/invisible_cities/cities/sophronia_test.py index 3b433ac5c..922f25492 100644 --- a/invisible_cities/cities/sophronia_test.py +++ b/invisible_cities/cities/sophronia_test.py @@ -5,6 +5,8 @@ from pytest import mark +from .. io import dst_io as dio +from .. core.testing_utils import assert_dataframes_equal from .. core.testing_utils import assert_tables_equality from .. core.testing_utils import ignore_warning from .. core.system_of_units import pes @@ -64,28 +66,27 @@ def test_sophronia_contains_all_tables(sophronia_config, config_tmpdir): @ignore_warning.no_config_group @mark.slow -# RE-ACTIVAR -# def test_sophronia_exact_result(sophronia_config, Th228_hits, config_tmpdir): -# path_out = os.path.join(config_tmpdir, 'test_sophronia_exact_result.h5') -# config = dict(**sophronia_config) -# config.update(dict(file_out = path_out)) - -# sophronia(**config) - -# tables = ( "MC/hits", "MC/particles" -# , "DST/Events" -# , "RECO/Events" -# , "Run/events", "Run/runInfo" -# , "Filters/s12_selector", "Filters/valid_hit" -# ) - -# with tb.open_file(Th228_hits) as true_output_file: -# with tb.open_file(path_out) as output_file: -# for table in tables: -# assert hasattr(output_file.root, table), table -# got = getattr( output_file.root, table) -# expected = getattr(true_output_file.root, table) -# assert_tables_equality(got, expected) +def test_sophronia_exact_result(sophronia_config, Th228_hits, config_tmpdir): + path_out = os.path.join(config_tmpdir, 'test_sophronia_exact_result.h5') + config = dict(**sophronia_config) + config.update(dict(file_out = path_out)) + + sophronia(**config) + + tables = ( "MC/hits", "MC/particles" + , "DST/Events" + , "RECO/Events" + , "Run/events", "Run/runInfo" + , "Filters/s12_selector", "Filters/valid_hit" + ) + + with tb.open_file(Th228_hits) as true_output_file: + with tb.open_file(path_out) as output_file: + for table in tables: + assert hasattr(output_file.root, table), table + got = getattr( output_file.root, table) + expected = getattr(true_output_file.root, table) + assert_tables_equality(got, expected) @ignore_warning.no_config_group @@ -148,3 +149,49 @@ def test_sophronia_keeps_hitless_events(config_tmpdir, sophronia_config): with tb.open_file(path_out) as output_file: assert len(output_file.root.Run.events) == 1 assert "RECO" not in output_file.root + + +@ignore_warning.no_config_group +def test_sophronia_clustering_integration(config_tmpdir, sophronia_config): + """ + Runs Sophronia twice (once disabled, once enabled) to verify: + 1. Backward compatibility: No 'cluster' column when disabled. + 2. Feature activation: 'cluster' column exists when enabled. + 3. Data consistency: Enabling clustering does NOT change any other data. + """ + path_out_no_cluster = os.path.join(config_tmpdir, 'test_sophronia_no_cluster.h5') + path_out_with_cluster = os.path.join(config_tmpdir, 'test_sophronia_with_cluster.h5') + + # Clustering disabled + config_no_cluster = dict(**sophronia_config) + config_no_cluster.update(dict( file_out = path_out_no_cluster + , event_range = 1 + , clustering_params = None)) + sophronia(**config_no_cluster) + + # Clustering enabled + clustering_params = dict( + eps = 3, + min_samples = 5, + scale_xy = 14.55, + scale_z = 3.7 + ) + config_with_cluster = dict(**sophronia_config) + config_with_cluster.update(dict( file_out = path_out_with_cluster + , event_range = 1 + , clustering_params = clustering_params)) + sophronia(**config_with_cluster) + + # Load both outputs + df_no_cluster = dio.load_dst(path_out_no_cluster, "RECO", "Events") + df_with_cluster = dio.load_dst(path_out_with_cluster, "RECO", "Events") + + # ----- Assertions + assert not df_no_cluster.empty + assert not df_with_cluster.empty + assert 'cluster' not in df_no_cluster.columns, "'cluster' column should not exist when clustering is disabled." + assert 'cluster' in df_with_cluster.columns, "'cluster' column should exist when clustering is enabled." + + # Compare all columns except 'cluster' for equality + df_with_cluster_compare = df_with_cluster.drop(columns=['cluster']) + assert_dataframes_equal(df_no_cluster, df_with_cluster_compare) \ No newline at end of file From f22bb7565c40b733a5f8a406f5bbe8d734933dc9 Mon Sep 17 00:00:00 2001 From: Camilo Cortes Parra Date: Tue, 3 Mar 2026 15:29:57 +0100 Subject: [PATCH 09/23] New reference file including cluster label for hits --- invisible_cities/reco/hits_functions.py | 2 +- invisible_cities/reco/hits_functions_test.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/invisible_cities/reco/hits_functions.py b/invisible_cities/reco/hits_functions.py index 4bb54c1d3..d04204d3b 100644 --- a/invisible_cities/reco/hits_functions.py +++ b/invisible_cities/reco/hits_functions.py @@ -241,7 +241,7 @@ def threshold_hits(hits: pd.DataFrame, th: float) -> pd.DataFrame: """ if th <= 0: return hits return (hits.groupby("Z", as_index=False) - .apply(apply_threshold, th=th, on_corrected=on_corrected)) + .apply(apply_threshold, th=th)) def cluster_tagger(df_hits: pd.DataFrame, *, diff --git a/invisible_cities/reco/hits_functions_test.py b/invisible_cities/reco/hits_functions_test.py index ade6c9a65..d3cc20c54 100644 --- a/invisible_cities/reco/hits_functions_test.py +++ b/invisible_cities/reco/hits_functions_test.py @@ -160,8 +160,7 @@ def test_threshold_hits_energy_conserved(hits, th): def test_threshold_hits_all_larger_than_th(hits, th): hits_thresh = threshold_hits(hits, th) non_nn = hits_thresh.loc[hits_thresh.Q != NN] - q = non_nn[col] - assert np.all(q >= th) + assert np.all(non_nn.Q >= th) # ----- CLUSTER TAGGER TESTS ----- # From e062786353105663a5501362dad3930000a92d9c Mon Sep 17 00:00:00 2001 From: Camilo Cortes Parra Date: Tue, 3 Mar 2026 16:00:07 +0100 Subject: [PATCH 10/23] New hits reference file, git problem solved --- invisible_cities/database/test_data/228Th_10evt_hits.h5 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/invisible_cities/database/test_data/228Th_10evt_hits.h5 b/invisible_cities/database/test_data/228Th_10evt_hits.h5 index de4ac5337..3aa32af53 100644 --- a/invisible_cities/database/test_data/228Th_10evt_hits.h5 +++ b/invisible_cities/database/test_data/228Th_10evt_hits.h5 @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c4ea23ce094795bfd779121b872ce7e7d1da60eb1e1992a8868e3fc7de73c4b8 -size 274703 +oid sha256:ee809a8d3f69241048b4c37c156b452e28bcce05d45b3504c15163599e6c8dbb +size 274902 From 4e43dc846a1b90c38f605f02f0f8dc72188fd059 Mon Sep 17 00:00:00 2001 From: Camilo Cortes Parra Date: Wed, 4 Mar 2026 11:51:08 +0100 Subject: [PATCH 11/23] scikit-learn added --- manage.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/manage.sh b/manage.sh index 04d8b7a83..f6c04e1fa 100644 --- a/manage.sh +++ b/manage.sh @@ -73,7 +73,7 @@ function install_conda { fi } -CONDA_ENV_TAG=2026-03-05 +CONDA_ENV_TAG=2025-03-04 CONDA_ENV_NAME=IC-${PYTHON_VERSION}-${CONDA_ENV_TAG} function make_environment { @@ -109,6 +109,7 @@ dependencies: - scipy = 1.9.3 - seaborn = 0.11.2 - setuptools = 58.0.4 +- scikit-learn = 1.1.3 - sphinx = 4.2.0 - tornado = 6.1 - pip: From c24a3313cd5269420051c73b8dc342e707bcf6c4 Mon Sep 17 00:00:00 2001 From: Camilo Cortes Parra Date: Wed, 4 Mar 2026 11:53:11 +0100 Subject: [PATCH 12/23] Update conda environment tag --- manage.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manage.sh b/manage.sh index f6c04e1fa..421370d10 100644 --- a/manage.sh +++ b/manage.sh @@ -73,7 +73,7 @@ function install_conda { fi } -CONDA_ENV_TAG=2025-03-04 +CONDA_ENV_TAG=2026-03-04 CONDA_ENV_NAME=IC-${PYTHON_VERSION}-${CONDA_ENV_TAG} function make_environment { From 5c41eab8f5b1256d21c0a1acdab09cff11696a0e Mon Sep 17 00:00:00 2001 From: Camilo Cortes Parra Date: Wed, 4 Mar 2026 14:48:29 +0100 Subject: [PATCH 13/23] Update beersheba reference files --- invisible_cities/database/test_data/228Th_10evt_deco.h5 | 4 ++-- .../database/test_data/228Th_10evt_deco_satellite.h5 | 4 ++-- .../database/test_data/228Th_10evt_deco_separate.h5 | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/invisible_cities/database/test_data/228Th_10evt_deco.h5 b/invisible_cities/database/test_data/228Th_10evt_deco.h5 index e4f1f6006..47ff579ad 100644 --- a/invisible_cities/database/test_data/228Th_10evt_deco.h5 +++ b/invisible_cities/database/test_data/228Th_10evt_deco.h5 @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3b40406d8530d0f7a6f6dfcd7b1d10bea83374ee49374f612183771f9ca5a9c5 -size 818438 +oid sha256:5ed63a53e9a78cef39a4ff4d14ae1cfbdfc0f121d27fd9ee215c18bcec91ff2d +size 819282 diff --git a/invisible_cities/database/test_data/228Th_10evt_deco_satellite.h5 b/invisible_cities/database/test_data/228Th_10evt_deco_satellite.h5 index be2713440..273064f6e 100644 --- a/invisible_cities/database/test_data/228Th_10evt_deco_satellite.h5 +++ b/invisible_cities/database/test_data/228Th_10evt_deco_satellite.h5 @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e1aaca525390738b4ee0f432d81f235ca2f37c38801f48f76210c92b91e0777b -size 302572 +oid sha256:434f0bf46e4e326731dbb87e14d83d6649ea01eb0079654a42c9c3e3c41c7df1 +size 304668 diff --git a/invisible_cities/database/test_data/228Th_10evt_deco_separate.h5 b/invisible_cities/database/test_data/228Th_10evt_deco_separate.h5 index 490271120..0f6cddf7d 100644 --- a/invisible_cities/database/test_data/228Th_10evt_deco_separate.h5 +++ b/invisible_cities/database/test_data/228Th_10evt_deco_separate.h5 @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5ae5b40acc034706762ad2de2209430b136001b1783c921e98c52731c6c2d101 -size 818469 +oid sha256:69175d759c8484973f9006911f08c8019cc84bc472481b7c8d3fafded16a3cc8 +size 819313 From 58bd2a3452306d46c4b02d17d284878e9169f4bb Mon Sep 17 00:00:00 2001 From: Camilo Cortes Parra Date: Wed, 22 Apr 2026 15:06:02 +0200 Subject: [PATCH 14/23] PR: first round of comments addressed --- invisible_cities/cities/components.py | 34 ++--- invisible_cities/cities/sophronia.py | 9 +- invisible_cities/config/sophronia.conf | 4 +- invisible_cities/conftest.py | 4 +- invisible_cities/reco/hits_functions.py | 136 +++++++++++-------- invisible_cities/reco/hits_functions_test.py | 115 ++++++++-------- 6 files changed, 161 insertions(+), 141 deletions(-) diff --git a/invisible_cities/cities/components.py b/invisible_cities/cities/components.py index f89351b1f..07fc816a4 100644 --- a/invisible_cities/cities/components.py +++ b/invisible_cities/cities/components.py @@ -1718,33 +1718,33 @@ def correct(hits : pd.DataFrame) -> pd.DataFrame: return correct -def hits_clusterizer(clustering_params: dict) -> Callable: - """" - This function receives a configuration dictionary and returns a callable - that will perform DBSCAN clustering on a DataFrame of hits. +@check_annotations +def hits_clusterizer( eps : float + , min_samples : int + , scale_xy : float + , scale_z : float + ) -> Callable: + """ + Creates a callable for performing DBSCAN clustering on a dataFrame of hits. Parameters ---------- - clustering_params : dict - A dictionary containing the configuration for the clustering algorithm. - Expected keys are: - - 'eps' : float, Epsilon value for DBSCAN. - - 'min_samples': int, Min Samples value for DBSCAN. - - 'scale_xy' : float, optional, scale factor for XY coordinates. - - 'scale_z' : float, optional, scale factor for Z coordinate. + eps : float + Epsilon value for DBSCAN, defining the maximum distance between two samples for them to be considered neighbors. + min_samples : int + Minimum number of samples required to form a dense region (cluster). This includes the point itself. + scale_xy : float + Scaling factor to apply to the (x, y) coordinates before clustering. + scale_z : float + Scaling factor to apply to the z coordinate before clustering. Returns ------- Callable A function that takes a DataFrame of hits and returns the same DataFrame - with an added 'cluster' column, which are the clusters labels assigned by DBSCAN + with an added 'cluster' column, which contains the cluster labels assigned by DBSCAN (-1 for noise). """ - eps = clustering_params['eps'] - min_samples = clustering_params['min_samples'] - scale_xy = clustering_params['scale_xy'] - scale_z = clustering_params['scale_z'] - return partial(cluster_tagger, eps=eps, min_samples=min_samples, scale_xy=scale_xy, scale_z=scale_z) diff --git a/invisible_cities/cities/sophronia.py b/invisible_cities/cities/sophronia.py index 6ddb0c2a7..97947cda9 100644 --- a/invisible_cities/cities/sophronia.py +++ b/invisible_cities/cities/sophronia.py @@ -142,12 +142,9 @@ def sophronia( files_in : OneOrManyFiles clustering_params : dict eps : float - The maximum distance between two samples for one to be - considered as in the neighborhood of the other. + Epsilon value for DBSCAN, defining the maximum distance between two samples for them to be considered neighbors. min_samples : int - The number of samples (or total weight) in a neighborhood - for a point to be considered as a core point. This includes the point - itself. + Minimum number of samples required to form a dense region (cluster). This includes the point itself. scale_xy : float Scaling factor to apply to the (x, y) coordinates before clustering. scale_z : float @@ -193,7 +190,7 @@ def sophronia( files_in : OneOrManyFiles correct_hits = df.map( hits_corrector(**corrections) if corrections is not None else identity , item = "hits") - cluster_hits = df.map( hits_clusterizer(clustering_params) if clustering_params is not None else identity + cluster_hits = df.map( hits_clusterizer(**clustering_params) if clustering_params is not None else identity , item = "hits") build_pointlike_event = df.map( pointlike_event_builder( detector_db diff --git a/invisible_cities/config/sophronia.conf b/invisible_cities/config/sophronia.conf index f52deb856..56ca2eeb7 100644 --- a/invisible_cities/config/sophronia.conf +++ b/invisible_cities/config/sophronia.conf @@ -67,6 +67,6 @@ corrections = dict( clustering_params = dict( eps = 3, min_samples = 5, - scale_xy = 14.55, - scale_z = 3.7 + scale_xy = 15.55, + scale_z = 4.0 ) diff --git a/invisible_cities/conftest.py b/invisible_cities/conftest.py index 7dc54dfcc..1b2af207d 100644 --- a/invisible_cities/conftest.py +++ b/invisible_cities/conftest.py @@ -441,8 +441,8 @@ def sophronia_config(Th228_pmaps, next100_mc_krmap): , clustering_params = dict( eps = 3, min_samples = 5, - scale_xy = 14.55, - scale_z = 3.7) + scale_xy = 15.55, + scale_z = 4.0) ) return config diff --git a/invisible_cities/reco/hits_functions.py b/invisible_cities/reco/hits_functions.py index d04204d3b..1b9c58f24 100644 --- a/invisible_cities/reco/hits_functions.py +++ b/invisible_cities/reco/hits_functions.py @@ -243,64 +243,80 @@ def threshold_hits(hits: pd.DataFrame, th: float) -> pd.DataFrame: return (hits.groupby("Z", as_index=False) .apply(apply_threshold, th=th)) +def tag_hits_in_event(event_hits: pd.DataFrame, *, + eps: float, min_samples: int, + scale_xy: float, scale_z: float) -> pd.DataFrame: + """ + Applies DBSCAN clustering to a DataFrame containing hits from a single event. + + The coordinates are scaled to account for detector geometry differences + in sampling and applies DBSCAN to identify spatial clusters. + A 'cluster' column is added to the group with the resulting labels. + + Parameters + ---------- + event_hits : pd.DataFrame + DataFrame with hits from a single event. Must contain 'X', 'Y', 'Z' columns. + eps, min_samples, scale_xy, scale_z : + Configuration parameters for scaling and DBSCAN. See `cluster_tagger` for details. + + Returns + ------- + pd.DataFrame + The input DataFrame with a 'cluster' column added. + """ + # If the event has no hits, there's nothing to do + if event_hits.empty: + return event_hits.assign(cluster=pd.Series(dtype=int)) + + # Extract coordinates and apply scaling + coords = event_hits[['X', 'Y', 'Z']].to_numpy() + coords[:, :2] /= scale_xy + coords[:, 2] /= scale_z + + # DBSCAN clustering + labels = DBSCAN(eps=eps, min_samples=min_samples).fit_predict(coords) + # Add the cluster labels as a new column to the event's DataFrame. + event_hits['cluster'] = labels -def cluster_tagger(df_hits: pd.DataFrame, *, - eps:float, min_samples:int, - scale_xy:float, scale_z:float) -> pd.DataFrame: - """ - Applies DBSCAN clustering to hits on an event-by-event basis. - - This function processes a DataFrame of hits, groups them by event, - scales their coordinates, and applies DBSCAN to identify spatial clusters. - A 'cluster' column is added to the DataFrame with the resulting labels. - - Parameters - ---------- - df_hits : pd.DataFrame - DataFrame containing hit information with columns 'X', 'Y', 'Z', and 'event'. - eps : float - The maximum distance between two samples for one to be considered as in the - neighborhood of the other. This is the most important DBSCAN parameter. - min_samples : int - The number of samples (or total weight) in a neighborhood for a point - to be considered as a core point. - scale_xy : float - Scale factor to apply to X and Y coordinates before clustering to account - for different detector resolutions. - scale_z : float - Scale factor to apply to the Z coordinate. - - Returns - ------- - pd.DataFrame - The input DataFrame with an added 'cluster' column indicating the - cluster label for each hit (-1 for noise). - """ - if df_hits.empty: - return df_hits.assign(cluster=pd.Series(dtype=int)) - - # Pre-allocate array for cluster labels - cluster_labels = np.full(len(df_hits), -9999, dtype=int) - - # Get values once (faster than repeatedly accessing DataFrame columns) - coords = df_hits[['X', 'Y', 'Z']].to_numpy() - events = df_hits['event'].to_numpy() - - # Use np.unique to get sorted event IDs - unique_events = np.unique(events) - for event_id in unique_events: - - mask = (events == event_id) - X = coords[mask].copy() - - # Scale - X[:, :2] /= scale_xy - X[:, 2] /= scale_z - - # DBSCAN clustering - labels = DBSCAN(eps=eps, min_samples=min_samples).fit_predict(X) - cluster_labels[mask] = labels - - df_hits['cluster'] = cluster_labels - - return df_hits \ No newline at end of file + return event_hits + +def cluster_tagger(df_hits: pd.DataFrame, *, + eps: float, min_samples: int, + scale_xy: float, scale_z: float) -> pd.DataFrame: + """ + Applies DBSCAN clustering to hits on an event-by-event basis using groupby.apply. + + This function groups the input DataFrame by 'event' and applies the + `tag_hits_in_event` function to each event's group of hits. + + Parameters + ---------- + df_hits : pd.DataFrame + DataFrame containing hit information with columns 'X', 'Y', 'Z', and 'event'. + eps : float + Epsilon value for DBSCAN, defining the maximum distance between two samples for them to be considered neighbors. + min_samples : int + Minimum number of samples required to form a dense region (cluster). This includes the point itself. + scale_xy : float + Scaling factor to apply to the (x, y) coordinates before clustering. + scale_z : float + Scaling factor to apply to the z coordinate before clustering. + + Returns + ------- + pd.DataFrame + The input DataFrame with an added 'cluster' column indicating the + cluster label for each hit (-1 for noise). + """ + if df_hits.empty: + return df_hits.assign(cluster=pd.Series(dtype=int)) + + clustered_df = df_hits.groupby('event', as_index=False, group_keys=False) \ + .apply(tag_hits_in_event, + eps=eps, + min_samples=min_samples, + scale_xy=scale_xy, + scale_z=scale_z) + + return clustered_df.set_index(df_hits.index) \ No newline at end of file diff --git a/invisible_cities/reco/hits_functions_test.py b/invisible_cities/reco/hits_functions_test.py index d3cc20c54..1eab9ed93 100644 --- a/invisible_cities/reco/hits_functions_test.py +++ b/invisible_cities/reco/hits_functions_test.py @@ -173,63 +173,77 @@ def test_threshold_hits_all_larger_than_th(hits, th): column('E', dtype=float, elements=floats(min_value=0.1, max_value=100)), ]) +@settings(deadline=None) @given(df=gen_cluster_df) -def test_dummy(df): +def test_cluster_tagger_output_shape(df): """ - Hypothesis calls this function multiple times. - 'df' will be a different pandas DataFrame in every call. + Verifies that the output DataFrame of cluster_tagger: + - Has the same number of rows as the input. + - Contains exactly one new column named 'cluster'. """ - # Just for demonstration purposes, we print the shape of the generated DFs - print(f"Generated dataframe shape: {df.shape}") - - # Check some stuff here - assert 'X' in df.columns - assert df['event'].dtype == int - assert not df.empty + if df.empty: + return + + # Run the cluster tagger + params = dict(eps=10.0, min_samples=1, scale_xy=1.0, scale_z=1.0) # Dummy values + df_result = cluster_tagger(df.copy(), **params) + + # --- Assertations + assert len(df_result) == len(df), "Output DataFrame has different length than input." + assert 'cluster' in df_result.columns, "Output DataFrame does not contain 'cluster' column." + expected_cols = set(df.columns) | {'cluster'} + assert set(df_result.columns) == expected_cols, "Output DataFrame has unexpected columns." @settings(deadline=None) @given(df=gen_cluster_df) -def test_cluster_tagger_structure_preservation(df): +def test_cluster_tagger_original(df): """ Verifies that cluster_tagger: - - Returns a DataFrame with the exact same length as the input. - - Adds exactly one column named 'cluster'. - - Does not modify any of the original columns (X, Y, Z, E, etc.). - - Preserves the original Index and order of rows. - - The 'cluster' column contains valid integers (no NaNs). + - Does not modify any of the original columns. + - Preserves the original index and row order. """ - # Shuffle the input DataFrame to ensure cluster_tagger does not rely on any specific order - df_input = df.sample(frac=1.0).copy() - df_original = df_input.copy() # Keep a copy of the original for later comparison + if df.empty: + return # Run the cluster tagger - params = dict(eps=10.0, min_samples=1, scale_xy=1.0, scale_z=1.0) # Dummy values - df_result = cluster_tagger(df_input, **params) + params = dict(eps=10.0, min_samples=1, scale_xy=1.0, scale_z=1.0) # Dummy values + df_result = cluster_tagger(df.copy(), **params) # --- Assertations - assert len(df_result) == len(df_original), "Output DataFrame has different length than input." - assert 'cluster' in df_result.columns, "Output DataFrame does not contain 'cluster' column." - expected_cols = set(df_original.columns) | {'cluster'} - assert set(df_result.columns) == expected_cols, "Output DataFrame has unexpected columns." pd.testing.assert_frame_equal( df_result.drop(columns=['cluster']), - df_original, + df, check_dtype=True, obj="Dataframe structure check" ) + +@settings(deadline=None) +@given(df=gen_cluster_df) +def test_cluster_tagger_new_column_validity(df): + """ + Verifies that the new 'cluster' column: + - The 'cluster' column contains valid integers (no NaNs). + """ + if df.empty: + return + + # Run the cluster tagger + params = dict(eps=10.0, min_samples=1, scale_xy=1.0, scale_z=1.0) # Dummy values + df_result = cluster_tagger(df.copy(), **params) + + # --- Assertations assert pd.api.types.is_integer_dtype(df_result['cluster']), "'cluster' column is not of integer type." assert not df_result['cluster'].isna().any(), "'cluster' column contains NaN values." def test_cluster_tagger_row_alignment(): """ - Verifies that the calculated cluster label is assigned to the correct - spatial hit, even if the input DataFrame is shuffled. - + Verifies that the correct cluster label is assigned to the correct + row (hit), even if the input DataFrame is shuffled. + Scenario: - Event 0: - Cluster A: 2 hits at (0,0,0) and (1,1,0) -> Should be Cluster 0 - Cluster B: 2 hits at (100,100,0) and (101,101,0) -> Should be Cluster 1 - - We check that hits near 0 get Label 0 and hits near 100 get Label 1 (NO noise here). """ # Setup data data = { @@ -240,24 +254,23 @@ def test_cluster_tagger_row_alignment(): 'E': [10, 10, 10, 10 ] } df = pd.DataFrame(data) - df['expected_label'] = [0, 0, 1, 1] - - # Shuffle the input DataFrame - df_input = df.sample(frac=1.0).copy() + # Shuffle rows in a specific order + df_shuffled = df.reindex(index=[0, 2, 1, 3]).copy() + # Add expected labels for assertations (not shuffled, just for reference) + df['cluster'] = [0, 0, 1, 1] # Run the cluster tagger - params = dict(eps=5.0, min_samples=1, scale_xy=1.0, scale_z=1.0) # Enough to consider both clusters - df_result = cluster_tagger(df_input, **params) + params = dict(eps=5.0, min_samples=1, scale_xy=1.0, scale_z=1.0) # Enough to consider both clusters + df_result = cluster_tagger(df_shuffled, **params) + df_result = df_result.sort_index() # Sort back to original order for easier assertations # --- Assertations - hits_group_0 = df_result[df_result['expected_label'] == 0] - hits_group_1 = df_result[df_result['expected_label'] == 1] - assert hits_group_0['cluster'].nunique() == 1, "Hits near (0,0,0) were assigned multiple cluster labels." - assert hits_group_1['cluster'].nunique() == 1, "Hits near (100,100,0) were assigned multiple cluster labels." - label_0 = hits_group_0['cluster'].iloc[0] - label_1 = hits_group_1['cluster'].iloc[0] - assert label_0 != label_1, "Both clusters were assigned the same label." - assert label_0 != -1 and label_1 != -1, "One of the clusters was labeled as noise (-1)." + pd.testing.assert_frame_equal( + df_result, + df, + check_dtype=True, + obj="Dataframe structure check" + ) def test_cluster_tagger_noise_rejection(): """ @@ -277,12 +290,9 @@ def test_cluster_tagger_noise_rejection(): } df = pd.DataFrame(data) - # Shuffle the input DataFrame - df_input = df.sample(frac=1.0).copy() - # Run the cluster tagger - params = dict(eps=5.0, min_samples=3, scale_xy=1.0, scale_z=1.0) # Enough to consider one cluster - df_result = cluster_tagger(df_input, **params) + params = dict(eps=5.0, min_samples=3, scale_xy=1.0, scale_z=1.0) # Enough to consider one cluster + df_result = cluster_tagger(df.copy(), **params) # --- Assertations cluster_labels = df_result['cluster'].unique() @@ -312,12 +322,9 @@ def test_cluster_tagger_event_distinction(): } df = pd.DataFrame(data) - # Shuffle the input DataFrame - df_input = df.sample(frac=1.0).copy() - # Run the cluster tagger params = dict(eps=5.0, min_samples=2, scale_xy=1.0, scale_z=1.0) # Enough to consider both clusters - df_result = cluster_tagger(df_input, **params) + df_result = cluster_tagger(df.copy(), **params) # --- Assertations event_0_clusters = df_result[df_result['event'] == 0]['cluster'].unique() @@ -327,4 +334,4 @@ def test_cluster_tagger_event_distinction(): event_0_hits = df_result[df_result['event'] == 0] noise_1_hit = df_result[(df_result['X'] == 0.5)] assert noise_1_hit['cluster'].iloc[0] == -1, "The hit at (0.5,0.5,0) in event 1 should be marked as noise (-1)." - assert event_0_hits['cluster'].iloc[0] != noise_1_hit['cluster'].iloc[0], "Hits from event 0 and the noise hit from event 1 were assigned the same cluster label." + assert event_0_hits['cluster'].iloc[0] != noise_1_hit['cluster'].iloc[0], "Hits from event 0 and the noise hit from event 1 were assigned the same cluster label." \ No newline at end of file From 0e4703d392902257f2a6d6406958f159a9b67420 Mon Sep 17 00:00:00 2001 From: Camilo Cortes Parra Date: Thu, 23 Apr 2026 11:55:41 +0200 Subject: [PATCH 15/23] Remove @settings --- invisible_cities/reco/hits_functions_test.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/invisible_cities/reco/hits_functions_test.py b/invisible_cities/reco/hits_functions_test.py index 1eab9ed93..c37d2b2c5 100644 --- a/invisible_cities/reco/hits_functions_test.py +++ b/invisible_cities/reco/hits_functions_test.py @@ -173,7 +173,6 @@ def test_threshold_hits_all_larger_than_th(hits, th): column('E', dtype=float, elements=floats(min_value=0.1, max_value=100)), ]) -@settings(deadline=None) @given(df=gen_cluster_df) def test_cluster_tagger_output_shape(df): """ @@ -194,7 +193,6 @@ def test_cluster_tagger_output_shape(df): expected_cols = set(df.columns) | {'cluster'} assert set(df_result.columns) == expected_cols, "Output DataFrame has unexpected columns." -@settings(deadline=None) @given(df=gen_cluster_df) def test_cluster_tagger_original(df): """ @@ -217,7 +215,6 @@ def test_cluster_tagger_original(df): obj="Dataframe structure check" ) -@settings(deadline=None) @given(df=gen_cluster_df) def test_cluster_tagger_new_column_validity(df): """ From 2eadf57ea0a8659ffdc08f15dd361e698ce62cdb Mon Sep 17 00:00:00 2001 From: Camilo Cortes Parra Date: Fri, 24 Apr 2026 17:26:58 +0200 Subject: [PATCH 16/23] PR: second round of comments addressed --- invisible_cities/reco/hits_functions.py | 69 ++++---- invisible_cities/reco/hits_functions_test.py | 159 ++++++++----------- 2 files changed, 96 insertions(+), 132 deletions(-) diff --git a/invisible_cities/reco/hits_functions.py b/invisible_cities/reco/hits_functions.py index 1b9c58f24..471c99c33 100644 --- a/invisible_cities/reco/hits_functions.py +++ b/invisible_cities/reco/hits_functions.py @@ -243,65 +243,64 @@ def threshold_hits(hits: pd.DataFrame, th: float) -> pd.DataFrame: return (hits.groupby("Z", as_index=False) .apply(apply_threshold, th=th)) -def tag_hits_in_event(event_hits: pd.DataFrame, *, - eps: float, min_samples: int, - scale_xy: float, scale_z: float) -> pd.DataFrame: +def tag_hits_in_event(event_hits : pd.DataFrame + , * + , eps : float + , min_samples : int + , scale_xy : float + , scale_z : float + ) -> pd.DataFrame: """ Applies DBSCAN clustering to a DataFrame containing hits from a single event. - - The coordinates are scaled to account for detector geometry differences - in sampling and applies DBSCAN to identify spatial clusters. + Hits coordinates are scaled to account for the anisotropy of the detector geometry. A 'cluster' column is added to the group with the resulting labels. Parameters ---------- - event_hits : pd.DataFrame + event_hits : pd.DataFrame DataFrame with hits from a single event. Must contain 'X', 'Y', 'Z' columns. eps, min_samples, scale_xy, scale_z : - Configuration parameters for scaling and DBSCAN. See `cluster_tagger` for details. + Configuration parameters for DBSCAN and scaling. See `cluster_tagger` for details. Returns ------- pd.DataFrame The input DataFrame with a 'cluster' column added. """ - # If the event has no hits, there's nothing to do - if event_hits.empty: - return event_hits.assign(cluster=pd.Series(dtype=int)) - - # Extract coordinates and apply scaling coords = event_hits[['X', 'Y', 'Z']].to_numpy() coords[:, :2] /= scale_xy coords[:, 2] /= scale_z - # DBSCAN clustering labels = DBSCAN(eps=eps, min_samples=min_samples).fit_predict(coords) - # Add the cluster labels as a new column to the event's DataFrame. event_hits['cluster'] = labels return event_hits -def cluster_tagger(df_hits: pd.DataFrame, *, - eps: float, min_samples: int, - scale_xy: float, scale_z: float) -> pd.DataFrame: +def cluster_tagger(df_hits : pd.DataFrame + , * + , eps : float + , min_samples : int + , scale_xy : float + , scale_z : float + ) -> pd.DataFrame: """ Applies DBSCAN clustering to hits on an event-by-event basis using groupby.apply. - This function groups the input DataFrame by 'event' and applies the `tag_hits_in_event` function to each event's group of hits. Parameters ---------- - df_hits : pd.DataFrame - DataFrame containing hit information with columns 'X', 'Y', 'Z', and 'event'. - eps : float - Epsilon value for DBSCAN, defining the maximum distance between two samples for them to be considered neighbors. + df_hits : pd.DataFrame + DataFrame with hit information. Must contain 'X', 'Y', 'Z', and 'event'. + eps : float + Maximum distance between two samples for them to be considered neighbors. min_samples : int - Minimum number of samples required to form a dense region (cluster). This includes the point itself. - scale_xy : float - Scaling factor to apply to the (x, y) coordinates before clustering. - scale_z : float - Scaling factor to apply to the z coordinate before clustering. + Minimum number of samples required to form a dense region (cluster). + This includes the point itself. + scale_xy : float + Scaling factor to apply to the XY coordinates before clustering. + scale_z : float + Scaling factor to apply to the Z coordinate before clustering. Returns ------- @@ -312,11 +311,11 @@ def cluster_tagger(df_hits: pd.DataFrame, *, if df_hits.empty: return df_hits.assign(cluster=pd.Series(dtype=int)) - clustered_df = df_hits.groupby('event', as_index=False, group_keys=False) \ - .apply(tag_hits_in_event, - eps=eps, - min_samples=min_samples, - scale_xy=scale_xy, - scale_z=scale_z) + df_clustered = df_hits.groupby('event', as_index=False, group_keys=False) \ + .apply(tag_hits_in_event, + eps = eps, + min_samples = min_samples, + scale_xy = scale_xy, + scale_z = scale_z) - return clustered_df.set_index(df_hits.index) \ No newline at end of file + return df_clustered.set_index(df_hits.index) \ No newline at end of file diff --git a/invisible_cities/reco/hits_functions_test.py b/invisible_cities/reco/hits_functions_test.py index c37d2b2c5..14bb88e49 100644 --- a/invisible_cities/reco/hits_functions_test.py +++ b/invisible_cities/reco/hits_functions_test.py @@ -162,32 +162,25 @@ def test_threshold_hits_all_larger_than_th(hits, th): non_nn = hits_thresh.loc[hits_thresh.Q != NN] assert np.all(non_nn.Q >= th) -# ----- CLUSTER TAGGER TESTS ----- # - -gen_cluster_df = data_frames( index=range_indexes(min_size=1, max_size=50), - columns=[ - column('event', dtype=int, elements=integers(min_value=0, max_value=10)), - column('X', dtype=float, elements=floats(min_value=-500, max_value=500)), - column('Y', dtype=float, elements=floats(min_value=-500, max_value=500)), - column('Z', dtype=float, elements=floats(min_value=0, max_value=1200)), - column('E', dtype=float, elements=floats(min_value=0.1, max_value=100)), - ]) +gen_cluster_df = data_frames(index = range_indexes(min_size=1, max_size=50), + columns = [ + column('event', dtype=int , elements=integers(min_value=0, max_value=10)), + column( 'X', dtype=float, elements=floats(min_value=-500, max_value=500)), + column( 'Y', dtype=float, elements=floats(min_value=-500, max_value=500)), + column( 'Z', dtype=float, elements=floats(min_value=0, max_value=1200)), + column( 'E', dtype=float, elements=floats(min_value=0.1, max_value=100)), + ]) @given(df=gen_cluster_df) def test_cluster_tagger_output_shape(df): """ - Verifies that the output DataFrame of cluster_tagger: + Verifies that the output of cluster_tagger: - Has the same number of rows as the input. - Contains exactly one new column named 'cluster'. """ - if df.empty: - return - - # Run the cluster tagger - params = dict(eps=10.0, min_samples=1, scale_xy=1.0, scale_z=1.0) # Dummy values - df_result = cluster_tagger(df.copy(), **params) + dummy_params = dict(eps=10.0, min_samples=1, scale_xy=1.0, scale_z=1.0) + df_result = cluster_tagger(df.copy(), **dummy_params) - # --- Assertations assert len(df_result) == len(df), "Output DataFrame has different length than input." assert 'cluster' in df_result.columns, "Output DataFrame does not contain 'cluster' column." expected_cols = set(df.columns) | {'cluster'} @@ -197,17 +190,12 @@ def test_cluster_tagger_output_shape(df): def test_cluster_tagger_original(df): """ Verifies that cluster_tagger: - - Does not modify any of the original columns. - - Preserves the original index and row order. + - Does not modify any of the input information. + - Preserves the input index and row order. """ - if df.empty: - return + dummy_params = dict(eps=10.0, min_samples=1, scale_xy=1.0, scale_z=1.0) + df_result = cluster_tagger(df.copy(), **dummy_params) - # Run the cluster tagger - params = dict(eps=10.0, min_samples=1, scale_xy=1.0, scale_z=1.0) # Dummy values - df_result = cluster_tagger(df.copy(), **params) - - # --- Assertations pd.testing.assert_frame_equal( df_result.drop(columns=['cluster']), df, @@ -217,51 +205,39 @@ def test_cluster_tagger_original(df): @given(df=gen_cluster_df) def test_cluster_tagger_new_column_validity(df): - """ - Verifies that the new 'cluster' column: - - The 'cluster' column contains valid integers (no NaNs). - """ - if df.empty: - return - - # Run the cluster tagger - params = dict(eps=10.0, min_samples=1, scale_xy=1.0, scale_z=1.0) # Dummy values - df_result = cluster_tagger(df.copy(), **params) + dummy_params = dict(eps=10.0, min_samples=1, scale_xy=1.0, scale_z=1.0) + df_result = cluster_tagger(df.copy(), **dummy_params) - # --- Assertations - assert pd.api.types.is_integer_dtype(df_result['cluster']), "'cluster' column is not of integer type." - assert not df_result['cluster'].isna().any(), "'cluster' column contains NaN values." + assert pd.api.types.is_integer_dtype(df_result.cluster), "'cluster' column is not of integer type." + assert not df_result.cluster.isna().any(), "'cluster' column contains NaN values." def test_cluster_tagger_row_alignment(): """ - Verifies that the correct cluster label is assigned to the correct - row (hit), even if the input DataFrame is shuffled. + Verifies that the correct cluster label is assigned to the correct hit, + even if the input DataFrame is shuffled. Scenario: - Event 0: - Cluster A: 2 hits at (0,0,0) and (1,1,0) -> Should be Cluster 0 - Cluster B: 2 hits at (100,100,0) and (101,101,0) -> Should be Cluster 1 """ - # Setup data data = { - 'event': [0, 0, 0, 0], - 'X': [0., 1., 100., 101.], - 'Y': [0., 1., 100., 101.], - 'Z': [0., 0., 0., 0.], - 'E': [10, 10, 10, 10 ] + 'event' : [ 0, 0, 0, 0], + 'X' : [0., 1., 100., 101.], + 'Y' : [0., 1., 100., 101.], + 'Z' : [0., 0., 0., 0.], + 'cluster': [ 0, 0, 1, 1] } df = pd.DataFrame(data) - # Shuffle rows in a specific order - df_shuffled = df.reindex(index=[0, 2, 1, 3]).copy() - # Add expected labels for assertations (not shuffled, just for reference) - df['cluster'] = [0, 0, 1, 1] + # Shuffled dataframe must start with the same hit as the original + # to ensure that both hits close to (0,0,0) have same cluster label (0) + df_shuffled = df.reindex(index=[0, 2, 1, 3]).copy().drop(columns=['cluster']) - # Run the cluster tagger - params = dict(eps=5.0, min_samples=1, scale_xy=1.0, scale_z=1.0) # Enough to consider both clusters - df_result = cluster_tagger(df_shuffled, **params) - df_result = df_result.sort_index() # Sort back to original order for easier assertations + test_params = dict(eps=5.0, min_samples=1, scale_xy=1.0, scale_z=1.0) + df_result = cluster_tagger(df_shuffled, **test_params) + # Sorted final result must match original dataframe + df_result = df_result.sort_index() - # --- Assertations pd.testing.assert_frame_equal( df_result, df, @@ -269,66 +245,55 @@ def test_cluster_tagger_row_alignment(): obj="Dataframe structure check" ) -def test_cluster_tagger_noise_rejection(): +def test_cluster_tagger_noise_identification(): """ - Verifies that isolated hits (outliers) are correctly identified as noise (-1). - Scenario: - 3 points very close together (0,0), (1,0), (0,1). They should form a cluster. - 1 point very far away (100, 100). It has 0 neighbors. Should be noise. """ - # Setup data data = { - 'event': [0, 0, 0, 0], - 'X': [0., 1., 0., 100.], - 'Y': [0., 0., 1., 100.], - 'Z': [0., 0., 0., 0.], - 'E': [10, 10, 10, 10 ] + 'event': [ 0, 0, 0, 0], + 'X' : [0., 1., 0., 100.], + 'Y' : [0., 0., 1., 100.], + 'Z' : [0., 0., 0., 0.] } df = pd.DataFrame(data) - # Run the cluster tagger - params = dict(eps=5.0, min_samples=3, scale_xy=1.0, scale_z=1.0) # Enough to consider one cluster - df_result = cluster_tagger(df.copy(), **params) - - # --- Assertations - cluster_labels = df_result['cluster'].unique() - assert cluster_labels.size == 2, "Expected exactly 2 unique cluster labels (one cluster + one noise)." - cluster_hits = df_result[df_result['cluster'] != -1] - assert cluster_hits.shape[0] == 3, "Expected exactly 3 hits to be clustered together." - noise_hit = df_result[df_result['cluster'] == -1] - assert noise_hit.shape[0] == 1, "Expected exactly 1 noise hit." + test_params = dict(eps=5.0, min_samples=3, scale_xy=1.0, scale_z=1.0) + df_result = cluster_tagger(df.copy(), **test_params) + + cluster_labels = df_result.cluster.unique() + assert len(cluster_labels) == 2, "Expected exactly 2 unique cluster labels (one cluster + one noise)." + cluster_hits = df_result[df_result.cluster != -1] + assert len(cluster_hits) == 3, "Expected exactly 3 hits to be clustered together." + noise_hit = df_result[df_result.cluster == -1] + assert len(noise_hit) == 1, "Expected exactly 1 noise hit." assert noise_hit['X'].iloc[0] == 100 and noise_hit['Y'].iloc[0] == 100, "The noise hit identified is NOT the distant one." def test_cluster_tagger_event_distinction(): """ - Verifies that hits from different events are not clustered together. - Scenario: - - Event 0: 2 hits at (0,0,0) and (1,1,0) -> Should be Cluster 0 - - Event 1: 2 hits at (100,100,0) and (101,101,0) and 1 hit at (0.5,0.5,0) -> Should be marked as noise (-1) + - Event 0: 2 hits at (0,0,0) and (1,1,0) -> Should be Cluster 0 + - Event 1: 2 hits at (100,100,0) and (101,101,0) -> Should be Cluster 1 + and 1 hit at (0.5,0.5,0) -> Should be marked as noise (-1) - We check that noise hit from Event 1 get a different cluster label than hits from Event 0, even if they are spatially close. """ - # Setup data data = { - 'event': [0, 0, 1, 1, 1], - 'X': [0., 1., 100., 101., 0.5], - 'Y': [0., 1., 100., 101., 0.5], - 'Z': [0., 0., 0., 0., 0.], - 'E': [10, 10, 10, 10, 10] + 'event': [ 0, 0, 1, 1, 1], + 'X' : [0., 1., 100., 101., 0.5], + 'Y' : [0., 1., 100., 101., 0.5], + 'Z' : [0., 0., 0., 0., 0.], } df = pd.DataFrame(data) - # Run the cluster tagger - params = dict(eps=5.0, min_samples=2, scale_xy=1.0, scale_z=1.0) # Enough to consider both clusters - df_result = cluster_tagger(df.copy(), **params) + test_params = dict(eps=5.0, min_samples=2, scale_xy=1.0, scale_z=1.0) + df_result = cluster_tagger(df.copy(), **test_params) - # --- Assertations - event_0_clusters = df_result[df_result['event'] == 0]['cluster'].unique() - event_1_clusters = df_result[df_result['event'] == 1]['cluster'].unique() + event_0_clusters = df_result[df_result.event == 0].cluster.unique() + event_1_clusters = df_result[df_result.event == 1].cluster.unique() assert len(event_0_clusters) == 1, "For event 0: expected exactly 1 unique cluster label (one cluster)." assert len(event_1_clusters) == 2, "For event 1: expected exactly 2 unique cluster labels (one cluster + one noise)." - event_0_hits = df_result[df_result['event'] == 0] - noise_1_hit = df_result[(df_result['X'] == 0.5)] - assert noise_1_hit['cluster'].iloc[0] == -1, "The hit at (0.5,0.5,0) in event 1 should be marked as noise (-1)." - assert event_0_hits['cluster'].iloc[0] != noise_1_hit['cluster'].iloc[0], "Hits from event 0 and the noise hit from event 1 were assigned the same cluster label." \ No newline at end of file + event_0_hits = df_result[df_result.event == 0] + noise_1_hit = df_result[(df_result.X == 0.5)] + assert noise_1_hit.cluster.iloc[0] == -1, "The hit at (0.5,0.5,0) in event 1 should be marked as noise (-1)." + assert event_0_hits.cluster.iloc[0] != noise_1_hit.cluster.iloc[0], "Hits from event 0 and the noise hit from event 1 were assigned the same cluster label." \ No newline at end of file From 74dcedc478a5d73841d988edb70cc9cc2b24e9d6 Mon Sep 17 00:00:00 2001 From: Camilo Cortes Parra Date: Tue, 5 May 2026 16:11:21 +0200 Subject: [PATCH 17/23] Add @settings --- invisible_cities/reco/hits_functions_test.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/invisible_cities/reco/hits_functions_test.py b/invisible_cities/reco/hits_functions_test.py index 14bb88e49..648c1d017 100644 --- a/invisible_cities/reco/hits_functions_test.py +++ b/invisible_cities/reco/hits_functions_test.py @@ -20,6 +20,7 @@ from hypothesis import assume from hypothesis.strategies import composite from hypothesis.extra.pandas import data_frames, column, range_indexes +from hypothesis import settings event_numbers = integers(0, np.iinfo(np.int32).max) @@ -168,10 +169,11 @@ def test_threshold_hits_all_larger_than_th(hits, th): column( 'X', dtype=float, elements=floats(min_value=-500, max_value=500)), column( 'Y', dtype=float, elements=floats(min_value=-500, max_value=500)), column( 'Z', dtype=float, elements=floats(min_value=0, max_value=1200)), - column( 'E', dtype=float, elements=floats(min_value=0.1, max_value=100)), + column( 'E', dtype=float, elements=floats(min_value=0.1, max_value=100)) ]) @given(df=gen_cluster_df) +@settings(deadline=None) def test_cluster_tagger_output_shape(df): """ Verifies that the output of cluster_tagger: From 8dda6547c935c58715c5405a8d7d8f79d900974a Mon Sep 17 00:00:00 2001 From: Camilo Cortes Parra Date: Wed, 6 May 2026 18:18:43 +0200 Subject: [PATCH 18/23] Removing unused import --- invisible_cities/reco/hits_functions.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/invisible_cities/reco/hits_functions.py b/invisible_cities/reco/hits_functions.py index 471c99c33..a5097d06d 100644 --- a/invisible_cities/reco/hits_functions.py +++ b/invisible_cities/reco/hits_functions.py @@ -6,8 +6,7 @@ from typing import List from sklearn.cluster import DBSCAN -from .. evm import event_model as evm -from .. types.ic_types import NN +from .. types.ic_types import NN EPSILON = np.finfo(np.float64).eps @@ -318,4 +317,4 @@ def cluster_tagger(df_hits : pd.DataFrame scale_xy = scale_xy, scale_z = scale_z) - return df_clustered.set_index(df_hits.index) \ No newline at end of file + return df_clustered.set_index(df_hits.index) From fe3d020c54a74d312b7d96ee9c94968e07a007ad Mon Sep 17 00:00:00 2001 From: Camilo Cortes Parra Date: Tue, 12 May 2026 15:14:35 +0200 Subject: [PATCH 19/23] DBSCAN eps value set to 1.8 to retain only neighbouring hits as a cluster. It is not a parameter function anymore. --- invisible_cities/cities/components.py | 15 +++---- invisible_cities/cities/sophronia.py | 7 ++- invisible_cities/cities/sophronia_test.py | 5 +-- invisible_cities/config/sophronia.conf | 5 +-- invisible_cities/conftest.py | 1 - invisible_cities/reco/hits_functions.py | 45 ++++++++++---------- invisible_cities/reco/hits_functions_test.py | 12 +++--- 7 files changed, 42 insertions(+), 48 deletions(-) diff --git a/invisible_cities/cities/components.py b/invisible_cities/cities/components.py index 07fc816a4..fd4ff59ac 100644 --- a/invisible_cities/cities/components.py +++ b/invisible_cities/cities/components.py @@ -1719,8 +1719,7 @@ def correct(hits : pd.DataFrame) -> pd.DataFrame: return correct @check_annotations -def hits_clusterizer( eps : float - , min_samples : int +def hits_clusterizer( min_samples : int , scale_xy : float , scale_z : float ) -> Callable: @@ -1729,10 +1728,9 @@ def hits_clusterizer( eps : float Parameters ---------- - eps : float - Epsilon value for DBSCAN, defining the maximum distance between two samples for them to be considered neighbors. min_samples : int - Minimum number of samples required to form a dense region (cluster). This includes the point itself. + Minimum number of samples required to form a dense region (cluster). + This includes the point itself. scale_xy : float Scaling factor to apply to the (x, y) coordinates before clustering. scale_z : float @@ -1745,9 +1743,10 @@ def hits_clusterizer( eps : float with an added 'cluster' column, which contains the cluster labels assigned by DBSCAN (-1 for noise). """ - return partial(cluster_tagger, - eps=eps, min_samples=min_samples, - scale_xy=scale_xy, scale_z=scale_z) + return partial( cluster_tagger + , min_samples = min_samples + , scale_xy = scale_xy + , scale_z = scale_z ) def identity(x : Any) -> Any: diff --git a/invisible_cities/cities/sophronia.py b/invisible_cities/cities/sophronia.py index 97947cda9..e122865b7 100644 --- a/invisible_cities/cities/sophronia.py +++ b/invisible_cities/cities/sophronia.py @@ -141,13 +141,12 @@ def sophronia( files_in : OneOrManyFiles Normalization value in case of `norm_strat = NormStrategy.custom` clustering_params : dict - eps : float - Epsilon value for DBSCAN, defining the maximum distance between two samples for them to be considered neighbors. min_samples : int - Minimum number of samples required to form a dense region (cluster). This includes the point itself. + Minimum number of samples required to form a dense region (cluster). + This includes the point itself. scale_xy : float Scaling factor to apply to the (x, y) coordinates before clustering. - scale_z : float + scale_z : float Scaling factor to apply to the z coordinate before clustering. """ global_reco = compute_xy_position( detector_db diff --git a/invisible_cities/cities/sophronia_test.py b/invisible_cities/cities/sophronia_test.py index 922f25492..a285a6989 100644 --- a/invisible_cities/cities/sophronia_test.py +++ b/invisible_cities/cities/sophronia_test.py @@ -171,10 +171,9 @@ def test_sophronia_clustering_integration(config_tmpdir, sophronia_config): # Clustering enabled clustering_params = dict( - eps = 3, min_samples = 5, - scale_xy = 14.55, - scale_z = 3.7 + scale_xy = 15.55, + scale_z = 4.0 ) config_with_cluster = dict(**sophronia_config) config_with_cluster.update(dict( file_out = path_out_with_cluster diff --git a/invisible_cities/config/sophronia.conf b/invisible_cities/config/sophronia.conf index 56ca2eeb7..c1f27b0a8 100644 --- a/invisible_cities/config/sophronia.conf +++ b/invisible_cities/config/sophronia.conf @@ -65,8 +65,7 @@ corrections = dict( apply_z = False) clustering_params = dict( - eps = 3, - min_samples = 5, + min_samples = 5, scale_xy = 15.55, - scale_z = 4.0 + scale_z = 4.0 ) diff --git a/invisible_cities/conftest.py b/invisible_cities/conftest.py index 1b2af207d..56e5a50f4 100644 --- a/invisible_cities/conftest.py +++ b/invisible_cities/conftest.py @@ -439,7 +439,6 @@ def sophronia_config(Th228_pmaps, next100_mc_krmap): apply_temp = False, norm_strat = NormStrategy.kr) , clustering_params = dict( - eps = 3, min_samples = 5, scale_xy = 15.55, scale_z = 4.0) diff --git a/invisible_cities/reco/hits_functions.py b/invisible_cities/reco/hits_functions.py index a5097d06d..4e300af2d 100644 --- a/invisible_cities/reco/hits_functions.py +++ b/invisible_cities/reco/hits_functions.py @@ -244,7 +244,6 @@ def threshold_hits(hits: pd.DataFrame, th: float) -> pd.DataFrame: def tag_hits_in_event(event_hits : pd.DataFrame , * - , eps : float , min_samples : int , scale_xy : float , scale_z : float @@ -256,10 +255,15 @@ def tag_hits_in_event(event_hits : pd.DataFrame Parameters ---------- - event_hits : pd.DataFrame + event_hits : pd.DataFrame DataFrame with hits from a single event. Must contain 'X', 'Y', 'Z' columns. - eps, min_samples, scale_xy, scale_z : - Configuration parameters for DBSCAN and scaling. See `cluster_tagger` for details. + min_samples : int + Minimum number of samples required to form a dense region (cluster). + This includes the point itself. + scale_xy : float + Scaling factor to apply to the XY coordinates before clustering. + scale_z : float + Scaling factor to apply to the Z coordinate before clustering. Returns ------- @@ -267,40 +271,35 @@ def tag_hits_in_event(event_hits : pd.DataFrame The input DataFrame with a 'cluster' column added. """ coords = event_hits[['X', 'Y', 'Z']].to_numpy() + # A proper scaling leads to hits being separeted + # by a distance of 1 in the DBSCAN metric space coords[:, :2] /= scale_xy coords[:, 2] /= scale_z - labels = DBSCAN(eps=eps, min_samples=min_samples).fit_predict(coords) + # eps parameter is fixed to a value a bit higher of √3 + # to retain diagonal neighbours in the same cluster + labels = DBSCAN(eps=1.8, min_samples=min_samples).fit_predict(coords) event_hits['cluster'] = labels return event_hits def cluster_tagger(df_hits : pd.DataFrame , * - , eps : float , min_samples : int , scale_xy : float , scale_z : float ) -> pd.DataFrame: """ - Applies DBSCAN clustering to hits on an event-by-event basis using groupby.apply. This function groups the input DataFrame by 'event' and applies the `tag_hits_in_event` function to each event's group of hits. Parameters ---------- - df_hits : pd.DataFrame + df_hits : pd.DataFrame DataFrame with hit information. Must contain 'X', 'Y', 'Z', and 'event'. - eps : float - Maximum distance between two samples for them to be considered neighbors. - min_samples : int - Minimum number of samples required to form a dense region (cluster). - This includes the point itself. - scale_xy : float - Scaling factor to apply to the XY coordinates before clustering. - scale_z : float - Scaling factor to apply to the Z coordinate before clustering. - + min_samples, scale_xy, scale_z : + See `tag_hits_in_event` + Returns ------- pd.DataFrame @@ -311,10 +310,10 @@ def cluster_tagger(df_hits : pd.DataFrame return df_hits.assign(cluster=pd.Series(dtype=int)) df_clustered = df_hits.groupby('event', as_index=False, group_keys=False) \ - .apply(tag_hits_in_event, - eps = eps, - min_samples = min_samples, - scale_xy = scale_xy, - scale_z = scale_z) + .apply( tag_hits_in_event + , min_samples = min_samples + , scale_xy = scale_xy + , scale_z = scale_z ) return df_clustered.set_index(df_hits.index) + \ No newline at end of file diff --git a/invisible_cities/reco/hits_functions_test.py b/invisible_cities/reco/hits_functions_test.py index 648c1d017..dde3f59c6 100644 --- a/invisible_cities/reco/hits_functions_test.py +++ b/invisible_cities/reco/hits_functions_test.py @@ -180,7 +180,7 @@ def test_cluster_tagger_output_shape(df): - Has the same number of rows as the input. - Contains exactly one new column named 'cluster'. """ - dummy_params = dict(eps=10.0, min_samples=1, scale_xy=1.0, scale_z=1.0) + dummy_params = dict(min_samples=1, scale_xy=1.0, scale_z=1.0) df_result = cluster_tagger(df.copy(), **dummy_params) assert len(df_result) == len(df), "Output DataFrame has different length than input." @@ -195,7 +195,7 @@ def test_cluster_tagger_original(df): - Does not modify any of the input information. - Preserves the input index and row order. """ - dummy_params = dict(eps=10.0, min_samples=1, scale_xy=1.0, scale_z=1.0) + dummy_params = dict(min_samples=1, scale_xy=1.0, scale_z=1.0) df_result = cluster_tagger(df.copy(), **dummy_params) pd.testing.assert_frame_equal( @@ -207,7 +207,7 @@ def test_cluster_tagger_original(df): @given(df=gen_cluster_df) def test_cluster_tagger_new_column_validity(df): - dummy_params = dict(eps=10.0, min_samples=1, scale_xy=1.0, scale_z=1.0) + dummy_params = dict(min_samples=1, scale_xy=1.0, scale_z=1.0) df_result = cluster_tagger(df.copy(), **dummy_params) assert pd.api.types.is_integer_dtype(df_result.cluster), "'cluster' column is not of integer type." @@ -235,7 +235,7 @@ def test_cluster_tagger_row_alignment(): # to ensure that both hits close to (0,0,0) have same cluster label (0) df_shuffled = df.reindex(index=[0, 2, 1, 3]).copy().drop(columns=['cluster']) - test_params = dict(eps=5.0, min_samples=1, scale_xy=1.0, scale_z=1.0) + test_params = dict(min_samples=1, scale_xy=1.0, scale_z=1.0) df_result = cluster_tagger(df_shuffled, **test_params) # Sorted final result must match original dataframe df_result = df_result.sort_index() @@ -261,7 +261,7 @@ def test_cluster_tagger_noise_identification(): } df = pd.DataFrame(data) - test_params = dict(eps=5.0, min_samples=3, scale_xy=1.0, scale_z=1.0) + test_params = dict(min_samples=3, scale_xy=1.0, scale_z=1.0) df_result = cluster_tagger(df.copy(), **test_params) cluster_labels = df_result.cluster.unique() @@ -288,7 +288,7 @@ def test_cluster_tagger_event_distinction(): } df = pd.DataFrame(data) - test_params = dict(eps=5.0, min_samples=2, scale_xy=1.0, scale_z=1.0) + test_params = dict(min_samples=2, scale_xy=1.0, scale_z=1.0) df_result = cluster_tagger(df.copy(), **test_params) event_0_clusters = df_result[df_result.event == 0].cluster.unique() From 9af71acc98031215524236bb3abff3ee2a082da0 Mon Sep 17 00:00:00 2001 From: Camilo Cortes Parra Date: Tue, 12 May 2026 15:14:56 +0200 Subject: [PATCH 20/23] Update reference files for tests --- 228Th_10evt_hits.h5 | 3 +++ invisible_cities/database/test_data/228Th_10evt_deco.h5 | 4 ++-- .../database/test_data/228Th_10evt_deco_satellite.h5 | 4 ++-- .../database/test_data/228Th_10evt_deco_separate.h5 | 4 ++-- invisible_cities/database/test_data/228Th_10evt_hits.h5 | 4 ++-- 5 files changed, 11 insertions(+), 8 deletions(-) create mode 100644 228Th_10evt_hits.h5 diff --git a/228Th_10evt_hits.h5 b/228Th_10evt_hits.h5 new file mode 100644 index 000000000..4b8739c0d --- /dev/null +++ b/228Th_10evt_hits.h5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7016967f80568b698957649dab8dc581a7c2f3f7af330cf65c69391ad0ce675f +size 276410 diff --git a/invisible_cities/database/test_data/228Th_10evt_deco.h5 b/invisible_cities/database/test_data/228Th_10evt_deco.h5 index 47ff579ad..0ffee6179 100644 --- a/invisible_cities/database/test_data/228Th_10evt_deco.h5 +++ b/invisible_cities/database/test_data/228Th_10evt_deco.h5 @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5ed63a53e9a78cef39a4ff4d14ae1cfbdfc0f121d27fd9ee215c18bcec91ff2d -size 819282 +oid sha256:3032cc3428c6df55802181aa3f2c0a72d288cabac34ac0b753f4d0cff6ba31be +size 823118 diff --git a/invisible_cities/database/test_data/228Th_10evt_deco_satellite.h5 b/invisible_cities/database/test_data/228Th_10evt_deco_satellite.h5 index 273064f6e..e9051848a 100644 --- a/invisible_cities/database/test_data/228Th_10evt_deco_satellite.h5 +++ b/invisible_cities/database/test_data/228Th_10evt_deco_satellite.h5 @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:434f0bf46e4e326731dbb87e14d83d6649ea01eb0079654a42c9c3e3c41c7df1 -size 304668 +oid sha256:ee059274164e63cd00f91fd41938d57af47ce6ee7dbd9b08357947ddb883f701 +size 303387 diff --git a/invisible_cities/database/test_data/228Th_10evt_deco_separate.h5 b/invisible_cities/database/test_data/228Th_10evt_deco_separate.h5 index 0f6cddf7d..0b4f7548e 100644 --- a/invisible_cities/database/test_data/228Th_10evt_deco_separate.h5 +++ b/invisible_cities/database/test_data/228Th_10evt_deco_separate.h5 @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:69175d759c8484973f9006911f08c8019cc84bc472481b7c8d3fafded16a3cc8 -size 819313 +oid sha256:3fb02b1e386e4a978267875acc49eceb97c2c07ce6a74e3579a2524683f39fc7 +size 823121 diff --git a/invisible_cities/database/test_data/228Th_10evt_hits.h5 b/invisible_cities/database/test_data/228Th_10evt_hits.h5 index 3aa32af53..4b8739c0d 100644 --- a/invisible_cities/database/test_data/228Th_10evt_hits.h5 +++ b/invisible_cities/database/test_data/228Th_10evt_hits.h5 @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ee809a8d3f69241048b4c37c156b452e28bcce05d45b3504c15163599e6c8dbb -size 274902 +oid sha256:7016967f80568b698957649dab8dc581a7c2f3f7af330cf65c69391ad0ce675f +size 276410 From 6009d782dea1d3967de5999e1b793bb336f3140b Mon Sep 17 00:00:00 2001 From: Camilo Cortes Parra Date: Tue, 12 May 2026 15:45:41 +0200 Subject: [PATCH 21/23] Removing deadline for tests that use hypothesis --- invisible_cities/reco/hits_functions_test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/invisible_cities/reco/hits_functions_test.py b/invisible_cities/reco/hits_functions_test.py index dde3f59c6..954ae1e84 100644 --- a/invisible_cities/reco/hits_functions_test.py +++ b/invisible_cities/reco/hits_functions_test.py @@ -189,6 +189,7 @@ def test_cluster_tagger_output_shape(df): assert set(df_result.columns) == expected_cols, "Output DataFrame has unexpected columns." @given(df=gen_cluster_df) +@settings(deadline=None) def test_cluster_tagger_original(df): """ Verifies that cluster_tagger: @@ -206,6 +207,7 @@ def test_cluster_tagger_original(df): ) @given(df=gen_cluster_df) +@settings(deadline=None) def test_cluster_tagger_new_column_validity(df): dummy_params = dict(min_samples=1, scale_xy=1.0, scale_z=1.0) df_result = cluster_tagger(df.copy(), **dummy_params) From 995705d1e7b31e6c0b5ae97af72c831967be9751 Mon Sep 17 00:00:00 2001 From: Camilo Cortes Parra Date: Tue, 12 May 2026 15:46:01 +0200 Subject: [PATCH 22/23] Updating esmeralda reference file for tests --- invisible_cities/database/test_data/228Th_10evt_tracks.h5 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/invisible_cities/database/test_data/228Th_10evt_tracks.h5 b/invisible_cities/database/test_data/228Th_10evt_tracks.h5 index edc540196..3844b259b 100644 --- a/invisible_cities/database/test_data/228Th_10evt_tracks.h5 +++ b/invisible_cities/database/test_data/228Th_10evt_tracks.h5 @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1ccacb5209c035672cd6b147f54d816863a9a7828f5ad88dd1a7dff488a8e7fc -size 230547 +oid sha256:c7841bdae394479d350c3877a4f37778f3b3db528e4610f65c22f3b284d7f537 +size 230764 From 0772fbf71d66594148f570494bb5a35f161672ec Mon Sep 17 00:00:00 2001 From: Camilo Cortes Parra Date: Tue, 12 May 2026 16:45:06 +0200 Subject: [PATCH 23/23] Updating reference for esmeralda test --- invisible_cities/database/test_data/228Th_10evt_tracks.h5 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/invisible_cities/database/test_data/228Th_10evt_tracks.h5 b/invisible_cities/database/test_data/228Th_10evt_tracks.h5 index 3844b259b..304fcb987 100644 --- a/invisible_cities/database/test_data/228Th_10evt_tracks.h5 +++ b/invisible_cities/database/test_data/228Th_10evt_tracks.h5 @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c7841bdae394479d350c3877a4f37778f3b3db528e4610f65c22f3b284d7f537 -size 230764 +oid sha256:ecc3e2524de6c1c5e0e9d89cdf7c510fde37557277d85417890c16bbe9485902 +size 230750