diff --git a/228Th_10evt_hits.h5 b/228Th_10evt_hits.h5 new file mode 100644 index 000000000..4b8739c0d --- /dev/null +++ b/228Th_10evt_hits.h5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7016967f80568b698957649dab8dc581a7c2f3f7af330cf65c69391ad0ce675f +size 276410 diff --git a/invisible_cities/cities/components.py b/invisible_cities/cities/components.py index f94405e83..fd4ff59ac 100644 --- a/invisible_cities/cities/components.py +++ b/invisible_cities/cities/components.py @@ -59,6 +59,7 @@ from .. reco .corrections import get_df_to_z_converter from .. reco .xy_algorithms import corona from .. reco .xy_algorithms import barycenter +from .. reco .hits_functions import cluster_tagger from .. filters.s1s2_filter import S12Selector from .. filters.s1s2_filter import S12SelectorOutput from .. filters.s1s2_filter import pmap_filter @@ -1717,6 +1718,36 @@ def correct(hits : pd.DataFrame) -> pd.DataFrame: return correct +@check_annotations +def hits_clusterizer( min_samples : int + , scale_xy : float + , scale_z : float + ) -> Callable: + """ + Creates a callable for performing DBSCAN clustering on a dataFrame of hits. + + Parameters + ---------- + min_samples : int + Minimum number of samples required to form a dense region (cluster). + This includes the point itself. + scale_xy : float + Scaling factor to apply to the (x, y) coordinates before clustering. + scale_z : float + Scaling factor to apply to the z coordinate before clustering. + + Returns + ------- + Callable + A function that takes a DataFrame of hits and returns the same DataFrame + with an added 'cluster' column, which contains the cluster labels assigned by DBSCAN + (-1 for noise). + """ + return partial( cluster_tagger + , min_samples = min_samples + , scale_xy = scale_xy + , scale_z = scale_z ) + def identity(x : Any) -> Any: return x diff --git a/invisible_cities/cities/sophronia.py b/invisible_cities/cities/sophronia.py index 625ac3032..e122865b7 100644 --- a/invisible_cities/cities/sophronia.py +++ b/invisible_cities/cities/sophronia.py @@ -64,6 +64,7 @@ from . components import collect from . components import build_pointlike_event as pointlike_event_builder from . components import hits_corrector +from . components import hits_clusterizer from . components import identity from typing import Optional @@ -93,6 +94,7 @@ def sophronia( files_in : OneOrManyFiles , sipm_charge_type : SiPMCharge , same_peak : bool , corrections : Optional[dict] = None + , clustering_params : Optional[dict] = None ): """ drift_v : float @@ -137,6 +139,15 @@ def sophronia( files_in : OneOrManyFiles Normalization strategy norm_value : float, optional Normalization value in case of `norm_strat = NormStrategy.custom` + + clustering_params : dict + min_samples : int + Minimum number of samples required to form a dense region (cluster). + This includes the point itself. + scale_xy : float + Scaling factor to apply to the (x, y) coordinates before clustering. + scale_z : float + Scaling factor to apply to the z coordinate before clustering. """ global_reco = compute_xy_position( detector_db , run_number @@ -177,6 +188,9 @@ def sophronia( files_in : OneOrManyFiles correct_hits = df.map( hits_corrector(**corrections) if corrections is not None else identity , item = "hits") + + cluster_hits = df.map( hits_clusterizer(**clustering_params) if clustering_params is not None else identity + , item = "hits") build_pointlike_event = df.map( pointlike_event_builder( detector_db , run_number @@ -202,7 +216,7 @@ def sophronia( files_in : OneOrManyFiles , args = "event_number enough_valid_hits".split()) hits_branch = ( make_hits, enough_valid_hits, df.branch(write_hits_filter) - , hits_select.filter, merge_nn_hits, correct_hits, write_hits) + , hits_select.filter, merge_nn_hits, correct_hits, cluster_hits, write_hits) kdst_branch = build_pointlike_event, write_pointlike_event collect_evt_numbers = "event_number", event_number_collector.sink diff --git a/invisible_cities/cities/sophronia_test.py b/invisible_cities/cities/sophronia_test.py index 96d7646e7..a285a6989 100644 --- a/invisible_cities/cities/sophronia_test.py +++ b/invisible_cities/cities/sophronia_test.py @@ -5,6 +5,8 @@ from pytest import mark +from .. io import dst_io as dio +from .. core.testing_utils import assert_dataframes_equal from .. core.testing_utils import assert_tables_equality from .. core.testing_utils import ignore_warning from .. core.system_of_units import pes @@ -147,3 +149,48 @@ def test_sophronia_keeps_hitless_events(config_tmpdir, sophronia_config): with tb.open_file(path_out) as output_file: assert len(output_file.root.Run.events) == 1 assert "RECO" not in output_file.root + + +@ignore_warning.no_config_group +def test_sophronia_clustering_integration(config_tmpdir, sophronia_config): + """ + Runs Sophronia twice (once disabled, once enabled) to verify: + 1. Backward compatibility: No 'cluster' column when disabled. + 2. Feature activation: 'cluster' column exists when enabled. + 3. Data consistency: Enabling clustering does NOT change any other data. + """ + path_out_no_cluster = os.path.join(config_tmpdir, 'test_sophronia_no_cluster.h5') + path_out_with_cluster = os.path.join(config_tmpdir, 'test_sophronia_with_cluster.h5') + + # Clustering disabled + config_no_cluster = dict(**sophronia_config) + config_no_cluster.update(dict( file_out = path_out_no_cluster + , event_range = 1 + , clustering_params = None)) + sophronia(**config_no_cluster) + + # Clustering enabled + clustering_params = dict( + min_samples = 5, + scale_xy = 15.55, + scale_z = 4.0 + ) + config_with_cluster = dict(**sophronia_config) + config_with_cluster.update(dict( file_out = path_out_with_cluster + , event_range = 1 + , clustering_params = clustering_params)) + sophronia(**config_with_cluster) + + # Load both outputs + df_no_cluster = dio.load_dst(path_out_no_cluster, "RECO", "Events") + df_with_cluster = dio.load_dst(path_out_with_cluster, "RECO", "Events") + + # ----- Assertions + assert not df_no_cluster.empty + assert not df_with_cluster.empty + assert 'cluster' not in df_no_cluster.columns, "'cluster' column should not exist when clustering is disabled." + assert 'cluster' in df_with_cluster.columns, "'cluster' column should exist when clustering is enabled." + + # Compare all columns except 'cluster' for equality + df_with_cluster_compare = df_with_cluster.drop(columns=['cluster']) + assert_dataframes_equal(df_no_cluster, df_with_cluster_compare) \ No newline at end of file diff --git a/invisible_cities/config/sophronia.conf b/invisible_cities/config/sophronia.conf index 22613d646..c1f27b0a8 100644 --- a/invisible_cities/config/sophronia.conf +++ b/invisible_cities/config/sophronia.conf @@ -63,3 +63,9 @@ corrections = dict( apply_temp = True, norm_strat = kr, apply_z = False) + +clustering_params = dict( + min_samples = 5, + scale_xy = 15.55, + scale_z = 4.0 +) diff --git a/invisible_cities/conftest.py b/invisible_cities/conftest.py index 2476ae84f..56e5a50f4 100644 --- a/invisible_cities/conftest.py +++ b/invisible_cities/conftest.py @@ -438,6 +438,10 @@ def sophronia_config(Th228_pmaps, next100_mc_krmap): filename = next100_mc_krmap, apply_temp = False, norm_strat = NormStrategy.kr) + , clustering_params = dict( + min_samples = 5, + scale_xy = 15.55, + scale_z = 4.0) ) return config diff --git a/invisible_cities/database/test_data/228Th_10evt_deco.h5 b/invisible_cities/database/test_data/228Th_10evt_deco.h5 index e4f1f6006..0ffee6179 100644 --- a/invisible_cities/database/test_data/228Th_10evt_deco.h5 +++ b/invisible_cities/database/test_data/228Th_10evt_deco.h5 @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3b40406d8530d0f7a6f6dfcd7b1d10bea83374ee49374f612183771f9ca5a9c5 -size 818438 +oid sha256:3032cc3428c6df55802181aa3f2c0a72d288cabac34ac0b753f4d0cff6ba31be +size 823118 diff --git a/invisible_cities/database/test_data/228Th_10evt_deco_satellite.h5 b/invisible_cities/database/test_data/228Th_10evt_deco_satellite.h5 index be2713440..e9051848a 100644 --- a/invisible_cities/database/test_data/228Th_10evt_deco_satellite.h5 +++ b/invisible_cities/database/test_data/228Th_10evt_deco_satellite.h5 @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e1aaca525390738b4ee0f432d81f235ca2f37c38801f48f76210c92b91e0777b -size 302572 +oid sha256:ee059274164e63cd00f91fd41938d57af47ce6ee7dbd9b08357947ddb883f701 +size 303387 diff --git a/invisible_cities/database/test_data/228Th_10evt_deco_separate.h5 b/invisible_cities/database/test_data/228Th_10evt_deco_separate.h5 index 490271120..0b4f7548e 100644 --- a/invisible_cities/database/test_data/228Th_10evt_deco_separate.h5 +++ b/invisible_cities/database/test_data/228Th_10evt_deco_separate.h5 @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5ae5b40acc034706762ad2de2209430b136001b1783c921e98c52731c6c2d101 -size 818469 +oid sha256:3fb02b1e386e4a978267875acc49eceb97c2c07ce6a74e3579a2524683f39fc7 +size 823121 diff --git a/invisible_cities/database/test_data/228Th_10evt_hits.h5 b/invisible_cities/database/test_data/228Th_10evt_hits.h5 index de4ac5337..4b8739c0d 100644 --- a/invisible_cities/database/test_data/228Th_10evt_hits.h5 +++ b/invisible_cities/database/test_data/228Th_10evt_hits.h5 @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c4ea23ce094795bfd779121b872ce7e7d1da60eb1e1992a8868e3fc7de73c4b8 -size 274703 +oid sha256:7016967f80568b698957649dab8dc581a7c2f3f7af330cf65c69391ad0ce675f +size 276410 diff --git a/invisible_cities/database/test_data/228Th_10evt_tracks.h5 b/invisible_cities/database/test_data/228Th_10evt_tracks.h5 index edc540196..304fcb987 100644 --- a/invisible_cities/database/test_data/228Th_10evt_tracks.h5 +++ b/invisible_cities/database/test_data/228Th_10evt_tracks.h5 @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1ccacb5209c035672cd6b147f54d816863a9a7828f5ad88dd1a7dff488a8e7fc -size 230547 +oid sha256:ecc3e2524de6c1c5e0e9d89cdf7c510fde37557277d85417890c16bbe9485902 +size 230750 diff --git a/invisible_cities/reco/hits_functions.py b/invisible_cities/reco/hits_functions.py index 9bcdea2ef..4e300af2d 100644 --- a/invisible_cities/reco/hits_functions.py +++ b/invisible_cities/reco/hits_functions.py @@ -1,7 +1,12 @@ import numpy as np import pandas as pd -from .. types.ic_types import NN +from itertools import compress +from copy import deepcopy +from typing import List +from sklearn.cluster import DBSCAN + +from .. types.ic_types import NN EPSILON = np.finfo(np.float64).eps @@ -64,8 +69,6 @@ def sipms_above_threshold(xys: np.ndarray, qs: np.ndarray, thr:float, energy: fl return xs, ys, qs, es - - def merge_NN_hits(hits: pd.DataFrame, same_peak: bool = True) -> pd.DataFrame: """ Finds NN hits (defined as hits with Q=NN) and removes them without energy @@ -238,3 +241,79 @@ def threshold_hits(hits: pd.DataFrame, th: float) -> pd.DataFrame: if th <= 0: return hits return (hits.groupby("Z", as_index=False) .apply(apply_threshold, th=th)) + +def tag_hits_in_event(event_hits : pd.DataFrame + , * + , min_samples : int + , scale_xy : float + , scale_z : float + ) -> pd.DataFrame: + """ + Applies DBSCAN clustering to a DataFrame containing hits from a single event. + Hits coordinates are scaled to account for the anisotropy of the detector geometry. + A 'cluster' column is added to the group with the resulting labels. + + Parameters + ---------- + event_hits : pd.DataFrame + DataFrame with hits from a single event. Must contain 'X', 'Y', 'Z' columns. + min_samples : int + Minimum number of samples required to form a dense region (cluster). + This includes the point itself. + scale_xy : float + Scaling factor to apply to the XY coordinates before clustering. + scale_z : float + Scaling factor to apply to the Z coordinate before clustering. + + Returns + ------- + pd.DataFrame + The input DataFrame with a 'cluster' column added. + """ + coords = event_hits[['X', 'Y', 'Z']].to_numpy() + # A proper scaling leads to hits being separeted + # by a distance of 1 in the DBSCAN metric space + coords[:, :2] /= scale_xy + coords[:, 2] /= scale_z + + # eps parameter is fixed to a value a bit higher of √3 + # to retain diagonal neighbours in the same cluster + labels = DBSCAN(eps=1.8, min_samples=min_samples).fit_predict(coords) + event_hits['cluster'] = labels + + return event_hits + +def cluster_tagger(df_hits : pd.DataFrame + , * + , min_samples : int + , scale_xy : float + , scale_z : float + ) -> pd.DataFrame: + """ + This function groups the input DataFrame by 'event' and applies the + `tag_hits_in_event` function to each event's group of hits. + + Parameters + ---------- + df_hits : pd.DataFrame + DataFrame with hit information. Must contain 'X', 'Y', 'Z', and 'event'. + min_samples, scale_xy, scale_z : + See `tag_hits_in_event` + + Returns + ------- + pd.DataFrame + The input DataFrame with an added 'cluster' column indicating the + cluster label for each hit (-1 for noise). + """ + if df_hits.empty: + return df_hits.assign(cluster=pd.Series(dtype=int)) + + df_clustered = df_hits.groupby('event', as_index=False, group_keys=False) \ + .apply( tag_hits_in_event + , min_samples = min_samples + , scale_xy = scale_xy + , scale_z = scale_z ) + + return df_clustered.set_index(df_hits.index) + \ No newline at end of file diff --git a/invisible_cities/reco/hits_functions_test.py b/invisible_cities/reco/hits_functions_test.py index c92b7abaa..954ae1e84 100644 --- a/invisible_cities/reco/hits_functions_test.py +++ b/invisible_cities/reco/hits_functions_test.py @@ -1,6 +1,8 @@ import numpy as np import pandas as pd +from pytest import mark +from pytest import fixture from numpy.testing import assert_almost_equal from .. core.testing_utils import assert_dataframes_close @@ -9,6 +11,7 @@ from . hits_functions import merge_NN_hits from . hits_functions import threshold_hits from . hits_functions import sipms_above_threshold +from . hits_functions import cluster_tagger from hypothesis import given from hypothesis.strategies import lists from hypothesis.strategies import floats @@ -16,7 +19,8 @@ from copy import deepcopy from hypothesis import assume from hypothesis.strategies import composite - +from hypothesis.extra.pandas import data_frames, column, range_indexes +from hypothesis import settings event_numbers = integers(0, np.iinfo(np.int32).max) @@ -158,3 +162,142 @@ def test_threshold_hits_all_larger_than_th(hits, th): hits_thresh = threshold_hits(hits, th) non_nn = hits_thresh.loc[hits_thresh.Q != NN] assert np.all(non_nn.Q >= th) + +gen_cluster_df = data_frames(index = range_indexes(min_size=1, max_size=50), + columns = [ + column('event', dtype=int , elements=integers(min_value=0, max_value=10)), + column( 'X', dtype=float, elements=floats(min_value=-500, max_value=500)), + column( 'Y', dtype=float, elements=floats(min_value=-500, max_value=500)), + column( 'Z', dtype=float, elements=floats(min_value=0, max_value=1200)), + column( 'E', dtype=float, elements=floats(min_value=0.1, max_value=100)) + ]) + +@given(df=gen_cluster_df) +@settings(deadline=None) +def test_cluster_tagger_output_shape(df): + """ + Verifies that the output of cluster_tagger: + - Has the same number of rows as the input. + - Contains exactly one new column named 'cluster'. + """ + dummy_params = dict(min_samples=1, scale_xy=1.0, scale_z=1.0) + df_result = cluster_tagger(df.copy(), **dummy_params) + + assert len(df_result) == len(df), "Output DataFrame has different length than input." + assert 'cluster' in df_result.columns, "Output DataFrame does not contain 'cluster' column." + expected_cols = set(df.columns) | {'cluster'} + assert set(df_result.columns) == expected_cols, "Output DataFrame has unexpected columns." + +@given(df=gen_cluster_df) +@settings(deadline=None) +def test_cluster_tagger_original(df): + """ + Verifies that cluster_tagger: + - Does not modify any of the input information. + - Preserves the input index and row order. + """ + dummy_params = dict(min_samples=1, scale_xy=1.0, scale_z=1.0) + df_result = cluster_tagger(df.copy(), **dummy_params) + + pd.testing.assert_frame_equal( + df_result.drop(columns=['cluster']), + df, + check_dtype=True, + obj="Dataframe structure check" + ) + +@given(df=gen_cluster_df) +@settings(deadline=None) +def test_cluster_tagger_new_column_validity(df): + dummy_params = dict(min_samples=1, scale_xy=1.0, scale_z=1.0) + df_result = cluster_tagger(df.copy(), **dummy_params) + + assert pd.api.types.is_integer_dtype(df_result.cluster), "'cluster' column is not of integer type." + assert not df_result.cluster.isna().any(), "'cluster' column contains NaN values." + +def test_cluster_tagger_row_alignment(): + """ + Verifies that the correct cluster label is assigned to the correct hit, + even if the input DataFrame is shuffled. + + Scenario: + - Event 0: + - Cluster A: 2 hits at (0,0,0) and (1,1,0) -> Should be Cluster 0 + - Cluster B: 2 hits at (100,100,0) and (101,101,0) -> Should be Cluster 1 + """ + data = { + 'event' : [ 0, 0, 0, 0], + 'X' : [0., 1., 100., 101.], + 'Y' : [0., 1., 100., 101.], + 'Z' : [0., 0., 0., 0.], + 'cluster': [ 0, 0, 1, 1] + } + df = pd.DataFrame(data) + # Shuffled dataframe must start with the same hit as the original + # to ensure that both hits close to (0,0,0) have same cluster label (0) + df_shuffled = df.reindex(index=[0, 2, 1, 3]).copy().drop(columns=['cluster']) + + test_params = dict(min_samples=1, scale_xy=1.0, scale_z=1.0) + df_result = cluster_tagger(df_shuffled, **test_params) + # Sorted final result must match original dataframe + df_result = df_result.sort_index() + + pd.testing.assert_frame_equal( + df_result, + df, + check_dtype=True, + obj="Dataframe structure check" + ) + +def test_cluster_tagger_noise_identification(): + """ + Scenario: + - 3 points very close together (0,0), (1,0), (0,1). They should form a cluster. + - 1 point very far away (100, 100). It has 0 neighbors. Should be noise. + """ + data = { + 'event': [ 0, 0, 0, 0], + 'X' : [0., 1., 0., 100.], + 'Y' : [0., 0., 1., 100.], + 'Z' : [0., 0., 0., 0.] + } + df = pd.DataFrame(data) + + test_params = dict(min_samples=3, scale_xy=1.0, scale_z=1.0) + df_result = cluster_tagger(df.copy(), **test_params) + + cluster_labels = df_result.cluster.unique() + assert len(cluster_labels) == 2, "Expected exactly 2 unique cluster labels (one cluster + one noise)." + cluster_hits = df_result[df_result.cluster != -1] + assert len(cluster_hits) == 3, "Expected exactly 3 hits to be clustered together." + noise_hit = df_result[df_result.cluster == -1] + assert len(noise_hit) == 1, "Expected exactly 1 noise hit." + assert noise_hit['X'].iloc[0] == 100 and noise_hit['Y'].iloc[0] == 100, "The noise hit identified is NOT the distant one." + +def test_cluster_tagger_event_distinction(): + """ + Scenario: + - Event 0: 2 hits at (0,0,0) and (1,1,0) -> Should be Cluster 0 + - Event 1: 2 hits at (100,100,0) and (101,101,0) -> Should be Cluster 1 + and 1 hit at (0.5,0.5,0) -> Should be marked as noise (-1) + - We check that noise hit from Event 1 get a different cluster label than hits from Event 0, even if they are spatially close. + """ + data = { + 'event': [ 0, 0, 1, 1, 1], + 'X' : [0., 1., 100., 101., 0.5], + 'Y' : [0., 1., 100., 101., 0.5], + 'Z' : [0., 0., 0., 0., 0.], + } + df = pd.DataFrame(data) + + test_params = dict(min_samples=2, scale_xy=1.0, scale_z=1.0) + df_result = cluster_tagger(df.copy(), **test_params) + + event_0_clusters = df_result[df_result.event == 0].cluster.unique() + event_1_clusters = df_result[df_result.event == 1].cluster.unique() + assert len(event_0_clusters) == 1, "For event 0: expected exactly 1 unique cluster label (one cluster)." + assert len(event_1_clusters) == 2, "For event 1: expected exactly 2 unique cluster labels (one cluster + one noise)." + event_0_hits = df_result[df_result.event == 0] + noise_1_hit = df_result[(df_result.X == 0.5)] + assert noise_1_hit.cluster.iloc[0] == -1, "The hit at (0.5,0.5,0) in event 1 should be marked as noise (-1)." + assert event_0_hits.cluster.iloc[0] != noise_1_hit.cluster.iloc[0], "Hits from event 0 and the noise hit from event 1 were assigned the same cluster label." \ No newline at end of file diff --git a/manage.sh b/manage.sh index 04d8b7a83..421370d10 100644 --- a/manage.sh +++ b/manage.sh @@ -73,7 +73,7 @@ function install_conda { fi } -CONDA_ENV_TAG=2026-03-05 +CONDA_ENV_TAG=2026-03-04 CONDA_ENV_NAME=IC-${PYTHON_VERSION}-${CONDA_ENV_TAG} function make_environment { @@ -109,6 +109,7 @@ dependencies: - scipy = 1.9.3 - seaborn = 0.11.2 - setuptools = 58.0.4 +- scikit-learn = 1.1.3 - sphinx = 4.2.0 - tornado = 6.1 - pip: