Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions 228Th_10evt_hits.h5
Git LFS file not shown
31 changes: 31 additions & 0 deletions invisible_cities/cities/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
from .. reco .corrections import get_df_to_z_converter
from .. reco .xy_algorithms import corona
from .. reco .xy_algorithms import barycenter
from .. reco .hits_functions import cluster_tagger
from .. filters.s1s2_filter import S12Selector
from .. filters.s1s2_filter import S12SelectorOutput
from .. filters.s1s2_filter import pmap_filter
Expand Down Expand Up @@ -1717,6 +1718,36 @@ def correct(hits : pd.DataFrame) -> pd.DataFrame:

return correct

@check_annotations
def hits_clusterizer( min_samples : int
, scale_xy : float
, scale_z : float
) -> Callable:
"""
Creates a callable for performing DBSCAN clustering on a dataFrame of hits.

Parameters
----------
min_samples : int
Minimum number of samples required to form a dense region (cluster).
This includes the point itself.
scale_xy : float
Scaling factor to apply to the (x, y) coordinates before clustering.
scale_z : float
Scaling factor to apply to the z coordinate before clustering.

Returns
-------
Callable
A function that takes a DataFrame of hits and returns the same DataFrame
with an added 'cluster' column, which contains the cluster labels assigned by DBSCAN
(-1 for noise).
"""
return partial( cluster_tagger
, min_samples = min_samples
, scale_xy = scale_xy
, scale_z = scale_z )


def identity(x : Any) -> Any:
return x
16 changes: 15 additions & 1 deletion invisible_cities/cities/sophronia.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
from . components import collect
from . components import build_pointlike_event as pointlike_event_builder
from . components import hits_corrector
from . components import hits_clusterizer
from . components import identity

from typing import Optional
Expand Down Expand Up @@ -93,6 +94,7 @@ def sophronia( files_in : OneOrManyFiles
, sipm_charge_type : SiPMCharge
, same_peak : bool
, corrections : Optional[dict] = None
, clustering_params : Optional[dict] = None
):
"""
drift_v : float
Expand Down Expand Up @@ -137,6 +139,15 @@ def sophronia( files_in : OneOrManyFiles
Normalization strategy
norm_value : float, optional
Normalization value in case of `norm_strat = NormStrategy.custom`

clustering_params : dict
min_samples : int
Minimum number of samples required to form a dense region (cluster).
This includes the point itself.
scale_xy : float
Scaling factor to apply to the (x, y) coordinates before clustering.
scale_z : float
Scaling factor to apply to the z coordinate before clustering.
"""
global_reco = compute_xy_position( detector_db
, run_number
Expand Down Expand Up @@ -177,6 +188,9 @@ def sophronia( files_in : OneOrManyFiles

correct_hits = df.map( hits_corrector(**corrections) if corrections is not None else identity
, item = "hits")

cluster_hits = df.map( hits_clusterizer(**clustering_params) if clustering_params is not None else identity
, item = "hits")

build_pointlike_event = df.map( pointlike_event_builder( detector_db
, run_number
Expand All @@ -202,7 +216,7 @@ def sophronia( files_in : OneOrManyFiles
, args = "event_number enough_valid_hits".split())

hits_branch = ( make_hits, enough_valid_hits, df.branch(write_hits_filter)
, hits_select.filter, merge_nn_hits, correct_hits, write_hits)
, hits_select.filter, merge_nn_hits, correct_hits, cluster_hits, write_hits)
kdst_branch = build_pointlike_event, write_pointlike_event
collect_evt_numbers = "event_number", event_number_collector.sink

Expand Down
47 changes: 47 additions & 0 deletions invisible_cities/cities/sophronia_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

from pytest import mark

from .. io import dst_io as dio
from .. core.testing_utils import assert_dataframes_equal
from .. core.testing_utils import assert_tables_equality
from .. core.testing_utils import ignore_warning
from .. core.system_of_units import pes
Expand Down Expand Up @@ -147,3 +149,48 @@ def test_sophronia_keeps_hitless_events(config_tmpdir, sophronia_config):
with tb.open_file(path_out) as output_file:
assert len(output_file.root.Run.events) == 1
assert "RECO" not in output_file.root


@ignore_warning.no_config_group
def test_sophronia_clustering_integration(config_tmpdir, sophronia_config):
"""
Runs Sophronia twice (once disabled, once enabled) to verify:
1. Backward compatibility: No 'cluster' column when disabled.
2. Feature activation: 'cluster' column exists when enabled.
3. Data consistency: Enabling clustering does NOT change any other data.
"""
path_out_no_cluster = os.path.join(config_tmpdir, 'test_sophronia_no_cluster.h5')
path_out_with_cluster = os.path.join(config_tmpdir, 'test_sophronia_with_cluster.h5')

# Clustering disabled
config_no_cluster = dict(**sophronia_config)
config_no_cluster.update(dict( file_out = path_out_no_cluster
, event_range = 1
, clustering_params = None))
sophronia(**config_no_cluster)

# Clustering enabled
clustering_params = dict(
min_samples = 5,
scale_xy = 15.55,
scale_z = 4.0
)
config_with_cluster = dict(**sophronia_config)
config_with_cluster.update(dict( file_out = path_out_with_cluster
, event_range = 1
, clustering_params = clustering_params))
sophronia(**config_with_cluster)

# Load both outputs
df_no_cluster = dio.load_dst(path_out_no_cluster, "RECO", "Events")
df_with_cluster = dio.load_dst(path_out_with_cluster, "RECO", "Events")

# ----- Assertions
assert not df_no_cluster.empty
assert not df_with_cluster.empty
assert 'cluster' not in df_no_cluster.columns, "'cluster' column should not exist when clustering is disabled."
assert 'cluster' in df_with_cluster.columns, "'cluster' column should exist when clustering is enabled."

# Compare all columns except 'cluster' for equality
df_with_cluster_compare = df_with_cluster.drop(columns=['cluster'])
assert_dataframes_equal(df_no_cluster, df_with_cluster_compare)
6 changes: 6 additions & 0 deletions invisible_cities/config/sophronia.conf
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,9 @@ corrections = dict(
apply_temp = True,
norm_strat = kr,
apply_z = False)

clustering_params = dict(
min_samples = 5,
scale_xy = 15.55,
scale_z = 4.0
)
4 changes: 4 additions & 0 deletions invisible_cities/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,10 @@ def sophronia_config(Th228_pmaps, next100_mc_krmap):
filename = next100_mc_krmap,
apply_temp = False,
norm_strat = NormStrategy.kr)
, clustering_params = dict(
min_samples = 5,
scale_xy = 15.55,
scale_z = 4.0)
)
return config

Expand Down
4 changes: 2 additions & 2 deletions invisible_cities/database/test_data/228Th_10evt_deco.h5
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
4 changes: 2 additions & 2 deletions invisible_cities/database/test_data/228Th_10evt_hits.h5
Git LFS file not shown
85 changes: 82 additions & 3 deletions invisible_cities/reco/hits_functions.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
import numpy as np
import pandas as pd

from .. types.ic_types import NN
from itertools import compress
from copy import deepcopy
from typing import List
from sklearn.cluster import DBSCAN

from .. types.ic_types import NN

EPSILON = np.finfo(np.float64).eps

Expand Down Expand Up @@ -64,8 +69,6 @@ def sipms_above_threshold(xys: np.ndarray, qs: np.ndarray, thr:float, energy: fl
return xs, ys, qs, es




def merge_NN_hits(hits: pd.DataFrame, same_peak: bool = True) -> pd.DataFrame:
"""
Finds NN hits (defined as hits with Q=NN) and removes them without energy
Expand Down Expand Up @@ -238,3 +241,79 @@ def threshold_hits(hits: pd.DataFrame, th: float) -> pd.DataFrame:
if th <= 0: return hits
return (hits.groupby("Z", as_index=False)
.apply(apply_threshold, th=th))

def tag_hits_in_event(event_hits : pd.DataFrame
, *
, min_samples : int
, scale_xy : float
, scale_z : float
) -> pd.DataFrame:
"""
Applies DBSCAN clustering to a DataFrame containing hits from a single event.
Hits coordinates are scaled to account for the anisotropy of the detector geometry.
A 'cluster' column is added to the group with the resulting labels.

Parameters
----------
event_hits : pd.DataFrame
DataFrame with hits from a single event. Must contain 'X', 'Y', 'Z' columns.
min_samples : int
Minimum number of samples required to form a dense region (cluster).
This includes the point itself.
scale_xy : float
Scaling factor to apply to the XY coordinates before clustering.
scale_z : float
Scaling factor to apply to the Z coordinate before clustering.

Returns
-------
pd.DataFrame
The input DataFrame with a 'cluster' column added.
"""
coords = event_hits[['X', 'Y', 'Z']].to_numpy()
# A proper scaling leads to hits being separeted
# by a distance of 1 in the DBSCAN metric space
coords[:, :2] /= scale_xy
coords[:, 2] /= scale_z

# eps parameter is fixed to a value a bit higher of √3
# to retain diagonal neighbours in the same cluster
labels = DBSCAN(eps=1.8, min_samples=min_samples).fit_predict(coords)
event_hits['cluster'] = labels

return event_hits

def cluster_tagger(df_hits : pd.DataFrame
, *
, min_samples : int
, scale_xy : float
, scale_z : float
) -> pd.DataFrame:
"""
This function groups the input DataFrame by 'event' and applies the
`tag_hits_in_event` function to each event's group of hits.

Parameters
----------
df_hits : pd.DataFrame
DataFrame with hit information. Must contain 'X', 'Y', 'Z', and 'event'.
min_samples, scale_xy, scale_z :
See `tag_hits_in_event`

Returns
-------
pd.DataFrame
The input DataFrame with an added 'cluster' column indicating the
cluster label for each hit (-1 for noise).
"""
if df_hits.empty:
return df_hits.assign(cluster=pd.Series(dtype=int))

df_clustered = df_hits.groupby('event', as_index=False, group_keys=False) \
.apply( tag_hits_in_event
, min_samples = min_samples
, scale_xy = scale_xy
, scale_z = scale_z )

return df_clustered.set_index(df_hits.index)

Loading
Loading