diff --git a/invisible_cities/reco/krmap_functions.py b/invisible_cities/reco/krmap_functions.py index a967041fd..8d0d57fec 100644 --- a/invisible_cities/reco/krmap_functions.py +++ b/invisible_cities/reco/krmap_functions.py @@ -1,9 +1,17 @@ -import numpy as np -import pandas as pd +import warnings +import itertools +import numpy as np +import pandas as pd +import scipy.stats as stats + + +from typing import Tuple, Optional from .. types.symbols import KrFitFunction from .. evm.ic_containers import FitFunction from .. core.fit_functions import polynom, expo +from ..core.core_functions import in_range, shift_to_bin_centers +from ..core.fit_functions import fit, get_chi2_and_pvalue def lin_seed(x : np.array, y : np.array): @@ -149,3 +157,326 @@ def transform_parameters(fit_output : FitFunction): return par, err, cov + +def create_df_kr_map(bins_x : np.array, + bins_y : np.array, + counts : np.array, + n_min : int, + r_max : float)->pd.DataFrame: + ''' + This function creates the dataframe in which the map parameters are stored. + + Parameters + ---------- + bins_x : np.array + Bins in x axis + bins_y : np.array + Bins in y axis + counts : np.array + Number of events falling into each bin + n_min : int + Min number of events per bin required to perform fits + r_max : float + Radius defining the active area of the detector + + Returns + ------- + kr_map : pd.DataFrame + Kr map dataframe with all the info prior to the fits: bin label, + events per bin, bin in/outside the active volume, bin position + (X, Y, R), etc. + ''' + n_xbins = len(bins_x)-1 + n_ybins = len(bins_y)-1 + b_center = [shift_to_bin_centers(axis) for axis in [bins_x, bins_y]] + + bin_index = range(n_xbins*n_ybins) + geom_comb = np.array(list(itertools.product(*b_center))) + r_values = np.sum(geom_comb**2, axis=1)**0.5 + + kr_map = pd.DataFrame(dict(bin = bin_index, + counts = counts, + X = geom_comb.T[0], + Y = geom_comb.T[1], + R = r_values, + in_active = r_values <= r_max, + has_min_counts = counts >= n_min, + fit_success = False, + valid = False, + e0 = np.nan, + ue0 = np.nan, + lt = np.nan, + ult = np.nan, + cov = np.nan, + res_std = np.nan, + chi2 = np.nan, + pval = np.nan)) + + return kr_map + + +def get_number_of_bins(nevents : Optional[int] = None, + thr : Optional[int] = 1e6, + n_bins : Optional[np.array] = None)->np.array: + ''' + Computes the number of XY bins to be used in the creation + of correction map regarding the number of selected events. + + Parameters + --------- + nevents: int (optional) + Total number of provided events for the map computation. + thr: int (optional) + Event threshold to use 50x50 or 100x100 binning. + n_bins: int (optional) + The number of bins to use can be chosen a priori. If given, + the returned number of bins is the one provided by the user. + However, if no number of bins is given in advance, this will + automatically select a value depending on the amount of events + contained in the dst and the threshold. + + Returns + ------- + n_bins: int + Number of bins in each direction (X,Y) (square map). + ''' + if n_bins is not None: return n_bins; + elif nevents < thr: return np.array([50, 50]); + else: return np.array([100, 100]); + + +def get_bin_counts_and_event_bin_id(dst : pd.DataFrame, + bins_x : np.array, + bins_y : np.array): + ''' + This function distributes all the events in the DST into the selected + bins. Given a certain binning, it computes which events fall into each + square of the grid formed by the bins arrays. binned_statistic_2d returns + counts (matrix here each matrix element is the number of events falling + into that specific bin) and bin_indexes (a 2-D array labeling each event + with the bin it belongs). Then counts is flattened into 1-D, bin_indexes + is transformed into 1-D using the number of bins on each axis. + + Parameters + ---------- + dst : pd.DataFrame + Krypton dataframe. + bins_x : np.array + Binning in x axis + bins_y : np.array + Binning in y axis + + Returns + ------- + counts : np.array + Total number of events falling into each bin + bin_labels : np.array + 1D bin label for each event + + Further info: + ------------- + Why set expand_binnumbers to True (2D binning) if then we transform it to 1D? + Because even though expand_binnumbers = False returns 1-D labels, it also adds + two additional bins (per axis), taking into account out-of-range events which + dont fall into the the binning passed to the binned_statistic_2d function. But + since the dst is going to be previously selected and filtered with the desired + binning, it's not convenient to use that. Maybe a visual example is more useful: + + 2x2 binning (4 bins), natural index values shown both as 1D and 2D: + + || 0 | 1 || || (0, 0) | (0, 1) || + || 2 | 3 || (1D) = || (1, 0) | (1, 1) || (2D) + + Using expand_binnumbers = False, the 1D index values instead of (0, ..., 3) + would be (0, ..., 15): + + || 0 | 1 | 2 | 3 || + || 4 | 5 | 6 | 7 || The bins that we "care about" (inner part) have indexes + || 8 | 9 |10 |11 || 5, 6, 9, 10 which I believe is not convenient at all. + ||12 |13 |14 |15 || This creates (nx+2)*(ny+2) bins. + ''' + n_xbins = len(bins_x)-1 + n_ybins = len(bins_y)-1 + + counts, _, _, bin_indexes = stats.binned_statistic_2d(x=dst.X, y=dst.Y, values=None, + bins=[bins_x, bins_y], statistic='count', + expand_binnumbers=True) + + counts = counts.flatten() + bin_indexes -= 1 + bin_labels = np.ravel_multi_index(bin_indexes, dims=(n_xbins, n_ybins), + mode='clip', order = 'F') + + return counts, bin_labels + + +def calculate_residuals(x : np.array, + y : np.array, + fit_output : FitFunction): + ''' + Calculate residuals and their standard deviation for the fitted data. + + Parameters + ---------- + x : pd.array + Independent variable + y : pd.array + Dependent variable + fit_output : FitFunction + Container for IC's fit function result + + Returns + ------- + res : np.array + Residuals. + std : float + Standard deviation of residuals. + ''' + + res = y - fit_output.fn(x); std = res.std() + + return res, std + + +def calculate_pval(residuals : np.array): + ''' + Calculate the p-value for the Shapiro-Wilk normality test of residuals. + + Parameters + ---------- + residuals : np.array + Residuals from the fitted model. + + Returns + ------- + pval : float + p-value for the Shapiro-Wilk normality test. + ''' + + pval = stats.shapiro(residuals)[1] if (len(residuals) > 3) else 0. + + return pval + + +def valid_bin_counter(map_df : pd.DataFrame, + validity_parameter : Optional[float] = 0.9): + ''' + Count the number of valid bins in the map DataFrame and issue a warning + if the validity threshold is not met. + + Parameters + ---------- + map_df : pd.DataFrame + DataFrame containing map data. + validity_parameter : float + Threshold for the ratio of valid bins (default set to 0.9). + + Returns + ------- + valid_per : float + Percentage of valid bins. + ''' + + inside = np.count_nonzero(map_df.in_active) + valid = np.count_nonzero(map_df.in_active & map_df.valid) + + valid_per = valid / inside if inside else 0 + + if valid_per <= validity_parameter: + warnings.warn(f"{inside-valid} inner bins are not valid. {100*valid_per:.1f}% are successful.", UserWarning) + + return valid_per + + +def fit_and_fill_map(map_bin : pd.DataFrame, + dst : pd.DataFrame, + fittype : KrFitFunction): + ''' + This function is the core of the map computation. It's the one in charge of looping + over all the bins, performing the calculations of the lifetime fits and filling the + krypton map dataframe with all the obtained parameters. + + Basically, it's applied to the dataframe, and bin by bin, it checks if the conditions + for the map computation are met. If it's the case, it will call the corresponding fit + function specified in the fittype parameter, calculate all the different map values + and fill the dataframe with them. + + Parameters + ---------- + map_bin : pd.DataFrame + KrMap dataframe + dst : pd.DataFrame + kdst dataframe. + fittype : str + Type of fit to perform. + + Returns + ------- + kr_map : pd.DataFrame + DataFrame containing map parameters. + ''' + if not map_bin.in_active or not map_bin.has_min_counts: return map_bin + + dst_bin = dst.loc[dst.bin_index == map_bin.bin] + + fit_func, seed = get_function_and_seed_lt(fittype = fittype) + x, y = select_fit_variables (fittype = fittype, + dst = dst_bin) + + fit_output = fit(func = fit_func, + x = x, + y = y, + seed = seed(x, y)) + + par, err, cov = transform_parameters(fit_output = fit_output) + + res = y - fit_output.fn(x) + std = res.std() + chi2 = get_chi2_and_pvalue(y, fit_output.fn(x), len(x)-len(par), std)[0] + pval = stats.shapiro(res)[1] if (len(res) > 3) else 0. + + map_bin['e0'] = par[0] + map_bin['ue0'] = err[0] + map_bin['lt'] = par[1] + map_bin['ult'] = err[1] + map_bin['cov'] = cov + map_bin['res_std'] = std + map_bin['chi2'] = chi2 + map_bin['pval'] = pval + map_bin['fit_success'] = fit_output.ier in range(1, 5) + map_bin['valid'] = map_bin['fit_success'] and map_bin['has_min_counts'] and map_bin['in_active'] + + return map_bin + + +def regularize_map(maps : pd.DataFrame, + x2range : Tuple[float, float] = None): + ''' + Given a certain chi2 range, this function checks which bins are outside that + range and substitutes the (probably wrong) values of the map (e0, lt, etc) for + the mean value of the whole map. + + Parameters + ---------- + maps: pd.DataFrame + Map to check the outliers + x2range : Tuple[float, float] + Range for chi2 + + Returns + --------- + new_map: pd.DataFrame + Regularized map.''' + new_map = maps.copy() + outliers = new_map.in_active & ~new_map.valid + + if isinstance(x2range, Tuple): + outliers &= ~in_range(new_map.chi2, *x2range) + else: outliers = outliers + + new_map['e0'] [outliers] = np.nanmean(maps['e0']) + new_map['lt'] [outliers] = np.nanmean(maps['lt']) + new_map['ue0'][outliers] = np.nanmean(maps['ue0']) + new_map['ult'][outliers] = np.nanmean(maps['ult']) + + return new_map diff --git a/invisible_cities/reco/krmap_functions_test.py b/invisible_cities/reco/krmap_functions_test.py index fcb5cbd34..f7b5708db 100644 --- a/invisible_cities/reco/krmap_functions_test.py +++ b/invisible_cities/reco/krmap_functions_test.py @@ -5,6 +5,7 @@ import pandas as pd import scipy.optimize as so +from pytest import mark, warns from hypothesis import given, settings from hypothesis.strategies import floats, integers @@ -12,7 +13,8 @@ from ..types.symbols import KrFitFunction from .. evm.ic_containers import FitFunction from .. core.fit_functions import expo - +from .. core.core_functions import in_range +from .. conftest import KrMC_kdst @given(floats(min_value = 0, max_value = 10), floats(min_value = 10, max_value = 20), @@ -104,3 +106,133 @@ def test_transform_parameters(a, b): npt.assert_allclose(transformed_par, [ E0_expected, lt_expected]) npt.assert_allclose(transformed_err, [s_E0_expected, s_lt_expected]) assert np.isclose(transformed_cov, cov_expected) + + +@mark.parametrize('n_bins', [78, 125]) +def test_get_number_of_bins_performance_default_value(n_bins): + nevents = 1e7 + thr = 1e6 + bins = krf.get_number_of_bins(nevents, thr, n_bins) + + npt.assert_array_equal(bins, n_bins) + + +@given(integers(min_value = 0, max_value = 1e10), + integers(min_value = 0, max_value = 1e10)) +def test_get_number_of_bins_with_thresholds(nevents, thr): + bins = krf.get_number_of_bins(nevents, thr) + + if nevents < thr: + npt.assert_array_equal(bins, np.array([50, 50])) + if nevents >= thr: + npt.assert_array_equal(bins, np.array([100, 100])) + + +@mark.parametrize('n_bins n_min r_max'.split(), + (((10, 10), 50, 200), + ( (50, 50), 10, 350), + ((100, 50), 1, 500),)) +def test_create_df_kr_map_shape(n_bins, n_min, r_max): + + XYrange = (500, 500) + n_bins_x = n_bins[0] + n_bins_y = n_bins[1] + + bins_x = np.linspace(*XYrange, n_bins_x+1) + bins_y = np.linspace(*XYrange, n_bins_y+1) + counts = np.full(shape=(n_bins_x, n_bins_y), fill_value = 100, dtype=int).flatten() + + df = krf.create_df_kr_map(bins_x, bins_y, counts, n_min, r_max) + + columns = ['bin', 'counts', 'e0', 'ue0', 'lt', 'ult', 'cov', 'res_std', 'chi2', + 'pval', 'in_active', 'has_min_counts', 'fit_success', 'valid', 'R', 'X', 'Y'] + + assert all(element in columns for element in df.columns.values) + assert df.bin.nunique() == n_bins_x*n_bins_y + + +@mark.parametrize('bins_x bins_y'.split(), + ((np.linspace(-100, +100, 5), np.linspace(-100, +100, 10)), + ( np.linspace(-400, +400, 10), np.linspace(-400, +400, 10)), + )) +def test_get_bin_counts_and_event_bin_id(KrMC_kdst, bins_x, bins_y): + + kr_df = KrMC_kdst[0].read + inside = in_range(kr_df.X, min(bins_x), max(bins_x)) & in_range(kr_df.Y, min(bins_y), max(bins_y)) + + counts, bin_labels = krf.get_bin_counts_and_event_bin_id(dst = kr_df, bins_x = bins_x, bins_y = bins_y) + + assert counts.sum() == kr_df[inside].event.nunique() + assert len(bin_labels) == kr_df.event.nunique() + +@mark.parametrize('n', (0, 8, 50)) +def test_valid_bin_counter_all_in_active(n): + inside = np.ones(50, dtype = bool) + valid = inside.copy() + valid[:n//2] = False + valid[len(valid)-n//2:] = False + krmap = pd.DataFrame(dict(in_active = inside, + valid = valid)) + + assert krf.valid_bin_counter(krmap) == ((50-n) / 50) + + +def test_valid_bin_counter_valid_subset_of_active(): + inside = np.ones(50, dtype = bool) + inside[:3] = False + inside[-3:] = False + valid = inside.copy() + valid[:4] = False + valid[-4:] = False + krmap = pd.DataFrame(dict(in_active = inside, + valid = valid)) + + assert krf.valid_bin_counter(krmap) == ((50-6-2) / (50-6)) + +@mark.parametrize('ni', (0, 4, 25)) +@mark.parametrize('nv', (0, 3, 25)) +def test_valid_bin_counter_valid_outside_active(ni, nv): + inside = np.ones(50, dtype = bool) + inside[len(inside)-ni:] = False + valid = np.ones(50, dtype = bool) + valid[:nv] = False + krmap = pd.DataFrame(dict(in_active = inside, + valid = valid)) + + assert krf.valid_bin_counter(krmap) == ((50-ni-nv) / (50-ni)) + +@mark.parametrize('n_bins rmax validity_parameter'.split(), + ((( 5, 5), 200, 1.0), + (( 3, 3), 100, 0.2))) +def test_valid_bin_counter_warning(n_bins, rmax, validity_parameter): + counts = np.array(range(n_bins[0]*n_bins[1])) + krmap = krf.create_df_kr_map(bins_x = np.linspace(-rmax, +rmax, n_bins[0]+1), + bins_y = np.linspace(-rmax, +rmax, n_bins[1]+1), + counts = counts, + n_min = 0, + r_max = np.nextafter(np.sqrt(2)*rmax, np.inf)) + + krmap.valid.iloc[0 : 9] = True + + if validity_parameter == 1: + with warns(UserWarning, match = "inner bins are not valid."): + krf.valid_bin_counter(map_df = krmap, validity_parameter = validity_parameter) + + +def test_regularize_map_no_chi2range(): + + inside = np.ones(50, dtype = bool) + valid = inside.copy() + valid[20:30] = False + maps = pd.DataFrame(dict(in_active = inside, + valid = valid, + e0 = np.full(50, 10), + lt = np.full(50, 1000), + ue0 = np.full(50, 2), + ult = np.full(50, 100))) + + new_map = krf.regularize_map(maps = maps, x2range = None) + + for col in ['e0', 'lt', 'ue0', 'ult']: + mean_value = np.nanmean(maps[col]) + assert np.allclose(new_map[col][~valid], mean_value, rtol=0.01)