Skip to content

Commit

Permalink
Merge pull request #2 from IceCubeOpenSource/ReducedSummaryData
Browse files Browse the repository at this point in the history
New feature: reduced summary data
Improvement: runtime slightly improved due to container data only being set if written to frame
  • Loading branch information
mhuen authored Sep 8, 2020
2 parents 2f52714 + 8a8af11 commit ee25c8a
Show file tree
Hide file tree
Showing 5 changed files with 273 additions and 47 deletions.
17 changes: 13 additions & 4 deletions ic3_data/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,11 @@ def Configure(self):
self._dom_exclusions = self._container.config['dom_exclusions']
self._partial_exclusion = self._container.config['partial_exclusion']

if self._output_key is None:
self._write_to_frame = False
else:
self._write_to_frame = True

# initalize data fields of data container
self._container.initialize()
self._batch_index = 0
Expand Down Expand Up @@ -86,6 +91,8 @@ def Configure(self):
# Check if it is a data format that computes everything at once,
# or if it is one that computes the values for one DOM at a time
if self._config['data_format'] in [
'total_dom_charge',
'reduced_summary_statistics_data',
'cascade_classification_data',
'mc_tree_input_data',
]:
Expand Down Expand Up @@ -241,7 +248,7 @@ def Physics(self, frame):
self._container.runtime_batch[self._batch_index] = elapsed_time

# Write data to frame
if self._output_key is not None:
if self._write_to_frame:
frame[self._output_key + '_bin_exclusions'] = \
self._container.bin_exclusions
frame[self._output_key + '_bin_indices'] = \
Expand Down Expand Up @@ -316,8 +323,9 @@ def _fill_dnn_container(self, om_key, bin_values_list, bin_indices_list,

# Add bin values and indices if non-empty
if bin_values_list:
self._container.bin_values[om_key] = bin_values_list
self._container.bin_indices[om_key] = bin_indices_list
if self._write_to_frame:
self._container.bin_values[om_key] = bin_values_list
self._container.bin_indices[om_key] = bin_indices_list

for value, index in zip(bin_values_list, bin_indices_list):
if self._is_str_dom_format:
Expand All @@ -341,7 +349,8 @@ def _fill_dnn_container(self, om_key, bin_values_list, bin_indices_list,

# Add bin exclusions if non-empty
if bin_exclusions_list:
self._container.bin_exclusions[om_key] = bin_exclusions_list
if self._write_to_frame:
self._container.bin_exclusions[om_key] = bin_exclusions_list
for index in bin_exclusions_list:
if self._is_str_dom_format:
if index == -1:
Expand Down
117 changes: 117 additions & 0 deletions ic3_data/data_formats_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from ic3_data.ext_boost import get_cascade_classification_data
from ic3_data.ext_boost import get_mc_tree_input_data_dict
from ic3_data.ext_boost import get_reduced_summary_statistics_data

"""All data format functions must have the following signature:
Expand Down Expand Up @@ -49,6 +50,122 @@
"""


def total_dom_charge(frame, pulses, config, dom_exclusions,
partial_exclusion, *args, **kwargs):
"""Get the total DOM charge per DOM
Parameters
----------
frame : I3Frame
The current frame.
pulses : I3RecoPulseSeriesMap
The pulse series map from which to calculate the DNN input data.
config : dict
A dictionary that contains all configuration settings.
dom_exclusions : list of str, None
List of frame keys that define DOMs or TimeWindows that should be
excluded. Typical values for this are:
['BrightDOMs','SaturationWindows', 'BadDomsList', 'CalibrationErrata']
partial_exclusion : bool, None
If True, partially exclude DOMS, e.g. only omit pulses from
excluded TimeWindows defined in 'dom_exclusions'.
If False, all pulses from a DOM will be excluded if the omkey
exists in the dom_exclusions.
*args
Variable length argument list.
**kwargs
Arbitrary keyword arguments.
Returns
-------
float
Global time offset. Time variables will be shifted by this global
offset. E.g. if the network predicts a vertex time based on input
data relative to the vertex time, the predicted time will be shifted
by this global offset.
dict
A dictionary with the structure
I3OMKey: (list of float, list of int, list of int)
The first list of the tuple are the bin values and the second list
are the bin indices. The last list is a list of bin exclusions.
list
Bin values: values of non-zero data bins.
list
Bin indices: indices to which the returned values belong to.
list
Bin exclusions: indices which define bins that will be excluded.
"""

add_total_charge = True
add_t_first = False
add_t_std = False

global_time_offset, data_dict = get_reduced_summary_statistics_data(
pulses, add_total_charge, add_t_first, add_t_std)

return global_time_offset, data_dict


def reduced_summary_statistics_data(frame, pulses, config, dom_exclusions,
partial_exclusion, *args, **kwargs):
"""Get a reduced set of summary statistics per DOM
These include: total dom charge, time of first pulse, std. dev of pulse
times. The pulse times are calculated relative to the charge weighted
mean time of all pulses.
Parameters
----------
frame : I3Frame
The current frame.
pulses : I3RecoPulseSeriesMap
The pulse series map from which to calculate the DNN input data.
config : dict
A dictionary that contains all configuration settings.
dom_exclusions : list of str, None
List of frame keys that define DOMs or TimeWindows that should be
excluded. Typical values for this are:
['BrightDOMs','SaturationWindows', 'BadDomsList', 'CalibrationErrata']
partial_exclusion : bool, None
If True, partially exclude DOMS, e.g. only omit pulses from
excluded TimeWindows defined in 'dom_exclusions'.
If False, all pulses from a DOM will be excluded if the omkey
exists in the dom_exclusions.
*args
Variable length argument list.
**kwargs
Arbitrary keyword arguments.
Returns
-------
float
Global time offset. Time variables will be shifted by this global
offset. E.g. if the network predicts a vertex time based on input
data relative to the vertex time, the predicted time will be shifted
by this global offset.
dict
A dictionary with the structure
I3OMKey: (list of float, list of int, list of int)
The first list of the tuple are the bin values and the second list
are the bin indices. The last list is a list of bin exclusions.
list
Bin values: values of non-zero data bins.
list
Bin indices: indices to which the returned values belong to.
list
Bin exclusions: indices which define bins that will be excluded.
"""

add_total_charge = True
add_t_first = True
add_t_std = True

global_time_offset, data_dict = get_reduced_summary_statistics_data(
pulses, add_total_charge, add_t_first, add_t_std)

return global_time_offset, data_dict


def cascade_classification_data(frame, pulses, config, dom_exclusions,
partial_exclusion, *args, **kwargs):
"""Get pulses on DOMs in time bins relative to expected arrival times
Expand Down
98 changes: 97 additions & 1 deletion ic3_data_ext/ext_boost.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@ wrapped with boost python.
#include <boost/version.hpp>
#include <boost/python.hpp>

#include "utils.cpp"

/*
Depending on the boost version, we need to use numpy differently.
Priot to Boost 1.63 (BOOST_VERSION == 106300) numpy was not directly
Prior to Boost 1.63 (BOOST_VERSION == 106300) numpy was not directly
included in boost/python
See answers and discussion provided here:
Expand Down Expand Up @@ -149,6 +151,97 @@ See answers and discussion provided here:
/******************************************************
Functions with pybinding for python-based usage
******************************************************/
template <typename T>
inline boost::python::tuple get_reduced_summary_statistics_data(
const boost::python::object& pulse_map_obj,
const bool add_total_charge,
const bool add_t_first,
const bool add_t_std
) {

// Get pulse map
I3RecoPulseSeriesMap& pulse_map = boost::python::extract<I3RecoPulseSeriesMap&>(pulse_map_obj);

// create a dict for the output data
boost::python::dict data_dict;
T global_offset_time = 0.;

// Iterate over pulses once to obtain global time offset
if (add_t_first){
MeanVarianceAccumulator<T> acc_total;
for (auto const& dom_pulses : pulse_map){
for (auto const& pulse : dom_pulses.second){
acc_total.add_element(pulse.GetTime(), pulse.GetCharge());
}
}
global_offset_time = acc_total.mean();
}

// now iterate over DOMs and pulses to fill data_dict
for (auto const& dom_pulses : pulse_map){

// check if pulses are present
unsigned int n_pulses = dom_pulses.second.size();
if (n_pulses == 0){
continue;
}

// create and initialize variables
T dom_charge_sum = 0.0;
MeanVarianceAccumulator<T> acc;

// loop through pulses
for (auto const& pulse : dom_pulses.second){

// total DOM charge
dom_charge_sum += pulse.GetCharge();

// weighted mean and std
if (add_t_std){
acc.add_element(pulse.GetTime(), pulse.GetCharge());
}
}

// add data
int counter = 0;
boost::python::list bin_exclusions_list; // empty dummy exclusions
boost::python::list bin_indices_list;
boost::python::list bin_values_list;

// Total DOM charge
if (add_total_charge){
bin_indices_list.append(counter);
bin_values_list.append(dom_charge_sum);
counter += 1;
}

// time of first pulse
if (add_t_first){
bin_indices_list.append(counter);
bin_values_list.append(
dom_pulses.second[0].GetTime() - global_offset_time);
counter += 1;
}

// time std deviation of pulses at DOM
if (add_t_std){
bin_indices_list.append(counter);
if (n_pulses == 1){
bin_values_list.append(0.);
} else{
bin_values_list.append(acc.std());
}
counter += 1;
}

// add to data_dict
data_dict[dom_pulses.first] = boost::python::make_tuple(
bin_values_list, bin_indices_list, bin_exclusions_list);
}

return boost::python::make_tuple(global_offset_time, data_dict);
}

template <typename T>
inline boost::python::dict get_cascade_classification_data(
boost::python::object& frame_obj,
Expand Down Expand Up @@ -819,6 +912,9 @@ BOOST_PYTHON_MODULE(ext_boost)
boost::python::def("get_time_bin_exclusions",
&get_time_bin_exclusions);

boost::python::def("get_reduced_summary_statistics_data",
&get_reduced_summary_statistics_data<double>);

boost::python::def("get_cascade_classification_data",
&get_cascade_classification_data<double>);

Expand Down
44 changes: 2 additions & 42 deletions ic3_data_ext/ext_pybind11.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
#include <string>
#include <math.h>

#include "utils.cpp"


/******************************************************
Time critical functions for the Deep Learning-based
Expand All @@ -17,48 +19,6 @@ wrapped with pybind11.
// Actual C++ snippets. See docstrings at the bottom for argument info.
namespace py = pybind11;


template <typename T>
class MeanVarianceAccumulator{
// adopted from:
// http://www.nowozin.net/sebastian/blog/streaming-mean-and-variance-computation.html
public:
T sumw = 0.0;
T wmean = 0.0;
T t = 0.0;
int n = 0;

void add_element(const T& value, const T& weight){
assert(weight >= 0.0);

T q = value - this->wmean;
T temp_sumw = this->sumw + weight;
T r = q*weight / temp_sumw;

this->wmean += r;
this->t += q*r*this->sumw;
this->sumw = temp_sumw;
this->n += 1;
}

T count(){
return this->n;
}

T mean(){
return this->wmean;
}

T var(){
return (this->t * this->n)/( this->sumw*( this->n - 1));
}

T std(){
return sqrt(this->var());
}
};


template <typename T>
inline py::list get_summary_data( const py::array_t<T> dom_charges,
const py::array_t<T> rel_dom_times,
Expand Down
Loading

0 comments on commit ee25c8a

Please sign in to comment.