Skip to content

Commit

Permalink
Merge pull request #1737 from astrofrog/abstract-compute-statistic
Browse files Browse the repository at this point in the history
Move computation of statistics to the Data object
  • Loading branch information
astrofrog authored May 22, 2018
2 parents 63d21a4 + d29cf8b commit bab74b7
Show file tree
Hide file tree
Showing 8 changed files with 255 additions and 150 deletions.
2 changes: 1 addition & 1 deletion doc/whatsnew/whatsnew.rst
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ Profile viewer

Glue now features a new profile viewer that can be used to show data collapsed
along all but one dimension using a variety of functions (mean, median, maximum,
minimim, and so on). This new viewer replaces the previous 'spectrum' tool
minimum, and so on). This new viewer replaces the previous 'spectrum' tool
(which was restricted to 3 dimensions and mostly designed to work with
astronomical data) and includes the same functionality to fit models to profiles
or collapse data in an image viewer based on an interval selected in the profile
Expand Down
106 changes: 105 additions & 1 deletion glue/core/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,15 @@
from glue.core.decorators import clear_cache
from glue.core.util import split_component_view
from glue.core.hub import Hub
from glue.core.subset import Subset, SubsetState
from glue.core.subset import Subset, SubsetState, SliceSubsetState
from glue.core.component_id import ComponentIDList
from glue.core.component_link import ComponentLink, CoordinateComponentLink
from glue.core.exceptions import IncompatibleAttribute
from glue.core.visual import VisualAttributes
from glue.core.coordinates import Coordinates
from glue.core.contracts import contract
from glue.config import settings
from glue.utils import compute_statistic, unbroadcast, iterate_chunks


# Note: leave all the following imports for component and component_id since
Expand All @@ -34,6 +35,8 @@

__all__ = ['Data']

N_CHUNK_MAX = 40000000


class Data(object):

Expand Down Expand Up @@ -1153,6 +1156,107 @@ def update_values_from_data(self, data):
for subset in self.subsets:
clear_cache(subset.subset_state.to_mask)

# The following are methods for accessing the data in various ways that
# can be overriden by subclasses that want to improve performance.

def compute_statistic(self, statistic, cid, subset_state=None, axis=None,
finite=True, positive=False, percentile=None, view=None,
random_subset=None):
"""
Compute a statistic for the data.
Parameters
----------
statistic : {'minimum', 'maximum', 'mean', 'median', 'sum', 'percentile'}
The statistic to compute
cid : `ComponentID` or str
The component ID to compute the statistic on - if given as a string
this will be assumed to be for the component belonging to the dataset
(not external links).
subset_state : `SubsetState`
If specified, the statistic will only include the values that are in
the subset specified by this subset state.
axis : None or int or tuple of int
If specified, the axis/axes to compute the statistic over.
finite : bool, optional
Whether to include only finite values in the statistic. This should
be `True` to ignore NaN/Inf values
positive : bool, optional
Whether to include only (strictly) positive values in the statistic.
This is used for example when computing statistics of data shown in
log space.
percentile : float, optional
If ``statistic`` is ``'percentile'``, the ``percentile`` argument
should be given and specify the percentile to calculate in the
range [0:100]
random_subset : int, optional
If specified, this should be an integer giving the number of values
to use for the statistic. This can only be used if ``axis`` is `None`
"""

# TODO: generalize chunking to more types of axis

if (view is None and
isinstance(axis, tuple) and
len(axis) == self.ndim - 1 and
self.size > N_CHUNK_MAX and
not isinstance(subset_state, SliceSubsetState)):

# We operate in chunks here to avoid memory issues.

# TODO: there are cases where the code below is not optimized
# because the mask may be computable for a single slice and
# broadcastable to all slices - normally ROISubsetState takes care
# of that but if we call it once per view it won't. In the future we
# could ask a SubsetState whether it is broadcasted along
# axis_index.

axis_index = [a for a in range(self.ndim) if a not in axis][0]

result = np.zeros(self.shape[axis_index])

chunk_shape = list(self.shape)

# Deliberately leave n_chunks as float to not round twice
n_chunks = self.size / N_CHUNK_MAX

chunk_shape[axis_index] = max(1, int(chunk_shape[axis_index] / n_chunks))

for chunk_view in iterate_chunks(self.shape, chunk_shape=chunk_shape):
values = self.compute_statistic(statistic, cid, subset_state=subset_state,
axis=axis, finite=finite, positive=positive,
percentile=percentile, view=chunk_view)
result[chunk_view[axis_index]] = values

return result

if subset_state:
if isinstance(subset_state, SliceSubsetState) and view is None:
data = subset_state.to_array(self, cid)
mask = None
else:
data = self[cid]
mask = subset_state.to_mask(self, view)
else:
data = self[cid, view]
mask = None

if axis is None and mask is None:
# Since we are just finding overall statistics, not along axes, we
# can remove any broadcasted dimension since these should not affect
# the statistics.
data = unbroadcast(data)

if random_subset and data.size > random_subset:
if not hasattr(self, '_random_subset_indices') or self._random_subset_indices[0] != data.size:
self._random_subset_indices = (data.size, np.random.randint(0, data.size, random_subset))
data = data.ravel()[self._random_subset_indices[1]]
if mask is not None:
mask = mask.ravel()[self._random_subset_indices[1]]

return compute_statistic(statistic, data, mask=mask, axis=axis, finite=finite,
positive=positive, percentile=percentile)


@contract(i=int, ndim=int)
def pixel_label(i, ndim):
Expand Down
114 changes: 35 additions & 79 deletions glue/core/state_objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
HasCallbackProperties, CallbackList)
from glue.core.state import saver, loader
from glue.core.component_id import PixelComponentID
from glue.utils import unbroadcast

__all__ = ['State', 'StateAttributeCacheHelper',
'StateAttributeLimitsHelper', 'StateAttributeSingleValueHelper']
Expand Down Expand Up @@ -141,21 +140,11 @@ def __init__(self, state, attribute, cache=None, **kwargs):

@property
def data_values(self):
# For subsets in 'data' mode, we want to compute the limits based on
# the full dataset, not just the subset.
if isinstance(self.data, Subset):
return self.data.data[self.component_id]
else:
return self.data[self.component_id]
return self.data[self.component_id]

@property
def data_component(self):
# For subsets in 'data' mode, we want to compute the limits based on
# the full dataset, not just the subset.
if isinstance(self.data, Subset):
return self.data.data.get_component(self.component_id)
else:
return self.data.get_component(self.component_id)
return self.data.get_component(self.component_id)

def invalidate_cache(self):
self._cache.clear()
Expand All @@ -165,7 +154,12 @@ def data(self):
if self.attribute is None:
return None
else:
return self.attribute.parent
# For subsets in 'data' mode, we want to compute the limits based on
# the full dataset, not just the subset.
if isinstance(self.attribute.parent, Subset):
return self.attribute.parent.data
else:
return self.attribute.parent

@property
def component_id(self):
Expand Down Expand Up @@ -253,7 +247,7 @@ class StateAttributeLimitsHelper(StateAttributeCacheHelper):
attribute : str
The attribute name - this will be populated once a dataset is assigned
to the helper.
percentile_subset : int
random_subset : int
How many points to use at most for the percentile calculation (using all
values is highly inefficient and not needed)
margin : float
Expand Down Expand Up @@ -285,12 +279,12 @@ class StateAttributeLimitsHelper(StateAttributeCacheHelper):
values_names = ('lower', 'upper')
modifiers_names = ('log', 'percentile')

def __init__(self, state, attribute, percentile_subset=10000, margin=0, cache=None, **kwargs):
def __init__(self, state, attribute, random_subset=10000, margin=0, cache=None, **kwargs):

super(StateAttributeLimitsHelper, self).__init__(state, attribute, cache=cache, **kwargs)

self.margin = margin
self.percentile_subset = percentile_subset
self.random_subset = random_subset
self.subset_indices = None

if self.attribute is not None:
Expand Down Expand Up @@ -333,48 +327,28 @@ def update_values(self, force=False, use_default_modifiers=False, **properties):

exclude = (100 - percentile) / 2.

data_values = self.data_values

# Since we are just finding overall statistics, not along axes, we
# can remove any broadcasted dimension since these should not affect
# the statistics.
data_values = unbroadcast(data_values)

if data_values.size > self.percentile_subset:
if self.subset_indices is None or self.subset_indices[0] != data_values.size:
self.subset_indices = (data_values.size,
np.random.randint(0, data_values.size,
self.percentile_subset))
data_values = data_values.ravel()[self.subset_indices[1]]

if log:
data_values = data_values[data_values > 0]
if len(data_values) == 0:
self.set(lower=0.1, upper=1, percentile=percentile, log=log)
return

# NOTE: we can't use np.nanmin/np.nanmax or nanpercentile below as
# they don't exclude inf/-inf
if data_values.dtype.kind != 'M':
data_values = data_values[np.isfinite(data_values)]

if data_values.size > 0:

if percentile == 100:

if data_values.dtype.kind == 'M':
lower = data_values.min()
upper = data_values.max()
else:
lower = np.min(data_values)
upper = np.max(data_values)
data_component = self.data_component

else:

lower = np.percentile(data_values, exclude)
upper = np.percentile(data_values, 100 - exclude)
if percentile == 100:
lower = self.data.compute_statistic('minimum', cid=self.component_id,
finite=True, positive=log,
random_subset=self.random_subset)
upper = self.data.compute_statistic('maximum', cid=self.component_id,
finite=True, positive=log,
random_subset=self.random_subset)
else:
lower = self.data.compute_statistic('percentile', cid=self.component_id,
percentile=exclude, positive=log,
random_subset=self.random_subset)
upper = self.data.compute_statistic('percentile', cid=self.component_id,
percentile=100 - exclude, positive=log,
random_subset=self.random_subset)

if not isinstance(lower, np.datetime64) and np.isnan(lower):
lower, upper = 0, 1
else:

if self.data_component.categorical:
if data_component.categorical:
lower = np.floor(lower - 0.5) + 0.5
upper = np.ceil(upper + 0.5) - 0.5

Expand All @@ -387,11 +361,6 @@ def update_values(self, force=False, use_default_modifiers=False, **properties):
lower -= value_range * self.margin
upper += value_range * self.margin

else:

lower = 0.
upper = 1.

self.set(lower=lower, upper=upper, percentile=percentile, log=log)

def flip_limits(self):
Expand Down Expand Up @@ -492,24 +461,11 @@ def update_values(self, force=False, use_default_modifiers=False, **properties):
else:
n_bin = self._common_n_bin

data_values = self.data_values

# Since we are just finding overall statistics, not along axes, we
# can remove any broadcasted dimension since these should not affect
# the statistics.
data_values = unbroadcast(data_values)
lower = self.data.compute_statistic('minimum', cid=self.component_id, finite=True)
upper = self.data.compute_statistic('maximum', cid=self.component_id, finite=True)

# NOTE: we can't use np.nanmin/np.nanmax or nanpercentile below as
# they don't exclude inf/-inf
if data_values.dtype.kind != 'M':
data_values = data_values[np.isfinite(data_values)]

if data_values.size > 0:
lower = data_values.min()
upper = data_values.max()
else:
lower = 0.
upper = 1.
if not isinstance(lower, np.datetime64) and np.isnan(lower):
lower, upper = 0, 1

self.set(lower=lower, upper=upper, n_bin=n_bin)

Expand Down
Loading

0 comments on commit bab74b7

Please sign in to comment.