Skip to content

Commit

Permalink
Start moving computation of data statistics to the Data object [ci skip]
Browse files Browse the repository at this point in the history
  • Loading branch information
astrofrog committed May 22, 2018
1 parent 265af08 commit 6400f87
Show file tree
Hide file tree
Showing 4 changed files with 194 additions and 82 deletions.
51 changes: 50 additions & 1 deletion glue/core/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,15 @@
from glue.core.decorators import clear_cache
from glue.core.util import split_component_view
from glue.core.hub import Hub
from glue.core.subset import Subset, SubsetState
from glue.core.subset import Subset, SubsetState, SliceSubsetState
from glue.core.component_id import ComponentIDList
from glue.core.component_link import ComponentLink, CoordinateComponentLink
from glue.core.exceptions import IncompatibleAttribute
from glue.core.visual import VisualAttributes
from glue.core.coordinates import Coordinates
from glue.core.contracts import contract
from glue.config import settings
from glue.utils import compute_statistic


# Note: leave all the following imports for component and component_id since
Expand Down Expand Up @@ -1153,6 +1154,54 @@ def update_values_from_data(self, data):
for subset in self.subsets:
clear_cache(subset.subset_state.to_mask)

# The following are methods for accessing the data in various ways that
# can be overriden by subclasses that want to improve performance.

def compute_statistic(self, statistic, cid, subset_state=None, axis=None,
finite=True, positive=False, percentile=None):
"""
Compute a statistic for the data.
Parameters
----------
statistic : {'minimum', 'maximum', 'mean', 'median', 'sum', 'percentile'}
The statistic to compute
cid : `ComponentID` or str
The component ID to compute the statistic on - if given as a string
this will be assumed to be for the component belonging to the dataset
(not external links).
subset_state : `SubsetState`
If specified, the statistic will only include the values that are in
the subset specified by this subset state.
axis : None or int or tuple of int
If specified, the axis/axes to compute the statistic over.
finite : bool, optional
Whether to include only finite values in the statistic. This should
be `True` to ignore NaN/Inf values
positive : bool, optional
Whether to include only (strictly) positive values in the statistic.
This is used for example when computing statistics of data shown in
log space.
percentile : float, optional
If ``statistic`` is ``'percentile'``, the ``percentile`` argument
should be given and specify the percentile to calculate in the
range [0:100]
"""

if subset_state:
if isinstance(subset_state, SliceSubsetState):
data = subset_state.to_array(self, cid)
mask = None
else:
data = self[cid]
mask = subset_state.to_mask(self)
else:
data = self[cid]
mask = None

return compute_statistic(statistic, data, mask=mask, axis=axis, finite=finite,
positive=positive, percentile=percentile)


@contract(i=int, ndim=int)
def pixel_label(i, ndim):
Expand Down
100 changes: 47 additions & 53 deletions glue/core/state_objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,21 +140,11 @@ def __init__(self, state, attribute, cache=None, **kwargs):

@property
def data_values(self):
# For subsets in 'data' mode, we want to compute the limits based on
# the full dataset, not just the subset.
if isinstance(self.data, Subset):
return self.data.data[self.component_id]
else:
return self.data[self.component_id]
return self.data[self.component_id]

@property
def data_component(self):
# For subsets in 'data' mode, we want to compute the limits based on
# the full dataset, not just the subset.
if isinstance(self.data, Subset):
return self.data.data.get_component(self.component_id)
else:
return self.data.get_component(self.component_id)
return self.data.get_component(self.component_id)

def invalidate_cache(self):
self._cache.clear()
Expand All @@ -164,7 +154,12 @@ def data(self):
if self.attribute is None:
return None
else:
return self.attribute.parent
# For subsets in 'data' mode, we want to compute the limits based on
# the full dataset, not just the subset.
if isinstance(self.attribute.parent, Subset):
return self.attribute.parent.data
else:
return self.attribute.parent

@property
def component_id(self):
Expand Down Expand Up @@ -332,43 +327,48 @@ def update_values(self, force=False, use_default_modifiers=False, **properties):

exclude = (100 - percentile) / 2.

data_values = self.data_values
# data_values = self.data_values
data_component = self.data_component

if data_values.size > self.percentile_subset:
if self.subset_indices is None or self.subset_indices[0] != data_values.size:
self.subset_indices = (data_values.size,
np.random.randint(0, data_values.size,
self.percentile_subset))
data_values = data_values.ravel()[self.subset_indices[1]]
# NOTE: specific to issues with local data
# if data_component.size > self.percentile_subset:
# if self.subset_indices is None or self.subset_indices[0] != data_component.size:
# self.subset_indices = (data_component.size,
# np.random.randint(0, data_component.size,
# self.percentile_subset))
# data_values = data_values.ravel()[self.subset_indices[1]]

if log:
data_values = data_values[data_values > 0]
if len(data_values) == 0:
self.set(lower=0.1, upper=1, percentile=percentile, log=log)
return
if log and not data_component.any_positive:
self.set(lower=0.1, upper=1, percentile=percentile, log=log)
return

# NOTE: we can't use np.nanmin/np.nanmax or nanpercentile below as
# they don't exclude inf/-inf
if data_values.dtype.kind != 'M':
data_values = data_values[np.isfinite(data_values)]
# if data_values.dtype.kind != 'M':
# data_values = data_values[np.isfinite(data_values)]

if data_values.size > 0:
if percentile == 100:

if percentile == 100:
# if data_values.dtype.kind == 'M':
# lower = data_values.min()
# upper = data_values.max()
# else:
# TODO: have a way to ask for the min/max of positive values
lower = self.data.compute_statistic('min', cid=self.component_id, finite=True, positive=log)
upper = self.data.compute_statistic('max', cid=self.component_id, finite=True, positive=log)

if data_values.dtype.kind == 'M':
lower = data_values.min()
upper = data_values.max()
else:
lower = np.min(data_values)
upper = np.max(data_values)
else:

else:
lower = self.data.compute_statistic('percentile', cid=self.component_id, percentile=exclude, positive=log)
upper = self.data.compute_statistic('percentile', cid=self.component_id, percentile=100 - exclude, positive=log)

if np.isnan(lower) or np.isnan(upper):

lower = np.percentile(data_values, exclude)
upper = np.percentile(data_values, 100 - exclude)
lower, upper = 0, 1

if self.data_component.categorical:
else:

if data_component.categorical:
lower = np.floor(lower - 0.5) + 0.5
upper = np.ceil(upper + 0.5) - 0.5

Expand All @@ -381,11 +381,6 @@ def update_values(self, force=False, use_default_modifiers=False, **properties):
lower -= value_range * self.margin
upper += value_range * self.margin

else:

lower = 0.
upper = 1.

self.set(lower=lower, upper=upper, percentile=percentile, log=log)

def flip_limits(self):
Expand Down Expand Up @@ -486,19 +481,18 @@ def update_values(self, force=False, use_default_modifiers=False, **properties):
else:
n_bin = self._common_n_bin

values = self.data_values
data_component = self.data_component

# NOTE: we can't use np.nanmin/np.nanmax or nanpercentile below as
# they don't exclude inf/-inf
if values.dtype.kind != 'M':
values = values[np.isfinite(values)]
# if values.dtype.kind != 'M':
# values = values[np.isfinite(values)]

if values.size > 0:
lower = values.min()
upper = values.max()
else:
lower = 0.
upper = 1.
lower = self.data.compute_statistic('min', cid=self.component_id, finite=True)
upper = self.data.compute_statistic('max', cid=self.component_id, finite=True)

if np.isnan(lower) or np.isnan(upper):
lower, upper = 0, 1

self.set(lower=lower, upper=upper, n_bin=n_bin)

Expand Down
84 changes: 84 additions & 0 deletions glue/utils/array.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from __future__ import absolute_import, division, print_function

import warnings

import numpy as np
from numpy.lib.stride_tricks import as_strided

Expand Down Expand Up @@ -390,3 +392,85 @@ def format_minimal(values):
if len(strings) == len(set(strings)):
break
return fmt, strings



PLAIN_FUNCTIONS = {'minimum': np.min,
'maximum': np.max,
'mean': np.mean,
'median': np.median,
'sum': np.sum,
'percentile': np.percentile}

NAN_FUNCTIONS = {'minimum': nanmin,
'maximum': nanmax,
'mean': nanmean,
'median': nanmedian,
'sum': nansum,
'percentile': np.nanpercentile}


def compute_statistic(statistic, data, mask=None, axis=None, finite=True,
positive=False, percentile=None):
"""
Compute a statistic for the data.
Parameters
----------
statistic : {'count', 'min', 'max', 'mean', 'median', 'sum', 'percentile'}
The statistic to compute
data : `numpy.ndarray`
The data to compute the statistic for.
mask : `numpy.ndarray`
The mask to apply when computing the statistic.
axis : None or int or tuple of int
If specified, the axis/axes to compute the statistic over.
finite : bool, optional
Whether to include only finite values in the statistic. This should
be `True` to ignore NaN/Inf values
positive : bool, optional
Whether to include only (strictly) positive values in the statistic.
This is used for example when computing statistics of data shown in
log space.
percentile : float, optional
If ``statistic`` is ``'percentile'``, the ``percentile`` argument
should be given and specify the percentile to calculate in the
range [0:100]
"""

# NOTE: this function should not ever have to use glue-specific objects.
# The aim is to eventually use a fast C implementation of this function.

if statistic not in PLAIN_FUNCTIONS:
raise ValueError("Unrecognized statistic: {0}".format(statistic))

if finite or positive or mask:

keep = np.ones(data.shape, dtype=bool)

if finite:
keep &= np.isfinite(data)

if positive:
keep &= data > 0

if mask:
keep &= mask

if axis is None:
data = data[keep]
else:
data[keep] = np.nan

function = NAN_FUNCTIONS[statistic]

else:

function = PLAIN_FUNCTIONS[statistic]

with warnings.catch_warnings():
warnings.simplefilter("ignore", category=RuntimeWarning)
if statistic == 'percentile':
return function(data, percentile, axis=axis)
else:
return function(data, axis=axis)
41 changes: 13 additions & 28 deletions glue/viewers/profile/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@
__all__ = ['ProfileViewerState', 'ProfileLayerState']


FUNCTIONS = OrderedDict([(nanmax, 'Maximum'),
(nanmin, 'Minimum'),
(nanmean, 'Mean'),
(nanmedian, 'Median'),
(nansum, 'Sum')])
FUNCTIONS = OrderedDict([('max', 'Maximum'),
('min', 'Minimum'),
('mean', 'Mean'),
('median', 'Median'),
('sum', 'Sum')])


class ProfileViewerState(MatplotlibDataViewerState):
Expand Down Expand Up @@ -203,34 +203,19 @@ def _update_profile(self, *event):
# smaller than the data to just average the relevant 'spaxels' in the
# data rather than collapsing the whole cube.

if isinstance(self.layer, Data):
data = self.layer
subset_state = None
else:
data = self.layer.data
subset_state = self.layer.subset_state

try:
if isinstance(self.layer, Data):
data = self.layer
data_values = data[self.attribute]
else:
data = self.layer.data
if isinstance(self.layer.subset_state, SliceSubsetState):
data_values = self.layer.subset_state.to_array(self.layer.data, self.attribute)
else:
# We need to force a copy *and* convert to float just in case
data_values = np.array(data[self.attribute], dtype=float)
mask = self.layer.to_mask()
if np.sum(mask) == 0:
self._profile = [], []
return
data_values[~mask] = np.nan
profile_values = data.compute_statistic(self.viewer_state.function, self.attribute, axis=axes, subset_state=subset_state)
except IncompatibleAttribute:
self._profile = None, None
return

# Collapse along all dimensions except x_att
if self.layer.ndim > 1:
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=RuntimeWarning)
profile_values = self.viewer_state.function(data_values, axis=axes)
else:
profile_values = data_values

# Finally, we get the coordinate values for the requested axis
axis_view = [0] * data.ndim
axis_view[pix_cid.axis] = slice(None)
Expand Down

0 comments on commit 6400f87

Please sign in to comment.