From 3eb699f0242c9ef30a59782a05f570afa52de60b Mon Sep 17 00:00:00 2001 From: Thomas Robitaille Date: Wed, 9 May 2018 14:23:22 +0100 Subject: [PATCH 1/6] Start moving computation of data statistics to the Data object [ci skip] --- glue/core/data.py | 89 ++++++++++++++++++++++++++- glue/core/state_objects.py | 111 ++++++++++++++-------------------- glue/utils/array.py | 84 +++++++++++++++++++++++++ glue/viewers/profile/state.py | 70 +++++---------------- 4 files changed, 234 insertions(+), 120 deletions(-) diff --git a/glue/core/data.py b/glue/core/data.py index b404a8ee2..7afaa91f5 100644 --- a/glue/core/data.py +++ b/glue/core/data.py @@ -16,7 +16,7 @@ from glue.core.decorators import clear_cache from glue.core.util import split_component_view from glue.core.hub import Hub -from glue.core.subset import Subset, SubsetState +from glue.core.subset import Subset, SubsetState, SliceSubsetState from glue.core.component_id import ComponentIDList from glue.core.component_link import ComponentLink, CoordinateComponentLink from glue.core.exceptions import IncompatibleAttribute @@ -24,6 +24,7 @@ from glue.core.coordinates import Coordinates from glue.core.contracts import contract from glue.config import settings +from glue.utils import compute_statistic, unbroadcast # Note: leave all the following imports for component and component_id since @@ -34,6 +35,8 @@ __all__ = ['Data'] +N_CHUNK_MAX = 40000000 + class Data(object): @@ -1153,6 +1156,90 @@ def update_values_from_data(self, data): for subset in self.subsets: clear_cache(subset.subset_state.to_mask) + # The following are methods for accessing the data in various ways that + # can be overriden by subclasses that want to improve performance. + + def compute_statistic(self, statistic, cid, subset_state=None, axis=None, + finite=True, positive=False, percentile=None, view=None): + """ + Compute a statistic for the data. + + Parameters + ---------- + statistic : {'minimum', 'maximum', 'mean', 'median', 'sum', 'percentile'} + The statistic to compute + cid : `ComponentID` or str + The component ID to compute the statistic on - if given as a string + this will be assumed to be for the component belonging to the dataset + (not external links). + subset_state : `SubsetState` + If specified, the statistic will only include the values that are in + the subset specified by this subset state. + axis : None or int or tuple of int + If specified, the axis/axes to compute the statistic over. + finite : bool, optional + Whether to include only finite values in the statistic. This should + be `True` to ignore NaN/Inf values + positive : bool, optional + Whether to include only (strictly) positive values in the statistic. + This is used for example when computing statistics of data shown in + log space. + percentile : float, optional + If ``statistic`` is ``'percentile'``, the ``percentile`` argument + should be given and specify the percentile to calculate in the + range [0:100] + """ + + # TODO: generalize chunking to tuple axis (not just int) + + if (view is None and isinstance(axis, int) and self.size > N_CHUNK_MAX and + not isinstance(subset_state, SliceSubsetState)): + + # We operate in chunks here to avoid memory issues. + + # TODO: there are cases where the code below is not optimized + # because the mask may be computable for a single slice and + # broadcastable to all slices - normally ROISubsetState takes care + # of that but if we call it once per view it won't. In the future we + # could ask a SubsetState whether it is broadcasted along + # axis_index. + + result = np.zeros(self.shape[axis]) + + chunk_shape = list(self.shape) + + # Deliberately leave n_chunks as float to not round twice + n_chunks = self.layer.size / N_CHUNK_MAX + + chunk_shape[axis_index] = max(1, int(chunk_shape[axis_index] / n_chunks)) + + for view in iterate_chunks(self.layer.shape, chunk_shape=chunk_shape): + result[view[axis]] = self.compute_statistic(statistic, cid, subset_state=subset_state, + axis=axis, finite=finite, positive=positive, + percentile=percentile, view=view) + + return result + + if subset_state: + if isinstance(subset_state, SliceSubsetState): + data = subset_state.to_array(self, cid, view) + mask = None + else: + data = self[cid] + mask = subset_state.to_mask(self, view) + else: + data = self[cid, view] + mask = None + + if axis is None and mask is None: + # Since we are just finding overall statistics, not along axes, we + # can remove any broadcasted dimension since these should not affect + # the statistics. + data = unbroadcast(data) + + return compute_statistic(statistic, data, mask=mask, axis=axis, finite=finite, + positive=positive, percentile=percentile) + @contract(i=int, ndim=int) def pixel_label(i, ndim): diff --git a/glue/core/state_objects.py b/glue/core/state_objects.py index d9ab0362d..0e5adbfb3 100644 --- a/glue/core/state_objects.py +++ b/glue/core/state_objects.py @@ -9,7 +9,6 @@ HasCallbackProperties, CallbackList) from glue.core.state import saver, loader from glue.core.component_id import PixelComponentID -from glue.utils import unbroadcast __all__ = ['State', 'StateAttributeCacheHelper', 'StateAttributeLimitsHelper', 'StateAttributeSingleValueHelper'] @@ -141,21 +140,11 @@ def __init__(self, state, attribute, cache=None, **kwargs): @property def data_values(self): - # For subsets in 'data' mode, we want to compute the limits based on - # the full dataset, not just the subset. - if isinstance(self.data, Subset): - return self.data.data[self.component_id] - else: - return self.data[self.component_id] + return self.data[self.component_id] @property def data_component(self): - # For subsets in 'data' mode, we want to compute the limits based on - # the full dataset, not just the subset. - if isinstance(self.data, Subset): - return self.data.data.get_component(self.component_id) - else: - return self.data.get_component(self.component_id) + return self.data.get_component(self.component_id) def invalidate_cache(self): self._cache.clear() @@ -165,7 +154,12 @@ def data(self): if self.attribute is None: return None else: - return self.attribute.parent + # For subsets in 'data' mode, we want to compute the limits based on + # the full dataset, not just the subset. + if isinstance(self.attribute.parent, Subset): + return self.attribute.parent.data + else: + return self.attribute.parent @property def component_id(self): @@ -333,48 +327,48 @@ def update_values(self, force=False, use_default_modifiers=False, **properties): exclude = (100 - percentile) / 2. - data_values = self.data_values - - # Since we are just finding overall statistics, not along axes, we - # can remove any broadcasted dimension since these should not affect - # the statistics. - data_values = unbroadcast(data_values) + # data_values = self.data_values + data_component = self.data_component - if data_values.size > self.percentile_subset: - if self.subset_indices is None or self.subset_indices[0] != data_values.size: - self.subset_indices = (data_values.size, - np.random.randint(0, data_values.size, - self.percentile_subset)) - data_values = data_values.ravel()[self.subset_indices[1]] + # NOTE: specific to issues with local data + # if data_component.size > self.percentile_subset: + # if self.subset_indices is None or self.subset_indices[0] != data_component.size: + # self.subset_indices = (data_component.size, + # np.random.randint(0, data_component.size, + # self.percentile_subset)) + # data_values = data_values.ravel()[self.subset_indices[1]] - if log: - data_values = data_values[data_values > 0] - if len(data_values) == 0: - self.set(lower=0.1, upper=1, percentile=percentile, log=log) - return + if log and not data_component.any_positive: + self.set(lower=0.1, upper=1, percentile=percentile, log=log) + return # NOTE: we can't use np.nanmin/np.nanmax or nanpercentile below as # they don't exclude inf/-inf - if data_values.dtype.kind != 'M': - data_values = data_values[np.isfinite(data_values)] + # if data_values.dtype.kind != 'M': + # data_values = data_values[np.isfinite(data_values)] + + if percentile == 100: - if data_values.size > 0: + # if data_values.dtype.kind == 'M': + # lower = data_values.min() + # upper = data_values.max() + # else: + # TODO: have a way to ask for the min/max of positive values + lower = self.data.compute_statistic('min', cid=self.component_id, finite=True, positive=log) + upper = self.data.compute_statistic('max', cid=self.component_id, finite=True, positive=log) - if percentile == 100: + else: - if data_values.dtype.kind == 'M': - lower = data_values.min() - upper = data_values.max() - else: - lower = np.min(data_values) - upper = np.max(data_values) + lower = self.data.compute_statistic('percentile', cid=self.component_id, percentile=exclude, positive=log) + upper = self.data.compute_statistic('percentile', cid=self.component_id, percentile=100 - exclude, positive=log) - else: + if np.isnan(lower) or np.isnan(upper): - lower = np.percentile(data_values, exclude) - upper = np.percentile(data_values, 100 - exclude) + lower, upper = 0, 1 - if self.data_component.categorical: + else: + + if data_component.categorical: lower = np.floor(lower - 0.5) + 0.5 upper = np.ceil(upper + 0.5) - 0.5 @@ -387,11 +381,6 @@ def update_values(self, force=False, use_default_modifiers=False, **properties): lower -= value_range * self.margin upper += value_range * self.margin - else: - - lower = 0. - upper = 1. - self.set(lower=lower, upper=upper, percentile=percentile, log=log) def flip_limits(self): @@ -492,24 +481,18 @@ def update_values(self, force=False, use_default_modifiers=False, **properties): else: n_bin = self._common_n_bin - data_values = self.data_values - - # Since we are just finding overall statistics, not along axes, we - # can remove any broadcasted dimension since these should not affect - # the statistics. - data_values = unbroadcast(data_values) + data_component = self.data_component # NOTE: we can't use np.nanmin/np.nanmax or nanpercentile below as # they don't exclude inf/-inf - if data_values.dtype.kind != 'M': - data_values = data_values[np.isfinite(data_values)] + # if values.dtype.kind != 'M': + # values = values[np.isfinite(values)] - if data_values.size > 0: - lower = data_values.min() - upper = data_values.max() - else: - lower = 0. - upper = 1. + lower = self.data.compute_statistic('min', cid=self.component_id, finite=True) + upper = self.data.compute_statistic('max', cid=self.component_id, finite=True) + + if np.isnan(lower) or np.isnan(upper): + lower, upper = 0, 1 self.set(lower=lower, upper=upper, n_bin=n_bin) diff --git a/glue/utils/array.py b/glue/utils/array.py index 0f6845f73..8ad4230f6 100644 --- a/glue/utils/array.py +++ b/glue/utils/array.py @@ -1,5 +1,7 @@ from __future__ import absolute_import, division, print_function +import warnings + import numpy as np from numpy.lib.stride_tricks import as_strided @@ -390,3 +392,85 @@ def format_minimal(values): if len(strings) == len(set(strings)): break return fmt, strings + + + +PLAIN_FUNCTIONS = {'minimum': np.min, + 'maximum': np.max, + 'mean': np.mean, + 'median': np.median, + 'sum': np.sum, + 'percentile': np.percentile} + +NAN_FUNCTIONS = {'minimum': nanmin, + 'maximum': nanmax, + 'mean': nanmean, + 'median': nanmedian, + 'sum': nansum, + 'percentile': np.nanpercentile} + + +def compute_statistic(statistic, data, mask=None, axis=None, finite=True, + positive=False, percentile=None): + """ + Compute a statistic for the data. + + Parameters + ---------- + statistic : {'count', 'min', 'max', 'mean', 'median', 'sum', 'percentile'} + The statistic to compute + data : `numpy.ndarray` + The data to compute the statistic for. + mask : `numpy.ndarray` + The mask to apply when computing the statistic. + axis : None or int or tuple of int + If specified, the axis/axes to compute the statistic over. + finite : bool, optional + Whether to include only finite values in the statistic. This should + be `True` to ignore NaN/Inf values + positive : bool, optional + Whether to include only (strictly) positive values in the statistic. + This is used for example when computing statistics of data shown in + log space. + percentile : float, optional + If ``statistic`` is ``'percentile'``, the ``percentile`` argument + should be given and specify the percentile to calculate in the + range [0:100] + """ + + # NOTE: this function should not ever have to use glue-specific objects. + # The aim is to eventually use a fast C implementation of this function. + + if statistic not in PLAIN_FUNCTIONS: + raise ValueError("Unrecognized statistic: {0}".format(statistic)) + + if finite or positive or mask: + + keep = np.ones(data.shape, dtype=bool) + + if finite: + keep &= np.isfinite(data) + + if positive: + keep &= data > 0 + + if mask: + keep &= mask + + if axis is None: + data = data[keep] + else: + data[keep] = np.nan + + function = NAN_FUNCTIONS[statistic] + + else: + + function = PLAIN_FUNCTIONS[statistic] + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=RuntimeWarning) + if statistic == 'percentile': + return function(data, percentile, axis=axis) + else: + return function(data, axis=axis) diff --git a/glue/viewers/profile/state.py b/glue/viewers/profile/state.py index 5b3ff91fd..eb40f55e0 100644 --- a/glue/viewers/profile/state.py +++ b/glue/viewers/profile/state.py @@ -5,7 +5,7 @@ import numpy as np -from glue.core import Data, Subset, Coordinates +from glue.core import Data, Coordinates from glue.external.echo import delay_callback from glue.viewers.matplotlib.state import (MatplotlibDataViewerState, MatplotlibLayerState, @@ -13,23 +13,19 @@ DeferredDrawSelectionCallbackProperty as DDSCProperty) from glue.core.state_objects import StateAttributeLimitsHelper from glue.core.data_combo_helper import ManualDataComboHelper, ComponentIDComboHelper -from glue.utils import defer_draw, nanmean, nanmedian, nansum, nanmin, nanmax, iterate_chunks +from glue.utils import defer_draw, nanmean, nanmedian, nansum, nanmin, nanmax from glue.core.link_manager import is_convertible_to_single_pixel_cid -from glue.core.exceptions import IncompatibleDataException +from glue.core.exceptions import IncompatibleAttribute, IncompatibleDataException from glue.core.subset import SliceSubsetState __all__ = ['ProfileViewerState', 'ProfileLayerState'] -FUNCTIONS = OrderedDict([(nanmax, 'Maximum'), - (nanmin, 'Minimum'), - (nanmean, 'Mean'), - (nanmedian, 'Median'), - (nansum, 'Sum')]) - -# Maximum number of elements in a chunk size used to compute the profile - this -# prevents the profile calculation from using up too much memory at a time. -N_CHUNK_MAX = 50000000 +FUNCTIONS = OrderedDict([('maximum', 'Maximum'), + ('minimum', 'Minimum'), + ('mean', 'Mean'), + ('median', 'Median'), + ('sum', 'Sum')]) class ProfileViewerState(MatplotlibDataViewerState): @@ -216,51 +212,15 @@ def update_profile(self, update_limits=True): # smaller than the data to just average the relevant 'spaxels' in the # data rather than collapsing the whole cube. - # We operate in chunks here to avoid memory issues - - axis_index = pix_cid.axis - - profile_values = np.zeros(self.layer.shape[axis_index]) - - if isinstance(self.layer, Subset) and isinstance(self.layer.subset_state, SliceSubsetState): - chunk_shape = self.layer.shape + if isinstance(self.layer, Data): + data = self.layer + subset_state = None else: - chunk_shape = list(self.layer.shape) - if self.layer.size > N_CHUNK_MAX: - # Deliberately leave n_chunks as float to not round twice - n_chunks = self.layer.size / N_CHUNK_MAX - chunk_shape[axis_index] = max(1, int(chunk_shape[axis_index] / n_chunks)) - - # TODO: there are cases where the code below is not optimized because - # the mask may be computable for a single slice and broadcastable to all - # slices - normally ROISubsetState takes care of that but if we call it - # once per view it won't. In the future we could ask a SubsetState - # whether it is broadcasted along axis_index. - - for view in iterate_chunks(self.layer.shape, chunk_shape=chunk_shape): - - if isinstance(self.layer, Data): - data = self.layer - data_values = data[self.attribute, view] - else: - data = self.layer.data - if isinstance(self.layer.subset_state, SliceSubsetState): - data_values = self.layer.subset_state.to_array(self.layer.data, self.attribute) - else: - # We need to force a copy *and* convert to float just in case - data_values = np.array(data[self.attribute, view], dtype=float) - mask = self.layer.to_mask(view=view) - data_values[~mask] = np.nan - - # Collapse along all dimensions except x_att - if self.layer.ndim > 1: - with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=RuntimeWarning) - profile_values[view[axis_index]] = self.viewer_state.function(data_values, axis=axes) - else: - profile_values[view[axis_index]] = data_values + data = self.layer.data + subset_state = self.layer.subset_state + + profile_values = data.compute_statistic(self.viewer_state.function, self.attribute, axis=axes, subset_state=subset_state) - # Finally, we get the coordinate values for the requested axis if np.all(np.isnan(profile_values)): self._profile_cache = [], [] else: From d73cbce60fa4a369aba78824cdd109f451063f81 Mon Sep 17 00:00:00 2001 From: Thomas Robitaille Date: Tue, 22 May 2018 18:40:48 +0100 Subject: [PATCH 2/6] Implemented random_subset in compute_statistic --- glue/core/data.py | 13 ++++++++++- glue/core/state_objects.py | 44 +++++++++++++------------------------- glue/utils/array.py | 2 +- 3 files changed, 28 insertions(+), 31 deletions(-) diff --git a/glue/core/data.py b/glue/core/data.py index 7afaa91f5..9adc3282b 100644 --- a/glue/core/data.py +++ b/glue/core/data.py @@ -1160,7 +1160,8 @@ def update_values_from_data(self, data): # can be overriden by subclasses that want to improve performance. def compute_statistic(self, statistic, cid, subset_state=None, axis=None, - finite=True, positive=False, percentile=None, view=None): + finite=True, positive=False, percentile=None, view=None, + random_subset=None): """ Compute a statistic for the data. @@ -1188,6 +1189,9 @@ def compute_statistic(self, statistic, cid, subset_state=None, axis=None, If ``statistic`` is ``'percentile'``, the ``percentile`` argument should be given and specify the percentile to calculate in the range [0:100] + random_subset : int, optional + If specified, this should be an integer giving the number of values + to use for the statistic. This can only be used if ``axis`` is `None` """ # TODO: generalize chunking to tuple axis (not just int) @@ -1237,6 +1241,13 @@ def compute_statistic(self, statistic, cid, subset_state=None, axis=None, # the statistics. data = unbroadcast(data) + if random_subset and data.size > random_subset: + if not hasattr(self, '_random_subset_indices') or self._random_subset_indices[0] != data.size: + self._random_subset_indices = (data.size, np.random.randint(0, data.size, random_subset)) + data = data.ravel()[self._random_subset_indices[1]] + if mask is not None: + mask = mask.ravel()[self._random_subset_indices[1]] + return compute_statistic(statistic, data, mask=mask, axis=axis, finite=finite, positive=positive, percentile=percentile) diff --git a/glue/core/state_objects.py b/glue/core/state_objects.py index 0e5adbfb3..12b17952a 100644 --- a/glue/core/state_objects.py +++ b/glue/core/state_objects.py @@ -247,7 +247,7 @@ class StateAttributeLimitsHelper(StateAttributeCacheHelper): attribute : str The attribute name - this will be populated once a dataset is assigned to the helper. - percentile_subset : int + random_subset : int How many points to use at most for the percentile calculation (using all values is highly inefficient and not needed) margin : float @@ -279,12 +279,12 @@ class StateAttributeLimitsHelper(StateAttributeCacheHelper): values_names = ('lower', 'upper') modifiers_names = ('log', 'percentile') - def __init__(self, state, attribute, percentile_subset=10000, margin=0, cache=None, **kwargs): + def __init__(self, state, attribute, random_subset=10000, margin=0, cache=None, **kwargs): super(StateAttributeLimitsHelper, self).__init__(state, attribute, cache=cache, **kwargs) self.margin = margin - self.percentile_subset = percentile_subset + self.random_subset = random_subset self.subset_indices = None if self.attribute is not None: @@ -327,40 +327,26 @@ def update_values(self, force=False, use_default_modifiers=False, **properties): exclude = (100 - percentile) / 2. - # data_values = self.data_values data_component = self.data_component - # NOTE: specific to issues with local data - # if data_component.size > self.percentile_subset: - # if self.subset_indices is None or self.subset_indices[0] != data_component.size: - # self.subset_indices = (data_component.size, - # np.random.randint(0, data_component.size, - # self.percentile_subset)) - # data_values = data_values.ravel()[self.subset_indices[1]] - if log and not data_component.any_positive: self.set(lower=0.1, upper=1, percentile=percentile, log=log) return - # NOTE: we can't use np.nanmin/np.nanmax or nanpercentile below as - # they don't exclude inf/-inf - # if data_values.dtype.kind != 'M': - # data_values = data_values[np.isfinite(data_values)] - if percentile == 100: - - # if data_values.dtype.kind == 'M': - # lower = data_values.min() - # upper = data_values.max() - # else: - # TODO: have a way to ask for the min/max of positive values - lower = self.data.compute_statistic('min', cid=self.component_id, finite=True, positive=log) - upper = self.data.compute_statistic('max', cid=self.component_id, finite=True, positive=log) - + lower = self.data.compute_statistic('min', cid=self.component_id, + finite=True, positive=log, + random_subset=self.random_subset) + upper = self.data.compute_statistic('max', cid=self.component_id, + finite=True, positive=log, + random_subset=self.random_subset) else: - - lower = self.data.compute_statistic('percentile', cid=self.component_id, percentile=exclude, positive=log) - upper = self.data.compute_statistic('percentile', cid=self.component_id, percentile=100 - exclude, positive=log) + lower = self.data.compute_statistic('percentile', cid=self.component_id, + percentile=exclude, positive=log, + random_subset=self.random_subset) + upper = self.data.compute_statistic('percentile', cid=self.component_id, + percentile=100 - exclude, positive=log, + random_subset=self.random_subset) if np.isnan(lower) or np.isnan(upper): diff --git a/glue/utils/array.py b/glue/utils/array.py index 8ad4230f6..f31ede613 100644 --- a/glue/utils/array.py +++ b/glue/utils/array.py @@ -448,7 +448,7 @@ def compute_statistic(statistic, data, mask=None, axis=None, finite=True, keep = np.ones(data.shape, dtype=bool) - if finite: + if data.dtype.kind != 'M' and finite: keep &= np.isfinite(data) if positive: From f307c71fcbbbd5105611e9b7b0959722025484eb Mon Sep 17 00:00:00 2001 From: Thomas Robitaille Date: Tue, 22 May 2018 19:00:56 +0100 Subject: [PATCH 3/6] Clean up changes --- doc/whatsnew/whatsnew.rst | 2 +- glue/core/data.py | 24 ++++++++++++------- glue/core/state_objects.py | 21 ++++------------ glue/utils/array.py | 14 ++++++----- .../profile/qt/tests/test_data_viewer.py | 2 +- .../profile/qt/tests/test_profile_tools.py | 2 +- glue/viewers/profile/tests/test_state.py | 16 ++++++------- 7 files changed, 38 insertions(+), 43 deletions(-) diff --git a/doc/whatsnew/whatsnew.rst b/doc/whatsnew/whatsnew.rst index a6d2b0ce2..d9c21daa2 100644 --- a/doc/whatsnew/whatsnew.rst +++ b/doc/whatsnew/whatsnew.rst @@ -163,7 +163,7 @@ Profile viewer Glue now features a new profile viewer that can be used to show data collapsed along all but one dimension using a variety of functions (mean, median, maximum, -minimim, and so on). This new viewer replaces the previous 'spectrum' tool +minimum, and so on). This new viewer replaces the previous 'spectrum' tool (which was restricted to 3 dimensions and mostly designed to work with astronomical data) and includes the same functionality to fit models to profiles or collapse data in an image viewer based on an interval selected in the profile diff --git a/glue/core/data.py b/glue/core/data.py index 9adc3282b..7eb249dd3 100644 --- a/glue/core/data.py +++ b/glue/core/data.py @@ -24,7 +24,7 @@ from glue.core.coordinates import Coordinates from glue.core.contracts import contract from glue.config import settings -from glue.utils import compute_statistic, unbroadcast +from glue.utils import compute_statistic, unbroadcast, iterate_chunks # Note: leave all the following imports for component and component_id since @@ -1194,9 +1194,12 @@ def compute_statistic(self, statistic, cid, subset_state=None, axis=None, to use for the statistic. This can only be used if ``axis`` is `None` """ - # TODO: generalize chunking to tuple axis (not just int) + # TODO: generalize chunking to more types of axis - if (view is None and isinstance(axis, int) and self.size > N_CHUNK_MAX and + if (view is None and + isinstance(axis, tuple) and + len(axis) == self.ndim - 1 and + self.size > N_CHUNK_MAX and not isinstance(subset_state, SliceSubsetState)): # We operate in chunks here to avoid memory issues. @@ -1208,19 +1211,22 @@ def compute_statistic(self, statistic, cid, subset_state=None, axis=None, # could ask a SubsetState whether it is broadcasted along # axis_index. - result = np.zeros(self.shape[axis]) + axis_index = [a for a in range(self.ndim) if a not in axis][0] + + result = np.zeros(self.shape[axis_index]) chunk_shape = list(self.shape) # Deliberately leave n_chunks as float to not round twice - n_chunks = self.layer.size / N_CHUNK_MAX + n_chunks = self.size / N_CHUNK_MAX chunk_shape[axis_index] = max(1, int(chunk_shape[axis_index] / n_chunks)) - for view in iterate_chunks(self.layer.shape, chunk_shape=chunk_shape): - result[view[axis]] = self.compute_statistic(statistic, cid, subset_state=subset_state, - axis=axis, finite=finite, positive=positive, - percentile=percentile, view=view) + for chunk_view in iterate_chunks(self.shape, chunk_shape=chunk_shape): + values = self.compute_statistic(statistic, cid, subset_state=subset_state, + axis=axis, finite=finite, positive=positive, + percentile=percentile, view=chunk_view) + result[chunk_view[axis_index]] = values return result diff --git a/glue/core/state_objects.py b/glue/core/state_objects.py index 12b17952a..bf7843912 100644 --- a/glue/core/state_objects.py +++ b/glue/core/state_objects.py @@ -329,15 +329,11 @@ def update_values(self, force=False, use_default_modifiers=False, **properties): data_component = self.data_component - if log and not data_component.any_positive: - self.set(lower=0.1, upper=1, percentile=percentile, log=log) - return - if percentile == 100: - lower = self.data.compute_statistic('min', cid=self.component_id, + lower = self.data.compute_statistic('minimum', cid=self.component_id, finite=True, positive=log, random_subset=self.random_subset) - upper = self.data.compute_statistic('max', cid=self.component_id, + upper = self.data.compute_statistic('maximum', cid=self.component_id, finite=True, positive=log, random_subset=self.random_subset) else: @@ -349,9 +345,7 @@ def update_values(self, force=False, use_default_modifiers=False, **properties): random_subset=self.random_subset) if np.isnan(lower) or np.isnan(upper): - lower, upper = 0, 1 - else: if data_component.categorical: @@ -467,15 +461,8 @@ def update_values(self, force=False, use_default_modifiers=False, **properties): else: n_bin = self._common_n_bin - data_component = self.data_component - - # NOTE: we can't use np.nanmin/np.nanmax or nanpercentile below as - # they don't exclude inf/-inf - # if values.dtype.kind != 'M': - # values = values[np.isfinite(values)] - - lower = self.data.compute_statistic('min', cid=self.component_id, finite=True) - upper = self.data.compute_statistic('max', cid=self.component_id, finite=True) + lower = self.data.compute_statistic('minimum', cid=self.component_id, finite=True) + upper = self.data.compute_statistic('maximum', cid=self.component_id, finite=True) if np.isnan(lower) or np.isnan(upper): lower, upper = 0, 1 diff --git a/glue/utils/array.py b/glue/utils/array.py index f31ede613..65e0c784e 100644 --- a/glue/utils/array.py +++ b/glue/utils/array.py @@ -15,7 +15,7 @@ __all__ = ['unique', 'shape_to_string', 'view_shape', 'stack_view', 'coerce_numeric', 'check_sorted', 'broadcast_to', 'unbroadcast', 'iterate_chunks', 'combine_slices', 'nanmean', 'nanmedian', 'nansum', - 'nanmin', 'nanmax', 'format_minimal'] + 'nanmin', 'nanmax', 'format_minimal', 'compute_statistic'] def unbroadcast(array): @@ -394,7 +394,6 @@ def format_minimal(values): return fmt, strings - PLAIN_FUNCTIONS = {'minimum': np.min, 'maximum': np.max, 'mean': np.mean, @@ -417,7 +416,7 @@ def compute_statistic(statistic, data, mask=None, axis=None, finite=True, Parameters ---------- - statistic : {'count', 'min', 'max', 'mean', 'median', 'sum', 'percentile'} + statistic : {'minimum', 'maximum', 'mean', 'median', 'sum', 'percentile'} The statistic to compute data : `numpy.ndarray` The data to compute the statistic for. @@ -444,7 +443,7 @@ def compute_statistic(statistic, data, mask=None, axis=None, finite=True, if statistic not in PLAIN_FUNCTIONS: raise ValueError("Unrecognized statistic: {0}".format(statistic)) - if finite or positive or mask: + if finite or positive or mask is not None: keep = np.ones(data.shape, dtype=bool) @@ -454,13 +453,16 @@ def compute_statistic(statistic, data, mask=None, axis=None, finite=True, if positive: keep &= data > 0 - if mask: + if mask is not None: keep &= mask if axis is None: data = data[keep] else: - data[keep] = np.nan + # We need to force a copy since we are editing the values and we + # might as well convert to float just in case + data = np.array(data, dtype=float) + data[~keep] = np.nan function = NAN_FUNCTIONS[statistic] diff --git a/glue/viewers/profile/qt/tests/test_data_viewer.py b/glue/viewers/profile/qt/tests/test_data_viewer.py index af733031e..e0c26eca4 100644 --- a/glue/viewers/profile/qt/tests/test_data_viewer.py +++ b/glue/viewers/profile/qt/tests/test_data_viewer.py @@ -62,7 +62,7 @@ def teardown_method(self, method): def test_functions(self): self.viewer.add_data(self.data) - self.viewer.state.function = nanmean + self.viewer.state.function = 'mean' assert len(self.viewer.layers) == 1 layer_artist = self.viewer.layers[0] layer_artist.wait() diff --git a/glue/viewers/profile/qt/tests/test_profile_tools.py b/glue/viewers/profile/qt/tests/test_profile_tools.py index 274e76207..3dad372ea 100644 --- a/glue/viewers/profile/qt/tests/test_profile_tools.py +++ b/glue/viewers/profile/qt/tests/test_profile_tools.py @@ -33,7 +33,7 @@ def setup_method(self, method): self.data_collection.append(self.data) self.viewer = self.app.new_data_viewer(ProfileViewer) - self.viewer.state.function = nanmean + self.viewer.state.function = 'mean' self.viewer.toolbar.active_tool = 'profile-analysis' diff --git a/glue/viewers/profile/tests/test_state.py b/glue/viewers/profile/tests/test_state.py index ddaf3f957..a8884c871 100644 --- a/glue/viewers/profile/tests/test_state.py +++ b/glue/viewers/profile/tests/test_state.py @@ -30,7 +30,7 @@ def setup_method(self, method): self.layer_state = ProfileLayerState(viewer_state=self.viewer_state, layer=self.data) self.viewer_state.layers.append(self.layer_state) - self.viewer_state.function = nanmean + self.viewer_state.function = 'mean' def test_basic(self): x, y = self.layer_state.profile @@ -62,23 +62,23 @@ def test_x_att(self): def test_function(self): - self.viewer_state.function = nanmean + self.viewer_state.function = 'mean' x, y = self.layer_state.profile assert_allclose(y, [3.5, 11.5, 19.5]) - self.viewer_state.function = nanmin + self.viewer_state.function = 'minimum' x, y = self.layer_state.profile assert_allclose(y, [0, 8, 16]) - self.viewer_state.function = nanmax + self.viewer_state.function = 'maximum' x, y = self.layer_state.profile assert_allclose(y, [7, 15, 23]) - self.viewer_state.function = nansum + self.viewer_state.function = 'sum' x, y = self.layer_state.profile assert_allclose(y, [28, 92, 156]) - self.viewer_state.function = nanmedian + self.viewer_state.function = 'median' x, y = self.layer_state.profile assert_allclose(y, [3.5, 11.5, 19.5]) @@ -105,7 +105,7 @@ def test_subset(self): def test_clone(self): self.viewer_state.x_att = self.data.pixel_component_ids[1] - self.viewer_state.function = nanmedian + self.viewer_state.function = 'median' self.layer_state.attribute = self.data.id['x'] self.layer_state.linewidth = 3 @@ -113,7 +113,7 @@ def test_clone(self): viewer_state_new = clone(self.viewer_state) assert viewer_state_new.x_att.label == 'Pixel Axis 1 [y]' - assert viewer_state_new.function is nanmedian + assert viewer_state_new.function == 'median' assert self.layer_state.attribute.label == 'x' assert self.layer_state.linewidth == 3 From df444b11f15e16229c1a92f4b10f6428688ae27f Mon Sep 17 00:00:00 2001 From: Thomas Robitaille Date: Tue, 22 May 2018 19:52:35 +0100 Subject: [PATCH 4/6] PEP8 and fix to to_array --- glue/core/data.py | 4 ++-- glue/viewers/profile/qt/tests/test_data_viewer.py | 1 - glue/viewers/profile/state.py | 6 ++---- glue/viewers/profile/tests/test_state.py | 1 - 4 files changed, 4 insertions(+), 8 deletions(-) diff --git a/glue/core/data.py b/glue/core/data.py index 7eb249dd3..997b5f6cd 100644 --- a/glue/core/data.py +++ b/glue/core/data.py @@ -1231,8 +1231,8 @@ def compute_statistic(self, statistic, cid, subset_state=None, axis=None, return result if subset_state: - if isinstance(subset_state, SliceSubsetState): - data = subset_state.to_array(self, cid, view) + if isinstance(subset_state, SliceSubsetState) and view is None: + data = subset_state.to_array(self, cid) mask = None else: data = self[cid] diff --git a/glue/viewers/profile/qt/tests/test_data_viewer.py b/glue/viewers/profile/qt/tests/test_data_viewer.py index e0c26eca4..836a1b339 100644 --- a/glue/viewers/profile/qt/tests/test_data_viewer.py +++ b/glue/viewers/profile/qt/tests/test_data_viewer.py @@ -16,7 +16,6 @@ from glue.core.component_link import ComponentLink from glue.viewers.matplotlib.qt.tests.test_data_viewer import BaseTestMatplotlibDataViewer from glue.viewers.profile.tests.test_state import SimpleCoordinates -from glue.utils import nanmean from glue.core.tests.test_state import clone from ..data_viewer import ProfileViewer diff --git a/glue/viewers/profile/state.py b/glue/viewers/profile/state.py index eb40f55e0..075cf3475 100644 --- a/glue/viewers/profile/state.py +++ b/glue/viewers/profile/state.py @@ -1,6 +1,5 @@ from __future__ import absolute_import, division, print_function -import warnings from collections import OrderedDict import numpy as np @@ -13,10 +12,9 @@ DeferredDrawSelectionCallbackProperty as DDSCProperty) from glue.core.state_objects import StateAttributeLimitsHelper from glue.core.data_combo_helper import ManualDataComboHelper, ComponentIDComboHelper -from glue.utils import defer_draw, nanmean, nanmedian, nansum, nanmin, nanmax +from glue.utils import defer_draw, nanmin, nanmax from glue.core.link_manager import is_convertible_to_single_pixel_cid -from glue.core.exceptions import IncompatibleAttribute, IncompatibleDataException -from glue.core.subset import SliceSubsetState +from glue.core.exceptions import IncompatibleDataException __all__ = ['ProfileViewerState', 'ProfileLayerState'] diff --git a/glue/viewers/profile/tests/test_state.py b/glue/viewers/profile/tests/test_state.py index a8884c871..67d37b324 100644 --- a/glue/viewers/profile/tests/test_state.py +++ b/glue/viewers/profile/tests/test_state.py @@ -6,7 +6,6 @@ from glue.core import Data, Coordinates from glue.core.tests.test_state import clone -from glue.utils import nanmean, nanmedian, nansum, nanmin, nanmax from ..state import ProfileViewerState, ProfileLayerState From cb4109d5773bf14b4febc87832d8a0b0eb0a08cd Mon Sep 17 00:00:00 2001 From: Thomas Robitaille Date: Tue, 22 May 2018 20:02:54 +0100 Subject: [PATCH 5/6] Fix case when all values are NaN --- glue/utils/array.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/glue/utils/array.py b/glue/utils/array.py index 65e0c784e..135e347fd 100644 --- a/glue/utils/array.py +++ b/glue/utils/array.py @@ -470,6 +470,9 @@ def compute_statistic(statistic, data, mask=None, axis=None, finite=True, function = PLAIN_FUNCTIONS[statistic] + if data.size == 0: + return np.nan + with warnings.catch_warnings(): warnings.simplefilter("ignore", category=RuntimeWarning) if statistic == 'percentile': From d29cf8b59e71d054aeed7de118b8de789ed5fab0 Mon Sep 17 00:00:00 2001 From: Thomas Robitaille Date: Tue, 22 May 2018 20:13:50 +0100 Subject: [PATCH 6/6] Fix tests --- glue/core/state_objects.py | 4 ++-- glue/utils/array.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/glue/core/state_objects.py b/glue/core/state_objects.py index bf7843912..da5ad18b2 100644 --- a/glue/core/state_objects.py +++ b/glue/core/state_objects.py @@ -344,7 +344,7 @@ def update_values(self, force=False, use_default_modifiers=False, **properties): percentile=100 - exclude, positive=log, random_subset=self.random_subset) - if np.isnan(lower) or np.isnan(upper): + if not isinstance(lower, np.datetime64) and np.isnan(lower): lower, upper = 0, 1 else: @@ -464,7 +464,7 @@ def update_values(self, force=False, use_default_modifiers=False, **properties): lower = self.data.compute_statistic('minimum', cid=self.component_id, finite=True) upper = self.data.compute_statistic('maximum', cid=self.component_id, finite=True) - if np.isnan(lower) or np.isnan(upper): + if not isinstance(lower, np.datetime64) and np.isnan(lower): lower, upper = 0, 1 self.set(lower=lower, upper=upper, n_bin=n_bin) diff --git a/glue/utils/array.py b/glue/utils/array.py index 135e347fd..a9539c53d 100644 --- a/glue/utils/array.py +++ b/glue/utils/array.py @@ -443,11 +443,11 @@ def compute_statistic(statistic, data, mask=None, axis=None, finite=True, if statistic not in PLAIN_FUNCTIONS: raise ValueError("Unrecognized statistic: {0}".format(statistic)) - if finite or positive or mask is not None: + if (finite or positive or mask is not None) and data.dtype.kind != 'M': keep = np.ones(data.shape, dtype=bool) - if data.dtype.kind != 'M' and finite: + if finite: keep &= np.isfinite(data) if positive: