Start moving computation of data statistics to the Data object [ci skip]

glue-viz · May 22, 2018 · 6400f87 · 6400f87
1 parent 265af08
commit 6400f87
Show file tree

Hide file tree

Showing 4 changed files with 194 additions and 82 deletions.
diff --git a/glue/core/data.py b/glue/core/data.py
@@ -16,14 +16,15 @@
 from glue.core.decorators import clear_cache
 from glue.core.util import split_component_view
 from glue.core.hub import Hub
-from glue.core.subset import Subset, SubsetState
+from glue.core.subset import Subset, SubsetState, SliceSubsetState
 from glue.core.component_id import ComponentIDList
 from glue.core.component_link import ComponentLink, CoordinateComponentLink
 from glue.core.exceptions import IncompatibleAttribute
 from glue.core.visual import VisualAttributes
 from glue.core.coordinates import Coordinates
 from glue.core.contracts import contract
 from glue.config import settings
+from glue.utils import compute_statistic
 
 
 # Note: leave all the following imports for component and component_id since
@@ -1153,6 +1154,54 @@ def update_values_from_data(self, data):
         for subset in self.subsets:
             clear_cache(subset.subset_state.to_mask)
 
+    # The following are methods for accessing the data in various ways that
+    # can be overriden by subclasses that want to improve performance.
+
+    def compute_statistic(self, statistic, cid, subset_state=None, axis=None,
+                          finite=True, positive=False, percentile=None):
+        """
+        Compute a statistic for the data.
+
+        Parameters
+        ----------
+        statistic : {'minimum', 'maximum', 'mean', 'median', 'sum', 'percentile'}
+            The statistic to compute
+        cid : `ComponentID` or str
+            The component ID to compute the statistic on - if given as a string
+            this will be assumed to be for the component belonging to the dataset
+            (not external links).
+        subset_state : `SubsetState`
+            If specified, the statistic will only include the values that are in
+            the subset specified by this subset state.
+        axis : None or int or tuple of int
+            If specified, the axis/axes to compute the statistic over.
+        finite : bool, optional
+            Whether to include only finite values in the statistic. This should
+            be `True` to ignore NaN/Inf values
+        positive : bool, optional
+            Whether to include only (strictly) positive values in the statistic.
+            This is used for example when computing statistics of data shown in
+            log space.
+        percentile : float, optional
+            If ``statistic`` is ``'percentile'``, the ``percentile`` argument
+            should be given and specify the percentile to calculate in the
+            range [0:100]
+        """
+
+        if subset_state:
+            if isinstance(subset_state, SliceSubsetState):
+                data = subset_state.to_array(self, cid)
+                mask = None
+            else:
+                data = self[cid]
+                mask = subset_state.to_mask(self)
+        else:
+            data = self[cid]
+            mask = None
+
+        return compute_statistic(statistic, data, mask=mask, axis=axis, finite=finite,
+                                 positive=positive, percentile=percentile)
+
 
 @contract(i=int, ndim=int)
 def pixel_label(i, ndim):

diff --git a/glue/core/state_objects.py b/glue/core/state_objects.py
@@ -140,21 +140,11 @@ def __init__(self, state, attribute, cache=None, **kwargs):
 
     @property
     def data_values(self):
-        # For subsets in 'data' mode, we want to compute the limits based on
-        # the full dataset, not just the subset.
-        if isinstance(self.data, Subset):
-            return self.data.data[self.component_id]
-        else:
-            return self.data[self.component_id]
+        return self.data[self.component_id]
 
     @property
     def data_component(self):
-        # For subsets in 'data' mode, we want to compute the limits based on
-        # the full dataset, not just the subset.
-        if isinstance(self.data, Subset):
-            return self.data.data.get_component(self.component_id)
-        else:
-            return self.data.get_component(self.component_id)
+        return self.data.get_component(self.component_id)
 
     def invalidate_cache(self):
         self._cache.clear()
@@ -164,7 +154,12 @@ def data(self):
         if self.attribute is None:
             return None
         else:
-            return self.attribute.parent
+            # For subsets in 'data' mode, we want to compute the limits based on
+            # the full dataset, not just the subset.
+            if isinstance(self.attribute.parent, Subset):
+                return self.attribute.parent.data
+            else:
+                return self.attribute.parent
 
     @property
     def component_id(self):
@@ -332,43 +327,48 @@ def update_values(self, force=False, use_default_modifiers=False, **properties):
 
             exclude = (100 - percentile) / 2.
 
-            data_values = self.data_values
+            # data_values = self.data_values
+            data_component = self.data_component
 
-            if data_values.size > self.percentile_subset:
-                if self.subset_indices is None or self.subset_indices[0] != data_values.size:
-                    self.subset_indices = (data_values.size,
-                                           np.random.randint(0, data_values.size,
-                                                             self.percentile_subset))
-                data_values = data_values.ravel()[self.subset_indices[1]]
+            # NOTE: specific to issues with local data
+            # if data_component.size > self.percentile_subset:
+            #     if self.subset_indices is None or self.subset_indices[0] != data_component.size:
+            #         self.subset_indices = (data_component.size,
+            #                                np.random.randint(0, data_component.size,
+            #                                                  self.percentile_subset))
+            #     data_values = data_values.ravel()[self.subset_indices[1]]
 
-            if log:
-                data_values = data_values[data_values > 0]
-                if len(data_values) == 0:
-                    self.set(lower=0.1, upper=1, percentile=percentile, log=log)
-                    return
+            if log and not data_component.any_positive:
+                self.set(lower=0.1, upper=1, percentile=percentile, log=log)
+                return
 
             # NOTE: we can't use np.nanmin/np.nanmax or nanpercentile below as
             # they don't exclude inf/-inf
-            if data_values.dtype.kind != 'M':
-                data_values = data_values[np.isfinite(data_values)]
+            # if data_values.dtype.kind != 'M':
+            #     data_values = data_values[np.isfinite(data_values)]
 
-            if data_values.size > 0:
+            if percentile == 100:
 
-                if percentile == 100:
+                # if data_values.dtype.kind == 'M':
+                #     lower = data_values.min()
+                #     upper = data_values.max()
+                # else:
+                # TODO: have a way to ask for the min/max of positive values
+                lower = self.data.compute_statistic('min', cid=self.component_id, finite=True, positive=log)
+                upper = self.data.compute_statistic('max', cid=self.component_id, finite=True, positive=log)
 
-                    if data_values.dtype.kind == 'M':
-                        lower = data_values.min()
-                        upper = data_values.max()
-                    else:
-                        lower = np.min(data_values)
-                        upper = np.max(data_values)
+            else:
 
-                else:
+                lower = self.data.compute_statistic('percentile', cid=self.component_id, percentile=exclude, positive=log)
+                upper = self.data.compute_statistic('percentile', cid=self.component_id, percentile=100 - exclude, positive=log)
+
+            if np.isnan(lower) or np.isnan(upper):
 
-                    lower = np.percentile(data_values, exclude)
-                    upper = np.percentile(data_values, 100 - exclude)
+                lower, upper = 0, 1
 
-                if self.data_component.categorical:
+            else:
+
+                if data_component.categorical:
                     lower = np.floor(lower - 0.5) + 0.5
                     upper = np.ceil(upper + 0.5) - 0.5
 
@@ -381,11 +381,6 @@ def update_values(self, force=False, use_default_modifiers=False, **properties):
                     lower -= value_range * self.margin
                     upper += value_range * self.margin
 
-            else:
-
-                lower = 0.
-                upper = 1.
-
             self.set(lower=lower, upper=upper, percentile=percentile, log=log)
 
     def flip_limits(self):
@@ -486,19 +481,18 @@ def update_values(self, force=False, use_default_modifiers=False, **properties):
                 else:
                     n_bin = self._common_n_bin
 
-                values = self.data_values
+                data_component = self.data_component
 
                 # NOTE: we can't use np.nanmin/np.nanmax or nanpercentile below as
                 # they don't exclude inf/-inf
-                if values.dtype.kind != 'M':
-                    values = values[np.isfinite(values)]
+                # if values.dtype.kind != 'M':
+                #     values = values[np.isfinite(values)]
 
-                if values.size > 0:
-                    lower = values.min()
-                    upper = values.max()
-                else:
-                    lower = 0.
-                    upper = 1.
+                lower = self.data.compute_statistic('min', cid=self.component_id, finite=True)
+                upper = self.data.compute_statistic('max', cid=self.component_id, finite=True)
+
+                if np.isnan(lower) or np.isnan(upper):
+                    lower, upper = 0, 1
 
             self.set(lower=lower, upper=upper, n_bin=n_bin)
 

diff --git a/glue/utils/array.py b/glue/utils/array.py
@@ -1,5 +1,7 @@
 from __future__ import absolute_import, division, print_function
 
+import warnings
+
 import numpy as np
 from numpy.lib.stride_tricks import as_strided
 
@@ -390,3 +392,85 @@ def format_minimal(values):
         if len(strings) == len(set(strings)):
             break
     return fmt, strings
+
+
+
+PLAIN_FUNCTIONS = {'minimum': np.min,
+                   'maximum': np.max,
+                   'mean': np.mean,
+                   'median': np.median,
+                   'sum': np.sum,
+                   'percentile': np.percentile}
+
+NAN_FUNCTIONS = {'minimum': nanmin,
+                 'maximum': nanmax,
+                 'mean': nanmean,
+                 'median': nanmedian,
+                 'sum': nansum,
+                 'percentile': np.nanpercentile}
+
+
+def compute_statistic(statistic, data, mask=None, axis=None, finite=True,
+                      positive=False, percentile=None):
+    """
+    Compute a statistic for the data.
+
+    Parameters
+    ----------
+    statistic : {'count', 'min', 'max', 'mean', 'median', 'sum', 'percentile'}
+        The statistic to compute
+    data : `numpy.ndarray`
+        The data to compute the statistic for.
+    mask : `numpy.ndarray`
+        The mask to apply when computing the statistic.
+    axis : None or int or tuple of int
+        If specified, the axis/axes to compute the statistic over.
+    finite : bool, optional
+        Whether to include only finite values in the statistic. This should
+        be `True` to ignore NaN/Inf values
+    positive : bool, optional
+        Whether to include only (strictly) positive values in the statistic.
+        This is used for example when computing statistics of data shown in
+        log space.
+    percentile : float, optional
+        If ``statistic`` is ``'percentile'``, the ``percentile`` argument
+        should be given and specify the percentile to calculate in the
+        range [0:100]
+    """
+
+    # NOTE: this function should not ever have to use glue-specific objects.
+    # The aim is to eventually use a fast C implementation of this function.
+
+    if statistic not in PLAIN_FUNCTIONS:
+        raise ValueError("Unrecognized statistic: {0}".format(statistic))
+
+    if finite or positive or mask:
+
+        keep = np.ones(data.shape, dtype=bool)
+
+        if finite:
+            keep &= np.isfinite(data)
+
+        if positive:
+            keep &= data > 0
+
+        if mask:
+            keep &= mask
+
+        if axis is None:
+            data = data[keep]
+        else:
+            data[keep] = np.nan
+
+        function = NAN_FUNCTIONS[statistic]
+
+    else:
+
+        function = PLAIN_FUNCTIONS[statistic]
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", category=RuntimeWarning)
+        if statistic == 'percentile':
+            return function(data, percentile, axis=axis)
+        else:
+            return function(data, axis=axis)
diff --git a/glue/viewers/profile/state.py b/glue/viewers/profile/state.py
@@ -21,11 +21,11 @@
 __all__ = ['ProfileViewerState', 'ProfileLayerState']
 
 
-FUNCTIONS = OrderedDict([(nanmax, 'Maximum'),
-                         (nanmin, 'Minimum'),
-                         (nanmean, 'Mean'),
-                         (nanmedian, 'Median'),
-                         (nansum, 'Sum')])
+FUNCTIONS = OrderedDict([('max', 'Maximum'),
+                         ('min', 'Minimum'),
+                         ('mean', 'Mean'),
+                         ('median', 'Median'),
+                         ('sum', 'Sum')])
 
 
 class ProfileViewerState(MatplotlibDataViewerState):
@@ -203,34 +203,19 @@ def _update_profile(self, *event):
         # smaller than the data to just average the relevant 'spaxels' in the
         # data rather than collapsing the whole cube.
 
+        if isinstance(self.layer, Data):
+            data = self.layer
+            subset_state = None
+        else:
+            data = self.layer.data
+            subset_state = self.layer.subset_state
+
         try:
-            if isinstance(self.layer, Data):
-                data = self.layer
-                data_values = data[self.attribute]
-            else:
-                data = self.layer.data
-                if isinstance(self.layer.subset_state, SliceSubsetState):
-                    data_values = self.layer.subset_state.to_array(self.layer.data, self.attribute)
-                else:
-                    # We need to force a copy *and* convert to float just in case
-                    data_values = np.array(data[self.attribute], dtype=float)
-                    mask = self.layer.to_mask()
-                    if np.sum(mask) == 0:
-                        self._profile = [], []
-                        return
-                    data_values[~mask] = np.nan
+            profile_values = data.compute_statistic(self.viewer_state.function, self.attribute, axis=axes, subset_state=subset_state)
         except IncompatibleAttribute:
             self._profile = None, None
             return
 
-        # Collapse along all dimensions except x_att
-        if self.layer.ndim > 1:
-            with warnings.catch_warnings():
-                warnings.simplefilter("ignore", category=RuntimeWarning)
-                profile_values = self.viewer_state.function(data_values, axis=axes)
-        else:
-            profile_values = data_values
-
         # Finally, we get the coordinate values for the requested axis
         axis_view = [0] * data.ndim
         axis_view[pix_cid.axis] = slice(None)