Merge pull request #1737 from astrofrog/abstract-compute-statistic

Move computation of statistics to the Data object
glue-viz · May 22, 2018 · bab74b7 · bab74b7
2 parents 63d21a4 + d29cf8b
commit bab74b7
Show file tree

Hide file tree

Showing 8 changed files with 255 additions and 150 deletions.
diff --git a/doc/whatsnew/whatsnew.rst b/doc/whatsnew/whatsnew.rst
@@ -163,7 +163,7 @@ Profile viewer
 
 Glue now features a new profile viewer that can be used to show data collapsed
 along all but one dimension using a variety of functions (mean, median, maximum,
-minimim, and so on). This new viewer replaces the previous 'spectrum' tool
+minimum, and so on). This new viewer replaces the previous 'spectrum' tool
 (which was restricted to 3 dimensions and mostly designed to work with
 astronomical data) and includes the same functionality to fit models to profiles
 or collapse data in an image viewer based on an interval selected in the profile

diff --git a/glue/core/data.py b/glue/core/data.py
@@ -16,14 +16,15 @@
 from glue.core.decorators import clear_cache
 from glue.core.util import split_component_view
 from glue.core.hub import Hub
-from glue.core.subset import Subset, SubsetState
+from glue.core.subset import Subset, SubsetState, SliceSubsetState
 from glue.core.component_id import ComponentIDList
 from glue.core.component_link import ComponentLink, CoordinateComponentLink
 from glue.core.exceptions import IncompatibleAttribute
 from glue.core.visual import VisualAttributes
 from glue.core.coordinates import Coordinates
 from glue.core.contracts import contract
 from glue.config import settings
+from glue.utils import compute_statistic, unbroadcast, iterate_chunks
 
 
 # Note: leave all the following imports for component and component_id since
@@ -34,6 +35,8 @@
 
 __all__ = ['Data']
 
+N_CHUNK_MAX = 40000000
+
 
 class Data(object):
 
@@ -1153,6 +1156,107 @@ def update_values_from_data(self, data):
         for subset in self.subsets:
             clear_cache(subset.subset_state.to_mask)
 
+    # The following are methods for accessing the data in various ways that
+    # can be overriden by subclasses that want to improve performance.
+
+    def compute_statistic(self, statistic, cid, subset_state=None, axis=None,
+                          finite=True, positive=False, percentile=None, view=None,
+                          random_subset=None):
+        """
+        Compute a statistic for the data.
+
+        Parameters
+        ----------
+        statistic : {'minimum', 'maximum', 'mean', 'median', 'sum', 'percentile'}
+            The statistic to compute
+        cid : `ComponentID` or str
+            The component ID to compute the statistic on - if given as a string
+            this will be assumed to be for the component belonging to the dataset
+            (not external links).
+        subset_state : `SubsetState`
+            If specified, the statistic will only include the values that are in
+            the subset specified by this subset state.
+        axis : None or int or tuple of int
+            If specified, the axis/axes to compute the statistic over.
+        finite : bool, optional
+            Whether to include only finite values in the statistic. This should
+            be `True` to ignore NaN/Inf values
+        positive : bool, optional
+            Whether to include only (strictly) positive values in the statistic.
+            This is used for example when computing statistics of data shown in
+            log space.
+        percentile : float, optional
+            If ``statistic`` is ``'percentile'``, the ``percentile`` argument
+            should be given and specify the percentile to calculate in the
+            range [0:100]
+        random_subset : int, optional
+            If specified, this should be an integer giving the number of values
+            to use for the statistic. This can only be used if ``axis`` is `None`
+        """
+
+        # TODO: generalize chunking to more types of axis
+
+        if (view is None and
+                isinstance(axis, tuple) and
+                len(axis) == self.ndim - 1 and
+                self.size > N_CHUNK_MAX and
+                not isinstance(subset_state, SliceSubsetState)):
+
+            # We operate in chunks here to avoid memory issues.
+
+            # TODO: there are cases where the code below is not optimized
+            # because the mask may be computable for a single slice and
+            # broadcastable to all slices - normally ROISubsetState takes care
+            # of that but if we call it once per view it won't. In the future we
+            # could ask a SubsetState whether it is broadcasted along
+            # axis_index.
+
+            axis_index = [a for a in range(self.ndim) if a not in axis][0]
+
+            result = np.zeros(self.shape[axis_index])
+
+            chunk_shape = list(self.shape)
+
+            # Deliberately leave n_chunks as float to not round twice
+            n_chunks = self.size / N_CHUNK_MAX
+
+            chunk_shape[axis_index] = max(1, int(chunk_shape[axis_index] / n_chunks))
+
+            for chunk_view in iterate_chunks(self.shape, chunk_shape=chunk_shape):
+                values = self.compute_statistic(statistic, cid, subset_state=subset_state,
+                                                axis=axis, finite=finite, positive=positive,
+                                                percentile=percentile, view=chunk_view)
+                result[chunk_view[axis_index]] = values
+
+            return result
+
+        if subset_state:
+            if isinstance(subset_state, SliceSubsetState) and view is None:
+                data = subset_state.to_array(self, cid)
+                mask = None
+            else:
+                data = self[cid]
+                mask = subset_state.to_mask(self, view)
+        else:
+            data = self[cid, view]
+            mask = None
+
+        if axis is None and mask is None:
+            # Since we are just finding overall statistics, not along axes, we
+            # can remove any broadcasted dimension since these should not affect
+            # the statistics.
+            data = unbroadcast(data)
+
+        if random_subset and data.size > random_subset:
+            if not hasattr(self, '_random_subset_indices') or self._random_subset_indices[0] != data.size:
+                self._random_subset_indices = (data.size, np.random.randint(0, data.size, random_subset))
+            data = data.ravel()[self._random_subset_indices[1]]
+            if mask is not None:
+                mask = mask.ravel()[self._random_subset_indices[1]]
+
+        return compute_statistic(statistic, data, mask=mask, axis=axis, finite=finite,
+                                 positive=positive, percentile=percentile)
+
 
 @contract(i=int, ndim=int)
 def pixel_label(i, ndim):

diff --git a/glue/core/state_objects.py b/glue/core/state_objects.py
@@ -9,7 +9,6 @@
                                 HasCallbackProperties, CallbackList)
 from glue.core.state import saver, loader
 from glue.core.component_id import PixelComponentID
-from glue.utils import unbroadcast
 
 __all__ = ['State', 'StateAttributeCacheHelper',
            'StateAttributeLimitsHelper', 'StateAttributeSingleValueHelper']
@@ -141,21 +140,11 @@ def __init__(self, state, attribute, cache=None, **kwargs):
 
     @property
     def data_values(self):
-        # For subsets in 'data' mode, we want to compute the limits based on
-        # the full dataset, not just the subset.
-        if isinstance(self.data, Subset):
-            return self.data.data[self.component_id]
-        else:
-            return self.data[self.component_id]
+        return self.data[self.component_id]
 
     @property
     def data_component(self):
-        # For subsets in 'data' mode, we want to compute the limits based on
-        # the full dataset, not just the subset.
-        if isinstance(self.data, Subset):
-            return self.data.data.get_component(self.component_id)
-        else:
-            return self.data.get_component(self.component_id)
+        return self.data.get_component(self.component_id)
 
     def invalidate_cache(self):
         self._cache.clear()
@@ -165,7 +154,12 @@ def data(self):
         if self.attribute is None:
             return None
         else:
-            return self.attribute.parent
+            # For subsets in 'data' mode, we want to compute the limits based on
+            # the full dataset, not just the subset.
+            if isinstance(self.attribute.parent, Subset):
+                return self.attribute.parent.data
+            else:
+                return self.attribute.parent
 
     @property
     def component_id(self):
@@ -253,7 +247,7 @@ class StateAttributeLimitsHelper(StateAttributeCacheHelper):
     attribute : str
         The attribute name - this will be populated once a dataset is assigned
         to the helper.
-    percentile_subset : int
+    random_subset : int
         How many points to use at most for the percentile calculation (using all
         values is highly inefficient and not needed)
     margin : float
@@ -285,12 +279,12 @@ class StateAttributeLimitsHelper(StateAttributeCacheHelper):
     values_names = ('lower', 'upper')
     modifiers_names = ('log', 'percentile')
 
-    def __init__(self, state, attribute, percentile_subset=10000, margin=0, cache=None, **kwargs):
+    def __init__(self, state, attribute, random_subset=10000, margin=0, cache=None, **kwargs):
 
         super(StateAttributeLimitsHelper, self).__init__(state, attribute, cache=cache, **kwargs)
 
         self.margin = margin
-        self.percentile_subset = percentile_subset
+        self.random_subset = random_subset
         self.subset_indices = None
 
         if self.attribute is not None:
@@ -333,48 +327,28 @@ def update_values(self, force=False, use_default_modifiers=False, **properties):
 
             exclude = (100 - percentile) / 2.
 
-            data_values = self.data_values
-
-            # Since we are just finding overall statistics, not along axes, we
-            # can remove any broadcasted dimension since these should not affect
-            # the statistics.
-            data_values = unbroadcast(data_values)
-
-            if data_values.size > self.percentile_subset:
-                if self.subset_indices is None or self.subset_indices[0] != data_values.size:
-                    self.subset_indices = (data_values.size,
-                                           np.random.randint(0, data_values.size,
-                                                             self.percentile_subset))
-                data_values = data_values.ravel()[self.subset_indices[1]]
-
-            if log:
-                data_values = data_values[data_values > 0]
-                if len(data_values) == 0:
-                    self.set(lower=0.1, upper=1, percentile=percentile, log=log)
-                    return
-
-            # NOTE: we can't use np.nanmin/np.nanmax or nanpercentile below as
-            # they don't exclude inf/-inf
-            if data_values.dtype.kind != 'M':
-                data_values = data_values[np.isfinite(data_values)]
-
-            if data_values.size > 0:
-
-                if percentile == 100:
-
-                    if data_values.dtype.kind == 'M':
-                        lower = data_values.min()
-                        upper = data_values.max()
-                    else:
-                        lower = np.min(data_values)
-                        upper = np.max(data_values)
+            data_component = self.data_component
 
-                else:
-
-                    lower = np.percentile(data_values, exclude)
-                    upper = np.percentile(data_values, 100 - exclude)
+            if percentile == 100:
+                lower = self.data.compute_statistic('minimum', cid=self.component_id,
+                                                    finite=True, positive=log,
+                                                    random_subset=self.random_subset)
+                upper = self.data.compute_statistic('maximum', cid=self.component_id,
+                                                    finite=True, positive=log,
+                                                    random_subset=self.random_subset)
+            else:
+                lower = self.data.compute_statistic('percentile', cid=self.component_id,
+                                                    percentile=exclude, positive=log,
+                                                    random_subset=self.random_subset)
+                upper = self.data.compute_statistic('percentile', cid=self.component_id,
+                                                    percentile=100 - exclude, positive=log,
+                                                    random_subset=self.random_subset)
+
+            if not isinstance(lower, np.datetime64) and np.isnan(lower):
+                lower, upper = 0, 1
+            else:
 
-                if self.data_component.categorical:
+                if data_component.categorical:
                     lower = np.floor(lower - 0.5) + 0.5
                     upper = np.ceil(upper + 0.5) - 0.5
 
@@ -387,11 +361,6 @@ def update_values(self, force=False, use_default_modifiers=False, **properties):
                     lower -= value_range * self.margin
                     upper += value_range * self.margin
 
-            else:
-
-                lower = 0.
-                upper = 1.
-
             self.set(lower=lower, upper=upper, percentile=percentile, log=log)
 
     def flip_limits(self):
@@ -492,24 +461,11 @@ def update_values(self, force=False, use_default_modifiers=False, **properties):
                 else:
                     n_bin = self._common_n_bin
 
-                data_values = self.data_values
-
-                # Since we are just finding overall statistics, not along axes, we
-                # can remove any broadcasted dimension since these should not affect
-                # the statistics.
-                data_values = unbroadcast(data_values)
+                lower = self.data.compute_statistic('minimum', cid=self.component_id, finite=True)
+                upper = self.data.compute_statistic('maximum', cid=self.component_id, finite=True)
 
-                # NOTE: we can't use np.nanmin/np.nanmax or nanpercentile below as
-                # they don't exclude inf/-inf
-                if data_values.dtype.kind != 'M':
-                    data_values = data_values[np.isfinite(data_values)]
-
-                if data_values.size > 0:
-                    lower = data_values.min()
-                    upper = data_values.max()
-                else:
-                    lower = 0.
-                    upper = 1.
+                if not isinstance(lower, np.datetime64) and np.isnan(lower):
+                    lower, upper = 0, 1
 
             self.set(lower=lower, upper=upper, n_bin=n_bin)