From 3eb699f0242c9ef30a59782a05f570afa52de60b Mon Sep 17 00:00:00 2001
From: Thomas Robitaille <thomas.robitaille@gmail.com>
Date: Wed, 9 May 2018 14:23:22 +0100
Subject: [PATCH 1/6] Start moving computation of data statistics to the Data
 object [ci skip]

---
 glue/core/data.py             |  89 ++++++++++++++++++++++++++-
 glue/core/state_objects.py    | 111 ++++++++++++++--------------------
 glue/utils/array.py           |  84 +++++++++++++++++++++++++
 glue/viewers/profile/state.py |  70 +++++----------------
 4 files changed, 234 insertions(+), 120 deletions(-)

diff --git a/glue/core/data.py b/glue/core/data.py
index b404a8ee2..7afaa91f5 100644
--- a/glue/core/data.py
+++ b/glue/core/data.py
@@ -16,7 +16,7 @@
 from glue.core.decorators import clear_cache
 from glue.core.util import split_component_view
 from glue.core.hub import Hub
-from glue.core.subset import Subset, SubsetState
+from glue.core.subset import Subset, SubsetState, SliceSubsetState
 from glue.core.component_id import ComponentIDList
 from glue.core.component_link import ComponentLink, CoordinateComponentLink
 from glue.core.exceptions import IncompatibleAttribute
@@ -24,6 +24,7 @@
 from glue.core.coordinates import Coordinates
 from glue.core.contracts import contract
 from glue.config import settings
+from glue.utils import compute_statistic, unbroadcast
 
 
 # Note: leave all the following imports for component and component_id since
@@ -34,6 +35,8 @@
 
 __all__ = ['Data']
 
+N_CHUNK_MAX = 40000000
+
 
 class Data(object):
 
@@ -1153,6 +1156,90 @@ def update_values_from_data(self, data):
         for subset in self.subsets:
             clear_cache(subset.subset_state.to_mask)
 
+    # The following are methods for accessing the data in various ways that
+    # can be overriden by subclasses that want to improve performance.
+
+    def compute_statistic(self, statistic, cid, subset_state=None, axis=None,
+                          finite=True, positive=False, percentile=None, view=None):
+        """
+        Compute a statistic for the data.
+
+        Parameters
+        ----------
+        statistic : {'minimum', 'maximum', 'mean', 'median', 'sum', 'percentile'}
+            The statistic to compute
+        cid : `ComponentID` or str
+            The component ID to compute the statistic on - if given as a string
+            this will be assumed to be for the component belonging to the dataset
+            (not external links).
+        subset_state : `SubsetState`
+            If specified, the statistic will only include the values that are in
+            the subset specified by this subset state.
+        axis : None or int or tuple of int
+            If specified, the axis/axes to compute the statistic over.
+        finite : bool, optional
+            Whether to include only finite values in the statistic. This should
+            be `True` to ignore NaN/Inf values
+        positive : bool, optional
+            Whether to include only (strictly) positive values in the statistic.
+            This is used for example when computing statistics of data shown in
+            log space.
+        percentile : float, optional
+            If ``statistic`` is ``'percentile'``, the ``percentile`` argument
+            should be given and specify the percentile to calculate in the
+            range [0:100]
+        """
+
+        # TODO: generalize chunking to tuple axis (not just int)
+
+        if (view is None and isinstance(axis, int) and self.size > N_CHUNK_MAX and
+                not isinstance(subset_state, SliceSubsetState)):
+
+            # We operate in chunks here to avoid memory issues.
+
+            # TODO: there are cases where the code below is not optimized
+            # because the mask may be computable for a single slice and
+            # broadcastable to all slices - normally ROISubsetState takes care
+            # of that but if we call it once per view it won't. In the future we
+            # could ask a SubsetState whether it is broadcasted along
+            # axis_index.
+
+            result = np.zeros(self.shape[axis])
+
+            chunk_shape = list(self.shape)
+
+            # Deliberately leave n_chunks as float to not round twice
+            n_chunks = self.layer.size / N_CHUNK_MAX
+
+            chunk_shape[axis_index] = max(1, int(chunk_shape[axis_index] / n_chunks))
+
+            for view in iterate_chunks(self.layer.shape, chunk_shape=chunk_shape):
+                result[view[axis]] = self.compute_statistic(statistic, cid, subset_state=subset_state,
+                                                            axis=axis, finite=finite, positive=positive,
+                                                            percentile=percentile, view=view)
+
+            return result
+
+        if subset_state:
+            if isinstance(subset_state, SliceSubsetState):
+                data = subset_state.to_array(self, cid, view)
+                mask = None
+            else:
+                data = self[cid]
+                mask = subset_state.to_mask(self, view)
+        else:
+            data = self[cid, view]
+            mask = None
+
+        if axis is None and mask is None:
+            # Since we are just finding overall statistics, not along axes, we
+            # can remove any broadcasted dimension since these should not affect
+            # the statistics.
+            data = unbroadcast(data)
+
+        return compute_statistic(statistic, data, mask=mask, axis=axis, finite=finite,
+                                 positive=positive, percentile=percentile)
+
 
 @contract(i=int, ndim=int)
 def pixel_label(i, ndim):
diff --git a/glue/core/state_objects.py b/glue/core/state_objects.py
index d9ab0362d..0e5adbfb3 100644
--- a/glue/core/state_objects.py
+++ b/glue/core/state_objects.py
@@ -9,7 +9,6 @@
                                 HasCallbackProperties, CallbackList)
 from glue.core.state import saver, loader
 from glue.core.component_id import PixelComponentID
-from glue.utils import unbroadcast
 
 __all__ = ['State', 'StateAttributeCacheHelper',
            'StateAttributeLimitsHelper', 'StateAttributeSingleValueHelper']
@@ -141,21 +140,11 @@ def __init__(self, state, attribute, cache=None, **kwargs):
 
     @property
     def data_values(self):
-        # For subsets in 'data' mode, we want to compute the limits based on
-        # the full dataset, not just the subset.
-        if isinstance(self.data, Subset):
-            return self.data.data[self.component_id]
-        else:
-            return self.data[self.component_id]
+        return self.data[self.component_id]
 
     @property
     def data_component(self):
-        # For subsets in 'data' mode, we want to compute the limits based on
-        # the full dataset, not just the subset.
-        if isinstance(self.data, Subset):
-            return self.data.data.get_component(self.component_id)
-        else:
-            return self.data.get_component(self.component_id)
+        return self.data.get_component(self.component_id)
 
     def invalidate_cache(self):
         self._cache.clear()
@@ -165,7 +154,12 @@ def data(self):
         if self.attribute is None:
             return None
         else:
-            return self.attribute.parent
+            # For subsets in 'data' mode, we want to compute the limits based on
+            # the full dataset, not just the subset.
+            if isinstance(self.attribute.parent, Subset):
+                return self.attribute.parent.data
+            else:
+                return self.attribute.parent
 
     @property
     def component_id(self):
@@ -333,48 +327,48 @@ def update_values(self, force=False, use_default_modifiers=False, **properties):
 
             exclude = (100 - percentile) / 2.
 
-            data_values = self.data_values
-
-            # Since we are just finding overall statistics, not along axes, we
-            # can remove any broadcasted dimension since these should not affect
-            # the statistics.
-            data_values = unbroadcast(data_values)
+            # data_values = self.data_values
+            data_component = self.data_component
 
-            if data_values.size > self.percentile_subset:
-                if self.subset_indices is None or self.subset_indices[0] != data_values.size:
-                    self.subset_indices = (data_values.size,
-                                           np.random.randint(0, data_values.size,
-                                                             self.percentile_subset))
-                data_values = data_values.ravel()[self.subset_indices[1]]
+            # NOTE: specific to issues with local data
+            # if data_component.size > self.percentile_subset:
+            #     if self.subset_indices is None or self.subset_indices[0] != data_component.size:
+            #         self.subset_indices = (data_component.size,
+            #                                np.random.randint(0, data_component.size,
+            #                                                  self.percentile_subset))
+            #     data_values = data_values.ravel()[self.subset_indices[1]]
 
-            if log:
-                data_values = data_values[data_values > 0]
-                if len(data_values) == 0:
-                    self.set(lower=0.1, upper=1, percentile=percentile, log=log)
-                    return
+            if log and not data_component.any_positive:
+                self.set(lower=0.1, upper=1, percentile=percentile, log=log)
+                return
 
             # NOTE: we can't use np.nanmin/np.nanmax or nanpercentile below as
             # they don't exclude inf/-inf
-            if data_values.dtype.kind != 'M':
-                data_values = data_values[np.isfinite(data_values)]
+            # if data_values.dtype.kind != 'M':
+            #     data_values = data_values[np.isfinite(data_values)]
+
+            if percentile == 100:
 
-            if data_values.size > 0:
+                # if data_values.dtype.kind == 'M':
+                #     lower = data_values.min()
+                #     upper = data_values.max()
+                # else:
+                # TODO: have a way to ask for the min/max of positive values
+                lower = self.data.compute_statistic('min', cid=self.component_id, finite=True, positive=log)
+                upper = self.data.compute_statistic('max', cid=self.component_id, finite=True, positive=log)
 
-                if percentile == 100:
+            else:
 
-                    if data_values.dtype.kind == 'M':
-                        lower = data_values.min()
-                        upper = data_values.max()
-                    else:
-                        lower = np.min(data_values)
-                        upper = np.max(data_values)
+                lower = self.data.compute_statistic('percentile', cid=self.component_id, percentile=exclude, positive=log)
+                upper = self.data.compute_statistic('percentile', cid=self.component_id, percentile=100 - exclude, positive=log)
 
-                else:
+            if np.isnan(lower) or np.isnan(upper):
 
-                    lower = np.percentile(data_values, exclude)
-                    upper = np.percentile(data_values, 100 - exclude)
+                lower, upper = 0, 1
 
-                if self.data_component.categorical:
+            else:
+
+                if data_component.categorical:
                     lower = np.floor(lower - 0.5) + 0.5
                     upper = np.ceil(upper + 0.5) - 0.5
 
@@ -387,11 +381,6 @@ def update_values(self, force=False, use_default_modifiers=False, **properties):
                     lower -= value_range * self.margin
                     upper += value_range * self.margin
 
-            else:
-
-                lower = 0.
-                upper = 1.
-
             self.set(lower=lower, upper=upper, percentile=percentile, log=log)
 
     def flip_limits(self):
@@ -492,24 +481,18 @@ def update_values(self, force=False, use_default_modifiers=False, **properties):
                 else:
                     n_bin = self._common_n_bin
 
-                data_values = self.data_values
-
-                # Since we are just finding overall statistics, not along axes, we
-                # can remove any broadcasted dimension since these should not affect
-                # the statistics.
-                data_values = unbroadcast(data_values)
+                data_component = self.data_component
 
                 # NOTE: we can't use np.nanmin/np.nanmax or nanpercentile below as
                 # they don't exclude inf/-inf
-                if data_values.dtype.kind != 'M':
-                    data_values = data_values[np.isfinite(data_values)]
+                # if values.dtype.kind != 'M':
+                #     values = values[np.isfinite(values)]
 
-                if data_values.size > 0:
-                    lower = data_values.min()
-                    upper = data_values.max()
-                else:
-                    lower = 0.
-                    upper = 1.
+                lower = self.data.compute_statistic('min', cid=self.component_id, finite=True)
+                upper = self.data.compute_statistic('max', cid=self.component_id, finite=True)
+
+                if np.isnan(lower) or np.isnan(upper):
+                    lower, upper = 0, 1
 
             self.set(lower=lower, upper=upper, n_bin=n_bin)
 
diff --git a/glue/utils/array.py b/glue/utils/array.py
index 0f6845f73..8ad4230f6 100644
--- a/glue/utils/array.py
+++ b/glue/utils/array.py
@@ -1,5 +1,7 @@
 from __future__ import absolute_import, division, print_function
 
+import warnings
+
 import numpy as np
 from numpy.lib.stride_tricks import as_strided
 
@@ -390,3 +392,85 @@ def format_minimal(values):
         if len(strings) == len(set(strings)):
             break
     return fmt, strings
+
+
+
+PLAIN_FUNCTIONS = {'minimum': np.min,
+                   'maximum': np.max,
+                   'mean': np.mean,
+                   'median': np.median,
+                   'sum': np.sum,
+                   'percentile': np.percentile}
+
+NAN_FUNCTIONS = {'minimum': nanmin,
+                 'maximum': nanmax,
+                 'mean': nanmean,
+                 'median': nanmedian,
+                 'sum': nansum,
+                 'percentile': np.nanpercentile}
+
+
+def compute_statistic(statistic, data, mask=None, axis=None, finite=True,
+                      positive=False, percentile=None):
+    """
+    Compute a statistic for the data.
+
+    Parameters
+    ----------
+    statistic : {'count', 'min', 'max', 'mean', 'median', 'sum', 'percentile'}
+        The statistic to compute
+    data : `numpy.ndarray`
+        The data to compute the statistic for.
+    mask : `numpy.ndarray`
+        The mask to apply when computing the statistic.
+    axis : None or int or tuple of int
+        If specified, the axis/axes to compute the statistic over.
+    finite : bool, optional
+        Whether to include only finite values in the statistic. This should
+        be `True` to ignore NaN/Inf values
+    positive : bool, optional
+        Whether to include only (strictly) positive values in the statistic.
+        This is used for example when computing statistics of data shown in
+        log space.
+    percentile : float, optional
+        If ``statistic`` is ``'percentile'``, the ``percentile`` argument
+        should be given and specify the percentile to calculate in the
+        range [0:100]
+    """
+
+    # NOTE: this function should not ever have to use glue-specific objects.
+    # The aim is to eventually use a fast C implementation of this function.
+
+    if statistic not in PLAIN_FUNCTIONS:
+        raise ValueError("Unrecognized statistic: {0}".format(statistic))
+
+    if finite or positive or mask:
+
+        keep = np.ones(data.shape, dtype=bool)
+
+        if finite:
+            keep &= np.isfinite(data)
+
+        if positive:
+            keep &= data > 0
+
+        if mask:
+            keep &= mask
+
+        if axis is None:
+            data = data[keep]
+        else:
+            data[keep] = np.nan
+
+        function = NAN_FUNCTIONS[statistic]
+
+    else:
+
+        function = PLAIN_FUNCTIONS[statistic]
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", category=RuntimeWarning)
+        if statistic == 'percentile':
+            return function(data, percentile, axis=axis)
+        else:
+            return function(data, axis=axis)
diff --git a/glue/viewers/profile/state.py b/glue/viewers/profile/state.py
index 5b3ff91fd..eb40f55e0 100644
--- a/glue/viewers/profile/state.py
+++ b/glue/viewers/profile/state.py
@@ -5,7 +5,7 @@
 
 import numpy as np
 
-from glue.core import Data, Subset, Coordinates
+from glue.core import Data, Coordinates
 from glue.external.echo import delay_callback
 from glue.viewers.matplotlib.state import (MatplotlibDataViewerState,
                                            MatplotlibLayerState,
@@ -13,23 +13,19 @@
                                            DeferredDrawSelectionCallbackProperty as DDSCProperty)
 from glue.core.state_objects import StateAttributeLimitsHelper
 from glue.core.data_combo_helper import ManualDataComboHelper, ComponentIDComboHelper
-from glue.utils import defer_draw, nanmean, nanmedian, nansum, nanmin, nanmax, iterate_chunks
+from glue.utils import defer_draw, nanmean, nanmedian, nansum, nanmin, nanmax
 from glue.core.link_manager import is_convertible_to_single_pixel_cid
-from glue.core.exceptions import IncompatibleDataException
+from glue.core.exceptions import IncompatibleAttribute, IncompatibleDataException
 from glue.core.subset import SliceSubsetState
 
 __all__ = ['ProfileViewerState', 'ProfileLayerState']
 
 
-FUNCTIONS = OrderedDict([(nanmax, 'Maximum'),
-                         (nanmin, 'Minimum'),
-                         (nanmean, 'Mean'),
-                         (nanmedian, 'Median'),
-                         (nansum, 'Sum')])
-
-# Maximum number of elements in a chunk size used to compute the profile - this
-# prevents the profile calculation from using up too much memory at a time.
-N_CHUNK_MAX = 50000000
+FUNCTIONS = OrderedDict([('maximum', 'Maximum'),
+                         ('minimum', 'Minimum'),
+                         ('mean', 'Mean'),
+                         ('median', 'Median'),
+                         ('sum', 'Sum')])
 
 
 class ProfileViewerState(MatplotlibDataViewerState):
@@ -216,51 +212,15 @@ def update_profile(self, update_limits=True):
         # smaller than the data to just average the relevant 'spaxels' in the
         # data rather than collapsing the whole cube.
 
-        # We operate in chunks here to avoid memory issues
-
-        axis_index = pix_cid.axis
-
-        profile_values = np.zeros(self.layer.shape[axis_index])
-
-        if isinstance(self.layer, Subset) and isinstance(self.layer.subset_state, SliceSubsetState):
-            chunk_shape = self.layer.shape
+        if isinstance(self.layer, Data):
+            data = self.layer
+            subset_state = None
         else:
-            chunk_shape = list(self.layer.shape)
-            if self.layer.size > N_CHUNK_MAX:
-                # Deliberately leave n_chunks as float to not round twice
-                n_chunks = self.layer.size / N_CHUNK_MAX
-                chunk_shape[axis_index] = max(1, int(chunk_shape[axis_index] / n_chunks))
-
-        # TODO: there are cases where the code below is not optimized because
-        # the mask may be computable for a single slice and broadcastable to all
-        # slices - normally ROISubsetState takes care of that but if we call it
-        # once per view it won't. In the future we could ask a SubsetState
-        # whether it is broadcasted along axis_index.
-
-        for view in iterate_chunks(self.layer.shape, chunk_shape=chunk_shape):
-
-            if isinstance(self.layer, Data):
-                data = self.layer
-                data_values = data[self.attribute, view]
-            else:
-                data = self.layer.data
-                if isinstance(self.layer.subset_state, SliceSubsetState):
-                    data_values = self.layer.subset_state.to_array(self.layer.data, self.attribute)
-                else:
-                    # We need to force a copy *and* convert to float just in case
-                    data_values = np.array(data[self.attribute, view], dtype=float)
-                    mask = self.layer.to_mask(view=view)
-                    data_values[~mask] = np.nan
-
-            # Collapse along all dimensions except x_att
-            if self.layer.ndim > 1:
-                with warnings.catch_warnings():
-                    warnings.simplefilter("ignore", category=RuntimeWarning)
-                    profile_values[view[axis_index]] = self.viewer_state.function(data_values, axis=axes)
-            else:
-                profile_values[view[axis_index]] = data_values
+            data = self.layer.data
+            subset_state = self.layer.subset_state
+
+        profile_values = data.compute_statistic(self.viewer_state.function, self.attribute, axis=axes, subset_state=subset_state)
 
-        # Finally, we get the coordinate values for the requested axis
         if np.all(np.isnan(profile_values)):
             self._profile_cache = [], []
         else:

From d73cbce60fa4a369aba78824cdd109f451063f81 Mon Sep 17 00:00:00 2001
From: Thomas Robitaille <thomas.robitaille@gmail.com>
Date: Tue, 22 May 2018 18:40:48 +0100
Subject: [PATCH 2/6] Implemented random_subset in compute_statistic

---
 glue/core/data.py          | 13 ++++++++++-
 glue/core/state_objects.py | 44 +++++++++++++-------------------------
 glue/utils/array.py        |  2 +-
 3 files changed, 28 insertions(+), 31 deletions(-)

diff --git a/glue/core/data.py b/glue/core/data.py
index 7afaa91f5..9adc3282b 100644
--- a/glue/core/data.py
+++ b/glue/core/data.py
@@ -1160,7 +1160,8 @@ def update_values_from_data(self, data):
     # can be overriden by subclasses that want to improve performance.
 
     def compute_statistic(self, statistic, cid, subset_state=None, axis=None,
-                          finite=True, positive=False, percentile=None, view=None):
+                          finite=True, positive=False, percentile=None, view=None,
+                          random_subset=None):
         """
         Compute a statistic for the data.
 
@@ -1188,6 +1189,9 @@ def compute_statistic(self, statistic, cid, subset_state=None, axis=None,
             If ``statistic`` is ``'percentile'``, the ``percentile`` argument
             should be given and specify the percentile to calculate in the
             range [0:100]
+        random_subset : int, optional
+            If specified, this should be an integer giving the number of values
+            to use for the statistic. This can only be used if ``axis`` is `None`
         """
 
         # TODO: generalize chunking to tuple axis (not just int)
@@ -1237,6 +1241,13 @@ def compute_statistic(self, statistic, cid, subset_state=None, axis=None,
             # the statistics.
             data = unbroadcast(data)
 
+        if random_subset and data.size > random_subset:
+            if not hasattr(self, '_random_subset_indices') or self._random_subset_indices[0] != data.size:
+                self._random_subset_indices = (data.size, np.random.randint(0, data.size, random_subset))
+            data = data.ravel()[self._random_subset_indices[1]]
+            if mask is not None:
+                mask = mask.ravel()[self._random_subset_indices[1]]
+
         return compute_statistic(statistic, data, mask=mask, axis=axis, finite=finite,
                                  positive=positive, percentile=percentile)
 
diff --git a/glue/core/state_objects.py b/glue/core/state_objects.py
index 0e5adbfb3..12b17952a 100644
--- a/glue/core/state_objects.py
+++ b/glue/core/state_objects.py
@@ -247,7 +247,7 @@ class StateAttributeLimitsHelper(StateAttributeCacheHelper):
     attribute : str
         The attribute name - this will be populated once a dataset is assigned
         to the helper.
-    percentile_subset : int
+    random_subset : int
         How many points to use at most for the percentile calculation (using all
         values is highly inefficient and not needed)
     margin : float
@@ -279,12 +279,12 @@ class StateAttributeLimitsHelper(StateAttributeCacheHelper):
     values_names = ('lower', 'upper')
     modifiers_names = ('log', 'percentile')
 
-    def __init__(self, state, attribute, percentile_subset=10000, margin=0, cache=None, **kwargs):
+    def __init__(self, state, attribute, random_subset=10000, margin=0, cache=None, **kwargs):
 
         super(StateAttributeLimitsHelper, self).__init__(state, attribute, cache=cache, **kwargs)
 
         self.margin = margin
-        self.percentile_subset = percentile_subset
+        self.random_subset = random_subset
         self.subset_indices = None
 
         if self.attribute is not None:
@@ -327,40 +327,26 @@ def update_values(self, force=False, use_default_modifiers=False, **properties):
 
             exclude = (100 - percentile) / 2.
 
-            # data_values = self.data_values
             data_component = self.data_component
 
-            # NOTE: specific to issues with local data
-            # if data_component.size > self.percentile_subset:
-            #     if self.subset_indices is None or self.subset_indices[0] != data_component.size:
-            #         self.subset_indices = (data_component.size,
-            #                                np.random.randint(0, data_component.size,
-            #                                                  self.percentile_subset))
-            #     data_values = data_values.ravel()[self.subset_indices[1]]
-
             if log and not data_component.any_positive:
                 self.set(lower=0.1, upper=1, percentile=percentile, log=log)
                 return
 
-            # NOTE: we can't use np.nanmin/np.nanmax or nanpercentile below as
-            # they don't exclude inf/-inf
-            # if data_values.dtype.kind != 'M':
-            #     data_values = data_values[np.isfinite(data_values)]
-
             if percentile == 100:
-
-                # if data_values.dtype.kind == 'M':
-                #     lower = data_values.min()
-                #     upper = data_values.max()
-                # else:
-                # TODO: have a way to ask for the min/max of positive values
-                lower = self.data.compute_statistic('min', cid=self.component_id, finite=True, positive=log)
-                upper = self.data.compute_statistic('max', cid=self.component_id, finite=True, positive=log)
-
+                lower = self.data.compute_statistic('min', cid=self.component_id,
+                                                    finite=True, positive=log,
+                                                    random_subset=self.random_subset)
+                upper = self.data.compute_statistic('max', cid=self.component_id,
+                                                    finite=True, positive=log,
+                                                    random_subset=self.random_subset)
             else:
-
-                lower = self.data.compute_statistic('percentile', cid=self.component_id, percentile=exclude, positive=log)
-                upper = self.data.compute_statistic('percentile', cid=self.component_id, percentile=100 - exclude, positive=log)
+                lower = self.data.compute_statistic('percentile', cid=self.component_id,
+                                                    percentile=exclude, positive=log,
+                                                    random_subset=self.random_subset)
+                upper = self.data.compute_statistic('percentile', cid=self.component_id,
+                                                    percentile=100 - exclude, positive=log,
+                                                    random_subset=self.random_subset)
 
             if np.isnan(lower) or np.isnan(upper):
 
diff --git a/glue/utils/array.py b/glue/utils/array.py
index 8ad4230f6..f31ede613 100644
--- a/glue/utils/array.py
+++ b/glue/utils/array.py
@@ -448,7 +448,7 @@ def compute_statistic(statistic, data, mask=None, axis=None, finite=True,
 
         keep = np.ones(data.shape, dtype=bool)
 
-        if finite:
+        if data.dtype.kind != 'M' and finite:
             keep &= np.isfinite(data)
 
         if positive:

From f307c71fcbbbd5105611e9b7b0959722025484eb Mon Sep 17 00:00:00 2001
From: Thomas Robitaille <thomas.robitaille@gmail.com>
Date: Tue, 22 May 2018 19:00:56 +0100
Subject: [PATCH 3/6] Clean up changes

---
 doc/whatsnew/whatsnew.rst                     |  2 +-
 glue/core/data.py                             | 24 ++++++++++++-------
 glue/core/state_objects.py                    | 21 ++++------------
 glue/utils/array.py                           | 14 ++++++-----
 .../profile/qt/tests/test_data_viewer.py      |  2 +-
 .../profile/qt/tests/test_profile_tools.py    |  2 +-
 glue/viewers/profile/tests/test_state.py      | 16 ++++++-------
 7 files changed, 38 insertions(+), 43 deletions(-)

diff --git a/doc/whatsnew/whatsnew.rst b/doc/whatsnew/whatsnew.rst
index a6d2b0ce2..d9c21daa2 100644
--- a/doc/whatsnew/whatsnew.rst
+++ b/doc/whatsnew/whatsnew.rst
@@ -163,7 +163,7 @@ Profile viewer
 
 Glue now features a new profile viewer that can be used to show data collapsed
 along all but one dimension using a variety of functions (mean, median, maximum,
-minimim, and so on). This new viewer replaces the previous 'spectrum' tool
+minimum, and so on). This new viewer replaces the previous 'spectrum' tool
 (which was restricted to 3 dimensions and mostly designed to work with
 astronomical data) and includes the same functionality to fit models to profiles
 or collapse data in an image viewer based on an interval selected in the profile
diff --git a/glue/core/data.py b/glue/core/data.py
index 9adc3282b..7eb249dd3 100644
--- a/glue/core/data.py
+++ b/glue/core/data.py
@@ -24,7 +24,7 @@
 from glue.core.coordinates import Coordinates
 from glue.core.contracts import contract
 from glue.config import settings
-from glue.utils import compute_statistic, unbroadcast
+from glue.utils import compute_statistic, unbroadcast, iterate_chunks
 
 
 # Note: leave all the following imports for component and component_id since
@@ -1194,9 +1194,12 @@ def compute_statistic(self, statistic, cid, subset_state=None, axis=None,
             to use for the statistic. This can only be used if ``axis`` is `None`
         """
 
-        # TODO: generalize chunking to tuple axis (not just int)
+        # TODO: generalize chunking to more types of axis
 
-        if (view is None and isinstance(axis, int) and self.size > N_CHUNK_MAX and
+        if (view is None and
+                isinstance(axis, tuple) and
+                len(axis) == self.ndim - 1 and
+                self.size > N_CHUNK_MAX and
                 not isinstance(subset_state, SliceSubsetState)):
 
             # We operate in chunks here to avoid memory issues.
@@ -1208,19 +1211,22 @@ def compute_statistic(self, statistic, cid, subset_state=None, axis=None,
             # could ask a SubsetState whether it is broadcasted along
             # axis_index.
 
-            result = np.zeros(self.shape[axis])
+            axis_index = [a for a in range(self.ndim) if a not in axis][0]
+
+            result = np.zeros(self.shape[axis_index])
 
             chunk_shape = list(self.shape)
 
             # Deliberately leave n_chunks as float to not round twice
-            n_chunks = self.layer.size / N_CHUNK_MAX
+            n_chunks = self.size / N_CHUNK_MAX
 
             chunk_shape[axis_index] = max(1, int(chunk_shape[axis_index] / n_chunks))
 
-            for view in iterate_chunks(self.layer.shape, chunk_shape=chunk_shape):
-                result[view[axis]] = self.compute_statistic(statistic, cid, subset_state=subset_state,
-                                                            axis=axis, finite=finite, positive=positive,
-                                                            percentile=percentile, view=view)
+            for chunk_view in iterate_chunks(self.shape, chunk_shape=chunk_shape):
+                values = self.compute_statistic(statistic, cid, subset_state=subset_state,
+                                                axis=axis, finite=finite, positive=positive,
+                                                percentile=percentile, view=chunk_view)
+                result[chunk_view[axis_index]] = values
 
             return result
 
diff --git a/glue/core/state_objects.py b/glue/core/state_objects.py
index 12b17952a..bf7843912 100644
--- a/glue/core/state_objects.py
+++ b/glue/core/state_objects.py
@@ -329,15 +329,11 @@ def update_values(self, force=False, use_default_modifiers=False, **properties):
 
             data_component = self.data_component
 
-            if log and not data_component.any_positive:
-                self.set(lower=0.1, upper=1, percentile=percentile, log=log)
-                return
-
             if percentile == 100:
-                lower = self.data.compute_statistic('min', cid=self.component_id,
+                lower = self.data.compute_statistic('minimum', cid=self.component_id,
                                                     finite=True, positive=log,
                                                     random_subset=self.random_subset)
-                upper = self.data.compute_statistic('max', cid=self.component_id,
+                upper = self.data.compute_statistic('maximum', cid=self.component_id,
                                                     finite=True, positive=log,
                                                     random_subset=self.random_subset)
             else:
@@ -349,9 +345,7 @@ def update_values(self, force=False, use_default_modifiers=False, **properties):
                                                     random_subset=self.random_subset)
 
             if np.isnan(lower) or np.isnan(upper):
-
                 lower, upper = 0, 1
-
             else:
 
                 if data_component.categorical:
@@ -467,15 +461,8 @@ def update_values(self, force=False, use_default_modifiers=False, **properties):
                 else:
                     n_bin = self._common_n_bin
 
-                data_component = self.data_component
-
-                # NOTE: we can't use np.nanmin/np.nanmax or nanpercentile below as
-                # they don't exclude inf/-inf
-                # if values.dtype.kind != 'M':
-                #     values = values[np.isfinite(values)]
-
-                lower = self.data.compute_statistic('min', cid=self.component_id, finite=True)
-                upper = self.data.compute_statistic('max', cid=self.component_id, finite=True)
+                lower = self.data.compute_statistic('minimum', cid=self.component_id, finite=True)
+                upper = self.data.compute_statistic('maximum', cid=self.component_id, finite=True)
 
                 if np.isnan(lower) or np.isnan(upper):
                     lower, upper = 0, 1
diff --git a/glue/utils/array.py b/glue/utils/array.py
index f31ede613..65e0c784e 100644
--- a/glue/utils/array.py
+++ b/glue/utils/array.py
@@ -15,7 +15,7 @@
 __all__ = ['unique', 'shape_to_string', 'view_shape', 'stack_view',
            'coerce_numeric', 'check_sorted', 'broadcast_to', 'unbroadcast',
            'iterate_chunks', 'combine_slices', 'nanmean', 'nanmedian', 'nansum',
-           'nanmin', 'nanmax', 'format_minimal']
+           'nanmin', 'nanmax', 'format_minimal', 'compute_statistic']
 
 
 def unbroadcast(array):
@@ -394,7 +394,6 @@ def format_minimal(values):
     return fmt, strings
 
 
-
 PLAIN_FUNCTIONS = {'minimum': np.min,
                    'maximum': np.max,
                    'mean': np.mean,
@@ -417,7 +416,7 @@ def compute_statistic(statistic, data, mask=None, axis=None, finite=True,
 
     Parameters
     ----------
-    statistic : {'count', 'min', 'max', 'mean', 'median', 'sum', 'percentile'}
+    statistic : {'minimum', 'maximum', 'mean', 'median', 'sum', 'percentile'}
         The statistic to compute
     data : `numpy.ndarray`
         The data to compute the statistic for.
@@ -444,7 +443,7 @@ def compute_statistic(statistic, data, mask=None, axis=None, finite=True,
     if statistic not in PLAIN_FUNCTIONS:
         raise ValueError("Unrecognized statistic: {0}".format(statistic))
 
-    if finite or positive or mask:
+    if finite or positive or mask is not None:
 
         keep = np.ones(data.shape, dtype=bool)
 
@@ -454,13 +453,16 @@ def compute_statistic(statistic, data, mask=None, axis=None, finite=True,
         if positive:
             keep &= data > 0
 
-        if mask:
+        if mask is not None:
             keep &= mask
 
         if axis is None:
             data = data[keep]
         else:
-            data[keep] = np.nan
+            # We need to force a copy since we are editing the values and we
+            # might as well convert to float just in case
+            data = np.array(data, dtype=float)
+            data[~keep] = np.nan
 
         function = NAN_FUNCTIONS[statistic]
 
diff --git a/glue/viewers/profile/qt/tests/test_data_viewer.py b/glue/viewers/profile/qt/tests/test_data_viewer.py
index af733031e..e0c26eca4 100644
--- a/glue/viewers/profile/qt/tests/test_data_viewer.py
+++ b/glue/viewers/profile/qt/tests/test_data_viewer.py
@@ -62,7 +62,7 @@ def teardown_method(self, method):
 
     def test_functions(self):
         self.viewer.add_data(self.data)
-        self.viewer.state.function = nanmean
+        self.viewer.state.function = 'mean'
         assert len(self.viewer.layers) == 1
         layer_artist = self.viewer.layers[0]
         layer_artist.wait()
diff --git a/glue/viewers/profile/qt/tests/test_profile_tools.py b/glue/viewers/profile/qt/tests/test_profile_tools.py
index 274e76207..3dad372ea 100644
--- a/glue/viewers/profile/qt/tests/test_profile_tools.py
+++ b/glue/viewers/profile/qt/tests/test_profile_tools.py
@@ -33,7 +33,7 @@ def setup_method(self, method):
         self.data_collection.append(self.data)
 
         self.viewer = self.app.new_data_viewer(ProfileViewer)
-        self.viewer.state.function = nanmean
+        self.viewer.state.function = 'mean'
 
         self.viewer.toolbar.active_tool = 'profile-analysis'
 
diff --git a/glue/viewers/profile/tests/test_state.py b/glue/viewers/profile/tests/test_state.py
index ddaf3f957..a8884c871 100644
--- a/glue/viewers/profile/tests/test_state.py
+++ b/glue/viewers/profile/tests/test_state.py
@@ -30,7 +30,7 @@ def setup_method(self, method):
         self.layer_state = ProfileLayerState(viewer_state=self.viewer_state,
                                              layer=self.data)
         self.viewer_state.layers.append(self.layer_state)
-        self.viewer_state.function = nanmean
+        self.viewer_state.function = 'mean'
 
     def test_basic(self):
         x, y = self.layer_state.profile
@@ -62,23 +62,23 @@ def test_x_att(self):
 
     def test_function(self):
 
-        self.viewer_state.function = nanmean
+        self.viewer_state.function = 'mean'
         x, y = self.layer_state.profile
         assert_allclose(y, [3.5, 11.5, 19.5])
 
-        self.viewer_state.function = nanmin
+        self.viewer_state.function = 'minimum'
         x, y = self.layer_state.profile
         assert_allclose(y, [0, 8, 16])
 
-        self.viewer_state.function = nanmax
+        self.viewer_state.function = 'maximum'
         x, y = self.layer_state.profile
         assert_allclose(y, [7, 15, 23])
 
-        self.viewer_state.function = nansum
+        self.viewer_state.function = 'sum'
         x, y = self.layer_state.profile
         assert_allclose(y, [28, 92, 156])
 
-        self.viewer_state.function = nanmedian
+        self.viewer_state.function = 'median'
         x, y = self.layer_state.profile
         assert_allclose(y, [3.5, 11.5, 19.5])
 
@@ -105,7 +105,7 @@ def test_subset(self):
     def test_clone(self):
 
         self.viewer_state.x_att = self.data.pixel_component_ids[1]
-        self.viewer_state.function = nanmedian
+        self.viewer_state.function = 'median'
 
         self.layer_state.attribute = self.data.id['x']
         self.layer_state.linewidth = 3
@@ -113,7 +113,7 @@ def test_clone(self):
         viewer_state_new = clone(self.viewer_state)
 
         assert viewer_state_new.x_att.label == 'Pixel Axis 1 [y]'
-        assert viewer_state_new.function is nanmedian
+        assert viewer_state_new.function == 'median'
 
         assert self.layer_state.attribute.label == 'x'
         assert self.layer_state.linewidth == 3

From df444b11f15e16229c1a92f4b10f6428688ae27f Mon Sep 17 00:00:00 2001
From: Thomas Robitaille <thomas.robitaille@gmail.com>
Date: Tue, 22 May 2018 19:52:35 +0100
Subject: [PATCH 4/6] PEP8 and fix to to_array

---
 glue/core/data.py                                 | 4 ++--
 glue/viewers/profile/qt/tests/test_data_viewer.py | 1 -
 glue/viewers/profile/state.py                     | 6 ++----
 glue/viewers/profile/tests/test_state.py          | 1 -
 4 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/glue/core/data.py b/glue/core/data.py
index 7eb249dd3..997b5f6cd 100644
--- a/glue/core/data.py
+++ b/glue/core/data.py
@@ -1231,8 +1231,8 @@ def compute_statistic(self, statistic, cid, subset_state=None, axis=None,
             return result
 
         if subset_state:
-            if isinstance(subset_state, SliceSubsetState):
-                data = subset_state.to_array(self, cid, view)
+            if isinstance(subset_state, SliceSubsetState) and view is None:
+                data = subset_state.to_array(self, cid)
                 mask = None
             else:
                 data = self[cid]
diff --git a/glue/viewers/profile/qt/tests/test_data_viewer.py b/glue/viewers/profile/qt/tests/test_data_viewer.py
index e0c26eca4..836a1b339 100644
--- a/glue/viewers/profile/qt/tests/test_data_viewer.py
+++ b/glue/viewers/profile/qt/tests/test_data_viewer.py
@@ -16,7 +16,6 @@
 from glue.core.component_link import ComponentLink
 from glue.viewers.matplotlib.qt.tests.test_data_viewer import BaseTestMatplotlibDataViewer
 from glue.viewers.profile.tests.test_state import SimpleCoordinates
-from glue.utils import nanmean
 from glue.core.tests.test_state import clone
 
 from ..data_viewer import ProfileViewer
diff --git a/glue/viewers/profile/state.py b/glue/viewers/profile/state.py
index eb40f55e0..075cf3475 100644
--- a/glue/viewers/profile/state.py
+++ b/glue/viewers/profile/state.py
@@ -1,6 +1,5 @@
 from __future__ import absolute_import, division, print_function
 
-import warnings
 from collections import OrderedDict
 
 import numpy as np
@@ -13,10 +12,9 @@
                                            DeferredDrawSelectionCallbackProperty as DDSCProperty)
 from glue.core.state_objects import StateAttributeLimitsHelper
 from glue.core.data_combo_helper import ManualDataComboHelper, ComponentIDComboHelper
-from glue.utils import defer_draw, nanmean, nanmedian, nansum, nanmin, nanmax
+from glue.utils import defer_draw, nanmin, nanmax
 from glue.core.link_manager import is_convertible_to_single_pixel_cid
-from glue.core.exceptions import IncompatibleAttribute, IncompatibleDataException
-from glue.core.subset import SliceSubsetState
+from glue.core.exceptions import IncompatibleDataException
 
 __all__ = ['ProfileViewerState', 'ProfileLayerState']
 
diff --git a/glue/viewers/profile/tests/test_state.py b/glue/viewers/profile/tests/test_state.py
index a8884c871..67d37b324 100644
--- a/glue/viewers/profile/tests/test_state.py
+++ b/glue/viewers/profile/tests/test_state.py
@@ -6,7 +6,6 @@
 
 from glue.core import Data, Coordinates
 from glue.core.tests.test_state import clone
-from glue.utils import nanmean, nanmedian, nansum, nanmin, nanmax
 
 from ..state import ProfileViewerState, ProfileLayerState
 

From cb4109d5773bf14b4febc87832d8a0b0eb0a08cd Mon Sep 17 00:00:00 2001
From: Thomas Robitaille <thomas.robitaille@gmail.com>
Date: Tue, 22 May 2018 20:02:54 +0100
Subject: [PATCH 5/6] Fix case when all values are NaN

---
 glue/utils/array.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/glue/utils/array.py b/glue/utils/array.py
index 65e0c784e..135e347fd 100644
--- a/glue/utils/array.py
+++ b/glue/utils/array.py
@@ -470,6 +470,9 @@ def compute_statistic(statistic, data, mask=None, axis=None, finite=True,
 
         function = PLAIN_FUNCTIONS[statistic]
 
+    if data.size == 0:
+        return np.nan
+
     with warnings.catch_warnings():
         warnings.simplefilter("ignore", category=RuntimeWarning)
         if statistic == 'percentile':

From d29cf8b59e71d054aeed7de118b8de789ed5fab0 Mon Sep 17 00:00:00 2001
From: Thomas Robitaille <thomas.robitaille@gmail.com>
Date: Tue, 22 May 2018 20:13:50 +0100
Subject: [PATCH 6/6] Fix tests

---
 glue/core/state_objects.py | 4 ++--
 glue/utils/array.py        | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/glue/core/state_objects.py b/glue/core/state_objects.py
index bf7843912..da5ad18b2 100644
--- a/glue/core/state_objects.py
+++ b/glue/core/state_objects.py
@@ -344,7 +344,7 @@ def update_values(self, force=False, use_default_modifiers=False, **properties):
                                                     percentile=100 - exclude, positive=log,
                                                     random_subset=self.random_subset)
 
-            if np.isnan(lower) or np.isnan(upper):
+            if not isinstance(lower, np.datetime64) and np.isnan(lower):
                 lower, upper = 0, 1
             else:
 
@@ -464,7 +464,7 @@ def update_values(self, force=False, use_default_modifiers=False, **properties):
                 lower = self.data.compute_statistic('minimum', cid=self.component_id, finite=True)
                 upper = self.data.compute_statistic('maximum', cid=self.component_id, finite=True)
 
-                if np.isnan(lower) or np.isnan(upper):
+                if not isinstance(lower, np.datetime64) and np.isnan(lower):
                     lower, upper = 0, 1
 
             self.set(lower=lower, upper=upper, n_bin=n_bin)
diff --git a/glue/utils/array.py b/glue/utils/array.py
index 135e347fd..a9539c53d 100644
--- a/glue/utils/array.py
+++ b/glue/utils/array.py
@@ -443,11 +443,11 @@ def compute_statistic(statistic, data, mask=None, axis=None, finite=True,
     if statistic not in PLAIN_FUNCTIONS:
         raise ValueError("Unrecognized statistic: {0}".format(statistic))
 
-    if finite or positive or mask is not None:
+    if (finite or positive or mask is not None) and data.dtype.kind != 'M':
 
         keep = np.ones(data.shape, dtype=bool)
 
-        if data.dtype.kind != 'M' and finite:
+        if finite:
             keep &= np.isfinite(data)
 
         if positive: