From ccb7a6d22132b2ff9cf2723fda8d10c79b5b7ea2 Mon Sep 17 00:00:00 2001
From: tdelteil <thomas.delteil1@gmail.com>
Date: Wed, 24 Oct 2018 00:13:20 +0000
Subject: [PATCH 1/3] update dec example

---
 example/deep-embedded-clustering/README.md |  9 +++++++++
 example/deep-embedded-clustering/dec.py    | 22 ++++++++++++----------
 2 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/example/deep-embedded-clustering/README.md b/example/deep-embedded-clustering/README.md
index 90803d2ed12d..eec89e55a73d 100644
--- a/example/deep-embedded-clustering/README.md
+++ b/example/deep-embedded-clustering/README.md
@@ -1,9 +1,18 @@
 # DEC Implementation
 This is based on the paper `Unsupervised deep embedding for clustering analysis` by  Junyuan Xie, Ross Girshick, and Ali Farhadi
 
+Abstract:
+
+Clustering is central to many data-driven application domains and has been studied extensively in terms of distance functions and grouping algorithms. Relatively little work has focused on learning representations for clustering. In this paper, we propose Deep Embedded Clustering (DEC), a method that simultaneously learns feature representations and cluster assignments using deep neural networks. DEC learns a mapping from the data space to a lower-dimensional feature space in which it iteratively optimizes a clustering objective. Our experimental evaluations on image and text corpora show significant improvement over state-of-the-art methods.
+
+
 ## Prerequisite
   - Install Scikit-learn: `python -m pip install --user sklearn`
   - Install SciPy: `python -m pip install --user scipy`
 
+## Data
+
+The script is using MNIST dataset
+
 ## Usage
 run `python dec.py`
\ No newline at end of file
diff --git a/example/deep-embedded-clustering/dec.py b/example/deep-embedded-clustering/dec.py
index 44e582d9f94d..e1f76583a1bd 100644
--- a/example/deep-embedded-clustering/dec.py
+++ b/example/deep-embedded-clustering/dec.py
@@ -33,14 +33,14 @@
 import logging
 
 def cluster_acc(Y_pred, Y):
-  from sklearn.utils.linear_assignment_ import linear_assignment
-  assert Y_pred.size == Y.size
-  D = max(Y_pred.max(), Y.max())+1
-  w = np.zeros((D,D), dtype=np.int64)
-  for i in range(Y_pred.size):
-    w[Y_pred[i], int(Y[i])] += 1
-  ind = linear_assignment(w.max() - w)
-  return sum([w[i,j] for i,j in ind])*1.0/Y_pred.size, w
+    from sklearn.utils.linear_assignment_ import linear_assignment
+    assert Y_pred.size == Y.size
+    D = max(Y_pred.max(), Y.max())+1
+    w = np.zeros((D,D), dtype=np.int64)
+    for i in range(Y_pred.size):
+        w[Y_pred[i], int(Y[i])] += 1
+    ind = linear_assignment(w.max() - w)
+    return sum([w[i,j] for i,j in ind])*1.0/Y_pred.size, w
 
 class DECModel(model.MXModel):
     class DECLoss(mx.operator.NumpyOp):
@@ -87,9 +87,9 @@ def setup(self, X, num_centers, alpha, save_to='dec_model'):
         ae_model = AutoEncoderModel(self.xpu, [X.shape[1],500,500,2000,10], pt_dropout=0.2)
         if not os.path.exists(save_to+'_pt.arg'):
             ae_model.layerwise_pretrain(X_train, 256, 50000, 'sgd', l_rate=0.1, decay=0.0,
-                                        lr_scheduler=mx.misc.FactorScheduler(20000,0.1))
+                                        lr_scheduler=mx.lr_scheduler.FactorScheduler(20000,0.1))
             ae_model.finetune(X_train, 256, 100000, 'sgd', l_rate=0.1, decay=0.0,
-                              lr_scheduler=mx.misc.FactorScheduler(20000,0.1))
+                              lr_scheduler=mx.lr_scheduler.FactorScheduler(20000,0.1))
             ae_model.save(save_to+'_pt.arg')
             logging.log(logging.INFO, "Autoencoder Training error: %f"%ae_model.eval(X_train))
             logging.log(logging.INFO, "Autoencoder Validation error: %f"%ae_model.eval(X_val))
@@ -160,6 +160,8 @@ def refresh(i):
 
 def mnist_exp(xpu):
     X, Y = data.get_mnist()
+    if not os.path.isdir('data'):
+        os.makedirs('data')
     dec_model = DECModel(xpu, X, 10, 1.0, 'data/mnist')
     acc = []
     for i in [10*(2**j) for j in range(9)]:

From 956e52cf82c6a1963b125619330f6ce6311557e8 Mon Sep 17 00:00:00 2001
From: Thomas Delteil <thomas.delteil1@gmail.com>
Date: Wed, 24 Oct 2018 15:17:46 -0700
Subject: [PATCH 2/3] trigger CI

---
 example/deep-embedded-clustering/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/example/deep-embedded-clustering/README.md b/example/deep-embedded-clustering/README.md
index eec89e55a73d..3972f90bda4a 100644
--- a/example/deep-embedded-clustering/README.md
+++ b/example/deep-embedded-clustering/README.md
@@ -12,7 +12,7 @@ Clustering is central to many data-driven application domains and has been studi
 
 ## Data
 
-The script is using MNIST dataset
+The script is using MNIST dataset.
 
 ## Usage
-run `python dec.py`
\ No newline at end of file
+run `python dec.py`

From 5a40b3d24d4aa5283986b5b2108dd5a3913bbb56 Mon Sep 17 00:00:00 2001
From: tdelteil <thomas.delteil1@gmail.com>
Date: Mon, 5 Nov 2018 19:12:23 +0000
Subject: [PATCH 3/3] update to remove dependency on sklearn data

---
 example/deep-embedded-clustering/data.py | 15 ++++++++-------
 example/deep-embedded-clustering/dec.py  |  3 ---
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/example/deep-embedded-clustering/data.py b/example/deep-embedded-clustering/data.py
index 9fd472e6a8b1..3649990dbbb9 100644
--- a/example/deep-embedded-clustering/data.py
+++ b/example/deep-embedded-clustering/data.py
@@ -19,20 +19,21 @@
 from __future__ import print_function
 
 import os
+
+import mxnet as mx
 import numpy as np
-from sklearn.datasets import fetch_mldata
 
 
 def get_mnist():
     """ Gets MNIST dataset """
 
     np.random.seed(1234) # set seed for deterministic ordering
-    data_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-    data_path = os.path.join(data_path, '../../data')
-    mnist = fetch_mldata('MNIST original', data_home=data_path)
-    p = np.random.permutation(mnist.data.shape[0])
-    X = mnist.data[p].astype(np.float32)*0.02
-    Y = mnist.target[p]
+    mnist_data = mx.test_utils.get_mnist()
+    X = np.concatenate([mnist_data['train_data'], mnist_data['test_data']])
+    Y = np.concatenate([mnist_data['train_label'], mnist_data['test_label']])
+    p = np.random.permutation(X.shape[0])
+    X = X[p].reshape((X.shape[0], -1)).astype(np.float32)*5
+    Y = Y[p]
     return X, Y
 
 
diff --git a/example/deep-embedded-clustering/dec.py b/example/deep-embedded-clustering/dec.py
index e1f76583a1bd..d7594703cfc3 100644
--- a/example/deep-embedded-clustering/dec.py
+++ b/example/deep-embedded-clustering/dec.py
@@ -19,9 +19,6 @@
 from __future__ import print_function
 import sys
 import os
-# code to automatically download dataset
-curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-sys.path = [os.path.join(curr_path, "../autoencoder")] + sys.path
 import mxnet as mx
 import numpy as np
 import data