From ccb7a6d22132b2ff9cf2723fda8d10c79b5b7ea2 Mon Sep 17 00:00:00 2001 From: tdelteil Date: Wed, 24 Oct 2018 00:13:20 +0000 Subject: [PATCH 1/3] update dec example --- example/deep-embedded-clustering/README.md | 9 +++++++++ example/deep-embedded-clustering/dec.py | 22 ++++++++++++---------- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/example/deep-embedded-clustering/README.md b/example/deep-embedded-clustering/README.md index 90803d2ed12d..eec89e55a73d 100644 --- a/example/deep-embedded-clustering/README.md +++ b/example/deep-embedded-clustering/README.md @@ -1,9 +1,18 @@ # DEC Implementation This is based on the paper `Unsupervised deep embedding for clustering analysis` by Junyuan Xie, Ross Girshick, and Ali Farhadi +Abstract: + +Clustering is central to many data-driven application domains and has been studied extensively in terms of distance functions and grouping algorithms. Relatively little work has focused on learning representations for clustering. In this paper, we propose Deep Embedded Clustering (DEC), a method that simultaneously learns feature representations and cluster assignments using deep neural networks. DEC learns a mapping from the data space to a lower-dimensional feature space in which it iteratively optimizes a clustering objective. Our experimental evaluations on image and text corpora show significant improvement over state-of-the-art methods. + + ## Prerequisite - Install Scikit-learn: `python -m pip install --user sklearn` - Install SciPy: `python -m pip install --user scipy` +## Data + +The script is using MNIST dataset + ## Usage run `python dec.py` \ No newline at end of file diff --git a/example/deep-embedded-clustering/dec.py b/example/deep-embedded-clustering/dec.py index 44e582d9f94d..e1f76583a1bd 100644 --- a/example/deep-embedded-clustering/dec.py +++ b/example/deep-embedded-clustering/dec.py @@ -33,14 +33,14 @@ import logging def cluster_acc(Y_pred, Y): - from sklearn.utils.linear_assignment_ import linear_assignment - assert Y_pred.size == Y.size - D = max(Y_pred.max(), Y.max())+1 - w = np.zeros((D,D), dtype=np.int64) - for i in range(Y_pred.size): - w[Y_pred[i], int(Y[i])] += 1 - ind = linear_assignment(w.max() - w) - return sum([w[i,j] for i,j in ind])*1.0/Y_pred.size, w + from sklearn.utils.linear_assignment_ import linear_assignment + assert Y_pred.size == Y.size + D = max(Y_pred.max(), Y.max())+1 + w = np.zeros((D,D), dtype=np.int64) + for i in range(Y_pred.size): + w[Y_pred[i], int(Y[i])] += 1 + ind = linear_assignment(w.max() - w) + return sum([w[i,j] for i,j in ind])*1.0/Y_pred.size, w class DECModel(model.MXModel): class DECLoss(mx.operator.NumpyOp): @@ -87,9 +87,9 @@ def setup(self, X, num_centers, alpha, save_to='dec_model'): ae_model = AutoEncoderModel(self.xpu, [X.shape[1],500,500,2000,10], pt_dropout=0.2) if not os.path.exists(save_to+'_pt.arg'): ae_model.layerwise_pretrain(X_train, 256, 50000, 'sgd', l_rate=0.1, decay=0.0, - lr_scheduler=mx.misc.FactorScheduler(20000,0.1)) + lr_scheduler=mx.lr_scheduler.FactorScheduler(20000,0.1)) ae_model.finetune(X_train, 256, 100000, 'sgd', l_rate=0.1, decay=0.0, - lr_scheduler=mx.misc.FactorScheduler(20000,0.1)) + lr_scheduler=mx.lr_scheduler.FactorScheduler(20000,0.1)) ae_model.save(save_to+'_pt.arg') logging.log(logging.INFO, "Autoencoder Training error: %f"%ae_model.eval(X_train)) logging.log(logging.INFO, "Autoencoder Validation error: %f"%ae_model.eval(X_val)) @@ -160,6 +160,8 @@ def refresh(i): def mnist_exp(xpu): X, Y = data.get_mnist() + if not os.path.isdir('data'): + os.makedirs('data') dec_model = DECModel(xpu, X, 10, 1.0, 'data/mnist') acc = [] for i in [10*(2**j) for j in range(9)]: From 956e52cf82c6a1963b125619330f6ce6311557e8 Mon Sep 17 00:00:00 2001 From: Thomas Delteil Date: Wed, 24 Oct 2018 15:17:46 -0700 Subject: [PATCH 2/3] trigger CI --- example/deep-embedded-clustering/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/example/deep-embedded-clustering/README.md b/example/deep-embedded-clustering/README.md index eec89e55a73d..3972f90bda4a 100644 --- a/example/deep-embedded-clustering/README.md +++ b/example/deep-embedded-clustering/README.md @@ -12,7 +12,7 @@ Clustering is central to many data-driven application domains and has been studi ## Data -The script is using MNIST dataset +The script is using MNIST dataset. ## Usage -run `python dec.py` \ No newline at end of file +run `python dec.py` From 5a40b3d24d4aa5283986b5b2108dd5a3913bbb56 Mon Sep 17 00:00:00 2001 From: tdelteil Date: Mon, 5 Nov 2018 19:12:23 +0000 Subject: [PATCH 3/3] update to remove dependency on sklearn data --- example/deep-embedded-clustering/data.py | 15 ++++++++------- example/deep-embedded-clustering/dec.py | 3 --- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/example/deep-embedded-clustering/data.py b/example/deep-embedded-clustering/data.py index 9fd472e6a8b1..3649990dbbb9 100644 --- a/example/deep-embedded-clustering/data.py +++ b/example/deep-embedded-clustering/data.py @@ -19,20 +19,21 @@ from __future__ import print_function import os + +import mxnet as mx import numpy as np -from sklearn.datasets import fetch_mldata def get_mnist(): """ Gets MNIST dataset """ np.random.seed(1234) # set seed for deterministic ordering - data_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) - data_path = os.path.join(data_path, '../../data') - mnist = fetch_mldata('MNIST original', data_home=data_path) - p = np.random.permutation(mnist.data.shape[0]) - X = mnist.data[p].astype(np.float32)*0.02 - Y = mnist.target[p] + mnist_data = mx.test_utils.get_mnist() + X = np.concatenate([mnist_data['train_data'], mnist_data['test_data']]) + Y = np.concatenate([mnist_data['train_label'], mnist_data['test_label']]) + p = np.random.permutation(X.shape[0]) + X = X[p].reshape((X.shape[0], -1)).astype(np.float32)*5 + Y = Y[p] return X, Y diff --git a/example/deep-embedded-clustering/dec.py b/example/deep-embedded-clustering/dec.py index e1f76583a1bd..d7594703cfc3 100644 --- a/example/deep-embedded-clustering/dec.py +++ b/example/deep-embedded-clustering/dec.py @@ -19,9 +19,6 @@ from __future__ import print_function import sys import os -# code to automatically download dataset -curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) -sys.path = [os.path.join(curr_path, "../autoencoder")] + sys.path import mxnet as mx import numpy as np import data