From 72acf517cbb436a2df4279cc33e8339554138721 Mon Sep 17 00:00:00 2001
From: Lai Wei <royweilai@gmail.com>
Date: Fri, 19 Oct 2018 15:59:05 -0700
Subject: [PATCH 01/16] initial draft gluon tutorial

---
 .../gluon_from_experiment_to_deploymen.md     | 211 ++++++++++++++++++
 1 file changed, 211 insertions(+)
 create mode 100644 docs/tutorials/gluon/gluon_from_experiment_to_deploymen.md

diff --git a/docs/tutorials/gluon/gluon_from_experiment_to_deploymen.md b/docs/tutorials/gluon/gluon_from_experiment_to_deploymen.md
new file mode 100644
index 000000000000..9cf9694eaf83
--- /dev/null
+++ b/docs/tutorials/gluon/gluon_from_experiment_to_deploymen.md
@@ -0,0 +1,211 @@
+# Gluon: from experiment to deployment, an end to end example
+
+## Overview
+
+MXNet Gluon API comes with a lot of great features and it can provide you everything you need from experiment to deploy the model.
+In this tutorial, we will walk you through a common used case on how to build a model using gluon, train it on your data, and deploy it for inference.
+
+Let's say you want to build a service that provides flower species recognition. A common use case is, you don't have enough data to train a good model like ResNet50.
+What you can do is utilize pre-trained model from Gluon, tweak the model according to your neeed, fine-tune the model on your small dataset, and deploy the model to integrate with your service.
+
+We will use the [Oxford 102 Category Flower Dateset](http://www.robots.ox.ac.uk/~vgg/data/flowers/102/) as an example to show you the steps.
+
+## Prepare training data
+
+You can use this [script](/~https://github.com/Arsey/keras-transfer-learning-for-oxford102/blob/master/bootstrap.py) to download and organize your data into train, test, and validation sets. Simply run:
+```python
+python bootstrap.py
+```
+
+Now your data will be organized into the following format, all the images belong to the same category will be put together
+```
+data
+├── train
+│   ├── 0
+│   │   ├── image_06736.jpg
+│   │   ├── image_06741.jpg
+...
+│   ├── 1
+│   │   ├── image_06755.jpg
+│   │   ├── image_06899.jpg
+...
+├── test
+│   ├── 0
+│   │   ├── image_00731.jpg
+│   │   ├── image_0002.jpg
+...
+│   ├── 1
+│   │   ├── image_00036.jpg
+│   │   ├── image_05011.jpg
+
+```
+
+## Define Hyper-paramerters
+Now let's first import neccesarry packages:
+```python
+import mxnet as mx
+import numpy as np
+import os, time
+
+from mxnet import gluon, init
+from mxnet import autograd as ag
+from mxnet.gluon import nn
+from mxnet.gluon.data.vision import transforms
+from gluoncv.model_zoo import get_model
+```
+
+and define the hyper parameter we will use for fine-tuning:
+```python
+classes = 102
+
+epochs = 1
+lr = 0.001
+per_device_batch_size = 32
+momentum = 0.9
+wd = 0.0001
+
+lr_factor = 0.75
+lr_steps = [10, 20, 30, np.inf]
+
+num_gpus = 0
+num_workers = 1
+ctx = [mx.gpu(i) for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()]
+batch_size = per_device_batch_size * max(num_gpus, 1)
+```
+
+## Data pre-processing
+
+We can use Gluon DataSet API, DataLoader API, and Transform API to load the images and do data augmentation:
+```python
+jitter_param = 0.4
+lighting_param = 0.1
+
+transform_train = transforms.Compose([
+    transforms.RandomResizedCrop(224),
+    transforms.RandomFlipLeftRight(),
+    transforms.RandomColorJitter(brightness=jitter_param, contrast=jitter_param,
+                                 saturation=jitter_param),
+    transforms.RandomLighting(lighting_param),
+    transforms.ToTensor(),
+    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+])
+
+transform_test = transforms.Compose([
+    transforms.Resize(256),
+    transforms.CenterCrop(224),
+    transforms.ToTensor(),
+    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+])
+
+
+path = './data'
+train_path = os.path.join(path, 'train')
+val_path = os.path.join(path, 'valid')
+test_path = os.path.join(path, 'test')
+
+train_data = gluon.data.DataLoader(
+    gluon.data.vision.ImageFolderDataset(train_path).transform_first(transform_train),
+    batch_size=batch_size, shuffle=True, num_workers=num_workers)
+
+val_data = gluon.data.DataLoader(
+    gluon.data.vision.ImageFolderDataset(val_path).transform_first(transform_test),
+    batch_size=batch_size, shuffle=False, num_workers = num_workers)
+
+test_data = gluon.data.DataLoader(
+    gluon.data.vision.ImageFolderDataset(test_path).transform_first(transform_test),
+    batch_size=batch_size, shuffle=False, num_workers = num_workers)
+ ```
+
+
+## Loading pre-trained model
+
+We will use pre-trained ResNet50_v2 model, all you need to do is re-define the last softmax layer for your case. Specify the number of classes in your data and initialize the weights.
+You can also add layers to the network according to your needs.
+
+Before we go to training, one important part is to hybridize your model, it will convert your imperative code to mxnet symbolic graph. It's much more efficient to train a symbolic model,
+and you can also serialize and save the network archietecure and parameters for inference.
+
+```python
+model_name = 'ResNet50_v2'
+finetune_net = get_model(model_name, pretrained=True)
+with finetune_net.name_scope():
+    finetune_net.output = nn.Dense(classes)
+finetune_net.output.initialize(init.Xavier(), ctx = ctx)
+finetune_net.collect_params().reset_ctx(ctx)
+finetune_net.hybridize()
+
+trainer = gluon.Trainer(finetune_net.collect_params(), 'sgd', {
+                        'learning_rate': lr, 'momentum': momentum, 'wd': wd})
+metric = mx.metric.Accuracy()
+L = gluon.loss.SoftmaxCrossEntropyLoss()
+```
+
+## Fine-tuning model on your custom dataset
+
+Now let's define the test metrics and start fine-tuning.
+
+```python
+def test(net, val_data, ctx):
+    metric = mx.metric.Accuracy()
+    for i, batch in enumerate(val_data):
+        data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False)
+        label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0, even_split=False)
+        outputs = [net(X) for X in data]
+        metric.update(label, outputs)
+
+    return metric.get()
+
+
+lr_counter = 0
+num_batch = len(train_data)
+
+for epoch in range(epochs):
+    if epoch == lr_steps[lr_counter]:
+        trainer.set_learning_rate(trainer.learning_rate*lr_factor)
+        lr_counter += 1
+
+    tic = time.time()
+    train_loss = 0
+    metric.reset()
+
+    for i, batch in enumerate(train_data):
+        data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False)
+        label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0, even_split=False)
+        with ag.record():
+            outputs = [finetune_net(X) for X in data]
+            loss = [L(yhat, y) for yhat, y in zip(outputs, label)]
+        for l in loss:
+            l.backward()
+
+        trainer.step(batch_size)
+        train_loss += sum([l.mean().asscalar() for l in loss]) / len(loss)
+
+        metric.update(label, outputs)
+
+    _, train_acc = metric.get()
+    train_loss /= num_batch
+
+    _, val_acc = test(finetune_net, val_data, ctx)
+
+    print('[Epoch %d] Train-acc: %.3f, loss: %.3f | Val-acc: %.3f | time: %.1f' %
+             (epoch, train_acc, train_loss, val_acc, time.time() - tic))
+
+_, test_acc = test(finetune_net, test_data, ctx)
+print('[Finished] Test-acc: %.3f' % (test_acc))
+```
+
+Note we are able to reach a test accuracy of 93% with only 20 epochs in less than 20 minutes, this is really fast because we used the
+pre-trained weights from ResNet50, it's been trained on a marge larget dataset: ImageNet, so it works really well to capture features on our small dataset.
+
+## Save fine-tuned model
+
+We now have a trained our custom model. This can be exported into files using the export function. The export function will export the model architecture into a .json file and model parameters into a .params file.
+
+```python
+net.export("flower-recognition", epoch=1)
+```
+export in this case creates flower-recognitio-symbol.json and flower-recognitio-0001.params in the current directory.
+
+## Load and inference using C API
+
+(WIP)
\ No newline at end of file

From e3e211fbf2a0c9ebab014251b9e6a8a80178f327 Mon Sep 17 00:00:00 2001
From: Lai Wei <royweilai@gmail.com>
Date: Fri, 26 Oct 2018 10:21:24 -0700
Subject: [PATCH 02/16] add reference

---
 docs/tutorials/gluon/gluon_from_experiment_to_deploymen.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/docs/tutorials/gluon/gluon_from_experiment_to_deploymen.md b/docs/tutorials/gluon/gluon_from_experiment_to_deploymen.md
index 9cf9694eaf83..0d2b41a1ac68 100644
--- a/docs/tutorials/gluon/gluon_from_experiment_to_deploymen.md
+++ b/docs/tutorials/gluon/gluon_from_experiment_to_deploymen.md
@@ -208,4 +208,8 @@ export in this case creates flower-recognitio-symbol.json and flower-recognitio-
 
 ## Load and inference using C API
 
-(WIP)
\ No newline at end of file
+(WIP)
+
+## References
+
+https://gluon.mxnet.io/chapter08_computer-vision/fine-tuning.html

From 9b005bee85ee92cda311a3f63285c2432ff93d5a Mon Sep 17 00:00:00 2001
From: Lai Wei <royweilai@gmail.com>
Date: Wed, 21 Nov 2018 16:49:29 -0800
Subject: [PATCH 03/16] add cpp inference

---
 .../gluon_from_experiment_to_deploymen.md     | 205 +++++++++++++++++-
 1 file changed, 195 insertions(+), 10 deletions(-)

diff --git a/docs/tutorials/gluon/gluon_from_experiment_to_deploymen.md b/docs/tutorials/gluon/gluon_from_experiment_to_deploymen.md
index 0d2b41a1ac68..d6b18ad1ebb3 100644
--- a/docs/tutorials/gluon/gluon_from_experiment_to_deploymen.md
+++ b/docs/tutorials/gluon/gluon_from_experiment_to_deploymen.md
@@ -40,7 +40,9 @@ data
 
 ```
 
-## Define Hyper-paramerters
+
+# Training using Gluon
+### Define Hyper-paramerters
 Now let's first import neccesarry packages:
 ```python
 import mxnet as mx
@@ -73,7 +75,7 @@ ctx = [mx.gpu(i) for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()]
 batch_size = per_device_batch_size * max(num_gpus, 1)
 ```
 
-## Data pre-processing
+### Data pre-processing
 
 We can use Gluon DataSet API, DataLoader API, and Transform API to load the images and do data augmentation:
 ```python
@@ -117,7 +119,7 @@ test_data = gluon.data.DataLoader(
  ```
 
 
-## Loading pre-trained model
+### Loading pre-trained model
 
 We will use pre-trained ResNet50_v2 model, all you need to do is re-define the last softmax layer for your case. Specify the number of classes in your data and initialize the weights.
 You can also add layers to the network according to your needs.
@@ -140,7 +142,7 @@ metric = mx.metric.Accuracy()
 L = gluon.loss.SoftmaxCrossEntropyLoss()
 ```
 
-## Fine-tuning model on your custom dataset
+### Fine-tuning model on your custom dataset
 
 Now let's define the test metrics and start fine-tuning.
 
@@ -197,19 +199,202 @@ print('[Finished] Test-acc: %.3f' % (test_acc))
 Note we are able to reach a test accuracy of 93% with only 20 epochs in less than 20 minutes, this is really fast because we used the
 pre-trained weights from ResNet50, it's been trained on a marge larget dataset: ImageNet, so it works really well to capture features on our small dataset.
 
-## Save fine-tuned model
+### Save fine-tuned model
 
 We now have a trained our custom model. This can be exported into files using the export function. The export function will export the model architecture into a .json file and model parameters into a .params file.
 
 ```python
-net.export("flower-recognition", epoch=1)
+net.export("flower-recognition", epoch=20)
+```
+export in this case creates `flower-recognition-symbol.json` and `flower-recognition-0020.params` in the current directory.
+
+## Load and inference using C++ API
+
+### Setup MXNet C++ API
+TODO: replace link with offical link after PR merged.
+MXNet provide several language bindings for inference, for example C++, Scala, Java. In this tutorial we will focus on using C++ for inference. The code is modified from
+MXNet [C++ Inference example](/~https://github.com/leleamol/incubator-mxnet/tree/inception-example/cpp-package/example/inference).
+To use C++ API in MXNet, you need to build MXNet from source with C++ package. Please follow the [built from source guide](https://mxnet.incubator.apache.org/install/ubuntu_setup.html), and [C++ Package documentation](/~https://github.com/apache/incubator-mxnet/tree/master/cpp-package)
+to enable C++ API.
+In summary you just need to build MXNet from source with `USE_CPP_PACKAGE` flag set to 1 using`make -j USE_CPP_PACKAGE=1`.
+
+### Write Predictor in C++
+Now let's write prediction code in C++.  What will use a Predictor Class to do the following jobs:
+1. Load the pre-trained model,
+2. Load the parameters of pre-trained model,
+3. Load the image to be classified  in to NDArray.
+4. Run the forward pass and predict the input image.
+
+```cpp
+class Predictor {
+ public:
+    Predictor() {}
+    Predictor(const std::string& model_json,
+              const std::string& model_params,
+              const Shape& input_shape,
+              bool gpu_context_type = false,
+              const std::string& synset_file = "",
+              const std::string& mean_image_file = "");
+    void PredictImage(const std::string& image_file);
+    ~Predictor();
+
+ private:
+    void LoadModel(const std::string& model_json_file);
+    void LoadParameters(const std::string& model_parameters_file);
+    void LoadSynset(const std::string& synset_file);
+    NDArray LoadInputImage(const std::string& image_file);
+    void LoadMeanImageData();
+    void NormalizeInput(const std::string& mean_image_file);
+
+    NDArray mean_img;
+    map<string, NDArray> args_map;
+    map<string, NDArray> aux_map;
+    std::vector<std::string> output_labels;
+    Symbol net;
+    Executor *executor;
+    Shape input_shape;
+    NDArray mean_image_data;
+    Context global_ctx = Context::cpu();
+    string mean_image_file;
+};
+```
+
+### Load network symbol and parameters
+
+In the Predictor constructor, you need a few information including paths to saved json and param files. After that add the following two methods to load the netowrk and its parameters.
+```cpp
+void Predictor::LoadModel(const std::string& model_json_file) {
+  LG << "Loading the model from " << model_json_file << std::endl;
+  net = Symbol::Load(model_json_file);
+}
+
+
+/*
+ * The following function loads the model parameters.
+ */
+void Predictor::LoadParameters(const std::string& model_parameters_file) {
+    LG << "Loading the model parameters from " << model_parameters_file << std::endl;
+    map<string, NDArray> paramters;
+    NDArray::Load(model_parameters_file, 0, &paramters);
+    for (const auto &k : paramters) {
+      if (k.first.substr(0, 4) == "aux:") {
+        auto name = k.first.substr(4, k.first.size() - 4);
+        aux_map[name] = k.second.Copy(global_ctx);
+      }
+      if (k.first.substr(0, 4) == "arg:") {
+        auto name = k.first.substr(4, k.first.size() - 4);
+        args_map[name] = k.second.Copy(global_ctx);
+      }
+    }
+    /*WaitAll is need when we copy data between GPU and the main memory*/
+    NDArray::WaitAll();
+}
+```
+
+### Load Input Image
+
+Now let's add a method to load the input image we want to predict and converts it to NDArray for prediction.
+```cpp
+NDArray Predictor::LoadInputImage(const std::string& image_file) {
+  LG << "Loading the image " << image_file << std::endl;
+  vector<float> array;
+  cv::Mat mat = cv::imread(image_file);
+  /*resize pictures to (224, 224) according to the pretrained model*/
+  int height = input_shape[2];
+  int width = input_shape[3];
+  int channels = input_shape[1];
+  cv::resize(mat, mat, cv::Size(height, width));
+  for (int c = 0; c < channels; ++c) {
+    for (int i = 0; i < height; ++i) {
+      for (int j = 0; j < width; ++j) {
+        array.push_back(static_cast<float>(mat.data[(i * height + j) * 3 + c]));
+      }
+    }
+  }
+  NDArray image_data = NDArray(input_shape, global_ctx, false);
+  image_data.SyncCopyFromCPU(array.data(), input_shape.Size());
+  NDArray::WaitAll();
+  return image_data;
+}
+```
+
+### Run inference
+
+Finally, let's run the inference. It's basicaaly using MXNet executor to do a forward pass.
+
+```cpp
+void Predictor::PredictImage(const std::string& image_file) {
+  // Load the input image
+  NDArray image_data = LoadInputImage(image_file);
+
+  // Normalize the image
+  if (!mean_image_file.empty()) {
+    image_data.Slice(0, 1) -= mean_image_data;
+  }
+
+  LG << "Running the forward pass on model to predict the image";
+  /*
+   * The executor->arg_arrays represent the arguments to the model.
+   *
+   * Copying the image_data that contains the NDArray of input image
+   * to the arg map of the executor. The input is stored with the key "data" in the map.
+   *
+   */
+  image_data.CopyTo(&(executor->arg_dict()["data"]));
+  NDArray::WaitAll();
+
+  // Run the forward pass.
+  executor->Forward(false);
+
+  // The output is available in executor->outputs.
+  auto array = executor->outputs[0].Copy(global_ctx);
+  NDArray::WaitAll();
+
+  float best_accuracy = 0.0;
+  std::size_t best_idx = 0;
+
+  // Find out the maximum accuracy and the index associated with that accuracy.
+  for (std::size_t i = 0; i < array.Size(); ++i) {
+    if (array.At(0, i) > best_accuracy) {
+      best_accuracy = array.At(0, i);
+      best_idx = i;
+    }
+  }
+
+  if (output_labels.empty()) {
+    LG << "The model predicts the highest accuracy of " << best_accuracy << " at index "
+       << best_idx;
+  } else {
+    LG << "The model predicts the input image to be a [" << output_labels[best_idx]
+       << " ] with Accuracy = " << array.At(0, best_idx) << std::endl;
+  }
+}
 ```
-export in this case creates flower-recognitio-symbol.json and flower-recognitio-0001.params in the current directory.
 
-## Load and inference using C API
+### Compile and Run Inference Code
 
-(WIP)
+You can find the full code [here](/~https://github.com/leleamol/incubator-mxnet/blob/inception-example/cpp-package/example/inference/inception_inference.cpp)
+, and to compile it use this [Makefile](/~https://github.com/leleamol/incubator-mxnet/blob/inception-example/cpp-package/example/inference/Makefile)
+
+Now you will be able to compile the run inference, just do `make all` and pass the parameters as follows
+```bash
+make all
+LD_LIBRARY_PATH=../incubator-mxnet/lib/ ./inception_inference --symbol "flower-recognition-symbol.json" --params "flower-recognition-0020.params" --image ./data/test/0/image_06736.jpg
+```
+Then it will predict your iamge
+
+```bash
+[22:26:49] inception_inference.cpp:128: Loading the model from flower-recognition-symbol.json
+
+[22:26:49] inception_inference.cpp:137: Loading the model parameters from flower-recognition-0020.params
+
+[22:26:50] inception_inference.cpp:179: Loading the image ./data/test/0/image_06736.jpg
+
+[22:26:50] inception_inference.cpp:230: Running the forward pass on model to predict the image
+[22:26:50] inception_inference.cpp:260: The model predicts the highest accuracy of 7.17001 at index 3
+```
 
 ## References
 
-https://gluon.mxnet.io/chapter08_computer-vision/fine-tuning.html
+1. https://gluon.mxnet.io/chapter08_computer-vision/fine-tuning.html
+2. /~https://github.com/leleamol/incubator-mxnet/blob/inception-example/cpp-package/example/inference/

From 8b13dbfcba9fcefce9a5b50574405e1e080c0003 Mon Sep 17 00:00:00 2001
From: Lai Wei <royweilai@gmail.com>
Date: Tue, 27 Nov 2018 09:37:48 -0800
Subject: [PATCH 04/16] improve wording

---
 docs/tutorials/gluon/gluon_from_experiment_to_deploymen.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/docs/tutorials/gluon/gluon_from_experiment_to_deploymen.md b/docs/tutorials/gluon/gluon_from_experiment_to_deploymen.md
index d6b18ad1ebb3..9d71e6b209a3 100644
--- a/docs/tutorials/gluon/gluon_from_experiment_to_deploymen.md
+++ b/docs/tutorials/gluon/gluon_from_experiment_to_deploymen.md
@@ -4,8 +4,9 @@
 
 MXNet Gluon API comes with a lot of great features and it can provide you everything you need from experiment to deploy the model.
 In this tutorial, we will walk you through a common used case on how to build a model using gluon, train it on your data, and deploy it for inference.
+We will keep each section short and please follow the links or references if you want to know more about each topic we covered.
 
-Let's say you want to build a service that provides flower species recognition. A common use case is, you don't have enough data to train a good model like ResNet50.
+Now let's say you want to build a service that provides flower species recognition. A common use case is, you don't have enough data to train a good model like ResNet50.
 What you can do is utilize pre-trained model from Gluon, tweak the model according to your neeed, fine-tune the model on your small dataset, and deploy the model to integrate with your service.
 
 We will use the [Oxford 102 Category Flower Dateset](http://www.robots.ox.ac.uk/~vgg/data/flowers/102/) as an example to show you the steps.
@@ -396,5 +397,7 @@ Then it will predict your iamge
 
 ## References
 
+1. /~https://github.com/Arsey/keras-transfer-learning-for-oxford102
 1. https://gluon.mxnet.io/chapter08_computer-vision/fine-tuning.html
 2. /~https://github.com/leleamol/incubator-mxnet/blob/inception-example/cpp-package/example/inference/
+3. https://gluon-crash-course.mxnet.io/

From b839fabb88bf16773098f4cd69d097b4df8d7253 Mon Sep 17 00:00:00 2001
From: Lai Wei <royweilai@gmail.com>
Date: Sun, 9 Dec 2018 16:57:49 -0800
Subject: [PATCH 05/16] address pr comments

---
 .../gluon_from_experiment_to_deploymen.md     | 446 +++++++++++++-----
 1 file changed, 335 insertions(+), 111 deletions(-)

diff --git a/docs/tutorials/gluon/gluon_from_experiment_to_deploymen.md b/docs/tutorials/gluon/gluon_from_experiment_to_deploymen.md
index 9d71e6b209a3..691a94921cbe 100644
--- a/docs/tutorials/gluon/gluon_from_experiment_to_deploymen.md
+++ b/docs/tutorials/gluon/gluon_from_experiment_to_deploymen.md
@@ -1,25 +1,48 @@
+
 # Gluon: from experiment to deployment, an end to end example
 
 ## Overview
+MXNet Gluon API comes with a lot of great features and it can provide you everything you need from experiment to deploy the model. In this tutorial, we will walk you through a common use case on how to build a model using gluon, train it on your data, and deploy it for inference.
+
+Let's say you need to build a service that provides flower species recognition. A common use case is, you don't have enough data to train a good model. In such cases we use a technique called Transfer Learning.
+In Transfer Learning we make use of a pre-trained model that solves a related task but is trained on a very large standard dataset such as ImageNet from a different domain, we utilize the knowledge in this pre-trained model to perform a new task at hand. 
+
+Gluon provides State of the Art models for many of the standard tasks such as Classifcation, Object Detection, Segmentation, etc. In this tutorial we will use the pre-trained model [ResNet50 V2](https://arxiv.org/abs/1603.05027) trained on ImageNet dataset, this model achieves 77.11% top-1 accuracy on ImageNet, we seek to transfer as much knowledge as possible for our task of recognizing different species of Flowers.
+
+In this tutorial we will show you the steps to load pre-trained model from Gluon, tweak the model according to your neeed, fine-tune the model on your small dataset, and finally deploy the trained model to integrate with your service.
+
+
+
+
+## Prerequisites
+
+To complete this tutorial, you need:
 
-MXNet Gluon API comes with a lot of great features and it can provide you everything you need from experiment to deploy the model.
-In this tutorial, we will walk you through a common used case on how to build a model using gluon, train it on your data, and deploy it for inference.
-We will keep each section short and please follow the links or references if you want to know more about each topic we covered.
+- [Build MXNet from source](https://mxnet.incubator.apache.org/install/ubuntu_setup.html#build-mxnet-from-source) with Python(Gluon) and C++ Packages
+- Learn the basics about Gluon with [A 60-minute Gluon Crash Course](https://gluon-crash-course.mxnet.io/)
+- Learn the basics about [MXNet C++ API](/~https://github.com/apache/incubator-mxnet/tree/master/cpp-package)
 
-Now let's say you want to build a service that provides flower species recognition. A common use case is, you don't have enough data to train a good model like ResNet50.
-What you can do is utilize pre-trained model from Gluon, tweak the model according to your neeed, fine-tune the model on your small dataset, and deploy the model to integrate with your service.
 
-We will use the [Oxford 102 Category Flower Dateset](http://www.robots.ox.ac.uk/~vgg/data/flowers/102/) as an example to show you the steps.
+## The Data
+
+We will use the [Oxford 102 Category Flower Dateset](http://www.robots.ox.ac.uk/~vgg/data/flowers/102/) as an example to show you the steps. You can use this [script](/~https://github.com/Arsey/keras-transfer-learning-for-oxford102/blob/master/bootstrap.py) to download and organize your data into train, test, and validation sets. Simply import it and run:
+
 
-## Prepare training data
 
-You can use this [script](/~https://github.com/Arsey/keras-transfer-learning-for-oxford102/blob/master/bootstrap.py) to download and organize your data into train, test, and validation sets. Simply run:
 ```python
-python bootstrap.py
+data_util_file = "oxford_102_flower_dataset.py" 
+# TODO change base_url to apache/incubator-mxnet, used fork url for testing in CI
+base_url = "https://raw.githubusercontent.com/roywei/incubator-mxnet/master/docs/tutorial_utils/data/{}?raw=true"
+#mx.test_utils.download(base_url.format(data_util_file), fname=data_util_file)
+import oxford_102_flower_dataset
+
+# download and move data to train, test, valid folders
+path = './data'
+oxford_102_flower_dataset.get_data(path)
 ```
 
 Now your data will be organized into the following format, all the images belong to the same category will be put together
-```
+```bash
 data
 ├── train
 │   ├── 0
@@ -41,49 +64,62 @@ data
 
 ```
 
+## Training using Gluon
 
-# Training using Gluon
 ### Define Hyper-paramerters
-Now let's first import neccesarry packages:
+
+Now let's first import necessary packages:
+
+
 ```python
-import mxnet as mx
-import numpy as np
-import os, time
+import math
+import os
+import time
+from multiprocessing import cpu_count
 
+import mxnet as mx
+from mxnet import autograd
 from mxnet import gluon, init
-from mxnet import autograd as ag
 from mxnet.gluon import nn
 from mxnet.gluon.data.vision import transforms
-from gluoncv.model_zoo import get_model
+from mxnet.gluon.model_zoo.vision import resnet50_v2
 ```
 
-and define the hyper parameter we will use for fine-tuning:
+and define the hyper-parameters we will use for fine-tuning, we will use [MXNet learning rate scheduler](https://mxnet.incubator.apache.org/tutorials/gluon/learning_rate_schedules.html) to adjust learning rates during training.
+
+
 ```python
 classes = 102
-
-epochs = 1
+epochs = 40
 lr = 0.001
 per_device_batch_size = 32
 momentum = 0.9
 wd = 0.0001
 
 lr_factor = 0.75
-lr_steps = [10, 20, 30, np.inf]
+# learning rate change at following epochs
+lr_epochs = [10, 20, 30]
 
-num_gpus = 0
-num_workers = 1
+num_gpus = mx.context.num_gpus()
+num_workers = cpu_count()
 ctx = [mx.gpu(i) for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()]
 batch_size = per_device_batch_size * max(num_gpus, 1)
 ```
 
-### Data pre-processing
+Before the training we will apply data augmentations on training images. It's making minor alterations on training images and our model will consider them as distinct images. This can be very useful for finetuning on relatively small dataset and help improve the model. We can use Gluon [DataSet API](https://mxnet.incubator.apache.org/tutorials/gluon/datasets.html), [DataLoader API](https://mxnet.incubator.apache.org/tutorials/gluon/datasets.html), and [Transform API](https://mxnet.incubator.apache.org/tutorials/gluon/data_augmentation.html) to load the images and apply the follwing data augmentation:
+1. Randomly crop the image and resize it to 224x224
+2. Randomly flip the image horizontally
+3. Randomly jitter color and add noise
+4. Transpose the data from height*width*num_channels to num_channels*height*width, and map values from [0, 255] to [0, 1]
+5. Normalize with the mean and standard deviation from the ImageNet dataset.
+
+
 
-We can use Gluon DataSet API, DataLoader API, and Transform API to load the images and do data augmentation:
 ```python
 jitter_param = 0.4
 lighting_param = 0.1
 
-transform_train = transforms.Compose([
+training_transformer = transforms.Compose([
     transforms.RandomResizedCrop(224),
     transforms.RandomFlipLeftRight(),
     transforms.RandomColorJitter(brightness=jitter_param, contrast=jitter_param,
@@ -93,7 +129,7 @@ transform_train = transforms.Compose([
     transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
 ])
 
-transform_test = transforms.Compose([
+validation_transformer = transforms.Compose([
     transforms.Resize(256),
     transforms.CenterCrop(224),
     transforms.ToTensor(),
@@ -101,137 +137,297 @@ transform_test = transforms.Compose([
 ])
 
 
-path = './data'
 train_path = os.path.join(path, 'train')
 val_path = os.path.join(path, 'valid')
 test_path = os.path.join(path, 'test')
 
+# loading the data and apply pre-processing(transforms) on images
 train_data = gluon.data.DataLoader(
-    gluon.data.vision.ImageFolderDataset(train_path).transform_first(transform_train),
+    gluon.data.vision.ImageFolderDataset(train_path).transform_first(training_transformer),
     batch_size=batch_size, shuffle=True, num_workers=num_workers)
 
 val_data = gluon.data.DataLoader(
-    gluon.data.vision.ImageFolderDataset(val_path).transform_first(transform_test),
-    batch_size=batch_size, shuffle=False, num_workers = num_workers)
+    gluon.data.vision.ImageFolderDataset(val_path).transform_first(validation_transformer),
+    batch_size=batch_size, shuffle=False, num_workers=num_workers)
 
 test_data = gluon.data.DataLoader(
-    gluon.data.vision.ImageFolderDataset(test_path).transform_first(transform_test),
-    batch_size=batch_size, shuffle=False, num_workers = num_workers)
- ```
-
+    gluon.data.vision.ImageFolderDataset(test_path).transform_first(validation_transformer),
+    batch_size=batch_size, shuffle=False, num_workers=num_workers)
+```
 
 ### Loading pre-trained model
 
-We will use pre-trained ResNet50_v2 model, all you need to do is re-define the last softmax layer for your case. Specify the number of classes in your data and initialize the weights.
-You can also add layers to the network according to your needs.
 
-Before we go to training, one important part is to hybridize your model, it will convert your imperative code to mxnet symbolic graph. It's much more efficient to train a symbolic model,
-and you can also serialize and save the network archietecure and parameters for inference.
+We will use pre-trained ResNet50_v2 model which was pre-trained on the [ImageNet Dataset](http://www.image-net.org/) with 1000 classes. All you need to do is re-define the last softmax layer and specify the number of classes to be 102 in our case and initialize the parameters. You can also add layers to the network according to your needs.
+
+Before we go to training, one unique feature Gluon offers is hybridization. It allows you to convert your imperative code to static symbolic graph which is much more efficient to execute. There are two main benefit of hybridizing your model: better performance and easier serialization for deployment. The best part is it's as simple as just calling `net.hybridize()`. To know more about Gluon hybridization, please follow our [tutorials](https://mxnet.incubator.apache.org/tutorials/gluon/hybrid.html).
+
+
 
 ```python
-model_name = 'ResNet50_v2'
-finetune_net = get_model(model_name, pretrained=True)
+# load pre-trained resnet50_v2 from model zoo
+finetune_net = resnet50_v2(pretrained=True, ctx=ctx)
+
+# change last softmax layer since number of classes are different
 with finetune_net.name_scope():
     finetune_net.output = nn.Dense(classes)
-finetune_net.output.initialize(init.Xavier(), ctx = ctx)
-finetune_net.collect_params().reset_ctx(ctx)
+finetune_net.output.initialize(init.Xavier(), ctx=ctx)
+# hybridize for better performance
 finetune_net.hybridize()
 
 trainer = gluon.Trainer(finetune_net.collect_params(), 'sgd', {
-                        'learning_rate': lr, 'momentum': momentum, 'wd': wd})
+    'learning_rate': lr, 'momentum': momentum, 'wd': wd})
 metric = mx.metric.Accuracy()
-L = gluon.loss.SoftmaxCrossEntropyLoss()
+softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()
 ```
 
 ### Fine-tuning model on your custom dataset
 
 Now let's define the test metrics and start fine-tuning.
 
+
+
 ```python
 def test(net, val_data, ctx):
     metric = mx.metric.Accuracy()
-    for i, batch in enumerate(val_data):
-        data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False)
-        label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0, even_split=False)
-        outputs = [net(X) for X in data]
+    for i, (data, label) in enumerate(val_data):
+        data = gluon.utils.split_and_load(data, ctx_list=ctx, even_split=False)
+        label = gluon.utils.split_and_load(label, ctx_list=ctx, even_split=False)
+        outputs = [net(x) for x in data]
         metric.update(label, outputs)
-
     return metric.get()
 
 
-lr_counter = 0
 num_batch = len(train_data)
+iteration_idx = 1
+
+# setup learning rate scheduler
+iterations_per_epoch = math.ceil(num_batch)
+# learning rate change at following steps
+lr_steps = [epoch * iterations_per_epoch for epoch in lr_epochs]
+schedule = mx.lr_scheduler.MultiFactorScheduler(step=lr_steps, factor=lr_factor, base_lr=lr)
 
-for epoch in range(epochs):
-    if epoch == lr_steps[lr_counter]:
-        trainer.set_learning_rate(trainer.learning_rate*lr_factor)
-        lr_counter += 1
+# start with epoch 1 for easier learning rate calculation
+for epoch in range(1, epochs + 1):
 
     tic = time.time()
     train_loss = 0
     metric.reset()
 
-    for i, batch in enumerate(train_data):
-        data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False)
-        label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0, even_split=False)
-        with ag.record():
-            outputs = [finetune_net(X) for X in data]
-            loss = [L(yhat, y) for yhat, y in zip(outputs, label)]
+    for i, (data, label) in enumerate(train_data):
+        # get the images and labels
+        data = gluon.utils.split_and_load(data, ctx_list=ctx, even_split=False)
+        label = gluon.utils.split_and_load(label, ctx_list=ctx, even_split=False)
+        with autograd.record():
+            outputs = [finetune_net(x) for x in data]
+            loss = [softmax_cross_entropy(yhat, y) for yhat, y in zip(outputs, label)]
         for l in loss:
             l.backward()
 
+        lr = schedule(iteration_idx)
+        trainer.set_learning_rate(lr)
         trainer.step(batch_size)
         train_loss += sum([l.mean().asscalar() for l in loss]) / len(loss)
-
         metric.update(label, outputs)
+        iteration_idx += 1
 
     _, train_acc = metric.get()
     train_loss /= num_batch
-
     _, val_acc = test(finetune_net, val_data, ctx)
 
-    print('[Epoch %d] Train-acc: %.3f, loss: %.3f | Val-acc: %.3f | time: %.1f' %
-             (epoch, train_acc, train_loss, val_acc, time.time() - tic))
+    print('[Epoch %d] Train-acc: %.3f, loss: %.3f | Val-acc: %.3f | learning-rate: %.3E | time: %.1f' %
+          (epoch, train_acc, train_loss, val_acc, trainer.learning_rate, time.time() - tic))
 
 _, test_acc = test(finetune_net, test_data, ctx)
 print('[Finished] Test-acc: %.3f' % (test_acc))
 ```
 
-Note we are able to reach a test accuracy of 93% with only 20 epochs in less than 20 minutes, this is really fast because we used the
-pre-trained weights from ResNet50, it's been trained on a marge larget dataset: ImageNet, so it works really well to capture features on our small dataset.
+    Process Process-7:
+    Process Process-5:
+    Process Process-6:
+    Process Process-8:
+    Traceback (most recent call last):
+    Traceback (most recent call last):
+    Traceback (most recent call last):
+    Traceback (most recent call last):
+      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
+        self.run()
+      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
+        self.run()
+      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
+        self.run()
+      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
+        self.run()
+      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
+        self._target(*self._args, **self._kwargs)
+      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
+        self._target(*self._args, **self._kwargs)
+      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
+        self._target(*self._args, **self._kwargs)
+      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
+        self._target(*self._args, **self._kwargs)
+      File "/Users/lawei/Documents/Workspace/roywei/incubator-mxnet/python/mxnet/gluon/data/dataloader.py", line 186, in worker_loop
+        idx, samples = key_queue.get()
+      File "/Users/lawei/Documents/Workspace/roywei/incubator-mxnet/python/mxnet/gluon/data/dataloader.py", line 186, in worker_loop
+        idx, samples = key_queue.get()
+      File "/Users/lawei/Documents/Workspace/roywei/incubator-mxnet/python/mxnet/gluon/data/dataloader.py", line 186, in worker_loop
+        idx, samples = key_queue.get()
+      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/queues.py", line 93, in get
+        with self._rlock:
+      File "/Users/lawei/Documents/Workspace/roywei/incubator-mxnet/python/mxnet/gluon/data/dataloader.py", line 186, in worker_loop
+        idx, samples = key_queue.get()
+      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/queues.py", line 93, in get
+        with self._rlock:
+      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/queues.py", line 93, in get
+        with self._rlock:
+      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
+        return self._semlock.__enter__()
+      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
+        return self._semlock.__enter__()
+      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/queues.py", line 94, in get
+        res = self._recv_bytes()
+      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
+        return self._semlock.__enter__()
+    KeyboardInterrupt
+    KeyboardInterrupt
+      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/connection.py", line 216, in recv_bytes
+        buf = self._recv_bytes(maxlength)
+    KeyboardInterrupt
+      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/connection.py", line 407, in _recv_bytes
+        buf = self._recv(4)
+      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/connection.py", line 379, in _recv
+        chunk = read(handle, remaining)
+    KeyboardInterrupt
+    Traceback (most recent call last):
+      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/queues.py", line 240, in _feed
+        send_bytes(obj)
+      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/connection.py", line 200, in send_bytes
+        self._send_bytes(m[offset:offset + size])
+      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/connection.py", line 404, in _send_bytes
+        self._send(header + buf)
+      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/connection.py", line 368, in _send
+        n = write(self._handle, buf)
+
+
+
+    ---------------------------------------------------------------------------
+
+    KeyboardInterrupt                         Traceback (most recent call last)
+
+    <ipython-input-32-4e9fa8a3bb40> in <module>()
+         38         trainer.set_learning_rate(lr)
+         39         trainer.step(batch_size)
+    ---> 40         train_loss += sum([l.mean().asscalar() for l in loss]) / len(loss)
+         41         metric.update(label, outputs)
+         42         iteration_idx += 1
+
+
+    <ipython-input-32-4e9fa8a3bb40> in <listcomp>(.0)
+         38         trainer.set_learning_rate(lr)
+         39         trainer.step(batch_size)
+    ---> 40         train_loss += sum([l.mean().asscalar() for l in loss]) / len(loss)
+         41         metric.update(label, outputs)
+         42         iteration_idx += 1
+
+
+    ~/Documents/Workspace/roywei/incubator-mxnet/python/mxnet/ndarray/ndarray.py in asscalar(self)
+       1996         if self.shape != (1,):
+       1997             raise ValueError("The current array is not a scalar")
+    -> 1998         return self.asnumpy()[0]
+       1999 
+       2000     def astype(self, dtype, copy=True):
+
+
+    ~/Documents/Workspace/roywei/incubator-mxnet/python/mxnet/ndarray/ndarray.py in asnumpy(self)
+       1978             self.handle,
+       1979             data.ctypes.data_as(ctypes.c_void_p),
+    -> 1980             ctypes.c_size_t(data.size)))
+       1981         return data
+       1982 
+
+
+    KeyboardInterrupt: 
+
+
+    BrokenPipeError: [Errno 32] Broken pipe
+    Traceback (most recent call last):
+      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/queues.py", line 240, in _feed
+        send_bytes(obj)
+      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/connection.py", line 200, in send_bytes
+        self._send_bytes(m[offset:offset + size])
+      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/connection.py", line 404, in _send_bytes
+        self._send(header + buf)
+      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/connection.py", line 368, in _send
+        n = write(self._handle, buf)
+    BrokenPipeError: [Errno 32] Broken pipe
+    Traceback (most recent call last):
+      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/queues.py", line 240, in _feed
+        send_bytes(obj)
+      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/connection.py", line 200, in send_bytes
+        self._send_bytes(m[offset:offset + size])
+      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/connection.py", line 404, in _send_bytes
+        self._send(header + buf)
+      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/connection.py", line 368, in _send
+        n = write(self._handle, buf)
+    BrokenPipeError: [Errno 32] Broken pipe
+    Traceback (most recent call last):
+      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/queues.py", line 240, in _feed
+        send_bytes(obj)
+      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/connection.py", line 200, in send_bytes
+        self._send_bytes(m[offset:offset + size])
+      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/connection.py", line 404, in _send_bytes
+        self._send(header + buf)
+      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/connection.py", line 368, in _send
+        n = write(self._handle, buf)
+    BrokenPipeError: [Errno 32] Broken pipe
+
+
+Following is the training result:
+```bash
+[Epoch 40] Train-acc: 0.945, loss: 0.354 | Val-acc: 0.955 | learning-rate: 4.219E-04 | time: 17.8
+[Finished] Test-acc: 0.952
+```
+We trained the model using a [AWS P3.8XLarge instance](https://aws.amazon.com/ec2/instance-types/p3/) with 4 Tesla V100	GPUs. We were able to reach a test accuracy of 95.2% with 40 epochs in around 12 minutes. This was really fast because our model was pre-trained on a much larget dataset, ImageNet, with around 1.3 million images. It worked really well to capture features on our small dataset.
+
 
 ### Save fine-tuned model
 
-We now have a trained our custom model. This can be exported into files using the export function. The export function will export the model architecture into a .json file and model parameters into a .params file.
+
+We now have a trained our custom model. This can be serialized into model files using the export function. The export function will export the model architecture into a `.json` file and model parameters into a `.params` file.
+
+
 
 ```python
-net.export("flower-recognition", epoch=20)
+finetune_net.export("flower-recognition", epoch=epochs)
+
 ```
-export in this case creates `flower-recognition-symbol.json` and `flower-recognition-0020.params` in the current directory.
+
+export in this case creates `flower-recognition-symbol.json` and `flower-recognition-0020.params` in the current directory. They can be used for model deployment in the next section.
+
 
 ## Load and inference using C++ API
 
+MXNet provide various useful tools and interfaces for deploying your model for inference. For example, you can use [MXNet Model Server](/~https://github.com/awslabs/mxnet-model-server) to start a service and host your trained model easily. Besides that, you can also use MXNet's different language APIs to integrate your model with your existing service. We provide [Java](https://mxnet.incubator.apache.org/api/java/index.html), [Scala](https://mxnet.incubator.apache.org/api/scala/index.html), and [C++](https://mxnet.incubator.apache.org/api/c++/index.html) APIs. In this tutorial, we will focus on the C++ API, for more details, please refer to the [C++ Inference Example](/~https://github.com/leleamol/incubator-mxnet/tree/inception-example/cpp-package/example/inference).
+
+ 
 ### Setup MXNet C++ API
-TODO: replace link with offical link after PR merged.
-MXNet provide several language bindings for inference, for example C++, Scala, Java. In this tutorial we will focus on using C++ for inference. The code is modified from
-MXNet [C++ Inference example](/~https://github.com/leleamol/incubator-mxnet/tree/inception-example/cpp-package/example/inference).
 To use C++ API in MXNet, you need to build MXNet from source with C++ package. Please follow the [built from source guide](https://mxnet.incubator.apache.org/install/ubuntu_setup.html), and [C++ Package documentation](/~https://github.com/apache/incubator-mxnet/tree/master/cpp-package)
 to enable C++ API.
-In summary you just need to build MXNet from source with `USE_CPP_PACKAGE` flag set to 1 using`make -j USE_CPP_PACKAGE=1`.
+In summary you just need to build MXNet from source with `USE_CPP_PACKAGE` flag set to 1 using `make -j USE_CPP_PACKAGE=1`.
 
 ### Write Predictor in C++
-Now let's write prediction code in C++.  What will use a Predictor Class to do the following jobs:
+Now let's write prediction code in C++.  We will use a Predictor Class to do the following jobs:
 1. Load the pre-trained model,
 2. Load the parameters of pre-trained model,
-3. Load the image to be classified  in to NDArray.
+3. Load the image to be classified in to NDArray.
 4. Run the forward pass and predict the input image.
 
 ```cpp
 class Predictor {
  public:
     Predictor() {}
-    Predictor(const std::string& model_json,
-              const std::string& model_params,
+    Predictor(const std::string& model_json_file,
+              const std::string& model_params_file,
               const Shape& input_shape,
               bool gpu_context_type = false,
               const std::string& synset_file = "",
@@ -245,26 +441,39 @@ class Predictor {
     void LoadSynset(const std::string& synset_file);
     NDArray LoadInputImage(const std::string& image_file);
     void LoadMeanImageData();
+    void LoadDefaultMeanImageData();
     void NormalizeInput(const std::string& mean_image_file);
-
+    inline bool FileExists(const std::string& name) {
+        struct stat buffer;
+        return (stat(name.c_str(), &buffer) == 0);
+    }
     NDArray mean_img;
-    map<string, NDArray> args_map;
-    map<string, NDArray> aux_map;
+    std::map<std::string, NDArray> args_map;
+    std::map<std::string, NDArray> aux_map;
     std::vector<std::string> output_labels;
     Symbol net;
     Executor *executor;
     Shape input_shape;
     NDArray mean_image_data;
+    NDArray std_dev_image_data;
     Context global_ctx = Context::cpu();
-    string mean_image_file;
+    std::string mean_image_file;
 };
 ```
 
 ### Load network symbol and parameters
 
 In the Predictor constructor, you need a few information including paths to saved json and param files. After that add the following two methods to load the netowrk and its parameters.
+
 ```cpp
+/*
+ * The following function loads the model from json file.
+ */
 void Predictor::LoadModel(const std::string& model_json_file) {
+  if (!FileExists(model_json_file)) {
+    LG << "Model file " << model_json_file << " does not exist";
+    throw std::runtime_error("Model file does not exist");
+  }
   LG << "Loading the model from " << model_json_file << std::endl;
   net = Symbol::Load(model_json_file);
 }
@@ -274,21 +483,25 @@ void Predictor::LoadModel(const std::string& model_json_file) {
  * The following function loads the model parameters.
  */
 void Predictor::LoadParameters(const std::string& model_parameters_file) {
-    LG << "Loading the model parameters from " << model_parameters_file << std::endl;
-    map<string, NDArray> paramters;
-    NDArray::Load(model_parameters_file, 0, &paramters);
-    for (const auto &k : paramters) {
-      if (k.first.substr(0, 4) == "aux:") {
-        auto name = k.first.substr(4, k.first.size() - 4);
-        aux_map[name] = k.second.Copy(global_ctx);
-      }
-      if (k.first.substr(0, 4) == "arg:") {
-        auto name = k.first.substr(4, k.first.size() - 4);
-        args_map[name] = k.second.Copy(global_ctx);
-      }
+  if (!FileExists(model_parameters_file)) {
+    LG << "Parameter file " << model_parameters_file << " does not exist";
+    throw std::runtime_error("Model parameters does not exist");
+  }
+  LG << "Loading the model parameters from " << model_parameters_file << std::endl;
+  std::map<std::string, NDArray> parameters;
+  NDArray::Load(model_parameters_file, 0, &parameters);
+  for (const auto &k : parameters) {
+    if (k.first.substr(0, 4) == "aux:") {
+      auto name = k.first.substr(4, k.first.size() - 4);
+      aux_map[name] = k.second.Copy(global_ctx);
     }
-    /*WaitAll is need when we copy data between GPU and the main memory*/
-    NDArray::WaitAll();
+    if (k.first.substr(0, 4) == "arg:") {
+      auto name = k.first.substr(4, k.first.size() - 4);
+      args_map[name] = k.second.Copy(global_ctx);
+    }
+  }
+  /*WaitAll is need when we copy data between GPU and the main memory*/
+  NDArray::WaitAll();
 }
 ```
 
@@ -297,8 +510,12 @@ void Predictor::LoadParameters(const std::string& model_parameters_file) {
 Now let's add a method to load the input image we want to predict and converts it to NDArray for prediction.
 ```cpp
 NDArray Predictor::LoadInputImage(const std::string& image_file) {
+  if (!FileExists(image_file)) {
+    LG << "Image file " << image_file << " does not exist";
+    throw std::runtime_error("Image file does not exist");
+  }
   LG << "Loading the image " << image_file << std::endl;
-  vector<float> array;
+  std::vector<float> array;
   cv::Mat mat = cv::imread(image_file);
   /*resize pictures to (224, 224) according to the pretrained model*/
   int height = input_shape[2];
@@ -321,7 +538,7 @@ NDArray Predictor::LoadInputImage(const std::string& image_file) {
 
 ### Run inference
 
-Finally, let's run the inference. It's basicaaly using MXNet executor to do a forward pass.
+Finally, let's run the inference. It's basically using MXNet executor to do a forward pass. To run predictions on multiple images, you can load the images in a list of NDArrays and run prediction in batches. Note that the Predictor class may not be thread safe, calling it in multi-threaded enviroments was not tested. To utilize mult-threaded prediction, you need to use the C predict API, please follow the [C predict example](/~https://github.com/apache/incubator-mxnet/tree/master/example/image-classification/predict-cpp).
 
 ```cpp
 void Predictor::PredictImage(const std::string& image_file) {
@@ -329,9 +546,7 @@ void Predictor::PredictImage(const std::string& image_file) {
   NDArray image_data = LoadInputImage(image_file);
 
   // Normalize the image
-  if (!mean_image_file.empty()) {
-    image_data.Slice(0, 1) -= mean_image_data;
-  }
+  image_data.Slice(0, 1) -= mean_image_data;
 
   LG << "Running the forward pass on model to predict the image";
   /*
@@ -351,23 +566,22 @@ void Predictor::PredictImage(const std::string& image_file) {
   auto array = executor->outputs[0].Copy(global_ctx);
   NDArray::WaitAll();
 
-  float best_accuracy = 0.0;
-  std::size_t best_idx = 0;
+  /*
+   * Find out the maximum accuracy and the index associated with that accuracy.
+   * This is done by using the argmax operator on NDArray.
+   */
+  auto predicted = array.ArgmaxChannel();
+  NDArray::WaitAll();
 
-  // Find out the maximum accuracy and the index associated with that accuracy.
-  for (std::size_t i = 0; i < array.Size(); ++i) {
-    if (array.At(0, i) > best_accuracy) {
-      best_accuracy = array.At(0, i);
-      best_idx = i;
-    }
-  }
+  int best_idx = predicted.At(0, 0);
+  float best_accuracy = array.At(0, best_idx);
 
   if (output_labels.empty()) {
     LG << "The model predicts the highest accuracy of " << best_accuracy << " at index "
        << best_idx;
   } else {
     LG << "The model predicts the input image to be a [" << output_labels[best_idx]
-       << " ] with Accuracy = " << array.At(0, best_idx) << std::endl;
+       << " ] with Accuracy = " << best_accuracy << std::endl;
   }
 }
 ```
@@ -378,10 +592,12 @@ You can find the full code [here](/~https://github.com/leleamol/incubator-mxnet/bl
 , and to compile it use this [Makefile](/~https://github.com/leleamol/incubator-mxnet/blob/inception-example/cpp-package/example/inference/Makefile)
 
 Now you will be able to compile the run inference, just do `make all` and pass the parameters as follows
+
 ```bash
 make all
 LD_LIBRARY_PATH=../incubator-mxnet/lib/ ./inception_inference --symbol "flower-recognition-symbol.json" --params "flower-recognition-0020.params" --image ./data/test/0/image_06736.jpg
 ```
+
 Then it will predict your iamge
 
 ```bash
@@ -395,6 +611,14 @@ Then it will predict your iamge
 [22:26:50] inception_inference.cpp:260: The model predicts the highest accuracy of 7.17001 at index 3
 ```
 
+
+## What's next
+
+You can find more ways to run inference and examples here:
+1. [Java Inference examples](/~https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer)
+2. [Scala Inference examples](https://mxnet.incubator.apache.org/tutorials/scala/)
+3. [ONNX model inference examples](https://mxnet.incubator.apache.org/tutorials/onnx/inference_on_onnx_model.html) 
+
 ## References
 
 1. /~https://github.com/Arsey/keras-transfer-learning-for-oxford102

From 5a780a4eb9677ef60ae1081fbd5efc9b47b16a9e Mon Sep 17 00:00:00 2001
From: Lai Wei <royweilai@gmail.com>
Date: Sun, 9 Dec 2018 16:59:26 -0800
Subject: [PATCH 06/16] add util functions on dataset

---
 .../gluon/oxford_102_flower_dataset.py        | 102 ++++++++++++++++++
 1 file changed, 102 insertions(+)
 create mode 100644 docs/tutorials/gluon/oxford_102_flower_dataset.py

diff --git a/docs/tutorials/gluon/oxford_102_flower_dataset.py b/docs/tutorials/gluon/oxford_102_flower_dataset.py
new file mode 100644
index 000000000000..573078ea0bb4
--- /dev/null
+++ b/docs/tutorials/gluon/oxford_102_flower_dataset.py
@@ -0,0 +1,102 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+"""
+This scripts downloads and prepare the Oxford 102 Category Flower Dataset for training
+Dataset is from: http://www.robots.ox.ac.uk/~vgg/data/flowers/102/index.html
+Script is modified from: /~https://github.com/Arsey/keras-transfer-learning-for-oxford102
+"""
+
+import glob
+import os
+import tarfile
+from shutil import copyfile
+
+import numpy as np
+from mxnet import gluon
+from scipy.io import loadmat
+
+
+def download_data():
+    data_url = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/'
+    image_file_name = '102flowers.tgz'
+    label_file_name = 'imagelabels.mat'
+    setid_file_name = 'setid.mat'
+
+    global data_path, image_path, label_path, setid_path
+    image_path = os.path.join(data_path, image_file_name)
+    label_path = os.path.join(data_path, label_file_name)
+    setid_path = os.path.join(data_path, setid_file_name)
+    # download the dataset into current directory
+    if not os.path.exists(data_path):
+        os.mkdir(data_path)
+    if not os.path.isfile(image_path):
+        gluon.utils.download(url=data_url + image_file_name, path=data_path)
+        print("Extracting downloaded dataset...")
+        tarfile.open(image_path).extractall(path=data_path)
+    if not os.path.isfile(label_path):
+        gluon.utils.download(url=data_url + label_file_name, path=data_path)
+    if not os.path.isfile(setid_path):
+        gluon.utils.download(url=data_url + setid_file_name, path=data_path)
+
+
+def prepare_data():
+    # Read .mat file containing training, testing, and validation sets.
+    global data_path, image_path, label_path, setid_path
+    setid = loadmat(setid_path)
+
+    idx_train = setid['trnid'][0] - 1
+    idx_test = setid['tstid'][0] - 1
+    idx_valid = setid['valid'][0] - 1
+
+    # Read .mat file containing image labels.
+    image_labels = loadmat(label_path)['labels'][0]
+
+    # Subtract one to get 0-based labels
+    image_labels -= 1
+
+    # extracted images are stored in folder 'jpg'
+    files = sorted(glob.glob(os.path.join(data_path, 'jpg', '*.jpg')))
+    file_label_pairs = np.array([i for i in zip(files, image_labels)])
+
+    # move files from extracted folder to train, test, valid
+    move_files('train', file_label_pairs[idx_test, :])
+    move_files('test', file_label_pairs[idx_train, :])
+    move_files('valid', file_label_pairs[idx_valid, :])
+
+def move_files(dir_name, file_label_pairs):
+    data_segment_dir = os.path.join(data_path, dir_name)
+    if not os.path.exists(data_segment_dir):
+        os.mkdir(data_segment_dir)
+
+    for i in range(0, 102):
+        class_dir = os.path.join(data_segment_dir, str(i))
+        if not os.path.exists(class_dir):
+            os.mkdir(class_dir)
+
+    for file, label in file_label_pairs:
+        src = str(file)
+        dst = os.path.join(data_path, dir_name, label, src.split(os.sep)[-1])
+        copyfile(src, dst)
+
+
+def get_data(dir_name):
+    global data_path
+    data_path = dir_name
+    download_data()
+    prepare_data()

From 1cf584db7e2e080cc5b9db808ec47f55d2ce7a35 Mon Sep 17 00:00:00 2001
From: Lai Wei <royweilai@gmail.com>
Date: Sun, 9 Dec 2018 17:05:07 -0800
Subject: [PATCH 07/16] move util file

---
 .../gluon => tutorial_utils/data}/oxford_102_flower_dataset.py    | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename docs/{tutorials/gluon => tutorial_utils/data}/oxford_102_flower_dataset.py (100%)

diff --git a/docs/tutorials/gluon/oxford_102_flower_dataset.py b/docs/tutorial_utils/data/oxford_102_flower_dataset.py
similarity index 100%
rename from docs/tutorials/gluon/oxford_102_flower_dataset.py
rename to docs/tutorial_utils/data/oxford_102_flower_dataset.py

From ab4a724f686cb0dfa51383f7fe2f98fcc1331fbd Mon Sep 17 00:00:00 2001
From: Lai Wei <royweilai@gmail.com>
Date: Sun, 9 Dec 2018 17:10:01 -0800
Subject: [PATCH 08/16] update link

---
 docs/tutorials/gluon/gluon_from_experiment_to_deploymen.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/docs/tutorials/gluon/gluon_from_experiment_to_deploymen.md b/docs/tutorials/gluon/gluon_from_experiment_to_deploymen.md
index 691a94921cbe..e6ba71662730 100644
--- a/docs/tutorials/gluon/gluon_from_experiment_to_deploymen.md
+++ b/docs/tutorials/gluon/gluon_from_experiment_to_deploymen.md
@@ -31,9 +31,8 @@ We will use the [Oxford 102 Category Flower Dateset](http://www.robots.ox.ac.uk/
 
 ```python
 data_util_file = "oxford_102_flower_dataset.py" 
-# TODO change base_url to apache/incubator-mxnet, used fork url for testing in CI
-base_url = "https://raw.githubusercontent.com/roywei/incubator-mxnet/master/docs/tutorial_utils/data/{}?raw=true"
-#mx.test_utils.download(base_url.format(data_util_file), fname=data_util_file)
+base_url = "https://raw.githubusercontent.com/roywei/incubator-mxnet/gluon_tutorial/docs/tutorial_utils/data/{}?raw=true"
+mx.test_utils.download(base_url.format(data_util_file), fname=data_util_file)
 import oxford_102_flower_dataset
 
 # download and move data to train, test, valid folders

From a7d041bb0bfa0803af306c23623c518cabf2a774 Mon Sep 17 00:00:00 2001
From: Lai Wei <royweilai@gmail.com>
Date: Sun, 9 Dec 2018 17:33:56 -0800
Subject: [PATCH 09/16] fix typo, add test

---
 ...men.md => gluon_from_experiment_to_deployment.md} | 12 ++++++------
 tests/tutorials/test_tutorials.py                    |  3 +++
 2 files changed, 9 insertions(+), 6 deletions(-)
 rename docs/tutorials/gluon/{gluon_from_experiment_to_deploymen.md => gluon_from_experiment_to_deployment.md} (96%)

diff --git a/docs/tutorials/gluon/gluon_from_experiment_to_deploymen.md b/docs/tutorials/gluon/gluon_from_experiment_to_deployment.md
similarity index 96%
rename from docs/tutorials/gluon/gluon_from_experiment_to_deploymen.md
rename to docs/tutorials/gluon/gluon_from_experiment_to_deployment.md
index e6ba71662730..51df2ac8f255 100644
--- a/docs/tutorials/gluon/gluon_from_experiment_to_deploymen.md
+++ b/docs/tutorials/gluon/gluon_from_experiment_to_deployment.md
@@ -7,9 +7,9 @@ MXNet Gluon API comes with a lot of great features and it can provide you everyt
 Let's say you need to build a service that provides flower species recognition. A common use case is, you don't have enough data to train a good model. In such cases we use a technique called Transfer Learning.
 In Transfer Learning we make use of a pre-trained model that solves a related task but is trained on a very large standard dataset such as ImageNet from a different domain, we utilize the knowledge in this pre-trained model to perform a new task at hand. 
 
-Gluon provides State of the Art models for many of the standard tasks such as Classifcation, Object Detection, Segmentation, etc. In this tutorial we will use the pre-trained model [ResNet50 V2](https://arxiv.org/abs/1603.05027) trained on ImageNet dataset, this model achieves 77.11% top-1 accuracy on ImageNet, we seek to transfer as much knowledge as possible for our task of recognizing different species of Flowers.
+Gluon provides State of the Art models for many of the standard tasks such as Classification, Object Detection, Segmentation, etc. In this tutorial we will use the pre-trained model [ResNet50 V2](https://arxiv.org/abs/1603.05027) trained on ImageNet dataset, this model achieves 77.11% top-1 accuracy on ImageNet, we seek to transfer as much knowledge as possible for our task of recognizing different species of Flowers.
 
-In this tutorial we will show you the steps to load pre-trained model from Gluon, tweak the model according to your neeed, fine-tune the model on your small dataset, and finally deploy the trained model to integrate with your service.
+In this tutorial we will show you the steps to load pre-trained model from Gluon, tweak the model according to your need, fine-tune the model on your small dataset, and finally deploy the trained model to integrate with your service.
 
 
 
@@ -65,7 +65,7 @@ data
 
 ## Training using Gluon
 
-### Define Hyper-paramerters
+### Define Hyper-parameters
 
 Now let's first import necessary packages:
 
@@ -386,7 +386,7 @@ Following is the training result:
 [Epoch 40] Train-acc: 0.945, loss: 0.354 | Val-acc: 0.955 | learning-rate: 4.219E-04 | time: 17.8
 [Finished] Test-acc: 0.952
 ```
-We trained the model using a [AWS P3.8XLarge instance](https://aws.amazon.com/ec2/instance-types/p3/) with 4 Tesla V100	GPUs. We were able to reach a test accuracy of 95.2% with 40 epochs in around 12 minutes. This was really fast because our model was pre-trained on a much larget dataset, ImageNet, with around 1.3 million images. It worked really well to capture features on our small dataset.
+We trained the model using a [AWS P3.8XLarge instance](https://aws.amazon.com/ec2/instance-types/p3/) with 4 Tesla V100	GPUs. We were able to reach a test accuracy of 95.2% with 40 epochs in around 12 minutes. This was really fast because our model was pre-trained on a much larger dataset, ImageNet, with around 1.3 million images. It worked really well to capture features on our small dataset.
 
 
 ### Save fine-tuned model
@@ -462,7 +462,7 @@ class Predictor {
 
 ### Load network symbol and parameters
 
-In the Predictor constructor, you need a few information including paths to saved json and param files. After that add the following two methods to load the netowrk and its parameters.
+In the Predictor constructor, you need a few information including paths to saved json and param files. After that add the following two methods to load the network and its parameters.
 
 ```cpp
 /*
@@ -537,7 +537,7 @@ NDArray Predictor::LoadInputImage(const std::string& image_file) {
 
 ### Run inference
 
-Finally, let's run the inference. It's basically using MXNet executor to do a forward pass. To run predictions on multiple images, you can load the images in a list of NDArrays and run prediction in batches. Note that the Predictor class may not be thread safe, calling it in multi-threaded enviroments was not tested. To utilize mult-threaded prediction, you need to use the C predict API, please follow the [C predict example](/~https://github.com/apache/incubator-mxnet/tree/master/example/image-classification/predict-cpp).
+Finally, let's run the inference. It's basically using MXNet executor to do a forward pass. To run predictions on multiple images, you can load the images in a list of NDArrays and run prediction in batches. Note that the Predictor class may not be thread safe, calling it in multi-threaded environments was not tested. To utilize multi-threaded prediction, you need to use the C predict API, please follow the [C predict example](/~https://github.com/apache/incubator-mxnet/tree/master/example/image-classification/predict-cpp).
 
 ```cpp
 void Predictor::PredictImage(const std::string& image_file) {
diff --git a/tests/tutorials/test_tutorials.py b/tests/tutorials/test_tutorials.py
index 8d8ef398d708..37ba9918fb70 100644
--- a/tests/tutorials/test_tutorials.py
+++ b/tests/tutorials/test_tutorials.py
@@ -151,6 +151,9 @@ def test_python_logistic_regression() :
 def test_python_numpy_gotchas() :
     assert _test_tutorial_nb('gluon/gotchas_numpy_in_mxnet')
 
+def test_gluon_end_to_end():
+    assert _test_tutorial_nb('gluon/gluon_from_experiment_to_deployment')
+
 def test_python_mnist():
     assert _test_tutorial_nb('python/mnist')
 

From 60a8cd780725bf0cec19d5caf53b093c04dc5cc4 Mon Sep 17 00:00:00 2001
From: Lai Wei <royweilai@gmail.com>
Date: Sun, 9 Dec 2018 20:14:59 -0800
Subject: [PATCH 10/16] allow download

---
 docs/tutorials/gluon/gluon_from_experiment_to_deployment.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/tutorials/gluon/gluon_from_experiment_to_deployment.md b/docs/tutorials/gluon/gluon_from_experiment_to_deployment.md
index 51df2ac8f255..43b9651389c0 100644
--- a/docs/tutorials/gluon/gluon_from_experiment_to_deployment.md
+++ b/docs/tutorials/gluon/gluon_from_experiment_to_deployment.md
@@ -624,3 +624,5 @@ You can find more ways to run inference and examples here:
 1. https://gluon.mxnet.io/chapter08_computer-vision/fine-tuning.html
 2. /~https://github.com/leleamol/incubator-mxnet/blob/inception-example/cpp-package/example/inference/
 3. https://gluon-crash-course.mxnet.io/
+
+<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
\ No newline at end of file

From 81d6a16e1bf73ac46821d457ca138f0678c089a8 Mon Sep 17 00:00:00 2001
From: Lai Wei <royweilai@gmail.com>
Date: Tue, 18 Dec 2018 09:35:12 -0800
Subject: [PATCH 11/16] update wording

---
 .../gluon_from_experiment_to_deployment.md    | 176 ++----------------
 1 file changed, 18 insertions(+), 158 deletions(-)

diff --git a/docs/tutorials/gluon/gluon_from_experiment_to_deployment.md b/docs/tutorials/gluon/gluon_from_experiment_to_deployment.md
index 43b9651389c0..be6552ee1145 100644
--- a/docs/tutorials/gluon/gluon_from_experiment_to_deployment.md
+++ b/docs/tutorials/gluon/gluon_from_experiment_to_deployment.md
@@ -5,7 +5,7 @@
 MXNet Gluon API comes with a lot of great features and it can provide you everything you need from experiment to deploy the model. In this tutorial, we will walk you through a common use case on how to build a model using gluon, train it on your data, and deploy it for inference.
 
 Let's say you need to build a service that provides flower species recognition. A common use case is, you don't have enough data to train a good model. In such cases we use a technique called Transfer Learning.
-In Transfer Learning we make use of a pre-trained model that solves a related task but is trained on a very large standard dataset such as ImageNet from a different domain, we utilize the knowledge in this pre-trained model to perform a new task at hand. 
+In Transfer Learning we make use of a pre-trained model that solves a related task but is trained on a very large standard dataset such as ImageNet from a different domain, we utilize the knowledge in this pre-trained model to perform a new task at hand.
 
 Gluon provides State of the Art models for many of the standard tasks such as Classification, Object Detection, Segmentation, etc. In this tutorial we will use the pre-trained model [ResNet50 V2](https://arxiv.org/abs/1603.05027) trained on ImageNet dataset, this model achieves 77.11% top-1 accuracy on ImageNet, we seek to transfer as much knowledge as possible for our task of recognizing different species of Flowers.
 
@@ -30,7 +30,7 @@ We will use the [Oxford 102 Category Flower Dateset](http://www.robots.ox.ac.uk/
 
 
 ```python
-data_util_file = "oxford_102_flower_dataset.py" 
+data_util_file = "oxford_102_flower_dataset.py"
 base_url = "https://raw.githubusercontent.com/roywei/incubator-mxnet/gluon_tutorial/docs/tutorial_utils/data/{}?raw=true"
 mx.test_utils.download(base_url.format(data_util_file), fname=data_util_file)
 import oxford_102_flower_dataset
@@ -43,23 +43,23 @@ oxford_102_flower_dataset.get_data(path)
 Now your data will be organized into the following format, all the images belong to the same category will be put together
 ```bash
 data
-├── train
-│   ├── 0
-│   │   ├── image_06736.jpg
-│   │   ├── image_06741.jpg
+|--train
+|   |-- 0
+|   |   |-- image_06736.jpg
+|   |   |-- image_06741.jpg
 ...
-│   ├── 1
-│   │   ├── image_06755.jpg
-│   │   ├── image_06899.jpg
+|   |-- 1
+|   |   |-- image_06755.jpg
+|   |   |-- image_06899.jpg
 ...
-├── test
-│   ├── 0
-│   │   ├── image_00731.jpg
-│   │   ├── image_0002.jpg
+|-- test
+|   |-- 0
+|   |   |-- image_00731.jpg
+|   |   |-- image_0002.jpg
 ...
-│   ├── 1
-│   │   ├── image_00036.jpg
-│   │   ├── image_05011.jpg
+|   |-- 1
+|   |   |-- image_00036.jpg
+|   |   |-- image_05011.jpg
 
 ```
 
@@ -241,146 +241,6 @@ _, test_acc = test(finetune_net, test_data, ctx)
 print('[Finished] Test-acc: %.3f' % (test_acc))
 ```
 
-    Process Process-7:
-    Process Process-5:
-    Process Process-6:
-    Process Process-8:
-    Traceback (most recent call last):
-    Traceback (most recent call last):
-    Traceback (most recent call last):
-    Traceback (most recent call last):
-      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
-        self.run()
-      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
-        self.run()
-      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
-        self.run()
-      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
-        self.run()
-      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
-        self._target(*self._args, **self._kwargs)
-      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
-        self._target(*self._args, **self._kwargs)
-      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
-        self._target(*self._args, **self._kwargs)
-      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
-        self._target(*self._args, **self._kwargs)
-      File "/Users/lawei/Documents/Workspace/roywei/incubator-mxnet/python/mxnet/gluon/data/dataloader.py", line 186, in worker_loop
-        idx, samples = key_queue.get()
-      File "/Users/lawei/Documents/Workspace/roywei/incubator-mxnet/python/mxnet/gluon/data/dataloader.py", line 186, in worker_loop
-        idx, samples = key_queue.get()
-      File "/Users/lawei/Documents/Workspace/roywei/incubator-mxnet/python/mxnet/gluon/data/dataloader.py", line 186, in worker_loop
-        idx, samples = key_queue.get()
-      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/queues.py", line 93, in get
-        with self._rlock:
-      File "/Users/lawei/Documents/Workspace/roywei/incubator-mxnet/python/mxnet/gluon/data/dataloader.py", line 186, in worker_loop
-        idx, samples = key_queue.get()
-      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/queues.py", line 93, in get
-        with self._rlock:
-      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/queues.py", line 93, in get
-        with self._rlock:
-      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
-        return self._semlock.__enter__()
-      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
-        return self._semlock.__enter__()
-      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/queues.py", line 94, in get
-        res = self._recv_bytes()
-      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
-        return self._semlock.__enter__()
-    KeyboardInterrupt
-    KeyboardInterrupt
-      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/connection.py", line 216, in recv_bytes
-        buf = self._recv_bytes(maxlength)
-    KeyboardInterrupt
-      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/connection.py", line 407, in _recv_bytes
-        buf = self._recv(4)
-      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/connection.py", line 379, in _recv
-        chunk = read(handle, remaining)
-    KeyboardInterrupt
-    Traceback (most recent call last):
-      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/queues.py", line 240, in _feed
-        send_bytes(obj)
-      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/connection.py", line 200, in send_bytes
-        self._send_bytes(m[offset:offset + size])
-      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/connection.py", line 404, in _send_bytes
-        self._send(header + buf)
-      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/connection.py", line 368, in _send
-        n = write(self._handle, buf)
-
-
-
-    ---------------------------------------------------------------------------
-
-    KeyboardInterrupt                         Traceback (most recent call last)
-
-    <ipython-input-32-4e9fa8a3bb40> in <module>()
-         38         trainer.set_learning_rate(lr)
-         39         trainer.step(batch_size)
-    ---> 40         train_loss += sum([l.mean().asscalar() for l in loss]) / len(loss)
-         41         metric.update(label, outputs)
-         42         iteration_idx += 1
-
-
-    <ipython-input-32-4e9fa8a3bb40> in <listcomp>(.0)
-         38         trainer.set_learning_rate(lr)
-         39         trainer.step(batch_size)
-    ---> 40         train_loss += sum([l.mean().asscalar() for l in loss]) / len(loss)
-         41         metric.update(label, outputs)
-         42         iteration_idx += 1
-
-
-    ~/Documents/Workspace/roywei/incubator-mxnet/python/mxnet/ndarray/ndarray.py in asscalar(self)
-       1996         if self.shape != (1,):
-       1997             raise ValueError("The current array is not a scalar")
-    -> 1998         return self.asnumpy()[0]
-       1999 
-       2000     def astype(self, dtype, copy=True):
-
-
-    ~/Documents/Workspace/roywei/incubator-mxnet/python/mxnet/ndarray/ndarray.py in asnumpy(self)
-       1978             self.handle,
-       1979             data.ctypes.data_as(ctypes.c_void_p),
-    -> 1980             ctypes.c_size_t(data.size)))
-       1981         return data
-       1982 
-
-
-    KeyboardInterrupt: 
-
-
-    BrokenPipeError: [Errno 32] Broken pipe
-    Traceback (most recent call last):
-      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/queues.py", line 240, in _feed
-        send_bytes(obj)
-      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/connection.py", line 200, in send_bytes
-        self._send_bytes(m[offset:offset + size])
-      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/connection.py", line 404, in _send_bytes
-        self._send(header + buf)
-      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/connection.py", line 368, in _send
-        n = write(self._handle, buf)
-    BrokenPipeError: [Errno 32] Broken pipe
-    Traceback (most recent call last):
-      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/queues.py", line 240, in _feed
-        send_bytes(obj)
-      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/connection.py", line 200, in send_bytes
-        self._send_bytes(m[offset:offset + size])
-      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/connection.py", line 404, in _send_bytes
-        self._send(header + buf)
-      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/connection.py", line 368, in _send
-        n = write(self._handle, buf)
-    BrokenPipeError: [Errno 32] Broken pipe
-    Traceback (most recent call last):
-      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/queues.py", line 240, in _feed
-        send_bytes(obj)
-      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/connection.py", line 200, in send_bytes
-        self._send_bytes(m[offset:offset + size])
-      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/connection.py", line 404, in _send_bytes
-        self._send(header + buf)
-      File "/Users/lawei/anaconda3/lib/python3.6/multiprocessing/connection.py", line 368, in _send
-        n = write(self._handle, buf)
-    BrokenPipeError: [Errno 32] Broken pipe
-
-
 Following is the training result:
 ```bash
 [Epoch 40] Train-acc: 0.945, loss: 0.354 | Val-acc: 0.955 | learning-rate: 4.219E-04 | time: 17.8
@@ -408,7 +268,7 @@ export in this case creates `flower-recognition-symbol.json` and `flower-recogni
 
 MXNet provide various useful tools and interfaces for deploying your model for inference. For example, you can use [MXNet Model Server](/~https://github.com/awslabs/mxnet-model-server) to start a service and host your trained model easily. Besides that, you can also use MXNet's different language APIs to integrate your model with your existing service. We provide [Java](https://mxnet.incubator.apache.org/api/java/index.html), [Scala](https://mxnet.incubator.apache.org/api/scala/index.html), and [C++](https://mxnet.incubator.apache.org/api/c++/index.html) APIs. In this tutorial, we will focus on the C++ API, for more details, please refer to the [C++ Inference Example](/~https://github.com/leleamol/incubator-mxnet/tree/inception-example/cpp-package/example/inference).
 
- 
+
 ### Setup MXNet C++ API
 To use C++ API in MXNet, you need to build MXNet from source with C++ package. Please follow the [built from source guide](https://mxnet.incubator.apache.org/install/ubuntu_setup.html), and [C++ Package documentation](/~https://github.com/apache/incubator-mxnet/tree/master/cpp-package)
 to enable C++ API.
@@ -616,7 +476,7 @@ Then it will predict your iamge
 You can find more ways to run inference and examples here:
 1. [Java Inference examples](/~https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer)
 2. [Scala Inference examples](https://mxnet.incubator.apache.org/tutorials/scala/)
-3. [ONNX model inference examples](https://mxnet.incubator.apache.org/tutorials/onnx/inference_on_onnx_model.html) 
+3. [ONNX model inference examples](https://mxnet.incubator.apache.org/tutorials/onnx/inference_on_onnx_model.html)
 
 ## References
 

From 05cf55ea514111ca3fac113e44d8fae673c2dc3a Mon Sep 17 00:00:00 2001
From: Lai Wei <royweilai@gmail.com>
Date: Tue, 18 Dec 2018 09:45:30 -0800
Subject: [PATCH 12/16] update links

---
 .../gluon/gluon_from_experiment_to_deployment.md  | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/docs/tutorials/gluon/gluon_from_experiment_to_deployment.md b/docs/tutorials/gluon/gluon_from_experiment_to_deployment.md
index be6552ee1145..5cd92ce130a8 100644
--- a/docs/tutorials/gluon/gluon_from_experiment_to_deployment.md
+++ b/docs/tutorials/gluon/gluon_from_experiment_to_deployment.md
@@ -447,8 +447,8 @@ void Predictor::PredictImage(const std::string& image_file) {
 
 ### Compile and Run Inference Code
 
-You can find the full code [here](/~https://github.com/leleamol/incubator-mxnet/blob/inception-example/cpp-package/example/inference/inception_inference.cpp)
-, and to compile it use this [Makefile](/~https://github.com/leleamol/incubator-mxnet/blob/inception-example/cpp-package/example/inference/Makefile)
+You can find the full code [here](/~https://github.com/apache/incubator-mxnet/tree/master/cpp-package/example/inference)
+, and to compile it use this [Makefile](/~https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/Makefile)
 
 Now you will be able to compile the run inference, just do `make all` and pass the parameters as follows
 
@@ -473,16 +473,17 @@ Then it will predict your iamge
 
 ## What's next
 
-You can find more ways to run inference and examples here:
+You can find more ways to run inference and deploy your models here:
 1. [Java Inference examples](/~https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer)
 2. [Scala Inference examples](https://mxnet.incubator.apache.org/tutorials/scala/)
 3. [ONNX model inference examples](https://mxnet.incubator.apache.org/tutorials/onnx/inference_on_onnx_model.html)
+4. [MXNet Model Server Examples](/~https://github.com/awslabs/mxnet-model-server/tree/master/examples)
 
 ## References
 
-1. /~https://github.com/Arsey/keras-transfer-learning-for-oxford102
-1. https://gluon.mxnet.io/chapter08_computer-vision/fine-tuning.html
-2. /~https://github.com/leleamol/incubator-mxnet/blob/inception-example/cpp-package/example/inference/
-3. https://gluon-crash-course.mxnet.io/
+1. [Trasnfer Learning for Oxford102 Flower Dataset](/~https://github.com/Arsey/keras-transfer-learning-for-oxford102)
+2. [Gluon tutorial on fine-tuning](https://gluon.mxnet.io/chapter08_computer-vision/fine-tuning.html)
+3. [Gluon crash course](https://gluon-crash-course.mxnet.io/)
+4. [Gluon CPP inference example](/~https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/)
 
 <!-- INSERT SOURCE DOWNLOAD BUTTONS -->
\ No newline at end of file

From 832ef1896732dd1a3712cc9d349913f2782a5f4e Mon Sep 17 00:00:00 2001
From: Lai Wei <royweilai@gmail.com>
Date: Tue, 8 Jan 2019 15:51:09 -0800
Subject: [PATCH 13/16] address comments

---
 .../data/oxford_102_flower_dataset.py         |  2 +-
 .../gluon_from_experiment_to_deployment.md    | 72 +++++++++----------
 2 files changed, 36 insertions(+), 38 deletions(-)

diff --git a/docs/tutorial_utils/data/oxford_102_flower_dataset.py b/docs/tutorial_utils/data/oxford_102_flower_dataset.py
index 573078ea0bb4..0bc1c2f9ac7a 100644
--- a/docs/tutorial_utils/data/oxford_102_flower_dataset.py
+++ b/docs/tutorial_utils/data/oxford_102_flower_dataset.py
@@ -17,7 +17,7 @@
 
 
 """
-This scripts downloads and prepare the Oxford 102 Category Flower Dataset for training
+This scripts downloads and prepares the Oxford 102 Category Flower Dataset for training
 Dataset is from: http://www.robots.ox.ac.uk/~vgg/data/flowers/102/index.html
 Script is modified from: /~https://github.com/Arsey/keras-transfer-learning-for-oxford102
 """
diff --git a/docs/tutorials/gluon/gluon_from_experiment_to_deployment.md b/docs/tutorials/gluon/gluon_from_experiment_to_deployment.md
index 5cd92ce130a8..9d5dc43c93f1 100644
--- a/docs/tutorials/gluon/gluon_from_experiment_to_deployment.md
+++ b/docs/tutorials/gluon/gluon_from_experiment_to_deployment.md
@@ -2,14 +2,12 @@
 # Gluon: from experiment to deployment, an end to end example
 
 ## Overview
-MXNet Gluon API comes with a lot of great features and it can provide you everything you need from experiment to deploy the model. In this tutorial, we will walk you through a common use case on how to build a model using gluon, train it on your data, and deploy it for inference.
+MXNet Gluon API comes with a lot of great features, and it can provide you everything you need: from experimentation to deploying the model. In this tutorial, we will walk you through a common use case on how to build a model using gluon, train it on your data, and deploy it for inference.
 
-Let's say you need to build a service that provides flower species recognition. A common use case is, you don't have enough data to train a good model. In such cases we use a technique called Transfer Learning.
-In Transfer Learning we make use of a pre-trained model that solves a related task but is trained on a very large standard dataset such as ImageNet from a different domain, we utilize the knowledge in this pre-trained model to perform a new task at hand.
+Let's say you need to build a service that provides flower species recognition. A common problem is that you don't have enough data to train a good model. In such cases, a technique called Transfer Learning can be used to make a more robust model.
+In Transfer Learning we make use of a pre-trained model that solves a related task, and was trained on a very large standard dataset, such as ImageNet. ImageNet is from a different domain, but we can utilize the knowledge in this pre-trained model to perform the new task at hand.
 
-Gluon provides State of the Art models for many of the standard tasks such as Classification, Object Detection, Segmentation, etc. In this tutorial we will use the pre-trained model [ResNet50 V2](https://arxiv.org/abs/1603.05027) trained on ImageNet dataset, this model achieves 77.11% top-1 accuracy on ImageNet, we seek to transfer as much knowledge as possible for our task of recognizing different species of Flowers.
-
-In this tutorial we will show you the steps to load pre-trained model from Gluon, tweak the model according to your need, fine-tune the model on your small dataset, and finally deploy the trained model to integrate with your service.
+Gluon provides State of the Art models for many of the standard tasks such as Classification, Object Detection, Segmentation, etc. In this tutorial we will use the pre-trained model [ResNet50 V2](https://arxiv.org/abs/1603.05027) trained on ImageNet dataset. This model achieves 77.11% top-1 accuracy on ImageNet. We seek to transfer as much knowledge as possible for our task of recognizing different species of flowers.
 
 
 
@@ -25,8 +23,8 @@ To complete this tutorial, you need:
 
 ## The Data
 
-We will use the [Oxford 102 Category Flower Dateset](http://www.robots.ox.ac.uk/~vgg/data/flowers/102/) as an example to show you the steps. You can use this [script](/~https://github.com/Arsey/keras-transfer-learning-for-oxford102/blob/master/bootstrap.py) to download and organize your data into train, test, and validation sets. Simply import it and run:
-
+We will use the [Oxford 102 Category Flower Dataset](http://www.robots.ox.ac.uk/~vgg/data/flowers/102/) as an example to show you the steps.
+We have prepared a utility file to help you download and organize your data into train, test, and validation sets. Run the following Python code to download and prepare the data:
 
 
 ```python
@@ -40,7 +38,7 @@ path = './data'
 oxford_102_flower_dataset.get_data(path)
 ```
 
-Now your data will be organized into the following format, all the images belong to the same category will be put together
+Now your data will be organized into the following format, all the images belong to the same category will be put together in the following pattern:
 ```bash
 data
 |--train
@@ -84,7 +82,7 @@ from mxnet.gluon.data.vision import transforms
 from mxnet.gluon.model_zoo.vision import resnet50_v2
 ```
 
-and define the hyper-parameters we will use for fine-tuning, we will use [MXNet learning rate scheduler](https://mxnet.incubator.apache.org/tutorials/gluon/learning_rate_schedules.html) to adjust learning rates during training.
+Next, we define the hyper-parameters that we will use for fine-tuning. We will use the [MXNet learning rate scheduler](https://mxnet.incubator.apache.org/tutorials/gluon/learning_rate_schedules.html) to adjust learning rates during training.
 
 
 ```python
@@ -105,7 +103,7 @@ ctx = [mx.gpu(i) for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()]
 batch_size = per_device_batch_size * max(num_gpus, 1)
 ```
 
-Before the training we will apply data augmentations on training images. It's making minor alterations on training images and our model will consider them as distinct images. This can be very useful for finetuning on relatively small dataset and help improve the model. We can use Gluon [DataSet API](https://mxnet.incubator.apache.org/tutorials/gluon/datasets.html), [DataLoader API](https://mxnet.incubator.apache.org/tutorials/gluon/datasets.html), and [Transform API](https://mxnet.incubator.apache.org/tutorials/gluon/data_augmentation.html) to load the images and apply the follwing data augmentation:
+Now we will apply data augmentations on training images. This makes minor alterations on the training images, and our model will consider them as distinct images. This can be very useful for fine-tuning on a relatively small dataset, and it will help improve the model. We can use the Gluon [DataSet API](https://mxnet.incubator.apache.org/tutorials/gluon/datasets.html), [DataLoader API](https://mxnet.incubator.apache.org/tutorials/gluon/datasets.html), and [Transform API](https://mxnet.incubator.apache.org/tutorials/gluon/data_augmentation.html) to load the images and apply the following data augmentations:
 1. Randomly crop the image and resize it to 224x224
 2. Randomly flip the image horizontally
 3. Randomly jitter color and add noise
@@ -157,9 +155,9 @@ test_data = gluon.data.DataLoader(
 ### Loading pre-trained model
 
 
-We will use pre-trained ResNet50_v2 model which was pre-trained on the [ImageNet Dataset](http://www.image-net.org/) with 1000 classes. All you need to do is re-define the last softmax layer and specify the number of classes to be 102 in our case and initialize the parameters. You can also add layers to the network according to your needs.
+We will use pre-trained ResNet50_v2 model which was pre-trained on the [ImageNet Dataset](http://www.image-net.org/) with 1000 classes. To match the classes in the Flower dataset, we must redefine the last softmax (output) layer to be 102, then initialize the parameters.
 
-Before we go to training, one unique feature Gluon offers is hybridization. It allows you to convert your imperative code to static symbolic graph which is much more efficient to execute. There are two main benefit of hybridizing your model: better performance and easier serialization for deployment. The best part is it's as simple as just calling `net.hybridize()`. To know more about Gluon hybridization, please follow our [tutorials](https://mxnet.incubator.apache.org/tutorials/gluon/hybrid.html).
+Before we go to training, one unique Gluon feature you should be aware of is hybridization. It allows you to convert your imperative code to a static symbolic graph, which is much more efficient to execute. There are two main benefits of hybridizing your model: better performance and easier serialization for deployment. The best part is that it's as simple as just calling `net.hybridize()`. To know more about Gluon hybridization, please follow the [hybridization tutorial](https://mxnet.incubator.apache.org/tutorials/gluon/hybrid.html).
 
 
 
@@ -246,10 +244,10 @@ Following is the training result:
 [Epoch 40] Train-acc: 0.945, loss: 0.354 | Val-acc: 0.955 | learning-rate: 4.219E-04 | time: 17.8
 [Finished] Test-acc: 0.952
 ```
-We trained the model using a [AWS P3.8XLarge instance](https://aws.amazon.com/ec2/instance-types/p3/) with 4 Tesla V100	GPUs. We were able to reach a test accuracy of 95.2% with 40 epochs in around 12 minutes. This was really fast because our model was pre-trained on a much larger dataset, ImageNet, with around 1.3 million images. It worked really well to capture features on our small dataset.
+In the previous example output, we trained the model using an [AWS p3.8xlarge instance](https://aws.amazon.com/ec2/instance-types/p3/) with 4 Tesla V100 GPUs. We were able to reach a test accuracy of 95.5% with 40 epochs in around 12 minutes. This was really fast because our model was pre-trained on a much larger dataset, ImageNet, with around 1.3 million images. It worked really well to capture features on our small dataset.
 
 
-### Save fine-tuned model
+### Save the fine-tuned model
 
 
 We now have a trained our custom model. This can be serialized into model files using the export function. The export function will export the model architecture into a `.json` file and model parameters into a `.params` file.
@@ -261,25 +259,25 @@ finetune_net.export("flower-recognition", epoch=epochs)
 
 ```
 
-export in this case creates `flower-recognition-symbol.json` and `flower-recognition-0020.params` in the current directory. They can be used for model deployment in the next section.
+`export` creates `flower-recognition-symbol.json` and `flower-recognition-0020.params` (`0020` is for 20 epochs we ran) in the current directory. These files can be used for model deployment in the next section.
 
 
-## Load and inference using C++ API
+## Load the model and run inference using the MXNet C++ API
 
-MXNet provide various useful tools and interfaces for deploying your model for inference. For example, you can use [MXNet Model Server](/~https://github.com/awslabs/mxnet-model-server) to start a service and host your trained model easily. Besides that, you can also use MXNet's different language APIs to integrate your model with your existing service. We provide [Java](https://mxnet.incubator.apache.org/api/java/index.html), [Scala](https://mxnet.incubator.apache.org/api/scala/index.html), and [C++](https://mxnet.incubator.apache.org/api/c++/index.html) APIs. In this tutorial, we will focus on the C++ API, for more details, please refer to the [C++ Inference Example](/~https://github.com/leleamol/incubator-mxnet/tree/inception-example/cpp-package/example/inference).
+MXNet provides various useful tools and interfaces for deploying your model for inference. For example, you can use [MXNet Model Server](/~https://github.com/awslabs/mxnet-model-server) to start a service and host your trained model easily. Besides that, you can also use MXNet's different language APIs to integrate your model with your existing service. We provide [Java](https://mxnet.incubator.apache.org/api/java/index.html), [Scala](https://mxnet.incubator.apache.org/api/scala/index.html), and [C++](https://mxnet.incubator.apache.org/api/c++/index.html) APIs. In this tutorial, we will focus on the MXNet C++ API. For more details, please refer to the [C++ Inference Example](/~https://github.com/apache/incubator-mxnet/tree/master/cpp-package/example/inference).
 
 
-### Setup MXNet C++ API
-To use C++ API in MXNet, you need to build MXNet from source with C++ package. Please follow the [built from source guide](https://mxnet.incubator.apache.org/install/ubuntu_setup.html), and [C++ Package documentation](/~https://github.com/apache/incubator-mxnet/tree/master/cpp-package)
-to enable C++ API.
-In summary you just need to build MXNet from source with `USE_CPP_PACKAGE` flag set to 1 using `make -j USE_CPP_PACKAGE=1`.
+### Setup the MXNet C++ API
+To use the C++ API in MXNet, you need to build MXNet from source with C++ package. Please follow the [built from source guide](https://mxnet.incubator.apache.org/install/ubuntu_setup.html), and [C++ Package documentation](/~https://github.com/apache/incubator-mxnet/tree/master/cpp-package)
+to enable the C++ API.
+The summary of those two documents is that you need to build MXNet from source with `USE_CPP_PACKAGE` flag set to 1. For example: `make -j USE_CPP_PACKAGE=1`.
 
-### Write Predictor in C++
-Now let's write prediction code in C++.  We will use a Predictor Class to do the following jobs:
-1. Load the pre-trained model,
-2. Load the parameters of pre-trained model,
-3. Load the image to be classified in to NDArray.
-4. Run the forward pass and predict the input image.
+### Write a predictor using the MXNet C++ API
+Now let's add a method to load the input image and convert it to NDArray for prediction.
+1. Load the pre-trained model
+2. Load the parameters of pre-trained model
+3. Load the image to be classified in to NDArray
+4. Run the forward pass and predict the class of the input image
 
 ```cpp
 class Predictor {
@@ -322,7 +320,7 @@ class Predictor {
 
 ### Load network symbol and parameters
 
-In the Predictor constructor, you need a few information including paths to saved json and param files. After that add the following two methods to load the network and its parameters.
+In the Predictor constructor, you need to provide paths to saved json and param files. After that, add the following two methods to load the network and its parameters.
 
 ```cpp
 /*
@@ -395,9 +393,9 @@ NDArray Predictor::LoadInputImage(const std::string& image_file) {
 }
 ```
 
-### Run inference
+### Create a predict image class
 
-Finally, let's run the inference. It's basically using MXNet executor to do a forward pass. To run predictions on multiple images, you can load the images in a list of NDArrays and run prediction in batches. Note that the Predictor class may not be thread safe, calling it in multi-threaded environments was not tested. To utilize multi-threaded prediction, you need to use the C predict API, please follow the [C predict example](/~https://github.com/apache/incubator-mxnet/tree/master/example/image-classification/predict-cpp).
+Finally, let's run the inference. It's basically using MXNet executor to do a forward pass. To run predictions on multiple images, you can load the images in a list of NDArrays and run prediction in batches. Note that the Predictor class may not be thread safe. Calling it in multi-threaded environments was not tested. To utilize multi-threaded prediction, you need to use the C predict API. Please follow the [C predict example](/~https://github.com/apache/incubator-mxnet/tree/master/example/image-classification/predict-cpp).
 
 ```cpp
 void Predictor::PredictImage(const std::string& image_file) {
@@ -445,19 +443,19 @@ void Predictor::PredictImage(const std::string& image_file) {
 }
 ```
 
-### Compile and Run Inference Code
+### Compile and run the inference code
 
-You can find the full code [here](/~https://github.com/apache/incubator-mxnet/tree/master/cpp-package/example/inference)
-, and to compile it use this [Makefile](/~https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/Makefile)
+You can find the [full code for the inference example](/~https://github.com/apache/incubator-mxnet/tree/master/cpp-package/example/inference) in the `cpp-package` folder of the project
+, and to compile it use this [Makefile](/~https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/Makefile).
 
-Now you will be able to compile the run inference, just do `make all` and pass the parameters as follows
+Now you will be able to compile the run inference. Run `make all`. Once this is complete, run inference with the the following parameters:
 
 ```bash
 make all
 LD_LIBRARY_PATH=../incubator-mxnet/lib/ ./inception_inference --symbol "flower-recognition-symbol.json" --params "flower-recognition-0020.params" --image ./data/test/0/image_06736.jpg
 ```
 
-Then it will predict your iamge
+Then it will predict your image:
 
 ```bash
 [22:26:49] inception_inference.cpp:128: Loading the model from flower-recognition-symbol.json
@@ -481,7 +479,7 @@ You can find more ways to run inference and deploy your models here:
 
 ## References
 
-1. [Trasnfer Learning for Oxford102 Flower Dataset](/~https://github.com/Arsey/keras-transfer-learning-for-oxford102)
+1. [Transfer Learning for Oxford102 Flower Dataset](/~https://github.com/Arsey/keras-transfer-learning-for-oxford102)
 2. [Gluon tutorial on fine-tuning](https://gluon.mxnet.io/chapter08_computer-vision/fine-tuning.html)
 3. [Gluon crash course](https://gluon-crash-course.mxnet.io/)
 4. [Gluon CPP inference example](/~https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/)

From 0def9740a4a769816788d60411b5d938f91dfe3d Mon Sep 17 00:00:00 2001
From: Lai Wei <royweilai@gmail.com>
Date: Mon, 14 Jan 2019 23:40:31 -0800
Subject: [PATCH 14/16] use lr scheduler with optimizer

---
 .../gluon_from_experiment_to_deployment.md    | 25 ++++++++-----------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/docs/tutorials/gluon/gluon_from_experiment_to_deployment.md b/docs/tutorials/gluon/gluon_from_experiment_to_deployment.md
index 9d5dc43c93f1..b70af7f29e1b 100644
--- a/docs/tutorials/gluon/gluon_from_experiment_to_deployment.md
+++ b/docs/tutorials/gluon/gluon_from_experiment_to_deployment.md
@@ -172,8 +172,16 @@ finetune_net.output.initialize(init.Xavier(), ctx=ctx)
 # hybridize for better performance
 finetune_net.hybridize()
 
-trainer = gluon.Trainer(finetune_net.collect_params(), 'sgd', {
-    'learning_rate': lr, 'momentum': momentum, 'wd': wd})
+num_batch = len(train_data)
+
+# setup learning rate scheduler
+iterations_per_epoch = math.ceil(num_batch)
+# learning rate change at following steps
+lr_steps = [epoch * iterations_per_epoch for epoch in lr_epochs]
+schedule = mx.lr_scheduler.MultiFactorScheduler(step=lr_steps, factor=lr_factor, base_lr=lr)
+
+# setup optimizer with learning rate scheduler, metric, and loss function
+sgd_optimizer = mx.optimizer.SGD(learning_rate=lr, lr_scheduler=schedule, momentum=momentum, wd=wd)
 metric = mx.metric.Accuracy()
 softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()
 ```
@@ -194,15 +202,7 @@ def test(net, val_data, ctx):
         metric.update(label, outputs)
     return metric.get()
 
-
-num_batch = len(train_data)
-iteration_idx = 1
-
-# setup learning rate scheduler
-iterations_per_epoch = math.ceil(num_batch)
-# learning rate change at following steps
-lr_steps = [epoch * iterations_per_epoch for epoch in lr_epochs]
-schedule = mx.lr_scheduler.MultiFactorScheduler(step=lr_steps, factor=lr_factor, base_lr=lr)
+trainer = gluon.Trainer(finetune_net.collect_params(), optimizer=sgd_optimizer)
 
 # start with epoch 1 for easier learning rate calculation
 for epoch in range(1, epochs + 1):
@@ -221,12 +221,9 @@ for epoch in range(1, epochs + 1):
         for l in loss:
             l.backward()
 
-        lr = schedule(iteration_idx)
-        trainer.set_learning_rate(lr)
         trainer.step(batch_size)
         train_loss += sum([l.mean().asscalar() for l in loss]) / len(loss)
         metric.update(label, outputs)
-        iteration_idx += 1
 
     _, train_acc = metric.get()
     train_loss /= num_batch

From c5c001e976b251979cc7911ce73471344451a9bf Mon Sep 17 00:00:00 2001
From: Lai Wei <royweilai@gmail.com>
Date: Wed, 16 Jan 2019 10:22:30 -0800
Subject: [PATCH 15/16] separate into 2 tutorials

---
 .../data/oxford_102_flower_dataset.py         | 123 +++++++-
 .../c++/mxnet_cpp_inference_tutorial.md       | 267 +++++++++++++++++
 .../gluon_from_experiment_to_deployment.md    | 282 ++++--------------
 3 files changed, 453 insertions(+), 219 deletions(-)
 create mode 100644 docs/tutorials/c++/mxnet_cpp_inference_tutorial.md

diff --git a/docs/tutorial_utils/data/oxford_102_flower_dataset.py b/docs/tutorial_utils/data/oxford_102_flower_dataset.py
index 0bc1c2f9ac7a..0dcae2209c6e 100644
--- a/docs/tutorial_utils/data/oxford_102_flower_dataset.py
+++ b/docs/tutorial_utils/data/oxford_102_flower_dataset.py
@@ -31,6 +31,110 @@
 from mxnet import gluon
 from scipy.io import loadmat
 
+label_names = [
+    'pink primrose',
+    'hard-leaved pocket orchid',
+    'canterbury bells',
+    'sweet pea',
+    'english marigold',
+    'tiger lily',
+    'moon orchid',
+    'bird of paradise',
+    'monkshood',
+    'globe thistle',
+    'snapdragon',
+    "colt's foot",
+    'king protea',
+    'spear thistle',
+    'yellow iris',
+    'globe-flower',
+    'purple coneflower',
+    'peruvian lily',
+    'balloon flower',
+    'giant white arum lily',
+    'fire lily',
+    'pincushion flower',
+    'fritillary',
+    'red ginger',
+    'grape hyacinth',
+    'corn poppy',
+    'prince of wales feathers',
+    'stemless gentian',
+    'artichoke',
+    'sweet william',
+    'carnation',
+    'garden phlox',
+    'love in the mist',
+    'mexican aster',
+    'alpine sea holly',
+    'ruby-lipped cattleya',
+    'cape flower',
+    'great masterwort',
+    'siam tulip',
+    'lenten rose',
+    'barbeton daisy',
+    'daffodil',
+    'sword lily',
+    'poinsettia',
+    'bolero deep blue',
+    'wallflower',
+    'marigold',
+    'buttercup',
+    'oxeye daisy',
+    'common dandelion',
+    'petunia',
+    'wild pansy',
+    'primula',
+    'sunflower',
+    'pelargonium',
+    'bishop of llandaff',
+    'gaura',
+    'geranium',
+    'orange dahlia',
+    'pink-yellow dahlia?',
+    'cautleya spicata',
+    'japanese anemone',
+    'black-eyed susan',
+    'silverbush',
+    'californian poppy',
+    'osteospermum',
+    'spring crocus',
+    'bearded iris',
+    'windflower',
+    'tree poppy',
+    'gazania',
+    'azalea',
+    'water lily',
+    'rose',
+    'thorn apple',
+    'morning glory',
+    'passion flower',
+    'lotus',
+    'toad lily',
+    'anthurium',
+    'frangipani',
+    'clematis',
+    'hibiscus',
+    'columbine',
+    'desert-rose',
+    'tree mallow',
+    'magnolia',
+    'cyclamen',
+    'watercress',
+    'canna lily',
+    'hippeastrum ',
+    'bee balm',
+    'ball moss',
+    'foxglove',
+    'bougainvillea',
+    'camellia',
+    'mallow',
+    'mexican petunia',
+    'bromelia',
+    'blanket flower',
+    'trumpet creeper',
+    'blackberry lily'
+]
 
 def download_data():
     data_url = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/'
@@ -47,6 +151,7 @@ def download_data():
         os.mkdir(data_path)
     if not os.path.isfile(image_path):
         gluon.utils.download(url=data_url + image_file_name, path=data_path)
+    if not os.path.exists(os.path.join(data_path, 'jpg')):
         print("Extracting downloaded dataset...")
         tarfile.open(image_path).extractall(path=data_path)
     if not os.path.isfile(label_path):
@@ -57,7 +162,7 @@ def download_data():
 
 def prepare_data():
     # Read .mat file containing training, testing, and validation sets.
-    global data_path, image_path, label_path, setid_path
+    global data_path, image_path, label_path, setid_path, label_names
     setid = loadmat(setid_path)
 
     idx_train = setid['trnid'][0] - 1
@@ -70,6 +175,8 @@ def prepare_data():
     # Subtract one to get 0-based labels
     image_labels -= 1
 
+    # convert label from number to flower names
+    image_labels = [label_names[i] for i in image_labels]
     # extracted images are stored in folder 'jpg'
     files = sorted(glob.glob(os.path.join(data_path, 'jpg', '*.jpg')))
     file_label_pairs = np.array([i for i in zip(files, image_labels)])
@@ -79,13 +186,14 @@ def prepare_data():
     move_files('test', file_label_pairs[idx_train, :])
     move_files('valid', file_label_pairs[idx_valid, :])
 
+
 def move_files(dir_name, file_label_pairs):
     data_segment_dir = os.path.join(data_path, dir_name)
     if not os.path.exists(data_segment_dir):
         os.mkdir(data_segment_dir)
 
-    for i in range(0, 102):
-        class_dir = os.path.join(data_segment_dir, str(i))
+    for label in label_names:
+        class_dir = os.path.join(data_segment_dir, label)
         if not os.path.exists(class_dir):
             os.mkdir(class_dir)
 
@@ -95,8 +203,17 @@ def move_files(dir_name, file_label_pairs):
         copyfile(src, dst)
 
 
+def generate_synset():
+    with open('synset.txt', 'w') as f:
+        # Gluon Dataset API will load synset in sorted order
+        for label in sorted(label_names):
+            f.write(label.strip() + '\n')
+        f.close()
+
+
 def get_data(dir_name):
     global data_path
     data_path = dir_name
     download_data()
     prepare_data()
+    generate_synset()
diff --git a/docs/tutorials/c++/mxnet_cpp_inference_tutorial.md b/docs/tutorials/c++/mxnet_cpp_inference_tutorial.md
new file mode 100644
index 000000000000..e55e7c97dfee
--- /dev/null
+++ b/docs/tutorials/c++/mxnet_cpp_inference_tutorial.md
@@ -0,0 +1,267 @@
+# MXNet C++ API inference tutorial
+
+## Overview
+MXNet provides various useful tools and interfaces for deploying your model for inference. For example, you can use [MXNet Model Server](/~https://github.com/awslabs/mxnet-model-server) to start a service and host your trained model easily.
+Besides that, you can also use MXNet's different language APIs to integrate your model with your existing service. We provide [Python](https://mxnet.incubator.apache.org/api/python/module/module.html),    [Java](https://mxnet.incubator.apache.org/api/java/index.html), [Scala](https://mxnet.incubator.apache.org/api/scala/index.html), and [C++](https://mxnet.incubator.apache.org/api/c++/index.html) APIs.
+
+This tutorial is a continuation of the [Gluon end to end tutorial](/~https://github.com/apache/incubator-mxnet/tree/master/docs/tutorials/gluon/gluon_from_experiment_to_deployment.md), we will focus on the MXNet C++ API. We have slightly modified the code in [C++ Inference Example](/~https://github.com/apache/incubator-mxnet/tree/master/cpp-package/example/inference) for our use case.
+
+## Prerequisites
+
+To complete this tutorial, you need:
+- Complete the training part of [Gluon end to end tutorial](/~https://github.com/apache/incubator-mxnet/tree/master/docs/tutorials/gluon/end_to_end_tutorial_training.md)
+- Learn the basics about [MXNet C++ API](/~https://github.com/apache/incubator-mxnet/tree/master/cpp-package)
+
+
+## Setup the MXNet C++ API
+To use the C++ API in MXNet, you need to build MXNet from source with C++ package. Please follow the [built from source guide](https://mxnet.incubator.apache.org/install/ubuntu_setup.html), and [C++ Package documentation](/~https://github.com/apache/incubator-mxnet/tree/master/cpp-package)
+to enable the C++ API.
+The summary of those two documents is that you need to build MXNet from source with `USE_CPP_PACKAGE` flag set to 1. For example: `make -j USE_CPP_PACKAGE=1`.
+
+## Load the model and run inference
+
+After you complete [the previous tutorial](/~https://github.com/apache/incubator-mxnet/tree/master/docs/tutorials/gluon/end_to_end_tutorial_training.md), you will get the following output files:
+1. Model Architecture stored in `flower-recognition-symbol.json`
+2. Model parameter values stored in `flower-recognition-0040.params` (`0040` is for 40 epochs we ran)
+3. Label names stored in `synset.txt`
+4. Mean and standard deviation values stored in `mean_std_224` for image normalization.
+
+
+Now we need to write the C++ code to load them and run prediction on a test image.
+The full code is available in the [C++ Inference Example](/~https://github.com/apache/incubator-mxnet/tree/master/cpp-package/example/inference), we will walk you through it and point out the necessary changes to make for our use case.
+
+
+
+### Write a predictor using the MXNet C++ API
+
+In general, the C++ inference code should follow the 4 steps below. We can do that using a Predictor class.
+1. Load the pre-trained model
+2. Load the parameters of pre-trained model
+3. Load the image to be classified in to NDArray and apply image transformation we did in training
+4. Run the forward pass and predict the class of the input image
+
+```cpp
+class Predictor {
+ public:
+    Predictor() {}
+    Predictor(const std::string& model_json_file,
+              const std::string& model_params_file,
+              const Shape& input_shape,
+              bool gpu_context_type = false,
+              const std::string& synset_file = "",
+              const std::string& mean_image_file = "");
+    void PredictImage(const std::string& image_file);
+    ~Predictor();
+
+ private:
+    void LoadModel(const std::string& model_json_file);
+    void LoadParameters(const std::string& model_parameters_file);
+    void LoadSynset(const std::string& synset_file);
+    NDArray LoadInputImage(const std::string& image_file);
+    void LoadMeanImageData();
+    void LoadDefaultMeanImageData();
+    void NormalizeInput(const std::string& mean_image_file);
+    inline bool FileExists(const std::string& name) {
+        struct stat buffer;
+        return (stat(name.c_str(), &buffer) == 0);
+    }
+    NDArray mean_img;
+    std::map<std::string, NDArray> args_map;
+    std::map<std::string, NDArray> aux_map;
+    std::vector<std::string> output_labels;
+    Symbol net;
+    Executor *executor;
+    Shape input_shape;
+    NDArray mean_image_data;
+    NDArray std_dev_image_data;
+    Context global_ctx = Context::cpu();
+    std::string mean_image_file;
+};
+```
+
+### Load the model, synset file, and normalization values
+
+In the Predictor constructor, you need to provide paths to saved json and param files. After that, add the following methods `LoadModel` and `LoadParameters` to load the network and its parameters. This part is the same as [the example](/~https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/inception_inference.cpp).
+
+Next, we need to load synset file, and normalization values. We have made the following change since our synset file contains flower names and we used both mean and standard deviation for image normalization.
+
+```c++
+/*
+ * The following function loads the synset file.
+ * This information will be used later to report the label of input image.
+ */
+void Predictor::LoadSynset(const std::string& synset_file) {
+  if (!FileExists(synset_file)) {
+    LG << "Synset file " << synset_file << " does not exist";
+    throw std::runtime_error("Synset file does not exist");
+  }
+  LG << "Loading the synset file.";
+  std::ifstream fi(synset_file.c_str());
+  if (!fi.is_open()) {
+    std::cerr << "Error opening synset file " << synset_file << std::endl;
+    throw std::runtime_error("Error in opening the synset file.");
+  }
+  std::string lemma;
+  while (getline(fi, lemma)) {
+    output_labels.push_back(lemma);
+  }
+  fi.close();
+}
+
+/*
+ * The following function loads the mean and standard deviation values.
+ * This data will be used for normalizing the image before running the forward
+ * pass.
+ * The output data has the same shape as that of the input image data.
+ */
+void Predictor::LoadMeanImageData() {
+  LG << "Load the mean image data that will be used to normalize "
+     << "the image before running forward pass.";
+  mean_image_data = NDArray(input_shape, global_ctx, false);
+  mean_image_data.SyncCopyFromCPU(
+        NDArray::LoadToMap(mean_image_file)["mean_img"].GetData(),
+        input_shape.Size());
+  NDArray::WaitAll();
+   std_dev_image_data = NDArray(input_shape, global_ctx, false);
+   std_dev_image_data.SyncCopyFromCPU(
+       NDArray::LoadToMap(mean_image_file)["std_img"].GetData(),
+       input_shape.Size());
+    NDArray::WaitAll();
+}
+```
+
+
+
+### Load input image
+
+Now let's add a method to load the input image we want to predict and converts it to NDArray for prediction.
+```cpp
+NDArray Predictor::LoadInputImage(const std::string& image_file) {
+  if (!FileExists(image_file)) {
+    LG << "Image file " << image_file << " does not exist";
+    throw std::runtime_error("Image file does not exist");
+  }
+  LG << "Loading the image " << image_file << std::endl;
+  std::vector<float> array;
+  cv::Mat mat = cv::imread(image_file);
+  /*resize pictures to (224, 224) according to the pretrained model*/
+  int height = input_shape[2];
+  int width = input_shape[3];
+  int channels = input_shape[1];
+  cv::resize(mat, mat, cv::Size(height, width));
+  for (int c = 0; c < channels; ++c) {
+    for (int i = 0; i < height; ++i) {
+      for (int j = 0; j < width; ++j) {
+        array.push_back(static_cast<float>(mat.data[(i * height + j) * 3 + c]));
+      }
+    }
+  }
+  NDArray image_data = NDArray(input_shape, global_ctx, false);
+  image_data.SyncCopyFromCPU(array.data(), input_shape.Size());
+  NDArray::WaitAll();
+  return image_data;
+}
+```
+
+### Predict the image
+
+Finally, let's run the inference. It's basically using MXNet executor to do a forward pass. To run predictions on multiple images, you can load the images in a list of NDArrays and run prediction in batches. Note that the Predictor class may not be thread safe. Calling it in multi-threaded environments was not tested. To utilize multi-threaded prediction, you need to use the C predict API. Please follow the [C predict example](/~https://github.com/apache/incubator-mxnet/tree/master/example/image-classification/predict-cpp).
+
+An additional step is to normalize the image NDArrays values to `(0, 1)` and apply mean and standard deviation we just loaded. 
+
+```cpp
+/*
+ * The following function runs the forward pass on the model.
+ * The executor is created in the constructor.
+ *
+ */
+void Predictor::PredictImage(const std::string& image_file) {
+  // Load the input image
+  NDArray image_data = LoadInputImage(image_file);
+
+  // Normalize the image
+  image_data.Slice(0, 1) /= 255.0;
+  image_data -= mean_image_data;
+  image_data /= std_dev_image_data;
+
+  LG << "Running the forward pass on model to predict the image";
+  /*
+   * The executor->arg_arrays represent the arguments to the model.
+   *
+   * Copying the image_data that contains the NDArray of input image
+   * to the arg map of the executor. The input is stored with the key "data" in the map.
+   *
+   */
+  image_data.CopyTo(&(executor->arg_dict()["data"]));
+  NDArray::WaitAll();
+
+  // Run the forward pass.
+  executor->Forward(false);
+
+  // The output is available in executor->outputs.
+  auto array = executor->outputs[0].Copy(global_ctx);
+  NDArray::WaitAll();
+
+  /*
+   * Find out the maximum accuracy and the index associated with that accuracy.
+   * This is done by using the argmax operator on NDArray.
+   */
+  auto predicted = array.ArgmaxChannel();
+  NDArray::WaitAll();
+
+  int best_idx = predicted.At(0, 0);
+  float best_accuracy = array.At(0, best_idx);
+
+  if (output_labels.empty()) {
+    LG << "The model predicts the highest accuracy of " << best_accuracy << " at index "
+       << best_idx;
+  } else {
+    LG << "The model predicts the input image to be a [" << output_labels[best_idx]
+       << " ] with Accuracy = " << best_accuracy << std::endl;
+  }
+}
+```
+
+### Compile and run the inference code
+
+You can find the [full code for the inference example](/~https://github.com/apache/incubator-mxnet/tree/master/cpp-package/example/inference) in the `cpp-package` folder of the project
+, and to compile it use this [Makefile](/~https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/Makefile).
+
+Make a copy of the example code, rename it to `flower_inference` and apply the changes we mentioned above. Now you will be able to compile and run inference. Run `make all`. Once this is complete, run inference with the following parameters. Remember to set your `LD_LIBRARY_PATH` to point to MXNet library if you have not done so.
+
+```bash
+make all
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH=:path/to/incubator-mxnet/lib
+./flower_inference --symbol flower-recognition-symbol.json --params flower-recognition-0040.params --synset synset.txt --mean mean_std_224.nd --image ./data/test/lotus/image_01832.jpg
+```
+
+Then it will predict your image:
+
+```bash
+[17:38:51] resnet.cpp:150: Loading the model from flower-recognition-symbol.json
+
+[17:38:51] resnet.cpp:163: Loading the model parameters from flower-recognition-0040.params
+
+[17:38:52] resnet.cpp:190: Loading the synset file.
+[17:38:52] resnet.cpp:211: Load the mean image data that will be used to normalize the image before running forward pass.
+[17:38:52] resnet.cpp:263: Loading the image ./data/test/lotus/image_01832.jpg
+
+[17:38:52] resnet.cpp:299: Running the forward pass on model to predict the image
+[17:38:52] resnet.cpp:331: The model predicts the input image to be a [lotus ] with Accuracy = 8.63046
+```
+
+
+
+## What's next
+
+Now you can explore more ways to run inference and deploy your models:
+1. [Java Inference examples](/~https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer)
+2. [Scala Inference examples](https://mxnet.incubator.apache.org/tutorials/scala/)
+3. [ONNX model inference examples](https://mxnet.incubator.apache.org/tutorials/onnx/inference_on_onnx_model.html)
+4. [MXNet Model Server Examples](/~https://github.com/awslabs/mxnet-model-server/tree/master/examples)
+
+## References
+
+1. [Gluon end to end tutorial](/~https://github.com/apache/incubator-mxnet/tree/master/docs/tutorials/gluon/end_to_end_tutorial_training.md)
+2. [Gluon C++ inference example](/~https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/)
+3. [Gluon C++ package](/~https://github.com/apache/incubator-mxnet/tree/master/cpp-package)
\ No newline at end of file
diff --git a/docs/tutorials/gluon/gluon_from_experiment_to_deployment.md b/docs/tutorials/gluon/gluon_from_experiment_to_deployment.md
index b70af7f29e1b..87e6f24ae254 100644
--- a/docs/tutorials/gluon/gluon_from_experiment_to_deployment.md
+++ b/docs/tutorials/gluon/gluon_from_experiment_to_deployment.md
@@ -1,8 +1,9 @@
 
-# Gluon: from experiment to deployment, an end to end example
+# Gluon: from experiment to deployment, an end to end tutorial
 
 ## Overview
 MXNet Gluon API comes with a lot of great features, and it can provide you everything you need: from experimentation to deploying the model. In this tutorial, we will walk you through a common use case on how to build a model using gluon, train it on your data, and deploy it for inference.
+This tutorial covers training and inference in Python, please continue to [C++ inference part](/~https://github.com/apache/incubator-mxnet/tree/master/docs/tutorials/c++/mxnet_cpp_inference_tutorial.md) after you finish.
 
 Let's say you need to build a service that provides flower species recognition. A common problem is that you don't have enough data to train a good model. In such cases, a technique called Transfer Learning can be used to make a more robust model.
 In Transfer Learning we make use of a pre-trained model that solves a related task, and was trained on a very large standard dataset, such as ImageNet. ImageNet is from a different domain, but we can utilize the knowledge in this pre-trained model to perform the new task at hand.
@@ -18,7 +19,6 @@ To complete this tutorial, you need:
 
 - [Build MXNet from source](https://mxnet.incubator.apache.org/install/ubuntu_setup.html#build-mxnet-from-source) with Python(Gluon) and C++ Packages
 - Learn the basics about Gluon with [A 60-minute Gluon Crash Course](https://gluon-crash-course.mxnet.io/)
-- Learn the basics about [MXNet C++ API](/~https://github.com/apache/incubator-mxnet/tree/master/cpp-package)
 
 
 ## The Data
@@ -28,6 +28,7 @@ We have prepared a utility file to help you download and organize your data into
 
 
 ```python
+import mxnet as mx
 data_util_file = "oxford_102_flower_dataset.py"
 base_url = "https://raw.githubusercontent.com/roywei/incubator-mxnet/gluon_tutorial/docs/tutorial_utils/data/{}?raw=true"
 mx.test_utils.download(base_url.format(data_util_file), fname=data_util_file)
@@ -42,20 +43,20 @@ Now your data will be organized into the following format, all the images belong
 ```bash
 data
 |--train
-|   |-- 0
+|   |-- class0
 |   |   |-- image_06736.jpg
 |   |   |-- image_06741.jpg
 ...
-|   |-- 1
+|   |-- class1
 |   |   |-- image_06755.jpg
 |   |   |-- image_06899.jpg
 ...
 |-- test
-|   |-- 0
+|   |-- class0
 |   |   |-- image_00731.jpg
 |   |   |-- image_0002.jpg
 ...
-|   |-- 1
+|   |-- class1
 |   |   |-- image_00036.jpg
 |   |   |-- image_05011.jpg
 
@@ -74,7 +75,6 @@ import os
 import time
 from multiprocessing import cpu_count
 
-import mxnet as mx
 from mxnet import autograd
 from mxnet import gluon, init
 from mxnet.gluon import nn
@@ -107,15 +107,19 @@ Now we will apply data augmentations on training images. This makes minor altera
 1. Randomly crop the image and resize it to 224x224
 2. Randomly flip the image horizontally
 3. Randomly jitter color and add noise
-4. Transpose the data from height*width*num_channels to num_channels*height*width, and map values from [0, 255] to [0, 1]
+4. Transpose the data from `[height, width, num_channels]` to `[num_channels, height, width]`, and map values from [0, 255] to [0, 1]
 5. Normalize with the mean and standard deviation from the ImageNet dataset.
 
-
+For validation and inference, we only need to apply step 1, 4, and 5. We also need to save the mean and standard deviation values for [inference using C++](/~https://github.com/apache/incubator-mxnet/tree/master/docs/tutorials/c++/mxnet_cpp_inference_tutorial.md).
 
 ```python
 jitter_param = 0.4
 lighting_param = 0.1
 
+# mean and std for normalizing image value in range (0,1)
+mean = [0.485, 0.456, 0.406]
+std = [0.229, 0.224, 0.225]
+
 training_transformer = transforms.Compose([
     transforms.RandomResizedCrop(224),
     transforms.RandomFlipLeftRight(),
@@ -123,16 +127,20 @@ training_transformer = transforms.Compose([
                                  saturation=jitter_param),
     transforms.RandomLighting(lighting_param),
     transforms.ToTensor(),
-    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    transforms.Normalize(mean, std)
 ])
 
 validation_transformer = transforms.Compose([
     transforms.Resize(256),
     transforms.CenterCrop(224),
     transforms.ToTensor(),
-    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    transforms.Normalize(mean, std)
 ])
 
+# save mean and std NDArray values for inference
+mean_img = mx.nd.stack(*[mx.nd.full((224, 224), m) for m in mean])
+std_img = mx.nd.stack(*[mx.nd.full((224, 224), s) for s in std])
+mx.nd.save('mean_std_224.nd', {"mean_img": mean_img, "std_img": std_img})
 
 train_path = os.path.join(path, 'train')
 val_path = os.path.join(path, 'valid')
@@ -256,219 +264,60 @@ finetune_net.export("flower-recognition", epoch=epochs)
 
 ```
 
-`export` creates `flower-recognition-symbol.json` and `flower-recognition-0020.params` (`0020` is for 20 epochs we ran) in the current directory. These files can be used for model deployment in the next section.
-
-
-## Load the model and run inference using the MXNet C++ API
-
-MXNet provides various useful tools and interfaces for deploying your model for inference. For example, you can use [MXNet Model Server](/~https://github.com/awslabs/mxnet-model-server) to start a service and host your trained model easily. Besides that, you can also use MXNet's different language APIs to integrate your model with your existing service. We provide [Java](https://mxnet.incubator.apache.org/api/java/index.html), [Scala](https://mxnet.incubator.apache.org/api/scala/index.html), and [C++](https://mxnet.incubator.apache.org/api/c++/index.html) APIs. In this tutorial, we will focus on the MXNet C++ API. For more details, please refer to the [C++ Inference Example](/~https://github.com/apache/incubator-mxnet/tree/master/cpp-package/example/inference).
-
-
-### Setup the MXNet C++ API
-To use the C++ API in MXNet, you need to build MXNet from source with C++ package. Please follow the [built from source guide](https://mxnet.incubator.apache.org/install/ubuntu_setup.html), and [C++ Package documentation](/~https://github.com/apache/incubator-mxnet/tree/master/cpp-package)
-to enable the C++ API.
-The summary of those two documents is that you need to build MXNet from source with `USE_CPP_PACKAGE` flag set to 1. For example: `make -j USE_CPP_PACKAGE=1`.
-
-### Write a predictor using the MXNet C++ API
-Now let's add a method to load the input image and convert it to NDArray for prediction.
-1. Load the pre-trained model
-2. Load the parameters of pre-trained model
-3. Load the image to be classified in to NDArray
-4. Run the forward pass and predict the class of the input image
-
-```cpp
-class Predictor {
- public:
-    Predictor() {}
-    Predictor(const std::string& model_json_file,
-              const std::string& model_params_file,
-              const Shape& input_shape,
-              bool gpu_context_type = false,
-              const std::string& synset_file = "",
-              const std::string& mean_image_file = "");
-    void PredictImage(const std::string& image_file);
-    ~Predictor();
-
- private:
-    void LoadModel(const std::string& model_json_file);
-    void LoadParameters(const std::string& model_parameters_file);
-    void LoadSynset(const std::string& synset_file);
-    NDArray LoadInputImage(const std::string& image_file);
-    void LoadMeanImageData();
-    void LoadDefaultMeanImageData();
-    void NormalizeInput(const std::string& mean_image_file);
-    inline bool FileExists(const std::string& name) {
-        struct stat buffer;
-        return (stat(name.c_str(), &buffer) == 0);
-    }
-    NDArray mean_img;
-    std::map<std::string, NDArray> args_map;
-    std::map<std::string, NDArray> aux_map;
-    std::vector<std::string> output_labels;
-    Symbol net;
-    Executor *executor;
-    Shape input_shape;
-    NDArray mean_image_data;
-    NDArray std_dev_image_data;
-    Context global_ctx = Context::cpu();
-    std::string mean_image_file;
-};
-```
-
-### Load network symbol and parameters
-
-In the Predictor constructor, you need to provide paths to saved json and param files. After that, add the following two methods to load the network and its parameters.
-
-```cpp
-/*
- * The following function loads the model from json file.
- */
-void Predictor::LoadModel(const std::string& model_json_file) {
-  if (!FileExists(model_json_file)) {
-    LG << "Model file " << model_json_file << " does not exist";
-    throw std::runtime_error("Model file does not exist");
-  }
-  LG << "Loading the model from " << model_json_file << std::endl;
-  net = Symbol::Load(model_json_file);
-}
-
-
-/*
- * The following function loads the model parameters.
- */
-void Predictor::LoadParameters(const std::string& model_parameters_file) {
-  if (!FileExists(model_parameters_file)) {
-    LG << "Parameter file " << model_parameters_file << " does not exist";
-    throw std::runtime_error("Model parameters does not exist");
-  }
-  LG << "Loading the model parameters from " << model_parameters_file << std::endl;
-  std::map<std::string, NDArray> parameters;
-  NDArray::Load(model_parameters_file, 0, &parameters);
-  for (const auto &k : parameters) {
-    if (k.first.substr(0, 4) == "aux:") {
-      auto name = k.first.substr(4, k.first.size() - 4);
-      aux_map[name] = k.second.Copy(global_ctx);
-    }
-    if (k.first.substr(0, 4) == "arg:") {
-      auto name = k.first.substr(4, k.first.size() - 4);
-      args_map[name] = k.second.Copy(global_ctx);
-    }
-  }
-  /*WaitAll is need when we copy data between GPU and the main memory*/
-  NDArray::WaitAll();
-}
-```
+`export` creates `flower-recognition-symbol.json` and `flower-recognition-0040.params` (`0040` is for 40 epochs we ran) in the current directory. These files can be used for model deployment in the next section.
 
-### Load Input Image
-
-Now let's add a method to load the input image we want to predict and converts it to NDArray for prediction.
-```cpp
-NDArray Predictor::LoadInputImage(const std::string& image_file) {
-  if (!FileExists(image_file)) {
-    LG << "Image file " << image_file << " does not exist";
-    throw std::runtime_error("Image file does not exist");
-  }
-  LG << "Loading the image " << image_file << std::endl;
-  std::vector<float> array;
-  cv::Mat mat = cv::imread(image_file);
-  /*resize pictures to (224, 224) according to the pretrained model*/
-  int height = input_shape[2];
-  int width = input_shape[3];
-  int channels = input_shape[1];
-  cv::resize(mat, mat, cv::Size(height, width));
-  for (int c = 0; c < channels; ++c) {
-    for (int i = 0; i < height; ++i) {
-      for (int j = 0; j < width; ++j) {
-        array.push_back(static_cast<float>(mat.data[(i * height + j) * 3 + c]));
-      }
-    }
-  }
-  NDArray image_data = NDArray(input_shape, global_ctx, false);
-  image_data.SyncCopyFromCPU(array.data(), input_shape.Size());
-  NDArray::WaitAll();
-  return image_data;
-}
-```
-
-### Create a predict image class
-
-Finally, let's run the inference. It's basically using MXNet executor to do a forward pass. To run predictions on multiple images, you can load the images in a list of NDArrays and run prediction in batches. Note that the Predictor class may not be thread safe. Calling it in multi-threaded environments was not tested. To utilize multi-threaded prediction, you need to use the C predict API. Please follow the [C predict example](/~https://github.com/apache/incubator-mxnet/tree/master/example/image-classification/predict-cpp).
-
-```cpp
-void Predictor::PredictImage(const std::string& image_file) {
-  // Load the input image
-  NDArray image_data = LoadInputImage(image_file);
-
-  // Normalize the image
-  image_data.Slice(0, 1) -= mean_image_data;
-
-  LG << "Running the forward pass on model to predict the image";
-  /*
-   * The executor->arg_arrays represent the arguments to the model.
-   *
-   * Copying the image_data that contains the NDArray of input image
-   * to the arg map of the executor. The input is stored with the key "data" in the map.
-   *
-   */
-  image_data.CopyTo(&(executor->arg_dict()["data"]));
-  NDArray::WaitAll();
-
-  // Run the forward pass.
-  executor->Forward(false);
-
-  // The output is available in executor->outputs.
-  auto array = executor->outputs[0].Copy(global_ctx);
-  NDArray::WaitAll();
-
-  /*
-   * Find out the maximum accuracy and the index associated with that accuracy.
-   * This is done by using the argmax operator on NDArray.
-   */
-  auto predicted = array.ArgmaxChannel();
-  NDArray::WaitAll();
-
-  int best_idx = predicted.At(0, 0);
-  float best_accuracy = array.At(0, best_idx);
-
-  if (output_labels.empty()) {
-    LG << "The model predicts the highest accuracy of " << best_accuracy << " at index "
-       << best_idx;
-  } else {
-    LG << "The model predicts the input image to be a [" << output_labels[best_idx]
-       << " ] with Accuracy = " << best_accuracy << std::endl;
-  }
-}
-```
+## Load the model and run inference using the MXNet Module API
 
-### Compile and run the inference code
+MXNet provides various useful tools and interfaces for deploying your model for inference. For example, you can use [MXNet Model Server](/~https://github.com/awslabs/mxnet-model-server) to start a service and host your trained model easily.
+Besides that, you can also use MXNet's different language APIs to integrate your model with your existing service. We provide [Python](https://mxnet.incubator.apache.org/api/python/module/module.html),    [Java](https://mxnet.incubator.apache.org/api/java/index.html), [Scala](https://mxnet.incubator.apache.org/api/scala/index.html), and [C++](https://mxnet.incubator.apache.org/api/c++/index.html) APIs.
 
-You can find the [full code for the inference example](/~https://github.com/apache/incubator-mxnet/tree/master/cpp-package/example/inference) in the `cpp-package` folder of the project
-, and to compile it use this [Makefile](/~https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/Makefile).
+Here we will briefly introduce how to run inference using Module API in Python. There is more detailed explanation available in the [Predict Image Tutorial](https://mxnet.incubator.apache.org/tutorials/python/predict_image.html).
+In general, prediction consists of the following steps:
+1. Load the model architecture (symbol file) and trained parameter values (params file)
+2. Load the synset file for label names
+3. Load the image and apply the same transformation we did on validation dataset during training
+4. Run a forward pass on the image data
+5. Convert output probabilities to predicted label name
 
-Now you will be able to compile the run inference. Run `make all`. Once this is complete, run inference with the the following parameters:
-
-```bash
-make all
-LD_LIBRARY_PATH=../incubator-mxnet/lib/ ./inception_inference --symbol "flower-recognition-symbol.json" --params "flower-recognition-0020.params" --image ./data/test/0/image_06736.jpg
+```python
+import numpy as np
+from collections import namedtuple
+
+ctx = mx.cpu()
+# load model symbol and params
+sym, arg_params, aux_params = mx.model.load_checkpoint('flower-recognition', epochs)
+mod = mx.mod.Module(symbol=sym, context=ctx, label_names=None)
+mod.bind(for_training=False, data_shapes=[('data', (1, 3, 224, 224))], label_shapes=mod._label_shapes)
+mod.set_params(arg_params, aux_params, allow_missing=True)
+
+# load synset for label names
+with open('synset.txt', 'r') as f:
+    labels = [l.rstrip() for l in f]
+
+# load an image for prediction
+img = mx.image.imread('./data/test/lotus/image_01832.jpg')
+# apply transform we did during training
+img = validation_transformer(img)
+# batchify
+img = img.expand_dims(axis=0)
+Batch = namedtuple('Batch', ['data'])
+mod.forward(Batch([img]))
+prob = mod.get_outputs()[0].asnumpy()
+prob = np.squeeze(prob)
+idx = np.argmax(prob)
+print('probability=%f, class=%s' % (prob[idx], labels[idx]))
 ```
 
-Then it will predict your image:
-
+Following is the output, you can see the image has been classified as lotus correctly.
 ```bash
-[22:26:49] inception_inference.cpp:128: Loading the model from flower-recognition-symbol.json
-
-[22:26:49] inception_inference.cpp:137: Loading the model parameters from flower-recognition-0020.params
-
-[22:26:50] inception_inference.cpp:179: Loading the image ./data/test/0/image_06736.jpg
-
-[22:26:50] inception_inference.cpp:230: Running the forward pass on model to predict the image
-[22:26:50] inception_inference.cpp:260: The model predicts the highest accuracy of 7.17001 at index 3
+probability=9.798435, class=lotus
 ```
 
-
 ## What's next
 
-You can find more ways to run inference and deploy your models here:
+You can continue to the [next tutorial](/~https://github.com/apache/incubator-mxnet/tree/master/docs/tutorials/c++/mxnet_cpp_inference_tutorial.md) on how to load the model we just trained and run inference using MXNet C++ API.
+
+You can also find more ways to run inference and deploy your models here:
 1. [Java Inference examples](/~https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer)
 2. [Scala Inference examples](https://mxnet.incubator.apache.org/tutorials/scala/)
 3. [ONNX model inference examples](https://mxnet.incubator.apache.org/tutorials/onnx/inference_on_onnx_model.html)
@@ -477,8 +326,9 @@ You can find more ways to run inference and deploy your models here:
 ## References
 
 1. [Transfer Learning for Oxford102 Flower Dataset](/~https://github.com/Arsey/keras-transfer-learning-for-oxford102)
-2. [Gluon tutorial on fine-tuning](https://gluon.mxnet.io/chapter08_computer-vision/fine-tuning.html)
-3. [Gluon crash course](https://gluon-crash-course.mxnet.io/)
-4. [Gluon CPP inference example](/~https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/)
+2. [Gluon book on fine-tuning](https://gluon.mxnet.io/chapter08_computer-vision/fine-tuning.html)
+3. [Gluon CV transfer learning tutorial](https://gluon-cv.mxnet.io/build/examples_classification/transfer_learning_minc.html)
+4. [Gluon crash course](https://gluon-crash-course.mxnet.io/)
+5. [Gluon CPP inference example](/~https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/)
 
 <!-- INSERT SOURCE DOWNLOAD BUTTONS -->
\ No newline at end of file

From f17d7da80f97859e65b00df12daf3c800f08e9d4 Mon Sep 17 00:00:00 2001
From: Lai Wei <royweilai@gmail.com>
Date: Wed, 16 Jan 2019 10:26:00 -0800
Subject: [PATCH 16/16] add c++ tutorial to test whitelist

---
 tests/tutorials/test_sanity_tutorials.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/tutorials/test_sanity_tutorials.py b/tests/tutorials/test_sanity_tutorials.py
index 644a6118333f..429527db2000 100644
--- a/tests/tutorials/test_sanity_tutorials.py
+++ b/tests/tutorials/test_sanity_tutorials.py
@@ -28,6 +28,7 @@
              'c++/basics.md',
              'c++/index.md',
              'c++/subgraphAPI.md',
+             'c++/mxnet_cpp_inference_tutorial.md',
              'control_flow/index.md',
              'embedded/index.md',
              'embedded/wine_detector.md',