diff --git a/docs/api/python/symbol/symbol.md b/docs/api/python/symbol/symbol.md index 9cab2c59e862..fea746bb02f4 100644 --- a/docs/api/python/symbol/symbol.md +++ b/docs/api/python/symbol/symbol.md @@ -337,6 +337,7 @@ Composite multiple symbols into a new one by an operator. :nosignatures: Symbol.infer_type + Symbol.infer_type_partial Symbol.infer_shape Symbol.infer_shape_partial ``` diff --git a/docs/install/index.md b/docs/install/index.md index 76f22e744075..2b7e0457e2b4 100644 --- a/docs/install/index.md +++ b/docs/install/index.md @@ -1188,7 +1188,7 @@ MXNet should work on any cloud provider's CPU-only instances. Follow the Python
-MXNet supports the Debian based Raspbian ARM based operating system so you can run MXNet on Raspberry Pi Devices. +MXNet supports the Debian based Raspbian ARM based operating system so you can run MXNet on Raspberry Pi 3B devices. These instructions will walk through how to build MXNet for the Raspberry Pi and install the Python bindings for the library. @@ -1196,6 +1196,9 @@ You can do a dockerized cross compilation build on your local machine or a nativ The complete MXNet library and its requirements can take almost 200MB of RAM, and loading large models with the library can take over 1GB of RAM. Because of this, we recommend running MXNet on the Raspberry Pi 3 or an equivalent device that has more than 1 GB of RAM and a Secure Digital (SD) card that has at least 4 GB of free memory. +## Quick installation +You can use this [pre-built Python wheel](wget https://mxnet-public.s3.amazonaws.com/install/raspbian/mxnet-1.5.0-py2.py3-none-any.whl) on a Raspberry Pi 3B with Stretch. You will likely need to install several dependencies to get MXNet to work. Refer to the following **Build** section for details. + **Cross compilation build (Experimental)** ## Docker installation @@ -1222,11 +1225,48 @@ ci/build.py -p armv7 ## Install -Create a virtualenv and install the package we created previously. +Your Pi will need several dependencies. + +Install MXNet dependencies with the following: +``` +sudo apt-get update +sudo apt-get install -y \ + apt-transport-https \ + build-essential \ + ca-certificates \ + cmake \ + curl \ + git \ + libatlas-base-dev \ + libcurl4-openssl-dev \ + libjemalloc-dev \ + liblapack-dev \ + libopenblas-dev \ + libopencv-dev \ + libzmq3-dev \ + ninja-build \ + python-dev \ + software-properties-common \ + sudo \ + unzip \ + virtualenv \ + wget +``` +Install virtualenv with: +``` +sudo pip install virtualenv +``` +Create a Python 2.7 environment for MXNet with: +``` +virtualenv -p `which python` mxnet_py27 +``` +You may use Python 3, however the [wine bottle detection example](https://mxnet.incubator.apache.org/versions/master/tutorials/embedded/wine_detector.html) for the Pi with camera requires Python 2.7. + +Create a virtualenv and install the wheel we created previously, or the wheel that you downloaded. ``` -virtualenv -p `which python3` mxnet_py3 -source mxnet_py3/bin/activate +virtualenv -p `which python3` mxnet_py27 +source mxnet_py27/bin/activate pip install mxnet-x.x.x-py2.py3-none-any.whl ``` @@ -1257,7 +1297,7 @@ Install these dependencies using the following commands in any directory: ``` sudo apt-get update - sudo apt-get -y install git cmake ninja-build build-essential g++-4.9 c++-4.9 liblapack* libblas* libopencv* libopenblas* python3-dev virtualenv + sudo apt-get -y install git cmake ninja-build build-essential g++-4.9 c++-4.9 liblapack* libblas* libopencv* libopenblas* python3-dev python-dev virtualenv ``` Clone the MXNet source code repository using the following `git` command in your home directory: diff --git a/docs/install/raspbian_setup.md b/docs/install/raspbian_setup.md index a432d4815662..896d4721370b 100644 --- a/docs/install/raspbian_setup.md +++ b/docs/install/raspbian_setup.md @@ -17,9 +17,9 @@ - +

- + This content is moved to a new MXNet install page. Redirecting...

diff --git a/docs/install/scala_setup.md b/docs/install/scala_setup.md index bc069a14e6b3..15a2def1ef38 100644 --- a/docs/install/scala_setup.md +++ b/docs/install/scala_setup.md @@ -20,6 +20,7 @@ The following instructions are provided for macOS and Ubuntu. Windows is not yet available. **Note:** If you use IntelliJ or a similar IDE, you may want to follow the [MXNet-Scala on IntelliJ tutorial](../tutorials/scala/mxnet_scala_on_intellij.html) instead of these instructions. +**Note:** Currently, we only support scala 2.11
@@ -114,6 +115,33 @@ mvn install
+## Interpreter + +To run the scala interpreter, first download and install scala 2.11.x (run `scala -version` to make sure you have the right version installed.** + +### Installing the Interpreter + +**Ubuntu*** + +``` +sudo apt-get install scala +``` + +**macOS*** + +``` +brew install scala@2.11 +``` + +Then, add scala to your path by following the instructions output by homebrew. + +### Running the Interpreter + +To run the interpreter, download the appropriate mxnet jar from [the maven repository](https://search.maven.org/search?q=g:org.apache.mxnet) or build from source following the instructions above. + +Then, run `scala -cp {path/to/mxnet-full_2.11-os-version.jar}` to start it. +If you receive a "NumberFormatException" when running the interpreter, run `export TERM=xterm-color` before starting the interpreter. + ## Documentation Scaladocs are generated as part of the docs build pipeline. You can find them published in the [Scala API](http://mxnet.incubator.apache.org/api/scala/index.html) section of the website or by going to the [scaladocs output](https://mxnet.incubator.apache.org/api/scala/docs/index.html#org.apache.mxnet.package) directly. diff --git a/docs/tutorials/scala/char_lstm.md b/docs/tutorials/scala/char_lstm.md index 972661bc81ef..aca08dc79920 100644 --- a/docs/tutorials/scala/char_lstm.md +++ b/docs/tutorials/scala/char_lstm.md @@ -71,11 +71,7 @@ In this tutorial, you will accomplish the following: ## Prerequisites -To complete this tutorial, you need: - -- MXNet. See the instructions for your operating system in [Setup and Installation](http://mxnet.io/install/index.html) -- [Scala 2.11.8](https://www.scala-lang.org/download/2.11.8.html) -- [Maven 3](https://maven.apache.org/install.html) +To complete this tutorial, setup and run the scala interpreter by following the [instructions](https://mxnet.incubator.apache.org/install/scala_setup.html#interpreter). ## Download the Data diff --git a/example/captcha/mxnet_captcha.R b/example/captcha/mxnet_captcha.R index 8988d25104d5..43e819f8c264 100644 --- a/example/captcha/mxnet_captcha.R +++ b/example/captcha/mxnet_captcha.R @@ -39,9 +39,9 @@ label <- mx.symbol.Reshape(data = label, target_shape = c(0)) captcha_net <- mx.symbol.SoftmaxOutput(data = fc2, label = label, name = "softmax") mx.metric.acc2 <- mx.metric.custom("accuracy", function(label, pred) { - ypred <- max.col(t(pred)) - 1 + ypred <- max.col(t(data.matrix(pred))) - 1 ypred <- matrix(ypred, nrow = nrow(label), ncol = ncol(label), byrow = TRUE) - return(sum(colSums(label == ypred) == 4) / ncol(label)) + return(sum(colSums(data.matrix(label) == ypred) == 4) / ncol(label)) }) data.shape <- c(80, 30, 3) diff --git a/example/gan/CGAN_mnist_R/README.md b/example/gan/CGAN_mnist_R/README.md new file mode 100644 index 000000000000..bf0bb08b1147 --- /dev/null +++ b/example/gan/CGAN_mnist_R/README.md @@ -0,0 +1,168 @@ + + + + + + + + + + + + + + + + + +# Conditional Generative Adversarial Network with MXNet R package + +This tutorial shows how to build and train a Conditional Generative Adversarial Network (CGAN) on MNIST images. + +## How GAN works +A Generative Adversarial Model simultaneously trains two models: a generator that learns to output fake samples from an unknown distribution and a discriminator that learns to distinguish fake from real samples. + +The CGAN is a conditional variation of the GAN where the generator is instructed to generate a real sample having specific characteristics rather than a generic sample from full distribution. Such condition could be the label associated with an image like in this tutorial or a more detailed tag as shown in the example below: + +![Image credit: (Scott Reed)[/~https://github.com/reedscot/icml2016]](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/gan/CGAN_mnist_R/dcgan_network.jpg) + +## Initial setup + +The following packages are needed to run the tutorial: + +``` +require("imager") +require("dplyr") +require("readr") +require("mxnet") +``` + +The full demo is comprised of the two following scripts: + +```CGAN_mnist_setup.R```: prepare data and define the model structure +```CGAN_train.R```: execute the training + +## Data preperation + +The MNIST dataset is available [here](https://www.kaggle.com/c/digit-recognizer/data)). Once train.csv is downloaded into the data/ folder, we can import into R. + +```train <- read_csv('data/train.csv') +train <- data.matrix(train) + +train_data <- train[,-1] +train_data <- t(train_data/255*2-1) +train_label <- as.integer(train[,1]) + +dim(train_data) <- c(28, 28, 1, ncol(train_data)) +``` +Custom iterators are defined in ```iterators.R``` and imported by ```CGAN_mnist_setup.R``` + +## Generator +The generator is a network that creates novel samples (MNIST images) from 2 inputs: + +- Noise vector +- Labels defining the object condition (which digit to produce) + +The noise vector provides the building blocks to the Generator model, which will learns how to structure that noise into a sample. The mx.symbol.Deconvolution operator is used to upsample the initial input from a 1x1 shape up to a 28x28 image. + +The information on the label for which to generate a fake sample is provided by a one-hot encoding of the label indices that is appended to the random noise. For MNIST, the 0-9 indices are therefore converted into a binary vector of length 10. More complex applications would require embeddings rather than simple one-hot to encode the condition. + +## Discriminator +The discriminator attempts to distinguish between fake samples produced by the generator and real ones sampled from MNIST training data. + +In a conditional GAN, the labels associated with the samples are also provided to the Discriminator. In this demo, this information is again provided as a hot-hot encoding of the label that is broadcast to match the image dimensions (10 -> 28x28x10). + +## Training logic +The training process of the discriminator is most obvious: the loss is simple a binary TRUE/FALSE response and that loss is propagated back into the CNN network. It can therefore be understood as a simple binary classification problem. + +```### Train loop on fake +mx.exec.update.arg.arrays(exec_D, arg.arrays = + list(data=D_data_fake, digit=D_digit_fake, label=mx.nd.array(rep(0, batch_size))), + match.name=TRUE) +mx.exec.forward(exec_D, is.train=T) +mx.exec.backward(exec_D) +update_args_D<- updater_D(weight = exec_D$ref.arg.arrays, grad = exec_D$ref.grad.arrays) +mx.exec.update.arg.arrays(exec_D, update_args_D, skip.null=TRUE) + +### Train loop on real +mx.exec.update.arg.arrays(exec_D, arg.arrays = + list(data=D_data_real, digit=D_digit_real, label=mx.nd.array(rep(1, batch_size))), + match.name=TRUE) +mx.exec.forward(exec_D, is.train=T) +mx.exec.backward(exec_D) +update_args_D<- updater_D(weight = exec_D$ref.arg.arrays, grad = exec_D$ref.grad.arrays) +mx.exec.update.arg.arrays(exec_D, update_args_D, skip.null=TRUE) +``` + +The generator loss comes from the backpropagation of the the discriminator loss into its generated output. By faking the generator labels to be real samples into the discriminator, the discriminator back-propagated loss provides the generator with the information on how to best adapt its parameters to trick the discriminator into believing the fake samples are real. + +This requires to backpropagate the gradients up to the input data of the discriminator (whereas this input gradient is typically ignored in vanilla feedforward network). + +```### Update Generator weights - use a seperate executor for writing data gradients +exec_D_back <- mxnet:::mx.symbol.bind(symbol = D_sym, + arg.arrays = exec_D$arg.arrays, + aux.arrays = exec_D$aux.arrays, grad.reqs = rep("write", length(exec_D$arg.arrays)), + ctx = devices) + +mx.exec.update.arg.arrays(exec_D_back, arg.arrays = + list(data=D_data_fake, digit=D_digit_fake, label=mx.nd.array(rep(1, batch_size))), + match.name=TRUE) +mx.exec.forward(exec_D_back, is.train=T) +mx.exec.backward(exec_D_back) +D_grads <- exec_D_back$ref.grad.arrays$data +mx.exec.backward(exec_G, out_grads=D_grads) + +update_args_G <- updater_G(weight = exec_G$ref.arg.arrays, grad = exec_G$ref.grad.arrays) +mx.exec.update.arg.arrays(exec_G, update_args_G, skip.null=TRUE) +``` + +The above training steps are executed in the ```CGAN_train.R``` script. + +## Monitor the training + +During training, the [imager](http://dahtah.github.io/imager/) package facilitates the visual quality assessment of the fake samples. + +```if (iteration==1 | iteration %% 100==0){ + par(mfrow=c(3,3), mar=c(0.1,0.1,0.1,0.1)) + for (i in 1:9) { + img <- as.array(exec_G$ref.outputs$G_sym_output)[,,,i] + plot(as.cimg(img), axes=F) + } +} +``` +Below are samples obtained at different stage of the training. + +Starting from noise: + +![](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/gan/CGAN_mnist_R/CGAN_1.png) + +Slowly getting it - iteration 200: + +![](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/gan/CGAN_mnist_R/CGAN_200.png) + +Generate specified digit images on demand - iteration 2400: + +![](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/gan/CGAN_mnist_R/CGAN_2400.png) + +## Inference + +Once the model is trained, synthetic images of the desired digit can be produced by feeding the generator with fixed labels rather than the randomly generated ones used during the training. + +Here we will generate fake ```9```: + +```digit <- mx.nd.array(rep(9, times=batch_size)) +data <- mx.nd.one.hot(indices = digit, depth = 10) +data <- mx.nd.reshape(data = data, shape = c(1,1,-1, batch_size)) + +exec_G <- mx.simple.bind(symbol = G_sym, data=data_shape_G, ctx = devices, grad.req = "null") +mx.exec.update.arg.arrays(exec_G, G_arg_params, match.name=TRUE) +mx.exec.update.arg.arrays(exec_G, list(data=data), match.name=TRUE) +mx.exec.update.aux.arrays(exec_G, G_aux_params, match.name=TRUE) + +mx.exec.forward(exec_G, is.train=F) +``` +![](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/gan/CGAN_mnist_R/CGAN_infer_9.png) + +Further details of the CGAN methodology can be found in the paper [Generative Adversarial Text to Image Synthesis](https://arxiv.org/abs/1605.05396). + + diff --git a/example/gluon/actor_critic/README.md b/example/gluon/actor_critic/README.md new file mode 100644 index 000000000000..7f3a6a73e972 --- /dev/null +++ b/example/gluon/actor_critic/README.md @@ -0,0 +1,61 @@ + + + + + + + + + + + + + + + + + +# Actor Critic Model + +This example shows an actor critic model that consists of a critic that measures how good an action taken is and an actor that controls the agent's behavior. +In our example actor and critic use the same model: + +``` +class Policy(gluon.Block): + def __init__(self, **kwargs): + super(Policy, self).__init__(**kwargs) + with self.name_scope(): + self.dense = nn.Dense(16, in_units=4, activation='relu') + self.action_pred = nn.Dense(2, in_units=16) + self.value_pred = nn.Dense(1, in_units=16) + + def forward(self, x): + x = self.dense(x) + probs = self.action_pred(x) + values = self.value_pred(x) + return F.softmax(probs), values +``` +The example uses [Gym](https://gym.openai.com/docs/), which is a toolkit for developing and comparing reinforcement learning algorithms. The model is running an instance of [CartPole-v0](https://gym.openai.com/envs/CartPole-v0/) that simulates a pole that is attached by an un-actuated joint to a cart, which moves along a frictionless track. The goal is to prevent it from falling over. + + +The example provides the following commandline options: +``` +MXNet actor-critic example + +optional arguments: + -h, --help show this help message and exit + --gamma G discount factor (default: 0.99) + --seed N random seed (default: 1) + --render render the environment + --log-interval N interval between training status logs (default: 10) + +``` + +To run the model execute, type +``` +python actor_critic.py --render +``` + +You will get an output like the following: +![](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/gluon/actor_critic/actor_critic.gif) + diff --git a/example/gluon/audio/README.md b/example/gluon/audio/README.md new file mode 100644 index 000000000000..cb2b53eb3b83 --- /dev/null +++ b/example/gluon/audio/README.md @@ -0,0 +1,115 @@ + + + + + + + + + + + + + + + + + +# Urban Sounds Classification in MXNet Gluon + +This example provides an end-to-end pipeline for a common datahack competition - [Urban Sounds Classification Example](https://datahack.analyticsvidhya.com/contest/practice-problem-urban-sound-classification/). + +After logging in, the data set can be downloaded. +The details of the dataset and the link to download it are given below: + + +## Urban Sounds Dataset: +### Description + The dataset contains 8732 wav files which are audio samples(<= 4s)) of street sounds like engine_idling, car_horn, children_playing, dog_barking and so on. + The task is to classify these audio samples into one of the following 10 labels: + ``` + siren, + street_music, + drilling, + dog_bark, + children_playing, + gun_shot, + engine_idling, + air_conditioner, + jackhammer, + car_horn + ``` + +To be able to run this example: + +1. `pip install -r requirements.txt` + + If you are in the directory where the requirements.txt file lies, + this step installs the required libraries to run the example. + The main dependency that is required is: Librosa. + The version used to test the example is: `0.6.2` + For more details, refer here: +https://librosa.github.io/librosa/install.html + +2. Download the dataset(train.zip, test.zip) required for this example from the location: +https://drive.google.com/drive/folders/0By0bAi7hOBAFUHVXd1JCN3MwTEU + +3. Extract both the zip archives into the **current directory** - after unzipping you would get 2 new folders namely, + **Train** and **Test** and two csv files - **train.csv**, **test.csv** + + Assuming you are in a directory *"UrbanSounds"*, after downloading and extracting train.zip, the folder structure should be: + + ``` + UrbanSounds + - Train + - 0.wav, 1.wav ... + - train.csv + - train.py + - predict.py ... + ``` + +4. Apache MXNet is installed on the machine. For instructions, go to the link: https://mxnet.incubator.apache.org/install/ + + + +For information on the current design of how the AudioFolderDataset is implemented, refer below: +https://cwiki.apache.org/confluence/display/MXNET/Gluon+-+Audio + +### Usage + +For training: + +- Arguments + - train : The folder/directory that contains the audio(wav) files locally. Default = "./Train" + - csv: The file name of the csv file that contains audio file name to label mapping. Default = "train.csv" + - epochs : Number of epochs to train the model. Default = 30 + - batch_size : The batch size for training. Default = 32 + + +###### To use the default arguments, use: +``` +python train.py +``` +or + +###### To pass command-line arguments for training data directory, epochs, batch_size, csv file name, use : +``` +python train.py --train ./Train --csv train.csv --batch_size 32 --epochs 30 +``` + +For prediction: + +- Arguments + - pred : The folder/directory that contains the audio(wav) files which are to be classified. Default = "./Test" + + +###### To use the default arguments, use: +``` +python predict.py +``` +or + +###### To pass command-line arguments for test data directory, use : +``` +python predict.py --pred ./Test +``` diff --git a/example/gluon/house_prices/README.md b/example/gluon/house_prices/README.md new file mode 100644 index 000000000000..1393a0e3869d --- /dev/null +++ b/example/gluon/house_prices/README.md @@ -0,0 +1,29 @@ + + + + + + + + + + + + + + + + + +# House Prices: Advanced Regression Techniques + +This example shows how to predict house prices and it is based on the [House Price Kaggle challenge](https://www.kaggle.com/c/house-prices-advanced-regression-techniques#description) + +First you need to download train and test data set from here: +``` +https://www.kaggle.com/c/house-prices-advanced-regression-techniques/download/train.csv +https://www.kaggle.com/c/house-prices-advanced-regression-techniques/download/test.csv +``` +Afterwards you can execute the script with ```python kaggle_k_fold_cross_validation.py``` + +For a detailed explanation of the code, you can check out this [chapter](http://d2l.ai/chapter_deep-learning-basics/kaggle-house-price.html) of the Dive into Deep Learning book. diff --git a/example/gluon/lstm_crf/README.md b/example/gluon/lstm_crf/README.md new file mode 100644 index 000000000000..519c3b89f9fd --- /dev/null +++ b/example/gluon/lstm_crf/README.md @@ -0,0 +1,36 @@ + + + + + + + + + + + + + + + + + +# BiLSTM CRF model +This example demonstrates how a [BiLSTM-CRF model](https://arxiv.org/pdf/1508.01991v1.pdf) can be implemented in Gluon to perform noun-phrase chunking as a sequence labeling task. In this example we define the following training sample: +``` +georgia tech is a university in georgia +B I O O O O B +``` +The second line is the IOB representation of the above sentence that is learnt by the model. **I** stands for in chunk, **O** for out of a chunk and **B** for beginning of junks. + +The model consists of an LSTM layer with 2 hidden units and a CRF layer. The CRF layer has a state transition matrix which allows to take past and future tags into account when predicting the current tag. The bidirectional LSTM is reading the word sequence from beginning to end and vice versa. It prodcues a vector representation for the words. The following image is taken from https://arxiv.org/pdf/1508.01991v1.pdf and shows the model architecture: + +![Image taken from https://arxiv.org/pdf/1508.01991v1.pdf](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/gluon/lstm_crf/bi-lstm_crf.png) + +You can run the example by executing +``` +python lstm_crf.py +``` +The example code does not take any commandline arguments. If you want to change the number of hidden units or the size of vectors embeddings, then you need to change the variables ```EMBEDDING_DIM``` and ```HIDDEN_DIM```. + + diff --git a/example/gluon/mnist/README.md b/example/gluon/mnist/README.md new file mode 100644 index 000000000000..c053364fad3c --- /dev/null +++ b/example/gluon/mnist/README.md @@ -0,0 +1,55 @@ + + + + + + + + + + + + + + + + + +# MNIST classification example + +This script shows a simple example how to do image classification with Gluon. +The model is trained on MNIST digits image dataset and the goal is to classify the digits ```0-9```. The model has the following layout: +``` +net = nn.Sequential() +with net.name_scope(): + net.add(nn.Dense(128, activation='relu')) + net.add(nn.Dense(64, activation='relu')) + net.add(nn.Dense(10)) +``` + +The script provides the following commandline arguments: + + +``` +MXNet Gluon MNIST Example + +optional arguments: + -h, --help show this help message and exit + --batch-size BATCH_SIZE + batch size for training and testing (default: 100) + --epochs EPOCHS number of epochs to train (default: 10) + --lr LR learning rate (default: 0.1) + --momentum MOMENTUM SGD momentum (default: 0.9) + --cuda Train on GPU with CUDA + --log-interval N how many batches to wait before logging training + status +``` + +After one epoch we get the following output vector for the given test image: + + + +[-5.461655 -4.745 -1.8203478 -0.5705207 8.923972 -2.2358544 -3.3020825 -2.409004 4.0074944 10.362008] + +As we can see the highest activation is 10.362 which corresponds to label `9`. + diff --git a/example/gluon/super_resolution/README.md b/example/gluon/super_resolution/README.md new file mode 100644 index 000000000000..ddcbe8b0a202 --- /dev/null +++ b/example/gluon/super_resolution/README.md @@ -0,0 +1,45 @@ + + + + + + + + + + + + + + + + + +# Superresolution + +This example trains a convolutional neural network to enhance the resolution of images (also known as superresolution). +The script takes the following commandline arguments: + +``` +Super-resolution using an efficient sub-pixel convolution neural network. + +optional arguments: + -h, --help show this help message and exit + --upscale_factor UPSCALE_FACTOR + super resolution upscale factor. default is 3. + --batch_size BATCH_SIZE + training batch size, per device. default is 4. + --test_batch_size TEST_BATCH_SIZE + test batch size + --epochs EPOCHS number of training epochs + --lr LR learning Rate. default is 0.001. + --use-gpu whether to use GPU. + --seed SEED random seed to use. Default=123 + --resolve_img RESOLVE_IMG + input image to use +``` + +Once the network is trained you can use the following command to increase the resolution of your image: +``` +python super_resolution.py --resolve_img myimage.jpg +``` diff --git a/example/gluon/data.py b/example/gluon/super_resolution/data.py similarity index 100% rename from example/gluon/data.py rename to example/gluon/super_resolution/data.py diff --git a/example/model-parallel/README.md b/example/model-parallel/README.md new file mode 100644 index 000000000000..537562070a62 --- /dev/null +++ b/example/model-parallel/README.md @@ -0,0 +1,20 @@ + + + + + + + + + + + + + + + + + +# Run parts of a model on different devices + +This folder contains the example [matrix_factorization](/~https://github.com/apache/incubator-mxnet/tree/master/example/model-parallel/matrix_factorization) that demonstrates the basic usage of `group2ctxs`. diff --git a/example/sparse/README.md b/example/sparse/README.md new file mode 100644 index 000000000000..8f1302950d22 --- /dev/null +++ b/example/sparse/README.md @@ -0,0 +1,24 @@ + + + + + + + + + + + + + + + + + +# Examples using Sparse Symbol API +This folder contains examples that demonstrate the usage of [Sparse Symbol API](https://mxnet.incubator.apache.org/api/python/symbol/sparse.html) +- [Factorization Machine](/~https://github.com/apache/incubator-mxnet/tree/master/example/sparse/factorization_machine) uses sparse weights +- [Linear Classification Using Sparse Matrix Multiplication](/~https://github.com/apache/incubator-mxnet/tree/master/example/sparse/linear_classification) shows how to use a sparse data loader, sparse dot operator and sparse gradient updaters +- [Matrix Factorization w/ Sparse Embedding](/~https://github.com/apache/incubator-mxnet/tree/master/example/sparse/matrix_factorization) uses sparse weights +- [Wide and Deep Learning](/~https://github.com/apache/incubator-mxnet/tree/master/example/sparse/wide_deep) shows how to run sparse wide and deep classification + diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h index 13ee903407b3..76a4995d15c0 100644 --- a/include/mxnet/c_api.h +++ b/include/mxnet/c_api.h @@ -1562,6 +1562,38 @@ MXNET_DLL int MXSymbolInferType(SymbolHandle sym, const int **aux_type_data, int *complete); +/*! + * \brief partially infer type of unknown input types given the known one. + * + * Return partially inferred results if not all types could be inferred. + * The types are packed into a CSR matrix represented by arg_ind_ptr and arg_type_data + * The call will be treated as a kwargs call if key != nullptr or num_args==0, otherwise it is positional. + * + * \param sym symbol handle + * \param num_args numbe of input arguments. + * \param keys the key of keyword args (optional) + * \param arg_type_data the content of the CSR + * \param in_type_size sizeof the returning array of in_types + * \param in_type_data returning array of pointers to head of the input type. + * \param out_type_size sizeof the returning array of out_types + * \param out_type_data returning array of pointers to head of the input type. + * \param aux_type_size sizeof the returning array of aux_types + * \param aux_type_data returning array of pointers to head of the auxiliary type. + * \param complete whether infer type completes or more information is needed. + * \return 0 when success, -1 when failure happens + */ +MXNET_DLL int MXSymbolInferTypePartial(SymbolHandle sym, + mx_uint num_args, + const char** keys, + const int *arg_type_data, + mx_uint *in_type_size, + const int **in_type_data, + mx_uint *out_type_size, + const int **out_type_data, + mx_uint *aux_type_size, + const int **aux_type_data, + int *complete); + /*! * \brief Convert a symbol into a quantized symbol where FP32 operators are replaced with INT8 * \param sym_handle symbol to be converted diff --git a/perl-package/AI-MXNet/Changes b/perl-package/AI-MXNet/Changes index 8bd43f3be205..e67fe39b49ab 100644 --- a/perl-package/AI-MXNet/Changes +++ b/perl-package/AI-MXNet/Changes @@ -1,5 +1,11 @@ Revision history for Perl extension AI::MXNet +1.4 Mon Feb 18 11:54:07 PST 2019 + - Two more gluon loss classes + - Visualization fixes + - Gluon rnn rework, including hybridization + - Exposed GPU memory info to perl level. + 1.33 Thu Oct 4 13:25:56 PDT 2018 - Added randn function. - Internal SELU function on C++ layer. diff --git a/perl-package/AI-MXNet/META.json b/perl-package/AI-MXNet/META.json index bbbea734ccf8..37c573c279f5 100644 --- a/perl-package/AI-MXNet/META.json +++ b/perl-package/AI-MXNet/META.json @@ -30,7 +30,7 @@ }, "runtime" : { "requires" : { - "AI::MXNetCAPI" : "1.33", + "AI::MXNetCAPI" : "1.4", "AI::NNVMCAPI" : "1.3", "Function::Parameters" : "1.0705", "Hash::Ordered" : "0.012", @@ -45,5 +45,5 @@ } }, "release_status" : "stable", - "version" : "1.33" + "version" : "1.4" } diff --git a/perl-package/AI-MXNet/META.yml b/perl-package/AI-MXNet/META.yml index 26e37b572600..692ca0307948 100644 --- a/perl-package/AI-MXNet/META.yml +++ b/perl-package/AI-MXNet/META.yml @@ -34,7 +34,7 @@ no_index: - t - inc requires: - AI::MXNetCAPI: '1.33' + AI::MXNetCAPI: '1.4' AI::NNVMCAPI: '1.3' Function::Parameters: '1.0705' Hash::Ordered: '0.012' @@ -42,4 +42,4 @@ requires: Mouse: v2.1.0 PDL: '2.007' PDL::CCS: '1.23.4' -version: '1.33' +version: '1.4' diff --git a/perl-package/AI-MXNet/Makefile.PL b/perl-package/AI-MXNet/Makefile.PL index 6d70b21344c2..19aba3fee4a5 100644 --- a/perl-package/AI-MXNet/Makefile.PL +++ b/perl-package/AI-MXNet/Makefile.PL @@ -36,7 +36,7 @@ my %WriteMakefileArgs = ( "LICENSE" => "apache_2_0", "NAME" => "AI::MXNet", "PREREQ_PM" => { - "AI::MXNetCAPI" => "1.33", + "AI::MXNetCAPI" => "1.4", "AI::NNVMCAPI" => "1.3", "Function::Parameters" => "1.0705", "Hash::Ordered" => "0.012", @@ -46,7 +46,7 @@ my %WriteMakefileArgs = ( "GraphViz" => "2.14" }, "TEST_REQUIRES" => {}, - "VERSION" => "1.33", + "VERSION" => "1.4", "test" => { "TESTS" => "t/*.t" } diff --git a/perl-package/AI-MXNet/README b/perl-package/AI-MXNet/README index f370db3804e9..4935b6384071 100644 --- a/perl-package/AI-MXNet/README +++ b/perl-package/AI-MXNet/README @@ -1,5 +1,5 @@ This archive contains the distribution AI-MXNet, -version 1.33: +version 1.4: Perl interface to MXNet machine learning library diff --git a/perl-package/AI-MXNet/lib/AI/MXNet.pm b/perl-package/AI-MXNet/lib/AI/MXNet.pm index 6a559a394a9f..80699b14311c 100644 --- a/perl-package/AI-MXNet/lib/AI/MXNet.pm +++ b/perl-package/AI-MXNet/lib/AI/MXNet.pm @@ -51,7 +51,7 @@ use AI::MXNet::Gluon; use AI::MXNet::NDArray::Sparse; use AI::MXNet::Symbol::Sparse; use AI::MXNet::Engine; -our $VERSION = '1.33'; +our $VERSION = '1.4'; sub import { diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Context.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Context.pm index 826e7baf905b..7ae99be7b99e 100644 --- a/perl-package/AI-MXNet/lib/AI/MXNet/Context.pm +++ b/perl-package/AI-MXNet/lib/AI/MXNet/Context.pm @@ -190,6 +190,30 @@ method num_gpus() return scalar(check_call(AI::MXNetCAPI::GetGPUCount())); } +=head2 gpu_memory_info + + Query CUDA for the free and total bytes of GPU global memory. + + Parameters + ---------- + $device_id=0 : int, optional + The device id of the GPU device. + + Raises + ------ + Will raise an exception on any CUDA error. + + Returns + ------- + ($free, $total) : (int, int) + Free and total memory in bytes. +=cut + +method gpu_memory_info($device_id=0) +{ + return check_call(AI::MXNetCAPI::GetGPUMemoryInformation64($device_id)); +} + method current_ctx() { return $AI::MXNet::current_ctx; diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Loss.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Loss.pm index 7dea68ffa16d..3eb62eb5a2ef 100644 --- a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Loss.pm +++ b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Loss.pm @@ -824,4 +824,175 @@ method hybrid_forward( __PACKAGE__->register('AI::MXNet::Gluon::Loss'); +package AI::MXNet::Gluon::PoissonNLLLoss; +use AI::MXNet::Gluon::Mouse; +extends 'AI::MXNet::Gluon::Loss'; +has 'from_logits' => (is => 'ro', isa => 'Bool', default => 1); +has 'compute_full' => (is => 'ro', isa => 'Bool', default => 0); + +=head1 NAME + + AI::MXNet::Gluon::PoissonNLLLoss +=cut + +=head1 DESCRIPTION + + For a target (Random Variable) in a Poisson distribution, the function calculates the Negative + Log likelihood loss. + PoissonNLLLoss measures the loss accrued from a poisson regression prediction made by the model. + + .. math:: + L = \text{pred} - \text{target} * \log(\text{pred}) +\log(\text{target!}) + + `pred`, `target` can have arbitrary shape as long as they have the same number of elements. + + Parameters + ---------- + from_logits : boolean, default True + indicating whether log(predicted) value has already been computed. If True, the loss is computed as + :math:`\exp(\text{pred}) - \text{target} * \text{pred}`, and if False, then loss is computed as + :math:`\text{pred} - \text{target} * \log(\text{pred}+\text{epsilon})`.The default value + weight : float or None + Global scalar weight for loss. + batch_axis : int, default 0 + The axis that represents mini-batch. + compute_full: boolean, default False + Indicates whether to add an approximation(Stirling factor) for the Factorial term in the formula for the loss. + The Stirling factor is: + :math:`\text{target} * \log(\text{target}) - \text{target} + 0.5 * \log(2 * \pi * \text{target})` + epsilon: float, default 1e-08 + This is to avoid calculating log(0) which is not defined. + + + Inputs: + - **pred**: Predicted value + - **target**: Random variable(count or number) which belongs to a Poisson distribution. + - **sample_weight**: element-wise weighting tensor. Must be broadcastable + to the same shape as pred. For example, if pred has shape (64, 10) + and you want to weigh each sample in the batch separately, + sample_weight should have shape (64, 1). + + Outputs: + - **loss**: Average loss (shape=(1,1)) of the loss tensor with shape (batch_size,). +=cut + +method hybrid_forward( + GluonClass $F, GluonInput $pred, GluonInput $target, + Maybe[GluonInput] $sample_weight=, Maybe[Num] $epsilon=1e-08 +) +{ + $target = __PACKAGE__->_reshape_like($F, $target, $pred); + my $loss; + if($self->from_logits) + { + $loss = $F->exp($pred) - $target * $pred; + } + else + { + $loss = $pred - $target * $F->log($pred + $epsilon); + if($self->compute_full) + { + my $stirling_factor = $target * $F->log($target) - $target + 0.5 * $F->log(2 * $target * 3.1415926); + $stirling_factor *= ($target > 1); + $loss += $stirling_factor; + } + $loss = __PACKAGE__->_apply_weighting($F, $loss, $self->weight, $sample_weight); + } + return $F->mean($loss); +} + +__PACKAGE__->register('AI::MXNet::Gluon::Loss'); + +package AI::MXNet::Gluon::CosineEmbeddingLoss; +use AI::MXNet::Gluon::Mouse; +extends 'AI::MXNet::Gluon::Loss'; +has 'margin' => (is => 'rw', isa => 'Num', default => 0); + +=head1 NAME + + AI::MXNet::Gluon::CosineEmbeddingLoss +=cut + +=head1 DESCRIPTION + + For a target label 1 or -1, vectors input1 and input2, the function computes the cosine distance + between the vectors. This can be interpreted as how similar/dissimilar two input vectors are. + + .. math:: + + L = \sum_i \begin{cases} 1 - {cos\_sim({input1}_i, {input2}_i)} & \text{ if } {label}_i = 1\\ + {cos\_sim({input1}_i, {input2}_i)} & \text{ if } {label}_i = -1 \end{cases}\\ + cos\_sim(input1, input2) = \frac{{input1}_i.{input2}_i}{||{input1}_i||.||{input2}_i||} + + `input1`, `input2` can have arbitrary shape as long as they have the same number of elements. + + Parameters + ---------- + weight : float or None + Global scalar weight for loss. + batch_axis : int, default 0 + The axis that represents mini-batch. + margin : float + Margin of separation between correct and incorrect pair. + + + Inputs: + - **input1**: a tensor with arbitrary shape + - **input2**: another tensor with same shape as pred to which input1 is + compared for similarity and loss calculation + - **label**: A 1-D tensor indicating for each pair input1 and input2, target label is 1 or -1 + - **sample_weight**: element-wise weighting tensor. Must be broadcastable + to the same shape as input1. For example, if input1 has shape (64, 10) + and you want to weigh each sample in the batch separately, + sample_weight should have shape (64, 1). + + Outputs: + - **loss**: The loss tensor with shape (batch_size,). +=cut + +method hybrid_forward( + GluonClass $F, GluonInput $input1, GluonInput $input2, GluonInput $label, Maybe[GluonInput] $sample_weight= +) +{ + $input1 = __PACKAGE__->_reshape_like($F, $input1, $input2); + $label = $label->reshape([-1, 1]); + my $cos_sim = $self->_cosine_similarity($F, $input1, $input2); + my $y_1 = $label == 1; + my $y_minus_1 = $label == -1; + my $cos_sim_a = (1 - $cos_sim) * $y_1; + + my $z_array; + if($F eq 'AI::MXNet::NDArray') + { + $z_array = $F->array([0]); + } + else + { + $z_array = $F->zeros([1, 1]); + } + my $cos_sim_b = $F->broadcast_maximum($z_array, $y_minus_1 * ($cos_sim - $self->margin), { axis=>1 }); + my $loss = $cos_sim_a + $cos_sim_b; + $loss = __PACKAGE__->_apply_weighting($F, $loss, $self->weight, $sample_weight); + return $loss; +} + +method _cosine_similarity($F, $x, $y, $axis=-1) +{ + my $x_norm = $F->norm($x, axis=>$axis)->reshape([-1, 1]); + my $y_norm = $F->norm($y, axis=>$axis)->reshape([-1, 1]); + my $x_dot_y = $F->sum($x*$y, axis=>$axis)->reshape([-1, 1]); + my $eps_arr; + if($F eq 'AI::MXNet::NDArray') + { + $eps_arr = $F->array([1e-12]); + } + else + { + $eps_arr = $F->full([1, 1], 1e-12); + } + return ($x_dot_y / $F->broadcast_maximum($x_norm * $y_norm, $eps_arr)); +} + +__PACKAGE__->register('AI::MXNet::Gluon::Loss'); + 1; \ No newline at end of file diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN/Cell.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN/Cell.pm index c14b792e77d7..89493c7b8bfb 100644 --- a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN/Cell.pm +++ b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN/Cell.pm @@ -57,6 +57,7 @@ method _get_begin_state(GluonClass $F, $begin_state, GluonInput $inputs, $batch_ return $begin_state; } + method _format_sequence($length, $inputs, $layout, $merge, $in_layout=) { assert( @@ -118,7 +119,7 @@ method _format_sequence($length, $inputs, $layout, $merge, $in_layout=) if($merge) { $inputs = [map { $F->expand_dims($_, axis => $axis) } @{ $inputs }]; - $inputs = $F->concat(@{ $inputs }, dim => $axis); + $inputs = $F->stack(@{ $inputs }, axis => $axis); $in_axis = $axis; } } @@ -129,6 +130,54 @@ method _format_sequence($length, $inputs, $layout, $merge, $in_layout=) return ($inputs, $axis, $F, $batch_size); } +method _mask_sequence_variable_length($F, $data, $length, $valid_length, $time_axis, $merge) +{ + assert(defined $valid_length); + if(not blessed $data) + { + $data = $F->stack(@$data, axis=>$time_axis); + } + my $outputs = $F->SequenceMask($data, { sequence_length=>$valid_length, use_sequence_length=>1, + axis=>$time_axis}); + if(not $merge) + { + $outputs = $F->split($outputs, { num_outputs=>$length, axis=>$time_axis, + squeeze_axis=>1}); + if(not ref $outputs eq 'ARRAY') + { + $outputs = [$outputs]; + } + } + return $outputs; +} + +method _reverse_sequences($sequences, $unroll_step, $valid_length=) +{ + my $F; + if($sequences->[0]->isa('AI::MXNet::Symbol')) + { + $F = 'AI::MXNet::Symbol'; + } + else + { + $F = 'AI::MXNet::NDArray'; + } + + my $reversed_sequences; + if(not defined $valid_length) + { + $reversed_sequences = [reverse(@$sequences)]; + } + else + { + $reversed_sequences = $F->SequenceReverse($F->stack(@$sequences, axis=>0), + {sequence_length=>$valid_length, + use_sequence_length=>1}); + $reversed_sequences = $F->split($reversed_sequences, {axis=>0, num_outputs=>$unroll_step, squeeze_axis=>1}); + } + return $reversed_sequences; +} + =head1 NAME AI::MXNet::Gluon::RNN::RecurrentCell @@ -280,21 +329,39 @@ method unroll( Maybe[GluonInput] $inputs, Maybe[GluonInput] :$begin_state=, Str :$layout='NTC', - Maybe[Bool] :$merge_outputs= + Maybe[Bool] :$merge_outputs=, + Maybe[Bool] :$valid_length= ) { $self->reset(); - my ($F, $batch_size); - ($inputs, undef, $F, $batch_size) = $self->_format_sequence($length, $inputs, $layout, 0); + my ($F, $batch_size, $axis); + ($inputs, $axis, $F, $batch_size) = $self->_format_sequence($length, $inputs, $layout, 0); $begin_state //= $self->_get_begin_state($F, $begin_state, $inputs, $batch_size); my $states = $begin_state; my $outputs = []; + my $all_states = []; for my $i (0..$length-1) { my $output; ($output, $states) = $self->($inputs->[$i], $states); push @$outputs, $output; + if(defined $valid_length) + { + push @$all_states, $states; + } + } + if(defined $valid_length) + { + $states = []; + for(zip(@$all_states)) + { + push @$states, $F->SequenceLast($F->stack(@$_, axis=>0), + sequence_length=>$valid_length, + use_sequence_length=>1, + axis=>0); + } + $outputs = $self->_mask_sequence_variable_length($F, $outputs, $length, $valid_length, $axis, 1); } ($outputs) = $self->_format_sequence($length, $outputs, $layout, $merge_outputs); return ($outputs, $states); @@ -304,8 +371,17 @@ method _get_activation(GluonClass $F, GluonInput $inputs, Activation $activation { if(not blessed $activation) { + my %act = map { $_ => 1 } qw(tanh relu sigmoid softsign); + if(exists $act{$activation}) + { + return $F->$activation($inputs, %kwargs) + } return $F->Activation($inputs, act_type=>$activation, %kwargs); } + elsif(ref($activation) =~ /LeakyReLU/) + { + return $F->LeakyReLU($inputs, act_type=>'leaky', slope => $activation->alpha, %kwargs); + } else { return $activation->($inputs, %kwargs); @@ -430,7 +506,7 @@ has [qw/ method python_constructor_arguments() { [qw/ - hidden_size activation + hidden_size activation i2h_weight_initializer h2h_weight_initializer i2h_bias_initializer h2h_bias_initializer input_size @@ -476,16 +552,17 @@ method hybrid_forward( { my $prefix = "t${\ $self->counter}_"; my $i2h = $F->FullyConnected( - $inputs, $i2h_weight, $i2h_bias, + data => $inputs, weight => $i2h_weight, bias => $i2h_bias, num_hidden => $self->hidden_size, name => "${prefix}i2h" ); my $h2h = $F->FullyConnected( - $states->[0], $h2h_weight, $h2h_bias, + data => $states->[0], weight => $h2h_weight, bias => $h2h_bias, num_hidden => $self->hidden_size, name => "${prefix}h2h" ); - my $output = $self->_get_activation($F, $i2h + $h2h, $self->activation, name => "${prefix}out"); + my $i2h_plus_h2h = $F->elemwise_add($i2h, $h2h, name => "${prefix}plus0"); + my $output = $self->_get_activation($F, $i2h_plus_h2h, $self->activation, name => "${prefix}out"); return ($output, [$output]); } @@ -555,6 +632,7 @@ method python_constructor_arguments() /]; } + sub BUILD { my $self = shift; @@ -606,14 +684,18 @@ method hybrid_forward( num_hidden => $self->hidden_size*4, name => "${prefix}h2h" ); - my $gates = $i2h + $h2h; + my $gates = $F->elemwise_add($i2h, $h2h, name => "${prefix}plus0"); my @slice_gates = @{ $F->SliceChannel($gates, num_outputs => 4, name => "${prefix}slice") }; my $in_gate = $F->Activation($slice_gates[0], act_type=>"sigmoid", name => "${prefix}i"); my $forget_gate = $F->Activation($slice_gates[1], act_type=>"sigmoid", name => "${prefix}f"); my $in_transform = $F->Activation($slice_gates[2], act_type=>"tanh", name => "${prefix}c"); my $out_gate = $F->Activation($slice_gates[3], act_type=>"sigmoid", name => "${prefix}o"); - my $next_c = $F->_plus($forget_gate * $states->[1], $in_gate * $in_transform, name => "${prefix}state"); - my $next_h = $F->_mul($out_gate, $F->Activation($next_c, act_type=>"tanh"), name => "${prefix}out"); + my $next_c = $F->_plus( + $F->elemwise_mul($forget_gate, $states->[1], name => "${prefix}mul0"), + $F->elemwise_mul($in_gate, $in_transform, name => "${prefix}mul1"), + name => "${prefix}state" + ); + my $next_h = $F->_mul($out_gate, $F->Activation($next_c, act_type=>"tanh", name => "${prefix}activation0"), name => "${prefix}out"); return ($next_h, [$next_h, $next_c]); } @@ -735,10 +817,29 @@ method hybrid_forward( my ($i2h_r, $i2h_z, $h2h_r, $h2h_z); ($i2h_r, $i2h_z, $i2h) = @{ $F->SliceChannel($i2h, num_outputs => 3, name => "${prefix}i2h_slice") }; ($h2h_r, $h2h_z, $h2h) = @{ $F->SliceChannel($h2h, num_outputs => 3, name => "${prefix}h2h_slice") }; - my $reset_gate = $F->Activation($i2h_r + $h2h_r, act_type=>"sigmoid", name => "${prefix}r_act"); - my $update_gate = $F->Activation($i2h_z + $h2h_z, act_type=>"sigmoid", name => "${prefix}z_act"); - my $next_h_tmp = $F->Activation($i2h + $reset_gate * $h2h, act_type => "tanh", name => "${prefix}h_act"); - my $next_h = $F->_plus((1 - $update_gate) * $next_h_tmp, $update_gate * $prev_state_h, name => "${prefix}out"); + my $reset_gate = $F->Activation($F->elemwise_add($i2h_r, $h2h_r, name => "${prefix}plus0"), act_type=>"sigmoid", name => "${prefix}r_act"); + my $update_gate = $F->Activation($F->elemwise_add($i2h_z, $h2h_z, name => "${prefix}plus1"), act_type=>"sigmoid", name => "${prefix}z_act"); + my $next_h_tmp = $F->Activation( + $F->elemwise_add( + $i2h, + $F->elemwise_mul( + $reset_gate, $h2h, name => "${prefix}mul0" + ), + name => "${prefix}plus2" + ), + act_type => "tanh", + name => "${prefix}h_act" + ); + my $ones = $F->ones_like($update_gate, name => "${prefix}ones_like0"); + my $next_h = $F->_plus( + $F->elemwise_mul( + $F->elemwise_sub($ones, $update_gate, name => "${prefix}minus0"), + $next_h_tmp, + name => "${prefix}mul1" + ), + $F->elemwise_mul($update_gate, $prev_state_h, name => "${prefix}mul2"), + name => "${prefix}out" + ); return ($next_h, [$next_h]); } diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN/Layer.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN/Layer.pm index 2b6e8a5bdae4..08212ab20f6d 100644 --- a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN/Layer.pm +++ b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN/Layer.pm @@ -21,7 +21,7 @@ package AI::MXNet::Gluon::RNN::Layer; use AI::MXNet::Function::Parameters; use AI::MXNet::Gluon::Mouse; use AI::MXNet::Base; -extends 'AI::MXNet::Gluon::Block'; +extends 'AI::MXNet::Gluon::HybridBlock'; has 'hidden_size' => (is => 'rw', isa => 'Int'); has 'num_layers' => (is => 'rw', isa => 'Int'); @@ -29,18 +29,19 @@ has 'layout' => (is => 'rw', isa => 'Str'); has 'dropout' => (is => 'rw', isa => 'Num'); has 'bidirectional' => (is => 'rw', isa => 'Bool'); has 'input_size' => (is => 'rw', isa => 'Int', default => 0); +has 'projection_size' => (is => 'rw', isa => 'Maybe[Int]'); +has [qw/lstm_state_clip_min + lstm_state_clip_max/] => (is => 'rw', isa => 'Maybe[Num]'); +has 'lstm_state_clip_nan' => (is => 'rw', isa => 'Bool', default => 0); has [qw/ i2h_weight_initializer h2h_weight_initializer i2h_bias_initializer h2h_bias_initializer + h2r_weight_initializer /] => (is => 'rw', isa => 'Maybe[Initializer]'); has 'mode' => (is => 'rw', isa => 'Str'); has [qw/dir gates - i2h_weight - h2h_weight - i2h_bias - h2h_bias unfused/] => (is => 'rw', init_arg => undef); method python_constructor_arguments() @@ -50,7 +51,8 @@ method python_constructor_arguments() dropout bidirectional input_size i2h_weight_initializer h2h_weight_initializer i2h_bias_initializer h2h_bias_initializer - mode + mode projection_size h2r_weight_initializer + lstm_state_clip_min lstm_state_clip_max lstm_state_clip_nan /]; } @@ -61,41 +63,76 @@ sub BUILD ($self->layout eq 'TNC' or $self->layout eq 'NTC'), "Invalid layout [${\ $self->layout }]; must be one of ['TNC' or 'NTC']" ); - $self->i2h_weight([]); - $self->h2h_weight([]); - $self->i2h_bias([]); - $self->h2h_bias([]); $self->dir($self->bidirectional ? 2 : 1); $self->gates({qw/rnn_relu 1 rnn_tanh 1 lstm 4 gru 3/}->{$self->mode}); my ($ng, $ni, $nh) = ($self->gates, $self->input_size, $self->hidden_size); - for my $i (0..$self->num_layers-1) + if(not $self->projection_size) { - for my $j ($self->dir == 2 ? ('l', 'r') : ('l')) + for my $i (0..$self->num_layers-1) { - push @{ $self->i2h_weight }, $self->params->get( - "$j${i}_i2h_weight", shape=>[$ng*$nh, $ni], - init => $self->i2h_weight_initializer, - allow_deferred_init => 1 - ); - push @{ $self->h2h_weight }, $self->params->get( - "$j${i}_h2h_weight", shape=>[$ng*$nh, $nh], - init => $self->h2h_weight_initializer, - allow_deferred_init => 1 - ); - push @{ $self->i2h_bias }, $self->params->get( - "$j${i}_i2h_bias", shape=>[$ng*$nh], - init => $self->i2h_bias_initializer, - allow_deferred_init => 1 - ); - push @{ $self->h2h_bias }, $self->params->get( - "$j${i}_h2h_bias", shape=>[$ng*$nh], - init => $self->h2h_bias_initializer, - allow_deferred_init => 1 - ); + for my $j ($self->dir == 2 ? ('l', 'r') : ('l')) + { + $self->_register_param( + "$j${i}_i2h_weight", [$ng*$nh, $ni], + $self->i2h_weight_initializer + ); + $self->_register_param( + "$j${i}_h2h_weight", [$ng*$nh, $nh], + $self->h2h_weight_initializer + ); + $self->_register_param( + "$j${i}_i2h_bias", [$ng*$nh], + $self->i2h_bias_initializer, + ); + $self->_register_param( + "$j${i}_h2h_bias", [$ng*$nh], + $self->h2h_bias_initializer, + ); + } + $ni = $nh * $self->dir; + } + } + else + { + my $np = $self->projection_size; + for my $i (0..$self->num_layers-1) + { + for my $j ($self->dir == 2 ? ('l', 'r') : ('l')) + { + $self->_register_param( + "$j${i}_i2h_weight", [$ng*$nh, $ni], + $self->i2h_weight_initializer + ); + $self->_register_param( + "$j${i}_h2h_weight", [$ng*$nh, $np], + $self->h2h_weight_initializer + ); + $self->_register_param( + "$j${i}_i2h_bias", [$ng*$nh], + $self->i2h_bias_initializer, + ); + $self->_register_param( + "$j${i}_h2h_bias", [$ng*$nh], + $self->h2h_bias_initializer, + ); + $self->_register_param( + "$j${i}_h2r_weight", [$np, $nh], + $self->h2r_weight_initializer, + ); + } + $ni = $np * $self->dir; } - $ni = $nh * $self->dir; } - $self->unfused($self->_unfuse()); +} + +method _register_param($name, $shape, $init) +{ + my $p = $self->params->get( + $name, shape=>$shape, init=>$init, + allow_deferred_init=>1 + ); + $self->$name($p); + return $p; } use overload '""' => sub { @@ -119,15 +156,55 @@ use overload '""' => sub { return $s; }; +method _collect_params_with_prefix(Str $prefix='') +{ + $prefix .= '.' if($prefix); + my $pattern = qr/(l|r)(\d+)_(i2h|h2h)_(weight|bias)$/; + my $convert_key = sub { my ($m, $bidirectional) = @_; + my ($d, $l, $g, $t) = @$m; + if($bidirectional) + { + return "_unfused.$l.${d}_cell.${g}_$t"; + } + else + { + return "_unfused.$l.${g}_$t"; + } + }; + my $bidirectional = 0; + my %params = %{ $self->_reg_params }; + for my $k (keys %params) + { + $k =~ $pattern; + $bidirectional = 1 if $1 and $1 eq 'r'; + } + my %ret; + for my $k (keys %params) + { + $k =~ $pattern; + $ret{ $prefix . $convert_key->([$1, $2, $3, $4], $bidirectional) } = $params{$k}; + } + my $iter = $self->_children->iterator; + while(my ($name, $child) = $iter->()) + { + %ret = (%ret, %{ $child->_collect_params_with_prefix("$prefix$name") }); + } + return \%ret; +} + method state_info($batch_size=0) { confess('NotImplementedError'); } -# Unfuses the fused RNN in to a stack of rnn cells. method _unfuse() { + assert((not $self->projection_size), "_unfuse does not support projection layer yet!"); + assert( + (not $self->lstm_state_clip_min and not $self->lstm_state_clip_max), + "_unfuse does not support state clipping yet!" + ); my $get_cell = { rnn_relu => sub { my %kwargs = @_; @@ -218,89 +295,105 @@ method begin_state( } use Data::Dumper; -method forward(GluonInput $inputs, Maybe[GluonInput] $states=) +method hybrid_forward(GluonClass $F, GluonInput $inputs, @args) { - my $batch_size = $inputs->shape->[index($self->layout, 'N')]; - my $skip_states = not defined $states; - if($skip_states) + my $states; + if(@args) { - $states = $self->begin_state($batch_size, ctx=>$inputs->context); + if(not defined $args[0] or ref $args[0]) + { + $states = shift(@args); + undef $states if(ref $states eq 'ARRAY' and not @$states); + } } - if(blessed $states and $states->isa('AI::MXNet::NDArray')) + use Data::Dumper; + + my $batch_size; + if($F eq 'AI::MXNet::NDArray') { - $states = [$states]; + $batch_size = $inputs->shape->[index($self->layout, 'N')]; } - for(zip($states, $self->state_info($batch_size))) { - my ($state, $info) = @$_; - if(Dumper($state->shape) ne Dumper($info->{shape})) + my $skip_states = not defined $states; + if($skip_states) + { + if($F eq 'AI::MXNet::NDArray') { - my @state_shape = @{ $state->shape }; - confess("Invalid recurrent state shape. Expecting @{$info->{shape}}, got @state_shape."); + $states = $self->begin_state($batch_size, ctx=>$inputs->context, dtype=>$inputs->dtype); } - } - if($self->input_size == 0) - { - for my $i (0..$self->dir-1) + else { - $self->i2h_weight->[$i]->shape([$self->gates*$self->hidden_size, $inputs->shape->[2]]); - $self->i2h_weight->[$i]->_finish_deferred_init(); + $states = $self->begin_state(0, func=>sub { return AI::MXNet::Symbol->zeros(@_) }); } } - my $out; - if($inputs->context->device_type eq 'gpu') + if(blessed $states and ($states->isa('AI::MXNet::NDArray') or $states->isa('AI::MXNet::Symbol'))) { - $out = $self->_forward_gpu($inputs, $states); + $states = [$states]; } - else + if($F eq 'AI::MXNet::NDArray') { - $out = $self->_forward_cpu($inputs, $states); + for(zip($states, $self->state_info($batch_size))) + { + my ($state, $info) = @$_; + if(Dumper($state->shape) ne Dumper($info->{shape})) + { + my @state_shape = @{ $state->shape }; + confess("Invalid recurrent state shape. Expecting @{$info->{shape}}, got @state_shape."); + } + } } - - # out is (output, state) + my $out = $self->_forward_kernel($F, $inputs, $states, @args); return $skip_states ? $out->[0] : $out; } -method _forward_cpu($inputs, $states) +method _forward_kernel($F, $inputs, $states, %kwargs) { - my $ns = @{ $states }; - my $axis = index($self->layout, 'T'); - $states = [map { @{$_} } @{ $states }]; - my $outputs; - ($outputs, $states) = $self->unfused->unroll( - $inputs->shape->[$axis], $inputs, begin_state => $states, - layout => $self->layout, merge_outputs => 1 - ); - my @new_states; - for my $i (0..$ns-1) + if($self->layout eq 'NTC') { - my @tmp; - for (my $j = $i; $j < @{ $states }; $j += $ns) + $inputs = $F->swapaxes($inputs, dim1=>0, dim2=>1); + } + my @params; + if(not defined $self->projection_size) + { + for my $t ('weight', 'bias') { - push @tmp, $states->[$j]; + for my $l (0..$self->num_layers-1) + { + for my $d ($self->dir == 2 ? ('l', 'r') : ('l')) + { + for my $g ('i2h', 'h2h') + { + push @params, $kwargs{"$d${l}_${g}_$t"}->reshape([-1]); + } + } + } } - my $state = AI::MXNet::NDArray->concat((map { $_->reshape([1, @{ $_->shape }]) } @tmp), dim => 0); - push @new_states, $state; } - return [$outputs, \@new_states]; -} - -method _forward_gpu($inputs, $states) -{ - if($self->layout eq 'NTC') + else { - $inputs = $inputs->swapaxes(dim1 => 0, dim2 => 1); + for my $t ('weight', 'bias') + { + for my $l (0..$self->num_layers-1) + { + for my $d ($self->dir == 2 ? ('l', 'r') : ('l')) + { + for my $g ('i2h', 'h2h', 'h2r') + { + push @params, $kwargs{"$d${l}_${g}_$t"}->reshape([-1]) + unless($g eq 'h2r' and $t eq 'bias'); + } + } + } + } } - my $ctx = $inputs->context; - my @params = map { $_->data($ctx)->reshape([-1]) } map { @{ $_ } } ( - $self->i2h_weight, $self->h2h_weight, - $self->i2h_bias, $self->h2h_bias - ); - my $params = AI::MXNet::NDArray->concat(@params, dim => 0); - my $rnn = AI::MXNet::NDArray->RNN( - $inputs, $params, @{ $states }, state_size => $self->hidden_size, + my $params = $F->_rnn_param_concat(@params, dim=>0); + my $rnn = $F->RNN( + $inputs, $params, @{ $states }, { state_size => $self->hidden_size, num_layers => $self->num_layers, bidirectional => $self->dir == 2 ? 1 : 0, - p => $self->dropout, state_outputs => 1, mode => $self->mode - ); + p => $self->dropout, state_outputs => 1, mode => $self->mode, + (defined $self->lstm_state_clip_min ? (lstm_state_clip_min=>$self->lstm_state_clip_min) : ()), + (defined $self->lstm_state_clip_max ? (lstm_state_clip_max=>$self->lstm_state_clip_max) : ()), + (defined $self->lstm_state_clip_nan ? (lstm_state_clip_nan=>$self->lstm_state_clip_nan) : ()) + }); my $outputs; my @rnn = @{$rnn}; if($self->mode eq 'lstm') @@ -318,7 +411,6 @@ method _forward_gpu($inputs, $states) return [$outputs, $states]; } - package AI::MXNet::Gluon::RNN::RNN; =head1 NAME @@ -552,7 +644,10 @@ method state_info(DimSize $batch_size=0) { return [ { - shape => [$self->num_layers * $self->dir, $batch_size, $self->hidden_size], + shape => [ + $self->num_layers * $self->dir, $batch_size, + defined $self->projection_size ? $self->projection_size : $self->hidden_size + ], __layout__ => 'LNC' }, { diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Initializer.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Initializer.pm index 0359cc3640d4..75c8b1e3dad1 100644 --- a/perl-package/AI-MXNet/lib/AI/MXNet/Initializer.pm +++ b/perl-package/AI-MXNet/lib/AI/MXNet/Initializer.pm @@ -191,6 +191,16 @@ method call(Str|AI::MXNet::InitDesc $desc, AI::MXNet::NDArray $arr) $self->$method($desc, $arr); $self->_verbose_print($desc, $1, $arr); } + elsif($desc =~ /min$/) + { + $self->_init_zero($desc, $arr); + $self->_verbose_print($desc, 'min', $arr); + } + elsif($desc =~ /max$/) + { + $self->_init_one($desc, $arr); + $self->_verbose_print($desc, 'max', $arr); + } else { $self->_init_default($desc, $arr) @@ -250,6 +260,14 @@ method _legacy_init(Str $name, AI::MXNet::NDArray $arr) { $self->_init_zero($name, $arr); } + elsif($name =~ /min$/) + { + $self->_init_zero($name, $arr); + } + elsif($name =~ /max$/) + { + $self->_init_one($name, $arr); + } else { $self->_init_default($name, $arr); diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm index 3a7b6bab2e2c..72f6cc772178 100644 --- a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm +++ b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm @@ -1226,6 +1226,9 @@ method concatenate(ArrayRef[AI::MXNet::NDArray] $arrays, Index :$axis=0, :$alway :$repeat=1 : number, optional The repeating time of all elements. E.g repeat=3, the element a will be repeated three times --> a, a, a. + :$infer_range=0 : Bool + When set to 1, infer stop position from start, step, repeat, and + output tensor size. :$ctx : Context, optional The context of the NDArray, defaultw to current default context. :$dtype : data type, optional @@ -1237,7 +1240,7 @@ method concatenate(ArrayRef[AI::MXNet::NDArray] $arrays, Index :$axis=0, :$alway The created NDArray =cut -method arange(Index :$start=0, Maybe[Index] :$stop=, Index :$step=1, Index :$repeat=1, +method arange(Index :$start=0, Maybe[Index] :$stop=, Index :$step=1, Index :$repeat=1, Bool :$infer_range=0, AI::MXNet::Context :$ctx=AI::MXNet::Context->current_ctx, Dtype :$dtype='float32') { return __PACKAGE__->_arange({ @@ -1246,6 +1249,7 @@ method arange(Index :$start=0, Maybe[Index] :$stop=, Index :$step=1, Index :$rep step => $step, repeat => $repeat, dtype => $dtype, + infer_range => $infer_range, ctx => "$ctx" }); } diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol.pm index 57bfdf1d977c..04dd1cbfc441 100644 --- a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol.pm +++ b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol.pm @@ -1411,16 +1411,19 @@ method ones(Shape :$shape, Dtype :$dtype='float32', Maybe[Str] :$name=, Maybe[St Parameters ---------- - start : number + :$start=0 : number Start of interval. The interval includes this value. The default start value is 0. - stop : number, optional + :$stop= : number, optional End of interval. The interval does not include this value. - step : number, optional + :$step=1.0 : number, optional Spacing between values - repeat : int, optional + :$repeat=1 : int, optional "The repeating time of all elements. E.g repeat=3, the element a will be repeated three times --> a, a, a. - dtype : type, optional + :$infer_range=0 : Bool + When set to 1, infer stop position from start, step, repeat, and + output tensor size. + :$dtype='float32' : type, optional The value type of the NDArray, default to np.float32 Returns @@ -1429,11 +1432,12 @@ method ones(Shape :$shape, Dtype :$dtype='float32', Maybe[Str] :$name=, Maybe[St The created Symbol =cut -method arange(Index :$start=0, Index :$stop=, Num :$step=1.0, Index :$repeat=1, Maybe[Str] :$name=, Dtype :$dtype='float32') +method arange(Index :$start=0, Index :$stop=, Num :$step=1.0, Index :$repeat=1, Bool :$infer_range=0, Maybe[Str] :$name=, Dtype :$dtype='float32') { return __PACKAGE__->_arange({ start => $start, (defined $stop ? (stop => $stop) : ()), - step => $step, repeat => $repeat, name => $name, dtype => $dtype + step => $step, repeat => $repeat, name => $name, dtype => $dtype, + infer_range => $infer_range }); } diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Visualization.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Visualization.pm index 20811f10fedf..1574ea58307f 100644 --- a/perl-package/AI-MXNet/lib/AI/MXNet/Visualization.pm +++ b/perl-package/AI-MXNet/lib/AI/MXNet/Visualization.pm @@ -172,6 +172,10 @@ method print_summary( $cur_param = $num_filter * 2; } } + elsif($op eq 'Embedding') + { + $cur_param = $node->{attrs}{input_dim} * $node->{attrs}{output_dim}; + } my $first_connection; if(not $pre_node) { diff --git a/perl-package/AI-MXNet/t/test_gluon_rnn.t b/perl-package/AI-MXNet/t/test_gluon_rnn.t index 83b294d110ce..51e6ad53e171 100644 --- a/perl-package/AI-MXNet/t/test_gluon_rnn.t +++ b/perl-package/AI-MXNet/t/test_gluon_rnn.t @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. + use strict; use warnings; use Test::More tests => 77; @@ -276,14 +277,15 @@ sub check_rnn_layer_forward $inputs->attach_grad; my $out; mx->autograd->record(sub { - $out = $layer->($inputs, $states); if(defined $states) { + $out = $layer->($inputs, $states); ok(@$out == 2); $out = $out->[0]; } else { + $out = $layer->($inputs); ok(blessed $out and $out->isa('AI::MXNet::NDArray')); } $out->backward(); @@ -292,21 +294,19 @@ sub check_rnn_layer_forward my $pdl_out = $out->aspdl; my $pdl_dx = $inputs->grad->aspdl; $layer->hybridize; - mx->autograd->record(sub { - $out = $layer->($inputs, $states); if(defined $states) { - ok(@$out == 2); - $out = $out->[0] + ($out, $states) = $layer->($inputs, $states); + ok(blessed $out and $out->isa('AI::MXNet::NDArray')); } else { + $out = $layer->($inputs, $states); ok(blessed $out and $out->isa('AI::MXNet::NDArray')); } $out->backward(); }); - ok(almost_equal($pdl_out, $out->aspdl, 1e-3)); ok(almost_equal($pdl_dx, $inputs->grad->aspdl, 1e-3)); } @@ -314,21 +314,12 @@ sub check_rnn_layer_forward sub test_rnn_layers { check_rnn_layer_forward(gluon->rnn->RNN(10, 2), mx->nd->ones([8, 3, 20])); - check_rnn_layer_forward(gluon->rnn->RNN(10, 2), mx->nd->ones([8, 3, 20]), mx->nd->ones([2, 3, 10])); + check_rnn_layer_forward(gluon->rnn->RNN(10, 2, bidirectional=>1), mx->nd->ones([8, 3, 20]), mx->nd->ones([4, 3, 10])); check_rnn_layer_forward(gluon->rnn->LSTM(10, 2), mx->nd->ones([8, 3, 20])); - check_rnn_layer_forward(gluon->rnn->LSTM(10, 2), mx->nd->ones([8, 3, 20]), [mx->nd->ones([2, 3, 10]), mx->nd->ones([2, 3, 10])]); + check_rnn_layer_forward(gluon->rnn->LSTM(10, 2, bidirectional=>1), mx->nd->ones([8, 3, 20]), [mx->nd->ones([4, 3, 10]), mx->nd->ones([4, 3, 10])]); check_rnn_layer_forward(gluon->rnn->GRU(10, 2), mx->nd->ones([8, 3, 20])); - check_rnn_layer_forward(gluon->rnn->GRU(10, 2), mx->nd->ones([8, 3, 20]), mx->nd->ones([2, 3, 10])); - -# my $net = gluon->nn->Sequential(); -# $net->add(gluon->rnn->LSTM(10, 2, bidirectional=>1)); -# $net->add(gluon->nn->BatchNorm(axis=>2)); -# $net->add(gluon->nn->Flatten()); -# $net->add(gluon->nn->Dense(3, activation=>'relu')); -# $net->collect_params()->initialize(); -# mx->autograd->record(sub { -# $net->(mx->nd->ones([2, 3, 10]))->backward(); -# }); + check_rnn_layer_forward(gluon->rnn->GRU(10, 2, bidirectional=>1), mx->nd->ones([8, 3, 20]), mx->nd->ones([4, 3, 10])); } test_rnn_layers(); + diff --git a/perl-package/AI-MXNet/t/test_loss.t b/perl-package/AI-MXNet/t/test_loss.t index 7fc7ee81d0de..5a9e413bbfaf 100644 --- a/perl-package/AI-MXNet/t/test_loss.t +++ b/perl-package/AI-MXNet/t/test_loss.t @@ -17,7 +17,7 @@ use strict; use warnings; -use Test::More tests => 30; +use Test::More tests => 32; use AI::MXNet 'mx'; use AI::MXNet::Gluon 'gluon'; use AI::MXNet::TestUtils 'almost_equal'; @@ -435,3 +435,47 @@ sub test_triplet_loss test_triplet_loss(); +sub test_cosine_loss +{ + my $input1 = mx->nd->random->randn(3, 2); + my $input2 = mx->nd->random->randn(3, 2); + my $label = mx->nd->sign(mx->nd->random->randn($input1->shape->[0])); + + my $Loss = gluon->loss->CosineEmbeddingLoss(); + my $loss = $Loss->($input1, $input2, $label); + + my $numerator = mx->nd->sum($input1 * $input2, keepdims => 1, axis => 1); + my $denominator = mx->nd->sqrt(mx->nd->sum($input1**2, axis=>1, keepdims=>1)) + * + mx->nd->sqrt(mx->nd->sum($input2**2, axis=>1, keepdims=>1)); + my $pdl_loss = mx->nd->where( + ($label == 1), 1-$numerator/$denominator, + mx->nd->broadcast_maximum(mx->nd->array([0]), $numerator/$denominator, { axis=>1 }) + ); + ok(almost_equal($loss->aspdl, $pdl_loss->aspdl)); +} + +test_cosine_loss(); + +sub test_poisson_nllloss +{ + my $N = 1000; + mx->random->seed(1234); + srand(1234); + my $data = mx->random->poisson(shape=>[$N, 2]); + my $label = mx->random->poisson(lam=>4, shape=>[$N, 1]); + my $data_iter = mx->io->NDArrayIter($data, $label, batch_size=>20, label_name=>'label', shuffle=>1); + my $output = mx->sym->exp(get_net(1)); + my $l = mx->symbol->Variable('label'); + my $Loss = gluon->loss->PoissonNLLLoss(from_logits=>0); + my $loss = $Loss->($output, $l); + $loss = mx->sym->make_loss($loss); + my $mod = mx->mod->Module($loss, data_names=>['data'], label_names=>['label']); + local($AI::MXNet::Logging::silent) = 1; + $mod->fit($data_iter, num_epoch=>20, optimizer_params=>{learning_rate => 0.01}, + initializer=>mx->init->Normal(sigma=>0.1), eval_metric=>mx->metric->Loss(), + optimizer=>'adam'); + ok($mod->score($data_iter, mx->metric->Loss())->{loss} < 0.05); +} + +test_poisson_nllloss; diff --git a/perl-package/AI-MXNetCAPI/Changes b/perl-package/AI-MXNetCAPI/Changes index 08ad085abce9..cdbbdab57cdf 100644 --- a/perl-package/AI-MXNetCAPI/Changes +++ b/perl-package/AI-MXNetCAPI/Changes @@ -1,5 +1,8 @@ Revision history for Perl extension AI::MXNetCAPI +1.4 Mon Feb 18 11:54:07 PST 2019 + - Support for 64bit integers + 1.33 Thu Oct 4 13:25:56 PDT 2018 - Gluon: Better sparse support for KVStore. - Gpu memory info via mxnet api call. diff --git a/perl-package/AI-MXNetCAPI/META.json b/perl-package/AI-MXNetCAPI/META.json index 1849e6b3bc18..82bee1ace8f8 100644 --- a/perl-package/AI-MXNetCAPI/META.json +++ b/perl-package/AI-MXNetCAPI/META.json @@ -37,5 +37,5 @@ } }, "release_status" : "stable", - "version" : "1.33" + "version" : "1.4" } diff --git a/perl-package/AI-MXNetCAPI/META.yml b/perl-package/AI-MXNetCAPI/META.yml index d870f05fbe52..bd4af4047378 100644 --- a/perl-package/AI-MXNetCAPI/META.yml +++ b/perl-package/AI-MXNetCAPI/META.yml @@ -36,4 +36,4 @@ no_index: - inc requires: Test::More: '0' -version: '1.33' +version: '1.4' diff --git a/perl-package/AI-MXNetCAPI/README b/perl-package/AI-MXNetCAPI/README index 67b77ccd1614..848b4d03ab21 100644 --- a/perl-package/AI-MXNetCAPI/README +++ b/perl-package/AI-MXNetCAPI/README @@ -1,4 +1,4 @@ -AI-MXNetCAPI version 1.33 +AI-MXNetCAPI version 1.4 ===================== Swig interface to MXNet c api. diff --git a/perl-package/AI-MXNetCAPI/lib/AI/MXNetCAPI.pm b/perl-package/AI-MXNetCAPI/lib/AI/MXNetCAPI.pm index bc7676047d76..e3b71f8efc92 100644 --- a/perl-package/AI-MXNetCAPI/lib/AI/MXNetCAPI.pm +++ b/perl-package/AI-MXNetCAPI/lib/AI/MXNetCAPI.pm @@ -18,7 +18,7 @@ package AI::MXNetCAPI; use base qw(DynaLoader); bootstrap AI::MXNetCAPI; -our $VERSION = '1.33'; +our $VERSION = '1.4'; 1; __END__ diff --git a/perl-package/AI-MXNetCAPI/mxnet_typemaps.i b/perl-package/AI-MXNetCAPI/mxnet_typemaps.i index 68e11ca74e1a..50296c2aaba5 100644 --- a/perl-package/AI-MXNetCAPI/mxnet_typemaps.i +++ b/perl-package/AI-MXNetCAPI/mxnet_typemaps.i @@ -115,7 +115,7 @@ } } -%typemap(in,numinputs=0) (int *out) (int temp), (bool *out) (bool temp) +%typemap(in,numinputs=0) (int *out) (int temp), (bool *out) (bool temp), (uint64_t *out) (uint64_t temp) { temp = 0; $1 = &temp; @@ -131,6 +131,17 @@ } } +%typemap(argout) (uint64_t *out) +{ + if(!result) + { + $result = newSVnv((double)(*$1)); + sv_2mortal($result); + argvi++; + } +} + + %typemap(in,numinputs=0) (const int **out_stypes) (int* temp) { temp = NULL; diff --git a/python/mxnet/contrib/onnx/onnx2mx/_op_translations.py b/python/mxnet/contrib/onnx/onnx2mx/_op_translations.py index a7cef7674496..1a8d2cea9cd6 100644 --- a/python/mxnet/contrib/onnx/onnx2mx/_op_translations.py +++ b/python/mxnet/contrib/onnx/onnx2mx/_op_translations.py @@ -62,48 +62,22 @@ def sample_multinomial(attrs, inputs, proto_obj): new_attrs['dtype'] = TENSOR_TYPE_TO_NP_TYPE[int(attrs.get('dtype', 6))] return 'sample_multinomial', new_attrs, inputs - # Arithmetic Operations def add(attrs, inputs, proto_obj): """Adding two tensors""" - new_attr = {} - if 'broadcast' in attrs and attrs['broadcast'] == 1: - broadcast_axis = attrs['axis'] - op_value = translation_utils._fix_broadcast('broadcast_add', inputs, - broadcast_axis, proto_obj) - return op_value, new_attr, inputs - return 'broadcast_add', new_attr, inputs + return translation_utils.broadcast_arithmetic_helper(attrs, inputs, proto_obj, 'broadcast_add') def subtract(attrs, inputs, proto_obj): """Subtracting two tensors""" - new_attr = {} - if 'broadcast' in attrs and attrs['broadcast'] == 1: - broadcast_axis = attrs['axis'] - op_value = translation_utils._fix_broadcast('broadcast_sub', inputs, - broadcast_axis, proto_obj) - return op_value, new_attr, inputs - return 'broadcast_sub', new_attr, inputs - + return translation_utils.broadcast_arithmetic_helper(attrs, inputs, proto_obj, 'broadcast_sub') def multiply(attrs, inputs, proto_obj): """Multiply two tensors""" - new_attr = {} - if 'broadcast' in attrs and attrs['broadcast'] == 1: - broadcast_axis = attrs['axis'] - op_value = translation_utils._fix_broadcast('broadcast_mul', inputs, - broadcast_axis, proto_obj) - return op_value, new_attr, inputs - return 'broadcast_mul', new_attr, inputs + return translation_utils.broadcast_arithmetic_helper(attrs, inputs, proto_obj, 'broadcast_mul') def divide(attrs, inputs, proto_obj): """Divide two tensors""" - new_attr = {} - if 'broadcast' in attrs and attrs['broadcast'] == 1: - broadcast_axis = attrs['axis'] - op_value = translation_utils._fix_broadcast('broadcast_div', inputs, - broadcast_axis, proto_obj) - return op_value, new_attr, inputs - return 'broadcast_div', new_attr, inputs + return translation_utils.broadcast_arithmetic_helper(attrs, inputs, proto_obj, 'broadcast_div') def mean(attrs, inputs, proto_obj): """Mean of all the input tensors.""" diff --git a/python/mxnet/contrib/onnx/onnx2mx/_translation_utils.py b/python/mxnet/contrib/onnx/onnx2mx/_translation_utils.py index 6fd52665ca31..0c6730513d4b 100644 --- a/python/mxnet/contrib/onnx/onnx2mx/_translation_utils.py +++ b/python/mxnet/contrib/onnx/onnx2mx/_translation_utils.py @@ -221,7 +221,7 @@ def get_input_shape(sym, proto_obj): model_input_shape = [data[1] for data in proto_obj.model_metadata.get('input_tensor_data')] data_names = [data[0] for data in proto_obj.model_metadata.get('input_tensor_data')] - #creating dummy inputs + # creating dummy inputs inputs = [] for in_shape in model_input_shape: inputs.append(nd.ones(shape=in_shape)) @@ -245,3 +245,17 @@ def get_input_shape(sym, proto_obj): result = mod.get_outputs()[0].asnumpy() return result.shape + +def broadcast_arithmetic_helper(attrs, inputs, proto_obj, current_op_name): + """Helper function for broadcast arithmetic ops.""" + new_attr = {} + op_names = ['batchnorm, convolution, deconvolution'] + if 'broadcast' in attrs and attrs['broadcast'] == 1: + broadcast_axis = attrs['axis'] + for op_name in op_names: + # if input is bias which comes after conv, deconv, batchnorm operators + # then only reshape bias term + if inputs[0].name.startswith(op_name): + op_value = _fix_broadcast(current_op_name, inputs, broadcast_axis, proto_obj) + return op_value, new_attr, inputs + return current_op_name, new_attr, inputs diff --git a/python/mxnet/model.py b/python/mxnet/model.py index c08077cc65f4..efb51096c368 100644 --- a/python/mxnet/model.py +++ b/python/mxnet/model.py @@ -181,8 +181,10 @@ def _update_params(param_arrays, grad_arrays, updater, num_device, w, g = p updates[k].append((index*num_device+k, g, w)) for dev_updates in updates: - i, w, g = zip(*dev_updates) - updater(i, w, g) + # update params if param_arrays and grad_arrays are not empty + if dev_updates: + i, w, g = zip(*dev_updates) + updater(i, w, g) def _multiple_callbacks(callbacks, *args, **kwargs): diff --git a/python/mxnet/symbol/symbol.py b/python/mxnet/symbol/symbol.py index 43de0c9d7535..3e3e79ed59f7 100644 --- a/python/mxnet/symbol/symbol.py +++ b/python/mxnet/symbol/symbol.py @@ -882,6 +882,81 @@ def infer_type(self, *args, **kwargs): List of auxiliary state types. The order is same as the order of list_auxiliary_states(). """ + try: + res = self._infer_type_impl(False, *args, **kwargs) + if res[1] is None: + arg_shapes, _, _ = self._infer_type_impl(True, *args, **kwargs) + arg_names = self.list_arguments() + unknowns = [] + for name, dtype in zip(arg_names, arg_shapes): + if not dtype: + if len(unknowns) >= 10: + unknowns.append('...') + break + unknowns.append('%s: %s' % (name, str(dtype))) + warnings.warn( + "Cannot decide type for the following arguments. " + + "Consider providing them as input:\n\t" + + "\n\t".join(unknowns), stacklevel=2) + return res + except MXNetError: + print("infer_type error. Arguments:") + for i, arg in enumerate(args): + print(" #%d: %s" % (i, arg)) + for k, v in kwargs.items(): + print(" %s: %s" % (k, v)) + raise + + def infer_type_partial(self, *args, **kwargs): + """Infers the type partially. + + This functions works the same way as `infer_type`, + except that this function can return partial results. + + In the following example, information about fc2 is not available. So, `infer_shape` + will return a tuple of `None` values but `infer_shape_partial` will return partial values. + + Example + ------- + >>> data = mx.sym.Variable('data') + >>> prev = mx.sym.Variable('prev') + >>> casted_prev = mx.sym.cast(prev, dtype='float32') + >>> out = mx.sym.Activation(data=mx.sym.elemwise_add(data, casted_prev), act_type='relu') + >>> out.list_arguments() + ['data', 'prev'] + >>> out.infer_type(data='float32') + (None, None, None) + >>> out.infer_type_partial(data='float32') + ([numpy.float32, None], [numpy.float32], []) + >>> # infers type if you give information about prev + >>> out.infer_type(data='float32', prev='float16') + ([numpy.float32, numpy.float16], [numpy.float32], []) + + Parameters + ---------- + *args : + Type of known arguments in a positional way. + Unknown type can be marked as None. + + **kwargs : + Keyword arguments of known types. + + Returns + ------- + arg_types : list of numpy.dtype or None + List of argument types. + The order is same as the order of list_arguments(). + out_types : list of numpy.dtype or None + List of output types. + The order is same as the order of list_outputs(). + aux_types : list of numpy.dtype or None + List of auxiliary state types. + The order is same as the order of list_auxiliary_states(). + """ + return self._infer_type_impl(True, *args, **kwargs) + + def _infer_type_impl(self, partial, *args, **kwargs): + """The actual implementation for calling type inference API.""" # pylint: disable=too-many-locals if len(args) != 0 and len(kwargs) != 0: raise ValueError('Can only specify known argument \ @@ -912,7 +987,11 @@ def infer_type(self, *args, **kwargs): aux_type_size = mx_uint() aux_type_data = ctypes.POINTER(ctypes.c_int)() complete = ctypes.c_int() - check_call(_LIB.MXSymbolInferType( + if partial: + infer_func = _LIB.MXSymbolInferTypePartial + else: + infer_func = _LIB.MXSymbolInferType + check_call(infer_func( self.handle, mx_uint(len(sdata)), keys, diff --git a/scala-package/README.md b/scala-package/README.md index 8322ab2a237f..c7d0cecf15ac 100644 --- a/scala-package/README.md +++ b/scala-package/README.md @@ -179,6 +179,37 @@ mvn deploy -Pstaging Examples & Usage ------- +Assuming you use `mvn install`, you can find the `mxnet-full_scala_version-INTERNAL.jar` e.g. `mxnet-full_2.11-INTERNAL.jar` under the path `incubator-mxnet/scala-package/assembly/target`. + +Adding the following configuration in `pom.xml` +```HTML + + org.apache.mxnet + mxnet-full_2.11-INTERNAL + 1.5.0 + system + path_to_jar/mxnet-full_2.11-INTERNAL.jar + +``` +If you have following error message +``` +Error: A JNI error has occurred, please check your installation and try again +Exception in thread "main" java.lang.NoClassDefFoundError: org/apache/mxnet/NDArray + at java.lang.Class.getDeclaredMethods0(Native Method) + at java.lang.Class.privateGetDeclaredMethods(Class.java:2701) + at java.lang.Class.privateGetMethodRecursive(Class.java:3048) + at java.lang.Class.getMethod0(Class.java:3018) + at java.lang.Class.getMethod(Class.java:1784) + at sun.launcher.LauncherHelper.validateMainClass(LauncherHelper.java:544) + at sun.launcher.LauncherHelper.checkAndLoadMain(LauncherHelper.java:526) +Caused by: java.lang.ClassNotFoundException: org.apache.mxnet.NDArray + at java.net.URLClassLoader.findClass(URLClassLoader.java:381) + at java.lang.ClassLoader.loadClass(ClassLoader.java:424) + at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:331) + at java.lang.ClassLoader.loadClass(ClassLoader.java:357) +``` +Please make sure your $CLASSPATH is able to find `mxnet-full_scala_version-INTERNAL.jar`. + - To set up the Scala Project using IntelliJ IDE on macOS follow the instructions [here](https://mxnet.incubator.apache.org/tutorials/scala/mxnet_scala_on_intellij.html). - Several examples on using the Scala APIs are provided in the [Scala Examples Folder](/~https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/) diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc index 32b63c11dd9a..9f0d2834fcce 100644 --- a/src/c_api/c_api_symbolic.cc +++ b/src/c_api/c_api_symbolic.cc @@ -638,6 +638,27 @@ int MXSymbolInferType(SymbolHandle sym, API_END(); } +int MXSymbolInferTypePartial(SymbolHandle sym, + mx_uint num_args, + const char** keys, + const int *arg_type_data, + mx_uint *in_type_size, + const int **in_type_data, + mx_uint *out_type_size, + const int **out_type_data, + mx_uint *aux_type_size, + const int **aux_type_data, + int *complete) { + int succ; + *complete = 1; + return MXSymbolInferType(sym, num_args, keys, + arg_type_data, + in_type_size, in_type_data, + out_type_size, out_type_data, + aux_type_size, aux_type_data, + &succ); +} + int MXSymbolGrad(SymbolHandle sym, mx_uint num_wrt, const char** wrt, SymbolHandle* out) { API_BEGIN(); LOG(FATAL) << "not implemented"; diff --git a/src/operator/contrib/adamw-inl.h b/src/operator/contrib/adamw-inl.h index 3d76b33ae765..66bd4f3f3ba4 100644 --- a/src/operator/contrib/adamw-inl.h +++ b/src/operator/contrib/adamw-inl.h @@ -33,6 +33,7 @@ #include #include #include +#include #include "../operator_common.h" #include "../mshadow_op.h" #include "../elemwise_op_common.h" @@ -48,7 +49,6 @@ struct AdamWParam : public dmlc::Parameter { float epsilon; float wd; float eta; - float rescale_grad; float clip_gradient; DMLC_DECLARE_PARAMETER(AdamWParam) { DMLC_DECLARE_FIELD(lr) @@ -69,9 +69,6 @@ struct AdamWParam : public dmlc::Parameter { "The penalty scales with the square of the magnitude of each weight."); DMLC_DECLARE_FIELD(eta) .describe("Learning rate schedule multiplier"); - DMLC_DECLARE_FIELD(rescale_grad) - .set_default(1.0f) - .describe("Rescale gradient to grad = rescale_grad*grad."); DMLC_DECLARE_FIELD(clip_gradient) .set_default(-1.0f) .describe("Clip gradient to the range of [-clip_gradient, clip_gradient] " @@ -80,44 +77,138 @@ struct AdamWParam : public dmlc::Parameter { } }; +// rescale_grad is a reserved argument at position -1. Example: +// n_in = 2: weight, grad (fp16) +// n_out = 1: weight (fp16) +// total_in = 6: weight, grad, mean, var, weight32, rescale_grad (fp32) +template +inline bool MPUpdateInferShape(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), static_cast(total_in)) << " in operator " << attrs.name; + CHECK_EQ(out_attrs->size(), static_cast(n_out)) << " in operator " << attrs.name; + // rescale_grad.shape = (1,) + SHAPE_ASSIGN_CHECK(*in_attrs, total_in - 1, mshadow::Shape1(1)); + return ElemwiseAttr( + attrs, in_attrs, out_attrs, TShape()); +} + +// rescale_grad is a reserved argument at position -1. Example: +// n_in = 2: weight, grad (fp16) +// n_out = 1: weight (fp16) +// total_in = 6: weight, grad, mean, var, weight32, rescale_grad (fp32) +template +inline bool MPUpdateInferType(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), static_cast(total_in)) << " in operator " << attrs.name; + CHECK_EQ(out_attrs->size(), static_cast(n_out)) << " in operator " << attrs.name; + for (int i = n_in; i < total_in; ++i) { + TYPE_ASSIGN_CHECK(*in_attrs, i, mshadow::kFloat32); + } + return ElemwiseAttr( + attrs, in_attrs, out_attrs, -1); +} + +template +struct MPAdamWKernel { + template + MSHADOW_XINLINE static void Map(int i, DType* out_data, float* mean_data, + float* var_data, const DType* weight_data, const DType* grad_data, float* weight32, + const float param_clip_gradient, const float param_beta1, const float param_beta2, + const float param_eta, const float param_lr, const float param_wd, + const float param_rescale_grad, const float param_epsilon) { + float w = weight32[i]; + float mean = mean_data[i]; + float var = var_data[i]; + float scaled_grad = param_rescale_grad*static_cast(grad_data[i]); + if (param_clip_gradient >= 0.0f) { + mean = param_beta1 * mean + + (1 - param_beta1) * mshadow_op::clip::Map(scaled_grad, param_clip_gradient); + var = param_beta2 * var + (1 - param_beta2) * + mshadow_op::square::Map(mshadow_op::clip::Map(scaled_grad, param_clip_gradient)); + } else { + mean = param_beta1 * mean + (1 - param_beta1) * scaled_grad; + var = param_beta2 * var + (1 - param_beta2) * mshadow_op::square::Map(scaled_grad); + } + mean_data[i] = mean; + var_data[i] = var; + w = w - param_eta * (param_lr * mean / (mshadow_op::square_root::Map(var) + param_epsilon) + + param_wd * w); + weight32[i] = w; + KERNEL_ASSIGN(out_data[i], req, w); + } +}; + + +template +struct MPAdamWUpdate { + static inline void Forward(const nnvm::NodeAttrs& attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs, + const float rescale_grad) { + using namespace mxnet_op; + AdamWParam param = nnvm::get(attrs.parsed); + Stream* s = ctx.get_stream(); + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + Tensor weight = inputs[0].FlatTo2D(s); + Tensor grad = inputs[1].FlatTo2D(s); + Tensor mean = inputs[2].FlatTo2D(s); + Tensor var = inputs[3].FlatTo2D(s); + Tensor weight32 = inputs[4].FlatTo2D(s); + Tensor out = outputs[0].FlatTo2D(s); + MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, { + Kernel, xpu>::Launch(s, weight.shape_.Size(), out.dptr_, mean.dptr_, + var.dptr_, weight.dptr_, grad.dptr_, weight32.dptr_, param.clip_gradient, param.beta1, + param.beta2, param.eta, param.lr, param.wd, rescale_grad, param.epsilon); + }); + }); + } +}; + /* * \brief adam_w update. */ template -inline void AdamWUpdate(const nnvm::NodeAttrs& attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { - using namespace mshadow; - using namespace mshadow::expr; - using namespace mshadow_op; - const AdamWParam& param = nnvm::get(attrs.parsed); - Stream* s = ctx.get_stream(); - MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { - Tensor weight = inputs[0].FlatTo2D(s); - Tensor grad = inputs[1].FlatTo2D(s); - Tensor mean = inputs[2].FlatTo2D(s); - Tensor var = inputs[3].FlatTo2D(s); - Tensor out = outputs[0].FlatTo2D(s); +struct AdamWUpdate { + static inline void Forward(const nnvm::NodeAttrs& attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs, + const float rescale_grad) { + using namespace mshadow; + using namespace mshadow::expr; + using namespace mshadow_op; + const AdamWParam& param = nnvm::get(attrs.parsed); + Stream* s = ctx.get_stream(); + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + Tensor weight = inputs[0].FlatTo2D(s); + Tensor grad = inputs[1].FlatTo2D(s); + Tensor mean = inputs[2].FlatTo2D(s); + Tensor var = inputs[3].FlatTo2D(s); + Tensor out = outputs[0].FlatTo2D(s); - grad = scalar(param.rescale_grad) * grad; - if (param.clip_gradient >= 0.0f) { - mean = scalar(param.beta1)*mean + scalar(1.f-param.beta1) * - F(grad, DType(param.clip_gradient)); - var = scalar(param.beta2)*var + scalar(1.f-param.beta2)*F( - F(grad, DType(param.clip_gradient))); - } else { - mean = scalar(param.beta1)*mean + scalar(1.f-param.beta1) * grad; - var = scalar(param.beta2)*var + scalar(1.f-param.beta2) * F(grad); - } - Assign(out, req[0], - weight - - scalar(param.eta) * (scalar(param.lr) * - mean / (F(var) + scalar(param.epsilon)) + - (scalar(param.wd) * weight))); - }); -} + grad = scalar(rescale_grad) * grad; + if (param.clip_gradient >= 0.0f) { + mean = scalar(param.beta1)*mean + scalar(1.f-param.beta1) * + F(grad, DType(param.clip_gradient)); + var = scalar(param.beta2)*var + scalar(1.f-param.beta2)*F( + F(grad, DType(param.clip_gradient))); + } else { + mean = scalar(param.beta1)*mean + scalar(1.f-param.beta1) * grad; + var = scalar(param.beta2)*var + scalar(1.f-param.beta2) * F(grad); + } + Assign(out, req[0], + weight - + scalar(param.eta) * (scalar(param.lr) * + mean / (F(var) + scalar(param.epsilon)) + + (scalar(param.wd) * weight))); + }); + } +}; } // namespace op } // namespace mxnet diff --git a/src/operator/contrib/adamw.cc b/src/operator/contrib/adamw.cc index 94623fe08a9e..2fbc39743c93 100644 --- a/src/operator/contrib/adamw.cc +++ b/src/operator/contrib/adamw.cc @@ -24,12 +24,76 @@ * \author Haibin Lin */ #include "./adamw-inl.h" +#include "../optimizer_op-inl.h" namespace mxnet { namespace op { DMLC_REGISTER_PARAMETER(AdamWParam); +template