From 118420b1723f29899c7368981164696459410074 Mon Sep 17 00:00:00 2001
From: Sergey Kolychev <sergeykolychev.github@gmail.com>
Date: Tue, 15 Jan 2019 15:45:00 -0800
Subject: [PATCH] Two more gluon loss classes. Visualization fixes. Gluon rnn
 rework, including hybridization. Exposed GPU memory info to perl level.

---
 perl-package/AI-MXNet/Changes                 |   6 +
 perl-package/AI-MXNet/META.json               |   4 +-
 perl-package/AI-MXNet/META.yml                |   4 +-
 perl-package/AI-MXNet/Makefile.PL             |   4 +-
 perl-package/AI-MXNet/README                  |   2 +-
 perl-package/AI-MXNet/lib/AI/MXNet.pm         |   2 +-
 perl-package/AI-MXNet/lib/AI/MXNet/Context.pm |  24 ++
 .../AI-MXNet/lib/AI/MXNet/Gluon/Loss.pm       | 171 +++++++++++
 .../AI-MXNet/lib/AI/MXNet/Gluon/RNN/Cell.pm   | 131 +++++++-
 .../AI-MXNet/lib/AI/MXNet/Gluon/RNN/Layer.pm  | 285 ++++++++++++------
 .../AI-MXNet/lib/AI/MXNet/Initializer.pm      |  18 ++
 perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm |   6 +-
 perl-package/AI-MXNet/lib/AI/MXNet/Symbol.pm  |  18 +-
 .../AI-MXNet/lib/AI/MXNet/Visualization.pm    |   4 +
 perl-package/AI-MXNet/t/test_gluon_rnn.t      |  29 +-
 perl-package/AI-MXNet/t/test_loss.t           |  46 ++-
 perl-package/AI-MXNetCAPI/Changes             |   3 +
 perl-package/AI-MXNetCAPI/META.json           |   2 +-
 perl-package/AI-MXNetCAPI/META.yml            |   2 +-
 perl-package/AI-MXNetCAPI/README              |   2 +-
 perl-package/AI-MXNetCAPI/lib/AI/MXNetCAPI.pm |   2 +-
 perl-package/AI-MXNetCAPI/mxnet_typemaps.i    |  13 +-
 22 files changed, 627 insertions(+), 151 deletions(-)

diff --git a/perl-package/AI-MXNet/Changes b/perl-package/AI-MXNet/Changes
index 8bd43f3be205..e67fe39b49ab 100644
--- a/perl-package/AI-MXNet/Changes
+++ b/perl-package/AI-MXNet/Changes
@@ -1,5 +1,11 @@
 Revision history for Perl extension AI::MXNet
 
+1.4     Mon Feb 18 11:54:07 PST 2019
+        - Two more gluon loss classes
+        - Visualization fixes
+        - Gluon rnn rework, including hybridization
+        - Exposed GPU memory info to perl level.
+
 1.33    Thu Oct  4 13:25:56 PDT 2018
         - Added randn function.
         - Internal SELU function on C++ layer.
diff --git a/perl-package/AI-MXNet/META.json b/perl-package/AI-MXNet/META.json
index bbbea734ccf8..37c573c279f5 100644
--- a/perl-package/AI-MXNet/META.json
+++ b/perl-package/AI-MXNet/META.json
@@ -30,7 +30,7 @@
       },
       "runtime" : {
          "requires" : {
-            "AI::MXNetCAPI" : "1.33",
+            "AI::MXNetCAPI" : "1.4",
             "AI::NNVMCAPI" : "1.3",
             "Function::Parameters" : "1.0705",
             "Hash::Ordered" : "0.012",
@@ -45,5 +45,5 @@
       }
    },
    "release_status" : "stable",
-   "version" : "1.33"
+   "version" : "1.4"
 }
diff --git a/perl-package/AI-MXNet/META.yml b/perl-package/AI-MXNet/META.yml
index 26e37b572600..692ca0307948 100644
--- a/perl-package/AI-MXNet/META.yml
+++ b/perl-package/AI-MXNet/META.yml
@@ -34,7 +34,7 @@ no_index:
     - t
     - inc
 requires:
-  AI::MXNetCAPI: '1.33'
+  AI::MXNetCAPI: '1.4'
   AI::NNVMCAPI: '1.3'
   Function::Parameters: '1.0705'
   Hash::Ordered: '0.012'
@@ -42,4 +42,4 @@ requires:
   Mouse: v2.1.0
   PDL: '2.007'
   PDL::CCS: '1.23.4'
-version: '1.33'
+version: '1.4'
diff --git a/perl-package/AI-MXNet/Makefile.PL b/perl-package/AI-MXNet/Makefile.PL
index 6d70b21344c2..19aba3fee4a5 100644
--- a/perl-package/AI-MXNet/Makefile.PL
+++ b/perl-package/AI-MXNet/Makefile.PL
@@ -36,7 +36,7 @@ my %WriteMakefileArgs = (
   "LICENSE" => "apache_2_0",
   "NAME" => "AI::MXNet",
   "PREREQ_PM" => {
-    "AI::MXNetCAPI" => "1.33",
+    "AI::MXNetCAPI" => "1.4",
     "AI::NNVMCAPI" => "1.3",
     "Function::Parameters" => "1.0705",
     "Hash::Ordered" => "0.012",
@@ -46,7 +46,7 @@ my %WriteMakefileArgs = (
     "GraphViz" => "2.14"
   },
   "TEST_REQUIRES" => {},
-  "VERSION" => "1.33",
+  "VERSION" => "1.4",
   "test" => {
     "TESTS" => "t/*.t"
   }
diff --git a/perl-package/AI-MXNet/README b/perl-package/AI-MXNet/README
index f370db3804e9..4935b6384071 100644
--- a/perl-package/AI-MXNet/README
+++ b/perl-package/AI-MXNet/README
@@ -1,5 +1,5 @@
 This archive contains the distribution AI-MXNet,
-version 1.33:
+version 1.4:
 
   Perl interface to MXNet machine learning library
 
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet.pm b/perl-package/AI-MXNet/lib/AI/MXNet.pm
index 6a559a394a9f..80699b14311c 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet.pm
@@ -51,7 +51,7 @@ use AI::MXNet::Gluon;
 use AI::MXNet::NDArray::Sparse;
 use AI::MXNet::Symbol::Sparse;
 use AI::MXNet::Engine;
-our $VERSION = '1.33';
+our $VERSION = '1.4';
 
 sub import
 {
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Context.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Context.pm
index 826e7baf905b..7ae99be7b99e 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Context.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Context.pm
@@ -190,6 +190,30 @@ method num_gpus()
     return scalar(check_call(AI::MXNetCAPI::GetGPUCount()));
 }
 
+=head2 gpu_memory_info
+
+    Query CUDA for the free and total bytes of GPU global memory.
+
+    Parameters
+    ----------
+    $device_id=0 : int, optional
+        The device id of the GPU device.
+
+    Raises
+    ------
+    Will raise an exception on any CUDA error.
+
+    Returns
+    -------
+    ($free, $total) : (int, int)
+        Free and total memory in bytes.
+=cut
+
+method gpu_memory_info($device_id=0)
+{
+    return check_call(AI::MXNetCAPI::GetGPUMemoryInformation64($device_id));
+}
+
 method current_ctx()
 {
     return $AI::MXNet::current_ctx;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Loss.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Loss.pm
index 7dea68ffa16d..3eb62eb5a2ef 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Loss.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Loss.pm
@@ -824,4 +824,175 @@ method hybrid_forward(
 
 __PACKAGE__->register('AI::MXNet::Gluon::Loss');
 
+package AI::MXNet::Gluon::PoissonNLLLoss;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::Loss';
+has 'from_logits'  => (is => 'ro', isa => 'Bool', default => 1);
+has 'compute_full' => (is => 'ro', isa => 'Bool', default => 0);
+
+=head1 NAME
+
+    AI::MXNet::Gluon::PoissonNLLLoss
+=cut
+
+=head1 DESCRIPTION
+
+    For a target (Random Variable) in a Poisson distribution, the function calculates the Negative
+    Log likelihood loss.
+    PoissonNLLLoss measures the loss accrued from a poisson regression prediction made by the model.
+
+    .. math::
+        L = \text{pred} - \text{target} * \log(\text{pred}) +\log(\text{target!})
+
+    `pred`, `target` can have arbitrary shape as long as they have the same number of elements.
+
+    Parameters
+    ----------
+    from_logits : boolean, default True
+        indicating whether log(predicted) value has already been computed. If True, the loss is computed as
+        :math:`\exp(\text{pred}) - \text{target} * \text{pred}`, and if False, then loss is computed as
+        :math:`\text{pred} - \text{target} * \log(\text{pred}+\text{epsilon})`.The default value
+    weight : float or None
+        Global scalar weight for loss.
+    batch_axis : int, default 0
+        The axis that represents mini-batch.
+    compute_full: boolean, default False
+        Indicates whether to add an approximation(Stirling factor) for the Factorial term in the formula for the loss.
+        The Stirling factor is:
+        :math:`\text{target} * \log(\text{target}) - \text{target} + 0.5 * \log(2 * \pi * \text{target})`
+    epsilon: float, default 1e-08
+        This is to avoid calculating log(0) which is not defined.
+
+
+    Inputs:
+        - **pred**:   Predicted value
+        - **target**: Random variable(count or number) which belongs to a Poisson distribution.
+        - **sample_weight**: element-wise weighting tensor. Must be broadcastable
+          to the same shape as pred. For example, if pred has shape (64, 10)
+          and you want to weigh each sample in the batch separately,
+          sample_weight should have shape (64, 1).
+
+    Outputs:
+        - **loss**: Average loss (shape=(1,1)) of the loss tensor with shape (batch_size,).
+=cut
+
+method hybrid_forward(
+    GluonClass $F, GluonInput $pred, GluonInput $target,
+    Maybe[GluonInput] $sample_weight=, Maybe[Num] $epsilon=1e-08
+)
+{
+    $target = __PACKAGE__->_reshape_like($F, $target, $pred);
+    my $loss;
+    if($self->from_logits)
+    {
+        $loss = $F->exp($pred) - $target * $pred;
+    }
+    else
+    {
+        $loss = $pred - $target * $F->log($pred + $epsilon);
+        if($self->compute_full)
+        {
+            my $stirling_factor = $target * $F->log($target) - $target + 0.5 * $F->log(2 * $target * 3.1415926);
+            $stirling_factor *= ($target > 1);
+            $loss += $stirling_factor;
+        }
+        $loss = __PACKAGE__->_apply_weighting($F, $loss, $self->weight, $sample_weight);
+    }
+    return $F->mean($loss);
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon::Loss');
+
+package AI::MXNet::Gluon::CosineEmbeddingLoss;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::Loss';
+has 'margin' => (is => 'rw', isa => 'Num', default => 0);
+
+=head1 NAME
+
+    AI::MXNet::Gluon::CosineEmbeddingLoss
+=cut
+
+=head1 DESCRIPTION
+
+    For a target label 1 or -1, vectors input1 and input2, the function computes the cosine distance
+    between the vectors. This can be interpreted as how similar/dissimilar two input vectors are.
+
+    .. math::
+
+        L = \sum_i \begin{cases} 1 - {cos\_sim({input1}_i, {input2}_i)} & \text{ if } {label}_i = 1\\
+                         {cos\_sim({input1}_i, {input2}_i)} & \text{ if } {label}_i = -1 \end{cases}\\
+        cos\_sim(input1, input2) = \frac{{input1}_i.{input2}_i}{||{input1}_i||.||{input2}_i||}
+
+    `input1`, `input2` can have arbitrary shape as long as they have the same number of elements.
+
+    Parameters
+    ----------
+    weight : float or None
+        Global scalar weight for loss.
+    batch_axis : int, default 0
+        The axis that represents mini-batch.
+    margin : float
+        Margin of separation between correct and incorrect pair.
+
+
+    Inputs:
+        - **input1**: a tensor with arbitrary shape
+        - **input2**: another tensor with same shape as pred to which input1 is
+          compared for similarity and loss calculation
+        - **label**: A 1-D tensor indicating for each pair input1 and input2, target label is 1 or -1
+        - **sample_weight**: element-wise weighting tensor. Must be broadcastable
+          to the same shape as input1. For example, if input1 has shape (64, 10)
+          and you want to weigh each sample in the batch separately,
+          sample_weight should have shape (64, 1).
+
+    Outputs:
+        - **loss**: The loss tensor with shape (batch_size,).
+=cut
+
+method hybrid_forward(
+    GluonClass $F, GluonInput $input1, GluonInput $input2, GluonInput $label, Maybe[GluonInput] $sample_weight=
+)
+{
+    $input1 = __PACKAGE__->_reshape_like($F, $input1, $input2);
+    $label = $label->reshape([-1, 1]);
+    my $cos_sim = $self->_cosine_similarity($F, $input1, $input2);
+    my $y_1 = $label == 1;
+    my $y_minus_1 = $label == -1;
+    my $cos_sim_a = (1 - $cos_sim) * $y_1;
+
+    my $z_array;
+    if($F eq 'AI::MXNet::NDArray')
+    {
+        $z_array = $F->array([0]);
+    }
+    else
+    {
+        $z_array = $F->zeros([1, 1]);
+    }
+    my $cos_sim_b = $F->broadcast_maximum($z_array, $y_minus_1 * ($cos_sim - $self->margin), { axis=>1 });
+    my $loss = $cos_sim_a + $cos_sim_b;
+    $loss = __PACKAGE__->_apply_weighting($F, $loss, $self->weight, $sample_weight);
+    return $loss;
+}
+
+method _cosine_similarity($F, $x, $y, $axis=-1)
+{
+    my $x_norm = $F->norm($x, axis=>$axis)->reshape([-1, 1]);
+    my $y_norm = $F->norm($y, axis=>$axis)->reshape([-1, 1]);
+    my $x_dot_y = $F->sum($x*$y, axis=>$axis)->reshape([-1, 1]);
+    my $eps_arr;
+    if($F eq 'AI::MXNet::NDArray')
+    {
+        $eps_arr = $F->array([1e-12]);
+    }
+    else
+    {
+        $eps_arr = $F->full([1, 1], 1e-12);
+    }
+    return ($x_dot_y / $F->broadcast_maximum($x_norm * $y_norm, $eps_arr));
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon::Loss');
+
 1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN/Cell.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN/Cell.pm
index c14b792e77d7..89493c7b8bfb 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN/Cell.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN/Cell.pm
@@ -57,6 +57,7 @@ method _get_begin_state(GluonClass $F, $begin_state, GluonInput $inputs, $batch_
     return $begin_state;
 }
 
+
 method _format_sequence($length, $inputs, $layout, $merge, $in_layout=)
 {
     assert(
@@ -118,7 +119,7 @@ method _format_sequence($length, $inputs, $layout, $merge, $in_layout=)
         if($merge)
         {
             $inputs  = [map { $F->expand_dims($_, axis => $axis) } @{ $inputs }];
-            $inputs  = $F->concat(@{ $inputs }, dim => $axis);
+            $inputs  = $F->stack(@{ $inputs }, axis => $axis);
             $in_axis = $axis;
         }
     }
@@ -129,6 +130,54 @@ method _format_sequence($length, $inputs, $layout, $merge, $in_layout=)
     return ($inputs, $axis, $F, $batch_size);
 }
 
+method _mask_sequence_variable_length($F, $data, $length, $valid_length, $time_axis, $merge)
+{
+    assert(defined $valid_length);
+    if(not blessed $data)
+    {
+        $data = $F->stack(@$data, axis=>$time_axis);
+    }
+    my $outputs = $F->SequenceMask($data, { sequence_length=>$valid_length, use_sequence_length=>1,
+                             axis=>$time_axis});
+    if(not $merge)
+    {
+        $outputs = $F->split($outputs, { num_outputs=>$length, axis=>$time_axis,
+                                   squeeze_axis=>1});
+        if(not ref $outputs eq 'ARRAY')
+        {
+            $outputs = [$outputs];
+        }
+    }
+    return $outputs;
+}
+
+method _reverse_sequences($sequences, $unroll_step, $valid_length=)
+{
+    my $F;
+    if($sequences->[0]->isa('AI::MXNet::Symbol'))
+    {
+        $F = 'AI::MXNet::Symbol';
+    }
+    else
+    {
+        $F = 'AI::MXNet::NDArray';
+    }
+
+    my $reversed_sequences;
+    if(not defined $valid_length)
+    {
+        $reversed_sequences = [reverse(@$sequences)];
+    }
+    else
+    {
+        $reversed_sequences = $F->SequenceReverse($F->stack(@$sequences, axis=>0),
+                                               {sequence_length=>$valid_length,
+                                               use_sequence_length=>1});
+        $reversed_sequences = $F->split($reversed_sequences, {axis=>0, num_outputs=>$unroll_step, squeeze_axis=>1});
+    }
+    return $reversed_sequences;
+}
+
 =head1 NAME
 
     AI::MXNet::Gluon::RNN::RecurrentCell
@@ -280,21 +329,39 @@ method unroll(
     Maybe[GluonInput] $inputs,
     Maybe[GluonInput] :$begin_state=,
     Str :$layout='NTC',
-    Maybe[Bool] :$merge_outputs=
+    Maybe[Bool] :$merge_outputs=,
+    Maybe[Bool] :$valid_length=
 )
 {
     $self->reset();
-    my ($F, $batch_size);
-    ($inputs, undef, $F, $batch_size) = $self->_format_sequence($length, $inputs, $layout, 0);
+    my ($F, $batch_size, $axis);
+    ($inputs, $axis, $F, $batch_size) = $self->_format_sequence($length, $inputs, $layout, 0);
     $begin_state //= $self->_get_begin_state($F, $begin_state, $inputs, $batch_size);
 
     my $states = $begin_state;
     my $outputs = [];
+    my $all_states = [];
     for my $i (0..$length-1)
     {
         my $output;
         ($output, $states) = $self->($inputs->[$i], $states);
         push @$outputs, $output;
+        if(defined $valid_length)
+        {
+            push @$all_states, $states;
+        }
+    }
+    if(defined $valid_length)
+    {
+        $states = [];
+        for(zip(@$all_states))
+        {
+            push @$states, $F->SequenceLast($F->stack(@$_, axis=>0),
+                                     sequence_length=>$valid_length,
+                                     use_sequence_length=>1,
+                                     axis=>0);
+        }
+        $outputs = $self->_mask_sequence_variable_length($F, $outputs, $length, $valid_length, $axis, 1);
     }
     ($outputs) = $self->_format_sequence($length, $outputs, $layout, $merge_outputs);
     return ($outputs, $states);
@@ -304,8 +371,17 @@ method _get_activation(GluonClass $F, GluonInput $inputs, Activation $activation
 {
     if(not blessed $activation)
     {
+        my %act = map { $_ => 1 } qw(tanh relu sigmoid softsign);
+        if(exists $act{$activation})
+        {
+            return $F->$activation($inputs, %kwargs)
+        }
         return $F->Activation($inputs, act_type=>$activation, %kwargs);
     }
+    elsif(ref($activation) =~ /LeakyReLU/)
+    {
+        return $F->LeakyReLU($inputs, act_type=>'leaky', slope => $activation->alpha, %kwargs);
+    }
     else
     {
         return $activation->($inputs, %kwargs);
@@ -430,7 +506,7 @@ has [qw/
 method python_constructor_arguments()
 {
     [qw/
-        hidden_size activation 
+        hidden_size activation
         i2h_weight_initializer h2h_weight_initializer
         i2h_bias_initializer h2h_bias_initializer
         input_size
@@ -476,16 +552,17 @@ method hybrid_forward(
 {
     my $prefix = "t${\ $self->counter}_";
     my $i2h = $F->FullyConnected(
-        $inputs, $i2h_weight, $i2h_bias,
+        data => $inputs, weight => $i2h_weight, bias => $i2h_bias,
         num_hidden => $self->hidden_size,
         name => "${prefix}i2h"
     );
     my $h2h = $F->FullyConnected(
-        $states->[0], $h2h_weight, $h2h_bias,
+        data => $states->[0], weight => $h2h_weight, bias => $h2h_bias,
         num_hidden => $self->hidden_size,
         name => "${prefix}h2h"
     );
-    my $output = $self->_get_activation($F, $i2h + $h2h, $self->activation, name => "${prefix}out");
+    my $i2h_plus_h2h = $F->elemwise_add($i2h, $h2h, name => "${prefix}plus0");
+    my $output = $self->_get_activation($F, $i2h_plus_h2h, $self->activation, name => "${prefix}out");
     return ($output, [$output]);
 }
 
@@ -555,6 +632,7 @@ method python_constructor_arguments()
     /];
 }
 
+
 sub BUILD
 {
     my $self = shift;
@@ -606,14 +684,18 @@ method hybrid_forward(
         num_hidden => $self->hidden_size*4,
         name => "${prefix}h2h"
     );
-    my $gates = $i2h + $h2h;
+    my $gates = $F->elemwise_add($i2h, $h2h, name => "${prefix}plus0");
     my @slice_gates = @{ $F->SliceChannel($gates, num_outputs => 4, name => "${prefix}slice") };
     my $in_gate = $F->Activation($slice_gates[0], act_type=>"sigmoid", name => "${prefix}i");
     my $forget_gate = $F->Activation($slice_gates[1], act_type=>"sigmoid", name => "${prefix}f");
     my $in_transform = $F->Activation($slice_gates[2], act_type=>"tanh", name => "${prefix}c");
     my $out_gate = $F->Activation($slice_gates[3], act_type=>"sigmoid", name => "${prefix}o");
-    my $next_c = $F->_plus($forget_gate * $states->[1], $in_gate * $in_transform, name => "${prefix}state");
-    my $next_h = $F->_mul($out_gate, $F->Activation($next_c, act_type=>"tanh"), name => "${prefix}out");
+    my $next_c = $F->_plus(
+        $F->elemwise_mul($forget_gate, $states->[1], name => "${prefix}mul0"),
+        $F->elemwise_mul($in_gate, $in_transform, name => "${prefix}mul1"),
+        name => "${prefix}state"
+    );
+    my $next_h = $F->_mul($out_gate, $F->Activation($next_c, act_type=>"tanh", name => "${prefix}activation0"), name => "${prefix}out");
     return ($next_h, [$next_h, $next_c]);
 }
 
@@ -735,10 +817,29 @@ method hybrid_forward(
     my ($i2h_r, $i2h_z, $h2h_r, $h2h_z);
     ($i2h_r, $i2h_z, $i2h) = @{ $F->SliceChannel($i2h, num_outputs => 3, name => "${prefix}i2h_slice") };
     ($h2h_r, $h2h_z, $h2h) = @{ $F->SliceChannel($h2h, num_outputs => 3, name => "${prefix}h2h_slice") };
-    my $reset_gate  = $F->Activation($i2h_r + $h2h_r, act_type=>"sigmoid", name => "${prefix}r_act");
-    my $update_gate = $F->Activation($i2h_z + $h2h_z, act_type=>"sigmoid", name => "${prefix}z_act");
-    my $next_h_tmp = $F->Activation($i2h + $reset_gate * $h2h, act_type => "tanh", name => "${prefix}h_act");
-    my $next_h = $F->_plus((1 - $update_gate) * $next_h_tmp, $update_gate * $prev_state_h, name => "${prefix}out");
+    my $reset_gate  = $F->Activation($F->elemwise_add($i2h_r, $h2h_r, name => "${prefix}plus0"), act_type=>"sigmoid", name => "${prefix}r_act");
+    my $update_gate = $F->Activation($F->elemwise_add($i2h_z, $h2h_z, name => "${prefix}plus1"), act_type=>"sigmoid", name => "${prefix}z_act");
+    my $next_h_tmp = $F->Activation(
+        $F->elemwise_add(
+            $i2h,
+            $F->elemwise_mul(
+                $reset_gate, $h2h, name => "${prefix}mul0"
+            ),
+            name => "${prefix}plus2"
+        ),
+        act_type => "tanh",
+        name => "${prefix}h_act"
+    );
+    my $ones = $F->ones_like($update_gate, name => "${prefix}ones_like0");
+    my $next_h = $F->_plus(
+        $F->elemwise_mul(
+            $F->elemwise_sub($ones, $update_gate, name => "${prefix}minus0"),
+            $next_h_tmp,
+            name => "${prefix}mul1"
+        ),
+        $F->elemwise_mul($update_gate, $prev_state_h, name => "${prefix}mul2"),
+        name => "${prefix}out"
+    );
     return ($next_h, [$next_h]);
 }
 
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN/Layer.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN/Layer.pm
index 2b6e8a5bdae4..08212ab20f6d 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN/Layer.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN/Layer.pm
@@ -21,7 +21,7 @@ package AI::MXNet::Gluon::RNN::Layer;
 use AI::MXNet::Function::Parameters;
 use AI::MXNet::Gluon::Mouse;
 use AI::MXNet::Base;
-extends 'AI::MXNet::Gluon::Block';
+extends 'AI::MXNet::Gluon::HybridBlock';
 
 has 'hidden_size'   => (is => 'rw', isa => 'Int');
 has 'num_layers'    => (is => 'rw', isa => 'Int');
@@ -29,18 +29,19 @@ has 'layout'        => (is => 'rw', isa => 'Str');
 has 'dropout'       => (is => 'rw', isa => 'Num');
 has 'bidirectional' => (is => 'rw', isa => 'Bool');
 has 'input_size'    => (is => 'rw', isa => 'Int', default => 0);
+has 'projection_size' => (is => 'rw', isa => 'Maybe[Int]');
+has [qw/lstm_state_clip_min
+        lstm_state_clip_max/] => (is => 'rw', isa => 'Maybe[Num]');
+has 'lstm_state_clip_nan' => (is => 'rw', isa => 'Bool', default => 0);
 has [qw/
     i2h_weight_initializer
     h2h_weight_initializer
     i2h_bias_initializer
     h2h_bias_initializer
+    h2r_weight_initializer
     /]              => (is => 'rw', isa => 'Maybe[Initializer]');
 has 'mode'          => (is => 'rw', isa => 'Str');
 has [qw/dir gates
-    i2h_weight
-    h2h_weight
-    i2h_bias
-    h2h_bias
     unfused/]       => (is => 'rw', init_arg => undef);
 
 method python_constructor_arguments()
@@ -50,7 +51,8 @@ method python_constructor_arguments()
         dropout bidirectional input_size
         i2h_weight_initializer h2h_weight_initializer
         i2h_bias_initializer h2h_bias_initializer
-        mode
+        mode projection_size h2r_weight_initializer
+        lstm_state_clip_min lstm_state_clip_max lstm_state_clip_nan
     /];
 }
 
@@ -61,41 +63,76 @@ sub BUILD
         ($self->layout eq 'TNC' or $self->layout eq 'NTC'),
         "Invalid layout [${\ $self->layout }]; must be one of ['TNC' or 'NTC']"
     );
-    $self->i2h_weight([]);
-    $self->h2h_weight([]);
-    $self->i2h_bias([]);
-    $self->h2h_bias([]);
     $self->dir($self->bidirectional ? 2 : 1);
     $self->gates({qw/rnn_relu 1 rnn_tanh 1 lstm 4 gru 3/}->{$self->mode});
     my ($ng, $ni, $nh) = ($self->gates, $self->input_size, $self->hidden_size);
-    for my $i (0..$self->num_layers-1)
+    if(not $self->projection_size)
     {
-        for my $j ($self->dir == 2 ? ('l', 'r') : ('l'))
+        for my $i (0..$self->num_layers-1)
         {
-            push @{ $self->i2h_weight }, $self->params->get(
-                "$j${i}_i2h_weight", shape=>[$ng*$nh, $ni],
-                init => $self->i2h_weight_initializer,
-                allow_deferred_init => 1
-            );
-            push @{ $self->h2h_weight }, $self->params->get(
-                "$j${i}_h2h_weight", shape=>[$ng*$nh, $nh],
-                init => $self->h2h_weight_initializer,
-                allow_deferred_init => 1
-            );
-            push @{ $self->i2h_bias }, $self->params->get(
-                "$j${i}_i2h_bias", shape=>[$ng*$nh],
-                init => $self->i2h_bias_initializer,
-                allow_deferred_init => 1
-            );
-            push @{ $self->h2h_bias }, $self->params->get(
-                "$j${i}_h2h_bias", shape=>[$ng*$nh],
-                init => $self->h2h_bias_initializer,
-                allow_deferred_init => 1
-            );
+            for my $j ($self->dir == 2 ? ('l', 'r') : ('l'))
+            {
+                $self->_register_param(
+                    "$j${i}_i2h_weight", [$ng*$nh, $ni],
+                    $self->i2h_weight_initializer
+                );
+                $self->_register_param(
+                    "$j${i}_h2h_weight", [$ng*$nh, $nh],
+                    $self->h2h_weight_initializer
+                );
+                $self->_register_param(
+                    "$j${i}_i2h_bias", [$ng*$nh],
+                    $self->i2h_bias_initializer,
+                );
+                $self->_register_param(
+                    "$j${i}_h2h_bias", [$ng*$nh],
+                    $self->h2h_bias_initializer,
+                );
+            }
+            $ni = $nh * $self->dir;
+        }
+    }
+    else
+    {
+        my $np = $self->projection_size;
+        for my $i (0..$self->num_layers-1)
+        {
+            for my $j ($self->dir == 2 ? ('l', 'r') : ('l'))
+            {
+                $self->_register_param(
+                    "$j${i}_i2h_weight", [$ng*$nh, $ni],
+                    $self->i2h_weight_initializer
+                );
+                $self->_register_param(
+                    "$j${i}_h2h_weight", [$ng*$nh, $np],
+                    $self->h2h_weight_initializer
+                );
+                $self->_register_param(
+                    "$j${i}_i2h_bias", [$ng*$nh],
+                    $self->i2h_bias_initializer,
+                );
+                $self->_register_param(
+                    "$j${i}_h2h_bias", [$ng*$nh],
+                    $self->h2h_bias_initializer,
+                );
+                $self->_register_param(
+                    "$j${i}_h2r_weight", [$np, $nh],
+                    $self->h2r_weight_initializer,
+                );
+            }
+            $ni = $np * $self->dir;
         }
-        $ni = $nh * $self->dir;
     }
-    $self->unfused($self->_unfuse());
+}
+
+method _register_param($name, $shape, $init)
+{
+    my $p = $self->params->get(
+        $name, shape=>$shape, init=>$init,
+        allow_deferred_init=>1
+    );
+    $self->$name($p);
+    return $p;
 }
 
 use overload '""' => sub {
@@ -119,15 +156,55 @@ use overload '""' => sub {
     return $s;
 };
 
+method _collect_params_with_prefix(Str $prefix='')
+{
+    $prefix .= '.' if($prefix);
+    my $pattern = qr/(l|r)(\d+)_(i2h|h2h)_(weight|bias)$/;
+    my $convert_key = sub { my ($m, $bidirectional) = @_;
+        my ($d, $l, $g, $t) = @$m;
+        if($bidirectional)
+        {
+            return "_unfused.$l.${d}_cell.${g}_$t";
+        }
+        else
+        {
+            return "_unfused.$l.${g}_$t";
+        }
+    };
+    my $bidirectional = 0;
+    my %params = %{ $self->_reg_params };
+    for my $k (keys %params)
+    {
+        $k =~ $pattern;
+        $bidirectional = 1 if $1 and $1 eq 'r';
+    }
+    my %ret;
+    for my $k (keys %params)
+    {
+        $k =~ $pattern;
+        $ret{ $prefix . $convert_key->([$1, $2, $3, $4], $bidirectional) } = $params{$k};
+    }
+    my $iter = $self->_children->iterator;
+    while(my ($name, $child) = $iter->())
+    {
+        %ret = (%ret, %{ $child->_collect_params_with_prefix("$prefix$name") });
+    }
+    return \%ret;
+}
+
 method state_info($batch_size=0)
 {
     confess('NotImplementedError');
 }
 
-# Unfuses the fused RNN in to a stack of rnn cells.
 
 method _unfuse()
 {
+    assert((not $self->projection_size), "_unfuse does not support projection layer yet!");
+    assert(
+        (not $self->lstm_state_clip_min and not $self->lstm_state_clip_max),
+        "_unfuse does not support state clipping yet!"
+    );
     my $get_cell = {
         rnn_relu => sub {
             my %kwargs = @_;
@@ -218,89 +295,105 @@ method begin_state(
 }
 
 use Data::Dumper;
-method forward(GluonInput $inputs, Maybe[GluonInput] $states=)
+method hybrid_forward(GluonClass $F, GluonInput $inputs, @args)
 {
-    my $batch_size = $inputs->shape->[index($self->layout, 'N')];
-    my $skip_states = not defined $states;
-    if($skip_states)
+    my $states;
+    if(@args)
     {
-        $states = $self->begin_state($batch_size, ctx=>$inputs->context);
+        if(not defined $args[0] or ref $args[0])
+        {
+            $states = shift(@args);
+            undef $states if(ref $states eq 'ARRAY' and not @$states);
+        }
     }
-    if(blessed $states and $states->isa('AI::MXNet::NDArray'))
+    use Data::Dumper;
+
+    my $batch_size;
+    if($F eq 'AI::MXNet::NDArray')
     {
-        $states = [$states];
+        $batch_size = $inputs->shape->[index($self->layout, 'N')];
     }
-    for(zip($states, $self->state_info($batch_size))) {
-        my ($state, $info) = @$_;
-        if(Dumper($state->shape) ne Dumper($info->{shape}))
+    my $skip_states = not defined $states;
+    if($skip_states)
+    {
+        if($F eq 'AI::MXNet::NDArray')
         {
-            my @state_shape = @{ $state->shape };
-            confess("Invalid recurrent state shape. Expecting @{$info->{shape}}, got @state_shape.");
+            $states = $self->begin_state($batch_size, ctx=>$inputs->context, dtype=>$inputs->dtype);
         }
-    }
-    if($self->input_size == 0)
-    {
-        for my $i (0..$self->dir-1)
+        else
         {
-            $self->i2h_weight->[$i]->shape([$self->gates*$self->hidden_size, $inputs->shape->[2]]);
-            $self->i2h_weight->[$i]->_finish_deferred_init();
+            $states = $self->begin_state(0, func=>sub { return AI::MXNet::Symbol->zeros(@_) });
         }
     }
-    my $out;
-    if($inputs->context->device_type eq 'gpu')
+    if(blessed $states and ($states->isa('AI::MXNet::NDArray') or $states->isa('AI::MXNet::Symbol')))
     {
-        $out = $self->_forward_gpu($inputs, $states);
+        $states = [$states];
     }
-    else
+    if($F eq 'AI::MXNet::NDArray')
     {
-        $out = $self->_forward_cpu($inputs, $states);
+        for(zip($states, $self->state_info($batch_size)))
+        {
+            my ($state, $info) = @$_;
+            if(Dumper($state->shape) ne Dumper($info->{shape}))
+            {
+                my @state_shape = @{ $state->shape };
+                confess("Invalid recurrent state shape. Expecting @{$info->{shape}}, got @state_shape.");
+            }
+        }
     }
-
-    # out is (output, state)
+    my $out = $self->_forward_kernel($F, $inputs, $states, @args);
     return $skip_states ? $out->[0] : $out;
 }
 
-method _forward_cpu($inputs, $states)
+method _forward_kernel($F, $inputs, $states, %kwargs)
 {
-    my $ns = @{ $states };
-    my $axis = index($self->layout, 'T');
-    $states = [map { @{$_} } @{ $states }];
-    my $outputs;
-    ($outputs, $states) = $self->unfused->unroll(
-        $inputs->shape->[$axis], $inputs, begin_state => $states,
-        layout => $self->layout, merge_outputs => 1
-    );
-    my @new_states;
-    for my $i (0..$ns-1)
+    if($self->layout eq 'NTC')
     {
-        my @tmp;
-        for (my $j = $i; $j < @{ $states }; $j += $ns)
+        $inputs = $F->swapaxes($inputs, dim1=>0, dim2=>1);
+    }
+    my @params;
+    if(not defined $self->projection_size)
+    {
+        for my $t ('weight', 'bias')
         {
-            push @tmp, $states->[$j];
+            for my $l (0..$self->num_layers-1)
+            {
+                for my $d ($self->dir == 2 ? ('l', 'r') : ('l'))
+                {
+                    for my $g ('i2h', 'h2h')
+                    {
+                        push @params, $kwargs{"$d${l}_${g}_$t"}->reshape([-1]);
+                    }
+                }
+            }
         }
-        my $state = AI::MXNet::NDArray->concat((map { $_->reshape([1, @{ $_->shape }]) } @tmp), dim => 0);
-        push @new_states, $state;
     }
-    return [$outputs, \@new_states];
-}
-
-method _forward_gpu($inputs, $states)
-{
-    if($self->layout eq 'NTC')
+    else
     {
-        $inputs = $inputs->swapaxes(dim1 => 0, dim2 => 1);
+        for my $t ('weight', 'bias')
+        {
+            for my $l (0..$self->num_layers-1)
+            {
+                for my $d ($self->dir == 2 ? ('l', 'r') : ('l'))
+                {
+                    for my $g ('i2h', 'h2h', 'h2r')
+                    {
+                        push @params, $kwargs{"$d${l}_${g}_$t"}->reshape([-1])
+                            unless($g eq 'h2r' and $t eq 'bias');
+                    }
+                }
+            }
+        }
     }
-    my $ctx = $inputs->context;
-    my @params = map { $_->data($ctx)->reshape([-1]) } map { @{ $_ } } (
-        $self->i2h_weight, $self->h2h_weight,
-        $self->i2h_bias, $self->h2h_bias
-    );
-    my $params = AI::MXNet::NDArray->concat(@params, dim => 0);
-    my $rnn = AI::MXNet::NDArray->RNN(
-        $inputs, $params, @{ $states }, state_size => $self->hidden_size,
+    my $params = $F->_rnn_param_concat(@params, dim=>0);
+    my $rnn = $F->RNN(
+        $inputs, $params, @{ $states }, { state_size => $self->hidden_size,
         num_layers => $self->num_layers, bidirectional => $self->dir == 2 ? 1 : 0,
-        p => $self->dropout, state_outputs => 1, mode => $self->mode
-    );
+        p => $self->dropout, state_outputs => 1, mode => $self->mode,
+        (defined $self->lstm_state_clip_min ? (lstm_state_clip_min=>$self->lstm_state_clip_min) : ()),
+        (defined $self->lstm_state_clip_max ? (lstm_state_clip_max=>$self->lstm_state_clip_max) : ()),
+        (defined $self->lstm_state_clip_nan ? (lstm_state_clip_nan=>$self->lstm_state_clip_nan) : ())
+    });
     my $outputs;
     my @rnn = @{$rnn};
     if($self->mode eq 'lstm')
@@ -318,7 +411,6 @@ method _forward_gpu($inputs, $states)
     return [$outputs, $states];
 }
 
-
 package AI::MXNet::Gluon::RNN::RNN;
 
 =head1 NAME
@@ -552,7 +644,10 @@ method state_info(DimSize $batch_size=0)
 {
     return [
         {
-            shape => [$self->num_layers * $self->dir, $batch_size, $self->hidden_size],
+            shape => [
+                $self->num_layers * $self->dir, $batch_size, 
+                defined $self->projection_size ? $self->projection_size : $self->hidden_size
+            ],
             __layout__ => 'LNC'
         },
         {
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Initializer.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Initializer.pm
index 0359cc3640d4..75c8b1e3dad1 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Initializer.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Initializer.pm
@@ -191,6 +191,16 @@ method call(Str|AI::MXNet::InitDesc $desc, AI::MXNet::NDArray $arr)
             $self->$method($desc, $arr);
             $self->_verbose_print($desc, $1, $arr);
         }
+        elsif($desc =~ /min$/)
+        {
+            $self->_init_zero($desc, $arr);
+            $self->_verbose_print($desc, 'min', $arr);
+        }
+        elsif($desc =~ /max$/)
+        {
+            $self->_init_one($desc, $arr);
+            $self->_verbose_print($desc, 'max', $arr);
+        }
         else
         {
             $self->_init_default($desc, $arr)
@@ -250,6 +260,14 @@ method _legacy_init(Str $name, AI::MXNet::NDArray $arr)
     {
         $self->_init_zero($name, $arr);
     }
+    elsif($name =~ /min$/)
+    {
+        $self->_init_zero($name, $arr);
+    }
+    elsif($name =~ /max$/)
+    {
+        $self->_init_one($name, $arr);
+    }
     else
     {
         $self->_init_default($name, $arr);
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm
index 3a7b6bab2e2c..72f6cc772178 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm
@@ -1226,6 +1226,9 @@ method concatenate(ArrayRef[AI::MXNet::NDArray] $arrays, Index :$axis=0, :$alway
     :$repeat=1 : number, optional
         The repeating time of all elements.
         E.g repeat=3, the element a will be repeated three times --> a, a, a.
+    :$infer_range=0 : Bool
+        When set to 1, infer stop position from start, step, repeat, and
+        output tensor size.
     :$ctx : Context, optional
         The context of the NDArray, defaultw to current default context.
     :$dtype : data type, optional
@@ -1237,7 +1240,7 @@ method concatenate(ArrayRef[AI::MXNet::NDArray] $arrays, Index :$axis=0, :$alway
         The created NDArray
 =cut
 
-method arange(Index :$start=0, Maybe[Index] :$stop=, Index :$step=1, Index :$repeat=1,
+method arange(Index :$start=0, Maybe[Index] :$stop=, Index :$step=1, Index :$repeat=1, Bool :$infer_range=0,
               AI::MXNet::Context :$ctx=AI::MXNet::Context->current_ctx, Dtype :$dtype='float32')
 {
     return __PACKAGE__->_arange({
@@ -1246,6 +1249,7 @@ method arange(Index :$start=0, Maybe[Index] :$stop=, Index :$step=1, Index :$rep
                 step => $step,
                 repeat => $repeat,
                 dtype => $dtype,
+                infer_range => $infer_range,
                 ctx => "$ctx"
     });
 }
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol.pm
index 57bfdf1d977c..04dd1cbfc441 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol.pm
@@ -1411,16 +1411,19 @@ method ones(Shape :$shape, Dtype :$dtype='float32', Maybe[Str] :$name=, Maybe[St
 
     Parameters
     ----------
-    start : number
+    :$start=0 : number
         Start of interval. The interval includes this value. The default start value is 0.
-    stop : number, optional
+    :$stop= : number, optional
         End of interval. The interval does not include this value.
-    step : number, optional
+    :$step=1.0 : number, optional
         Spacing between values
-    repeat : int, optional
+    :$repeat=1 : int, optional
         "The repeating time of all elements.
         E.g repeat=3, the element a will be repeated three times --> a, a, a.
-    dtype : type, optional
+    :$infer_range=0 : Bool
+        When set to 1, infer stop position from start, step, repeat, and
+        output tensor size.
+    :$dtype='float32' : type, optional
         The value type of the NDArray, default to np.float32
 
     Returns
@@ -1429,11 +1432,12 @@ method ones(Shape :$shape, Dtype :$dtype='float32', Maybe[Str] :$name=, Maybe[St
         The created Symbol
 =cut
 
-method arange(Index :$start=0, Index :$stop=, Num :$step=1.0, Index :$repeat=1, Maybe[Str] :$name=, Dtype :$dtype='float32')
+method arange(Index :$start=0, Index :$stop=, Num :$step=1.0, Index :$repeat=1, Bool :$infer_range=0, Maybe[Str] :$name=, Dtype :$dtype='float32')
 {
     return __PACKAGE__->_arange({
                  start => $start, (defined $stop ? (stop => $stop) : ()),
-                 step => $step, repeat => $repeat, name => $name, dtype => $dtype
+                 step => $step, repeat => $repeat, name => $name, dtype => $dtype,
+                 infer_range => $infer_range
     });
 }
 
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Visualization.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Visualization.pm
index 20811f10fedf..1574ea58307f 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Visualization.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Visualization.pm
@@ -172,6 +172,10 @@ method print_summary(
                 $cur_param = $num_filter * 2;
             }
         }
+        elsif($op eq 'Embedding')
+        {
+            $cur_param = $node->{attrs}{input_dim} * $node->{attrs}{output_dim};
+        }
         my $first_connection;
         if(not $pre_node)
         {
diff --git a/perl-package/AI-MXNet/t/test_gluon_rnn.t b/perl-package/AI-MXNet/t/test_gluon_rnn.t
index 83b294d110ce..51e6ad53e171 100644
--- a/perl-package/AI-MXNet/t/test_gluon_rnn.t
+++ b/perl-package/AI-MXNet/t/test_gluon_rnn.t
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+
 use strict;
 use warnings;
 use Test::More tests => 77;
@@ -276,14 +277,15 @@ sub check_rnn_layer_forward
     $inputs->attach_grad;
     my $out;
     mx->autograd->record(sub {
-        $out = $layer->($inputs, $states);
         if(defined $states)
         {
+            $out = $layer->($inputs, $states);
             ok(@$out == 2);
             $out = $out->[0];
         }
         else
         {
+            $out = $layer->($inputs);
             ok(blessed $out and $out->isa('AI::MXNet::NDArray'));
         }
         $out->backward();
@@ -292,21 +294,19 @@ sub check_rnn_layer_forward
     my $pdl_out = $out->aspdl;
     my $pdl_dx = $inputs->grad->aspdl;
     $layer->hybridize;
-
     mx->autograd->record(sub {
-        $out = $layer->($inputs, $states);
         if(defined $states)
         {
-            ok(@$out == 2);
-            $out = $out->[0]
+            ($out, $states) = $layer->($inputs, $states);
+            ok(blessed $out and $out->isa('AI::MXNet::NDArray'));
         }
         else
         {
+            $out = $layer->($inputs, $states);
             ok(blessed $out and $out->isa('AI::MXNet::NDArray'));
         }
         $out->backward();
     });
-
     ok(almost_equal($pdl_out, $out->aspdl, 1e-3));
     ok(almost_equal($pdl_dx, $inputs->grad->aspdl, 1e-3));
 }
@@ -314,21 +314,12 @@ sub check_rnn_layer_forward
 sub test_rnn_layers
 {
     check_rnn_layer_forward(gluon->rnn->RNN(10, 2), mx->nd->ones([8, 3, 20]));
-    check_rnn_layer_forward(gluon->rnn->RNN(10, 2), mx->nd->ones([8, 3, 20]), mx->nd->ones([2, 3, 10]));
+    check_rnn_layer_forward(gluon->rnn->RNN(10, 2, bidirectional=>1), mx->nd->ones([8, 3, 20]), mx->nd->ones([4, 3, 10]));
     check_rnn_layer_forward(gluon->rnn->LSTM(10, 2), mx->nd->ones([8, 3, 20]));
-    check_rnn_layer_forward(gluon->rnn->LSTM(10, 2), mx->nd->ones([8, 3, 20]), [mx->nd->ones([2, 3, 10]), mx->nd->ones([2, 3, 10])]);
+    check_rnn_layer_forward(gluon->rnn->LSTM(10, 2, bidirectional=>1), mx->nd->ones([8, 3, 20]), [mx->nd->ones([4, 3, 10]), mx->nd->ones([4, 3, 10])]);
     check_rnn_layer_forward(gluon->rnn->GRU(10, 2), mx->nd->ones([8, 3, 20]));
-    check_rnn_layer_forward(gluon->rnn->GRU(10, 2), mx->nd->ones([8, 3, 20]), mx->nd->ones([2, 3, 10]));
-
-#    my $net = gluon->nn->Sequential();
-#    $net->add(gluon->rnn->LSTM(10, 2, bidirectional=>1));
-#    $net->add(gluon->nn->BatchNorm(axis=>2));
-#    $net->add(gluon->nn->Flatten());
-#    $net->add(gluon->nn->Dense(3, activation=>'relu'));
-#    $net->collect_params()->initialize();
-#    mx->autograd->record(sub {
-#        $net->(mx->nd->ones([2, 3, 10]))->backward();
-#    });
+    check_rnn_layer_forward(gluon->rnn->GRU(10, 2, bidirectional=>1), mx->nd->ones([8, 3, 20]), mx->nd->ones([4, 3, 10]));
 }
 
 test_rnn_layers();
+
diff --git a/perl-package/AI-MXNet/t/test_loss.t b/perl-package/AI-MXNet/t/test_loss.t
index 7fc7ee81d0de..5a9e413bbfaf 100644
--- a/perl-package/AI-MXNet/t/test_loss.t
+++ b/perl-package/AI-MXNet/t/test_loss.t
@@ -17,7 +17,7 @@
 
 use strict;
 use warnings;
-use Test::More tests => 30;
+use Test::More tests => 32;
 use AI::MXNet 'mx';
 use AI::MXNet::Gluon 'gluon';
 use AI::MXNet::TestUtils 'almost_equal';
@@ -435,3 +435,47 @@ sub test_triplet_loss
 
 test_triplet_loss();
 
+sub test_cosine_loss
+{
+    my $input1 = mx->nd->random->randn(3, 2);
+    my $input2 = mx->nd->random->randn(3, 2);
+    my $label  = mx->nd->sign(mx->nd->random->randn($input1->shape->[0]));
+
+    my $Loss = gluon->loss->CosineEmbeddingLoss();
+    my $loss = $Loss->($input1, $input2, $label);
+
+    my $numerator = mx->nd->sum($input1 * $input2, keepdims => 1, axis => 1);
+    my $denominator = mx->nd->sqrt(mx->nd->sum($input1**2, axis=>1, keepdims=>1))
+        *
+    mx->nd->sqrt(mx->nd->sum($input2**2, axis=>1, keepdims=>1));
+    my $pdl_loss = mx->nd->where(
+        ($label == 1), 1-$numerator/$denominator,
+        mx->nd->broadcast_maximum(mx->nd->array([0]), $numerator/$denominator, { axis=>1 })
+    );
+    ok(almost_equal($loss->aspdl, $pdl_loss->aspdl));
+}
+
+test_cosine_loss();
+
+sub test_poisson_nllloss
+{
+    my $N = 1000;
+    mx->random->seed(1234);
+    srand(1234);
+    my $data = mx->random->poisson(shape=>[$N, 2]);
+    my $label = mx->random->poisson(lam=>4, shape=>[$N, 1]);
+    my $data_iter = mx->io->NDArrayIter($data, $label, batch_size=>20, label_name=>'label', shuffle=>1);
+    my $output = mx->sym->exp(get_net(1));
+    my $l = mx->symbol->Variable('label');
+    my $Loss = gluon->loss->PoissonNLLLoss(from_logits=>0);
+    my $loss = $Loss->($output, $l);
+    $loss = mx->sym->make_loss($loss);
+    my $mod = mx->mod->Module($loss, data_names=>['data'], label_names=>['label']);
+    local($AI::MXNet::Logging::silent) = 1;
+    $mod->fit($data_iter, num_epoch=>20, optimizer_params=>{learning_rate => 0.01},
+            initializer=>mx->init->Normal(sigma=>0.1), eval_metric=>mx->metric->Loss(),
+            optimizer=>'adam');
+    ok($mod->score($data_iter, mx->metric->Loss())->{loss} < 0.05);
+}
+
+test_poisson_nllloss;
diff --git a/perl-package/AI-MXNetCAPI/Changes b/perl-package/AI-MXNetCAPI/Changes
index 08ad085abce9..cdbbdab57cdf 100644
--- a/perl-package/AI-MXNetCAPI/Changes
+++ b/perl-package/AI-MXNetCAPI/Changes
@@ -1,5 +1,8 @@
 Revision history for Perl extension AI::MXNetCAPI
 
+1.4     Mon Feb 18 11:54:07 PST 2019
+        - Support for 64bit integers
+
 1.33    Thu Oct  4 13:25:56 PDT 2018
         - Gluon: Better sparse support for KVStore.
         - Gpu memory info via mxnet api call.
diff --git a/perl-package/AI-MXNetCAPI/META.json b/perl-package/AI-MXNetCAPI/META.json
index 1849e6b3bc18..82bee1ace8f8 100644
--- a/perl-package/AI-MXNetCAPI/META.json
+++ b/perl-package/AI-MXNetCAPI/META.json
@@ -37,5 +37,5 @@
       }
    },
    "release_status" : "stable",
-   "version" : "1.33"
+   "version" : "1.4"
 }
diff --git a/perl-package/AI-MXNetCAPI/META.yml b/perl-package/AI-MXNetCAPI/META.yml
index d870f05fbe52..bd4af4047378 100644
--- a/perl-package/AI-MXNetCAPI/META.yml
+++ b/perl-package/AI-MXNetCAPI/META.yml
@@ -36,4 +36,4 @@ no_index:
     - inc
 requires:
   Test::More: '0'
-version: '1.33'
+version: '1.4'
diff --git a/perl-package/AI-MXNetCAPI/README b/perl-package/AI-MXNetCAPI/README
index 67b77ccd1614..848b4d03ab21 100644
--- a/perl-package/AI-MXNetCAPI/README
+++ b/perl-package/AI-MXNetCAPI/README
@@ -1,4 +1,4 @@
-AI-MXNetCAPI version 1.33
+AI-MXNetCAPI version 1.4
 =====================
 
 Swig interface to MXNet c api.
diff --git a/perl-package/AI-MXNetCAPI/lib/AI/MXNetCAPI.pm b/perl-package/AI-MXNetCAPI/lib/AI/MXNetCAPI.pm
index bc7676047d76..e3b71f8efc92 100644
--- a/perl-package/AI-MXNetCAPI/lib/AI/MXNetCAPI.pm
+++ b/perl-package/AI-MXNetCAPI/lib/AI/MXNetCAPI.pm
@@ -18,7 +18,7 @@
 package AI::MXNetCAPI;
 use base qw(DynaLoader);
 bootstrap AI::MXNetCAPI;
-our $VERSION = '1.33';
+our $VERSION = '1.4';
 1;
 __END__
 
diff --git a/perl-package/AI-MXNetCAPI/mxnet_typemaps.i b/perl-package/AI-MXNetCAPI/mxnet_typemaps.i
index 68e11ca74e1a..50296c2aaba5 100644
--- a/perl-package/AI-MXNetCAPI/mxnet_typemaps.i
+++ b/perl-package/AI-MXNetCAPI/mxnet_typemaps.i
@@ -115,7 +115,7 @@
     }
 }
 
-%typemap(in,numinputs=0) (int *out) (int temp), (bool *out) (bool temp)
+%typemap(in,numinputs=0) (int *out) (int temp), (bool *out) (bool temp), (uint64_t *out) (uint64_t temp)
 {
     temp = 0;
     $1 = &temp;
@@ -131,6 +131,17 @@
     }
 }
 
+%typemap(argout) (uint64_t *out)
+{
+    if(!result)
+    {
+        $result = newSVnv((double)(*$1));
+        sv_2mortal($result);
+        argvi++;
+    }
+}
+
+
 %typemap(in,numinputs=0) (const int **out_stypes) (int* temp)
 {
     temp = NULL;