From 96c6033f957806131b98391f64fc1f488b1fd2b7 Mon Sep 17 00:00:00 2001
From: Kyle Gao <kyle.ygao@gmail.com>
Date: Mon, 4 Dec 2017 10:11:54 -0500
Subject: [PATCH] 0.1.5 + minor fixes (#106)

* Modified parameter order of DecoderRNN.forward (#85)

* Updated TopKDecoder (#86)

* Fixed topk decoder.

* Use torchtext from pipy (#87)

* Use torchtext from pipe.

* Fixed torch text sorting order.

* attention is not required when only using teacher forcing in decoder (#90)

* attention is not required when only using teacher forcing in decoder

* Updated docs and version.

* Fixed code style.

* bugfix (#92)

Fixed field arguments validation.

* Removed `initial_lr` when resuming optimizer with scheduler. (#95)

* shuffle the training data (#97)

* 0.1.5 (#91)

* Modified parameter order of DecoderRNN.forward (#85)

* Updated TopKDecoder (#86)

* Fixed topk decoder.

* Use torchtext from pipy (#87)

* Use torchtext from pipe.

* Fixed torch text sorting order.

* attention is not required when only using teacher forcing in decoder (#90)

* attention is not required when only using teacher forcing in decoder

* Updated docs and version.

* Fixed code style.

* shuffle the training data

* fix example of inflate function in TopKDecoer.py (#98)

* fix example of inflate function in TopKDecoer.py

* Fix hidden_layer size for one-directional decoder (#99)

* Fix hidden_layer size for one-directional decoder

Hidden layer size of the decoder was given `hidden_size * 2 if bidirectional else 1`, resulting in a dimensionality error for non-bidirectional decoders.
Changed `1` to `hidden_size`.

* Adapt load to allow CPU loading of GPU models (#100)

* Adapt load to allow CPU loading of GPU models

Add storage parameter to torch.load to allow loading
models on a CPU that are trained on the GPU, depending
on availability of cuda.

* Fix wrong parameter use on DecoderRNN (#103)

* Fix wrong parameter use on DecoderRNN
---
 examples/sample.py                    |  2 +-
 seq2seq/dataset/fields.py             |  2 +-
 seq2seq/models/DecoderRNN.py          |  2 +-
 seq2seq/models/TopKDecoder.py         | 13 ++++++-------
 seq2seq/models/attention.py           |  2 +-
 seq2seq/trainer/supervised_trainer.py |  4 +++-
 seq2seq/util/checkpoint.py            | 10 +++++++---
 7 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/examples/sample.py b/examples/sample.py
index 889bfcb..8db6847 100644
--- a/examples/sample.py
+++ b/examples/sample.py
@@ -100,7 +100,7 @@ def len_filter(example):
         bidirectional = True
         encoder = EncoderRNN(len(src.vocab), max_len, hidden_size,
                              bidirectional=bidirectional, variable_lengths=True)
-        decoder = DecoderRNN(len(tgt.vocab), max_len, hidden_size * 2 if bidirectional else 1,
+        decoder = DecoderRNN(len(tgt.vocab), max_len, hidden_size * 2 if bidirectional else hidden_size,
                              dropout_p=0.2, use_attention=True, bidirectional=bidirectional,
                              eos_id=tgt.eos_id, sos_id=tgt.sos_id)
         seq2seq = Seq2seq(encoder, decoder)
diff --git a/seq2seq/dataset/fields.py b/seq2seq/dataset/fields.py
index 8ee01be..a844000 100644
--- a/seq2seq/dataset/fields.py
+++ b/seq2seq/dataset/fields.py
@@ -11,7 +11,7 @@ def __init__(self, **kwargs):
         if kwargs.get('batch_first') is False:
             logger.warning("Option batch_first has to be set to use pytorch-seq2seq.  Changed to True.")
         kwargs['batch_first'] = True
-        if kwargs.get('batch_first') is False:
+        if kwargs.get('include_lengths') is False:
             logger.warning("Option include_lengths has to be set to use pytorch-seq2seq.  Changed to True.")
         kwargs['include_lengths'] = True
 
diff --git a/seq2seq/models/DecoderRNN.py b/seq2seq/models/DecoderRNN.py
index b46e198..7915f1e 100644
--- a/seq2seq/models/DecoderRNN.py
+++ b/seq2seq/models/DecoderRNN.py
@@ -131,7 +131,7 @@ def decode(step, step_output, step_attn):
             eos_batches = symbols.data.eq(self.eos_id)
             if eos_batches.dim() > 0:
                 eos_batches = eos_batches.cpu().view(-1).numpy()
-                update_idx = ((lengths > di) & eos_batches) != 0
+                update_idx = ((lengths > step) & eos_batches) != 0
                 lengths[update_idx] = len(sequence_symbols)
             return symbols
 
diff --git a/seq2seq/models/TopKDecoder.py b/seq2seq/models/TopKDecoder.py
index 626d27c..ae0d465 100644
--- a/seq2seq/models/TopKDecoder.py
+++ b/seq2seq/models/TopKDecoder.py
@@ -9,7 +9,7 @@ def _inflate(tensor, times, dim):
         Args:
             tensor: A :class:`Tensor` to inflate
             times: number of repetitions
-            dimension: axis for inflation (default=0)
+            dim: axis for inflation (default=0)
 
         Returns:
             A :class:`Tensor`
@@ -20,17 +20,16 @@ def _inflate(tensor, times, dim):
             1   2
             3   4
             [torch.LongTensor of size 2x2]
-            >> decoder = TopKDecoder(nn.RNN(10, 20, 2), 3)
-            >> b = decoder._inflate(a, 1, dimension=1)
+            >> b = ._inflate(a, 2, dim=1)
             >> b
-            1   1   2   2
-            3   3   4   4
+            1   2   1   2
+            3   4   3   4
             [torch.LongTensor of size 2x4]
-            >> c = decoder._inflate(a, 1, dimension=0)
+            >> c = _inflate(a, 2, dim=0)
             >> c
             1   2
-            1   2
             3   4
+            1   2
             3   4
             [torch.LongTensor of size 4x2]
 
diff --git a/seq2seq/models/attention.py b/seq2seq/models/attention.py
index 376896f..0f06916 100644
--- a/seq2seq/models/attention.py
+++ b/seq2seq/models/attention.py
@@ -10,7 +10,7 @@ class Attention(nn.Module):
     .. math::
             \begin{array}{ll}
             x = context*output \\
-            attn = exp(x_i - max_i x_i) / sum_j exp(x_j - max_i x_i) \\
+            attn = exp(x_i) / sum_j exp(x_j) \\
             output = \tanh(w * (attn * context) + b * output)
             \end{array}
 
diff --git a/seq2seq/trainer/supervised_trainer.py b/seq2seq/trainer/supervised_trainer.py
index 57dae64..68c2711 100644
--- a/seq2seq/trainer/supervised_trainer.py
+++ b/seq2seq/trainer/supervised_trainer.py
@@ -75,7 +75,8 @@ def _train_epoches(self, data, model, n_epochs, start_epoch, start_step,
         device = None if torch.cuda.is_available() else -1
         batch_iterator = torchtext.data.BucketIterator(
             dataset=data, batch_size=self.batch_size,
-            sort=True, sort_key=lambda x: len(x.src),
+            sort=False, sort_within_batch=True,
+            sort_key=lambda x: len(x.src),
             device=device, repeat=False)
 
         steps_per_epoch = len(batch_iterator)
@@ -166,6 +167,7 @@ def train(self, model, data, num_epochs=5,
             resume_optim = self.optimizer.optimizer
             defaults = resume_optim.param_groups[0]
             defaults.pop('params', None)
+            defaults.pop('initial_lr', None)
             self.optimizer.optimizer = resume_optim.__class__(model.parameters(), **defaults)
 
             start_epoch = resume_checkpoint.epoch
diff --git a/seq2seq/util/checkpoint.py b/seq2seq/util/checkpoint.py
index d0bf482..f28a401 100644
--- a/seq2seq/util/checkpoint.py
+++ b/seq2seq/util/checkpoint.py
@@ -91,9 +91,13 @@ def load(cls, path):
         Returns:
             checkpoint (Checkpoint): checkpoint object with fields copied from those stored on disk
         """
-        print("Loading checkpoints from {}".format(path))
-        resume_checkpoint = torch.load(os.path.join(path, cls.TRAINER_STATE_NAME))
-        model = torch.load(os.path.join(path, cls.MODEL_NAME))
+        if torch.cuda.is_available():
+            resume_checkpoint = torch.load(os.path.join(path, cls.TRAINER_STATE_NAME))
+            model = torch.load(os.path.join(path, cls.MODEL_NAME))
+        else:
+            resume_checkpoint = torch.load(os.path.join(path, cls.TRAINER_STATE_NAME), map_location=lambda storage, loc: storage)
+            model = torch.load(os.path.join(path, cls.MODEL_NAME), map_location=lambda storage, loc: storage)
+
         model.flatten_parameters() # make RNN parameters contiguous
         with open(os.path.join(path, cls.INPUT_VOCAB_FILE), 'rb') as fin:
             input_vocab = dill.load(fin)