PaddlePaddle · iclementine · Jan 30, 2022 · Jan 9, 2022 · Jan 18, 2022 · Jan 20, 2022
diff --git a/python/paddle/distribution/__init__.py b/python/paddle/distribution/__init__.py
@@ -18,6 +18,7 @@
 from .distribution import Distribution
 from .exponential_family import ExponentialFamily
 from .kl import kl_divergence, register_kl
+from .multinomial import Multinomial
 from .normal import Normal
 from .uniform import Uniform
 
@@ -27,8 +28,9 @@
     'Dirichlet',
     'Distribution',
     'ExponentialFamily',
+    'Multinomial',
     'Normal',
     'Uniform',
     'kl_divergence',
-    'register_kl'
+    'register_kl',
 ]
diff --git a/python/paddle/distribution/beta.py b/python/paddle/distribution/beta.py
@@ -21,7 +21,14 @@
 
 class Beta(ExponentialFamily):
     r"""
-    Beta distribution parameterized by alpha and beta
+    Beta distribution parameterized by alpha and beta.
+
+    In probability theory and statistics, the beta distribution is a family of 
+    continuous probability distributions defined on the interval [0, 1] 
+    parameterized by two positive shape parameters, denoted by alpha and beta, 
+    that appear as exponents of the random variable and control the shape of 
+    the distribution. The generalization to multiple variables is called a 
+    Dirichlet distribution.
 
     The probability density function (pdf) is
 
@@ -37,8 +44,14 @@ class Beta(ExponentialFamily):
 
 
     Args:
-        alpha (float|Tensor): alpha parameter of beta distribution, positive(>0).
-        beta (float|Tensor): beta parameter of beta distribution, positive(>0).
+        alpha (float|Tensor): Alpha parameter. It supports broadcast semantics. 
+            The value of alpha must be positive. When the parameter is a tensor, 
+            it represents multiple independent distribution with 
+            a batch_shape(refer to ``Distribution`` ).
+        beta (float|Tensor): Beta parameter. It supports broadcast semantics. 
+            The value of beta must be positive(>0). When the parameter is tensor, 
+            it represent multiple independent distribution with 
+            a batch_shape(refer to ``Distribution`` ). 
 
     Examples:
 
@@ -86,56 +99,56 @@ def __init__(self, alpha, beta):
 
     @property
     def mean(self):
-        """mean of beta distribution.
+        """Mean of beta distribution.
         """
         return self.alpha / (self.alpha + self.beta)
 
     @property
     def variance(self):
-        """variance of beat distribution
+        """Variance of beat distribution
         """
         sum = self.alpha + self.beta
         return self.alpha * self.beta / (sum.pow(2) * (sum + 1))
 
     def prob(self, value):
-        """probability density funciotn evaluated at value
+        """Probability density funciotn evaluated at value
 
         Args:
-            value (Tensor): value to be evaluated.
+            value (Tensor): Value to be evaluated.
 
         Returns:
-            Tensor: probability.
+            Tensor: Probability.
         """
         return paddle.exp(self.log_prob(value))
 
     def log_prob(self, value):
-        """log probability density funciton evaluated at value
+        """Log probability density funciton evaluated at value
 
         Args:
-            value (Tensor): value to be evaluated
+            value (Tensor): Value to be evaluated
 
         Returns:
-            Tensor: log probability.
+            Tensor: Log probability.
         """
         return self._dirichlet.log_prob(paddle.stack([value, 1.0 - value], -1))
 
     def sample(self, shape=()):
-        """sample from beta distribution with sample shape.
+        """Sample from beta distribution with sample shape.
 
         Args:
-            shape (Sequence[int], optional): sample shape.
+            shape (Sequence[int], optional): Sample shape.
 
         Returns:
-            sampled data with shape `sample_shape` + `batch_shape` + `event_shape`.
+            Sampled data with shape `sample_shape` + `batch_shape` + `event_shape`.
         """
         shape = shape if isinstance(shape, tuple) else tuple(shape)
-        return paddle.squeeze(self._dirichlet.sample(shape)[..., 0])
+        return paddle.squeeze(self._dirichlet.sample(shape)[..., 0], axis=-1)
 
     def entropy(self):
-        """entropy of dirichlet distribution
+        """Entropy of dirichlet distribution
 
         Returns:
-            Tensor: entropy.
+            Tensor: Entropy.
         """
         return self._dirichlet.entropy()
 

diff --git a/python/paddle/distribution/categorical.py b/python/paddle/distribution/categorical.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,6 +16,7 @@
 import warnings
 
 import numpy as np
+import paddle
 from paddle import _C_ops
 
 from ..fluid import core
@@ -123,7 +124,7 @@ def sample(self, shape):
 
         Returns:
             Tensor: A tensor with prepended dimensions shape.
-        
+
         Examples:
             .. code-block:: python
 
@@ -153,14 +154,22 @@ def sample(self, shape):
         logits_shape = list(self.logits.shape)
         if len(logits_shape) > 1:
             sample_shape = shape + logits_shape[:-1]
-            logits = nn.reshape(self.logits,
-                                [np.prod(logits_shape[:-1]), logits_shape[-1]])
+            logits = paddle.reshape(
+                self.logits, [np.prod(logits_shape[:-1]), logits_shape[-1]])
         else:
             sample_shape = shape
             logits = self.logits
 
-        sample_index = multinomial(logits, num_samples, True)
-        return nn.reshape(sample_index, sample_shape, name=name)
+        sample_index = multinomial(
+            self._logits_to_probs(logits), num_samples, True)
+
+        # multinomial sample shape is (logits.shape[:-1], num_samples), need to
+        # tanspose to (num_samples, logits.shape[:-1])
+        permute = list(range(sample_index.dim()))
+        permute.insert(0, permute.pop(-1))
+        sample_index = sample_index.transpose(permute)
+
+        return paddle.reshape(sample_index, sample_shape, name=name)
 
     def kl_divergence(self, other):
         """The KL-divergence between two Categorical distributions.
@@ -170,7 +179,7 @@ def kl_divergence(self, other):
 
         Returns:
             Tensor: kl-divergence between two Categorical distributions.
-        
+
         Examples:
             .. code-block:: python
 
@@ -200,19 +209,20 @@ def kl_divergence(self, other):
         if not in_dygraph_mode():
             check_type(other, 'other', Categorical, 'kl_divergence')
 
-        logits = self.logits - nn.reduce_max(self.logits, dim=-1, keep_dim=True)
-        other_logits = other.logits - nn.reduce_max(
-            other.logits, dim=-1, keep_dim=True)
+        logits = self.logits - \
+            paddle.max(self.logits, axis=-1, keepdim=True)
+        other_logits = other.logits - paddle.max(
+            other.logits, axis=-1, keepdim=True)
         e_logits = ops.exp(logits)
         other_e_logits = ops.exp(other_logits)
-        z = nn.reduce_sum(e_logits, dim=-1, keep_dim=True)
-        other_z = nn.reduce_sum(other_e_logits, dim=-1, keep_dim=True)
+        z = paddle.sum(e_logits, axis=-1, keepdim=True)
+        other_z = paddle.sum(other_e_logits, axis=-1, keepdim=True)
         prob = e_logits / z
-        kl = nn.reduce_sum(
-            prob * (logits - nn.log(z) - other_logits + nn.log(other_z)),
-            dim=-1,
-            keep_dim=True,
-            name=name)
+        kl = paddle.sum(prob * (
+            logits - paddle.log(z) - other_logits + paddle.log(other_z)),
+                        axis=-1,
+                        keepdim=True,
+                        name=name)
 
         return kl
 
@@ -221,7 +231,7 @@ def entropy(self):
 
         Returns:
             Tensor: Shannon entropy of Categorical distribution. The data type is float32.
-        
+
         Examples:
             .. code-block:: python
 
@@ -241,14 +251,14 @@ def entropy(self):
 
         """
         name = self.name + '_entropy'
-        logits = self.logits - nn.reduce_max(self.logits, dim=-1, keep_dim=True)
+        logits = self.logits - \
+            paddle.max(self.logits, axis=-1, keepdim=True)
         e_logits = ops.exp(logits)
-        z = nn.reduce_sum(e_logits, dim=-1, keep_dim=True)
+        z = paddle.sum(e_logits, axis=-1, keepdim=True)
         prob = e_logits / z
 
-        neg_entropy = nn.reduce_sum(
-            prob * (logits - nn.log(z)), dim=-1, keep_dim=True)
-        entropy = nn.scale(neg_entropy, scale=-1.0, name=name)
+        neg_entropy = paddle.sum(prob * (logits - paddle.log(z)), axis=-1)
+        entropy = paddle.scale(neg_entropy, scale=-1.0, name=name)
         return entropy
 
     def probs(self, value):
@@ -266,7 +276,7 @@ def probs(self, value):
 
         Returns:
             Tensor: probability according to the category index.
-        
+
         Examples:
             .. code-block:: python
 
@@ -288,41 +298,41 @@ def probs(self, value):
         """
         name = self.name + '_probs'
 
-        dist_sum = nn.reduce_sum(self.logits, dim=-1, keep_dim=True)
+        dist_sum = paddle.sum(self.logits, axis=-1, keepdim=True)
         prob = self.logits / dist_sum
 
         shape = list(prob.shape)
         value_shape = list(value.shape)
         if len(shape) == 1:
             num_value_in_one_dist = np.prod(value_shape)
-            index_value = nn.reshape(value, [num_value_in_one_dist, 1])
+            index_value = paddle.reshape(value, [num_value_in_one_dist, 1])
             index = index_value
         else:
             num_dist = np.prod(shape[:-1])
             num_value_in_one_dist = value_shape[-1]
-            prob = nn.reshape(prob, [num_dist, shape[-1]])
+            prob = paddle.reshape(prob, [num_dist, shape[-1]])
             if len(value_shape) == 1:
                 value = nn.expand(value, [num_dist])
                 value_shape = shape[:-1] + value_shape
-            index_value = nn.reshape(value, [num_dist, -1, 1])
+            index_value = paddle.reshape(value, [num_dist, -1, 1])
             if shape[:-1] != value_shape[:-1]:
                 raise ValueError(
                     "shape of value {} must match shape of logits {}".format(
                         str(value_shape[:-1]), str(shape[:-1])))
 
-            index_prefix = nn.unsqueeze(
+            index_prefix = paddle.unsqueeze(
                 arange(
-                    num_dist, dtype=index_value.dtype), axes=-1)
+                    num_dist, dtype=index_value.dtype), axis=-1)
             index_prefix = nn.expand(index_prefix, [1, num_value_in_one_dist])
-            index_prefix = nn.unsqueeze(index_prefix, axes=-1)
+            index_prefix = paddle.unsqueeze(index_prefix, axis=-1)
 
             if index_value.dtype != index_prefix.dtype:
                 tensor.cast(index_prefix, dtype=index_value.dtype)
             index = concat([index_prefix, index_value], axis=-1)
 
         # value is the category index to search for the corresponding probability.
         select_prob = gather_nd(prob, index)
-        return nn.reshape(select_prob, value_shape, name=name)
+        return paddle.reshape(select_prob, value_shape, name=name)
 
     def log_prob(self, value):
         """Log probabilities of the given category. Refer to ``probs`` method.
@@ -332,7 +342,7 @@ def log_prob(self, value):
 
         Returns:
             Tensor: Log probability.
-        
+
         Examples:
             .. code-block:: python
 
@@ -354,4 +364,4 @@ def log_prob(self, value):
         """
         name = self.name + '_log_prob'
 
-        return nn.log(self.probs(value), name=name)
+        return paddle.log(self.probs(value), name=name)