diff --git a/python/paddle/incubate/distributed/models/moe/__init__.py b/python/paddle/incubate/distributed/models/moe/__init__.py
index e1663029ef1f8..fd06b4b8e5287 100644
--- a/python/paddle/incubate/distributed/models/moe/__init__.py
+++ b/python/paddle/incubate/distributed/models/moe/__init__.py
@@ -11,3 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from .gate import GShardGate, BaseGate, SwitchGate, NaiveGate
+from .moe_layer import MoELayer
+from .grad_clip import ClipGradForMOEByGlobalNorm
+ClipGradByGlobalNorm = ClipGradForMOEByGlobalNorm
diff --git a/python/setup.py.in b/python/setup.py.in
index 0f231e34168d9..4cf8bc3fc6a2e 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -312,6 +312,8 @@ packages=['paddle',
           'paddle.distributed.auto_parallel.tuner',
           'paddle.distributed.auto_parallel.cost',
           'paddle.distributed.passes',
+          'paddle.distributed.models',
+          'paddle.distributed.models.moe',
           'paddle.framework',
           'paddle.jit',
           'paddle.jit.dy2static',
@@ -366,6 +368,10 @@ packages=['paddle',
           'paddle.incubate.nn.functional',
           'paddle.incubate.nn.layer',
           'paddle.incubate.optimizer.functional',
+          'paddle.incubate.distributed',
+          'paddle.incubate.distributed.models',
+          'paddle.incubate.distributed.models.moe',
+          'paddle.incubate.distributed.models.moe.gate',
           'paddle.io',
           'paddle.optimizer',
           'paddle.nn',