PaddlePaddle · ZHUI · Oct 12, 2024 · Oct 11, 2024
diff --git a/llm/predict/predictor.py b/llm/predict/predictor.py
@@ -24,7 +24,7 @@
 import numpy as np
 import paddle
 import paddle.incubate.multiprocessing as mp
-from paddle.base.framework import in_cinn_mode, in_pir_executor_mode
+from paddle.base.framework import in_cinn_mode, in_pir_executor_mode, use_pir_api
 from paddle.distributed import fleet
 
 from paddlenlp.generation import GenerationConfig, TextIteratorStreamer
@@ -624,8 +624,10 @@ def _create_predictor(self, predictor_args: PredictorArgument):
         infer_model_path = llm_utils.get_infer_model_path(
             predictor_args.model_name_or_path, predictor_args.model_prefix
         )
-
-        config = paddle.inference.Config(infer_model_path + ".pdmodel", infer_model_path + ".pdiparams")
+        if use_pir_api():
+            config = paddle.inference.Config(infer_model_path + ".json", infer_model_path + ".pdiparams")
+        else:
+            config = paddle.inference.Config(infer_model_path + ".pdmodel", infer_model_path + ".pdiparams")
 
         config.switch_ir_optim(True)
         # remove `gpu_cpu_map_matmul_v2_to_matmul_pass` to avoid mapping matmul_v2 -> matmul op
@@ -1057,7 +1059,10 @@ def _create_predictor(self, predictor_args: PredictorArgument):
             predictor_args.model_name_or_path, predictor_args.model_prefix
         )
 
-        config = paddle.inference.Config(infer_model_path + ".pdmodel", infer_model_path + ".pdiparams")
+        if use_pir_api():
+            config = paddle.inference.Config(infer_model_path + ".json", infer_model_path + ".pdiparams")
+        else:
+            config = paddle.inference.Config(infer_model_path + ".pdmodel", infer_model_path + ".pdiparams")
 
         config.switch_ir_optim(False)
         if predictor_args.device in paddle.device.get_all_custom_device_type():

diff --git a/paddlenlp/experimental/transformers/chatglm/modeling.py b/paddlenlp/experimental/transformers/chatglm/modeling.py
@@ -294,19 +294,19 @@
         time_step=None,
         **kwargs,
     ):
+        is_decoder = cache is not None
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
             batch_size, seq_length = input_ids.shape[:2]
         elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape[:2]
+            batch_size, seq_length, _ = inputs_embeds.shape[:3]
         else:
             raise ValueError("You have to specify either input_ids or inputs_embeds")
 
-        encode_seq_length = input_ids.shape[1]
-        seq_lens = seq_len_decoder if encode_seq_length == 1 else seq_len_encoder
+        seq_lens = seq_len_decoder if is_decoder else seq_len_encoder
 
-        if encode_seq_length > 1:
+        if not is_decoder:
             ids_remove_padding, padding_offset, cum_offsets = self.remove_padding(input_ids, seq_len_encoder)
         else:
             ids_remove_padding = input_ids
@@ -354,7 +354,7 @@
         hidden_states = self.input_layernorm(hidden_states)
 
         position_offset = 0
-        if encode_seq_length > 1 and pre_caches is not None:
+        if not is_decoder and pre_caches is not None:
             position_offset = 128
 
         with dy2st_nocheck_guard_context():