From a7bd53032d5d88727e8399fe68eb64b60190257b Mon Sep 17 00:00:00 2001
From: Qijian Tian <1741919942@qq.com>
Date: Wed, 7 Dec 2022 19:38:38 +0800
Subject: [PATCH 01/10] add unique_op

---
 .../opencl/utils/tune_cache_generated.h       | 383 +++++++++++++
 .../x86_mobilenetv1_full_demo/CMakeLists.txt  |  73 +++
 .../x86_mobilenetv1_light_demo/CMakeLists.txt |  73 +++
 lite/kernels/host/CMakeLists.txt              |   1 +
 lite/kernels/host/unique_compute.cc           | 530 ++++++++++++++++++
 lite/kernels/host/unique_compute.h            |  36 ++
 lite/operators/CMakeLists.txt                 |   1 +
 lite/operators/op_params.h                    |  15 +
 lite/operators/unique_op.cc                   |  88 +++
 lite/operators/unique_op.h                    |  52 ++
 10 files changed, 1252 insertions(+)
 create mode 100644 lite/backends/opencl/utils/tune_cache_generated.h
 create mode 100644 lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt
 create mode 100644 lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt
 create mode 100644 lite/kernels/host/unique_compute.cc
 create mode 100644 lite/kernels/host/unique_compute.h
 create mode 100644 lite/operators/unique_op.cc
 create mode 100644 lite/operators/unique_op.h
diff --git a/lite/backends/opencl/utils/tune_cache_generated.h b/lite/backends/opencl/utils/tune_cache_generated.h
new file mode 100644
index 00000000000..bb091cce383
--- /dev/null
+++ b/lite/backends/opencl/utils/tune_cache_generated.h
@@ -0,0 +1,383 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_TUNECACHE_PADDLE_LITE_FBS_OPENCL_PROTO_H_
+#define FLATBUFFERS_GENERATED_TUNECACHE_PADDLE_LITE_FBS_OPENCL_PROTO_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+namespace paddle {
+namespace lite {
+namespace fbs {
+namespace opencl {
+namespace proto {
+namespace TuneCache_ {
+
+struct TunePair;
+struct TunePairBuilder;
+struct TunePairT;
+
+}  // namespace TuneCache_
+
+struct TuneCache;
+struct TuneCacheBuilder;
+struct TuneCacheT;
+
+namespace TuneCache_ {
+
+bool operator==(const TunePairT &lhs, const TunePairT &rhs);
+bool operator!=(const TunePairT &lhs, const TunePairT &rhs);
+}  // namespace TuneCache_
+
+bool operator==(const TuneCacheT &lhs, const TuneCacheT &rhs);
+bool operator!=(const TuneCacheT &lhs, const TuneCacheT &rhs);
+
+namespace TuneCache_ {
+
+inline const flatbuffers::TypeTable *TunePairTypeTable();
+
+}  // namespace TuneCache_
+
+inline const flatbuffers::TypeTable *TuneCacheTypeTable();
+
+namespace TuneCache_ {
+
+struct TunePairT : public flatbuffers::NativeTable {
+  typedef TunePair TableType;
+  std::string key;
+  std::vector<int32_t> value;
+  TunePairT() {
+  }
+};
+
+inline bool operator==(const TunePairT &lhs, const TunePairT &rhs) {
+  return
+      (lhs.key == rhs.key) &&
+      (lhs.value == rhs.value);
+}
+
+inline bool operator!=(const TunePairT &lhs, const TunePairT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+struct TunePair FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef TunePairT NativeTableType;
+  typedef TunePairBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return TunePairTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_KEY = 4,
+    VT_VALUE = 6
+  };
+  const flatbuffers::String *key() const {
+    return GetPointer<const flatbuffers::String *>(VT_KEY);
+  }
+  flatbuffers::String *mutable_key() {
+    return GetPointer<flatbuffers::String *>(VT_KEY);
+  }
+  bool KeyCompareLessThan(const TunePair *o) const {
+    return *key() < *o->key();
+  }
+  int KeyCompareWithValue(const char *val) const {
+    return strcmp(key()->c_str(), val);
+  }
+  const flatbuffers::Vector<int32_t> *value() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_VALUE);
+  }
+  flatbuffers::Vector<int32_t> *mutable_value() {
+    return GetPointer<flatbuffers::Vector<int32_t> *>(VT_VALUE);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffsetRequired(verifier, VT_KEY) &&
+           verifier.VerifyString(key()) &&
+           VerifyOffset(verifier, VT_VALUE) &&
+           verifier.VerifyVector(value()) &&
+           verifier.EndTable();
+  }
+  TunePairT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TunePairT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<TunePair> Pack(flatbuffers::FlatBufferBuilder &_fbb, const TunePairT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct TunePairBuilder {
+  typedef TunePair Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_key(flatbuffers::Offset<flatbuffers::String> key) {
+    fbb_.AddOffset(TunePair::VT_KEY, key);
+  }
+  void add_value(flatbuffers::Offset<flatbuffers::Vector<int32_t>> value) {
+    fbb_.AddOffset(TunePair::VT_VALUE, value);
+  }
+  explicit TunePairBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  TunePairBuilder &operator=(const TunePairBuilder &);
+  flatbuffers::Offset<TunePair> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<TunePair>(end);
+    fbb_.Required(o, TunePair::VT_KEY);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<TunePair> CreateTunePair(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::String> key = 0,
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> value = 0) {
+  TunePairBuilder builder_(_fbb);
+  builder_.add_value(value);
+  builder_.add_key(key);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<TunePair> CreateTunePairDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const char *key = nullptr,
+    const std::vector<int32_t> *value = nullptr) {
+  auto key__ = key ? _fbb.CreateString(key) : 0;
+  auto value__ = value ? _fbb.CreateVector<int32_t>(*value) : 0;
+  return paddle::lite::fbs::opencl::proto::TuneCache_::CreateTunePair(
+      _fbb,
+      key__,
+      value__);
+}
+
+flatbuffers::Offset<TunePair> CreateTunePair(flatbuffers::FlatBufferBuilder &_fbb, const TunePairT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+}  // namespace TuneCache_
+
+struct TuneCacheT : public flatbuffers::NativeTable {
+  typedef TuneCache TableType;
+  std::vector<std::unique_ptr<paddle::lite::fbs::opencl::proto::TuneCache_::TunePairT>> tune_map;
+  TuneCacheT() {
+  }
+};
+
+inline bool operator==(const TuneCacheT &lhs, const TuneCacheT &rhs) {
+  return
+      (lhs.tune_map == rhs.tune_map);
+}
+
+inline bool operator!=(const TuneCacheT &lhs, const TuneCacheT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+struct TuneCache FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef TuneCacheT NativeTableType;
+  typedef TuneCacheBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return TuneCacheTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_TUNE_MAP = 4
+  };
+  const flatbuffers::Vector<flatbuffers::Offset<paddle::lite::fbs::opencl::proto::TuneCache_::TunePair>> *tune_map() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<paddle::lite::fbs::opencl::proto::TuneCache_::TunePair>> *>(VT_TUNE_MAP);
+  }
+  flatbuffers::Vector<flatbuffers::Offset<paddle::lite::fbs::opencl::proto::TuneCache_::TunePair>> *mutable_tune_map() {
+    return GetPointer<flatbuffers::Vector<flatbuffers::Offset<paddle::lite::fbs::opencl::proto::TuneCache_::TunePair>> *>(VT_TUNE_MAP);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffsetRequired(verifier, VT_TUNE_MAP) &&
+           verifier.VerifyVector(tune_map()) &&
+           verifier.VerifyVectorOfTables(tune_map()) &&
+           verifier.EndTable();
+  }
+  TuneCacheT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TuneCacheT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<TuneCache> Pack(flatbuffers::FlatBufferBuilder &_fbb, const TuneCacheT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct TuneCacheBuilder {
+  typedef TuneCache Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_tune_map(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<paddle::lite::fbs::opencl::proto::TuneCache_::TunePair>>> tune_map) {
+    fbb_.AddOffset(TuneCache::VT_TUNE_MAP, tune_map);
+  }
+  explicit TuneCacheBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  TuneCacheBuilder &operator=(const TuneCacheBuilder &);
+  flatbuffers::Offset<TuneCache> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<TuneCache>(end);
+    fbb_.Required(o, TuneCache::VT_TUNE_MAP);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<TuneCache> CreateTuneCache(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<paddle::lite::fbs::opencl::proto::TuneCache_::TunePair>>> tune_map = 0) {
+  TuneCacheBuilder builder_(_fbb);
+  builder_.add_tune_map(tune_map);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<TuneCache> CreateTuneCacheDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    std::vector<flatbuffers::Offset<paddle::lite::fbs::opencl::proto::TuneCache_::TunePair>> *tune_map = nullptr) {
+  auto tune_map__ = tune_map ? _fbb.CreateVectorOfSortedTables<paddle::lite::fbs::opencl::proto::TuneCache_::TunePair>(tune_map) : 0;
+  return paddle::lite::fbs::opencl::proto::CreateTuneCache(
+      _fbb,
+      tune_map__);
+}
+
+flatbuffers::Offset<TuneCache> CreateTuneCache(flatbuffers::FlatBufferBuilder &_fbb, const TuneCacheT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+namespace TuneCache_ {
+
+inline TunePairT *TunePair::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  std::unique_ptr<paddle::lite::fbs::opencl::proto::TuneCache_::TunePairT> _o = std::unique_ptr<paddle::lite::fbs::opencl::proto::TuneCache_::TunePairT>(new TunePairT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void TunePair::UnPackTo(TunePairT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = key(); if (_e) _o->key = _e->str(); }
+  { auto _e = value(); if (_e) { _o->value.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->value[_i] = _e->Get(_i); } } }
+}
+
+inline flatbuffers::Offset<TunePair> TunePair::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TunePairT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateTunePair(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<TunePair> CreateTunePair(flatbuffers::FlatBufferBuilder &_fbb, const TunePairT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TunePairT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _key = _fbb.CreateString(_o->key);
+  auto _value = _fbb.CreateVector(_o->value);
+  return paddle::lite::fbs::opencl::proto::TuneCache_::CreateTunePair(
+      _fbb,
+      _key,
+      _value);
+}
+
+}  // namespace TuneCache_
+
+inline TuneCacheT *TuneCache::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  std::unique_ptr<paddle::lite::fbs::opencl::proto::TuneCacheT> _o = std::unique_ptr<paddle::lite::fbs::opencl::proto::TuneCacheT>(new TuneCacheT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void TuneCache::UnPackTo(TuneCacheT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = tune_map(); if (_e) { _o->tune_map.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->tune_map[_i] = std::unique_ptr<paddle::lite::fbs::opencl::proto::TuneCache_::TunePairT>(_e->Get(_i)->UnPack(_resolver)); } } }
+}
+
+inline flatbuffers::Offset<TuneCache> TuneCache::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TuneCacheT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateTuneCache(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<TuneCache> CreateTuneCache(flatbuffers::FlatBufferBuilder &_fbb, const TuneCacheT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TuneCacheT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _tune_map = _fbb.CreateVector<flatbuffers::Offset<paddle::lite::fbs::opencl::proto::TuneCache_::TunePair>> (_o->tune_map.size(), [](size_t i, _VectorArgs *__va) { return CreateTunePair(*__va->__fbb, __va->__o->tune_map[i].get(), __va->__rehasher); }, &_va );
+  return paddle::lite::fbs::opencl::proto::CreateTuneCache(
+      _fbb,
+      _tune_map);
+}
+
+namespace TuneCache_ {
+
+inline const flatbuffers::TypeTable *TunePairTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_STRING, 0, -1 },
+    { flatbuffers::ET_INT, 1, -1 }
+  };
+  static const char * const names[] = {
+    "key",
+    "value"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 2, type_codes, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+}  // namespace TuneCache_
+
+inline const flatbuffers::TypeTable *TuneCacheTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_SEQUENCE, 1, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    paddle::lite::fbs::opencl::proto::TuneCache_::TunePairTypeTable
+  };
+  static const char * const names[] = {
+    "tune_map"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, type_refs, nullptr, names
+  };
+  return &tt;
+}
+
+inline const paddle::lite::fbs::opencl::proto::TuneCache *GetTuneCache(const void *buf) {
+  return flatbuffers::GetRoot<paddle::lite::fbs::opencl::proto::TuneCache>(buf);
+}
+
+inline const paddle::lite::fbs::opencl::proto::TuneCache *GetSizePrefixedTuneCache(const void *buf) {
+  return flatbuffers::GetSizePrefixedRoot<paddle::lite::fbs::opencl::proto::TuneCache>(buf);
+}
+
+inline TuneCache *GetMutableTuneCache(void *buf) {
+  return flatbuffers::GetMutableRoot<TuneCache>(buf);
+}
+
+inline bool VerifyTuneCacheBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifyBuffer<paddle::lite::fbs::opencl::proto::TuneCache>(nullptr);
+}
+
+inline bool VerifySizePrefixedTuneCacheBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifySizePrefixedBuffer<paddle::lite::fbs::opencl::proto::TuneCache>(nullptr);
+}
+
+inline void FinishTuneCacheBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<paddle::lite::fbs::opencl::proto::TuneCache> root) {
+  fbb.Finish(root);
+}
+
+inline void FinishSizePrefixedTuneCacheBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<paddle::lite::fbs::opencl::proto::TuneCache> root) {
+  fbb.FinishSizePrefixed(root);
+}
+
+inline std::unique_ptr<paddle::lite::fbs::opencl::proto::TuneCacheT> UnPackTuneCache(
+    const void *buf,
+    const flatbuffers::resolver_function_t *res = nullptr) {
+  return std::unique_ptr<paddle::lite::fbs::opencl::proto::TuneCacheT>(GetTuneCache(buf)->UnPack(res));
+}
+
+inline std::unique_ptr<paddle::lite::fbs::opencl::proto::TuneCacheT> UnPackSizePrefixedTuneCache(
+    const void *buf,
+    const flatbuffers::resolver_function_t *res = nullptr) {
+  return std::unique_ptr<paddle::lite::fbs::opencl::proto::TuneCacheT>(GetSizePrefixedTuneCache(buf)->UnPack(res));
+}
+
+}  // namespace proto
+}  // namespace opencl
+}  // namespace fbs
+}  // namespace lite
+}  // namespace paddle
+
+#endif  // FLATBUFFERS_GENERATED_TUNECACHE_PADDLE_LITE_FBS_OPENCL_PROTO_H_
diff --git a/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt b/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt
new file mode 100644
index 00000000000..234ec1c85e3
--- /dev/null
+++ b/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt
@@ -0,0 +1,73 @@
+cmake_minimum_required(VERSION 2.8)
+project(mobilenet_full_api)
+set(TARGET mobilenet_full_api)
+
+# 1. path to Paddle-Lite lib and mklml lib
+set(LITE_DIR "${PROJECT_SOURCE_DIR}/../../../cxx")
+set(MKLML_DIR "${PROJECT_SOURCE_DIR}/../../../third_party/mklml/")
+
+if (WIN32)
+  add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
+  set(MSVC_STATIC_CRT )
+  if(MSVC_STATIC_CRT)
+    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
+    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
+    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
+    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
+  else(MSVC_STATIC_CRT)
+    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MDd")
+    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MD")
+    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MDd")
+    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MD")
+  endif(MSVC_STATIC_CRT)
+endif()
+
+if (APPLE AND METAL)
+  message(STATUS "set METAL=ON")
+  add_definitions("-DMETAL")
+  find_library(METAL_LIBRARY Metal REQUIRED)
+  find_library(GRAPHIC CoreGraphics REQUIRED)
+  find_library(MPS_LIBRARY MetalPerformanceShaders REQUIRED)
+endif()
+
+# 2. link mklml and Paddle-Lite directory
+link_directories(${LITE_DIR}/lib ${MKLML_DIR}/lib)
+include_directories(${LITE_DIR}/include/ ${MKLML_DIR}/include)
+
+# 3. compile options 
+if (NOT WIN32)
+  add_definitions(-std=c++11 -g -O3 -pthread)
+  set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR})
+endif()
+
+# 4.add executable output
+add_executable(${TARGET} ${TARGET}.cc)
+if (WIN32)
+  set(WITH_STATIC_MKL )
+  if(WITH_STATIC_MKL)
+    set(MATH_LIB ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX})
+  else()
+    set(MATH_LIB ${MKLML_DIR}/lib/mklml${CMAKE_STATIC_LIBRARY_SUFFIX}
+                 ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX})
+  endif()
+
+  target_link_libraries(${TARGET} libpaddle_api_full_bundled.lib)
+  target_link_libraries(${TARGET} shlwapi.lib)
+  target_link_libraries(${TARGET} ${MATH_LIB})
+
+  add_custom_command(TARGET ${TARGET} POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/libiomp5md.dll ${CMAKE_CURRENT_BINARY_DIR}/Release
+  )
+  if(NOT WITH_STATIC_MKL)
+    add_custom_command(TARGET ${TARGET} POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/mklml.dll ${CMAKE_CURRENT_BINARY_DIR}/Release
+    )
+  endif()
+else()
+    if (APPLE AND METAL)
+      target_link_libraries(${TARGET} ${METAL_LIBRARY} ${GRAPHIC} ${MPS_LIBRARY})
+    endif()
+    target_link_libraries(${TARGET} -lpaddle_full_api_shared)
+    target_link_libraries(${TARGET} -liomp5)
+    target_link_libraries(${TARGET} -ldl)
+endif()
diff --git a/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt b/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt
new file mode 100644
index 00000000000..3a91bfafbd3
--- /dev/null
+++ b/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt
@@ -0,0 +1,73 @@
+cmake_minimum_required(VERSION 2.8)
+project(mobilenet_light_api)
+set(TARGET mobilenet_light_api)
+
+# 1. path to Paddle-Lite lib and mklml lib
+set(LITE_DIR "${PROJECT_SOURCE_DIR}/../../../cxx")
+set(MKLML_DIR "${PROJECT_SOURCE_DIR}/../../../third_party/mklml/")
+
+if (WIN32)
+  add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
+  set(MSVC_STATIC_CRT )
+  if(MSVC_STATIC_CRT)
+    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
+    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
+    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
+    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
+  else(MSVC_STATIC_CRT)
+    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MDd")
+    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MD")
+    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MDd")
+    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MD")
+  endif(MSVC_STATIC_CRT)
+endif()
+
+if (APPLE AND METAL)
+  message(STATUS "set METAL=ON")
+  add_definitions("-DMETAL")
+  find_library(METAL_LIBRARY Metal REQUIRED)
+  find_library(GRAPHIC CoreGraphics REQUIRED)
+  find_library(MPS_LIBRARY MetalPerformanceShaders REQUIRED)
+endif()
+
+# 2. link mklml and Paddle-Lite directory
+link_directories(${LITE_DIR}/lib ${MKLML_DIR}/lib)
+include_directories(${LITE_DIR}/include/ ${MKLML_DIR}/include)
+
+# 3. compile options 
+if (NOT WIN32)
+  add_definitions(-std=c++11 -g -O3 -pthread)
+  set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR})
+endif()
+
+# 4.add executable output
+add_executable(${TARGET} ${TARGET}.cc)
+if (WIN32)
+  set(WITH_STATIC_MKL )
+  if(WITH_STATIC_MKL)
+    set(MATH_LIB ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX})
+  else()
+    set(MATH_LIB ${MKLML_DIR}/lib/mklml${CMAKE_STATIC_LIBRARY_SUFFIX}
+                 ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX})
+  endif()
+
+  target_link_libraries(${TARGET} libpaddle_api_light_bundled.lib)
+  target_link_libraries(${TARGET} shlwapi.lib)
+  target_link_libraries(${TARGET} ${MATH_LIB})
+
+  add_custom_command(TARGET ${TARGET} POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/libiomp5md.dll ${CMAKE_CURRENT_BINARY_DIR}/Release
+  )
+  if(NOT WITH_STATIC_MKL)
+    add_custom_command(TARGET ${TARGET} POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/mklml.dll ${CMAKE_CURRENT_BINARY_DIR}/Release
+    )
+  endif()
+else()
+  if (APPLE AND METAL)
+    target_link_libraries(${TARGET} ${METAL_LIBRARY} ${GRAPHIC} ${MPS_LIBRARY})
+  endif()
+  target_link_libraries(${TARGET} -lpaddle_light_api_shared)
+  target_link_libraries(${TARGET} -liomp5)
+  target_link_libraries(${TARGET} -ldl)
+endif()
diff --git a/lite/kernels/host/CMakeLists.txt b/lite/kernels/host/CMakeLists.txt
index 564fcc0c88a..bbf6d9f3e4a 100644
--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
@@ -108,6 +108,7 @@ add_kernel(distribute_fpn_proposals_compute_host Host extra SRCS distribute_fpn_
 add_kernel(collect_fpn_proposals_compute_host Host extra SRCS collect_fpn_proposals_compute.cc)
 add_kernel(flip_compute_host Host extra SRCS flip_compute.cc)
 add_kernel(unique_with_counts_compute  Host extra SRCS unique_with_counts_compute.cc)
+add_kernel(unique_compute Host extra SRCS unique_compute.cc)
 add_kernel(roi_align_compute Host extra SRCS roi_align_compute.cc)
 add_kernel(box_clip_compute Host extra SRCS box_clip_compute.cc)
 add_kernel(gaussian_random_compute Host extra SRCS gaussian_random_compute.cc)
diff --git a/lite/kernels/host/unique_compute.cc b/lite/kernels/host/unique_compute.cc
new file mode 100644
index 00000000000..97363f2bbe9
--- /dev/null
+++ b/lite/kernels/host/unique_compute.cc
@@ -0,0 +1,530 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/host/unique_compute.h"
+#include "lite/backends/host/math/concat.h"
+
+#include <algorithm>
+#include <cmath>
+#include <numeric>
+#include <set>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+template <typename InT, typename IndexT>
+void UniqueFunc(const lite::Tensor* x, 
+                      lite::Tensor* out,
+                      lite::Tensor* index,
+                      lite::Tensor* count) {
+  const InT* in_data = x->template data<InT>();
+  IndexT* index_data = index->mutable_data<IndexT>();
+
+  int64_t j = 0;
+
+  std::unordered_map<InT, int64_t> dict;
+  std::vector<InT> uniq;
+
+  for (auto i = 0; i < x->numel(); i++) {
+    auto it = dict.find(in_data[i]);
+    if (it == dict.end()) {
+      dict.emplace(std::make_pair(in_data[i], j));
+      index_data[i] = static_cast<IndexT>(j);
+      j++;
+    } else {
+      index_data[i] = static_cast<IndexT>(it->second);
+    }
+  }
+
+  if (count != nullptr) {
+    // Resize the count tensor dims to allocate the memory
+    count->Resize({static_cast<int64_t>(uniq.size())});
+    IndexT* count_data = count->template mutable<IndexT>();
+    // init count_data to 0
+    memset(count_data, 0, uniq.size() * sizeof(IndexT));
+
+    if (IndexT == int32_t) {
+      for (auto i = 0; i < x->numel(); ++i) {
+        const IndexT& index = index_data[i];
+        count_data[static_cast<int32_t>(index)] += static_cast<IndexT>(1);
+      }
+    } else {
+      for (auto i = 0; i < x->numel(); ++i) {
+        const IndexT& index = index_data[i];
+        count_data[static_cast<int64_t>(index)] += static_cast<IndexT>(1);
+      }
+    }
+  }
+
+  out->Resize({static_cast<int64_t>(uniq.size())});
+  auto out_data = out->mutable_data<InT>();
+  std::memcpy(out_data, uniq.data(), uniq.size() * sizeof(InT));
+}
+
+template <typename InT, typename IndexT>
+void UniqueFlattendTensorFunc(const lite::Tensor& in,
+                              lite::Tensor* out,
+                              lite::Tensor* index,
+                              lite::Tensor* indices,
+                              lite::Tensor* count,
+                              bool return_index,
+                              bool return_inverse,
+                              bool return_counts) {
+  const InT* in_data = in.data<InT>();
+  std::set<InT> unique(in_data, in_data + in.numel());
+  out->Resize({static_cast<int64_t>(unique.size())});
+  auto out_data = out->mutable_data<InT>();
+  std::copy(unique.begin(), unique.end(), out_data);
+
+  if (return_index) {
+    indices->Resize({out->numel()});
+    auto indices_data = indices->mutable_data<IndexT>();
+    std::unordered_map<InT, IndexT> indices_map;
+    indices_map.reserve(out->numel());
+    for (int64_t i = 0; i < in.numel(); ++i) {
+      if (indices_map.find(in_data[i]) != indices_map.end()) continue;
+      indices_map[in_data[i]] = i;
+    }
+    for (int64_t i = 0; i < out->numel(); ++i) {
+      indices_data[i] = indices_map[out_data[i]];
+    }
+  }
+
+  if (return_inverse) {
+    auto* inverse = index;
+    inverse->Resize{{out->numel()}};
+    auto inverse_data = inverse->mutable_data<IndexT>();
+    std::unordered_map<InT, IndexT> inverse_map;
+    for (int64_t i = 0; i < out->numel(); ++i) {
+      inverse_map[out_data[i]] = i;
+    }
+    for (int64_t i = 0; i < in.numel(); ++i) {
+      inverse_data[i] = inverse_map[in_data[i]];
+    }
+  }
+
+  if (return_counts) {
+    count->Resize({out->numel()});
+    auto count_data = count->mutable_data<IndexT>();
+    std::unordered_map<InT, IndexT> counts_map;
+    counts_map.reserve(out->numel());
+    for (int64_t i = 0; i < out->numel(); ++i) {
+      counts_map[out_data[i]] = 0;
+    }
+    for (int64_t i = 0; i < in.numel(); ++i) {
+      counts_map[in_data[i]] += 1;
+    }
+    for (int64_t i = 0; i < out->numel(); ++i) {
+      count_data[i] = counts_map[out_data[i]];
+    }
+  }
+}
+
+static std::vector<lite::Tensor> Unbind(const lite::Tensor& in) {
+  int64_t size = in.dims()[0];
+  std::vector<lite::Tensor> tensors(size);
+  for (int64_t i = 0; i < size; ++i) {
+    tensors[i] = in.Slice(i, i + 1);
+  }
+  return tensors;
+}
+
+template <typename T>
+static bool Equal(const lite::Tensor& a, const lite::Tensor& b) {
+  if (a.numel() != b.numel()) {
+    return false;
+  }
+  for (int64_t i = 0; i < a.numel(); ++i) {
+    if (a.data<T>()[i] != b.data<T>()[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <class ForwardIt, typename InT, typename IndexT>
+static ForwardIt UniqueDimImpl(ForwardIt first,
+                               ForwardIt last,
+                               const std::vector<IndexT>& sorted_indices_vec,
+                               std::vector<IndexT>* inverse_vec,
+                               std::vector<IndexT>* counts_vec,
+                               std::vector<IndexT>* indices_vec) {
+  if (first == last) {
+    return last;
+  }
+
+  (*inverse_vec)[sorted_indices_vec[0]] = 0;
+  (*counts_vec)[0] = 1;
+  (*indices_vec)[0] = sorted_indices_vec[0];
+
+  ForwardIt begin = first;
+  ForwardIt result = first;
+
+  while (++first != last) {
+    int64_t idx_first = std::distance(begin, first);
+    int64_t idx_result = std::distance(begin, result);
+    if (!Equal<InT>(*result, *first)) {
+      if (++result != first) {
+        *result = std::move(*first);
+      }
+      idx_result += 1;
+      (*indices_vec)[idx_result] = sorted_indices_vec[idx_first];
+    }
+    (*inverse_vec)[sorted_indices_vec[idx_first]] = idx_result;
+    (*counts_vec)[idx_result] += 1;
+  }
+  return ++result;
+}
+
+template <class T>
+void TensorFromVector(const std::vector<T>& src, lite::Tensor* dst) {
+  auto* src_ptr = static_cast<const void*>(src.data());
+  auto* dst_ptr = static_cast<void*>(dst->mutable_data<T>());
+  auto size = src.size() * sizeof(T);
+  lite::TargetWrapperHost::MemcpySync(
+      dst_ptr, src_ptr, size, IoDirection::HtoH);
+}
+
+template <typename T>
+void TransCompute(const Tensor &input,
+               Tensor *output,
+               const std::vector<int> &orders) {
+  auto in_dims = input.dims();
+  auto out_dims = output->dims();
+  int num_axes = in_dims.size();
+  int count = in_dims.production();
+
+  const T *din = input.data<T>();
+  T *dout = output->mutable_data<T>();
+
+  std::vector<int> old_temp;
+  int temp = 1;
+  for (int i = 0; i < num_axes; ++i) {
+    old_temp.push_back(temp);
+    temp *= in_dims[num_axes - 1 - i];
+  }
+  std::vector<int> old_step;
+  for (int i = 0; i < num_axes; i++) {
+    old_step.push_back(old_temp[num_axes - 1 - i]);
+  }
+
+  std::vector<int> new_temp;
+  temp = 1;
+  for (int i = 0; i < num_axes; ++i) {
+    new_temp.push_back(temp);
+    temp *= out_dims[num_axes - 1 - i];
+  }
+  std::vector<int> new_step;
+  for (int i = 0; i < num_axes; i++) {
+    new_step.push_back(new_temp[num_axes - 1 - i]);
+  }
+
+  // std::vector<int> old_steps(
+  //     {static_cast<int>(in_dims[1] * in_dims[2] * in_dims[3]),
+  //      static_cast<int>(in_dims[2] * in_dims[3]),
+  //      static_cast<int>(in_dims[3]),
+  //      1});
+  // std::vector<int> new_steps(
+  //     {static_cast<int>(out_dims[1] * out_dims[2] * out_dims[3]),
+  //      static_cast<int>(out_dims[2] * out_dims[3]),
+  //      static_cast<int>(out_dims[3]),
+  //      1});
+
+  for (int i = 0; i < count; ++i) {
+    int old_idx = 0;
+    int idx = i;
+    for (int j = 0; j < num_axes; ++j) {
+      int order = orders[j];
+      old_idx += (idx / new_steps[j]) * old_steps[order];
+      idx %= new_steps[j];
+    }
+    dout[i] = din[old_idx];
+  }
+}
+
+lite::DDim FlattenTo2d(const lite::DDim& src, int num_col_dims) {
+  return lite::DDim((src.Slice(0, num_col_dims)).production(),
+                    (src.Slice(num_col_dims, src.size())).production());
+}
+
+template <typename InT, typename IndexT>
+void UniqueDimFunc(const lite::Tensor& in,
+                   lite::Tensor* out,
+                   lite::Tensor* index,
+                   lite::Tensor* indices,
+                   lite::Tensor* count,
+                   int axis,
+                   bool return_index,
+                   bool return_inverse,
+                   bool return_counts) {
+  // transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2]
+  std::vector<int> permute(in.dims().size());
+  std::iota(permute.begin(), permute.end(), 0);
+  permute[axis] = 0;
+  permute[0] = axis;
+  std::vector<int64_t> in_trans_dim_vec(in.dims());
+  in_trans_dim_vec[axis] = in.dims()[0];
+  in_trans_dim_vec[0] = in.dims()[axis];
+  lite::Tensor in_trans;
+  lite::DDim in_trans_dims = in_trans_dim_vec;
+  in_trans.Resize(in_trans_dims);
+  in_trans.mutable_data<InT>();
+  TransCompute<InT>(in, &in_trans, permute);
+  // reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2]
+  lite::DDim in_trans_flat_dims = FlattenTo2d(in_trans_dims, 1);
+  in_trans.Resize(in_trans_flat_dims);
+
+  // sort indices
+  std::vector<IndexT> sorted_indices_vec(in_trans.dims()[0]);
+  std::iota(sorted_indices_vec.begin(), sorted_indices_vec.end(), 0);
+  int64_t col = in_trans.dims()[1];
+  const InT* in_trans_data = in_trans.data<InT>();
+  std::sort(sorted_indices_vec.begin(),
+            sorted_indices_vec.end(),
+            [&](int64_t a, int64_t b) -> bool {
+              for (int64_t i = 0; i < col; ++i) {
+                InT lhs = in_trans_data[i + a*col];
+                InT rhs = in_trans_data[i + b*col];
+                if (lhs < rhs) {
+                  return true;
+                } else if (lhs > rhs) {
+                  return false;
+                }
+              }
+              return false;
+            });
+  
+  // sort tensor according to indices
+  lite::Tensor input_sorted;
+  input_sorted.Resize(in_trans_dims);
+  input_sorted.mutable_data<InT>();
+  InT* input_sorted_data = input_sorted.data<InT>();
+  for (size_t i = 0; i < sorted_indices_vec.size(); ++i) {
+    memcpy(input_sorted_data + i * col,
+           in_trans_data + static_cast<int64_t>(sorted_indices_vec[i]) * col,
+           col * sizeof(InT));
+
+  }
+
+  std::vector<lite::Tensor> input_unbind = Unbind(input_sorted);
+  std::vector<IndexT> inverse_vec(sorted_indices_vec.size(), 0);
+  std::vector<IndexT> counts_vec(sorted_indices_vec.size(), 0);
+  std::vector<IndexT> indices_vec(sorted_indices_vec.size(), 0);
+  auto last = UniqueDimImpl<std::vector<lite::Tensor>::iterator, InT, int32_t>(
+    input_unbind.begin(),
+    input_unbind.end(),
+    sorted_indices_vec,
+    &inverse_vec,
+    &counts_vec,
+    &indices_vec);
+  input_unbind.erase(last, input_unbind.end());
+  counts_vec.erase(counts_vec.begin() + input_unbind.size(), counts_vec.end());
+  indices_vec.erase(indices_vec.begin() + input_unbind.size(), indices_vec.end());
+  
+  lite::Tensor out_trans;
+  std::vector<int64_t> out_trans_dims_vec = in_trans_dim_vec;
+  out_trans_dims_vec[0] = input_unbind.size();
+  out_trans.Resize(out_trans_dims_vec);
+  out_trans.mutable_data<InT>();
+  std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]);
+  out->Resize(out_trans_dims_vec);
+  out->mutable_data<InT>();
+  lite::host::math::concat_func<InT>(input_unbind, 0, &out_trans);
+  TransCompute<InT>(out_trans, out, permute);
+
+  if (return_inverse) {
+    TensorFromVector(inverse_vec, index);
+  }
+
+  if (return_counts) {
+    TensorFromVector(counts_vec, count);
+  }
+
+  if (return_index) {
+    TensorFromVector(indices_vec, indices);
+  }
+}
+
+void UniqueCompute::Run() {
+  auto& param = Param<operators::UniqueParam>();
+  auto x = param.X;
+  auto output = param.Out;
+  auto index = param.Index;
+  auto indices = param.Indices;
+  auto count = param.Counts;
+  auto dtype = param.dtype;
+  auto return_index = param.return_index;
+  auto return_inverse = param.return_inverse;
+  auto return_counts = param.return_counts;
+  auto axis_vec = param.axis;
+  auto is_sorted = param.is_sorted;
+
+  // lite_api::PrecisionType index_type = index->precision();
+  lite_api::PrecisionType index_type = dtype;
+  bool index_type_match = index_type == PRECISION(kInt32) || index_type == PRECISION(kInt64);
+  lite_api::PrecisionType type = x->precision();
+  CHECK_EQ(index_type_match, true) << "Index holds the wrong type, it holds "
+                                   << static_cast<int>(type)
+                                   << "but desires to be int32 or int64";
+    
+  if (!is_sorted) {
+    if (index_type == PRECISION(kInt32)) {
+      switch (type) {
+        case PRECISION(kFloat):
+          UniqueFunc<float, int32_t>(x, output, index, count);
+          break;
+        case PRECISION(kInt32):
+          UniqueFunc<int32_t, int32_t>(x, output, index, count);
+          break;
+        case PRECISION(kInt64):
+          UniqueFunc<int64_t, int32_t>(x, output, index, count);
+          break;
+        default:
+          LOG(FATAL) << "unique does not implement for the "
+                     << "input type:" << static_cast<int>(type);
+          break;
+      }
+    } else {
+        switch (type) {
+          case PRECISION(kFloat):
+            UniqueFunc<float, int64_t>(x, output, index, count);
+            break;
+          case PRECISION(kInt32):
+            UniqueFunc<int32_t, int64_t>(x, output, index, count);
+            break;
+          case PRECISION(kInt64):
+            UniqueFunc<int64_t, int64_t>(x, output, index, count);
+            break;
+          default:
+            LOG(FATAL) << "unique does not implement for the "
+                       << "input type:" << static_cast<int>(type);
+            break;
+        }
+    }
+    return;
+  }
+
+  if (x->numel() = 0) {
+    out->template mutable_data<T>();
+    return;
+  }
+  if (axis_vec.empty()) {
+    if (index_type == PRECISION(kInt32)) {
+      switch (type) {
+        case PRECISION(kFloat):
+          UniqueFlattendTensorFunc<float, int32_t>(x, output, index, indices, count, return_index, return_inverse, return_counts);
+          break;
+        case PRECISION(kInt32):
+          UniqueFlattendTensorFunc<int32_t, int32_t>(x, output, index, indices, count, return_index, return_inverse, return_counts);
+          break;
+        case PRECISION(kInt64):
+          UniqueFlattendTensorFunc<int64_t, int32_t>(x, output, index, indices, count, return_index, return_inverse, return_counts);
+          break;
+        default:
+          LOG(FATAL) << "unique does not implement for the "
+                     << "input type:" << static_cast<int>(type);
+          break;
+      }
+    } else {
+      switch (type) {
+        case PRECISION(kFloat):
+          UniqueFlattendTensorFunc<float, int64_t>(x, output, index, indices, count, return_index, return_inverse, return_counts);
+          break;
+        case PRECISION(kInt32):
+          UniqueFlattendTensorFunc<int32_t, int64_t>(x, output, index, indices, count, return_index, return_inverse, return_counts);
+          break;
+        case PRECISION(kInt64):
+          UniqueFlattendTensorFunc<int64_t, int64_t>(x, output, index, indices, count, return_index, return_inverse, return_counts);
+          break;
+        default:
+          LOG(FATAL) << "unique does not implement for the "
+                     << "input type:" << static_cast<int>(type);
+          break;
+      }
+    }
+  } else {
+    int axis = axis_vec[0];
+    if (index_type == PRECISION(kInt32)) {
+      switch (type) {
+        case PRECISION(kFloat):
+          UniqueDimFunc<float, int32_t>(x, output, index, indices, count, axis, return_index, return_inverse, return_counts);
+          break;
+        case PRECISION(kInt32):
+          UniqueDimFunc<int32_t, int32_t>(x, output, index, indices, count, axis, return_index, return_inverse, return_counts);
+          break;
+        case PRECISION(kInt64):
+          UniqueDimFunc<int64_t, int32_t>(x, output, index, indices, count, axis, return_index, return_inverse, return_counts);
+          break;
+        default:
+          LOG(FATAL) << "unique does not implement for the "
+                     << "input type:" << static_cast<int>(type);
+      }
+    } else {
+      switch (type) {
+        case PRECISION(kFloat):
+          UniqueDimFunc<float, int64_t>(x, output, index, indices, count, axis, return_index, return_inverse, return_counts);
+          break;
+        case PRECISION(kInt32):
+          UniqueDimFunc<int32_t, int64_t>(x, output, index, indices, count, axis, return_index, return_inverse, return_counts);
+          break;
+        case PRECISION(kInt64):
+          UniqueDimFunc<int64_t, int64_t>(x, output, index, indices, count, axis, return_index, return_inverse, return_counts);
+          break;
+        default:
+          LOG(FATAL) << "unique does not implement for the "
+                     << "input type:" << static_cast<int>(type);
+      }
+    }
+  }
+} 
+
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+
+REGISTER_LITE_KERNEL(unique,
+                     kHost,
+                     kAny,
+                     kAny,
+                     paddle::lite::kernels::host::UniqueCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kAny),
+                                       DATALAYOUT(kAny))})
+    .BindOutput("Index",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kInt32),
+                                       DATALAYOUT(kAny))})
+    .BindOutput("Indices",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kInt32),
+                                       DATALAYOUT(kAny))})
+    .BindOutput("Count",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kInt32),
+                                       DATALAYOUT(kAny))})
+    .Finalize();
\ No newline at end of file
diff --git a/lite/kernels/host/unique_compute.h b/lite/kernels/host/unique_compute.h
new file mode 100644
index 00000000000..631eb5b5682
--- /dev/null
+++ b/lite/kernels/host/unique_compute.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+class UniqueCompute
+    : public KernelLite<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)> {
+ public:
+  void Run() override;
+
+  virtual ~UniqueCompute() = default;
+};
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/CMakeLists.txt b/lite/operators/CMakeLists.txt
index 9138e42ffe0..8ff0540a311 100755
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -206,6 +206,7 @@ add_operator(one_hot_v2_op extra SRCS one_hot_v2_op.cc)
 add_operator(strided_slice_op extra SRCS strided_slice_op.cc)
 add_operator(where_op extra SRCS where_op.cc)
 add_operator(unique_with_counts_op extra SRCS unique_with_counts_op.cc)
+add_operator(unique_op extra SRCS unique_op.cc)
 
 # for content-dnn specific
 add_operator(search_aligned_mat_mul_op extra SRCS search_aligned_mat_mul_op.cc)
diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h
index d09d9ffff54..1b9e121e4b7 100644
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -2284,6 +2284,21 @@ struct UniqueWithCountsParam : ParamBase {
   lite::Tensor* Count{};
 };
 
+/// --------------- unique operators ---------------
+struct UniqueParam : ParamBase {
+  const lite::Tensor* X{};
+  lite::Tensor* Out{};
+  lite::Tensor* Index{}; // the indices in the original input
+  lite::Tensor* Indices{}; // the indices in the result
+  lite::Tensor* Counts{};
+  int dtype{-1};
+  bool return_index{false}; // Indices
+  bool return_inverse{false}; // Index
+  bool return_counts{false};
+  std::vector<int> axis{};
+  bool is_sorted{false};
+};
+
 struct GaussRandomParam : ParamBase {
   const lite::Tensor* ShapeTensor{nullptr};
   std::vector<const lite::Tensor*> ShapeTensorList{};
diff --git a/lite/operators/unique_op.cc b/lite/operators/unique_op.cc
new file mode 100644
index 00000000000..3cab23bbc81
--- /dev/null
+++ b/lite/operators/unique_op.cc
@@ -0,0 +1,88 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/unique_op.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool UniqueOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.Out);
+  if (param_.return_index) {
+    CHECK_OR_FALSE(param_.Indices);
+  }
+  if (param_.return_inverse) {
+    CHECK_OR_FALSE(param_.Index);
+  }
+  if (param_.return_counts) {
+    CHECK_OR_FALSE(param_.Counts)
+  }
+  return true;
+}
+
+bool UniqueOp::InferShapeImpl() const {
+  DDim in_dims = param_.X->dims();
+  param_.Out->Resize(in_dims);
+  param_.Index->Resize(in_dims);
+  param_.Indices->Resize(in_dims);
+  param_.Counts->Resize(in_dims);
+  return true;
+}
+
+bool UniqueOp::AttachImpl(const cpp::OpDesc &opdesc,
+                                    lite::Scope *scope) {
+  param_.X = scope->FindTensor(opdesc.Input("X").front());
+  CHECK(param_.X) << "Input(X) of UniqueOp should not be null.";
+  param_.Out = scope->FindMutableTensor(opdesc.Output("Out").front());
+  CHECK(param_.Out) << "Output(Out) of UniqueOp should not be null.";
+  if (opdesc.HasOutput("Index")) {
+    param_.Index = scope->FindMutableTensor(opdesc.Output("Index").front());
+    CHECK(param_.Out) << "Output(Index) of UniqueOp should not be null.";
+  }
+  if (opdesc.HasOutput("Indices")) {
+    param_.Indices = scope->FindMutableTensor(opdesc.Output("Indices").front());
+    CHECK(param_.Out) << "Output(Indices) of UniqueOp should not be null.";
+  }
+  if (opdesc.HasOutput("Counts")) {
+    param_.Counts = scope->FindMutableTensor(opdesc.Output("Counts").front());
+    CHECK(param_.Counts) << "Output(Counts) of UniqueOp should not be null.";
+  }
+  
+  if (opdesc.HasAttr("dtype")) {
+    param_.dtype = opdesc.GetAttr<int>("dtype");
+  }
+  if (opdesc.HasAttr("return_index")) {
+    param_.return_index = opdesc.GetAttr<bool>("return_index");
+  }
+  if (opdesc.HasAttr("return_reverse")) {
+    param_.return_inverse = opdesc.GetAttr<bool>("return_inverse");
+  }
+  if (opdesc.HasAttr("return_counts")) {
+    param_.return_counts = opdesc.GetAttr<bool>("return_counts");
+  }
+  param_.axis = opdesc.GetAttr<std::vector<int>>("axis");
+  if (opdesc.HasAttr("is_sorted")) {
+    param_.is_sorted = opdesc.GetAttr<bool>("is_sorted");
+  } 
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(unique, paddle::lite::operators::UniqueOp);
diff --git a/lite/operators/unique_op.h b/lite/operators/unique_op.h
new file mode 100644
index 00000000000..c9e302b7566
--- /dev/null
+++ b/lite/operators/unique_op.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class UniqueOp : public OpLite {
+ public:
+  UniqueOp() {}
+  explicit UniqueOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  
+  std::string DebugString() const override { return "unique"; }
+
+  bool InferType() override {
+    param_.Out->set_precision(param_.X->precision());
+    return true;
+  }
+
+ protected:
+  mutable UniqueParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
\ No newline at end of file

From 7b4b379d494cee3eed460122516b552143c399ca Mon Sep 17 00:00:00 2001
From: Qijian Tian <1741919942@qq.com>
Date: Thu, 8 Dec 2022 18:58:31 +0800
Subject: [PATCH 02/10] add and update unique_op

---
 lite/backends/arm/math/dotprod/gemm_sdot.h    | 442 +++++++++++++
 lite/backends/arm/math/dotprod/gemm_vsdot.h   |  54 ++
 .../cxx/mobile_light/mobilenetv1_light_api.cc | 620 +++++++++---------
 .../x86_mobilenetv1_full_demo/CMakeLists.txt  |  73 ---
 .../x86_mobilenetv1_light_demo/CMakeLists.txt |  73 ---
 lite/kernels/host/index_select_compute.cc     |  13 +
 lite/kernels/host/unique_compute.cc           | 142 ++--
 7 files changed, 894 insertions(+), 523 deletions(-)
 create mode 100644 lite/backends/arm/math/dotprod/gemm_sdot.h
 create mode 100644 lite/backends/arm/math/dotprod/gemm_vsdot.h
 delete mode 100644 lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt
 delete mode 100644 lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt

diff --git a/lite/backends/arm/math/dotprod/gemm_sdot.h b/lite/backends/arm/math/dotprod/gemm_sdot.h
new file mode 100644
index 00000000000..1eea169b15f
--- /dev/null
+++ b/lite/backends/arm/math/dotprod/gemm_sdot.h
@@ -0,0 +1,442 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+// clang-format off
+#define GEMM_SDOT_INT8_KERNEL                                              \
+  "ldp    q0, q1, [%[a_ptr]], #32\n"     /* load a00,a01 to q0, q1*/       \
+  "ldp    q4, q5, [%[b_ptr]], #32\n"     /* load b0, b1 to q4, q5*/        \
+  "eor    v8.16b,  v8.16b, v8.16b\n"     /* out0 = 0 */                    \
+  "eor    v9.16b,  v9.16b, v9.16b\n"     /* out1 = 0 */                    \
+  "eor    v10.16b,  v10.16b, v10.16b\n"  /* out2 = 0 */                    \
+  "eor    v11.16b,  v11.16b, v11.16b\n"  /* out3 = 0 */                    \
+  "eor    v12.16b,  v12.16b, v12.16b\n"  /* out4 = 0 */                    \
+  "prfm   pldl1keep, [%[b_ptr], #64]\n"  /* preload b*/                    \
+  "eor    v13.16b,  v13.16b, v13.16b\n"  /* out5 = 0 */                    \
+  "prfm   pldl1keep, [%[a_ptr], #64]\n"  /* preload a*/                    \
+  "eor    v14.16b,  v14.16b, v14.16b\n"  /* out6 = 0 */                    \
+  "prfm   pldl1keep, [%[b_ptr], #128]\n" /* preload b*/                    \
+  "eor    v15.16b,  v15.16b, v15.16b\n"  /* out7 = 0 */                    \
+  "prfm   pldl1keep, [%[a_ptr], #128]\n" /* preload a*/                    \
+  "eor    v16.16b,  v16.16b, v16.16b\n"  /* out8 = 0 */                    \
+  "prfm   pldl1keep, [%[b_ptr], #192]\n" /* preload b*/                    \
+  "eor    v17.16b,  v17.16b, v17.16b\n"  /* out9 = 0 */                    \
+  "prfm   pldl1keep, [%[b_ptr], #256]\n" /* preload b*/                    \
+  "eor    v18.16b,  v18.16b, v18.16b\n"  /* out10 = 0 */                   \
+  "prfm   pldl1keep, [%[a_ptr], #192]\n" /* preload a*/                    \
+  "eor    v19.16b,  v19.16b, v19.16b\n"  /* out11 = 0 */                   \
+  "prfm   pldl1keep, [%[b_ptr], #320]\n" /* preload b*/                    \
+  "eor    v20.16b,  v20.16b, v20.16b\n"  /* out12 = 0 */                   \
+  "prfm   pldl1keep, [%[a_ptr], #256]\n" /* preload a*/                    \
+  "eor    v21.16b,  v21.16b, v21.16b\n"  /* out13 = 0 */                   \
+  "prfm   pldl1keep, [%[b_ptr], #384]\n" /* preload b*/                    \
+  "eor    v22.16b,  v22.16b, v22.16b\n"  /* out14 = 0 */                   \
+  "eor    v23.16b,  v23.16b, v23.16b\n"  /* out15 = 0 */                   \
+  "eor    v24.16b,  v24.16b, v24.16b\n"  /* out16 = 0 */                   \
+  "eor    v25.16b,  v25.16b, v25.16b\n"  /* out17 = 0 */                   \
+  "eor    v26.16b,  v26.16b, v26.16b\n"  /* out18 = 0 */                   \
+  "eor    v27.16b,  v27.16b, v27.16b\n"  /* out19 = 0 */                   \
+  "eor    v28.16b,  v28.16b, v28.16b\n"  /* out20 = 0 */                   \
+  "eor    v29.16b,  v29.16b, v29.16b\n"  /* out21 = 0 */                   \
+  "eor    v30.16b,  v30.16b, v30.16b\n"  /* out22 = 0 */                   \
+  "eor    v31.16b,  v31.16b, v31.16b\n"  /* out23 = 0 */                   \
+  "cbz    %w[k], 2f\n" /* check loop count > 0 */                          \
+  /* main loop, unrool 0*/                                                 \
+  "1:\n"                                 /* main loop */                   \
+".word 0x4f80e088\n" /* sdot v8.4s, v4.16b, v0.4b[0] */\
+".word 0x4fa0e08b\n" /* sdot v11.4s, v4.16b, v0.4b[1] */\
+  "ldp    q6, q7, [%[b_ptr]], #32\n"     /* load b2, b0 to q6, q7       */ \
+".word 0x4f80e88e\n" /* sdot v14.4s, v4.16b, v0.4b[2] */\
+".word 0x4fa0e891\n" /* sdot v17.4s, v4.16b, v0.4b[3] */\
+  "ldp    q2, q3, [%[a_ptr]], #32\n"     /* load a10, a11 to q3, q4     */ \
+".word 0x4f81e094\n" /* sdot v20.4s, v4.16b, v1.4b[0] */\
+".word 0x4fa1e097\n" /* sdot v23.4s, v4.16b, v1.4b[1] */\
+".word 0x4f81e89a\n" /* sdot v26.4s, v4.16b, v1.4b[2] */\
+".word 0x4fa1e89d\n" /* sdot v29.4s, v4.16b, v1.4b[3] */\
+".word 0x4f80e0a9\n" /* sdot v9.4s, v5.16b, v0.4b[0] */\
+".word 0x4fa0e0ac\n" /* sdot v12.4s, v5.16b, v0.4b[1] */\
+".word 0x4f80e8af\n" /* sdot v15.4s, v5.16b, v0.4b[2] */\
+".word 0x4fa0e8b2\n" /* sdot v18.4s, v5.16b, v0.4b[3] */\
+".word 0x4f81e0b5\n" /* sdot v21.4s, v5.16b, v1.4b[0] */\
+".word 0x4fa1e0b8\n" /* sdot v24.4s, v5.16b, v1.4b[1] */\
+".word 0x4f81e8bb\n" /* sdot v27.4s, v5.16b, v1.4b[2] */\
+".word 0x4fa1e8be\n" /* sdot v30.4s, v5.16b, v1.4b[3] */\
+  "ldp    q4, q5, [%[b_ptr]], #32\n"     /* load b1, b2 to q4, q5       */ \
+".word 0x4f80e0ca\n" /* sdot v10.4s, v6.16b, v0.4b[0] */\
+".word 0x4fa0e0cd\n" /* sdot v13.4s, v6.16b, v0.4b[1] */\
+  "prfm   pldl1keep, [%[b_ptr], #384]\n"                                   \
+".word 0x4f80e8d0\n" /* sdot v16.4s, v6.16b, v0.4b[2] */\
+".word 0x4fa0e8d3\n" /* sdot v19.4s, v6.16b, v0.4b[3] */\
+".word 0x4f81e0d6\n" /* sdot v22.4s, v6.16b, v1.4b[0] */\
+".word 0x4fa1e0d9\n" /* sdot v25.4s, v6.16b, v1.4b[1] */\
+".word 0x4f81e8dc\n" /* sdot v28.4s, v6.16b, v1.4b[2] */\
+".word 0x4fa1e8df\n" /* sdot v31.4s, v6.16b, v1.4b[3] */\
+  "ldp    q0, q1, [%[a_ptr]], #32\n"    /* load a00, a01 to q0, q1 */      \
+  /* unrool 1 */                                                           \
+".word 0x4f82e0e8\n" /* sdot v8.4s, v7.16b, v2.4b[0] */\
+".word 0x4fa2e0eb\n" /* sdot v11.4s, v7.16b, v2.4b[1] */\
+".word 0x4f82e8ee\n" /* sdot v14.4s, v7.16b, v2.4b[2] */\
+  "prfm   pldl1keep, [%[a_ptr], #256]\n"                                   \
+".word 0x4fa2e8f1\n" /* sdot v17.4s, v7.16b, v2.4b[3] */\
+".word 0x4f83e0f4\n" /* sdot v20.4s, v7.16b, v3.4b[0] */\
+".word 0x4fa3e0f7\n" /* sdot v23.4s, v7.16b, v3.4b[1] */\
+".word 0x4f83e8fa\n" /* sdot v26.4s, v7.16b, v3.4b[2] */\
+".word 0x4fa3e8fd\n" /* sdot v29.4s, v7.16b, v3.4b[3] */\
+  "ldp    q6, q7, [%[b_ptr]], #32\n"    /* load b0, b1 to q6, q7       */  \
+".word 0x4f82e089\n" /* sdot v9.4s, v4.16b, v2.4b[0] */\
+".word 0x4fa2e08c\n" /* sdot v12.4s, v4.16b, v2.4b[1] */\
+".word 0x4f82e88f\n" /* sdot v15.4s, v4.16b, v2.4b[2] */\
+".word 0x4fa2e892\n" /* sdot v18.4s, v4.16b, v2.4b[3] */\
+".word 0x4f83e095\n" /* sdot v21.4s, v4.16b, v3.4b[0] */\
+".word 0x4fa3e098\n" /* sdot v24.4s, v4.16b, v3.4b[1] */\
+".word 0x4f83e89b\n" /* sdot v27.4s, v4.16b, v3.4b[2] */\
+".word 0x4fa3e89e\n" /* sdot v30.4s, v4.16b, v3.4b[3] */\
+".word 0x4f82e0aa\n" /* sdot v10.4s, v5.16b, v2.4b[0] */\
+".word 0x4fa2e0ad\n" /* sdot v13.4s, v5.16b, v2.4b[1] */\
+".word 0x4f82e8b0\n" /* sdot v16.4s, v5.16b, v2.4b[2] */\
+".word 0x4fa2e8b3\n" /* sdot v19.4s, v5.16b, v2.4b[3] */\
+".word 0x4f83e0b6\n" /* sdot v22.4s, v5.16b, v3.4b[0] */\
+".word 0x4fa3e0b9\n" /* sdot v25.4s, v5.16b, v3.4b[1] */\
+".word 0x4f83e8bc\n" /* sdot v28.4s, v5.16b, v3.4b[2] */\
+".word 0x4fa3e8bf\n" /* sdot v31.4s, v5.16b, v3.4b[3] */\
+  "ldp    q4, q5, [%[b_ptr]], #32\n"    /* load b2, b0 to q4, q5 */        \
+  /* unrool 2*/                                                            \
+".word 0x4f80e0c8\n" /* sdot v8.4s, v6.16b, v0.4b[0] */\
+".word 0x4fa0e0cb\n" /* sdot v11.4s, v6.16b, v0.4b[1] */\
+  "ldp    q2, q3, [%[a_ptr]], #32\n"     /* load a10, a11 to q3, q4*/      \
+".word 0x4f80e8ce\n" /* sdot v14.4s, v6.16b, v0.4b[2] */\
+".word 0x4fa0e8d1\n" /* sdot v17.4s, v6.16b, v0.4b[3] */\
+".word 0x4f81e0d4\n" /* sdot v20.4s, v6.16b, v1.4b[0] */\
+".word 0x4fa1e0d7\n" /* sdot v23.4s, v6.16b, v1.4b[1] */\
+".word 0x4f81e8da\n" /* sdot v26.4s, v6.16b, v1.4b[2] */\
+".word 0x4fa1e8dd\n" /* sdot v29.4s, v6.16b, v1.4b[3] */\
+".word 0x4f80e0e9\n" /* sdot v9.4s, v7.16b, v0.4b[0] */\
+".word 0x4fa0e0ec\n" /* sdot v12.4s, v7.16b, v0.4b[1] */\
+  "prfm   pldl1keep, [%[b_ptr], #384]\n"                                   \
+".word 0x4f80e8ef\n" /* sdot v15.4s, v7.16b, v0.4b[2] */\
+".word 0x4fa0e8f2\n" /* sdot v18.4s, v7.16b, v0.4b[3] */\
+".word 0x4f81e0f5\n" /* sdot v21.4s, v7.16b, v1.4b[0] */\
+".word 0x4fa1e0f8\n" /* sdot v24.4s, v7.16b, v1.4b[1] */\
+".word 0x4f81e8fb\n" /* sdot v27.4s, v7.16b, v1.4b[2] */\
+".word 0x4fa1e8fe\n" /* sdot v30.4s, v7.16b, v1.4b[3] */\
+  "ldp    q6, q7, [%[b_ptr]], #32\n"    /* load b1, b2 to q6, q7*/         \
+".word 0x4f80e08a\n" /* sdot v10.4s, v4.16b, v0.4b[0] */\
+".word 0x4fa0e08d\n" /* sdot v13.4s, v4.16b, v0.4b[1] */\
+".word 0x4f80e890\n" /* sdot v16.4s, v4.16b, v0.4b[2] */\
+".word 0x4fa0e893\n" /* sdot v19.4s, v4.16b, v0.4b[3] */\
+".word 0x4f81e096\n" /* sdot v22.4s, v4.16b, v1.4b[0] */\
+".word 0x4fa1e099\n" /* sdot v25.4s, v4.16b, v1.4b[1] */\
+".word 0x4f81e89c\n" /* sdot v28.4s, v4.16b, v1.4b[2] */\
+".word 0x4fa1e89f\n" /* sdot v31.4s, v4.16b, v1.4b[3] */\
+  "ldp    q0, q1, [%[a_ptr]], #32\n" /* load a00, a01 to q0, q1*/          \
+  /* unrool 3*/                                                            \
+".word 0x4f82e0a8\n" /* sdot v8.4s, v5.16b, v2.4b[0] */\
+".word 0x4fa2e0ab\n" /* sdot v11.4s, v5.16b, v2.4b[1] */\
+".word 0x4f82e8ae\n" /* sdot v14.4s, v5.16b, v2.4b[2] */\
+".word 0x4fa2e8b1\n" /* sdot v17.4s, v5.16b, v2.4b[3] */\
+".word 0x4f83e0b4\n" /* sdot v20.4s, v5.16b, v3.4b[0] */\
+".word 0x4fa3e0b7\n" /* sdot v23.4s, v5.16b, v3.4b[1] */\
+".word 0x4f83e8ba\n" /* sdot v26.4s, v5.16b, v3.4b[2] */\
+".word 0x4fa3e8bd\n" /* sdot v29.4s, v5.16b, v3.4b[3] */\
+  "ldp    q4, q5, [%[b_ptr]], #32\n"     /* load b0, b1 to q4, q5*/        \
+".word 0x4f82e0c9\n" /* sdot v9.4s, v6.16b, v2.4b[0] */\
+".word 0x4fa2e0cc\n" /* sdot v12.4s, v6.16b, v2.4b[1] */\
+  "prfm   pldl1keep, [%[a_ptr], #256]\n"                                   \
+".word 0x4f82e8cf\n" /* sdot v15.4s, v6.16b, v2.4b[2] */\
+".word 0x4fa2e8d2\n" /* sdot v18.4s, v6.16b, v2.4b[3] */\
+".word 0x4f83e0d5\n" /* sdot v21.4s, v6.16b, v3.4b[0] */\
+".word 0x4fa3e0d8\n" /* sdot v24.4s, v6.16b, v3.4b[1] */\
+".word 0x4f83e8db\n" /* sdot v27.4s, v6.16b, v3.4b[2] */\
+  "prfm   pldl1keep, [%[b_ptr], #384]\n"                                   \
+".word 0x4fa3e8de\n" /* sdot v30.4s, v6.16b, v3.4b[3] */\
+".word 0x4f82e0ea\n" /* sdot v10.4s, v7.16b, v2.4b[0] */\
+".word 0x4fa2e0ed\n" /* sdot v13.4s, v7.16b, v2.4b[1] */\
+".word 0x4f82e8f0\n" /* sdot v16.4s, v7.16b, v2.4b[2] */\
+".word 0x4fa2e8f3\n" /* sdot v19.4s, v7.16b, v2.4b[3] */\
+".word 0x4f83e0f6\n" /* sdot v22.4s, v7.16b, v3.4b[0] */\
+".word 0x4fa3e0f9\n" /* sdot v25.4s, v7.16b, v3.4b[1] */\
+  "subs   %w[k], %w[k], #1\n"           /* loop count - 1*/                \
+".word 0x4f83e8fc\n" /* sdot v28.4s, v7.16b, v3.4b[2] */\
+".word 0x4fa3e8ff\n" /* sdot v31.4s, v7.16b, v3.4b[3] */\
+  "bne    1b\n" /* Target to use when K is 1 or 2 */                       \
+  "2:\n"                                             /* process tail*/     \
+  "subs       %w[tail], %w[tail], #1\n"              /* tail--*/           \
+  "beq        3f\n" /*jump to tail = 1*/                                   \
+  /* final unrool 0, unrool 0, tail > 1*/                                  \
+".word 0x4f80e088\n" /* sdot v8.4s, v4.16b, v0.4b[0] */\
+".word 0x4fa0e08b\n" /* sdot v11.4s, v4.16b, v0.4b[1] */\
+  "ldp    q6, q7, [%[b_ptr]], #32\n"     /* load b2, b0 to q6, q7*/        \
+".word 0x4f80e88e\n" /* sdot v14.4s, v4.16b, v0.4b[2] */\
+".word 0x4fa0e891\n" /* sdot v17.4s, v4.16b, v0.4b[3] */\
+  "ldp    q2, q3, [%[a_ptr]], #32\n"     /* load a10, a11 to q2, q3*/      \
+".word 0x4f81e094\n" /* sdot v20.4s, v4.16b, v1.4b[0] */\
+".word 0x4fa1e097\n" /* sdot v23.4s, v4.16b, v1.4b[1] */\
+".word 0x4f81e89a\n" /* sdot v26.4s, v4.16b, v1.4b[2] */\
+".word 0x4fa1e89d\n" /* sdot v29.4s, v4.16b, v1.4b[3] */\
+  "subs   %w[tail], %w[tail], #1\n"      /* tail--*/                       \
+".word 0x4f80e0a9\n" /* sdot v9.4s, v5.16b, v0.4b[0] */\
+".word 0x4fa0e0ac\n" /* sdot v12.4s, v5.16b, v0.4b[1] */\
+".word 0x4f80e8af\n" /* sdot v15.4s, v5.16b, v0.4b[2] */\
+".word 0x4fa0e8b2\n" /* sdot v18.4s, v5.16b, v0.4b[3] */\
+".word 0x4f81e0b5\n" /* sdot v21.4s, v5.16b, v1.4b[0] */\
+".word 0x4fa1e0b8\n" /* sdot v24.4s, v5.16b, v1.4b[1] */\
+".word 0x4f81e8bb\n" /* sdot v27.4s, v5.16b, v1.4b[2] */\
+".word 0x4fa1e8be\n" /* sdot v30.4s, v5.16b, v1.4b[3] */\
+  "ldp    q4, q5, [%[b_ptr]], #32\n"     /* load b1, b2 to q4, q5*/        \
+".word 0x4f80e0ca\n" /* sdot v10.4s, v6.16b, v0.4b[0] */\
+".word 0x4fa0e0cd\n" /* sdot v13.4s, v6.16b, v0.4b[1] */\
+".word 0x4f80e8d0\n" /* sdot v16.4s, v6.16b, v0.4b[2] */\
+".word 0x4fa0e8d3\n" /* sdot v19.4s, v6.16b, v0.4b[3] */\
+".word 0x4f81e0d6\n" /* sdot v22.4s, v6.16b, v1.4b[0] */\
+".word 0x4fa1e0d9\n" /* sdot v25.4s, v6.16b, v1.4b[1] */\
+".word 0x4f81e8dc\n" /* sdot v28.4s, v6.16b, v1.4b[2] */\
+".word 0x4fa1e8df\n" /* sdot v31.4s, v6.16b, v1.4b[3] */\
+  "beq        4f\n" /*jump to tail = 2*/                                   \
+  /* unrool 1, tail > 2*/                                                  \
+  "ldp    q0, q1, [%[a_ptr]], #32\n"     /* load a00, a01 to q0, q1*/      \
+".word 0x4f82e0e8\n" /* sdot v8.4s, v7.16b, v2.4b[0] */\
+".word 0x4fa2e0eb\n" /* sdot v11.4s, v7.16b, v2.4b[1] */\
+".word 0x4f82e8ee\n" /* sdot v14.4s, v7.16b, v2.4b[2] */\
+".word 0x4fa2e8f1\n" /* sdot v17.4s, v7.16b, v2.4b[3] */\
+".word 0x4f83e0f4\n" /* sdot v20.4s, v7.16b, v3.4b[0] */\
+".word 0x4fa3e0f7\n" /* sdot v23.4s, v7.16b, v3.4b[1] */\
+".word 0x4f83e8fa\n" /* sdot v26.4s, v7.16b, v3.4b[2] */\
+".word 0x4fa3e8fd\n" /* sdot v29.4s, v7.16b, v3.4b[3] */\
+  "ldp    q6, q7, [%[b_ptr]], #32\n"     /* load b0, b1 to q6, q7*/        \
+".word 0x4f82e089\n" /* sdot v9.4s, v4.16b, v2.4b[0] */\
+".word 0x4fa2e08c\n" /* sdot v12.4s, v4.16b, v2.4b[1] */\
+".word 0x4f82e88f\n" /* sdot v15.4s, v4.16b, v2.4b[2] */\
+".word 0x4fa2e892\n" /* sdot v18.4s, v4.16b, v2.4b[3] */\
+".word 0x4f83e095\n" /* sdot v21.4s, v4.16b, v3.4b[0] */\
+".word 0x4fa3e098\n" /* sdot v24.4s, v4.16b, v3.4b[1] */\
+".word 0x4f83e89b\n" /* sdot v27.4s, v4.16b, v3.4b[2] */\
+".word 0x4fa3e89e\n" /* sdot v30.4s, v4.16b, v3.4b[3] */\
+  "subs   %w[tail], %w[tail], #1\n"      /* tail--*/                       \
+".word 0x4f82e0aa\n" /* sdot v10.4s, v5.16b, v2.4b[0] */\
+".word 0x4fa2e0ad\n" /* sdot v13.4s, v5.16b, v2.4b[1] */\
+".word 0x4f82e8b0\n" /* sdot v16.4s, v5.16b, v2.4b[2] */\
+".word 0x4fa2e8b3\n" /* sdot v19.4s, v5.16b, v2.4b[3] */\
+".word 0x4f83e0b6\n" /* sdot v22.4s, v5.16b, v3.4b[0] */\
+".word 0x4fa3e0b9\n" /* sdot v25.4s, v5.16b, v3.4b[1] */\
+".word 0x4f83e8bc\n" /* sdot v28.4s, v5.16b, v3.4b[2] */\
+".word 0x4fa3e8bf\n" /* sdot v31.4s, v5.16b, v3.4b[3] */\
+  "beq        5f\n" /*jump to tail = 3*/                                   \
+  /* unrool 2, tail = 4*/                                                  \
+  "ldp    q4, q5, [%[b_ptr]], #32\n"     /* load b2, b0 to q4, q5*/        \
+".word 0x4f80e0c8\n" /* sdot v8.4s, v6.16b, v0.4b[0] */\
+".word 0x4fa0e0cb\n" /* sdot v11.4s, v6.16b, v0.4b[1] */\
+  "ldp    q2, q3, [%[a_ptr]], #32\n"     /* load a10, a11 to q3, q4*/      \
+".word 0x4f80e8ce\n" /* sdot v14.4s, v6.16b, v0.4b[2] */\
+".word 0x4fa0e8d1\n" /* sdot v17.4s, v6.16b, v0.4b[3] */\
+".word 0x4f81e0d4\n" /* sdot v20.4s, v6.16b, v1.4b[0] */\
+".word 0x4fa1e0d7\n" /* sdot v23.4s, v6.16b, v1.4b[1] */\
+".word 0x4f81e8da\n" /* sdot v26.4s, v6.16b, v1.4b[2] */\
+".word 0x4fa1e8dd\n" /* sdot v29.4s, v6.16b, v1.4b[3] */\
+".word 0x4f80e0e9\n" /* sdot v9.4s, v7.16b, v0.4b[0] */\
+".word 0x4fa0e0ec\n" /* sdot v12.4s, v7.16b, v0.4b[1] */\
+".word 0x4f80e8ef\n" /* sdot v15.4s, v7.16b, v0.4b[2] */\
+".word 0x4fa0e8f2\n" /* sdot v18.4s, v7.16b, v0.4b[3] */\
+".word 0x4f81e0f5\n" /* sdot v21.4s, v7.16b, v1.4b[0] */\
+".word 0x4fa1e0f8\n" /* sdot v24.4s, v7.16b, v1.4b[1] */\
+".word 0x4f81e8fb\n" /* sdot v27.4s, v7.16b, v1.4b[2] */\
+".word 0x4fa1e8fe\n" /* sdot v30.4s, v7.16b, v1.4b[3] */\
+  "ldp    q6, q7, [%[b_ptr]], #32\n"     /* load b1, b2 to q6, q7*/        \
+".word 0x4f80e08a\n" /* sdot v10.4s, v4.16b, v0.4b[0] */\
+".word 0x4fa0e08d\n" /* sdot v13.4s, v4.16b, v0.4b[1] */\
+".word 0x4f80e890\n" /* sdot v16.4s, v4.16b, v0.4b[2] */\
+".word 0x4fa0e893\n" /* sdot v19.4s, v4.16b, v0.4b[3] */\
+".word 0x4f81e096\n" /* sdot v22.4s, v4.16b, v1.4b[0] */\
+".word 0x4fa1e099\n" /* sdot v25.4s, v4.16b, v1.4b[1] */\
+".word 0x4f81e89c\n" /* sdot v28.4s, v4.16b, v1.4b[2] */\
+".word 0x4fa1e89f\n" /* sdot v31.4s, v4.16b, v1.4b[3] */\
+  /* unrool 3, tail = 4*/                                                  \
+".word 0x4f82e0a8\n" /* sdot v8.4s, v5.16b, v2.4b[0] */\
+".word 0x4fa2e0ab\n" /* sdot v11.4s, v5.16b, v2.4b[1] */\
+".word 0x4f82e8ae\n" /* sdot v14.4s, v5.16b, v2.4b[2] */\
+".word 0x4fa2e8b1\n" /* sdot v17.4s, v5.16b, v2.4b[3] */\
+".word 0x4f83e0b4\n" /* sdot v20.4s, v5.16b, v3.4b[0] */\
+".word 0x4fa3e0b7\n" /* sdot v23.4s, v5.16b, v3.4b[1] */\
+".word 0x4f83e8ba\n" /* sdot v26.4s, v5.16b, v3.4b[2] */\
+".word 0x4fa3e8bd\n" /* sdot v29.4s, v5.16b, v3.4b[3] */\
+".word 0x4f82e0c9\n" /* sdot v9.4s, v6.16b, v2.4b[0] */\
+".word 0x4fa2e0cc\n" /* sdot v12.4s, v6.16b, v2.4b[1] */\
+".word 0x4f82e8cf\n" /* sdot v15.4s, v6.16b, v2.4b[2] */\
+".word 0x4fa2e8d2\n" /* sdot v18.4s, v6.16b, v2.4b[3] */\
+".word 0x4f83e0d5\n" /* sdot v21.4s, v6.16b, v3.4b[0] */\
+".word 0x4fa3e0d8\n" /* sdot v24.4s, v6.16b, v3.4b[1] */\
+".word 0x4f83e8db\n" /* sdot v27.4s, v6.16b, v3.4b[2] */\
+".word 0x4fa3e8de\n" /* sdot v30.4s, v6.16b, v3.4b[3] */\
+".word 0x4f82e0ea\n" /* sdot v10.4s, v7.16b, v2.4b[0] */\
+".word 0x4fa2e0ed\n" /* sdot v13.4s, v7.16b, v2.4b[1] */\
+".word 0x4f82e8f0\n" /* sdot v16.4s, v7.16b, v2.4b[2] */\
+".word 0x4fa2e8f3\n" /* sdot v19.4s, v7.16b, v2.4b[3] */\
+".word 0x4f83e0f6\n" /* sdot v22.4s, v7.16b, v3.4b[0] */\
+".word 0x4fa3e0f9\n" /* sdot v25.4s, v7.16b, v3.4b[1] */\
+".word 0x4f83e8fc\n" /* sdot v28.4s, v7.16b, v3.4b[2] */\
+".word 0x4fa3e8ff\n" /* sdot v31.4s, v7.16b, v3.4b[3] */\
+  "b      11f\n"                         /* tails==1 final tail*/          \
+  "3: \n"                                /* tail=1*/                       \
+  "ldr    q6, [%[b_ptr]], #16\n"         /* load b2 to q6*/                \
+".word 0x4f80e088\n" /* sdot v8.4s, v4.16b, v0.4b[0] */\
+".word 0x4fa0e08b\n" /* sdot v11.4s, v4.16b, v0.4b[1] */\
+".word 0x4f80e88e\n" /* sdot v14.4s, v4.16b, v0.4b[2] */\
+".word 0x4fa0e891\n" /* sdot v17.4s, v4.16b, v0.4b[3] */\
+".word 0x4f81e094\n" /* sdot v20.4s, v4.16b, v1.4b[0] */\
+".word 0x4fa1e097\n" /* sdot v23.4s, v4.16b, v1.4b[1] */\
+".word 0x4f81e89a\n" /* sdot v26.4s, v4.16b, v1.4b[2] */\
+".word 0x4fa1e89d\n" /* sdot v29.4s, v4.16b, v1.4b[3] */\
+".word 0x4f80e0a9\n" /* sdot v9.4s, v5.16b, v0.4b[0] */\
+".word 0x4fa0e0ac\n" /* sdot v12.4s, v5.16b, v0.4b[1] */\
+".word 0x4f80e8af\n" /* sdot v15.4s, v5.16b, v0.4b[2] */\
+".word 0x4fa0e8b2\n" /* sdot v18.4s, v5.16b, v0.4b[3] */\
+".word 0x4f81e0b5\n" /* sdot v21.4s, v5.16b, v1.4b[0] */\
+".word 0x4fa1e0b8\n" /* sdot v24.4s, v5.16b, v1.4b[1] */\
+".word 0x4f81e8bb\n" /* sdot v27.4s, v5.16b, v1.4b[2] */\
+".word 0x4fa1e8be\n" /* sdot v30.4s, v5.16b, v1.4b[3] */\
+".word 0x4f80e0ca\n" /* sdot v10.4s, v6.16b, v0.4b[0] */\
+".word 0x4fa0e0cd\n" /* sdot v13.4s, v6.16b, v0.4b[1] */\
+".word 0x4f80e8d0\n" /* sdot v16.4s, v6.16b, v0.4b[2] */\
+".word 0x4fa0e8d3\n" /* sdot v19.4s, v6.16b, v0.4b[3] */\
+".word 0x4f81e0d6\n" /* sdot v22.4s, v6.16b, v1.4b[0] */\
+".word 0x4fa1e0d9\n" /* sdot v25.4s, v6.16b, v1.4b[1] */\
+".word 0x4f81e8dc\n" /* sdot v28.4s, v6.16b, v1.4b[2] */\
+".word 0x4fa1e8df\n" /* sdot v31.4s, v6.16b, v1.4b[3] */\
+  "b      11f\n"                         /* tails==2 final tail*/          \
+  "4:\n"                                 /* tail = 2*/                     \
+".word 0x4f82e0e8\n" /* sdot v8.4s, v7.16b, v2.4b[0] */\
+".word 0x4fa2e0eb\n" /* sdot v11.4s, v7.16b, v2.4b[1] */\
+".word 0x4f82e8ee\n" /* sdot v14.4s, v7.16b, v2.4b[2] */\
+".word 0x4fa2e8f1\n" /* sdot v17.4s, v7.16b, v2.4b[3] */\
+".word 0x4f83e0f4\n" /* sdot v20.4s, v7.16b, v3.4b[0] */\
+".word 0x4fa3e0f7\n" /* sdot v23.4s, v7.16b, v3.4b[1] */\
+".word 0x4f83e8fa\n" /* sdot v26.4s, v7.16b, v3.4b[2] */\
+".word 0x4fa3e8fd\n" /* sdot v29.4s, v7.16b, v3.4b[3] */\
+".word 0x4f82e089\n" /* sdot v9.4s, v4.16b, v2.4b[0] */\
+".word 0x4fa2e08c\n" /* sdot v12.4s, v4.16b, v2.4b[1] */\
+".word 0x4f82e88f\n" /* sdot v15.4s, v4.16b, v2.4b[2] */\
+".word 0x4fa2e892\n" /* sdot v18.4s, v4.16b, v2.4b[3] */\
+".word 0x4f83e095\n" /* sdot v21.4s, v4.16b, v3.4b[0] */\
+".word 0x4fa3e098\n" /* sdot v24.4s, v4.16b, v3.4b[1] */\
+".word 0x4f83e89b\n" /* sdot v27.4s, v4.16b, v3.4b[2] */\
+".word 0x4fa3e89e\n" /* sdot v30.4s, v4.16b, v3.4b[3] */\
+".word 0x4f82e0aa\n" /* sdot v10.4s, v5.16b, v2.4b[0] */\
+".word 0x4fa2e0ad\n" /* sdot v13.4s, v5.16b, v2.4b[1] */\
+".word 0x4f82e8b0\n" /* sdot v16.4s, v5.16b, v2.4b[2] */\
+".word 0x4fa2e8b3\n" /* sdot v19.4s, v5.16b, v2.4b[3] */\
+".word 0x4f83e0b6\n" /* sdot v22.4s, v5.16b, v3.4b[0] */\
+".word 0x4fa3e0b9\n" /* sdot v25.4s, v5.16b, v3.4b[1] */\
+".word 0x4f83e8bc\n" /* sdot v28.4s, v5.16b, v3.4b[2] */\
+".word 0x4fa3e8bf\n" /* sdot v31.4s, v5.16b, v3.4b[3] */\
+  "b      11f\n"                         /* tails==3 final tail*/          \
+  "5:\n"                                 /* tail = 3*/                     \
+  "ldr    q4, [%[b_ptr]], #16\n"         /* load b2, b0 to q4*/            \
+".word 0x4f80e0c8\n" /* sdot v8.4s, v6.16b, v0.4b[0] */\
+".word 0x4fa0e0cb\n" /* sdot v11.4s, v6.16b, v0.4b[1] */\
+".word 0x4f80e8ce\n" /* sdot v14.4s, v6.16b, v0.4b[2] */\
+".word 0x4fa0e8d1\n" /* sdot v17.4s, v6.16b, v0.4b[3] */\
+".word 0x4f81e0d4\n" /* sdot v20.4s, v6.16b, v1.4b[0] */\
+".word 0x4fa1e0d7\n" /* sdot v23.4s, v6.16b, v1.4b[1] */\
+".word 0x4f81e8da\n" /* sdot v26.4s, v6.16b, v1.4b[2] */\
+".word 0x4fa1e8dd\n" /* sdot v29.4s, v6.16b, v1.4b[3] */\
+".word 0x4f80e0e9\n" /* sdot v9.4s, v7.16b, v0.4b[0] */\
+".word 0x4fa0e0ec\n" /* sdot v12.4s, v7.16b, v0.4b[1] */\
+".word 0x4f80e8ef\n" /* sdot v15.4s, v7.16b, v0.4b[2] */\
+".word 0x4fa0e8f2\n" /* sdot v18.4s, v7.16b, v0.4b[3] */\
+".word 0x4f81e0f5\n" /* sdot v21.4s, v7.16b, v1.4b[0] */\
+".word 0x4fa1e0f8\n" /* sdot v24.4s, v7.16b, v1.4b[1] */\
+".word 0x4f81e8fb\n" /* sdot v27.4s, v7.16b, v1.4b[2] */\
+".word 0x4fa1e8fe\n" /* sdot v30.4s, v7.16b, v1.4b[3] */\
+".word 0x4f80e08a\n" /* sdot v10.4s, v4.16b, v0.4b[0] */\
+".word 0x4fa0e08d\n" /* sdot v13.4s, v4.16b, v0.4b[1] */\
+".word 0x4f80e890\n" /* sdot v16.4s, v4.16b, v0.4b[2] */\
+".word 0x4fa0e893\n" /* sdot v19.4s, v4.16b, v0.4b[3] */\
+".word 0x4f81e096\n" /* sdot v22.4s, v4.16b, v1.4b[0] */\
+".word 0x4fa1e099\n" /* sdot v25.4s, v4.16b, v1.4b[1] */\
+".word 0x4f81e89c\n" /* sdot v28.4s, v4.16b, v1.4b[2] */\
+".word 0x4fa1e89f\n" /* sdot v31.4s, v4.16b, v1.4b[3] */\
+  "11: \n"                               /* end */
+
+#define GEMM_SDOT_INT8_KERNEL_8x8     \
+  "prfm   pldl1keep, [%[a_ptr], #64]\n"  /* preload a*/                     \
+  "eor    v8.16b,  v8.16b,  v8.16b \n"     /* out0 = 0 */                   \
+  "eor    v11.16b, v11.16b, v11.16b\n"     /* out0 = 0 */                   \
+  "eor    v14.16b, v14.16b, v14.16b\n"     /* out0 = 0 */                   \
+  "eor    v17.16b, v17.16b, v17.16b\n"     /* out0 = 0 */                   \
+  "eor    v20.16b, v20.16b, v20.16b\n"     /* out0 = 0 */                   \
+  "eor    v23.16b, v23.16b, v23.16b\n"     /* out0 = 0 */                   \
+  "eor    v26.16b, v26.16b, v26.16b\n"     /* out0 = 0 */                   \
+  "eor    v29.16b, v29.16b, v29.16b\n"     /* out0 = 0 */                   \
+  "prfm   pldl1keep, [%[b_ptr], #64]\n"  /* preload b*/                     \
+  "eor    v9.16b,  v9.16b,  v9.16b \n"     /* out0 = 0 */                   \
+  "eor    v12.16b, v12.16b, v12.16b\n"     /* out0 = 0 */                   \
+  "eor    v15.16b, v15.16b, v15.16b\n"     /* out0 = 0 */                   \
+  "eor    v18.16b, v18.16b, v18.16b\n"     /* out0 = 0 */                   \
+  "eor    v21.16b, v21.16b, v21.16b\n"     /* out0 = 0 */                   \
+  "eor    v24.16b, v24.16b, v24.16b\n"     /* out0 = 0 */                   \
+  "eor    v27.16b, v27.16b, v27.16b\n"     /* out0 = 0 */                   \
+  "eor    v30.16b, v30.16b, v30.16b\n"     /* out0 = 0 */                   \
+  "1:\n"                                                                    \
+  "ldp    q0, q1, [%[a_ptr]], #32\n"                                        \
+  "ldp    q4, q5, [%[b_ptr]], #32\n"                                        \
+".word 0x4f80e088\n" /* sdot v8.4s, v4.16b, v0.4b[0] */\
+".word 0x4fa0e08b\n" /* sdot v11.4s, v4.16b, v0.4b[1] */\
+".word 0x4f80e88e\n" /* sdot v14.4s, v4.16b, v0.4b[2] */\
+".word 0x4fa0e891\n" /* sdot v17.4s, v4.16b, v0.4b[3] */\
+  "prfm   pldl1keep, [%[a_ptr], #64]\n"  /* preload a*/                     \
+".word 0x4f81e094\n" /* sdot v20.4s, v4.16b, v1.4b[0] */\
+".word 0x4fa1e097\n" /* sdot v23.4s, v4.16b, v1.4b[1] */\
+".word 0x4f81e89a\n" /* sdot v26.4s, v4.16b, v1.4b[2] */\
+".word 0x4fa1e89d\n" /* sdot v29.4s, v4.16b, v1.4b[3] */\
+  "prfm   pldl1keep, [%[a_ptr], #128]\n"  /* preload b*/                    \
+  "prfm   pldl1keep, [%[b_ptr], #64]\n"  /* preload b*/                     \
+".word 0x4f80e0a9\n" /* sdot v9.4s, v5.16b, v0.4b[0] */\
+".word 0x4fa0e0ac\n" /* sdot v12.4s, v5.16b, v0.4b[1] */\
+".word 0x4f80e8af\n" /* sdot v15.4s, v5.16b, v0.4b[2] */\
+".word 0x4fa0e8b2\n" /* sdot v18.4s, v5.16b, v0.4b[3] */\
+  "prfm   pldl1keep, [%[b_ptr], #128]\n"  /* preload b*/                    \
+".word 0x4f81e0b5\n" /* sdot v21.4s, v5.16b, v1.4b[0] */\
+".word 0x4fa1e0b8\n" /* sdot v24.4s, v5.16b, v1.4b[1] */\
+".word 0x4f81e8bb\n" /* sdot v27.4s, v5.16b, v1.4b[2] */\
+".word 0x4fa1e8be\n" /* sdot v30.4s, v5.16b, v1.4b[3] */\
+  "subs %w[k], %w[k], #1\n"                                                 \
+  "bne 1b\n"
+
+#define GEMM_SDOT_INT8_KERNEL_8x4     \
+  "prfm   pldl1keep, [%[a_ptr], #64]\n"  /* preload a*/                      \
+  "eor    v8.16b,  v8.16b,  v8.16b \n"     /* out0 = 0 */                    \
+  "eor    v11.16b, v11.16b, v11.16b\n"     /* out0 = 0 */                    \
+  "eor    v14.16b, v14.16b, v14.16b\n"     /* out0 = 0 */                    \
+  "eor    v17.16b, v17.16b, v17.16b\n"     /* out0 = 0 */                    \
+  "prfm   pldl1keep, [%[b_ptr], #32]\n"  /* preload b*/                      \
+  "eor    v20.16b, v20.16b, v20.16b\n"     /* out0 = 0 */                    \
+  "eor    v23.16b, v23.16b, v23.16b\n"     /* out0 = 0 */                    \
+  "eor    v26.16b, v26.16b, v26.16b\n"     /* out0 = 0 */                    \
+  "eor    v29.16b, v29.16b, v29.16b\n"     /* out0 = 0 */                    \
+  "1:\n"                              \
+  "ldp    q0, q1, [%[a_ptr]], #32\n"  \
+  "ldr    q4,  [%[b_ptr]], #16\n"     \
+".word 0x4f80e088\n" /* sdot v8.4s, v4.16b, v0.4b[0] */\
+".word 0x4fa0e08b\n" /* sdot v11.4s, v4.16b, v0.4b[1] */\
+  "prfm   pldl1keep, [%[a_ptr], #64]\n"  /* preload a*/                      \
+".word 0x4f80e88e\n" /* sdot v14.4s, v4.16b, v0.4b[2] */\
+".word 0x4fa0e891\n" /* sdot v17.4s, v4.16b, v0.4b[3] */\
+  "prfm   pldl1keep, [%[a_ptr], #64]\n"  /* preload a*/                      \
+".word 0x4f81e094\n" /* sdot v20.4s, v4.16b, v1.4b[0] */\
+".word 0x4fa1e097\n" /* sdot v23.4s, v4.16b, v1.4b[1] */\
+  "prfm   pldl1keep, [%[b_ptr], #32]\n"  /* preload b*/                      \
+".word 0x4f81e89a\n" /* sdot v26.4s, v4.16b, v1.4b[2] */\
+".word 0x4fa1e89d\n" /* sdot v29.4s, v4.16b, v1.4b[3] */\
+  "subs %w[k], %w[k], #1\n"           \
+  "bne 1b\n"
diff --git a/lite/backends/arm/math/dotprod/gemm_vsdot.h b/lite/backends/arm/math/dotprod/gemm_vsdot.h
new file mode 100644
index 00000000000..9929ade9b95
--- /dev/null
+++ b/lite/backends/arm/math/dotprod/gemm_vsdot.h
@@ -0,0 +1,54 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+// clang-format off
+#define GEMM_DOT_INT8_KERNEL                                           \
+  "vld1.s8  {q0}, [%[a_ptr]]!  \n"     /* load a00,a01 to q0, q1*/       \
+  "vld1.s8  {d2}, [%[a_ptr]]!  \n"     /* load a00,a01 to q0, q1*/       \
+  "veor.s32    q4,  q4, q4     \n"     /* out0 = 0 */                    \
+  "veor.s32    q5,  q5, q5     \n"     /* out0 = 0 */                    \
+  "veor.s32    q6,  q6, q6     \n"     /* out0 = 0 */                    \
+  "veor.s32    q7,  q7, q7     \n"     /* out0 = 0 */                    \
+  "veor.s32    q8,  q8, q8     \n"     /* out0 = 0 */                    \
+  "veor.s32    q9,  q9, q9     \n"     /* out0 = 0 */                    \
+  "veor.s32    q10,  q10, q10  \n"     /* out0 = 0 */                    \
+  "veor.s32    q11,  q11, q11  \n"     /* out0 = 0 */                    \
+  "veor.s32    q12,  q12, q12  \n"     /* out0 = 0 */                    \
+  "veor.s32    q13,  q13, q13  \n"     /* out0 = 0 */                    \
+  "veor.s32    q14,  q14, q14  \n"     /* out0 = 0 */                    \
+  "veor.s32    q15,  q15, q15  \n"     /* out0 = 0 */                    \
+  "cmp   %[k], #0              \n"                                       \
+  "beq   2f                    \n"                                       \
+  "1:                          \n"                                       \
+  "vld1.s8  {q2}, [%[b_ptr]]!  \n"                                       \
+  "vld1.s8  {q3}, [%[b_ptr]]!  \n"                                       \
+".word 0x8d40fe24\n" /* vsdot.s8 q4, q2, d0[0] */\
+".word 0xcd60fe24\n" /* vsdot.s8 q6, q2, d0[1] */\
+".word 0x0d41fe64\n" /* vsdot.s8 q8, q2, d1[0] */\
+".word 0x4d61fe64\n" /* vsdot.s8 q10, q2, d1[1] */\
+".word 0x8d42fe64\n" /* vsdot.s8 q12, q2, d2[0] */\
+".word 0xcd62fe64\n" /* vsdot.s8 q14, q2, d2[1] */\
+".word 0xad40fe26\n" /* vsdot.s8 q5, q3, d0[0] */\
+".word 0xed60fe26\n" /* vsdot.s8 q7, q3, d0[1] */\
+".word 0x2d41fe66\n" /* vsdot.s8 q9, q3, d1[0] */\
+".word 0x6d61fe66\n" /* vsdot.s8 q11, q3, d1[1] */\
+".word 0xad42fe66\n" /* vsdot.s8 q13, q3, d2[0] */\
+".word 0xed62fe66\n" /* vsdot.s8 q15, q3, d2[1] */\
+  "vld1.s8  {q0}, [%[a_ptr]]!  \n"     /* load a00,a01 to q0, q1*/       \
+  "vld1.s8  {d2}, [%[a_ptr]]!  \n"     /* load a00,a01 to q0, q1*/       \
+  "subs %[k], %[k], #1         \n"                                       \
+  "bne    1b                   \n"                                       \
+  "2:                          \n"
diff --git a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
index bb430c8d8f6..e493bebfc50 100644
--- a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
+++ b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
@@ -1,364 +1,336 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <sys/time.h>
-#include <time.h>
+#include <chrono>  // NOLINT(build/c++11)
 #include <cmath>
 #include <iostream>
-#include <string>
 #include <vector>
-
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <map>
+#include <stdexcept>
 #include "paddle_api.h"  // NOLINT
+
+#define IPTCORE_PADDLE_MOBILE
+#define IPTCORE_PADDLE_BENCHMARK
 /////////////////////////////////////////////////////////////////////////
-// If this demo is linked to static library:libpaddle_api_light_bundled.a
+// If this demo is linked to static library:libpaddle_api_full_bundled.a
 // , you should include `paddle_use_ops.h` and `paddle_use_kernels.h` to
 // avoid linking errors such as `unsupport ops or kernels`.
 /////////////////////////////////////////////////////////////////////////
-// #include "paddle_use_kernels.h"  // NOLINT
-// #include "paddle_use_ops.h"      // NOLINT
-
-using namespace paddle::lite_api;  // NOLINT
+#ifdef IPTCORE_PADDLE_MOBILE
+#else
+#ifdef _WIN32
+#include "paddle_use_kernels.h"  // NOLINT
+#include "paddle_use_ops.h"      // NOLINT
+#endif
+#endif
 
-int64_t ShapeProduction(const shape_t& shape) {
-  int64_t res = 1;
-  for (auto i : shape) res *= i;
-  return res;
-}
+#ifdef IPTCORE_PADDLE_BENCHMARK
+class Timer {
+private:
+    std::chrono::high_resolution_clock::time_point inTime, outTime;
+
+public:
+    void startTimer() { inTime = std::chrono::high_resolution_clock::now(); }
+
+    // unit millisecond
+    float getCostTimer() {
+        outTime = std::chrono::high_resolution_clock::now();
+        return static_cast<float>(
+            std::chrono::duration_cast<std::chrono::microseconds>(outTime - inTime)
+                .count() /
+                1e+3);
+    }
+};
+#endif
 
-std::string ShapePrint(const std::vector<shape_t>& shapes) {
-  std::string shapes_str{""};
-  for (size_t shape_idx = 0; shape_idx < shapes.size(); ++shape_idx) {
-    auto shape = shapes[shape_idx];
-    std::string shape_str;
-    for (auto i : shape) {
-      shape_str += std::to_string(i) + ",";
+template<typename T>
+double compute_mean(const T* in, const size_t length) {
+    double sum = 0.;
+    for (size_t i = 0; i < length; ++i) {
+        sum += in[i];
     }
-    shapes_str += shape_str;
-    shapes_str +=
-        (shape_idx != 0 && shape_idx == shapes.size() - 1) ? "" : " : ";
-  }
-  return shapes_str;
+    return sum / length;
 }
 
-std::string ShapePrint(const shape_t& shape) {
-  std::string shape_str{""};
-  for (auto i : shape) {
-    shape_str += std::to_string(i) + " ";
-  }
-  return shape_str;
-}
+template<typename T>
+double compute_standard_deviation(const T* in,
+                                  const size_t length,
+                                  bool has_mean = false,
+                                  double mean = 10000) {
+    if (!has_mean) {
+        mean = compute_mean<T>(in, length);
+    }
 
-std::vector<std::string> split_string(const std::string& str_in) {
-  std::vector<std::string> str_out;
-  std::string tmp_str = str_in;
-  while (!tmp_str.empty()) {
-    size_t next_offset = tmp_str.find(":");
-    str_out.push_back(tmp_str.substr(0, next_offset));
-    if (next_offset == std::string::npos) {
-      break;
-    } else {
-      tmp_str = tmp_str.substr(next_offset + 1);
+    double variance = 0.;
+    for (size_t i = 0; i < length; ++i) {
+        variance += pow((in[i] - mean), 2);
     }
-  }
-  return str_out;
+    variance /= length;
+    return sqrt(variance);
 }
 
-std::vector<int64_t> get_shape(const std::string& str_shape) {
-  std::vector<int64_t> shape;
-  std::string tmp_str = str_shape;
-  while (!tmp_str.empty()) {
-    int dim = atoi(tmp_str.data());
-    shape.push_back(dim);
-    size_t next_offset = tmp_str.find(",");
-    if (next_offset == std::string::npos) {
-      break;
-    } else {
-      tmp_str = tmp_str.substr(next_offset + 1);
+int64_t shape_production(const paddle::lite_api::shape_t& shape) {
+    int64_t res = 1;
+    for (auto i : shape) {
+        res *= i;
     }
-  }
-  return shape;
+    return res;
 }
 
-template <typename T>
-double compute_mean(const T* in, const size_t length) {
-  double sum = 0.;
-  for (size_t i = 0; i < length; ++i) {
-    sum += in[i];
-  }
-  return sum / length;
+class InputData {
+public:
+    int _type = -1; ///int32, int64, float32
+    bool _lod = false;
+    std::vector<int64_t> _shape;
+    std::vector<int32_t> _int32_data;
+    std::vector<int64_t> _int64_data;
+    std::vector<float> _float32_data;
+    std::vector<std::vector<uint64_t>> _lod_data = {{0, 1}, {0, 1}};
+};
+
+class UserPersonaInfer {
+public:
+#ifdef IPTCORE_PADDLE_MOBILE
+    void create_paddle_light_predictor(const std::string& model_file);
+#else
+    void create_paddle_full_predictor(const std::string& model_dir);
+#endif
+    void prepare(const std::string& path);
+    void infer();
+private:
+    void infer_specific_item(paddle::lite_api::PaddlePredictor *predictor);
+    std::shared_ptr<paddle::lite_api::PaddlePredictor> _paddle_predictor;
+    std::vector<std::map<std::string, InputData> > _batch;
+};
+
+#ifdef IPTCORE_PADDLE_MOBILE
+void UserPersonaInfer::create_paddle_light_predictor(const std::string& model_file) {
+    // 1. Set MobileConfig
+    paddle::lite_api::MobileConfig config;
+    config.set_model_from_file(model_file);
+    config.set_power_mode(paddle::lite_api::LITE_POWER_HIGH);
+    // 2. Create PaddlePredictor by MobileConfig
+    _paddle_predictor =
+        paddle::lite_api::CreatePaddlePredictor<paddle::lite_api::MobileConfig>(config);
 }
-
-template <typename T>
-double compute_standard_deviation(const T* in,
-                                  const size_t length,
-                                  bool has_mean = false,
-                                  double mean = 10000) {
-  if (!has_mean) {
-    mean = compute_mean<T>(in, length);
-  }
-
-  double variance = 0.;
-  for (size_t i = 0; i < length; ++i) {
-    variance += pow((in[i] - mean), 2);
-  }
-  variance /= length;
-  return sqrt(variance);
+#else
+void UserPersonaInfer::create_paddle_full_predictor(const std::string& model_dir) {
+    // 1. Create CxxConfig
+    paddle::lite_api::CxxConfig config;
+    config.set_model_dir(model_dir);
+    config.set_valid_places({paddle::lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
+                                paddle::lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});
+    // 2. Create PaddlePredictor by CxxConfig
+    _paddle_predictor =
+        paddle::lite_api::CreatePaddlePredictor<paddle::lite_api::CxxConfig>(config);
 }
+#endif
+namespace {
+using namespace std;
+template <class T>
+void extract_num(const string &str, vector<T> &results) {
+    stringstream ss;
 
-inline double GetCurrentUS() {
-  struct timeval time;
-  gettimeofday(&time, NULL);
-  return 1e+6 * time.tv_sec + time.tv_usec;
-}
+    /* Storing the whole string into string stream */
+    ss << str;
 
-void RunModel(std::string model_dir,
-              const std::vector<shape_t>& input_shapes,
-              size_t repeats,
-              size_t warmup,
-              size_t power_mode,
-              size_t thread_num,
-              size_t accelerate_opencl,
-              size_t print_output_elem) {
-  // 1. Set MobileConfig
-  MobileConfig config;
-  config.set_model_from_file(model_dir);
-
-#ifdef METAL
-  std::string metal_lib_path = "../../../metal/lite.metallib";
-  config.set_metal_lib_path(metal_lib_path);
-  config.set_metal_use_mps(true);
-#else
-  // NOTE: Use android gpu with opencl, you should ensure:
-  //  first, [compile **cpu+opencl** paddlelite
-  //    lib](/~https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/docs/demo_guides/opencl.md);
-  //  second, [convert and use opencl nb
-  //    model](/~https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/docs/user_guides/opt/opt_bin.md).
-
-  bool is_opencl_backend_valid =
-      ::IsOpenCLBackendValid(/*check_fp16_valid = false*/);
-  std::cout << "is_opencl_backend_valid:"
-            << (is_opencl_backend_valid ? "true" : "false") << std::endl;
-  if (is_opencl_backend_valid) {
-    if (accelerate_opencl != 0) {
-      // Set opencl kernel binary.
-      // Large addtitional prepare time is cost due to algorithm selecting and
-      // building kernel from source code.
-      // Prepare time can be reduced dramitically after building algorithm file
-      // and OpenCL kernel binary on the first running.
-      // The 1st running time will be a bit longer due to the compiling time if
-      // you don't call `set_opencl_binary_path_name` explicitly.
-      // So call `set_opencl_binary_path_name` explicitly is strongly
-      // recommended.
-
-      // Make sure you have write permission of the binary path.
-      // We strongly recommend each model has a unique binary name.
-      const std::string bin_path = "/data/local/tmp/";
-      const std::string bin_name = "lite_opencl_kernel.bin";
-      config.set_opencl_binary_path_name(bin_path, bin_name);
-
-      // opencl tune option
-      // CL_TUNE_NONE: 0
-      // CL_TUNE_RAPID: 1
-      // CL_TUNE_NORMAL: 2
-      // CL_TUNE_EXHAUSTIVE: 3
-      const std::string tuned_path = "/data/local/tmp/";
-      const std::string tuned_name = "lite_opencl_tuned.bin";
-      config.set_opencl_tune(CL_TUNE_NORMAL, tuned_path, tuned_name);
-
-      // opencl precision option
-      // CL_PRECISION_AUTO: 0, first fp16 if valid, default
-      // CL_PRECISION_FP32: 1, force fp32
-      // CL_PRECISION_FP16: 2, force fp16
-      config.set_opencl_precision(CL_PRECISION_FP16);
+    /* Running loop till the end of the stream */
+    string temp;
+    T found;
+    while (!ss.eof()) {
+
+        /* extracting word by word from stream */
+        ss >> temp;
+
+        /* Checking the given word is integer or not */
+        if (stringstream(temp) >> found)
+            results.emplace_back(found);
+
+        /* To save from space at the end of string */
+        temp = "";
     }
-  } else {
-    std::cout << "*** nb model will be running on cpu. ***" << std::endl;
-    // you can give backup cpu nb model instead
-    // config.set_model_from_file(cpu_nb_model_dir);
-  }
-#endif
+}
+}
+
+void UserPersonaInfer::prepare(const std::string& path) {
+    ///xia_i	186	tgt_generation_mask	float32	(1, 1, 33)	[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+    std::ifstream in(path.c_str());
+    std::string line;
+    std::string current_idx;
+    while (std::getline(in, line)) {
+        if (line.empty()) {
+            break;
+        }
+        if (line.back() == '\r') {
+            line.pop_back();
+        }
+        if (line.empty()) {
+            break;
+        }
+        std::vector<std::string> strings;
+        std::istringstream f(line);
+        std::string s;
+        while (getline(f, s, '\t')) {
+            strings.push_back(s);
+        }
+        if (current_idx != strings.at(1)) {
+            _batch.push_back(std::map<std::string, InputData>());
+            current_idx = strings[1];
+        }
+        if (strings.at(2) == "lods") {
+            if (strings.at(3) != "[[0, 1], [0, 1]]") {
+                throw std::invalid_argument("invalid lod");
+            }
+            continue;
+        }
+        auto& input_data = _batch.back()[strings.at(2)];
+
+        extract_num(strings.at(4), input_data._shape);
+        if (strings[0] == "lod_i") {
+            input_data._lod = true;
+        }
+        if (strings.at(3) == "int32") {
+            input_data._type = 0;
+            extract_num(strings.at(5), input_data._int32_data);
+        } else if (strings.at(3) == "int64") {
+            input_data._type = 1;
+            extract_num(strings.at(5), input_data._int64_data);
+        } else if (strings.at(3) == "float32") {
+            input_data._type = 2;
+            extract_num(strings.at(5), input_data._float32_data);
+        } else {
+            throw std::invalid_argument("invalid type");
+        }
 
-  // NOTE: To load model transformed by model_optimize_tool before
-  // release/v2.3.0, plese use `set_model_dir` API as listed below.
-  // config.set_model_dir(model_dir);
-  config.set_power_mode(static_cast<paddle::lite_api::PowerMode>(power_mode));
-  config.set_threads(thread_num);
-  // 2. Create PaddlePredictor by MobileConfig
-  std::shared_ptr<PaddlePredictor> predictor =
-      CreatePaddlePredictor<MobileConfig>(config);
-
-  // 3. Prepare input data
-  std::cout << "input_shapes.size():" << input_shapes.size() << std::endl;
-  for (int j = 0; j < input_shapes.size(); ++j) {
-    auto input_tensor = predictor->GetInput(j);
-    input_tensor->Resize(input_shapes[j]);
-    auto input_data = input_tensor->mutable_data<float>();
-    int input_num = 1;
-    for (int i = 0; i < input_shapes[j].size(); ++i) {
-      input_num *= input_shapes[j][i];
     }
+}
 
-    for (int i = 0; i < input_num; ++i) {
-      input_data[i] = 1.f;
+void UserPersonaInfer::infer_specific_item(paddle::lite_api::PaddlePredictor *predictor){
+    static int count = 0;
+    if (_batch.empty()) {
+        return;
     }
-  }
-
-  // 4. Run predictor
-  double first_duration{-1};
-  for (size_t widx = 0; widx < warmup; ++widx) {
-    if (widx == 0) {
-      auto start = GetCurrentUS();
-      predictor->Run();
-      first_duration = (GetCurrentUS() - start) / 1000.0;
-    } else {
-      predictor->Run();
+    auto &inputs = _batch[count];
+    auto names = predictor->GetInputNames();
+    for (auto &name : names) {
+        auto& input = inputs[name];
+        auto tensor = predictor->GetInputByName(name);
+        tensor->Resize(input._shape);
+        if (input._type == 0) {
+            auto input_data = tensor->mutable_data<int32_t>();
+            std::copy(input._int32_data.begin(), input._int32_data.end(), input_data);
+        } else if (input._type == 1) {
+            auto input_data = tensor->mutable_data<int64_t>();
+            std::copy(input._int64_data.begin(), input._int64_data.end(), input_data);
+        } else if (input._type == 2) {
+            auto input_data = tensor->mutable_data<float>();
+            std::copy(input._float32_data.begin(), input._float32_data.end(), input_data);
+        } else {
+            throw std::invalid_argument("invalid name");
+        }
+        if (input._lod) {
+            tensor->SetLoD(input._lod_data);
+        }
     }
-  }
-
-  double sum_duration = 0.0;  // millisecond;
-  double max_duration = 1e-5;
-  double min_duration = 1e5;
-  double avg_duration = -1;
-  for (size_t ridx = 0; ridx < repeats; ++ridx) {
-    auto start = GetCurrentUS();
 
     predictor->Run();
 
-    auto duration = (GetCurrentUS() - start) / 1000.0;
-    sum_duration += duration;
-    max_duration = duration > max_duration ? duration : max_duration;
-    min_duration = duration < min_duration ? duration : min_duration;
-    std::cout << "run_idx:" << ridx + 1 << " / " << repeats << ": " << duration
-              << " ms" << std::endl;
-    if (first_duration < 0) {
-      first_duration = duration;
+    std::cout << "\n";
+    for (int idx = 0; idx != 2; ++idx) {
+        auto output_tensor = predictor->GetOutput(idx);
+        auto total_size = shape_production(output_tensor->shape());
+        std::cout << "xiarj_" << count << "\t";
+        for (int i = 0; i < total_size; ++i) {
+            if (idx == 0) {
+                std::cout << output_tensor->data<int64_t>()[i] << "\t";
+            } else {
+                std::cout << output_tensor->data<float>()[i] << "\t";
+            }
+        }
+        std::cout << "\n";
     }
-  }
-  avg_duration = sum_duration / static_cast<float>(repeats);
-  std::cout << "\n======= benchmark summary =======\n"
-            << "input_shape(s) (NCHW):" << ShapePrint(input_shapes) << "\n"
-            << "model_dir:" << model_dir << "\n"
-            << "warmup:" << warmup << "\n"
-            << "repeats:" << repeats << "\n"
-            << "power_mode:" << power_mode << "\n"
-            << "thread_num:" << thread_num << "\n"
-            << "*** time info(ms) ***\n"
-            << "1st_duration:" << first_duration << "\n"
-            << "max_duration:" << max_duration << "\n"
-            << "min_duration:" << min_duration << "\n"
-            << "avg_duration:" << avg_duration << "\n";
-
-  // 5. Get output
-  std::cout << "\n====== output summary ====== " << std::endl;
-  size_t output_tensor_num = predictor->GetOutputNames().size();
-  std::cout << "output tensor num:" << output_tensor_num << std::endl;
-
-  for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
-    std::unique_ptr<const paddle::lite_api::Tensor> output_tensor =
-        predictor->GetOutput(tidx);
-    std::cout << "\n--- output tensor " << tidx << " ---" << std::endl;
-    auto out_shape = output_tensor->shape();
-    auto out_data = output_tensor->data<float>();
-    auto out_mean = compute_mean<float>(out_data, ShapeProduction(out_shape));
-    auto out_std_dev = compute_standard_deviation<float>(
-        out_data, ShapeProduction(out_shape), true, out_mean);
-
-    std::cout << "output shape(NCHW):" << ShapePrint(out_shape) << std::endl;
-    std::cout << "output tensor " << tidx
-              << " elem num:" << ShapeProduction(out_shape) << std::endl;
-    std::cout << "output tensor " << tidx
-              << " standard deviation:" << out_std_dev << std::endl;
-    std::cout << "output tensor " << tidx << " mean value:" << out_mean
-              << std::endl;
-
-    // print output
-    if (print_output_elem) {
-      for (int i = 0; i < ShapeProduction(out_shape); ++i) {
-        std::cout << "out[" << tidx << "][" << i
-                  << "]:" << output_tensor->data<float>()[i] << std::endl;
-      }
+    std::cout << std::flush;
+
+    if (++count == _batch.size()){
+        count = 0;
     }
-  }
 }
 
-int main(int argc, char** argv) {
-  std::vector<std::string> str_input_shapes;
-  std::vector<shape_t> input_shapes{
-      {1, 3, 224, 224}};  // shape_t ==> std::vector<int64_t>
-
-  int repeats = 10;
-  int warmup = 10;
-  // set arm power mode:
-  // 0 for big cluster, high performance
-  // 1 for little cluster
-  // 2 for all cores
-  // 3 for no bind
-  size_t power_mode = 0;
-  size_t thread_num = 1;
-  int accelerate_opencl = 1;
-  int print_output_elem = 0;
-
-  if (argc > 2 && argc < 9) {
-    std::cerr
-        << "usage: ./" << argv[0] << "\n"
-        << "  <naive_buffer_model_dir>\n"
-        << "  <raw_input_shapes>, eg: 1,3,224,224 for 1 input; "
-           "1,3,224,224:1,5 for 2 inputs\n"
-        << "  <repeats>, eg: 100\n"
-        << "  <warmup>, eg: 10\n"
-        << "  <power_mode>, 0: big cluster, high performance\n"
-           "                1: little cluster\n"
-           "                2: all cores\n"
-           "                3: no bind\n"
-        << "  <thread_num>, eg: 1 for single thread \n"
-        << "  <accelerate_opencl>, this option takes effect only when model "
-           "can be running on opencl backend.\n"
-           "                       0: disable opencl kernel cache & tuning\n"
-           "                       1: enable opencl kernel cache & tuning\n"
-        << "  <print_output>, 0: disable print outputs to stdout\n"
-           "                  1: enable print outputs to stdout\n"
-        << std::endl;
-    return 0;
-  }
-
-  std::string model_dir = argv[1];
-  if (argc >= 9) {
-    input_shapes.clear();
-    std::string raw_input_shapes = argv[2];
-    std::cout << "raw_input_shapes: " << raw_input_shapes << std::endl;
-    str_input_shapes = split_string(raw_input_shapes);
-    for (size_t i = 0; i < str_input_shapes.size(); ++i) {
-      std::cout << "input shape: " << str_input_shapes[i] << std::endl;
-      input_shapes.push_back(get_shape(str_input_shapes[i]));
+void UserPersonaInfer::infer() {
+    static int idx = 0;
+    auto predictor = _paddle_predictor.get();
+    if (!predictor) {
+        return;
+    }
+    // 3. Prepare input data
+
+    // 4. Run predictor
+#ifdef IPTCORE_PADDLE_BENCHMARK
+    int warmup = 10;
+    int repeats = 400;
+    Timer timeInstance;
+    double first_duration{-1};
+    for (size_t widx = 0; widx < warmup; ++widx) {
+        if (widx == 0) {
+            timeInstance.startTimer();
+            infer_specific_item(predictor);
+            first_duration = timeInstance.getCostTimer();
+        } else {
+            infer_specific_item(predictor);
+        }
+    }
+
+    double sum_duration = 0.0;
+    double max_duration = 1e-5;
+    double min_duration = 1e5;
+    double avg_duration = -1;
+    for (size_t ridx = 0; ridx < repeats; ++ridx) {
+        timeInstance.startTimer();
+
+        infer_specific_item(predictor);
+
+        double duration = timeInstance.getCostTimer();
+        sum_duration += duration;
+        max_duration = duration > max_duration ? duration : max_duration;
+        min_duration = duration < min_duration ? duration : min_duration;
+//        std::cout << "run_idx:" << ridx + 1 << " / " << repeats << ": " << duration
+//                  << " ms" << std::endl;
+        if (first_duration < 0) {
+            first_duration = duration;
+        }
     }
+    avg_duration = sum_duration / static_cast<float>(repeats);
+    std::cout << "\n======= benchmark summary =======\n"
+              << "warmup:" << warmup << "\n"
+              << "repeats:" << repeats << "\n"
+              << "*** time info(ms) ***\n"
+              //<< "1st_duration:" << first_duration << "\n"
+              << "max_duration:" << max_duration << "\n"
+              << "min_duration:" << min_duration << "\n"
+              << "avg_duration:" << avg_duration << "\n";
+#else
+    infer_specific_item(predictor);
+#endif
+
+    // 5. Get output
+}
 
-    repeats = atoi(argv[3]);
-    warmup = atoi(argv[4]);
-    power_mode = atoi(argv[5]);
-    thread_num = atoi(argv[6]);
-    accelerate_opencl = atoi(argv[7]);
-    print_output_elem = atoi(argv[8]);
-  }
-
-  RunModel(model_dir,
-           input_shapes,
-           repeats,
-           warmup,
-           power_mode,
-           thread_num,
-           accelerate_opencl,
-           print_output_elem);
-
-  return 0;
+int main(int argc, char** argv) {
+    UserPersonaInfer user_persona_infer;
+#ifdef IPTCORE_PADDLE_MOBILE
+//    user_persona_infer.create_paddle_light_predictor(
+//        "D:\\baidu\\baiduinput\\inputtools\\paddle_lite\\wenxin\\model_x86.nb");
+    user_persona_infer.create_paddle_light_predictor(
+        "./model_naive_buffer_arm.nb");
+    std::cout << "xiarj" << std::endl;
+#else
+//    user_persona_infer.create_paddle_full_predictor(
+//        "D:\\baidu\\baiduinput\\inputtools\\paddle_lite\\honor_2_11\\cls_ernie_3.0_tiny_fc_ch_dy_15_3L128H_decrypt_inference_1");
+#endif
+    //user_persona_infer.prepare("D:\\baidu\\baiduinput\\inputtools\\paddle_lite\\wenxin\\xia.txt");
+    user_persona_infer.prepare("./xia.txt");
+    user_persona_infer.infer();
+
+    return 0;
 }
+
diff --git a/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt b/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt
deleted file mode 100644
index 234ec1c85e3..00000000000
--- a/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt
+++ /dev/null
@@ -1,73 +0,0 @@
-cmake_minimum_required(VERSION 2.8)
-project(mobilenet_full_api)
-set(TARGET mobilenet_full_api)
-
-# 1. path to Paddle-Lite lib and mklml lib
-set(LITE_DIR "${PROJECT_SOURCE_DIR}/../../../cxx")
-set(MKLML_DIR "${PROJECT_SOURCE_DIR}/../../../third_party/mklml/")
-
-if (WIN32)
-  add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
-  set(MSVC_STATIC_CRT )
-  if(MSVC_STATIC_CRT)
-    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
-    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
-    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
-    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
-  else(MSVC_STATIC_CRT)
-    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MDd")
-    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MD")
-    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MDd")
-    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MD")
-  endif(MSVC_STATIC_CRT)
-endif()
-
-if (APPLE AND METAL)
-  message(STATUS "set METAL=ON")
-  add_definitions("-DMETAL")
-  find_library(METAL_LIBRARY Metal REQUIRED)
-  find_library(GRAPHIC CoreGraphics REQUIRED)
-  find_library(MPS_LIBRARY MetalPerformanceShaders REQUIRED)
-endif()
-
-# 2. link mklml and Paddle-Lite directory
-link_directories(${LITE_DIR}/lib ${MKLML_DIR}/lib)
-include_directories(${LITE_DIR}/include/ ${MKLML_DIR}/include)
-
-# 3. compile options 
-if (NOT WIN32)
-  add_definitions(-std=c++11 -g -O3 -pthread)
-  set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR})
-endif()
-
-# 4.add executable output
-add_executable(${TARGET} ${TARGET}.cc)
-if (WIN32)
-  set(WITH_STATIC_MKL )
-  if(WITH_STATIC_MKL)
-    set(MATH_LIB ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX})
-  else()
-    set(MATH_LIB ${MKLML_DIR}/lib/mklml${CMAKE_STATIC_LIBRARY_SUFFIX}
-                 ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX})
-  endif()
-
-  target_link_libraries(${TARGET} libpaddle_api_full_bundled.lib)
-  target_link_libraries(${TARGET} shlwapi.lib)
-  target_link_libraries(${TARGET} ${MATH_LIB})
-
-  add_custom_command(TARGET ${TARGET} POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/libiomp5md.dll ${CMAKE_CURRENT_BINARY_DIR}/Release
-  )
-  if(NOT WITH_STATIC_MKL)
-    add_custom_command(TARGET ${TARGET} POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/mklml.dll ${CMAKE_CURRENT_BINARY_DIR}/Release
-    )
-  endif()
-else()
-    if (APPLE AND METAL)
-      target_link_libraries(${TARGET} ${METAL_LIBRARY} ${GRAPHIC} ${MPS_LIBRARY})
-    endif()
-    target_link_libraries(${TARGET} -lpaddle_full_api_shared)
-    target_link_libraries(${TARGET} -liomp5)
-    target_link_libraries(${TARGET} -ldl)
-endif()
diff --git a/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt b/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt
deleted file mode 100644
index 3a91bfafbd3..00000000000
--- a/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt
+++ /dev/null
@@ -1,73 +0,0 @@
-cmake_minimum_required(VERSION 2.8)
-project(mobilenet_light_api)
-set(TARGET mobilenet_light_api)
-
-# 1. path to Paddle-Lite lib and mklml lib
-set(LITE_DIR "${PROJECT_SOURCE_DIR}/../../../cxx")
-set(MKLML_DIR "${PROJECT_SOURCE_DIR}/../../../third_party/mklml/")
-
-if (WIN32)
-  add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
-  set(MSVC_STATIC_CRT )
-  if(MSVC_STATIC_CRT)
-    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
-    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
-    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
-    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
-  else(MSVC_STATIC_CRT)
-    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MDd")
-    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MD")
-    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MDd")
-    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MD")
-  endif(MSVC_STATIC_CRT)
-endif()
-
-if (APPLE AND METAL)
-  message(STATUS "set METAL=ON")
-  add_definitions("-DMETAL")
-  find_library(METAL_LIBRARY Metal REQUIRED)
-  find_library(GRAPHIC CoreGraphics REQUIRED)
-  find_library(MPS_LIBRARY MetalPerformanceShaders REQUIRED)
-endif()
-
-# 2. link mklml and Paddle-Lite directory
-link_directories(${LITE_DIR}/lib ${MKLML_DIR}/lib)
-include_directories(${LITE_DIR}/include/ ${MKLML_DIR}/include)
-
-# 3. compile options 
-if (NOT WIN32)
-  add_definitions(-std=c++11 -g -O3 -pthread)
-  set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR})
-endif()
-
-# 4.add executable output
-add_executable(${TARGET} ${TARGET}.cc)
-if (WIN32)
-  set(WITH_STATIC_MKL )
-  if(WITH_STATIC_MKL)
-    set(MATH_LIB ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX})
-  else()
-    set(MATH_LIB ${MKLML_DIR}/lib/mklml${CMAKE_STATIC_LIBRARY_SUFFIX}
-                 ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX})
-  endif()
-
-  target_link_libraries(${TARGET} libpaddle_api_light_bundled.lib)
-  target_link_libraries(${TARGET} shlwapi.lib)
-  target_link_libraries(${TARGET} ${MATH_LIB})
-
-  add_custom_command(TARGET ${TARGET} POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/libiomp5md.dll ${CMAKE_CURRENT_BINARY_DIR}/Release
-  )
-  if(NOT WITH_STATIC_MKL)
-    add_custom_command(TARGET ${TARGET} POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/mklml.dll ${CMAKE_CURRENT_BINARY_DIR}/Release
-    )
-  endif()
-else()
-  if (APPLE AND METAL)
-    target_link_libraries(${TARGET} ${METAL_LIBRARY} ${GRAPHIC} ${MPS_LIBRARY})
-  endif()
-  target_link_libraries(${TARGET} -lpaddle_light_api_shared)
-  target_link_libraries(${TARGET} -liomp5)
-  target_link_libraries(${TARGET} -ldl)
-endif()
diff --git a/lite/kernels/host/index_select_compute.cc b/lite/kernels/host/index_select_compute.cc
index b65342cd92d..f4ff2b1ad8c 100644
--- a/lite/kernels/host/index_select_compute.cc
+++ b/lite/kernels/host/index_select_compute.cc
@@ -72,6 +72,19 @@ REGISTER_LITE_KERNEL(index_select,
 
 #ifdef LITE_BUILD_EXTRA
 
+REGISTER_LITE_KERNEL(index_select,
+                     kHost,
+                     kAny,
+                     kNCHW,
+                     paddle::lite::kernels::host::Index_selectCompute<int64_t>,
+                     int64)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
+    .BindInput("Index",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt64))})
+    .BindPaddleOpVersion("index_select", 1)
+    .Finalize();
+
 REGISTER_LITE_KERNEL(index_select,
                      kHost,
                      kAny,
diff --git a/lite/kernels/host/unique_compute.cc b/lite/kernels/host/unique_compute.cc
index 97363f2bbe9..2e62c03f938 100644
--- a/lite/kernels/host/unique_compute.cc
+++ b/lite/kernels/host/unique_compute.cc
@@ -1,19 +1,5 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
 #include "lite/kernels/host/unique_compute.h"
-#include "lite/backends/host/math/concat.h"
+#include "lite/core/tensor.h"
 
 #include <algorithm>
 #include <cmath>
@@ -55,11 +41,11 @@ void UniqueFunc(const lite::Tensor* x,
   if (count != nullptr) {
     // Resize the count tensor dims to allocate the memory
     count->Resize({static_cast<int64_t>(uniq.size())});
-    IndexT* count_data = count->template mutable<IndexT>();
+    IndexT* count_data = count->template mutable_data<IndexT>();
     // init count_data to 0
     memset(count_data, 0, uniq.size() * sizeof(IndexT));
 
-    if (IndexT == int32_t) {
+    if (typeid(IndexT).name() == typeid(int32_t).name()) {
       for (auto i = 0; i < x->numel(); ++i) {
         const IndexT& index = index_data[i];
         count_data[static_cast<int32_t>(index)] += static_cast<IndexT>(1);
@@ -108,7 +94,7 @@ void UniqueFlattendTensorFunc(const lite::Tensor& in,
 
   if (return_inverse) {
     auto* inverse = index;
-    inverse->Resize{{out->numel()}};
+    inverse->Resize({out->numel()});
     auto inverse_data = inverse->mutable_data<IndexT>();
     std::unordered_map<InT, IndexT> inverse_map;
     for (int64_t i = 0; i < out->numel(); ++i) {
@@ -136,11 +122,12 @@ void UniqueFlattendTensorFunc(const lite::Tensor& in,
   }
 }
 
+template <typename T>
 static std::vector<lite::Tensor> Unbind(const lite::Tensor& in) {
   int64_t size = in.dims()[0];
   std::vector<lite::Tensor> tensors(size);
   for (int64_t i = 0; i < size; ++i) {
-    tensors[i] = in.Slice(i, i + 1);
+    tensors[i] = in.Slice<T>(i, i + 1);
   }
   return tensors;
 }
@@ -213,26 +200,26 @@ void TransCompute(const Tensor &input,
   const T *din = input.data<T>();
   T *dout = output->mutable_data<T>();
 
-  std::vector<int> old_temp;
+  std::vector<int> old_temps;
   int temp = 1;
   for (int i = 0; i < num_axes; ++i) {
-    old_temp.push_back(temp);
+    old_temps.push_back(temp);
     temp *= in_dims[num_axes - 1 - i];
   }
-  std::vector<int> old_step;
+  std::vector<int> old_steps;
   for (int i = 0; i < num_axes; i++) {
-    old_step.push_back(old_temp[num_axes - 1 - i]);
+    old_steps.push_back(old_temps[num_axes - 1 - i]);
   }
 
-  std::vector<int> new_temp;
+  std::vector<int> new_temps;
   temp = 1;
   for (int i = 0; i < num_axes; ++i) {
-    new_temp.push_back(temp);
+    new_temps.push_back(temp);
     temp *= out_dims[num_axes - 1 - i];
   }
-  std::vector<int> new_step;
+  std::vector<int> new_steps;
   for (int i = 0; i < num_axes; i++) {
-    new_step.push_back(new_temp[num_axes - 1 - i]);
+    new_steps.push_back(new_temps[num_axes - 1 - i]);
   }
 
   // std::vector<int> old_steps(
@@ -259,8 +246,43 @@ void TransCompute(const Tensor &input,
 }
 
 lite::DDim FlattenTo2d(const lite::DDim& src, int num_col_dims) {
-  return lite::DDim((src.Slice(0, num_col_dims)).production(),
-                    (src.Slice(num_col_dims, src.size())).production());
+  return DDim(std::vector<DDim::value_type>{
+              src.Slice(0, num_col_dims).production(),
+              src.Slice(num_col_dims, src.size()).production()});
+}
+
+template <typename T>
+void concat_func(const std::vector<lite::Tensor>& input,
+                 const int axis,
+                 lite::Tensor* output) {
+  size_t num = input.size();
+  auto dim_0 = input[0].dims();
+  int64_t concat_input_size = 1;
+  int64_t num_cancats = 1;
+  for (int i = axis + 1; i < dim_0.size(); i++) {
+    concat_input_size *= dim_0[i];
+  }
+  for (int i = 0; i < axis; i++) {
+    num_cancats *= dim_0[i];
+  }
+
+  auto* dst_ptr = output->mutable_data<T>();
+  const int out_concat_axis = output->dims()[axis];
+  int64_t offset_concat_axis = 0;
+  int64_t out_sum = out_concat_axis * concat_input_size;
+  for (int n = 0; n < num; n++) {
+    auto dims = input[n].dims();
+    auto* src_ptr = input[n].data<T>();
+    int64_t in_concat_axis = dims[axis];
+    auto* dout_ptr = dst_ptr + offset_concat_axis * concat_input_size;
+    int64_t in_sum = in_concat_axis * concat_input_size;
+    for (int i = 0; i < num_cancats; i++) {
+      std::memcpy(dout_ptr, src_ptr, sizeof(T) * in_sum);
+      dout_ptr += out_sum;
+      src_ptr += in_sum;
+    }
+    offset_concat_axis += in_concat_axis;
+  }
 }
 
 template <typename InT, typename IndexT>
@@ -278,11 +300,11 @@ void UniqueDimFunc(const lite::Tensor& in,
   std::iota(permute.begin(), permute.end(), 0);
   permute[axis] = 0;
   permute[0] = axis;
-  std::vector<int64_t> in_trans_dim_vec(in.dims());
+  std::vector<int64_t> in_trans_dim_vec(in.dims().Vectorize());
   in_trans_dim_vec[axis] = in.dims()[0];
   in_trans_dim_vec[0] = in.dims()[axis];
   lite::Tensor in_trans;
-  lite::DDim in_trans_dims = in_trans_dim_vec;
+  lite::DDim in_trans_dims = DDim(in_trans_dim_vec);
   in_trans.Resize(in_trans_dims);
   in_trans.mutable_data<InT>();
   TransCompute<InT>(in, &in_trans, permute);
@@ -314,7 +336,7 @@ void UniqueDimFunc(const lite::Tensor& in,
   lite::Tensor input_sorted;
   input_sorted.Resize(in_trans_dims);
   input_sorted.mutable_data<InT>();
-  InT* input_sorted_data = input_sorted.data<InT>();
+  InT* input_sorted_data = const_cast<InT*>(input_sorted.data<InT>());
   for (size_t i = 0; i < sorted_indices_vec.size(); ++i) {
     memcpy(input_sorted_data + i * col,
            in_trans_data + static_cast<int64_t>(sorted_indices_vec[i]) * col,
@@ -322,11 +344,11 @@ void UniqueDimFunc(const lite::Tensor& in,
 
   }
 
-  std::vector<lite::Tensor> input_unbind = Unbind(input_sorted);
+  std::vector<lite::Tensor> input_unbind = Unbind<InT>(input_sorted);
   std::vector<IndexT> inverse_vec(sorted_indices_vec.size(), 0);
   std::vector<IndexT> counts_vec(sorted_indices_vec.size(), 0);
   std::vector<IndexT> indices_vec(sorted_indices_vec.size(), 0);
-  auto last = UniqueDimImpl<std::vector<lite::Tensor>::iterator, InT, int32_t>(
+  auto last = UniqueDimImpl<std::vector<lite::Tensor>::iterator, InT, IndexT>(
     input_unbind.begin(),
     input_unbind.end(),
     sorted_indices_vec,
@@ -345,7 +367,7 @@ void UniqueDimFunc(const lite::Tensor& in,
   std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]);
   out->Resize(out_trans_dims_vec);
   out->mutable_data<InT>();
-  lite::host::math::concat_func<InT>(input_unbind, 0, &out_trans);
+  concat_func<InT>(input_unbind, 0, &out_trans);
   TransCompute<InT>(out_trans, out, permute);
 
   if (return_inverse) {
@@ -375,8 +397,7 @@ void UniqueCompute::Run() {
   auto axis_vec = param.axis;
   auto is_sorted = param.is_sorted;
 
-  // lite_api::PrecisionType index_type = index->precision();
-  lite_api::PrecisionType index_type = dtype;
+  lite_api::PrecisionType index_type = index->precision();
   bool index_type_match = index_type == PRECISION(kInt32) || index_type == PRECISION(kInt64);
   lite_api::PrecisionType type = x->precision();
   CHECK_EQ(index_type_match, true) << "Index holds the wrong type, it holds "
@@ -420,21 +441,36 @@ void UniqueCompute::Run() {
     return;
   }
 
-  if (x->numel() = 0) {
-    out->template mutable_data<T>();
+  if (x->numel() == 0) {
+    switch (type) {
+      case PRECISION(kFloat):
+        output->template mutable_data<float>();
+        break;
+      case PRECISION(kInt32):
+        output->template mutable_data<int32_t>();
+        break;
+      case PRECISION(kInt64):
+        output->template mutable_data<int64_t>();
+        break;
+      default:
+        LOG(FATAL) << "unique does not implement for the "
+                   << "input type:" << static_cast<int>(type);
+        break;
+    }
+    
     return;
   }
   if (axis_vec.empty()) {
     if (index_type == PRECISION(kInt32)) {
       switch (type) {
         case PRECISION(kFloat):
-          UniqueFlattendTensorFunc<float, int32_t>(x, output, index, indices, count, return_index, return_inverse, return_counts);
+          UniqueFlattendTensorFunc<float, int32_t>(*x, output, index, indices, count, return_index, return_inverse, return_counts);
           break;
         case PRECISION(kInt32):
-          UniqueFlattendTensorFunc<int32_t, int32_t>(x, output, index, indices, count, return_index, return_inverse, return_counts);
+          UniqueFlattendTensorFunc<int32_t, int32_t>(*x, output, index, indices, count, return_index, return_inverse, return_counts);
           break;
         case PRECISION(kInt64):
-          UniqueFlattendTensorFunc<int64_t, int32_t>(x, output, index, indices, count, return_index, return_inverse, return_counts);
+          UniqueFlattendTensorFunc<int64_t, int32_t>(*x, output, index, indices, count, return_index, return_inverse, return_counts);
           break;
         default:
           LOG(FATAL) << "unique does not implement for the "
@@ -444,13 +480,13 @@ void UniqueCompute::Run() {
     } else {
       switch (type) {
         case PRECISION(kFloat):
-          UniqueFlattendTensorFunc<float, int64_t>(x, output, index, indices, count, return_index, return_inverse, return_counts);
+          UniqueFlattendTensorFunc<float, int64_t>(*x, output, index, indices, count, return_index, return_inverse, return_counts);
           break;
         case PRECISION(kInt32):
-          UniqueFlattendTensorFunc<int32_t, int64_t>(x, output, index, indices, count, return_index, return_inverse, return_counts);
+          UniqueFlattendTensorFunc<int32_t, int64_t>(*x, output, index, indices, count, return_index, return_inverse, return_counts);
           break;
         case PRECISION(kInt64):
-          UniqueFlattendTensorFunc<int64_t, int64_t>(x, output, index, indices, count, return_index, return_inverse, return_counts);
+          UniqueFlattendTensorFunc<int64_t, int64_t>(*x, output, index, indices, count, return_index, return_inverse, return_counts);
           break;
         default:
           LOG(FATAL) << "unique does not implement for the "
@@ -463,13 +499,13 @@ void UniqueCompute::Run() {
     if (index_type == PRECISION(kInt32)) {
       switch (type) {
         case PRECISION(kFloat):
-          UniqueDimFunc<float, int32_t>(x, output, index, indices, count, axis, return_index, return_inverse, return_counts);
+          UniqueDimFunc<float, int32_t>(*x, output, index, indices, count, axis, return_index, return_inverse, return_counts);
           break;
         case PRECISION(kInt32):
-          UniqueDimFunc<int32_t, int32_t>(x, output, index, indices, count, axis, return_index, return_inverse, return_counts);
+          UniqueDimFunc<int32_t, int32_t>(*x, output, index, indices, count, axis, return_index, return_inverse, return_counts);
           break;
         case PRECISION(kInt64):
-          UniqueDimFunc<int64_t, int32_t>(x, output, index, indices, count, axis, return_index, return_inverse, return_counts);
+          UniqueDimFunc<int64_t, int32_t>(*x, output, index, indices, count, axis, return_index, return_inverse, return_counts);
           break;
         default:
           LOG(FATAL) << "unique does not implement for the "
@@ -478,13 +514,13 @@ void UniqueCompute::Run() {
     } else {
       switch (type) {
         case PRECISION(kFloat):
-          UniqueDimFunc<float, int64_t>(x, output, index, indices, count, axis, return_index, return_inverse, return_counts);
+          UniqueDimFunc<float, int64_t>(*x, output, index, indices, count, axis, return_index, return_inverse, return_counts);
           break;
         case PRECISION(kInt32):
-          UniqueDimFunc<int32_t, int64_t>(x, output, index, indices, count, axis, return_index, return_inverse, return_counts);
+          UniqueDimFunc<int32_t, int64_t>(*x, output, index, indices, count, axis, return_index, return_inverse, return_counts);
           break;
         case PRECISION(kInt64):
-          UniqueDimFunc<int64_t, int64_t>(x, output, index, indices, count, axis, return_index, return_inverse, return_counts);
+          UniqueDimFunc<int64_t, int64_t>(*x, output, index, indices, count, axis, return_index, return_inverse, return_counts);
           break;
         default:
           LOG(FATAL) << "unique does not implement for the "
@@ -523,8 +559,8 @@ REGISTER_LITE_KERNEL(unique,
                 {LiteType::GetTensorTy(TARGET(kHost),
                                        PRECISION(kInt32),
                                        DATALAYOUT(kAny))})
-    .BindOutput("Count",
+    .BindOutput("Counts",
                 {LiteType::GetTensorTy(TARGET(kHost),
                                        PRECISION(kInt32),
                                        DATALAYOUT(kAny))})
-    .Finalize();
\ No newline at end of file
+    .Finalize();

From 41db8dd53dd656f07dee491cd40ead2410561a8a Mon Sep 17 00:00:00 2001
From: Qijian Tian <1741919942@qq.com>
Date: Tue, 13 Dec 2022 15:35:07 +0800
Subject: [PATCH 03/10] update unique_op on 12.13

---
 lite/core/program.cc                          | 243 ++++++++++++++++--
 .../cxx/mobile_full/mobilenetv1_full_api.cc   | 164 ++++++++++--
 .../x86_mobilenetv1_full_demo/CMakeLists.txt  |  73 ++++++
 .../x86_mobilenetv1_light_demo/CMakeLists.txt |  73 ++++++
 lite/kernels/host/unique_compute.cc           |  78 +++---
 lite/operators/unique_op.cc                   |   8 +-
 .../unittest_py/__main___cache_dir/model      | Bin 0 -> 2164 bytes
 .../__main___cache_dir/opt_model/model        | Bin 0 -> 609 bytes
 .../__main___cache_dir/opt_model/params       |   0
 .../unittest_py/__main___cache_dir/params     | Bin 0 -> 28 bytes
 lite/tests/unittest_py/op/statics_data        | Bin 0 -> 1185 bytes
 lite/tests/unittest_py/op/test_unique_op.py   | 105 ++++++++
 12 files changed, 653 insertions(+), 91 deletions(-)
 create mode 100644 lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt
 create mode 100644 lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt
 create mode 100644 lite/tests/unittest_py/__main___cache_dir/model
 create mode 100644 lite/tests/unittest_py/__main___cache_dir/opt_model/model
 create mode 100644 lite/tests/unittest_py/__main___cache_dir/opt_model/params
 create mode 100644 lite/tests/unittest_py/__main___cache_dir/params
 create mode 100644 lite/tests/unittest_py/op/statics_data
 create mode 100644 lite/tests/unittest_py/op/test_unique_op.py

diff --git a/lite/core/program.cc b/lite/core/program.cc
index 8f0c0a5043a..069da1e78eb 100644
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -605,27 +605,9 @@ void RuntimeProgram::Run() {
 #ifdef LITE_WITH_OPENCL
     // delegate flush judgement to specify target , it is too heavy for Inst
     inst.Flush(idx);
-#if defined(LITE_WITH_PROFILE) || defined(LITE_WITH_PRECISION_PROFILE)
-    VLOG(4) << "kernel name " << idx << " " << inst.kernel()->name();
-    const auto* op_info = inst.op()->op_info();
-    auto var_in_names = op_info->input_names();
-    for (int i = 0; i < var_in_names.size(); i++) {
-      VLOG(4) << "input var_in_names: " << var_in_names[i];
-    }
-    auto var_out_names = op_info->output_names();
-    for (int i = 0; i < var_out_names.size(); i++) {
-      VLOG(4) << "output var_out_names: " << var_out_names[i];
-    }
-#endif
 #endif
 
     inst.Run();
-#ifdef LITE_WITH_PRECISION_PROFILE
-    if (inst.op()->Type() != "while") {
-      precision_profiler_summary +=
-          inst_precision_profiler.GetInstPrecision(&inst);
-    }
-#endif  // LITE_WITH_PRECISION_PROFILE
   }
 
 #ifdef LITE_WITH_METAL
@@ -815,6 +797,231 @@ void Instruction::Run() {
   kernel_->Launch();
   has_run_ = true;
 
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+#if 1
+  // clang-format off
+  /*
+  time_t t;
+  struct tm* timeinfo;
+  time(&t);
+  timeinfo = localtime(&t);
+  std::cout << "time: " << asctime(timeinfo) << std::endl;
+  */
+  std::cout << "***-----------------------------******-----------------------------***" << std::endl;
+  // get precision
+  std::string op_name = op_->op_info()->Type();
+  std::cout << "op_type: " << op_name << std::endl;
+  if ((op_->op_info()->Type() != "fetch") &&
+      (op_->op_info()->Type() != "while") &&
+      (op_->op_info()->Type() != "conditional_block")) {
+    auto op_scope = op_->scope();
+    auto out_names = op_->op_info()->output_names();
+    auto in_names = op_->op_info()->input_names();
+    for (auto& out_name : in_names) {
+      std::string out_arg_name;
+      op_->op_info()->GetInputArgname(out_name, &out_arg_name);
+      //auto type = kernel_->GetInputDeclType(out_arg_name);
+       // if (type->IsTensor()) {
+      auto tmp = op_scope->FindVar(out_name);
+      if (tmp->IsType<Tensor>()) {
+        const Tensor* tout = op_scope->FindVar(out_name)->GetMutable<Tensor>();
+        if (tout->IsInitialized()) {
+          auto size = tout->numel();
+          auto dim = tout->dims();
+          double sum = 0.0;
+          if (tout->precision() == PrecisionType::kFloat) {
+            const float* dout = tout->data<float>();
+            for (int i = 0; i < size; i++) {
+              sum += dout[i];
+            }
+          } else if (tout->precision() == PrecisionType::kFP16) {
+            const float16_t* dout = tout->data<float16_t>();
+            for (int i = 0; i < size; i++) {
+              sum += static_cast<double>(dout[i]);
+            }
+          } else if (tout->precision() == PrecisionType::kInt32) {
+            const int32_t* dout = tout->data<int32_t>();
+            for (int i = 0; i < size; i++) {
+              sum += static_cast<double>(dout[i]);
+            }
+          } else if (tout->precision() == PrecisionType::kInt64) {
+            const int64_t* dout = tout->data<int64_t>();
+            for (int i = 0; i < size; i++) {
+              sum += static_cast<double>(dout[i]);
+            }
+          } else if (tout->precision() == PrecisionType::kInt8) {
+            const int8_t* dout = tout->data<int8_t>();
+            for (int i = 0; i < size; i++) {
+              sum += static_cast<double>(dout[i]);
+            }
+          } else {
+            std::cout << "This data_type is not support: "
+                      << PrecisionToStr(tout->precision()) << std::endl;
+          }
+          double avg = sum / static_cast<double>(size);
+          std::cout << "in_name: " << out_name
+                    << ", type: " << PrecisionToStr(tout->precision())
+                    << ", size: " << size << ", input avg: " << avg;
+         
+          std::cout<<", dim size:"<< dim.size() << "[";
+          for(int i = 0; i < dim.size(); i++)
+            std::cout << dim[i] << ",";
+          std::cout<<"]\n";
+        } else {
+          std::cout << out_name << " is not inited." << std::endl;
+        }
+      } else if (tmp->IsType<std::vector<Tensor>>()) {
+        auto touts =
+            op_scope->FindVar(out_name)->GetMutable<std::vector<Tensor>>();
+        for (auto t : *touts) {
+          const Tensor* tout = &t;
+          if (tout->IsInitialized()) {
+            auto size = tout->numel();
+            const float* dout = tout->data<float>();
+            double sum = 0.0;
+            for (int i = 0; i < size; i++) {
+              sum += dout[i];
+            }
+            double avg = sum / static_cast<double>(size);
+            std::cout << "op_type: " << op_name << ", input avg: " << avg
+                      << std::endl;
+          } else {
+            std::cout << out_name << " is not inited." << std::endl;
+          }
+        }
+      }
+    }
+    for (auto& out_name : out_names) {
+      std::string out_arg_name;
+      op_->op_info()->GetOutputArgname(out_name, &out_arg_name);
+      //auto type = kernel_->GetOutputDeclType(out_arg_name);
+      std::string op_name = op_->op_info()->Type();
+      //if (type->IsTensor()) {
+      auto tmp = op_scope->FindVar(out_name);
+      if (tmp->IsType<Tensor>()) {
+        const Tensor* tout = op_scope->FindVar(out_name)->GetMutable<Tensor>();
+        if (tout->IsInitialized()) {
+          auto size = tout->numel();
+          auto dim = tout->dims();
+          double sum = 0.0;
+          if (tout->precision() == PrecisionType::kFloat) {
+            const float* dout = tout->data<float>();
+            for (int i = 0; i < size; i++) {
+              sum += dout[i];
+            }
+          } else if (tout->precision() == PrecisionType::kFP16) {
+            const float16_t* dout = tout->data<float16_t>();
+            for (int i = 0; i < size; i++) {
+              sum += static_cast<double>(dout[i]);
+            }
+          } else if (tout->precision() == PrecisionType::kInt32) {
+            const int32_t* dout = tout->data<int32_t>();
+            for (int i = 0; i < size; i++) {
+              sum += static_cast<double>(dout[i]);
+            }
+          } else if (tout->precision() == PrecisionType::kInt64) {
+            const int64_t* dout = tout->data<int64_t>();
+            for (int i = 0; i < size; i++) {
+              sum += static_cast<double>(dout[i]);
+            }
+          } else if (tout->precision() == PrecisionType::kInt8) {
+            const int8_t* dout = tout->data<int8_t>();
+            for (int i = 0; i < size; i++) {
+              sum += static_cast<double>(dout[i]);
+            }
+          } else {
+            std::cout << "This data_type is not support: "
+                      << PrecisionToStr(tout->precision()) << std::endl;
+          }
+            double avg = sum / static_cast<double>(size);
+          std::cout << "out_name: " << out_name
+                    << ", type: " << PrecisionToStr(tout->precision())
+                    << ", sum: " << sum << ", output avg: " << avg;
+          std::cout<<", dim size:"<< dim.size() << "[";
+          for(int i = 0; i < dim.size(); i++)
+            std::cout << dim[i] << ",";
+          std::cout<<"]\n";
+        } else {
+          std::cout << out_name << " is not inited." << std::endl;
+        }
+      } else if (tmp->IsType<std::vector<Tensor>>()) {
+        auto touts =
+            op_scope->FindVar(out_name)->GetMutable<std::vector<Tensor>>();
+        for (auto t : *touts) {
+          const Tensor* tout = &t;
+          if (tout->IsInitialized()) {
+            auto size = tout->numel();
+            double sum = 0.0;
+            if (tout->precision() == PrecisionType::kFloat) {
+              const float* dout = tout->data<float>();
+              for (int i = 0; i < size; i++) {
+                sum += dout[i];
+                std::cout << dout[i] << ", ";
+              }
+            } else if (tout->precision() == PrecisionType::kFP16) {
+              const float16_t* dout = tout->data<float16_t>();
+              for (int i = 0; i < size; i++) {
+                sum += static_cast<double>(dout[i]);
+                std::cout << dout[i] << ", ";
+              }
+            } else if (tout->precision() == PrecisionType::kInt32) {
+              const int32_t* dout = tout->data<int32_t>();
+              for (int i = 0; i < size; i++) {
+              sum += static_cast<double>(dout[i]);
+                std::cout << dout[i] << ", ";
+              }
+            } else if (tout->precision() == PrecisionType::kInt64) {
+              const int64_t* dout = tout->data<int64_t>();
+              for (int i = 0; i < size; i++) {
+              sum += static_cast<double>(dout[i]);
+                std::cout << dout[i] << ", ";
+              }
+            } else {
+              std::cout << "This data_type is not support: "
+                        << PrecisionToStr(tout->precision()) << std::endl;
+            }
+            double avg = sum / static_cast<double>(size);
+            std::cout << std::endl;
+            std::cout << "op_type: " << op_name << out_name
+                      << ", type: " << PrecisionToStr(tout->precision())
+                      << ", output avg: " << avg << std::endl;
+          } else {
+            std::cout << out_name << " is not inited." << std::endl;
+          }
+        }
+      }
+    }
+    std::cout << "***-----------------------------******-----------------------------***" << std::endl;
+  }
+#endif
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 #ifdef LITE_WITH_PROFILE
   if (first_epoch_for_profiler_) {
     kernel_->SetIsKernelTest(false);
diff --git a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
index 3db0f2c9c93..1759a484175 100644
--- a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
+++ b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
@@ -11,13 +11,11 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #include <gflags/gflags.h>
 #include <iostream>
 #include <vector>
 #include "paddle_api.h"         // NOLINT
 #include "paddle_use_passes.h"  // NOLINT
-
 /////////////////////////////////////////////////////////////////////////
 // If this demo is linked to static library:libpaddle_api_full_bundled.a
 // , you should include `paddle_use_ops.h` and `paddle_use_kernels.h` to
@@ -27,9 +25,7 @@
 #include "paddle_use_kernels.h"  // NOLINT
 #include "paddle_use_ops.h"      // NOLINT
 #endif
-
 using namespace paddle::lite_api;  // NOLINT
-
 DEFINE_string(model_dir,
               "",
               "Model dir path. Set it when the model is uncombined format.");
@@ -54,13 +50,11 @@ DEFINE_int32(threads, 1, "threads num");
 DEFINE_int32(warmup, 10, "warmup times");
 DEFINE_int32(repeats, 100, "repeats times");
 DEFINE_bool(use_gpu, false, "use opencl backend");
-
 int64_t ShapeProduction(const shape_t& shape) {
   int64_t res = 1;
   for (auto i : shape) res *= i;
   return res;
 }
-
 void RunModel() {
   // 1. Set CxxConfig
   CxxConfig config;
@@ -72,7 +66,6 @@ void RunModel() {
   }
   config.set_power_mode((paddle::lite_api::PowerMode)FLAGS_power_mode);
   config.set_threads(FLAGS_threads);
-
   std::vector<Place> valid_places;
   if (FLAGS_use_gpu) {
     valid_places.emplace_back(
@@ -93,51 +86,167 @@ void RunModel() {
   } else {
     valid_places.emplace_back(Place{TARGET(kARM), PRECISION(kFloat)});
   }
-
   if (FLAGS_prefer_int8_kernel) {
     valid_places.insert(valid_places.begin(),
                         Place{TARGET(kARM), PRECISION(kInt8)});
   }
   config.set_valid_places(valid_places);
-
   // 2. Create PaddlePredictor by CxxConfig
   std::shared_ptr<PaddlePredictor> predictor =
       CreatePaddlePredictor<CxxConfig>(config);
-
   // 3. Save the optimized model
   // WARN: The `predictor->SaveOptimizedModel` method must be executed
   // before the `predictor->Run` method. Because some kernels' `PrepareForRun`
   // method maybe change some parameters' values.
   predictor->SaveOptimizedModel(FLAGS_optimized_model_dir,
                                 LiteModelType::kNaiveBuffer);
-
   // 4. Prepare input data
-  std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
-  input_tensor->Resize(shape_t({1, 3, 224, 224}));
-  auto* data = input_tensor->mutable_data<float>();
-  for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
-    data[i] = 1;
+  const lod_t lodd = {{0,1},{0,1}};
+  {
+    // src_ids
+    int64_t pre_data[100] = {41, 2, 69, 2, 68, 2, 78, 2, 83, 2, 22, 29, 21, 28,
+    27, 18, 8, 2, 788, 342, 6431, 17, 2, 788, 96, 6431, 6622};
+    std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+    input_tensor->Resize(shape_t({1,27,1}));
+    auto* data = input_tensor->mutable_data<int64_t>();
+    for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+      data[i] = pre_data[i];
+    }
   }
-
-  // 5. Run predictor
-  for (int i = 0; i < FLAGS_warmup; ++i) {
-    predictor->Run();
+  {
+    // pos_ids
+    int64_t pre_data[100] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26};
+    std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(1)));
+    input_tensor->Resize(shape_t({1,27,1}));
+    auto* data = input_tensor->mutable_data<int64_t>();
+    for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+      data[i] = pre_data[i];
+    }
+  }
+  {
+    // input_mask
+    std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(2)));
+    input_tensor->Resize(shape_t({1,27,27}));
+    auto* data = input_tensor->mutable_data<float>();
+    for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+      data[i] = 1;
+    }
+  }
+  {
+    // pos_ids_extra
+    std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(3)));
+    input_tensor->Resize(shape_t({1,27,1}));
+    auto* data = input_tensor->mutable_data<int64_t>();
+    for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+      data[i] = 0;
+    }
+  }
+  {
+    // tgt_ids
+    std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(4)));
+    input_tensor->Resize(shape_t({1,1}));
+    auto* data = input_tensor->mutable_data<int64_t>();
+    for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+      data[i] = 6621;
+    }
+    input_tensor->SetLoD(lodd);
+  }
+  {
+    // tgt_pos
+    std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(5)));
+    input_tensor->Resize(shape_t({1,1}));
+    auto* data = input_tensor->mutable_data<int64_t>();
+    for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+      data[i] = 26;
+    }
+    input_tensor->SetLoD(lodd);
+  }
+  {
+    // init_score
+    std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(6)));
+    input_tensor->Resize(shape_t({1,1}));
+    auto* data = input_tensor->mutable_data<float>();
+    for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+      data[i] = 0;
+    }
+    input_tensor->SetLoD(lodd);
+  }
+  {
+    // parent_idx
+    std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(7)));
+    input_tensor->Resize(shape_t({1,1}));
+    auto* data = input_tensor->mutable_data<int>();
+    for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+      data[i] = 0;
+    }
   }
-
-  for (int j = 0; j < FLAGS_repeats; ++j) {
+  {
+    // tgt_generation_mask
+    std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(8)));
+    input_tensor->Resize(shape_t({1,1,27}));
+    auto* data = input_tensor->mutable_data<float>();
+    for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+      data[i] = 1;
+    }
+  }
+  {
+    // max_dec_len
+    std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(9)));
+    input_tensor->Resize(shape_t({1}));
+    auto* data = input_tensor->mutable_data<int64_t>();
+    for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+      data[i] = 10;
+    }
+  }
+  {
+    // tgt_pos_extra
+    std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(10)));
+    input_tensor->Resize(shape_t({1,1}));
+    auto* data = input_tensor->mutable_data<int64_t>();
+    for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+      data[i] = 1;
+    }
+    input_tensor->SetLoD(lodd);
+  }
+  {
+    // cand_ids
+    int64_t cand[500]={41, 6623, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 69, 6623, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 68, 6623, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 78, 6623, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83, 6623, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(11)));
+    input_tensor->Resize(shape_t({5,32}));
+    auto* data = input_tensor->mutable_data<int64_t>();
+    for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+      data[i] = cand[i];
+    }
+  }
+ 
+  // 5. Run predictor
+  for (int j = 0; j < 1; ++j) {
     predictor->Run();
   }
-
   // 6. Get output
+  double sum = 0;
   std::unique_ptr<const Tensor> output_tensor(
       std::move(predictor->GetOutput(0)));
-  std::cout << "Output shape " << output_tensor->shape()[1] << std::endl;
+  std::cout << "Output0 shape " << output_tensor->shape()[0] <<","<< output_tensor->shape()[1] << std::endl;
   for (int i = 0; i < ShapeProduction(output_tensor->shape()); i++) {
-    std::cout << "Output[" << i << "]: " << output_tensor->data<float>()[i]
-              << std::endl;
+    sum += output_tensor->data<int64_t>()[i] * 1.f;
+  }
+  std::cout << "output0 mean is "<<sum / ShapeProduction(output_tensor->shape())<<"\n";
+  sum = 0;
+  std::unique_ptr<const Tensor> output_tensor1(
+      std::move(predictor->GetOutput(0)));
+  std::cout << "Output1 shape " << output_tensor1->shape()[0] <<","<< output_tensor1->shape()[1] << std::endl;
+  for (int i = 0; i < ShapeProduction(output_tensor1->shape()); i++) {
+    sum += output_tensor1->data<float>()[i] * 1.f;
   }
+  std::cout << "output1 mean is "<<sum / ShapeProduction(output_tensor1->shape())<<"\n";
 }
-
 int main(int argc, char** argv) {
   google::ParseCommandLineFlags(&argc, &argv, true);
   if (FLAGS_model_dir.empty() &&
@@ -162,7 +271,6 @@ int main(int argc, char** argv) {
         << " --use_gpu=false              bool    Use gpu or not.\n";
     exit(1);
   }
-
   RunModel();
   return 0;
 }
diff --git a/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt b/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt
new file mode 100644
index 00000000000..234ec1c85e3
--- /dev/null
+++ b/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt
@@ -0,0 +1,73 @@
+cmake_minimum_required(VERSION 2.8)
+project(mobilenet_full_api)
+set(TARGET mobilenet_full_api)
+
+# 1. path to Paddle-Lite lib and mklml lib
+set(LITE_DIR "${PROJECT_SOURCE_DIR}/../../../cxx")
+set(MKLML_DIR "${PROJECT_SOURCE_DIR}/../../../third_party/mklml/")
+
+if (WIN32)
+  add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
+  set(MSVC_STATIC_CRT )
+  if(MSVC_STATIC_CRT)
+    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
+    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
+    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
+    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
+  else(MSVC_STATIC_CRT)
+    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MDd")
+    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MD")
+    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MDd")
+    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MD")
+  endif(MSVC_STATIC_CRT)
+endif()
+
+if (APPLE AND METAL)
+  message(STATUS "set METAL=ON")
+  add_definitions("-DMETAL")
+  find_library(METAL_LIBRARY Metal REQUIRED)
+  find_library(GRAPHIC CoreGraphics REQUIRED)
+  find_library(MPS_LIBRARY MetalPerformanceShaders REQUIRED)
+endif()
+
+# 2. link mklml and Paddle-Lite directory
+link_directories(${LITE_DIR}/lib ${MKLML_DIR}/lib)
+include_directories(${LITE_DIR}/include/ ${MKLML_DIR}/include)
+
+# 3. compile options 
+if (NOT WIN32)
+  add_definitions(-std=c++11 -g -O3 -pthread)
+  set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR})
+endif()
+
+# 4.add executable output
+add_executable(${TARGET} ${TARGET}.cc)
+if (WIN32)
+  set(WITH_STATIC_MKL )
+  if(WITH_STATIC_MKL)
+    set(MATH_LIB ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX})
+  else()
+    set(MATH_LIB ${MKLML_DIR}/lib/mklml${CMAKE_STATIC_LIBRARY_SUFFIX}
+                 ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX})
+  endif()
+
+  target_link_libraries(${TARGET} libpaddle_api_full_bundled.lib)
+  target_link_libraries(${TARGET} shlwapi.lib)
+  target_link_libraries(${TARGET} ${MATH_LIB})
+
+  add_custom_command(TARGET ${TARGET} POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/libiomp5md.dll ${CMAKE_CURRENT_BINARY_DIR}/Release
+  )
+  if(NOT WITH_STATIC_MKL)
+    add_custom_command(TARGET ${TARGET} POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/mklml.dll ${CMAKE_CURRENT_BINARY_DIR}/Release
+    )
+  endif()
+else()
+    if (APPLE AND METAL)
+      target_link_libraries(${TARGET} ${METAL_LIBRARY} ${GRAPHIC} ${MPS_LIBRARY})
+    endif()
+    target_link_libraries(${TARGET} -lpaddle_full_api_shared)
+    target_link_libraries(${TARGET} -liomp5)
+    target_link_libraries(${TARGET} -ldl)
+endif()
diff --git a/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt b/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt
new file mode 100644
index 00000000000..3a91bfafbd3
--- /dev/null
+++ b/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt
@@ -0,0 +1,73 @@
+cmake_minimum_required(VERSION 2.8)
+project(mobilenet_light_api)
+set(TARGET mobilenet_light_api)
+
+# 1. path to Paddle-Lite lib and mklml lib
+set(LITE_DIR "${PROJECT_SOURCE_DIR}/../../../cxx")
+set(MKLML_DIR "${PROJECT_SOURCE_DIR}/../../../third_party/mklml/")
+
+if (WIN32)
+  add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
+  set(MSVC_STATIC_CRT )
+  if(MSVC_STATIC_CRT)
+    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
+    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
+    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
+    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
+  else(MSVC_STATIC_CRT)
+    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MDd")
+    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MD")
+    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MDd")
+    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MD")
+  endif(MSVC_STATIC_CRT)
+endif()
+
+if (APPLE AND METAL)
+  message(STATUS "set METAL=ON")
+  add_definitions("-DMETAL")
+  find_library(METAL_LIBRARY Metal REQUIRED)
+  find_library(GRAPHIC CoreGraphics REQUIRED)
+  find_library(MPS_LIBRARY MetalPerformanceShaders REQUIRED)
+endif()
+
+# 2. link mklml and Paddle-Lite directory
+link_directories(${LITE_DIR}/lib ${MKLML_DIR}/lib)
+include_directories(${LITE_DIR}/include/ ${MKLML_DIR}/include)
+
+# 3. compile options 
+if (NOT WIN32)
+  add_definitions(-std=c++11 -g -O3 -pthread)
+  set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR})
+endif()
+
+# 4.add executable output
+add_executable(${TARGET} ${TARGET}.cc)
+if (WIN32)
+  set(WITH_STATIC_MKL )
+  if(WITH_STATIC_MKL)
+    set(MATH_LIB ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX})
+  else()
+    set(MATH_LIB ${MKLML_DIR}/lib/mklml${CMAKE_STATIC_LIBRARY_SUFFIX}
+                 ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX})
+  endif()
+
+  target_link_libraries(${TARGET} libpaddle_api_light_bundled.lib)
+  target_link_libraries(${TARGET} shlwapi.lib)
+  target_link_libraries(${TARGET} ${MATH_LIB})
+
+  add_custom_command(TARGET ${TARGET} POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/libiomp5md.dll ${CMAKE_CURRENT_BINARY_DIR}/Release
+  )
+  if(NOT WITH_STATIC_MKL)
+    add_custom_command(TARGET ${TARGET} POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/mklml.dll ${CMAKE_CURRENT_BINARY_DIR}/Release
+    )
+  endif()
+else()
+  if (APPLE AND METAL)
+    target_link_libraries(${TARGET} ${METAL_LIBRARY} ${GRAPHIC} ${MPS_LIBRARY})
+  endif()
+  target_link_libraries(${TARGET} -lpaddle_light_api_shared)
+  target_link_libraries(${TARGET} -liomp5)
+  target_link_libraries(${TARGET} -ldl)
+endif()
diff --git a/lite/kernels/host/unique_compute.cc b/lite/kernels/host/unique_compute.cc
index 2e62c03f938..4c96e7f8c11 100644
--- a/lite/kernels/host/unique_compute.cc
+++ b/lite/kernels/host/unique_compute.cc
@@ -1,3 +1,17 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include "lite/kernels/host/unique_compute.h"
 #include "lite/core/tensor.h"
 
@@ -8,6 +22,7 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
+#include <iostream>
 
 namespace paddle {
 namespace lite {
@@ -18,7 +33,7 @@ template <typename InT, typename IndexT>
 void UniqueFunc(const lite::Tensor* x, 
                       lite::Tensor* out,
                       lite::Tensor* index,
-                      lite::Tensor* count) {
+                      lite::Tensor* count = nullptr) {
   const InT* in_data = x->template data<InT>();
   IndexT* index_data = index->mutable_data<IndexT>();
 
@@ -31,6 +46,7 @@ void UniqueFunc(const lite::Tensor* x,
     auto it = dict.find(in_data[i]);
     if (it == dict.end()) {
       dict.emplace(std::make_pair(in_data[i], j));
+      uniq.emplace_back(in_data[i]);
       index_data[i] = static_cast<IndexT>(j);
       j++;
     } else {
@@ -44,20 +60,11 @@ void UniqueFunc(const lite::Tensor* x,
     IndexT* count_data = count->template mutable_data<IndexT>();
     // init count_data to 0
     memset(count_data, 0, uniq.size() * sizeof(IndexT));
-
-    if (typeid(IndexT).name() == typeid(int32_t).name()) {
-      for (auto i = 0; i < x->numel(); ++i) {
-        const IndexT& index = index_data[i];
-        count_data[static_cast<int32_t>(index)] += static_cast<IndexT>(1);
-      }
-    } else {
-      for (auto i = 0; i < x->numel(); ++i) {
-        const IndexT& index = index_data[i];
-        count_data[static_cast<int64_t>(index)] += static_cast<IndexT>(1);
-      }
+    for (auto i = 0; i < x->numel(); ++i) {
+      const IndexT& index = index_data[i];
+      count_data[index] += static_cast<IndexT>(1);
     }
   }
-
   out->Resize({static_cast<int64_t>(uniq.size())});
   auto out_data = out->mutable_data<InT>();
   std::memcpy(out_data, uniq.data(), uniq.size() * sizeof(InT));
@@ -222,17 +229,6 @@ void TransCompute(const Tensor &input,
     new_steps.push_back(new_temps[num_axes - 1 - i]);
   }
 
-  // std::vector<int> old_steps(
-  //     {static_cast<int>(in_dims[1] * in_dims[2] * in_dims[3]),
-  //      static_cast<int>(in_dims[2] * in_dims[3]),
-  //      static_cast<int>(in_dims[3]),
-  //      1});
-  // std::vector<int> new_steps(
-  //     {static_cast<int>(out_dims[1] * out_dims[2] * out_dims[3]),
-  //      static_cast<int>(out_dims[2] * out_dims[3]),
-  //      static_cast<int>(out_dims[3]),
-  //      1});
-
   for (int i = 0; i < count; ++i) {
     int old_idx = 0;
     int idx = i;
@@ -422,24 +418,24 @@ void UniqueCompute::Run() {
           break;
       }
     } else {
-        switch (type) {
-          case PRECISION(kFloat):
-            UniqueFunc<float, int64_t>(x, output, index, count);
-            break;
-          case PRECISION(kInt32):
-            UniqueFunc<int32_t, int64_t>(x, output, index, count);
-            break;
-          case PRECISION(kInt64):
-            UniqueFunc<int64_t, int64_t>(x, output, index, count);
-            break;
-          default:
-            LOG(FATAL) << "unique does not implement for the "
-                       << "input type:" << static_cast<int>(type);
-            break;
+      switch (type) {
+        case PRECISION(kFloat):
+          UniqueFunc<float, int64_t>(x, output, index, count);
+          break;
+        case PRECISION(kInt32):
+          UniqueFunc<int32_t, int64_t>(x, output, index, count);
+          break;
+        case PRECISION(kInt64):
+          UniqueFunc<int64_t, int64_t>(x, output, index, count);
+          break;
+        default:
+          LOG(FATAL) << "unique does not implement for the "
+                     << "input type:" << static_cast<int>(type);
+          break;
         }
-    }
-    return;
-  }
+     }
+     return;
+  } 
 
   if (x->numel() == 0) {
     switch (type) {
diff --git a/lite/operators/unique_op.cc b/lite/operators/unique_op.cc
index 3cab23bbc81..adab6096cd8 100644
--- a/lite/operators/unique_op.cc
+++ b/lite/operators/unique_op.cc
@@ -35,10 +35,10 @@ bool UniqueOp::CheckShape() const {
 
 bool UniqueOp::InferShapeImpl() const {
   DDim in_dims = param_.X->dims();
-  param_.Out->Resize(in_dims);
-  param_.Index->Resize(in_dims);
-  param_.Indices->Resize(in_dims);
-  param_.Counts->Resize(in_dims);
+  if (param_.Out) param_.Out->Resize(in_dims);
+  if (param_.Index) param_.Index->Resize(in_dims);
+  if (param_.Indices) param_.Indices->Resize(in_dims);
+  if (param_.Counts) param_.Counts->Resize(in_dims);
   return true;
 }
 
diff --git a/lite/tests/unittest_py/__main___cache_dir/model b/lite/tests/unittest_py/__main___cache_dir/model
new file mode 100644
index 0000000000000000000000000000000000000000..ad9c9d92f78368ee4c4713242cba7c0dad4929b5
GIT binary patch
literal 2164
zcmZuyO>P@E6!u86EkDW0SV>w(DGEbrfx?>t&H;La7HAjU6a+QHkqB|fG0Bmota^-Y
zyUTHUmR=zJNNOxassY2EH}CKJ-uK?nUxVRU{<-_frehjZLX?x<@ai_3UekFce6hqM
z-DT5vG%~GkTF*-E`D8r2oDS(?*v~2NPv6t4M)5+hrBS7@>`BPQ(oYCZ7c>~2=NS@C
ziQJMmA|>eF_42`8Hv5sTDEobq>fccR=N8d>?`eE7yP|$!RDO1QHv810w>0`S<oiu%
zXY8T-WRfx*c09eVq*AOf+Ig=1Y((d!-!=l-`m>jGRT+!^i>A#lemR@vG&T(@xKi*g
zR{42wfu$5{S%|!Me>R&$7tL$o3WGes_JS_qWQ_`G|GY!cX!p$4+#<_cqD0iLbVvT@
zYzDp~;C!^9O>D_|i!jx8xzE!>_Wd9B-=Ai0=;f32OZM0T4&&b2UFXy~!RnPNwT_kt
z$^*BDWK711pfurW`g2I{VEFf^kN5wK>1(<HGpyOpr7oEl&NBu@3<!K^<C#Et1w%$4
zbpHTy7FKW%E%+tZ(IF>N!nuPGSZeOA+%R3c=<|YxTcZq{n@xOvM{gEF3yVywv8FMO
zE63Ik@#VL4TS|~5=Ph1Wjo!Q9eUHG9P@)#vKS@x+%OLHHru6#ITq$F0DcAAcQAjOQ
z>x9ivvWYGcnk<>)b)$rhwtM=n;w#7pShY|r)++-Y+RKla2H!9~4-k6<h)QbLU|nJ?
zP@F(yiqC~g{$sj;Xx9&^c~nQgtF~dD&sEAWqi<@nmU_YX8dNO+b?#zK2L%V8$i-n?
zM<Ip=g}cYt2|DXKjxnf=@ziaaFNRgMP8!V?wvGNbG+J^CCYDy}RW#kwq}B4V1r=B{
zQM6vJQ<I?3rQ?#(9Zj97d|)L?xUeSZLVHYj8}nS#CASOK@qav)T}#lQ6F%(-!uo_F
zSiaEj$@IGkM-}qbmRX?^3~=CWHw|!Ljbx26DGM^z(uE_KVU5T>&oo-lK$P^e#$gEO
zk!fS==<%9fSgu!LKY<VIMJos>MdO!ICuzD2JwO%4CAz@<g8Su8UmYDhZN)3PZsZ0!
zbIZ1>b}GQC;5aH#_>)!02oYbLKXlQfpscobZR!**kR@&-&fs(VAiglrkb@Bb!krC~
zV4`SoUl_!bqK_wJl3YP&ZNdibioh-6tZ*0<-{OnXu@qQ1;@}6Goo-d9&`#qq{wvD=
zhs7T4>>qHszF-$vyYsZaSpc<P*)Hwzn4*2Rk{=RANTleF1MuRTQK&S=6A?o3F#)z%
zY#NS>%Ac><5reLsM^~5^ymd};&48gaiEr#4uPUhpwo#@&pyLLMm>5MPVZ|ENro#%v
zPK0{+wW3JZ4mg_qtbs%EI#tF0yyT%&y!2ruxRQ&Y-abCnecl$aJM8<(@k(dw2u@?3
GLco9K>qUeB

literal 0
HcmV?d00001

diff --git a/lite/tests/unittest_py/__main___cache_dir/opt_model/model b/lite/tests/unittest_py/__main___cache_dir/opt_model/model
new file mode 100644
index 0000000000000000000000000000000000000000..99d11af804fa2902ea6a97be38c42308ea14546c
GIT binary patch
literal 609
zcmaJ<OHRWu6wI$}^3sp71hkgeSVBT&5f#|e8-N50B!sLean!0I35nAd4#J)tN8%h@
zpg1wD(oMaw<aux2%sj)7Wou}&Jv!-v6>%P7!*+%`Apph9&o~Qx;bX8nqz#sBAZw_R
zGjP*VWTjAGSAsUQZ4&`9CESBn#6>WZOueB-pTPnB2`lm=Fz-r%8{DJ@=on5In=W{s
z@R*5rIoKB>pE8Dwalg{#g*@^m-i>!f9WaA5Mw%r5G++eoL#6bQCq<e!sWDd5nrcZe
zn#VB<(xec6BIxyjxu<)jI{_2wLMyB^N(m+%fD@&8z*&%$sC}<h4{63izMKbKs@LcU
zAd7uj@!zw}e4|sDRUI>O*RZa+3X!v=AD5~X0h9%2uZuWLlA65_Ed}f6FZu5+J%+(*
zE@telly>US>aq+_m3q@U?y5M{c8I#@<$(YO^&WS|_mY<`tFN*3)VXN5vh1(#k3SIb
Bv<UzJ

literal 0
HcmV?d00001

diff --git a/lite/tests/unittest_py/__main___cache_dir/opt_model/params b/lite/tests/unittest_py/__main___cache_dir/opt_model/params
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/lite/tests/unittest_py/__main___cache_dir/params b/lite/tests/unittest_py/__main___cache_dir/params
new file mode 100644
index 0000000000000000000000000000000000000000..0b26358c07d8cb50206b90e46ed2267c0250d19f
GIT binary patch
literal 28
VcmZQzKm{y7E(fasBLhQ&Jpckz0O9}u

literal 0
HcmV?d00001

diff --git a/lite/tests/unittest_py/op/statics_data b/lite/tests/unittest_py/op/statics_data
new file mode 100644
index 0000000000000000000000000000000000000000..276804d5f93042d2bd7c043f3033e5504a09bccb
GIT binary patch
literal 1185
zcmajeWp~s-6b9fzfkJUB?zUKQ*HWZFTi`+=Xh?Tc638a|ZYHJ3;um*!cXx;1#!O~8
ze1XH)d+*G7?t5nT6y{KnFp3Ha3WSy2WQ!1ulC_cR3D*ycLt7N1@p=>*P2@mD5DGN0
z7)=Syj1pP#eJf>Ugq<~0cD<XlP2Wn}A&MM%ra7Ypq1b38=Pb`Ng&hhr$b@K_L#xD}
z3tOw<HVLDvBB+@<S57AbKNOa)PLyW0p{>#3cUvmWwAGMrw6huQ2_1|=**7LnL`QX9
zCq`#NNn9+WDs)lBu8eMk?nW~?xg=xz<(256>OC2~2)&J_vR-8i%R?Vk?#t*$=&vtc
zT7v<qIFK=jFxY4zXRE8r^zs;zLn2|ck+m(6??rxt>zfTC>0zikGmJ5uFv4gp`$;QZ
zn|0-KX4;D|QngDNqX?sowzBQlyD8U_QTbtz%?gZB17jKE2;<}XU0>LqT-}zFet-$8
zKanwsFgc2hmhy@)N+#_vjFX;{!_<Tm2R!XR0Z;!|z%yjPGYPYtIM~^YGQu1u4z!#x
zmoU$XQ=HFOK&Wuy>=rT>5f(dfW|fR8LbW4PT9R<$NR~2|5y*)HSk72MsBz+DS29)+
z3@6@fHDe87t=vpKq977x%B`!5S6i3E`uu7e7$#w(2Fll2OGs*9ieVG#G;kBcA-Ed2
znX!f7X<(Y+69NUy<bhyh384lGMntIBz^#l1!Zr=u&e%cNDS_%Yc&?B?AP{C4Wimll
z*r}MB%VAfZx|^|wuvb&}G4>M<Xeu)f5)NtVVa5@{QB6I@I8HdBsV5ny2&WbGOrCm{
zagK0aQ!g+s5-w@#WyTf4RZYFdxK6kssp_L;tW?VT8}?=nxANHAj5~z88hej%pYT9q
zA2J>h9&79q##6#GjeX8|L3pXLuNbchZxr@z9{Y~*p723qKQcZMK5Oh3##h2Ojs4E}
KLHHR(QRy$IOqiSi

literal 0
HcmV?d00001

diff --git a/lite/tests/unittest_py/op/test_unique_op.py b/lite/tests/unittest_py/op/test_unique_op.py
new file mode 100644
index 00000000000..24fb6b63a3f
--- /dev/null
+++ b/lite/tests/unittest_py/op/test_unique_op.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+sys.path.append('../')
+
+from auto_scan_test import AutoScanTest, IgnoreReasons
+from program_config import TensorConfig, ProgramConfig, OpConfig, CxxConfig, TargetType, PrecisionType, DataLayoutType, Place
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume
+import hypothesis.strategies as st
+from functools import partial
+import random
+import numpy as np
+
+
+class TestUniqueWithCountsOp(AutoScanTest):
+    def __init__(self, *args, **kwargs):
+        AutoScanTest.__init__(self, *args, **kwargs)
+        host_places = [
+            Place(TargetType.Host, PrecisionType.FP32, DataLayoutType.NCHW)
+        ]
+        self.enable_testing_on_place(places=host_places, thread=[1,4])
+
+    def is_program_valid(self,
+                         program_config: ProgramConfig,
+                         predictor_config: CxxConfig) -> bool:
+        return True
+
+    def sample_program_configs(self, draw):
+        in_shape = draw(
+            st.lists(
+                st.integers(
+                    min_value=2, max_value=100),
+                min_size=1,
+                max_size=3))
+        in_dtype = draw(st.sampled_from([np.float32, np.int32, np.int64]))
+
+        def generate_X_data():
+            return np.random.normal(0.0, 5.0, in_shape).astype(in_dtype)
+
+        def generate_IndexTensor():
+            return np.random.randint(1, 5, size=in_shape).astype(np.int32)
+
+        axis = draw(st.sampled_from([[0, 1, 2], [1], [0, 2], [2, 1], [0, 1]]))
+
+        unique_op = OpConfig(
+            type = "unique",
+            input = {"X": ["input_data"]},
+            outputs = {
+                "Out": ["Out_data"],
+                "Index": ["Index_data"],
+                "Indices": ["Indices_data"],
+                "Counts": ["Counts_data"]
+            },
+            attrs={
+                "dtype": 2,
+                "return_index": False,
+                "return_inverse": False,
+                "return_counts": False,
+                "axis": axis,
+                "is_sorted": False
+            }
+        )
+
+        unique_op.outputs_dtype = {"Out_data": in_dtype}
+        unique_op.outputs_dtype = {"Index_data": np.int32}
+        unique_op.outputs_dtype = {"Counts_data":np.int32}
+
+        program_config = ProgramConfig(
+            ops=[unique_op],
+            weights={
+                "Index_data":
+                TensorConfig(data_gen=partial(generate_IndexTensor))
+            },
+            inputs={
+                "input_data": TensorConfig(data_gen=partial(generate_X_data))
+            },
+            outputs=["Out_data", "Index_data", "Counts_data"]
+        )
+        return program_config
+
+    def sample_predictor_configs(self):
+        return self.get_predictor_configs(), [""], (1e-5, 1e-5)
+
+    def add_ignore_pass_case(self):
+        pass
+
+    def test(self, *args, **kwargs):
+        self.run_and_statis(quant=False, max_examples=25)
+
+if __name__ == "__main__":
+    unittest.main(argv=[''])    

From 794fb857f296bbbbae530a3927656a41b2a4dc09 Mon Sep 17 00:00:00 2001
From: Qijian Tian <1741919942@qq.com>
Date: Tue, 13 Dec 2022 15:54:39 +0800
Subject: [PATCH 04/10] update mobilenetv1_full_api for unique_op

---
 lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
index 1759a484175..c1fe58927b5 100644
--- a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
+++ b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
@@ -240,7 +240,7 @@ void RunModel() {
   std::cout << "output0 mean is "<<sum / ShapeProduction(output_tensor->shape())<<"\n";
   sum = 0;
   std::unique_ptr<const Tensor> output_tensor1(
-      std::move(predictor->GetOutput(0)));
+      std::move(predictor->GetOutput(1)));
   std::cout << "Output1 shape " << output_tensor1->shape()[0] <<","<< output_tensor1->shape()[1] << std::endl;
   for (int i = 0; i < ShapeProduction(output_tensor1->shape()); i++) {
     sum += output_tensor1->data<float>()[i] * 1.f;

From 0d81d6b6655cda0c4358bf2f28030bb592a91b2a Mon Sep 17 00:00:00 2001
From: Qijian Tian <1741919942@qq.com>
Date: Wed, 14 Dec 2022 12:25:14 +0800
Subject: [PATCH 05/10] update test_unique_op for unique_op

---
 lite/backends/arm/math/dotprod/gemm_sdot.h    | 442 -------------
 lite/backends/arm/math/dotprod/gemm_vsdot.h   |  54 --
 lite/core/program.cc                          |   2 +-
 .../cxx/mobile_full/mobilenetv1_full_api.cc   | 164 +----
 .../cxx/mobile_light/mobilenetv1_light_api.cc | 620 +++++++++---------
 .../x86_mobilenetv1_full_demo/CMakeLists.txt  |  73 ---
 .../x86_mobilenetv1_light_demo/CMakeLists.txt |  73 ---
 .../unittest_py/__main___cache_dir/model      | Bin 2164 -> 0 bytes
 .../__main___cache_dir/opt_model/model        | Bin 609 -> 0 bytes
 .../__main___cache_dir/opt_model/params       |   0
 .../unittest_py/__main___cache_dir/params     | Bin 28 -> 0 bytes
 lite/tests/unittest_py/op/statics_data        | Bin 1185 -> 0 bytes
 lite/tests/unittest_py/op/test_unique_op.py   |   5 +-
 13 files changed, 356 insertions(+), 1077 deletions(-)
 delete mode 100644 lite/backends/arm/math/dotprod/gemm_sdot.h
 delete mode 100644 lite/backends/arm/math/dotprod/gemm_vsdot.h
 delete mode 100644 lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt
 delete mode 100644 lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt
 delete mode 100644 lite/tests/unittest_py/__main___cache_dir/model
 delete mode 100644 lite/tests/unittest_py/__main___cache_dir/opt_model/model
 delete mode 100644 lite/tests/unittest_py/__main___cache_dir/opt_model/params
 delete mode 100644 lite/tests/unittest_py/__main___cache_dir/params
 delete mode 100644 lite/tests/unittest_py/op/statics_data

diff --git a/lite/backends/arm/math/dotprod/gemm_sdot.h b/lite/backends/arm/math/dotprod/gemm_sdot.h
deleted file mode 100644
index 1eea169b15f..00000000000
--- a/lite/backends/arm/math/dotprod/gemm_sdot.h
+++ /dev/null
@@ -1,442 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-// clang-format off
-#define GEMM_SDOT_INT8_KERNEL                                              \
-  "ldp    q0, q1, [%[a_ptr]], #32\n"     /* load a00,a01 to q0, q1*/       \
-  "ldp    q4, q5, [%[b_ptr]], #32\n"     /* load b0, b1 to q4, q5*/        \
-  "eor    v8.16b,  v8.16b, v8.16b\n"     /* out0 = 0 */                    \
-  "eor    v9.16b,  v9.16b, v9.16b\n"     /* out1 = 0 */                    \
-  "eor    v10.16b,  v10.16b, v10.16b\n"  /* out2 = 0 */                    \
-  "eor    v11.16b,  v11.16b, v11.16b\n"  /* out3 = 0 */                    \
-  "eor    v12.16b,  v12.16b, v12.16b\n"  /* out4 = 0 */                    \
-  "prfm   pldl1keep, [%[b_ptr], #64]\n"  /* preload b*/                    \
-  "eor    v13.16b,  v13.16b, v13.16b\n"  /* out5 = 0 */                    \
-  "prfm   pldl1keep, [%[a_ptr], #64]\n"  /* preload a*/                    \
-  "eor    v14.16b,  v14.16b, v14.16b\n"  /* out6 = 0 */                    \
-  "prfm   pldl1keep, [%[b_ptr], #128]\n" /* preload b*/                    \
-  "eor    v15.16b,  v15.16b, v15.16b\n"  /* out7 = 0 */                    \
-  "prfm   pldl1keep, [%[a_ptr], #128]\n" /* preload a*/                    \
-  "eor    v16.16b,  v16.16b, v16.16b\n"  /* out8 = 0 */                    \
-  "prfm   pldl1keep, [%[b_ptr], #192]\n" /* preload b*/                    \
-  "eor    v17.16b,  v17.16b, v17.16b\n"  /* out9 = 0 */                    \
-  "prfm   pldl1keep, [%[b_ptr], #256]\n" /* preload b*/                    \
-  "eor    v18.16b,  v18.16b, v18.16b\n"  /* out10 = 0 */                   \
-  "prfm   pldl1keep, [%[a_ptr], #192]\n" /* preload a*/                    \
-  "eor    v19.16b,  v19.16b, v19.16b\n"  /* out11 = 0 */                   \
-  "prfm   pldl1keep, [%[b_ptr], #320]\n" /* preload b*/                    \
-  "eor    v20.16b,  v20.16b, v20.16b\n"  /* out12 = 0 */                   \
-  "prfm   pldl1keep, [%[a_ptr], #256]\n" /* preload a*/                    \
-  "eor    v21.16b,  v21.16b, v21.16b\n"  /* out13 = 0 */                   \
-  "prfm   pldl1keep, [%[b_ptr], #384]\n" /* preload b*/                    \
-  "eor    v22.16b,  v22.16b, v22.16b\n"  /* out14 = 0 */                   \
-  "eor    v23.16b,  v23.16b, v23.16b\n"  /* out15 = 0 */                   \
-  "eor    v24.16b,  v24.16b, v24.16b\n"  /* out16 = 0 */                   \
-  "eor    v25.16b,  v25.16b, v25.16b\n"  /* out17 = 0 */                   \
-  "eor    v26.16b,  v26.16b, v26.16b\n"  /* out18 = 0 */                   \
-  "eor    v27.16b,  v27.16b, v27.16b\n"  /* out19 = 0 */                   \
-  "eor    v28.16b,  v28.16b, v28.16b\n"  /* out20 = 0 */                   \
-  "eor    v29.16b,  v29.16b, v29.16b\n"  /* out21 = 0 */                   \
-  "eor    v30.16b,  v30.16b, v30.16b\n"  /* out22 = 0 */                   \
-  "eor    v31.16b,  v31.16b, v31.16b\n"  /* out23 = 0 */                   \
-  "cbz    %w[k], 2f\n" /* check loop count > 0 */                          \
-  /* main loop, unrool 0*/                                                 \
-  "1:\n"                                 /* main loop */                   \
-".word 0x4f80e088\n" /* sdot v8.4s, v4.16b, v0.4b[0] */\
-".word 0x4fa0e08b\n" /* sdot v11.4s, v4.16b, v0.4b[1] */\
-  "ldp    q6, q7, [%[b_ptr]], #32\n"     /* load b2, b0 to q6, q7       */ \
-".word 0x4f80e88e\n" /* sdot v14.4s, v4.16b, v0.4b[2] */\
-".word 0x4fa0e891\n" /* sdot v17.4s, v4.16b, v0.4b[3] */\
-  "ldp    q2, q3, [%[a_ptr]], #32\n"     /* load a10, a11 to q3, q4     */ \
-".word 0x4f81e094\n" /* sdot v20.4s, v4.16b, v1.4b[0] */\
-".word 0x4fa1e097\n" /* sdot v23.4s, v4.16b, v1.4b[1] */\
-".word 0x4f81e89a\n" /* sdot v26.4s, v4.16b, v1.4b[2] */\
-".word 0x4fa1e89d\n" /* sdot v29.4s, v4.16b, v1.4b[3] */\
-".word 0x4f80e0a9\n" /* sdot v9.4s, v5.16b, v0.4b[0] */\
-".word 0x4fa0e0ac\n" /* sdot v12.4s, v5.16b, v0.4b[1] */\
-".word 0x4f80e8af\n" /* sdot v15.4s, v5.16b, v0.4b[2] */\
-".word 0x4fa0e8b2\n" /* sdot v18.4s, v5.16b, v0.4b[3] */\
-".word 0x4f81e0b5\n" /* sdot v21.4s, v5.16b, v1.4b[0] */\
-".word 0x4fa1e0b8\n" /* sdot v24.4s, v5.16b, v1.4b[1] */\
-".word 0x4f81e8bb\n" /* sdot v27.4s, v5.16b, v1.4b[2] */\
-".word 0x4fa1e8be\n" /* sdot v30.4s, v5.16b, v1.4b[3] */\
-  "ldp    q4, q5, [%[b_ptr]], #32\n"     /* load b1, b2 to q4, q5       */ \
-".word 0x4f80e0ca\n" /* sdot v10.4s, v6.16b, v0.4b[0] */\
-".word 0x4fa0e0cd\n" /* sdot v13.4s, v6.16b, v0.4b[1] */\
-  "prfm   pldl1keep, [%[b_ptr], #384]\n"                                   \
-".word 0x4f80e8d0\n" /* sdot v16.4s, v6.16b, v0.4b[2] */\
-".word 0x4fa0e8d3\n" /* sdot v19.4s, v6.16b, v0.4b[3] */\
-".word 0x4f81e0d6\n" /* sdot v22.4s, v6.16b, v1.4b[0] */\
-".word 0x4fa1e0d9\n" /* sdot v25.4s, v6.16b, v1.4b[1] */\
-".word 0x4f81e8dc\n" /* sdot v28.4s, v6.16b, v1.4b[2] */\
-".word 0x4fa1e8df\n" /* sdot v31.4s, v6.16b, v1.4b[3] */\
-  "ldp    q0, q1, [%[a_ptr]], #32\n"    /* load a00, a01 to q0, q1 */      \
-  /* unrool 1 */                                                           \
-".word 0x4f82e0e8\n" /* sdot v8.4s, v7.16b, v2.4b[0] */\
-".word 0x4fa2e0eb\n" /* sdot v11.4s, v7.16b, v2.4b[1] */\
-".word 0x4f82e8ee\n" /* sdot v14.4s, v7.16b, v2.4b[2] */\
-  "prfm   pldl1keep, [%[a_ptr], #256]\n"                                   \
-".word 0x4fa2e8f1\n" /* sdot v17.4s, v7.16b, v2.4b[3] */\
-".word 0x4f83e0f4\n" /* sdot v20.4s, v7.16b, v3.4b[0] */\
-".word 0x4fa3e0f7\n" /* sdot v23.4s, v7.16b, v3.4b[1] */\
-".word 0x4f83e8fa\n" /* sdot v26.4s, v7.16b, v3.4b[2] */\
-".word 0x4fa3e8fd\n" /* sdot v29.4s, v7.16b, v3.4b[3] */\
-  "ldp    q6, q7, [%[b_ptr]], #32\n"    /* load b0, b1 to q6, q7       */  \
-".word 0x4f82e089\n" /* sdot v9.4s, v4.16b, v2.4b[0] */\
-".word 0x4fa2e08c\n" /* sdot v12.4s, v4.16b, v2.4b[1] */\
-".word 0x4f82e88f\n" /* sdot v15.4s, v4.16b, v2.4b[2] */\
-".word 0x4fa2e892\n" /* sdot v18.4s, v4.16b, v2.4b[3] */\
-".word 0x4f83e095\n" /* sdot v21.4s, v4.16b, v3.4b[0] */\
-".word 0x4fa3e098\n" /* sdot v24.4s, v4.16b, v3.4b[1] */\
-".word 0x4f83e89b\n" /* sdot v27.4s, v4.16b, v3.4b[2] */\
-".word 0x4fa3e89e\n" /* sdot v30.4s, v4.16b, v3.4b[3] */\
-".word 0x4f82e0aa\n" /* sdot v10.4s, v5.16b, v2.4b[0] */\
-".word 0x4fa2e0ad\n" /* sdot v13.4s, v5.16b, v2.4b[1] */\
-".word 0x4f82e8b0\n" /* sdot v16.4s, v5.16b, v2.4b[2] */\
-".word 0x4fa2e8b3\n" /* sdot v19.4s, v5.16b, v2.4b[3] */\
-".word 0x4f83e0b6\n" /* sdot v22.4s, v5.16b, v3.4b[0] */\
-".word 0x4fa3e0b9\n" /* sdot v25.4s, v5.16b, v3.4b[1] */\
-".word 0x4f83e8bc\n" /* sdot v28.4s, v5.16b, v3.4b[2] */\
-".word 0x4fa3e8bf\n" /* sdot v31.4s, v5.16b, v3.4b[3] */\
-  "ldp    q4, q5, [%[b_ptr]], #32\n"    /* load b2, b0 to q4, q5 */        \
-  /* unrool 2*/                                                            \
-".word 0x4f80e0c8\n" /* sdot v8.4s, v6.16b, v0.4b[0] */\
-".word 0x4fa0e0cb\n" /* sdot v11.4s, v6.16b, v0.4b[1] */\
-  "ldp    q2, q3, [%[a_ptr]], #32\n"     /* load a10, a11 to q3, q4*/      \
-".word 0x4f80e8ce\n" /* sdot v14.4s, v6.16b, v0.4b[2] */\
-".word 0x4fa0e8d1\n" /* sdot v17.4s, v6.16b, v0.4b[3] */\
-".word 0x4f81e0d4\n" /* sdot v20.4s, v6.16b, v1.4b[0] */\
-".word 0x4fa1e0d7\n" /* sdot v23.4s, v6.16b, v1.4b[1] */\
-".word 0x4f81e8da\n" /* sdot v26.4s, v6.16b, v1.4b[2] */\
-".word 0x4fa1e8dd\n" /* sdot v29.4s, v6.16b, v1.4b[3] */\
-".word 0x4f80e0e9\n" /* sdot v9.4s, v7.16b, v0.4b[0] */\
-".word 0x4fa0e0ec\n" /* sdot v12.4s, v7.16b, v0.4b[1] */\
-  "prfm   pldl1keep, [%[b_ptr], #384]\n"                                   \
-".word 0x4f80e8ef\n" /* sdot v15.4s, v7.16b, v0.4b[2] */\
-".word 0x4fa0e8f2\n" /* sdot v18.4s, v7.16b, v0.4b[3] */\
-".word 0x4f81e0f5\n" /* sdot v21.4s, v7.16b, v1.4b[0] */\
-".word 0x4fa1e0f8\n" /* sdot v24.4s, v7.16b, v1.4b[1] */\
-".word 0x4f81e8fb\n" /* sdot v27.4s, v7.16b, v1.4b[2] */\
-".word 0x4fa1e8fe\n" /* sdot v30.4s, v7.16b, v1.4b[3] */\
-  "ldp    q6, q7, [%[b_ptr]], #32\n"    /* load b1, b2 to q6, q7*/         \
-".word 0x4f80e08a\n" /* sdot v10.4s, v4.16b, v0.4b[0] */\
-".word 0x4fa0e08d\n" /* sdot v13.4s, v4.16b, v0.4b[1] */\
-".word 0x4f80e890\n" /* sdot v16.4s, v4.16b, v0.4b[2] */\
-".word 0x4fa0e893\n" /* sdot v19.4s, v4.16b, v0.4b[3] */\
-".word 0x4f81e096\n" /* sdot v22.4s, v4.16b, v1.4b[0] */\
-".word 0x4fa1e099\n" /* sdot v25.4s, v4.16b, v1.4b[1] */\
-".word 0x4f81e89c\n" /* sdot v28.4s, v4.16b, v1.4b[2] */\
-".word 0x4fa1e89f\n" /* sdot v31.4s, v4.16b, v1.4b[3] */\
-  "ldp    q0, q1, [%[a_ptr]], #32\n" /* load a00, a01 to q0, q1*/          \
-  /* unrool 3*/                                                            \
-".word 0x4f82e0a8\n" /* sdot v8.4s, v5.16b, v2.4b[0] */\
-".word 0x4fa2e0ab\n" /* sdot v11.4s, v5.16b, v2.4b[1] */\
-".word 0x4f82e8ae\n" /* sdot v14.4s, v5.16b, v2.4b[2] */\
-".word 0x4fa2e8b1\n" /* sdot v17.4s, v5.16b, v2.4b[3] */\
-".word 0x4f83e0b4\n" /* sdot v20.4s, v5.16b, v3.4b[0] */\
-".word 0x4fa3e0b7\n" /* sdot v23.4s, v5.16b, v3.4b[1] */\
-".word 0x4f83e8ba\n" /* sdot v26.4s, v5.16b, v3.4b[2] */\
-".word 0x4fa3e8bd\n" /* sdot v29.4s, v5.16b, v3.4b[3] */\
-  "ldp    q4, q5, [%[b_ptr]], #32\n"     /* load b0, b1 to q4, q5*/        \
-".word 0x4f82e0c9\n" /* sdot v9.4s, v6.16b, v2.4b[0] */\
-".word 0x4fa2e0cc\n" /* sdot v12.4s, v6.16b, v2.4b[1] */\
-  "prfm   pldl1keep, [%[a_ptr], #256]\n"                                   \
-".word 0x4f82e8cf\n" /* sdot v15.4s, v6.16b, v2.4b[2] */\
-".word 0x4fa2e8d2\n" /* sdot v18.4s, v6.16b, v2.4b[3] */\
-".word 0x4f83e0d5\n" /* sdot v21.4s, v6.16b, v3.4b[0] */\
-".word 0x4fa3e0d8\n" /* sdot v24.4s, v6.16b, v3.4b[1] */\
-".word 0x4f83e8db\n" /* sdot v27.4s, v6.16b, v3.4b[2] */\
-  "prfm   pldl1keep, [%[b_ptr], #384]\n"                                   \
-".word 0x4fa3e8de\n" /* sdot v30.4s, v6.16b, v3.4b[3] */\
-".word 0x4f82e0ea\n" /* sdot v10.4s, v7.16b, v2.4b[0] */\
-".word 0x4fa2e0ed\n" /* sdot v13.4s, v7.16b, v2.4b[1] */\
-".word 0x4f82e8f0\n" /* sdot v16.4s, v7.16b, v2.4b[2] */\
-".word 0x4fa2e8f3\n" /* sdot v19.4s, v7.16b, v2.4b[3] */\
-".word 0x4f83e0f6\n" /* sdot v22.4s, v7.16b, v3.4b[0] */\
-".word 0x4fa3e0f9\n" /* sdot v25.4s, v7.16b, v3.4b[1] */\
-  "subs   %w[k], %w[k], #1\n"           /* loop count - 1*/                \
-".word 0x4f83e8fc\n" /* sdot v28.4s, v7.16b, v3.4b[2] */\
-".word 0x4fa3e8ff\n" /* sdot v31.4s, v7.16b, v3.4b[3] */\
-  "bne    1b\n" /* Target to use when K is 1 or 2 */                       \
-  "2:\n"                                             /* process tail*/     \
-  "subs       %w[tail], %w[tail], #1\n"              /* tail--*/           \
-  "beq        3f\n" /*jump to tail = 1*/                                   \
-  /* final unrool 0, unrool 0, tail > 1*/                                  \
-".word 0x4f80e088\n" /* sdot v8.4s, v4.16b, v0.4b[0] */\
-".word 0x4fa0e08b\n" /* sdot v11.4s, v4.16b, v0.4b[1] */\
-  "ldp    q6, q7, [%[b_ptr]], #32\n"     /* load b2, b0 to q6, q7*/        \
-".word 0x4f80e88e\n" /* sdot v14.4s, v4.16b, v0.4b[2] */\
-".word 0x4fa0e891\n" /* sdot v17.4s, v4.16b, v0.4b[3] */\
-  "ldp    q2, q3, [%[a_ptr]], #32\n"     /* load a10, a11 to q2, q3*/      \
-".word 0x4f81e094\n" /* sdot v20.4s, v4.16b, v1.4b[0] */\
-".word 0x4fa1e097\n" /* sdot v23.4s, v4.16b, v1.4b[1] */\
-".word 0x4f81e89a\n" /* sdot v26.4s, v4.16b, v1.4b[2] */\
-".word 0x4fa1e89d\n" /* sdot v29.4s, v4.16b, v1.4b[3] */\
-  "subs   %w[tail], %w[tail], #1\n"      /* tail--*/                       \
-".word 0x4f80e0a9\n" /* sdot v9.4s, v5.16b, v0.4b[0] */\
-".word 0x4fa0e0ac\n" /* sdot v12.4s, v5.16b, v0.4b[1] */\
-".word 0x4f80e8af\n" /* sdot v15.4s, v5.16b, v0.4b[2] */\
-".word 0x4fa0e8b2\n" /* sdot v18.4s, v5.16b, v0.4b[3] */\
-".word 0x4f81e0b5\n" /* sdot v21.4s, v5.16b, v1.4b[0] */\
-".word 0x4fa1e0b8\n" /* sdot v24.4s, v5.16b, v1.4b[1] */\
-".word 0x4f81e8bb\n" /* sdot v27.4s, v5.16b, v1.4b[2] */\
-".word 0x4fa1e8be\n" /* sdot v30.4s, v5.16b, v1.4b[3] */\
-  "ldp    q4, q5, [%[b_ptr]], #32\n"     /* load b1, b2 to q4, q5*/        \
-".word 0x4f80e0ca\n" /* sdot v10.4s, v6.16b, v0.4b[0] */\
-".word 0x4fa0e0cd\n" /* sdot v13.4s, v6.16b, v0.4b[1] */\
-".word 0x4f80e8d0\n" /* sdot v16.4s, v6.16b, v0.4b[2] */\
-".word 0x4fa0e8d3\n" /* sdot v19.4s, v6.16b, v0.4b[3] */\
-".word 0x4f81e0d6\n" /* sdot v22.4s, v6.16b, v1.4b[0] */\
-".word 0x4fa1e0d9\n" /* sdot v25.4s, v6.16b, v1.4b[1] */\
-".word 0x4f81e8dc\n" /* sdot v28.4s, v6.16b, v1.4b[2] */\
-".word 0x4fa1e8df\n" /* sdot v31.4s, v6.16b, v1.4b[3] */\
-  "beq        4f\n" /*jump to tail = 2*/                                   \
-  /* unrool 1, tail > 2*/                                                  \
-  "ldp    q0, q1, [%[a_ptr]], #32\n"     /* load a00, a01 to q0, q1*/      \
-".word 0x4f82e0e8\n" /* sdot v8.4s, v7.16b, v2.4b[0] */\
-".word 0x4fa2e0eb\n" /* sdot v11.4s, v7.16b, v2.4b[1] */\
-".word 0x4f82e8ee\n" /* sdot v14.4s, v7.16b, v2.4b[2] */\
-".word 0x4fa2e8f1\n" /* sdot v17.4s, v7.16b, v2.4b[3] */\
-".word 0x4f83e0f4\n" /* sdot v20.4s, v7.16b, v3.4b[0] */\
-".word 0x4fa3e0f7\n" /* sdot v23.4s, v7.16b, v3.4b[1] */\
-".word 0x4f83e8fa\n" /* sdot v26.4s, v7.16b, v3.4b[2] */\
-".word 0x4fa3e8fd\n" /* sdot v29.4s, v7.16b, v3.4b[3] */\
-  "ldp    q6, q7, [%[b_ptr]], #32\n"     /* load b0, b1 to q6, q7*/        \
-".word 0x4f82e089\n" /* sdot v9.4s, v4.16b, v2.4b[0] */\
-".word 0x4fa2e08c\n" /* sdot v12.4s, v4.16b, v2.4b[1] */\
-".word 0x4f82e88f\n" /* sdot v15.4s, v4.16b, v2.4b[2] */\
-".word 0x4fa2e892\n" /* sdot v18.4s, v4.16b, v2.4b[3] */\
-".word 0x4f83e095\n" /* sdot v21.4s, v4.16b, v3.4b[0] */\
-".word 0x4fa3e098\n" /* sdot v24.4s, v4.16b, v3.4b[1] */\
-".word 0x4f83e89b\n" /* sdot v27.4s, v4.16b, v3.4b[2] */\
-".word 0x4fa3e89e\n" /* sdot v30.4s, v4.16b, v3.4b[3] */\
-  "subs   %w[tail], %w[tail], #1\n"      /* tail--*/                       \
-".word 0x4f82e0aa\n" /* sdot v10.4s, v5.16b, v2.4b[0] */\
-".word 0x4fa2e0ad\n" /* sdot v13.4s, v5.16b, v2.4b[1] */\
-".word 0x4f82e8b0\n" /* sdot v16.4s, v5.16b, v2.4b[2] */\
-".word 0x4fa2e8b3\n" /* sdot v19.4s, v5.16b, v2.4b[3] */\
-".word 0x4f83e0b6\n" /* sdot v22.4s, v5.16b, v3.4b[0] */\
-".word 0x4fa3e0b9\n" /* sdot v25.4s, v5.16b, v3.4b[1] */\
-".word 0x4f83e8bc\n" /* sdot v28.4s, v5.16b, v3.4b[2] */\
-".word 0x4fa3e8bf\n" /* sdot v31.4s, v5.16b, v3.4b[3] */\
-  "beq        5f\n" /*jump to tail = 3*/                                   \
-  /* unrool 2, tail = 4*/                                                  \
-  "ldp    q4, q5, [%[b_ptr]], #32\n"     /* load b2, b0 to q4, q5*/        \
-".word 0x4f80e0c8\n" /* sdot v8.4s, v6.16b, v0.4b[0] */\
-".word 0x4fa0e0cb\n" /* sdot v11.4s, v6.16b, v0.4b[1] */\
-  "ldp    q2, q3, [%[a_ptr]], #32\n"     /* load a10, a11 to q3, q4*/      \
-".word 0x4f80e8ce\n" /* sdot v14.4s, v6.16b, v0.4b[2] */\
-".word 0x4fa0e8d1\n" /* sdot v17.4s, v6.16b, v0.4b[3] */\
-".word 0x4f81e0d4\n" /* sdot v20.4s, v6.16b, v1.4b[0] */\
-".word 0x4fa1e0d7\n" /* sdot v23.4s, v6.16b, v1.4b[1] */\
-".word 0x4f81e8da\n" /* sdot v26.4s, v6.16b, v1.4b[2] */\
-".word 0x4fa1e8dd\n" /* sdot v29.4s, v6.16b, v1.4b[3] */\
-".word 0x4f80e0e9\n" /* sdot v9.4s, v7.16b, v0.4b[0] */\
-".word 0x4fa0e0ec\n" /* sdot v12.4s, v7.16b, v0.4b[1] */\
-".word 0x4f80e8ef\n" /* sdot v15.4s, v7.16b, v0.4b[2] */\
-".word 0x4fa0e8f2\n" /* sdot v18.4s, v7.16b, v0.4b[3] */\
-".word 0x4f81e0f5\n" /* sdot v21.4s, v7.16b, v1.4b[0] */\
-".word 0x4fa1e0f8\n" /* sdot v24.4s, v7.16b, v1.4b[1] */\
-".word 0x4f81e8fb\n" /* sdot v27.4s, v7.16b, v1.4b[2] */\
-".word 0x4fa1e8fe\n" /* sdot v30.4s, v7.16b, v1.4b[3] */\
-  "ldp    q6, q7, [%[b_ptr]], #32\n"     /* load b1, b2 to q6, q7*/        \
-".word 0x4f80e08a\n" /* sdot v10.4s, v4.16b, v0.4b[0] */\
-".word 0x4fa0e08d\n" /* sdot v13.4s, v4.16b, v0.4b[1] */\
-".word 0x4f80e890\n" /* sdot v16.4s, v4.16b, v0.4b[2] */\
-".word 0x4fa0e893\n" /* sdot v19.4s, v4.16b, v0.4b[3] */\
-".word 0x4f81e096\n" /* sdot v22.4s, v4.16b, v1.4b[0] */\
-".word 0x4fa1e099\n" /* sdot v25.4s, v4.16b, v1.4b[1] */\
-".word 0x4f81e89c\n" /* sdot v28.4s, v4.16b, v1.4b[2] */\
-".word 0x4fa1e89f\n" /* sdot v31.4s, v4.16b, v1.4b[3] */\
-  /* unrool 3, tail = 4*/                                                  \
-".word 0x4f82e0a8\n" /* sdot v8.4s, v5.16b, v2.4b[0] */\
-".word 0x4fa2e0ab\n" /* sdot v11.4s, v5.16b, v2.4b[1] */\
-".word 0x4f82e8ae\n" /* sdot v14.4s, v5.16b, v2.4b[2] */\
-".word 0x4fa2e8b1\n" /* sdot v17.4s, v5.16b, v2.4b[3] */\
-".word 0x4f83e0b4\n" /* sdot v20.4s, v5.16b, v3.4b[0] */\
-".word 0x4fa3e0b7\n" /* sdot v23.4s, v5.16b, v3.4b[1] */\
-".word 0x4f83e8ba\n" /* sdot v26.4s, v5.16b, v3.4b[2] */\
-".word 0x4fa3e8bd\n" /* sdot v29.4s, v5.16b, v3.4b[3] */\
-".word 0x4f82e0c9\n" /* sdot v9.4s, v6.16b, v2.4b[0] */\
-".word 0x4fa2e0cc\n" /* sdot v12.4s, v6.16b, v2.4b[1] */\
-".word 0x4f82e8cf\n" /* sdot v15.4s, v6.16b, v2.4b[2] */\
-".word 0x4fa2e8d2\n" /* sdot v18.4s, v6.16b, v2.4b[3] */\
-".word 0x4f83e0d5\n" /* sdot v21.4s, v6.16b, v3.4b[0] */\
-".word 0x4fa3e0d8\n" /* sdot v24.4s, v6.16b, v3.4b[1] */\
-".word 0x4f83e8db\n" /* sdot v27.4s, v6.16b, v3.4b[2] */\
-".word 0x4fa3e8de\n" /* sdot v30.4s, v6.16b, v3.4b[3] */\
-".word 0x4f82e0ea\n" /* sdot v10.4s, v7.16b, v2.4b[0] */\
-".word 0x4fa2e0ed\n" /* sdot v13.4s, v7.16b, v2.4b[1] */\
-".word 0x4f82e8f0\n" /* sdot v16.4s, v7.16b, v2.4b[2] */\
-".word 0x4fa2e8f3\n" /* sdot v19.4s, v7.16b, v2.4b[3] */\
-".word 0x4f83e0f6\n" /* sdot v22.4s, v7.16b, v3.4b[0] */\
-".word 0x4fa3e0f9\n" /* sdot v25.4s, v7.16b, v3.4b[1] */\
-".word 0x4f83e8fc\n" /* sdot v28.4s, v7.16b, v3.4b[2] */\
-".word 0x4fa3e8ff\n" /* sdot v31.4s, v7.16b, v3.4b[3] */\
-  "b      11f\n"                         /* tails==1 final tail*/          \
-  "3: \n"                                /* tail=1*/                       \
-  "ldr    q6, [%[b_ptr]], #16\n"         /* load b2 to q6*/                \
-".word 0x4f80e088\n" /* sdot v8.4s, v4.16b, v0.4b[0] */\
-".word 0x4fa0e08b\n" /* sdot v11.4s, v4.16b, v0.4b[1] */\
-".word 0x4f80e88e\n" /* sdot v14.4s, v4.16b, v0.4b[2] */\
-".word 0x4fa0e891\n" /* sdot v17.4s, v4.16b, v0.4b[3] */\
-".word 0x4f81e094\n" /* sdot v20.4s, v4.16b, v1.4b[0] */\
-".word 0x4fa1e097\n" /* sdot v23.4s, v4.16b, v1.4b[1] */\
-".word 0x4f81e89a\n" /* sdot v26.4s, v4.16b, v1.4b[2] */\
-".word 0x4fa1e89d\n" /* sdot v29.4s, v4.16b, v1.4b[3] */\
-".word 0x4f80e0a9\n" /* sdot v9.4s, v5.16b, v0.4b[0] */\
-".word 0x4fa0e0ac\n" /* sdot v12.4s, v5.16b, v0.4b[1] */\
-".word 0x4f80e8af\n" /* sdot v15.4s, v5.16b, v0.4b[2] */\
-".word 0x4fa0e8b2\n" /* sdot v18.4s, v5.16b, v0.4b[3] */\
-".word 0x4f81e0b5\n" /* sdot v21.4s, v5.16b, v1.4b[0] */\
-".word 0x4fa1e0b8\n" /* sdot v24.4s, v5.16b, v1.4b[1] */\
-".word 0x4f81e8bb\n" /* sdot v27.4s, v5.16b, v1.4b[2] */\
-".word 0x4fa1e8be\n" /* sdot v30.4s, v5.16b, v1.4b[3] */\
-".word 0x4f80e0ca\n" /* sdot v10.4s, v6.16b, v0.4b[0] */\
-".word 0x4fa0e0cd\n" /* sdot v13.4s, v6.16b, v0.4b[1] */\
-".word 0x4f80e8d0\n" /* sdot v16.4s, v6.16b, v0.4b[2] */\
-".word 0x4fa0e8d3\n" /* sdot v19.4s, v6.16b, v0.4b[3] */\
-".word 0x4f81e0d6\n" /* sdot v22.4s, v6.16b, v1.4b[0] */\
-".word 0x4fa1e0d9\n" /* sdot v25.4s, v6.16b, v1.4b[1] */\
-".word 0x4f81e8dc\n" /* sdot v28.4s, v6.16b, v1.4b[2] */\
-".word 0x4fa1e8df\n" /* sdot v31.4s, v6.16b, v1.4b[3] */\
-  "b      11f\n"                         /* tails==2 final tail*/          \
-  "4:\n"                                 /* tail = 2*/                     \
-".word 0x4f82e0e8\n" /* sdot v8.4s, v7.16b, v2.4b[0] */\
-".word 0x4fa2e0eb\n" /* sdot v11.4s, v7.16b, v2.4b[1] */\
-".word 0x4f82e8ee\n" /* sdot v14.4s, v7.16b, v2.4b[2] */\
-".word 0x4fa2e8f1\n" /* sdot v17.4s, v7.16b, v2.4b[3] */\
-".word 0x4f83e0f4\n" /* sdot v20.4s, v7.16b, v3.4b[0] */\
-".word 0x4fa3e0f7\n" /* sdot v23.4s, v7.16b, v3.4b[1] */\
-".word 0x4f83e8fa\n" /* sdot v26.4s, v7.16b, v3.4b[2] */\
-".word 0x4fa3e8fd\n" /* sdot v29.4s, v7.16b, v3.4b[3] */\
-".word 0x4f82e089\n" /* sdot v9.4s, v4.16b, v2.4b[0] */\
-".word 0x4fa2e08c\n" /* sdot v12.4s, v4.16b, v2.4b[1] */\
-".word 0x4f82e88f\n" /* sdot v15.4s, v4.16b, v2.4b[2] */\
-".word 0x4fa2e892\n" /* sdot v18.4s, v4.16b, v2.4b[3] */\
-".word 0x4f83e095\n" /* sdot v21.4s, v4.16b, v3.4b[0] */\
-".word 0x4fa3e098\n" /* sdot v24.4s, v4.16b, v3.4b[1] */\
-".word 0x4f83e89b\n" /* sdot v27.4s, v4.16b, v3.4b[2] */\
-".word 0x4fa3e89e\n" /* sdot v30.4s, v4.16b, v3.4b[3] */\
-".word 0x4f82e0aa\n" /* sdot v10.4s, v5.16b, v2.4b[0] */\
-".word 0x4fa2e0ad\n" /* sdot v13.4s, v5.16b, v2.4b[1] */\
-".word 0x4f82e8b0\n" /* sdot v16.4s, v5.16b, v2.4b[2] */\
-".word 0x4fa2e8b3\n" /* sdot v19.4s, v5.16b, v2.4b[3] */\
-".word 0x4f83e0b6\n" /* sdot v22.4s, v5.16b, v3.4b[0] */\
-".word 0x4fa3e0b9\n" /* sdot v25.4s, v5.16b, v3.4b[1] */\
-".word 0x4f83e8bc\n" /* sdot v28.4s, v5.16b, v3.4b[2] */\
-".word 0x4fa3e8bf\n" /* sdot v31.4s, v5.16b, v3.4b[3] */\
-  "b      11f\n"                         /* tails==3 final tail*/          \
-  "5:\n"                                 /* tail = 3*/                     \
-  "ldr    q4, [%[b_ptr]], #16\n"         /* load b2, b0 to q4*/            \
-".word 0x4f80e0c8\n" /* sdot v8.4s, v6.16b, v0.4b[0] */\
-".word 0x4fa0e0cb\n" /* sdot v11.4s, v6.16b, v0.4b[1] */\
-".word 0x4f80e8ce\n" /* sdot v14.4s, v6.16b, v0.4b[2] */\
-".word 0x4fa0e8d1\n" /* sdot v17.4s, v6.16b, v0.4b[3] */\
-".word 0x4f81e0d4\n" /* sdot v20.4s, v6.16b, v1.4b[0] */\
-".word 0x4fa1e0d7\n" /* sdot v23.4s, v6.16b, v1.4b[1] */\
-".word 0x4f81e8da\n" /* sdot v26.4s, v6.16b, v1.4b[2] */\
-".word 0x4fa1e8dd\n" /* sdot v29.4s, v6.16b, v1.4b[3] */\
-".word 0x4f80e0e9\n" /* sdot v9.4s, v7.16b, v0.4b[0] */\
-".word 0x4fa0e0ec\n" /* sdot v12.4s, v7.16b, v0.4b[1] */\
-".word 0x4f80e8ef\n" /* sdot v15.4s, v7.16b, v0.4b[2] */\
-".word 0x4fa0e8f2\n" /* sdot v18.4s, v7.16b, v0.4b[3] */\
-".word 0x4f81e0f5\n" /* sdot v21.4s, v7.16b, v1.4b[0] */\
-".word 0x4fa1e0f8\n" /* sdot v24.4s, v7.16b, v1.4b[1] */\
-".word 0x4f81e8fb\n" /* sdot v27.4s, v7.16b, v1.4b[2] */\
-".word 0x4fa1e8fe\n" /* sdot v30.4s, v7.16b, v1.4b[3] */\
-".word 0x4f80e08a\n" /* sdot v10.4s, v4.16b, v0.4b[0] */\
-".word 0x4fa0e08d\n" /* sdot v13.4s, v4.16b, v0.4b[1] */\
-".word 0x4f80e890\n" /* sdot v16.4s, v4.16b, v0.4b[2] */\
-".word 0x4fa0e893\n" /* sdot v19.4s, v4.16b, v0.4b[3] */\
-".word 0x4f81e096\n" /* sdot v22.4s, v4.16b, v1.4b[0] */\
-".word 0x4fa1e099\n" /* sdot v25.4s, v4.16b, v1.4b[1] */\
-".word 0x4f81e89c\n" /* sdot v28.4s, v4.16b, v1.4b[2] */\
-".word 0x4fa1e89f\n" /* sdot v31.4s, v4.16b, v1.4b[3] */\
-  "11: \n"                               /* end */
-
-#define GEMM_SDOT_INT8_KERNEL_8x8     \
-  "prfm   pldl1keep, [%[a_ptr], #64]\n"  /* preload a*/                     \
-  "eor    v8.16b,  v8.16b,  v8.16b \n"     /* out0 = 0 */                   \
-  "eor    v11.16b, v11.16b, v11.16b\n"     /* out0 = 0 */                   \
-  "eor    v14.16b, v14.16b, v14.16b\n"     /* out0 = 0 */                   \
-  "eor    v17.16b, v17.16b, v17.16b\n"     /* out0 = 0 */                   \
-  "eor    v20.16b, v20.16b, v20.16b\n"     /* out0 = 0 */                   \
-  "eor    v23.16b, v23.16b, v23.16b\n"     /* out0 = 0 */                   \
-  "eor    v26.16b, v26.16b, v26.16b\n"     /* out0 = 0 */                   \
-  "eor    v29.16b, v29.16b, v29.16b\n"     /* out0 = 0 */                   \
-  "prfm   pldl1keep, [%[b_ptr], #64]\n"  /* preload b*/                     \
-  "eor    v9.16b,  v9.16b,  v9.16b \n"     /* out0 = 0 */                   \
-  "eor    v12.16b, v12.16b, v12.16b\n"     /* out0 = 0 */                   \
-  "eor    v15.16b, v15.16b, v15.16b\n"     /* out0 = 0 */                   \
-  "eor    v18.16b, v18.16b, v18.16b\n"     /* out0 = 0 */                   \
-  "eor    v21.16b, v21.16b, v21.16b\n"     /* out0 = 0 */                   \
-  "eor    v24.16b, v24.16b, v24.16b\n"     /* out0 = 0 */                   \
-  "eor    v27.16b, v27.16b, v27.16b\n"     /* out0 = 0 */                   \
-  "eor    v30.16b, v30.16b, v30.16b\n"     /* out0 = 0 */                   \
-  "1:\n"                                                                    \
-  "ldp    q0, q1, [%[a_ptr]], #32\n"                                        \
-  "ldp    q4, q5, [%[b_ptr]], #32\n"                                        \
-".word 0x4f80e088\n" /* sdot v8.4s, v4.16b, v0.4b[0] */\
-".word 0x4fa0e08b\n" /* sdot v11.4s, v4.16b, v0.4b[1] */\
-".word 0x4f80e88e\n" /* sdot v14.4s, v4.16b, v0.4b[2] */\
-".word 0x4fa0e891\n" /* sdot v17.4s, v4.16b, v0.4b[3] */\
-  "prfm   pldl1keep, [%[a_ptr], #64]\n"  /* preload a*/                     \
-".word 0x4f81e094\n" /* sdot v20.4s, v4.16b, v1.4b[0] */\
-".word 0x4fa1e097\n" /* sdot v23.4s, v4.16b, v1.4b[1] */\
-".word 0x4f81e89a\n" /* sdot v26.4s, v4.16b, v1.4b[2] */\
-".word 0x4fa1e89d\n" /* sdot v29.4s, v4.16b, v1.4b[3] */\
-  "prfm   pldl1keep, [%[a_ptr], #128]\n"  /* preload b*/                    \
-  "prfm   pldl1keep, [%[b_ptr], #64]\n"  /* preload b*/                     \
-".word 0x4f80e0a9\n" /* sdot v9.4s, v5.16b, v0.4b[0] */\
-".word 0x4fa0e0ac\n" /* sdot v12.4s, v5.16b, v0.4b[1] */\
-".word 0x4f80e8af\n" /* sdot v15.4s, v5.16b, v0.4b[2] */\
-".word 0x4fa0e8b2\n" /* sdot v18.4s, v5.16b, v0.4b[3] */\
-  "prfm   pldl1keep, [%[b_ptr], #128]\n"  /* preload b*/                    \
-".word 0x4f81e0b5\n" /* sdot v21.4s, v5.16b, v1.4b[0] */\
-".word 0x4fa1e0b8\n" /* sdot v24.4s, v5.16b, v1.4b[1] */\
-".word 0x4f81e8bb\n" /* sdot v27.4s, v5.16b, v1.4b[2] */\
-".word 0x4fa1e8be\n" /* sdot v30.4s, v5.16b, v1.4b[3] */\
-  "subs %w[k], %w[k], #1\n"                                                 \
-  "bne 1b\n"
-
-#define GEMM_SDOT_INT8_KERNEL_8x4     \
-  "prfm   pldl1keep, [%[a_ptr], #64]\n"  /* preload a*/                      \
-  "eor    v8.16b,  v8.16b,  v8.16b \n"     /* out0 = 0 */                    \
-  "eor    v11.16b, v11.16b, v11.16b\n"     /* out0 = 0 */                    \
-  "eor    v14.16b, v14.16b, v14.16b\n"     /* out0 = 0 */                    \
-  "eor    v17.16b, v17.16b, v17.16b\n"     /* out0 = 0 */                    \
-  "prfm   pldl1keep, [%[b_ptr], #32]\n"  /* preload b*/                      \
-  "eor    v20.16b, v20.16b, v20.16b\n"     /* out0 = 0 */                    \
-  "eor    v23.16b, v23.16b, v23.16b\n"     /* out0 = 0 */                    \
-  "eor    v26.16b, v26.16b, v26.16b\n"     /* out0 = 0 */                    \
-  "eor    v29.16b, v29.16b, v29.16b\n"     /* out0 = 0 */                    \
-  "1:\n"                              \
-  "ldp    q0, q1, [%[a_ptr]], #32\n"  \
-  "ldr    q4,  [%[b_ptr]], #16\n"     \
-".word 0x4f80e088\n" /* sdot v8.4s, v4.16b, v0.4b[0] */\
-".word 0x4fa0e08b\n" /* sdot v11.4s, v4.16b, v0.4b[1] */\
-  "prfm   pldl1keep, [%[a_ptr], #64]\n"  /* preload a*/                      \
-".word 0x4f80e88e\n" /* sdot v14.4s, v4.16b, v0.4b[2] */\
-".word 0x4fa0e891\n" /* sdot v17.4s, v4.16b, v0.4b[3] */\
-  "prfm   pldl1keep, [%[a_ptr], #64]\n"  /* preload a*/                      \
-".word 0x4f81e094\n" /* sdot v20.4s, v4.16b, v1.4b[0] */\
-".word 0x4fa1e097\n" /* sdot v23.4s, v4.16b, v1.4b[1] */\
-  "prfm   pldl1keep, [%[b_ptr], #32]\n"  /* preload b*/                      \
-".word 0x4f81e89a\n" /* sdot v26.4s, v4.16b, v1.4b[2] */\
-".word 0x4fa1e89d\n" /* sdot v29.4s, v4.16b, v1.4b[3] */\
-  "subs %w[k], %w[k], #1\n"           \
-  "bne 1b\n"
diff --git a/lite/backends/arm/math/dotprod/gemm_vsdot.h b/lite/backends/arm/math/dotprod/gemm_vsdot.h
deleted file mode 100644
index 9929ade9b95..00000000000
--- a/lite/backends/arm/math/dotprod/gemm_vsdot.h
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-// clang-format off
-#define GEMM_DOT_INT8_KERNEL                                           \
-  "vld1.s8  {q0}, [%[a_ptr]]!  \n"     /* load a00,a01 to q0, q1*/       \
-  "vld1.s8  {d2}, [%[a_ptr]]!  \n"     /* load a00,a01 to q0, q1*/       \
-  "veor.s32    q4,  q4, q4     \n"     /* out0 = 0 */                    \
-  "veor.s32    q5,  q5, q5     \n"     /* out0 = 0 */                    \
-  "veor.s32    q6,  q6, q6     \n"     /* out0 = 0 */                    \
-  "veor.s32    q7,  q7, q7     \n"     /* out0 = 0 */                    \
-  "veor.s32    q8,  q8, q8     \n"     /* out0 = 0 */                    \
-  "veor.s32    q9,  q9, q9     \n"     /* out0 = 0 */                    \
-  "veor.s32    q10,  q10, q10  \n"     /* out0 = 0 */                    \
-  "veor.s32    q11,  q11, q11  \n"     /* out0 = 0 */                    \
-  "veor.s32    q12,  q12, q12  \n"     /* out0 = 0 */                    \
-  "veor.s32    q13,  q13, q13  \n"     /* out0 = 0 */                    \
-  "veor.s32    q14,  q14, q14  \n"     /* out0 = 0 */                    \
-  "veor.s32    q15,  q15, q15  \n"     /* out0 = 0 */                    \
-  "cmp   %[k], #0              \n"                                       \
-  "beq   2f                    \n"                                       \
-  "1:                          \n"                                       \
-  "vld1.s8  {q2}, [%[b_ptr]]!  \n"                                       \
-  "vld1.s8  {q3}, [%[b_ptr]]!  \n"                                       \
-".word 0x8d40fe24\n" /* vsdot.s8 q4, q2, d0[0] */\
-".word 0xcd60fe24\n" /* vsdot.s8 q6, q2, d0[1] */\
-".word 0x0d41fe64\n" /* vsdot.s8 q8, q2, d1[0] */\
-".word 0x4d61fe64\n" /* vsdot.s8 q10, q2, d1[1] */\
-".word 0x8d42fe64\n" /* vsdot.s8 q12, q2, d2[0] */\
-".word 0xcd62fe64\n" /* vsdot.s8 q14, q2, d2[1] */\
-".word 0xad40fe26\n" /* vsdot.s8 q5, q3, d0[0] */\
-".word 0xed60fe26\n" /* vsdot.s8 q7, q3, d0[1] */\
-".word 0x2d41fe66\n" /* vsdot.s8 q9, q3, d1[0] */\
-".word 0x6d61fe66\n" /* vsdot.s8 q11, q3, d1[1] */\
-".word 0xad42fe66\n" /* vsdot.s8 q13, q3, d2[0] */\
-".word 0xed62fe66\n" /* vsdot.s8 q15, q3, d2[1] */\
-  "vld1.s8  {q0}, [%[a_ptr]]!  \n"     /* load a00,a01 to q0, q1*/       \
-  "vld1.s8  {d2}, [%[a_ptr]]!  \n"     /* load a00,a01 to q0, q1*/       \
-  "subs %[k], %[k], #1         \n"                                       \
-  "bne    1b                   \n"                                       \
-  "2:                          \n"
diff --git a/lite/core/program.cc b/lite/core/program.cc
index 069da1e78eb..a6c2698ca37 100644
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -811,7 +811,7 @@ void Instruction::Run() {
 
 
 
-#if 1
+#if 0
   // clang-format off
   /*
   time_t t;
diff --git a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
index c1fe58927b5..3db0f2c9c93 100644
--- a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
+++ b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
@@ -11,11 +11,13 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
 #include <gflags/gflags.h>
 #include <iostream>
 #include <vector>
 #include "paddle_api.h"         // NOLINT
 #include "paddle_use_passes.h"  // NOLINT
+
 /////////////////////////////////////////////////////////////////////////
 // If this demo is linked to static library:libpaddle_api_full_bundled.a
 // , you should include `paddle_use_ops.h` and `paddle_use_kernels.h` to
@@ -25,7 +27,9 @@
 #include "paddle_use_kernels.h"  // NOLINT
 #include "paddle_use_ops.h"      // NOLINT
 #endif
+
 using namespace paddle::lite_api;  // NOLINT
+
 DEFINE_string(model_dir,
               "",
               "Model dir path. Set it when the model is uncombined format.");
@@ -50,11 +54,13 @@ DEFINE_int32(threads, 1, "threads num");
 DEFINE_int32(warmup, 10, "warmup times");
 DEFINE_int32(repeats, 100, "repeats times");
 DEFINE_bool(use_gpu, false, "use opencl backend");
+
 int64_t ShapeProduction(const shape_t& shape) {
   int64_t res = 1;
   for (auto i : shape) res *= i;
   return res;
 }
+
 void RunModel() {
   // 1. Set CxxConfig
   CxxConfig config;
@@ -66,6 +72,7 @@ void RunModel() {
   }
   config.set_power_mode((paddle::lite_api::PowerMode)FLAGS_power_mode);
   config.set_threads(FLAGS_threads);
+
   std::vector<Place> valid_places;
   if (FLAGS_use_gpu) {
     valid_places.emplace_back(
@@ -86,167 +93,51 @@ void RunModel() {
   } else {
     valid_places.emplace_back(Place{TARGET(kARM), PRECISION(kFloat)});
   }
+
   if (FLAGS_prefer_int8_kernel) {
     valid_places.insert(valid_places.begin(),
                         Place{TARGET(kARM), PRECISION(kInt8)});
   }
   config.set_valid_places(valid_places);
+
   // 2. Create PaddlePredictor by CxxConfig
   std::shared_ptr<PaddlePredictor> predictor =
       CreatePaddlePredictor<CxxConfig>(config);
+
   // 3. Save the optimized model
   // WARN: The `predictor->SaveOptimizedModel` method must be executed
   // before the `predictor->Run` method. Because some kernels' `PrepareForRun`
   // method maybe change some parameters' values.
   predictor->SaveOptimizedModel(FLAGS_optimized_model_dir,
                                 LiteModelType::kNaiveBuffer);
+
   // 4. Prepare input data
-  const lod_t lodd = {{0,1},{0,1}};
-  {
-    // src_ids
-    int64_t pre_data[100] = {41, 2, 69, 2, 68, 2, 78, 2, 83, 2, 22, 29, 21, 28,
-    27, 18, 8, 2, 788, 342, 6431, 17, 2, 788, 96, 6431, 6622};
-    std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
-    input_tensor->Resize(shape_t({1,27,1}));
-    auto* data = input_tensor->mutable_data<int64_t>();
-    for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
-      data[i] = pre_data[i];
-    }
+  std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+  input_tensor->Resize(shape_t({1, 3, 224, 224}));
+  auto* data = input_tensor->mutable_data<float>();
+  for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+    data[i] = 1;
   }
-  {
-    // pos_ids
-    int64_t pre_data[100] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26};
-    std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(1)));
-    input_tensor->Resize(shape_t({1,27,1}));
-    auto* data = input_tensor->mutable_data<int64_t>();
-    for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
-      data[i] = pre_data[i];
-    }
-  }
-  {
-    // input_mask
-    std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(2)));
-    input_tensor->Resize(shape_t({1,27,27}));
-    auto* data = input_tensor->mutable_data<float>();
-    for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
-      data[i] = 1;
-    }
-  }
-  {
-    // pos_ids_extra
-    std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(3)));
-    input_tensor->Resize(shape_t({1,27,1}));
-    auto* data = input_tensor->mutable_data<int64_t>();
-    for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
-      data[i] = 0;
-    }
-  }
-  {
-    // tgt_ids
-    std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(4)));
-    input_tensor->Resize(shape_t({1,1}));
-    auto* data = input_tensor->mutable_data<int64_t>();
-    for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
-      data[i] = 6621;
-    }
-    input_tensor->SetLoD(lodd);
-  }
-  {
-    // tgt_pos
-    std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(5)));
-    input_tensor->Resize(shape_t({1,1}));
-    auto* data = input_tensor->mutable_data<int64_t>();
-    for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
-      data[i] = 26;
-    }
-    input_tensor->SetLoD(lodd);
-  }
-  {
-    // init_score
-    std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(6)));
-    input_tensor->Resize(shape_t({1,1}));
-    auto* data = input_tensor->mutable_data<float>();
-    for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
-      data[i] = 0;
-    }
-    input_tensor->SetLoD(lodd);
-  }
-  {
-    // parent_idx
-    std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(7)));
-    input_tensor->Resize(shape_t({1,1}));
-    auto* data = input_tensor->mutable_data<int>();
-    for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
-      data[i] = 0;
-    }
-  }
-  {
-    // tgt_generation_mask
-    std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(8)));
-    input_tensor->Resize(shape_t({1,1,27}));
-    auto* data = input_tensor->mutable_data<float>();
-    for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
-      data[i] = 1;
-    }
-  }
-  {
-    // max_dec_len
-    std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(9)));
-    input_tensor->Resize(shape_t({1}));
-    auto* data = input_tensor->mutable_data<int64_t>();
-    for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
-      data[i] = 10;
-    }
-  }
-  {
-    // tgt_pos_extra
-    std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(10)));
-    input_tensor->Resize(shape_t({1,1}));
-    auto* data = input_tensor->mutable_data<int64_t>();
-    for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
-      data[i] = 1;
-    }
-    input_tensor->SetLoD(lodd);
-  }
-  {
-    // cand_ids
-    int64_t cand[500]={41, 6623, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 69, 6623, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 68, 6623, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 78, 6623, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83, 6623, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-    std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(11)));
-    input_tensor->Resize(shape_t({5,32}));
-    auto* data = input_tensor->mutable_data<int64_t>();
-    for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
-      data[i] = cand[i];
-    }
-  }
- 
+
   // 5. Run predictor
-  for (int j = 0; j < 1; ++j) {
+  for (int i = 0; i < FLAGS_warmup; ++i) {
     predictor->Run();
   }
+
+  for (int j = 0; j < FLAGS_repeats; ++j) {
+    predictor->Run();
+  }
+
   // 6. Get output
-  double sum = 0;
   std::unique_ptr<const Tensor> output_tensor(
       std::move(predictor->GetOutput(0)));
-  std::cout << "Output0 shape " << output_tensor->shape()[0] <<","<< output_tensor->shape()[1] << std::endl;
+  std::cout << "Output shape " << output_tensor->shape()[1] << std::endl;
   for (int i = 0; i < ShapeProduction(output_tensor->shape()); i++) {
-    sum += output_tensor->data<int64_t>()[i] * 1.f;
-  }
-  std::cout << "output0 mean is "<<sum / ShapeProduction(output_tensor->shape())<<"\n";
-  sum = 0;
-  std::unique_ptr<const Tensor> output_tensor1(
-      std::move(predictor->GetOutput(1)));
-  std::cout << "Output1 shape " << output_tensor1->shape()[0] <<","<< output_tensor1->shape()[1] << std::endl;
-  for (int i = 0; i < ShapeProduction(output_tensor1->shape()); i++) {
-    sum += output_tensor1->data<float>()[i] * 1.f;
+    std::cout << "Output[" << i << "]: " << output_tensor->data<float>()[i]
+              << std::endl;
   }
-  std::cout << "output1 mean is "<<sum / ShapeProduction(output_tensor1->shape())<<"\n";
 }
+
 int main(int argc, char** argv) {
   google::ParseCommandLineFlags(&argc, &argv, true);
   if (FLAGS_model_dir.empty() &&
@@ -271,6 +162,7 @@ int main(int argc, char** argv) {
         << " --use_gpu=false              bool    Use gpu or not.\n";
     exit(1);
   }
+
   RunModel();
   return 0;
 }
diff --git a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
index e493bebfc50..bb430c8d8f6 100644
--- a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
+++ b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
@@ -1,336 +1,364 @@
-#include <chrono>  // NOLINT(build/c++11)
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/time.h>
+#include <time.h>
 #include <cmath>
 #include <iostream>
+#include <string>
 #include <vector>
-#include <iostream>
-#include <fstream>
-#include <sstream>
-#include <map>
-#include <stdexcept>
-#include "paddle_api.h"  // NOLINT
 
-#define IPTCORE_PADDLE_MOBILE
-#define IPTCORE_PADDLE_BENCHMARK
+#include "paddle_api.h"  // NOLINT
 /////////////////////////////////////////////////////////////////////////
-// If this demo is linked to static library:libpaddle_api_full_bundled.a
+// If this demo is linked to static library:libpaddle_api_light_bundled.a
 // , you should include `paddle_use_ops.h` and `paddle_use_kernels.h` to
 // avoid linking errors such as `unsupport ops or kernels`.
 /////////////////////////////////////////////////////////////////////////
-#ifdef IPTCORE_PADDLE_MOBILE
-#else
-#ifdef _WIN32
-#include "paddle_use_kernels.h"  // NOLINT
-#include "paddle_use_ops.h"      // NOLINT
-#endif
-#endif
+// #include "paddle_use_kernels.h"  // NOLINT
+// #include "paddle_use_ops.h"      // NOLINT
 
-#ifdef IPTCORE_PADDLE_BENCHMARK
-class Timer {
-private:
-    std::chrono::high_resolution_clock::time_point inTime, outTime;
-
-public:
-    void startTimer() { inTime = std::chrono::high_resolution_clock::now(); }
-
-    // unit millisecond
-    float getCostTimer() {
-        outTime = std::chrono::high_resolution_clock::now();
-        return static_cast<float>(
-            std::chrono::duration_cast<std::chrono::microseconds>(outTime - inTime)
-                .count() /
-                1e+3);
-    }
-};
-#endif
+using namespace paddle::lite_api;  // NOLINT
 
-template<typename T>
-double compute_mean(const T* in, const size_t length) {
-    double sum = 0.;
-    for (size_t i = 0; i < length; ++i) {
-        sum += in[i];
-    }
-    return sum / length;
+int64_t ShapeProduction(const shape_t& shape) {
+  int64_t res = 1;
+  for (auto i : shape) res *= i;
+  return res;
 }
 
-template<typename T>
-double compute_standard_deviation(const T* in,
-                                  const size_t length,
-                                  bool has_mean = false,
-                                  double mean = 10000) {
-    if (!has_mean) {
-        mean = compute_mean<T>(in, length);
+std::string ShapePrint(const std::vector<shape_t>& shapes) {
+  std::string shapes_str{""};
+  for (size_t shape_idx = 0; shape_idx < shapes.size(); ++shape_idx) {
+    auto shape = shapes[shape_idx];
+    std::string shape_str;
+    for (auto i : shape) {
+      shape_str += std::to_string(i) + ",";
     }
+    shapes_str += shape_str;
+    shapes_str +=
+        (shape_idx != 0 && shape_idx == shapes.size() - 1) ? "" : " : ";
+  }
+  return shapes_str;
+}
 
-    double variance = 0.;
-    for (size_t i = 0; i < length; ++i) {
-        variance += pow((in[i] - mean), 2);
-    }
-    variance /= length;
-    return sqrt(variance);
+std::string ShapePrint(const shape_t& shape) {
+  std::string shape_str{""};
+  for (auto i : shape) {
+    shape_str += std::to_string(i) + " ";
+  }
+  return shape_str;
 }
 
-int64_t shape_production(const paddle::lite_api::shape_t& shape) {
-    int64_t res = 1;
-    for (auto i : shape) {
-        res *= i;
+std::vector<std::string> split_string(const std::string& str_in) {
+  std::vector<std::string> str_out;
+  std::string tmp_str = str_in;
+  while (!tmp_str.empty()) {
+    size_t next_offset = tmp_str.find(":");
+    str_out.push_back(tmp_str.substr(0, next_offset));
+    if (next_offset == std::string::npos) {
+      break;
+    } else {
+      tmp_str = tmp_str.substr(next_offset + 1);
     }
-    return res;
+  }
+  return str_out;
 }
 
-class InputData {
-public:
-    int _type = -1; ///int32, int64, float32
-    bool _lod = false;
-    std::vector<int64_t> _shape;
-    std::vector<int32_t> _int32_data;
-    std::vector<int64_t> _int64_data;
-    std::vector<float> _float32_data;
-    std::vector<std::vector<uint64_t>> _lod_data = {{0, 1}, {0, 1}};
-};
-
-class UserPersonaInfer {
-public:
-#ifdef IPTCORE_PADDLE_MOBILE
-    void create_paddle_light_predictor(const std::string& model_file);
-#else
-    void create_paddle_full_predictor(const std::string& model_dir);
-#endif
-    void prepare(const std::string& path);
-    void infer();
-private:
-    void infer_specific_item(paddle::lite_api::PaddlePredictor *predictor);
-    std::shared_ptr<paddle::lite_api::PaddlePredictor> _paddle_predictor;
-    std::vector<std::map<std::string, InputData> > _batch;
-};
-
-#ifdef IPTCORE_PADDLE_MOBILE
-void UserPersonaInfer::create_paddle_light_predictor(const std::string& model_file) {
-    // 1. Set MobileConfig
-    paddle::lite_api::MobileConfig config;
-    config.set_model_from_file(model_file);
-    config.set_power_mode(paddle::lite_api::LITE_POWER_HIGH);
-    // 2. Create PaddlePredictor by MobileConfig
-    _paddle_predictor =
-        paddle::lite_api::CreatePaddlePredictor<paddle::lite_api::MobileConfig>(config);
-}
-#else
-void UserPersonaInfer::create_paddle_full_predictor(const std::string& model_dir) {
-    // 1. Create CxxConfig
-    paddle::lite_api::CxxConfig config;
-    config.set_model_dir(model_dir);
-    config.set_valid_places({paddle::lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
-                                paddle::lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});
-    // 2. Create PaddlePredictor by CxxConfig
-    _paddle_predictor =
-        paddle::lite_api::CreatePaddlePredictor<paddle::lite_api::CxxConfig>(config);
+std::vector<int64_t> get_shape(const std::string& str_shape) {
+  std::vector<int64_t> shape;
+  std::string tmp_str = str_shape;
+  while (!tmp_str.empty()) {
+    int dim = atoi(tmp_str.data());
+    shape.push_back(dim);
+    size_t next_offset = tmp_str.find(",");
+    if (next_offset == std::string::npos) {
+      break;
+    } else {
+      tmp_str = tmp_str.substr(next_offset + 1);
+    }
+  }
+  return shape;
 }
-#endif
-namespace {
-using namespace std;
-template <class T>
-void extract_num(const string &str, vector<T> &results) {
-    stringstream ss;
-
-    /* Storing the whole string into string stream */
-    ss << str;
 
-    /* Running loop till the end of the stream */
-    string temp;
-    T found;
-    while (!ss.eof()) {
-
-        /* extracting word by word from stream */
-        ss >> temp;
-
-        /* Checking the given word is integer or not */
-        if (stringstream(temp) >> found)
-            results.emplace_back(found);
+template <typename T>
+double compute_mean(const T* in, const size_t length) {
+  double sum = 0.;
+  for (size_t i = 0; i < length; ++i) {
+    sum += in[i];
+  }
+  return sum / length;
+}
 
-        /* To save from space at the end of string */
-        temp = "";
-    }
+template <typename T>
+double compute_standard_deviation(const T* in,
+                                  const size_t length,
+                                  bool has_mean = false,
+                                  double mean = 10000) {
+  if (!has_mean) {
+    mean = compute_mean<T>(in, length);
+  }
+
+  double variance = 0.;
+  for (size_t i = 0; i < length; ++i) {
+    variance += pow((in[i] - mean), 2);
+  }
+  variance /= length;
+  return sqrt(variance);
 }
+
+inline double GetCurrentUS() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return 1e+6 * time.tv_sec + time.tv_usec;
 }
 
-void UserPersonaInfer::prepare(const std::string& path) {
-    ///xia_i	186	tgt_generation_mask	float32	(1, 1, 33)	[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
-    std::ifstream in(path.c_str());
-    std::string line;
-    std::string current_idx;
-    while (std::getline(in, line)) {
-        if (line.empty()) {
-            break;
-        }
-        if (line.back() == '\r') {
-            line.pop_back();
-        }
-        if (line.empty()) {
-            break;
-        }
-        std::vector<std::string> strings;
-        std::istringstream f(line);
-        std::string s;
-        while (getline(f, s, '\t')) {
-            strings.push_back(s);
-        }
-        if (current_idx != strings.at(1)) {
-            _batch.push_back(std::map<std::string, InputData>());
-            current_idx = strings[1];
-        }
-        if (strings.at(2) == "lods") {
-            if (strings.at(3) != "[[0, 1], [0, 1]]") {
-                throw std::invalid_argument("invalid lod");
-            }
-            continue;
-        }
-        auto& input_data = _batch.back()[strings.at(2)];
-
-        extract_num(strings.at(4), input_data._shape);
-        if (strings[0] == "lod_i") {
-            input_data._lod = true;
-        }
-        if (strings.at(3) == "int32") {
-            input_data._type = 0;
-            extract_num(strings.at(5), input_data._int32_data);
-        } else if (strings.at(3) == "int64") {
-            input_data._type = 1;
-            extract_num(strings.at(5), input_data._int64_data);
-        } else if (strings.at(3) == "float32") {
-            input_data._type = 2;
-            extract_num(strings.at(5), input_data._float32_data);
-        } else {
-            throw std::invalid_argument("invalid type");
-        }
+void RunModel(std::string model_dir,
+              const std::vector<shape_t>& input_shapes,
+              size_t repeats,
+              size_t warmup,
+              size_t power_mode,
+              size_t thread_num,
+              size_t accelerate_opencl,
+              size_t print_output_elem) {
+  // 1. Set MobileConfig
+  MobileConfig config;
+  config.set_model_from_file(model_dir);
+
+#ifdef METAL
+  std::string metal_lib_path = "../../../metal/lite.metallib";
+  config.set_metal_lib_path(metal_lib_path);
+  config.set_metal_use_mps(true);
+#else
+  // NOTE: Use android gpu with opencl, you should ensure:
+  //  first, [compile **cpu+opencl** paddlelite
+  //    lib](/~https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/docs/demo_guides/opencl.md);
+  //  second, [convert and use opencl nb
+  //    model](/~https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/docs/user_guides/opt/opt_bin.md).
+
+  bool is_opencl_backend_valid =
+      ::IsOpenCLBackendValid(/*check_fp16_valid = false*/);
+  std::cout << "is_opencl_backend_valid:"
+            << (is_opencl_backend_valid ? "true" : "false") << std::endl;
+  if (is_opencl_backend_valid) {
+    if (accelerate_opencl != 0) {
+      // Set opencl kernel binary.
+      // Large addtitional prepare time is cost due to algorithm selecting and
+      // building kernel from source code.
+      // Prepare time can be reduced dramitically after building algorithm file
+      // and OpenCL kernel binary on the first running.
+      // The 1st running time will be a bit longer due to the compiling time if
+      // you don't call `set_opencl_binary_path_name` explicitly.
+      // So call `set_opencl_binary_path_name` explicitly is strongly
+      // recommended.
+
+      // Make sure you have write permission of the binary path.
+      // We strongly recommend each model has a unique binary name.
+      const std::string bin_path = "/data/local/tmp/";
+      const std::string bin_name = "lite_opencl_kernel.bin";
+      config.set_opencl_binary_path_name(bin_path, bin_name);
+
+      // opencl tune option
+      // CL_TUNE_NONE: 0
+      // CL_TUNE_RAPID: 1
+      // CL_TUNE_NORMAL: 2
+      // CL_TUNE_EXHAUSTIVE: 3
+      const std::string tuned_path = "/data/local/tmp/";
+      const std::string tuned_name = "lite_opencl_tuned.bin";
+      config.set_opencl_tune(CL_TUNE_NORMAL, tuned_path, tuned_name);
+
+      // opencl precision option
+      // CL_PRECISION_AUTO: 0, first fp16 if valid, default
+      // CL_PRECISION_FP32: 1, force fp32
+      // CL_PRECISION_FP16: 2, force fp16
+      config.set_opencl_precision(CL_PRECISION_FP16);
+    }
+  } else {
+    std::cout << "*** nb model will be running on cpu. ***" << std::endl;
+    // you can give backup cpu nb model instead
+    // config.set_model_from_file(cpu_nb_model_dir);
+  }
+#endif
 
+  // NOTE: To load model transformed by model_optimize_tool before
+  // release/v2.3.0, plese use `set_model_dir` API as listed below.
+  // config.set_model_dir(model_dir);
+  config.set_power_mode(static_cast<paddle::lite_api::PowerMode>(power_mode));
+  config.set_threads(thread_num);
+  // 2. Create PaddlePredictor by MobileConfig
+  std::shared_ptr<PaddlePredictor> predictor =
+      CreatePaddlePredictor<MobileConfig>(config);
+
+  // 3. Prepare input data
+  std::cout << "input_shapes.size():" << input_shapes.size() << std::endl;
+  for (int j = 0; j < input_shapes.size(); ++j) {
+    auto input_tensor = predictor->GetInput(j);
+    input_tensor->Resize(input_shapes[j]);
+    auto input_data = input_tensor->mutable_data<float>();
+    int input_num = 1;
+    for (int i = 0; i < input_shapes[j].size(); ++i) {
+      input_num *= input_shapes[j][i];
     }
-}
 
-void UserPersonaInfer::infer_specific_item(paddle::lite_api::PaddlePredictor *predictor){
-    static int count = 0;
-    if (_batch.empty()) {
-        return;
+    for (int i = 0; i < input_num; ++i) {
+      input_data[i] = 1.f;
     }
-    auto &inputs = _batch[count];
-    auto names = predictor->GetInputNames();
-    for (auto &name : names) {
-        auto& input = inputs[name];
-        auto tensor = predictor->GetInputByName(name);
-        tensor->Resize(input._shape);
-        if (input._type == 0) {
-            auto input_data = tensor->mutable_data<int32_t>();
-            std::copy(input._int32_data.begin(), input._int32_data.end(), input_data);
-        } else if (input._type == 1) {
-            auto input_data = tensor->mutable_data<int64_t>();
-            std::copy(input._int64_data.begin(), input._int64_data.end(), input_data);
-        } else if (input._type == 2) {
-            auto input_data = tensor->mutable_data<float>();
-            std::copy(input._float32_data.begin(), input._float32_data.end(), input_data);
-        } else {
-            throw std::invalid_argument("invalid name");
-        }
-        if (input._lod) {
-            tensor->SetLoD(input._lod_data);
-        }
+  }
+
+  // 4. Run predictor
+  double first_duration{-1};
+  for (size_t widx = 0; widx < warmup; ++widx) {
+    if (widx == 0) {
+      auto start = GetCurrentUS();
+      predictor->Run();
+      first_duration = (GetCurrentUS() - start) / 1000.0;
+    } else {
+      predictor->Run();
     }
+  }
 
-    predictor->Run();
-
-    std::cout << "\n";
-    for (int idx = 0; idx != 2; ++idx) {
-        auto output_tensor = predictor->GetOutput(idx);
-        auto total_size = shape_production(output_tensor->shape());
-        std::cout << "xiarj_" << count << "\t";
-        for (int i = 0; i < total_size; ++i) {
-            if (idx == 0) {
-                std::cout << output_tensor->data<int64_t>()[i] << "\t";
-            } else {
-                std::cout << output_tensor->data<float>()[i] << "\t";
-            }
-        }
-        std::cout << "\n";
-    }
-    std::cout << std::flush;
+  double sum_duration = 0.0;  // millisecond;
+  double max_duration = 1e-5;
+  double min_duration = 1e5;
+  double avg_duration = -1;
+  for (size_t ridx = 0; ridx < repeats; ++ridx) {
+    auto start = GetCurrentUS();
 
-    if (++count == _batch.size()){
-        count = 0;
-    }
-}
+    predictor->Run();
 
-void UserPersonaInfer::infer() {
-    static int idx = 0;
-    auto predictor = _paddle_predictor.get();
-    if (!predictor) {
-        return;
+    auto duration = (GetCurrentUS() - start) / 1000.0;
+    sum_duration += duration;
+    max_duration = duration > max_duration ? duration : max_duration;
+    min_duration = duration < min_duration ? duration : min_duration;
+    std::cout << "run_idx:" << ridx + 1 << " / " << repeats << ": " << duration
+              << " ms" << std::endl;
+    if (first_duration < 0) {
+      first_duration = duration;
     }
-    // 3. Prepare input data
-
-    // 4. Run predictor
-#ifdef IPTCORE_PADDLE_BENCHMARK
-    int warmup = 10;
-    int repeats = 400;
-    Timer timeInstance;
-    double first_duration{-1};
-    for (size_t widx = 0; widx < warmup; ++widx) {
-        if (widx == 0) {
-            timeInstance.startTimer();
-            infer_specific_item(predictor);
-            first_duration = timeInstance.getCostTimer();
-        } else {
-            infer_specific_item(predictor);
-        }
+  }
+  avg_duration = sum_duration / static_cast<float>(repeats);
+  std::cout << "\n======= benchmark summary =======\n"
+            << "input_shape(s) (NCHW):" << ShapePrint(input_shapes) << "\n"
+            << "model_dir:" << model_dir << "\n"
+            << "warmup:" << warmup << "\n"
+            << "repeats:" << repeats << "\n"
+            << "power_mode:" << power_mode << "\n"
+            << "thread_num:" << thread_num << "\n"
+            << "*** time info(ms) ***\n"
+            << "1st_duration:" << first_duration << "\n"
+            << "max_duration:" << max_duration << "\n"
+            << "min_duration:" << min_duration << "\n"
+            << "avg_duration:" << avg_duration << "\n";
+
+  // 5. Get output
+  std::cout << "\n====== output summary ====== " << std::endl;
+  size_t output_tensor_num = predictor->GetOutputNames().size();
+  std::cout << "output tensor num:" << output_tensor_num << std::endl;
+
+  for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
+    std::unique_ptr<const paddle::lite_api::Tensor> output_tensor =
+        predictor->GetOutput(tidx);
+    std::cout << "\n--- output tensor " << tidx << " ---" << std::endl;
+    auto out_shape = output_tensor->shape();
+    auto out_data = output_tensor->data<float>();
+    auto out_mean = compute_mean<float>(out_data, ShapeProduction(out_shape));
+    auto out_std_dev = compute_standard_deviation<float>(
+        out_data, ShapeProduction(out_shape), true, out_mean);
+
+    std::cout << "output shape(NCHW):" << ShapePrint(out_shape) << std::endl;
+    std::cout << "output tensor " << tidx
+              << " elem num:" << ShapeProduction(out_shape) << std::endl;
+    std::cout << "output tensor " << tidx
+              << " standard deviation:" << out_std_dev << std::endl;
+    std::cout << "output tensor " << tidx << " mean value:" << out_mean
+              << std::endl;
+
+    // print output
+    if (print_output_elem) {
+      for (int i = 0; i < ShapeProduction(out_shape); ++i) {
+        std::cout << "out[" << tidx << "][" << i
+                  << "]:" << output_tensor->data<float>()[i] << std::endl;
+      }
     }
-
-    double sum_duration = 0.0;
-    double max_duration = 1e-5;
-    double min_duration = 1e5;
-    double avg_duration = -1;
-    for (size_t ridx = 0; ridx < repeats; ++ridx) {
-        timeInstance.startTimer();
-
-        infer_specific_item(predictor);
-
-        double duration = timeInstance.getCostTimer();
-        sum_duration += duration;
-        max_duration = duration > max_duration ? duration : max_duration;
-        min_duration = duration < min_duration ? duration : min_duration;
-//        std::cout << "run_idx:" << ridx + 1 << " / " << repeats << ": " << duration
-//                  << " ms" << std::endl;
-        if (first_duration < 0) {
-            first_duration = duration;
-        }
-    }
-    avg_duration = sum_duration / static_cast<float>(repeats);
-    std::cout << "\n======= benchmark summary =======\n"
-              << "warmup:" << warmup << "\n"
-              << "repeats:" << repeats << "\n"
-              << "*** time info(ms) ***\n"
-              //<< "1st_duration:" << first_duration << "\n"
-              << "max_duration:" << max_duration << "\n"
-              << "min_duration:" << min_duration << "\n"
-              << "avg_duration:" << avg_duration << "\n";
-#else
-    infer_specific_item(predictor);
-#endif
-
-    // 5. Get output
+  }
 }
 
 int main(int argc, char** argv) {
-    UserPersonaInfer user_persona_infer;
-#ifdef IPTCORE_PADDLE_MOBILE
-//    user_persona_infer.create_paddle_light_predictor(
-//        "D:\\baidu\\baiduinput\\inputtools\\paddle_lite\\wenxin\\model_x86.nb");
-    user_persona_infer.create_paddle_light_predictor(
-        "./model_naive_buffer_arm.nb");
-    std::cout << "xiarj" << std::endl;
-#else
-//    user_persona_infer.create_paddle_full_predictor(
-//        "D:\\baidu\\baiduinput\\inputtools\\paddle_lite\\honor_2_11\\cls_ernie_3.0_tiny_fc_ch_dy_15_3L128H_decrypt_inference_1");
-#endif
-    //user_persona_infer.prepare("D:\\baidu\\baiduinput\\inputtools\\paddle_lite\\wenxin\\xia.txt");
-    user_persona_infer.prepare("./xia.txt");
-    user_persona_infer.infer();
-
+  std::vector<std::string> str_input_shapes;
+  std::vector<shape_t> input_shapes{
+      {1, 3, 224, 224}};  // shape_t ==> std::vector<int64_t>
+
+  int repeats = 10;
+  int warmup = 10;
+  // set arm power mode:
+  // 0 for big cluster, high performance
+  // 1 for little cluster
+  // 2 for all cores
+  // 3 for no bind
+  size_t power_mode = 0;
+  size_t thread_num = 1;
+  int accelerate_opencl = 1;
+  int print_output_elem = 0;
+
+  if (argc > 2 && argc < 9) {
+    std::cerr
+        << "usage: ./" << argv[0] << "\n"
+        << "  <naive_buffer_model_dir>\n"
+        << "  <raw_input_shapes>, eg: 1,3,224,224 for 1 input; "
+           "1,3,224,224:1,5 for 2 inputs\n"
+        << "  <repeats>, eg: 100\n"
+        << "  <warmup>, eg: 10\n"
+        << "  <power_mode>, 0: big cluster, high performance\n"
+           "                1: little cluster\n"
+           "                2: all cores\n"
+           "                3: no bind\n"
+        << "  <thread_num>, eg: 1 for single thread \n"
+        << "  <accelerate_opencl>, this option takes effect only when model "
+           "can be running on opencl backend.\n"
+           "                       0: disable opencl kernel cache & tuning\n"
+           "                       1: enable opencl kernel cache & tuning\n"
+        << "  <print_output>, 0: disable print outputs to stdout\n"
+           "                  1: enable print outputs to stdout\n"
+        << std::endl;
     return 0;
-}
+  }
+
+  std::string model_dir = argv[1];
+  if (argc >= 9) {
+    input_shapes.clear();
+    std::string raw_input_shapes = argv[2];
+    std::cout << "raw_input_shapes: " << raw_input_shapes << std::endl;
+    str_input_shapes = split_string(raw_input_shapes);
+    for (size_t i = 0; i < str_input_shapes.size(); ++i) {
+      std::cout << "input shape: " << str_input_shapes[i] << std::endl;
+      input_shapes.push_back(get_shape(str_input_shapes[i]));
+    }
 
+    repeats = atoi(argv[3]);
+    warmup = atoi(argv[4]);
+    power_mode = atoi(argv[5]);
+    thread_num = atoi(argv[6]);
+    accelerate_opencl = atoi(argv[7]);
+    print_output_elem = atoi(argv[8]);
+  }
+
+  RunModel(model_dir,
+           input_shapes,
+           repeats,
+           warmup,
+           power_mode,
+           thread_num,
+           accelerate_opencl,
+           print_output_elem);
+
+  return 0;
+}
diff --git a/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt b/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt
deleted file mode 100644
index 234ec1c85e3..00000000000
--- a/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt
+++ /dev/null
@@ -1,73 +0,0 @@
-cmake_minimum_required(VERSION 2.8)
-project(mobilenet_full_api)
-set(TARGET mobilenet_full_api)
-
-# 1. path to Paddle-Lite lib and mklml lib
-set(LITE_DIR "${PROJECT_SOURCE_DIR}/../../../cxx")
-set(MKLML_DIR "${PROJECT_SOURCE_DIR}/../../../third_party/mklml/")
-
-if (WIN32)
-  add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
-  set(MSVC_STATIC_CRT )
-  if(MSVC_STATIC_CRT)
-    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
-    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
-    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
-    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
-  else(MSVC_STATIC_CRT)
-    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MDd")
-    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MD")
-    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MDd")
-    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MD")
-  endif(MSVC_STATIC_CRT)
-endif()
-
-if (APPLE AND METAL)
-  message(STATUS "set METAL=ON")
-  add_definitions("-DMETAL")
-  find_library(METAL_LIBRARY Metal REQUIRED)
-  find_library(GRAPHIC CoreGraphics REQUIRED)
-  find_library(MPS_LIBRARY MetalPerformanceShaders REQUIRED)
-endif()
-
-# 2. link mklml and Paddle-Lite directory
-link_directories(${LITE_DIR}/lib ${MKLML_DIR}/lib)
-include_directories(${LITE_DIR}/include/ ${MKLML_DIR}/include)
-
-# 3. compile options 
-if (NOT WIN32)
-  add_definitions(-std=c++11 -g -O3 -pthread)
-  set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR})
-endif()
-
-# 4.add executable output
-add_executable(${TARGET} ${TARGET}.cc)
-if (WIN32)
-  set(WITH_STATIC_MKL )
-  if(WITH_STATIC_MKL)
-    set(MATH_LIB ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX})
-  else()
-    set(MATH_LIB ${MKLML_DIR}/lib/mklml${CMAKE_STATIC_LIBRARY_SUFFIX}
-                 ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX})
-  endif()
-
-  target_link_libraries(${TARGET} libpaddle_api_full_bundled.lib)
-  target_link_libraries(${TARGET} shlwapi.lib)
-  target_link_libraries(${TARGET} ${MATH_LIB})
-
-  add_custom_command(TARGET ${TARGET} POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/libiomp5md.dll ${CMAKE_CURRENT_BINARY_DIR}/Release
-  )
-  if(NOT WITH_STATIC_MKL)
-    add_custom_command(TARGET ${TARGET} POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/mklml.dll ${CMAKE_CURRENT_BINARY_DIR}/Release
-    )
-  endif()
-else()
-    if (APPLE AND METAL)
-      target_link_libraries(${TARGET} ${METAL_LIBRARY} ${GRAPHIC} ${MPS_LIBRARY})
-    endif()
-    target_link_libraries(${TARGET} -lpaddle_full_api_shared)
-    target_link_libraries(${TARGET} -liomp5)
-    target_link_libraries(${TARGET} -ldl)
-endif()
diff --git a/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt b/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt
deleted file mode 100644
index 3a91bfafbd3..00000000000
--- a/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt
+++ /dev/null
@@ -1,73 +0,0 @@
-cmake_minimum_required(VERSION 2.8)
-project(mobilenet_light_api)
-set(TARGET mobilenet_light_api)
-
-# 1. path to Paddle-Lite lib and mklml lib
-set(LITE_DIR "${PROJECT_SOURCE_DIR}/../../../cxx")
-set(MKLML_DIR "${PROJECT_SOURCE_DIR}/../../../third_party/mklml/")
-
-if (WIN32)
-  add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
-  set(MSVC_STATIC_CRT )
-  if(MSVC_STATIC_CRT)
-    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
-    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
-    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
-    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
-  else(MSVC_STATIC_CRT)
-    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MDd")
-    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MD")
-    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MDd")
-    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MD")
-  endif(MSVC_STATIC_CRT)
-endif()
-
-if (APPLE AND METAL)
-  message(STATUS "set METAL=ON")
-  add_definitions("-DMETAL")
-  find_library(METAL_LIBRARY Metal REQUIRED)
-  find_library(GRAPHIC CoreGraphics REQUIRED)
-  find_library(MPS_LIBRARY MetalPerformanceShaders REQUIRED)
-endif()
-
-# 2. link mklml and Paddle-Lite directory
-link_directories(${LITE_DIR}/lib ${MKLML_DIR}/lib)
-include_directories(${LITE_DIR}/include/ ${MKLML_DIR}/include)
-
-# 3. compile options 
-if (NOT WIN32)
-  add_definitions(-std=c++11 -g -O3 -pthread)
-  set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR})
-endif()
-
-# 4.add executable output
-add_executable(${TARGET} ${TARGET}.cc)
-if (WIN32)
-  set(WITH_STATIC_MKL )
-  if(WITH_STATIC_MKL)
-    set(MATH_LIB ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX})
-  else()
-    set(MATH_LIB ${MKLML_DIR}/lib/mklml${CMAKE_STATIC_LIBRARY_SUFFIX}
-                 ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX})
-  endif()
-
-  target_link_libraries(${TARGET} libpaddle_api_light_bundled.lib)
-  target_link_libraries(${TARGET} shlwapi.lib)
-  target_link_libraries(${TARGET} ${MATH_LIB})
-
-  add_custom_command(TARGET ${TARGET} POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/libiomp5md.dll ${CMAKE_CURRENT_BINARY_DIR}/Release
-  )
-  if(NOT WITH_STATIC_MKL)
-    add_custom_command(TARGET ${TARGET} POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/mklml.dll ${CMAKE_CURRENT_BINARY_DIR}/Release
-    )
-  endif()
-else()
-  if (APPLE AND METAL)
-    target_link_libraries(${TARGET} ${METAL_LIBRARY} ${GRAPHIC} ${MPS_LIBRARY})
-  endif()
-  target_link_libraries(${TARGET} -lpaddle_light_api_shared)
-  target_link_libraries(${TARGET} -liomp5)
-  target_link_libraries(${TARGET} -ldl)
-endif()
diff --git a/lite/tests/unittest_py/__main___cache_dir/model b/lite/tests/unittest_py/__main___cache_dir/model
deleted file mode 100644
index ad9c9d92f78368ee4c4713242cba7c0dad4929b5..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2164
zcmZuyO>P@E6!u86EkDW0SV>w(DGEbrfx?>t&H;La7HAjU6a+QHkqB|fG0Bmota^-Y
zyUTHUmR=zJNNOxassY2EH}CKJ-uK?nUxVRU{<-_frehjZLX?x<@ai_3UekFce6hqM
z-DT5vG%~GkTF*-E`D8r2oDS(?*v~2NPv6t4M)5+hrBS7@>`BPQ(oYCZ7c>~2=NS@C
ziQJMmA|>eF_42`8Hv5sTDEobq>fccR=N8d>?`eE7yP|$!RDO1QHv810w>0`S<oiu%
zXY8T-WRfx*c09eVq*AOf+Ig=1Y((d!-!=l-`m>jGRT+!^i>A#lemR@vG&T(@xKi*g
zR{42wfu$5{S%|!Me>R&$7tL$o3WGes_JS_qWQ_`G|GY!cX!p$4+#<_cqD0iLbVvT@
zYzDp~;C!^9O>D_|i!jx8xzE!>_Wd9B-=Ai0=;f32OZM0T4&&b2UFXy~!RnPNwT_kt
z$^*BDWK711pfurW`g2I{VEFf^kN5wK>1(<HGpyOpr7oEl&NBu@3<!K^<C#Et1w%$4
zbpHTy7FKW%E%+tZ(IF>N!nuPGSZeOA+%R3c=<|YxTcZq{n@xOvM{gEF3yVywv8FMO
zE63Ik@#VL4TS|~5=Ph1Wjo!Q9eUHG9P@)#vKS@x+%OLHHru6#ITq$F0DcAAcQAjOQ
z>x9ivvWYGcnk<>)b)$rhwtM=n;w#7pShY|r)++-Y+RKla2H!9~4-k6<h)QbLU|nJ?
zP@F(yiqC~g{$sj;Xx9&^c~nQgtF~dD&sEAWqi<@nmU_YX8dNO+b?#zK2L%V8$i-n?
zM<Ip=g}cYt2|DXKjxnf=@ziaaFNRgMP8!V?wvGNbG+J^CCYDy}RW#kwq}B4V1r=B{
zQM6vJQ<I?3rQ?#(9Zj97d|)L?xUeSZLVHYj8}nS#CASOK@qav)T}#lQ6F%(-!uo_F
zSiaEj$@IGkM-}qbmRX?^3~=CWHw|!Ljbx26DGM^z(uE_KVU5T>&oo-lK$P^e#$gEO
zk!fS==<%9fSgu!LKY<VIMJos>MdO!ICuzD2JwO%4CAz@<g8Su8UmYDhZN)3PZsZ0!
zbIZ1>b}GQC;5aH#_>)!02oYbLKXlQfpscobZR!**kR@&-&fs(VAiglrkb@Bb!krC~
zV4`SoUl_!bqK_wJl3YP&ZNdibioh-6tZ*0<-{OnXu@qQ1;@}6Goo-d9&`#qq{wvD=
zhs7T4>>qHszF-$vyYsZaSpc<P*)Hwzn4*2Rk{=RANTleF1MuRTQK&S=6A?o3F#)z%
zY#NS>%Ac><5reLsM^~5^ymd};&48gaiEr#4uPUhpwo#@&pyLLMm>5MPVZ|ENro#%v
zPK0{+wW3JZ4mg_qtbs%EI#tF0yyT%&y!2ruxRQ&Y-abCnecl$aJM8<(@k(dw2u@?3
GLco9K>qUeB

diff --git a/lite/tests/unittest_py/__main___cache_dir/opt_model/model b/lite/tests/unittest_py/__main___cache_dir/opt_model/model
deleted file mode 100644
index 99d11af804fa2902ea6a97be38c42308ea14546c..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 609
zcmaJ<OHRWu6wI$}^3sp71hkgeSVBT&5f#|e8-N50B!sLean!0I35nAd4#J)tN8%h@
zpg1wD(oMaw<aux2%sj)7Wou}&Jv!-v6>%P7!*+%`Apph9&o~Qx;bX8nqz#sBAZw_R
zGjP*VWTjAGSAsUQZ4&`9CESBn#6>WZOueB-pTPnB2`lm=Fz-r%8{DJ@=on5In=W{s
z@R*5rIoKB>pE8Dwalg{#g*@^m-i>!f9WaA5Mw%r5G++eoL#6bQCq<e!sWDd5nrcZe
zn#VB<(xec6BIxyjxu<)jI{_2wLMyB^N(m+%fD@&8z*&%$sC}<h4{63izMKbKs@LcU
zAd7uj@!zw}e4|sDRUI>O*RZa+3X!v=AD5~X0h9%2uZuWLlA65_Ed}f6FZu5+J%+(*
zE@telly>US>aq+_m3q@U?y5M{c8I#@<$(YO^&WS|_mY<`tFN*3)VXN5vh1(#k3SIb
Bv<UzJ

diff --git a/lite/tests/unittest_py/__main___cache_dir/opt_model/params b/lite/tests/unittest_py/__main___cache_dir/opt_model/params
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/lite/tests/unittest_py/__main___cache_dir/params b/lite/tests/unittest_py/__main___cache_dir/params
deleted file mode 100644
index 0b26358c07d8cb50206b90e46ed2267c0250d19f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 28
VcmZQzKm{y7E(fasBLhQ&Jpckz0O9}u

diff --git a/lite/tests/unittest_py/op/statics_data b/lite/tests/unittest_py/op/statics_data
deleted file mode 100644
index 276804d5f93042d2bd7c043f3033e5504a09bccb..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1185
zcmajeWp~s-6b9fzfkJUB?zUKQ*HWZFTi`+=Xh?Tc638a|ZYHJ3;um*!cXx;1#!O~8
ze1XH)d+*G7?t5nT6y{KnFp3Ha3WSy2WQ!1ulC_cR3D*ycLt7N1@p=>*P2@mD5DGN0
z7)=Syj1pP#eJf>Ugq<~0cD<XlP2Wn}A&MM%ra7Ypq1b38=Pb`Ng&hhr$b@K_L#xD}
z3tOw<HVLDvBB+@<S57AbKNOa)PLyW0p{>#3cUvmWwAGMrw6huQ2_1|=**7LnL`QX9
zCq`#NNn9+WDs)lBu8eMk?nW~?xg=xz<(256>OC2~2)&J_vR-8i%R?Vk?#t*$=&vtc
zT7v<qIFK=jFxY4zXRE8r^zs;zLn2|ck+m(6??rxt>zfTC>0zikGmJ5uFv4gp`$;QZ
zn|0-KX4;D|QngDNqX?sowzBQlyD8U_QTbtz%?gZB17jKE2;<}XU0>LqT-}zFet-$8
zKanwsFgc2hmhy@)N+#_vjFX;{!_<Tm2R!XR0Z;!|z%yjPGYPYtIM~^YGQu1u4z!#x
zmoU$XQ=HFOK&Wuy>=rT>5f(dfW|fR8LbW4PT9R<$NR~2|5y*)HSk72MsBz+DS29)+
z3@6@fHDe87t=vpKq977x%B`!5S6i3E`uu7e7$#w(2Fll2OGs*9ieVG#G;kBcA-Ed2
znX!f7X<(Y+69NUy<bhyh384lGMntIBz^#l1!Zr=u&e%cNDS_%Yc&?B?AP{C4Wimll
z*r}MB%VAfZx|^|wuvb&}G4>M<Xeu)f5)NtVVa5@{QB6I@I8HdBsV5ny2&WbGOrCm{
zagK0aQ!g+s5-w@#WyTf4RZYFdxK6kssp_L;tW?VT8}?=nxANHAj5~z88hej%pYT9q
zA2J>h9&79q##6#GjeX8|L3pXLuNbchZxr@z9{Y~*p723qKQcZMK5Oh3##h2Ojs4E}
KLHHR(QRy$IOqiSi

diff --git a/lite/tests/unittest_py/op/test_unique_op.py b/lite/tests/unittest_py/op/test_unique_op.py
index 24fb6b63a3f..dc71fcdf799 100644
--- a/lite/tests/unittest_py/op/test_unique_op.py
+++ b/lite/tests/unittest_py/op/test_unique_op.py
@@ -17,6 +17,7 @@
 
 from auto_scan_test import AutoScanTest, IgnoreReasons
 from program_config import TensorConfig, ProgramConfig, OpConfig, CxxConfig, TargetType, PrecisionType, DataLayoutType, Place
+import unittest
 
 import hypothesis
 from hypothesis import given, settings, seed, example, assume
@@ -26,7 +27,7 @@
 import numpy as np
 
 
-class TestUniqueWithCountsOp(AutoScanTest):
+class TestUniqueOp(AutoScanTest):
     def __init__(self, *args, **kwargs):
         AutoScanTest.__init__(self, *args, **kwargs)
         host_places = [
@@ -58,7 +59,7 @@ def generate_IndexTensor():
 
         unique_op = OpConfig(
             type = "unique",
-            input = {"X": ["input_data"]},
+            inputs = {"X": ["input_data"]},
             outputs = {
                 "Out": ["Out_data"],
                 "Index": ["Index_data"],

From 4a8f18be55f38095ce187f26c5f85e7b11953650 Mon Sep 17 00:00:00 2001
From: Qijian Tian <1741919942@qq.com>
Date: Wed, 14 Dec 2022 14:10:55 +0800
Subject: [PATCH 06/10] fix test_unique_op

---
 lite/tests/unittest_py/op/test_unique_op.py | 61 ++++++++++++++-------
 1 file changed, 41 insertions(+), 20 deletions(-)

diff --git a/lite/tests/unittest_py/op/test_unique_op.py b/lite/tests/unittest_py/op/test_unique_op.py
index dc71fcdf799..64fc9d7111e 100644
--- a/lite/tests/unittest_py/op/test_unique_op.py
+++ b/lite/tests/unittest_py/op/test_unique_op.py
@@ -46,50 +46,71 @@ def sample_program_configs(self, draw):
                 st.integers(
                     min_value=2, max_value=100),
                 min_size=1,
-                max_size=3))
+                max_size=1))
         in_dtype = draw(st.sampled_from([np.float32, np.int32, np.int64]))
-
+                
         def generate_X_data():
             return np.random.normal(0.0, 5.0, in_shape).astype(in_dtype)
 
         def generate_IndexTensor():
             return np.random.randint(1, 5, size=in_shape).astype(np.int32)
 
+        dtype = 2
+        is_sorted = draw(st.sampled_from([True, False]))
+        return_index = draw(st.sampled_from([False]))
+        if is_sorted: 
+            return_inverse = draw(st.sampled_from([True, False]))
+        else:
+            return_inverse = True
+        return_counts = draw(st.sampled_from([False]))
+        outputs = [
+            "Out_data"
+        ]
+        outputs_config = {
+            "Out": ["Out_data"]
+        }
+        outputs_dtype = {
+            "Out_data": in_dtype
+        }
+        if return_inverse:
+            outputs.append("Index_data")
+            outputs_config["Index"] = ["Index_data"]
+            outputs_dtype["Index_data"] = np.int32
+        if return_index:
+            outputs.append("Indices_data")
+            outputs_config["Indices"] = ["Indices_data"]
+            outputs_dtype["Indices_data"] = np.int32
+        if return_counts:
+            outputs.append("Counts_data")
+            outputs_config["Counts"] = ["Counts_data"] 
+            outputs_dtype["Counts_data"] = np.int32
+
         axis = draw(st.sampled_from([[0, 1, 2], [1], [0, 2], [2, 1], [0, 1]]))
+        axis = []        
 
         unique_op = OpConfig(
             type = "unique",
             inputs = {"X": ["input_data"]},
-            outputs = {
-                "Out": ["Out_data"],
-                "Index": ["Index_data"],
-                "Indices": ["Indices_data"],
-                "Counts": ["Counts_data"]
-            },
+            outputs = outputs_config,
             attrs={
                 "dtype": 2,
-                "return_index": False,
-                "return_inverse": False,
-                "return_counts": False,
+                "return_index": return_index,
+                "return_inverse": return_inverse,
+                "return_counts": return_counts,
                 "axis": axis,
-                "is_sorted": False
+                "is_sorted": is_sorted
             }
         )
 
-        unique_op.outputs_dtype = {"Out_data": in_dtype}
-        unique_op.outputs_dtype = {"Index_data": np.int32}
-        unique_op.outputs_dtype = {"Counts_data":np.int32}
+        unique_op.outputs_dtype = outputs_dtype
 
         program_config = ProgramConfig(
             ops=[unique_op],
-            weights={
-                "Index_data":
-                TensorConfig(data_gen=partial(generate_IndexTensor))
-            },
+            weights={},
             inputs={
                 "input_data": TensorConfig(data_gen=partial(generate_X_data))
             },
-            outputs=["Out_data", "Index_data", "Counts_data"]
+            outputs=outputs
         )
         return program_config
 

From 0d6f7edf982e6a34df5104aa8d8c6823eeafbfe9 Mon Sep 17 00:00:00 2001
From: Qijian Tian <1741919942@qq.com>
Date: Mon, 19 Dec 2022 11:57:22 +0800
Subject: [PATCH 07/10] update unique_op on 12.19

---
 lite/kernels/host/unique_compute.cc         | 32 +++++++------
 lite/operators/unique_op.cc                 | 51 +++++++++++++++------
 lite/tests/unittest_py/op/test_unique_op.py | 48 +++++++++++--------
 3 files changed, 83 insertions(+), 48 deletions(-)

diff --git a/lite/kernels/host/unique_compute.cc b/lite/kernels/host/unique_compute.cc
index 4c96e7f8c11..b29101e52ed 100644
--- a/lite/kernels/host/unique_compute.cc
+++ b/lite/kernels/host/unique_compute.cc
@@ -24,6 +24,7 @@
 #include <vector>
 #include <iostream>
 
+
 namespace paddle {
 namespace lite {
 namespace kernels {
@@ -33,7 +34,7 @@ template <typename InT, typename IndexT>
 void UniqueFunc(const lite::Tensor* x, 
                       lite::Tensor* out,
                       lite::Tensor* index,
-                      lite::Tensor* count = nullptr) {
+                      lite::Tensor* count) {
   const InT* in_data = x->template data<InT>();
   IndexT* index_data = index->mutable_data<IndexT>();
 
@@ -100,10 +101,10 @@ void UniqueFlattendTensorFunc(const lite::Tensor& in,
   }
 
   if (return_inverse) {
-    auto* inverse = index;
-    inverse->Resize({out->numel()});
-    auto inverse_data = inverse->mutable_data<IndexT>();
+    index->Resize({in.numel()});
+    auto inverse_data = index->mutable_data<IndexT>();
     std::unordered_map<InT, IndexT> inverse_map;
+    inverse_map.reserve(out->numel());
     for (int64_t i = 0; i < out->numel(); ++i) {
       inverse_map[out_data[i]] = i;
     }
@@ -296,11 +297,11 @@ void UniqueDimFunc(const lite::Tensor& in,
   std::iota(permute.begin(), permute.end(), 0);
   permute[axis] = 0;
   permute[0] = axis;
-  std::vector<int64_t> in_trans_dim_vec(in.dims().Vectorize());
-  in_trans_dim_vec[axis] = in.dims()[0];
-  in_trans_dim_vec[0] = in.dims()[axis];
+  std::vector<int64_t> in_trans_dims_vec(in.dims().Vectorize());
+  in_trans_dims_vec[axis] = in.dims()[0];
+  in_trans_dims_vec[0] = in.dims()[axis];
   lite::Tensor in_trans;
-  lite::DDim in_trans_dims = DDim(in_trans_dim_vec);
+  lite::DDim in_trans_dims = DDim(in_trans_dims_vec);
   in_trans.Resize(in_trans_dims);
   in_trans.mutable_data<InT>();
   TransCompute<InT>(in, &in_trans, permute);
@@ -356,7 +357,7 @@ void UniqueDimFunc(const lite::Tensor& in,
   indices_vec.erase(indices_vec.begin() + input_unbind.size(), indices_vec.end());
   
   lite::Tensor out_trans;
-  std::vector<int64_t> out_trans_dims_vec = in_trans_dim_vec;
+  std::vector<int64_t> out_trans_dims_vec = in_trans_dims_vec;
   out_trans_dims_vec[0] = input_unbind.size();
   out_trans.Resize(out_trans_dims_vec);
   out_trans.mutable_data<InT>();
@@ -367,16 +368,20 @@ void UniqueDimFunc(const lite::Tensor& in,
   TransCompute<InT>(out_trans, out, permute);
 
   if (return_inverse) {
+    index->Resize({in.numel()});
     TensorFromVector(inverse_vec, index);
   }
 
   if (return_counts) {
+    count->Resize({out->numel()});
     TensorFromVector(counts_vec, count);
   }
 
   if (return_index) {
+    indices->Resize({out->numel()});
     TensorFromVector(indices_vec, indices);
   }
+
 }
 
 void UniqueCompute::Run() {
@@ -387,9 +392,9 @@ void UniqueCompute::Run() {
   auto indices = param.Indices;
   auto count = param.Counts;
   auto dtype = param.dtype;
-  auto return_index = param.return_index;
-  auto return_inverse = param.return_inverse;
-  auto return_counts = param.return_counts;
+  bool return_index = param.return_index;
+  bool return_inverse = param.return_inverse;
+  bool return_counts = param.return_counts;
   auto axis_vec = param.axis;
   auto is_sorted = param.is_sorted;
 
@@ -399,7 +404,7 @@ void UniqueCompute::Run() {
   CHECK_EQ(index_type_match, true) << "Index holds the wrong type, it holds "
                                    << static_cast<int>(type)
                                    << "but desires to be int32 or int64";
-    
+
   if (!is_sorted) {
     if (index_type == PRECISION(kInt32)) {
       switch (type) {
@@ -560,3 +565,4 @@ REGISTER_LITE_KERNEL(unique,
                                        PRECISION(kInt32),
                                        DATALAYOUT(kAny))})
     .Finalize();
+
diff --git a/lite/operators/unique_op.cc b/lite/operators/unique_op.cc
index adab6096cd8..84f829028a9 100644
--- a/lite/operators/unique_op.cc
+++ b/lite/operators/unique_op.cc
@@ -21,24 +21,45 @@ namespace operators {
 bool UniqueOp::CheckShape() const {
   CHECK_OR_FALSE(param_.X);
   CHECK_OR_FALSE(param_.Out);
-  if (param_.return_index) {
-    CHECK_OR_FALSE(param_.Indices);
-  }
-  if (param_.return_inverse) {
+  if (!param_.is_sorted) {
     CHECK_OR_FALSE(param_.Index);
-  }
-  if (param_.return_counts) {
-    CHECK_OR_FALSE(param_.Counts)
+  } else {
+    if (param_.return_index) {
+      CHECK_OR_FALSE(param_.Indices);
+    }
+    if (param_.return_inverse) {
+      CHECK_OR_FALSE(param_.Index);
+    }
+    if (param_.return_counts) {
+      CHECK_OR_FALSE(param_.Counts)
+    }
   }
   return true;
 }
 
 bool UniqueOp::InferShapeImpl() const {
-  DDim in_dims = param_.X->dims();
-  if (param_.Out) param_.Out->Resize(in_dims);
-  if (param_.Index) param_.Index->Resize(in_dims);
-  if (param_.Indices) param_.Indices->Resize(in_dims);
-  if (param_.Counts) param_.Counts->Resize(in_dims);
+  if (!param_.is_sorted) {
+    DDim in_dims = param_.X->dims();
+    if (param_.Out) param_.Out->Resize({-1});
+    if (param_.Index) param_.Index->Resize(in_dims);
+  } else {
+    DDim in_dims = param_.X->dims();
+    if (param_.axis.empty()) {
+      if (param_.Out) param_.Out->Resize(in_dims);
+      if (param_.return_inverse) param_.Index->Resize(in_dims);
+    } else {
+      int axis_value = param_.axis[0];
+      if (axis_value < 0) {
+        axis_value += in_dims.size();
+      }
+      DDim out_dims = in_dims;
+      out_dims[axis_value] = -1;
+      if (param_.Out) param_.Out->Resize(out_dims);
+      if (param_.return_inverse) param_.Index->Resize({in_dims[axis_value]});
+    }
+    if (param_.return_index) param_.Indices->Resize({-1});
+    if (param_.return_counts) param_.Counts->Resize({-1});
+  }
   return true;
 }
 
@@ -50,11 +71,11 @@ bool UniqueOp::AttachImpl(const cpp::OpDesc &opdesc,
   CHECK(param_.Out) << "Output(Out) of UniqueOp should not be null.";
   if (opdesc.HasOutput("Index")) {
     param_.Index = scope->FindMutableTensor(opdesc.Output("Index").front());
-    CHECK(param_.Out) << "Output(Index) of UniqueOp should not be null.";
+    CHECK(param_.Index) << "Output(Index) of UniqueOp should not be null.";
   }
   if (opdesc.HasOutput("Indices")) {
     param_.Indices = scope->FindMutableTensor(opdesc.Output("Indices").front());
-    CHECK(param_.Out) << "Output(Indices) of UniqueOp should not be null.";
+    CHECK(param_.Indices) << "Output(Indices) of UniqueOp should not be null.";
   }
   if (opdesc.HasOutput("Counts")) {
     param_.Counts = scope->FindMutableTensor(opdesc.Output("Counts").front());
@@ -67,7 +88,7 @@ bool UniqueOp::AttachImpl(const cpp::OpDesc &opdesc,
   if (opdesc.HasAttr("return_index")) {
     param_.return_index = opdesc.GetAttr<bool>("return_index");
   }
-  if (opdesc.HasAttr("return_reverse")) {
+  if (opdesc.HasAttr("return_inverse")) {
     param_.return_inverse = opdesc.GetAttr<bool>("return_inverse");
   }
   if (opdesc.HasAttr("return_counts")) {
diff --git a/lite/tests/unittest_py/op/test_unique_op.py b/lite/tests/unittest_py/op/test_unique_op.py
index 64fc9d7111e..453ac6bd561 100644
--- a/lite/tests/unittest_py/op/test_unique_op.py
+++ b/lite/tests/unittest_py/op/test_unique_op.py
@@ -47,22 +47,33 @@ def sample_program_configs(self, draw):
                     min_value=2, max_value=100),
                 min_size=1,
                 max_size=1))
+        print(in_shape)
         in_dtype = draw(st.sampled_from([np.float32, np.int32, np.int64]))
                 
         def generate_X_data():
-            return np.random.normal(0.0, 5.0, in_shape).astype(in_dtype)
-
-        def generate_IndexTensor():
-            return np.random.randint(1, 5, size=in_shape).astype(np.int32)
-
-        dtype = 2
-        is_sorted = draw(st.sampled_from([True, False]))
-        return_index = draw(st.sampled_from([False]))
-        if is_sorted: 
-            return_inverse = draw(st.sampled_from([True, False]))
-        else:
-            return_inverse = True
-        return_counts = draw(st.sampled_from([False]))
+            t = np.random.normal(0.0, 5.0, in_shape).astype(in_dtype)
+            print(t)
+            return t
+
+        dtype = draw(st.sampled_from([2,3]))
+        is_sorted = draw(st.booleans())
+        return_index = draw(st.booleans())
+        return_inverse = draw(st.sampled_from([True]))
+        return_counts = draw(st.booleans())
+
+        if is_sorted == False:
+            return_index = False
+            return_counts = False
+
+        param_is_sorted = is_sorted
+        param_return_index = return_index
+        param_return_inverse = return_inverse
+        param_return_counts = return_counts
+  
+        axis = draw(st.sampled_from([[2], [1], [0], []]))
+        while len(axis) > 0 and axis[0] >= len(in_shape):
+            axis[0] = axis[0] - 1       
+
         outputs = [
             "Out_data"
         ]
@@ -85,18 +96,15 @@ def generate_IndexTensor():
             outputs_config["Counts"] = ["Counts_data"] 
             outputs_dtype["Counts_data"] = np.int32
 
-        axis = draw(st.sampled_from([[0, 1, 2], [1], [0, 2], [2, 1], [0, 1]]))
-        axis = []        
-
         unique_op = OpConfig(
             type = "unique",
             inputs = {"X": ["input_data"]},
             outputs = outputs_config,
             attrs={
-                "dtype": 2,
-                "return_index": return_index,
-                "return_inverse": return_inverse,
-                "return_counts": return_counts,
+                "dtype": dtype,
+                "return_index": param_return_index,
+                "return_inverse": param_return_inverse,
+                "return_counts": param_return_counts,
                 "axis": axis,
                 "is_sorted": is_sorted
             }

From 256856055fc42bab5511e9466bd4c9d5f0b24e88 Mon Sep 17 00:00:00 2001
From: Qijian Tian <1741919942@qq.com>
Date: Mon, 19 Dec 2022 21:33:45 +0800
Subject: [PATCH 08/10] Fix index_select_op and unique_op

---
 .../opencl/utils/tune_cache_generated.h       | 383 ------------------
 lite/core/program.cc                          | 243 +----------
 lite/kernels/host/index_select_compute.cc     |   2 +
 lite/kernels/host/unique_compute.cc           | 196 ++++++---
 lite/operators/op_params.h                    |   8 +-
 lite/operators/unique_op.cc                   |   8 +-
 lite/operators/unique_op.h                    |   4 +-
 lite/tests/unittest_py/op/test_unique_op.py   |  75 ++--
 8 files changed, 204 insertions(+), 715 deletions(-)
 delete mode 100644 lite/backends/opencl/utils/tune_cache_generated.h

diff --git a/lite/backends/opencl/utils/tune_cache_generated.h b/lite/backends/opencl/utils/tune_cache_generated.h
deleted file mode 100644
index bb091cce383..00000000000
--- a/lite/backends/opencl/utils/tune_cache_generated.h
+++ /dev/null
@@ -1,383 +0,0 @@
-// automatically generated by the FlatBuffers compiler, do not modify
-
-
-#ifndef FLATBUFFERS_GENERATED_TUNECACHE_PADDLE_LITE_FBS_OPENCL_PROTO_H_
-#define FLATBUFFERS_GENERATED_TUNECACHE_PADDLE_LITE_FBS_OPENCL_PROTO_H_
-
-#include "flatbuffers/flatbuffers.h"
-
-namespace paddle {
-namespace lite {
-namespace fbs {
-namespace opencl {
-namespace proto {
-namespace TuneCache_ {
-
-struct TunePair;
-struct TunePairBuilder;
-struct TunePairT;
-
-}  // namespace TuneCache_
-
-struct TuneCache;
-struct TuneCacheBuilder;
-struct TuneCacheT;
-
-namespace TuneCache_ {
-
-bool operator==(const TunePairT &lhs, const TunePairT &rhs);
-bool operator!=(const TunePairT &lhs, const TunePairT &rhs);
-}  // namespace TuneCache_
-
-bool operator==(const TuneCacheT &lhs, const TuneCacheT &rhs);
-bool operator!=(const TuneCacheT &lhs, const TuneCacheT &rhs);
-
-namespace TuneCache_ {
-
-inline const flatbuffers::TypeTable *TunePairTypeTable();
-
-}  // namespace TuneCache_
-
-inline const flatbuffers::TypeTable *TuneCacheTypeTable();
-
-namespace TuneCache_ {
-
-struct TunePairT : public flatbuffers::NativeTable {
-  typedef TunePair TableType;
-  std::string key;
-  std::vector<int32_t> value;
-  TunePairT() {
-  }
-};
-
-inline bool operator==(const TunePairT &lhs, const TunePairT &rhs) {
-  return
-      (lhs.key == rhs.key) &&
-      (lhs.value == rhs.value);
-}
-
-inline bool operator!=(const TunePairT &lhs, const TunePairT &rhs) {
-    return !(lhs == rhs);
-}
-
-
-struct TunePair FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
-  typedef TunePairT NativeTableType;
-  typedef TunePairBuilder Builder;
-  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
-    return TunePairTypeTable();
-  }
-  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
-    VT_KEY = 4,
-    VT_VALUE = 6
-  };
-  const flatbuffers::String *key() const {
-    return GetPointer<const flatbuffers::String *>(VT_KEY);
-  }
-  flatbuffers::String *mutable_key() {
-    return GetPointer<flatbuffers::String *>(VT_KEY);
-  }
-  bool KeyCompareLessThan(const TunePair *o) const {
-    return *key() < *o->key();
-  }
-  int KeyCompareWithValue(const char *val) const {
-    return strcmp(key()->c_str(), val);
-  }
-  const flatbuffers::Vector<int32_t> *value() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_VALUE);
-  }
-  flatbuffers::Vector<int32_t> *mutable_value() {
-    return GetPointer<flatbuffers::Vector<int32_t> *>(VT_VALUE);
-  }
-  bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) &&
-           VerifyOffsetRequired(verifier, VT_KEY) &&
-           verifier.VerifyString(key()) &&
-           VerifyOffset(verifier, VT_VALUE) &&
-           verifier.VerifyVector(value()) &&
-           verifier.EndTable();
-  }
-  TunePairT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(TunePairT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<TunePair> Pack(flatbuffers::FlatBufferBuilder &_fbb, const TunePairT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
-};
-
-struct TunePairBuilder {
-  typedef TunePair Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_key(flatbuffers::Offset<flatbuffers::String> key) {
-    fbb_.AddOffset(TunePair::VT_KEY, key);
-  }
-  void add_value(flatbuffers::Offset<flatbuffers::Vector<int32_t>> value) {
-    fbb_.AddOffset(TunePair::VT_VALUE, value);
-  }
-  explicit TunePairBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
-    start_ = fbb_.StartTable();
-  }
-  TunePairBuilder &operator=(const TunePairBuilder &);
-  flatbuffers::Offset<TunePair> Finish() {
-    const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<TunePair>(end);
-    fbb_.Required(o, TunePair::VT_KEY);
-    return o;
-  }
-};
-
-inline flatbuffers::Offset<TunePair> CreateTunePair(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::String> key = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> value = 0) {
-  TunePairBuilder builder_(_fbb);
-  builder_.add_value(value);
-  builder_.add_key(key);
-  return builder_.Finish();
-}
-
-inline flatbuffers::Offset<TunePair> CreateTunePairDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    const char *key = nullptr,
-    const std::vector<int32_t> *value = nullptr) {
-  auto key__ = key ? _fbb.CreateString(key) : 0;
-  auto value__ = value ? _fbb.CreateVector<int32_t>(*value) : 0;
-  return paddle::lite::fbs::opencl::proto::TuneCache_::CreateTunePair(
-      _fbb,
-      key__,
-      value__);
-}
-
-flatbuffers::Offset<TunePair> CreateTunePair(flatbuffers::FlatBufferBuilder &_fbb, const TunePairT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
-
-}  // namespace TuneCache_
-
-struct TuneCacheT : public flatbuffers::NativeTable {
-  typedef TuneCache TableType;
-  std::vector<std::unique_ptr<paddle::lite::fbs::opencl::proto::TuneCache_::TunePairT>> tune_map;
-  TuneCacheT() {
-  }
-};
-
-inline bool operator==(const TuneCacheT &lhs, const TuneCacheT &rhs) {
-  return
-      (lhs.tune_map == rhs.tune_map);
-}
-
-inline bool operator!=(const TuneCacheT &lhs, const TuneCacheT &rhs) {
-    return !(lhs == rhs);
-}
-
-
-struct TuneCache FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
-  typedef TuneCacheT NativeTableType;
-  typedef TuneCacheBuilder Builder;
-  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
-    return TuneCacheTypeTable();
-  }
-  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
-    VT_TUNE_MAP = 4
-  };
-  const flatbuffers::Vector<flatbuffers::Offset<paddle::lite::fbs::opencl::proto::TuneCache_::TunePair>> *tune_map() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<paddle::lite::fbs::opencl::proto::TuneCache_::TunePair>> *>(VT_TUNE_MAP);
-  }
-  flatbuffers::Vector<flatbuffers::Offset<paddle::lite::fbs::opencl::proto::TuneCache_::TunePair>> *mutable_tune_map() {
-    return GetPointer<flatbuffers::Vector<flatbuffers::Offset<paddle::lite::fbs::opencl::proto::TuneCache_::TunePair>> *>(VT_TUNE_MAP);
-  }
-  bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) &&
-           VerifyOffsetRequired(verifier, VT_TUNE_MAP) &&
-           verifier.VerifyVector(tune_map()) &&
-           verifier.VerifyVectorOfTables(tune_map()) &&
-           verifier.EndTable();
-  }
-  TuneCacheT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(TuneCacheT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<TuneCache> Pack(flatbuffers::FlatBufferBuilder &_fbb, const TuneCacheT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
-};
-
-struct TuneCacheBuilder {
-  typedef TuneCache Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_tune_map(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<paddle::lite::fbs::opencl::proto::TuneCache_::TunePair>>> tune_map) {
-    fbb_.AddOffset(TuneCache::VT_TUNE_MAP, tune_map);
-  }
-  explicit TuneCacheBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
-    start_ = fbb_.StartTable();
-  }
-  TuneCacheBuilder &operator=(const TuneCacheBuilder &);
-  flatbuffers::Offset<TuneCache> Finish() {
-    const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<TuneCache>(end);
-    fbb_.Required(o, TuneCache::VT_TUNE_MAP);
-    return o;
-  }
-};
-
-inline flatbuffers::Offset<TuneCache> CreateTuneCache(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<paddle::lite::fbs::opencl::proto::TuneCache_::TunePair>>> tune_map = 0) {
-  TuneCacheBuilder builder_(_fbb);
-  builder_.add_tune_map(tune_map);
-  return builder_.Finish();
-}
-
-inline flatbuffers::Offset<TuneCache> CreateTuneCacheDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    std::vector<flatbuffers::Offset<paddle::lite::fbs::opencl::proto::TuneCache_::TunePair>> *tune_map = nullptr) {
-  auto tune_map__ = tune_map ? _fbb.CreateVectorOfSortedTables<paddle::lite::fbs::opencl::proto::TuneCache_::TunePair>(tune_map) : 0;
-  return paddle::lite::fbs::opencl::proto::CreateTuneCache(
-      _fbb,
-      tune_map__);
-}
-
-flatbuffers::Offset<TuneCache> CreateTuneCache(flatbuffers::FlatBufferBuilder &_fbb, const TuneCacheT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
-
-namespace TuneCache_ {
-
-inline TunePairT *TunePair::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  std::unique_ptr<paddle::lite::fbs::opencl::proto::TuneCache_::TunePairT> _o = std::unique_ptr<paddle::lite::fbs::opencl::proto::TuneCache_::TunePairT>(new TunePairT());
-  UnPackTo(_o.get(), _resolver);
-  return _o.release();
-}
-
-inline void TunePair::UnPackTo(TunePairT *_o, const flatbuffers::resolver_function_t *_resolver) const {
-  (void)_o;
-  (void)_resolver;
-  { auto _e = key(); if (_e) _o->key = _e->str(); }
-  { auto _e = value(); if (_e) { _o->value.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->value[_i] = _e->Get(_i); } } }
-}
-
-inline flatbuffers::Offset<TunePair> TunePair::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TunePairT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateTunePair(_fbb, _o, _rehasher);
-}
-
-inline flatbuffers::Offset<TunePair> CreateTunePair(flatbuffers::FlatBufferBuilder &_fbb, const TunePairT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
-  (void)_rehasher;
-  (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TunePairT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _key = _fbb.CreateString(_o->key);
-  auto _value = _fbb.CreateVector(_o->value);
-  return paddle::lite::fbs::opencl::proto::TuneCache_::CreateTunePair(
-      _fbb,
-      _key,
-      _value);
-}
-
-}  // namespace TuneCache_
-
-inline TuneCacheT *TuneCache::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  std::unique_ptr<paddle::lite::fbs::opencl::proto::TuneCacheT> _o = std::unique_ptr<paddle::lite::fbs::opencl::proto::TuneCacheT>(new TuneCacheT());
-  UnPackTo(_o.get(), _resolver);
-  return _o.release();
-}
-
-inline void TuneCache::UnPackTo(TuneCacheT *_o, const flatbuffers::resolver_function_t *_resolver) const {
-  (void)_o;
-  (void)_resolver;
-  { auto _e = tune_map(); if (_e) { _o->tune_map.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->tune_map[_i] = std::unique_ptr<paddle::lite::fbs::opencl::proto::TuneCache_::TunePairT>(_e->Get(_i)->UnPack(_resolver)); } } }
-}
-
-inline flatbuffers::Offset<TuneCache> TuneCache::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TuneCacheT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateTuneCache(_fbb, _o, _rehasher);
-}
-
-inline flatbuffers::Offset<TuneCache> CreateTuneCache(flatbuffers::FlatBufferBuilder &_fbb, const TuneCacheT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
-  (void)_rehasher;
-  (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TuneCacheT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _tune_map = _fbb.CreateVector<flatbuffers::Offset<paddle::lite::fbs::opencl::proto::TuneCache_::TunePair>> (_o->tune_map.size(), [](size_t i, _VectorArgs *__va) { return CreateTunePair(*__va->__fbb, __va->__o->tune_map[i].get(), __va->__rehasher); }, &_va );
-  return paddle::lite::fbs::opencl::proto::CreateTuneCache(
-      _fbb,
-      _tune_map);
-}
-
-namespace TuneCache_ {
-
-inline const flatbuffers::TypeTable *TunePairTypeTable() {
-  static const flatbuffers::TypeCode type_codes[] = {
-    { flatbuffers::ET_STRING, 0, -1 },
-    { flatbuffers::ET_INT, 1, -1 }
-  };
-  static const char * const names[] = {
-    "key",
-    "value"
-  };
-  static const flatbuffers::TypeTable tt = {
-    flatbuffers::ST_TABLE, 2, type_codes, nullptr, nullptr, names
-  };
-  return &tt;
-}
-
-}  // namespace TuneCache_
-
-inline const flatbuffers::TypeTable *TuneCacheTypeTable() {
-  static const flatbuffers::TypeCode type_codes[] = {
-    { flatbuffers::ET_SEQUENCE, 1, 0 }
-  };
-  static const flatbuffers::TypeFunction type_refs[] = {
-    paddle::lite::fbs::opencl::proto::TuneCache_::TunePairTypeTable
-  };
-  static const char * const names[] = {
-    "tune_map"
-  };
-  static const flatbuffers::TypeTable tt = {
-    flatbuffers::ST_TABLE, 1, type_codes, type_refs, nullptr, names
-  };
-  return &tt;
-}
-
-inline const paddle::lite::fbs::opencl::proto::TuneCache *GetTuneCache(const void *buf) {
-  return flatbuffers::GetRoot<paddle::lite::fbs::opencl::proto::TuneCache>(buf);
-}
-
-inline const paddle::lite::fbs::opencl::proto::TuneCache *GetSizePrefixedTuneCache(const void *buf) {
-  return flatbuffers::GetSizePrefixedRoot<paddle::lite::fbs::opencl::proto::TuneCache>(buf);
-}
-
-inline TuneCache *GetMutableTuneCache(void *buf) {
-  return flatbuffers::GetMutableRoot<TuneCache>(buf);
-}
-
-inline bool VerifyTuneCacheBuffer(
-    flatbuffers::Verifier &verifier) {
-  return verifier.VerifyBuffer<paddle::lite::fbs::opencl::proto::TuneCache>(nullptr);
-}
-
-inline bool VerifySizePrefixedTuneCacheBuffer(
-    flatbuffers::Verifier &verifier) {
-  return verifier.VerifySizePrefixedBuffer<paddle::lite::fbs::opencl::proto::TuneCache>(nullptr);
-}
-
-inline void FinishTuneCacheBuffer(
-    flatbuffers::FlatBufferBuilder &fbb,
-    flatbuffers::Offset<paddle::lite::fbs::opencl::proto::TuneCache> root) {
-  fbb.Finish(root);
-}
-
-inline void FinishSizePrefixedTuneCacheBuffer(
-    flatbuffers::FlatBufferBuilder &fbb,
-    flatbuffers::Offset<paddle::lite::fbs::opencl::proto::TuneCache> root) {
-  fbb.FinishSizePrefixed(root);
-}
-
-inline std::unique_ptr<paddle::lite::fbs::opencl::proto::TuneCacheT> UnPackTuneCache(
-    const void *buf,
-    const flatbuffers::resolver_function_t *res = nullptr) {
-  return std::unique_ptr<paddle::lite::fbs::opencl::proto::TuneCacheT>(GetTuneCache(buf)->UnPack(res));
-}
-
-inline std::unique_ptr<paddle::lite::fbs::opencl::proto::TuneCacheT> UnPackSizePrefixedTuneCache(
-    const void *buf,
-    const flatbuffers::resolver_function_t *res = nullptr) {
-  return std::unique_ptr<paddle::lite::fbs::opencl::proto::TuneCacheT>(GetSizePrefixedTuneCache(buf)->UnPack(res));
-}
-
-}  // namespace proto
-}  // namespace opencl
-}  // namespace fbs
-}  // namespace lite
-}  // namespace paddle
-
-#endif  // FLATBUFFERS_GENERATED_TUNECACHE_PADDLE_LITE_FBS_OPENCL_PROTO_H_
diff --git a/lite/core/program.cc b/lite/core/program.cc
index a6c2698ca37..8f0c0a5043a 100644
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -605,9 +605,27 @@ void RuntimeProgram::Run() {
 #ifdef LITE_WITH_OPENCL
     // delegate flush judgement to specify target , it is too heavy for Inst
     inst.Flush(idx);
+#if defined(LITE_WITH_PROFILE) || defined(LITE_WITH_PRECISION_PROFILE)
+    VLOG(4) << "kernel name " << idx << " " << inst.kernel()->name();
+    const auto* op_info = inst.op()->op_info();
+    auto var_in_names = op_info->input_names();
+    for (int i = 0; i < var_in_names.size(); i++) {
+      VLOG(4) << "input var_in_names: " << var_in_names[i];
+    }
+    auto var_out_names = op_info->output_names();
+    for (int i = 0; i < var_out_names.size(); i++) {
+      VLOG(4) << "output var_out_names: " << var_out_names[i];
+    }
+#endif
 #endif
 
     inst.Run();
+#ifdef LITE_WITH_PRECISION_PROFILE
+    if (inst.op()->Type() != "while") {
+      precision_profiler_summary +=
+          inst_precision_profiler.GetInstPrecision(&inst);
+    }
+#endif  // LITE_WITH_PRECISION_PROFILE
   }
 
 #ifdef LITE_WITH_METAL
@@ -797,231 +815,6 @@ void Instruction::Run() {
   kernel_->Launch();
   has_run_ = true;
 
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-#if 0
-  // clang-format off
-  /*
-  time_t t;
-  struct tm* timeinfo;
-  time(&t);
-  timeinfo = localtime(&t);
-  std::cout << "time: " << asctime(timeinfo) << std::endl;
-  */
-  std::cout << "***-----------------------------******-----------------------------***" << std::endl;
-  // get precision
-  std::string op_name = op_->op_info()->Type();
-  std::cout << "op_type: " << op_name << std::endl;
-  if ((op_->op_info()->Type() != "fetch") &&
-      (op_->op_info()->Type() != "while") &&
-      (op_->op_info()->Type() != "conditional_block")) {
-    auto op_scope = op_->scope();
-    auto out_names = op_->op_info()->output_names();
-    auto in_names = op_->op_info()->input_names();
-    for (auto& out_name : in_names) {
-      std::string out_arg_name;
-      op_->op_info()->GetInputArgname(out_name, &out_arg_name);
-      //auto type = kernel_->GetInputDeclType(out_arg_name);
-       // if (type->IsTensor()) {
-      auto tmp = op_scope->FindVar(out_name);
-      if (tmp->IsType<Tensor>()) {
-        const Tensor* tout = op_scope->FindVar(out_name)->GetMutable<Tensor>();
-        if (tout->IsInitialized()) {
-          auto size = tout->numel();
-          auto dim = tout->dims();
-          double sum = 0.0;
-          if (tout->precision() == PrecisionType::kFloat) {
-            const float* dout = tout->data<float>();
-            for (int i = 0; i < size; i++) {
-              sum += dout[i];
-            }
-          } else if (tout->precision() == PrecisionType::kFP16) {
-            const float16_t* dout = tout->data<float16_t>();
-            for (int i = 0; i < size; i++) {
-              sum += static_cast<double>(dout[i]);
-            }
-          } else if (tout->precision() == PrecisionType::kInt32) {
-            const int32_t* dout = tout->data<int32_t>();
-            for (int i = 0; i < size; i++) {
-              sum += static_cast<double>(dout[i]);
-            }
-          } else if (tout->precision() == PrecisionType::kInt64) {
-            const int64_t* dout = tout->data<int64_t>();
-            for (int i = 0; i < size; i++) {
-              sum += static_cast<double>(dout[i]);
-            }
-          } else if (tout->precision() == PrecisionType::kInt8) {
-            const int8_t* dout = tout->data<int8_t>();
-            for (int i = 0; i < size; i++) {
-              sum += static_cast<double>(dout[i]);
-            }
-          } else {
-            std::cout << "This data_type is not support: "
-                      << PrecisionToStr(tout->precision()) << std::endl;
-          }
-          double avg = sum / static_cast<double>(size);
-          std::cout << "in_name: " << out_name
-                    << ", type: " << PrecisionToStr(tout->precision())
-                    << ", size: " << size << ", input avg: " << avg;
-         
-          std::cout<<", dim size:"<< dim.size() << "[";
-          for(int i = 0; i < dim.size(); i++)
-            std::cout << dim[i] << ",";
-          std::cout<<"]\n";
-        } else {
-          std::cout << out_name << " is not inited." << std::endl;
-        }
-      } else if (tmp->IsType<std::vector<Tensor>>()) {
-        auto touts =
-            op_scope->FindVar(out_name)->GetMutable<std::vector<Tensor>>();
-        for (auto t : *touts) {
-          const Tensor* tout = &t;
-          if (tout->IsInitialized()) {
-            auto size = tout->numel();
-            const float* dout = tout->data<float>();
-            double sum = 0.0;
-            for (int i = 0; i < size; i++) {
-              sum += dout[i];
-            }
-            double avg = sum / static_cast<double>(size);
-            std::cout << "op_type: " << op_name << ", input avg: " << avg
-                      << std::endl;
-          } else {
-            std::cout << out_name << " is not inited." << std::endl;
-          }
-        }
-      }
-    }
-    for (auto& out_name : out_names) {
-      std::string out_arg_name;
-      op_->op_info()->GetOutputArgname(out_name, &out_arg_name);
-      //auto type = kernel_->GetOutputDeclType(out_arg_name);
-      std::string op_name = op_->op_info()->Type();
-      //if (type->IsTensor()) {
-      auto tmp = op_scope->FindVar(out_name);
-      if (tmp->IsType<Tensor>()) {
-        const Tensor* tout = op_scope->FindVar(out_name)->GetMutable<Tensor>();
-        if (tout->IsInitialized()) {
-          auto size = tout->numel();
-          auto dim = tout->dims();
-          double sum = 0.0;
-          if (tout->precision() == PrecisionType::kFloat) {
-            const float* dout = tout->data<float>();
-            for (int i = 0; i < size; i++) {
-              sum += dout[i];
-            }
-          } else if (tout->precision() == PrecisionType::kFP16) {
-            const float16_t* dout = tout->data<float16_t>();
-            for (int i = 0; i < size; i++) {
-              sum += static_cast<double>(dout[i]);
-            }
-          } else if (tout->precision() == PrecisionType::kInt32) {
-            const int32_t* dout = tout->data<int32_t>();
-            for (int i = 0; i < size; i++) {
-              sum += static_cast<double>(dout[i]);
-            }
-          } else if (tout->precision() == PrecisionType::kInt64) {
-            const int64_t* dout = tout->data<int64_t>();
-            for (int i = 0; i < size; i++) {
-              sum += static_cast<double>(dout[i]);
-            }
-          } else if (tout->precision() == PrecisionType::kInt8) {
-            const int8_t* dout = tout->data<int8_t>();
-            for (int i = 0; i < size; i++) {
-              sum += static_cast<double>(dout[i]);
-            }
-          } else {
-            std::cout << "This data_type is not support: "
-                      << PrecisionToStr(tout->precision()) << std::endl;
-          }
-            double avg = sum / static_cast<double>(size);
-          std::cout << "out_name: " << out_name
-                    << ", type: " << PrecisionToStr(tout->precision())
-                    << ", sum: " << sum << ", output avg: " << avg;
-          std::cout<<", dim size:"<< dim.size() << "[";
-          for(int i = 0; i < dim.size(); i++)
-            std::cout << dim[i] << ",";
-          std::cout<<"]\n";
-        } else {
-          std::cout << out_name << " is not inited." << std::endl;
-        }
-      } else if (tmp->IsType<std::vector<Tensor>>()) {
-        auto touts =
-            op_scope->FindVar(out_name)->GetMutable<std::vector<Tensor>>();
-        for (auto t : *touts) {
-          const Tensor* tout = &t;
-          if (tout->IsInitialized()) {
-            auto size = tout->numel();
-            double sum = 0.0;
-            if (tout->precision() == PrecisionType::kFloat) {
-              const float* dout = tout->data<float>();
-              for (int i = 0; i < size; i++) {
-                sum += dout[i];
-                std::cout << dout[i] << ", ";
-              }
-            } else if (tout->precision() == PrecisionType::kFP16) {
-              const float16_t* dout = tout->data<float16_t>();
-              for (int i = 0; i < size; i++) {
-                sum += static_cast<double>(dout[i]);
-                std::cout << dout[i] << ", ";
-              }
-            } else if (tout->precision() == PrecisionType::kInt32) {
-              const int32_t* dout = tout->data<int32_t>();
-              for (int i = 0; i < size; i++) {
-              sum += static_cast<double>(dout[i]);
-                std::cout << dout[i] << ", ";
-              }
-            } else if (tout->precision() == PrecisionType::kInt64) {
-              const int64_t* dout = tout->data<int64_t>();
-              for (int i = 0; i < size; i++) {
-              sum += static_cast<double>(dout[i]);
-                std::cout << dout[i] << ", ";
-              }
-            } else {
-              std::cout << "This data_type is not support: "
-                        << PrecisionToStr(tout->precision()) << std::endl;
-            }
-            double avg = sum / static_cast<double>(size);
-            std::cout << std::endl;
-            std::cout << "op_type: " << op_name << out_name
-                      << ", type: " << PrecisionToStr(tout->precision())
-                      << ", output avg: " << avg << std::endl;
-          } else {
-            std::cout << out_name << " is not inited." << std::endl;
-          }
-        }
-      }
-    }
-    std::cout << "***-----------------------------******-----------------------------***" << std::endl;
-  }
-#endif
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
 #ifdef LITE_WITH_PROFILE
   if (first_epoch_for_profiler_) {
     kernel_->SetIsKernelTest(false);
diff --git a/lite/kernels/host/index_select_compute.cc b/lite/kernels/host/index_select_compute.cc
index f4ff2b1ad8c..1c4be1c8df2 100644
--- a/lite/kernels/host/index_select_compute.cc
+++ b/lite/kernels/host/index_select_compute.cc
@@ -35,6 +35,8 @@ void Index_selectCompute<T>::Run() {
   auto index_ddim = index->dims();
   auto output_ddim = output->dims();
 
+  if (param.dim < 0) param.dim += input_ddim.size();
+
   int left = input_ddim.count(0, param.dim);
   int middle = input_ddim[param.dim];
   int right = input_ddim.count(param.dim + 1, input_ddim.size());
diff --git a/lite/kernels/host/unique_compute.cc b/lite/kernels/host/unique_compute.cc
index b29101e52ed..5f1458f2bc2 100644
--- a/lite/kernels/host/unique_compute.cc
+++ b/lite/kernels/host/unique_compute.cc
@@ -17,13 +17,12 @@
 
 #include <algorithm>
 #include <cmath>
+#include <iostream>
 #include <numeric>
 #include <set>
 #include <unordered_map>
 #include <utility>
 #include <vector>
-#include <iostream>
-
 
 namespace paddle {
 namespace lite {
@@ -31,10 +30,10 @@ namespace kernels {
 namespace host {
 
 template <typename InT, typename IndexT>
-void UniqueFunc(const lite::Tensor* x, 
-                      lite::Tensor* out,
-                      lite::Tensor* index,
-                      lite::Tensor* count) {
+void UniqueFunc(const lite::Tensor* x,
+                lite::Tensor* out,
+                lite::Tensor* index,
+                lite::Tensor* count) {
   const InT* in_data = x->template data<InT>();
   IndexT* index_data = index->mutable_data<IndexT>();
 
@@ -187,9 +186,10 @@ static ForwardIt UniqueDimImpl(ForwardIt first,
   return ++result;
 }
 
-template <class T>
+template <typename T>
 void TensorFromVector(const std::vector<T>& src, lite::Tensor* dst) {
   auto* src_ptr = static_cast<const void*>(src.data());
+  dst->Resize({static_cast<int64_t>(src.size())});
   auto* dst_ptr = static_cast<void*>(dst->mutable_data<T>());
   auto size = src.size() * sizeof(T);
   lite::TargetWrapperHost::MemcpySync(
@@ -197,16 +197,16 @@ void TensorFromVector(const std::vector<T>& src, lite::Tensor* dst) {
 }
 
 template <typename T>
-void TransCompute(const Tensor &input,
-               Tensor *output,
-               const std::vector<int> &orders) {
+void TransCompute(const Tensor& input,
+                  Tensor* output,
+                  const std::vector<int>& orders) {
   auto in_dims = input.dims();
   auto out_dims = output->dims();
   int num_axes = in_dims.size();
   int count = in_dims.production();
 
-  const T *din = input.data<T>();
-  T *dout = output->mutable_data<T>();
+  const T* din = input.data<T>();
+  T* dout = output->mutable_data<T>();
 
   std::vector<int> old_temps;
   int temp = 1;
@@ -244,8 +244,8 @@ void TransCompute(const Tensor &input,
 
 lite::DDim FlattenTo2d(const lite::DDim& src, int num_col_dims) {
   return DDim(std::vector<DDim::value_type>{
-              src.Slice(0, num_col_dims).production(),
-              src.Slice(num_col_dims, src.size()).production()});
+      src.Slice(0, num_col_dims).production(),
+      src.Slice(num_col_dims, src.size()).production()});
 }
 
 template <typename T>
@@ -318,8 +318,8 @@ void UniqueDimFunc(const lite::Tensor& in,
             sorted_indices_vec.end(),
             [&](int64_t a, int64_t b) -> bool {
               for (int64_t i = 0; i < col; ++i) {
-                InT lhs = in_trans_data[i + a*col];
-                InT rhs = in_trans_data[i + b*col];
+                InT lhs = in_trans_data[i + a * col];
+                InT rhs = in_trans_data[i + b * col];
                 if (lhs < rhs) {
                   return true;
                 } else if (lhs > rhs) {
@@ -328,7 +328,7 @@ void UniqueDimFunc(const lite::Tensor& in,
               }
               return false;
             });
-  
+
   // sort tensor according to indices
   lite::Tensor input_sorted;
   input_sorted.Resize(in_trans_dims);
@@ -338,7 +338,6 @@ void UniqueDimFunc(const lite::Tensor& in,
     memcpy(input_sorted_data + i * col,
            in_trans_data + static_cast<int64_t>(sorted_indices_vec[i]) * col,
            col * sizeof(InT));
-
   }
 
   std::vector<lite::Tensor> input_unbind = Unbind<InT>(input_sorted);
@@ -346,16 +345,17 @@ void UniqueDimFunc(const lite::Tensor& in,
   std::vector<IndexT> counts_vec(sorted_indices_vec.size(), 0);
   std::vector<IndexT> indices_vec(sorted_indices_vec.size(), 0);
   auto last = UniqueDimImpl<std::vector<lite::Tensor>::iterator, InT, IndexT>(
-    input_unbind.begin(),
-    input_unbind.end(),
-    sorted_indices_vec,
-    &inverse_vec,
-    &counts_vec,
-    &indices_vec);
+      input_unbind.begin(),
+      input_unbind.end(),
+      sorted_indices_vec,
+      &inverse_vec,
+      &counts_vec,
+      &indices_vec);
   input_unbind.erase(last, input_unbind.end());
   counts_vec.erase(counts_vec.begin() + input_unbind.size(), counts_vec.end());
-  indices_vec.erase(indices_vec.begin() + input_unbind.size(), indices_vec.end());
-  
+  indices_vec.erase(indices_vec.begin() + input_unbind.size(),
+                    indices_vec.end());
+
   lite::Tensor out_trans;
   std::vector<int64_t> out_trans_dims_vec = in_trans_dims_vec;
   out_trans_dims_vec[0] = input_unbind.size();
@@ -368,20 +368,16 @@ void UniqueDimFunc(const lite::Tensor& in,
   TransCompute<InT>(out_trans, out, permute);
 
   if (return_inverse) {
-    index->Resize({in.numel()});
     TensorFromVector(inverse_vec, index);
   }
 
   if (return_counts) {
-    count->Resize({out->numel()});
     TensorFromVector(counts_vec, count);
   }
 
   if (return_index) {
-    indices->Resize({out->numel()});
     TensorFromVector(indices_vec, indices);
   }
-
 }
 
 void UniqueCompute::Run() {
@@ -399,7 +395,8 @@ void UniqueCompute::Run() {
   auto is_sorted = param.is_sorted;
 
   lite_api::PrecisionType index_type = index->precision();
-  bool index_type_match = index_type == PRECISION(kInt32) || index_type == PRECISION(kInt64);
+  bool index_type_match =
+      index_type == PRECISION(kInt32) || index_type == PRECISION(kInt64);
   lite_api::PrecisionType type = x->precision();
   CHECK_EQ(index_type_match, true) << "Index holds the wrong type, it holds "
                                    << static_cast<int>(type)
@@ -437,10 +434,10 @@ void UniqueCompute::Run() {
           LOG(FATAL) << "unique does not implement for the "
                      << "input type:" << static_cast<int>(type);
           break;
-        }
-     }
-     return;
-  } 
+      }
+    }
+    return;
+  }
 
   if (x->numel() == 0) {
     switch (type) {
@@ -458,20 +455,41 @@ void UniqueCompute::Run() {
                    << "input type:" << static_cast<int>(type);
         break;
     }
-    
+
     return;
   }
   if (axis_vec.empty()) {
     if (index_type == PRECISION(kInt32)) {
       switch (type) {
         case PRECISION(kFloat):
-          UniqueFlattendTensorFunc<float, int32_t>(*x, output, index, indices, count, return_index, return_inverse, return_counts);
+          UniqueFlattendTensorFunc<float, int32_t>(*x,
+                                                   output,
+                                                   index,
+                                                   indices,
+                                                   count,
+                                                   return_index,
+                                                   return_inverse,
+                                                   return_counts);
           break;
         case PRECISION(kInt32):
-          UniqueFlattendTensorFunc<int32_t, int32_t>(*x, output, index, indices, count, return_index, return_inverse, return_counts);
+          UniqueFlattendTensorFunc<int32_t, int32_t>(*x,
+                                                     output,
+                                                     index,
+                                                     indices,
+                                                     count,
+                                                     return_index,
+                                                     return_inverse,
+                                                     return_counts);
           break;
         case PRECISION(kInt64):
-          UniqueFlattendTensorFunc<int64_t, int32_t>(*x, output, index, indices, count, return_index, return_inverse, return_counts);
+          UniqueFlattendTensorFunc<int64_t, int32_t>(*x,
+                                                     output,
+                                                     index,
+                                                     indices,
+                                                     count,
+                                                     return_index,
+                                                     return_inverse,
+                                                     return_counts);
           break;
         default:
           LOG(FATAL) << "unique does not implement for the "
@@ -481,13 +499,34 @@ void UniqueCompute::Run() {
     } else {
       switch (type) {
         case PRECISION(kFloat):
-          UniqueFlattendTensorFunc<float, int64_t>(*x, output, index, indices, count, return_index, return_inverse, return_counts);
+          UniqueFlattendTensorFunc<float, int64_t>(*x,
+                                                   output,
+                                                   index,
+                                                   indices,
+                                                   count,
+                                                   return_index,
+                                                   return_inverse,
+                                                   return_counts);
           break;
         case PRECISION(kInt32):
-          UniqueFlattendTensorFunc<int32_t, int64_t>(*x, output, index, indices, count, return_index, return_inverse, return_counts);
+          UniqueFlattendTensorFunc<int32_t, int64_t>(*x,
+                                                     output,
+                                                     index,
+                                                     indices,
+                                                     count,
+                                                     return_index,
+                                                     return_inverse,
+                                                     return_counts);
           break;
         case PRECISION(kInt64):
-          UniqueFlattendTensorFunc<int64_t, int64_t>(*x, output, index, indices, count, return_index, return_inverse, return_counts);
+          UniqueFlattendTensorFunc<int64_t, int64_t>(*x,
+                                                     output,
+                                                     index,
+                                                     indices,
+                                                     count,
+                                                     return_index,
+                                                     return_inverse,
+                                                     return_counts);
           break;
         default:
           LOG(FATAL) << "unique does not implement for the "
@@ -500,13 +539,37 @@ void UniqueCompute::Run() {
     if (index_type == PRECISION(kInt32)) {
       switch (type) {
         case PRECISION(kFloat):
-          UniqueDimFunc<float, int32_t>(*x, output, index, indices, count, axis, return_index, return_inverse, return_counts);
+          UniqueDimFunc<float, int32_t>(*x,
+                                        output,
+                                        index,
+                                        indices,
+                                        count,
+                                        axis,
+                                        return_index,
+                                        return_inverse,
+                                        return_counts);
           break;
         case PRECISION(kInt32):
-          UniqueDimFunc<int32_t, int32_t>(*x, output, index, indices, count, axis, return_index, return_inverse, return_counts);
+          UniqueDimFunc<int32_t, int32_t>(*x,
+                                          output,
+                                          index,
+                                          indices,
+                                          count,
+                                          axis,
+                                          return_index,
+                                          return_inverse,
+                                          return_counts);
           break;
         case PRECISION(kInt64):
-          UniqueDimFunc<int64_t, int32_t>(*x, output, index, indices, count, axis, return_index, return_inverse, return_counts);
+          UniqueDimFunc<int64_t, int32_t>(*x,
+                                          output,
+                                          index,
+                                          indices,
+                                          count,
+                                          axis,
+                                          return_index,
+                                          return_inverse,
+                                          return_counts);
           break;
         default:
           LOG(FATAL) << "unique does not implement for the "
@@ -515,13 +578,37 @@ void UniqueCompute::Run() {
     } else {
       switch (type) {
         case PRECISION(kFloat):
-          UniqueDimFunc<float, int64_t>(*x, output, index, indices, count, axis, return_index, return_inverse, return_counts);
+          UniqueDimFunc<float, int64_t>(*x,
+                                        output,
+                                        index,
+                                        indices,
+                                        count,
+                                        axis,
+                                        return_index,
+                                        return_inverse,
+                                        return_counts);
           break;
         case PRECISION(kInt32):
-          UniqueDimFunc<int32_t, int64_t>(*x, output, index, indices, count, axis, return_index, return_inverse, return_counts);
+          UniqueDimFunc<int32_t, int64_t>(*x,
+                                          output,
+                                          index,
+                                          indices,
+                                          count,
+                                          axis,
+                                          return_index,
+                                          return_inverse,
+                                          return_counts);
           break;
         case PRECISION(kInt64):
-          UniqueDimFunc<int64_t, int64_t>(*x, output, index, indices, count, axis, return_index, return_inverse, return_counts);
+          UniqueDimFunc<int64_t, int64_t>(*x,
+                                          output,
+                                          index,
+                                          indices,
+                                          count,
+                                          axis,
+                                          return_index,
+                                          return_inverse,
+                                          return_counts);
           break;
         default:
           LOG(FATAL) << "unique does not implement for the "
@@ -529,21 +616,15 @@ void UniqueCompute::Run() {
       }
     }
   }
-} 
-
+}
 
 }  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
 
-
-REGISTER_LITE_KERNEL(unique,
-                     kHost,
-                     kAny,
-                     kAny,
-                     paddle::lite::kernels::host::UniqueCompute,
-                     def)
+REGISTER_LITE_KERNEL(
+    unique, kHost, kAny, kAny, paddle::lite::kernels::host::UniqueCompute, def)
     .BindInput("X",
                {LiteType::GetTensorTy(TARGET(kHost),
                                       PRECISION(kAny),
@@ -565,4 +646,3 @@ REGISTER_LITE_KERNEL(unique,
                                        PRECISION(kInt32),
                                        DATALAYOUT(kAny))})
     .Finalize();
-
diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h
index 1b9e121e4b7..a023deee21c 100644
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -2288,12 +2288,12 @@ struct UniqueWithCountsParam : ParamBase {
 struct UniqueParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
-  lite::Tensor* Index{}; // the indices in the original input
-  lite::Tensor* Indices{}; // the indices in the result
+  lite::Tensor* Index{};    // the indices in the original input
+  lite::Tensor* Indices{};  // the indices in the result
   lite::Tensor* Counts{};
   int dtype{-1};
-  bool return_index{false}; // Indices
-  bool return_inverse{false}; // Index
+  bool return_index{false};    // Indices
+  bool return_inverse{false};  // Index
   bool return_counts{false};
   std::vector<int> axis{};
   bool is_sorted{false};
diff --git a/lite/operators/unique_op.cc b/lite/operators/unique_op.cc
index 84f829028a9..01155347491 100644
--- a/lite/operators/unique_op.cc
+++ b/lite/operators/unique_op.cc
@@ -14,6 +14,7 @@
 
 #include "lite/operators/unique_op.h"
 #include "lite/core/op_registry.h"
+
 namespace paddle {
 namespace lite {
 namespace operators {
@@ -63,8 +64,7 @@ bool UniqueOp::InferShapeImpl() const {
   return true;
 }
 
-bool UniqueOp::AttachImpl(const cpp::OpDesc &opdesc,
-                                    lite::Scope *scope) {
+bool UniqueOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
   param_.X = scope->FindTensor(opdesc.Input("X").front());
   CHECK(param_.X) << "Input(X) of UniqueOp should not be null.";
   param_.Out = scope->FindMutableTensor(opdesc.Output("Out").front());
@@ -81,7 +81,7 @@ bool UniqueOp::AttachImpl(const cpp::OpDesc &opdesc,
     param_.Counts = scope->FindMutableTensor(opdesc.Output("Counts").front());
     CHECK(param_.Counts) << "Output(Counts) of UniqueOp should not be null.";
   }
-  
+
   if (opdesc.HasAttr("dtype")) {
     param_.dtype = opdesc.GetAttr<int>("dtype");
   }
@@ -97,7 +97,7 @@ bool UniqueOp::AttachImpl(const cpp::OpDesc &opdesc,
   param_.axis = opdesc.GetAttr<std::vector<int>>("axis");
   if (opdesc.HasAttr("is_sorted")) {
     param_.is_sorted = opdesc.GetAttr<bool>("is_sorted");
-  } 
+  }
 
   return true;
 }
diff --git a/lite/operators/unique_op.h b/lite/operators/unique_op.h
index c9e302b7566..69cb898eae6 100644
--- a/lite/operators/unique_op.h
+++ b/lite/operators/unique_op.h
@@ -35,7 +35,7 @@ class UniqueOp : public OpLite {
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  
+
   std::string DebugString() const override { return "unique"; }
 
   bool InferType() override {
@@ -49,4 +49,4 @@ class UniqueOp : public OpLite {
 
 }  // namespace operators
 }  // namespace lite
-}  // namespace paddle
\ No newline at end of file
+}  // namespace paddle
diff --git a/lite/tests/unittest_py/op/test_unique_op.py b/lite/tests/unittest_py/op/test_unique_op.py
index 453ac6bd561..3130967e702 100644
--- a/lite/tests/unittest_py/op/test_unique_op.py
+++ b/lite/tests/unittest_py/op/test_unique_op.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -33,7 +33,7 @@ def __init__(self, *args, **kwargs):
         host_places = [
             Place(TargetType.Host, PrecisionType.FP32, DataLayoutType.NCHW)
         ]
-        self.enable_testing_on_place(places=host_places, thread=[1,4])
+        self.enable_testing_on_place(places=host_places, thread=[1, 4])
 
     def is_program_valid(self,
                          program_config: ProgramConfig,
@@ -41,48 +41,46 @@ def is_program_valid(self,
         return True
 
     def sample_program_configs(self, draw):
-        in_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=2, max_value=100),
-                min_size=1,
-                max_size=1))
-        print(in_shape)
-        in_dtype = draw(st.sampled_from([np.float32, np.int32, np.int64]))
-                
-        def generate_X_data():
-            t = np.random.normal(0.0, 5.0, in_shape).astype(in_dtype)
-            print(t)
-            return t
-
-        dtype = draw(st.sampled_from([2,3]))
+        dtype = draw(st.sampled_from([2, 3]))
         is_sorted = draw(st.booleans())
         return_index = draw(st.booleans())
         return_inverse = draw(st.sampled_from([True]))
         return_counts = draw(st.booleans())
 
+        in_shape = draw(
+            st.lists(
+                st.integers(
+                    min_value=2, max_value=10), min_size=1, max_size=8))
+
         if is_sorted == False:
             return_index = False
+            return_inverse = True
             return_counts = False
+            in_shape = draw(
+                st.lists(
+                    st.integers(
+                        min_value=2, max_value=10),
+                    min_size=1,
+                    max_size=1))
+
+        in_dtype = draw(st.sampled_from([np.float32, np.int32, np.int64]))
+
+        def generate_X_data():
+            return np.random.normal(0.0, 5.0, in_shape).astype(in_dtype)
 
         param_is_sorted = is_sorted
         param_return_index = return_index
         param_return_inverse = return_inverse
         param_return_counts = return_counts
-  
-        axis = draw(st.sampled_from([[2], [1], [0], []]))
+
+        axis = draw(
+            st.sampled_from([[], [0], [1], [2], [3], [4], [5], [6], [7], [8]]))
         while len(axis) > 0 and axis[0] >= len(in_shape):
-            axis[0] = axis[0] - 1       
+            axis[0] = axis[0] - 1
 
-        outputs = [
-            "Out_data"
-        ]
-        outputs_config = {
-            "Out": ["Out_data"]
-        }
-        outputs_dtype = {
-            "Out_data": in_dtype
-        }
+        outputs = ["Out_data"]
+        outputs_config = {"Out": ["Out_data"]}
+        outputs_dtype = {"Out_data": in_dtype}
         if return_inverse:
             outputs.append("Index_data")
             outputs_config["Index"] = ["Index_data"]
@@ -93,13 +91,13 @@ def generate_X_data():
             outputs_dtype["Indices_data"] = np.int32
         if return_counts:
             outputs.append("Counts_data")
-            outputs_config["Counts"] = ["Counts_data"] 
+            outputs_config["Counts"] = ["Counts_data"]
             outputs_dtype["Counts_data"] = np.int32
 
         unique_op = OpConfig(
-            type = "unique",
-            inputs = {"X": ["input_data"]},
-            outputs = outputs_config,
+            type="unique",
+            inputs={"X": ["input_data"]},
+            outputs=outputs_config,
             attrs={
                 "dtype": dtype,
                 "return_index": param_return_index,
@@ -107,8 +105,7 @@ def generate_X_data():
                 "return_counts": param_return_counts,
                 "axis": axis,
                 "is_sorted": is_sorted
-            }
-        )
+            })
 
         unique_op.outputs_dtype = outputs_dtype
 
@@ -118,8 +115,7 @@ def generate_X_data():
             inputs={
                 "input_data": TensorConfig(data_gen=partial(generate_X_data))
             },
-            outputs=outputs
-        )
+            outputs=outputs)
         return program_config
 
     def sample_predictor_configs(self):
@@ -129,7 +125,8 @@ def add_ignore_pass_case(self):
         pass
 
     def test(self, *args, **kwargs):
-        self.run_and_statis(quant=False, max_examples=25)
+        self.run_and_statis(quant=False, max_examples=100)
+
 
 if __name__ == "__main__":
-    unittest.main(argv=[''])    
+    unittest.main(argv=[''])

From 2718edb0a27b326dc0319dc4e0e7b771a43242e7 Mon Sep 17 00:00:00 2001
From: Qijian Tian <1741919942@qq.com>
Date: Tue, 20 Dec 2022 18:18:44 +0800
Subject: [PATCH 09/10] complate unique_op


From 2e4ed6e2f242f86919c9e030502ed6144ba2f9ab Mon Sep 17 00:00:00 2001
From: Qijian Tian <1741919942@qq.com>
Date: Thu, 22 Dec 2022 11:36:22 +0800
Subject: [PATCH 10/10] rename TensorFromVector

---
 lite/kernels/host/unique_compute.cc | 58 ++++++++---------------------
 1 file changed, 16 insertions(+), 42 deletions(-)

diff --git a/lite/kernels/host/unique_compute.cc b/lite/kernels/host/unique_compute.cc
index 5f1458f2bc2..b891d1435c9 100644
--- a/lite/kernels/host/unique_compute.cc
+++ b/lite/kernels/host/unique_compute.cc
@@ -13,8 +13,6 @@
 // limitations under the License.
 
 #include "lite/kernels/host/unique_compute.h"
-#include "lite/core/tensor.h"
-
 #include <algorithm>
 #include <cmath>
 #include <iostream>
@@ -23,6 +21,7 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
+#include "lite/core/tensor.h"
 
 namespace paddle {
 namespace lite {
@@ -36,12 +35,9 @@ void UniqueFunc(const lite::Tensor* x,
                 lite::Tensor* count) {
   const InT* in_data = x->template data<InT>();
   IndexT* index_data = index->mutable_data<IndexT>();
-
   int64_t j = 0;
-
   std::unordered_map<InT, int64_t> dict;
   std::vector<InT> uniq;
-
   for (auto i = 0; i < x->numel(); i++) {
     auto it = dict.find(in_data[i]);
     if (it == dict.end()) {
@@ -53,7 +49,6 @@ void UniqueFunc(const lite::Tensor* x,
       index_data[i] = static_cast<IndexT>(it->second);
     }
   }
-
   if (count != nullptr) {
     // Resize the count tensor dims to allocate the memory
     count->Resize({static_cast<int64_t>(uniq.size())});
@@ -84,7 +79,6 @@ void UniqueFlattendTensorFunc(const lite::Tensor& in,
   out->Resize({static_cast<int64_t>(unique.size())});
   auto out_data = out->mutable_data<InT>();
   std::copy(unique.begin(), unique.end(), out_data);
-
   if (return_index) {
     indices->Resize({out->numel()});
     auto indices_data = indices->mutable_data<IndexT>();
@@ -98,7 +92,6 @@ void UniqueFlattendTensorFunc(const lite::Tensor& in,
       indices_data[i] = indices_map[out_data[i]];
     }
   }
-
   if (return_inverse) {
     index->Resize({in.numel()});
     auto inverse_data = index->mutable_data<IndexT>();
@@ -111,7 +104,6 @@ void UniqueFlattendTensorFunc(const lite::Tensor& in,
       inverse_data[i] = inverse_map[in_data[i]];
     }
   }
-
   if (return_counts) {
     count->Resize({out->numel()});
     auto count_data = count->mutable_data<IndexT>();
@@ -162,14 +154,11 @@ static ForwardIt UniqueDimImpl(ForwardIt first,
   if (first == last) {
     return last;
   }
-
   (*inverse_vec)[sorted_indices_vec[0]] = 0;
   (*counts_vec)[0] = 1;
   (*indices_vec)[0] = sorted_indices_vec[0];
-
   ForwardIt begin = first;
   ForwardIt result = first;
-
   while (++first != last) {
     int64_t idx_first = std::distance(begin, first);
     int64_t idx_result = std::distance(begin, result);
@@ -187,7 +176,7 @@ static ForwardIt UniqueDimImpl(ForwardIt first,
 }
 
 template <typename T>
-void TensorFromVector(const std::vector<T>& src, lite::Tensor* dst) {
+void UniqueTensorFromVector(const std::vector<T>& src, lite::Tensor* dst) {
   auto* src_ptr = static_cast<const void*>(src.data());
   dst->Resize({static_cast<int64_t>(src.size())});
   auto* dst_ptr = static_cast<void*>(dst->mutable_data<T>());
@@ -197,17 +186,15 @@ void TensorFromVector(const std::vector<T>& src, lite::Tensor* dst) {
 }
 
 template <typename T>
-void TransCompute(const Tensor& input,
-                  Tensor* output,
-                  const std::vector<int>& orders) {
+void UniqueTransCompute(const Tensor& input,
+                        Tensor* output,
+                        const std::vector<int>& orders) {
   auto in_dims = input.dims();
   auto out_dims = output->dims();
   int num_axes = in_dims.size();
   int count = in_dims.production();
-
   const T* din = input.data<T>();
   T* dout = output->mutable_data<T>();
-
   std::vector<int> old_temps;
   int temp = 1;
   for (int i = 0; i < num_axes; ++i) {
@@ -218,7 +205,6 @@ void TransCompute(const Tensor& input,
   for (int i = 0; i < num_axes; i++) {
     old_steps.push_back(old_temps[num_axes - 1 - i]);
   }
-
   std::vector<int> new_temps;
   temp = 1;
   for (int i = 0; i < num_axes; ++i) {
@@ -229,7 +215,6 @@ void TransCompute(const Tensor& input,
   for (int i = 0; i < num_axes; i++) {
     new_steps.push_back(new_temps[num_axes - 1 - i]);
   }
-
   for (int i = 0; i < count; ++i) {
     int old_idx = 0;
     int idx = i;
@@ -242,16 +227,16 @@ void TransCompute(const Tensor& input,
   }
 }
 
-lite::DDim FlattenTo2d(const lite::DDim& src, int num_col_dims) {
+lite::DDim UniqueFlattenTo2d(const lite::DDim& src, int num_col_dims) {
   return DDim(std::vector<DDim::value_type>{
       src.Slice(0, num_col_dims).production(),
       src.Slice(num_col_dims, src.size()).production()});
 }
 
 template <typename T>
-void concat_func(const std::vector<lite::Tensor>& input,
-                 const int axis,
-                 lite::Tensor* output) {
+void UniqueConcatFunc(const std::vector<lite::Tensor>& input,
+                      const int axis,
+                      lite::Tensor* output) {
   size_t num = input.size();
   auto dim_0 = input[0].dims();
   int64_t concat_input_size = 1;
@@ -262,7 +247,6 @@ void concat_func(const std::vector<lite::Tensor>& input,
   for (int i = 0; i < axis; i++) {
     num_cancats *= dim_0[i];
   }
-
   auto* dst_ptr = output->mutable_data<T>();
   const int out_concat_axis = output->dims()[axis];
   int64_t offset_concat_axis = 0;
@@ -304,9 +288,9 @@ void UniqueDimFunc(const lite::Tensor& in,
   lite::DDim in_trans_dims = DDim(in_trans_dims_vec);
   in_trans.Resize(in_trans_dims);
   in_trans.mutable_data<InT>();
-  TransCompute<InT>(in, &in_trans, permute);
+  UniqueTransCompute<InT>(in, &in_trans, permute);
   // reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2]
-  lite::DDim in_trans_flat_dims = FlattenTo2d(in_trans_dims, 1);
+  lite::DDim in_trans_flat_dims = UniqueFlattenTo2d(in_trans_dims, 1);
   in_trans.Resize(in_trans_flat_dims);
 
   // sort indices
@@ -328,7 +312,6 @@ void UniqueDimFunc(const lite::Tensor& in,
               }
               return false;
             });
-
   // sort tensor according to indices
   lite::Tensor input_sorted;
   input_sorted.Resize(in_trans_dims);
@@ -339,7 +322,6 @@ void UniqueDimFunc(const lite::Tensor& in,
            in_trans_data + static_cast<int64_t>(sorted_indices_vec[i]) * col,
            col * sizeof(InT));
   }
-
   std::vector<lite::Tensor> input_unbind = Unbind<InT>(input_sorted);
   std::vector<IndexT> inverse_vec(sorted_indices_vec.size(), 0);
   std::vector<IndexT> counts_vec(sorted_indices_vec.size(), 0);
@@ -355,7 +337,6 @@ void UniqueDimFunc(const lite::Tensor& in,
   counts_vec.erase(counts_vec.begin() + input_unbind.size(), counts_vec.end());
   indices_vec.erase(indices_vec.begin() + input_unbind.size(),
                     indices_vec.end());
-
   lite::Tensor out_trans;
   std::vector<int64_t> out_trans_dims_vec = in_trans_dims_vec;
   out_trans_dims_vec[0] = input_unbind.size();
@@ -364,19 +345,16 @@ void UniqueDimFunc(const lite::Tensor& in,
   std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]);
   out->Resize(out_trans_dims_vec);
   out->mutable_data<InT>();
-  concat_func<InT>(input_unbind, 0, &out_trans);
-  TransCompute<InT>(out_trans, out, permute);
-
+  UniqueConcatFunc<InT>(input_unbind, 0, &out_trans);
+  UniqueTransCompute<InT>(out_trans, out, permute);
   if (return_inverse) {
-    TensorFromVector(inverse_vec, index);
+    UniqueTensorFromVector(inverse_vec, index);
   }
-
   if (return_counts) {
-    TensorFromVector(counts_vec, count);
+    UniqueTensorFromVector(counts_vec, count);
   }
-
   if (return_index) {
-    TensorFromVector(indices_vec, indices);
+    UniqueTensorFromVector(indices_vec, indices);
   }
 }
 
@@ -393,7 +371,6 @@ void UniqueCompute::Run() {
   bool return_counts = param.return_counts;
   auto axis_vec = param.axis;
   auto is_sorted = param.is_sorted;
-
   lite_api::PrecisionType index_type = index->precision();
   bool index_type_match =
       index_type == PRECISION(kInt32) || index_type == PRECISION(kInt64);
@@ -401,7 +378,6 @@ void UniqueCompute::Run() {
   CHECK_EQ(index_type_match, true) << "Index holds the wrong type, it holds "
                                    << static_cast<int>(type)
                                    << "but desires to be int32 or int64";
-
   if (!is_sorted) {
     if (index_type == PRECISION(kInt32)) {
       switch (type) {
@@ -438,7 +414,6 @@ void UniqueCompute::Run() {
     }
     return;
   }
-
   if (x->numel() == 0) {
     switch (type) {
       case PRECISION(kFloat):
@@ -455,7 +430,6 @@ void UniqueCompute::Run() {
                    << "input type:" << static_cast<int>(type);
         break;
     }
-
     return;
   }
   if (axis_vec.empty()) {