From a7bd53032d5d88727e8399fe68eb64b60190257b Mon Sep 17 00:00:00 2001 From: Qijian Tian <1741919942@qq.com> Date: Wed, 7 Dec 2022 19:38:38 +0800 Subject: [PATCH 01/10] add unique_op --- .../opencl/utils/tune_cache_generated.h | 383 +++++++++++++ .../x86_mobilenetv1_full_demo/CMakeLists.txt | 73 +++ .../x86_mobilenetv1_light_demo/CMakeLists.txt | 73 +++ lite/kernels/host/CMakeLists.txt | 1 + lite/kernels/host/unique_compute.cc | 530 ++++++++++++++++++ lite/kernels/host/unique_compute.h | 36 ++ lite/operators/CMakeLists.txt | 1 + lite/operators/op_params.h | 15 + lite/operators/unique_op.cc | 88 +++ lite/operators/unique_op.h | 52 ++ 10 files changed, 1252 insertions(+) create mode 100644 lite/backends/opencl/utils/tune_cache_generated.h create mode 100644 lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt create mode 100644 lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt create mode 100644 lite/kernels/host/unique_compute.cc create mode 100644 lite/kernels/host/unique_compute.h create mode 100644 lite/operators/unique_op.cc create mode 100644 lite/operators/unique_op.h diff --git a/lite/backends/opencl/utils/tune_cache_generated.h b/lite/backends/opencl/utils/tune_cache_generated.h new file mode 100644 index 00000000000..bb091cce383 --- /dev/null +++ b/lite/backends/opencl/utils/tune_cache_generated.h @@ -0,0 +1,383 @@ +// automatically generated by the FlatBuffers compiler, do not modify + + +#ifndef FLATBUFFERS_GENERATED_TUNECACHE_PADDLE_LITE_FBS_OPENCL_PROTO_H_ +#define FLATBUFFERS_GENERATED_TUNECACHE_PADDLE_LITE_FBS_OPENCL_PROTO_H_ + +#include "flatbuffers/flatbuffers.h" + +namespace paddle { +namespace lite { +namespace fbs { +namespace opencl { +namespace proto { +namespace TuneCache_ { + +struct TunePair; +struct TunePairBuilder; +struct TunePairT; + +} // namespace TuneCache_ + +struct TuneCache; +struct TuneCacheBuilder; +struct TuneCacheT; + +namespace TuneCache_ { + +bool operator==(const TunePairT &lhs, const TunePairT &rhs); +bool operator!=(const TunePairT &lhs, const TunePairT &rhs); +} // namespace TuneCache_ + +bool operator==(const TuneCacheT &lhs, const TuneCacheT &rhs); +bool operator!=(const TuneCacheT &lhs, const TuneCacheT &rhs); + +namespace TuneCache_ { + +inline const flatbuffers::TypeTable *TunePairTypeTable(); + +} // namespace TuneCache_ + +inline const flatbuffers::TypeTable *TuneCacheTypeTable(); + +namespace TuneCache_ { + +struct TunePairT : public flatbuffers::NativeTable { + typedef TunePair TableType; + std::string key; + std::vector value; + TunePairT() { + } +}; + +inline bool operator==(const TunePairT &lhs, const TunePairT &rhs) { + return + (lhs.key == rhs.key) && + (lhs.value == rhs.value); +} + +inline bool operator!=(const TunePairT &lhs, const TunePairT &rhs) { + return !(lhs == rhs); +} + + +struct TunePair FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { + typedef TunePairT NativeTableType; + typedef TunePairBuilder Builder; + static const flatbuffers::TypeTable *MiniReflectTypeTable() { + return TunePairTypeTable(); + } + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_KEY = 4, + VT_VALUE = 6 + }; + const flatbuffers::String *key() const { + return GetPointer(VT_KEY); + } + flatbuffers::String *mutable_key() { + return GetPointer(VT_KEY); + } + bool KeyCompareLessThan(const TunePair *o) const { + return *key() < *o->key(); + } + int KeyCompareWithValue(const char *val) const { + return strcmp(key()->c_str(), val); + } + const flatbuffers::Vector *value() const { + return GetPointer *>(VT_VALUE); + } + flatbuffers::Vector *mutable_value() { + return GetPointer *>(VT_VALUE); + } + bool Verify(flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyOffsetRequired(verifier, VT_KEY) && + verifier.VerifyString(key()) && + VerifyOffset(verifier, VT_VALUE) && + verifier.VerifyVector(value()) && + verifier.EndTable(); + } + TunePairT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const; + void UnPackTo(TunePairT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const; + static flatbuffers::Offset Pack(flatbuffers::FlatBufferBuilder &_fbb, const TunePairT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); +}; + +struct TunePairBuilder { + typedef TunePair Table; + flatbuffers::FlatBufferBuilder &fbb_; + flatbuffers::uoffset_t start_; + void add_key(flatbuffers::Offset key) { + fbb_.AddOffset(TunePair::VT_KEY, key); + } + void add_value(flatbuffers::Offset> value) { + fbb_.AddOffset(TunePair::VT_VALUE, value); + } + explicit TunePairBuilder(flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + TunePairBuilder &operator=(const TunePairBuilder &); + flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = flatbuffers::Offset(end); + fbb_.Required(o, TunePair::VT_KEY); + return o; + } +}; + +inline flatbuffers::Offset CreateTunePair( + flatbuffers::FlatBufferBuilder &_fbb, + flatbuffers::Offset key = 0, + flatbuffers::Offset> value = 0) { + TunePairBuilder builder_(_fbb); + builder_.add_value(value); + builder_.add_key(key); + return builder_.Finish(); +} + +inline flatbuffers::Offset CreateTunePairDirect( + flatbuffers::FlatBufferBuilder &_fbb, + const char *key = nullptr, + const std::vector *value = nullptr) { + auto key__ = key ? _fbb.CreateString(key) : 0; + auto value__ = value ? _fbb.CreateVector(*value) : 0; + return paddle::lite::fbs::opencl::proto::TuneCache_::CreateTunePair( + _fbb, + key__, + value__); +} + +flatbuffers::Offset CreateTunePair(flatbuffers::FlatBufferBuilder &_fbb, const TunePairT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); + +} // namespace TuneCache_ + +struct TuneCacheT : public flatbuffers::NativeTable { + typedef TuneCache TableType; + std::vector> tune_map; + TuneCacheT() { + } +}; + +inline bool operator==(const TuneCacheT &lhs, const TuneCacheT &rhs) { + return + (lhs.tune_map == rhs.tune_map); +} + +inline bool operator!=(const TuneCacheT &lhs, const TuneCacheT &rhs) { + return !(lhs == rhs); +} + + +struct TuneCache FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { + typedef TuneCacheT NativeTableType; + typedef TuneCacheBuilder Builder; + static const flatbuffers::TypeTable *MiniReflectTypeTable() { + return TuneCacheTypeTable(); + } + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_TUNE_MAP = 4 + }; + const flatbuffers::Vector> *tune_map() const { + return GetPointer> *>(VT_TUNE_MAP); + } + flatbuffers::Vector> *mutable_tune_map() { + return GetPointer> *>(VT_TUNE_MAP); + } + bool Verify(flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyOffsetRequired(verifier, VT_TUNE_MAP) && + verifier.VerifyVector(tune_map()) && + verifier.VerifyVectorOfTables(tune_map()) && + verifier.EndTable(); + } + TuneCacheT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const; + void UnPackTo(TuneCacheT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const; + static flatbuffers::Offset Pack(flatbuffers::FlatBufferBuilder &_fbb, const TuneCacheT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); +}; + +struct TuneCacheBuilder { + typedef TuneCache Table; + flatbuffers::FlatBufferBuilder &fbb_; + flatbuffers::uoffset_t start_; + void add_tune_map(flatbuffers::Offset>> tune_map) { + fbb_.AddOffset(TuneCache::VT_TUNE_MAP, tune_map); + } + explicit TuneCacheBuilder(flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + TuneCacheBuilder &operator=(const TuneCacheBuilder &); + flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = flatbuffers::Offset(end); + fbb_.Required(o, TuneCache::VT_TUNE_MAP); + return o; + } +}; + +inline flatbuffers::Offset CreateTuneCache( + flatbuffers::FlatBufferBuilder &_fbb, + flatbuffers::Offset>> tune_map = 0) { + TuneCacheBuilder builder_(_fbb); + builder_.add_tune_map(tune_map); + return builder_.Finish(); +} + +inline flatbuffers::Offset CreateTuneCacheDirect( + flatbuffers::FlatBufferBuilder &_fbb, + std::vector> *tune_map = nullptr) { + auto tune_map__ = tune_map ? _fbb.CreateVectorOfSortedTables(tune_map) : 0; + return paddle::lite::fbs::opencl::proto::CreateTuneCache( + _fbb, + tune_map__); +} + +flatbuffers::Offset CreateTuneCache(flatbuffers::FlatBufferBuilder &_fbb, const TuneCacheT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); + +namespace TuneCache_ { + +inline TunePairT *TunePair::UnPack(const flatbuffers::resolver_function_t *_resolver) const { + std::unique_ptr _o = std::unique_ptr(new TunePairT()); + UnPackTo(_o.get(), _resolver); + return _o.release(); +} + +inline void TunePair::UnPackTo(TunePairT *_o, const flatbuffers::resolver_function_t *_resolver) const { + (void)_o; + (void)_resolver; + { auto _e = key(); if (_e) _o->key = _e->str(); } + { auto _e = value(); if (_e) { _o->value.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->value[_i] = _e->Get(_i); } } } +} + +inline flatbuffers::Offset TunePair::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TunePairT* _o, const flatbuffers::rehasher_function_t *_rehasher) { + return CreateTunePair(_fbb, _o, _rehasher); +} + +inline flatbuffers::Offset CreateTunePair(flatbuffers::FlatBufferBuilder &_fbb, const TunePairT *_o, const flatbuffers::rehasher_function_t *_rehasher) { + (void)_rehasher; + (void)_o; + struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TunePairT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va; + auto _key = _fbb.CreateString(_o->key); + auto _value = _fbb.CreateVector(_o->value); + return paddle::lite::fbs::opencl::proto::TuneCache_::CreateTunePair( + _fbb, + _key, + _value); +} + +} // namespace TuneCache_ + +inline TuneCacheT *TuneCache::UnPack(const flatbuffers::resolver_function_t *_resolver) const { + std::unique_ptr _o = std::unique_ptr(new TuneCacheT()); + UnPackTo(_o.get(), _resolver); + return _o.release(); +} + +inline void TuneCache::UnPackTo(TuneCacheT *_o, const flatbuffers::resolver_function_t *_resolver) const { + (void)_o; + (void)_resolver; + { auto _e = tune_map(); if (_e) { _o->tune_map.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->tune_map[_i] = std::unique_ptr(_e->Get(_i)->UnPack(_resolver)); } } } +} + +inline flatbuffers::Offset TuneCache::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TuneCacheT* _o, const flatbuffers::rehasher_function_t *_rehasher) { + return CreateTuneCache(_fbb, _o, _rehasher); +} + +inline flatbuffers::Offset CreateTuneCache(flatbuffers::FlatBufferBuilder &_fbb, const TuneCacheT *_o, const flatbuffers::rehasher_function_t *_rehasher) { + (void)_rehasher; + (void)_o; + struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TuneCacheT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va; + auto _tune_map = _fbb.CreateVector> (_o->tune_map.size(), [](size_t i, _VectorArgs *__va) { return CreateTunePair(*__va->__fbb, __va->__o->tune_map[i].get(), __va->__rehasher); }, &_va ); + return paddle::lite::fbs::opencl::proto::CreateTuneCache( + _fbb, + _tune_map); +} + +namespace TuneCache_ { + +inline const flatbuffers::TypeTable *TunePairTypeTable() { + static const flatbuffers::TypeCode type_codes[] = { + { flatbuffers::ET_STRING, 0, -1 }, + { flatbuffers::ET_INT, 1, -1 } + }; + static const char * const names[] = { + "key", + "value" + }; + static const flatbuffers::TypeTable tt = { + flatbuffers::ST_TABLE, 2, type_codes, nullptr, nullptr, names + }; + return &tt; +} + +} // namespace TuneCache_ + +inline const flatbuffers::TypeTable *TuneCacheTypeTable() { + static const flatbuffers::TypeCode type_codes[] = { + { flatbuffers::ET_SEQUENCE, 1, 0 } + }; + static const flatbuffers::TypeFunction type_refs[] = { + paddle::lite::fbs::opencl::proto::TuneCache_::TunePairTypeTable + }; + static const char * const names[] = { + "tune_map" + }; + static const flatbuffers::TypeTable tt = { + flatbuffers::ST_TABLE, 1, type_codes, type_refs, nullptr, names + }; + return &tt; +} + +inline const paddle::lite::fbs::opencl::proto::TuneCache *GetTuneCache(const void *buf) { + return flatbuffers::GetRoot(buf); +} + +inline const paddle::lite::fbs::opencl::proto::TuneCache *GetSizePrefixedTuneCache(const void *buf) { + return flatbuffers::GetSizePrefixedRoot(buf); +} + +inline TuneCache *GetMutableTuneCache(void *buf) { + return flatbuffers::GetMutableRoot(buf); +} + +inline bool VerifyTuneCacheBuffer( + flatbuffers::Verifier &verifier) { + return verifier.VerifyBuffer(nullptr); +} + +inline bool VerifySizePrefixedTuneCacheBuffer( + flatbuffers::Verifier &verifier) { + return verifier.VerifySizePrefixedBuffer(nullptr); +} + +inline void FinishTuneCacheBuffer( + flatbuffers::FlatBufferBuilder &fbb, + flatbuffers::Offset root) { + fbb.Finish(root); +} + +inline void FinishSizePrefixedTuneCacheBuffer( + flatbuffers::FlatBufferBuilder &fbb, + flatbuffers::Offset root) { + fbb.FinishSizePrefixed(root); +} + +inline std::unique_ptr UnPackTuneCache( + const void *buf, + const flatbuffers::resolver_function_t *res = nullptr) { + return std::unique_ptr(GetTuneCache(buf)->UnPack(res)); +} + +inline std::unique_ptr UnPackSizePrefixedTuneCache( + const void *buf, + const flatbuffers::resolver_function_t *res = nullptr) { + return std::unique_ptr(GetSizePrefixedTuneCache(buf)->UnPack(res)); +} + +} // namespace proto +} // namespace opencl +} // namespace fbs +} // namespace lite +} // namespace paddle + +#endif // FLATBUFFERS_GENERATED_TUNECACHE_PADDLE_LITE_FBS_OPENCL_PROTO_H_ diff --git a/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt b/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt new file mode 100644 index 00000000000..234ec1c85e3 --- /dev/null +++ b/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt @@ -0,0 +1,73 @@ +cmake_minimum_required(VERSION 2.8) +project(mobilenet_full_api) +set(TARGET mobilenet_full_api) + +# 1. path to Paddle-Lite lib and mklml lib +set(LITE_DIR "${PROJECT_SOURCE_DIR}/../../../cxx") +set(MKLML_DIR "${PROJECT_SOURCE_DIR}/../../../third_party/mklml/") + +if (WIN32) + add_definitions("/DGOOGLE_GLOG_DLL_DECL=") + set(MSVC_STATIC_CRT ) + if(MSVC_STATIC_CRT) + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") + else(MSVC_STATIC_CRT) + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MDd") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MD") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MDd") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MD") + endif(MSVC_STATIC_CRT) +endif() + +if (APPLE AND METAL) + message(STATUS "set METAL=ON") + add_definitions("-DMETAL") + find_library(METAL_LIBRARY Metal REQUIRED) + find_library(GRAPHIC CoreGraphics REQUIRED) + find_library(MPS_LIBRARY MetalPerformanceShaders REQUIRED) +endif() + +# 2. link mklml and Paddle-Lite directory +link_directories(${LITE_DIR}/lib ${MKLML_DIR}/lib) +include_directories(${LITE_DIR}/include/ ${MKLML_DIR}/include) + +# 3. compile options +if (NOT WIN32) + add_definitions(-std=c++11 -g -O3 -pthread) + set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}) +endif() + +# 4.add executable output +add_executable(${TARGET} ${TARGET}.cc) +if (WIN32) + set(WITH_STATIC_MKL ) + if(WITH_STATIC_MKL) + set(MATH_LIB ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX}) + else() + set(MATH_LIB ${MKLML_DIR}/lib/mklml${CMAKE_STATIC_LIBRARY_SUFFIX} + ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX}) + endif() + + target_link_libraries(${TARGET} libpaddle_api_full_bundled.lib) + target_link_libraries(${TARGET} shlwapi.lib) + target_link_libraries(${TARGET} ${MATH_LIB}) + + add_custom_command(TARGET ${TARGET} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/libiomp5md.dll ${CMAKE_CURRENT_BINARY_DIR}/Release + ) + if(NOT WITH_STATIC_MKL) + add_custom_command(TARGET ${TARGET} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/mklml.dll ${CMAKE_CURRENT_BINARY_DIR}/Release + ) + endif() +else() + if (APPLE AND METAL) + target_link_libraries(${TARGET} ${METAL_LIBRARY} ${GRAPHIC} ${MPS_LIBRARY}) + endif() + target_link_libraries(${TARGET} -lpaddle_full_api_shared) + target_link_libraries(${TARGET} -liomp5) + target_link_libraries(${TARGET} -ldl) +endif() diff --git a/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt b/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt new file mode 100644 index 00000000000..3a91bfafbd3 --- /dev/null +++ b/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt @@ -0,0 +1,73 @@ +cmake_minimum_required(VERSION 2.8) +project(mobilenet_light_api) +set(TARGET mobilenet_light_api) + +# 1. path to Paddle-Lite lib and mklml lib +set(LITE_DIR "${PROJECT_SOURCE_DIR}/../../../cxx") +set(MKLML_DIR "${PROJECT_SOURCE_DIR}/../../../third_party/mklml/") + +if (WIN32) + add_definitions("/DGOOGLE_GLOG_DLL_DECL=") + set(MSVC_STATIC_CRT ) + if(MSVC_STATIC_CRT) + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") + else(MSVC_STATIC_CRT) + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MDd") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MD") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MDd") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MD") + endif(MSVC_STATIC_CRT) +endif() + +if (APPLE AND METAL) + message(STATUS "set METAL=ON") + add_definitions("-DMETAL") + find_library(METAL_LIBRARY Metal REQUIRED) + find_library(GRAPHIC CoreGraphics REQUIRED) + find_library(MPS_LIBRARY MetalPerformanceShaders REQUIRED) +endif() + +# 2. link mklml and Paddle-Lite directory +link_directories(${LITE_DIR}/lib ${MKLML_DIR}/lib) +include_directories(${LITE_DIR}/include/ ${MKLML_DIR}/include) + +# 3. compile options +if (NOT WIN32) + add_definitions(-std=c++11 -g -O3 -pthread) + set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}) +endif() + +# 4.add executable output +add_executable(${TARGET} ${TARGET}.cc) +if (WIN32) + set(WITH_STATIC_MKL ) + if(WITH_STATIC_MKL) + set(MATH_LIB ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX}) + else() + set(MATH_LIB ${MKLML_DIR}/lib/mklml${CMAKE_STATIC_LIBRARY_SUFFIX} + ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX}) + endif() + + target_link_libraries(${TARGET} libpaddle_api_light_bundled.lib) + target_link_libraries(${TARGET} shlwapi.lib) + target_link_libraries(${TARGET} ${MATH_LIB}) + + add_custom_command(TARGET ${TARGET} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/libiomp5md.dll ${CMAKE_CURRENT_BINARY_DIR}/Release + ) + if(NOT WITH_STATIC_MKL) + add_custom_command(TARGET ${TARGET} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/mklml.dll ${CMAKE_CURRENT_BINARY_DIR}/Release + ) + endif() +else() + if (APPLE AND METAL) + target_link_libraries(${TARGET} ${METAL_LIBRARY} ${GRAPHIC} ${MPS_LIBRARY}) + endif() + target_link_libraries(${TARGET} -lpaddle_light_api_shared) + target_link_libraries(${TARGET} -liomp5) + target_link_libraries(${TARGET} -ldl) +endif() diff --git a/lite/kernels/host/CMakeLists.txt b/lite/kernels/host/CMakeLists.txt index 564fcc0c88a..bbf6d9f3e4a 100644 --- a/lite/kernels/host/CMakeLists.txt +++ b/lite/kernels/host/CMakeLists.txt @@ -108,6 +108,7 @@ add_kernel(distribute_fpn_proposals_compute_host Host extra SRCS distribute_fpn_ add_kernel(collect_fpn_proposals_compute_host Host extra SRCS collect_fpn_proposals_compute.cc) add_kernel(flip_compute_host Host extra SRCS flip_compute.cc) add_kernel(unique_with_counts_compute Host extra SRCS unique_with_counts_compute.cc) +add_kernel(unique_compute Host extra SRCS unique_compute.cc) add_kernel(roi_align_compute Host extra SRCS roi_align_compute.cc) add_kernel(box_clip_compute Host extra SRCS box_clip_compute.cc) add_kernel(gaussian_random_compute Host extra SRCS gaussian_random_compute.cc) diff --git a/lite/kernels/host/unique_compute.cc b/lite/kernels/host/unique_compute.cc new file mode 100644 index 00000000000..97363f2bbe9 --- /dev/null +++ b/lite/kernels/host/unique_compute.cc @@ -0,0 +1,530 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/host/unique_compute.h" +#include "lite/backends/host/math/concat.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace paddle { +namespace lite { +namespace kernels { +namespace host { + +template +void UniqueFunc(const lite::Tensor* x, + lite::Tensor* out, + lite::Tensor* index, + lite::Tensor* count) { + const InT* in_data = x->template data(); + IndexT* index_data = index->mutable_data(); + + int64_t j = 0; + + std::unordered_map dict; + std::vector uniq; + + for (auto i = 0; i < x->numel(); i++) { + auto it = dict.find(in_data[i]); + if (it == dict.end()) { + dict.emplace(std::make_pair(in_data[i], j)); + index_data[i] = static_cast(j); + j++; + } else { + index_data[i] = static_cast(it->second); + } + } + + if (count != nullptr) { + // Resize the count tensor dims to allocate the memory + count->Resize({static_cast(uniq.size())}); + IndexT* count_data = count->template mutable(); + // init count_data to 0 + memset(count_data, 0, uniq.size() * sizeof(IndexT)); + + if (IndexT == int32_t) { + for (auto i = 0; i < x->numel(); ++i) { + const IndexT& index = index_data[i]; + count_data[static_cast(index)] += static_cast(1); + } + } else { + for (auto i = 0; i < x->numel(); ++i) { + const IndexT& index = index_data[i]; + count_data[static_cast(index)] += static_cast(1); + } + } + } + + out->Resize({static_cast(uniq.size())}); + auto out_data = out->mutable_data(); + std::memcpy(out_data, uniq.data(), uniq.size() * sizeof(InT)); +} + +template +void UniqueFlattendTensorFunc(const lite::Tensor& in, + lite::Tensor* out, + lite::Tensor* index, + lite::Tensor* indices, + lite::Tensor* count, + bool return_index, + bool return_inverse, + bool return_counts) { + const InT* in_data = in.data(); + std::set unique(in_data, in_data + in.numel()); + out->Resize({static_cast(unique.size())}); + auto out_data = out->mutable_data(); + std::copy(unique.begin(), unique.end(), out_data); + + if (return_index) { + indices->Resize({out->numel()}); + auto indices_data = indices->mutable_data(); + std::unordered_map indices_map; + indices_map.reserve(out->numel()); + for (int64_t i = 0; i < in.numel(); ++i) { + if (indices_map.find(in_data[i]) != indices_map.end()) continue; + indices_map[in_data[i]] = i; + } + for (int64_t i = 0; i < out->numel(); ++i) { + indices_data[i] = indices_map[out_data[i]]; + } + } + + if (return_inverse) { + auto* inverse = index; + inverse->Resize{{out->numel()}}; + auto inverse_data = inverse->mutable_data(); + std::unordered_map inverse_map; + for (int64_t i = 0; i < out->numel(); ++i) { + inverse_map[out_data[i]] = i; + } + for (int64_t i = 0; i < in.numel(); ++i) { + inverse_data[i] = inverse_map[in_data[i]]; + } + } + + if (return_counts) { + count->Resize({out->numel()}); + auto count_data = count->mutable_data(); + std::unordered_map counts_map; + counts_map.reserve(out->numel()); + for (int64_t i = 0; i < out->numel(); ++i) { + counts_map[out_data[i]] = 0; + } + for (int64_t i = 0; i < in.numel(); ++i) { + counts_map[in_data[i]] += 1; + } + for (int64_t i = 0; i < out->numel(); ++i) { + count_data[i] = counts_map[out_data[i]]; + } + } +} + +static std::vector Unbind(const lite::Tensor& in) { + int64_t size = in.dims()[0]; + std::vector tensors(size); + for (int64_t i = 0; i < size; ++i) { + tensors[i] = in.Slice(i, i + 1); + } + return tensors; +} + +template +static bool Equal(const lite::Tensor& a, const lite::Tensor& b) { + if (a.numel() != b.numel()) { + return false; + } + for (int64_t i = 0; i < a.numel(); ++i) { + if (a.data()[i] != b.data()[i]) { + return false; + } + } + return true; +} + +template +static ForwardIt UniqueDimImpl(ForwardIt first, + ForwardIt last, + const std::vector& sorted_indices_vec, + std::vector* inverse_vec, + std::vector* counts_vec, + std::vector* indices_vec) { + if (first == last) { + return last; + } + + (*inverse_vec)[sorted_indices_vec[0]] = 0; + (*counts_vec)[0] = 1; + (*indices_vec)[0] = sorted_indices_vec[0]; + + ForwardIt begin = first; + ForwardIt result = first; + + while (++first != last) { + int64_t idx_first = std::distance(begin, first); + int64_t idx_result = std::distance(begin, result); + if (!Equal(*result, *first)) { + if (++result != first) { + *result = std::move(*first); + } + idx_result += 1; + (*indices_vec)[idx_result] = sorted_indices_vec[idx_first]; + } + (*inverse_vec)[sorted_indices_vec[idx_first]] = idx_result; + (*counts_vec)[idx_result] += 1; + } + return ++result; +} + +template +void TensorFromVector(const std::vector& src, lite::Tensor* dst) { + auto* src_ptr = static_cast(src.data()); + auto* dst_ptr = static_cast(dst->mutable_data()); + auto size = src.size() * sizeof(T); + lite::TargetWrapperHost::MemcpySync( + dst_ptr, src_ptr, size, IoDirection::HtoH); +} + +template +void TransCompute(const Tensor &input, + Tensor *output, + const std::vector &orders) { + auto in_dims = input.dims(); + auto out_dims = output->dims(); + int num_axes = in_dims.size(); + int count = in_dims.production(); + + const T *din = input.data(); + T *dout = output->mutable_data(); + + std::vector old_temp; + int temp = 1; + for (int i = 0; i < num_axes; ++i) { + old_temp.push_back(temp); + temp *= in_dims[num_axes - 1 - i]; + } + std::vector old_step; + for (int i = 0; i < num_axes; i++) { + old_step.push_back(old_temp[num_axes - 1 - i]); + } + + std::vector new_temp; + temp = 1; + for (int i = 0; i < num_axes; ++i) { + new_temp.push_back(temp); + temp *= out_dims[num_axes - 1 - i]; + } + std::vector new_step; + for (int i = 0; i < num_axes; i++) { + new_step.push_back(new_temp[num_axes - 1 - i]); + } + + // std::vector old_steps( + // {static_cast(in_dims[1] * in_dims[2] * in_dims[3]), + // static_cast(in_dims[2] * in_dims[3]), + // static_cast(in_dims[3]), + // 1}); + // std::vector new_steps( + // {static_cast(out_dims[1] * out_dims[2] * out_dims[3]), + // static_cast(out_dims[2] * out_dims[3]), + // static_cast(out_dims[3]), + // 1}); + + for (int i = 0; i < count; ++i) { + int old_idx = 0; + int idx = i; + for (int j = 0; j < num_axes; ++j) { + int order = orders[j]; + old_idx += (idx / new_steps[j]) * old_steps[order]; + idx %= new_steps[j]; + } + dout[i] = din[old_idx]; + } +} + +lite::DDim FlattenTo2d(const lite::DDim& src, int num_col_dims) { + return lite::DDim((src.Slice(0, num_col_dims)).production(), + (src.Slice(num_col_dims, src.size())).production()); +} + +template +void UniqueDimFunc(const lite::Tensor& in, + lite::Tensor* out, + lite::Tensor* index, + lite::Tensor* indices, + lite::Tensor* count, + int axis, + bool return_index, + bool return_inverse, + bool return_counts) { + // transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2] + std::vector permute(in.dims().size()); + std::iota(permute.begin(), permute.end(), 0); + permute[axis] = 0; + permute[0] = axis; + std::vector in_trans_dim_vec(in.dims()); + in_trans_dim_vec[axis] = in.dims()[0]; + in_trans_dim_vec[0] = in.dims()[axis]; + lite::Tensor in_trans; + lite::DDim in_trans_dims = in_trans_dim_vec; + in_trans.Resize(in_trans_dims); + in_trans.mutable_data(); + TransCompute(in, &in_trans, permute); + // reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2] + lite::DDim in_trans_flat_dims = FlattenTo2d(in_trans_dims, 1); + in_trans.Resize(in_trans_flat_dims); + + // sort indices + std::vector sorted_indices_vec(in_trans.dims()[0]); + std::iota(sorted_indices_vec.begin(), sorted_indices_vec.end(), 0); + int64_t col = in_trans.dims()[1]; + const InT* in_trans_data = in_trans.data(); + std::sort(sorted_indices_vec.begin(), + sorted_indices_vec.end(), + [&](int64_t a, int64_t b) -> bool { + for (int64_t i = 0; i < col; ++i) { + InT lhs = in_trans_data[i + a*col]; + InT rhs = in_trans_data[i + b*col]; + if (lhs < rhs) { + return true; + } else if (lhs > rhs) { + return false; + } + } + return false; + }); + + // sort tensor according to indices + lite::Tensor input_sorted; + input_sorted.Resize(in_trans_dims); + input_sorted.mutable_data(); + InT* input_sorted_data = input_sorted.data(); + for (size_t i = 0; i < sorted_indices_vec.size(); ++i) { + memcpy(input_sorted_data + i * col, + in_trans_data + static_cast(sorted_indices_vec[i]) * col, + col * sizeof(InT)); + + } + + std::vector input_unbind = Unbind(input_sorted); + std::vector inverse_vec(sorted_indices_vec.size(), 0); + std::vector counts_vec(sorted_indices_vec.size(), 0); + std::vector indices_vec(sorted_indices_vec.size(), 0); + auto last = UniqueDimImpl::iterator, InT, int32_t>( + input_unbind.begin(), + input_unbind.end(), + sorted_indices_vec, + &inverse_vec, + &counts_vec, + &indices_vec); + input_unbind.erase(last, input_unbind.end()); + counts_vec.erase(counts_vec.begin() + input_unbind.size(), counts_vec.end()); + indices_vec.erase(indices_vec.begin() + input_unbind.size(), indices_vec.end()); + + lite::Tensor out_trans; + std::vector out_trans_dims_vec = in_trans_dim_vec; + out_trans_dims_vec[0] = input_unbind.size(); + out_trans.Resize(out_trans_dims_vec); + out_trans.mutable_data(); + std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]); + out->Resize(out_trans_dims_vec); + out->mutable_data(); + lite::host::math::concat_func(input_unbind, 0, &out_trans); + TransCompute(out_trans, out, permute); + + if (return_inverse) { + TensorFromVector(inverse_vec, index); + } + + if (return_counts) { + TensorFromVector(counts_vec, count); + } + + if (return_index) { + TensorFromVector(indices_vec, indices); + } +} + +void UniqueCompute::Run() { + auto& param = Param(); + auto x = param.X; + auto output = param.Out; + auto index = param.Index; + auto indices = param.Indices; + auto count = param.Counts; + auto dtype = param.dtype; + auto return_index = param.return_index; + auto return_inverse = param.return_inverse; + auto return_counts = param.return_counts; + auto axis_vec = param.axis; + auto is_sorted = param.is_sorted; + + // lite_api::PrecisionType index_type = index->precision(); + lite_api::PrecisionType index_type = dtype; + bool index_type_match = index_type == PRECISION(kInt32) || index_type == PRECISION(kInt64); + lite_api::PrecisionType type = x->precision(); + CHECK_EQ(index_type_match, true) << "Index holds the wrong type, it holds " + << static_cast(type) + << "but desires to be int32 or int64"; + + if (!is_sorted) { + if (index_type == PRECISION(kInt32)) { + switch (type) { + case PRECISION(kFloat): + UniqueFunc(x, output, index, count); + break; + case PRECISION(kInt32): + UniqueFunc(x, output, index, count); + break; + case PRECISION(kInt64): + UniqueFunc(x, output, index, count); + break; + default: + LOG(FATAL) << "unique does not implement for the " + << "input type:" << static_cast(type); + break; + } + } else { + switch (type) { + case PRECISION(kFloat): + UniqueFunc(x, output, index, count); + break; + case PRECISION(kInt32): + UniqueFunc(x, output, index, count); + break; + case PRECISION(kInt64): + UniqueFunc(x, output, index, count); + break; + default: + LOG(FATAL) << "unique does not implement for the " + << "input type:" << static_cast(type); + break; + } + } + return; + } + + if (x->numel() = 0) { + out->template mutable_data(); + return; + } + if (axis_vec.empty()) { + if (index_type == PRECISION(kInt32)) { + switch (type) { + case PRECISION(kFloat): + UniqueFlattendTensorFunc(x, output, index, indices, count, return_index, return_inverse, return_counts); + break; + case PRECISION(kInt32): + UniqueFlattendTensorFunc(x, output, index, indices, count, return_index, return_inverse, return_counts); + break; + case PRECISION(kInt64): + UniqueFlattendTensorFunc(x, output, index, indices, count, return_index, return_inverse, return_counts); + break; + default: + LOG(FATAL) << "unique does not implement for the " + << "input type:" << static_cast(type); + break; + } + } else { + switch (type) { + case PRECISION(kFloat): + UniqueFlattendTensorFunc(x, output, index, indices, count, return_index, return_inverse, return_counts); + break; + case PRECISION(kInt32): + UniqueFlattendTensorFunc(x, output, index, indices, count, return_index, return_inverse, return_counts); + break; + case PRECISION(kInt64): + UniqueFlattendTensorFunc(x, output, index, indices, count, return_index, return_inverse, return_counts); + break; + default: + LOG(FATAL) << "unique does not implement for the " + << "input type:" << static_cast(type); + break; + } + } + } else { + int axis = axis_vec[0]; + if (index_type == PRECISION(kInt32)) { + switch (type) { + case PRECISION(kFloat): + UniqueDimFunc(x, output, index, indices, count, axis, return_index, return_inverse, return_counts); + break; + case PRECISION(kInt32): + UniqueDimFunc(x, output, index, indices, count, axis, return_index, return_inverse, return_counts); + break; + case PRECISION(kInt64): + UniqueDimFunc(x, output, index, indices, count, axis, return_index, return_inverse, return_counts); + break; + default: + LOG(FATAL) << "unique does not implement for the " + << "input type:" << static_cast(type); + } + } else { + switch (type) { + case PRECISION(kFloat): + UniqueDimFunc(x, output, index, indices, count, axis, return_index, return_inverse, return_counts); + break; + case PRECISION(kInt32): + UniqueDimFunc(x, output, index, indices, count, axis, return_index, return_inverse, return_counts); + break; + case PRECISION(kInt64): + UniqueDimFunc(x, output, index, indices, count, axis, return_index, return_inverse, return_counts); + break; + default: + LOG(FATAL) << "unique does not implement for the " + << "input type:" << static_cast(type); + } + } + } +} + + +} // namespace host +} // namespace kernels +} // namespace lite +} // namespace paddle + + +REGISTER_LITE_KERNEL(unique, + kHost, + kAny, + kAny, + paddle::lite::kernels::host::UniqueCompute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kAny), + DATALAYOUT(kAny))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kAny), + DATALAYOUT(kAny))}) + .BindOutput("Index", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kInt32), + DATALAYOUT(kAny))}) + .BindOutput("Indices", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kInt32), + DATALAYOUT(kAny))}) + .BindOutput("Count", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kInt32), + DATALAYOUT(kAny))}) + .Finalize(); \ No newline at end of file diff --git a/lite/kernels/host/unique_compute.h b/lite/kernels/host/unique_compute.h new file mode 100644 index 00000000000..631eb5b5682 --- /dev/null +++ b/lite/kernels/host/unique_compute.h @@ -0,0 +1,36 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace host { + +class UniqueCompute + : public KernelLite { + public: + void Run() override; + + virtual ~UniqueCompute() = default; +}; + +} // namespace host +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/operators/CMakeLists.txt b/lite/operators/CMakeLists.txt index 9138e42ffe0..8ff0540a311 100755 --- a/lite/operators/CMakeLists.txt +++ b/lite/operators/CMakeLists.txt @@ -206,6 +206,7 @@ add_operator(one_hot_v2_op extra SRCS one_hot_v2_op.cc) add_operator(strided_slice_op extra SRCS strided_slice_op.cc) add_operator(where_op extra SRCS where_op.cc) add_operator(unique_with_counts_op extra SRCS unique_with_counts_op.cc) +add_operator(unique_op extra SRCS unique_op.cc) # for content-dnn specific add_operator(search_aligned_mat_mul_op extra SRCS search_aligned_mat_mul_op.cc) diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h index d09d9ffff54..1b9e121e4b7 100644 --- a/lite/operators/op_params.h +++ b/lite/operators/op_params.h @@ -2284,6 +2284,21 @@ struct UniqueWithCountsParam : ParamBase { lite::Tensor* Count{}; }; +/// --------------- unique operators --------------- +struct UniqueParam : ParamBase { + const lite::Tensor* X{}; + lite::Tensor* Out{}; + lite::Tensor* Index{}; // the indices in the original input + lite::Tensor* Indices{}; // the indices in the result + lite::Tensor* Counts{}; + int dtype{-1}; + bool return_index{false}; // Indices + bool return_inverse{false}; // Index + bool return_counts{false}; + std::vector axis{}; + bool is_sorted{false}; +}; + struct GaussRandomParam : ParamBase { const lite::Tensor* ShapeTensor{nullptr}; std::vector ShapeTensorList{}; diff --git a/lite/operators/unique_op.cc b/lite/operators/unique_op.cc new file mode 100644 index 00000000000..3cab23bbc81 --- /dev/null +++ b/lite/operators/unique_op.cc @@ -0,0 +1,88 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/unique_op.h" +#include "lite/core/op_registry.h" +namespace paddle { +namespace lite { +namespace operators { + +bool UniqueOp::CheckShape() const { + CHECK_OR_FALSE(param_.X); + CHECK_OR_FALSE(param_.Out); + if (param_.return_index) { + CHECK_OR_FALSE(param_.Indices); + } + if (param_.return_inverse) { + CHECK_OR_FALSE(param_.Index); + } + if (param_.return_counts) { + CHECK_OR_FALSE(param_.Counts) + } + return true; +} + +bool UniqueOp::InferShapeImpl() const { + DDim in_dims = param_.X->dims(); + param_.Out->Resize(in_dims); + param_.Index->Resize(in_dims); + param_.Indices->Resize(in_dims); + param_.Counts->Resize(in_dims); + return true; +} + +bool UniqueOp::AttachImpl(const cpp::OpDesc &opdesc, + lite::Scope *scope) { + param_.X = scope->FindTensor(opdesc.Input("X").front()); + CHECK(param_.X) << "Input(X) of UniqueOp should not be null."; + param_.Out = scope->FindMutableTensor(opdesc.Output("Out").front()); + CHECK(param_.Out) << "Output(Out) of UniqueOp should not be null."; + if (opdesc.HasOutput("Index")) { + param_.Index = scope->FindMutableTensor(opdesc.Output("Index").front()); + CHECK(param_.Out) << "Output(Index) of UniqueOp should not be null."; + } + if (opdesc.HasOutput("Indices")) { + param_.Indices = scope->FindMutableTensor(opdesc.Output("Indices").front()); + CHECK(param_.Out) << "Output(Indices) of UniqueOp should not be null."; + } + if (opdesc.HasOutput("Counts")) { + param_.Counts = scope->FindMutableTensor(opdesc.Output("Counts").front()); + CHECK(param_.Counts) << "Output(Counts) of UniqueOp should not be null."; + } + + if (opdesc.HasAttr("dtype")) { + param_.dtype = opdesc.GetAttr("dtype"); + } + if (opdesc.HasAttr("return_index")) { + param_.return_index = opdesc.GetAttr("return_index"); + } + if (opdesc.HasAttr("return_reverse")) { + param_.return_inverse = opdesc.GetAttr("return_inverse"); + } + if (opdesc.HasAttr("return_counts")) { + param_.return_counts = opdesc.GetAttr("return_counts"); + } + param_.axis = opdesc.GetAttr>("axis"); + if (opdesc.HasAttr("is_sorted")) { + param_.is_sorted = opdesc.GetAttr("is_sorted"); + } + + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(unique, paddle::lite::operators::UniqueOp); diff --git a/lite/operators/unique_op.h b/lite/operators/unique_op.h new file mode 100644 index 00000000000..c9e302b7566 --- /dev/null +++ b/lite/operators/unique_op.h @@ -0,0 +1,52 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class UniqueOp : public OpLite { + public: + UniqueOp() {} + explicit UniqueOp(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { return "unique"; } + + bool InferType() override { + param_.Out->set_precision(param_.X->precision()); + return true; + } + + protected: + mutable UniqueParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle \ No newline at end of file From 7b4b379d494cee3eed460122516b552143c399ca Mon Sep 17 00:00:00 2001 From: Qijian Tian <1741919942@qq.com> Date: Thu, 8 Dec 2022 18:58:31 +0800 Subject: [PATCH 02/10] add and update unique_op --- lite/backends/arm/math/dotprod/gemm_sdot.h | 442 +++++++++++++ lite/backends/arm/math/dotprod/gemm_vsdot.h | 54 ++ .../cxx/mobile_light/mobilenetv1_light_api.cc | 620 +++++++++--------- .../x86_mobilenetv1_full_demo/CMakeLists.txt | 73 --- .../x86_mobilenetv1_light_demo/CMakeLists.txt | 73 --- lite/kernels/host/index_select_compute.cc | 13 + lite/kernels/host/unique_compute.cc | 142 ++-- 7 files changed, 894 insertions(+), 523 deletions(-) create mode 100644 lite/backends/arm/math/dotprod/gemm_sdot.h create mode 100644 lite/backends/arm/math/dotprod/gemm_vsdot.h delete mode 100644 lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt delete mode 100644 lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt diff --git a/lite/backends/arm/math/dotprod/gemm_sdot.h b/lite/backends/arm/math/dotprod/gemm_sdot.h new file mode 100644 index 00000000000..1eea169b15f --- /dev/null +++ b/lite/backends/arm/math/dotprod/gemm_sdot.h @@ -0,0 +1,442 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +// clang-format off +#define GEMM_SDOT_INT8_KERNEL \ + "ldp q0, q1, [%[a_ptr]], #32\n" /* load a00,a01 to q0, q1*/ \ + "ldp q4, q5, [%[b_ptr]], #32\n" /* load b0, b1 to q4, q5*/ \ + "eor v8.16b, v8.16b, v8.16b\n" /* out0 = 0 */ \ + "eor v9.16b, v9.16b, v9.16b\n" /* out1 = 0 */ \ + "eor v10.16b, v10.16b, v10.16b\n" /* out2 = 0 */ \ + "eor v11.16b, v11.16b, v11.16b\n" /* out3 = 0 */ \ + "eor v12.16b, v12.16b, v12.16b\n" /* out4 = 0 */ \ + "prfm pldl1keep, [%[b_ptr], #64]\n" /* preload b*/ \ + "eor v13.16b, v13.16b, v13.16b\n" /* out5 = 0 */ \ + "prfm pldl1keep, [%[a_ptr], #64]\n" /* preload a*/ \ + "eor v14.16b, v14.16b, v14.16b\n" /* out6 = 0 */ \ + "prfm pldl1keep, [%[b_ptr], #128]\n" /* preload b*/ \ + "eor v15.16b, v15.16b, v15.16b\n" /* out7 = 0 */ \ + "prfm pldl1keep, [%[a_ptr], #128]\n" /* preload a*/ \ + "eor v16.16b, v16.16b, v16.16b\n" /* out8 = 0 */ \ + "prfm pldl1keep, [%[b_ptr], #192]\n" /* preload b*/ \ + "eor v17.16b, v17.16b, v17.16b\n" /* out9 = 0 */ \ + "prfm pldl1keep, [%[b_ptr], #256]\n" /* preload b*/ \ + "eor v18.16b, v18.16b, v18.16b\n" /* out10 = 0 */ \ + "prfm pldl1keep, [%[a_ptr], #192]\n" /* preload a*/ \ + "eor v19.16b, v19.16b, v19.16b\n" /* out11 = 0 */ \ + "prfm pldl1keep, [%[b_ptr], #320]\n" /* preload b*/ \ + "eor v20.16b, v20.16b, v20.16b\n" /* out12 = 0 */ \ + "prfm pldl1keep, [%[a_ptr], #256]\n" /* preload a*/ \ + "eor v21.16b, v21.16b, v21.16b\n" /* out13 = 0 */ \ + "prfm pldl1keep, [%[b_ptr], #384]\n" /* preload b*/ \ + "eor v22.16b, v22.16b, v22.16b\n" /* out14 = 0 */ \ + "eor v23.16b, v23.16b, v23.16b\n" /* out15 = 0 */ \ + "eor v24.16b, v24.16b, v24.16b\n" /* out16 = 0 */ \ + "eor v25.16b, v25.16b, v25.16b\n" /* out17 = 0 */ \ + "eor v26.16b, v26.16b, v26.16b\n" /* out18 = 0 */ \ + "eor v27.16b, v27.16b, v27.16b\n" /* out19 = 0 */ \ + "eor v28.16b, v28.16b, v28.16b\n" /* out20 = 0 */ \ + "eor v29.16b, v29.16b, v29.16b\n" /* out21 = 0 */ \ + "eor v30.16b, v30.16b, v30.16b\n" /* out22 = 0 */ \ + "eor v31.16b, v31.16b, v31.16b\n" /* out23 = 0 */ \ + "cbz %w[k], 2f\n" /* check loop count > 0 */ \ + /* main loop, unrool 0*/ \ + "1:\n" /* main loop */ \ +".word 0x4f80e088\n" /* sdot v8.4s, v4.16b, v0.4b[0] */\ +".word 0x4fa0e08b\n" /* sdot v11.4s, v4.16b, v0.4b[1] */\ + "ldp q6, q7, [%[b_ptr]], #32\n" /* load b2, b0 to q6, q7 */ \ +".word 0x4f80e88e\n" /* sdot v14.4s, v4.16b, v0.4b[2] */\ +".word 0x4fa0e891\n" /* sdot v17.4s, v4.16b, v0.4b[3] */\ + "ldp q2, q3, [%[a_ptr]], #32\n" /* load a10, a11 to q3, q4 */ \ +".word 0x4f81e094\n" /* sdot v20.4s, v4.16b, v1.4b[0] */\ +".word 0x4fa1e097\n" /* sdot v23.4s, v4.16b, v1.4b[1] */\ +".word 0x4f81e89a\n" /* sdot v26.4s, v4.16b, v1.4b[2] */\ +".word 0x4fa1e89d\n" /* sdot v29.4s, v4.16b, v1.4b[3] */\ +".word 0x4f80e0a9\n" /* sdot v9.4s, v5.16b, v0.4b[0] */\ +".word 0x4fa0e0ac\n" /* sdot v12.4s, v5.16b, v0.4b[1] */\ +".word 0x4f80e8af\n" /* sdot v15.4s, v5.16b, v0.4b[2] */\ +".word 0x4fa0e8b2\n" /* sdot v18.4s, v5.16b, v0.4b[3] */\ +".word 0x4f81e0b5\n" /* sdot v21.4s, v5.16b, v1.4b[0] */\ +".word 0x4fa1e0b8\n" /* sdot v24.4s, v5.16b, v1.4b[1] */\ +".word 0x4f81e8bb\n" /* sdot v27.4s, v5.16b, v1.4b[2] */\ +".word 0x4fa1e8be\n" /* sdot v30.4s, v5.16b, v1.4b[3] */\ + "ldp q4, q5, [%[b_ptr]], #32\n" /* load b1, b2 to q4, q5 */ \ +".word 0x4f80e0ca\n" /* sdot v10.4s, v6.16b, v0.4b[0] */\ +".word 0x4fa0e0cd\n" /* sdot v13.4s, v6.16b, v0.4b[1] */\ + "prfm pldl1keep, [%[b_ptr], #384]\n" \ +".word 0x4f80e8d0\n" /* sdot v16.4s, v6.16b, v0.4b[2] */\ +".word 0x4fa0e8d3\n" /* sdot v19.4s, v6.16b, v0.4b[3] */\ +".word 0x4f81e0d6\n" /* sdot v22.4s, v6.16b, v1.4b[0] */\ +".word 0x4fa1e0d9\n" /* sdot v25.4s, v6.16b, v1.4b[1] */\ +".word 0x4f81e8dc\n" /* sdot v28.4s, v6.16b, v1.4b[2] */\ +".word 0x4fa1e8df\n" /* sdot v31.4s, v6.16b, v1.4b[3] */\ + "ldp q0, q1, [%[a_ptr]], #32\n" /* load a00, a01 to q0, q1 */ \ + /* unrool 1 */ \ +".word 0x4f82e0e8\n" /* sdot v8.4s, v7.16b, v2.4b[0] */\ +".word 0x4fa2e0eb\n" /* sdot v11.4s, v7.16b, v2.4b[1] */\ +".word 0x4f82e8ee\n" /* sdot v14.4s, v7.16b, v2.4b[2] */\ + "prfm pldl1keep, [%[a_ptr], #256]\n" \ +".word 0x4fa2e8f1\n" /* sdot v17.4s, v7.16b, v2.4b[3] */\ +".word 0x4f83e0f4\n" /* sdot v20.4s, v7.16b, v3.4b[0] */\ +".word 0x4fa3e0f7\n" /* sdot v23.4s, v7.16b, v3.4b[1] */\ +".word 0x4f83e8fa\n" /* sdot v26.4s, v7.16b, v3.4b[2] */\ +".word 0x4fa3e8fd\n" /* sdot v29.4s, v7.16b, v3.4b[3] */\ + "ldp q6, q7, [%[b_ptr]], #32\n" /* load b0, b1 to q6, q7 */ \ +".word 0x4f82e089\n" /* sdot v9.4s, v4.16b, v2.4b[0] */\ +".word 0x4fa2e08c\n" /* sdot v12.4s, v4.16b, v2.4b[1] */\ +".word 0x4f82e88f\n" /* sdot v15.4s, v4.16b, v2.4b[2] */\ +".word 0x4fa2e892\n" /* sdot v18.4s, v4.16b, v2.4b[3] */\ +".word 0x4f83e095\n" /* sdot v21.4s, v4.16b, v3.4b[0] */\ +".word 0x4fa3e098\n" /* sdot v24.4s, v4.16b, v3.4b[1] */\ +".word 0x4f83e89b\n" /* sdot v27.4s, v4.16b, v3.4b[2] */\ +".word 0x4fa3e89e\n" /* sdot v30.4s, v4.16b, v3.4b[3] */\ +".word 0x4f82e0aa\n" /* sdot v10.4s, v5.16b, v2.4b[0] */\ +".word 0x4fa2e0ad\n" /* sdot v13.4s, v5.16b, v2.4b[1] */\ +".word 0x4f82e8b0\n" /* sdot v16.4s, v5.16b, v2.4b[2] */\ +".word 0x4fa2e8b3\n" /* sdot v19.4s, v5.16b, v2.4b[3] */\ +".word 0x4f83e0b6\n" /* sdot v22.4s, v5.16b, v3.4b[0] */\ +".word 0x4fa3e0b9\n" /* sdot v25.4s, v5.16b, v3.4b[1] */\ +".word 0x4f83e8bc\n" /* sdot v28.4s, v5.16b, v3.4b[2] */\ +".word 0x4fa3e8bf\n" /* sdot v31.4s, v5.16b, v3.4b[3] */\ + "ldp q4, q5, [%[b_ptr]], #32\n" /* load b2, b0 to q4, q5 */ \ + /* unrool 2*/ \ +".word 0x4f80e0c8\n" /* sdot v8.4s, v6.16b, v0.4b[0] */\ +".word 0x4fa0e0cb\n" /* sdot v11.4s, v6.16b, v0.4b[1] */\ + "ldp q2, q3, [%[a_ptr]], #32\n" /* load a10, a11 to q3, q4*/ \ +".word 0x4f80e8ce\n" /* sdot v14.4s, v6.16b, v0.4b[2] */\ +".word 0x4fa0e8d1\n" /* sdot v17.4s, v6.16b, v0.4b[3] */\ +".word 0x4f81e0d4\n" /* sdot v20.4s, v6.16b, v1.4b[0] */\ +".word 0x4fa1e0d7\n" /* sdot v23.4s, v6.16b, v1.4b[1] */\ +".word 0x4f81e8da\n" /* sdot v26.4s, v6.16b, v1.4b[2] */\ +".word 0x4fa1e8dd\n" /* sdot v29.4s, v6.16b, v1.4b[3] */\ +".word 0x4f80e0e9\n" /* sdot v9.4s, v7.16b, v0.4b[0] */\ +".word 0x4fa0e0ec\n" /* sdot v12.4s, v7.16b, v0.4b[1] */\ + "prfm pldl1keep, [%[b_ptr], #384]\n" \ +".word 0x4f80e8ef\n" /* sdot v15.4s, v7.16b, v0.4b[2] */\ +".word 0x4fa0e8f2\n" /* sdot v18.4s, v7.16b, v0.4b[3] */\ +".word 0x4f81e0f5\n" /* sdot v21.4s, v7.16b, v1.4b[0] */\ +".word 0x4fa1e0f8\n" /* sdot v24.4s, v7.16b, v1.4b[1] */\ +".word 0x4f81e8fb\n" /* sdot v27.4s, v7.16b, v1.4b[2] */\ +".word 0x4fa1e8fe\n" /* sdot v30.4s, v7.16b, v1.4b[3] */\ + "ldp q6, q7, [%[b_ptr]], #32\n" /* load b1, b2 to q6, q7*/ \ +".word 0x4f80e08a\n" /* sdot v10.4s, v4.16b, v0.4b[0] */\ +".word 0x4fa0e08d\n" /* sdot v13.4s, v4.16b, v0.4b[1] */\ +".word 0x4f80e890\n" /* sdot v16.4s, v4.16b, v0.4b[2] */\ +".word 0x4fa0e893\n" /* sdot v19.4s, v4.16b, v0.4b[3] */\ +".word 0x4f81e096\n" /* sdot v22.4s, v4.16b, v1.4b[0] */\ +".word 0x4fa1e099\n" /* sdot v25.4s, v4.16b, v1.4b[1] */\ +".word 0x4f81e89c\n" /* sdot v28.4s, v4.16b, v1.4b[2] */\ +".word 0x4fa1e89f\n" /* sdot v31.4s, v4.16b, v1.4b[3] */\ + "ldp q0, q1, [%[a_ptr]], #32\n" /* load a00, a01 to q0, q1*/ \ + /* unrool 3*/ \ +".word 0x4f82e0a8\n" /* sdot v8.4s, v5.16b, v2.4b[0] */\ +".word 0x4fa2e0ab\n" /* sdot v11.4s, v5.16b, v2.4b[1] */\ +".word 0x4f82e8ae\n" /* sdot v14.4s, v5.16b, v2.4b[2] */\ +".word 0x4fa2e8b1\n" /* sdot v17.4s, v5.16b, v2.4b[3] */\ +".word 0x4f83e0b4\n" /* sdot v20.4s, v5.16b, v3.4b[0] */\ +".word 0x4fa3e0b7\n" /* sdot v23.4s, v5.16b, v3.4b[1] */\ +".word 0x4f83e8ba\n" /* sdot v26.4s, v5.16b, v3.4b[2] */\ +".word 0x4fa3e8bd\n" /* sdot v29.4s, v5.16b, v3.4b[3] */\ + "ldp q4, q5, [%[b_ptr]], #32\n" /* load b0, b1 to q4, q5*/ \ +".word 0x4f82e0c9\n" /* sdot v9.4s, v6.16b, v2.4b[0] */\ +".word 0x4fa2e0cc\n" /* sdot v12.4s, v6.16b, v2.4b[1] */\ + "prfm pldl1keep, [%[a_ptr], #256]\n" \ +".word 0x4f82e8cf\n" /* sdot v15.4s, v6.16b, v2.4b[2] */\ +".word 0x4fa2e8d2\n" /* sdot v18.4s, v6.16b, v2.4b[3] */\ +".word 0x4f83e0d5\n" /* sdot v21.4s, v6.16b, v3.4b[0] */\ +".word 0x4fa3e0d8\n" /* sdot v24.4s, v6.16b, v3.4b[1] */\ +".word 0x4f83e8db\n" /* sdot v27.4s, v6.16b, v3.4b[2] */\ + "prfm pldl1keep, [%[b_ptr], #384]\n" \ +".word 0x4fa3e8de\n" /* sdot v30.4s, v6.16b, v3.4b[3] */\ +".word 0x4f82e0ea\n" /* sdot v10.4s, v7.16b, v2.4b[0] */\ +".word 0x4fa2e0ed\n" /* sdot v13.4s, v7.16b, v2.4b[1] */\ +".word 0x4f82e8f0\n" /* sdot v16.4s, v7.16b, v2.4b[2] */\ +".word 0x4fa2e8f3\n" /* sdot v19.4s, v7.16b, v2.4b[3] */\ +".word 0x4f83e0f6\n" /* sdot v22.4s, v7.16b, v3.4b[0] */\ +".word 0x4fa3e0f9\n" /* sdot v25.4s, v7.16b, v3.4b[1] */\ + "subs %w[k], %w[k], #1\n" /* loop count - 1*/ \ +".word 0x4f83e8fc\n" /* sdot v28.4s, v7.16b, v3.4b[2] */\ +".word 0x4fa3e8ff\n" /* sdot v31.4s, v7.16b, v3.4b[3] */\ + "bne 1b\n" /* Target to use when K is 1 or 2 */ \ + "2:\n" /* process tail*/ \ + "subs %w[tail], %w[tail], #1\n" /* tail--*/ \ + "beq 3f\n" /*jump to tail = 1*/ \ + /* final unrool 0, unrool 0, tail > 1*/ \ +".word 0x4f80e088\n" /* sdot v8.4s, v4.16b, v0.4b[0] */\ +".word 0x4fa0e08b\n" /* sdot v11.4s, v4.16b, v0.4b[1] */\ + "ldp q6, q7, [%[b_ptr]], #32\n" /* load b2, b0 to q6, q7*/ \ +".word 0x4f80e88e\n" /* sdot v14.4s, v4.16b, v0.4b[2] */\ +".word 0x4fa0e891\n" /* sdot v17.4s, v4.16b, v0.4b[3] */\ + "ldp q2, q3, [%[a_ptr]], #32\n" /* load a10, a11 to q2, q3*/ \ +".word 0x4f81e094\n" /* sdot v20.4s, v4.16b, v1.4b[0] */\ +".word 0x4fa1e097\n" /* sdot v23.4s, v4.16b, v1.4b[1] */\ +".word 0x4f81e89a\n" /* sdot v26.4s, v4.16b, v1.4b[2] */\ +".word 0x4fa1e89d\n" /* sdot v29.4s, v4.16b, v1.4b[3] */\ + "subs %w[tail], %w[tail], #1\n" /* tail--*/ \ +".word 0x4f80e0a9\n" /* sdot v9.4s, v5.16b, v0.4b[0] */\ +".word 0x4fa0e0ac\n" /* sdot v12.4s, v5.16b, v0.4b[1] */\ +".word 0x4f80e8af\n" /* sdot v15.4s, v5.16b, v0.4b[2] */\ +".word 0x4fa0e8b2\n" /* sdot v18.4s, v5.16b, v0.4b[3] */\ +".word 0x4f81e0b5\n" /* sdot v21.4s, v5.16b, v1.4b[0] */\ +".word 0x4fa1e0b8\n" /* sdot v24.4s, v5.16b, v1.4b[1] */\ +".word 0x4f81e8bb\n" /* sdot v27.4s, v5.16b, v1.4b[2] */\ +".word 0x4fa1e8be\n" /* sdot v30.4s, v5.16b, v1.4b[3] */\ + "ldp q4, q5, [%[b_ptr]], #32\n" /* load b1, b2 to q4, q5*/ \ +".word 0x4f80e0ca\n" /* sdot v10.4s, v6.16b, v0.4b[0] */\ +".word 0x4fa0e0cd\n" /* sdot v13.4s, v6.16b, v0.4b[1] */\ +".word 0x4f80e8d0\n" /* sdot v16.4s, v6.16b, v0.4b[2] */\ +".word 0x4fa0e8d3\n" /* sdot v19.4s, v6.16b, v0.4b[3] */\ +".word 0x4f81e0d6\n" /* sdot v22.4s, v6.16b, v1.4b[0] */\ +".word 0x4fa1e0d9\n" /* sdot v25.4s, v6.16b, v1.4b[1] */\ +".word 0x4f81e8dc\n" /* sdot v28.4s, v6.16b, v1.4b[2] */\ +".word 0x4fa1e8df\n" /* sdot v31.4s, v6.16b, v1.4b[3] */\ + "beq 4f\n" /*jump to tail = 2*/ \ + /* unrool 1, tail > 2*/ \ + "ldp q0, q1, [%[a_ptr]], #32\n" /* load a00, a01 to q0, q1*/ \ +".word 0x4f82e0e8\n" /* sdot v8.4s, v7.16b, v2.4b[0] */\ +".word 0x4fa2e0eb\n" /* sdot v11.4s, v7.16b, v2.4b[1] */\ +".word 0x4f82e8ee\n" /* sdot v14.4s, v7.16b, v2.4b[2] */\ +".word 0x4fa2e8f1\n" /* sdot v17.4s, v7.16b, v2.4b[3] */\ +".word 0x4f83e0f4\n" /* sdot v20.4s, v7.16b, v3.4b[0] */\ +".word 0x4fa3e0f7\n" /* sdot v23.4s, v7.16b, v3.4b[1] */\ +".word 0x4f83e8fa\n" /* sdot v26.4s, v7.16b, v3.4b[2] */\ +".word 0x4fa3e8fd\n" /* sdot v29.4s, v7.16b, v3.4b[3] */\ + "ldp q6, q7, [%[b_ptr]], #32\n" /* load b0, b1 to q6, q7*/ \ +".word 0x4f82e089\n" /* sdot v9.4s, v4.16b, v2.4b[0] */\ +".word 0x4fa2e08c\n" /* sdot v12.4s, v4.16b, v2.4b[1] */\ +".word 0x4f82e88f\n" /* sdot v15.4s, v4.16b, v2.4b[2] */\ +".word 0x4fa2e892\n" /* sdot v18.4s, v4.16b, v2.4b[3] */\ +".word 0x4f83e095\n" /* sdot v21.4s, v4.16b, v3.4b[0] */\ +".word 0x4fa3e098\n" /* sdot v24.4s, v4.16b, v3.4b[1] */\ +".word 0x4f83e89b\n" /* sdot v27.4s, v4.16b, v3.4b[2] */\ +".word 0x4fa3e89e\n" /* sdot v30.4s, v4.16b, v3.4b[3] */\ + "subs %w[tail], %w[tail], #1\n" /* tail--*/ \ +".word 0x4f82e0aa\n" /* sdot v10.4s, v5.16b, v2.4b[0] */\ +".word 0x4fa2e0ad\n" /* sdot v13.4s, v5.16b, v2.4b[1] */\ +".word 0x4f82e8b0\n" /* sdot v16.4s, v5.16b, v2.4b[2] */\ +".word 0x4fa2e8b3\n" /* sdot v19.4s, v5.16b, v2.4b[3] */\ +".word 0x4f83e0b6\n" /* sdot v22.4s, v5.16b, v3.4b[0] */\ +".word 0x4fa3e0b9\n" /* sdot v25.4s, v5.16b, v3.4b[1] */\ +".word 0x4f83e8bc\n" /* sdot v28.4s, v5.16b, v3.4b[2] */\ +".word 0x4fa3e8bf\n" /* sdot v31.4s, v5.16b, v3.4b[3] */\ + "beq 5f\n" /*jump to tail = 3*/ \ + /* unrool 2, tail = 4*/ \ + "ldp q4, q5, [%[b_ptr]], #32\n" /* load b2, b0 to q4, q5*/ \ +".word 0x4f80e0c8\n" /* sdot v8.4s, v6.16b, v0.4b[0] */\ +".word 0x4fa0e0cb\n" /* sdot v11.4s, v6.16b, v0.4b[1] */\ + "ldp q2, q3, [%[a_ptr]], #32\n" /* load a10, a11 to q3, q4*/ \ +".word 0x4f80e8ce\n" /* sdot v14.4s, v6.16b, v0.4b[2] */\ +".word 0x4fa0e8d1\n" /* sdot v17.4s, v6.16b, v0.4b[3] */\ +".word 0x4f81e0d4\n" /* sdot v20.4s, v6.16b, v1.4b[0] */\ +".word 0x4fa1e0d7\n" /* sdot v23.4s, v6.16b, v1.4b[1] */\ +".word 0x4f81e8da\n" /* sdot v26.4s, v6.16b, v1.4b[2] */\ +".word 0x4fa1e8dd\n" /* sdot v29.4s, v6.16b, v1.4b[3] */\ +".word 0x4f80e0e9\n" /* sdot v9.4s, v7.16b, v0.4b[0] */\ +".word 0x4fa0e0ec\n" /* sdot v12.4s, v7.16b, v0.4b[1] */\ +".word 0x4f80e8ef\n" /* sdot v15.4s, v7.16b, v0.4b[2] */\ +".word 0x4fa0e8f2\n" /* sdot v18.4s, v7.16b, v0.4b[3] */\ +".word 0x4f81e0f5\n" /* sdot v21.4s, v7.16b, v1.4b[0] */\ +".word 0x4fa1e0f8\n" /* sdot v24.4s, v7.16b, v1.4b[1] */\ +".word 0x4f81e8fb\n" /* sdot v27.4s, v7.16b, v1.4b[2] */\ +".word 0x4fa1e8fe\n" /* sdot v30.4s, v7.16b, v1.4b[3] */\ + "ldp q6, q7, [%[b_ptr]], #32\n" /* load b1, b2 to q6, q7*/ \ +".word 0x4f80e08a\n" /* sdot v10.4s, v4.16b, v0.4b[0] */\ +".word 0x4fa0e08d\n" /* sdot v13.4s, v4.16b, v0.4b[1] */\ +".word 0x4f80e890\n" /* sdot v16.4s, v4.16b, v0.4b[2] */\ +".word 0x4fa0e893\n" /* sdot v19.4s, v4.16b, v0.4b[3] */\ +".word 0x4f81e096\n" /* sdot v22.4s, v4.16b, v1.4b[0] */\ +".word 0x4fa1e099\n" /* sdot v25.4s, v4.16b, v1.4b[1] */\ +".word 0x4f81e89c\n" /* sdot v28.4s, v4.16b, v1.4b[2] */\ +".word 0x4fa1e89f\n" /* sdot v31.4s, v4.16b, v1.4b[3] */\ + /* unrool 3, tail = 4*/ \ +".word 0x4f82e0a8\n" /* sdot v8.4s, v5.16b, v2.4b[0] */\ +".word 0x4fa2e0ab\n" /* sdot v11.4s, v5.16b, v2.4b[1] */\ +".word 0x4f82e8ae\n" /* sdot v14.4s, v5.16b, v2.4b[2] */\ +".word 0x4fa2e8b1\n" /* sdot v17.4s, v5.16b, v2.4b[3] */\ +".word 0x4f83e0b4\n" /* sdot v20.4s, v5.16b, v3.4b[0] */\ +".word 0x4fa3e0b7\n" /* sdot v23.4s, v5.16b, v3.4b[1] */\ +".word 0x4f83e8ba\n" /* sdot v26.4s, v5.16b, v3.4b[2] */\ +".word 0x4fa3e8bd\n" /* sdot v29.4s, v5.16b, v3.4b[3] */\ +".word 0x4f82e0c9\n" /* sdot v9.4s, v6.16b, v2.4b[0] */\ +".word 0x4fa2e0cc\n" /* sdot v12.4s, v6.16b, v2.4b[1] */\ +".word 0x4f82e8cf\n" /* sdot v15.4s, v6.16b, v2.4b[2] */\ +".word 0x4fa2e8d2\n" /* sdot v18.4s, v6.16b, v2.4b[3] */\ +".word 0x4f83e0d5\n" /* sdot v21.4s, v6.16b, v3.4b[0] */\ +".word 0x4fa3e0d8\n" /* sdot v24.4s, v6.16b, v3.4b[1] */\ +".word 0x4f83e8db\n" /* sdot v27.4s, v6.16b, v3.4b[2] */\ +".word 0x4fa3e8de\n" /* sdot v30.4s, v6.16b, v3.4b[3] */\ +".word 0x4f82e0ea\n" /* sdot v10.4s, v7.16b, v2.4b[0] */\ +".word 0x4fa2e0ed\n" /* sdot v13.4s, v7.16b, v2.4b[1] */\ +".word 0x4f82e8f0\n" /* sdot v16.4s, v7.16b, v2.4b[2] */\ +".word 0x4fa2e8f3\n" /* sdot v19.4s, v7.16b, v2.4b[3] */\ +".word 0x4f83e0f6\n" /* sdot v22.4s, v7.16b, v3.4b[0] */\ +".word 0x4fa3e0f9\n" /* sdot v25.4s, v7.16b, v3.4b[1] */\ +".word 0x4f83e8fc\n" /* sdot v28.4s, v7.16b, v3.4b[2] */\ +".word 0x4fa3e8ff\n" /* sdot v31.4s, v7.16b, v3.4b[3] */\ + "b 11f\n" /* tails==1 final tail*/ \ + "3: \n" /* tail=1*/ \ + "ldr q6, [%[b_ptr]], #16\n" /* load b2 to q6*/ \ +".word 0x4f80e088\n" /* sdot v8.4s, v4.16b, v0.4b[0] */\ +".word 0x4fa0e08b\n" /* sdot v11.4s, v4.16b, v0.4b[1] */\ +".word 0x4f80e88e\n" /* sdot v14.4s, v4.16b, v0.4b[2] */\ +".word 0x4fa0e891\n" /* sdot v17.4s, v4.16b, v0.4b[3] */\ +".word 0x4f81e094\n" /* sdot v20.4s, v4.16b, v1.4b[0] */\ +".word 0x4fa1e097\n" /* sdot v23.4s, v4.16b, v1.4b[1] */\ +".word 0x4f81e89a\n" /* sdot v26.4s, v4.16b, v1.4b[2] */\ +".word 0x4fa1e89d\n" /* sdot v29.4s, v4.16b, v1.4b[3] */\ +".word 0x4f80e0a9\n" /* sdot v9.4s, v5.16b, v0.4b[0] */\ +".word 0x4fa0e0ac\n" /* sdot v12.4s, v5.16b, v0.4b[1] */\ +".word 0x4f80e8af\n" /* sdot v15.4s, v5.16b, v0.4b[2] */\ +".word 0x4fa0e8b2\n" /* sdot v18.4s, v5.16b, v0.4b[3] */\ +".word 0x4f81e0b5\n" /* sdot v21.4s, v5.16b, v1.4b[0] */\ +".word 0x4fa1e0b8\n" /* sdot v24.4s, v5.16b, v1.4b[1] */\ +".word 0x4f81e8bb\n" /* sdot v27.4s, v5.16b, v1.4b[2] */\ +".word 0x4fa1e8be\n" /* sdot v30.4s, v5.16b, v1.4b[3] */\ +".word 0x4f80e0ca\n" /* sdot v10.4s, v6.16b, v0.4b[0] */\ +".word 0x4fa0e0cd\n" /* sdot v13.4s, v6.16b, v0.4b[1] */\ +".word 0x4f80e8d0\n" /* sdot v16.4s, v6.16b, v0.4b[2] */\ +".word 0x4fa0e8d3\n" /* sdot v19.4s, v6.16b, v0.4b[3] */\ +".word 0x4f81e0d6\n" /* sdot v22.4s, v6.16b, v1.4b[0] */\ +".word 0x4fa1e0d9\n" /* sdot v25.4s, v6.16b, v1.4b[1] */\ +".word 0x4f81e8dc\n" /* sdot v28.4s, v6.16b, v1.4b[2] */\ +".word 0x4fa1e8df\n" /* sdot v31.4s, v6.16b, v1.4b[3] */\ + "b 11f\n" /* tails==2 final tail*/ \ + "4:\n" /* tail = 2*/ \ +".word 0x4f82e0e8\n" /* sdot v8.4s, v7.16b, v2.4b[0] */\ +".word 0x4fa2e0eb\n" /* sdot v11.4s, v7.16b, v2.4b[1] */\ +".word 0x4f82e8ee\n" /* sdot v14.4s, v7.16b, v2.4b[2] */\ +".word 0x4fa2e8f1\n" /* sdot v17.4s, v7.16b, v2.4b[3] */\ +".word 0x4f83e0f4\n" /* sdot v20.4s, v7.16b, v3.4b[0] */\ +".word 0x4fa3e0f7\n" /* sdot v23.4s, v7.16b, v3.4b[1] */\ +".word 0x4f83e8fa\n" /* sdot v26.4s, v7.16b, v3.4b[2] */\ +".word 0x4fa3e8fd\n" /* sdot v29.4s, v7.16b, v3.4b[3] */\ +".word 0x4f82e089\n" /* sdot v9.4s, v4.16b, v2.4b[0] */\ +".word 0x4fa2e08c\n" /* sdot v12.4s, v4.16b, v2.4b[1] */\ +".word 0x4f82e88f\n" /* sdot v15.4s, v4.16b, v2.4b[2] */\ +".word 0x4fa2e892\n" /* sdot v18.4s, v4.16b, v2.4b[3] */\ +".word 0x4f83e095\n" /* sdot v21.4s, v4.16b, v3.4b[0] */\ +".word 0x4fa3e098\n" /* sdot v24.4s, v4.16b, v3.4b[1] */\ +".word 0x4f83e89b\n" /* sdot v27.4s, v4.16b, v3.4b[2] */\ +".word 0x4fa3e89e\n" /* sdot v30.4s, v4.16b, v3.4b[3] */\ +".word 0x4f82e0aa\n" /* sdot v10.4s, v5.16b, v2.4b[0] */\ +".word 0x4fa2e0ad\n" /* sdot v13.4s, v5.16b, v2.4b[1] */\ +".word 0x4f82e8b0\n" /* sdot v16.4s, v5.16b, v2.4b[2] */\ +".word 0x4fa2e8b3\n" /* sdot v19.4s, v5.16b, v2.4b[3] */\ +".word 0x4f83e0b6\n" /* sdot v22.4s, v5.16b, v3.4b[0] */\ +".word 0x4fa3e0b9\n" /* sdot v25.4s, v5.16b, v3.4b[1] */\ +".word 0x4f83e8bc\n" /* sdot v28.4s, v5.16b, v3.4b[2] */\ +".word 0x4fa3e8bf\n" /* sdot v31.4s, v5.16b, v3.4b[3] */\ + "b 11f\n" /* tails==3 final tail*/ \ + "5:\n" /* tail = 3*/ \ + "ldr q4, [%[b_ptr]], #16\n" /* load b2, b0 to q4*/ \ +".word 0x4f80e0c8\n" /* sdot v8.4s, v6.16b, v0.4b[0] */\ +".word 0x4fa0e0cb\n" /* sdot v11.4s, v6.16b, v0.4b[1] */\ +".word 0x4f80e8ce\n" /* sdot v14.4s, v6.16b, v0.4b[2] */\ +".word 0x4fa0e8d1\n" /* sdot v17.4s, v6.16b, v0.4b[3] */\ +".word 0x4f81e0d4\n" /* sdot v20.4s, v6.16b, v1.4b[0] */\ +".word 0x4fa1e0d7\n" /* sdot v23.4s, v6.16b, v1.4b[1] */\ +".word 0x4f81e8da\n" /* sdot v26.4s, v6.16b, v1.4b[2] */\ +".word 0x4fa1e8dd\n" /* sdot v29.4s, v6.16b, v1.4b[3] */\ +".word 0x4f80e0e9\n" /* sdot v9.4s, v7.16b, v0.4b[0] */\ +".word 0x4fa0e0ec\n" /* sdot v12.4s, v7.16b, v0.4b[1] */\ +".word 0x4f80e8ef\n" /* sdot v15.4s, v7.16b, v0.4b[2] */\ +".word 0x4fa0e8f2\n" /* sdot v18.4s, v7.16b, v0.4b[3] */\ +".word 0x4f81e0f5\n" /* sdot v21.4s, v7.16b, v1.4b[0] */\ +".word 0x4fa1e0f8\n" /* sdot v24.4s, v7.16b, v1.4b[1] */\ +".word 0x4f81e8fb\n" /* sdot v27.4s, v7.16b, v1.4b[2] */\ +".word 0x4fa1e8fe\n" /* sdot v30.4s, v7.16b, v1.4b[3] */\ +".word 0x4f80e08a\n" /* sdot v10.4s, v4.16b, v0.4b[0] */\ +".word 0x4fa0e08d\n" /* sdot v13.4s, v4.16b, v0.4b[1] */\ +".word 0x4f80e890\n" /* sdot v16.4s, v4.16b, v0.4b[2] */\ +".word 0x4fa0e893\n" /* sdot v19.4s, v4.16b, v0.4b[3] */\ +".word 0x4f81e096\n" /* sdot v22.4s, v4.16b, v1.4b[0] */\ +".word 0x4fa1e099\n" /* sdot v25.4s, v4.16b, v1.4b[1] */\ +".word 0x4f81e89c\n" /* sdot v28.4s, v4.16b, v1.4b[2] */\ +".word 0x4fa1e89f\n" /* sdot v31.4s, v4.16b, v1.4b[3] */\ + "11: \n" /* end */ + +#define GEMM_SDOT_INT8_KERNEL_8x8 \ + "prfm pldl1keep, [%[a_ptr], #64]\n" /* preload a*/ \ + "eor v8.16b, v8.16b, v8.16b \n" /* out0 = 0 */ \ + "eor v11.16b, v11.16b, v11.16b\n" /* out0 = 0 */ \ + "eor v14.16b, v14.16b, v14.16b\n" /* out0 = 0 */ \ + "eor v17.16b, v17.16b, v17.16b\n" /* out0 = 0 */ \ + "eor v20.16b, v20.16b, v20.16b\n" /* out0 = 0 */ \ + "eor v23.16b, v23.16b, v23.16b\n" /* out0 = 0 */ \ + "eor v26.16b, v26.16b, v26.16b\n" /* out0 = 0 */ \ + "eor v29.16b, v29.16b, v29.16b\n" /* out0 = 0 */ \ + "prfm pldl1keep, [%[b_ptr], #64]\n" /* preload b*/ \ + "eor v9.16b, v9.16b, v9.16b \n" /* out0 = 0 */ \ + "eor v12.16b, v12.16b, v12.16b\n" /* out0 = 0 */ \ + "eor v15.16b, v15.16b, v15.16b\n" /* out0 = 0 */ \ + "eor v18.16b, v18.16b, v18.16b\n" /* out0 = 0 */ \ + "eor v21.16b, v21.16b, v21.16b\n" /* out0 = 0 */ \ + "eor v24.16b, v24.16b, v24.16b\n" /* out0 = 0 */ \ + "eor v27.16b, v27.16b, v27.16b\n" /* out0 = 0 */ \ + "eor v30.16b, v30.16b, v30.16b\n" /* out0 = 0 */ \ + "1:\n" \ + "ldp q0, q1, [%[a_ptr]], #32\n" \ + "ldp q4, q5, [%[b_ptr]], #32\n" \ +".word 0x4f80e088\n" /* sdot v8.4s, v4.16b, v0.4b[0] */\ +".word 0x4fa0e08b\n" /* sdot v11.4s, v4.16b, v0.4b[1] */\ +".word 0x4f80e88e\n" /* sdot v14.4s, v4.16b, v0.4b[2] */\ +".word 0x4fa0e891\n" /* sdot v17.4s, v4.16b, v0.4b[3] */\ + "prfm pldl1keep, [%[a_ptr], #64]\n" /* preload a*/ \ +".word 0x4f81e094\n" /* sdot v20.4s, v4.16b, v1.4b[0] */\ +".word 0x4fa1e097\n" /* sdot v23.4s, v4.16b, v1.4b[1] */\ +".word 0x4f81e89a\n" /* sdot v26.4s, v4.16b, v1.4b[2] */\ +".word 0x4fa1e89d\n" /* sdot v29.4s, v4.16b, v1.4b[3] */\ + "prfm pldl1keep, [%[a_ptr], #128]\n" /* preload b*/ \ + "prfm pldl1keep, [%[b_ptr], #64]\n" /* preload b*/ \ +".word 0x4f80e0a9\n" /* sdot v9.4s, v5.16b, v0.4b[0] */\ +".word 0x4fa0e0ac\n" /* sdot v12.4s, v5.16b, v0.4b[1] */\ +".word 0x4f80e8af\n" /* sdot v15.4s, v5.16b, v0.4b[2] */\ +".word 0x4fa0e8b2\n" /* sdot v18.4s, v5.16b, v0.4b[3] */\ + "prfm pldl1keep, [%[b_ptr], #128]\n" /* preload b*/ \ +".word 0x4f81e0b5\n" /* sdot v21.4s, v5.16b, v1.4b[0] */\ +".word 0x4fa1e0b8\n" /* sdot v24.4s, v5.16b, v1.4b[1] */\ +".word 0x4f81e8bb\n" /* sdot v27.4s, v5.16b, v1.4b[2] */\ +".word 0x4fa1e8be\n" /* sdot v30.4s, v5.16b, v1.4b[3] */\ + "subs %w[k], %w[k], #1\n" \ + "bne 1b\n" + +#define GEMM_SDOT_INT8_KERNEL_8x4 \ + "prfm pldl1keep, [%[a_ptr], #64]\n" /* preload a*/ \ + "eor v8.16b, v8.16b, v8.16b \n" /* out0 = 0 */ \ + "eor v11.16b, v11.16b, v11.16b\n" /* out0 = 0 */ \ + "eor v14.16b, v14.16b, v14.16b\n" /* out0 = 0 */ \ + "eor v17.16b, v17.16b, v17.16b\n" /* out0 = 0 */ \ + "prfm pldl1keep, [%[b_ptr], #32]\n" /* preload b*/ \ + "eor v20.16b, v20.16b, v20.16b\n" /* out0 = 0 */ \ + "eor v23.16b, v23.16b, v23.16b\n" /* out0 = 0 */ \ + "eor v26.16b, v26.16b, v26.16b\n" /* out0 = 0 */ \ + "eor v29.16b, v29.16b, v29.16b\n" /* out0 = 0 */ \ + "1:\n" \ + "ldp q0, q1, [%[a_ptr]], #32\n" \ + "ldr q4, [%[b_ptr]], #16\n" \ +".word 0x4f80e088\n" /* sdot v8.4s, v4.16b, v0.4b[0] */\ +".word 0x4fa0e08b\n" /* sdot v11.4s, v4.16b, v0.4b[1] */\ + "prfm pldl1keep, [%[a_ptr], #64]\n" /* preload a*/ \ +".word 0x4f80e88e\n" /* sdot v14.4s, v4.16b, v0.4b[2] */\ +".word 0x4fa0e891\n" /* sdot v17.4s, v4.16b, v0.4b[3] */\ + "prfm pldl1keep, [%[a_ptr], #64]\n" /* preload a*/ \ +".word 0x4f81e094\n" /* sdot v20.4s, v4.16b, v1.4b[0] */\ +".word 0x4fa1e097\n" /* sdot v23.4s, v4.16b, v1.4b[1] */\ + "prfm pldl1keep, [%[b_ptr], #32]\n" /* preload b*/ \ +".word 0x4f81e89a\n" /* sdot v26.4s, v4.16b, v1.4b[2] */\ +".word 0x4fa1e89d\n" /* sdot v29.4s, v4.16b, v1.4b[3] */\ + "subs %w[k], %w[k], #1\n" \ + "bne 1b\n" diff --git a/lite/backends/arm/math/dotprod/gemm_vsdot.h b/lite/backends/arm/math/dotprod/gemm_vsdot.h new file mode 100644 index 00000000000..9929ade9b95 --- /dev/null +++ b/lite/backends/arm/math/dotprod/gemm_vsdot.h @@ -0,0 +1,54 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +// clang-format off +#define GEMM_DOT_INT8_KERNEL \ + "vld1.s8 {q0}, [%[a_ptr]]! \n" /* load a00,a01 to q0, q1*/ \ + "vld1.s8 {d2}, [%[a_ptr]]! \n" /* load a00,a01 to q0, q1*/ \ + "veor.s32 q4, q4, q4 \n" /* out0 = 0 */ \ + "veor.s32 q5, q5, q5 \n" /* out0 = 0 */ \ + "veor.s32 q6, q6, q6 \n" /* out0 = 0 */ \ + "veor.s32 q7, q7, q7 \n" /* out0 = 0 */ \ + "veor.s32 q8, q8, q8 \n" /* out0 = 0 */ \ + "veor.s32 q9, q9, q9 \n" /* out0 = 0 */ \ + "veor.s32 q10, q10, q10 \n" /* out0 = 0 */ \ + "veor.s32 q11, q11, q11 \n" /* out0 = 0 */ \ + "veor.s32 q12, q12, q12 \n" /* out0 = 0 */ \ + "veor.s32 q13, q13, q13 \n" /* out0 = 0 */ \ + "veor.s32 q14, q14, q14 \n" /* out0 = 0 */ \ + "veor.s32 q15, q15, q15 \n" /* out0 = 0 */ \ + "cmp %[k], #0 \n" \ + "beq 2f \n" \ + "1: \n" \ + "vld1.s8 {q2}, [%[b_ptr]]! \n" \ + "vld1.s8 {q3}, [%[b_ptr]]! \n" \ +".word 0x8d40fe24\n" /* vsdot.s8 q4, q2, d0[0] */\ +".word 0xcd60fe24\n" /* vsdot.s8 q6, q2, d0[1] */\ +".word 0x0d41fe64\n" /* vsdot.s8 q8, q2, d1[0] */\ +".word 0x4d61fe64\n" /* vsdot.s8 q10, q2, d1[1] */\ +".word 0x8d42fe64\n" /* vsdot.s8 q12, q2, d2[0] */\ +".word 0xcd62fe64\n" /* vsdot.s8 q14, q2, d2[1] */\ +".word 0xad40fe26\n" /* vsdot.s8 q5, q3, d0[0] */\ +".word 0xed60fe26\n" /* vsdot.s8 q7, q3, d0[1] */\ +".word 0x2d41fe66\n" /* vsdot.s8 q9, q3, d1[0] */\ +".word 0x6d61fe66\n" /* vsdot.s8 q11, q3, d1[1] */\ +".word 0xad42fe66\n" /* vsdot.s8 q13, q3, d2[0] */\ +".word 0xed62fe66\n" /* vsdot.s8 q15, q3, d2[1] */\ + "vld1.s8 {q0}, [%[a_ptr]]! \n" /* load a00,a01 to q0, q1*/ \ + "vld1.s8 {d2}, [%[a_ptr]]! \n" /* load a00,a01 to q0, q1*/ \ + "subs %[k], %[k], #1 \n" \ + "bne 1b \n" \ + "2: \n" diff --git a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc index bb430c8d8f6..e493bebfc50 100644 --- a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc +++ b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc @@ -1,364 +1,336 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include +#include // NOLINT(build/c++11) #include #include -#include #include - +#include +#include +#include +#include +#include #include "paddle_api.h" // NOLINT + +#define IPTCORE_PADDLE_MOBILE +#define IPTCORE_PADDLE_BENCHMARK ///////////////////////////////////////////////////////////////////////// -// If this demo is linked to static library:libpaddle_api_light_bundled.a +// If this demo is linked to static library:libpaddle_api_full_bundled.a // , you should include `paddle_use_ops.h` and `paddle_use_kernels.h` to // avoid linking errors such as `unsupport ops or kernels`. ///////////////////////////////////////////////////////////////////////// -// #include "paddle_use_kernels.h" // NOLINT -// #include "paddle_use_ops.h" // NOLINT - -using namespace paddle::lite_api; // NOLINT +#ifdef IPTCORE_PADDLE_MOBILE +#else +#ifdef _WIN32 +#include "paddle_use_kernels.h" // NOLINT +#include "paddle_use_ops.h" // NOLINT +#endif +#endif -int64_t ShapeProduction(const shape_t& shape) { - int64_t res = 1; - for (auto i : shape) res *= i; - return res; -} +#ifdef IPTCORE_PADDLE_BENCHMARK +class Timer { +private: + std::chrono::high_resolution_clock::time_point inTime, outTime; + +public: + void startTimer() { inTime = std::chrono::high_resolution_clock::now(); } + + // unit millisecond + float getCostTimer() { + outTime = std::chrono::high_resolution_clock::now(); + return static_cast( + std::chrono::duration_cast(outTime - inTime) + .count() / + 1e+3); + } +}; +#endif -std::string ShapePrint(const std::vector& shapes) { - std::string shapes_str{""}; - for (size_t shape_idx = 0; shape_idx < shapes.size(); ++shape_idx) { - auto shape = shapes[shape_idx]; - std::string shape_str; - for (auto i : shape) { - shape_str += std::to_string(i) + ","; +template +double compute_mean(const T* in, const size_t length) { + double sum = 0.; + for (size_t i = 0; i < length; ++i) { + sum += in[i]; } - shapes_str += shape_str; - shapes_str += - (shape_idx != 0 && shape_idx == shapes.size() - 1) ? "" : " : "; - } - return shapes_str; + return sum / length; } -std::string ShapePrint(const shape_t& shape) { - std::string shape_str{""}; - for (auto i : shape) { - shape_str += std::to_string(i) + " "; - } - return shape_str; -} +template +double compute_standard_deviation(const T* in, + const size_t length, + bool has_mean = false, + double mean = 10000) { + if (!has_mean) { + mean = compute_mean(in, length); + } -std::vector split_string(const std::string& str_in) { - std::vector str_out; - std::string tmp_str = str_in; - while (!tmp_str.empty()) { - size_t next_offset = tmp_str.find(":"); - str_out.push_back(tmp_str.substr(0, next_offset)); - if (next_offset == std::string::npos) { - break; - } else { - tmp_str = tmp_str.substr(next_offset + 1); + double variance = 0.; + for (size_t i = 0; i < length; ++i) { + variance += pow((in[i] - mean), 2); } - } - return str_out; + variance /= length; + return sqrt(variance); } -std::vector get_shape(const std::string& str_shape) { - std::vector shape; - std::string tmp_str = str_shape; - while (!tmp_str.empty()) { - int dim = atoi(tmp_str.data()); - shape.push_back(dim); - size_t next_offset = tmp_str.find(","); - if (next_offset == std::string::npos) { - break; - } else { - tmp_str = tmp_str.substr(next_offset + 1); +int64_t shape_production(const paddle::lite_api::shape_t& shape) { + int64_t res = 1; + for (auto i : shape) { + res *= i; } - } - return shape; + return res; } -template -double compute_mean(const T* in, const size_t length) { - double sum = 0.; - for (size_t i = 0; i < length; ++i) { - sum += in[i]; - } - return sum / length; +class InputData { +public: + int _type = -1; ///int32, int64, float32 + bool _lod = false; + std::vector _shape; + std::vector _int32_data; + std::vector _int64_data; + std::vector _float32_data; + std::vector> _lod_data = {{0, 1}, {0, 1}}; +}; + +class UserPersonaInfer { +public: +#ifdef IPTCORE_PADDLE_MOBILE + void create_paddle_light_predictor(const std::string& model_file); +#else + void create_paddle_full_predictor(const std::string& model_dir); +#endif + void prepare(const std::string& path); + void infer(); +private: + void infer_specific_item(paddle::lite_api::PaddlePredictor *predictor); + std::shared_ptr _paddle_predictor; + std::vector > _batch; +}; + +#ifdef IPTCORE_PADDLE_MOBILE +void UserPersonaInfer::create_paddle_light_predictor(const std::string& model_file) { + // 1. Set MobileConfig + paddle::lite_api::MobileConfig config; + config.set_model_from_file(model_file); + config.set_power_mode(paddle::lite_api::LITE_POWER_HIGH); + // 2. Create PaddlePredictor by MobileConfig + _paddle_predictor = + paddle::lite_api::CreatePaddlePredictor(config); } - -template -double compute_standard_deviation(const T* in, - const size_t length, - bool has_mean = false, - double mean = 10000) { - if (!has_mean) { - mean = compute_mean(in, length); - } - - double variance = 0.; - for (size_t i = 0; i < length; ++i) { - variance += pow((in[i] - mean), 2); - } - variance /= length; - return sqrt(variance); +#else +void UserPersonaInfer::create_paddle_full_predictor(const std::string& model_dir) { + // 1. Create CxxConfig + paddle::lite_api::CxxConfig config; + config.set_model_dir(model_dir); + config.set_valid_places({paddle::lite_api::Place{TARGET(kX86), PRECISION(kFloat)}, + paddle::lite_api::Place{TARGET(kHost), PRECISION(kFloat)}}); + // 2. Create PaddlePredictor by CxxConfig + _paddle_predictor = + paddle::lite_api::CreatePaddlePredictor(config); } +#endif +namespace { +using namespace std; +template +void extract_num(const string &str, vector &results) { + stringstream ss; -inline double GetCurrentUS() { - struct timeval time; - gettimeofday(&time, NULL); - return 1e+6 * time.tv_sec + time.tv_usec; -} + /* Storing the whole string into string stream */ + ss << str; -void RunModel(std::string model_dir, - const std::vector& input_shapes, - size_t repeats, - size_t warmup, - size_t power_mode, - size_t thread_num, - size_t accelerate_opencl, - size_t print_output_elem) { - // 1. Set MobileConfig - MobileConfig config; - config.set_model_from_file(model_dir); - -#ifdef METAL - std::string metal_lib_path = "../../../metal/lite.metallib"; - config.set_metal_lib_path(metal_lib_path); - config.set_metal_use_mps(true); -#else - // NOTE: Use android gpu with opencl, you should ensure: - // first, [compile **cpu+opencl** paddlelite - // lib](/~https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/docs/demo_guides/opencl.md); - // second, [convert and use opencl nb - // model](/~https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/docs/user_guides/opt/opt_bin.md). - - bool is_opencl_backend_valid = - ::IsOpenCLBackendValid(/*check_fp16_valid = false*/); - std::cout << "is_opencl_backend_valid:" - << (is_opencl_backend_valid ? "true" : "false") << std::endl; - if (is_opencl_backend_valid) { - if (accelerate_opencl != 0) { - // Set opencl kernel binary. - // Large addtitional prepare time is cost due to algorithm selecting and - // building kernel from source code. - // Prepare time can be reduced dramitically after building algorithm file - // and OpenCL kernel binary on the first running. - // The 1st running time will be a bit longer due to the compiling time if - // you don't call `set_opencl_binary_path_name` explicitly. - // So call `set_opencl_binary_path_name` explicitly is strongly - // recommended. - - // Make sure you have write permission of the binary path. - // We strongly recommend each model has a unique binary name. - const std::string bin_path = "/data/local/tmp/"; - const std::string bin_name = "lite_opencl_kernel.bin"; - config.set_opencl_binary_path_name(bin_path, bin_name); - - // opencl tune option - // CL_TUNE_NONE: 0 - // CL_TUNE_RAPID: 1 - // CL_TUNE_NORMAL: 2 - // CL_TUNE_EXHAUSTIVE: 3 - const std::string tuned_path = "/data/local/tmp/"; - const std::string tuned_name = "lite_opencl_tuned.bin"; - config.set_opencl_tune(CL_TUNE_NORMAL, tuned_path, tuned_name); - - // opencl precision option - // CL_PRECISION_AUTO: 0, first fp16 if valid, default - // CL_PRECISION_FP32: 1, force fp32 - // CL_PRECISION_FP16: 2, force fp16 - config.set_opencl_precision(CL_PRECISION_FP16); + /* Running loop till the end of the stream */ + string temp; + T found; + while (!ss.eof()) { + + /* extracting word by word from stream */ + ss >> temp; + + /* Checking the given word is integer or not */ + if (stringstream(temp) >> found) + results.emplace_back(found); + + /* To save from space at the end of string */ + temp = ""; } - } else { - std::cout << "*** nb model will be running on cpu. ***" << std::endl; - // you can give backup cpu nb model instead - // config.set_model_from_file(cpu_nb_model_dir); - } -#endif +} +} + +void UserPersonaInfer::prepare(const std::string& path) { + ///xia_i 186 tgt_generation_mask float32 (1, 1, 33) [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] + std::ifstream in(path.c_str()); + std::string line; + std::string current_idx; + while (std::getline(in, line)) { + if (line.empty()) { + break; + } + if (line.back() == '\r') { + line.pop_back(); + } + if (line.empty()) { + break; + } + std::vector strings; + std::istringstream f(line); + std::string s; + while (getline(f, s, '\t')) { + strings.push_back(s); + } + if (current_idx != strings.at(1)) { + _batch.push_back(std::map()); + current_idx = strings[1]; + } + if (strings.at(2) == "lods") { + if (strings.at(3) != "[[0, 1], [0, 1]]") { + throw std::invalid_argument("invalid lod"); + } + continue; + } + auto& input_data = _batch.back()[strings.at(2)]; + + extract_num(strings.at(4), input_data._shape); + if (strings[0] == "lod_i") { + input_data._lod = true; + } + if (strings.at(3) == "int32") { + input_data._type = 0; + extract_num(strings.at(5), input_data._int32_data); + } else if (strings.at(3) == "int64") { + input_data._type = 1; + extract_num(strings.at(5), input_data._int64_data); + } else if (strings.at(3) == "float32") { + input_data._type = 2; + extract_num(strings.at(5), input_data._float32_data); + } else { + throw std::invalid_argument("invalid type"); + } - // NOTE: To load model transformed by model_optimize_tool before - // release/v2.3.0, plese use `set_model_dir` API as listed below. - // config.set_model_dir(model_dir); - config.set_power_mode(static_cast(power_mode)); - config.set_threads(thread_num); - // 2. Create PaddlePredictor by MobileConfig - std::shared_ptr predictor = - CreatePaddlePredictor(config); - - // 3. Prepare input data - std::cout << "input_shapes.size():" << input_shapes.size() << std::endl; - for (int j = 0; j < input_shapes.size(); ++j) { - auto input_tensor = predictor->GetInput(j); - input_tensor->Resize(input_shapes[j]); - auto input_data = input_tensor->mutable_data(); - int input_num = 1; - for (int i = 0; i < input_shapes[j].size(); ++i) { - input_num *= input_shapes[j][i]; } +} - for (int i = 0; i < input_num; ++i) { - input_data[i] = 1.f; +void UserPersonaInfer::infer_specific_item(paddle::lite_api::PaddlePredictor *predictor){ + static int count = 0; + if (_batch.empty()) { + return; } - } - - // 4. Run predictor - double first_duration{-1}; - for (size_t widx = 0; widx < warmup; ++widx) { - if (widx == 0) { - auto start = GetCurrentUS(); - predictor->Run(); - first_duration = (GetCurrentUS() - start) / 1000.0; - } else { - predictor->Run(); + auto &inputs = _batch[count]; + auto names = predictor->GetInputNames(); + for (auto &name : names) { + auto& input = inputs[name]; + auto tensor = predictor->GetInputByName(name); + tensor->Resize(input._shape); + if (input._type == 0) { + auto input_data = tensor->mutable_data(); + std::copy(input._int32_data.begin(), input._int32_data.end(), input_data); + } else if (input._type == 1) { + auto input_data = tensor->mutable_data(); + std::copy(input._int64_data.begin(), input._int64_data.end(), input_data); + } else if (input._type == 2) { + auto input_data = tensor->mutable_data(); + std::copy(input._float32_data.begin(), input._float32_data.end(), input_data); + } else { + throw std::invalid_argument("invalid name"); + } + if (input._lod) { + tensor->SetLoD(input._lod_data); + } } - } - - double sum_duration = 0.0; // millisecond; - double max_duration = 1e-5; - double min_duration = 1e5; - double avg_duration = -1; - for (size_t ridx = 0; ridx < repeats; ++ridx) { - auto start = GetCurrentUS(); predictor->Run(); - auto duration = (GetCurrentUS() - start) / 1000.0; - sum_duration += duration; - max_duration = duration > max_duration ? duration : max_duration; - min_duration = duration < min_duration ? duration : min_duration; - std::cout << "run_idx:" << ridx + 1 << " / " << repeats << ": " << duration - << " ms" << std::endl; - if (first_duration < 0) { - first_duration = duration; + std::cout << "\n"; + for (int idx = 0; idx != 2; ++idx) { + auto output_tensor = predictor->GetOutput(idx); + auto total_size = shape_production(output_tensor->shape()); + std::cout << "xiarj_" << count << "\t"; + for (int i = 0; i < total_size; ++i) { + if (idx == 0) { + std::cout << output_tensor->data()[i] << "\t"; + } else { + std::cout << output_tensor->data()[i] << "\t"; + } + } + std::cout << "\n"; } - } - avg_duration = sum_duration / static_cast(repeats); - std::cout << "\n======= benchmark summary =======\n" - << "input_shape(s) (NCHW):" << ShapePrint(input_shapes) << "\n" - << "model_dir:" << model_dir << "\n" - << "warmup:" << warmup << "\n" - << "repeats:" << repeats << "\n" - << "power_mode:" << power_mode << "\n" - << "thread_num:" << thread_num << "\n" - << "*** time info(ms) ***\n" - << "1st_duration:" << first_duration << "\n" - << "max_duration:" << max_duration << "\n" - << "min_duration:" << min_duration << "\n" - << "avg_duration:" << avg_duration << "\n"; - - // 5. Get output - std::cout << "\n====== output summary ====== " << std::endl; - size_t output_tensor_num = predictor->GetOutputNames().size(); - std::cout << "output tensor num:" << output_tensor_num << std::endl; - - for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) { - std::unique_ptr output_tensor = - predictor->GetOutput(tidx); - std::cout << "\n--- output tensor " << tidx << " ---" << std::endl; - auto out_shape = output_tensor->shape(); - auto out_data = output_tensor->data(); - auto out_mean = compute_mean(out_data, ShapeProduction(out_shape)); - auto out_std_dev = compute_standard_deviation( - out_data, ShapeProduction(out_shape), true, out_mean); - - std::cout << "output shape(NCHW):" << ShapePrint(out_shape) << std::endl; - std::cout << "output tensor " << tidx - << " elem num:" << ShapeProduction(out_shape) << std::endl; - std::cout << "output tensor " << tidx - << " standard deviation:" << out_std_dev << std::endl; - std::cout << "output tensor " << tidx << " mean value:" << out_mean - << std::endl; - - // print output - if (print_output_elem) { - for (int i = 0; i < ShapeProduction(out_shape); ++i) { - std::cout << "out[" << tidx << "][" << i - << "]:" << output_tensor->data()[i] << std::endl; - } + std::cout << std::flush; + + if (++count == _batch.size()){ + count = 0; } - } } -int main(int argc, char** argv) { - std::vector str_input_shapes; - std::vector input_shapes{ - {1, 3, 224, 224}}; // shape_t ==> std::vector - - int repeats = 10; - int warmup = 10; - // set arm power mode: - // 0 for big cluster, high performance - // 1 for little cluster - // 2 for all cores - // 3 for no bind - size_t power_mode = 0; - size_t thread_num = 1; - int accelerate_opencl = 1; - int print_output_elem = 0; - - if (argc > 2 && argc < 9) { - std::cerr - << "usage: ./" << argv[0] << "\n" - << " \n" - << " , eg: 1,3,224,224 for 1 input; " - "1,3,224,224:1,5 for 2 inputs\n" - << " , eg: 100\n" - << " , eg: 10\n" - << " , 0: big cluster, high performance\n" - " 1: little cluster\n" - " 2: all cores\n" - " 3: no bind\n" - << " , eg: 1 for single thread \n" - << " , this option takes effect only when model " - "can be running on opencl backend.\n" - " 0: disable opencl kernel cache & tuning\n" - " 1: enable opencl kernel cache & tuning\n" - << " , 0: disable print outputs to stdout\n" - " 1: enable print outputs to stdout\n" - << std::endl; - return 0; - } - - std::string model_dir = argv[1]; - if (argc >= 9) { - input_shapes.clear(); - std::string raw_input_shapes = argv[2]; - std::cout << "raw_input_shapes: " << raw_input_shapes << std::endl; - str_input_shapes = split_string(raw_input_shapes); - for (size_t i = 0; i < str_input_shapes.size(); ++i) { - std::cout << "input shape: " << str_input_shapes[i] << std::endl; - input_shapes.push_back(get_shape(str_input_shapes[i])); +void UserPersonaInfer::infer() { + static int idx = 0; + auto predictor = _paddle_predictor.get(); + if (!predictor) { + return; + } + // 3. Prepare input data + + // 4. Run predictor +#ifdef IPTCORE_PADDLE_BENCHMARK + int warmup = 10; + int repeats = 400; + Timer timeInstance; + double first_duration{-1}; + for (size_t widx = 0; widx < warmup; ++widx) { + if (widx == 0) { + timeInstance.startTimer(); + infer_specific_item(predictor); + first_duration = timeInstance.getCostTimer(); + } else { + infer_specific_item(predictor); + } + } + + double sum_duration = 0.0; + double max_duration = 1e-5; + double min_duration = 1e5; + double avg_duration = -1; + for (size_t ridx = 0; ridx < repeats; ++ridx) { + timeInstance.startTimer(); + + infer_specific_item(predictor); + + double duration = timeInstance.getCostTimer(); + sum_duration += duration; + max_duration = duration > max_duration ? duration : max_duration; + min_duration = duration < min_duration ? duration : min_duration; +// std::cout << "run_idx:" << ridx + 1 << " / " << repeats << ": " << duration +// << " ms" << std::endl; + if (first_duration < 0) { + first_duration = duration; + } } + avg_duration = sum_duration / static_cast(repeats); + std::cout << "\n======= benchmark summary =======\n" + << "warmup:" << warmup << "\n" + << "repeats:" << repeats << "\n" + << "*** time info(ms) ***\n" + //<< "1st_duration:" << first_duration << "\n" + << "max_duration:" << max_duration << "\n" + << "min_duration:" << min_duration << "\n" + << "avg_duration:" << avg_duration << "\n"; +#else + infer_specific_item(predictor); +#endif + + // 5. Get output +} - repeats = atoi(argv[3]); - warmup = atoi(argv[4]); - power_mode = atoi(argv[5]); - thread_num = atoi(argv[6]); - accelerate_opencl = atoi(argv[7]); - print_output_elem = atoi(argv[8]); - } - - RunModel(model_dir, - input_shapes, - repeats, - warmup, - power_mode, - thread_num, - accelerate_opencl, - print_output_elem); - - return 0; +int main(int argc, char** argv) { + UserPersonaInfer user_persona_infer; +#ifdef IPTCORE_PADDLE_MOBILE +// user_persona_infer.create_paddle_light_predictor( +// "D:\\baidu\\baiduinput\\inputtools\\paddle_lite\\wenxin\\model_x86.nb"); + user_persona_infer.create_paddle_light_predictor( + "./model_naive_buffer_arm.nb"); + std::cout << "xiarj" << std::endl; +#else +// user_persona_infer.create_paddle_full_predictor( +// "D:\\baidu\\baiduinput\\inputtools\\paddle_lite\\honor_2_11\\cls_ernie_3.0_tiny_fc_ch_dy_15_3L128H_decrypt_inference_1"); +#endif + //user_persona_infer.prepare("D:\\baidu\\baiduinput\\inputtools\\paddle_lite\\wenxin\\xia.txt"); + user_persona_infer.prepare("./xia.txt"); + user_persona_infer.infer(); + + return 0; } + diff --git a/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt b/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt deleted file mode 100644 index 234ec1c85e3..00000000000 --- a/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt +++ /dev/null @@ -1,73 +0,0 @@ -cmake_minimum_required(VERSION 2.8) -project(mobilenet_full_api) -set(TARGET mobilenet_full_api) - -# 1. path to Paddle-Lite lib and mklml lib -set(LITE_DIR "${PROJECT_SOURCE_DIR}/../../../cxx") -set(MKLML_DIR "${PROJECT_SOURCE_DIR}/../../../third_party/mklml/") - -if (WIN32) - add_definitions("/DGOOGLE_GLOG_DLL_DECL=") - set(MSVC_STATIC_CRT ) - if(MSVC_STATIC_CRT) - set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") - set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") - set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") - else(MSVC_STATIC_CRT) - set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MDd") - set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MD") - set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MDd") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MD") - endif(MSVC_STATIC_CRT) -endif() - -if (APPLE AND METAL) - message(STATUS "set METAL=ON") - add_definitions("-DMETAL") - find_library(METAL_LIBRARY Metal REQUIRED) - find_library(GRAPHIC CoreGraphics REQUIRED) - find_library(MPS_LIBRARY MetalPerformanceShaders REQUIRED) -endif() - -# 2. link mklml and Paddle-Lite directory -link_directories(${LITE_DIR}/lib ${MKLML_DIR}/lib) -include_directories(${LITE_DIR}/include/ ${MKLML_DIR}/include) - -# 3. compile options -if (NOT WIN32) - add_definitions(-std=c++11 -g -O3 -pthread) - set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}) -endif() - -# 4.add executable output -add_executable(${TARGET} ${TARGET}.cc) -if (WIN32) - set(WITH_STATIC_MKL ) - if(WITH_STATIC_MKL) - set(MATH_LIB ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX}) - else() - set(MATH_LIB ${MKLML_DIR}/lib/mklml${CMAKE_STATIC_LIBRARY_SUFFIX} - ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX}) - endif() - - target_link_libraries(${TARGET} libpaddle_api_full_bundled.lib) - target_link_libraries(${TARGET} shlwapi.lib) - target_link_libraries(${TARGET} ${MATH_LIB}) - - add_custom_command(TARGET ${TARGET} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/libiomp5md.dll ${CMAKE_CURRENT_BINARY_DIR}/Release - ) - if(NOT WITH_STATIC_MKL) - add_custom_command(TARGET ${TARGET} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/mklml.dll ${CMAKE_CURRENT_BINARY_DIR}/Release - ) - endif() -else() - if (APPLE AND METAL) - target_link_libraries(${TARGET} ${METAL_LIBRARY} ${GRAPHIC} ${MPS_LIBRARY}) - endif() - target_link_libraries(${TARGET} -lpaddle_full_api_shared) - target_link_libraries(${TARGET} -liomp5) - target_link_libraries(${TARGET} -ldl) -endif() diff --git a/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt b/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt deleted file mode 100644 index 3a91bfafbd3..00000000000 --- a/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt +++ /dev/null @@ -1,73 +0,0 @@ -cmake_minimum_required(VERSION 2.8) -project(mobilenet_light_api) -set(TARGET mobilenet_light_api) - -# 1. path to Paddle-Lite lib and mklml lib -set(LITE_DIR "${PROJECT_SOURCE_DIR}/../../../cxx") -set(MKLML_DIR "${PROJECT_SOURCE_DIR}/../../../third_party/mklml/") - -if (WIN32) - add_definitions("/DGOOGLE_GLOG_DLL_DECL=") - set(MSVC_STATIC_CRT ) - if(MSVC_STATIC_CRT) - set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") - set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") - set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") - else(MSVC_STATIC_CRT) - set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MDd") - set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MD") - set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MDd") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MD") - endif(MSVC_STATIC_CRT) -endif() - -if (APPLE AND METAL) - message(STATUS "set METAL=ON") - add_definitions("-DMETAL") - find_library(METAL_LIBRARY Metal REQUIRED) - find_library(GRAPHIC CoreGraphics REQUIRED) - find_library(MPS_LIBRARY MetalPerformanceShaders REQUIRED) -endif() - -# 2. link mklml and Paddle-Lite directory -link_directories(${LITE_DIR}/lib ${MKLML_DIR}/lib) -include_directories(${LITE_DIR}/include/ ${MKLML_DIR}/include) - -# 3. compile options -if (NOT WIN32) - add_definitions(-std=c++11 -g -O3 -pthread) - set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}) -endif() - -# 4.add executable output -add_executable(${TARGET} ${TARGET}.cc) -if (WIN32) - set(WITH_STATIC_MKL ) - if(WITH_STATIC_MKL) - set(MATH_LIB ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX}) - else() - set(MATH_LIB ${MKLML_DIR}/lib/mklml${CMAKE_STATIC_LIBRARY_SUFFIX} - ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX}) - endif() - - target_link_libraries(${TARGET} libpaddle_api_light_bundled.lib) - target_link_libraries(${TARGET} shlwapi.lib) - target_link_libraries(${TARGET} ${MATH_LIB}) - - add_custom_command(TARGET ${TARGET} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/libiomp5md.dll ${CMAKE_CURRENT_BINARY_DIR}/Release - ) - if(NOT WITH_STATIC_MKL) - add_custom_command(TARGET ${TARGET} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/mklml.dll ${CMAKE_CURRENT_BINARY_DIR}/Release - ) - endif() -else() - if (APPLE AND METAL) - target_link_libraries(${TARGET} ${METAL_LIBRARY} ${GRAPHIC} ${MPS_LIBRARY}) - endif() - target_link_libraries(${TARGET} -lpaddle_light_api_shared) - target_link_libraries(${TARGET} -liomp5) - target_link_libraries(${TARGET} -ldl) -endif() diff --git a/lite/kernels/host/index_select_compute.cc b/lite/kernels/host/index_select_compute.cc index b65342cd92d..f4ff2b1ad8c 100644 --- a/lite/kernels/host/index_select_compute.cc +++ b/lite/kernels/host/index_select_compute.cc @@ -72,6 +72,19 @@ REGISTER_LITE_KERNEL(index_select, #ifdef LITE_BUILD_EXTRA +REGISTER_LITE_KERNEL(index_select, + kHost, + kAny, + kNCHW, + paddle::lite::kernels::host::Index_selectCompute, + int64) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt64))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))}) + .BindInput("Index", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt64))}) + .BindPaddleOpVersion("index_select", 1) + .Finalize(); + REGISTER_LITE_KERNEL(index_select, kHost, kAny, diff --git a/lite/kernels/host/unique_compute.cc b/lite/kernels/host/unique_compute.cc index 97363f2bbe9..2e62c03f938 100644 --- a/lite/kernels/host/unique_compute.cc +++ b/lite/kernels/host/unique_compute.cc @@ -1,19 +1,5 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - #include "lite/kernels/host/unique_compute.h" -#include "lite/backends/host/math/concat.h" +#include "lite/core/tensor.h" #include #include @@ -55,11 +41,11 @@ void UniqueFunc(const lite::Tensor* x, if (count != nullptr) { // Resize the count tensor dims to allocate the memory count->Resize({static_cast(uniq.size())}); - IndexT* count_data = count->template mutable(); + IndexT* count_data = count->template mutable_data(); // init count_data to 0 memset(count_data, 0, uniq.size() * sizeof(IndexT)); - if (IndexT == int32_t) { + if (typeid(IndexT).name() == typeid(int32_t).name()) { for (auto i = 0; i < x->numel(); ++i) { const IndexT& index = index_data[i]; count_data[static_cast(index)] += static_cast(1); @@ -108,7 +94,7 @@ void UniqueFlattendTensorFunc(const lite::Tensor& in, if (return_inverse) { auto* inverse = index; - inverse->Resize{{out->numel()}}; + inverse->Resize({out->numel()}); auto inverse_data = inverse->mutable_data(); std::unordered_map inverse_map; for (int64_t i = 0; i < out->numel(); ++i) { @@ -136,11 +122,12 @@ void UniqueFlattendTensorFunc(const lite::Tensor& in, } } +template static std::vector Unbind(const lite::Tensor& in) { int64_t size = in.dims()[0]; std::vector tensors(size); for (int64_t i = 0; i < size; ++i) { - tensors[i] = in.Slice(i, i + 1); + tensors[i] = in.Slice(i, i + 1); } return tensors; } @@ -213,26 +200,26 @@ void TransCompute(const Tensor &input, const T *din = input.data(); T *dout = output->mutable_data(); - std::vector old_temp; + std::vector old_temps; int temp = 1; for (int i = 0; i < num_axes; ++i) { - old_temp.push_back(temp); + old_temps.push_back(temp); temp *= in_dims[num_axes - 1 - i]; } - std::vector old_step; + std::vector old_steps; for (int i = 0; i < num_axes; i++) { - old_step.push_back(old_temp[num_axes - 1 - i]); + old_steps.push_back(old_temps[num_axes - 1 - i]); } - std::vector new_temp; + std::vector new_temps; temp = 1; for (int i = 0; i < num_axes; ++i) { - new_temp.push_back(temp); + new_temps.push_back(temp); temp *= out_dims[num_axes - 1 - i]; } - std::vector new_step; + std::vector new_steps; for (int i = 0; i < num_axes; i++) { - new_step.push_back(new_temp[num_axes - 1 - i]); + new_steps.push_back(new_temps[num_axes - 1 - i]); } // std::vector old_steps( @@ -259,8 +246,43 @@ void TransCompute(const Tensor &input, } lite::DDim FlattenTo2d(const lite::DDim& src, int num_col_dims) { - return lite::DDim((src.Slice(0, num_col_dims)).production(), - (src.Slice(num_col_dims, src.size())).production()); + return DDim(std::vector{ + src.Slice(0, num_col_dims).production(), + src.Slice(num_col_dims, src.size()).production()}); +} + +template +void concat_func(const std::vector& input, + const int axis, + lite::Tensor* output) { + size_t num = input.size(); + auto dim_0 = input[0].dims(); + int64_t concat_input_size = 1; + int64_t num_cancats = 1; + for (int i = axis + 1; i < dim_0.size(); i++) { + concat_input_size *= dim_0[i]; + } + for (int i = 0; i < axis; i++) { + num_cancats *= dim_0[i]; + } + + auto* dst_ptr = output->mutable_data(); + const int out_concat_axis = output->dims()[axis]; + int64_t offset_concat_axis = 0; + int64_t out_sum = out_concat_axis * concat_input_size; + for (int n = 0; n < num; n++) { + auto dims = input[n].dims(); + auto* src_ptr = input[n].data(); + int64_t in_concat_axis = dims[axis]; + auto* dout_ptr = dst_ptr + offset_concat_axis * concat_input_size; + int64_t in_sum = in_concat_axis * concat_input_size; + for (int i = 0; i < num_cancats; i++) { + std::memcpy(dout_ptr, src_ptr, sizeof(T) * in_sum); + dout_ptr += out_sum; + src_ptr += in_sum; + } + offset_concat_axis += in_concat_axis; + } } template @@ -278,11 +300,11 @@ void UniqueDimFunc(const lite::Tensor& in, std::iota(permute.begin(), permute.end(), 0); permute[axis] = 0; permute[0] = axis; - std::vector in_trans_dim_vec(in.dims()); + std::vector in_trans_dim_vec(in.dims().Vectorize()); in_trans_dim_vec[axis] = in.dims()[0]; in_trans_dim_vec[0] = in.dims()[axis]; lite::Tensor in_trans; - lite::DDim in_trans_dims = in_trans_dim_vec; + lite::DDim in_trans_dims = DDim(in_trans_dim_vec); in_trans.Resize(in_trans_dims); in_trans.mutable_data(); TransCompute(in, &in_trans, permute); @@ -314,7 +336,7 @@ void UniqueDimFunc(const lite::Tensor& in, lite::Tensor input_sorted; input_sorted.Resize(in_trans_dims); input_sorted.mutable_data(); - InT* input_sorted_data = input_sorted.data(); + InT* input_sorted_data = const_cast(input_sorted.data()); for (size_t i = 0; i < sorted_indices_vec.size(); ++i) { memcpy(input_sorted_data + i * col, in_trans_data + static_cast(sorted_indices_vec[i]) * col, @@ -322,11 +344,11 @@ void UniqueDimFunc(const lite::Tensor& in, } - std::vector input_unbind = Unbind(input_sorted); + std::vector input_unbind = Unbind(input_sorted); std::vector inverse_vec(sorted_indices_vec.size(), 0); std::vector counts_vec(sorted_indices_vec.size(), 0); std::vector indices_vec(sorted_indices_vec.size(), 0); - auto last = UniqueDimImpl::iterator, InT, int32_t>( + auto last = UniqueDimImpl::iterator, InT, IndexT>( input_unbind.begin(), input_unbind.end(), sorted_indices_vec, @@ -345,7 +367,7 @@ void UniqueDimFunc(const lite::Tensor& in, std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]); out->Resize(out_trans_dims_vec); out->mutable_data(); - lite::host::math::concat_func(input_unbind, 0, &out_trans); + concat_func(input_unbind, 0, &out_trans); TransCompute(out_trans, out, permute); if (return_inverse) { @@ -375,8 +397,7 @@ void UniqueCompute::Run() { auto axis_vec = param.axis; auto is_sorted = param.is_sorted; - // lite_api::PrecisionType index_type = index->precision(); - lite_api::PrecisionType index_type = dtype; + lite_api::PrecisionType index_type = index->precision(); bool index_type_match = index_type == PRECISION(kInt32) || index_type == PRECISION(kInt64); lite_api::PrecisionType type = x->precision(); CHECK_EQ(index_type_match, true) << "Index holds the wrong type, it holds " @@ -420,21 +441,36 @@ void UniqueCompute::Run() { return; } - if (x->numel() = 0) { - out->template mutable_data(); + if (x->numel() == 0) { + switch (type) { + case PRECISION(kFloat): + output->template mutable_data(); + break; + case PRECISION(kInt32): + output->template mutable_data(); + break; + case PRECISION(kInt64): + output->template mutable_data(); + break; + default: + LOG(FATAL) << "unique does not implement for the " + << "input type:" << static_cast(type); + break; + } + return; } if (axis_vec.empty()) { if (index_type == PRECISION(kInt32)) { switch (type) { case PRECISION(kFloat): - UniqueFlattendTensorFunc(x, output, index, indices, count, return_index, return_inverse, return_counts); + UniqueFlattendTensorFunc(*x, output, index, indices, count, return_index, return_inverse, return_counts); break; case PRECISION(kInt32): - UniqueFlattendTensorFunc(x, output, index, indices, count, return_index, return_inverse, return_counts); + UniqueFlattendTensorFunc(*x, output, index, indices, count, return_index, return_inverse, return_counts); break; case PRECISION(kInt64): - UniqueFlattendTensorFunc(x, output, index, indices, count, return_index, return_inverse, return_counts); + UniqueFlattendTensorFunc(*x, output, index, indices, count, return_index, return_inverse, return_counts); break; default: LOG(FATAL) << "unique does not implement for the " @@ -444,13 +480,13 @@ void UniqueCompute::Run() { } else { switch (type) { case PRECISION(kFloat): - UniqueFlattendTensorFunc(x, output, index, indices, count, return_index, return_inverse, return_counts); + UniqueFlattendTensorFunc(*x, output, index, indices, count, return_index, return_inverse, return_counts); break; case PRECISION(kInt32): - UniqueFlattendTensorFunc(x, output, index, indices, count, return_index, return_inverse, return_counts); + UniqueFlattendTensorFunc(*x, output, index, indices, count, return_index, return_inverse, return_counts); break; case PRECISION(kInt64): - UniqueFlattendTensorFunc(x, output, index, indices, count, return_index, return_inverse, return_counts); + UniqueFlattendTensorFunc(*x, output, index, indices, count, return_index, return_inverse, return_counts); break; default: LOG(FATAL) << "unique does not implement for the " @@ -463,13 +499,13 @@ void UniqueCompute::Run() { if (index_type == PRECISION(kInt32)) { switch (type) { case PRECISION(kFloat): - UniqueDimFunc(x, output, index, indices, count, axis, return_index, return_inverse, return_counts); + UniqueDimFunc(*x, output, index, indices, count, axis, return_index, return_inverse, return_counts); break; case PRECISION(kInt32): - UniqueDimFunc(x, output, index, indices, count, axis, return_index, return_inverse, return_counts); + UniqueDimFunc(*x, output, index, indices, count, axis, return_index, return_inverse, return_counts); break; case PRECISION(kInt64): - UniqueDimFunc(x, output, index, indices, count, axis, return_index, return_inverse, return_counts); + UniqueDimFunc(*x, output, index, indices, count, axis, return_index, return_inverse, return_counts); break; default: LOG(FATAL) << "unique does not implement for the " @@ -478,13 +514,13 @@ void UniqueCompute::Run() { } else { switch (type) { case PRECISION(kFloat): - UniqueDimFunc(x, output, index, indices, count, axis, return_index, return_inverse, return_counts); + UniqueDimFunc(*x, output, index, indices, count, axis, return_index, return_inverse, return_counts); break; case PRECISION(kInt32): - UniqueDimFunc(x, output, index, indices, count, axis, return_index, return_inverse, return_counts); + UniqueDimFunc(*x, output, index, indices, count, axis, return_index, return_inverse, return_counts); break; case PRECISION(kInt64): - UniqueDimFunc(x, output, index, indices, count, axis, return_index, return_inverse, return_counts); + UniqueDimFunc(*x, output, index, indices, count, axis, return_index, return_inverse, return_counts); break; default: LOG(FATAL) << "unique does not implement for the " @@ -523,8 +559,8 @@ REGISTER_LITE_KERNEL(unique, {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32), DATALAYOUT(kAny))}) - .BindOutput("Count", + .BindOutput("Counts", {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32), DATALAYOUT(kAny))}) - .Finalize(); \ No newline at end of file + .Finalize(); From 41db8dd53dd656f07dee491cd40ead2410561a8a Mon Sep 17 00:00:00 2001 From: Qijian Tian <1741919942@qq.com> Date: Tue, 13 Dec 2022 15:35:07 +0800 Subject: [PATCH 03/10] update unique_op on 12.13 --- lite/core/program.cc | 243 ++++++++++++++++-- .../cxx/mobile_full/mobilenetv1_full_api.cc | 164 ++++++++++-- .../x86_mobilenetv1_full_demo/CMakeLists.txt | 73 ++++++ .../x86_mobilenetv1_light_demo/CMakeLists.txt | 73 ++++++ lite/kernels/host/unique_compute.cc | 78 +++--- lite/operators/unique_op.cc | 8 +- .../unittest_py/__main___cache_dir/model | Bin 0 -> 2164 bytes .../__main___cache_dir/opt_model/model | Bin 0 -> 609 bytes .../__main___cache_dir/opt_model/params | 0 .../unittest_py/__main___cache_dir/params | Bin 0 -> 28 bytes lite/tests/unittest_py/op/statics_data | Bin 0 -> 1185 bytes lite/tests/unittest_py/op/test_unique_op.py | 105 ++++++++ 12 files changed, 653 insertions(+), 91 deletions(-) create mode 100644 lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt create mode 100644 lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt create mode 100644 lite/tests/unittest_py/__main___cache_dir/model create mode 100644 lite/tests/unittest_py/__main___cache_dir/opt_model/model create mode 100644 lite/tests/unittest_py/__main___cache_dir/opt_model/params create mode 100644 lite/tests/unittest_py/__main___cache_dir/params create mode 100644 lite/tests/unittest_py/op/statics_data create mode 100644 lite/tests/unittest_py/op/test_unique_op.py diff --git a/lite/core/program.cc b/lite/core/program.cc index 8f0c0a5043a..069da1e78eb 100644 --- a/lite/core/program.cc +++ b/lite/core/program.cc @@ -605,27 +605,9 @@ void RuntimeProgram::Run() { #ifdef LITE_WITH_OPENCL // delegate flush judgement to specify target , it is too heavy for Inst inst.Flush(idx); -#if defined(LITE_WITH_PROFILE) || defined(LITE_WITH_PRECISION_PROFILE) - VLOG(4) << "kernel name " << idx << " " << inst.kernel()->name(); - const auto* op_info = inst.op()->op_info(); - auto var_in_names = op_info->input_names(); - for (int i = 0; i < var_in_names.size(); i++) { - VLOG(4) << "input var_in_names: " << var_in_names[i]; - } - auto var_out_names = op_info->output_names(); - for (int i = 0; i < var_out_names.size(); i++) { - VLOG(4) << "output var_out_names: " << var_out_names[i]; - } -#endif #endif inst.Run(); -#ifdef LITE_WITH_PRECISION_PROFILE - if (inst.op()->Type() != "while") { - precision_profiler_summary += - inst_precision_profiler.GetInstPrecision(&inst); - } -#endif // LITE_WITH_PRECISION_PROFILE } #ifdef LITE_WITH_METAL @@ -815,6 +797,231 @@ void Instruction::Run() { kernel_->Launch(); has_run_ = true; + + + + + + + + + + + + + + +#if 1 + // clang-format off + /* + time_t t; + struct tm* timeinfo; + time(&t); + timeinfo = localtime(&t); + std::cout << "time: " << asctime(timeinfo) << std::endl; + */ + std::cout << "***-----------------------------******-----------------------------***" << std::endl; + // get precision + std::string op_name = op_->op_info()->Type(); + std::cout << "op_type: " << op_name << std::endl; + if ((op_->op_info()->Type() != "fetch") && + (op_->op_info()->Type() != "while") && + (op_->op_info()->Type() != "conditional_block")) { + auto op_scope = op_->scope(); + auto out_names = op_->op_info()->output_names(); + auto in_names = op_->op_info()->input_names(); + for (auto& out_name : in_names) { + std::string out_arg_name; + op_->op_info()->GetInputArgname(out_name, &out_arg_name); + //auto type = kernel_->GetInputDeclType(out_arg_name); + // if (type->IsTensor()) { + auto tmp = op_scope->FindVar(out_name); + if (tmp->IsType()) { + const Tensor* tout = op_scope->FindVar(out_name)->GetMutable(); + if (tout->IsInitialized()) { + auto size = tout->numel(); + auto dim = tout->dims(); + double sum = 0.0; + if (tout->precision() == PrecisionType::kFloat) { + const float* dout = tout->data(); + for (int i = 0; i < size; i++) { + sum += dout[i]; + } + } else if (tout->precision() == PrecisionType::kFP16) { + const float16_t* dout = tout->data(); + for (int i = 0; i < size; i++) { + sum += static_cast(dout[i]); + } + } else if (tout->precision() == PrecisionType::kInt32) { + const int32_t* dout = tout->data(); + for (int i = 0; i < size; i++) { + sum += static_cast(dout[i]); + } + } else if (tout->precision() == PrecisionType::kInt64) { + const int64_t* dout = tout->data(); + for (int i = 0; i < size; i++) { + sum += static_cast(dout[i]); + } + } else if (tout->precision() == PrecisionType::kInt8) { + const int8_t* dout = tout->data(); + for (int i = 0; i < size; i++) { + sum += static_cast(dout[i]); + } + } else { + std::cout << "This data_type is not support: " + << PrecisionToStr(tout->precision()) << std::endl; + } + double avg = sum / static_cast(size); + std::cout << "in_name: " << out_name + << ", type: " << PrecisionToStr(tout->precision()) + << ", size: " << size << ", input avg: " << avg; + + std::cout<<", dim size:"<< dim.size() << "["; + for(int i = 0; i < dim.size(); i++) + std::cout << dim[i] << ","; + std::cout<<"]\n"; + } else { + std::cout << out_name << " is not inited." << std::endl; + } + } else if (tmp->IsType>()) { + auto touts = + op_scope->FindVar(out_name)->GetMutable>(); + for (auto t : *touts) { + const Tensor* tout = &t; + if (tout->IsInitialized()) { + auto size = tout->numel(); + const float* dout = tout->data(); + double sum = 0.0; + for (int i = 0; i < size; i++) { + sum += dout[i]; + } + double avg = sum / static_cast(size); + std::cout << "op_type: " << op_name << ", input avg: " << avg + << std::endl; + } else { + std::cout << out_name << " is not inited." << std::endl; + } + } + } + } + for (auto& out_name : out_names) { + std::string out_arg_name; + op_->op_info()->GetOutputArgname(out_name, &out_arg_name); + //auto type = kernel_->GetOutputDeclType(out_arg_name); + std::string op_name = op_->op_info()->Type(); + //if (type->IsTensor()) { + auto tmp = op_scope->FindVar(out_name); + if (tmp->IsType()) { + const Tensor* tout = op_scope->FindVar(out_name)->GetMutable(); + if (tout->IsInitialized()) { + auto size = tout->numel(); + auto dim = tout->dims(); + double sum = 0.0; + if (tout->precision() == PrecisionType::kFloat) { + const float* dout = tout->data(); + for (int i = 0; i < size; i++) { + sum += dout[i]; + } + } else if (tout->precision() == PrecisionType::kFP16) { + const float16_t* dout = tout->data(); + for (int i = 0; i < size; i++) { + sum += static_cast(dout[i]); + } + } else if (tout->precision() == PrecisionType::kInt32) { + const int32_t* dout = tout->data(); + for (int i = 0; i < size; i++) { + sum += static_cast(dout[i]); + } + } else if (tout->precision() == PrecisionType::kInt64) { + const int64_t* dout = tout->data(); + for (int i = 0; i < size; i++) { + sum += static_cast(dout[i]); + } + } else if (tout->precision() == PrecisionType::kInt8) { + const int8_t* dout = tout->data(); + for (int i = 0; i < size; i++) { + sum += static_cast(dout[i]); + } + } else { + std::cout << "This data_type is not support: " + << PrecisionToStr(tout->precision()) << std::endl; + } + double avg = sum / static_cast(size); + std::cout << "out_name: " << out_name + << ", type: " << PrecisionToStr(tout->precision()) + << ", sum: " << sum << ", output avg: " << avg; + std::cout<<", dim size:"<< dim.size() << "["; + for(int i = 0; i < dim.size(); i++) + std::cout << dim[i] << ","; + std::cout<<"]\n"; + } else { + std::cout << out_name << " is not inited." << std::endl; + } + } else if (tmp->IsType>()) { + auto touts = + op_scope->FindVar(out_name)->GetMutable>(); + for (auto t : *touts) { + const Tensor* tout = &t; + if (tout->IsInitialized()) { + auto size = tout->numel(); + double sum = 0.0; + if (tout->precision() == PrecisionType::kFloat) { + const float* dout = tout->data(); + for (int i = 0; i < size; i++) { + sum += dout[i]; + std::cout << dout[i] << ", "; + } + } else if (tout->precision() == PrecisionType::kFP16) { + const float16_t* dout = tout->data(); + for (int i = 0; i < size; i++) { + sum += static_cast(dout[i]); + std::cout << dout[i] << ", "; + } + } else if (tout->precision() == PrecisionType::kInt32) { + const int32_t* dout = tout->data(); + for (int i = 0; i < size; i++) { + sum += static_cast(dout[i]); + std::cout << dout[i] << ", "; + } + } else if (tout->precision() == PrecisionType::kInt64) { + const int64_t* dout = tout->data(); + for (int i = 0; i < size; i++) { + sum += static_cast(dout[i]); + std::cout << dout[i] << ", "; + } + } else { + std::cout << "This data_type is not support: " + << PrecisionToStr(tout->precision()) << std::endl; + } + double avg = sum / static_cast(size); + std::cout << std::endl; + std::cout << "op_type: " << op_name << out_name + << ", type: " << PrecisionToStr(tout->precision()) + << ", output avg: " << avg << std::endl; + } else { + std::cout << out_name << " is not inited." << std::endl; + } + } + } + } + std::cout << "***-----------------------------******-----------------------------***" << std::endl; + } +#endif + + + + + + + + + + + + + + + #ifdef LITE_WITH_PROFILE if (first_epoch_for_profiler_) { kernel_->SetIsKernelTest(false); diff --git a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc index 3db0f2c9c93..1759a484175 100644 --- a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc +++ b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc @@ -11,13 +11,11 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - #include #include #include #include "paddle_api.h" // NOLINT #include "paddle_use_passes.h" // NOLINT - ///////////////////////////////////////////////////////////////////////// // If this demo is linked to static library:libpaddle_api_full_bundled.a // , you should include `paddle_use_ops.h` and `paddle_use_kernels.h` to @@ -27,9 +25,7 @@ #include "paddle_use_kernels.h" // NOLINT #include "paddle_use_ops.h" // NOLINT #endif - using namespace paddle::lite_api; // NOLINT - DEFINE_string(model_dir, "", "Model dir path. Set it when the model is uncombined format."); @@ -54,13 +50,11 @@ DEFINE_int32(threads, 1, "threads num"); DEFINE_int32(warmup, 10, "warmup times"); DEFINE_int32(repeats, 100, "repeats times"); DEFINE_bool(use_gpu, false, "use opencl backend"); - int64_t ShapeProduction(const shape_t& shape) { int64_t res = 1; for (auto i : shape) res *= i; return res; } - void RunModel() { // 1. Set CxxConfig CxxConfig config; @@ -72,7 +66,6 @@ void RunModel() { } config.set_power_mode((paddle::lite_api::PowerMode)FLAGS_power_mode); config.set_threads(FLAGS_threads); - std::vector valid_places; if (FLAGS_use_gpu) { valid_places.emplace_back( @@ -93,51 +86,167 @@ void RunModel() { } else { valid_places.emplace_back(Place{TARGET(kARM), PRECISION(kFloat)}); } - if (FLAGS_prefer_int8_kernel) { valid_places.insert(valid_places.begin(), Place{TARGET(kARM), PRECISION(kInt8)}); } config.set_valid_places(valid_places); - // 2. Create PaddlePredictor by CxxConfig std::shared_ptr predictor = CreatePaddlePredictor(config); - // 3. Save the optimized model // WARN: The `predictor->SaveOptimizedModel` method must be executed // before the `predictor->Run` method. Because some kernels' `PrepareForRun` // method maybe change some parameters' values. predictor->SaveOptimizedModel(FLAGS_optimized_model_dir, LiteModelType::kNaiveBuffer); - // 4. Prepare input data - std::unique_ptr input_tensor(std::move(predictor->GetInput(0))); - input_tensor->Resize(shape_t({1, 3, 224, 224})); - auto* data = input_tensor->mutable_data(); - for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { - data[i] = 1; + const lod_t lodd = {{0,1},{0,1}}; + { + // src_ids + int64_t pre_data[100] = {41, 2, 69, 2, 68, 2, 78, 2, 83, 2, 22, 29, 21, 28, + 27, 18, 8, 2, 788, 342, 6431, 17, 2, 788, 96, 6431, 6622}; + std::unique_ptr input_tensor(std::move(predictor->GetInput(0))); + input_tensor->Resize(shape_t({1,27,1})); + auto* data = input_tensor->mutable_data(); + for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { + data[i] = pre_data[i]; + } } - - // 5. Run predictor - for (int i = 0; i < FLAGS_warmup; ++i) { - predictor->Run(); + { + // pos_ids + int64_t pre_data[100] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26}; + std::unique_ptr input_tensor(std::move(predictor->GetInput(1))); + input_tensor->Resize(shape_t({1,27,1})); + auto* data = input_tensor->mutable_data(); + for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { + data[i] = pre_data[i]; + } + } + { + // input_mask + std::unique_ptr input_tensor(std::move(predictor->GetInput(2))); + input_tensor->Resize(shape_t({1,27,27})); + auto* data = input_tensor->mutable_data(); + for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { + data[i] = 1; + } + } + { + // pos_ids_extra + std::unique_ptr input_tensor(std::move(predictor->GetInput(3))); + input_tensor->Resize(shape_t({1,27,1})); + auto* data = input_tensor->mutable_data(); + for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { + data[i] = 0; + } + } + { + // tgt_ids + std::unique_ptr input_tensor(std::move(predictor->GetInput(4))); + input_tensor->Resize(shape_t({1,1})); + auto* data = input_tensor->mutable_data(); + for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { + data[i] = 6621; + } + input_tensor->SetLoD(lodd); + } + { + // tgt_pos + std::unique_ptr input_tensor(std::move(predictor->GetInput(5))); + input_tensor->Resize(shape_t({1,1})); + auto* data = input_tensor->mutable_data(); + for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { + data[i] = 26; + } + input_tensor->SetLoD(lodd); + } + { + // init_score + std::unique_ptr input_tensor(std::move(predictor->GetInput(6))); + input_tensor->Resize(shape_t({1,1})); + auto* data = input_tensor->mutable_data(); + for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { + data[i] = 0; + } + input_tensor->SetLoD(lodd); + } + { + // parent_idx + std::unique_ptr input_tensor(std::move(predictor->GetInput(7))); + input_tensor->Resize(shape_t({1,1})); + auto* data = input_tensor->mutable_data(); + for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { + data[i] = 0; + } } - - for (int j = 0; j < FLAGS_repeats; ++j) { + { + // tgt_generation_mask + std::unique_ptr input_tensor(std::move(predictor->GetInput(8))); + input_tensor->Resize(shape_t({1,1,27})); + auto* data = input_tensor->mutable_data(); + for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { + data[i] = 1; + } + } + { + // max_dec_len + std::unique_ptr input_tensor(std::move(predictor->GetInput(9))); + input_tensor->Resize(shape_t({1})); + auto* data = input_tensor->mutable_data(); + for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { + data[i] = 10; + } + } + { + // tgt_pos_extra + std::unique_ptr input_tensor(std::move(predictor->GetInput(10))); + input_tensor->Resize(shape_t({1,1})); + auto* data = input_tensor->mutable_data(); + for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { + data[i] = 1; + } + input_tensor->SetLoD(lodd); + } + { + // cand_ids + int64_t cand[500]={41, 6623, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 69, 6623, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 68, 6623, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 78, 6623, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83, 6623, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + std::unique_ptr input_tensor(std::move(predictor->GetInput(11))); + input_tensor->Resize(shape_t({5,32})); + auto* data = input_tensor->mutable_data(); + for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { + data[i] = cand[i]; + } + } + + // 5. Run predictor + for (int j = 0; j < 1; ++j) { predictor->Run(); } - // 6. Get output + double sum = 0; std::unique_ptr output_tensor( std::move(predictor->GetOutput(0))); - std::cout << "Output shape " << output_tensor->shape()[1] << std::endl; + std::cout << "Output0 shape " << output_tensor->shape()[0] <<","<< output_tensor->shape()[1] << std::endl; for (int i = 0; i < ShapeProduction(output_tensor->shape()); i++) { - std::cout << "Output[" << i << "]: " << output_tensor->data()[i] - << std::endl; + sum += output_tensor->data()[i] * 1.f; + } + std::cout << "output0 mean is "<shape())<<"\n"; + sum = 0; + std::unique_ptr output_tensor1( + std::move(predictor->GetOutput(0))); + std::cout << "Output1 shape " << output_tensor1->shape()[0] <<","<< output_tensor1->shape()[1] << std::endl; + for (int i = 0; i < ShapeProduction(output_tensor1->shape()); i++) { + sum += output_tensor1->data()[i] * 1.f; } + std::cout << "output1 mean is "<shape())<<"\n"; } - int main(int argc, char** argv) { google::ParseCommandLineFlags(&argc, &argv, true); if (FLAGS_model_dir.empty() && @@ -162,7 +271,6 @@ int main(int argc, char** argv) { << " --use_gpu=false bool Use gpu or not.\n"; exit(1); } - RunModel(); return 0; } diff --git a/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt b/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt new file mode 100644 index 00000000000..234ec1c85e3 --- /dev/null +++ b/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt @@ -0,0 +1,73 @@ +cmake_minimum_required(VERSION 2.8) +project(mobilenet_full_api) +set(TARGET mobilenet_full_api) + +# 1. path to Paddle-Lite lib and mklml lib +set(LITE_DIR "${PROJECT_SOURCE_DIR}/../../../cxx") +set(MKLML_DIR "${PROJECT_SOURCE_DIR}/../../../third_party/mklml/") + +if (WIN32) + add_definitions("/DGOOGLE_GLOG_DLL_DECL=") + set(MSVC_STATIC_CRT ) + if(MSVC_STATIC_CRT) + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") + else(MSVC_STATIC_CRT) + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MDd") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MD") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MDd") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MD") + endif(MSVC_STATIC_CRT) +endif() + +if (APPLE AND METAL) + message(STATUS "set METAL=ON") + add_definitions("-DMETAL") + find_library(METAL_LIBRARY Metal REQUIRED) + find_library(GRAPHIC CoreGraphics REQUIRED) + find_library(MPS_LIBRARY MetalPerformanceShaders REQUIRED) +endif() + +# 2. link mklml and Paddle-Lite directory +link_directories(${LITE_DIR}/lib ${MKLML_DIR}/lib) +include_directories(${LITE_DIR}/include/ ${MKLML_DIR}/include) + +# 3. compile options +if (NOT WIN32) + add_definitions(-std=c++11 -g -O3 -pthread) + set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}) +endif() + +# 4.add executable output +add_executable(${TARGET} ${TARGET}.cc) +if (WIN32) + set(WITH_STATIC_MKL ) + if(WITH_STATIC_MKL) + set(MATH_LIB ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX}) + else() + set(MATH_LIB ${MKLML_DIR}/lib/mklml${CMAKE_STATIC_LIBRARY_SUFFIX} + ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX}) + endif() + + target_link_libraries(${TARGET} libpaddle_api_full_bundled.lib) + target_link_libraries(${TARGET} shlwapi.lib) + target_link_libraries(${TARGET} ${MATH_LIB}) + + add_custom_command(TARGET ${TARGET} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/libiomp5md.dll ${CMAKE_CURRENT_BINARY_DIR}/Release + ) + if(NOT WITH_STATIC_MKL) + add_custom_command(TARGET ${TARGET} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/mklml.dll ${CMAKE_CURRENT_BINARY_DIR}/Release + ) + endif() +else() + if (APPLE AND METAL) + target_link_libraries(${TARGET} ${METAL_LIBRARY} ${GRAPHIC} ${MPS_LIBRARY}) + endif() + target_link_libraries(${TARGET} -lpaddle_full_api_shared) + target_link_libraries(${TARGET} -liomp5) + target_link_libraries(${TARGET} -ldl) +endif() diff --git a/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt b/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt new file mode 100644 index 00000000000..3a91bfafbd3 --- /dev/null +++ b/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt @@ -0,0 +1,73 @@ +cmake_minimum_required(VERSION 2.8) +project(mobilenet_light_api) +set(TARGET mobilenet_light_api) + +# 1. path to Paddle-Lite lib and mklml lib +set(LITE_DIR "${PROJECT_SOURCE_DIR}/../../../cxx") +set(MKLML_DIR "${PROJECT_SOURCE_DIR}/../../../third_party/mklml/") + +if (WIN32) + add_definitions("/DGOOGLE_GLOG_DLL_DECL=") + set(MSVC_STATIC_CRT ) + if(MSVC_STATIC_CRT) + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") + else(MSVC_STATIC_CRT) + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MDd") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MD") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MDd") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MD") + endif(MSVC_STATIC_CRT) +endif() + +if (APPLE AND METAL) + message(STATUS "set METAL=ON") + add_definitions("-DMETAL") + find_library(METAL_LIBRARY Metal REQUIRED) + find_library(GRAPHIC CoreGraphics REQUIRED) + find_library(MPS_LIBRARY MetalPerformanceShaders REQUIRED) +endif() + +# 2. link mklml and Paddle-Lite directory +link_directories(${LITE_DIR}/lib ${MKLML_DIR}/lib) +include_directories(${LITE_DIR}/include/ ${MKLML_DIR}/include) + +# 3. compile options +if (NOT WIN32) + add_definitions(-std=c++11 -g -O3 -pthread) + set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}) +endif() + +# 4.add executable output +add_executable(${TARGET} ${TARGET}.cc) +if (WIN32) + set(WITH_STATIC_MKL ) + if(WITH_STATIC_MKL) + set(MATH_LIB ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX}) + else() + set(MATH_LIB ${MKLML_DIR}/lib/mklml${CMAKE_STATIC_LIBRARY_SUFFIX} + ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX}) + endif() + + target_link_libraries(${TARGET} libpaddle_api_light_bundled.lib) + target_link_libraries(${TARGET} shlwapi.lib) + target_link_libraries(${TARGET} ${MATH_LIB}) + + add_custom_command(TARGET ${TARGET} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/libiomp5md.dll ${CMAKE_CURRENT_BINARY_DIR}/Release + ) + if(NOT WITH_STATIC_MKL) + add_custom_command(TARGET ${TARGET} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/mklml.dll ${CMAKE_CURRENT_BINARY_DIR}/Release + ) + endif() +else() + if (APPLE AND METAL) + target_link_libraries(${TARGET} ${METAL_LIBRARY} ${GRAPHIC} ${MPS_LIBRARY}) + endif() + target_link_libraries(${TARGET} -lpaddle_light_api_shared) + target_link_libraries(${TARGET} -liomp5) + target_link_libraries(${TARGET} -ldl) +endif() diff --git a/lite/kernels/host/unique_compute.cc b/lite/kernels/host/unique_compute.cc index 2e62c03f938..4c96e7f8c11 100644 --- a/lite/kernels/host/unique_compute.cc +++ b/lite/kernels/host/unique_compute.cc @@ -1,3 +1,17 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #include "lite/kernels/host/unique_compute.h" #include "lite/core/tensor.h" @@ -8,6 +22,7 @@ #include #include #include +#include namespace paddle { namespace lite { @@ -18,7 +33,7 @@ template void UniqueFunc(const lite::Tensor* x, lite::Tensor* out, lite::Tensor* index, - lite::Tensor* count) { + lite::Tensor* count = nullptr) { const InT* in_data = x->template data(); IndexT* index_data = index->mutable_data(); @@ -31,6 +46,7 @@ void UniqueFunc(const lite::Tensor* x, auto it = dict.find(in_data[i]); if (it == dict.end()) { dict.emplace(std::make_pair(in_data[i], j)); + uniq.emplace_back(in_data[i]); index_data[i] = static_cast(j); j++; } else { @@ -44,20 +60,11 @@ void UniqueFunc(const lite::Tensor* x, IndexT* count_data = count->template mutable_data(); // init count_data to 0 memset(count_data, 0, uniq.size() * sizeof(IndexT)); - - if (typeid(IndexT).name() == typeid(int32_t).name()) { - for (auto i = 0; i < x->numel(); ++i) { - const IndexT& index = index_data[i]; - count_data[static_cast(index)] += static_cast(1); - } - } else { - for (auto i = 0; i < x->numel(); ++i) { - const IndexT& index = index_data[i]; - count_data[static_cast(index)] += static_cast(1); - } + for (auto i = 0; i < x->numel(); ++i) { + const IndexT& index = index_data[i]; + count_data[index] += static_cast(1); } } - out->Resize({static_cast(uniq.size())}); auto out_data = out->mutable_data(); std::memcpy(out_data, uniq.data(), uniq.size() * sizeof(InT)); @@ -222,17 +229,6 @@ void TransCompute(const Tensor &input, new_steps.push_back(new_temps[num_axes - 1 - i]); } - // std::vector old_steps( - // {static_cast(in_dims[1] * in_dims[2] * in_dims[3]), - // static_cast(in_dims[2] * in_dims[3]), - // static_cast(in_dims[3]), - // 1}); - // std::vector new_steps( - // {static_cast(out_dims[1] * out_dims[2] * out_dims[3]), - // static_cast(out_dims[2] * out_dims[3]), - // static_cast(out_dims[3]), - // 1}); - for (int i = 0; i < count; ++i) { int old_idx = 0; int idx = i; @@ -422,24 +418,24 @@ void UniqueCompute::Run() { break; } } else { - switch (type) { - case PRECISION(kFloat): - UniqueFunc(x, output, index, count); - break; - case PRECISION(kInt32): - UniqueFunc(x, output, index, count); - break; - case PRECISION(kInt64): - UniqueFunc(x, output, index, count); - break; - default: - LOG(FATAL) << "unique does not implement for the " - << "input type:" << static_cast(type); - break; + switch (type) { + case PRECISION(kFloat): + UniqueFunc(x, output, index, count); + break; + case PRECISION(kInt32): + UniqueFunc(x, output, index, count); + break; + case PRECISION(kInt64): + UniqueFunc(x, output, index, count); + break; + default: + LOG(FATAL) << "unique does not implement for the " + << "input type:" << static_cast(type); + break; } - } - return; - } + } + return; + } if (x->numel() == 0) { switch (type) { diff --git a/lite/operators/unique_op.cc b/lite/operators/unique_op.cc index 3cab23bbc81..adab6096cd8 100644 --- a/lite/operators/unique_op.cc +++ b/lite/operators/unique_op.cc @@ -35,10 +35,10 @@ bool UniqueOp::CheckShape() const { bool UniqueOp::InferShapeImpl() const { DDim in_dims = param_.X->dims(); - param_.Out->Resize(in_dims); - param_.Index->Resize(in_dims); - param_.Indices->Resize(in_dims); - param_.Counts->Resize(in_dims); + if (param_.Out) param_.Out->Resize(in_dims); + if (param_.Index) param_.Index->Resize(in_dims); + if (param_.Indices) param_.Indices->Resize(in_dims); + if (param_.Counts) param_.Counts->Resize(in_dims); return true; } diff --git a/lite/tests/unittest_py/__main___cache_dir/model b/lite/tests/unittest_py/__main___cache_dir/model new file mode 100644 index 0000000000000000000000000000000000000000..ad9c9d92f78368ee4c4713242cba7c0dad4929b5 GIT binary patch literal 2164 zcmZuyO>P@E6!u86EkDW0SV>w(DGEbrfx?>t&H;La7HAjU6a+QHkqB|fG0Bmota^-Y zyUTHUmR=zJNNOxassY2EH}CKJ-uK?nUxVRU{<-_frehjZLX?x<@ai_3UekFce6hqM z-DT5vG%~GkTF*-E`D8r2oDS(?*v~2NPv6t4M)5+hrBS7@>`BPQ(oYCZ7c>~2=NS@C ziQJMmA|>eF_42`8Hv5sTDEobq>fccR=N8d>?`eE7yP|$!RDO1QHv810w>0`SjGRT+!^i>A#lemR@vG&T(@xKi*g zR{42wfu$5{S%|!Me>R&$7tL$o3WGes_JS_qWQ_`G|GY!cX!p$4+#<_cqD0iLbVvT@ zYzDp~;C!^9O>D_|i!jx8xzE!>_Wd9B-=Ai0=;f32OZM0T4&&b2UFXy~!RnPNwT_kt z$^*BDWK711pfurW`g2I{VEFf^kN5wK>1(N!nuPGSZeOA+%R3c=<|YxTcZq{n@xOvM{gEF3yVywv8FMO zE63Ik@#VL4TS|~5=Ph1Wjo!Q9eUHG9P@)#vKS@x+%OLHHru6#ITq$F0DcAAcQAjOQ z>x9ivvWYGcnk<>)b)$rhwtM=n;w#7pShY|r)++-Y+RKla2H!9~4-k6&oo-lK$P^e#$gEO zk!fS==<%9fSgu!LKYMdO!ICuzD2JwO%4CAz@b}GQC;5aH#_>)!02oYbLKXlQfpscobZR!**kR@&-&fs(VAiglrkb@Bb!krC~ zV4`SoUl_!bqK_wJl3YP&ZNdibioh-6tZ*0<-{OnXu@qQ1;@}6Goo-d9&`#qq{wvD= zhs7T4>>qHszF-$vyYsZaSpc%Ac><5reLsM^~5^ymd};&48gaiEr#4uPUhpwo#@&pyLLMm>5MPVZ|ENro#%v zPK0{+wW3JZ4mg_qtbs%EI#tF0yyT%&y!2ruxRQ&Y-abCnecl$aJM8<(@k(dw2u@?3 GLco9K>qUeB literal 0 HcmV?d00001 diff --git a/lite/tests/unittest_py/__main___cache_dir/opt_model/model b/lite/tests/unittest_py/__main___cache_dir/opt_model/model new file mode 100644 index 0000000000000000000000000000000000000000..99d11af804fa2902ea6a97be38c42308ea14546c GIT binary patch literal 609 zcmaJ%P7!*+%`Apph9&o~Qx;bX8nqz#sBAZw_R zGjP*VWTjAGSAsUQZ4&`9CESBn#6>WZOueB-pTPnB2`lm=Fz-r%8{DJ@=on5In=W{s z@R*5rIoKB>pE8Dwalg{#g*@^m-i>!f9WaA5Mw%r5G++eoL#6bQCqO*RZa+3X!v=AD5~X0h9%2uZuWLlA65_Ed}f6FZu5+J%+(* zE@telly>US>aq+_m3q@U?y5M{c8I#@<$(YO^&WS|_mY<`tFN*3)VXN5vh1(#k3SIb Bv*P2@mD5DGN0 z7)=Syj1pP#eJf>Ugq<~0cD#3cUvmWwAGMrw6huQ2_1|=**7LnL`QX9 zCq`#NNn9+WDs)lBu8eMk?nW~?xg=xz<(256>OC2~2)&J_vR-8i%R?Vk?#t*$=&vtc zT7vzfTC>0zikGmJ5uFv4gp`$;QZ zn|0-KX4;D|QngDNqX?sowzBQlyD8U_QTbtz%?gZB17jKE2;<}XU0>LqT-}zFet-$8 zKanwsFgc2hmhy@)N+#_vjFX;{!_=rT>5f(dfW|fR8LbW4PT9R<$NR~2|5y*)HSk72MsBz+DS29)+ z3@6@fHDe87t=vpKq977x%B`!5S6i3E`uu7e7$#w(2Fll2OGs*9ieVG#G;kBcA-Ed2 znX!f7X<(Y+69NUyMh9&79q##6#GjeX8|L3pXLuNbchZxr@z9{Y~*p723qKQcZMK5Oh3##h2Ojs4E} KLHHR(QRy$IOqiSi literal 0 HcmV?d00001 diff --git a/lite/tests/unittest_py/op/test_unique_op.py b/lite/tests/unittest_py/op/test_unique_op.py new file mode 100644 index 00000000000..24fb6b63a3f --- /dev/null +++ b/lite/tests/unittest_py/op/test_unique_op.py @@ -0,0 +1,105 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +sys.path.append('../') + +from auto_scan_test import AutoScanTest, IgnoreReasons +from program_config import TensorConfig, ProgramConfig, OpConfig, CxxConfig, TargetType, PrecisionType, DataLayoutType, Place + +import hypothesis +from hypothesis import given, settings, seed, example, assume +import hypothesis.strategies as st +from functools import partial +import random +import numpy as np + + +class TestUniqueWithCountsOp(AutoScanTest): + def __init__(self, *args, **kwargs): + AutoScanTest.__init__(self, *args, **kwargs) + host_places = [ + Place(TargetType.Host, PrecisionType.FP32, DataLayoutType.NCHW) + ] + self.enable_testing_on_place(places=host_places, thread=[1,4]) + + def is_program_valid(self, + program_config: ProgramConfig, + predictor_config: CxxConfig) -> bool: + return True + + def sample_program_configs(self, draw): + in_shape = draw( + st.lists( + st.integers( + min_value=2, max_value=100), + min_size=1, + max_size=3)) + in_dtype = draw(st.sampled_from([np.float32, np.int32, np.int64])) + + def generate_X_data(): + return np.random.normal(0.0, 5.0, in_shape).astype(in_dtype) + + def generate_IndexTensor(): + return np.random.randint(1, 5, size=in_shape).astype(np.int32) + + axis = draw(st.sampled_from([[0, 1, 2], [1], [0, 2], [2, 1], [0, 1]])) + + unique_op = OpConfig( + type = "unique", + input = {"X": ["input_data"]}, + outputs = { + "Out": ["Out_data"], + "Index": ["Index_data"], + "Indices": ["Indices_data"], + "Counts": ["Counts_data"] + }, + attrs={ + "dtype": 2, + "return_index": False, + "return_inverse": False, + "return_counts": False, + "axis": axis, + "is_sorted": False + } + ) + + unique_op.outputs_dtype = {"Out_data": in_dtype} + unique_op.outputs_dtype = {"Index_data": np.int32} + unique_op.outputs_dtype = {"Counts_data":np.int32} + + program_config = ProgramConfig( + ops=[unique_op], + weights={ + "Index_data": + TensorConfig(data_gen=partial(generate_IndexTensor)) + }, + inputs={ + "input_data": TensorConfig(data_gen=partial(generate_X_data)) + }, + outputs=["Out_data", "Index_data", "Counts_data"] + ) + return program_config + + def sample_predictor_configs(self): + return self.get_predictor_configs(), [""], (1e-5, 1e-5) + + def add_ignore_pass_case(self): + pass + + def test(self, *args, **kwargs): + self.run_and_statis(quant=False, max_examples=25) + +if __name__ == "__main__": + unittest.main(argv=['']) From 794fb857f296bbbbae530a3927656a41b2a4dc09 Mon Sep 17 00:00:00 2001 From: Qijian Tian <1741919942@qq.com> Date: Tue, 13 Dec 2022 15:54:39 +0800 Subject: [PATCH 04/10] update mobilenetv1_full_api for unique_op --- lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc index 1759a484175..c1fe58927b5 100644 --- a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc +++ b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc @@ -240,7 +240,7 @@ void RunModel() { std::cout << "output0 mean is "<shape())<<"\n"; sum = 0; std::unique_ptr output_tensor1( - std::move(predictor->GetOutput(0))); + std::move(predictor->GetOutput(1))); std::cout << "Output1 shape " << output_tensor1->shape()[0] <<","<< output_tensor1->shape()[1] << std::endl; for (int i = 0; i < ShapeProduction(output_tensor1->shape()); i++) { sum += output_tensor1->data()[i] * 1.f; From 0d81d6b6655cda0c4358bf2f28030bb592a91b2a Mon Sep 17 00:00:00 2001 From: Qijian Tian <1741919942@qq.com> Date: Wed, 14 Dec 2022 12:25:14 +0800 Subject: [PATCH 05/10] update test_unique_op for unique_op --- lite/backends/arm/math/dotprod/gemm_sdot.h | 442 ------------- lite/backends/arm/math/dotprod/gemm_vsdot.h | 54 -- lite/core/program.cc | 2 +- .../cxx/mobile_full/mobilenetv1_full_api.cc | 164 +---- .../cxx/mobile_light/mobilenetv1_light_api.cc | 620 +++++++++--------- .../x86_mobilenetv1_full_demo/CMakeLists.txt | 73 --- .../x86_mobilenetv1_light_demo/CMakeLists.txt | 73 --- .../unittest_py/__main___cache_dir/model | Bin 2164 -> 0 bytes .../__main___cache_dir/opt_model/model | Bin 609 -> 0 bytes .../__main___cache_dir/opt_model/params | 0 .../unittest_py/__main___cache_dir/params | Bin 28 -> 0 bytes lite/tests/unittest_py/op/statics_data | Bin 1185 -> 0 bytes lite/tests/unittest_py/op/test_unique_op.py | 5 +- 13 files changed, 356 insertions(+), 1077 deletions(-) delete mode 100644 lite/backends/arm/math/dotprod/gemm_sdot.h delete mode 100644 lite/backends/arm/math/dotprod/gemm_vsdot.h delete mode 100644 lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt delete mode 100644 lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt delete mode 100644 lite/tests/unittest_py/__main___cache_dir/model delete mode 100644 lite/tests/unittest_py/__main___cache_dir/opt_model/model delete mode 100644 lite/tests/unittest_py/__main___cache_dir/opt_model/params delete mode 100644 lite/tests/unittest_py/__main___cache_dir/params delete mode 100644 lite/tests/unittest_py/op/statics_data diff --git a/lite/backends/arm/math/dotprod/gemm_sdot.h b/lite/backends/arm/math/dotprod/gemm_sdot.h deleted file mode 100644 index 1eea169b15f..00000000000 --- a/lite/backends/arm/math/dotprod/gemm_sdot.h +++ /dev/null @@ -1,442 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -// clang-format off -#define GEMM_SDOT_INT8_KERNEL \ - "ldp q0, q1, [%[a_ptr]], #32\n" /* load a00,a01 to q0, q1*/ \ - "ldp q4, q5, [%[b_ptr]], #32\n" /* load b0, b1 to q4, q5*/ \ - "eor v8.16b, v8.16b, v8.16b\n" /* out0 = 0 */ \ - "eor v9.16b, v9.16b, v9.16b\n" /* out1 = 0 */ \ - "eor v10.16b, v10.16b, v10.16b\n" /* out2 = 0 */ \ - "eor v11.16b, v11.16b, v11.16b\n" /* out3 = 0 */ \ - "eor v12.16b, v12.16b, v12.16b\n" /* out4 = 0 */ \ - "prfm pldl1keep, [%[b_ptr], #64]\n" /* preload b*/ \ - "eor v13.16b, v13.16b, v13.16b\n" /* out5 = 0 */ \ - "prfm pldl1keep, [%[a_ptr], #64]\n" /* preload a*/ \ - "eor v14.16b, v14.16b, v14.16b\n" /* out6 = 0 */ \ - "prfm pldl1keep, [%[b_ptr], #128]\n" /* preload b*/ \ - "eor v15.16b, v15.16b, v15.16b\n" /* out7 = 0 */ \ - "prfm pldl1keep, [%[a_ptr], #128]\n" /* preload a*/ \ - "eor v16.16b, v16.16b, v16.16b\n" /* out8 = 0 */ \ - "prfm pldl1keep, [%[b_ptr], #192]\n" /* preload b*/ \ - "eor v17.16b, v17.16b, v17.16b\n" /* out9 = 0 */ \ - "prfm pldl1keep, [%[b_ptr], #256]\n" /* preload b*/ \ - "eor v18.16b, v18.16b, v18.16b\n" /* out10 = 0 */ \ - "prfm pldl1keep, [%[a_ptr], #192]\n" /* preload a*/ \ - "eor v19.16b, v19.16b, v19.16b\n" /* out11 = 0 */ \ - "prfm pldl1keep, [%[b_ptr], #320]\n" /* preload b*/ \ - "eor v20.16b, v20.16b, v20.16b\n" /* out12 = 0 */ \ - "prfm pldl1keep, [%[a_ptr], #256]\n" /* preload a*/ \ - "eor v21.16b, v21.16b, v21.16b\n" /* out13 = 0 */ \ - "prfm pldl1keep, [%[b_ptr], #384]\n" /* preload b*/ \ - "eor v22.16b, v22.16b, v22.16b\n" /* out14 = 0 */ \ - "eor v23.16b, v23.16b, v23.16b\n" /* out15 = 0 */ \ - "eor v24.16b, v24.16b, v24.16b\n" /* out16 = 0 */ \ - "eor v25.16b, v25.16b, v25.16b\n" /* out17 = 0 */ \ - "eor v26.16b, v26.16b, v26.16b\n" /* out18 = 0 */ \ - "eor v27.16b, v27.16b, v27.16b\n" /* out19 = 0 */ \ - "eor v28.16b, v28.16b, v28.16b\n" /* out20 = 0 */ \ - "eor v29.16b, v29.16b, v29.16b\n" /* out21 = 0 */ \ - "eor v30.16b, v30.16b, v30.16b\n" /* out22 = 0 */ \ - "eor v31.16b, v31.16b, v31.16b\n" /* out23 = 0 */ \ - "cbz %w[k], 2f\n" /* check loop count > 0 */ \ - /* main loop, unrool 0*/ \ - "1:\n" /* main loop */ \ -".word 0x4f80e088\n" /* sdot v8.4s, v4.16b, v0.4b[0] */\ -".word 0x4fa0e08b\n" /* sdot v11.4s, v4.16b, v0.4b[1] */\ - "ldp q6, q7, [%[b_ptr]], #32\n" /* load b2, b0 to q6, q7 */ \ -".word 0x4f80e88e\n" /* sdot v14.4s, v4.16b, v0.4b[2] */\ -".word 0x4fa0e891\n" /* sdot v17.4s, v4.16b, v0.4b[3] */\ - "ldp q2, q3, [%[a_ptr]], #32\n" /* load a10, a11 to q3, q4 */ \ -".word 0x4f81e094\n" /* sdot v20.4s, v4.16b, v1.4b[0] */\ -".word 0x4fa1e097\n" /* sdot v23.4s, v4.16b, v1.4b[1] */\ -".word 0x4f81e89a\n" /* sdot v26.4s, v4.16b, v1.4b[2] */\ -".word 0x4fa1e89d\n" /* sdot v29.4s, v4.16b, v1.4b[3] */\ -".word 0x4f80e0a9\n" /* sdot v9.4s, v5.16b, v0.4b[0] */\ -".word 0x4fa0e0ac\n" /* sdot v12.4s, v5.16b, v0.4b[1] */\ -".word 0x4f80e8af\n" /* sdot v15.4s, v5.16b, v0.4b[2] */\ -".word 0x4fa0e8b2\n" /* sdot v18.4s, v5.16b, v0.4b[3] */\ -".word 0x4f81e0b5\n" /* sdot v21.4s, v5.16b, v1.4b[0] */\ -".word 0x4fa1e0b8\n" /* sdot v24.4s, v5.16b, v1.4b[1] */\ -".word 0x4f81e8bb\n" /* sdot v27.4s, v5.16b, v1.4b[2] */\ -".word 0x4fa1e8be\n" /* sdot v30.4s, v5.16b, v1.4b[3] */\ - "ldp q4, q5, [%[b_ptr]], #32\n" /* load b1, b2 to q4, q5 */ \ -".word 0x4f80e0ca\n" /* sdot v10.4s, v6.16b, v0.4b[0] */\ -".word 0x4fa0e0cd\n" /* sdot v13.4s, v6.16b, v0.4b[1] */\ - "prfm pldl1keep, [%[b_ptr], #384]\n" \ -".word 0x4f80e8d0\n" /* sdot v16.4s, v6.16b, v0.4b[2] */\ -".word 0x4fa0e8d3\n" /* sdot v19.4s, v6.16b, v0.4b[3] */\ -".word 0x4f81e0d6\n" /* sdot v22.4s, v6.16b, v1.4b[0] */\ -".word 0x4fa1e0d9\n" /* sdot v25.4s, v6.16b, v1.4b[1] */\ -".word 0x4f81e8dc\n" /* sdot v28.4s, v6.16b, v1.4b[2] */\ -".word 0x4fa1e8df\n" /* sdot v31.4s, v6.16b, v1.4b[3] */\ - "ldp q0, q1, [%[a_ptr]], #32\n" /* load a00, a01 to q0, q1 */ \ - /* unrool 1 */ \ -".word 0x4f82e0e8\n" /* sdot v8.4s, v7.16b, v2.4b[0] */\ -".word 0x4fa2e0eb\n" /* sdot v11.4s, v7.16b, v2.4b[1] */\ -".word 0x4f82e8ee\n" /* sdot v14.4s, v7.16b, v2.4b[2] */\ - "prfm pldl1keep, [%[a_ptr], #256]\n" \ -".word 0x4fa2e8f1\n" /* sdot v17.4s, v7.16b, v2.4b[3] */\ -".word 0x4f83e0f4\n" /* sdot v20.4s, v7.16b, v3.4b[0] */\ -".word 0x4fa3e0f7\n" /* sdot v23.4s, v7.16b, v3.4b[1] */\ -".word 0x4f83e8fa\n" /* sdot v26.4s, v7.16b, v3.4b[2] */\ -".word 0x4fa3e8fd\n" /* sdot v29.4s, v7.16b, v3.4b[3] */\ - "ldp q6, q7, [%[b_ptr]], #32\n" /* load b0, b1 to q6, q7 */ \ -".word 0x4f82e089\n" /* sdot v9.4s, v4.16b, v2.4b[0] */\ -".word 0x4fa2e08c\n" /* sdot v12.4s, v4.16b, v2.4b[1] */\ -".word 0x4f82e88f\n" /* sdot v15.4s, v4.16b, v2.4b[2] */\ -".word 0x4fa2e892\n" /* sdot v18.4s, v4.16b, v2.4b[3] */\ -".word 0x4f83e095\n" /* sdot v21.4s, v4.16b, v3.4b[0] */\ -".word 0x4fa3e098\n" /* sdot v24.4s, v4.16b, v3.4b[1] */\ -".word 0x4f83e89b\n" /* sdot v27.4s, v4.16b, v3.4b[2] */\ -".word 0x4fa3e89e\n" /* sdot v30.4s, v4.16b, v3.4b[3] */\ -".word 0x4f82e0aa\n" /* sdot v10.4s, v5.16b, v2.4b[0] */\ -".word 0x4fa2e0ad\n" /* sdot v13.4s, v5.16b, v2.4b[1] */\ -".word 0x4f82e8b0\n" /* sdot v16.4s, v5.16b, v2.4b[2] */\ -".word 0x4fa2e8b3\n" /* sdot v19.4s, v5.16b, v2.4b[3] */\ -".word 0x4f83e0b6\n" /* sdot v22.4s, v5.16b, v3.4b[0] */\ -".word 0x4fa3e0b9\n" /* sdot v25.4s, v5.16b, v3.4b[1] */\ -".word 0x4f83e8bc\n" /* sdot v28.4s, v5.16b, v3.4b[2] */\ -".word 0x4fa3e8bf\n" /* sdot v31.4s, v5.16b, v3.4b[3] */\ - "ldp q4, q5, [%[b_ptr]], #32\n" /* load b2, b0 to q4, q5 */ \ - /* unrool 2*/ \ -".word 0x4f80e0c8\n" /* sdot v8.4s, v6.16b, v0.4b[0] */\ -".word 0x4fa0e0cb\n" /* sdot v11.4s, v6.16b, v0.4b[1] */\ - "ldp q2, q3, [%[a_ptr]], #32\n" /* load a10, a11 to q3, q4*/ \ -".word 0x4f80e8ce\n" /* sdot v14.4s, v6.16b, v0.4b[2] */\ -".word 0x4fa0e8d1\n" /* sdot v17.4s, v6.16b, v0.4b[3] */\ -".word 0x4f81e0d4\n" /* sdot v20.4s, v6.16b, v1.4b[0] */\ -".word 0x4fa1e0d7\n" /* sdot v23.4s, v6.16b, v1.4b[1] */\ -".word 0x4f81e8da\n" /* sdot v26.4s, v6.16b, v1.4b[2] */\ -".word 0x4fa1e8dd\n" /* sdot v29.4s, v6.16b, v1.4b[3] */\ -".word 0x4f80e0e9\n" /* sdot v9.4s, v7.16b, v0.4b[0] */\ -".word 0x4fa0e0ec\n" /* sdot v12.4s, v7.16b, v0.4b[1] */\ - "prfm pldl1keep, [%[b_ptr], #384]\n" \ -".word 0x4f80e8ef\n" /* sdot v15.4s, v7.16b, v0.4b[2] */\ -".word 0x4fa0e8f2\n" /* sdot v18.4s, v7.16b, v0.4b[3] */\ -".word 0x4f81e0f5\n" /* sdot v21.4s, v7.16b, v1.4b[0] */\ -".word 0x4fa1e0f8\n" /* sdot v24.4s, v7.16b, v1.4b[1] */\ -".word 0x4f81e8fb\n" /* sdot v27.4s, v7.16b, v1.4b[2] */\ -".word 0x4fa1e8fe\n" /* sdot v30.4s, v7.16b, v1.4b[3] */\ - "ldp q6, q7, [%[b_ptr]], #32\n" /* load b1, b2 to q6, q7*/ \ -".word 0x4f80e08a\n" /* sdot v10.4s, v4.16b, v0.4b[0] */\ -".word 0x4fa0e08d\n" /* sdot v13.4s, v4.16b, v0.4b[1] */\ -".word 0x4f80e890\n" /* sdot v16.4s, v4.16b, v0.4b[2] */\ -".word 0x4fa0e893\n" /* sdot v19.4s, v4.16b, v0.4b[3] */\ -".word 0x4f81e096\n" /* sdot v22.4s, v4.16b, v1.4b[0] */\ -".word 0x4fa1e099\n" /* sdot v25.4s, v4.16b, v1.4b[1] */\ -".word 0x4f81e89c\n" /* sdot v28.4s, v4.16b, v1.4b[2] */\ -".word 0x4fa1e89f\n" /* sdot v31.4s, v4.16b, v1.4b[3] */\ - "ldp q0, q1, [%[a_ptr]], #32\n" /* load a00, a01 to q0, q1*/ \ - /* unrool 3*/ \ -".word 0x4f82e0a8\n" /* sdot v8.4s, v5.16b, v2.4b[0] */\ -".word 0x4fa2e0ab\n" /* sdot v11.4s, v5.16b, v2.4b[1] */\ -".word 0x4f82e8ae\n" /* sdot v14.4s, v5.16b, v2.4b[2] */\ -".word 0x4fa2e8b1\n" /* sdot v17.4s, v5.16b, v2.4b[3] */\ -".word 0x4f83e0b4\n" /* sdot v20.4s, v5.16b, v3.4b[0] */\ -".word 0x4fa3e0b7\n" /* sdot v23.4s, v5.16b, v3.4b[1] */\ -".word 0x4f83e8ba\n" /* sdot v26.4s, v5.16b, v3.4b[2] */\ -".word 0x4fa3e8bd\n" /* sdot v29.4s, v5.16b, v3.4b[3] */\ - "ldp q4, q5, [%[b_ptr]], #32\n" /* load b0, b1 to q4, q5*/ \ -".word 0x4f82e0c9\n" /* sdot v9.4s, v6.16b, v2.4b[0] */\ -".word 0x4fa2e0cc\n" /* sdot v12.4s, v6.16b, v2.4b[1] */\ - "prfm pldl1keep, [%[a_ptr], #256]\n" \ -".word 0x4f82e8cf\n" /* sdot v15.4s, v6.16b, v2.4b[2] */\ -".word 0x4fa2e8d2\n" /* sdot v18.4s, v6.16b, v2.4b[3] */\ -".word 0x4f83e0d5\n" /* sdot v21.4s, v6.16b, v3.4b[0] */\ -".word 0x4fa3e0d8\n" /* sdot v24.4s, v6.16b, v3.4b[1] */\ -".word 0x4f83e8db\n" /* sdot v27.4s, v6.16b, v3.4b[2] */\ - "prfm pldl1keep, [%[b_ptr], #384]\n" \ -".word 0x4fa3e8de\n" /* sdot v30.4s, v6.16b, v3.4b[3] */\ -".word 0x4f82e0ea\n" /* sdot v10.4s, v7.16b, v2.4b[0] */\ -".word 0x4fa2e0ed\n" /* sdot v13.4s, v7.16b, v2.4b[1] */\ -".word 0x4f82e8f0\n" /* sdot v16.4s, v7.16b, v2.4b[2] */\ -".word 0x4fa2e8f3\n" /* sdot v19.4s, v7.16b, v2.4b[3] */\ -".word 0x4f83e0f6\n" /* sdot v22.4s, v7.16b, v3.4b[0] */\ -".word 0x4fa3e0f9\n" /* sdot v25.4s, v7.16b, v3.4b[1] */\ - "subs %w[k], %w[k], #1\n" /* loop count - 1*/ \ -".word 0x4f83e8fc\n" /* sdot v28.4s, v7.16b, v3.4b[2] */\ -".word 0x4fa3e8ff\n" /* sdot v31.4s, v7.16b, v3.4b[3] */\ - "bne 1b\n" /* Target to use when K is 1 or 2 */ \ - "2:\n" /* process tail*/ \ - "subs %w[tail], %w[tail], #1\n" /* tail--*/ \ - "beq 3f\n" /*jump to tail = 1*/ \ - /* final unrool 0, unrool 0, tail > 1*/ \ -".word 0x4f80e088\n" /* sdot v8.4s, v4.16b, v0.4b[0] */\ -".word 0x4fa0e08b\n" /* sdot v11.4s, v4.16b, v0.4b[1] */\ - "ldp q6, q7, [%[b_ptr]], #32\n" /* load b2, b0 to q6, q7*/ \ -".word 0x4f80e88e\n" /* sdot v14.4s, v4.16b, v0.4b[2] */\ -".word 0x4fa0e891\n" /* sdot v17.4s, v4.16b, v0.4b[3] */\ - "ldp q2, q3, [%[a_ptr]], #32\n" /* load a10, a11 to q2, q3*/ \ -".word 0x4f81e094\n" /* sdot v20.4s, v4.16b, v1.4b[0] */\ -".word 0x4fa1e097\n" /* sdot v23.4s, v4.16b, v1.4b[1] */\ -".word 0x4f81e89a\n" /* sdot v26.4s, v4.16b, v1.4b[2] */\ -".word 0x4fa1e89d\n" /* sdot v29.4s, v4.16b, v1.4b[3] */\ - "subs %w[tail], %w[tail], #1\n" /* tail--*/ \ -".word 0x4f80e0a9\n" /* sdot v9.4s, v5.16b, v0.4b[0] */\ -".word 0x4fa0e0ac\n" /* sdot v12.4s, v5.16b, v0.4b[1] */\ -".word 0x4f80e8af\n" /* sdot v15.4s, v5.16b, v0.4b[2] */\ -".word 0x4fa0e8b2\n" /* sdot v18.4s, v5.16b, v0.4b[3] */\ -".word 0x4f81e0b5\n" /* sdot v21.4s, v5.16b, v1.4b[0] */\ -".word 0x4fa1e0b8\n" /* sdot v24.4s, v5.16b, v1.4b[1] */\ -".word 0x4f81e8bb\n" /* sdot v27.4s, v5.16b, v1.4b[2] */\ -".word 0x4fa1e8be\n" /* sdot v30.4s, v5.16b, v1.4b[3] */\ - "ldp q4, q5, [%[b_ptr]], #32\n" /* load b1, b2 to q4, q5*/ \ -".word 0x4f80e0ca\n" /* sdot v10.4s, v6.16b, v0.4b[0] */\ -".word 0x4fa0e0cd\n" /* sdot v13.4s, v6.16b, v0.4b[1] */\ -".word 0x4f80e8d0\n" /* sdot v16.4s, v6.16b, v0.4b[2] */\ -".word 0x4fa0e8d3\n" /* sdot v19.4s, v6.16b, v0.4b[3] */\ -".word 0x4f81e0d6\n" /* sdot v22.4s, v6.16b, v1.4b[0] */\ -".word 0x4fa1e0d9\n" /* sdot v25.4s, v6.16b, v1.4b[1] */\ -".word 0x4f81e8dc\n" /* sdot v28.4s, v6.16b, v1.4b[2] */\ -".word 0x4fa1e8df\n" /* sdot v31.4s, v6.16b, v1.4b[3] */\ - "beq 4f\n" /*jump to tail = 2*/ \ - /* unrool 1, tail > 2*/ \ - "ldp q0, q1, [%[a_ptr]], #32\n" /* load a00, a01 to q0, q1*/ \ -".word 0x4f82e0e8\n" /* sdot v8.4s, v7.16b, v2.4b[0] */\ -".word 0x4fa2e0eb\n" /* sdot v11.4s, v7.16b, v2.4b[1] */\ -".word 0x4f82e8ee\n" /* sdot v14.4s, v7.16b, v2.4b[2] */\ -".word 0x4fa2e8f1\n" /* sdot v17.4s, v7.16b, v2.4b[3] */\ -".word 0x4f83e0f4\n" /* sdot v20.4s, v7.16b, v3.4b[0] */\ -".word 0x4fa3e0f7\n" /* sdot v23.4s, v7.16b, v3.4b[1] */\ -".word 0x4f83e8fa\n" /* sdot v26.4s, v7.16b, v3.4b[2] */\ -".word 0x4fa3e8fd\n" /* sdot v29.4s, v7.16b, v3.4b[3] */\ - "ldp q6, q7, [%[b_ptr]], #32\n" /* load b0, b1 to q6, q7*/ \ -".word 0x4f82e089\n" /* sdot v9.4s, v4.16b, v2.4b[0] */\ -".word 0x4fa2e08c\n" /* sdot v12.4s, v4.16b, v2.4b[1] */\ -".word 0x4f82e88f\n" /* sdot v15.4s, v4.16b, v2.4b[2] */\ -".word 0x4fa2e892\n" /* sdot v18.4s, v4.16b, v2.4b[3] */\ -".word 0x4f83e095\n" /* sdot v21.4s, v4.16b, v3.4b[0] */\ -".word 0x4fa3e098\n" /* sdot v24.4s, v4.16b, v3.4b[1] */\ -".word 0x4f83e89b\n" /* sdot v27.4s, v4.16b, v3.4b[2] */\ -".word 0x4fa3e89e\n" /* sdot v30.4s, v4.16b, v3.4b[3] */\ - "subs %w[tail], %w[tail], #1\n" /* tail--*/ \ -".word 0x4f82e0aa\n" /* sdot v10.4s, v5.16b, v2.4b[0] */\ -".word 0x4fa2e0ad\n" /* sdot v13.4s, v5.16b, v2.4b[1] */\ -".word 0x4f82e8b0\n" /* sdot v16.4s, v5.16b, v2.4b[2] */\ -".word 0x4fa2e8b3\n" /* sdot v19.4s, v5.16b, v2.4b[3] */\ -".word 0x4f83e0b6\n" /* sdot v22.4s, v5.16b, v3.4b[0] */\ -".word 0x4fa3e0b9\n" /* sdot v25.4s, v5.16b, v3.4b[1] */\ -".word 0x4f83e8bc\n" /* sdot v28.4s, v5.16b, v3.4b[2] */\ -".word 0x4fa3e8bf\n" /* sdot v31.4s, v5.16b, v3.4b[3] */\ - "beq 5f\n" /*jump to tail = 3*/ \ - /* unrool 2, tail = 4*/ \ - "ldp q4, q5, [%[b_ptr]], #32\n" /* load b2, b0 to q4, q5*/ \ -".word 0x4f80e0c8\n" /* sdot v8.4s, v6.16b, v0.4b[0] */\ -".word 0x4fa0e0cb\n" /* sdot v11.4s, v6.16b, v0.4b[1] */\ - "ldp q2, q3, [%[a_ptr]], #32\n" /* load a10, a11 to q3, q4*/ \ -".word 0x4f80e8ce\n" /* sdot v14.4s, v6.16b, v0.4b[2] */\ -".word 0x4fa0e8d1\n" /* sdot v17.4s, v6.16b, v0.4b[3] */\ -".word 0x4f81e0d4\n" /* sdot v20.4s, v6.16b, v1.4b[0] */\ -".word 0x4fa1e0d7\n" /* sdot v23.4s, v6.16b, v1.4b[1] */\ -".word 0x4f81e8da\n" /* sdot v26.4s, v6.16b, v1.4b[2] */\ -".word 0x4fa1e8dd\n" /* sdot v29.4s, v6.16b, v1.4b[3] */\ -".word 0x4f80e0e9\n" /* sdot v9.4s, v7.16b, v0.4b[0] */\ -".word 0x4fa0e0ec\n" /* sdot v12.4s, v7.16b, v0.4b[1] */\ -".word 0x4f80e8ef\n" /* sdot v15.4s, v7.16b, v0.4b[2] */\ -".word 0x4fa0e8f2\n" /* sdot v18.4s, v7.16b, v0.4b[3] */\ -".word 0x4f81e0f5\n" /* sdot v21.4s, v7.16b, v1.4b[0] */\ -".word 0x4fa1e0f8\n" /* sdot v24.4s, v7.16b, v1.4b[1] */\ -".word 0x4f81e8fb\n" /* sdot v27.4s, v7.16b, v1.4b[2] */\ -".word 0x4fa1e8fe\n" /* sdot v30.4s, v7.16b, v1.4b[3] */\ - "ldp q6, q7, [%[b_ptr]], #32\n" /* load b1, b2 to q6, q7*/ \ -".word 0x4f80e08a\n" /* sdot v10.4s, v4.16b, v0.4b[0] */\ -".word 0x4fa0e08d\n" /* sdot v13.4s, v4.16b, v0.4b[1] */\ -".word 0x4f80e890\n" /* sdot v16.4s, v4.16b, v0.4b[2] */\ -".word 0x4fa0e893\n" /* sdot v19.4s, v4.16b, v0.4b[3] */\ -".word 0x4f81e096\n" /* sdot v22.4s, v4.16b, v1.4b[0] */\ -".word 0x4fa1e099\n" /* sdot v25.4s, v4.16b, v1.4b[1] */\ -".word 0x4f81e89c\n" /* sdot v28.4s, v4.16b, v1.4b[2] */\ -".word 0x4fa1e89f\n" /* sdot v31.4s, v4.16b, v1.4b[3] */\ - /* unrool 3, tail = 4*/ \ -".word 0x4f82e0a8\n" /* sdot v8.4s, v5.16b, v2.4b[0] */\ -".word 0x4fa2e0ab\n" /* sdot v11.4s, v5.16b, v2.4b[1] */\ -".word 0x4f82e8ae\n" /* sdot v14.4s, v5.16b, v2.4b[2] */\ -".word 0x4fa2e8b1\n" /* sdot v17.4s, v5.16b, v2.4b[3] */\ -".word 0x4f83e0b4\n" /* sdot v20.4s, v5.16b, v3.4b[0] */\ -".word 0x4fa3e0b7\n" /* sdot v23.4s, v5.16b, v3.4b[1] */\ -".word 0x4f83e8ba\n" /* sdot v26.4s, v5.16b, v3.4b[2] */\ -".word 0x4fa3e8bd\n" /* sdot v29.4s, v5.16b, v3.4b[3] */\ -".word 0x4f82e0c9\n" /* sdot v9.4s, v6.16b, v2.4b[0] */\ -".word 0x4fa2e0cc\n" /* sdot v12.4s, v6.16b, v2.4b[1] */\ -".word 0x4f82e8cf\n" /* sdot v15.4s, v6.16b, v2.4b[2] */\ -".word 0x4fa2e8d2\n" /* sdot v18.4s, v6.16b, v2.4b[3] */\ -".word 0x4f83e0d5\n" /* sdot v21.4s, v6.16b, v3.4b[0] */\ -".word 0x4fa3e0d8\n" /* sdot v24.4s, v6.16b, v3.4b[1] */\ -".word 0x4f83e8db\n" /* sdot v27.4s, v6.16b, v3.4b[2] */\ -".word 0x4fa3e8de\n" /* sdot v30.4s, v6.16b, v3.4b[3] */\ -".word 0x4f82e0ea\n" /* sdot v10.4s, v7.16b, v2.4b[0] */\ -".word 0x4fa2e0ed\n" /* sdot v13.4s, v7.16b, v2.4b[1] */\ -".word 0x4f82e8f0\n" /* sdot v16.4s, v7.16b, v2.4b[2] */\ -".word 0x4fa2e8f3\n" /* sdot v19.4s, v7.16b, v2.4b[3] */\ -".word 0x4f83e0f6\n" /* sdot v22.4s, v7.16b, v3.4b[0] */\ -".word 0x4fa3e0f9\n" /* sdot v25.4s, v7.16b, v3.4b[1] */\ -".word 0x4f83e8fc\n" /* sdot v28.4s, v7.16b, v3.4b[2] */\ -".word 0x4fa3e8ff\n" /* sdot v31.4s, v7.16b, v3.4b[3] */\ - "b 11f\n" /* tails==1 final tail*/ \ - "3: \n" /* tail=1*/ \ - "ldr q6, [%[b_ptr]], #16\n" /* load b2 to q6*/ \ -".word 0x4f80e088\n" /* sdot v8.4s, v4.16b, v0.4b[0] */\ -".word 0x4fa0e08b\n" /* sdot v11.4s, v4.16b, v0.4b[1] */\ -".word 0x4f80e88e\n" /* sdot v14.4s, v4.16b, v0.4b[2] */\ -".word 0x4fa0e891\n" /* sdot v17.4s, v4.16b, v0.4b[3] */\ -".word 0x4f81e094\n" /* sdot v20.4s, v4.16b, v1.4b[0] */\ -".word 0x4fa1e097\n" /* sdot v23.4s, v4.16b, v1.4b[1] */\ -".word 0x4f81e89a\n" /* sdot v26.4s, v4.16b, v1.4b[2] */\ -".word 0x4fa1e89d\n" /* sdot v29.4s, v4.16b, v1.4b[3] */\ -".word 0x4f80e0a9\n" /* sdot v9.4s, v5.16b, v0.4b[0] */\ -".word 0x4fa0e0ac\n" /* sdot v12.4s, v5.16b, v0.4b[1] */\ -".word 0x4f80e8af\n" /* sdot v15.4s, v5.16b, v0.4b[2] */\ -".word 0x4fa0e8b2\n" /* sdot v18.4s, v5.16b, v0.4b[3] */\ -".word 0x4f81e0b5\n" /* sdot v21.4s, v5.16b, v1.4b[0] */\ -".word 0x4fa1e0b8\n" /* sdot v24.4s, v5.16b, v1.4b[1] */\ -".word 0x4f81e8bb\n" /* sdot v27.4s, v5.16b, v1.4b[2] */\ -".word 0x4fa1e8be\n" /* sdot v30.4s, v5.16b, v1.4b[3] */\ -".word 0x4f80e0ca\n" /* sdot v10.4s, v6.16b, v0.4b[0] */\ -".word 0x4fa0e0cd\n" /* sdot v13.4s, v6.16b, v0.4b[1] */\ -".word 0x4f80e8d0\n" /* sdot v16.4s, v6.16b, v0.4b[2] */\ -".word 0x4fa0e8d3\n" /* sdot v19.4s, v6.16b, v0.4b[3] */\ -".word 0x4f81e0d6\n" /* sdot v22.4s, v6.16b, v1.4b[0] */\ -".word 0x4fa1e0d9\n" /* sdot v25.4s, v6.16b, v1.4b[1] */\ -".word 0x4f81e8dc\n" /* sdot v28.4s, v6.16b, v1.4b[2] */\ -".word 0x4fa1e8df\n" /* sdot v31.4s, v6.16b, v1.4b[3] */\ - "b 11f\n" /* tails==2 final tail*/ \ - "4:\n" /* tail = 2*/ \ -".word 0x4f82e0e8\n" /* sdot v8.4s, v7.16b, v2.4b[0] */\ -".word 0x4fa2e0eb\n" /* sdot v11.4s, v7.16b, v2.4b[1] */\ -".word 0x4f82e8ee\n" /* sdot v14.4s, v7.16b, v2.4b[2] */\ -".word 0x4fa2e8f1\n" /* sdot v17.4s, v7.16b, v2.4b[3] */\ -".word 0x4f83e0f4\n" /* sdot v20.4s, v7.16b, v3.4b[0] */\ -".word 0x4fa3e0f7\n" /* sdot v23.4s, v7.16b, v3.4b[1] */\ -".word 0x4f83e8fa\n" /* sdot v26.4s, v7.16b, v3.4b[2] */\ -".word 0x4fa3e8fd\n" /* sdot v29.4s, v7.16b, v3.4b[3] */\ -".word 0x4f82e089\n" /* sdot v9.4s, v4.16b, v2.4b[0] */\ -".word 0x4fa2e08c\n" /* sdot v12.4s, v4.16b, v2.4b[1] */\ -".word 0x4f82e88f\n" /* sdot v15.4s, v4.16b, v2.4b[2] */\ -".word 0x4fa2e892\n" /* sdot v18.4s, v4.16b, v2.4b[3] */\ -".word 0x4f83e095\n" /* sdot v21.4s, v4.16b, v3.4b[0] */\ -".word 0x4fa3e098\n" /* sdot v24.4s, v4.16b, v3.4b[1] */\ -".word 0x4f83e89b\n" /* sdot v27.4s, v4.16b, v3.4b[2] */\ -".word 0x4fa3e89e\n" /* sdot v30.4s, v4.16b, v3.4b[3] */\ -".word 0x4f82e0aa\n" /* sdot v10.4s, v5.16b, v2.4b[0] */\ -".word 0x4fa2e0ad\n" /* sdot v13.4s, v5.16b, v2.4b[1] */\ -".word 0x4f82e8b0\n" /* sdot v16.4s, v5.16b, v2.4b[2] */\ -".word 0x4fa2e8b3\n" /* sdot v19.4s, v5.16b, v2.4b[3] */\ -".word 0x4f83e0b6\n" /* sdot v22.4s, v5.16b, v3.4b[0] */\ -".word 0x4fa3e0b9\n" /* sdot v25.4s, v5.16b, v3.4b[1] */\ -".word 0x4f83e8bc\n" /* sdot v28.4s, v5.16b, v3.4b[2] */\ -".word 0x4fa3e8bf\n" /* sdot v31.4s, v5.16b, v3.4b[3] */\ - "b 11f\n" /* tails==3 final tail*/ \ - "5:\n" /* tail = 3*/ \ - "ldr q4, [%[b_ptr]], #16\n" /* load b2, b0 to q4*/ \ -".word 0x4f80e0c8\n" /* sdot v8.4s, v6.16b, v0.4b[0] */\ -".word 0x4fa0e0cb\n" /* sdot v11.4s, v6.16b, v0.4b[1] */\ -".word 0x4f80e8ce\n" /* sdot v14.4s, v6.16b, v0.4b[2] */\ -".word 0x4fa0e8d1\n" /* sdot v17.4s, v6.16b, v0.4b[3] */\ -".word 0x4f81e0d4\n" /* sdot v20.4s, v6.16b, v1.4b[0] */\ -".word 0x4fa1e0d7\n" /* sdot v23.4s, v6.16b, v1.4b[1] */\ -".word 0x4f81e8da\n" /* sdot v26.4s, v6.16b, v1.4b[2] */\ -".word 0x4fa1e8dd\n" /* sdot v29.4s, v6.16b, v1.4b[3] */\ -".word 0x4f80e0e9\n" /* sdot v9.4s, v7.16b, v0.4b[0] */\ -".word 0x4fa0e0ec\n" /* sdot v12.4s, v7.16b, v0.4b[1] */\ -".word 0x4f80e8ef\n" /* sdot v15.4s, v7.16b, v0.4b[2] */\ -".word 0x4fa0e8f2\n" /* sdot v18.4s, v7.16b, v0.4b[3] */\ -".word 0x4f81e0f5\n" /* sdot v21.4s, v7.16b, v1.4b[0] */\ -".word 0x4fa1e0f8\n" /* sdot v24.4s, v7.16b, v1.4b[1] */\ -".word 0x4f81e8fb\n" /* sdot v27.4s, v7.16b, v1.4b[2] */\ -".word 0x4fa1e8fe\n" /* sdot v30.4s, v7.16b, v1.4b[3] */\ -".word 0x4f80e08a\n" /* sdot v10.4s, v4.16b, v0.4b[0] */\ -".word 0x4fa0e08d\n" /* sdot v13.4s, v4.16b, v0.4b[1] */\ -".word 0x4f80e890\n" /* sdot v16.4s, v4.16b, v0.4b[2] */\ -".word 0x4fa0e893\n" /* sdot v19.4s, v4.16b, v0.4b[3] */\ -".word 0x4f81e096\n" /* sdot v22.4s, v4.16b, v1.4b[0] */\ -".word 0x4fa1e099\n" /* sdot v25.4s, v4.16b, v1.4b[1] */\ -".word 0x4f81e89c\n" /* sdot v28.4s, v4.16b, v1.4b[2] */\ -".word 0x4fa1e89f\n" /* sdot v31.4s, v4.16b, v1.4b[3] */\ - "11: \n" /* end */ - -#define GEMM_SDOT_INT8_KERNEL_8x8 \ - "prfm pldl1keep, [%[a_ptr], #64]\n" /* preload a*/ \ - "eor v8.16b, v8.16b, v8.16b \n" /* out0 = 0 */ \ - "eor v11.16b, v11.16b, v11.16b\n" /* out0 = 0 */ \ - "eor v14.16b, v14.16b, v14.16b\n" /* out0 = 0 */ \ - "eor v17.16b, v17.16b, v17.16b\n" /* out0 = 0 */ \ - "eor v20.16b, v20.16b, v20.16b\n" /* out0 = 0 */ \ - "eor v23.16b, v23.16b, v23.16b\n" /* out0 = 0 */ \ - "eor v26.16b, v26.16b, v26.16b\n" /* out0 = 0 */ \ - "eor v29.16b, v29.16b, v29.16b\n" /* out0 = 0 */ \ - "prfm pldl1keep, [%[b_ptr], #64]\n" /* preload b*/ \ - "eor v9.16b, v9.16b, v9.16b \n" /* out0 = 0 */ \ - "eor v12.16b, v12.16b, v12.16b\n" /* out0 = 0 */ \ - "eor v15.16b, v15.16b, v15.16b\n" /* out0 = 0 */ \ - "eor v18.16b, v18.16b, v18.16b\n" /* out0 = 0 */ \ - "eor v21.16b, v21.16b, v21.16b\n" /* out0 = 0 */ \ - "eor v24.16b, v24.16b, v24.16b\n" /* out0 = 0 */ \ - "eor v27.16b, v27.16b, v27.16b\n" /* out0 = 0 */ \ - "eor v30.16b, v30.16b, v30.16b\n" /* out0 = 0 */ \ - "1:\n" \ - "ldp q0, q1, [%[a_ptr]], #32\n" \ - "ldp q4, q5, [%[b_ptr]], #32\n" \ -".word 0x4f80e088\n" /* sdot v8.4s, v4.16b, v0.4b[0] */\ -".word 0x4fa0e08b\n" /* sdot v11.4s, v4.16b, v0.4b[1] */\ -".word 0x4f80e88e\n" /* sdot v14.4s, v4.16b, v0.4b[2] */\ -".word 0x4fa0e891\n" /* sdot v17.4s, v4.16b, v0.4b[3] */\ - "prfm pldl1keep, [%[a_ptr], #64]\n" /* preload a*/ \ -".word 0x4f81e094\n" /* sdot v20.4s, v4.16b, v1.4b[0] */\ -".word 0x4fa1e097\n" /* sdot v23.4s, v4.16b, v1.4b[1] */\ -".word 0x4f81e89a\n" /* sdot v26.4s, v4.16b, v1.4b[2] */\ -".word 0x4fa1e89d\n" /* sdot v29.4s, v4.16b, v1.4b[3] */\ - "prfm pldl1keep, [%[a_ptr], #128]\n" /* preload b*/ \ - "prfm pldl1keep, [%[b_ptr], #64]\n" /* preload b*/ \ -".word 0x4f80e0a9\n" /* sdot v9.4s, v5.16b, v0.4b[0] */\ -".word 0x4fa0e0ac\n" /* sdot v12.4s, v5.16b, v0.4b[1] */\ -".word 0x4f80e8af\n" /* sdot v15.4s, v5.16b, v0.4b[2] */\ -".word 0x4fa0e8b2\n" /* sdot v18.4s, v5.16b, v0.4b[3] */\ - "prfm pldl1keep, [%[b_ptr], #128]\n" /* preload b*/ \ -".word 0x4f81e0b5\n" /* sdot v21.4s, v5.16b, v1.4b[0] */\ -".word 0x4fa1e0b8\n" /* sdot v24.4s, v5.16b, v1.4b[1] */\ -".word 0x4f81e8bb\n" /* sdot v27.4s, v5.16b, v1.4b[2] */\ -".word 0x4fa1e8be\n" /* sdot v30.4s, v5.16b, v1.4b[3] */\ - "subs %w[k], %w[k], #1\n" \ - "bne 1b\n" - -#define GEMM_SDOT_INT8_KERNEL_8x4 \ - "prfm pldl1keep, [%[a_ptr], #64]\n" /* preload a*/ \ - "eor v8.16b, v8.16b, v8.16b \n" /* out0 = 0 */ \ - "eor v11.16b, v11.16b, v11.16b\n" /* out0 = 0 */ \ - "eor v14.16b, v14.16b, v14.16b\n" /* out0 = 0 */ \ - "eor v17.16b, v17.16b, v17.16b\n" /* out0 = 0 */ \ - "prfm pldl1keep, [%[b_ptr], #32]\n" /* preload b*/ \ - "eor v20.16b, v20.16b, v20.16b\n" /* out0 = 0 */ \ - "eor v23.16b, v23.16b, v23.16b\n" /* out0 = 0 */ \ - "eor v26.16b, v26.16b, v26.16b\n" /* out0 = 0 */ \ - "eor v29.16b, v29.16b, v29.16b\n" /* out0 = 0 */ \ - "1:\n" \ - "ldp q0, q1, [%[a_ptr]], #32\n" \ - "ldr q4, [%[b_ptr]], #16\n" \ -".word 0x4f80e088\n" /* sdot v8.4s, v4.16b, v0.4b[0] */\ -".word 0x4fa0e08b\n" /* sdot v11.4s, v4.16b, v0.4b[1] */\ - "prfm pldl1keep, [%[a_ptr], #64]\n" /* preload a*/ \ -".word 0x4f80e88e\n" /* sdot v14.4s, v4.16b, v0.4b[2] */\ -".word 0x4fa0e891\n" /* sdot v17.4s, v4.16b, v0.4b[3] */\ - "prfm pldl1keep, [%[a_ptr], #64]\n" /* preload a*/ \ -".word 0x4f81e094\n" /* sdot v20.4s, v4.16b, v1.4b[0] */\ -".word 0x4fa1e097\n" /* sdot v23.4s, v4.16b, v1.4b[1] */\ - "prfm pldl1keep, [%[b_ptr], #32]\n" /* preload b*/ \ -".word 0x4f81e89a\n" /* sdot v26.4s, v4.16b, v1.4b[2] */\ -".word 0x4fa1e89d\n" /* sdot v29.4s, v4.16b, v1.4b[3] */\ - "subs %w[k], %w[k], #1\n" \ - "bne 1b\n" diff --git a/lite/backends/arm/math/dotprod/gemm_vsdot.h b/lite/backends/arm/math/dotprod/gemm_vsdot.h deleted file mode 100644 index 9929ade9b95..00000000000 --- a/lite/backends/arm/math/dotprod/gemm_vsdot.h +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -// clang-format off -#define GEMM_DOT_INT8_KERNEL \ - "vld1.s8 {q0}, [%[a_ptr]]! \n" /* load a00,a01 to q0, q1*/ \ - "vld1.s8 {d2}, [%[a_ptr]]! \n" /* load a00,a01 to q0, q1*/ \ - "veor.s32 q4, q4, q4 \n" /* out0 = 0 */ \ - "veor.s32 q5, q5, q5 \n" /* out0 = 0 */ \ - "veor.s32 q6, q6, q6 \n" /* out0 = 0 */ \ - "veor.s32 q7, q7, q7 \n" /* out0 = 0 */ \ - "veor.s32 q8, q8, q8 \n" /* out0 = 0 */ \ - "veor.s32 q9, q9, q9 \n" /* out0 = 0 */ \ - "veor.s32 q10, q10, q10 \n" /* out0 = 0 */ \ - "veor.s32 q11, q11, q11 \n" /* out0 = 0 */ \ - "veor.s32 q12, q12, q12 \n" /* out0 = 0 */ \ - "veor.s32 q13, q13, q13 \n" /* out0 = 0 */ \ - "veor.s32 q14, q14, q14 \n" /* out0 = 0 */ \ - "veor.s32 q15, q15, q15 \n" /* out0 = 0 */ \ - "cmp %[k], #0 \n" \ - "beq 2f \n" \ - "1: \n" \ - "vld1.s8 {q2}, [%[b_ptr]]! \n" \ - "vld1.s8 {q3}, [%[b_ptr]]! \n" \ -".word 0x8d40fe24\n" /* vsdot.s8 q4, q2, d0[0] */\ -".word 0xcd60fe24\n" /* vsdot.s8 q6, q2, d0[1] */\ -".word 0x0d41fe64\n" /* vsdot.s8 q8, q2, d1[0] */\ -".word 0x4d61fe64\n" /* vsdot.s8 q10, q2, d1[1] */\ -".word 0x8d42fe64\n" /* vsdot.s8 q12, q2, d2[0] */\ -".word 0xcd62fe64\n" /* vsdot.s8 q14, q2, d2[1] */\ -".word 0xad40fe26\n" /* vsdot.s8 q5, q3, d0[0] */\ -".word 0xed60fe26\n" /* vsdot.s8 q7, q3, d0[1] */\ -".word 0x2d41fe66\n" /* vsdot.s8 q9, q3, d1[0] */\ -".word 0x6d61fe66\n" /* vsdot.s8 q11, q3, d1[1] */\ -".word 0xad42fe66\n" /* vsdot.s8 q13, q3, d2[0] */\ -".word 0xed62fe66\n" /* vsdot.s8 q15, q3, d2[1] */\ - "vld1.s8 {q0}, [%[a_ptr]]! \n" /* load a00,a01 to q0, q1*/ \ - "vld1.s8 {d2}, [%[a_ptr]]! \n" /* load a00,a01 to q0, q1*/ \ - "subs %[k], %[k], #1 \n" \ - "bne 1b \n" \ - "2: \n" diff --git a/lite/core/program.cc b/lite/core/program.cc index 069da1e78eb..a6c2698ca37 100644 --- a/lite/core/program.cc +++ b/lite/core/program.cc @@ -811,7 +811,7 @@ void Instruction::Run() { -#if 1 +#if 0 // clang-format off /* time_t t; diff --git a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc index c1fe58927b5..3db0f2c9c93 100644 --- a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc +++ b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc @@ -11,11 +11,13 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + #include #include #include #include "paddle_api.h" // NOLINT #include "paddle_use_passes.h" // NOLINT + ///////////////////////////////////////////////////////////////////////// // If this demo is linked to static library:libpaddle_api_full_bundled.a // , you should include `paddle_use_ops.h` and `paddle_use_kernels.h` to @@ -25,7 +27,9 @@ #include "paddle_use_kernels.h" // NOLINT #include "paddle_use_ops.h" // NOLINT #endif + using namespace paddle::lite_api; // NOLINT + DEFINE_string(model_dir, "", "Model dir path. Set it when the model is uncombined format."); @@ -50,11 +54,13 @@ DEFINE_int32(threads, 1, "threads num"); DEFINE_int32(warmup, 10, "warmup times"); DEFINE_int32(repeats, 100, "repeats times"); DEFINE_bool(use_gpu, false, "use opencl backend"); + int64_t ShapeProduction(const shape_t& shape) { int64_t res = 1; for (auto i : shape) res *= i; return res; } + void RunModel() { // 1. Set CxxConfig CxxConfig config; @@ -66,6 +72,7 @@ void RunModel() { } config.set_power_mode((paddle::lite_api::PowerMode)FLAGS_power_mode); config.set_threads(FLAGS_threads); + std::vector valid_places; if (FLAGS_use_gpu) { valid_places.emplace_back( @@ -86,167 +93,51 @@ void RunModel() { } else { valid_places.emplace_back(Place{TARGET(kARM), PRECISION(kFloat)}); } + if (FLAGS_prefer_int8_kernel) { valid_places.insert(valid_places.begin(), Place{TARGET(kARM), PRECISION(kInt8)}); } config.set_valid_places(valid_places); + // 2. Create PaddlePredictor by CxxConfig std::shared_ptr predictor = CreatePaddlePredictor(config); + // 3. Save the optimized model // WARN: The `predictor->SaveOptimizedModel` method must be executed // before the `predictor->Run` method. Because some kernels' `PrepareForRun` // method maybe change some parameters' values. predictor->SaveOptimizedModel(FLAGS_optimized_model_dir, LiteModelType::kNaiveBuffer); + // 4. Prepare input data - const lod_t lodd = {{0,1},{0,1}}; - { - // src_ids - int64_t pre_data[100] = {41, 2, 69, 2, 68, 2, 78, 2, 83, 2, 22, 29, 21, 28, - 27, 18, 8, 2, 788, 342, 6431, 17, 2, 788, 96, 6431, 6622}; - std::unique_ptr input_tensor(std::move(predictor->GetInput(0))); - input_tensor->Resize(shape_t({1,27,1})); - auto* data = input_tensor->mutable_data(); - for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { - data[i] = pre_data[i]; - } + std::unique_ptr input_tensor(std::move(predictor->GetInput(0))); + input_tensor->Resize(shape_t({1, 3, 224, 224})); + auto* data = input_tensor->mutable_data(); + for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { + data[i] = 1; } - { - // pos_ids - int64_t pre_data[100] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26}; - std::unique_ptr input_tensor(std::move(predictor->GetInput(1))); - input_tensor->Resize(shape_t({1,27,1})); - auto* data = input_tensor->mutable_data(); - for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { - data[i] = pre_data[i]; - } - } - { - // input_mask - std::unique_ptr input_tensor(std::move(predictor->GetInput(2))); - input_tensor->Resize(shape_t({1,27,27})); - auto* data = input_tensor->mutable_data(); - for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { - data[i] = 1; - } - } - { - // pos_ids_extra - std::unique_ptr input_tensor(std::move(predictor->GetInput(3))); - input_tensor->Resize(shape_t({1,27,1})); - auto* data = input_tensor->mutable_data(); - for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { - data[i] = 0; - } - } - { - // tgt_ids - std::unique_ptr input_tensor(std::move(predictor->GetInput(4))); - input_tensor->Resize(shape_t({1,1})); - auto* data = input_tensor->mutable_data(); - for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { - data[i] = 6621; - } - input_tensor->SetLoD(lodd); - } - { - // tgt_pos - std::unique_ptr input_tensor(std::move(predictor->GetInput(5))); - input_tensor->Resize(shape_t({1,1})); - auto* data = input_tensor->mutable_data(); - for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { - data[i] = 26; - } - input_tensor->SetLoD(lodd); - } - { - // init_score - std::unique_ptr input_tensor(std::move(predictor->GetInput(6))); - input_tensor->Resize(shape_t({1,1})); - auto* data = input_tensor->mutable_data(); - for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { - data[i] = 0; - } - input_tensor->SetLoD(lodd); - } - { - // parent_idx - std::unique_ptr input_tensor(std::move(predictor->GetInput(7))); - input_tensor->Resize(shape_t({1,1})); - auto* data = input_tensor->mutable_data(); - for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { - data[i] = 0; - } - } - { - // tgt_generation_mask - std::unique_ptr input_tensor(std::move(predictor->GetInput(8))); - input_tensor->Resize(shape_t({1,1,27})); - auto* data = input_tensor->mutable_data(); - for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { - data[i] = 1; - } - } - { - // max_dec_len - std::unique_ptr input_tensor(std::move(predictor->GetInput(9))); - input_tensor->Resize(shape_t({1})); - auto* data = input_tensor->mutable_data(); - for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { - data[i] = 10; - } - } - { - // tgt_pos_extra - std::unique_ptr input_tensor(std::move(predictor->GetInput(10))); - input_tensor->Resize(shape_t({1,1})); - auto* data = input_tensor->mutable_data(); - for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { - data[i] = 1; - } - input_tensor->SetLoD(lodd); - } - { - // cand_ids - int64_t cand[500]={41, 6623, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 69, 6623, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 68, 6623, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 78, 6623, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83, 6623, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - std::unique_ptr input_tensor(std::move(predictor->GetInput(11))); - input_tensor->Resize(shape_t({5,32})); - auto* data = input_tensor->mutable_data(); - for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { - data[i] = cand[i]; - } - } - + // 5. Run predictor - for (int j = 0; j < 1; ++j) { + for (int i = 0; i < FLAGS_warmup; ++i) { predictor->Run(); } + + for (int j = 0; j < FLAGS_repeats; ++j) { + predictor->Run(); + } + // 6. Get output - double sum = 0; std::unique_ptr output_tensor( std::move(predictor->GetOutput(0))); - std::cout << "Output0 shape " << output_tensor->shape()[0] <<","<< output_tensor->shape()[1] << std::endl; + std::cout << "Output shape " << output_tensor->shape()[1] << std::endl; for (int i = 0; i < ShapeProduction(output_tensor->shape()); i++) { - sum += output_tensor->data()[i] * 1.f; - } - std::cout << "output0 mean is "<shape())<<"\n"; - sum = 0; - std::unique_ptr output_tensor1( - std::move(predictor->GetOutput(1))); - std::cout << "Output1 shape " << output_tensor1->shape()[0] <<","<< output_tensor1->shape()[1] << std::endl; - for (int i = 0; i < ShapeProduction(output_tensor1->shape()); i++) { - sum += output_tensor1->data()[i] * 1.f; + std::cout << "Output[" << i << "]: " << output_tensor->data()[i] + << std::endl; } - std::cout << "output1 mean is "<shape())<<"\n"; } + int main(int argc, char** argv) { google::ParseCommandLineFlags(&argc, &argv, true); if (FLAGS_model_dir.empty() && @@ -271,6 +162,7 @@ int main(int argc, char** argv) { << " --use_gpu=false bool Use gpu or not.\n"; exit(1); } + RunModel(); return 0; } diff --git a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc index e493bebfc50..bb430c8d8f6 100644 --- a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc +++ b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc @@ -1,336 +1,364 @@ -#include // NOLINT(build/c++11) +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include #include #include +#include #include -#include -#include -#include -#include -#include -#include "paddle_api.h" // NOLINT -#define IPTCORE_PADDLE_MOBILE -#define IPTCORE_PADDLE_BENCHMARK +#include "paddle_api.h" // NOLINT ///////////////////////////////////////////////////////////////////////// -// If this demo is linked to static library:libpaddle_api_full_bundled.a +// If this demo is linked to static library:libpaddle_api_light_bundled.a // , you should include `paddle_use_ops.h` and `paddle_use_kernels.h` to // avoid linking errors such as `unsupport ops or kernels`. ///////////////////////////////////////////////////////////////////////// -#ifdef IPTCORE_PADDLE_MOBILE -#else -#ifdef _WIN32 -#include "paddle_use_kernels.h" // NOLINT -#include "paddle_use_ops.h" // NOLINT -#endif -#endif +// #include "paddle_use_kernels.h" // NOLINT +// #include "paddle_use_ops.h" // NOLINT -#ifdef IPTCORE_PADDLE_BENCHMARK -class Timer { -private: - std::chrono::high_resolution_clock::time_point inTime, outTime; - -public: - void startTimer() { inTime = std::chrono::high_resolution_clock::now(); } - - // unit millisecond - float getCostTimer() { - outTime = std::chrono::high_resolution_clock::now(); - return static_cast( - std::chrono::duration_cast(outTime - inTime) - .count() / - 1e+3); - } -}; -#endif +using namespace paddle::lite_api; // NOLINT -template -double compute_mean(const T* in, const size_t length) { - double sum = 0.; - for (size_t i = 0; i < length; ++i) { - sum += in[i]; - } - return sum / length; +int64_t ShapeProduction(const shape_t& shape) { + int64_t res = 1; + for (auto i : shape) res *= i; + return res; } -template -double compute_standard_deviation(const T* in, - const size_t length, - bool has_mean = false, - double mean = 10000) { - if (!has_mean) { - mean = compute_mean(in, length); +std::string ShapePrint(const std::vector& shapes) { + std::string shapes_str{""}; + for (size_t shape_idx = 0; shape_idx < shapes.size(); ++shape_idx) { + auto shape = shapes[shape_idx]; + std::string shape_str; + for (auto i : shape) { + shape_str += std::to_string(i) + ","; } + shapes_str += shape_str; + shapes_str += + (shape_idx != 0 && shape_idx == shapes.size() - 1) ? "" : " : "; + } + return shapes_str; +} - double variance = 0.; - for (size_t i = 0; i < length; ++i) { - variance += pow((in[i] - mean), 2); - } - variance /= length; - return sqrt(variance); +std::string ShapePrint(const shape_t& shape) { + std::string shape_str{""}; + for (auto i : shape) { + shape_str += std::to_string(i) + " "; + } + return shape_str; } -int64_t shape_production(const paddle::lite_api::shape_t& shape) { - int64_t res = 1; - for (auto i : shape) { - res *= i; +std::vector split_string(const std::string& str_in) { + std::vector str_out; + std::string tmp_str = str_in; + while (!tmp_str.empty()) { + size_t next_offset = tmp_str.find(":"); + str_out.push_back(tmp_str.substr(0, next_offset)); + if (next_offset == std::string::npos) { + break; + } else { + tmp_str = tmp_str.substr(next_offset + 1); } - return res; + } + return str_out; } -class InputData { -public: - int _type = -1; ///int32, int64, float32 - bool _lod = false; - std::vector _shape; - std::vector _int32_data; - std::vector _int64_data; - std::vector _float32_data; - std::vector> _lod_data = {{0, 1}, {0, 1}}; -}; - -class UserPersonaInfer { -public: -#ifdef IPTCORE_PADDLE_MOBILE - void create_paddle_light_predictor(const std::string& model_file); -#else - void create_paddle_full_predictor(const std::string& model_dir); -#endif - void prepare(const std::string& path); - void infer(); -private: - void infer_specific_item(paddle::lite_api::PaddlePredictor *predictor); - std::shared_ptr _paddle_predictor; - std::vector > _batch; -}; - -#ifdef IPTCORE_PADDLE_MOBILE -void UserPersonaInfer::create_paddle_light_predictor(const std::string& model_file) { - // 1. Set MobileConfig - paddle::lite_api::MobileConfig config; - config.set_model_from_file(model_file); - config.set_power_mode(paddle::lite_api::LITE_POWER_HIGH); - // 2. Create PaddlePredictor by MobileConfig - _paddle_predictor = - paddle::lite_api::CreatePaddlePredictor(config); -} -#else -void UserPersonaInfer::create_paddle_full_predictor(const std::string& model_dir) { - // 1. Create CxxConfig - paddle::lite_api::CxxConfig config; - config.set_model_dir(model_dir); - config.set_valid_places({paddle::lite_api::Place{TARGET(kX86), PRECISION(kFloat)}, - paddle::lite_api::Place{TARGET(kHost), PRECISION(kFloat)}}); - // 2. Create PaddlePredictor by CxxConfig - _paddle_predictor = - paddle::lite_api::CreatePaddlePredictor(config); +std::vector get_shape(const std::string& str_shape) { + std::vector shape; + std::string tmp_str = str_shape; + while (!tmp_str.empty()) { + int dim = atoi(tmp_str.data()); + shape.push_back(dim); + size_t next_offset = tmp_str.find(","); + if (next_offset == std::string::npos) { + break; + } else { + tmp_str = tmp_str.substr(next_offset + 1); + } + } + return shape; } -#endif -namespace { -using namespace std; -template -void extract_num(const string &str, vector &results) { - stringstream ss; - - /* Storing the whole string into string stream */ - ss << str; - /* Running loop till the end of the stream */ - string temp; - T found; - while (!ss.eof()) { - - /* extracting word by word from stream */ - ss >> temp; - - /* Checking the given word is integer or not */ - if (stringstream(temp) >> found) - results.emplace_back(found); +template +double compute_mean(const T* in, const size_t length) { + double sum = 0.; + for (size_t i = 0; i < length; ++i) { + sum += in[i]; + } + return sum / length; +} - /* To save from space at the end of string */ - temp = ""; - } +template +double compute_standard_deviation(const T* in, + const size_t length, + bool has_mean = false, + double mean = 10000) { + if (!has_mean) { + mean = compute_mean(in, length); + } + + double variance = 0.; + for (size_t i = 0; i < length; ++i) { + variance += pow((in[i] - mean), 2); + } + variance /= length; + return sqrt(variance); } + +inline double GetCurrentUS() { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+6 * time.tv_sec + time.tv_usec; } -void UserPersonaInfer::prepare(const std::string& path) { - ///xia_i 186 tgt_generation_mask float32 (1, 1, 33) [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] - std::ifstream in(path.c_str()); - std::string line; - std::string current_idx; - while (std::getline(in, line)) { - if (line.empty()) { - break; - } - if (line.back() == '\r') { - line.pop_back(); - } - if (line.empty()) { - break; - } - std::vector strings; - std::istringstream f(line); - std::string s; - while (getline(f, s, '\t')) { - strings.push_back(s); - } - if (current_idx != strings.at(1)) { - _batch.push_back(std::map()); - current_idx = strings[1]; - } - if (strings.at(2) == "lods") { - if (strings.at(3) != "[[0, 1], [0, 1]]") { - throw std::invalid_argument("invalid lod"); - } - continue; - } - auto& input_data = _batch.back()[strings.at(2)]; - - extract_num(strings.at(4), input_data._shape); - if (strings[0] == "lod_i") { - input_data._lod = true; - } - if (strings.at(3) == "int32") { - input_data._type = 0; - extract_num(strings.at(5), input_data._int32_data); - } else if (strings.at(3) == "int64") { - input_data._type = 1; - extract_num(strings.at(5), input_data._int64_data); - } else if (strings.at(3) == "float32") { - input_data._type = 2; - extract_num(strings.at(5), input_data._float32_data); - } else { - throw std::invalid_argument("invalid type"); - } +void RunModel(std::string model_dir, + const std::vector& input_shapes, + size_t repeats, + size_t warmup, + size_t power_mode, + size_t thread_num, + size_t accelerate_opencl, + size_t print_output_elem) { + // 1. Set MobileConfig + MobileConfig config; + config.set_model_from_file(model_dir); + +#ifdef METAL + std::string metal_lib_path = "../../../metal/lite.metallib"; + config.set_metal_lib_path(metal_lib_path); + config.set_metal_use_mps(true); +#else + // NOTE: Use android gpu with opencl, you should ensure: + // first, [compile **cpu+opencl** paddlelite + // lib](/~https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/docs/demo_guides/opencl.md); + // second, [convert and use opencl nb + // model](/~https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/docs/user_guides/opt/opt_bin.md). + + bool is_opencl_backend_valid = + ::IsOpenCLBackendValid(/*check_fp16_valid = false*/); + std::cout << "is_opencl_backend_valid:" + << (is_opencl_backend_valid ? "true" : "false") << std::endl; + if (is_opencl_backend_valid) { + if (accelerate_opencl != 0) { + // Set opencl kernel binary. + // Large addtitional prepare time is cost due to algorithm selecting and + // building kernel from source code. + // Prepare time can be reduced dramitically after building algorithm file + // and OpenCL kernel binary on the first running. + // The 1st running time will be a bit longer due to the compiling time if + // you don't call `set_opencl_binary_path_name` explicitly. + // So call `set_opencl_binary_path_name` explicitly is strongly + // recommended. + + // Make sure you have write permission of the binary path. + // We strongly recommend each model has a unique binary name. + const std::string bin_path = "/data/local/tmp/"; + const std::string bin_name = "lite_opencl_kernel.bin"; + config.set_opencl_binary_path_name(bin_path, bin_name); + + // opencl tune option + // CL_TUNE_NONE: 0 + // CL_TUNE_RAPID: 1 + // CL_TUNE_NORMAL: 2 + // CL_TUNE_EXHAUSTIVE: 3 + const std::string tuned_path = "/data/local/tmp/"; + const std::string tuned_name = "lite_opencl_tuned.bin"; + config.set_opencl_tune(CL_TUNE_NORMAL, tuned_path, tuned_name); + + // opencl precision option + // CL_PRECISION_AUTO: 0, first fp16 if valid, default + // CL_PRECISION_FP32: 1, force fp32 + // CL_PRECISION_FP16: 2, force fp16 + config.set_opencl_precision(CL_PRECISION_FP16); + } + } else { + std::cout << "*** nb model will be running on cpu. ***" << std::endl; + // you can give backup cpu nb model instead + // config.set_model_from_file(cpu_nb_model_dir); + } +#endif + // NOTE: To load model transformed by model_optimize_tool before + // release/v2.3.0, plese use `set_model_dir` API as listed below. + // config.set_model_dir(model_dir); + config.set_power_mode(static_cast(power_mode)); + config.set_threads(thread_num); + // 2. Create PaddlePredictor by MobileConfig + std::shared_ptr predictor = + CreatePaddlePredictor(config); + + // 3. Prepare input data + std::cout << "input_shapes.size():" << input_shapes.size() << std::endl; + for (int j = 0; j < input_shapes.size(); ++j) { + auto input_tensor = predictor->GetInput(j); + input_tensor->Resize(input_shapes[j]); + auto input_data = input_tensor->mutable_data(); + int input_num = 1; + for (int i = 0; i < input_shapes[j].size(); ++i) { + input_num *= input_shapes[j][i]; } -} -void UserPersonaInfer::infer_specific_item(paddle::lite_api::PaddlePredictor *predictor){ - static int count = 0; - if (_batch.empty()) { - return; + for (int i = 0; i < input_num; ++i) { + input_data[i] = 1.f; } - auto &inputs = _batch[count]; - auto names = predictor->GetInputNames(); - for (auto &name : names) { - auto& input = inputs[name]; - auto tensor = predictor->GetInputByName(name); - tensor->Resize(input._shape); - if (input._type == 0) { - auto input_data = tensor->mutable_data(); - std::copy(input._int32_data.begin(), input._int32_data.end(), input_data); - } else if (input._type == 1) { - auto input_data = tensor->mutable_data(); - std::copy(input._int64_data.begin(), input._int64_data.end(), input_data); - } else if (input._type == 2) { - auto input_data = tensor->mutable_data(); - std::copy(input._float32_data.begin(), input._float32_data.end(), input_data); - } else { - throw std::invalid_argument("invalid name"); - } - if (input._lod) { - tensor->SetLoD(input._lod_data); - } + } + + // 4. Run predictor + double first_duration{-1}; + for (size_t widx = 0; widx < warmup; ++widx) { + if (widx == 0) { + auto start = GetCurrentUS(); + predictor->Run(); + first_duration = (GetCurrentUS() - start) / 1000.0; + } else { + predictor->Run(); } + } - predictor->Run(); - - std::cout << "\n"; - for (int idx = 0; idx != 2; ++idx) { - auto output_tensor = predictor->GetOutput(idx); - auto total_size = shape_production(output_tensor->shape()); - std::cout << "xiarj_" << count << "\t"; - for (int i = 0; i < total_size; ++i) { - if (idx == 0) { - std::cout << output_tensor->data()[i] << "\t"; - } else { - std::cout << output_tensor->data()[i] << "\t"; - } - } - std::cout << "\n"; - } - std::cout << std::flush; + double sum_duration = 0.0; // millisecond; + double max_duration = 1e-5; + double min_duration = 1e5; + double avg_duration = -1; + for (size_t ridx = 0; ridx < repeats; ++ridx) { + auto start = GetCurrentUS(); - if (++count == _batch.size()){ - count = 0; - } -} + predictor->Run(); -void UserPersonaInfer::infer() { - static int idx = 0; - auto predictor = _paddle_predictor.get(); - if (!predictor) { - return; + auto duration = (GetCurrentUS() - start) / 1000.0; + sum_duration += duration; + max_duration = duration > max_duration ? duration : max_duration; + min_duration = duration < min_duration ? duration : min_duration; + std::cout << "run_idx:" << ridx + 1 << " / " << repeats << ": " << duration + << " ms" << std::endl; + if (first_duration < 0) { + first_duration = duration; } - // 3. Prepare input data - - // 4. Run predictor -#ifdef IPTCORE_PADDLE_BENCHMARK - int warmup = 10; - int repeats = 400; - Timer timeInstance; - double first_duration{-1}; - for (size_t widx = 0; widx < warmup; ++widx) { - if (widx == 0) { - timeInstance.startTimer(); - infer_specific_item(predictor); - first_duration = timeInstance.getCostTimer(); - } else { - infer_specific_item(predictor); - } + } + avg_duration = sum_duration / static_cast(repeats); + std::cout << "\n======= benchmark summary =======\n" + << "input_shape(s) (NCHW):" << ShapePrint(input_shapes) << "\n" + << "model_dir:" << model_dir << "\n" + << "warmup:" << warmup << "\n" + << "repeats:" << repeats << "\n" + << "power_mode:" << power_mode << "\n" + << "thread_num:" << thread_num << "\n" + << "*** time info(ms) ***\n" + << "1st_duration:" << first_duration << "\n" + << "max_duration:" << max_duration << "\n" + << "min_duration:" << min_duration << "\n" + << "avg_duration:" << avg_duration << "\n"; + + // 5. Get output + std::cout << "\n====== output summary ====== " << std::endl; + size_t output_tensor_num = predictor->GetOutputNames().size(); + std::cout << "output tensor num:" << output_tensor_num << std::endl; + + for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) { + std::unique_ptr output_tensor = + predictor->GetOutput(tidx); + std::cout << "\n--- output tensor " << tidx << " ---" << std::endl; + auto out_shape = output_tensor->shape(); + auto out_data = output_tensor->data(); + auto out_mean = compute_mean(out_data, ShapeProduction(out_shape)); + auto out_std_dev = compute_standard_deviation( + out_data, ShapeProduction(out_shape), true, out_mean); + + std::cout << "output shape(NCHW):" << ShapePrint(out_shape) << std::endl; + std::cout << "output tensor " << tidx + << " elem num:" << ShapeProduction(out_shape) << std::endl; + std::cout << "output tensor " << tidx + << " standard deviation:" << out_std_dev << std::endl; + std::cout << "output tensor " << tidx << " mean value:" << out_mean + << std::endl; + + // print output + if (print_output_elem) { + for (int i = 0; i < ShapeProduction(out_shape); ++i) { + std::cout << "out[" << tidx << "][" << i + << "]:" << output_tensor->data()[i] << std::endl; + } } - - double sum_duration = 0.0; - double max_duration = 1e-5; - double min_duration = 1e5; - double avg_duration = -1; - for (size_t ridx = 0; ridx < repeats; ++ridx) { - timeInstance.startTimer(); - - infer_specific_item(predictor); - - double duration = timeInstance.getCostTimer(); - sum_duration += duration; - max_duration = duration > max_duration ? duration : max_duration; - min_duration = duration < min_duration ? duration : min_duration; -// std::cout << "run_idx:" << ridx + 1 << " / " << repeats << ": " << duration -// << " ms" << std::endl; - if (first_duration < 0) { - first_duration = duration; - } - } - avg_duration = sum_duration / static_cast(repeats); - std::cout << "\n======= benchmark summary =======\n" - << "warmup:" << warmup << "\n" - << "repeats:" << repeats << "\n" - << "*** time info(ms) ***\n" - //<< "1st_duration:" << first_duration << "\n" - << "max_duration:" << max_duration << "\n" - << "min_duration:" << min_duration << "\n" - << "avg_duration:" << avg_duration << "\n"; -#else - infer_specific_item(predictor); -#endif - - // 5. Get output + } } int main(int argc, char** argv) { - UserPersonaInfer user_persona_infer; -#ifdef IPTCORE_PADDLE_MOBILE -// user_persona_infer.create_paddle_light_predictor( -// "D:\\baidu\\baiduinput\\inputtools\\paddle_lite\\wenxin\\model_x86.nb"); - user_persona_infer.create_paddle_light_predictor( - "./model_naive_buffer_arm.nb"); - std::cout << "xiarj" << std::endl; -#else -// user_persona_infer.create_paddle_full_predictor( -// "D:\\baidu\\baiduinput\\inputtools\\paddle_lite\\honor_2_11\\cls_ernie_3.0_tiny_fc_ch_dy_15_3L128H_decrypt_inference_1"); -#endif - //user_persona_infer.prepare("D:\\baidu\\baiduinput\\inputtools\\paddle_lite\\wenxin\\xia.txt"); - user_persona_infer.prepare("./xia.txt"); - user_persona_infer.infer(); - + std::vector str_input_shapes; + std::vector input_shapes{ + {1, 3, 224, 224}}; // shape_t ==> std::vector + + int repeats = 10; + int warmup = 10; + // set arm power mode: + // 0 for big cluster, high performance + // 1 for little cluster + // 2 for all cores + // 3 for no bind + size_t power_mode = 0; + size_t thread_num = 1; + int accelerate_opencl = 1; + int print_output_elem = 0; + + if (argc > 2 && argc < 9) { + std::cerr + << "usage: ./" << argv[0] << "\n" + << " \n" + << " , eg: 1,3,224,224 for 1 input; " + "1,3,224,224:1,5 for 2 inputs\n" + << " , eg: 100\n" + << " , eg: 10\n" + << " , 0: big cluster, high performance\n" + " 1: little cluster\n" + " 2: all cores\n" + " 3: no bind\n" + << " , eg: 1 for single thread \n" + << " , this option takes effect only when model " + "can be running on opencl backend.\n" + " 0: disable opencl kernel cache & tuning\n" + " 1: enable opencl kernel cache & tuning\n" + << " , 0: disable print outputs to stdout\n" + " 1: enable print outputs to stdout\n" + << std::endl; return 0; -} + } + + std::string model_dir = argv[1]; + if (argc >= 9) { + input_shapes.clear(); + std::string raw_input_shapes = argv[2]; + std::cout << "raw_input_shapes: " << raw_input_shapes << std::endl; + str_input_shapes = split_string(raw_input_shapes); + for (size_t i = 0; i < str_input_shapes.size(); ++i) { + std::cout << "input shape: " << str_input_shapes[i] << std::endl; + input_shapes.push_back(get_shape(str_input_shapes[i])); + } + repeats = atoi(argv[3]); + warmup = atoi(argv[4]); + power_mode = atoi(argv[5]); + thread_num = atoi(argv[6]); + accelerate_opencl = atoi(argv[7]); + print_output_elem = atoi(argv[8]); + } + + RunModel(model_dir, + input_shapes, + repeats, + warmup, + power_mode, + thread_num, + accelerate_opencl, + print_output_elem); + + return 0; +} diff --git a/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt b/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt deleted file mode 100644 index 234ec1c85e3..00000000000 --- a/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt +++ /dev/null @@ -1,73 +0,0 @@ -cmake_minimum_required(VERSION 2.8) -project(mobilenet_full_api) -set(TARGET mobilenet_full_api) - -# 1. path to Paddle-Lite lib and mklml lib -set(LITE_DIR "${PROJECT_SOURCE_DIR}/../../../cxx") -set(MKLML_DIR "${PROJECT_SOURCE_DIR}/../../../third_party/mklml/") - -if (WIN32) - add_definitions("/DGOOGLE_GLOG_DLL_DECL=") - set(MSVC_STATIC_CRT ) - if(MSVC_STATIC_CRT) - set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") - set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") - set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") - else(MSVC_STATIC_CRT) - set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MDd") - set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MD") - set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MDd") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MD") - endif(MSVC_STATIC_CRT) -endif() - -if (APPLE AND METAL) - message(STATUS "set METAL=ON") - add_definitions("-DMETAL") - find_library(METAL_LIBRARY Metal REQUIRED) - find_library(GRAPHIC CoreGraphics REQUIRED) - find_library(MPS_LIBRARY MetalPerformanceShaders REQUIRED) -endif() - -# 2. link mklml and Paddle-Lite directory -link_directories(${LITE_DIR}/lib ${MKLML_DIR}/lib) -include_directories(${LITE_DIR}/include/ ${MKLML_DIR}/include) - -# 3. compile options -if (NOT WIN32) - add_definitions(-std=c++11 -g -O3 -pthread) - set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}) -endif() - -# 4.add executable output -add_executable(${TARGET} ${TARGET}.cc) -if (WIN32) - set(WITH_STATIC_MKL ) - if(WITH_STATIC_MKL) - set(MATH_LIB ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX}) - else() - set(MATH_LIB ${MKLML_DIR}/lib/mklml${CMAKE_STATIC_LIBRARY_SUFFIX} - ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX}) - endif() - - target_link_libraries(${TARGET} libpaddle_api_full_bundled.lib) - target_link_libraries(${TARGET} shlwapi.lib) - target_link_libraries(${TARGET} ${MATH_LIB}) - - add_custom_command(TARGET ${TARGET} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/libiomp5md.dll ${CMAKE_CURRENT_BINARY_DIR}/Release - ) - if(NOT WITH_STATIC_MKL) - add_custom_command(TARGET ${TARGET} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/mklml.dll ${CMAKE_CURRENT_BINARY_DIR}/Release - ) - endif() -else() - if (APPLE AND METAL) - target_link_libraries(${TARGET} ${METAL_LIBRARY} ${GRAPHIC} ${MPS_LIBRARY}) - endif() - target_link_libraries(${TARGET} -lpaddle_full_api_shared) - target_link_libraries(${TARGET} -liomp5) - target_link_libraries(${TARGET} -ldl) -endif() diff --git a/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt b/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt deleted file mode 100644 index 3a91bfafbd3..00000000000 --- a/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt +++ /dev/null @@ -1,73 +0,0 @@ -cmake_minimum_required(VERSION 2.8) -project(mobilenet_light_api) -set(TARGET mobilenet_light_api) - -# 1. path to Paddle-Lite lib and mklml lib -set(LITE_DIR "${PROJECT_SOURCE_DIR}/../../../cxx") -set(MKLML_DIR "${PROJECT_SOURCE_DIR}/../../../third_party/mklml/") - -if (WIN32) - add_definitions("/DGOOGLE_GLOG_DLL_DECL=") - set(MSVC_STATIC_CRT ) - if(MSVC_STATIC_CRT) - set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") - set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") - set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") - else(MSVC_STATIC_CRT) - set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MDd") - set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MD") - set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MDd") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MD") - endif(MSVC_STATIC_CRT) -endif() - -if (APPLE AND METAL) - message(STATUS "set METAL=ON") - add_definitions("-DMETAL") - find_library(METAL_LIBRARY Metal REQUIRED) - find_library(GRAPHIC CoreGraphics REQUIRED) - find_library(MPS_LIBRARY MetalPerformanceShaders REQUIRED) -endif() - -# 2. link mklml and Paddle-Lite directory -link_directories(${LITE_DIR}/lib ${MKLML_DIR}/lib) -include_directories(${LITE_DIR}/include/ ${MKLML_DIR}/include) - -# 3. compile options -if (NOT WIN32) - add_definitions(-std=c++11 -g -O3 -pthread) - set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}) -endif() - -# 4.add executable output -add_executable(${TARGET} ${TARGET}.cc) -if (WIN32) - set(WITH_STATIC_MKL ) - if(WITH_STATIC_MKL) - set(MATH_LIB ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX}) - else() - set(MATH_LIB ${MKLML_DIR}/lib/mklml${CMAKE_STATIC_LIBRARY_SUFFIX} - ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX}) - endif() - - target_link_libraries(${TARGET} libpaddle_api_light_bundled.lib) - target_link_libraries(${TARGET} shlwapi.lib) - target_link_libraries(${TARGET} ${MATH_LIB}) - - add_custom_command(TARGET ${TARGET} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/libiomp5md.dll ${CMAKE_CURRENT_BINARY_DIR}/Release - ) - if(NOT WITH_STATIC_MKL) - add_custom_command(TARGET ${TARGET} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/mklml.dll ${CMAKE_CURRENT_BINARY_DIR}/Release - ) - endif() -else() - if (APPLE AND METAL) - target_link_libraries(${TARGET} ${METAL_LIBRARY} ${GRAPHIC} ${MPS_LIBRARY}) - endif() - target_link_libraries(${TARGET} -lpaddle_light_api_shared) - target_link_libraries(${TARGET} -liomp5) - target_link_libraries(${TARGET} -ldl) -endif() diff --git a/lite/tests/unittest_py/__main___cache_dir/model b/lite/tests/unittest_py/__main___cache_dir/model deleted file mode 100644 index ad9c9d92f78368ee4c4713242cba7c0dad4929b5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2164 zcmZuyO>P@E6!u86EkDW0SV>w(DGEbrfx?>t&H;La7HAjU6a+QHkqB|fG0Bmota^-Y zyUTHUmR=zJNNOxassY2EH}CKJ-uK?nUxVRU{<-_frehjZLX?x<@ai_3UekFce6hqM z-DT5vG%~GkTF*-E`D8r2oDS(?*v~2NPv6t4M)5+hrBS7@>`BPQ(oYCZ7c>~2=NS@C ziQJMmA|>eF_42`8Hv5sTDEobq>fccR=N8d>?`eE7yP|$!RDO1QHv810w>0`SjGRT+!^i>A#lemR@vG&T(@xKi*g zR{42wfu$5{S%|!Me>R&$7tL$o3WGes_JS_qWQ_`G|GY!cX!p$4+#<_cqD0iLbVvT@ zYzDp~;C!^9O>D_|i!jx8xzE!>_Wd9B-=Ai0=;f32OZM0T4&&b2UFXy~!RnPNwT_kt z$^*BDWK711pfurW`g2I{VEFf^kN5wK>1(N!nuPGSZeOA+%R3c=<|YxTcZq{n@xOvM{gEF3yVywv8FMO zE63Ik@#VL4TS|~5=Ph1Wjo!Q9eUHG9P@)#vKS@x+%OLHHru6#ITq$F0DcAAcQAjOQ z>x9ivvWYGcnk<>)b)$rhwtM=n;w#7pShY|r)++-Y+RKla2H!9~4-k6&oo-lK$P^e#$gEO zk!fS==<%9fSgu!LKYMdO!ICuzD2JwO%4CAz@b}GQC;5aH#_>)!02oYbLKXlQfpscobZR!**kR@&-&fs(VAiglrkb@Bb!krC~ zV4`SoUl_!bqK_wJl3YP&ZNdibioh-6tZ*0<-{OnXu@qQ1;@}6Goo-d9&`#qq{wvD= zhs7T4>>qHszF-$vyYsZaSpc%Ac><5reLsM^~5^ymd};&48gaiEr#4uPUhpwo#@&pyLLMm>5MPVZ|ENro#%v zPK0{+wW3JZ4mg_qtbs%EI#tF0yyT%&y!2ruxRQ&Y-abCnecl$aJM8<(@k(dw2u@?3 GLco9K>qUeB diff --git a/lite/tests/unittest_py/__main___cache_dir/opt_model/model b/lite/tests/unittest_py/__main___cache_dir/opt_model/model deleted file mode 100644 index 99d11af804fa2902ea6a97be38c42308ea14546c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 609 zcmaJ%P7!*+%`Apph9&o~Qx;bX8nqz#sBAZw_R zGjP*VWTjAGSAsUQZ4&`9CESBn#6>WZOueB-pTPnB2`lm=Fz-r%8{DJ@=on5In=W{s z@R*5rIoKB>pE8Dwalg{#g*@^m-i>!f9WaA5Mw%r5G++eoL#6bQCqO*RZa+3X!v=AD5~X0h9%2uZuWLlA65_Ed}f6FZu5+J%+(* zE@telly>US>aq+_m3q@U?y5M{c8I#@<$(YO^&WS|_mY<`tFN*3)VXN5vh1(#k3SIb Bv*P2@mD5DGN0 z7)=Syj1pP#eJf>Ugq<~0cD#3cUvmWwAGMrw6huQ2_1|=**7LnL`QX9 zCq`#NNn9+WDs)lBu8eMk?nW~?xg=xz<(256>OC2~2)&J_vR-8i%R?Vk?#t*$=&vtc zT7vzfTC>0zikGmJ5uFv4gp`$;QZ zn|0-KX4;D|QngDNqX?sowzBQlyD8U_QTbtz%?gZB17jKE2;<}XU0>LqT-}zFet-$8 zKanwsFgc2hmhy@)N+#_vjFX;{!_=rT>5f(dfW|fR8LbW4PT9R<$NR~2|5y*)HSk72MsBz+DS29)+ z3@6@fHDe87t=vpKq977x%B`!5S6i3E`uu7e7$#w(2Fll2OGs*9ieVG#G;kBcA-Ed2 znX!f7X<(Y+69NUyMh9&79q##6#GjeX8|L3pXLuNbchZxr@z9{Y~*p723qKQcZMK5Oh3##h2Ojs4E} KLHHR(QRy$IOqiSi diff --git a/lite/tests/unittest_py/op/test_unique_op.py b/lite/tests/unittest_py/op/test_unique_op.py index 24fb6b63a3f..dc71fcdf799 100644 --- a/lite/tests/unittest_py/op/test_unique_op.py +++ b/lite/tests/unittest_py/op/test_unique_op.py @@ -17,6 +17,7 @@ from auto_scan_test import AutoScanTest, IgnoreReasons from program_config import TensorConfig, ProgramConfig, OpConfig, CxxConfig, TargetType, PrecisionType, DataLayoutType, Place +import unittest import hypothesis from hypothesis import given, settings, seed, example, assume @@ -26,7 +27,7 @@ import numpy as np -class TestUniqueWithCountsOp(AutoScanTest): +class TestUniqueOp(AutoScanTest): def __init__(self, *args, **kwargs): AutoScanTest.__init__(self, *args, **kwargs) host_places = [ @@ -58,7 +59,7 @@ def generate_IndexTensor(): unique_op = OpConfig( type = "unique", - input = {"X": ["input_data"]}, + inputs = {"X": ["input_data"]}, outputs = { "Out": ["Out_data"], "Index": ["Index_data"], From 4a8f18be55f38095ce187f26c5f85e7b11953650 Mon Sep 17 00:00:00 2001 From: Qijian Tian <1741919942@qq.com> Date: Wed, 14 Dec 2022 14:10:55 +0800 Subject: [PATCH 06/10] fix test_unique_op --- lite/tests/unittest_py/op/test_unique_op.py | 61 ++++++++++++++------- 1 file changed, 41 insertions(+), 20 deletions(-) diff --git a/lite/tests/unittest_py/op/test_unique_op.py b/lite/tests/unittest_py/op/test_unique_op.py index dc71fcdf799..64fc9d7111e 100644 --- a/lite/tests/unittest_py/op/test_unique_op.py +++ b/lite/tests/unittest_py/op/test_unique_op.py @@ -46,50 +46,71 @@ def sample_program_configs(self, draw): st.integers( min_value=2, max_value=100), min_size=1, - max_size=3)) + max_size=1)) in_dtype = draw(st.sampled_from([np.float32, np.int32, np.int64])) - + def generate_X_data(): return np.random.normal(0.0, 5.0, in_shape).astype(in_dtype) def generate_IndexTensor(): return np.random.randint(1, 5, size=in_shape).astype(np.int32) + dtype = 2 + is_sorted = draw(st.sampled_from([True, False])) + return_index = draw(st.sampled_from([False])) + if is_sorted: + return_inverse = draw(st.sampled_from([True, False])) + else: + return_inverse = True + return_counts = draw(st.sampled_from([False])) + outputs = [ + "Out_data" + ] + outputs_config = { + "Out": ["Out_data"] + } + outputs_dtype = { + "Out_data": in_dtype + } + if return_inverse: + outputs.append("Index_data") + outputs_config["Index"] = ["Index_data"] + outputs_dtype["Index_data"] = np.int32 + if return_index: + outputs.append("Indices_data") + outputs_config["Indices"] = ["Indices_data"] + outputs_dtype["Indices_data"] = np.int32 + if return_counts: + outputs.append("Counts_data") + outputs_config["Counts"] = ["Counts_data"] + outputs_dtype["Counts_data"] = np.int32 + axis = draw(st.sampled_from([[0, 1, 2], [1], [0, 2], [2, 1], [0, 1]])) + axis = [] unique_op = OpConfig( type = "unique", inputs = {"X": ["input_data"]}, - outputs = { - "Out": ["Out_data"], - "Index": ["Index_data"], - "Indices": ["Indices_data"], - "Counts": ["Counts_data"] - }, + outputs = outputs_config, attrs={ "dtype": 2, - "return_index": False, - "return_inverse": False, - "return_counts": False, + "return_index": return_index, + "return_inverse": return_inverse, + "return_counts": return_counts, "axis": axis, - "is_sorted": False + "is_sorted": is_sorted } ) - unique_op.outputs_dtype = {"Out_data": in_dtype} - unique_op.outputs_dtype = {"Index_data": np.int32} - unique_op.outputs_dtype = {"Counts_data":np.int32} + unique_op.outputs_dtype = outputs_dtype program_config = ProgramConfig( ops=[unique_op], - weights={ - "Index_data": - TensorConfig(data_gen=partial(generate_IndexTensor)) - }, + weights={}, inputs={ "input_data": TensorConfig(data_gen=partial(generate_X_data)) }, - outputs=["Out_data", "Index_data", "Counts_data"] + outputs=outputs ) return program_config From 0d6f7edf982e6a34df5104aa8d8c6823eeafbfe9 Mon Sep 17 00:00:00 2001 From: Qijian Tian <1741919942@qq.com> Date: Mon, 19 Dec 2022 11:57:22 +0800 Subject: [PATCH 07/10] update unique_op on 12.19 --- lite/kernels/host/unique_compute.cc | 32 +++++++------ lite/operators/unique_op.cc | 51 +++++++++++++++------ lite/tests/unittest_py/op/test_unique_op.py | 48 +++++++++++-------- 3 files changed, 83 insertions(+), 48 deletions(-) diff --git a/lite/kernels/host/unique_compute.cc b/lite/kernels/host/unique_compute.cc index 4c96e7f8c11..b29101e52ed 100644 --- a/lite/kernels/host/unique_compute.cc +++ b/lite/kernels/host/unique_compute.cc @@ -24,6 +24,7 @@ #include #include + namespace paddle { namespace lite { namespace kernels { @@ -33,7 +34,7 @@ template void UniqueFunc(const lite::Tensor* x, lite::Tensor* out, lite::Tensor* index, - lite::Tensor* count = nullptr) { + lite::Tensor* count) { const InT* in_data = x->template data(); IndexT* index_data = index->mutable_data(); @@ -100,10 +101,10 @@ void UniqueFlattendTensorFunc(const lite::Tensor& in, } if (return_inverse) { - auto* inverse = index; - inverse->Resize({out->numel()}); - auto inverse_data = inverse->mutable_data(); + index->Resize({in.numel()}); + auto inverse_data = index->mutable_data(); std::unordered_map inverse_map; + inverse_map.reserve(out->numel()); for (int64_t i = 0; i < out->numel(); ++i) { inverse_map[out_data[i]] = i; } @@ -296,11 +297,11 @@ void UniqueDimFunc(const lite::Tensor& in, std::iota(permute.begin(), permute.end(), 0); permute[axis] = 0; permute[0] = axis; - std::vector in_trans_dim_vec(in.dims().Vectorize()); - in_trans_dim_vec[axis] = in.dims()[0]; - in_trans_dim_vec[0] = in.dims()[axis]; + std::vector in_trans_dims_vec(in.dims().Vectorize()); + in_trans_dims_vec[axis] = in.dims()[0]; + in_trans_dims_vec[0] = in.dims()[axis]; lite::Tensor in_trans; - lite::DDim in_trans_dims = DDim(in_trans_dim_vec); + lite::DDim in_trans_dims = DDim(in_trans_dims_vec); in_trans.Resize(in_trans_dims); in_trans.mutable_data(); TransCompute(in, &in_trans, permute); @@ -356,7 +357,7 @@ void UniqueDimFunc(const lite::Tensor& in, indices_vec.erase(indices_vec.begin() + input_unbind.size(), indices_vec.end()); lite::Tensor out_trans; - std::vector out_trans_dims_vec = in_trans_dim_vec; + std::vector out_trans_dims_vec = in_trans_dims_vec; out_trans_dims_vec[0] = input_unbind.size(); out_trans.Resize(out_trans_dims_vec); out_trans.mutable_data(); @@ -367,16 +368,20 @@ void UniqueDimFunc(const lite::Tensor& in, TransCompute(out_trans, out, permute); if (return_inverse) { + index->Resize({in.numel()}); TensorFromVector(inverse_vec, index); } if (return_counts) { + count->Resize({out->numel()}); TensorFromVector(counts_vec, count); } if (return_index) { + indices->Resize({out->numel()}); TensorFromVector(indices_vec, indices); } + } void UniqueCompute::Run() { @@ -387,9 +392,9 @@ void UniqueCompute::Run() { auto indices = param.Indices; auto count = param.Counts; auto dtype = param.dtype; - auto return_index = param.return_index; - auto return_inverse = param.return_inverse; - auto return_counts = param.return_counts; + bool return_index = param.return_index; + bool return_inverse = param.return_inverse; + bool return_counts = param.return_counts; auto axis_vec = param.axis; auto is_sorted = param.is_sorted; @@ -399,7 +404,7 @@ void UniqueCompute::Run() { CHECK_EQ(index_type_match, true) << "Index holds the wrong type, it holds " << static_cast(type) << "but desires to be int32 or int64"; - + if (!is_sorted) { if (index_type == PRECISION(kInt32)) { switch (type) { @@ -560,3 +565,4 @@ REGISTER_LITE_KERNEL(unique, PRECISION(kInt32), DATALAYOUT(kAny))}) .Finalize(); + diff --git a/lite/operators/unique_op.cc b/lite/operators/unique_op.cc index adab6096cd8..84f829028a9 100644 --- a/lite/operators/unique_op.cc +++ b/lite/operators/unique_op.cc @@ -21,24 +21,45 @@ namespace operators { bool UniqueOp::CheckShape() const { CHECK_OR_FALSE(param_.X); CHECK_OR_FALSE(param_.Out); - if (param_.return_index) { - CHECK_OR_FALSE(param_.Indices); - } - if (param_.return_inverse) { + if (!param_.is_sorted) { CHECK_OR_FALSE(param_.Index); - } - if (param_.return_counts) { - CHECK_OR_FALSE(param_.Counts) + } else { + if (param_.return_index) { + CHECK_OR_FALSE(param_.Indices); + } + if (param_.return_inverse) { + CHECK_OR_FALSE(param_.Index); + } + if (param_.return_counts) { + CHECK_OR_FALSE(param_.Counts) + } } return true; } bool UniqueOp::InferShapeImpl() const { - DDim in_dims = param_.X->dims(); - if (param_.Out) param_.Out->Resize(in_dims); - if (param_.Index) param_.Index->Resize(in_dims); - if (param_.Indices) param_.Indices->Resize(in_dims); - if (param_.Counts) param_.Counts->Resize(in_dims); + if (!param_.is_sorted) { + DDim in_dims = param_.X->dims(); + if (param_.Out) param_.Out->Resize({-1}); + if (param_.Index) param_.Index->Resize(in_dims); + } else { + DDim in_dims = param_.X->dims(); + if (param_.axis.empty()) { + if (param_.Out) param_.Out->Resize(in_dims); + if (param_.return_inverse) param_.Index->Resize(in_dims); + } else { + int axis_value = param_.axis[0]; + if (axis_value < 0) { + axis_value += in_dims.size(); + } + DDim out_dims = in_dims; + out_dims[axis_value] = -1; + if (param_.Out) param_.Out->Resize(out_dims); + if (param_.return_inverse) param_.Index->Resize({in_dims[axis_value]}); + } + if (param_.return_index) param_.Indices->Resize({-1}); + if (param_.return_counts) param_.Counts->Resize({-1}); + } return true; } @@ -50,11 +71,11 @@ bool UniqueOp::AttachImpl(const cpp::OpDesc &opdesc, CHECK(param_.Out) << "Output(Out) of UniqueOp should not be null."; if (opdesc.HasOutput("Index")) { param_.Index = scope->FindMutableTensor(opdesc.Output("Index").front()); - CHECK(param_.Out) << "Output(Index) of UniqueOp should not be null."; + CHECK(param_.Index) << "Output(Index) of UniqueOp should not be null."; } if (opdesc.HasOutput("Indices")) { param_.Indices = scope->FindMutableTensor(opdesc.Output("Indices").front()); - CHECK(param_.Out) << "Output(Indices) of UniqueOp should not be null."; + CHECK(param_.Indices) << "Output(Indices) of UniqueOp should not be null."; } if (opdesc.HasOutput("Counts")) { param_.Counts = scope->FindMutableTensor(opdesc.Output("Counts").front()); @@ -67,7 +88,7 @@ bool UniqueOp::AttachImpl(const cpp::OpDesc &opdesc, if (opdesc.HasAttr("return_index")) { param_.return_index = opdesc.GetAttr("return_index"); } - if (opdesc.HasAttr("return_reverse")) { + if (opdesc.HasAttr("return_inverse")) { param_.return_inverse = opdesc.GetAttr("return_inverse"); } if (opdesc.HasAttr("return_counts")) { diff --git a/lite/tests/unittest_py/op/test_unique_op.py b/lite/tests/unittest_py/op/test_unique_op.py index 64fc9d7111e..453ac6bd561 100644 --- a/lite/tests/unittest_py/op/test_unique_op.py +++ b/lite/tests/unittest_py/op/test_unique_op.py @@ -47,22 +47,33 @@ def sample_program_configs(self, draw): min_value=2, max_value=100), min_size=1, max_size=1)) + print(in_shape) in_dtype = draw(st.sampled_from([np.float32, np.int32, np.int64])) def generate_X_data(): - return np.random.normal(0.0, 5.0, in_shape).astype(in_dtype) - - def generate_IndexTensor(): - return np.random.randint(1, 5, size=in_shape).astype(np.int32) - - dtype = 2 - is_sorted = draw(st.sampled_from([True, False])) - return_index = draw(st.sampled_from([False])) - if is_sorted: - return_inverse = draw(st.sampled_from([True, False])) - else: - return_inverse = True - return_counts = draw(st.sampled_from([False])) + t = np.random.normal(0.0, 5.0, in_shape).astype(in_dtype) + print(t) + return t + + dtype = draw(st.sampled_from([2,3])) + is_sorted = draw(st.booleans()) + return_index = draw(st.booleans()) + return_inverse = draw(st.sampled_from([True])) + return_counts = draw(st.booleans()) + + if is_sorted == False: + return_index = False + return_counts = False + + param_is_sorted = is_sorted + param_return_index = return_index + param_return_inverse = return_inverse + param_return_counts = return_counts + + axis = draw(st.sampled_from([[2], [1], [0], []])) + while len(axis) > 0 and axis[0] >= len(in_shape): + axis[0] = axis[0] - 1 + outputs = [ "Out_data" ] @@ -85,18 +96,15 @@ def generate_IndexTensor(): outputs_config["Counts"] = ["Counts_data"] outputs_dtype["Counts_data"] = np.int32 - axis = draw(st.sampled_from([[0, 1, 2], [1], [0, 2], [2, 1], [0, 1]])) - axis = [] - unique_op = OpConfig( type = "unique", inputs = {"X": ["input_data"]}, outputs = outputs_config, attrs={ - "dtype": 2, - "return_index": return_index, - "return_inverse": return_inverse, - "return_counts": return_counts, + "dtype": dtype, + "return_index": param_return_index, + "return_inverse": param_return_inverse, + "return_counts": param_return_counts, "axis": axis, "is_sorted": is_sorted } From 256856055fc42bab5511e9466bd4c9d5f0b24e88 Mon Sep 17 00:00:00 2001 From: Qijian Tian <1741919942@qq.com> Date: Mon, 19 Dec 2022 21:33:45 +0800 Subject: [PATCH 08/10] Fix index_select_op and unique_op --- .../opencl/utils/tune_cache_generated.h | 383 ------------------ lite/core/program.cc | 243 +---------- lite/kernels/host/index_select_compute.cc | 2 + lite/kernels/host/unique_compute.cc | 196 ++++++--- lite/operators/op_params.h | 8 +- lite/operators/unique_op.cc | 8 +- lite/operators/unique_op.h | 4 +- lite/tests/unittest_py/op/test_unique_op.py | 75 ++-- 8 files changed, 204 insertions(+), 715 deletions(-) delete mode 100644 lite/backends/opencl/utils/tune_cache_generated.h diff --git a/lite/backends/opencl/utils/tune_cache_generated.h b/lite/backends/opencl/utils/tune_cache_generated.h deleted file mode 100644 index bb091cce383..00000000000 --- a/lite/backends/opencl/utils/tune_cache_generated.h +++ /dev/null @@ -1,383 +0,0 @@ -// automatically generated by the FlatBuffers compiler, do not modify - - -#ifndef FLATBUFFERS_GENERATED_TUNECACHE_PADDLE_LITE_FBS_OPENCL_PROTO_H_ -#define FLATBUFFERS_GENERATED_TUNECACHE_PADDLE_LITE_FBS_OPENCL_PROTO_H_ - -#include "flatbuffers/flatbuffers.h" - -namespace paddle { -namespace lite { -namespace fbs { -namespace opencl { -namespace proto { -namespace TuneCache_ { - -struct TunePair; -struct TunePairBuilder; -struct TunePairT; - -} // namespace TuneCache_ - -struct TuneCache; -struct TuneCacheBuilder; -struct TuneCacheT; - -namespace TuneCache_ { - -bool operator==(const TunePairT &lhs, const TunePairT &rhs); -bool operator!=(const TunePairT &lhs, const TunePairT &rhs); -} // namespace TuneCache_ - -bool operator==(const TuneCacheT &lhs, const TuneCacheT &rhs); -bool operator!=(const TuneCacheT &lhs, const TuneCacheT &rhs); - -namespace TuneCache_ { - -inline const flatbuffers::TypeTable *TunePairTypeTable(); - -} // namespace TuneCache_ - -inline const flatbuffers::TypeTable *TuneCacheTypeTable(); - -namespace TuneCache_ { - -struct TunePairT : public flatbuffers::NativeTable { - typedef TunePair TableType; - std::string key; - std::vector value; - TunePairT() { - } -}; - -inline bool operator==(const TunePairT &lhs, const TunePairT &rhs) { - return - (lhs.key == rhs.key) && - (lhs.value == rhs.value); -} - -inline bool operator!=(const TunePairT &lhs, const TunePairT &rhs) { - return !(lhs == rhs); -} - - -struct TunePair FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { - typedef TunePairT NativeTableType; - typedef TunePairBuilder Builder; - static const flatbuffers::TypeTable *MiniReflectTypeTable() { - return TunePairTypeTable(); - } - enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { - VT_KEY = 4, - VT_VALUE = 6 - }; - const flatbuffers::String *key() const { - return GetPointer(VT_KEY); - } - flatbuffers::String *mutable_key() { - return GetPointer(VT_KEY); - } - bool KeyCompareLessThan(const TunePair *o) const { - return *key() < *o->key(); - } - int KeyCompareWithValue(const char *val) const { - return strcmp(key()->c_str(), val); - } - const flatbuffers::Vector *value() const { - return GetPointer *>(VT_VALUE); - } - flatbuffers::Vector *mutable_value() { - return GetPointer *>(VT_VALUE); - } - bool Verify(flatbuffers::Verifier &verifier) const { - return VerifyTableStart(verifier) && - VerifyOffsetRequired(verifier, VT_KEY) && - verifier.VerifyString(key()) && - VerifyOffset(verifier, VT_VALUE) && - verifier.VerifyVector(value()) && - verifier.EndTable(); - } - TunePairT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const; - void UnPackTo(TunePairT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const; - static flatbuffers::Offset Pack(flatbuffers::FlatBufferBuilder &_fbb, const TunePairT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); -}; - -struct TunePairBuilder { - typedef TunePair Table; - flatbuffers::FlatBufferBuilder &fbb_; - flatbuffers::uoffset_t start_; - void add_key(flatbuffers::Offset key) { - fbb_.AddOffset(TunePair::VT_KEY, key); - } - void add_value(flatbuffers::Offset> value) { - fbb_.AddOffset(TunePair::VT_VALUE, value); - } - explicit TunePairBuilder(flatbuffers::FlatBufferBuilder &_fbb) - : fbb_(_fbb) { - start_ = fbb_.StartTable(); - } - TunePairBuilder &operator=(const TunePairBuilder &); - flatbuffers::Offset Finish() { - const auto end = fbb_.EndTable(start_); - auto o = flatbuffers::Offset(end); - fbb_.Required(o, TunePair::VT_KEY); - return o; - } -}; - -inline flatbuffers::Offset CreateTunePair( - flatbuffers::FlatBufferBuilder &_fbb, - flatbuffers::Offset key = 0, - flatbuffers::Offset> value = 0) { - TunePairBuilder builder_(_fbb); - builder_.add_value(value); - builder_.add_key(key); - return builder_.Finish(); -} - -inline flatbuffers::Offset CreateTunePairDirect( - flatbuffers::FlatBufferBuilder &_fbb, - const char *key = nullptr, - const std::vector *value = nullptr) { - auto key__ = key ? _fbb.CreateString(key) : 0; - auto value__ = value ? _fbb.CreateVector(*value) : 0; - return paddle::lite::fbs::opencl::proto::TuneCache_::CreateTunePair( - _fbb, - key__, - value__); -} - -flatbuffers::Offset CreateTunePair(flatbuffers::FlatBufferBuilder &_fbb, const TunePairT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); - -} // namespace TuneCache_ - -struct TuneCacheT : public flatbuffers::NativeTable { - typedef TuneCache TableType; - std::vector> tune_map; - TuneCacheT() { - } -}; - -inline bool operator==(const TuneCacheT &lhs, const TuneCacheT &rhs) { - return - (lhs.tune_map == rhs.tune_map); -} - -inline bool operator!=(const TuneCacheT &lhs, const TuneCacheT &rhs) { - return !(lhs == rhs); -} - - -struct TuneCache FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { - typedef TuneCacheT NativeTableType; - typedef TuneCacheBuilder Builder; - static const flatbuffers::TypeTable *MiniReflectTypeTable() { - return TuneCacheTypeTable(); - } - enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { - VT_TUNE_MAP = 4 - }; - const flatbuffers::Vector> *tune_map() const { - return GetPointer> *>(VT_TUNE_MAP); - } - flatbuffers::Vector> *mutable_tune_map() { - return GetPointer> *>(VT_TUNE_MAP); - } - bool Verify(flatbuffers::Verifier &verifier) const { - return VerifyTableStart(verifier) && - VerifyOffsetRequired(verifier, VT_TUNE_MAP) && - verifier.VerifyVector(tune_map()) && - verifier.VerifyVectorOfTables(tune_map()) && - verifier.EndTable(); - } - TuneCacheT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const; - void UnPackTo(TuneCacheT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const; - static flatbuffers::Offset Pack(flatbuffers::FlatBufferBuilder &_fbb, const TuneCacheT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); -}; - -struct TuneCacheBuilder { - typedef TuneCache Table; - flatbuffers::FlatBufferBuilder &fbb_; - flatbuffers::uoffset_t start_; - void add_tune_map(flatbuffers::Offset>> tune_map) { - fbb_.AddOffset(TuneCache::VT_TUNE_MAP, tune_map); - } - explicit TuneCacheBuilder(flatbuffers::FlatBufferBuilder &_fbb) - : fbb_(_fbb) { - start_ = fbb_.StartTable(); - } - TuneCacheBuilder &operator=(const TuneCacheBuilder &); - flatbuffers::Offset Finish() { - const auto end = fbb_.EndTable(start_); - auto o = flatbuffers::Offset(end); - fbb_.Required(o, TuneCache::VT_TUNE_MAP); - return o; - } -}; - -inline flatbuffers::Offset CreateTuneCache( - flatbuffers::FlatBufferBuilder &_fbb, - flatbuffers::Offset>> tune_map = 0) { - TuneCacheBuilder builder_(_fbb); - builder_.add_tune_map(tune_map); - return builder_.Finish(); -} - -inline flatbuffers::Offset CreateTuneCacheDirect( - flatbuffers::FlatBufferBuilder &_fbb, - std::vector> *tune_map = nullptr) { - auto tune_map__ = tune_map ? _fbb.CreateVectorOfSortedTables(tune_map) : 0; - return paddle::lite::fbs::opencl::proto::CreateTuneCache( - _fbb, - tune_map__); -} - -flatbuffers::Offset CreateTuneCache(flatbuffers::FlatBufferBuilder &_fbb, const TuneCacheT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); - -namespace TuneCache_ { - -inline TunePairT *TunePair::UnPack(const flatbuffers::resolver_function_t *_resolver) const { - std::unique_ptr _o = std::unique_ptr(new TunePairT()); - UnPackTo(_o.get(), _resolver); - return _o.release(); -} - -inline void TunePair::UnPackTo(TunePairT *_o, const flatbuffers::resolver_function_t *_resolver) const { - (void)_o; - (void)_resolver; - { auto _e = key(); if (_e) _o->key = _e->str(); } - { auto _e = value(); if (_e) { _o->value.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->value[_i] = _e->Get(_i); } } } -} - -inline flatbuffers::Offset TunePair::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TunePairT* _o, const flatbuffers::rehasher_function_t *_rehasher) { - return CreateTunePair(_fbb, _o, _rehasher); -} - -inline flatbuffers::Offset CreateTunePair(flatbuffers::FlatBufferBuilder &_fbb, const TunePairT *_o, const flatbuffers::rehasher_function_t *_rehasher) { - (void)_rehasher; - (void)_o; - struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TunePairT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va; - auto _key = _fbb.CreateString(_o->key); - auto _value = _fbb.CreateVector(_o->value); - return paddle::lite::fbs::opencl::proto::TuneCache_::CreateTunePair( - _fbb, - _key, - _value); -} - -} // namespace TuneCache_ - -inline TuneCacheT *TuneCache::UnPack(const flatbuffers::resolver_function_t *_resolver) const { - std::unique_ptr _o = std::unique_ptr(new TuneCacheT()); - UnPackTo(_o.get(), _resolver); - return _o.release(); -} - -inline void TuneCache::UnPackTo(TuneCacheT *_o, const flatbuffers::resolver_function_t *_resolver) const { - (void)_o; - (void)_resolver; - { auto _e = tune_map(); if (_e) { _o->tune_map.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->tune_map[_i] = std::unique_ptr(_e->Get(_i)->UnPack(_resolver)); } } } -} - -inline flatbuffers::Offset TuneCache::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TuneCacheT* _o, const flatbuffers::rehasher_function_t *_rehasher) { - return CreateTuneCache(_fbb, _o, _rehasher); -} - -inline flatbuffers::Offset CreateTuneCache(flatbuffers::FlatBufferBuilder &_fbb, const TuneCacheT *_o, const flatbuffers::rehasher_function_t *_rehasher) { - (void)_rehasher; - (void)_o; - struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TuneCacheT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va; - auto _tune_map = _fbb.CreateVector> (_o->tune_map.size(), [](size_t i, _VectorArgs *__va) { return CreateTunePair(*__va->__fbb, __va->__o->tune_map[i].get(), __va->__rehasher); }, &_va ); - return paddle::lite::fbs::opencl::proto::CreateTuneCache( - _fbb, - _tune_map); -} - -namespace TuneCache_ { - -inline const flatbuffers::TypeTable *TunePairTypeTable() { - static const flatbuffers::TypeCode type_codes[] = { - { flatbuffers::ET_STRING, 0, -1 }, - { flatbuffers::ET_INT, 1, -1 } - }; - static const char * const names[] = { - "key", - "value" - }; - static const flatbuffers::TypeTable tt = { - flatbuffers::ST_TABLE, 2, type_codes, nullptr, nullptr, names - }; - return &tt; -} - -} // namespace TuneCache_ - -inline const flatbuffers::TypeTable *TuneCacheTypeTable() { - static const flatbuffers::TypeCode type_codes[] = { - { flatbuffers::ET_SEQUENCE, 1, 0 } - }; - static const flatbuffers::TypeFunction type_refs[] = { - paddle::lite::fbs::opencl::proto::TuneCache_::TunePairTypeTable - }; - static const char * const names[] = { - "tune_map" - }; - static const flatbuffers::TypeTable tt = { - flatbuffers::ST_TABLE, 1, type_codes, type_refs, nullptr, names - }; - return &tt; -} - -inline const paddle::lite::fbs::opencl::proto::TuneCache *GetTuneCache(const void *buf) { - return flatbuffers::GetRoot(buf); -} - -inline const paddle::lite::fbs::opencl::proto::TuneCache *GetSizePrefixedTuneCache(const void *buf) { - return flatbuffers::GetSizePrefixedRoot(buf); -} - -inline TuneCache *GetMutableTuneCache(void *buf) { - return flatbuffers::GetMutableRoot(buf); -} - -inline bool VerifyTuneCacheBuffer( - flatbuffers::Verifier &verifier) { - return verifier.VerifyBuffer(nullptr); -} - -inline bool VerifySizePrefixedTuneCacheBuffer( - flatbuffers::Verifier &verifier) { - return verifier.VerifySizePrefixedBuffer(nullptr); -} - -inline void FinishTuneCacheBuffer( - flatbuffers::FlatBufferBuilder &fbb, - flatbuffers::Offset root) { - fbb.Finish(root); -} - -inline void FinishSizePrefixedTuneCacheBuffer( - flatbuffers::FlatBufferBuilder &fbb, - flatbuffers::Offset root) { - fbb.FinishSizePrefixed(root); -} - -inline std::unique_ptr UnPackTuneCache( - const void *buf, - const flatbuffers::resolver_function_t *res = nullptr) { - return std::unique_ptr(GetTuneCache(buf)->UnPack(res)); -} - -inline std::unique_ptr UnPackSizePrefixedTuneCache( - const void *buf, - const flatbuffers::resolver_function_t *res = nullptr) { - return std::unique_ptr(GetSizePrefixedTuneCache(buf)->UnPack(res)); -} - -} // namespace proto -} // namespace opencl -} // namespace fbs -} // namespace lite -} // namespace paddle - -#endif // FLATBUFFERS_GENERATED_TUNECACHE_PADDLE_LITE_FBS_OPENCL_PROTO_H_ diff --git a/lite/core/program.cc b/lite/core/program.cc index a6c2698ca37..8f0c0a5043a 100644 --- a/lite/core/program.cc +++ b/lite/core/program.cc @@ -605,9 +605,27 @@ void RuntimeProgram::Run() { #ifdef LITE_WITH_OPENCL // delegate flush judgement to specify target , it is too heavy for Inst inst.Flush(idx); +#if defined(LITE_WITH_PROFILE) || defined(LITE_WITH_PRECISION_PROFILE) + VLOG(4) << "kernel name " << idx << " " << inst.kernel()->name(); + const auto* op_info = inst.op()->op_info(); + auto var_in_names = op_info->input_names(); + for (int i = 0; i < var_in_names.size(); i++) { + VLOG(4) << "input var_in_names: " << var_in_names[i]; + } + auto var_out_names = op_info->output_names(); + for (int i = 0; i < var_out_names.size(); i++) { + VLOG(4) << "output var_out_names: " << var_out_names[i]; + } +#endif #endif inst.Run(); +#ifdef LITE_WITH_PRECISION_PROFILE + if (inst.op()->Type() != "while") { + precision_profiler_summary += + inst_precision_profiler.GetInstPrecision(&inst); + } +#endif // LITE_WITH_PRECISION_PROFILE } #ifdef LITE_WITH_METAL @@ -797,231 +815,6 @@ void Instruction::Run() { kernel_->Launch(); has_run_ = true; - - - - - - - - - - - - - - -#if 0 - // clang-format off - /* - time_t t; - struct tm* timeinfo; - time(&t); - timeinfo = localtime(&t); - std::cout << "time: " << asctime(timeinfo) << std::endl; - */ - std::cout << "***-----------------------------******-----------------------------***" << std::endl; - // get precision - std::string op_name = op_->op_info()->Type(); - std::cout << "op_type: " << op_name << std::endl; - if ((op_->op_info()->Type() != "fetch") && - (op_->op_info()->Type() != "while") && - (op_->op_info()->Type() != "conditional_block")) { - auto op_scope = op_->scope(); - auto out_names = op_->op_info()->output_names(); - auto in_names = op_->op_info()->input_names(); - for (auto& out_name : in_names) { - std::string out_arg_name; - op_->op_info()->GetInputArgname(out_name, &out_arg_name); - //auto type = kernel_->GetInputDeclType(out_arg_name); - // if (type->IsTensor()) { - auto tmp = op_scope->FindVar(out_name); - if (tmp->IsType()) { - const Tensor* tout = op_scope->FindVar(out_name)->GetMutable(); - if (tout->IsInitialized()) { - auto size = tout->numel(); - auto dim = tout->dims(); - double sum = 0.0; - if (tout->precision() == PrecisionType::kFloat) { - const float* dout = tout->data(); - for (int i = 0; i < size; i++) { - sum += dout[i]; - } - } else if (tout->precision() == PrecisionType::kFP16) { - const float16_t* dout = tout->data(); - for (int i = 0; i < size; i++) { - sum += static_cast(dout[i]); - } - } else if (tout->precision() == PrecisionType::kInt32) { - const int32_t* dout = tout->data(); - for (int i = 0; i < size; i++) { - sum += static_cast(dout[i]); - } - } else if (tout->precision() == PrecisionType::kInt64) { - const int64_t* dout = tout->data(); - for (int i = 0; i < size; i++) { - sum += static_cast(dout[i]); - } - } else if (tout->precision() == PrecisionType::kInt8) { - const int8_t* dout = tout->data(); - for (int i = 0; i < size; i++) { - sum += static_cast(dout[i]); - } - } else { - std::cout << "This data_type is not support: " - << PrecisionToStr(tout->precision()) << std::endl; - } - double avg = sum / static_cast(size); - std::cout << "in_name: " << out_name - << ", type: " << PrecisionToStr(tout->precision()) - << ", size: " << size << ", input avg: " << avg; - - std::cout<<", dim size:"<< dim.size() << "["; - for(int i = 0; i < dim.size(); i++) - std::cout << dim[i] << ","; - std::cout<<"]\n"; - } else { - std::cout << out_name << " is not inited." << std::endl; - } - } else if (tmp->IsType>()) { - auto touts = - op_scope->FindVar(out_name)->GetMutable>(); - for (auto t : *touts) { - const Tensor* tout = &t; - if (tout->IsInitialized()) { - auto size = tout->numel(); - const float* dout = tout->data(); - double sum = 0.0; - for (int i = 0; i < size; i++) { - sum += dout[i]; - } - double avg = sum / static_cast(size); - std::cout << "op_type: " << op_name << ", input avg: " << avg - << std::endl; - } else { - std::cout << out_name << " is not inited." << std::endl; - } - } - } - } - for (auto& out_name : out_names) { - std::string out_arg_name; - op_->op_info()->GetOutputArgname(out_name, &out_arg_name); - //auto type = kernel_->GetOutputDeclType(out_arg_name); - std::string op_name = op_->op_info()->Type(); - //if (type->IsTensor()) { - auto tmp = op_scope->FindVar(out_name); - if (tmp->IsType()) { - const Tensor* tout = op_scope->FindVar(out_name)->GetMutable(); - if (tout->IsInitialized()) { - auto size = tout->numel(); - auto dim = tout->dims(); - double sum = 0.0; - if (tout->precision() == PrecisionType::kFloat) { - const float* dout = tout->data(); - for (int i = 0; i < size; i++) { - sum += dout[i]; - } - } else if (tout->precision() == PrecisionType::kFP16) { - const float16_t* dout = tout->data(); - for (int i = 0; i < size; i++) { - sum += static_cast(dout[i]); - } - } else if (tout->precision() == PrecisionType::kInt32) { - const int32_t* dout = tout->data(); - for (int i = 0; i < size; i++) { - sum += static_cast(dout[i]); - } - } else if (tout->precision() == PrecisionType::kInt64) { - const int64_t* dout = tout->data(); - for (int i = 0; i < size; i++) { - sum += static_cast(dout[i]); - } - } else if (tout->precision() == PrecisionType::kInt8) { - const int8_t* dout = tout->data(); - for (int i = 0; i < size; i++) { - sum += static_cast(dout[i]); - } - } else { - std::cout << "This data_type is not support: " - << PrecisionToStr(tout->precision()) << std::endl; - } - double avg = sum / static_cast(size); - std::cout << "out_name: " << out_name - << ", type: " << PrecisionToStr(tout->precision()) - << ", sum: " << sum << ", output avg: " << avg; - std::cout<<", dim size:"<< dim.size() << "["; - for(int i = 0; i < dim.size(); i++) - std::cout << dim[i] << ","; - std::cout<<"]\n"; - } else { - std::cout << out_name << " is not inited." << std::endl; - } - } else if (tmp->IsType>()) { - auto touts = - op_scope->FindVar(out_name)->GetMutable>(); - for (auto t : *touts) { - const Tensor* tout = &t; - if (tout->IsInitialized()) { - auto size = tout->numel(); - double sum = 0.0; - if (tout->precision() == PrecisionType::kFloat) { - const float* dout = tout->data(); - for (int i = 0; i < size; i++) { - sum += dout[i]; - std::cout << dout[i] << ", "; - } - } else if (tout->precision() == PrecisionType::kFP16) { - const float16_t* dout = tout->data(); - for (int i = 0; i < size; i++) { - sum += static_cast(dout[i]); - std::cout << dout[i] << ", "; - } - } else if (tout->precision() == PrecisionType::kInt32) { - const int32_t* dout = tout->data(); - for (int i = 0; i < size; i++) { - sum += static_cast(dout[i]); - std::cout << dout[i] << ", "; - } - } else if (tout->precision() == PrecisionType::kInt64) { - const int64_t* dout = tout->data(); - for (int i = 0; i < size; i++) { - sum += static_cast(dout[i]); - std::cout << dout[i] << ", "; - } - } else { - std::cout << "This data_type is not support: " - << PrecisionToStr(tout->precision()) << std::endl; - } - double avg = sum / static_cast(size); - std::cout << std::endl; - std::cout << "op_type: " << op_name << out_name - << ", type: " << PrecisionToStr(tout->precision()) - << ", output avg: " << avg << std::endl; - } else { - std::cout << out_name << " is not inited." << std::endl; - } - } - } - } - std::cout << "***-----------------------------******-----------------------------***" << std::endl; - } -#endif - - - - - - - - - - - - - - - #ifdef LITE_WITH_PROFILE if (first_epoch_for_profiler_) { kernel_->SetIsKernelTest(false); diff --git a/lite/kernels/host/index_select_compute.cc b/lite/kernels/host/index_select_compute.cc index f4ff2b1ad8c..1c4be1c8df2 100644 --- a/lite/kernels/host/index_select_compute.cc +++ b/lite/kernels/host/index_select_compute.cc @@ -35,6 +35,8 @@ void Index_selectCompute::Run() { auto index_ddim = index->dims(); auto output_ddim = output->dims(); + if (param.dim < 0) param.dim += input_ddim.size(); + int left = input_ddim.count(0, param.dim); int middle = input_ddim[param.dim]; int right = input_ddim.count(param.dim + 1, input_ddim.size()); diff --git a/lite/kernels/host/unique_compute.cc b/lite/kernels/host/unique_compute.cc index b29101e52ed..5f1458f2bc2 100644 --- a/lite/kernels/host/unique_compute.cc +++ b/lite/kernels/host/unique_compute.cc @@ -17,13 +17,12 @@ #include #include +#include #include #include #include #include #include -#include - namespace paddle { namespace lite { @@ -31,10 +30,10 @@ namespace kernels { namespace host { template -void UniqueFunc(const lite::Tensor* x, - lite::Tensor* out, - lite::Tensor* index, - lite::Tensor* count) { +void UniqueFunc(const lite::Tensor* x, + lite::Tensor* out, + lite::Tensor* index, + lite::Tensor* count) { const InT* in_data = x->template data(); IndexT* index_data = index->mutable_data(); @@ -187,9 +186,10 @@ static ForwardIt UniqueDimImpl(ForwardIt first, return ++result; } -template +template void TensorFromVector(const std::vector& src, lite::Tensor* dst) { auto* src_ptr = static_cast(src.data()); + dst->Resize({static_cast(src.size())}); auto* dst_ptr = static_cast(dst->mutable_data()); auto size = src.size() * sizeof(T); lite::TargetWrapperHost::MemcpySync( @@ -197,16 +197,16 @@ void TensorFromVector(const std::vector& src, lite::Tensor* dst) { } template -void TransCompute(const Tensor &input, - Tensor *output, - const std::vector &orders) { +void TransCompute(const Tensor& input, + Tensor* output, + const std::vector& orders) { auto in_dims = input.dims(); auto out_dims = output->dims(); int num_axes = in_dims.size(); int count = in_dims.production(); - const T *din = input.data(); - T *dout = output->mutable_data(); + const T* din = input.data(); + T* dout = output->mutable_data(); std::vector old_temps; int temp = 1; @@ -244,8 +244,8 @@ void TransCompute(const Tensor &input, lite::DDim FlattenTo2d(const lite::DDim& src, int num_col_dims) { return DDim(std::vector{ - src.Slice(0, num_col_dims).production(), - src.Slice(num_col_dims, src.size()).production()}); + src.Slice(0, num_col_dims).production(), + src.Slice(num_col_dims, src.size()).production()}); } template @@ -318,8 +318,8 @@ void UniqueDimFunc(const lite::Tensor& in, sorted_indices_vec.end(), [&](int64_t a, int64_t b) -> bool { for (int64_t i = 0; i < col; ++i) { - InT lhs = in_trans_data[i + a*col]; - InT rhs = in_trans_data[i + b*col]; + InT lhs = in_trans_data[i + a * col]; + InT rhs = in_trans_data[i + b * col]; if (lhs < rhs) { return true; } else if (lhs > rhs) { @@ -328,7 +328,7 @@ void UniqueDimFunc(const lite::Tensor& in, } return false; }); - + // sort tensor according to indices lite::Tensor input_sorted; input_sorted.Resize(in_trans_dims); @@ -338,7 +338,6 @@ void UniqueDimFunc(const lite::Tensor& in, memcpy(input_sorted_data + i * col, in_trans_data + static_cast(sorted_indices_vec[i]) * col, col * sizeof(InT)); - } std::vector input_unbind = Unbind(input_sorted); @@ -346,16 +345,17 @@ void UniqueDimFunc(const lite::Tensor& in, std::vector counts_vec(sorted_indices_vec.size(), 0); std::vector indices_vec(sorted_indices_vec.size(), 0); auto last = UniqueDimImpl::iterator, InT, IndexT>( - input_unbind.begin(), - input_unbind.end(), - sorted_indices_vec, - &inverse_vec, - &counts_vec, - &indices_vec); + input_unbind.begin(), + input_unbind.end(), + sorted_indices_vec, + &inverse_vec, + &counts_vec, + &indices_vec); input_unbind.erase(last, input_unbind.end()); counts_vec.erase(counts_vec.begin() + input_unbind.size(), counts_vec.end()); - indices_vec.erase(indices_vec.begin() + input_unbind.size(), indices_vec.end()); - + indices_vec.erase(indices_vec.begin() + input_unbind.size(), + indices_vec.end()); + lite::Tensor out_trans; std::vector out_trans_dims_vec = in_trans_dims_vec; out_trans_dims_vec[0] = input_unbind.size(); @@ -368,20 +368,16 @@ void UniqueDimFunc(const lite::Tensor& in, TransCompute(out_trans, out, permute); if (return_inverse) { - index->Resize({in.numel()}); TensorFromVector(inverse_vec, index); } if (return_counts) { - count->Resize({out->numel()}); TensorFromVector(counts_vec, count); } if (return_index) { - indices->Resize({out->numel()}); TensorFromVector(indices_vec, indices); } - } void UniqueCompute::Run() { @@ -399,7 +395,8 @@ void UniqueCompute::Run() { auto is_sorted = param.is_sorted; lite_api::PrecisionType index_type = index->precision(); - bool index_type_match = index_type == PRECISION(kInt32) || index_type == PRECISION(kInt64); + bool index_type_match = + index_type == PRECISION(kInt32) || index_type == PRECISION(kInt64); lite_api::PrecisionType type = x->precision(); CHECK_EQ(index_type_match, true) << "Index holds the wrong type, it holds " << static_cast(type) @@ -437,10 +434,10 @@ void UniqueCompute::Run() { LOG(FATAL) << "unique does not implement for the " << "input type:" << static_cast(type); break; - } - } - return; - } + } + } + return; + } if (x->numel() == 0) { switch (type) { @@ -458,20 +455,41 @@ void UniqueCompute::Run() { << "input type:" << static_cast(type); break; } - + return; } if (axis_vec.empty()) { if (index_type == PRECISION(kInt32)) { switch (type) { case PRECISION(kFloat): - UniqueFlattendTensorFunc(*x, output, index, indices, count, return_index, return_inverse, return_counts); + UniqueFlattendTensorFunc(*x, + output, + index, + indices, + count, + return_index, + return_inverse, + return_counts); break; case PRECISION(kInt32): - UniqueFlattendTensorFunc(*x, output, index, indices, count, return_index, return_inverse, return_counts); + UniqueFlattendTensorFunc(*x, + output, + index, + indices, + count, + return_index, + return_inverse, + return_counts); break; case PRECISION(kInt64): - UniqueFlattendTensorFunc(*x, output, index, indices, count, return_index, return_inverse, return_counts); + UniqueFlattendTensorFunc(*x, + output, + index, + indices, + count, + return_index, + return_inverse, + return_counts); break; default: LOG(FATAL) << "unique does not implement for the " @@ -481,13 +499,34 @@ void UniqueCompute::Run() { } else { switch (type) { case PRECISION(kFloat): - UniqueFlattendTensorFunc(*x, output, index, indices, count, return_index, return_inverse, return_counts); + UniqueFlattendTensorFunc(*x, + output, + index, + indices, + count, + return_index, + return_inverse, + return_counts); break; case PRECISION(kInt32): - UniqueFlattendTensorFunc(*x, output, index, indices, count, return_index, return_inverse, return_counts); + UniqueFlattendTensorFunc(*x, + output, + index, + indices, + count, + return_index, + return_inverse, + return_counts); break; case PRECISION(kInt64): - UniqueFlattendTensorFunc(*x, output, index, indices, count, return_index, return_inverse, return_counts); + UniqueFlattendTensorFunc(*x, + output, + index, + indices, + count, + return_index, + return_inverse, + return_counts); break; default: LOG(FATAL) << "unique does not implement for the " @@ -500,13 +539,37 @@ void UniqueCompute::Run() { if (index_type == PRECISION(kInt32)) { switch (type) { case PRECISION(kFloat): - UniqueDimFunc(*x, output, index, indices, count, axis, return_index, return_inverse, return_counts); + UniqueDimFunc(*x, + output, + index, + indices, + count, + axis, + return_index, + return_inverse, + return_counts); break; case PRECISION(kInt32): - UniqueDimFunc(*x, output, index, indices, count, axis, return_index, return_inverse, return_counts); + UniqueDimFunc(*x, + output, + index, + indices, + count, + axis, + return_index, + return_inverse, + return_counts); break; case PRECISION(kInt64): - UniqueDimFunc(*x, output, index, indices, count, axis, return_index, return_inverse, return_counts); + UniqueDimFunc(*x, + output, + index, + indices, + count, + axis, + return_index, + return_inverse, + return_counts); break; default: LOG(FATAL) << "unique does not implement for the " @@ -515,13 +578,37 @@ void UniqueCompute::Run() { } else { switch (type) { case PRECISION(kFloat): - UniqueDimFunc(*x, output, index, indices, count, axis, return_index, return_inverse, return_counts); + UniqueDimFunc(*x, + output, + index, + indices, + count, + axis, + return_index, + return_inverse, + return_counts); break; case PRECISION(kInt32): - UniqueDimFunc(*x, output, index, indices, count, axis, return_index, return_inverse, return_counts); + UniqueDimFunc(*x, + output, + index, + indices, + count, + axis, + return_index, + return_inverse, + return_counts); break; case PRECISION(kInt64): - UniqueDimFunc(*x, output, index, indices, count, axis, return_index, return_inverse, return_counts); + UniqueDimFunc(*x, + output, + index, + indices, + count, + axis, + return_index, + return_inverse, + return_counts); break; default: LOG(FATAL) << "unique does not implement for the " @@ -529,21 +616,15 @@ void UniqueCompute::Run() { } } } -} - +} } // namespace host } // namespace kernels } // namespace lite } // namespace paddle - -REGISTER_LITE_KERNEL(unique, - kHost, - kAny, - kAny, - paddle::lite::kernels::host::UniqueCompute, - def) +REGISTER_LITE_KERNEL( + unique, kHost, kAny, kAny, paddle::lite::kernels::host::UniqueCompute, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny), @@ -565,4 +646,3 @@ REGISTER_LITE_KERNEL(unique, PRECISION(kInt32), DATALAYOUT(kAny))}) .Finalize(); - diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h index 1b9e121e4b7..a023deee21c 100644 --- a/lite/operators/op_params.h +++ b/lite/operators/op_params.h @@ -2288,12 +2288,12 @@ struct UniqueWithCountsParam : ParamBase { struct UniqueParam : ParamBase { const lite::Tensor* X{}; lite::Tensor* Out{}; - lite::Tensor* Index{}; // the indices in the original input - lite::Tensor* Indices{}; // the indices in the result + lite::Tensor* Index{}; // the indices in the original input + lite::Tensor* Indices{}; // the indices in the result lite::Tensor* Counts{}; int dtype{-1}; - bool return_index{false}; // Indices - bool return_inverse{false}; // Index + bool return_index{false}; // Indices + bool return_inverse{false}; // Index bool return_counts{false}; std::vector axis{}; bool is_sorted{false}; diff --git a/lite/operators/unique_op.cc b/lite/operators/unique_op.cc index 84f829028a9..01155347491 100644 --- a/lite/operators/unique_op.cc +++ b/lite/operators/unique_op.cc @@ -14,6 +14,7 @@ #include "lite/operators/unique_op.h" #include "lite/core/op_registry.h" + namespace paddle { namespace lite { namespace operators { @@ -63,8 +64,7 @@ bool UniqueOp::InferShapeImpl() const { return true; } -bool UniqueOp::AttachImpl(const cpp::OpDesc &opdesc, - lite::Scope *scope) { +bool UniqueOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) { param_.X = scope->FindTensor(opdesc.Input("X").front()); CHECK(param_.X) << "Input(X) of UniqueOp should not be null."; param_.Out = scope->FindMutableTensor(opdesc.Output("Out").front()); @@ -81,7 +81,7 @@ bool UniqueOp::AttachImpl(const cpp::OpDesc &opdesc, param_.Counts = scope->FindMutableTensor(opdesc.Output("Counts").front()); CHECK(param_.Counts) << "Output(Counts) of UniqueOp should not be null."; } - + if (opdesc.HasAttr("dtype")) { param_.dtype = opdesc.GetAttr("dtype"); } @@ -97,7 +97,7 @@ bool UniqueOp::AttachImpl(const cpp::OpDesc &opdesc, param_.axis = opdesc.GetAttr>("axis"); if (opdesc.HasAttr("is_sorted")) { param_.is_sorted = opdesc.GetAttr("is_sorted"); - } + } return true; } diff --git a/lite/operators/unique_op.h b/lite/operators/unique_op.h index c9e302b7566..69cb898eae6 100644 --- a/lite/operators/unique_op.h +++ b/lite/operators/unique_op.h @@ -35,7 +35,7 @@ class UniqueOp : public OpLite { bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } - + std::string DebugString() const override { return "unique"; } bool InferType() override { @@ -49,4 +49,4 @@ class UniqueOp : public OpLite { } // namespace operators } // namespace lite -} // namespace paddle \ No newline at end of file +} // namespace paddle diff --git a/lite/tests/unittest_py/op/test_unique_op.py b/lite/tests/unittest_py/op/test_unique_op.py index 453ac6bd561..3130967e702 100644 --- a/lite/tests/unittest_py/op/test_unique_op.py +++ b/lite/tests/unittest_py/op/test_unique_op.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -33,7 +33,7 @@ def __init__(self, *args, **kwargs): host_places = [ Place(TargetType.Host, PrecisionType.FP32, DataLayoutType.NCHW) ] - self.enable_testing_on_place(places=host_places, thread=[1,4]) + self.enable_testing_on_place(places=host_places, thread=[1, 4]) def is_program_valid(self, program_config: ProgramConfig, @@ -41,48 +41,46 @@ def is_program_valid(self, return True def sample_program_configs(self, draw): - in_shape = draw( - st.lists( - st.integers( - min_value=2, max_value=100), - min_size=1, - max_size=1)) - print(in_shape) - in_dtype = draw(st.sampled_from([np.float32, np.int32, np.int64])) - - def generate_X_data(): - t = np.random.normal(0.0, 5.0, in_shape).astype(in_dtype) - print(t) - return t - - dtype = draw(st.sampled_from([2,3])) + dtype = draw(st.sampled_from([2, 3])) is_sorted = draw(st.booleans()) return_index = draw(st.booleans()) return_inverse = draw(st.sampled_from([True])) return_counts = draw(st.booleans()) + in_shape = draw( + st.lists( + st.integers( + min_value=2, max_value=10), min_size=1, max_size=8)) + if is_sorted == False: return_index = False + return_inverse = True return_counts = False + in_shape = draw( + st.lists( + st.integers( + min_value=2, max_value=10), + min_size=1, + max_size=1)) + + in_dtype = draw(st.sampled_from([np.float32, np.int32, np.int64])) + + def generate_X_data(): + return np.random.normal(0.0, 5.0, in_shape).astype(in_dtype) param_is_sorted = is_sorted param_return_index = return_index param_return_inverse = return_inverse param_return_counts = return_counts - - axis = draw(st.sampled_from([[2], [1], [0], []])) + + axis = draw( + st.sampled_from([[], [0], [1], [2], [3], [4], [5], [6], [7], [8]])) while len(axis) > 0 and axis[0] >= len(in_shape): - axis[0] = axis[0] - 1 + axis[0] = axis[0] - 1 - outputs = [ - "Out_data" - ] - outputs_config = { - "Out": ["Out_data"] - } - outputs_dtype = { - "Out_data": in_dtype - } + outputs = ["Out_data"] + outputs_config = {"Out": ["Out_data"]} + outputs_dtype = {"Out_data": in_dtype} if return_inverse: outputs.append("Index_data") outputs_config["Index"] = ["Index_data"] @@ -93,13 +91,13 @@ def generate_X_data(): outputs_dtype["Indices_data"] = np.int32 if return_counts: outputs.append("Counts_data") - outputs_config["Counts"] = ["Counts_data"] + outputs_config["Counts"] = ["Counts_data"] outputs_dtype["Counts_data"] = np.int32 unique_op = OpConfig( - type = "unique", - inputs = {"X": ["input_data"]}, - outputs = outputs_config, + type="unique", + inputs={"X": ["input_data"]}, + outputs=outputs_config, attrs={ "dtype": dtype, "return_index": param_return_index, @@ -107,8 +105,7 @@ def generate_X_data(): "return_counts": param_return_counts, "axis": axis, "is_sorted": is_sorted - } - ) + }) unique_op.outputs_dtype = outputs_dtype @@ -118,8 +115,7 @@ def generate_X_data(): inputs={ "input_data": TensorConfig(data_gen=partial(generate_X_data)) }, - outputs=outputs - ) + outputs=outputs) return program_config def sample_predictor_configs(self): @@ -129,7 +125,8 @@ def add_ignore_pass_case(self): pass def test(self, *args, **kwargs): - self.run_and_statis(quant=False, max_examples=25) + self.run_and_statis(quant=False, max_examples=100) + if __name__ == "__main__": - unittest.main(argv=['']) + unittest.main(argv=['']) From 2718edb0a27b326dc0319dc4e0e7b771a43242e7 Mon Sep 17 00:00:00 2001 From: Qijian Tian <1741919942@qq.com> Date: Tue, 20 Dec 2022 18:18:44 +0800 Subject: [PATCH 09/10] complate unique_op From 2e4ed6e2f242f86919c9e030502ed6144ba2f9ab Mon Sep 17 00:00:00 2001 From: Qijian Tian <1741919942@qq.com> Date: Thu, 22 Dec 2022 11:36:22 +0800 Subject: [PATCH 10/10] rename TensorFromVector --- lite/kernels/host/unique_compute.cc | 58 ++++++++--------------------- 1 file changed, 16 insertions(+), 42 deletions(-) diff --git a/lite/kernels/host/unique_compute.cc b/lite/kernels/host/unique_compute.cc index 5f1458f2bc2..b891d1435c9 100644 --- a/lite/kernels/host/unique_compute.cc +++ b/lite/kernels/host/unique_compute.cc @@ -13,8 +13,6 @@ // limitations under the License. #include "lite/kernels/host/unique_compute.h" -#include "lite/core/tensor.h" - #include #include #include @@ -23,6 +21,7 @@ #include #include #include +#include "lite/core/tensor.h" namespace paddle { namespace lite { @@ -36,12 +35,9 @@ void UniqueFunc(const lite::Tensor* x, lite::Tensor* count) { const InT* in_data = x->template data(); IndexT* index_data = index->mutable_data(); - int64_t j = 0; - std::unordered_map dict; std::vector uniq; - for (auto i = 0; i < x->numel(); i++) { auto it = dict.find(in_data[i]); if (it == dict.end()) { @@ -53,7 +49,6 @@ void UniqueFunc(const lite::Tensor* x, index_data[i] = static_cast(it->second); } } - if (count != nullptr) { // Resize the count tensor dims to allocate the memory count->Resize({static_cast(uniq.size())}); @@ -84,7 +79,6 @@ void UniqueFlattendTensorFunc(const lite::Tensor& in, out->Resize({static_cast(unique.size())}); auto out_data = out->mutable_data(); std::copy(unique.begin(), unique.end(), out_data); - if (return_index) { indices->Resize({out->numel()}); auto indices_data = indices->mutable_data(); @@ -98,7 +92,6 @@ void UniqueFlattendTensorFunc(const lite::Tensor& in, indices_data[i] = indices_map[out_data[i]]; } } - if (return_inverse) { index->Resize({in.numel()}); auto inverse_data = index->mutable_data(); @@ -111,7 +104,6 @@ void UniqueFlattendTensorFunc(const lite::Tensor& in, inverse_data[i] = inverse_map[in_data[i]]; } } - if (return_counts) { count->Resize({out->numel()}); auto count_data = count->mutable_data(); @@ -162,14 +154,11 @@ static ForwardIt UniqueDimImpl(ForwardIt first, if (first == last) { return last; } - (*inverse_vec)[sorted_indices_vec[0]] = 0; (*counts_vec)[0] = 1; (*indices_vec)[0] = sorted_indices_vec[0]; - ForwardIt begin = first; ForwardIt result = first; - while (++first != last) { int64_t idx_first = std::distance(begin, first); int64_t idx_result = std::distance(begin, result); @@ -187,7 +176,7 @@ static ForwardIt UniqueDimImpl(ForwardIt first, } template -void TensorFromVector(const std::vector& src, lite::Tensor* dst) { +void UniqueTensorFromVector(const std::vector& src, lite::Tensor* dst) { auto* src_ptr = static_cast(src.data()); dst->Resize({static_cast(src.size())}); auto* dst_ptr = static_cast(dst->mutable_data()); @@ -197,17 +186,15 @@ void TensorFromVector(const std::vector& src, lite::Tensor* dst) { } template -void TransCompute(const Tensor& input, - Tensor* output, - const std::vector& orders) { +void UniqueTransCompute(const Tensor& input, + Tensor* output, + const std::vector& orders) { auto in_dims = input.dims(); auto out_dims = output->dims(); int num_axes = in_dims.size(); int count = in_dims.production(); - const T* din = input.data(); T* dout = output->mutable_data(); - std::vector old_temps; int temp = 1; for (int i = 0; i < num_axes; ++i) { @@ -218,7 +205,6 @@ void TransCompute(const Tensor& input, for (int i = 0; i < num_axes; i++) { old_steps.push_back(old_temps[num_axes - 1 - i]); } - std::vector new_temps; temp = 1; for (int i = 0; i < num_axes; ++i) { @@ -229,7 +215,6 @@ void TransCompute(const Tensor& input, for (int i = 0; i < num_axes; i++) { new_steps.push_back(new_temps[num_axes - 1 - i]); } - for (int i = 0; i < count; ++i) { int old_idx = 0; int idx = i; @@ -242,16 +227,16 @@ void TransCompute(const Tensor& input, } } -lite::DDim FlattenTo2d(const lite::DDim& src, int num_col_dims) { +lite::DDim UniqueFlattenTo2d(const lite::DDim& src, int num_col_dims) { return DDim(std::vector{ src.Slice(0, num_col_dims).production(), src.Slice(num_col_dims, src.size()).production()}); } template -void concat_func(const std::vector& input, - const int axis, - lite::Tensor* output) { +void UniqueConcatFunc(const std::vector& input, + const int axis, + lite::Tensor* output) { size_t num = input.size(); auto dim_0 = input[0].dims(); int64_t concat_input_size = 1; @@ -262,7 +247,6 @@ void concat_func(const std::vector& input, for (int i = 0; i < axis; i++) { num_cancats *= dim_0[i]; } - auto* dst_ptr = output->mutable_data(); const int out_concat_axis = output->dims()[axis]; int64_t offset_concat_axis = 0; @@ -304,9 +288,9 @@ void UniqueDimFunc(const lite::Tensor& in, lite::DDim in_trans_dims = DDim(in_trans_dims_vec); in_trans.Resize(in_trans_dims); in_trans.mutable_data(); - TransCompute(in, &in_trans, permute); + UniqueTransCompute(in, &in_trans, permute); // reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2] - lite::DDim in_trans_flat_dims = FlattenTo2d(in_trans_dims, 1); + lite::DDim in_trans_flat_dims = UniqueFlattenTo2d(in_trans_dims, 1); in_trans.Resize(in_trans_flat_dims); // sort indices @@ -328,7 +312,6 @@ void UniqueDimFunc(const lite::Tensor& in, } return false; }); - // sort tensor according to indices lite::Tensor input_sorted; input_sorted.Resize(in_trans_dims); @@ -339,7 +322,6 @@ void UniqueDimFunc(const lite::Tensor& in, in_trans_data + static_cast(sorted_indices_vec[i]) * col, col * sizeof(InT)); } - std::vector input_unbind = Unbind(input_sorted); std::vector inverse_vec(sorted_indices_vec.size(), 0); std::vector counts_vec(sorted_indices_vec.size(), 0); @@ -355,7 +337,6 @@ void UniqueDimFunc(const lite::Tensor& in, counts_vec.erase(counts_vec.begin() + input_unbind.size(), counts_vec.end()); indices_vec.erase(indices_vec.begin() + input_unbind.size(), indices_vec.end()); - lite::Tensor out_trans; std::vector out_trans_dims_vec = in_trans_dims_vec; out_trans_dims_vec[0] = input_unbind.size(); @@ -364,19 +345,16 @@ void UniqueDimFunc(const lite::Tensor& in, std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]); out->Resize(out_trans_dims_vec); out->mutable_data(); - concat_func(input_unbind, 0, &out_trans); - TransCompute(out_trans, out, permute); - + UniqueConcatFunc(input_unbind, 0, &out_trans); + UniqueTransCompute(out_trans, out, permute); if (return_inverse) { - TensorFromVector(inverse_vec, index); + UniqueTensorFromVector(inverse_vec, index); } - if (return_counts) { - TensorFromVector(counts_vec, count); + UniqueTensorFromVector(counts_vec, count); } - if (return_index) { - TensorFromVector(indices_vec, indices); + UniqueTensorFromVector(indices_vec, indices); } } @@ -393,7 +371,6 @@ void UniqueCompute::Run() { bool return_counts = param.return_counts; auto axis_vec = param.axis; auto is_sorted = param.is_sorted; - lite_api::PrecisionType index_type = index->precision(); bool index_type_match = index_type == PRECISION(kInt32) || index_type == PRECISION(kInt64); @@ -401,7 +378,6 @@ void UniqueCompute::Run() { CHECK_EQ(index_type_match, true) << "Index holds the wrong type, it holds " << static_cast(type) << "but desires to be int32 or int64"; - if (!is_sorted) { if (index_type == PRECISION(kInt32)) { switch (type) { @@ -438,7 +414,6 @@ void UniqueCompute::Run() { } return; } - if (x->numel() == 0) { switch (type) { case PRECISION(kFloat): @@ -455,7 +430,6 @@ void UniqueCompute::Run() { << "input type:" << static_cast(type); break; } - return; } if (axis_vec.empty()) {