Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bug in cuda device context #3101

Merged
merged 3 commits into from
Jul 29, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 1 addition & 10 deletions cmake/external/eigen.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,8 @@ INCLUDE_DIRECTORIES(${EIGEN_SOURCE_DIR}/src/extern_eigen3)
ExternalProject_Add(
extern_eigen3
${EXTERNAL_PROJECT_LOG_ARGS}
# for latest version, please get from official website
# URL "https://bitbucket.org/eigen/eigen/get/3.3.4.tar.gz"
# URL_MD5 "1a47e78efe365a97de0c022d127607c3"

# for no-ssl http support, please get from bazel's mirror
# URL "http://mirror.bazel.build/bitbucket.org/eigen/eigen/get/f3a22f35b044.tar.gz"
# URL_MD5 "4645c66075982da6fa0bcf6b20f3e8f7"

# get from github mirror
GIT_REPOSITORY "/~https://github.com/RLovelett/eigen.git"
GIT_TAG "a46d2e7337c4656f00abe54a8115f6d76153a048"
GIT_TAG "master"
PREFIX ${EIGEN_SOURCE_DIR}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
Expand Down
2 changes: 1 addition & 1 deletion cmake/flags.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF)

# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
# So, don't set these flags here.
LIST(APPEND CUDA_NVCC_FLAGS -std=c++11)
LIST(APPEND CUDA_NVCC_FLAGS -std=c++11 --default-stream per-thread)
LIST(APPEND CUDA_NVCC_FLAGS --use_fast_math)

if(CMAKE_BUILD_TYPE STREQUAL "Debug")
Expand Down
42 changes: 12 additions & 30 deletions paddle/framework/detail/tensor-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,56 +83,38 @@ inline void Tensor::ShareDataWith(const Tensor& src) {

template <typename T>
inline void Tensor::CopyFrom(const Tensor& src,
const platform::CPUDeviceContext& ctx) {
const platform::Place& dst_place) {
src.check_memory_size<T>();
Resize(src.dims());

auto src_place = src.holder_->place();
auto src_ptr = static_cast<const void*>(src.data<T>());

auto dst_place = ctx.GetPlace();
auto dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));

auto size = product(src.dims_) * sizeof(T);

if (platform::is_cpu_place(src_place)) {
if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
boost::get<platform::CPUPlace>(src_place), src_ptr, size);
}
#ifndef PADDLE_ONLY_CPU
else if (platform::is_gpu_place(src_place)) {
else if (platform::is_gpu_place(src_place) &&
platform::is_cpu_place(dst_place)) {
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
boost::get<platform::GPUPlace>(src_place), src_ptr, size, 0);
}
#endif
}

#ifndef PADDLE_ONLY_CPU
template <typename T>
inline void Tensor::CopyFrom(const Tensor& src,
const platform::CUDADeviceContext& ctx) {
src.check_memory_size<T>();
Resize(src.dims());

auto src_place = src.holder_->place();
auto src_ptr = static_cast<const void*>(src.data<T>());

auto dst_place = ctx.GetPlace();
auto dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));

auto size = product(src.dims_) * sizeof(T);

if (platform::is_cpu_place(src_place)) {
} else if (platform::is_cpu_place(src_place) &&
platform::is_gpu_place(dst_place)) {
memory::Copy(boost::get<platform::GPUPlace>(dst_place), dst_ptr,
boost::get<platform::CPUPlace>(src_place), src_ptr, size,
ctx.stream());
} else if (platform::is_gpu_place(src_place)) {
boost::get<platform::CPUPlace>(src_place), src_ptr, size, 0);
} else if (platform::is_gpu_place(src_place) &&
platform::is_gpu_place(dst_place)) {
memory::Copy(boost::get<platform::GPUPlace>(dst_place), dst_ptr,
boost::get<platform::GPUPlace>(src_place), src_ptr, size,
ctx.stream());
boost::get<platform::GPUPlace>(src_place), src_ptr, size, 0);
}
}

#endif
}

template <typename T>
inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
Expand Down
9 changes: 1 addition & 8 deletions paddle/framework/tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,14 +94,7 @@ class Tensor {
* @note CopyFrom supports CPU <-> GPU, GPU <-> GPU.
*/
template <typename T>
inline void CopyFrom(const Tensor& src,
const platform::CPUDeviceContext& ctx);

#ifndef PADDLE_ONLY_CPU
template <typename T>
inline void CopyFrom(const Tensor& src,
const platform::CUDADeviceContext& ctx);
#endif
inline void CopyFrom(const Tensor& src, const platform::Place& dst_place);

/**
* @brief Return the slice of the tensor.
Expand Down
18 changes: 9 additions & 9 deletions paddle/framework/tensor_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -198,8 +198,8 @@ TEST(Tensor, CopyFrom) {
int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
memcpy(src_ptr, arr, 9 * sizeof(int));

auto* cpu_ctx = new paddle::platform::CPUDeviceContext();
dst_tensor.CopyFrom<int>(src_tensor, *cpu_ctx);
auto cpu_place = new paddle::platform::CPUPlace();
dst_tensor.CopyFrom<int>(src_tensor, *cpu_place);

const int* dst_ptr = dst_tensor.data<int>();
ASSERT_NE(src_ptr, dst_ptr);
Expand All @@ -208,7 +208,7 @@ TEST(Tensor, CopyFrom) {
}

Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
dst_tensor.CopyFrom<int>(slice_tensor, *cpu_ctx);
dst_tensor.CopyFrom<int>(slice_tensor, *cpu_place);
const int* slice_ptr = slice_tensor.data<int>();
dst_ptr = dst_tensor.data<int>();
ASSERT_NE(dst_ptr, slice_ptr);
Expand All @@ -228,12 +228,12 @@ TEST(Tensor, CopyFrom) {
memcpy(src_ptr, arr, 9 * sizeof(int));

// CPU Tensor to GPU Tensor
auto gpu_ctx = new paddle::platform::CUDADeviceContext(0);
gpu_tensor.CopyFrom<int>(src_tensor, *gpu_ctx);
auto gpu_place = new paddle::platform::GPUPlace(0);
gpu_tensor.CopyFrom<int>(src_tensor, *gpu_place);

// GPU Tensor to CPU Tensor
auto cpu_ctx = new paddle::platform::CPUDeviceContext();
dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_ctx);
auto cpu_place = new paddle::platform::CPUPlace();
dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place);

// Compare Tensors
const int* dst_ptr = dst_tensor.data<int>();
Expand All @@ -245,10 +245,10 @@ TEST(Tensor, CopyFrom) {
Tensor slice_tensor = src_tensor.Slice<int>(1, 2);

// CPU Slice Tensor to GPU Tensor
gpu_tensor.CopyFrom<int>(slice_tensor, *gpu_ctx);
gpu_tensor.CopyFrom<int>(slice_tensor, *gpu_place);

// GPU Tensor to CPU Tensor
dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_ctx);
dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place);

// Compare Slice Tensors
const int* slice_ptr = slice_tensor.data<int>();
Expand Down
23 changes: 14 additions & 9 deletions paddle/platform/device_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,19 @@ Eigen::GpuDevice* DeviceContext::get_eigen_device<Eigen::GpuDevice>() const {

CUDADeviceContext::CUDADeviceContext(GPUPlace place) : place_(place) {
SetDeviceId(place_.device);
PADDLE_ENFORCE(cudaStreamCreate(&stream_));
eigen_stream_.reset(new Eigen::CudaStreamDevice(&stream_));
// TODO (qijun) Pass a created cuda stream to Eigen::CudaStreamDevice directly
// here will cause segment fault. We must implement a class derived from
// Eigen::StreamInterface, and reinitialize it with a cuda stream and a gpu id
// later. Please refer to the implementation of class EigenCudaStreamDevice
// in TensorFlow.
//
// We find that CUDA 7 introduces a new option, the per-thread default stream,
// that has two effects. Please refer to https://devblogs.nvidia.com/
// parallelforall/gpu-pro-tip-cuda-7-streams-simplify-concurrency/
//
// So, we decide to use default stream and add –default-stream per-thread nvcc
// flag. Than, two threads with two CUDADeviceContexts will run parallelly.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

than -> then
parallelly -> simultaneously

eigen_stream_.reset(new Eigen::CudaStreamDevice());
eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
}

Expand All @@ -64,15 +75,12 @@ CUDADeviceContext::~CUDADeviceContext() {
}
eigen_stream_.reset();
eigen_device_.reset();
PADDLE_ENFORCE(cudaStreamDestroy(stream_));
}

Place CUDADeviceContext::GetPlace() const { return place_; }

cudaStream_t CUDADeviceContext::stream() const { return stream_; }

void CUDADeviceContext::Wait() const {
PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
PADDLE_ENFORCE(cudaStreamSynchronize(0));
}

Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
Expand All @@ -83,7 +91,6 @@ cublasHandle_t CUDADeviceContext::cublas_handle() {
if (!cublas_handle_) {
SetDeviceId(place_.device);
PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_));
PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_));
}
return cublas_handle_;
}
Expand All @@ -92,7 +99,6 @@ cudnnHandle_t CUDADeviceContext::cudnn_handle() {
if (!cudnn_handle_) {
SetDeviceId(place_.device);
PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream_));
}
return cudnn_handle_;
}
Expand All @@ -104,7 +110,6 @@ curandGenerator_t CUDADeviceContext::curand_generator() {
CURAND_RNG_PSEUDO_DEFAULT));
PADDLE_ENFORCE(
dynload::curandSetPseudoRandomGeneratorSeed(curand_generator_, seed_));
PADDLE_ENFORCE(dynload::curandSetStream(curand_generator_, stream_));
}
return curand_generator_;
}
Expand Down
5 changes: 0 additions & 5 deletions paddle/platform/device_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,6 @@ class CUDADeviceContext : public DeviceContext {
/*! \brief Wait for all operations completion in the stream. */
void Wait() const;

/*! \brief Return CUDA stream in the device context. */
cudaStream_t stream() const;

/*! \brief Return place in the device context. */
Place GetPlace() const override;

Expand Down Expand Up @@ -91,8 +88,6 @@ class CUDADeviceContext : public DeviceContext {
private:
uint64_t seed_;

cudaStream_t stream_;

// clang-format off
cudnnHandle_t cudnn_handle_ = nullptr;
cublasHandle_t cublas_handle_ = nullptr;
Expand Down