From 4a8775b8773399b4a39c1d5d69ddb3f529b1ec6f Mon Sep 17 00:00:00 2001 From: sabreshao Date: Mon, 9 Apr 2018 16:32:09 +0800 Subject: [PATCH] Add HIP support for fluid/memory. Refine with latest cmake. --- paddle/fluid/memory/detail/CMakeLists.txt | 2 + paddle/fluid/memory/detail/buddy_allocator.cc | 2 +- .../fluid/memory/detail/system_allocator.cc | 104 ++++++++++++++++++ paddle/fluid/memory/detail/system_allocator.h | 2 +- .../memory/detail/system_allocator_test.cc | 2 +- paddle/fluid/memory/malloc.cc | 6 +- paddle/fluid/memory/malloc_test.cc | 2 +- paddle/fluid/memory/memcpy.cc | 71 ++++++++++++ paddle/fluid/memory/memcpy.h | 22 ++++ 9 files changed, 206 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/memory/detail/CMakeLists.txt b/paddle/fluid/memory/detail/CMakeLists.txt index c725dba5e98c20..4bccf2fd70c605 100644 --- a/paddle/fluid/memory/detail/CMakeLists.txt +++ b/paddle/fluid/memory/detail/CMakeLists.txt @@ -2,6 +2,8 @@ cc_library(memory_block SRCS memory_block.cc memory_block_desc.cc meta_cache.cc) if(${WITH_GPU}) nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info) +elseif (WITH_AMD_GPU) + hip_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info) else(${WITH_GPU}) cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info) endif(${WITH_GPU}) diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc index 4194ba197948b4..2d3d9a5702d2b3 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.cc +++ b/paddle/fluid/memory/detail/buddy_allocator.cc @@ -177,7 +177,7 @@ void* BuddyAllocator::SystemAlloc(size_t size) { } BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) if (system_allocator_->UseGpu()) { if ((total_used_ + total_free_) == 0) { // Compute the maximum allocation size for the first allocation. diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index d5390529163491..47da4462e337e3 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -187,6 +187,110 @@ bool CUDAPinnedAllocator::UseGpu() const { return false; } #endif +#ifdef PADDLE_WITH_HIP + +void* GPUAllocator::Alloc(size_t* index, size_t size) { + // CUDA documentation doesn't explain if hipMalloc returns nullptr + // if size is 0. We just make sure it does. + if (size <= 0) return nullptr; + void* p; + int prev_id; + hipGetDevice(&prev_id); + if (prev_id != gpu_id_) { + hipSetDevice(gpu_id_); + } + + hipError_t result = hipMalloc(&p, size); + + if (prev_id != gpu_id_) { + hipSetDevice(prev_id); + } + + if (result == hipSuccess) { + *index = 0; + gpu_alloc_size_ += size; + return p; + } else { + LOG(WARNING) + << "Cannot malloc " << size / 1024.0 / 1024.0 + << " MB GPU memory. Please shrink FLAGS_fraction_of_gpu_memory_to_use " + "environment variable to a lower value. Current value is " + << FLAGS_fraction_of_gpu_memory_to_use; + return nullptr; + } +} + +void GPUAllocator::Free(void* p, size_t size, size_t index) { + hipError_t err; + + if (index == 0) { + PADDLE_ASSERT(gpu_alloc_size_ >= size); + gpu_alloc_size_ -= size; + err = hipFree(p); + } else { + PADDLE_ASSERT(fallback_alloc_size_ >= size); + fallback_alloc_size_ -= size; + err = hipHostFree(p); + } + + if (err != hipSuccess) { + PADDLE_ENFORCE(err, "hipFree failed in GPUAllocator::Free."); + } +} + +bool GPUAllocator::UseGpu() const { return true; } + +// PINNED memory allows direct DMA transfers by the GPU to and from system +// memory. It’s locked to a physical address. +void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) { + if (size <= 0) return nullptr; + + // NOTE: here, we use CUDAPinnedMaxAllocSize as the maximum memory size + // of host pinned allocation. Allocates too much would reduce + // the amount of memory available to the underlying system for paging. + size_t usable = + paddle::platform::CUDAPinnedMaxAllocSize() - cuda_pinnd_alloc_size_; + + if (size > usable) { + LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0 + << " MB pinned memory." + << ", available " << usable / 1024.0 / 1024.0 << " MB"; + return nullptr; + } + + void* p; + // PINNED memory is visible to all HIP contexts. + hipError_t result = hipHostMalloc(&p, size); + + if (result == hipSuccess) { + *index = 1; // PINNED memory + cuda_pinnd_alloc_size_ += size; + return p; + } else { + LOG(WARNING) << "hipMallocHost failed."; + return nullptr; + } + + return nullptr; +} + +void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) { + hipError_t err; + PADDLE_ASSERT(index == 1); + + PADDLE_ASSERT(cuda_pinnd_alloc_size_ >= size); + cuda_pinnd_alloc_size_ -= size; + err = hipHostFree(p); + + if (err != hipSuccess) { + PADDLE_ENFORCE(err, "hipFreeHost failed in GPUPinnedAllocator::Free."); + } +} + +bool CUDAPinnedAllocator::UseGpu() const { return false; } + +#endif + } // namespace detail } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h index a0386a2dad1bb7..f4f6504420783f 100644 --- a/paddle/fluid/memory/detail/system_allocator.h +++ b/paddle/fluid/memory/detail/system_allocator.h @@ -41,7 +41,7 @@ class CPUAllocator : public SystemAllocator { virtual bool UseGpu() const; }; -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) class GPUAllocator : public SystemAllocator { public: explicit GPUAllocator(int gpu_id) : gpu_id_(gpu_id) {} diff --git a/paddle/fluid/memory/detail/system_allocator_test.cc b/paddle/fluid/memory/detail/system_allocator_test.cc index 268260142c579e..66686a9c9bc032 100644 --- a/paddle/fluid/memory/detail/system_allocator_test.cc +++ b/paddle/fluid/memory/detail/system_allocator_test.cc @@ -56,7 +56,7 @@ TEST(CPUAllocator, LockMem) { TestAllocator(&a, 0); } -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) TEST(GPUAllocator, Alloc) { paddle::memory::detail::GPUAllocator a(0); TestAllocator(&a, 2048); diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc index 0c74f62de5c6f5..127864ea4b96d9 100644 --- a/paddle/fluid/memory/malloc.cc +++ b/paddle/fluid/memory/malloc.cc @@ -56,7 +56,7 @@ size_t Used(platform::CPUPlace place) { return GetCPUBuddyAllocator()->Used(); } -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { static BuddyAllocator** as = NULL; @@ -151,7 +151,7 @@ size_t Usage::operator()(const platform::CPUPlace& cpu) const { } size_t Usage::operator()(const platform::CUDAPlace& gpu) const { -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) return Used(gpu); #else PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); @@ -159,7 +159,7 @@ size_t Usage::operator()(const platform::CUDAPlace& gpu) const { } size_t Usage::operator()(const platform::CUDAPinnedPlace& cuda_pinned) const { -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) return Used(cuda_pinned); #else PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device."); diff --git a/paddle/fluid/memory/malloc_test.cc b/paddle/fluid/memory/malloc_test.cc index d39466ef60c375..d3ba59657d7290 100644 --- a/paddle/fluid/memory/malloc_test.cc +++ b/paddle/fluid/memory/malloc_test.cc @@ -82,7 +82,7 @@ TEST(BuddyAllocator, CPUMultAlloc) { } } -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) size_t align(size_t size, paddle::platform::CUDAPlace place) { size += sizeof(paddle::memory::detail::MemoryBlock::Desc); diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index eddcaab8befda8..05d14475bcdff0 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -97,5 +97,76 @@ void Copy( #endif +#ifdef PADDLE_WITH_HIP +template <> +void Copy( + platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place, + const void* src, size_t num, hipStream_t stream) { + platform::SetDeviceId(src_place.device); + platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, stream); +} + +template <> +void Copy( + platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place, + const void* src, size_t num, hipStream_t stream) { + platform::SetDeviceId(dst_place.device); + platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, stream); +} + +template <> +void Copy( + platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place, + const void* src, size_t num, hipStream_t stream) { + if (dst_place == src_place) { + platform::SetDeviceId(src_place.device); + platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToDevice, stream); + } else { + platform::GpuMemcpyPeer(dst, dst_place.device, src, src_place.device, num, + stream); + } +} + +template <> +void Copy( + platform::CPUPlace dst_place, void* dst, + platform::CUDAPinnedPlace src_place, const void* src, size_t num) { + std::memcpy(dst, src, num); +} + +template <> +void Copy( + platform::CUDAPinnedPlace dst_place, void* dst, + platform::CPUPlace src_place, const void* src, size_t num) { + std::memcpy(dst, src, num); +} + +template <> +void Copy( + platform::CUDAPinnedPlace dst_place, void* dst, + platform::CUDAPinnedPlace src_place, const void* src, size_t num) { + std::memcpy(dst, src, num); +} + +template <> +void Copy( + platform::CUDAPinnedPlace dst_place, void* dst, + platform::CUDAPlace src_place, const void* src, size_t num, + hipStream_t stream) { + platform::SetDeviceId(src_place.device); + platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, stream); +} + +template <> +void Copy( + platform::CUDAPlace dst_place, void* dst, + platform::CUDAPinnedPlace src_place, const void* src, size_t num, + hipStream_t stream) { + platform::SetDeviceId(dst_place.device); + platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, stream); +} + +#endif + } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/memcpy.h b/paddle/fluid/memory/memcpy.h index 7b2b8eb0662fb1..290f44f8016294 100644 --- a/paddle/fluid/memory/memcpy.h +++ b/paddle/fluid/memory/memcpy.h @@ -53,6 +53,28 @@ template void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num, cudaStream_t stream); +#endif + +#ifdef PADDLE_WITH_HIP + +/** + * \brief Copy memory from one place to another place. + * + * \param[in] DstPlace Destination allocation place (CPU or GPU). + * \param[in] dst Destination memory address. + * \param[in] SrcPlace Source allocation place (CPU or GPU). + * \param[in] src Source memory address. + * \param[in] num memory size in bytes to copy. + * \param[in] stream CUDA stream. + * + * \note For GPU memory copy, CUDA stream need to be specified + * for asynchronously memory copy. + * + */ +template +void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num, + hipStream_t stream); + #endif } // namespace memory } // namespace paddle