From 4a8775b8773399b4a39c1d5d69ddb3f529b1ec6f Mon Sep 17 00:00:00 2001
From: sabreshao <sabre.shao@amd.com>
Date: Mon, 9 Apr 2018 16:32:09 +0800
Subject: [PATCH] Add HIP support for fluid/memory. Refine with latest cmake.

---
 paddle/fluid/memory/detail/CMakeLists.txt     |   2 +
 paddle/fluid/memory/detail/buddy_allocator.cc |   2 +-
 .../fluid/memory/detail/system_allocator.cc   | 104 ++++++++++++++++++
 paddle/fluid/memory/detail/system_allocator.h |   2 +-
 .../memory/detail/system_allocator_test.cc    |   2 +-
 paddle/fluid/memory/malloc.cc                 |   6 +-
 paddle/fluid/memory/malloc_test.cc            |   2 +-
 paddle/fluid/memory/memcpy.cc                 |  71 ++++++++++++
 paddle/fluid/memory/memcpy.h                  |  22 ++++
 9 files changed, 206 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/memory/detail/CMakeLists.txt b/paddle/fluid/memory/detail/CMakeLists.txt
index c725dba5e98c20..4bccf2fd70c605 100644
--- a/paddle/fluid/memory/detail/CMakeLists.txt
+++ b/paddle/fluid/memory/detail/CMakeLists.txt
@@ -2,6 +2,8 @@ cc_library(memory_block SRCS memory_block.cc memory_block_desc.cc meta_cache.cc)
 
 if(${WITH_GPU})
   nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info)
+elseif (WITH_AMD_GPU)
+  hip_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info)
 else(${WITH_GPU})
   cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info)
 endif(${WITH_GPU})
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index 4194ba197948b4..2d3d9a5702d2b3 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -177,7 +177,7 @@ void* BuddyAllocator::SystemAlloc(size_t size) {
 }
 
 BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
-#ifdef PADDLE_WITH_CUDA
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP))
   if (system_allocator_->UseGpu()) {
     if ((total_used_ + total_free_) == 0) {
       // Compute the maximum allocation size for the first allocation.
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index d5390529163491..47da4462e337e3 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -187,6 +187,110 @@ bool CUDAPinnedAllocator::UseGpu() const { return false; }
 
 #endif
 
+#ifdef PADDLE_WITH_HIP
+
+void* GPUAllocator::Alloc(size_t* index, size_t size) {
+  // CUDA documentation doesn't explain if hipMalloc returns nullptr
+  // if size is 0.  We just make sure it does.
+  if (size <= 0) return nullptr;
+  void* p;
+  int prev_id;
+  hipGetDevice(&prev_id);
+  if (prev_id != gpu_id_) {
+    hipSetDevice(gpu_id_);
+  }
+
+  hipError_t result = hipMalloc(&p, size);
+
+  if (prev_id != gpu_id_) {
+    hipSetDevice(prev_id);
+  }
+
+  if (result == hipSuccess) {
+    *index = 0;
+    gpu_alloc_size_ += size;
+    return p;
+  } else {
+    LOG(WARNING)
+        << "Cannot malloc " << size / 1024.0 / 1024.0
+        << " MB GPU memory. Please shrink FLAGS_fraction_of_gpu_memory_to_use "
+           "environment variable to a lower value. Current value is "
+        << FLAGS_fraction_of_gpu_memory_to_use;
+    return nullptr;
+  }
+}
+
+void GPUAllocator::Free(void* p, size_t size, size_t index) {
+  hipError_t err;
+
+  if (index == 0) {
+    PADDLE_ASSERT(gpu_alloc_size_ >= size);
+    gpu_alloc_size_ -= size;
+    err = hipFree(p);
+  } else {
+    PADDLE_ASSERT(fallback_alloc_size_ >= size);
+    fallback_alloc_size_ -= size;
+    err = hipHostFree(p);
+  }
+
+  if (err != hipSuccess) {
+    PADDLE_ENFORCE(err, "hipFree failed in GPUAllocator::Free.");
+  }
+}
+
+bool GPUAllocator::UseGpu() const { return true; }
+
+// PINNED memory allows direct DMA transfers by the GPU to and from system
+// memory. It’s locked to a physical address.
+void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
+  if (size <= 0) return nullptr;
+
+  // NOTE: here, we use CUDAPinnedMaxAllocSize as the maximum memory size
+  // of host pinned allocation. Allocates too much would reduce
+  // the amount of memory available to the underlying system for paging.
+  size_t usable =
+      paddle::platform::CUDAPinnedMaxAllocSize() - cuda_pinnd_alloc_size_;
+
+  if (size > usable) {
+    LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
+                 << " MB pinned memory."
+                 << ", available " << usable / 1024.0 / 1024.0 << " MB";
+    return nullptr;
+  }
+
+  void* p;
+  // PINNED memory is visible to all HIP contexts.
+  hipError_t result = hipHostMalloc(&p, size);
+
+  if (result == hipSuccess) {
+    *index = 1;  // PINNED memory
+    cuda_pinnd_alloc_size_ += size;
+    return p;
+  } else {
+    LOG(WARNING) << "hipMallocHost failed.";
+    return nullptr;
+  }
+
+  return nullptr;
+}
+
+void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
+  hipError_t err;
+  PADDLE_ASSERT(index == 1);
+
+  PADDLE_ASSERT(cuda_pinnd_alloc_size_ >= size);
+  cuda_pinnd_alloc_size_ -= size;
+  err = hipHostFree(p);
+
+  if (err != hipSuccess) {
+    PADDLE_ENFORCE(err, "hipFreeHost failed in GPUPinnedAllocator::Free.");
+  }
+}
+
+bool CUDAPinnedAllocator::UseGpu() const { return false; }
+
+#endif
+
 }  // namespace detail
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h
index a0386a2dad1bb7..f4f6504420783f 100644
--- a/paddle/fluid/memory/detail/system_allocator.h
+++ b/paddle/fluid/memory/detail/system_allocator.h
@@ -41,7 +41,7 @@ class CPUAllocator : public SystemAllocator {
   virtual bool UseGpu() const;
 };
 
-#ifdef PADDLE_WITH_CUDA
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP))
 class GPUAllocator : public SystemAllocator {
  public:
   explicit GPUAllocator(int gpu_id) : gpu_id_(gpu_id) {}
diff --git a/paddle/fluid/memory/detail/system_allocator_test.cc b/paddle/fluid/memory/detail/system_allocator_test.cc
index 268260142c579e..66686a9c9bc032 100644
--- a/paddle/fluid/memory/detail/system_allocator_test.cc
+++ b/paddle/fluid/memory/detail/system_allocator_test.cc
@@ -56,7 +56,7 @@ TEST(CPUAllocator, LockMem) {
   TestAllocator(&a, 0);
 }
 
-#ifdef PADDLE_WITH_CUDA
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP))
 TEST(GPUAllocator, Alloc) {
   paddle::memory::detail::GPUAllocator a(0);
   TestAllocator(&a, 2048);
diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
index 0c74f62de5c6f5..127864ea4b96d9 100644
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -56,7 +56,7 @@ size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
   return GetCPUBuddyAllocator()->Used();
 }
 
-#ifdef PADDLE_WITH_CUDA
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP))
 
 BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
   static BuddyAllocator** as = NULL;
@@ -151,7 +151,7 @@ size_t Usage::operator()(const platform::CPUPlace& cpu) const {
 }
 
 size_t Usage::operator()(const platform::CUDAPlace& gpu) const {
-#ifdef PADDLE_WITH_CUDA
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP))
   return Used(gpu);
 #else
   PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
@@ -159,7 +159,7 @@ size_t Usage::operator()(const platform::CUDAPlace& gpu) const {
 }
 
 size_t Usage::operator()(const platform::CUDAPinnedPlace& cuda_pinned) const {
-#ifdef PADDLE_WITH_CUDA
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP))
   return Used(cuda_pinned);
 #else
   PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device.");
diff --git a/paddle/fluid/memory/malloc_test.cc b/paddle/fluid/memory/malloc_test.cc
index d39466ef60c375..d3ba59657d7290 100644
--- a/paddle/fluid/memory/malloc_test.cc
+++ b/paddle/fluid/memory/malloc_test.cc
@@ -82,7 +82,7 @@ TEST(BuddyAllocator, CPUMultAlloc) {
   }
 }
 
-#ifdef PADDLE_WITH_CUDA
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP))
 
 size_t align(size_t size, paddle::platform::CUDAPlace place) {
   size += sizeof(paddle::memory::detail::MemoryBlock::Desc);
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index eddcaab8befda8..05d14475bcdff0 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -97,5 +97,76 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
 
 #endif
 
+#ifdef PADDLE_WITH_HIP
+template <>
+void Copy<platform::CPUPlace, platform::CUDAPlace>(
+    platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
+    const void* src, size_t num, hipStream_t stream) {
+  platform::SetDeviceId(src_place.device);
+  platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, stream);
+}
+
+template <>
+void Copy<platform::CUDAPlace, platform::CPUPlace>(
+    platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place,
+    const void* src, size_t num, hipStream_t stream) {
+  platform::SetDeviceId(dst_place.device);
+  platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, stream);
+}
+
+template <>
+void Copy<platform::CUDAPlace, platform::CUDAPlace>(
+    platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place,
+    const void* src, size_t num, hipStream_t stream) {
+  if (dst_place == src_place) {
+    platform::SetDeviceId(src_place.device);
+    platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToDevice, stream);
+  } else {
+    platform::GpuMemcpyPeer(dst, dst_place.device, src, src_place.device, num,
+                            stream);
+  }
+}
+
+template <>
+void Copy<platform::CPUPlace, platform::CUDAPinnedPlace>(
+    platform::CPUPlace dst_place, void* dst,
+    platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
+  std::memcpy(dst, src, num);
+}
+
+template <>
+void Copy<platform::CUDAPinnedPlace, platform::CPUPlace>(
+    platform::CUDAPinnedPlace dst_place, void* dst,
+    platform::CPUPlace src_place, const void* src, size_t num) {
+  std::memcpy(dst, src, num);
+}
+
+template <>
+void Copy<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>(
+    platform::CUDAPinnedPlace dst_place, void* dst,
+    platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
+  std::memcpy(dst, src, num);
+}
+
+template <>
+void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
+    platform::CUDAPinnedPlace dst_place, void* dst,
+    platform::CUDAPlace src_place, const void* src, size_t num,
+    hipStream_t stream) {
+  platform::SetDeviceId(src_place.device);
+  platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, stream);
+}
+
+template <>
+void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
+    platform::CUDAPlace dst_place, void* dst,
+    platform::CUDAPinnedPlace src_place, const void* src, size_t num,
+    hipStream_t stream) {
+  platform::SetDeviceId(dst_place.device);
+  platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, stream);
+}
+
+#endif
+
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/memcpy.h b/paddle/fluid/memory/memcpy.h
index 7b2b8eb0662fb1..290f44f8016294 100644
--- a/paddle/fluid/memory/memcpy.h
+++ b/paddle/fluid/memory/memcpy.h
@@ -53,6 +53,28 @@ template <typename DstPlace, typename SrcPlace>
 void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
           cudaStream_t stream);
 
+#endif
+
+#ifdef PADDLE_WITH_HIP
+
+/**
+ * \brief   Copy memory from one place to another place.
+ *
+ * \param[in]  DstPlace Destination allocation place (CPU or GPU).
+ * \param[in]  dst      Destination memory address.
+ * \param[in]  SrcPlace Source allocation place (CPU or GPU).
+ * \param[in]  src      Source memory address.
+ * \param[in]  num      memory size in bytes to copy.
+ * \param[in]  stream   CUDA stream.
+ *
+ * \note    For GPU memory copy, CUDA stream need to be specified
+ *          for asynchronously memory copy.
+ *
+ */
+template <typename DstPlace, typename SrcPlace>
+void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
+          hipStream_t stream);
+
 #endif
 }  // namespace memory
 }  // namespace paddle