Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions paddle/fluid/memory/detail/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ cc_library(memory_block SRCS memory_block.cc memory_block_desc.cc meta_cache.cc)

if(${WITH_GPU})
nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info)
elseif (WITH_AMD_GPU)
hip_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info)
else(${WITH_GPU})
cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info)
endif(${WITH_GPU})
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/memory/detail/buddy_allocator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ void* BuddyAllocator::SystemAlloc(size_t size) {
}

BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
#ifdef PADDLE_WITH_CUDA
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP))
if (system_allocator_->UseGpu()) {
if ((total_used_ + total_free_) == 0) {
// Compute the maximum allocation size for the first allocation.
Expand Down
104 changes: 104 additions & 0 deletions paddle/fluid/memory/detail/system_allocator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,110 @@ bool CUDAPinnedAllocator::UseGpu() const { return false; }

#endif

#ifdef PADDLE_WITH_HIP

void* GPUAllocator::Alloc(size_t* index, size_t size) {
// CUDA documentation doesn't explain if hipMalloc returns nullptr
// if size is 0. We just make sure it does.
if (size <= 0) return nullptr;
void* p;
int prev_id;
hipGetDevice(&prev_id);
if (prev_id != gpu_id_) {
hipSetDevice(gpu_id_);
}

hipError_t result = hipMalloc(&p, size);

if (prev_id != gpu_id_) {
hipSetDevice(prev_id);
}

if (result == hipSuccess) {
*index = 0;
gpu_alloc_size_ += size;
return p;
} else {
LOG(WARNING)
<< "Cannot malloc " << size / 1024.0 / 1024.0
<< " MB GPU memory. Please shrink FLAGS_fraction_of_gpu_memory_to_use "
"environment variable to a lower value. Current value is "
<< FLAGS_fraction_of_gpu_memory_to_use;
return nullptr;
}
}

void GPUAllocator::Free(void* p, size_t size, size_t index) {
hipError_t err;

if (index == 0) {
PADDLE_ASSERT(gpu_alloc_size_ >= size);
gpu_alloc_size_ -= size;
err = hipFree(p);
} else {
PADDLE_ASSERT(fallback_alloc_size_ >= size);
fallback_alloc_size_ -= size;
err = hipHostFree(p);
}

if (err != hipSuccess) {
PADDLE_ENFORCE(err, "hipFree failed in GPUAllocator::Free.");
}
}

bool GPUAllocator::UseGpu() const { return true; }

// PINNED memory allows direct DMA transfers by the GPU to and from system
// memory. It’s locked to a physical address.
void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
if (size <= 0) return nullptr;

// NOTE: here, we use CUDAPinnedMaxAllocSize as the maximum memory size
// of host pinned allocation. Allocates too much would reduce
// the amount of memory available to the underlying system for paging.
size_t usable =
paddle::platform::CUDAPinnedMaxAllocSize() - cuda_pinnd_alloc_size_;

if (size > usable) {
LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
<< " MB pinned memory."
<< ", available " << usable / 1024.0 / 1024.0 << " MB";
return nullptr;
}

void* p;
// PINNED memory is visible to all HIP contexts.
hipError_t result = hipHostMalloc(&p, size);

if (result == hipSuccess) {
*index = 1; // PINNED memory
cuda_pinnd_alloc_size_ += size;
return p;
} else {
LOG(WARNING) << "hipMallocHost failed.";
return nullptr;
}

return nullptr;
}

void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
hipError_t err;
PADDLE_ASSERT(index == 1);

PADDLE_ASSERT(cuda_pinnd_alloc_size_ >= size);
cuda_pinnd_alloc_size_ -= size;
err = hipHostFree(p);

if (err != hipSuccess) {
PADDLE_ENFORCE(err, "hipFreeHost failed in GPUPinnedAllocator::Free.");
}
}

bool CUDAPinnedAllocator::UseGpu() const { return false; }

#endif

} // namespace detail
} // namespace memory
} // namespace paddle
2 changes: 1 addition & 1 deletion paddle/fluid/memory/detail/system_allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class CPUAllocator : public SystemAllocator {
virtual bool UseGpu() const;
};

#ifdef PADDLE_WITH_CUDA
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP))
class GPUAllocator : public SystemAllocator {
public:
explicit GPUAllocator(int gpu_id) : gpu_id_(gpu_id) {}
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/memory/detail/system_allocator_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ TEST(CPUAllocator, LockMem) {
TestAllocator(&a, 0);
}

#ifdef PADDLE_WITH_CUDA
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP))
TEST(GPUAllocator, Alloc) {
paddle::memory::detail::GPUAllocator a(0);
TestAllocator(&a, 2048);
Expand Down
6 changes: 3 additions & 3 deletions paddle/fluid/memory/malloc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
return GetCPUBuddyAllocator()->Used();
}

#ifdef PADDLE_WITH_CUDA
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP))

BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
static BuddyAllocator** as = NULL;
Expand Down Expand Up @@ -151,15 +151,15 @@ size_t Usage::operator()(const platform::CPUPlace& cpu) const {
}

size_t Usage::operator()(const platform::CUDAPlace& gpu) const {
#ifdef PADDLE_WITH_CUDA
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP))
return Used(gpu);
#else
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
#endif
}

size_t Usage::operator()(const platform::CUDAPinnedPlace& cuda_pinned) const {
#ifdef PADDLE_WITH_CUDA
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP))
return Used(cuda_pinned);
#else
PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device.");
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/memory/malloc_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ TEST(BuddyAllocator, CPUMultAlloc) {
}
}

#ifdef PADDLE_WITH_CUDA
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP))

size_t align(size_t size, paddle::platform::CUDAPlace place) {
size += sizeof(paddle::memory::detail::MemoryBlock::Desc);
Expand Down
71 changes: 71 additions & 0 deletions paddle/fluid/memory/memcpy.cc
Original file line number Diff line number Diff line change
Expand Up @@ -97,5 +97,76 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(

#endif

#ifdef PADDLE_WITH_HIP
template <>
void Copy<platform::CPUPlace, platform::CUDAPlace>(
platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
const void* src, size_t num, hipStream_t stream) {
platform::SetDeviceId(src_place.device);
platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, stream);
}

template <>
void Copy<platform::CUDAPlace, platform::CPUPlace>(
platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place,
const void* src, size_t num, hipStream_t stream) {
platform::SetDeviceId(dst_place.device);
platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, stream);
}

template <>
void Copy<platform::CUDAPlace, platform::CUDAPlace>(
platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place,
const void* src, size_t num, hipStream_t stream) {
if (dst_place == src_place) {
platform::SetDeviceId(src_place.device);
platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToDevice, stream);
} else {
platform::GpuMemcpyPeer(dst, dst_place.device, src, src_place.device, num,
stream);
}
}

template <>
void Copy<platform::CPUPlace, platform::CUDAPinnedPlace>(
platform::CPUPlace dst_place, void* dst,
platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
std::memcpy(dst, src, num);
}

template <>
void Copy<platform::CUDAPinnedPlace, platform::CPUPlace>(
platform::CUDAPinnedPlace dst_place, void* dst,
platform::CPUPlace src_place, const void* src, size_t num) {
std::memcpy(dst, src, num);
}

template <>
void Copy<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>(
platform::CUDAPinnedPlace dst_place, void* dst,
platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
std::memcpy(dst, src, num);
}

template <>
void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
platform::CUDAPinnedPlace dst_place, void* dst,
platform::CUDAPlace src_place, const void* src, size_t num,
hipStream_t stream) {
platform::SetDeviceId(src_place.device);
platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, stream);
}

template <>
void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
platform::CUDAPlace dst_place, void* dst,
platform::CUDAPinnedPlace src_place, const void* src, size_t num,
hipStream_t stream) {
platform::SetDeviceId(dst_place.device);
platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, stream);
}

#endif

} // namespace memory
} // namespace paddle
22 changes: 22 additions & 0 deletions paddle/fluid/memory/memcpy.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,28 @@ template <typename DstPlace, typename SrcPlace>
void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
cudaStream_t stream);

#endif

#ifdef PADDLE_WITH_HIP

/**
* \brief Copy memory from one place to another place.
*
* \param[in] DstPlace Destination allocation place (CPU or GPU).
* \param[in] dst Destination memory address.
* \param[in] SrcPlace Source allocation place (CPU or GPU).
* \param[in] src Source memory address.
* \param[in] num memory size in bytes to copy.
* \param[in] stream CUDA stream.
*
* \note For GPU memory copy, CUDA stream need to be specified
* for asynchronously memory copy.
*
*/
template <typename DstPlace, typename SrcPlace>
void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
hipStream_t stream);

#endif
} // namespace memory
} // namespace paddle