From 84c8cdeb535113f910d272885395c615923d4efd Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Fri, 7 Jan 2022 15:17:59 -0800
Subject: [PATCH] Use addressed-ordered first fit for the pinned memory pool
 (#9989)

The current `PinnedMemoryPool` always allocate from the largest free buffer, which might cause more fragmentation than necessary. Address-ordered first-fit has shown to perform slightly better than best-fit when it comes to memory fragmentation*, and slightly cheaper to implement. It is also used by some popular allocators such as jemalloc.

This seems like a low risk change, and I do see a few percentage of performance improvement on my desktop with Q50, although the effect is less obvious on the DGX-2.

* Johnstone, M. S., & Wilson, P. R. (1998). The memory fragmentation problem: Solved? ACM Sigplan Notices, 34(3), 26-36.

Authors:
  - Rong Ou (https://github.com/rongou)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/9989
---
 .../java/ai/rapids/cudf/PinnedMemoryPool.java | 83 ++++++++++---------
 1 file changed, 45 insertions(+), 38 deletions(-)
diff --git a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
index 865a668156f..6eee935748e 100644
--- a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
+++ b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -25,7 +25,9 @@
 import java.util.Comparator;
 import java.util.Iterator;
 import java.util.Objects;
-import java.util.PriorityQueue;
+import java.util.Optional;
+import java.util.SortedSet;
+import java.util.TreeSet;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
@@ -43,19 +45,21 @@ public final class PinnedMemoryPool implements AutoCloseable {
   private static Future<PinnedMemoryPool> initFuture = null;
 
   private final long pinnedPoolBase;
-  private final PriorityQueue<MemorySection> freeHeap = new PriorityQueue<>(new SortedBySize());
+  private final SortedSet<MemorySection> freeHeap = new TreeSet<>(new SortedByAddress());
   private int numAllocatedSections = 0;
   private long availableBytes;
 
   private static class SortedBySize implements Comparator<MemorySection> {
     @Override
     public int compare(MemorySection s0, MemorySection s1) {
-      // We want the largest ones first...
-      int ret = Long.compare(s1.size, s0.size);
-      if (ret == 0) {
-        ret = Long.compare(s0.baseAddress, s1.baseAddress);
-      }
-      return ret;
+      return Long.compare(s0.size, s1.size);
+    }
+  }
+
+  private static class SortedByAddress implements Comparator<MemorySection> {
+    @Override
+    public int compare(MemorySection s0, MemorySection s1) {
+      return Long.compare(s0.baseAddress, s1.baseAddress);
     }
   }
 
@@ -162,6 +166,7 @@ private static void freeInternal(MemorySection section) {
 
   /**
    * Initialize the pool.
+   *
    * @param poolSize size of the pool to initialize.
    */
   public static synchronized void initialize(long poolSize) {
@@ -170,8 +175,9 @@ public static synchronized void initialize(long poolSize) {
 
   /**
    * Initialize the pool.
+   *
    * @param poolSize size of the pool to initialize.
-   * @param gpuId gpu id to set to get memory pool from, -1 means to use default
+   * @param gpuId    gpu id to set to get memory pool from, -1 means to use default
    */
   public static synchronized void initialize(long poolSize, int gpuId) {
     if (isInitialized()) {
@@ -207,11 +213,12 @@ public static synchronized void shutdown() {
 
   /**
    * Factory method to create a pinned host memory buffer.
+   *
    * @param bytes size in bytes to allocate
    * @return newly created buffer or null if insufficient pinned memory
    */
   public static HostMemoryBuffer tryAllocate(long bytes) {
-    HostMemoryBuffer result  = null;
+    HostMemoryBuffer result = null;
     PinnedMemoryPool pool = getSingleton();
     if (pool != null) {
       result = pool.tryAllocateInternal(bytes);
@@ -222,6 +229,7 @@ public static HostMemoryBuffer tryAllocate(long bytes) {
   /**
    * Factory method to create a host buffer but preferably pointing to pinned memory.
    * It is not guaranteed that the returned buffer will be pointer to pinned memory.
+   *
    * @param bytes size in bytes to allocate
    * @return newly created buffer
    */
@@ -235,6 +243,7 @@ public static HostMemoryBuffer allocate(long bytes) {
 
   /**
    * Get the number of bytes free in the pinned memory pool.
+   *
    * @return amount of free memory in bytes or 0 if the pool is not initialized
    */
   public static long getAvailableBytes() {
@@ -246,7 +255,7 @@ public static long getAvailableBytes() {
   }
 
   private PinnedMemoryPool(long poolSize, int gpuId) {
-    if (gpuId > -1 ) {
+    if (gpuId > -1) {
       // set the gpu device to use
       Cuda.setDevice(gpuId);
       Cuda.freeZero();
@@ -269,20 +278,28 @@ private synchronized HostMemoryBuffer tryAllocateInternal(long bytes) {
     }
     // Align the allocation
     long alignedBytes = ((bytes + ALIGNMENT - 1) / ALIGNMENT) * ALIGNMENT;
-    MemorySection largest = freeHeap.peek();
-    if (largest.size < alignedBytes) {
-      log.debug("Insufficient pinned memory. {} needed, {} found", alignedBytes, largest.size);
+    Optional<MemorySection> firstFit = freeHeap.stream()
+        .filter(section -> section.size >= alignedBytes)
+        .findFirst();
+    if (!firstFit.isPresent()) {
+      if (log.isDebugEnabled()) {
+        MemorySection largest = freeHeap.stream()
+            .max(new SortedBySize())
+            .orElse(new MemorySection(0, 0));
+        log.debug("Insufficient pinned memory. {} needed, {} found", alignedBytes, largest.size);
+      }
       return null;
     }
+    MemorySection first = firstFit.get();
     log.debug("Allocating {}/{} bytes pinned from {} FREE COUNT {} OUTSTANDING COUNT {}",
-        bytes, alignedBytes, largest, freeHeap.size(), numAllocatedSections);
-    freeHeap.remove(largest);
+        bytes, alignedBytes, first, freeHeap.size(), numAllocatedSections);
+    freeHeap.remove(first);
     MemorySection allocated;
-    if (largest.size == alignedBytes) {
-      allocated = largest;
+    if (first.size == alignedBytes) {
+      allocated = first;
     } else {
-      allocated = largest.splitOff(alignedBytes);
-      freeHeap.add(largest);
+      allocated = first.splitOff(alignedBytes);
+      freeHeap.add(first);
     }
     numAllocatedSections++;
     availableBytes -= allocated.size;
@@ -293,25 +310,15 @@ private synchronized HostMemoryBuffer tryAllocateInternal(long bytes) {
 
   private synchronized void free(MemorySection section) {
     log.debug("Freeing {} with {} outstanding {}", section, freeHeap, numAllocatedSections);
-    // This looks inefficient, but in reality it will only walk through the heap about 2 times.
-    // Because we keep entries up to date, each new entry will at most combine with one above it
-    // and one below it. That will happen in a single pass through the heap.  We do a second pass
-    // simply out of an abundance of caution.
-    // Adding it in will be a log(N) operation because it is a heap.
     availableBytes += section.size;
-    boolean anyReplaced;
-    do {
-      anyReplaced = false;
-      Iterator<MemorySection> it = freeHeap.iterator();
-      while(it.hasNext()) {
-        MemorySection current = it.next();
-        if (section.canCombine(current)) {
-          it.remove();
-          anyReplaced = true;
-          section.combineWith(current);
-        }
+    Iterator<MemorySection> it = freeHeap.iterator();
+    while(it.hasNext()) {
+      MemorySection current = it.next();
+      if (section.canCombine(current)) {
+        it.remove();
+        section.combineWith(current);
       }
-    } while(anyReplaced);
+    }
     freeHeap.add(section);
     numAllocatedSections--;
     log.debug("After freeing {} outstanding {}", freeHeap, numAllocatedSections);