Skip to content

Commit

Permalink
Add env param KV_CACHE_LOCATION to control kv cache memory numanode l…
Browse files Browse the repository at this point in the history
…ocation

Usage:
before you run instance
export KV_CACHE_LOCATION=#memory_numa_node_id_you_want_to_use_for_kv_cache

by defaults, kv_cache location is the same as other parts of instance.
  • Loading branch information
a3213105 committed Jun 28, 2024
1 parent 6656c54 commit e80f266
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 20 deletions.
5 changes: 4 additions & 1 deletion src/common/kvcache_mgr.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ class KVCacheMgrImpl : public KVCacheMgrImplBase {
this->headNum_ = headNum;
this->headSize_ = headSize;
this->layers_ = layers;
// The KV Cache location configured in "KV_CACHE_LOCATION"
this->allocNode = getenv("KV_CACHE_LOCATION") ? atoi(getenv("KV_CACHE_LOCATION")) : -1;
}

~KVCacheMgrImpl() {
Expand Down Expand Up @@ -89,7 +91,7 @@ class KVCacheMgrImpl : public KVCacheMgrImplBase {
// User specified maxSeqLen needs to be <= model's configured maxSeqLen
auto maxLen = maxSeqLen > 0 ? std::min(maxSeqLen, maxSeqLen_) : maxSeqLen_;
for (int i = 0; i < 2 * layers_; ++i) {
cache[i].resize(maxLen, 1, headNum_, headSize_);
cache[i].resize(maxLen, 1, headNum_, headSize_, this->allocNode);
}

sequenceCaches.insert({seqID, cache});
Expand Down Expand Up @@ -186,6 +188,7 @@ class KVCacheMgrImpl : public KVCacheMgrImplBase {
int headNum_;
int headSize_;
int layers_;
int allocNode;
};

class KVCacheMgr {
Expand Down
36 changes: 23 additions & 13 deletions src/common/kvcache_tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "allocator.h"
#include "bfloat16.h"
#include "float16.h"
#include "numa_allocator.h"

extern bool kvTrans();

Expand Down Expand Up @@ -67,32 +68,40 @@ template <typename T>
class KVCacheTensor {
public:
KVCacheTensor()
: maxSeqLen(0), batchSize(0), headNum(0), headSize(0), data(nullptr), allocSize(0), scales(nullptr) {}
: maxSeqLen(0), batchSize(0), headNum(0), headSize(0), data(nullptr), allocSize(0), scales(nullptr), scalesAllocSize(0) {}

~KVCacheTensor() {
if (this->data) { free(this->data); }
if (this->scales) { free(this->scales); }
if (this->data) { xft_numa_free(this->data, allocSize); }
if (this->scales) { xft_numa_free(this->scales, scalesAllocSize); }
}

void resize(int maxSeqLen, int batchSize, int headNum, int headSize) {
void resize(int maxSeqLen, int batchSize, int headNum, int headSize, int allocNode) {
this->maxSeqLen = maxSeqLen;
this->batchSize = batchSize;
this->headNum = headNum;
this->headSize = headSize;

uint64_t requiredSize = (uint64_t)maxSeqLen * batchSize * headNum * headSize;
uint64_t requiredSize = (uint64_t)maxSeqLen * batchSize * headNum * headSize * sizeof(T);
if (requiredSize > allocSize) {
if (this->data) { free(this->data); }
this->data = (T *)xft::alloc(requiredSize * sizeof(T));
if (this->data) { xft_numa_free(this->data, allocSize); }
this->data = (T *)xft_numa_alloc_onnode(requiredSize, allocNode);
if (!this->data) {
printf("Failed to alloc mem for KV Cache [%d][%d][%d][%d].\n", maxSeqLen, batchSize, headNum, headSize);
exit(-1);
}
allocSize = requiredSize;
}

if (this->scales) { free(this->scales); }
this->scales = (float *)xft::alloc((uint64_t)maxSeqLen * batchSize * headNum * sizeof(float));
requiredSize = (uint64_t)maxSeqLen * batchSize * headNum * sizeof(float);
if (requiredSize > scalesAllocSize) {
if (this->scales) { xft_numa_free(this->scales, scalesAllocSize); }
this->scales = (float *)xft_numa_alloc_onnode(requiredSize, allocNode);
if (!this->scales) {
printf("Failed to alloc mem for KV Cache scales [%d][%d][%d][%d].\n", maxSeqLen, batchSize, headNum, headSize);
exit(-1);
}
scalesAllocSize = requiredSize;
}
}

int getBatchSize() const { return batchSize; }
Expand Down Expand Up @@ -188,15 +197,15 @@ class KVCacheTensor {
* initSeqLen: initial sequence length, which is the prompt token size
* accSeqLen: accumulated sequence length
*/
void reorder(int *idx, int size, int initSeqLen, int accSeqLen) {
void reorder(int *idx, int size, int initSeqLen, int accSeqLen, int allocNode) {
const int cols = this->getHeadNum() * this->getHeadSize();
const int batchSize = this->getBatchSize();

T *pdata = this->data + initSeqLen * batchSize * cols;

// Temporary buffer used for reorder
T *extraKeyBuf = (T *)xft::alloc((batchSize - 1) * cols * sizeof(T));

uint64_t requiredSize = (uint64_t)(batchSize - 1) * cols * sizeof(T);
T *extraKeyBuf = (T *)xft_numa_alloc_onnode(requiredSize, allocNode);
for (int seq = initSeqLen; seq < accSeqLen; ++seq) { // Reorder is not needed for the first few lines
int extraBufIdx = 0;
int remapped[batchSize];
Expand Down Expand Up @@ -260,7 +269,7 @@ class KVCacheTensor {
pdata += batchSize * cols;
}

free(extraKeyBuf);
xft_numa_free(extraKeyBuf, requiredSize);
}

private:
Expand Down Expand Up @@ -327,4 +336,5 @@ class KVCacheTensor {

// The scale factor for each head (if T is int8)
float *scales;
uint64_t scalesAllocSize;
};
14 changes: 8 additions & 6 deletions src/models/kvcache_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,19 @@

template <typename KVCacheT>
void KVCacheManager<KVCacheT>::resize(int maxSeqLen, int batchSize, int headsPerSplit, int headSize, bool prefix) {
// The KV Cache location configured in "KV_CACHE_LOCATION"
this->allocNode = getenv("KV_CACHE_LOCATION") ? atoi(getenv("KV_CACHE_LOCATION")) : -1;
if (prefix && this->cachedPrefixKeys == nullptr) {
this->cachedPrefixKeys = new KVCacheTensor<KVCacheT>[layers];
this->cachedPrefixValues = new KVCacheTensor<KVCacheT>[layers];
}
for (int i = 0; i < this->layers; ++i) {
if (prefix) {
this->cachedPrefixKeys[i].resize(maxSeqLen, 1, headsPerSplit, headSize);
this->cachedPrefixValues[i].resize(maxSeqLen, 1, headsPerSplit, headSize);
this->cachedPrefixKeys[i].resize(maxSeqLen, 1, headsPerSplit, headSize, this->allocNode);
this->cachedPrefixValues[i].resize(maxSeqLen, 1, headsPerSplit, headSize, this->allocNode);
} else {
this->cachedKeys[i].resize(maxSeqLen, batchSize, headsPerSplit, headSize);
this->cachedValues[i].resize(maxSeqLen, batchSize, headsPerSplit, headSize);
this->cachedKeys[i].resize(maxSeqLen, batchSize, headsPerSplit, headSize, this->allocNode);
this->cachedValues[i].resize(maxSeqLen, batchSize, headsPerSplit, headSize, this->allocNode);
}
}
}
Expand Down Expand Up @@ -100,10 +102,10 @@ void KVCacheManager<KVCacheT>::reorderCache(int *idx, int size, int initSeqLen,
int layer = i / 2;
if (i % 2 == 0) {
KVCacheTensor<KVCacheT> &keyTensor = this->getKey(layer);
keyTensor.reorder(idx, size, initSeqLen, accSeqLen);
keyTensor.reorder(idx, size, initSeqLen, accSeqLen, this->allocNode);
} else {
KVCacheTensor<KVCacheT> &valueTensor = this->getValue(layer);
valueTensor.reorder(idx, size, initSeqLen, accSeqLen);
valueTensor.reorder(idx, size, initSeqLen, accSeqLen, this->allocNode);
}
}
}
Expand Down
1 change: 1 addition & 0 deletions src/models/kvcache_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ class KVCacheManager {
void reorderCache(int *idx, int size, int initSeqLen, int accSeqLen);

private:
int allocNode;
int layers; // how many layers
KVCacheTensor<KVCacheT> *cachedKeys; // all accumulated keys
KVCacheTensor<KVCacheT> *cachedValues; // all accumulated values
Expand Down

0 comments on commit e80f266

Please sign in to comment.