Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add env param KV_CACHE_LOCATION to control kv cache memory numanode location #462

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions src/common/kvcache_mgr.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,11 @@
// limitations under the License.
// ============================================================================
#pragma once

#include <unordered_map>
#include <vector>

#include "environment.h"
#include "kvcache_tensor.h"
#include <unordered_map>

namespace xft {

Expand All @@ -41,6 +42,8 @@ class KVCacheMgrImpl : public KVCacheMgrImplBase {
this->headNum_ = headNum;
this->headSize_ = headSize;
this->layers_ = layers;
// The KV Cache location configured in "KV_CACHE_LOCATION"
this->allocNode = Env::getInstance().getPrimitiveCacheM();
a3213105 marked this conversation as resolved.
Show resolved Hide resolved
}

~KVCacheMgrImpl() {
Expand Down Expand Up @@ -89,7 +92,7 @@ class KVCacheMgrImpl : public KVCacheMgrImplBase {
// User specified maxSeqLen needs to be <= model's configured maxSeqLen
auto maxLen = maxSeqLen > 0 ? std::min(maxSeqLen, maxSeqLen_) : maxSeqLen_;
for (int i = 0; i < 2 * layers_; ++i) {
cache[i].resize(maxLen, 1, headNum_, headSize_);
cache[i].resize(maxLen, 1, headNum_, headSize_, this->allocNode);
}

sequenceCaches.insert({seqID, cache});
Expand Down Expand Up @@ -186,6 +189,7 @@ class KVCacheMgrImpl : public KVCacheMgrImplBase {
int headNum_;
int headSize_;
int layers_;
int allocNode;
};

class KVCacheMgr {
Expand Down
36 changes: 23 additions & 13 deletions src/common/kvcache_tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "allocator.h"
#include "bfloat16.h"
#include "float16.h"
#include "numa_allocator.h"

extern bool kvTrans();

Expand Down Expand Up @@ -67,32 +68,40 @@ template <typename T>
class KVCacheTensor {
public:
KVCacheTensor()
: maxSeqLen(0), batchSize(0), headNum(0), headSize(0), data(nullptr), allocSize(0), scales(nullptr) {}
: maxSeqLen(0), batchSize(0), headNum(0), headSize(0), data(nullptr), allocSize(0), scales(nullptr), scalesAllocSize(0) {}

~KVCacheTensor() {
if (this->data) { free(this->data); }
if (this->scales) { free(this->scales); }
if (this->data) { xft_numa_free(this->data, allocSize); }
if (this->scales) { xft_numa_free(this->scales, scalesAllocSize); }
}

void resize(int maxSeqLen, int batchSize, int headNum, int headSize) {
void resize(int maxSeqLen, int batchSize, int headNum, int headSize, int allocNode) {
this->maxSeqLen = maxSeqLen;
this->batchSize = batchSize;
this->headNum = headNum;
this->headSize = headSize;

uint64_t requiredSize = (uint64_t)maxSeqLen * batchSize * headNum * headSize;
uint64_t requiredSize = (uint64_t)maxSeqLen * batchSize * headNum * headSize * sizeof(T);
if (requiredSize > allocSize) {
if (this->data) { free(this->data); }
this->data = (T *)xft::alloc(requiredSize * sizeof(T));
if (this->data) { xft_numa_free(this->data, allocSize); }
this->data = (T *)xft_numa_alloc_onnode(requiredSize, allocNode);
if (!this->data) {
printf("Failed to alloc mem for KV Cache [%d][%d][%d][%d].\n", maxSeqLen, batchSize, headNum, headSize);
exit(-1);
}
allocSize = requiredSize;
}

if (this->scales) { free(this->scales); }
this->scales = (float *)xft::alloc((uint64_t)maxSeqLen * batchSize * headNum * sizeof(float));
requiredSize = (uint64_t)maxSeqLen * batchSize * headNum * sizeof(float);
if (requiredSize > scalesAllocSize) {
if (this->scales) { xft_numa_free(this->scales, scalesAllocSize); }
this->scales = (float *)xft_numa_alloc_onnode(requiredSize, allocNode);
if (!this->scales) {
printf("Failed to alloc mem for KV Cache scales [%d][%d][%d][%d].\n", maxSeqLen, batchSize, headNum, headSize);
exit(-1);
}
scalesAllocSize = requiredSize;
}
}

int getBatchSize() const { return batchSize; }
Expand Down Expand Up @@ -188,15 +197,15 @@ class KVCacheTensor {
* initSeqLen: initial sequence length, which is the prompt token size
* accSeqLen: accumulated sequence length
*/
void reorder(int *idx, int size, int initSeqLen, int accSeqLen) {
void reorder(int *idx, int size, int initSeqLen, int accSeqLen, int allocNode) {
const int cols = this->getHeadNum() * this->getHeadSize();
const int batchSize = this->getBatchSize();

T *pdata = this->data + initSeqLen * batchSize * cols;

// Temporary buffer used for reorder
T *extraKeyBuf = (T *)xft::alloc((batchSize - 1) * cols * sizeof(T));

uint64_t requiredSize = (uint64_t)(batchSize - 1) * cols * sizeof(T);
T *extraKeyBuf = (T *)xft_numa_alloc_onnode(requiredSize, allocNode);
for (int seq = initSeqLen; seq < accSeqLen; ++seq) { // Reorder is not needed for the first few lines
int extraBufIdx = 0;
int remapped[batchSize];
Expand Down Expand Up @@ -260,7 +269,7 @@ class KVCacheTensor {
pdata += batchSize * cols;
}

free(extraKeyBuf);
xft_numa_free(extraKeyBuf, requiredSize);
}

private:
Expand Down Expand Up @@ -327,4 +336,5 @@ class KVCacheTensor {

// The scale factor for each head (if T is int8)
float *scales;
uint64_t scalesAllocSize;
};
17 changes: 10 additions & 7 deletions src/models/kvcache_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,28 +12,31 @@
// See the License for the specific language governing permissions and
// limitations under the License.
// ============================================================================
#include "kvcache_manager.h"
#include <algorithm>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include "allocator.h"
#include "bfloat16.h"
#include "environment.h"
#include "float16.h"
#include "kvcache_manager.h"

template <typename KVCacheT>
void KVCacheManager<KVCacheT>::resize(int maxSeqLen, int batchSize, int headsPerSplit, int headSize, bool prefix) {
// The KV Cache location configured in "KV_CACHE_LOCATION"
this->allocNode = Env::getInstance().getPrimitiveCacheM();
if (prefix && this->cachedPrefixKeys == nullptr) {
this->cachedPrefixKeys = new KVCacheTensor<KVCacheT>[layers];
this->cachedPrefixValues = new KVCacheTensor<KVCacheT>[layers];
}
for (int i = 0; i < this->layers; ++i) {
if (prefix) {
this->cachedPrefixKeys[i].resize(maxSeqLen, 1, headsPerSplit, headSize);
this->cachedPrefixValues[i].resize(maxSeqLen, 1, headsPerSplit, headSize);
this->cachedPrefixKeys[i].resize(maxSeqLen, 1, headsPerSplit, headSize, this->allocNode);
this->cachedPrefixValues[i].resize(maxSeqLen, 1, headsPerSplit, headSize, this->allocNode);
} else {
this->cachedKeys[i].resize(maxSeqLen, batchSize, headsPerSplit, headSize);
this->cachedValues[i].resize(maxSeqLen, batchSize, headsPerSplit, headSize);
this->cachedKeys[i].resize(maxSeqLen, batchSize, headsPerSplit, headSize, this->allocNode);
this->cachedValues[i].resize(maxSeqLen, batchSize, headsPerSplit, headSize, this->allocNode);
}
}
}
Expand Down Expand Up @@ -100,10 +103,10 @@ void KVCacheManager<KVCacheT>::reorderCache(int *idx, int size, int initSeqLen,
int layer = i / 2;
if (i % 2 == 0) {
KVCacheTensor<KVCacheT> &keyTensor = this->getKey(layer);
keyTensor.reorder(idx, size, initSeqLen, accSeqLen);
keyTensor.reorder(idx, size, initSeqLen, accSeqLen, this->allocNode);
} else {
KVCacheTensor<KVCacheT> &valueTensor = this->getValue(layer);
valueTensor.reorder(idx, size, initSeqLen, accSeqLen);
valueTensor.reorder(idx, size, initSeqLen, accSeqLen, this->allocNode);
}
}
}
Expand Down
1 change: 1 addition & 0 deletions src/models/kvcache_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ class KVCacheManager {
void reorderCache(int *idx, int size, int initSeqLen, int accSeqLen);

private:
int allocNode;
int layers; // how many layers
KVCacheTensor<KVCacheT> *cachedKeys; // all accumulated keys
KVCacheTensor<KVCacheT> *cachedValues; // all accumulated values
Expand Down
15 changes: 15 additions & 0 deletions src/utils/environment.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ class Env {
// get Primitive Cache M
int getPrimitiveCacheM() { return primitiveCacheM; }

// get KV Cache Location
int getKVCacheLocation() { return primitiveCacheM; }
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should be kvCacheLocation? and initKVCacheLocation is never called?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed


private:
Env() {
// init Verbose
Expand Down Expand Up @@ -281,4 +284,16 @@ class Env {
primitiveCacheM = 256;
}
}

// KV_CACHE_LOCATION
int kvCacheLocation = -1;
void initKVCacheLocation() {
// The KV Cache location configured in "KV_CACHE_LOCATION"
char *xft_kvcache_location_value = getenv("KV_CACHE_LOCATION");
if (xft_kvcache_location_value != NULL) {
int value = atoi(xft_kvcache_location_value);
if (value >= 0)
kvCacheLocation = value;
}
}
};
Loading