Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

IQ1_M: 1.75 bpw quantization #6302

Merged
merged 24 commits into from
Mar 26, 2024
Merged
Changes from 1 commit
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
2a2d66d
iq1_m: basics
Kawrakow Mar 22, 2024
ac8b3dd
iq1_m: basics-2
Kawrakow Mar 22, 2024
1df37b6
iq1_m: CUDA dequantize works
Kawrakow Mar 22, 2024
282f278
iq1_m: separate shifts for each group of 8 in a block
Kawrakow Mar 23, 2024
308c50d
iq1_m: go to 3-bit scales
Kawrakow Mar 23, 2024
64b9dfd
iq1_m: scalar dot product
Kawrakow Mar 23, 2024
a139de5
iq1_m: AVX2 dot product
Kawrakow Mar 23, 2024
379fdb6
iq1_m: very slightly faster AVX2 dot product
Kawrakow Mar 24, 2024
8009b6d
iq1_m: ARM_NEON dot product
Kawrakow Mar 24, 2024
0e36afa
iq1_m: Metal - dequantize works, dot product does not
Kawrakow Mar 25, 2024
19fb974
iq1_m: Metal now works
Kawrakow Mar 25, 2024
abc1d4f
iq1_m: minor
Kawrakow Mar 25, 2024
dff85a8
iq1_m: checking pure iq1_m quantization
Kawrakow Mar 25, 2024
f664692
iiq1_m: slightly faster ARM_NEON dot product
Kawrakow Mar 25, 2024
b1d1c26
iq1_m: faster ARM_NEON dot product
Kawrakow Mar 25, 2024
78ce561
iq1_m: another minor ARM_NEON dot product improvement
Kawrakow Mar 25, 2024
3d9c21f
iq1_m: small PPL improvement via super-block scale adjustment
Kawrakow Mar 25, 2024
480d6d6
iq1_m: adapt to CUDA refactoring
Kawrakow Mar 25, 2024
62dd11f
iq1_m: remove unused variable
Kawrakow Mar 25, 2024
22fa121
iq1_m: add to backend-ops tests
Kawrakow Mar 25, 2024
b68f32b
iq1_m: fix Windows ARM
Kawrakow Mar 26, 2024
9a5786e
iq1_m: use common definition of iq1m_scale_t
Kawrakow Mar 26, 2024
cdb2d65
cuda: assert -> NO_DEVICE_CODE
Kawrakow Mar 26, 2024
6e4cef5
iq1_M: PR comments
Kawrakow Mar 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
iq1_m: separate shifts for each group of 8 in a block
We get
PPL(LLaMA-v2-7B ) = 9.2810
PPL(LLaMA-v2-13B) = 6.8105

Not bad, but slightly higher than
  sqrt(PPL(IQ1_S) * PPL(IQ2_XXS))
which is the expected outcome given that IQ1_M is
halfway between IQ1_S and IQ2_XXS in terms of bpw.
From this, we would expect
 PPL = 9.14 for LLaMA-v2-7B
 PPL = 6.63 for LLaMA-v2-13B
  • Loading branch information
Kawrakow committed Mar 25, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
commit 282f2788af220d4f94668332d9692482651fe2c8
1 change: 1 addition & 0 deletions ggml-common.h
Original file line number Diff line number Diff line change
@@ -1059,6 +1059,7 @@ GGML_TABLE_END()

#define NGRID_IQ1S 2048
#define IQ1S_DELTA 0.125f
#define IQ1M_DELTA 0.125f
#if defined(GGML_COMMON_IMPL_C)
GGML_TABLE_BEGIN(uint64_t, iq1s_grid, NGRID_IQ1S)
0xffffffffffffffff, 0xffffffffffffff01, 0xffffffffffff0000, 0xffffffffffff01ff,
226 changes: 223 additions & 3 deletions ggml-quants.c
Original file line number Diff line number Diff line change
@@ -11913,20 +11913,240 @@ size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int nrow,
return nrow * nblock * sizeof(block_iq1_s);
}

static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights,
float * scales,
float * weight,
float * pairs,
int8_t * L,
uint16_t * index,
int8_t * shifts) {

const int gindex = iq2_data_index(GGML_TYPE_IQ1_M);

const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
const int * kmap_q2xs = iq2_data[gindex].map;
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;

GGML_ASSERT(quant_weights && "missing quantization weights");
GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
GGML_ASSERT(n%QK_K == 0);

block_iq1_m * y = vy;

const int nbl = n/QK_K;

const int block_size = IQ1M_BLOCK_SIZE;

const float x_p[3] = {-1 + IQ1M_DELTA, IQ1M_DELTA, 1 + IQ1M_DELTA};
const float x_m[3] = {-1 - IQ1M_DELTA, -IQ1M_DELTA, 1 - IQ1M_DELTA};
const uint8_t masks[4] = {0x00, 0x80, 0x08, 0x88};

int * idx = (int *)(pairs + 1);

float sumqx[4], sumq2[4];

for (int ibl = 0; ibl < nbl; ++ibl) {

y[ibl].d = GGML_FP32_TO_FP16(0.f);
memset(y[ibl].qs, 0, QK_K/8);
memset(y[ibl].qh, 0, QK_K/16);
memset(y[ibl].scales, 0, QK_K/32);

float max_scale = 0;

const float * xbl = x + QK_K*ibl;
float sumx2 = 0;
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
float sigma2 = 2*sumx2/QK_K;

for (int ib = 0; ib < QK_K/block_size; ++ib) {
const float * xb = xbl + block_size*ib;
const float * qw = quant_weights + QK_K*ibl + block_size*ib;
for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
float max = fabsf(xb[0]);
for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
if (!max) {
scales[ib] = 0;
memset(L, 1, block_size);
continue;
}
// Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
// With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two
// boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights
// in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
// Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
// for each possible and score for each split.
for (int j = 0; j < block_size; ++j) {
pairs[2*j] = xb[j];
idx[2*j] = j;
}
qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper);
float best_score = 0, scale = max;
int besti1 = -1, besti2 = -1, best_k = -1;
// 0: +, +
// 1: +, -
// 2: -, +
// 3: -, -
for (int i1 = 0; i1 <= block_size; ++i1) {
for (int i2 = i1; i2 <= block_size; ++i2) {
memset(sumqx, 0, 4*sizeof(float));
memset(sumq2, 0, 4*sizeof(float));
for (int j = 0; j < i1; ++j) {
int i = idx[2*j];
if (i < block_size/2) {
sumqx[0] += weight[i]*x_p[0]*xb[i];
sumqx[1] += weight[i]*x_p[0]*xb[i];
sumqx[2] += weight[i]*x_m[0]*xb[i];
sumqx[3] += weight[i]*x_m[0]*xb[i];
sumq2[0] += weight[i]*x_p[0]*x_p[0];
sumq2[1] += weight[i]*x_p[0]*x_p[0];
sumq2[2] += weight[i]*x_m[0]*x_m[0];
sumq2[3] += weight[i]*x_m[0]*x_m[0];
} else {
sumqx[0] += weight[i]*x_p[0]*xb[i];
sumqx[2] += weight[i]*x_p[0]*xb[i];
sumqx[1] += weight[i]*x_m[0]*xb[i];
sumqx[3] += weight[i]*x_m[0]*xb[i];
sumq2[0] += weight[i]*x_p[0]*x_p[0];
sumq2[2] += weight[i]*x_p[0]*x_p[0];
sumq2[1] += weight[i]*x_m[0]*x_m[0];
sumq2[3] += weight[i]*x_m[0]*x_m[0];
}
}
for (int j = i1; j < i2; ++j) {
int i = idx[2*j];
if (i < block_size/2) {
sumqx[0] += weight[i]*x_p[1]*xb[i];
sumqx[1] += weight[i]*x_p[1]*xb[i];
sumqx[2] += weight[i]*x_m[1]*xb[i];
sumqx[3] += weight[i]*x_m[1]*xb[i];
sumq2[0] += weight[i]*x_p[1]*x_p[1];
sumq2[1] += weight[i]*x_p[1]*x_p[1];
sumq2[2] += weight[i]*x_m[1]*x_m[1];
sumq2[3] += weight[i]*x_m[1]*x_m[1];
} else {
sumqx[0] += weight[i]*x_p[1]*xb[i];
sumqx[2] += weight[i]*x_p[1]*xb[i];
sumqx[1] += weight[i]*x_m[1]*xb[i];
sumqx[3] += weight[i]*x_m[1]*xb[i];
sumq2[0] += weight[i]*x_p[1]*x_p[1];
sumq2[2] += weight[i]*x_p[1]*x_p[1];
sumq2[1] += weight[i]*x_m[1]*x_m[1];
sumq2[3] += weight[i]*x_m[1]*x_m[1];
}
}
for (int j = i2; j < block_size; ++j) {
int i = idx[2*j];
if (i < block_size/2) {
sumqx[0] += weight[i]*x_p[2]*xb[i];
sumqx[1] += weight[i]*x_p[2]*xb[i];
sumqx[2] += weight[i]*x_m[2]*xb[i];
sumqx[3] += weight[i]*x_m[2]*xb[i];
sumq2[0] += weight[i]*x_p[2]*x_p[2];
sumq2[1] += weight[i]*x_p[2]*x_p[2];
sumq2[2] += weight[i]*x_m[2]*x_m[2];
sumq2[3] += weight[i]*x_m[2]*x_m[2];
} else {
sumqx[0] += weight[i]*x_p[2]*xb[i];
sumqx[2] += weight[i]*x_p[2]*xb[i];
sumqx[1] += weight[i]*x_m[2]*xb[i];
sumqx[3] += weight[i]*x_m[2]*xb[i];
sumq2[0] += weight[i]*x_p[2]*x_p[2];
sumq2[2] += weight[i]*x_p[2]*x_p[2];
sumq2[1] += weight[i]*x_m[2]*x_m[2];
sumq2[3] += weight[i]*x_m[2]*x_m[2];
}
}
for (int k = 0; k < 4; ++k) {
if (sumq2[k] > 0 && sumqx[k]*sumqx[k] > best_score*sumq2[k]) {
scale = sumqx[k]/sumq2[k]; best_score = scale*sumqx[k];
besti1 = i1; besti2 = i2; best_k = k;
}
}
}
}
GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_k >= 0);
for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0;
for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2;
if (scale < 0) {
for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j];
scale = -scale;
best_k = best_k == 0 ? 3 : best_k == 1 ? 2 : best_k == 2 ? 1 : 0;
}
const float * xx;
bool all_on_grid = true;
for (int k = 0; k < block_size/8; ++k) {
if (k == 0) xx = best_k < 2 ? x_p : x_m;
else xx = best_k%2 == 0 ? x_p : x_m;
uint16_t u = 0;
for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j);
int grid_index = kmap_q2xs[u];
if (grid_index < 0) {
all_on_grid = false;
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S);
GGML_ASSERT(grid_index >= 0);
}
index[k] = grid_index;
}
if (!all_on_grid) {
float sumqx_f = 0, sumq2_f = 0;
for (int k = 0; k < block_size/8; ++k) {
if (k == 0) xx = best_k < 2 ? x_p : x_m;
else xx = best_k%2 == 0 ? x_p : x_m;
const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]);
for (int j = 0; j < 8; ++j) {
float w = weight[8*k + j];
float q = xx[(pg[j] - 1)/2];
sumqx_f += w*q*xb[8*k+j];
sumq2_f += w*q*q;
}
}
if (sumqx_f > 0 && sumq2_f > 0) scale = sumqx_f/sumq2_f;
}
y[ibl].qs[2*ib + 0] = index[0] & 255;
y[ibl].qs[2*ib + 1] = index[1] & 255;
y[ibl].qh[ib] = (index[0] >> 8) | ((index[1] >> 8) << 4);
GGML_ASSERT(scale >= 0);
scales[ib] = scale;
shifts[ib] = best_k;
max_scale = MAX(max_scale, scale);
}

if (!max_scale) {
continue;
}

float d = max_scale/31;
y[ibl].d = GGML_FP32_TO_FP16(d*1.125f); // 1.125f is another fudge factor. Don't ask me why it is needed.
float id = 1/d;
for (int ib = 0; ib < QK_K/block_size; ib += 2) {
int l1 = nearest_int(0.5f*(id*scales[ib+0]-1));
l1 = MAX(0, MIN(15, l1));
int l2 = nearest_int(0.5f*(id*scales[ib+1]-1));
l2 = MAX(0, MIN(15, l2));
y[ibl].scales[ib/2] = l1 | (l2 << 4);
y[ibl].qh[ib+0] |= masks[shifts[ib+0]];
y[ibl].qh[ib+1] |= masks[shifts[ib+1]];
}
}
}

size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
GGML_ASSERT(n_per_row%QK_K == 0);
float scales[QK_K/IQ1M_BLOCK_SIZE];
float weight[IQ1M_BLOCK_SIZE];
int8_t L[IQ1M_BLOCK_SIZE];
float sumx[IQ1M_BLOCK_SIZE+1];
float sumw[IQ1M_BLOCK_SIZE+1];
float pairs[2*IQ1M_BLOCK_SIZE];
uint16_t index[IQ1M_BLOCK_SIZE/8];
int8_t shifts[QK_K/IQ1M_BLOCK_SIZE];
int nblock = n_per_row/QK_K;
char * qrow = (char *)dst;
for (int row = 0; row < nrow; ++row) {
quantize_row_iq1_impl(GGML_TYPE_IQ1_M, src, qrow, n_per_row, quant_weights, scales, weight, sumx, sumw, pairs, L, index, shifts);
quantize_row_iq1_m_impl(src, qrow, n_per_row, quant_weights, scales, weight, pairs, L, index, shifts);
src += n_per_row;
qrow += nblock*sizeof(block_iq1_m);
}