From d3e1886eecb5e6557517bdbb3c1803f2ffd595ed Mon Sep 17 00:00:00 2001
From: Slaren <2141330+slaren@users.noreply.github.com>
Date: Sat, 15 Apr 2023 20:29:05 +0200
Subject: [PATCH] ggml_cpy: use the work buffer instead of alloca when
 quantizing

---
 ggml.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/ggml.c b/ggml.c
index f2cb593d027f7a..84ad11f63a257a 100644
--- a/ggml.c
+++ b/ggml.c
@@ -5466,8 +5466,7 @@ static void ggml_compute_forward_dup_f16(
                 size_t id = 0;
                 uint8_t * dst_ptr = (uint8_t *) dst->data;
                 size_t dst_row_size = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
-                // todo: use work buffer
-                float * src0_f32 = (float *) alloca(ne00 * sizeof(float));
+                float * src0_f32 = (float *) params->wdata;
 
                 for (int i03 = 0; i03 < ne03; i03++) {
                     for (int i02 = 0; i02 < ne02; i02++) {
@@ -10227,9 +10226,17 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
             struct ggml_tensor * node = cgraph->nodes[i];
 
             switch (node->op) {
+                case GGML_OP_CPY:
                 case GGML_OP_DUP:
                     {
                         node->n_tasks = 1;
+
+                        size_t cur = 0;
+                        if (node->type == GGML_TYPE_Q4_0 || node->type == GGML_TYPE_Q4_1) {
+                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0];
+                        }
+
+                        work_size = MAX(work_size, cur);
                     } break;
                 case GGML_OP_ADD:
                     {
@@ -10322,7 +10329,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                     {
                         node->n_tasks = n_threads;
                     } break;
-                case GGML_OP_CPY:
                 case GGML_OP_CONT:
                 case GGML_OP_RESHAPE:
                 case GGML_OP_VIEW: