ggerganov · slaren · Jul 10, 2023 · Jul 15, 2023 · Jul 15, 2023 · Jul 15, 2023
diff --git a/ggml-backend.c b/ggml-backend.c
@@ -373,36 +373,37 @@ void ggml_graph_splits_add_n_va(struct ggml_graph_splits * splits, struct ggml_t
 
     struct ggml_graph_split * split = &splits->splits[splits->n_splits];
 
-    // check if the split is on the same backend as the previous one
-    // FIXME: need to check all the inputs
-    if ((*inputs[0])->backend == ggml_get_ctx_backend(ctx)) {
-        if (splits->n_splits == 0) {
-            // always add the first split
-            int i = 0;
-            while (inputs[i] != NULL) {
-                GGML_ASSERT(i < GGML_MAX_SPLIT_INPUTS);
-                split->src_inputs[i] = *inputs[i];
-                split->dst_inputs[i] = *inputs[i];
-                i++;
-            }
-            split->src_inputs[i] = NULL;
-            split->dst_inputs[i] = NULL;
-        } else {
-            // add to the previous split
-            char name[GGML_MAX_NAME - 2];
-            int n = vsnprintf(name, sizeof(name), fmt, args);
-            char new_name[GGML_MAX_NAME];
-            snprintf(new_name, sizeof(new_name), "%.*s,%s", GGML_MAX_NAME - n - 2, splits->splits[splits->n_splits - 1].name, name);
-            strcpy(splits->splits[splits->n_splits - 1].name, new_name);
-            return;
+
+    if (splits->n_splits == 0) {
+        // always add the first split
+        int i = 0;
+        while (inputs[i] != NULL) {
+            GGML_ASSERT(i < GGML_MAX_SPLIT_INPUTS);
+            split->src_inputs[i] = *inputs[i];
+            split->dst_inputs[i] = *inputs[i];
+            i++;
         }
+        split->src_inputs[i] = NULL;
+        split->dst_inputs[i] = NULL;
+        split->ctx = ctx;
+    }
+    // check if the split is on the same context as the previous one
+    else if (splits->n_splits > 0 && splits->splits[splits->n_splits - 1].ctx == ctx) {
+        // add to the previous split
+        char name[GGML_MAX_NAME - 2];
+        int n = vsnprintf(name, sizeof(name), fmt, args);
+        char new_name[GGML_MAX_NAME];
+        snprintf(new_name, sizeof(new_name), "%.*s,%s", GGML_MAX_NAME - n - 2, splits->splits[splits->n_splits - 1].name, name);
+        strcpy(splits->splits[splits->n_splits - 1].name, new_name);
+        return;
     } else {
         // add a new split
         int i = 0;
         while (inputs[i] != NULL) {
             GGML_ASSERT(i < GGML_MAX_SPLIT_INPUTS);
             split->src_inputs[i] = *inputs[i];
             split->dst_inputs[i] = ggml_dup_tensor(ctx, *inputs[i]);
+            ggml_format_name(split->dst_inputs[i], "%s (split output)", split->src_inputs[i]->name);
             // TODO: maybe support different layings in ggml_backend_cpy_tensor instead
             for (int j = 0; j < GGML_MAX_DIMS; j++) {
                 split->dst_inputs[i]->nb[j] = split->src_inputs[i]->nb[j];
@@ -413,6 +414,7 @@ void ggml_graph_splits_add_n_va(struct ggml_graph_splits * splits, struct ggml_t
         }
         split->src_inputs[i] = NULL;
         split->dst_inputs[i] = NULL;
+        split->ctx = ctx;
     }
 
     vsnprintf(split->name, GGML_MAX_NAME, fmt, args);
@@ -493,7 +495,8 @@ void ggml_graph_splits_compute(struct ggml_graph_splits * splits) {
         // copy the input tensor to the backend
         uint64_t copy_start_us = ggml_time_us();
         for (int j = 0; split->src_inputs[j] != NULL; j++) {
-            //printf("\tcopying tensor %d (%s) (%lu bytes)\n", j, split->src_inputs[j]->name, ggml_nbytes(split->src_inputs[j]));
+            //printf("\tcopying tensor %d (%s) (%s -> %s) (%lu bytes)\n", j, split->src_inputs[j]->name, ggml_backend_name(split->src_inputs[j]->backend), ggml_backend_name(split->dst_inputs[j]->backend), ggml_nbytes(split->src_inputs[j]));
+            //printf("%p %p\n", split->src_inputs[j], split->dst_inputs[j]);
             ggml_backend_tensor_copy(split->src_inputs[j], split->dst_inputs[j]);
         }
         // ggml_backend_synchronize(split->dst_inputs[0]->backend);
@@ -705,32 +708,83 @@ void allocate_graph(struct ggml_cgraph * gf, struct ggml_buffer * buffer) {
 
 #endif
 
-void ggml_graph_allocate_tensors(struct ggml_cgraph * graph) {
-    ggml_graph_allocate_tensors_n(&graph, 1);
+void ggml_graph_allocate_tensors(struct ggml_cgraph * graph, struct ggml_context * ctx) {
+    ggml_graph_allocate_tensors_n(&graph, 1, ctx);
 }
 
-void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs) {
-}
+static bool ggml_is_view(struct ggml_tensor * t) {
+    return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE ||
+           t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
+}
+
+void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs, struct ggml_context * ctx) {
+    struct ggml_buffer * buffer = ggml_get_buffer(ctx);
+    for (int i = 0; i < n_graphs; i++) {
+        struct ggml_cgraph * graph = graphs[i];
+        for (int j = 0; j < graph->n_leafs; j++) {
+            struct ggml_tensor * leaf = graph->leafs[j];
+            GGML_ASSERT(leaf->backend == buffer->backend_buffer->backend);
+            if (leaf->data == NULL) {
+                //printf("allocating leaf %s\n", leaf->name);
+                ggml_backend_buffer_tensor_alloc(buffer->backend_buffer, leaf);
+            }
+        }
 
+        for (int j = 0; j < graph->n_nodes; j++) {
+            struct ggml_tensor * node = graph->nodes[j];
+            GGML_ASSERT(node->backend == buffer->backend_buffer->backend);
+            if (node->data == NULL) {
+                if (ggml_is_view(node)) {
+                    size_t offset;
+                    memcpy(&offset, node->op_params, sizeof(size_t));
+                    switch(node->op) {
+                        case GGML_OP_VIEW:
+                            //printf("view %s (%s), offset %zu\n", node->name, ggml_op_name(node->op), offset);
+                            node->data = (char *) node->src[0]->data + offset;
+                            break;
+                        case GGML_OP_RESHAPE:
+                        case GGML_OP_TRANSPOSE:
+                        case GGML_OP_PERMUTE:
+                            node->data = node->src[0]->data;
+                            break;
+                        case GGML_OP_CPY:
+                            node->data = node->src[1]->data;
+                            break;
+                        default:
+                            GGML_ASSERT(!"unknown view op");
+                            break;
+                    }
+                } else {
+                    //printf("allocating tensor %s\n", node->name);
+                    ggml_backend_buffer_tensor_alloc(buffer->backend_buffer, node);
+                }
+            }
+        }
+    }
+    //printf("\n\n\n");
+}
 
 void ggml_graph_splits_allocate_tensors(struct ggml_graph_splits * splits) {
     bool visited[GGML_MAX_SPLITS] = {false};
     for (int i = 0; i < splits->n_splits; i++) {
         if (!visited[i]) {
             struct ggml_graph_split * split = &splits->splits[i];
-            struct ggml_backend * backend = split->dst_inputs[0]->backend; // not great
+            struct ggml_context * ctx = split->ctx;
             struct ggml_cgraph * backend_graphs[GGML_MAX_SPLITS];
             int num_graphs = 0;
             for (int j = i; j < splits->n_splits; j++) {
-                if (splits->splits[j].dst_inputs[0]->backend == backend) {
-                    backend_graphs[num_graphs++] = splits->splits[j].graph;
+                if (splits->splits[j].ctx == ctx) {
+                    backend_graphs[num_graphs] = splits->splits[j].graph;
                     visited[j] = true;
+                    num_graphs++;
                     // TODO: need to ensure that the output tensors are never freed
-                    // maybe this can be done automatically in ggml_graph_calc_compute_buffer_size by assuming that n_childs == 0 => output tensor
+                    // maybe this can be done automatically in ggml_graph_allocate_tensors_n by assuming that n_childs == 0 => output tensor
                 }
             }
-            ggml_graph_allocate_tensors_n(backend_graphs, num_graphs);
+            //printf("allocating tensors for %s [%d graphs/%d splits]\n", ggml_backend_name(ggml_get_buffer(ctx)->backend_buffer->backend), num_graphs, splits->n_splits);
+            ggml_graph_allocate_tensors_n(backend_graphs, num_graphs, ctx);
         }
     }
+    //printf("done allocating tensors\n");
 }
 
diff --git a/ggml-backend.h b/ggml-backend.h
@@ -126,9 +126,10 @@ extern "C" {
 
     struct ggml_graph_split {
         char name[GGML_MAX_NAME];
-        struct ggml_tensor * src_inputs[GGML_MAX_SPLIT_INPUTS + 1];
-        struct ggml_tensor * dst_inputs[GGML_MAX_SPLIT_INPUTS + 1];
-        struct ggml_cgraph * graph;
+        struct ggml_context * ctx;
+        struct ggml_tensor  * src_inputs[GGML_MAX_SPLIT_INPUTS + 1];
+        struct ggml_tensor  * dst_inputs[GGML_MAX_SPLIT_INPUTS + 1];
+        struct ggml_cgraph  * graph;
     };
 
     // TODO: this shouldn't be fixed size, allocate from ggml_context
@@ -153,8 +154,8 @@ extern "C" {
     GGML_API void ggml_graph_splits_compute(struct ggml_graph_splits * splits);
 
     // graph tensor allocator
-    GGML_API void ggml_graph_allocate_tensors(struct ggml_cgraph * graph);
-    GGML_API void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs);
+    GGML_API void ggml_graph_allocate_tensors(struct ggml_cgraph * graph, struct ggml_context * ctx);
+    GGML_API void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs, struct ggml_context * ctx);
     GGML_API void ggml_graph_splits_allocate_tensors(struct ggml_graph_splits * splits);
 
 #ifdef  __cplusplus

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
@@ -1752,6 +1752,8 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend * backend, const ggm
 
     //ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
 
+    //printf("get tensor %s %p\n", tensor->name, tensor->data);
+
     CUDA_CHECK(cudaMemcpyAsync(data, (const char*)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStream_main));
 
     UNUSED(backend);

diff --git a/ggml.c b/ggml.c
@@ -3936,7 +3936,7 @@ struct ggml_context {
 
     struct ggml_buffer * buffer;
 
-    bool   no_alloc;
+    enum ggml_alloc_mode alloc_mode;
 
     int    n_objects;
 
@@ -4292,7 +4292,7 @@ static inline int ggml_up(int n, int m) {
 struct ggml_init_params ggml_init_params_default(void) {
     struct ggml_init_params default_params = {
         /*.buffer       =*/ NULL,
-        /*.no_alloc     =*/ false,
+        /*.alloc_mode   =*/ GGML_ALLOC_IMMEDIATE,
         /*.compute_type =*/ GGML_TYPE_F32
     };
     return default_params;
@@ -4386,7 +4386,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
         /*.mem_size           =*/ params.buffer->mem_size,
         /*.mem_buffer         =*/ params.buffer->mem_buffer,
         /*.buffer             =*/ params.buffer,
-        /*.no_alloc           =*/ params.no_alloc,
+        /*.alloc_mode         =*/ params.alloc_mode,
         /*.n_objects          =*/ 0,
         /*.objects_begin      =*/ NULL,
         /*.objects_end        =*/ NULL,
@@ -4435,8 +4435,8 @@ size_t ggml_used_mem(const struct ggml_context * ctx) {
     return ctx->objects_end == NULL ? 0 : ctx->objects_end->offs + ctx->objects_end->size;
 }
 
-void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
-    ctx->no_alloc = no_alloc;
+void ggml_set_alloc_mode(struct ggml_context * ctx, enum ggml_alloc_mode alloc_mode) {
+    ctx->alloc_mode = alloc_mode;
 }
 
 void * ggml_get_mem_buffer(const struct ggml_context * ctx) {
@@ -4467,8 +4467,8 @@ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
     return max_size;
 }
 
-struct ggml_backend * ggml_get_ctx_backend(struct ggml_context * ctx) {
-    return ctx->buffer->backend_buffer->backend;
+struct ggml_buffer * ggml_get_buffer(const struct ggml_context * ctx) {
+    return ctx->buffer;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -4520,7 +4520,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
     ggml_assert_aligned(result);
 
     *result = (struct ggml_tensor) {
-        /*.backend      =*/ ggml_get_ctx_backend(ctx),
+        /*.backend      =*/ ctx->buffer->backend_buffer->backend,
         /*.type         =*/ type,
         /*.n_dims       =*/ n_dims,
         /*.ne           =*/ { 1, 1, 1, 1 },
@@ -4537,7 +4537,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
         /*.data         =*/ data,
         /*.name         =*/ { 0 },
         /*.extra        =*/ NULL,
-        /*.pad          =*/ { 0 },
+        /*.padding      =*/ { 0 },
     };
 
     for (int i = 0; i < n_dims; i++) {
@@ -4550,14 +4550,10 @@ struct ggml_tensor * ggml_new_tensor_impl(
         result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
     }
 
-    if (data == NULL && !ctx->no_alloc) {
-         ggml_backend_buffer_tensor_alloc(ctx->buffer->backend_buffer, result);
+    if (data == NULL && ctx->alloc_mode == GGML_ALLOC_IMMEDIATE) {
+        ggml_backend_buffer_tensor_alloc(ctx->buffer->backend_buffer, result);
     }
 
-    // TODO: this should not be needed as long as we don't rely on aligned SIMD loads
-    //ggml_assert_aligned(result->data);
-
-
     ctx->n_objects++;
 
     return result;
@@ -6387,7 +6383,7 @@ struct ggml_tensor * ggml_view_1d(
         is_node = true;
     }
 
-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
+    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, a->data ? (char *) a->data + offset : NULL);
     ggml_format_name(result, "%s (view)", a->name);
 
     ggml_set_op_params(result, &offset, sizeof(offset));
@@ -6418,7 +6414,7 @@ struct ggml_tensor * ggml_view_2d(
 
     const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
 
-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
+    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data ? (char *) a->data + offset : NULL);
     ggml_format_name(result, "%s (view)", a->name);
 
     ggml_set_op_params(result, &offset, sizeof(offset));
@@ -6455,7 +6451,7 @@ struct ggml_tensor * ggml_view_3d(
 
     const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
 
-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
+    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data ? (char *) a->data + offset : NULL);
     ggml_format_name(result, "%s (view)", a->name);
 
     ggml_set_op_params(result, &offset, sizeof(offset));
@@ -6494,7 +6490,7 @@ struct ggml_tensor * ggml_view_4d(
 
     const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
 
-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
+    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data ? (char *) a->data + offset : NULL);
     ggml_format_name(result, "%s (view)", a->name);
 
     ggml_set_op_params(result, &offset, sizeof(offset));
@@ -6885,6 +6881,18 @@ struct ggml_tensor * ggml_rope_inplace(
     return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 10000.0f, 1.0f, n_ctx, true);
 }
 
+struct ggml_tensor * ggml_rope_custom(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past,
+        int                   n_dims,
+        int                   mode,
+        float                 freq_base,
+        float                 freq_scale,
+        int                   n_ctx) {
+    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, freq_base, freq_scale, n_ctx, false);
+}
+
 struct ggml_tensor * ggml_rope_custom_inplace(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,