Latest commits +added more presets for Nemo-based models

* updated Readme, will come back to it soon
MaggotHATE · Aug 16, 2024 · b665a51 · b665a51
1 parent 6e084a5
commit b665a51
Show file tree

Hide file tree

Showing 16 changed files with 147,916 additions and 147,494 deletions.
diff --git a/Makefile b/Makefile
@@ -444,14 +444,14 @@ OBJS_GGUF = \
 ifdef OPENBLAS64
 	CXXFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64)
 	CFLAGS   += $(shell pkg-config --cflags-only-other openblas64)
-	LDFLAGS  += $(shell pkg-config --libs openblas64)
+	LDFLAGS  += $(shell pkg-config --libs openblas64) --static
 	OBJS_GGUF    += $(TMP)t_ggml-blas.o
 endif # GGML_OPENBLAS
 
 ifdef OPENBLAS
 	CXXFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas)
 	CFLAGS   += $(shell pkg-config --cflags-only-other openblas)
-	LDFLAGS  += $(shell pkg-config --libs openblas)
+	LDFLAGS  += $(shell pkg-config --libs openblas) --static
 	OBJS_GGUF    += $(TMP)t_ggml-blas.o
 endif # GGML_OPENBLAS
 
@@ -563,7 +563,6 @@ endif
 CXXFLAGS_CL += -lclblast -lOpenCL
 CXXFLAGS_UI_CL += -lclblast -lOpenCL
 
-
 #OBJS_GGUF_CL    = $(TMP)cl_ggml-quants.o $(TMP)cl_ggml-opencl-gguf.o $(TMP)cl_ggml.o $(TMP)cl_ggml-alloc.o $(TMP)cl_ggml-backend.o $(TMP)cl_llama.o $(TMP)cl_sampling.o $(TMP)cl_common.o $(TMP)cl_grammar-parser.o
 OBJS_GGUF_CL    = \
     $(TMP)clt_ggml.o \
@@ -583,7 +582,7 @@ OBJS_GGUF_CL    = \
     $(TMP)clt_unicode.o \
     $(TMP)clt_unicode-data.o \
     $(TMP)clt_sgemm.o
-    
+
 $(TMP)clt_ggml-opencl-gguf.o: $(ggmlsrc_f)/ggml-opencl.cpp $(ggmlsrc_f)/ggml-opencl.h
 	$(CXX) $(CXXFLAGS_CL) -c $< -o $@
 

diff --git a/README.md b/README.md
@@ -58,6 +58,7 @@ Libraries:
 * `make chat_cl` for Clblast build
 * `make chat_vk` for Vulkan build
 * `make chatTest`, `make chatTest_cl` and `make chatTest_vk` for building the debugging program
+* for CPU-only builds use `OPENBLAS64=1` to enable OpenBLAS (helps with prompt processing)
 * if your GPU/iGPU don't support Vulkan, compile with SDL2=1
 * if you need Windows console for debugging, compile with CONW=1
 * see more in makefile

diff --git a/base/common.cpp b/base/common.cpp
@@ -1294,12 +1294,6 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
     return text;
 }
 
-bool llama_should_add_bos_token(const llama_model * model) {
-    const int add_bos = llama_add_bos_token(model);
-
-    return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
-}
-
 //
 // YAML utils
 //

diff --git a/base/common.h b/base/common.h
@@ -217,10 +217,6 @@ std::string llama_detokenize(
         const std::vector<llama_token> & tokens,
                                   bool   special = true);
 
-// Uses the value from the model metadata if possible, otherwise
-// defaults to true when model type is SPM, otherwise false.
-bool llama_should_add_bos_token(const llama_model * model);
-
 //
 // YAML utils
 //

diff --git a/base/ggml/ggml-backend.c b/base/ggml/ggml-backend.c
@@ -1018,10 +1018,6 @@ static bool ggml_is_view_op(enum ggml_op op) {
 #define GGML_SCHED_MAX_BACKENDS 16
 #endif
 
-#ifndef GGML_SCHED_MAX_SPLITS
-#define GGML_SCHED_MAX_SPLITS 2048
-#endif
-
 #ifndef GGML_SCHED_MAX_SPLIT_INPUTS
 #define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
 #endif
@@ -1125,7 +1121,8 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co
 }
 
 #if 0
-static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
+#define GGML_SCHED_MAX_SPLITS_DEBUG 4096
+static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
 #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
 #define GET_CAUSE(node) causes[hash_id(node)]
 #else
@@ -1549,7 +1546,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                     sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
                     GGML_ASSERT(sched->splits != NULL);
                 }
-                GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS);
                 split = &sched->splits[i_split];
                 split->backend_id = node_backend_id;
                 split->i_start = i;
@@ -1865,13 +1861,14 @@ ggml_backend_sched_t ggml_backend_sched_new(
     sched->hv_tensor_backend_ids = malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
     sched->hv_tensor_copies      = malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
 
-    const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
+    const size_t ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph
+    const size_t nodes_size = graph_size + ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
     sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
     sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
     sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
     sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
 
-    sched->context_buffer_size = GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
+    sched->context_buffer_size = ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
     sched->context_buffer = malloc(sched->context_buffer_size);
 
     const int initial_splits_capacity = 16;