Show warning when using a quantized base model

ggerganov · Apr 15, 2023 · 061f1ce · 061f1ce
1 parent ecd4827
commit 061f1ce
Showing 1 changed file with 10 additions and 3 deletions.
diff --git a/llama.cpp b/llama.cpp
@@ -1842,9 +1842,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
         model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, false));
     }
 
-    fprintf(stderr, "%s: ", __func__);
-
     // read tensors and apply
+    bool warned = false;
     int n_tensors = 0;
     while (true) {
         int32_t n_dims;
@@ -1937,6 +1936,14 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
                 base_t = dest_t;
             }
 
+            if (base_t->type == GGML_TYPE_Q4_0 || base_t->type == GGML_TYPE_Q4_1) {
+                if (!warned) {
+                    fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
+                                    "use a f16 or f32 base model with --lora-base\n", __func__);
+                    warned = true;
+                }
+            }
+
             ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
             ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
 
@@ -1973,7 +1980,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
             lora_tensors.clear();
 
             n_tensors++;
-            if (n_tensors % 8 == 0)
+            if (n_tensors % 4 == 0)
                 fprintf(stderr, ".");
         }
     }