Control printing information using NEURAL_SPEED_VERBOSE (#26)

intel · Jan 4, 2024 · a8d9e7d · a8d9e7d
1 parent 9be307f
commit a8d9e7d
Show file tree

Hide file tree

Showing 20 changed files with 38 additions and 55 deletions.
diff --git a/.github/workflows/scripts/models/cpp_graph_inference.sh b/.github/workflows/scripts/models/cpp_graph_inference.sh
@@ -136,7 +136,7 @@ function main() {
                     ## run inference
                     export LANG=en_US.UTF-8
                     export LC_ALL=en_US.UTF-8
-                    OMP_NUM_THREADS=$(($cores_per_instance * 1)) numactl -m 0 -C 0-$(($cores_per_instance * 1 - 1)) \
+                    NEURAL_SPEED_VERBOSE=1 OMP_NUM_THREADS=$(($cores_per_instance * 1)) numactl -m 0 -C 0-$(($cores_per_instance * 1 - 1)) \
                         $infer_cmd --seed 1234 -t $cores_per_instance -b 2047 -c ${ctx} -n ${output} -m ${model}-${precision}.bin -p "$prompt" 2>&1 | tee ${WORKING_DIR}/${logs_file} || true & 
                     monitor
                 done

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -76,10 +76,7 @@ if(NE_BUILD_TESTS)
   add_compile_definitions(NE_BUILD_TESTS)
 endif()
 
-option(NE_PROFILING              "neural_engine: use Profiling"                                  OFF)
-if (NE_PROFILING)
-    add_compile_definitions(NE_PERF)
-endif()
+add_compile_definitions(NE_PERF)
 option(NE_BEAM_SEARCH_VERBOSE    "neural_engine: print beam search processing log"               OFF)
 if (NE_BEAM_SEARCH_VERBOSE)
     add_compile_definitions(NE_BEAM_SEARCH_VERBOSE_ON)

diff --git a/README.md b/README.md
@@ -434,3 +434,12 @@ stopping_criteria = StoppingCriteriaList(
 
 outputs = model.generate(inputs, streamer=streamer, stopping_criteria=stopping_criteria)
 ```
+
+### 6. Verbose Mode
+
+Enable verbose mode and control tracing information using the `NEURAL_SPEED_VERBOSE` environment variable.
+
+Available modes:
+- 0: Print all tracing information. Comprehensive output, including: evaluation time and operator profiling.
+- 1: Print evaluation time. Time taken for each evaluation.
+- 2: Profile individual operator. Identify performance bottleneck within the model.
diff --git a/neural_speed/__init__.py b/neural_speed/__init__.py
@@ -197,6 +197,8 @@ def generate(self, input_ids, streamer=None, interactive=False, ignore_prompt=Fa
             streamer.end()
 
         self.generate_round += 1
+        if os.getenv("NEURAL_SPEED_VERBOSE") and os.getenv("NEURAL_SPEED_VERBOSE") in ["1", "0"]:
+            self.model.print_time()
         return ret
 
     def is_token_end(self):

diff --git a/neural_speed/application/main_pybind.cpp b/neural_speed/application/main_pybind.cpp
@@ -90,7 +90,7 @@ class Model {
     curr_input_ids.resize(params.batch_size);
     generate_count = 0;
   }
-
+  void print_time() { model_print_timings(ctx); }
   static size_t np_bestla_qpack(py::array_t<int8_t> src_w, py::array_t<float> src_scales, py::array_t<int8_t> src_zeros,
                                 py::array_t<int32_t> g_idx, py::array_t<int8_t> dst, const std::string& weight_dtype,
                                 const std::string& alg, int group_size, const std::string& scale_dtype,
@@ -689,5 +689,6 @@ PYBIND11_MODULE(qwen_cpp, m)
                   py::arg("src_w"), py::arg("dst"), py::arg("weight_dtype") = "int4", py::arg("alg") = "sym",
                   py::arg("group_size") = 32, py::arg("scale_dtype") = "fp32", py::arg("compute_dtype") = "int8",
                   py::arg("threads") = 8)
+      .def("print_time", &Model::print_time)
       .def("reinit", &Model::reinit);
 }
diff --git a/neural_speed/application/main_run.cpp b/neural_speed/application/main_run.cpp
@@ -765,7 +765,9 @@ int main(int argc, char** argv) {  // NOLINT
     model_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
   }
 
-  model_print_timings(ctx);
+  if (ns_log_level() == 0 || ns_log_level() == 1) {
+    model_print_timings(ctx);
+  }
   model_free(ctx);
 
   return 0;

diff --git a/neural_speed/models/baichuan/baichuan.cpp b/neural_speed/models/baichuan/baichuan.cpp
@@ -282,12 +282,9 @@ static bool baichuan_model_eval_internal(model_context* ctx, const model_input*
   ne_build_forward_expand(&gf, inpL);
   ne_graph_compute(ctx0, &gf);
 
-#ifdef NE_PERF
-  bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
-  if (engine_profiling_) {
+  if (ns_log_level() == 0 || ns_log_level() == 2) {
     ne_graph_profiling(&gf);
   }
-#endif
 
   // update kv token count
   lctx.model.kv_self.n = n_past + N;

diff --git a/neural_speed/models/bloom/bloom.cpp b/neural_speed/models/bloom/bloom.cpp
@@ -260,12 +260,9 @@ static bool bloom_model_eval_internal(model_context* ctx, const model_input* inp
   ne_build_forward_expand(&gf, inpL);
   ne_graph_compute(ctx0, &gf);
 
-#ifdef NE_PERF
-  bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
-  if (engine_profiling_) {
+  if (ns_log_level() == 0 || ns_log_level() == 2) {
     ne_graph_profiling(&gf);
   }
-#endif
 
   // update kv token count
   lctx.model.kv_self.n = n_past + N;

diff --git a/neural_speed/models/chatglm/chatglm.cpp b/neural_speed/models/chatglm/chatglm.cpp
@@ -288,12 +288,9 @@ static bool chatglm_model_eval_internal(model_context* ctx, const model_input* i
   ne_build_forward_expand(&gf, inpL);
   ne_graph_compute(ctx0, &gf);
 
-#ifdef NE_PERF
-  bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
-  if (engine_profiling_) {
+  if (ns_log_level() == 0 || ns_log_level() == 2) {
     ne_graph_profiling(&gf);
   }
-#endif
 
   // update kv token count
   lctx.model.kv_self.n = n_past + N;

diff --git a/neural_speed/models/chatglm/chatglm2.cpp b/neural_speed/models/chatglm/chatglm2.cpp
@@ -314,12 +314,9 @@ static bool chatglm_model_eval_internal(model_context* ctx, const model_input* i
   ne_build_forward_expand(&gf, inpL);
   ne_graph_compute(ctx0, &gf);
 
-#ifdef NE_PERF
-  bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
-  if (engine_profiling_) {
+  if (ns_log_level() == 0 || ns_log_level() == 2) {
     ne_graph_profiling(&gf);
   }
-#endif
 
   // update kv token count
   lctx.model.kv_self.n = n_cached;

diff --git a/neural_speed/models/falcon/falcon.cpp b/neural_speed/models/falcon/falcon.cpp
@@ -322,12 +322,9 @@ static bool falcon_model_eval_internal(model_context* ctx, const model_input* in
   ne_build_forward_expand(&gf, inpL);
   ne_graph_compute(ctx0, &gf);
 
-#ifdef NE_PERF
-  bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
-  if (engine_profiling_) {
+  if (ns_log_level() == 0 || ns_log_level() == 2) {
     ne_graph_profiling(&gf);
   }
-#endif
 
   // update kv token count
   lctx.model.kv_self.n = n_past + N;

diff --git a/neural_speed/models/gptj/gptj.cpp b/neural_speed/models/gptj/gptj.cpp
@@ -481,12 +481,9 @@ static bool gptj_model_eval_internal(model_context* ctx, const model_input* inpu
   ne_build_forward_expand(&gf, inpL);
   ne_graph_compute(ctx0, &gf);
 
-#ifdef NE_PERF
-  bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
-  if (engine_profiling_) {
+  if (ns_log_level() == 0 || ns_log_level() == 2) {
     ne_graph_profiling(&gf);
   }
-#endif
 
   // update kv token count
   lctx.model.kv_self.n = n_cached;

diff --git a/neural_speed/models/gptneox/gptneox.cpp b/neural_speed/models/gptneox/gptneox.cpp
@@ -363,12 +363,9 @@ static bool gptneox_model_eval_internal(model_context* ctx, const model_input* i
   ne_build_forward_expand(&gf, inpL);
   ne_graph_compute(ctx0, &gf);
 
-#ifdef NE_PERF
-  bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
-  if (engine_profiling_) {
+  if (ns_log_level() == 0 || ns_log_level() == 2) {
     ne_graph_profiling(&gf);
   }
-#endif
 
   // update kv token count
   lctx.model.kv_self.n = n_past + N;

diff --git a/neural_speed/models/llama/llama.cpp b/neural_speed/models/llama/llama.cpp
@@ -394,12 +394,9 @@ static bool llama_model_eval_internal(model_context* ctx, const model_input* inp
   ne_build_forward_expand(&gf, inpL);
   ne_graph_compute(ctx0, &gf);
 
-#ifdef NE_PERF
-  bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
-  if (engine_profiling_) {
+  if (ns_log_level() == 0 || ns_log_level() == 2) {
     ne_graph_profiling(&gf);
   }
-#endif
 
   // update kv token count
   lctx.model.kv_self.n = n_cached;

diff --git a/neural_speed/models/model_utils/model_utils.cpp b/neural_speed/models/model_utils/model_utils.cpp
@@ -53,6 +53,12 @@
 #include "models/model_utils/util.h"
 #include "models/models.h"
 
+int64_t ns_log_level() {
+  const char* log_level = getenv("NEURAL_SPEED_VERBOSE");
+  if (log_level == nullptr) return -1;
+  return std::stoi(log_level);
+}
+
 //
 // kv cache
 //

diff --git a/neural_speed/models/model_utils/model_utils.h b/neural_speed/models/model_utils/model_utils.h
@@ -50,6 +50,8 @@
 #define MODEL_SESSION_MAGIC MODEL_FILE_MAGIC_GGSN
 #define MODEL_SESSION_VERSION 1
 
+int64_t ns_log_level();
+
 void model_load_internal(const std::string& fname, model_archs arch, model_context* ctx, int n_gpu_layers,
                          bool use_mmap, bool use_mlock, bool vocab_only, model_progress_callback progress_callback,
                          void* progress_callback_user_data);

diff --git a/neural_speed/models/mpt/mpt.cpp b/neural_speed/models/mpt/mpt.cpp
@@ -305,12 +305,9 @@ static bool mpt_model_eval_internal(model_context* ctx, const model_input* input
   ne_build_forward_expand(&gf, inpL);
   ne_graph_compute(ctx0, &gf);
 
-#ifdef NE_PERF
-  bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
-  if (engine_profiling_) {
+  if (ns_log_level() == 0 || ns_log_level() == 2) {
     ne_graph_profiling(&gf);
   }
-#endif
 
   // update kv token count
   lctx.model.kv_self.n = n_past + N;

diff --git a/neural_speed/models/opt/opt.cpp b/neural_speed/models/opt/opt.cpp
@@ -322,12 +322,9 @@ static bool opt_model_eval_internal(model_context* ctx, const model_input* input
   ne_build_forward_expand(&gf, inpL);
   ne_graph_compute(ctx0, &gf);
 
-#ifdef NE_PERF
-  bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
-  if (engine_profiling_) {
+  if (ns_log_level() == 0 || ns_log_level() == 2) {
     ne_graph_profiling(&gf);
   }
-#endif
 
   // update kv token count
   lctx.model.kv_self.n = n_past + N;

diff --git a/neural_speed/models/qwen/qwen.cpp b/neural_speed/models/qwen/qwen.cpp
@@ -339,12 +339,9 @@ static bool qwen_model_eval_internal(model_context* ctx, const model_input* inpu
   ne_build_forward_expand(&gf, inpL);
   ne_graph_compute(ctx0, &gf);
 
-#ifdef NE_PERF
-  bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
-  if (engine_profiling_) {
+  if (ns_log_level() == 0 || ns_log_level() == 2) {
     ne_graph_profiling(&gf);
   }
-#endif
 
   // update kv token count
   lctx.model.kv_self.n = n_past + N;

diff --git a/neural_speed/models/starcoder/starcoder.cpp b/neural_speed/models/starcoder/starcoder.cpp
@@ -393,12 +393,9 @@ static bool starcoder_model_eval_internal(model_context* ctx, const model_input*
   ne_build_forward_expand(&gf, inpL);
   ne_graph_compute(ctx0, &gf);
 
-#ifdef NE_PERF
-  bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
-  if (engine_profiling_) {
+  if (ns_log_level() == 0 || ns_log_level() == 2) {
     ne_graph_profiling(&gf);
   }
-#endif
 
   // update kv token count
   lctx.model.kv_self.n = n_past + N;