From 2c46267aac45086c9c22099da7a92254956bd0d6 Mon Sep 17 00:00:00 2001
From: zhenwei-intel <zhenwei.liu@intel.com>
Date: Thu, 4 Jan 2024 11:10:28 +0800
Subject: [PATCH 1/2] Control printing information using NEURAL_SPEED_VERBOSE

Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com>
---
 CMakeLists.txt                                  | 5 +----
 README.md                                       | 9 +++++++++
 neural_speed/__init__.py                        | 2 ++
 neural_speed/application/main_pybind.cpp        | 3 ++-
 neural_speed/application/main_run.cpp           | 4 +++-
 neural_speed/models/baichuan/baichuan.cpp       | 5 +----
 neural_speed/models/bloom/bloom.cpp             | 5 +----
 neural_speed/models/chatglm/chatglm.cpp         | 5 +----
 neural_speed/models/chatglm/chatglm2.cpp        | 5 +----
 neural_speed/models/falcon/falcon.cpp           | 5 +----
 neural_speed/models/gptj/gptj.cpp               | 5 +----
 neural_speed/models/gptneox/gptneox.cpp         | 5 +----
 neural_speed/models/llama/llama.cpp             | 5 +----
 neural_speed/models/model_utils/model_utils.cpp | 6 ++++++
 neural_speed/models/model_utils/model_utils.h   | 2 ++
 neural_speed/models/mpt/mpt.cpp                 | 5 +----
 neural_speed/models/opt/opt.cpp                 | 5 +----
 neural_speed/models/qwen/qwen.cpp               | 5 +----
 neural_speed/models/starcoder/starcoder.cpp     | 5 +----
 19 files changed, 37 insertions(+), 54 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 69366f3a9..808d247fc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -76,10 +76,7 @@ if(NE_BUILD_TESTS)
   add_compile_definitions(NE_BUILD_TESTS)
 endif()
 
-option(NE_PROFILING              "neural_engine: use Profiling"                                  OFF)
-if (NE_PROFILING)
-    add_compile_definitions(NE_PERF)
-endif()
+add_compile_definitions(NE_PERF)
 option(NE_BEAM_SEARCH_VERBOSE    "neural_engine: print beam search processing log"               OFF)
 if (NE_BEAM_SEARCH_VERBOSE)
     add_compile_definitions(NE_BEAM_SEARCH_VERBOSE_ON)
diff --git a/README.md b/README.md
index 1be563b17..dd2d87c60 100644
--- a/README.md
+++ b/README.md
@@ -434,3 +434,12 @@ stopping_criteria = StoppingCriteriaList(
 
 outputs = model.generate(inputs, streamer=streamer, stopping_criteria=stopping_criteria)
 ```
+
+### 6. Verbose Mode
+
+Enable verbose mode and control tracing information using the `NEURAL_SPEED_VERBOSE` environment variable.
+
+Available modes:
+- 0: Print all tracing information. Comprehensive output, including: evaluation time and operator profiling.
+- 1: Print evaluation time. Time taken for each evaluation.
+- 2: Profile individual operator. Identify performance bottleneck within the model.
\ No newline at end of file
diff --git a/neural_speed/__init__.py b/neural_speed/__init__.py
index 78a700e0e..8fd0a181b 100644
--- a/neural_speed/__init__.py
+++ b/neural_speed/__init__.py
@@ -189,6 +189,8 @@ def generate(self, input_ids, streamer=None, interactive=False, ignore_prompt=Fa
             streamer.end()
 
         self.generate_round += 1
+        if os.getenv("NEURAL_SPEED_VERBOSE") and os.getenv("NEURAL_SPEED_VERBOSE") in ["1", "0"]:
+            self.model.print_time()
         return ret
 
     def is_token_end(self):
diff --git a/neural_speed/application/main_pybind.cpp b/neural_speed/application/main_pybind.cpp
index 8cf92bbad..83774c023 100644
--- a/neural_speed/application/main_pybind.cpp
+++ b/neural_speed/application/main_pybind.cpp
@@ -90,7 +90,7 @@ class Model {
     curr_input_ids.resize(params.batch_size);
     generate_count = 0;
   }
-
+  void print_time() { model_print_timings(ctx); }
   static size_t np_bestla_qpack(py::array_t<int8_t> src_w, py::array_t<float> src_scales, py::array_t<int8_t> src_zeros,
                                 py::array_t<int32_t> g_idx, py::array_t<int8_t> dst, const std::string& weight_dtype,
                                 const std::string& alg, int group_size, const std::string& scale_dtype,
@@ -689,5 +689,6 @@ PYBIND11_MODULE(qwen_cpp, m)
                   py::arg("src_w"), py::arg("dst"), py::arg("weight_dtype") = "int4", py::arg("alg") = "sym",
                   py::arg("group_size") = 32, py::arg("scale_dtype") = "fp32", py::arg("compute_dtype") = "int8",
                   py::arg("threads") = 8)
+      .def("print_time", &Model::print_time)
       .def("reinit", &Model::reinit);
 }
diff --git a/neural_speed/application/main_run.cpp b/neural_speed/application/main_run.cpp
index 6c1a09d2a..b48d9d192 100644
--- a/neural_speed/application/main_run.cpp
+++ b/neural_speed/application/main_run.cpp
@@ -765,7 +765,9 @@ int main(int argc, char** argv) {  // NOLINT
     model_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
   }
 
-  model_print_timings(ctx);
+  if (ns_log_level() == 0 || ns_log_level() == 1) {
+    model_print_timings(ctx);
+  }
   model_free(ctx);
 
   return 0;
diff --git a/neural_speed/models/baichuan/baichuan.cpp b/neural_speed/models/baichuan/baichuan.cpp
index 608902f9d..589d64016 100644
--- a/neural_speed/models/baichuan/baichuan.cpp
+++ b/neural_speed/models/baichuan/baichuan.cpp
@@ -282,12 +282,9 @@ static bool baichuan_model_eval_internal(model_context* ctx, const model_input*
   ne_build_forward_expand(&gf, inpL);
   ne_graph_compute(ctx0, &gf);
 
-#ifdef NE_PERF
-  bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
-  if (engine_profiling_) {
+  if (ns_log_level() == 0 || ns_log_level() == 2) {
     ne_graph_profiling(&gf);
   }
-#endif
 
   // update kv token count
   lctx.model.kv_self.n = n_past + N;
diff --git a/neural_speed/models/bloom/bloom.cpp b/neural_speed/models/bloom/bloom.cpp
index 3f7ce5958..d09bcdbc7 100644
--- a/neural_speed/models/bloom/bloom.cpp
+++ b/neural_speed/models/bloom/bloom.cpp
@@ -260,12 +260,9 @@ static bool bloom_model_eval_internal(model_context* ctx, const model_input* inp
   ne_build_forward_expand(&gf, inpL);
   ne_graph_compute(ctx0, &gf);
 
-#ifdef NE_PERF
-  bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
-  if (engine_profiling_) {
+  if (ns_log_level() == 0 || ns_log_level() == 2) {
     ne_graph_profiling(&gf);
   }
-#endif
 
   // update kv token count
   lctx.model.kv_self.n = n_past + N;
diff --git a/neural_speed/models/chatglm/chatglm.cpp b/neural_speed/models/chatglm/chatglm.cpp
index 243cccb52..68836e5cb 100644
--- a/neural_speed/models/chatglm/chatglm.cpp
+++ b/neural_speed/models/chatglm/chatglm.cpp
@@ -288,12 +288,9 @@ static bool chatglm_model_eval_internal(model_context* ctx, const model_input* i
   ne_build_forward_expand(&gf, inpL);
   ne_graph_compute(ctx0, &gf);
 
-#ifdef NE_PERF
-  bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
-  if (engine_profiling_) {
+  if (ns_log_level() == 0 || ns_log_level() == 2) {
     ne_graph_profiling(&gf);
   }
-#endif
 
   // update kv token count
   lctx.model.kv_self.n = n_past + N;
diff --git a/neural_speed/models/chatglm/chatglm2.cpp b/neural_speed/models/chatglm/chatglm2.cpp
index cd9c384a7..03acfd19b 100644
--- a/neural_speed/models/chatglm/chatglm2.cpp
+++ b/neural_speed/models/chatglm/chatglm2.cpp
@@ -314,12 +314,9 @@ static bool chatglm_model_eval_internal(model_context* ctx, const model_input* i
   ne_build_forward_expand(&gf, inpL);
   ne_graph_compute(ctx0, &gf);
 
-#ifdef NE_PERF
-  bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
-  if (engine_profiling_) {
+  if (ns_log_level() == 0 || ns_log_level() == 2) {
     ne_graph_profiling(&gf);
   }
-#endif
 
   // update kv token count
   lctx.model.kv_self.n = n_cached;
diff --git a/neural_speed/models/falcon/falcon.cpp b/neural_speed/models/falcon/falcon.cpp
index e866fd94a..35bfe9026 100644
--- a/neural_speed/models/falcon/falcon.cpp
+++ b/neural_speed/models/falcon/falcon.cpp
@@ -322,12 +322,9 @@ static bool falcon_model_eval_internal(model_context* ctx, const model_input* in
   ne_build_forward_expand(&gf, inpL);
   ne_graph_compute(ctx0, &gf);
 
-#ifdef NE_PERF
-  bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
-  if (engine_profiling_) {
+  if (ns_log_level() == 0 || ns_log_level() == 2) {
     ne_graph_profiling(&gf);
   }
-#endif
 
   // update kv token count
   lctx.model.kv_self.n = n_past + N;
diff --git a/neural_speed/models/gptj/gptj.cpp b/neural_speed/models/gptj/gptj.cpp
index 1ee917510..34a33ad3f 100644
--- a/neural_speed/models/gptj/gptj.cpp
+++ b/neural_speed/models/gptj/gptj.cpp
@@ -481,12 +481,9 @@ static bool gptj_model_eval_internal(model_context* ctx, const model_input* inpu
   ne_build_forward_expand(&gf, inpL);
   ne_graph_compute(ctx0, &gf);
 
-#ifdef NE_PERF
-  bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
-  if (engine_profiling_) {
+  if (ns_log_level() == 0 || ns_log_level() == 2) {
     ne_graph_profiling(&gf);
   }
-#endif
 
   // update kv token count
   lctx.model.kv_self.n = n_cached;
diff --git a/neural_speed/models/gptneox/gptneox.cpp b/neural_speed/models/gptneox/gptneox.cpp
index 93784e709..5a7f36582 100644
--- a/neural_speed/models/gptneox/gptneox.cpp
+++ b/neural_speed/models/gptneox/gptneox.cpp
@@ -363,12 +363,9 @@ static bool gptneox_model_eval_internal(model_context* ctx, const model_input* i
   ne_build_forward_expand(&gf, inpL);
   ne_graph_compute(ctx0, &gf);
 
-#ifdef NE_PERF
-  bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
-  if (engine_profiling_) {
+  if (ns_log_level() == 0 || ns_log_level() == 2) {
     ne_graph_profiling(&gf);
   }
-#endif
 
   // update kv token count
   lctx.model.kv_self.n = n_past + N;
diff --git a/neural_speed/models/llama/llama.cpp b/neural_speed/models/llama/llama.cpp
index 9759b4a6e..e88fcd85c 100644
--- a/neural_speed/models/llama/llama.cpp
+++ b/neural_speed/models/llama/llama.cpp
@@ -394,12 +394,9 @@ static bool llama_model_eval_internal(model_context* ctx, const model_input* inp
   ne_build_forward_expand(&gf, inpL);
   ne_graph_compute(ctx0, &gf);
 
-#ifdef NE_PERF
-  bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
-  if (engine_profiling_) {
+  if (ns_log_level() == 0 || ns_log_level() == 2) {
     ne_graph_profiling(&gf);
   }
-#endif
 
   // update kv token count
   lctx.model.kv_self.n = n_cached;
diff --git a/neural_speed/models/model_utils/model_utils.cpp b/neural_speed/models/model_utils/model_utils.cpp
index dbcb29b8c..af9480152 100644
--- a/neural_speed/models/model_utils/model_utils.cpp
+++ b/neural_speed/models/model_utils/model_utils.cpp
@@ -53,6 +53,12 @@
 #include "models/model_utils/util.h"
 #include "models/models.h"
 
+int64_t ns_log_level() {
+  const char* log_level = getenv("NEURAL_SPEED_VERBOSE");
+  if (log_level == nullptr) return -1;
+  return std::stoi(log_level);
+}
+
 //
 // kv cache
 //
diff --git a/neural_speed/models/model_utils/model_utils.h b/neural_speed/models/model_utils/model_utils.h
index 1873c3893..ead9e0cf8 100644
--- a/neural_speed/models/model_utils/model_utils.h
+++ b/neural_speed/models/model_utils/model_utils.h
@@ -50,6 +50,8 @@
 #define MODEL_SESSION_MAGIC MODEL_FILE_MAGIC_GGSN
 #define MODEL_SESSION_VERSION 1
 
+int64_t ns_log_level();
+
 void model_load_internal(const std::string& fname, model_archs arch, model_context* ctx, int n_gpu_layers,
                          bool use_mmap, bool use_mlock, bool vocab_only, model_progress_callback progress_callback,
                          void* progress_callback_user_data);
diff --git a/neural_speed/models/mpt/mpt.cpp b/neural_speed/models/mpt/mpt.cpp
index fcff0cae6..cc5470112 100644
--- a/neural_speed/models/mpt/mpt.cpp
+++ b/neural_speed/models/mpt/mpt.cpp
@@ -305,12 +305,9 @@ static bool mpt_model_eval_internal(model_context* ctx, const model_input* input
   ne_build_forward_expand(&gf, inpL);
   ne_graph_compute(ctx0, &gf);
 
-#ifdef NE_PERF
-  bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
-  if (engine_profiling_) {
+  if (ns_log_level() == 0 || ns_log_level() == 2) {
     ne_graph_profiling(&gf);
   }
-#endif
 
   // update kv token count
   lctx.model.kv_self.n = n_past + N;
diff --git a/neural_speed/models/opt/opt.cpp b/neural_speed/models/opt/opt.cpp
index 5aa02578d..d1f8736b0 100644
--- a/neural_speed/models/opt/opt.cpp
+++ b/neural_speed/models/opt/opt.cpp
@@ -322,12 +322,9 @@ static bool opt_model_eval_internal(model_context* ctx, const model_input* input
   ne_build_forward_expand(&gf, inpL);
   ne_graph_compute(ctx0, &gf);
 
-#ifdef NE_PERF
-  bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
-  if (engine_profiling_) {
+  if (ns_log_level() == 0 || ns_log_level() == 2) {
     ne_graph_profiling(&gf);
   }
-#endif
 
   // update kv token count
   lctx.model.kv_self.n = n_past + N;
diff --git a/neural_speed/models/qwen/qwen.cpp b/neural_speed/models/qwen/qwen.cpp
index ea98e0785..4fce7776e 100644
--- a/neural_speed/models/qwen/qwen.cpp
+++ b/neural_speed/models/qwen/qwen.cpp
@@ -339,12 +339,9 @@ static bool qwen_model_eval_internal(model_context* ctx, const model_input* inpu
   ne_build_forward_expand(&gf, inpL);
   ne_graph_compute(ctx0, &gf);
 
-#ifdef NE_PERF
-  bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
-  if (engine_profiling_) {
+  if (ns_log_level() == 0 || ns_log_level() == 2) {
     ne_graph_profiling(&gf);
   }
-#endif
 
   // update kv token count
   lctx.model.kv_self.n = n_past + N;
diff --git a/neural_speed/models/starcoder/starcoder.cpp b/neural_speed/models/starcoder/starcoder.cpp
index 6c8c34353..f3493158c 100644
--- a/neural_speed/models/starcoder/starcoder.cpp
+++ b/neural_speed/models/starcoder/starcoder.cpp
@@ -393,12 +393,9 @@ static bool starcoder_model_eval_internal(model_context* ctx, const model_input*
   ne_build_forward_expand(&gf, inpL);
   ne_graph_compute(ctx0, &gf);
 
-#ifdef NE_PERF
-  bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
-  if (engine_profiling_) {
+  if (ns_log_level() == 0 || ns_log_level() == 2) {
     ne_graph_profiling(&gf);
   }
-#endif
 
   // update kv token count
   lctx.model.kv_self.n = n_past + N;

From dfbeb93af59d4e2646cb203919fe1c39c642d276 Mon Sep 17 00:00:00 2001
From: zhenwei-intel <zhenwei.liu@intel.com>
Date: Thu, 4 Jan 2024 11:20:05 +0800
Subject: [PATCH 2/2] fix test

Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com>
---
 .github/workflows/scripts/models/cpp_graph_inference.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/scripts/models/cpp_graph_inference.sh b/.github/workflows/scripts/models/cpp_graph_inference.sh
index a80a2be5a..f7632830d 100644
--- a/.github/workflows/scripts/models/cpp_graph_inference.sh
+++ b/.github/workflows/scripts/models/cpp_graph_inference.sh
@@ -136,7 +136,7 @@ function main() {
                     ## run inference
                     export LANG=en_US.UTF-8
                     export LC_ALL=en_US.UTF-8
-                    OMP_NUM_THREADS=$(($cores_per_instance * 1)) numactl -m 0 -C 0-$(($cores_per_instance * 1 - 1)) \
+                    NEURAL_SPEED_VERBOSE=1 OMP_NUM_THREADS=$(($cores_per_instance * 1)) numactl -m 0 -C 0-$(($cores_per_instance * 1 - 1)) \
                         $infer_cmd --seed 1234 -t $cores_per_instance -b 2047 -c ${ctx} -n ${output} -m ${model}-${precision}.bin -p "$prompt" 2>&1 | tee ${WORKING_DIR}/${logs_file} || true & 
                     monitor
                 done