From 2c46267aac45086c9c22099da7a92254956bd0d6 Mon Sep 17 00:00:00 2001 From: zhenwei-intel Date: Thu, 4 Jan 2024 11:10:28 +0800 Subject: [PATCH 1/2] Control printing information using NEURAL_SPEED_VERBOSE Signed-off-by: zhenwei-intel --- CMakeLists.txt | 5 +---- README.md | 9 +++++++++ neural_speed/__init__.py | 2 ++ neural_speed/application/main_pybind.cpp | 3 ++- neural_speed/application/main_run.cpp | 4 +++- neural_speed/models/baichuan/baichuan.cpp | 5 +---- neural_speed/models/bloom/bloom.cpp | 5 +---- neural_speed/models/chatglm/chatglm.cpp | 5 +---- neural_speed/models/chatglm/chatglm2.cpp | 5 +---- neural_speed/models/falcon/falcon.cpp | 5 +---- neural_speed/models/gptj/gptj.cpp | 5 +---- neural_speed/models/gptneox/gptneox.cpp | 5 +---- neural_speed/models/llama/llama.cpp | 5 +---- neural_speed/models/model_utils/model_utils.cpp | 6 ++++++ neural_speed/models/model_utils/model_utils.h | 2 ++ neural_speed/models/mpt/mpt.cpp | 5 +---- neural_speed/models/opt/opt.cpp | 5 +---- neural_speed/models/qwen/qwen.cpp | 5 +---- neural_speed/models/starcoder/starcoder.cpp | 5 +---- 19 files changed, 37 insertions(+), 54 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 69366f3a9..808d247fc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -76,10 +76,7 @@ if(NE_BUILD_TESTS) add_compile_definitions(NE_BUILD_TESTS) endif() -option(NE_PROFILING "neural_engine: use Profiling" OFF) -if (NE_PROFILING) - add_compile_definitions(NE_PERF) -endif() +add_compile_definitions(NE_PERF) option(NE_BEAM_SEARCH_VERBOSE "neural_engine: print beam search processing log" OFF) if (NE_BEAM_SEARCH_VERBOSE) add_compile_definitions(NE_BEAM_SEARCH_VERBOSE_ON) diff --git a/README.md b/README.md index 1be563b17..dd2d87c60 100644 --- a/README.md +++ b/README.md @@ -434,3 +434,12 @@ stopping_criteria = StoppingCriteriaList( outputs = model.generate(inputs, streamer=streamer, stopping_criteria=stopping_criteria) ``` + +### 6. Verbose Mode + +Enable verbose mode and control tracing information using the `NEURAL_SPEED_VERBOSE` environment variable. + +Available modes: +- 0: Print all tracing information. Comprehensive output, including: evaluation time and operator profiling. +- 1: Print evaluation time. Time taken for each evaluation. +- 2: Profile individual operator. Identify performance bottleneck within the model. \ No newline at end of file diff --git a/neural_speed/__init__.py b/neural_speed/__init__.py index 78a700e0e..8fd0a181b 100644 --- a/neural_speed/__init__.py +++ b/neural_speed/__init__.py @@ -189,6 +189,8 @@ def generate(self, input_ids, streamer=None, interactive=False, ignore_prompt=Fa streamer.end() self.generate_round += 1 + if os.getenv("NEURAL_SPEED_VERBOSE") and os.getenv("NEURAL_SPEED_VERBOSE") in ["1", "0"]: + self.model.print_time() return ret def is_token_end(self): diff --git a/neural_speed/application/main_pybind.cpp b/neural_speed/application/main_pybind.cpp index 8cf92bbad..83774c023 100644 --- a/neural_speed/application/main_pybind.cpp +++ b/neural_speed/application/main_pybind.cpp @@ -90,7 +90,7 @@ class Model { curr_input_ids.resize(params.batch_size); generate_count = 0; } - + void print_time() { model_print_timings(ctx); } static size_t np_bestla_qpack(py::array_t src_w, py::array_t src_scales, py::array_t src_zeros, py::array_t g_idx, py::array_t dst, const std::string& weight_dtype, const std::string& alg, int group_size, const std::string& scale_dtype, @@ -689,5 +689,6 @@ PYBIND11_MODULE(qwen_cpp, m) py::arg("src_w"), py::arg("dst"), py::arg("weight_dtype") = "int4", py::arg("alg") = "sym", py::arg("group_size") = 32, py::arg("scale_dtype") = "fp32", py::arg("compute_dtype") = "int8", py::arg("threads") = 8) + .def("print_time", &Model::print_time) .def("reinit", &Model::reinit); } diff --git a/neural_speed/application/main_run.cpp b/neural_speed/application/main_run.cpp index 6c1a09d2a..b48d9d192 100644 --- a/neural_speed/application/main_run.cpp +++ b/neural_speed/application/main_run.cpp @@ -765,7 +765,9 @@ int main(int argc, char** argv) { // NOLINT model_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); } - model_print_timings(ctx); + if (ns_log_level() == 0 || ns_log_level() == 1) { + model_print_timings(ctx); + } model_free(ctx); return 0; diff --git a/neural_speed/models/baichuan/baichuan.cpp b/neural_speed/models/baichuan/baichuan.cpp index 608902f9d..589d64016 100644 --- a/neural_speed/models/baichuan/baichuan.cpp +++ b/neural_speed/models/baichuan/baichuan.cpp @@ -282,12 +282,9 @@ static bool baichuan_model_eval_internal(model_context* ctx, const model_input* ne_build_forward_expand(&gf, inpL); ne_graph_compute(ctx0, &gf); -#ifdef NE_PERF - bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL); - if (engine_profiling_) { + if (ns_log_level() == 0 || ns_log_level() == 2) { ne_graph_profiling(&gf); } -#endif // update kv token count lctx.model.kv_self.n = n_past + N; diff --git a/neural_speed/models/bloom/bloom.cpp b/neural_speed/models/bloom/bloom.cpp index 3f7ce5958..d09bcdbc7 100644 --- a/neural_speed/models/bloom/bloom.cpp +++ b/neural_speed/models/bloom/bloom.cpp @@ -260,12 +260,9 @@ static bool bloom_model_eval_internal(model_context* ctx, const model_input* inp ne_build_forward_expand(&gf, inpL); ne_graph_compute(ctx0, &gf); -#ifdef NE_PERF - bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL); - if (engine_profiling_) { + if (ns_log_level() == 0 || ns_log_level() == 2) { ne_graph_profiling(&gf); } -#endif // update kv token count lctx.model.kv_self.n = n_past + N; diff --git a/neural_speed/models/chatglm/chatglm.cpp b/neural_speed/models/chatglm/chatglm.cpp index 243cccb52..68836e5cb 100644 --- a/neural_speed/models/chatglm/chatglm.cpp +++ b/neural_speed/models/chatglm/chatglm.cpp @@ -288,12 +288,9 @@ static bool chatglm_model_eval_internal(model_context* ctx, const model_input* i ne_build_forward_expand(&gf, inpL); ne_graph_compute(ctx0, &gf); -#ifdef NE_PERF - bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL); - if (engine_profiling_) { + if (ns_log_level() == 0 || ns_log_level() == 2) { ne_graph_profiling(&gf); } -#endif // update kv token count lctx.model.kv_self.n = n_past + N; diff --git a/neural_speed/models/chatglm/chatglm2.cpp b/neural_speed/models/chatglm/chatglm2.cpp index cd9c384a7..03acfd19b 100644 --- a/neural_speed/models/chatglm/chatglm2.cpp +++ b/neural_speed/models/chatglm/chatglm2.cpp @@ -314,12 +314,9 @@ static bool chatglm_model_eval_internal(model_context* ctx, const model_input* i ne_build_forward_expand(&gf, inpL); ne_graph_compute(ctx0, &gf); -#ifdef NE_PERF - bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL); - if (engine_profiling_) { + if (ns_log_level() == 0 || ns_log_level() == 2) { ne_graph_profiling(&gf); } -#endif // update kv token count lctx.model.kv_self.n = n_cached; diff --git a/neural_speed/models/falcon/falcon.cpp b/neural_speed/models/falcon/falcon.cpp index e866fd94a..35bfe9026 100644 --- a/neural_speed/models/falcon/falcon.cpp +++ b/neural_speed/models/falcon/falcon.cpp @@ -322,12 +322,9 @@ static bool falcon_model_eval_internal(model_context* ctx, const model_input* in ne_build_forward_expand(&gf, inpL); ne_graph_compute(ctx0, &gf); -#ifdef NE_PERF - bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL); - if (engine_profiling_) { + if (ns_log_level() == 0 || ns_log_level() == 2) { ne_graph_profiling(&gf); } -#endif // update kv token count lctx.model.kv_self.n = n_past + N; diff --git a/neural_speed/models/gptj/gptj.cpp b/neural_speed/models/gptj/gptj.cpp index 1ee917510..34a33ad3f 100644 --- a/neural_speed/models/gptj/gptj.cpp +++ b/neural_speed/models/gptj/gptj.cpp @@ -481,12 +481,9 @@ static bool gptj_model_eval_internal(model_context* ctx, const model_input* inpu ne_build_forward_expand(&gf, inpL); ne_graph_compute(ctx0, &gf); -#ifdef NE_PERF - bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL); - if (engine_profiling_) { + if (ns_log_level() == 0 || ns_log_level() == 2) { ne_graph_profiling(&gf); } -#endif // update kv token count lctx.model.kv_self.n = n_cached; diff --git a/neural_speed/models/gptneox/gptneox.cpp b/neural_speed/models/gptneox/gptneox.cpp index 93784e709..5a7f36582 100644 --- a/neural_speed/models/gptneox/gptneox.cpp +++ b/neural_speed/models/gptneox/gptneox.cpp @@ -363,12 +363,9 @@ static bool gptneox_model_eval_internal(model_context* ctx, const model_input* i ne_build_forward_expand(&gf, inpL); ne_graph_compute(ctx0, &gf); -#ifdef NE_PERF - bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL); - if (engine_profiling_) { + if (ns_log_level() == 0 || ns_log_level() == 2) { ne_graph_profiling(&gf); } -#endif // update kv token count lctx.model.kv_self.n = n_past + N; diff --git a/neural_speed/models/llama/llama.cpp b/neural_speed/models/llama/llama.cpp index 9759b4a6e..e88fcd85c 100644 --- a/neural_speed/models/llama/llama.cpp +++ b/neural_speed/models/llama/llama.cpp @@ -394,12 +394,9 @@ static bool llama_model_eval_internal(model_context* ctx, const model_input* inp ne_build_forward_expand(&gf, inpL); ne_graph_compute(ctx0, &gf); -#ifdef NE_PERF - bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL); - if (engine_profiling_) { + if (ns_log_level() == 0 || ns_log_level() == 2) { ne_graph_profiling(&gf); } -#endif // update kv token count lctx.model.kv_self.n = n_cached; diff --git a/neural_speed/models/model_utils/model_utils.cpp b/neural_speed/models/model_utils/model_utils.cpp index dbcb29b8c..af9480152 100644 --- a/neural_speed/models/model_utils/model_utils.cpp +++ b/neural_speed/models/model_utils/model_utils.cpp @@ -53,6 +53,12 @@ #include "models/model_utils/util.h" #include "models/models.h" +int64_t ns_log_level() { + const char* log_level = getenv("NEURAL_SPEED_VERBOSE"); + if (log_level == nullptr) return -1; + return std::stoi(log_level); +} + // // kv cache // diff --git a/neural_speed/models/model_utils/model_utils.h b/neural_speed/models/model_utils/model_utils.h index 1873c3893..ead9e0cf8 100644 --- a/neural_speed/models/model_utils/model_utils.h +++ b/neural_speed/models/model_utils/model_utils.h @@ -50,6 +50,8 @@ #define MODEL_SESSION_MAGIC MODEL_FILE_MAGIC_GGSN #define MODEL_SESSION_VERSION 1 +int64_t ns_log_level(); + void model_load_internal(const std::string& fname, model_archs arch, model_context* ctx, int n_gpu_layers, bool use_mmap, bool use_mlock, bool vocab_only, model_progress_callback progress_callback, void* progress_callback_user_data); diff --git a/neural_speed/models/mpt/mpt.cpp b/neural_speed/models/mpt/mpt.cpp index fcff0cae6..cc5470112 100644 --- a/neural_speed/models/mpt/mpt.cpp +++ b/neural_speed/models/mpt/mpt.cpp @@ -305,12 +305,9 @@ static bool mpt_model_eval_internal(model_context* ctx, const model_input* input ne_build_forward_expand(&gf, inpL); ne_graph_compute(ctx0, &gf); -#ifdef NE_PERF - bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL); - if (engine_profiling_) { + if (ns_log_level() == 0 || ns_log_level() == 2) { ne_graph_profiling(&gf); } -#endif // update kv token count lctx.model.kv_self.n = n_past + N; diff --git a/neural_speed/models/opt/opt.cpp b/neural_speed/models/opt/opt.cpp index 5aa02578d..d1f8736b0 100644 --- a/neural_speed/models/opt/opt.cpp +++ b/neural_speed/models/opt/opt.cpp @@ -322,12 +322,9 @@ static bool opt_model_eval_internal(model_context* ctx, const model_input* input ne_build_forward_expand(&gf, inpL); ne_graph_compute(ctx0, &gf); -#ifdef NE_PERF - bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL); - if (engine_profiling_) { + if (ns_log_level() == 0 || ns_log_level() == 2) { ne_graph_profiling(&gf); } -#endif // update kv token count lctx.model.kv_self.n = n_past + N; diff --git a/neural_speed/models/qwen/qwen.cpp b/neural_speed/models/qwen/qwen.cpp index ea98e0785..4fce7776e 100644 --- a/neural_speed/models/qwen/qwen.cpp +++ b/neural_speed/models/qwen/qwen.cpp @@ -339,12 +339,9 @@ static bool qwen_model_eval_internal(model_context* ctx, const model_input* inpu ne_build_forward_expand(&gf, inpL); ne_graph_compute(ctx0, &gf); -#ifdef NE_PERF - bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL); - if (engine_profiling_) { + if (ns_log_level() == 0 || ns_log_level() == 2) { ne_graph_profiling(&gf); } -#endif // update kv token count lctx.model.kv_self.n = n_past + N; diff --git a/neural_speed/models/starcoder/starcoder.cpp b/neural_speed/models/starcoder/starcoder.cpp index 6c8c34353..f3493158c 100644 --- a/neural_speed/models/starcoder/starcoder.cpp +++ b/neural_speed/models/starcoder/starcoder.cpp @@ -393,12 +393,9 @@ static bool starcoder_model_eval_internal(model_context* ctx, const model_input* ne_build_forward_expand(&gf, inpL); ne_graph_compute(ctx0, &gf); -#ifdef NE_PERF - bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL); - if (engine_profiling_) { + if (ns_log_level() == 0 || ns_log_level() == 2) { ne_graph_profiling(&gf); } -#endif // update kv token count lctx.model.kv_self.n = n_past + N; From dfbeb93af59d4e2646cb203919fe1c39c642d276 Mon Sep 17 00:00:00 2001 From: zhenwei-intel Date: Thu, 4 Jan 2024 11:20:05 +0800 Subject: [PATCH 2/2] fix test Signed-off-by: zhenwei-intel --- .github/workflows/scripts/models/cpp_graph_inference.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/scripts/models/cpp_graph_inference.sh b/.github/workflows/scripts/models/cpp_graph_inference.sh index a80a2be5a..f7632830d 100644 --- a/.github/workflows/scripts/models/cpp_graph_inference.sh +++ b/.github/workflows/scripts/models/cpp_graph_inference.sh @@ -136,7 +136,7 @@ function main() { ## run inference export LANG=en_US.UTF-8 export LC_ALL=en_US.UTF-8 - OMP_NUM_THREADS=$(($cores_per_instance * 1)) numactl -m 0 -C 0-$(($cores_per_instance * 1 - 1)) \ + NEURAL_SPEED_VERBOSE=1 OMP_NUM_THREADS=$(($cores_per_instance * 1)) numactl -m 0 -C 0-$(($cores_per_instance * 1 - 1)) \ $infer_cmd --seed 1234 -t $cores_per_instance -b 2047 -c ${ctx} -n ${output} -m ${model}-${precision}.bin -p "$prompt" 2>&1 | tee ${WORKING_DIR}/${logs_file} || true & monitor done