Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

Commit

Permalink
Control printing information using NEURAL_SPEED_VERBOSE (#26)
Browse files Browse the repository at this point in the history
  • Loading branch information
zhenwei-intel authored Jan 4, 2024
1 parent 9be307f commit a8d9e7d
Show file tree
Hide file tree
Showing 20 changed files with 38 additions and 55 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/scripts/models/cpp_graph_inference.sh
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ function main() {
## run inference
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
OMP_NUM_THREADS=$(($cores_per_instance * 1)) numactl -m 0 -C 0-$(($cores_per_instance * 1 - 1)) \
NEURAL_SPEED_VERBOSE=1 OMP_NUM_THREADS=$(($cores_per_instance * 1)) numactl -m 0 -C 0-$(($cores_per_instance * 1 - 1)) \
$infer_cmd --seed 1234 -t $cores_per_instance -b 2047 -c ${ctx} -n ${output} -m ${model}-${precision}.bin -p "$prompt" 2>&1 | tee ${WORKING_DIR}/${logs_file} || true &
monitor
done
Expand Down
5 changes: 1 addition & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,7 @@ if(NE_BUILD_TESTS)
add_compile_definitions(NE_BUILD_TESTS)
endif()

option(NE_PROFILING "neural_engine: use Profiling" OFF)
if (NE_PROFILING)
add_compile_definitions(NE_PERF)
endif()
add_compile_definitions(NE_PERF)
option(NE_BEAM_SEARCH_VERBOSE "neural_engine: print beam search processing log" OFF)
if (NE_BEAM_SEARCH_VERBOSE)
add_compile_definitions(NE_BEAM_SEARCH_VERBOSE_ON)
Expand Down
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -434,3 +434,12 @@ stopping_criteria = StoppingCriteriaList(

outputs = model.generate(inputs, streamer=streamer, stopping_criteria=stopping_criteria)
```

### 6. Verbose Mode

Enable verbose mode and control tracing information using the `NEURAL_SPEED_VERBOSE` environment variable.

Available modes:
- 0: Print all tracing information. Comprehensive output, including: evaluation time and operator profiling.
- 1: Print evaluation time. Time taken for each evaluation.
- 2: Profile individual operator. Identify performance bottleneck within the model.
2 changes: 2 additions & 0 deletions neural_speed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,8 @@ def generate(self, input_ids, streamer=None, interactive=False, ignore_prompt=Fa
streamer.end()

self.generate_round += 1
if os.getenv("NEURAL_SPEED_VERBOSE") and os.getenv("NEURAL_SPEED_VERBOSE") in ["1", "0"]:
self.model.print_time()
return ret

def is_token_end(self):
Expand Down
3 changes: 2 additions & 1 deletion neural_speed/application/main_pybind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ class Model {
curr_input_ids.resize(params.batch_size);
generate_count = 0;
}

void print_time() { model_print_timings(ctx); }
static size_t np_bestla_qpack(py::array_t<int8_t> src_w, py::array_t<float> src_scales, py::array_t<int8_t> src_zeros,
py::array_t<int32_t> g_idx, py::array_t<int8_t> dst, const std::string& weight_dtype,
const std::string& alg, int group_size, const std::string& scale_dtype,
Expand Down Expand Up @@ -689,5 +689,6 @@ PYBIND11_MODULE(qwen_cpp, m)
py::arg("src_w"), py::arg("dst"), py::arg("weight_dtype") = "int4", py::arg("alg") = "sym",
py::arg("group_size") = 32, py::arg("scale_dtype") = "fp32", py::arg("compute_dtype") = "int8",
py::arg("threads") = 8)
.def("print_time", &Model::print_time)
.def("reinit", &Model::reinit);
}
4 changes: 3 additions & 1 deletion neural_speed/application/main_run.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -765,7 +765,9 @@ int main(int argc, char** argv) { // NOLINT
model_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
}

model_print_timings(ctx);
if (ns_log_level() == 0 || ns_log_level() == 1) {
model_print_timings(ctx);
}
model_free(ctx);

return 0;
Expand Down
5 changes: 1 addition & 4 deletions neural_speed/models/baichuan/baichuan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -282,12 +282,9 @@ static bool baichuan_model_eval_internal(model_context* ctx, const model_input*
ne_build_forward_expand(&gf, inpL);
ne_graph_compute(ctx0, &gf);

#ifdef NE_PERF
bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
if (engine_profiling_) {
if (ns_log_level() == 0 || ns_log_level() == 2) {
ne_graph_profiling(&gf);
}
#endif

// update kv token count
lctx.model.kv_self.n = n_past + N;
Expand Down
5 changes: 1 addition & 4 deletions neural_speed/models/bloom/bloom.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -260,12 +260,9 @@ static bool bloom_model_eval_internal(model_context* ctx, const model_input* inp
ne_build_forward_expand(&gf, inpL);
ne_graph_compute(ctx0, &gf);

#ifdef NE_PERF
bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
if (engine_profiling_) {
if (ns_log_level() == 0 || ns_log_level() == 2) {
ne_graph_profiling(&gf);
}
#endif

// update kv token count
lctx.model.kv_self.n = n_past + N;
Expand Down
5 changes: 1 addition & 4 deletions neural_speed/models/chatglm/chatglm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -288,12 +288,9 @@ static bool chatglm_model_eval_internal(model_context* ctx, const model_input* i
ne_build_forward_expand(&gf, inpL);
ne_graph_compute(ctx0, &gf);

#ifdef NE_PERF
bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
if (engine_profiling_) {
if (ns_log_level() == 0 || ns_log_level() == 2) {
ne_graph_profiling(&gf);
}
#endif

// update kv token count
lctx.model.kv_self.n = n_past + N;
Expand Down
5 changes: 1 addition & 4 deletions neural_speed/models/chatglm/chatglm2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -314,12 +314,9 @@ static bool chatglm_model_eval_internal(model_context* ctx, const model_input* i
ne_build_forward_expand(&gf, inpL);
ne_graph_compute(ctx0, &gf);

#ifdef NE_PERF
bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
if (engine_profiling_) {
if (ns_log_level() == 0 || ns_log_level() == 2) {
ne_graph_profiling(&gf);
}
#endif

// update kv token count
lctx.model.kv_self.n = n_cached;
Expand Down
5 changes: 1 addition & 4 deletions neural_speed/models/falcon/falcon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -322,12 +322,9 @@ static bool falcon_model_eval_internal(model_context* ctx, const model_input* in
ne_build_forward_expand(&gf, inpL);
ne_graph_compute(ctx0, &gf);

#ifdef NE_PERF
bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
if (engine_profiling_) {
if (ns_log_level() == 0 || ns_log_level() == 2) {
ne_graph_profiling(&gf);
}
#endif

// update kv token count
lctx.model.kv_self.n = n_past + N;
Expand Down
5 changes: 1 addition & 4 deletions neural_speed/models/gptj/gptj.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -481,12 +481,9 @@ static bool gptj_model_eval_internal(model_context* ctx, const model_input* inpu
ne_build_forward_expand(&gf, inpL);
ne_graph_compute(ctx0, &gf);

#ifdef NE_PERF
bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
if (engine_profiling_) {
if (ns_log_level() == 0 || ns_log_level() == 2) {
ne_graph_profiling(&gf);
}
#endif

// update kv token count
lctx.model.kv_self.n = n_cached;
Expand Down
5 changes: 1 addition & 4 deletions neural_speed/models/gptneox/gptneox.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -363,12 +363,9 @@ static bool gptneox_model_eval_internal(model_context* ctx, const model_input* i
ne_build_forward_expand(&gf, inpL);
ne_graph_compute(ctx0, &gf);

#ifdef NE_PERF
bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
if (engine_profiling_) {
if (ns_log_level() == 0 || ns_log_level() == 2) {
ne_graph_profiling(&gf);
}
#endif

// update kv token count
lctx.model.kv_self.n = n_past + N;
Expand Down
5 changes: 1 addition & 4 deletions neural_speed/models/llama/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -394,12 +394,9 @@ static bool llama_model_eval_internal(model_context* ctx, const model_input* inp
ne_build_forward_expand(&gf, inpL);
ne_graph_compute(ctx0, &gf);

#ifdef NE_PERF
bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
if (engine_profiling_) {
if (ns_log_level() == 0 || ns_log_level() == 2) {
ne_graph_profiling(&gf);
}
#endif

// update kv token count
lctx.model.kv_self.n = n_cached;
Expand Down
6 changes: 6 additions & 0 deletions neural_speed/models/model_utils/model_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,12 @@
#include "models/model_utils/util.h"
#include "models/models.h"

int64_t ns_log_level() {
const char* log_level = getenv("NEURAL_SPEED_VERBOSE");
if (log_level == nullptr) return -1;
return std::stoi(log_level);
}

//
// kv cache
//
Expand Down
2 changes: 2 additions & 0 deletions neural_speed/models/model_utils/model_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@
#define MODEL_SESSION_MAGIC MODEL_FILE_MAGIC_GGSN
#define MODEL_SESSION_VERSION 1

int64_t ns_log_level();

void model_load_internal(const std::string& fname, model_archs arch, model_context* ctx, int n_gpu_layers,
bool use_mmap, bool use_mlock, bool vocab_only, model_progress_callback progress_callback,
void* progress_callback_user_data);
Expand Down
5 changes: 1 addition & 4 deletions neural_speed/models/mpt/mpt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -305,12 +305,9 @@ static bool mpt_model_eval_internal(model_context* ctx, const model_input* input
ne_build_forward_expand(&gf, inpL);
ne_graph_compute(ctx0, &gf);

#ifdef NE_PERF
bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
if (engine_profiling_) {
if (ns_log_level() == 0 || ns_log_level() == 2) {
ne_graph_profiling(&gf);
}
#endif

// update kv token count
lctx.model.kv_self.n = n_past + N;
Expand Down
5 changes: 1 addition & 4 deletions neural_speed/models/opt/opt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -322,12 +322,9 @@ static bool opt_model_eval_internal(model_context* ctx, const model_input* input
ne_build_forward_expand(&gf, inpL);
ne_graph_compute(ctx0, &gf);

#ifdef NE_PERF
bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
if (engine_profiling_) {
if (ns_log_level() == 0 || ns_log_level() == 2) {
ne_graph_profiling(&gf);
}
#endif

// update kv token count
lctx.model.kv_self.n = n_past + N;
Expand Down
5 changes: 1 addition & 4 deletions neural_speed/models/qwen/qwen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -339,12 +339,9 @@ static bool qwen_model_eval_internal(model_context* ctx, const model_input* inpu
ne_build_forward_expand(&gf, inpL);
ne_graph_compute(ctx0, &gf);

#ifdef NE_PERF
bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
if (engine_profiling_) {
if (ns_log_level() == 0 || ns_log_level() == 2) {
ne_graph_profiling(&gf);
}
#endif

// update kv token count
lctx.model.kv_self.n = n_past + N;
Expand Down
5 changes: 1 addition & 4 deletions neural_speed/models/starcoder/starcoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -393,12 +393,9 @@ static bool starcoder_model_eval_internal(model_context* ctx, const model_input*
ne_build_forward_expand(&gf, inpL);
ne_graph_compute(ctx0, &gf);

#ifdef NE_PERF
bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
if (engine_profiling_) {
if (ns_log_level() == 0 || ns_log_level() == 2) {
ne_graph_profiling(&gf);
}
#endif

// update kv token count
lctx.model.kv_self.n = n_past + N;
Expand Down

0 comments on commit a8d9e7d

Please sign in to comment.