Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

Control printing information using NEURAL_SPEED_VERBOSE #26

Merged
merged 2 commits into from
Jan 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/scripts/models/cpp_graph_inference.sh
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ function main() {
## run inference
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
OMP_NUM_THREADS=$(($cores_per_instance * 1)) numactl -m 0 -C 0-$(($cores_per_instance * 1 - 1)) \
NEURAL_SPEED_VERBOSE=1 OMP_NUM_THREADS=$(($cores_per_instance * 1)) numactl -m 0 -C 0-$(($cores_per_instance * 1 - 1)) \
$infer_cmd --seed 1234 -t $cores_per_instance -b 2047 -c ${ctx} -n ${output} -m ${model}-${precision}.bin -p "$prompt" 2>&1 | tee ${WORKING_DIR}/${logs_file} || true &
monitor
done
Expand Down
5 changes: 1 addition & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,7 @@ if(NE_BUILD_TESTS)
add_compile_definitions(NE_BUILD_TESTS)
endif()

option(NE_PROFILING "neural_engine: use Profiling" OFF)
if (NE_PROFILING)
add_compile_definitions(NE_PERF)
endif()
add_compile_definitions(NE_PERF)
option(NE_BEAM_SEARCH_VERBOSE "neural_engine: print beam search processing log" OFF)
if (NE_BEAM_SEARCH_VERBOSE)
add_compile_definitions(NE_BEAM_SEARCH_VERBOSE_ON)
Expand Down
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -434,3 +434,12 @@ stopping_criteria = StoppingCriteriaList(

outputs = model.generate(inputs, streamer=streamer, stopping_criteria=stopping_criteria)
```

### 6. Verbose Mode

Enable verbose mode and control tracing information using the `NEURAL_SPEED_VERBOSE` environment variable.

Available modes:
- 0: Print all tracing information. Comprehensive output, including: evaluation time and operator profiling.
- 1: Print evaluation time. Time taken for each evaluation.
- 2: Profile individual operator. Identify performance bottleneck within the model.
2 changes: 2 additions & 0 deletions neural_speed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,8 @@ def generate(self, input_ids, streamer=None, interactive=False, ignore_prompt=Fa
streamer.end()

self.generate_round += 1
if os.getenv("NEURAL_SPEED_VERBOSE") and os.getenv("NEURAL_SPEED_VERBOSE") in ["1", "0"]:
self.model.print_time()
return ret

def is_token_end(self):
Expand Down
3 changes: 2 additions & 1 deletion neural_speed/application/main_pybind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ class Model {
curr_input_ids.resize(params.batch_size);
generate_count = 0;
}

void print_time() { model_print_timings(ctx); }
static size_t np_bestla_qpack(py::array_t<int8_t> src_w, py::array_t<float> src_scales, py::array_t<int8_t> src_zeros,
py::array_t<int32_t> g_idx, py::array_t<int8_t> dst, const std::string& weight_dtype,
const std::string& alg, int group_size, const std::string& scale_dtype,
Expand Down Expand Up @@ -689,5 +689,6 @@ PYBIND11_MODULE(qwen_cpp, m)
py::arg("src_w"), py::arg("dst"), py::arg("weight_dtype") = "int4", py::arg("alg") = "sym",
py::arg("group_size") = 32, py::arg("scale_dtype") = "fp32", py::arg("compute_dtype") = "int8",
py::arg("threads") = 8)
.def("print_time", &Model::print_time)
.def("reinit", &Model::reinit);
}
4 changes: 3 additions & 1 deletion neural_speed/application/main_run.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -765,7 +765,9 @@ int main(int argc, char** argv) { // NOLINT
model_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
}

model_print_timings(ctx);
if (ns_log_level() == 0 || ns_log_level() == 1) {
model_print_timings(ctx);
}
model_free(ctx);

return 0;
Expand Down
5 changes: 1 addition & 4 deletions neural_speed/models/baichuan/baichuan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -282,12 +282,9 @@ static bool baichuan_model_eval_internal(model_context* ctx, const model_input*
ne_build_forward_expand(&gf, inpL);
ne_graph_compute(ctx0, &gf);

#ifdef NE_PERF
bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
if (engine_profiling_) {
if (ns_log_level() == 0 || ns_log_level() == 2) {
ne_graph_profiling(&gf);
}
#endif

// update kv token count
lctx.model.kv_self.n = n_past + N;
Expand Down
5 changes: 1 addition & 4 deletions neural_speed/models/bloom/bloom.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -260,12 +260,9 @@ static bool bloom_model_eval_internal(model_context* ctx, const model_input* inp
ne_build_forward_expand(&gf, inpL);
ne_graph_compute(ctx0, &gf);

#ifdef NE_PERF
bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
if (engine_profiling_) {
if (ns_log_level() == 0 || ns_log_level() == 2) {
ne_graph_profiling(&gf);
}
#endif

// update kv token count
lctx.model.kv_self.n = n_past + N;
Expand Down
5 changes: 1 addition & 4 deletions neural_speed/models/chatglm/chatglm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -288,12 +288,9 @@ static bool chatglm_model_eval_internal(model_context* ctx, const model_input* i
ne_build_forward_expand(&gf, inpL);
ne_graph_compute(ctx0, &gf);

#ifdef NE_PERF
bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
if (engine_profiling_) {
if (ns_log_level() == 0 || ns_log_level() == 2) {
ne_graph_profiling(&gf);
}
#endif

// update kv token count
lctx.model.kv_self.n = n_past + N;
Expand Down
5 changes: 1 addition & 4 deletions neural_speed/models/chatglm/chatglm2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -314,12 +314,9 @@ static bool chatglm_model_eval_internal(model_context* ctx, const model_input* i
ne_build_forward_expand(&gf, inpL);
ne_graph_compute(ctx0, &gf);

#ifdef NE_PERF
bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
if (engine_profiling_) {
if (ns_log_level() == 0 || ns_log_level() == 2) {
ne_graph_profiling(&gf);
}
#endif

// update kv token count
lctx.model.kv_self.n = n_cached;
Expand Down
5 changes: 1 addition & 4 deletions neural_speed/models/falcon/falcon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -322,12 +322,9 @@ static bool falcon_model_eval_internal(model_context* ctx, const model_input* in
ne_build_forward_expand(&gf, inpL);
ne_graph_compute(ctx0, &gf);

#ifdef NE_PERF
bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
if (engine_profiling_) {
if (ns_log_level() == 0 || ns_log_level() == 2) {
ne_graph_profiling(&gf);
}
#endif

// update kv token count
lctx.model.kv_self.n = n_past + N;
Expand Down
5 changes: 1 addition & 4 deletions neural_speed/models/gptj/gptj.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -481,12 +481,9 @@ static bool gptj_model_eval_internal(model_context* ctx, const model_input* inpu
ne_build_forward_expand(&gf, inpL);
ne_graph_compute(ctx0, &gf);

#ifdef NE_PERF
bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
if (engine_profiling_) {
if (ns_log_level() == 0 || ns_log_level() == 2) {
ne_graph_profiling(&gf);
}
#endif

// update kv token count
lctx.model.kv_self.n = n_cached;
Expand Down
5 changes: 1 addition & 4 deletions neural_speed/models/gptneox/gptneox.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -363,12 +363,9 @@ static bool gptneox_model_eval_internal(model_context* ctx, const model_input* i
ne_build_forward_expand(&gf, inpL);
ne_graph_compute(ctx0, &gf);

#ifdef NE_PERF
bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
if (engine_profiling_) {
if (ns_log_level() == 0 || ns_log_level() == 2) {
ne_graph_profiling(&gf);
}
#endif

// update kv token count
lctx.model.kv_self.n = n_past + N;
Expand Down
5 changes: 1 addition & 4 deletions neural_speed/models/llama/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -394,12 +394,9 @@ static bool llama_model_eval_internal(model_context* ctx, const model_input* inp
ne_build_forward_expand(&gf, inpL);
ne_graph_compute(ctx0, &gf);

#ifdef NE_PERF
bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
if (engine_profiling_) {
if (ns_log_level() == 0 || ns_log_level() == 2) {
ne_graph_profiling(&gf);
}
#endif

// update kv token count
lctx.model.kv_self.n = n_cached;
Expand Down
6 changes: 6 additions & 0 deletions neural_speed/models/model_utils/model_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,12 @@
#include "models/model_utils/util.h"
#include "models/models.h"

int64_t ns_log_level() {
const char* log_level = getenv("NEURAL_SPEED_VERBOSE");
if (log_level == nullptr) return -1;
return std::stoi(log_level);
}

//
// kv cache
//
Expand Down
2 changes: 2 additions & 0 deletions neural_speed/models/model_utils/model_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@
#define MODEL_SESSION_MAGIC MODEL_FILE_MAGIC_GGSN
#define MODEL_SESSION_VERSION 1

int64_t ns_log_level();

void model_load_internal(const std::string& fname, model_archs arch, model_context* ctx, int n_gpu_layers,
bool use_mmap, bool use_mlock, bool vocab_only, model_progress_callback progress_callback,
void* progress_callback_user_data);
Expand Down
5 changes: 1 addition & 4 deletions neural_speed/models/mpt/mpt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -305,12 +305,9 @@ static bool mpt_model_eval_internal(model_context* ctx, const model_input* input
ne_build_forward_expand(&gf, inpL);
ne_graph_compute(ctx0, &gf);

#ifdef NE_PERF
bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
if (engine_profiling_) {
if (ns_log_level() == 0 || ns_log_level() == 2) {
ne_graph_profiling(&gf);
}
#endif

// update kv token count
lctx.model.kv_self.n = n_past + N;
Expand Down
5 changes: 1 addition & 4 deletions neural_speed/models/opt/opt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -322,12 +322,9 @@ static bool opt_model_eval_internal(model_context* ctx, const model_input* input
ne_build_forward_expand(&gf, inpL);
ne_graph_compute(ctx0, &gf);

#ifdef NE_PERF
bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
if (engine_profiling_) {
if (ns_log_level() == 0 || ns_log_level() == 2) {
ne_graph_profiling(&gf);
}
#endif

// update kv token count
lctx.model.kv_self.n = n_past + N;
Expand Down
5 changes: 1 addition & 4 deletions neural_speed/models/qwen/qwen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -339,12 +339,9 @@ static bool qwen_model_eval_internal(model_context* ctx, const model_input* inpu
ne_build_forward_expand(&gf, inpL);
ne_graph_compute(ctx0, &gf);

#ifdef NE_PERF
bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
if (engine_profiling_) {
if (ns_log_level() == 0 || ns_log_level() == 2) {
ne_graph_profiling(&gf);
}
#endif

// update kv token count
lctx.model.kv_self.n = n_past + N;
Expand Down
5 changes: 1 addition & 4 deletions neural_speed/models/starcoder/starcoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -393,12 +393,9 @@ static bool starcoder_model_eval_internal(model_context* ctx, const model_input*
ne_build_forward_expand(&gf, inpL);
ne_graph_compute(ctx0, &gf);

#ifdef NE_PERF
bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
if (engine_profiling_) {
if (ns_log_level() == 0 || ns_log_level() == 2) {
ne_graph_profiling(&gf);
}
#endif

// update kv token count
lctx.model.kv_self.n = n_past + N;
Expand Down