./bin/perplexity --hellaswag -f ../hellaswag_text_data/hellaswag_val_full.txt -m ../models/L2_7B/ggml-model-f16.gguf -t 1 -ngl 100 --hellaswag-tasks 400
main: build = 1065 (154725c)
main: seed = 1692977821
ggml_init_cublas: found 1 CUDA devices:
Device 0: NVIDIA GeForce RTX 4080, compute capability 8.9
llama_model_loader: loaded meta data with 14 key-value pairs and 291 tensors from ../models/L2_7B/ggml-model-f16.gguf (version GGUF V1 (latest))
llama_model_loader: - tensor 0: token_embd.weight f16 [ 4096, 32000, 1, 1 ]
llama_model_loader: - tensor 1: output_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 2: output.weight f16 [ 4096, 32000, 1, 1 ]
llama_model_loader: - tensor 3: blk.0.attn_q.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 4: blk.0.attn_k.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 5: blk.0.attn_v.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 6: blk.0.attn_output.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 7: blk.0.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 8: blk.0.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 9: blk.0.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 10: blk.0.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 11: blk.0.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 12: blk.1.attn_q.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 13: blk.1.attn_k.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 14: blk.1.attn_v.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 15: blk.1.attn_output.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 16: blk.1.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 17: blk.1.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 18: blk.1.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 19: blk.1.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 20: blk.1.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 21: blk.2.attn_q.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 22: blk.2.attn_k.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 23: blk.2.attn_v.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 24: blk.2.attn_output.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 25: blk.2.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 26: blk.2.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 27: blk.2.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 28: blk.2.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 29: blk.2.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 30: blk.3.attn_q.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 31: blk.3.attn_k.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 32: blk.3.attn_v.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 33: blk.3.attn_output.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 34: blk.3.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 35: blk.3.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 36: blk.3.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 37: blk.3.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 38: blk.3.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 39: blk.4.attn_q.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 40: blk.4.attn_k.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 41: blk.4.attn_v.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 42: blk.4.attn_output.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 43: blk.4.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 44: blk.4.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 45: blk.4.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 46: blk.4.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 47: blk.4.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 48: blk.5.attn_q.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 49: blk.5.attn_k.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 50: blk.5.attn_v.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 51: blk.5.attn_output.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 52: blk.5.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 53: blk.5.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 54: blk.5.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 55: blk.5.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 56: blk.5.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 57: blk.6.attn_q.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 58: blk.6.attn_k.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 59: blk.6.attn_v.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 60: blk.6.attn_output.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 61: blk.6.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 62: blk.6.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 63: blk.6.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 64: blk.6.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 65: blk.6.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 66: blk.7.attn_q.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 67: blk.7.attn_k.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 68: blk.7.attn_v.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 69: blk.7.attn_output.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 70: blk.7.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 71: blk.7.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 72: blk.7.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 73: blk.7.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 74: blk.7.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 75: blk.8.attn_q.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 76: blk.8.attn_k.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 77: blk.8.attn_v.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 78: blk.8.attn_output.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 79: blk.8.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 80: blk.8.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 81: blk.8.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 82: blk.8.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 83: blk.8.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 84: blk.9.attn_q.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 85: blk.9.attn_k.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 86: blk.9.attn_v.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 87: blk.9.attn_output.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 88: blk.9.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 89: blk.9.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 90: blk.9.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 91: blk.9.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 92: blk.9.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 93: blk.10.attn_q.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 94: blk.10.attn_k.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 95: blk.10.attn_v.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 96: blk.10.attn_output.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 97: blk.10.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 98: blk.10.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 99: blk.10.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 100: blk.10.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 101: blk.10.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 102: blk.11.attn_q.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 103: blk.11.attn_k.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 104: blk.11.attn_v.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 105: blk.11.attn_output.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 106: blk.11.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 107: blk.11.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 108: blk.11.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 109: blk.11.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 110: blk.11.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 111: blk.12.attn_q.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 112: blk.12.attn_k.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 113: blk.12.attn_v.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 114: blk.12.attn_output.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 115: blk.12.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 116: blk.12.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 117: blk.12.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 118: blk.12.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 119: blk.12.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 120: blk.13.attn_q.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 121: blk.13.attn_k.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 122: blk.13.attn_v.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 123: blk.13.attn_output.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 124: blk.13.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 125: blk.13.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 126: blk.13.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 127: blk.13.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 128: blk.13.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 129: blk.14.attn_q.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 130: blk.14.attn_k.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 131: blk.14.attn_v.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 132: blk.14.attn_output.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 133: blk.14.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 134: blk.14.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 135: blk.14.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 136: blk.14.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 137: blk.14.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 138: blk.15.attn_q.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 139: blk.15.attn_k.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 140: blk.15.attn_v.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 141: blk.15.attn_output.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 142: blk.15.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 143: blk.15.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 144: blk.15.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 145: blk.15.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 146: blk.15.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 147: blk.16.attn_q.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 148: blk.16.attn_k.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 149: blk.16.attn_v.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 150: blk.16.attn_output.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 151: blk.16.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 152: blk.16.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 153: blk.16.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 154: blk.16.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 155: blk.16.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 156: blk.17.attn_q.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 157: blk.17.attn_k.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 158: blk.17.attn_v.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 159: blk.17.attn_output.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 160: blk.17.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 161: blk.17.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 162: blk.17.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 163: blk.17.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 164: blk.17.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 165: blk.18.attn_q.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 166: blk.18.attn_k.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 167: blk.18.attn_v.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 168: blk.18.attn_output.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 169: blk.18.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 170: blk.18.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 171: blk.18.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 172: blk.18.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 173: blk.18.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 174: blk.19.attn_q.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 175: blk.19.attn_k.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 176: blk.19.attn_v.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 177: blk.19.attn_output.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 178: blk.19.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 179: blk.19.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 180: blk.19.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 181: blk.19.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 182: blk.19.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 183: blk.20.attn_q.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 184: blk.20.attn_k.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 185: blk.20.attn_v.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 186: blk.20.attn_output.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 187: blk.20.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 188: blk.20.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 189: blk.20.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 190: blk.20.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 191: blk.20.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 192: blk.21.attn_q.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 193: blk.21.attn_k.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 194: blk.21.attn_v.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 195: blk.21.attn_output.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 196: blk.21.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 197: blk.21.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 198: blk.21.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 199: blk.21.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 200: blk.21.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 201: blk.22.attn_q.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 202: blk.22.attn_k.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 203: blk.22.attn_v.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 204: blk.22.attn_output.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 205: blk.22.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 206: blk.22.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 207: blk.22.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 208: blk.22.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 209: blk.22.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 210: blk.23.attn_q.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 211: blk.23.attn_k.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 212: blk.23.attn_v.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 213: blk.23.attn_output.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 214: blk.23.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 215: blk.23.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 216: blk.23.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 217: blk.23.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 218: blk.23.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 219: blk.24.attn_q.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 220: blk.24.attn_k.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 221: blk.24.attn_v.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 222: blk.24.attn_output.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 223: blk.24.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 224: blk.24.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 225: blk.24.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 226: blk.24.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 227: blk.24.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 228: blk.25.attn_q.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 229: blk.25.attn_k.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 230: blk.25.attn_v.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 231: blk.25.attn_output.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 232: blk.25.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 233: blk.25.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 234: blk.25.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 235: blk.25.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 236: blk.25.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 237: blk.26.attn_q.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 238: blk.26.attn_k.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 239: blk.26.attn_v.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 240: blk.26.attn_output.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 241: blk.26.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 242: blk.26.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 243: blk.26.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 244: blk.26.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 245: blk.26.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 246: blk.27.attn_q.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 247: blk.27.attn_k.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 248: blk.27.attn_v.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 249: blk.27.attn_output.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 250: blk.27.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 251: blk.27.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 252: blk.27.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 253: blk.27.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 254: blk.27.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 255: blk.28.attn_q.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 256: blk.28.attn_k.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 257: blk.28.attn_v.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 258: blk.28.attn_output.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 259: blk.28.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 260: blk.28.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 261: blk.28.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 262: blk.28.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 263: blk.28.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 264: blk.29.attn_q.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 265: blk.29.attn_k.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 266: blk.29.attn_v.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 267: blk.29.attn_output.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 268: blk.29.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 269: blk.29.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 270: blk.29.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 271: blk.29.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 272: blk.29.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 273: blk.30.attn_q.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 274: blk.30.attn_k.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 275: blk.30.attn_v.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 276: blk.30.attn_output.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 277: blk.30.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 278: blk.30.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 279: blk.30.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 280: blk.30.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 281: blk.30.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 282: blk.31.attn_q.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 283: blk.31.attn_k.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 284: blk.31.attn_v.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 285: blk.31.attn_output.weight f16 [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 286: blk.31.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 287: blk.31.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 288: blk.31.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 289: blk.31.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 290: blk.31.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - kv 0: general.architecture str
llama_model_loader: - kv 1: general.name str
llama_model_loader: - kv 2: llama.context_length u32
llama_model_loader: - kv 3: llama.embedding_length u32
llama_model_loader: - kv 4: llama.block_count u32
llama_model_loader: - kv 5: llama.feed_forward_length u32
llama_model_loader: - kv 6: llama.rope.dimension_count u32
llama_model_loader: - kv 7: llama.attention.head_count u32
llama_model_loader: - kv 8: llama.attention.head_count_kv u32
llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32
llama_model_loader: - kv 10: tokenizer.ggml.model str
llama_model_loader: - kv 11: tokenizer.ggml.tokens arr
llama_model_loader: - kv 12: tokenizer.ggml.scores arr
llama_model_loader: - kv 13: tokenizer.ggml.token_type arr
llama_model_loader: - type f32: 65 tensors
llama_model_loader: - type f16: 226 tensors
llm_load_print_meta: format = GGUF V1 (latest)
llm_load_print_meta: arch = llama
llm_load_print_meta: vocab type = SPM
llm_load_print_meta: n_vocab = 32000
llm_load_print_meta: n_merges = 0
llm_load_print_meta: n_ctx_train = 4096
llm_load_print_meta: n_ctx = 512
llm_load_print_meta: n_embd = 4096
llm_load_print_meta: n_head = 32
llm_load_print_meta: n_head_kv = 32
llm_load_print_meta: n_layer = 32
llm_load_print_meta: n_rot = 128
llm_load_print_meta: n_gqa = 1
llm_load_print_meta: f_norm_eps = 1.0e-05
llm_load_print_meta: f_norm_rms_eps = 1.0e-05
llm_load_print_meta: n_ff = 11008
llm_load_print_meta: freq_base = 10000.0
llm_load_print_meta: freq_scale = 1
llm_load_print_meta: model type = 7B
llm_load_print_meta: model ftype = mostly F16 (guessed)
llm_load_print_meta: model size = 6.74 B
llm_load_print_meta: general.name = LLaMA
llm_load_print_meta: BOS token = 1 ''
llm_load_print_meta: EOS token = 2 ''
llm_load_print_meta: UNK token = 0 ''
llm_load_print_meta: LF token = 13 '<0x0A>'
llm_load_tensors: ggml ctx size = 0.09 MB
llm_load_tensors: using CUDA for GPU acceleration
llm_load_tensors: mem required = 250.09 MB (+ 256.00 MB per state)
llm_load_tensors: offloading 32 repeating layers to GPU
llm_load_tensors: offloading non-repeating layers to GPU
llm_load_tensors: offloading v cache to GPU
llm_load_tensors: offloading k cache to GPU
llm_load_tensors: offloaded 35/35 layers to GPU
llm_load_tensors: VRAM used: 12860 MB
...................................................................................................
llama_new_context_with_model: kv self size = 256.00 MB
llama_new_context_with_model: compute buffer total size = 71.91 MB
llama_new_context_with_model: VRAM scratch buffer: 70.50 MB
system_info: n_threads = 1 / 32 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | VSX = 0 |
hellaswag_score : loaded 10042 tasks from prompt.
hellaswag_score : selecting 400 randomized tasks.
hellaswag_score : calculating hellaswag score over selected tasks.
task acc_norm
1 100.00000000
2 100.00000000
3 100.00000000
4 100.00000000
5 100.00000000
6 83.33333333
7 71.42857143
8 62.50000000
9 55.55555556
10 60.00000000
11 54.54545455
12 50.00000000
13 46.15384615
14 42.85714286
15 46.66666667
16 50.00000000
17 52.94117647
18 55.55555556
19 57.89473684
20 60.00000000
21 61.90476190
22 63.63636364
23 65.21739130
24 66.66666667
25 68.00000000
26 69.23076923
27 66.66666667
28 67.85714286
29 65.51724138
30 66.66666667
31 67.74193548
32 68.75000000
33 66.66666667
34 67.64705882
35 68.57142857
36 69.44444444
37 67.56756757
38 68.42105263
39 69.23076923
40 70.00000000
41 70.73170732
42 71.42857143
43 72.09302326
44 72.72727273
45 71.11111111
46 69.56521739
47 70.21276596
48 70.83333333
49 71.42857143
50 72.00000000
51 72.54901961
52 73.07692308
53 73.58490566
54 74.07407407
55 72.72727273
56 71.42857143
57 70.17543860
58 70.68965517
59 71.18644068
60 71.66666667
61 70.49180328
62 70.96774194
63 71.42857143
64 70.31250000
65 70.76923077
66 71.21212121
67 71.64179104
68 70.58823529
69 71.01449275
70 70.00000000
71 69.01408451
72 69.44444444
73 68.49315068
74 68.91891892
75 68.00000000
76 68.42105263
77 67.53246753
78 67.94871795
79 68.35443038
80 68.75000000
81 69.13580247
82 69.51219512
83 68.67469880
84 69.04761905
85 69.41176471
86 69.76744186
87 70.11494253
88 70.45454545
89 70.78651685
90 70.00000000
91 70.32967033
92 69.56521739
93 69.89247312
94 70.21276596
95 70.52631579
96 70.83333333
97 71.13402062
98 70.40816327
99 70.70707071
100 71.00000000
101 70.29702970
102 69.60784314
103 69.90291262
104 70.19230769
105 69.52380952
106 69.81132075
107 69.15887850
108 69.44444444
109 68.80733945
110 69.09090909
111 68.46846847
112 68.75000000
113 68.14159292
114 68.42105263
115 67.82608696
116 68.10344828
117 68.37606838
118 67.79661017
119 68.06722689
120 68.33333333
121 68.59504132
122 68.85245902
123 69.10569106
124 68.54838710
125 68.00000000
126 68.25396825
127 68.50393701
128 67.96875000
129 67.44186047
130 67.69230769
131 67.93893130
132 67.42424242
133 67.66917293
134 67.91044776
135 68.14814815
136 68.38235294
137 68.61313869
138 68.84057971
139 69.06474820
140 69.28571429
141 68.79432624
142 68.30985915
143 67.83216783
144 68.05555556
145 68.27586207
146 68.49315068
147 68.70748299
148 68.91891892
149 69.12751678
150 69.33333333
151 68.87417219
152 68.42105263
153 68.62745098
154 68.18181818
155 68.38709677
156 68.58974359
157 68.78980892
158 68.35443038
159 68.55345912
160 68.75000000
161 68.94409938
162 69.13580247
163 68.71165644
164 68.90243902
165 69.09090909
166 69.27710843
167 69.46107784
168 69.64285714
169 69.23076923
170 69.41176471
171 69.59064327
172 69.76744186
173 69.94219653
174 70.11494253
175 70.28571429
176 70.45454545
177 70.62146893
178 70.78651685
179 70.94972067
180 71.11111111
181 70.71823204
182 70.87912088
183 71.03825137
184 71.19565217
185 70.81081081
186 70.96774194
187 70.58823529
188 70.74468085
189 70.89947090
190 71.05263158
191 70.68062827
192 70.83333333
193 70.98445596
194 71.13402062
195 71.28205128
196 70.91836735
197 71.06598985
198 70.70707071
199 70.85427136
200 70.50000000
201 70.14925373
202 70.29702970
203 70.44334975
204 70.09803922
205 69.75609756
206 69.90291262
207 70.04830918
208 70.19230769
209 70.33492823
210 70.47619048
211 70.61611374
212 70.75471698
213 70.42253521
214 70.56074766
215 70.23255814
216 70.37037037
217 70.50691244
218 70.64220183
219 70.77625571
220 70.90909091
221 70.58823529
222 70.27027027
223 70.40358744
224 70.08928571
225 70.22222222
226 70.35398230
227 70.48458150
228 70.61403509
229 70.74235808
230 70.43478261
231 70.56277056
232 70.68965517
233 70.38626609
234 70.51282051
235 70.63829787
236 70.33898305
237 70.46413502
238 70.58823529
239 70.29288703
240 70.41666667
241 70.12448133
242 70.24793388
243 69.95884774
244 69.67213115
245 69.79591837
246 69.51219512
247 69.63562753
248 69.35483871
249 69.47791165
250 69.60000000
251 69.72111554
252 69.44444444
253 69.56521739
254 69.68503937
255 69.80392157
256 69.53125000
257 69.64980545
258 69.37984496
259 69.11196911
260 68.84615385
261 68.96551724
262 69.08396947
263 68.82129278
264 68.93939394
265 69.05660377
266 69.17293233
267 69.28838951
268 69.40298507
269 69.51672862
270 69.62962963
271 69.74169742
272 69.85294118
273 69.59706960
274 69.70802920
275 69.81818182
276 69.92753623
277 70.03610108
278 70.14388489
279 70.25089606
280 70.35714286
281 70.46263345
282 70.56737589
283 70.67137809
284 70.42253521
285 70.52631579
286 70.27972028
287 70.38327526
288 70.48611111
289 70.58823529
290 70.68965517
291 70.79037801
292 70.89041096
293 70.64846416
294 70.74829932
295 70.84745763
296 70.94594595
297 70.70707071
298 70.80536913
299 70.90301003
300 70.66666667
301 70.76411960
302 70.86092715
303 70.95709571
304 71.05263158
305 71.14754098
306 70.91503268
307 70.68403909
308 70.45454545
309 70.55016181
310 70.32258065
311 70.41800643
312 70.51282051
313 70.60702875
314 70.70063694
315 70.79365079
316 70.88607595
317 70.66246057
318 70.75471698
319 70.84639498
320 70.93750000
321 71.02803738
322 71.11801242
323 70.89783282
324 70.98765432
325 71.07692308
326 71.16564417
327 71.25382263
328 71.34146341
329 71.42857143
330 71.51515152
331 71.29909366
332 71.38554217
333 71.17117117
334 71.25748503
335 71.34328358
336 71.42857143
337 71.51335312
338 71.59763314
339 71.68141593
340 71.76470588
341 71.55425220
342 71.63742690
343 71.72011662
344 71.80232558
345 71.88405797
346 71.67630058
347 71.75792507
348 71.55172414
349 71.63323782
350 71.42857143
351 71.50997151
352 71.59090909
353 71.67138810
354 71.75141243
355 71.83098592
356 71.91011236
357 71.70868347
358 71.50837989
359 71.58774373
360 71.66666667
361 71.46814404
362 71.27071823
363 71.07438017
364 70.87912088
365 70.95890411
366 71.03825137
367 71.11716621
368 71.19565217
369 71.00271003
370 70.81081081
371 70.88948787
372 70.96774194
373 70.77747989
374 70.85561497
375 70.66666667
376 70.74468085
377 70.82228117
378 70.63492063
379 70.71240106
380 70.78947368
381 70.60367454
382 70.41884817
383 70.49608355
384 70.57291667
385 70.64935065
386 70.72538860
387 70.54263566
388 70.61855670
389 70.69408740
390 70.76923077
391 70.58823529
392 70.66326531
393 70.73791349
394 70.81218274
395 70.88607595
396 70.95959596
397 70.78085642
398 70.85427136
399 70.92731830
400 71.00000000
llama_print_timings: load time = 1073.18 ms
llama_print_timings: sample time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
llama_print_timings: prompt eval time = 189769.47 ms / 75755 tokens ( 2.51 ms per token, 399.19 tokens per second)
llama_print_timings: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
llama_print_timings: total time = 196732.03 ms