NVIDIA · Tushar-ml · May 2, 2024 · May 4, 2024 · May 28, 2024
diff --git a/examples/medusa/convert_checkpoint.py b/examples/medusa/convert_checkpoint.py
@@ -571,9 +571,9 @@ def get_tllm_linear_weight(weight,
                            postfix='weight'):
     results = {}
     if use_weight_only:
-        v = weight.t().contiguous()
+        v = weight.t().contiguous().cpu()
         processed_torch_weights, torch_weight_scales = \
-            torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
+            torch.ops.trtllm.symmetric_quantize_last_axis_of_batched_matrix(
                 v, plugin_weight_only_quant_type)
         results[prefix + postfix] = processed_torch_weights
         results[prefix + 'per_channel_scale'] = torch_weight_scales
@@ -979,7 +979,7 @@ def convert_hf_llama(hf_model,
 
     tok = time.time()
     t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
-    print(f'Weights loaded. Total time: {t}')
+    logger.info(f'Weights loaded. Total time: {t}')
     return weights
 
 
@@ -989,7 +989,7 @@ def convert_hf_llama(hf_model,
     # which is included in tensorrt_llm Python package. Otherwise, the convert
     # script does not need to import tensorrt_llm. Will remove it after reimplementing
     # the op with PyTorch.
-    print(tensorrt_llm.__version__)
+    logger.info(tensorrt_llm.__version__)
     args = parse_arguments()
     world_size = args.tp_size * args.pp_size
 
@@ -1188,8 +1188,20 @@ def load_medusa_hf(medusa_path: str,
                                    mapping=Mapping(),
                                    dtype='float32'):
                     logger.info("Loading Medusa heads' weights ...")
+                    is_ckpt_safetensors = False
+
                     ckpt_file = Path(medusa_path) / "medusa_lm_head.pt"
-                    state_dict = torch.load(ckpt_file, map_location="cpu")
+                    if not ckpt_file.exists():
+                        ckpt_file = Path(medusa_path) / "medusa_lm_head.safetensors"
+                        is_ckpt_safetensors = True
+
+                    if is_ckpt_safetensors:
+                        logger.info("Safetensors Found ...")
+                        from safetensors.torch import load_file
+                        state_dict = load_file(ckpt_file)
+                    else:
+                        state_dict = torch.load(ckpt_file, map_location="cpu")
+
                     torch_dtype = str_dtype_to_torch(dtype)
                     weights = {}
 
@@ -1198,10 +1210,13 @@ def load_medusa_hf(medusa_path: str,
                             w = state_dict[f"{h}.{l}.linear.weight"].clone().to(
                                 torch_dtype)
 
-                            weights[
-                                'medusa_heads.{}.medusa_layers.{}.linear.weight'
-                                .format(h, l)] = split(w, mapping.tp_size,
-                                                       mapping.tp_rank)
+                            split_v = split(w, mapping.tp_size, mapping.tp_rank)
+                            weights.update(
+                                get_tllm_linear_weight(split_v,
+                                                       f'medusa_heads.{h}.medusa_layers.{l}.linear.',
+                                                       None, args.use_weight_only, plugin_weight_only_quant_type
+                                                       )
+                            )
 
                             b = state_dict[f"{h}.{l}.linear.bias"].clone().to(
                                 torch_dtype)
@@ -1266,4 +1281,4 @@ def load_medusa_hf(medusa_path: str,
 
     tok = time.time()
     t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
-    print(f'Total time of converting checkpoints: {t}')
+    logger.info(f'Total time of converting checkpoints: {t}')