[Tools] concat folder path in more secure way.

Signed-off-by: Duyi-Wang <[email protected]>
intel · Nov 3, 2023 · 185ee3d · 185ee3d
1 parent 7e448f6
commit 185ee3d
Show file tree

Hide file tree

Showing 4 changed files with 50 additions and 36 deletions.
diff --git a/tools/chatglm2_convert.py b/tools/chatglm2_convert.py
@@ -16,7 +16,7 @@
 from transformers import AutoTokenizer, AutoModel
 
 dir_path = os.path.dirname(os.path.realpath(__file__))
-sys.path.append(dir_path + "/../../../..")
+sys.path.append(os.path.join(dir_path, "../../../.."))
 sys.path.append(dir_path)
 
 
@@ -33,7 +33,7 @@ def split_and_convert_process(
     i, saved_dir, factor, key, args, val, old_name, dtype, num_attention_heads, multi_query_group_num, kv_channels
 ):
     def save_val(val, key, tp_num=None):
-        path = saved_dir + "/model." + key
+        path = path = os.path.join(saved_dir, "model." + key)
         if tp_num is not None:
             path += "." + str(tp_num)
         path += ".bin"
@@ -151,7 +151,7 @@ def split_and_convert(args):
         multi_query_group_num = config["chatglm2"]["kv_head_num"] = str(hf_config["multi_query_group_num"])
         config["chatglm2"]["pad_id"] = str(hf_config["pad_token_id"])
 
-        with open(saved_dir + "/config.ini", "w") as configfile:
+        with open(os.path.join(saved_dir, "config.ini"), "w") as configfile:
             config.write(configfile)
     except Exception as e:
         print("Fail to save the config in config.ini.", str(e))
@@ -201,13 +201,15 @@ def split_and_convert(args):
     pool = multiprocessing.Pool(args.processes)
     for name, param in model_named_parameters.items():
         if name == "transformer.embedding.word_embeddings.weight":
-            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(saved_dir + "model.wte.bin")
+            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(os.path.join(saved_dir, "model.wte.bin"))
         elif name == "transformer.encoder.final_layernorm.weight":
             param.detach().cpu().numpy().astype(np_weight_data_type).tofile(
-                saved_dir + "model.final_layernorm.weight.bin"
+                os.path.join(saved_dir, "model.final_layernorm.weight.bin")
             )
         elif name == "transformer.output_layer.weight":
-            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(saved_dir + "model.lm_head.weight.bin")
+            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(
+                os.path.join(saved_dir, "model.lm_head.weight.bin")
+            )
         else:
             starmap_args = []
             for i in range(len(huggingface_model_name_pattern)):

diff --git a/tools/chatglm_convert.py b/tools/chatglm_convert.py
@@ -16,7 +16,7 @@
 from transformers import AutoTokenizer, AutoModel
 
 dir_path = os.path.dirname(os.path.realpath(__file__))
-sys.path.append(dir_path + "/../../../..")
+sys.path.append(os.path.join(dir_path, "../../../.."))
 sys.path.append(dir_path)
 
 
@@ -31,7 +31,7 @@ def get_weight_data_type(data_type):
 
 def split_and_convert_process(i, saved_dir, factor, key, args, val, old_name, dtype):
     def save_val(val, key, tp_num=None):
-        path = saved_dir + "/model." + key
+        path = os.path.join(saved_dir, "model." + key)
         if tp_num is not None:
             path += "." + str(tp_num)
         path += ".bin"
@@ -130,7 +130,7 @@ def split_and_convert(args):
         config["chatglm"]["start_id"] = str(hf_config["bos_token_id"])
         config["chatglm"]["end_id"] = str(hf_config["eos_token_id"])
         config["chatglm"]["weight_data_type"] = args.weight_data_type
-        with open(saved_dir + "/config.ini", "w") as configfile:
+        with open(os.path.join(saved_dir, "config.ini"), "w") as configfile:
             config.write(configfile)
     except Exception as e:
         print("Fail to save the config in config.ini.", str(e))
@@ -181,17 +181,19 @@ def split_and_convert(args):
     pool = multiprocessing.Pool(args.processes)
     for name, param in model_named_parameters.items():
         if name == "transformer.word_embeddings.weight":
-            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(saved_dir + "model.wte.bin")
+            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(os.path.join(saved_dir, "model.wte.bin"))
         elif name == "transformer.final_layernorm.weight":
             param.detach().cpu().numpy().astype(np_weight_data_type).tofile(
-                saved_dir + "model.final_layernorm.weight.bin"
+                os.path.join(saved_dir, "model.final_layernorm.weight.bin")
             )
         elif name == "transformer.final_layernorm.bias":
             param.detach().cpu().numpy().astype(np_weight_data_type).tofile(
-                saved_dir + "model.final_layernorm.bias.bin"
+                os.path.join(saved_dir, "model.final_layernorm.bias.bin")
             )
         elif name == "lm_head.weight":
-            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(saved_dir + "model.lm_head.weight.bin")
+            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(
+                os.path.join(saved_dir, "model.lm_head.weight.bin")
+            )
         else:
             starmap_args = []
             for i in range(len(huggingface_model_name_pattern)):
@@ -222,8 +224,8 @@ def split_and_convert(args):
     torch.multiprocessing.set_sharing_strategy("file_system")
 
     parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
-    parser.add_argument('-saved_dir', '-o', type=str, help='file name of output file', required=True)
-    parser.add_argument('-in_file', '-i', type=str, help='file name of input checkpoint file', required=True)
+    parser.add_argument("-saved_dir", "-o", type=str, help="file name of output file", required=True)
+    parser.add_argument("-in_file", "-i", type=str, help="file name of input checkpoint file", required=True)
     parser.add_argument("-processes", "-p", type=int, help="processes to spawn for conversion (default: 8)", default=8)
     parser.add_argument("-weight_data_type", type=str, default="fp32", choices=["fp32", "fp16"])
 

diff --git a/tools/llama_convert.py b/tools/llama_convert.py
@@ -16,7 +16,7 @@
 from transformers import LlamaForCausalLM, LlamaTokenizer
 
 dir_path = os.path.dirname(os.path.realpath(__file__))
-sys.path.append(dir_path + "/../../../..")
+sys.path.append(os.path.join(dir_path, "../../../.."))
 sys.path.append(dir_path)
 
 
@@ -29,12 +29,14 @@ def get_weight_data_type(data_type):
         assert False, f"Invalid weight data type {data_type}"
 
 
-def split_and_convert_process(i, saved_dir, factor, key, args, val, old_name, dtype, num_attention_heads, num_key_value_heads):
+def split_and_convert_process(
+    i, saved_dir, factor, key, args, val, old_name, dtype, num_attention_heads, num_key_value_heads
+):
     def save_val(val, key, tp_num=None):
         if key.startswith("model."):
-            path = saved_dir + "/" + key
+            path = os.path.join(saved_dir, key)
         else:
-            path = saved_dir + "/model." + key
+            path = os.path.join(saved_dir, "model." + key)
 
         if tp_num is not None:
             path += "." + str(tp_num)
@@ -114,7 +116,9 @@ def split_and_convert(args):
     try:
         config["llama"]["model_name"] = "llama" if hf_config["_name_or_path"] == "" else hf_config["_name_or_path"]
         num_attention_heads = config["llama"]["head_num"] = str(hf_config["num_attention_heads"])
-        num_key_value_heads = config["llama"]["kv_head_num"] = str(hf_config.get("num_key_value_heads", num_attention_heads))
+        num_key_value_heads = config["llama"]["kv_head_num"] = str(
+            hf_config.get("num_key_value_heads", num_attention_heads)
+        )
 
         hidden_size = hf_config["hidden_size"]
         config["llama"]["size_per_head"] = str(hidden_size // hf_config["num_attention_heads"])
@@ -129,7 +133,7 @@ def split_and_convert(args):
         config["llama"]["start_id"] = str(hf_config["bos_token_id"])
         config["llama"]["end_id"] = str(hf_config["eos_token_id"])
         config["llama"]["weight_data_type"] = args.weight_data_type
-        with open(saved_dir + "/config.ini", "w") as configfile:
+        with open(os.path.join(saved_dir, "config.ini"), "w") as configfile:
             config.write(configfile)
     except Exception as e:
         print("Fail to save the config in config.ini.", str(e))
@@ -181,16 +185,18 @@ def split_and_convert(args):
     pool = multiprocessing.Pool(args.processes)
     for name, param in model_named_parameters.items():
         if name == "model.embed_tokens.weight":
-            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(saved_dir + "model.wte.bin")
+            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(os.path.join(saved_dir, "model.wte.bin"))
         elif name == "model.norm.weight":
             param.detach().cpu().numpy().astype(np_weight_data_type).tofile(
-                saved_dir + "model.final_layernorm.weight.bin"
+                os.path.join(saved_dir, "model.final_layernorm.weight.bin")
             )
         # elif name == 'model.final_layernorm.bias':
         #     param.detach().cpu().numpy().astype(np_weight_data_type).tofile(
-        #         saved_dir + "model.final_layernorm.bias.bin")
+        #         os.path.join(saved_dir, "model.final_layernorm.bias.bin"))
         elif name == "lm_head.weight":
-            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(saved_dir + "model.lm_head.weight.bin")
+            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(
+                os.path.join(saved_dir, "model.lm_head.weight.bin")
+            )
         else:
             starmap_args = []
             for i in range(len(hf_model_name_pattern)):
@@ -208,7 +214,7 @@ def split_and_convert(args):
                             name,
                             np_weight_data_type,
                             num_attention_heads,
-                            num_key_value_heads
+                            num_key_value_heads,
                         )
                     )
             pool.starmap_async(split_and_convert_process, starmap_args)

diff --git a/tools/opt_convert.py b/tools/opt_convert.py
@@ -31,7 +31,7 @@
 from transformers.models.opt.modeling_opt import OPTAttention, OPTDecoderLayer
 
 dir_path = os.path.dirname(os.path.realpath(__file__))
-sys.path.append(dir_path + "/../../../..")
+sys.path.append(os.path.join(dir_path, "../../../.."))
 sys.path.append(dir_path)
 
 
@@ -47,7 +47,7 @@ def get_weight_data_type(data_type):
 # def split_and_convert_process(i, saved_dir, factor, key, args, val, capture_dict, old_name, dtype):
 def split_and_convert_process(i, saved_dir, factor, key, args, val, old_name, dtype):
     def save_val(val, key, tp_num=None):
-        path = saved_dir + "/model." + key
+        path = os.path.join(saved_dir, "model." + key)
         if tp_num is not None:
             path += "." + str(tp_num)
         path += ".bin"
@@ -148,7 +148,7 @@ def split_and_convert(args):
         config["gpt"]["end_id"] = str(hf_config["eos_token_id"])
         config["gpt"]["weight_data_type"] = args.weight_data_type
         # config['gpt']['int8'] = str(save_int8) # really useful?
-        with open(saved_dir + "/config.ini", "w") as configfile:
+        with open(os.path.join(saved_dir, "config.ini"), "w") as configfile:
             config.write(configfile)
     except:
         print(f"Fail to save the config in config.ini.")
@@ -216,31 +216,35 @@ def split_and_convert(args):
     for name, param in model_named_parameters.items():
         if name == "model.decoder.embed_positions.weight":
             param[padding_offset:, ...].detach().cpu().numpy().astype(np_weight_data_type).tofile(
-                saved_dir + "model.wpe.bin"
+                os.path.join(saved_dir, "model.wpe.bin")
             )
 
         elif name == "model.decoder.embed_tokens.weight":
             if "model.decoder.project_in.weight" in model_named_parameters.keys():
                 project_in = model_named_parameters["model.decoder.project_in.weight"]
                 project_out = model_named_parameters["model.decoder.project_out.weight"]
                 torch.matmul(param, project_in).detach().cpu().numpy().astype(np_weight_data_type).tofile(
-                    saved_dir + "model.wte.bin"
+                    os.path.join(saved_dir, "model.wte.bin")
                 )
                 torch.matmul(param, project_out).detach().cpu().numpy().astype(np_weight_data_type).tofile(
-                    saved_dir + "model.lm_head.weight.bin"
+                    os.path.join(saved_dir, "model.lm_head.weight.bin")
                 )
 
             else:
-                param.detach().cpu().numpy().astype(np_weight_data_type).tofile(saved_dir + "model.wte.bin")
-                param.detach().cpu().numpy().astype(np_weight_data_type).tofile(saved_dir + "model.lm_head.weight.bin")
+                param.detach().cpu().numpy().astype(np_weight_data_type).tofile(
+                    os.path.join(saved_dir, "model.wte.bin")
+                )
+                param.detach().cpu().numpy().astype(np_weight_data_type).tofile(
+                    os.path.join(saved_dir, "model.lm_head.weight.bin")
+                )
 
         elif name == "model.decoder.final_layer_norm.weight":
             param.detach().cpu().numpy().astype(np_weight_data_type).tofile(
-                saved_dir + "model.final_layernorm.weight.bin"
+                os.path.join(saved_dir, "model.final_layernorm.weight.bin")
             )
         elif name == "model.decoder.final_layer_norm.bias":
             param.detach().cpu().numpy().astype(np_weight_data_type).tofile(
-                saved_dir + "model.final_layernorm.bias.bin"
+                os.path.join(saved_dir, "model.final_layernorm.bias.bin")
             )
         elif "project_in" in name or "project_out" in name:
             continue