-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathflatten.py
69 lines (54 loc) · 2.09 KB
/
flatten.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import argparse
import json
import shutil
import yaml
from logging import getLogger,config
from pathlib import Path
from tqdm import tqdm
def main(args):
input_root_dirname:str=args.input_root_dirname
output_dirname:str=args.output_dirname
log_filepath:str=args.log_filepath
#Set up logger
with open("./logging_config.yaml","r",encoding="utf-8") as r:
logging_config=yaml.safe_load(r)
config.dictConfig(logging_config)
logger=getLogger(__name__)
logger.debug(args)
#Get list of all "text" files in the input directory
#AFAIK, most combolists are comprised of text files,
#but you may want some extra processing in case the combolist
#contains other formats of data, such as JSON and SQL
input_root_dir=Path(input_root_dirname)
input_files=input_root_dir.glob("**/*.txt")
input_files=[f for f in input_files if f.is_file()]
input_files.sort()
logger.info(f"{len(input_files)} files exist in the input directory")
#Create output directory
output_dir=Path(output_dirname)
output_dir.mkdir(exist_ok=True,parents=True)
#Flatten
logger.info("Start flattening the files...")
flattening_results:list[dict]=[]
for idx,input_file in enumerate(tqdm(input_files)):
#Copy the file
output_file=output_dir.joinpath(f"{idx}.txt")
shutil.copy(input_file,output_file)
#Store the correspondence of the input and the output files
flatten_result={
"index": idx,
"input_filepath": str(input_file)
}
flattening_results.append(flatten_result)
logger.info("Finished flattening the files")
#Output flatten results to a log file
log_file=Path(log_filepath)
with log_file.open("w",encoding="utf-8") as w:
json.dump(flattening_results,w,ensure_ascii=False)
if __name__=="__main__":
parser=argparse.ArgumentParser()
parser.add_argument("-i","--input-root-dirname",type=str)
parser.add_argument("-o","--output-dirname",type=str)
parser.add_argument("-l","--log-filepath",type=str)
args=parser.parse_args()
main(args)