-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconvertsub.py
executable file
·45 lines (40 loc) · 1.49 KB
/
convertsub.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import opencc
import argparse
import chardet
import re
from tqdm import tqdm
# Create an OpenCC converter for Traditional to Simplified conversion
converter = opencc.OpenCC('t2s')
def utffix(file):
'''Fix the encoding of the file to utf-8'''
with open(file, 'rb') as f:
content = f.read()
detection = chardet.detect(content)
encoding = detection['encoding']
if encoding != 'utf-8':
with open(file, 'w', encoding='utf-8') as f:
f.write(content.decode(encoding, errors='replace'))
def convert_chinese(text):
"""Convert Chinese text to Simplified Chinese using OpenCC"""
# Check if the line contains Chinese characters
if re.search('[\u4e00-\u9fa5]', text):
# Convert the line to Simplified Chinese
return converter.convert(text)
else:
return text
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('file', nargs="+")
args = parser.parse_args()
for file in args.file:
if file.endswith('.srt'):
utffix(file)
with open(file, 'r', encoding='utf-8') as infile:
lines = infile.readlines()
with open(file, 'w', encoding='utf-8') as outfile:
filename = file.split('/')[-1]
for line in tqdm(lines, desc=f'Converting {filename}'):
converted_line = convert_chinese(line.strip()) + '\n'
outfile.write(converted_line)
else:
pass