-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathencoding
executable file
·84 lines (78 loc) · 3.12 KB
/
encoding
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env python3
"""
:author: 5km (十里)
:url: https://www.smslit.top
:copyright: © 2018 5km <[email protected]>
:license: MIT, see LICENSE for more details.
"""
import os
import argparse
from chardet.universaldetector import UniversalDetector
def getargs():
parser = argparse.ArgumentParser(description='这是一个检测或转换文件编码的工具')
parser.add_argument('files', nargs='+', help='指定一个或多个文件的路径')
parser.add_argument(
'-e',
'--encoding',
nargs='?',
const='UTF-8',
help='''指定目标编码,可选择的编码:
ASCII, (Default) UTF-8 (with or without a BOM), UTF-16 (with a BOM),
UTF-32 (with a BOM), Big5, GB2312/GB18030, EUC-TW, HZ-GB-2312,
ISO-2022-CN, EUC-JP, SHIFT_JIS, ISO-2022-JP, ISO-2022-KR, KOI8-R,
MacCyrillic, IBM855, IBM866, ISO-8859-5, windows-1251, ISO-8859-2,
windows-1250, EUC-KR, ISO-8859-5, windows-1251, ISO-8859-1, windows-1252,
ISO-8859-7, windows-1253, ISO-8859-8, windows-1255, TIS-620''')
parser.add_argument('-o', '--outdir', help='指定转换文件的输出目录')
return parser.parse_args()
def detcect_encoding(filepath):
"""检测文件编码
Args:
detector: UniversalDetector 对象
filepath: 文件路径
Return:
fileencoding: 文件编码
confidence: 检测结果的置信度,百分比
"""
detector = UniversalDetector()
detector.reset()
for each in open(filepath, 'rb'):
detector.feed(each)
if detector.done:
break
detector.close()
fileencoding = detector.result['encoding']
confidence = detector.result['confidence']
if fileencoding is None:
fileencoding = 'unknown'
confidence = 0.99
return fileencoding, confidence * 100
if __name__ == '__main__':
args = getargs()
if args.outdir:
if not os.path.exists(args.outdir):
answer = input(
f'[-] 无效的导出路径: {args.outdir} [-]\n要用转码后的文件直接替换源文件吗? y or n\n')
if 'y' in answer:
args.outdir = None
else:
exit(1)
for file in args.files:
if not os.path.isfile(file):
print(f'[-] 无效的文件路径: {file} [-]')
continue
encoding, confidence = detcect_encoding(file)
print(f'[+] {file}: 编码 -> {encoding} (置信度 {confidence}%) [+]')
if args.encoding and (encoding is not 'unknown') and (confidence > 75):
if args.encoding == encoding:
print(f'[*] {file} 已经是 {encoding} 编码了,无需转换![*]')
continue
f = open(file, 'r', encoding=encoding, errors='replace')
text = f.read()
f.close()
outpath = os.path.join(args.outdir, file) if args.outdir else file
f = open(outpath, 'w', encoding=args.encoding, errors='replace')
f.write(text)
f.close()
print('[+] 转码成功: %s(%s) -> %s(%s) [+] ' % (file, encoding, outpath,
args.encoding))