-
Notifications
You must be signed in to change notification settings - Fork 0
/
prepare_dataset.py
executable file
·79 lines (66 loc) · 1.8 KB
/
prepare_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import os
from lxml import etree as et
from tqdm import tqdm
from prepare_dataset import *
IMAGES_DATASET = 'my_images'
GT_PATH = 'gt_train.txt'
words_dict = {
'A': 'الف',
'B': 'ب',
'P': 'پ',
'T': 'ت',
'Y': 'ث',
'Z': 'ز',
'X': 'ش',
'E': 'ع',
'F': 'ف',
'K': 'ک',
'G': 'گ',
'D': 'D',
'S': 'S',
'J': 'ج',
'W': 'د',
'C': 'س',
'U': 'ص',
'R': 'ط',
'Q': 'ق',
'L': 'ل',
'M': 'م',
'N': 'ن',
'V': 'و',
'H': 'ه',
'I': 'ی',
'0': '0',
'1': '1',
'2': '2',
'3': '3',
'4': '4',
'5': '5',
'6': '6',
'7': '7',
'8': '8',
'9': '9',
'@': 'ویلچر',
}
def prepare_dataset(images_dataset_path: str, dataset_type: str) -> None:
gt_file = open(f'gt_{dataset_type}.txt', 'w', encoding='utf-8')
parser = et.XMLParser(encoding='utf-8')
for file in tqdm(os.listdir(images_dataset_path)):
if file.endswith('.xml'):
path = os.path.join(images_dataset_path, file)
xml_name = file.split('.')[0]
tree = et.parse(path, parser)
root = tree.getroot()
gt_file.write(os.path.join(dataset_type, f'{xml_name}.jpg'))
gt_file.write('\t')
for persian_word in root.iter('name'):
persian_word = persian_word.text.strip() if persian_word.text else ''
latin_word = [l_word for l_word, p_word in words_dict.items() if p_word == persian_word]
if len(latin_word):
gt_file.write(latin_word[0])
else:
gt_file.write('')
gt_file.write('\n')
gt_file.close()
if __name__ == '__main__':
prepare_dataset('', '')