-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathtrainLexicon.py
158 lines (142 loc) · 8.96 KB
/
trainLexicon.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# -*- coding: utf-8 -*-
import re
import json
import pickle
import sys
#Pass parameter to identify which lexicon dataset to load
#Possible lexicons: wiktionary, zargan
#Returns dictionary with example format below:
#wordDict = {"word_1":["kok","word_2"], "word_2":["kok","word_2"], "dif_word_1":["kok","diff_word_1"]}
def loadWord(dataset):
if dataset == "wiktionary":
with open('Datasets/wiktionary.pkl','rb') as f:
[wordList] = pickle.load(f)
wordDict = {}
for i in range(len(wordList)):
word = wordList[i].replace("â","a")
wordDict[findID(wordDict, word)] = ["kok",findID(wordDict, word)]
return wordDict
elif dataset == "zargan":
with open('Datasets/zargan.pkl','rb') as f:
zarganDict = pickle.load(f)
wordDict = {}
for word, valueList in zarganDict.items():
word = word.replace("â","a")
wordDict[findID(wordDict, word)] = ["kok",findID(wordDict, word)]
return wordDict
#finds correct id for given word
#for example if there are 3 different word for "gul" and you want to add one more
#it returns gul_4 to differentiate different versions of words
def findID(wordDict, word):
ct = 1
while word+"_"+str(ct) in wordDict:
ct = ct+1
return word+"_"+str(ct)
#This function generates new word types based on ablouts on lemmas
#To generate negative words, we add -ma -me suffixes to verbs
#For example: gelmek -> gelmemek, gitmek -> gitmemek
#To handle softening of consonants, I generate softened version of each word
#For lemmas end with [p,ç,t,k] except verbs, the last consonant can be transformed into [b,c,d,[gğ]] based on the suffix it had
#g is possible only when word ends with nk.
#I generate all possible transformation and I check if softening matches in checkSuffixValidation
#For example: kitap -> kitab, küçük -> küçüğ(correct one), renk -> rengi
#To handle "becoming close(narrow) vowel", I have used explanation in TDK site.(http://www.tdk.gov.tr/index.php?option=com_content&view=article&id=194:Unlu-Daralmasi&catid=50:yazm-kurallar&Itemid=132)
#When verbs whose last wovel is "a" or "e" take "-yor" suffix, their last wovel transforms into "ı" or "i"
#Exception is for verb demek and yemek
#For example izlemek -> izli, dinlememek -> dinlemi
#Dropping a vowel
#In Turkish, when some words with two syllables take suffix starting with vowel, word's last vowel drops.
#List of these words are added to the code
#For example: alın -> aln, zehir -> zehri, oğul -> oğlu
#Zero Infinitive
#In Turkish, verbs lexicon form, generally, is with to infinitive. This suffix is "-mak" or "-mek".
#For example, meaning of gelmek is "to come. However, we use root form of gelmek which is gel when we use verbs with different time forms. For example, present continuous form of gelmek is geliyor(without -mek part)
#Last action transforms verbs to the zero infinitive format
#For example: gelmek -> gel, almak -> al
def generate(wordDict, olay):
newDict = {}
if olay == "olumsuzluk eki":
for kelime, valueList in wordDict.items():
ind = kelime[kelime.index("_")+1:]
kelime = kelime[:kelime.index("_")]
if kelime.endswith("mak"):
kelime = kelime[:-3]+"mamak"
newDict[findID(wordDict, kelime)] = ["olumsuzluk",kelime+"_"+ind]
if kelime.endswith("mek"):
kelime = kelime[:-3]+"memek"
newDict[findID(wordDict, kelime)] = ["olumsuzluk",kelime+"_"+ind]
elif olay == "unsuz yumusamasi":
for kelime, valueList in wordDict.items():
ind = kelime[kelime.index("_")+1:]
kelime = kelime[:kelime.index("_")]
if not (kelime.endswith("mak") or kelime.endswith("mek")):
if kelime.endswith("p"):
newDict[findID(wordDict, kelime[:-1]+"b")] = ["unsuz yumusamasi", kelime+"_"+ind]
elif kelime.endswith("ç"):
newDict[findID(wordDict, kelime[:-1]+"c")] = ["unsuz yumusamasi", kelime+"_"+ind]
elif kelime.endswith("t"):
newDict[findID(wordDict, kelime[:-1]+"d")] = ["unsuz yumusamasi", kelime+"_"+ind]
elif kelime.endswith("k"):
if kelime.endswith("nk"):
newDict[findID(wordDict, kelime[:-1]+"g")] = ["unsuz yumusamasi", kelime+"_"+ind]
else:
newDict[findID(wordDict, kelime[:-1]+"ğ")] = ["unsuz yumusamasi", kelime+"_"+ind]
elif olay == "unlu daralmasi":
for kelime, valueList in wordDict.items():
ind = kelime[kelime.index("_")+1:]
kelime = kelime[:kelime.index("_")]
if kelime == "demek":
newDict[findID(wordDict, "di")] = ["unlu daralmasi", kelime+"_"+ind]
elif kelime == "yemek":
newDict[findID(wordDict, "yi")] = ["unlu daralmasi", kelime+"_"+ind]
elif kelime.endswith("amak"):
newDict[findID(wordDict, kelime[:-4]+"ı")] = ["unlu daralmasi", kelime+"_"+ind]
newDict[findID(wordDict, kelime[:-4]+"u")] = ["unlu daralmasi", kelime+"_"+ind]
elif kelime.endswith("emek"):
newDict[findID(wordDict, kelime[:-4]+"i")] = ["unlu daralmasi", kelime+"_"+ind]
newDict[findID(wordDict, kelime[:-4]+"ü")] = ["unlu daralmasi", kelime+"_"+ind]
elif olay == "unlu dusmesi":
for kelime, valueList in wordDict.items():
ind = kelime[kelime.index("_")+1:]
kelime = kelime[:kelime.index("_")]
dusmeDict = {'akis': 'aks', 'akıl': 'akl', 'alın': 'aln','asıl': 'asl','asır': 'asr','atıf': 'atf', 'avuç': 'avc','azim': 'azm', 'ağız': 'ağz', 'bahis': 'bahs','bağır': 'bağr','beniz': 'benz','beyin': 'beyn','boyun': 'boyn','burun': 'burn','böğür': 'böğr','cebir': 'cebr','cezir': 'cezr','cisim': 'cism','devir': 'devr','emir': 'emr','fasıl': 'fasl','fecir': 'fecr','fesih': 'fesh','fetih': 'feth','fikir': 'fikr','geniz': 'genz','gönül': 'gönl','göğüs': 'göğs','hacim': 'hacm','haciz': 'hacz','hapis': 'haps','hatır': 'hatr','hayır': 'hayr','hazım': 'hazm','hüküm': 'hükm','hüzün': 'hüzn','hısım': 'hısm','hışım': 'hışm','isim': 'ism','izin': 'izn','kabir': 'kabr','kahır': 'kahr','karın': 'karn','kasır': 'kasr','kasıt': 'kast','kayıp': 'kayb','kayıt': 'kayd','kesir': 'kesr','keyif': 'keyf','keşif': 'keşf','kibir': 'kibr','koyun': 'koyn','kusur': 'kusr','kutup': 'kutp','küfür': 'küfr','kısım': 'kısm','metin': 'metn','misil': 'misl','mühür': 'mühr','nabız': 'nabz','nakil': 'nakl','nakit': 'nakt','nefis': 'nefs','nehir': 'nehr','nesil': 'nesl','nutuk': 'nutk','omuz': 'omz','oğul': 'oğl','rehin': 'rehn','resim': 'resm','ritim': 'ritm','sabır': 'sabr','seyir': 'seyr','tavır': 'tavr','ufuk': 'ufk','umur': 'umr','vakit': 'vakt','vakıf': 'vakf','vasıf': 'vasf','zikir': 'zikr','zulüm': 'zulm','ömür': 'ömr','özür': 'özr','şahıs': 'şahs','şehir': 'şehr','şekil': 'şekl','şükür': 'şükr', 'zehir':'zehr'}
if kelime in dusmeDict:
newDict[findID(wordDict,dusmeDict[kelime])] = ["unlu dusmesi", kelime+"_"+ind]
elif olay == "fiil":
for kelime, valueList in wordDict.items():
ind = kelime[kelime.index("_")+1:]
kelime = kelime[:kelime.index("_")]
if kelime.endswith("mek") or kelime.endswith("mak"):
newDict[findID(wordDict, kelime[:-3])] = ["fiil", kelime+"_"+ind]
return newDict
#Appending two dictionary might have conflicts because there are same words with different meanings or features.
#If both dictionaries include "kelime_index", one of them changes to the "kelime_index+1"
def appendDict(revisedDict, newDict):
for kelime, valueList in newDict.items():
if kelime in revisedDict:
revisedDict[findID(revisedDict, kelime[:kelime.index("_")])] = valueList
else:
revisedDict[kelime] = valueList
return revisedDict
dataset="zargan"
if(len(sys.argv)>1):
dataset = sys.argv[1]
wordDict = loadWord(dataset)
wordDict = appendDict(wordDict, (generate(wordDict,"olumsuzluk eki")))
print("Negative verbs are added.")
wordDict = appendDict(wordDict, (generate(wordDict,"fiil")))
print("Zero infinitive forms of verbs are added.")
revisedDict = dict(wordDict)
for olay in ["unsuz yumusamasi","unlu daralmasi","unlu dusmesi"]:
newDict = {}
newDict = generate(wordDict,olay)
revisedDict = appendDict(revisedDict,newDict)
if olay=="unsuz yumusamasi":
print("Consonant softening forms are added.")
if olay=="unlu daralmasi":
print("Becoming close vowel forms are added.")
if olay=="unsuz yumusamasi":
print("Dropping vowel forms are added.")
with open('revisedDict.pkl', 'wb') as f:
pickle.dump(revisedDict, f)
print("Transformed lexicon is saved to revisedDict.pkl")