forked from exaexa/better-mff-thesis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrefs.bib
524 lines (489 loc) · 29.4 KB
/
refs.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
@inproceedings{park-etal-2016-korean,
title = "{K}orean Language Resources for Everyone",
author = "Park, Jungyeul and
Hong, Jeen-Pyo and
Cha, Jeong-Won",
booktitle = "Proceedings of the 30th Pacific Asia Conference on Language, Information and Computation: Oral Papers",
month = oct,
year = "2016",
address = "Seoul, South Korea",
url = "https://aclanthology.org/Y16-2002",
pages = "49--58",
}
@inproceedings{uresova-etal-2020-synsemclass,
title = {{S}yn{S}em{C}lass Linked Lexicon: Mapping Synonymy between Languages},
author = {Urešová, Zdeňka and Fučíková, Eva and Hajičová, Eva and Hajič, Jan},
editor = {Kernerman, Ilan and
Krek, Simon and
McCrae, John P. and
Gracia, Jorge and
Ahmadi, Sina and
Kabashi, Besim},
booktitle = {Proceedings of the 2020 Globalex Workshop on Linked Lexicography},
month = may,
year = {2020},
address = {Marseille, France},
publisher = {European Language Resources Association},
url = {https://aclanthology.org/2020.globalex-1.2},
pages = {10--19},
abstract = {This paper reports on an extended version of a synonym verb class lexicon, newly called SynSemClass (formerly CzEngClass). This lexicon stores cross-lingual semantically similar verb senses in synonym classes extracted from a richly annotated parallel corpus, the Prague Czech-English Dependency Treebank. When building the lexicon, we make use of predicate-argument relations (valency) and link them to semantic roles; in addition, each entry is linked to several external lexicons of more or less {``}semantic{''} nature, namely FrameNet, WordNet, VerbNet, OntoNotes and PropBank, and Czech VALLEX. The aim is to provide a linguistic resource that can be used to compare semantic roles and their syntactic properties and features across languages within and across synonym groups (classes, or {'}synsets{'}), as well as gold standard data for automatic NLP experiments with such synonyms, such as synonym discovery, feature mapping, etc. However, perhaps the most important goal is to eventually build an event type ontology that can be referenced and used as a human-readable and human-understandable {``}database{''} for all types of events, processes and states. While the current paper describes primarily the content of the lexicon, we are also presenting a preliminary design of a format compatible with Linked Data, on which we are hoping to get feedback during discussions at the workshop. Once the resource (in whichever form) is applied to corpus annotation, deep analysis will be possible using such combined resources as training data.},
language = {English},
ISBN = {979-10-95546-46-7},
}
@inproceedings{uresova-etal-2022-making,
title = "Making a Semantic Event-type Ontology Multilingual",
author = "Urešová, Zdeňka and
Zaczynska, Karolina and
Bourgonje, Peter and
Fučíková, Eva and
Rehm, Georg and
Hajič, Jan",
editor = "Calzolari, Nicoletta and
B{\'e}chet, Fr{\'e}d{\'e}ric and
Blache, Philippe and
Choukri, Khalid and
Cieri, Christopher and
Declerck, Thierry and
Goggi, Sara and
Isahara, Hitoshi and
Maegaard, Bente and
Mariani, Joseph and
Mazo, H{\'e}l{\`e}ne and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2022.lrec-1.142",
pages = "1332--1343",
abstract = "We present an extension of the SynSemClass Event-type Ontology, originally conceived as a bilingual Czech-English resource. We added German entries to the classes representing the concepts of the ontology. Having a different starting point than the original work (unannotated parallel corpus without links to a valency lexicon and, of course, different existing lexical resources), it was a challenge to adapt the annotation guidelines, the data model and the tools used for the original version. We describe the process and results of working in such a setup. We also show the next steps to adapt the annotation process, data structures and formats and tools necessary to make the addition of a new language in the future more smooth and efficient, and possibly to allow for various teams to work on SynSemClass extensions to many languages concurrently. We also present the latest release which contains the results of adding German, freely available for download as well as for online access.",
}
@article{czengvallex,
author = {Urešová, Zdeňka and Fučíková, Eva and Šindlerová, Jana},
year = {2016},
month = {04},
pages = {17-50},
title = {CzEngVallex: a Bilingual Czech-English Valency Lexicon},
volume = {105},
journal = {The Prague Bulletin of Mathematical Linguistics},
doi = {10.1515/pralin-2016-0001}
}
@misc{EngVallex20,
title={EngVallex - English Valency Lexicon 2.0},
author={Cinková, Silvie and Fučíková, Eva and Šindlerová and Hajič, Jan},
url = {http://hdl.handle.net/11234/1-3526},
note = {{LINDAT}\slash{CLARIAH-CZ} digital library at the Institute of Formal and Applied Linguistics ({{\'U}FAL}),
Faculty of Mathematics and Physics, Charles University},
copyright={Creative Commons - Attribution-{NonCommercial}-{ShareAlike} 4.0 International ({CC} {BY}-{NC}-{SA} 4.0)},
year={2021} }
@techreport{ruppenhofer2016framenet,
title={FrameNet II: Extended theory and practice},
author={Ruppenhofer, Josef and Ellsworth, Michael and Schwarzer-Petruck, Myriam and Johnson, Christopher R and Scheffczyk, Jan},
year={2016},
institution={International Computer Science Institute}
}
@inproceedings{kingsbury2002treebank_OLD,
title={From TreeBank to PropBank.},
author={Kingsbury, Paul R and Palmer, Martha},
booktitle={LREC},
pages={1989--1993},
year={2002}
}
@inproceedings{kingsbury2002treebank,
title = "From {T}ree{B}ank to {P}rop{B}ank",
author = "Kingsbury, Paul and
Palmer, Martha",
editor = "Gonz{\'a}lez Rodr{\'\i}guez, Manuel and
Suarez Araujo, Carmen Paz",
booktitle = "Proceedings of the Third International Conference on Language Resources and Evaluation ({LREC}{'}02)",
month = may,
year = "2002",
address = "Las Palmas, Canary Islands - Spain",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2002/pdf/283.pdf",
}
@article{SSC_Spanish,
author = {Cristina Fernández-Alcaina and Eva Fučíková and Jan Hajič and Zdeňka Urešová},
doi = {doi:10.2478/jazcas-2023-0033},
url = {https://doi.org/10.2478/jazcas-2023-0033},
title = {Spanish Synonyms as Part of a Multilingual Event-Type Ontology},
journal = {Journal of Linguistics/Jazykovedný časopis},
number = {1},
volume = {74},
year = {2023},
pages = {153--162}
}
@inproceedings{ssc_start,
author = {Zdeňka Urešová and Eva Fučíková and Eva Hajičová and Jan Hajič},
year = 2017,
title = {Syntactic-Semantic Classes of Context-Sensitive Synonyms Based on a Bilingual Corpus},
booktitle = {Proceedings of 8th Language and Technology Conference},
pages = {201--205},
publisher = {Fundacja Uniwersytetu im. Adama Mickiewicza w Poznaniu},
address = {Poznań, Poland},
isbn = {978-83-64864-94-0},
url = {https://ufal.mff.cuni.cz/~hajic/2017/docs/LTC_17.pdf}
}
@misc{pcedt,
title = {Prague Czech-English Dependency Treebank 2.0},
author = {Haji{\v c}, Jan and Haji{\v c}ov{\'a}, Eva and Panevov{\'a}, Jarmila and Sgall, Petr and Cinkov{\'a}, Silvie and Fu{\v c}{\'{\i}}kov{\'a}, Eva and Mikulov{\'a}, Marie and Pajas, Petr and Popelka, Jan and Semeck{\'y}, Ji{\v r}{\'{\i}} and {\v S}indlerov{\'a}, Jana and {\v S}t{\v e}p{\'a}nek, Jan and Toman, Josef and Ure{\v s}ov{\'a}, Zde{\v n}ka and {\v Z}abokrtsk{\'y}, Zden{\v e}k},
url = {http://hdl.handle.net/11858/00-097C-0000-0015-8DAF-4},
note = {{LINDAT}\slash{CLARIAH}-{CZ} digital library at the Institute of Formal and Applied Linguistics ({{\'U}FAL}), Faculty of Mathematics and Physics, Charles University},
copyright = {{CC}-{BY}-{NC}-{SA} + {LDC99T42}},
year = {2012}
}
@inproceedings{banon-etal-2020-paracrawl,
title = "{P}ara{C}rawl: Web-Scale Acquisition of Parallel Corpora",
author = "Ba{\~n}{\'o}n, Marta and
Chen, Pinzhen and
Haddow, Barry and
Heafield, Kenneth and
Hoang, Hieu and
Espl{\`a}-Gomis, Miquel and
Forcada, Mikel L. and
Kamran, Amir and
Kirefu, Faheem and
Koehn, Philipp and
Ortiz Rojas, Sergio and
Pla Sempere, Leopoldo and
Ram{\'\i}rez-S{\'a}nchez, Gema and
Sarr{\'\i}as, Elsa and
Strelec, Marek and
Thompson, Brian and
Waites, William and
Wiggins, Dion and
Zaragoza, Jaume",
editor = "Jurafsky, Dan and
Chai, Joyce and
Schluter, Natalie and
Tetreault, Joel",
booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.acl-main.417",
doi = "10.18653/v1/2020.acl-main.417",
pages = "4555--4567",
abstract = "We report on methods to create the largest publicly available parallel corpora by crawling the web, using open source software. We empirically compare alternative methods and publish benchmark data sets for sentence alignment and sentence pair filtering. We also describe the parallel corpora released and evaluate their quality and their usefulness to create machine translation systems.",
}
@inproceedings{taule-etal-2008-ancora,
title = "{A}n{C}ora: Multilevel Annotated Corpora for {C}atalan and {S}panish",
author = "Taul{\'e}, Mariona and
Mart{\'\i}, M. Ant{\`o}nia and
Recasens, Marta",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Maegaard, Bente and
Mariani, Joseph and
Odijk, Jan and
Piperidis, Stelios and
Tapias, Daniel",
booktitle = "Proceedings of the Sixth International Conference on Language Resources and Evaluation ({LREC}'08)",
month = may,
year = "2008",
address = "Marrakech, Morocco",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2008/pdf/35_paper.pdf",
abstract = "This paper presents AnCora, a multilingual corpus annotated at different linguistic levels consisting of 500,000 words in Catalan (AnCora-Ca) and in Spanish (AnCora-Es). At present AnCora is the largest multilayer annotated corpus of these languages freely available from \url{http://clic.ub.edu/ancora}. The two corpora consist mainly of newspaper texts annotated at different levels of linguistic description: morphological (PoS and lemmas), syntactic (constituents and functions), and semantic (argument structures, thematic roles, semantic verb classes, named entities, and WordNet nominal senses). All resulting layers are independent of each other, thus making easier the data management. The annotation was performed manually, semiautomatically, or fully automatically, depending on the encoded linguistic information. The development of these basic resources constituted a primary objective, since there was a lack of such resources for these languages. A second goal was the definition of a consistent methodology that can be followed in further annotations. The current versions of AnCora have been used in several international evaluation competitions",
}
@inproceedings{SSC_LLM_Suggestions_OLD,
booktitle = {Proceedings of the 17th Linguistic Annotation Workshop},
title = {Extending an Event-type Ontology: Adding Verbs and Classes using Fine-tuned {LLM}s Suggestions},
editor = {Jakob Prange and Annemarie Fridrich},
author = {Jana Strakov{\'{a}} and Eva Fu{\v{c}}{\'{\i}}kov{\'{a}} and Jan Haji{\v{c}} and Zde{\v{n}}ka Ure{\v{s}}ov{\'{a}}},
year = {2023},
publisher = {Association for Computational Linguistics},
organization = {Association for Computational Linguistics},
address = {Stroudsburg, {PA}, {USA}},
pages = {85--95},
}
@inproceedings{SSC_LLM_Suggestions,
title = "Extending an Event-type Ontology: Adding Verbs and Classes Using Fine-tuned {LLM}s Suggestions",
author = "Strakov{\'a}, Jana and
Fu{\v{c}}{\'\i}kov{\'a}, Eva and
Haji{\v{c}}, Jan and
Ure{\v{s}}ov{\'a}, Zde{\v{n}}ka",
editor = "Prange, Jakob and
Friedrich, Annemarie",
booktitle = "Proceedings of the 17th Linguistic Annotation Workshop (LAW-XVII)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.law-1.9",
doi = "10.18653/v1/2023.law-1.9",
pages = "85--95",
abstract = "In this project, we have investigated the use of advanced machine learning methods, specifically fine-tuned large language models, for pre-annotating data for a lexical extension task, namely adding descriptive words (verbs) to an existing (but incomplete, as of yet) ontology of event types. Several research questions have been focused on, from the investigation of a possible heuristics to provide at least hints to annotators which verbs to include and which are outside the current version of the ontology, to the possible use of the automatic scores to help the annotators to be more efficient in finding a threshold for identifying verbs that cannot be assigned to any existing class and therefore they are to be used as seeds for a new class. We have also carefully examined the correlation of the automatic scores with the human annotation. While the correlation turned out to be strong, its influence on the annotation proper is modest due to its near linearity, even though the mere fact of such pre-annotation leads to relatively short annotation times.",
}
@article{word_sense_disambiguation,
author = {Navigli, Roberto},
title = {Word sense disambiguation: A survey},
year = {2009},
issue_date = {February 2009},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {41},
number = {2},
issn = {0360-0300},
url = {https://doi.org/10.1145/1459352.1459355},
doi = {10.1145/1459352.1459355},
abstract = {Word sense disambiguation (WSD) is the ability to identify the meaning of words in context in a computational manner. WSD is considered an AI-complete problem, that is, a task whose solution is at least as hard as the most difficult problems in artificial intelligence. We introduce the reader to the motivations for solving the ambiguity of words and provide a description of the task. We overview supervised, unsupervised, and knowledge-based approaches. The assessment of WSD systems is discussed in the context of the Senseval/Semeval campaigns, aiming at the objective evaluation of systems participating in several different disambiguation tasks. Finally, applications, open problems, and future directions are discussed.},
journal = {ACM Comput. Surv.},
month = {feb},
articleno = {10},
numpages = {69},
keywords = {word sense discrimination, sense annotation, semantic annotation, lexical semantics, lexical ambiguity, Word sense disambiguation, WSD}
}
@inproceedings{
chung2021rethinking,
title={Rethinking Embedding Coupling in Pre-trained Language Models},
author={Hyung Won Chung and Thibault Fevry and Henry Tsai and Melvin Johnson and Sebastian Ruder},
booktitle={International Conference on Learning Representations},
year={2021},
url={https://openreview.net/forum?id=xpFFI_NtgpW}
}
@article{devlin2018bert_OLD,
title={Bert: Pre-training of deep bidirectional transformers for language understanding},
author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
journal={arXiv preprint arXiv:1810.04805},
year={2018}
}
@inproceedings{devlin2018bert,
title = "{BERT}: Pre-training of Deep Bidirectional Transformers for Language Understanding",
author = "Devlin, Jacob and
Chang, Ming-Wei and
Lee, Kenton and
Toutanova, Kristina",
editor = "Burstein, Jill and
Doran, Christy and
Solorio, Thamar",
booktitle = "Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)",
month = jun,
year = "2019",
address = "Minneapolis, Minnesota",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/N19-1423",
doi = "10.18653/v1/N19-1423",
pages = "4171--4186",
abstract = "We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models (Peters et al., 2018a; Radford et al., 2018), BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be fine-tuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial task-specific architecture modifications. BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural language processing tasks, including pushing the GLUE score to 80.5 (7.7 point absolute improvement), MultiNLI accuracy to 86.7{\%} (4.6{\%} absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement).",
}
@ARTICLE{lin-focal-loss,
author={Lin, Tsung-Yi and Goyal, Priya and Girshick, Ross and He, Kaiming and Dollár, Piotr},
journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
title={Focal Loss for Dense Object Detection},
year={2020},
volume={42},
number={2},
pages={318-327},
keywords={Detectors;Training;Object detection;Entropy;Proposals;Convolutional neural networks;Feature extraction;Computer vision;object detection;machine learning;convolutional neural networks},
doi={10.1109/TPAMI.2018.2858826}}
@misc{velar-plossive,
author={Tavin and Nardog},
title={Voiceless velar plosive},
year={2019},
url={https://commons.wikimedia.org/wiki/File:Voiceless_velar_plosive.svg},
note={This work is licensed under the Creative Commons
Attribution-ShareAlike 4.0 International License.
To view a copy of this license, visit
\url{https://creativecommons.org/licenses/by-sa/4.0/deed.en}. Our derivative is also licensed as such.}
}
@article{brown-etal-1992-class,
title = "Class-Based \textit{n}-gram Models of Natural Language",
author = "Brown, Peter F. and
Della Pietra, Vincent J. and
deSouza, Peter V. and
Lai, Jenifer C. and
Mercer, Robert L.",
journal = "Computational Linguistics",
volume = "18",
number = "4",
year = "1992",
url = "https://aclanthology.org/J92-4003",
pages = "467--480",
}
@inproceedings{opus_OLD,
title={News from OPUS-A collection of multilingual parallel corpora with tools and interfaces},
author={Tiedemann, J{\"o}rg},
booktitle={Recent advances in natural language processing},
volume={5},
pages={237--248},
year={2009}
}
@inbook{opus,
author = {Tiedemann, Jörg},
year = {2009},
month = {01},
pages = {237-248},
title = {News from OPUS—A Collection of Multilingual Parallel Corpora with Tools and Interfaces},
volume = {5},
isbn = {9789027248251},
journal = {Recent Advances in Natural Language Processing},
doi = {10.1075/cilt.309.19tie}
}
@InProceedings{straka-2018-udpipe,
title = "{UDP}ipe 2.0 Prototype at {C}o{NLL} 2018 {UD} Shared Task",
author = "Straka, Milan",
booktitle = "Proceedings of the {C}o{NLL} 2018 Shared Task: Multilingual Parsing from Raw Text to Universal Dependencies",
month = oct,
year = "2018",
address = "Brussels, Belgium",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/K18-2020",
doi = "10.18653/v1/K18-2020",
pages = "197--207",
}
@inproceedings{nivre2016universal_OLD,
title={Universal dependencies v1: A multilingual treebank collection},
author={Nivre, Joakim and De Marneffe, Marie-Catherine and Ginter, Filip and Goldberg, Yoav and Hajic, Jan and Manning, Christopher D and McDonald, Ryan and Petrov, Slav and Pyysalo, Sampo and Silveira, Natalia and others},
booktitle={Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC'16)},
pages={1659--1666},
year={2016}
}
@inproceedings{nivre2016universal,
title = "{U}niversal {D}ependencies v1: A Multilingual Treebank Collection",
author = "Nivre, Joakim and
de Marneffe, Marie-Catherine and
Ginter, Filip and
Goldberg, Yoav and
Haji{\v{c}}, Jan and
Manning, Christopher D. and
McDonald, Ryan and
Petrov, Slav and
Pyysalo, Sampo and
Silveira, Natalia and
Tsarfaty, Reut and
Zeman, Daniel",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Declerck, Thierry and
Goggi, Sara and
Grobelnik, Marko and
Maegaard, Bente and
Mariani, Joseph and
Mazo, Helene and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Tenth International Conference on Language Resources and Evaluation ({LREC}'16)",
month = may,
year = "2016",
address = "Portoro{\v{z}}, Slovenia",
publisher = "European Language Resources Association (ELRA)",
url = "https://aclanthology.org/L16-1262",
pages = "1659--1666",
abstract = "Cross-linguistically consistent annotation is necessary for sound comparative evaluation and cross-lingual learning experiments. It is also useful for multilingual system development and comparative linguistic studies. Universal Dependencies is an open community effort to create cross-linguistically consistent treebank annotation for many languages within a dependency-based lexicalist framework. In this paper, we describe v1 of the universal guidelines, the underlying design principles, and the currently available treebanks for 33 languages.",
}
@inproceedings{jalili-sabet-etal-2020-simalign,
title = "{S}im{A}lign: High Quality Word Alignments without Parallel Training Data using Static and Contextualized Embeddings",
author = {Jalili Sabet, Masoud and
Dufter, Philipp and
Yvon, Fran{\c{c}}ois and
Sch{\"u}tze, Hinrich},
booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: Findings",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/2020.findings-emnlp.147",
pages = "1627--1643",
}
@inproceedings{lai-etal-2022-cross,
title = "Cross-Align: Modeling Deep Cross-lingual Interactions for Word Alignment",
author = "Lai, Siyu and
Yang, Zhen and
Meng, Fandong and
Chen, Yufeng and
Xu, Jinan and
Zhou, Jie",
booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.emnlp-main.244",
pages = "3715--3725",
}
@inproceedings{dou2021word_OLD,
title={Word Alignment by Fine-tuning Embeddings on Parallel Corpora},
author={Dou, Zi-Yi and Neubig, Graham},
booktitle={Conference of the European Chapter of the Association for Computational Linguistics (EACL)},
year={2021}
}
@inproceedings{dou2021word,
title = "Word Alignment by Fine-tuning Embeddings on Parallel Corpora",
author = "Dou, Zi-Yi and
Neubig, Graham",
editor = "Merlo, Paola and
Tiedemann, Jorg and
Tsarfaty, Reut",
booktitle = "Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume",
month = apr,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.eacl-main.181",
doi = "10.18653/v1/2021.eacl-main.181",
pages = "2112--2128",
abstract = "Word alignment over parallel corpora has a wide variety of applications, including learning translation lexicons, cross-lingual transfer of language processing tools, and automatic evaluation or analysis of translation outputs. The great majority of past work on word alignment has worked by performing unsupervised learning on parallel text. Recently, however, other work has demonstrated that pre-trained contextualized word embeddings derived from multilingually trained language models (LMs) prove an attractive alternative, achieving competitive results on the word alignment task even in the absence of explicit training on parallel data. In this paper, we examine methods to marry the two approaches: leveraging pre-trained LMs but fine-tuning them on parallel text with objectives designed to improve alignment quality, and proposing methods to effectively extract alignments from these fine-tuned models. We perform experiments on five language pairs and demonstrate that our model can consistently outperform previous state-of-the-art models of all varieties. In addition, we demonstrate that we are able to train multilingual word aligners that can obtain robust performance on different language pairs.",
}
@article{xlmr_OLD,
title={Unsupervised cross-lingual representation learning at scale},
author={Conneau, Alexis and Khandelwal, Kartikay and Goyal, Naman and Chaudhary, Vishrav and Wenzek, Guillaume and Guzm{\'a}n, Francisco and Grave, Edouard and Ott, Myle and Zettlemoyer, Luke and Stoyanov, Veselin},
journal={arXiv preprint arXiv:1911.02116},
year={2019}
}
@inproceedings{xlmr,
title = "Unsupervised Cross-lingual Representation Learning at Scale",
author = "Conneau, Alexis and
Khandelwal, Kartikay and
Goyal, Naman and
Chaudhary, Vishrav and
Wenzek, Guillaume and
Guzm{\'a}n, Francisco and
Grave, Edouard and
Ott, Myle and
Zettlemoyer, Luke and
Stoyanov, Veselin",
editor = "Jurafsky, Dan and
Chai, Joyce and
Schluter, Natalie and
Tetreault, Joel",
booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.acl-main.747",
doi = "10.18653/v1/2020.acl-main.747",
pages = "8440--8451",
abstract = "This paper shows that pretraining multilingual language models at scale leads to significant performance gains for a wide range of cross-lingual transfer tasks. We train a Transformer-based masked language model on one hundred languages, using more than two terabytes of filtered CommonCrawl data. Our model, dubbed XLM-R, significantly outperforms multilingual BERT (mBERT) on a variety of cross-lingual benchmarks, including +14.6{\%} average accuracy on XNLI, +13{\%} average F1 score on MLQA, and +2.4{\%} F1 score on NER. XLM-R performs particularly well on low-resource languages, improving 15.7{\%} in XNLI accuracy for Swahili and 11.4{\%} for Urdu over previous XLM models. We also present a detailed empirical analysis of the key factors that are required to achieve these gains, including the trade-offs between (1) positive transfer and capacity dilution and (2) the performance of high and low resource languages at scale. Finally, we show, for the first time, the possibility of multilingual modeling without sacrificing per-language performance; XLM-R is very competitive with strong monolingual models on the GLUE and XNLI benchmarks. We will make our code and models publicly available.",
}
@inproceedings{behzad2023effect_OLD,
title={The effect of alignment correction on cross-lingual annotation projection},
author={Behzad, Shabnam and Ebner, Seth and Marone, Marc and Van Durme, Benjamin and Yarmohammadi, Mahsa},
booktitle={Proceedings of the 17th Linguistic Annotation Workshop (LAW-XVII)},
pages={244--251},
year={2023}
}
@inproceedings{behzad2023effect,
title = "The Effect of Alignment Correction on Cross-Lingual Annotation Projection",
author = "Behzad, Shabnam and
Ebner, Seth and
Marone, Marc and
Van Durme, Benjamin and
Yarmohammadi, Mahsa",
editor = "Prange, Jakob and
Friedrich, Annemarie",
booktitle = "Proceedings of the 17th Linguistic Annotation Workshop (LAW-XVII)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.law-1.24",
doi = "10.18653/v1/2023.law-1.24",
pages = "244--251",
abstract = "Cross-lingual annotation projection is a practical method for improving performance on low resource structured prediction tasks. An important step in annotation projection is obtaining alignments between the source and target texts, which enables the mapping of annotations across the texts. By manually correcting automatically generated alignments, we examine the impact of alignment quality{---}automatic, manual, and mixed{---}on downstream performance for two information extraction tasks and quantify the trade-off between annotation effort and model performance.",
}