-
Notifications
You must be signed in to change notification settings - Fork 0
/
bibliography.bib
384 lines (351 loc) · 20.2 KB
/
bibliography.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
@misc{Brown2020,
title={Language Models are Few-Shot Learners},
author={Tom B. Brown and Benjamin Mann and Nick Ryder and Melanie Subbiah and Jared Kaplan and Prafulla Dhariwal and Arvind Neelakantan and Pranav Shyam and Girish Sastry and Amanda Askell and Sandhini Agarwal and Ariel Herbert-Voss and Gretchen Krueger and Tom Henighan and Rewon Child and Aditya Ramesh and Daniel M. Ziegler and Jeffrey Wu and Clemens Winter and Christopher Hesse and Mark Chen and Eric Sigler and Mateusz Litwin and Scott Gray and Benjamin Chess and Jack Clark and Christopher Berner and Sam McCandlish and Alec Radford and Ilya Sutskever and Dario Amodei},
year={2020},
eprint={2005.14165},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2005.14165},
}
@misc{Chiang2022,
title={Overcoming a Theoretical Limitation of Self-Attention},
author={David Chiang and Peter Cholak},
year={2022},
eprint={2202.12172},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2202.12172},
}
@misc{Clark2020,
title={Transformers as Soft Reasoners over Language},
author={Peter Clark and Oyvind Tafjord and Kyle Richardson},
year={2020},
eprint={2002.05867},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2002.05867},
}
@inproceedings{Csordas2021,
title = "The Devil is in the Detail: Simple Tricks Improve Systematic Generalization of Transformers",
author = "Csord{\'a}s, R{\'o}bert and
Irie, Kazuki and
Schmidhuber, Juergen",
editor = "Moens, Marie-Francine and
Huang, Xuanjing and
Specia, Lucia and
Yih, Scott Wen-tau",
booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2021",
address = "Online and Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.emnlp-main.49",
doi = "10.18653/v1/2021.emnlp-main.49",
pages = "619--634",
abstract = "Recently, many datasets have been proposed to test the systematic generalization ability of neural networks. The companion baseline Transformers, typically trained with default hyper-parameters from standard tasks, are shown to fail dramatically. Here we demonstrate that by revisiting model configurations as basic as scaling of embeddings, early stopping, relative positional embedding, and Universal Transformer variants, we can drastically improve the performance of Transformers on systematic generalization. We report improvements on five popular datasets: SCAN, CFQ, PCFG, COGS, and Mathematics dataset. Our models improve accuracy from 50{\%} to 85{\%} on the PCFG productivity split, and from 35{\%} to 81{\%} on COGS. On SCAN, relative positional embedding largely mitigates the EOS decision problem (Newman et al., 2020), yielding 100{\%} accuracy on the length split with a cutoff at 26. Importantly, performance differences between these models are typically invisible on the IID data split. This calls for proper generalization validation sets for developing neural networks that generalize systematically. We publicly release the code to reproduce our results.",
}
%"The Devil is in the Detail: Simple Tricks Improve Systematic Generalization of Transformers", Csordás 2021 % https://aclanthology.org/2021.emnlp-main.49/
@misc{Csordas2022,
title={The Neural Data Router: Adaptive Control Flow in Transformers Improves Systematic Generalization},
author={Róbert Csordás and Kazuki Irie and Jürgen Schmidhuber},
year={2022},
eprint={2110.07732},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2110.07732},
}
%"Universal Transformers", Dehghani et al 2019 % https://arxiv.org/pdf/1807.03819
@misc{Dehghani2019,
title={Universal Transformers},
author={Mostafa Dehghani and Stephan Gouws and Oriol Vinyals and Jakob Uszkoreit and Łukasz Kaiser},
year={2019},
eprint={1807.03819},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/1807.03819},
}
@article{FodorPylyshyn1988,
title = {Connectionism and cognitive architecture: A critical analysis},
journal = {Cognition},
volume = {28},
number = {1},
pages = {3-71},
year = {1988},
issn = {0010-0277},
doi = {https://doi.org/10.1016/0010-0277(88)90031-5},
url = {https://www.sciencedirect.com/science/article/pii/0010027788900315},
author = {Jerry A. Fodor and Zenon W. Pylyshyn},
abstract = {This paper explores differences between Connectionist proposals for cognitive architecture and the sorts of models that have traditionally been assumed in cognitive science. We claim that the major distinction is that, while both Connectionist and Classical architectures postulate representational mental states, the latter but not the former are committed to a symbol-level of representation, or to a ‘language of thought’: i.e., to representational states that have combinatorial syntactic and semantic structure. Several arguments for combinatorial structure in mental representations are then reviewed. These include arguments based on the ‘systematicity’ of mental representation: i.e., on the fact that cognitive capacities always exhibit certain symmetries, so that the ability to entertain a given thought implies the ability to entertain thoughts with semantically related contents. We claim that such arguments make a powerful case that mind/brain architecture is not Connectionist at the cognitive level. We then consider the possibility that Connectionism may provide an account of the neural (or ‘abstract neurological’) structures in which Classical cognitive architecture is implemented. We survey a number of the standard arguments that have been offered in favor of Connectionism, and conclude that they are coherent only on this interpretation.
Résumé
Cet articleétudie les différences entre modèles connectionistes et modèles classiques de la structure cognitive. Nous pensons que, bien que les deux types de modèles stipulent l'existence d'états mentaux représentationnels, la différence essentielle est que seuls les modèles classiques requièrent l'existence d'un niveau de représentation symbolique—un “langage de la pensée”—, c'est-à-dire d'états représentationnels possédant une structure syntaxique et sémantique. Nous examinons ensuite différents arguments qui militent en faveur de l'existence de représentations mentales ayant ces propriétés. Certains de ces arguments reposent sur la “systématicité” des représentations mentales, c'est-à-dire sur le fait que les capacités cognitives exhibent toujours certaines symétries, de sorte que la capacitéd'entretenir certaines pensées implique la capacitéd'entretenir d'autres pensées apparentées par leur contenu sémantique. Nous pensons que ces arguments montrent de manière convainquante que l'architecture de l'esprit/du cerveau n'est pas connectioniste au niveau cognitif. Nous nous demandons ensuite s'il est possible d'interpréter le connectionisme comme une analyse des structures neuronales (ou des structures neurologiques “abstraites”) dans lesquelles est réalisée l'architecture cognitive classique. Nous examinons plusieurs des arguments avancés habituellement en défense du connectionisme, et en concluons que ceux-ci n'ont de sens que dans cette interprétation.}
}
@misc{Friedman2023,
title={Learning Transformer Programs},
author={Dan Friedman and Alexander Wettig and Danqi Chen},
year={2023},
eprint={2306.01128},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2306.01128},
}
@inproceedings{KimLinzen2020,
title = "{COGS}: A Compositional Generalization Challenge Based on Semantic Interpretation",
author = "Kim, Najoung and
Linzen, Tal",
editor = "Webber, Bonnie and
Cohn, Trevor and
He, Yulan and
Liu, Yang",
booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.emnlp-main.731",
doi = "10.18653/v1/2020.emnlp-main.731",
pages = "9087--9105",
abstract = "Natural language is characterized by compositionality: the meaning of a complex expression is constructed from the meanings of its constituent parts. To facilitate the evaluation of the compositional abilities of language processing architectures, we introduce COGS, a semantic parsing dataset based on a fragment of English. The evaluation portion of COGS contains multiple systematic gaps that can only be addressed by compositional generalization; these include new combinations of familiar syntactic structures, or new combinations of familiar words and familiar structures. In experiments with Transformers and LSTMs, we found that in-distribution accuracy on the COGS test set was near-perfect (96{--}99{\%}), but generalization accuracy was substantially lower (16{--}35{\%}) and showed high sensitivity to random seed (+-6{--}8{\%}). These findings indicate that contemporary standard NLP models are limited in their compositional generalization capacity, and position COGS as a good way to measure progress.",
}
@misc{Ranzato2015,
title={Sequence Level Training with Recurrent Neural Networks},
author={Marc'Aurelio Ranzato and Sumit Chopra and Michael Auli and Wojciech Zaremba},
year={2016},
eprint={1511.06732},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/1511.06732},
}
@inproceedings{Reddy2017,
title = "Universal Semantic Parsing",
author = {Reddy, Siva and
T{\"a}ckstr{\"o}m, Oscar and
Petrov, Slav and
Steedman, Mark and
Lapata, Mirella},
editor = "Palmer, Martha and
Hwa, Rebecca and
Riedel, Sebastian",
booktitle = "Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing",
month = sep,
year = "2017",
address = "Copenhagen, Denmark",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/D17-1009",
doi = "10.18653/v1/D17-1009",
pages = "89--101",
abstract = "Universal Dependencies (UD) offer a uniform cross-lingual syntactic representation, with the aim of advancing multilingual applications. Recent work shows that semantic parsing can be accomplished by transforming syntactic dependencies to logical forms. However, this work is limited to English, and cannot process dependency graphs, which allow handling complex phenomena such as control. In this work, we introduce UDepLambda, a semantic interface for UD, which maps natural language to logical forms in an almost language-independent fashion and can process dependency graphs. We perform experiments on question answering against Freebase and provide German and Spanish translations of the WebQuestions and GraphQuestions datasets to facilitate multilingual evaluation. Results show that UDepLambda outperforms strong baselines across languages and datasets. For English, it achieves a 4.9 F1 point improvement over the state-of-the-art on GraphQuestions.",
}
@article{Strobl2024,
title={What Formal Languages Can Transformers Express? A Survey},
volume={12},
ISSN={2307-387X},
url={http://dx.doi.org/10.1162/tacl_a_00663},
DOI={10.1162/tacl_a_00663},
journal={Transactions of the Association for Computational Linguistics},
publisher={MIT Press},
author={Strobl, Lena and Merrill, William and Weiss, Gail and Chiang, David and Angluin, Dana},
year={2024},
pages={543–561} }
@misc{Weiss2021,
title={Thinking Like Transformers},
author={Gail Weiss and Yoav Goldberg and Eran Yahav},
year={2021},
eprint={2106.06981},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2106.06981},
}
@misc{Zaremba2016,
title={Reinforcement Learning Neural Turing Machines - Revised},
author={Wojciech Zaremba and Ilya Sutskever},
year={2016},
eprint={1505.00521},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/1505.00521},
}
@misc{Wu2023,
title={ReCOGS: How Incidental Details of a Logical Form Overshadow an Evaluation of Semantic Interpretation},
author={Zhengxuan Wu and Christopher D. Manning and Christopher Potts},
year={2024},
eprint={2303.13716},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2303.13716},
}
@misc{Zhou2024,
title={What Algorithms can Transformers Learn? A Study in Length Generalization},
author={Hattie Zhou and Arwen Bradley and Etai Littwin and Noam Razin and Omid Saremi and Josh Susskind and Samy Bengio and Preetum Nakkiran},
year={2023},
eprint={2310.16028},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2310.16028},
}
@article{newell1956logic,
title={The logic theory machine--A complex information processing system},
author={Newell, Allen and Simon, Herbert},
journal={IRE Transactions on information theory},
volume={2},
number={3},
pages={61--79},
year={1956},
publisher={IEEE}
}
@misc{mccarthy1959programs,
title={Programs with common sense},
author={McCarthy, John},
year={1959},
publisher={London}
}
@misc{delétang2023neuralnetworkschomskyhierarchy,
title={Neural Networks and the Chomsky Hierarchy},
author={Grégoire Delétang and Anian Ruoss and Jordi Grau-Moya and Tim Genewein and Li Kevin Wenliang and Elliot Catt and Chris Cundy and Marcus Hutter and Shane Legg and Joel Veness and Pedro A. Ortega},
year={2023},
eprint={2207.02098},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2207.02098},
}
@article{Perez2021,
author = {Jorge Pérez and Pablo Barceló and Javier Marinkovic},
title = {Attention is Turing-Complete},
journal = {Journal of Machine Learning Research},
year = {2021},
volume = {22},
number = {75},
pages = {1--35},
url = {http://jmlr.org/papers/v22/20-302.html}
}
@misc{merrill2024expressivepowertransformerschain,
title={The Expressive Power of Transformers with Chain of Thought},
author={William Merrill and Ashish Sabharwal},
year={2024},
eprint={2310.07923},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2310.07923},
}
@misc{lindner2023tracrcompiledtransformerslaboratory,
title={Tracr: Compiled Transformers as a Laboratory for Interpretability},
author={David Lindner and János Kramár and Sebastian Farquhar and Matthew Rahtz and Thomas McGrath and Vladimir Mikulik},
year={2023},
eprint={2301.05062},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2301.05062},
}
@incollection{fuzzingbook2023:GrammarCoverageFuzzer,
author = {Andreas Zeller and Rahul Gopinath and Marcel B{\"o}hme and Gordon Fraser and Christian Holler},
booktitle = {The Fuzzing Book},
title = {Grammar Coverage},
year = {2023},
publisher = {CISPA Helmholtz Center for Information Security},
howpublished = {\url{https://www.fuzzingbook.org/html/GrammarCoverageFuzzer.html}},
note = {Retrieved 2023-11-11 18:18:06+01:00},
url = {https://www.fuzzingbook.org/html/GrammarCoverageFuzzer.html},
urldate = {2023-11-11 18:18:06+01:00}
}
@misc{klinger2024compositionalprogramgenerationfewshot,
title={Compositional Program Generation for Few-Shot Systematic Generalization},
author={Tim Klinger and Luke Liu and Soham Dan and Maxwell Crouse and Parikshit Ram and Alexander Gray},
year={2024},
eprint={2309.16467},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2309.16467},
}
@misc{tenney2019bertrediscoversclassicalnlp,
title={BERT Rediscovers the Classical NLP Pipeline},
author={Ian Tenney and Dipanjan Das and Ellie Pavlick},
year={2019},
eprint={1905.05950},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/1905.05950},
}
@article{lake2023human,
title={Human-like systematic generalization through a meta-learning neural network},
author={Lake, Brenden M and Baroni, Marco},
journal={Nature},
volume={623},
number={7985},
pages={115--121},
year={2023},
publisher={Nature Publishing Group UK London}
}
@article{linzen2016assessing,
title={Assessing the ability of LSTMs to learn syntax-sensitive dependencies},
author={Linzen, Tal and Dupoux, Emmanuel and Goldberg, Yoav},
journal={Transactions of the Association for Computational Linguistics},
volume={4},
pages={521--535},
year={2016},
publisher={MIT Press One Rogers Street, Cambridge, MA 02142-1209, USA journals-info~…}
}
@book{jespersen1913modernenglishgrammar1954reprint,
title={A Modern English Grammar on Historical Principles: Volume 2, Syntax (first volume)},
author={Jespersen, O.},
series={Otto Jespersen},
year={1954},
publisher={Bradford and Dickens}
}
@misc{agreementwithnearestlanguagelog,
title={Agreement with nearest},
author={Arnold Zwicky},
year={2008},
url={https://languagelog.ldc.upenn.edu/nll/?p=839},
}
@misc{petty2024impactdepthcompositionalgeneralization,
title={The Impact of Depth on Compositional Generalization in Transformer Language Models},
author={Jackson Petty and Sjoerd van Steenkiste and Ishita Dasgupta and Fei Sha and Dan Garrette and Tal Linzen},
year={2024},
eprint={2310.19956},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2310.19956},
}
@misc{vanschijndel2019quantitydoesntbuyquality,
title={Quantity doesn't buy quality syntax with neural language models},
author={Marten van Schijndel and Aaron Mueller and Tal Linzen},
year={2019},
eprint={1909.00111},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/1909.00111},
}
@misc{goldberg2019assessingbertssyntacticabilities,
title={Assessing BERT's Syntactic Abilities},
author={Yoav Goldberg},
year={2019},
eprint={1901.05287},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/1901.05287},
}
@misc{li2023slogstructuralgeneralizationbenchmark,
title={SLOG: A Structural Generalization Benchmark for Semantic Parsing},
author={Bingzhi Li and Lucia Donatelli and Alexander Koller and Tal Linzen and Yuekun Yao and Najoung Kim},
year={2023},
eprint={2310.15040},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2310.15040},
}
@inproceedings{hewitt-manning-2019-structural,
title = "{A} Structural Probe for Finding Syntax in Word Representations",
author = "Hewitt, John and
Manning, Christopher D.",
editor = "Burstein, Jill and
Doran, Christy and
Solorio, Thamar",
booktitle = "Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)",
month = jun,
year = "2019",
address = "Minneapolis, Minnesota",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/N19-1419/",
doi = "10.18653/v1/N19-1419",
pages = "4129--4138",
abstract = "Recent work has improved our ability to detect linguistic knowledge in word representations. However, current methods for detecting syntactic knowledge do not test whether syntax trees are represented in their entirety. In this work, we propose a structural probe, which evaluates whether syntax trees are embedded in a linear transformation of a neural network`s word representation space. The probe identifies a linear transformation under which squared L2 distance encodes the distance between words in the parse tree, and one in which squared L2 norm encodes depth in the parse tree. Using our probe, we show that such transformations exist for both ELMo and BERT but not in baselines, providing evidence that entire syntax trees are embedded implicitly in deep models' vector geometry."
}