bibliography.bib

@misc{Brown2020,
      title={Language Models are Few-Shot Learners}, 
      author={Tom B. Brown and Benjamin Mann and Nick Ryder and Melanie Subbiah and Jared Kaplan and Prafulla Dhariwal and Arvind Neelakantan and Pranav Shyam and Girish Sastry and Amanda Askell and Sandhini Agarwal and Ariel Herbert-Voss and Gretchen Krueger and Tom Henighan and Rewon Child and Aditya Ramesh and Daniel M. Ziegler and Jeffrey Wu and Clemens Winter and Christopher Hesse and Mark Chen and Eric Sigler and Mateusz Litwin and Scott Gray and Benjamin Chess and Jack Clark and Christopher Berner and Sam McCandlish and Alec Radford and Ilya Sutskever and Dario Amodei},
      year={2020},
      eprint={2005.14165},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2005.14165}, 
}

@misc{Chiang2022,
      title={Overcoming a Theoretical Limitation of Self-Attention}, 
      author={David Chiang and Peter Cholak},
      year={2022},
      eprint={2202.12172},
      archivePrefix={arXiv},
      primaryClass={cs.LG},
      url={https://arxiv.org/abs/2202.12172}, 
}

@misc{Clark2020,
      title={Transformers as Soft Reasoners over Language}, 
      author={Peter Clark and Oyvind Tafjord and Kyle Richardson},
      year={2020},
      eprint={2002.05867},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2002.05867}, 
}

@inproceedings{Csordas2021,
    title = "The Devil is in the Detail: Simple Tricks Improve Systematic Generalization of Transformers",
    author = "Csord{\'a}s, R{\'o}bert  and
      Irie, Kazuki  and
      Schmidhuber, Juergen",
    editor = "Moens, Marie-Francine  and
      Huang, Xuanjing  and
      Specia, Lucia  and
      Yih, Scott Wen-tau",
    booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
    month = nov,
    year = "2021",
    address = "Online and Punta Cana, Dominican Republic",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2021.emnlp-main.49",
    doi = "10.18653/v1/2021.emnlp-main.49",
    pages = "619--634",
    abstract = "Recently, many datasets have been proposed to test the systematic generalization ability of neural networks. The companion baseline Transformers, typically trained with default hyper-parameters from standard tasks, are shown to fail dramatically. Here we demonstrate that by revisiting model configurations as basic as scaling of embeddings, early stopping, relative positional embedding, and Universal Transformer variants, we can drastically improve the performance of Transformers on systematic generalization. We report improvements on five popular datasets: SCAN, CFQ, PCFG, COGS, and Mathematics dataset. Our models improve accuracy from 50{\%} to 85{\%} on the PCFG productivity split, and from 35{\%} to 81{\%} on COGS. On SCAN, relative positional embedding largely mitigates the EOS decision problem (Newman et al., 2020), yielding 100{\%} accuracy on the length split with a cutoff at 26. Importantly, performance differences between these models are typically invisible on the IID data split. This calls for proper generalization validation sets for developing neural networks that generalize systematically. We publicly release the code to reproduce our results.",
}
%"The Devil is in the Detail: Simple Tricks Improve Systematic Generalization of Transformers", Csordás 2021 % https://aclanthology.org/2021.emnlp-main.49/

@misc{Csordas2022,
      title={The Neural Data Router: Adaptive Control Flow in Transformers Improves Systematic Generalization}, 
      author={Róbert Csordás and Kazuki Irie and Jürgen Schmidhuber},
      year={2022},
      eprint={2110.07732},
      archivePrefix={arXiv},
      primaryClass={cs.LG},
      url={https://arxiv.org/abs/2110.07732}, 
}

%"Universal Transformers", Dehghani et al 2019 % https://arxiv.org/pdf/1807.03819
@misc{Dehghani2019,
      title={Universal Transformers}, 
      author={Mostafa Dehghani and Stephan Gouws and Oriol Vinyals and Jakob Uszkoreit and Łukasz Kaiser},
      year={2019},
      eprint={1807.03819},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/1807.03819}, 
}

@article{FodorPylyshyn1988,
title = {Connectionism and cognitive architecture: A critical analysis},
journal = {Cognition},
volume = {28},
number = {1},
pages = {3-71},
year = {1988},
issn = {0010-0277},
doi = {https://doi.org/10.1016/0010-0277(88)90031-5},
url = {https://www.sciencedirect.com/science/article/pii/0010027788900315},
author = {Jerry A. Fodor and Zenon W. Pylyshyn},
abstract = {This paper explores differences between Connectionist proposals for cognitive architecture and the sorts of models that have traditionally been assumed in cognitive science. We claim that the major distinction is that, while both Connectionist and Classical architectures postulate representational mental states, the latter but not the former are committed to a symbol-level of representation, or to a ‘language of thought’: i.e., to representational states that have combinatorial syntactic and semantic structure. Several arguments for combinatorial structure in mental representations are then reviewed. These include arguments based on the ‘systematicity’ of mental representation: i.e., on the fact that cognitive capacities always exhibit certain symmetries, so that the ability to entertain a given thought implies the ability to entertain thoughts with semantically related contents. We claim that such arguments make a powerful case that mind/brain architecture is not Connectionist at the cognitive level. We then consider the possibility that Connectionism may provide an account of the neural (or ‘abstract neurological’) structures in which Classical cognitive architecture is implemented. We survey a number of the standard arguments that have been offered in favor of Connectionism, and conclude that they are coherent only on this interpretation.
Résumé
Cet articleétudie les différences entre modèles connectionistes et modèles classiques de la structure cognitive. Nous pensons que, bien que les deux types de modèles stipulent l'existence d'états mentaux représentationnels, la différence essentielle est que seuls les modèles classiques requièrent l'existence d'un niveau de représentation symbolique—un “langage de la pensée”—, c'est-à-dire d'états représentationnels possédant une structure syntaxique et sémantique. Nous examinons ensuite différents arguments qui militent en faveur de l'existence de représentations mentales ayant ces propriétés. Certains de ces arguments reposent sur la “systématicité” des représentations mentales, c'est-à-dire sur le fait que les capacités cognitives exhibent toujours certaines symétries, de sorte que la capacitéd'entretenir certaines pensées implique la capacitéd'entretenir d'autres pensées apparentées par leur contenu sémantique. Nous pensons que ces arguments montrent de manière convainquante que l'architecture de l'esprit/du cerveau n'est pas connectioniste au niveau cognitif. Nous nous demandons ensuite s'il est possible d'interpréter le connectionisme comme une analyse des structures neuronales (ou des structures neurologiques “abstraites”) dans lesquelles est réalisée l'architecture cognitive classique. Nous examinons plusieurs des arguments avancés habituellement en défense du connectionisme, et en concluons que ceux-ci n'ont de sens que dans cette interprétation.}
}

@misc{Friedman2023,
      title={Learning Transformer Programs}, 
      author={Dan Friedman and Alexander Wettig and Danqi Chen},
      year={2023},
      eprint={2306.01128},
      archivePrefix={arXiv},
      primaryClass={cs.LG},
      url={https://arxiv.org/abs/2306.01128}, 
}

@inproceedings{KimLinzen2020,
    title = "{COGS}: A Compositional Generalization Challenge Based on Semantic Interpretation",
    author = "Kim, Najoung  and
      Linzen, Tal",
    editor = "Webber, Bonnie  and
      Cohn, Trevor  and
      He, Yulan  and
      Liu, Yang",
    booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
    month = nov,
    year = "2020",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2020.emnlp-main.731",
    doi = "10.18653/v1/2020.emnlp-main.731",
    pages = "9087--9105",
    abstract = "Natural language is characterized by compositionality: the meaning of a complex expression is constructed from the meanings of its constituent parts. To facilitate the evaluation of the compositional abilities of language processing architectures, we introduce COGS, a semantic parsing dataset based on a fragment of English. The evaluation portion of COGS contains multiple systematic gaps that can only be addressed by compositional generalization; these include new combinations of familiar syntactic structures, or new combinations of familiar words and familiar structures. In experiments with Transformers and LSTMs, we found that in-distribution accuracy on the COGS test set was near-perfect (96{--}99{\%}), but generalization accuracy was substantially lower (16{--}35{\%}) and showed high sensitivity to random seed (+-6{--}8{\%}). These findings indicate that contemporary standard NLP models are limited in their compositional generalization capacity, and position COGS as a good way to measure progress.",
}

@misc{Ranzato2015,
      title={Sequence Level Training with Recurrent Neural Networks}, 
      author={Marc'Aurelio Ranzato and Sumit Chopra and Michael Auli and Wojciech Zaremba},
      year={2016},
      eprint={1511.06732},
      archivePrefix={arXiv},
      primaryClass={cs.LG},
      url={https://arxiv.org/abs/1511.06732}, 
}

@inproceedings{Reddy2017,
    title = "Universal Semantic Parsing",
    author = {Reddy, Siva  and
      T{\"a}ckstr{\"o}m, Oscar  and
      Petrov, Slav  and
      Steedman, Mark  and
      Lapata, Mirella},
    editor = "Palmer, Martha  and
      Hwa, Rebecca  and
      Riedel, Sebastian",
    booktitle = "Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing",
    month = sep,
    year = "2017",
    address = "Copenhagen, Denmark",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/D17-1009",
    doi = "10.18653/v1/D17-1009",
    pages = "89--101",
    abstract = "Universal Dependencies (UD) offer a uniform cross-lingual syntactic representation, with the aim of advancing multilingual applications. Recent work shows that semantic parsing can be accomplished by transforming syntactic dependencies to logical forms. However, this work is limited to English, and cannot process dependency graphs, which allow handling complex phenomena such as control. In this work, we introduce UDepLambda, a semantic interface for UD, which maps natural language to logical forms in an almost language-independent fashion and can process dependency graphs. We perform experiments on question answering against Freebase and provide German and Spanish translations of the WebQuestions and GraphQuestions datasets to facilitate multilingual evaluation. Results show that UDepLambda outperforms strong baselines across languages and datasets. For English, it achieves a 4.9 F1 point improvement over the state-of-the-art on GraphQuestions.",
}

@article{Strobl2024,
   title={What Formal Languages Can Transformers Express? A Survey},
   volume={12},
   ISSN={2307-387X},
   url={http://dx.doi.org/10.1162/tacl_a_00663},
   DOI={10.1162/tacl_a_00663},
   journal={Transactions of the Association for Computational Linguistics},
   publisher={MIT Press},
   author={Strobl, Lena and Merrill, William and Weiss, Gail and Chiang, David and Angluin, Dana},
   year={2024},
   pages={543–561} }

@misc{Weiss2021,
      title={Thinking Like Transformers}, 
      author={Gail Weiss and Yoav Goldberg and Eran Yahav},
      year={2021},
      eprint={2106.06981},
      archivePrefix={arXiv},
      primaryClass={cs.LG},
      url={https://arxiv.org/abs/2106.06981}, 
}

@misc{Zaremba2016,
      title={Reinforcement Learning Neural Turing Machines - Revised}, 
      author={Wojciech Zaremba and Ilya Sutskever},
      year={2016},
      eprint={1505.00521},
      archivePrefix={arXiv},
      primaryClass={cs.LG},
      url={https://arxiv.org/abs/1505.00521}, 
}

@misc{Wu2023,
      title={ReCOGS: How Incidental Details of a Logical Form Overshadow an Evaluation of Semantic Interpretation}, 
      author={Zhengxuan Wu and Christopher D. Manning and Christopher Potts},
      year={2024},
      eprint={2303.13716},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2303.13716}, 
}

@misc{Zhou2024,
      title={What Algorithms can Transformers Learn? A Study in Length Generalization}, 
      author={Hattie Zhou and Arwen Bradley and Etai Littwin and Noam Razin and Omid Saremi and Josh Susskind and Samy Bengio and Preetum Nakkiran},
      year={2023},
      eprint={2310.16028},
      archivePrefix={arXiv},
      primaryClass={cs.LG},
      url={https://arxiv.org/abs/2310.16028}, 
}

@article{newell1956logic,
  title={The logic theory machine--A complex information processing system},
  author={Newell, Allen and Simon, Herbert},
  journal={IRE Transactions on information theory},
  volume={2},
  number={3},
  pages={61--79},
  year={1956},
  publisher={IEEE}
}

@misc{mccarthy1959programs,
  title={Programs with common sense},
  author={McCarthy, John},
  year={1959},
  publisher={London}
}

@misc{delétang2023neuralnetworkschomskyhierarchy,
      title={Neural Networks and the Chomsky Hierarchy}, 
      author={Grégoire Delétang and Anian Ruoss and Jordi Grau-Moya and Tim Genewein and Li Kevin Wenliang and Elliot Catt and Chris Cundy and Marcus Hutter and Shane Legg and Joel Veness and Pedro A. Ortega},
      year={2023},
      eprint={2207.02098},
      archivePrefix={arXiv},
      primaryClass={cs.LG},
      url={https://arxiv.org/abs/2207.02098}, 
}
@article{Perez2021,
  author  = {Jorge Pérez and Pablo Barceló and Javier Marinkovic},
  title   = {Attention is Turing-Complete},
  journal = {Journal of Machine Learning Research},
  year    = {2021},
  volume  = {22},
  number  = {75},
  pages   = {1--35},
  url     = {http://jmlr.org/papers/v22/20-302.html}
}

@misc{merrill2024expressivepowertransformerschain,
      title={The Expressive Power of Transformers with Chain of Thought}, 
      author={William Merrill and Ashish Sabharwal},
      year={2024},
      eprint={2310.07923},
      archivePrefix={arXiv},
      primaryClass={cs.LG},
      url={https://arxiv.org/abs/2310.07923}, 
}

@misc{lindner2023tracrcompiledtransformerslaboratory,
      title={Tracr: Compiled Transformers as a Laboratory for Interpretability}, 
      author={David Lindner and János Kramár and Sebastian Farquhar and Matthew Rahtz and Thomas McGrath and Vladimir Mikulik},
      year={2023},
      eprint={2301.05062},
      archivePrefix={arXiv},
      primaryClass={cs.LG},
      url={https://arxiv.org/abs/2301.05062}, 
}

@incollection{fuzzingbook2023:GrammarCoverageFuzzer,
    author = {Andreas Zeller and Rahul Gopinath and Marcel B{\"o}hme and Gordon Fraser and Christian Holler},
    booktitle = {The Fuzzing Book},
    title = {Grammar Coverage},
    year = {2023},
    publisher = {CISPA Helmholtz Center for Information Security},
    howpublished = {\url{https://www.fuzzingbook.org/html/GrammarCoverageFuzzer.html}},
    note = {Retrieved 2023-11-11 18:18:06+01:00},
    url = {https://www.fuzzingbook.org/html/GrammarCoverageFuzzer.html},
    urldate = {2023-11-11 18:18:06+01:00}
}

@misc{klinger2024compositionalprogramgenerationfewshot,
      title={Compositional Program Generation for Few-Shot Systematic Generalization}, 
      author={Tim Klinger and Luke Liu and Soham Dan and Maxwell Crouse and Parikshit Ram and Alexander Gray},
      year={2024},
      eprint={2309.16467},
      archivePrefix={arXiv},
      primaryClass={cs.LG},
      url={https://arxiv.org/abs/2309.16467}, 
}

@misc{tenney2019bertrediscoversclassicalnlp,
      title={BERT Rediscovers the Classical NLP Pipeline}, 
      author={Ian Tenney and Dipanjan Das and Ellie Pavlick},
      year={2019},
      eprint={1905.05950},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/1905.05950}, 
}

@article{lake2023human,
  title={Human-like systematic generalization through a meta-learning neural network},
  author={Lake, Brenden M and Baroni, Marco},
  journal={Nature},
  volume={623},
  number={7985},
  pages={115--121},
  year={2023},
  publisher={Nature Publishing Group UK London}
}

@article{linzen2016assessing,
  title={Assessing the ability of LSTMs to learn syntax-sensitive dependencies},
  author={Linzen, Tal and Dupoux, Emmanuel and Goldberg, Yoav},
  journal={Transactions of the Association for Computational Linguistics},
  volume={4},
  pages={521--535},
  year={2016},
  publisher={MIT Press One Rogers Street, Cambridge, MA 02142-1209, USA journals-info~…}
}

@book{jespersen1913modernenglishgrammar1954reprint,
  title={A Modern English Grammar on Historical Principles: Volume 2, Syntax (first volume)},
  author={Jespersen, O.},
  series={Otto Jespersen},
  year={1954},
  publisher={Bradford and Dickens}
}


@misc{agreementwithnearestlanguagelog,
      title={Agreement with nearest}, 
      author={Arnold Zwicky},
      year={2008},
      url={https://languagelog.ldc.upenn.edu/nll/?p=839}, 
}

@misc{petty2024impactdepthcompositionalgeneralization,
      title={The Impact of Depth on Compositional Generalization in Transformer Language Models}, 
      author={Jackson Petty and Sjoerd van Steenkiste and Ishita Dasgupta and Fei Sha and Dan Garrette and Tal Linzen},
      year={2024},
      eprint={2310.19956},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2310.19956}, 
}

@misc{vanschijndel2019quantitydoesntbuyquality,
      title={Quantity doesn't buy quality syntax with neural language models}, 
      author={Marten van Schijndel and Aaron Mueller and Tal Linzen},
      year={2019},
      eprint={1909.00111},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/1909.00111}, 
}

@misc{goldberg2019assessingbertssyntacticabilities,
      title={Assessing BERT's Syntactic Abilities}, 
      author={Yoav Goldberg},
      year={2019},
      eprint={1901.05287},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/1901.05287}, 
}

@misc{li2023slogstructuralgeneralizationbenchmark,
      title={SLOG: A Structural Generalization Benchmark for Semantic Parsing}, 
      author={Bingzhi Li and Lucia Donatelli and Alexander Koller and Tal Linzen and Yuekun Yao and Najoung Kim},
      year={2023},
      eprint={2310.15040},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2310.15040}, 
}

@inproceedings{hewitt-manning-2019-structural,
    title = "{A} Structural Probe for Finding Syntax in Word Representations",
    author = "Hewitt, John  and
      Manning, Christopher D.",
    editor = "Burstein, Jill  and
      Doran, Christy  and
      Solorio, Thamar",
    booktitle = "Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)",
    month = jun,
    year = "2019",
    address = "Minneapolis, Minnesota",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/N19-1419/",
    doi = "10.18653/v1/N19-1419",
    pages = "4129--4138",
    abstract = "Recent work has improved our ability to detect linguistic knowledge in word representations. However, current methods for detecting syntactic knowledge do not test whether syntax trees are represented in their entirety. In this work, we propose a structural probe, which evaluates whether syntax trees are embedded in a linear transformation of a neural network`s word representation space. The probe identifies a linear transformation under which squared L2 distance encodes the distance between words in the parse tree, and one in which squared L2 norm encodes depth in the parse tree. Using our probe, we show that such transformations exist for both ELMo and BERT but not in baselines, providing evidence that entire syntax trees are embedded implicitly in deep models' vector geometry."
}