sources.bib


@InProceedings{	  hosseini14learningto,
  author	= {Mohammad Javad Hosseini and Hannaneh Hajishirzi and Oren
		  Etzioni and Nate Kushman},
  title		= {Learning to solve arithmetic word problems with verb
		  categorization},
  booktitle	= {In Conference on Empirical Methods in Natural Language
		  Processing (EMNLP)},
  year		= {2014}
}

@Article{	  hendrycks2021measuring,
  title		= {Measuring mathematical problem solving with the math
		  dataset},
  author	= {Hendrycks, Dan and Burns, Collin and Kadavath, Saurav and
		  Arora, Akul and Basart, Steven and Tang, Eric and Song,
		  Dawn and Steinhardt, Jacob},
  journal	= {arXiv preprint arXiv:2103.03874},
  year		= {2021}
}

@Article{	  hendrycks2021measuringcode,
  title		= {Measuring coding challenge competence with apps},
  author	= {Hendrycks, Dan and Basart, Steven and Kadavath, Saurav and
		  Mazeika, Mantas and Arora, Akul and Guo, Ethan and Burns,
		  Collin and Puranik, Samir and He, Horace and Song, Dawn and
		  others},
  journal	= {arXiv preprint arXiv:2105.09938},
  year		= {2021}
}

@InProceedings{	  miao2020diverse,
  title		= {A Diverse Corpus for Evaluating and Developing English
		  Math Word Problem Solvers},
  author	= {Miao, Shen-Yun and Liang, Chao-Chun and Su, Keh-Yih},
  booktitle	= {Proceedings of the 58th Annual Meeting of the Association
		  for Computational Linguistics},
  pages		= {975--984},
  year		= {2020}
}

@InProceedings{	  yin2018mining,
  author	= {Yin, Pengcheng and Deng, Bowen and Chen, Edgar and
		  Vasilescu, Bogdan and Neubig, Graham},
  title		= {Learning to Mine Aligned Code and Natural Language Pairs
		  from Stack Overflow},
  booktitle	= {International Conference on Mining Software Repositories},
  series	= {MSR},
  pages		= {476--486},
  year		= {2018},
  publisher	= {ACM},
  doi		= {https://doi.org/10.1145/3196398.3196408}
}

@Article{	  saxton2019analysing,
  title		= {Analysing mathematical reasoning abilities of neural
		  models},
  author	= {Saxton, David and Grefenstette, Edward and Hill, Felix and
		  Kohli, Pushmeet},
  journal	= {arXiv preprint arXiv:1904.01557},
  year		= {2019}
}

@InProceedings{	  huang2016well,
  title		= {How well do computers solve math word problems?
		  large-scale dataset construction and evaluation},
  author	= {Huang, Danqing and Shi, Shuming and Lin, Chin-Yew and Yin,
		  Jian and Ma, Wei-Ying},
  booktitle	= {Proceedings of the 54th Annual Meeting of the Association
		  for Computational Linguistics (Volume 1: Long Papers)},
  pages		= {887--896},
  year		= {2016}
}

@TechReport{	  upadhyay2015draw,
  title		= {Draw: A challenging and diverse algebra word problem set},
  author	= {Upadhyay, Shyam and Chang, Ming-Wei},
  year		= {2015},
  institution	= {Citeseer}
}

@Article{	  cobbe2021training,
  title		= {Training verifiers to solve math word problems},
  author	= {Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad
		  and Hilton, Jacob and Nakano, Reiichiro and Hesse,
		  Christopher and Schulman, John},
  journal	= {arXiv preprint arXiv:2110.14168},
  year		= {2021}
}

@Article{	  amini2019mathqa,
  title		= {MathQA: Towards Interpretable Math Word Problem Solving
		  with Operation-Based Formalisms},
  author	= {Amini, Aida and Gabriel, Saadia and Lin, Peter and
		  Koncel-Kedziorski, Rik and Choi, Yejin and Hajishirzi,
		  Hannaneh},
  journal	= {arXiv preprint arXiv:1905.13319},
  year		= {2019}
}

@Article{	  austin2021program,
  title		= {Program synthesis with large language models},
  author	= {Austin, Jacob and Odena, Augustus and Nye, Maxwell and
		  Bosma, Maarten and Michalewski, Henryk and Dohan, David and
		  Jiang, Ellen and Cai, Carrie and Terry, Michael and Le,
		  Quoc and others},
  journal	= {arXiv preprint arXiv:2108.07732},
  year		= {2021}
}

@InProceedings{	  zhou2019going,
  title		= {“Going on a vacation” takes longer than “Going for a
		  walk”: A Study of Temporal Commonsense Understanding},
  author	= {Zhou, Ben and Khashabi, Daniel and Ning, Qiang and Roth,
		  Dan},
  booktitle	= {Proceedings of the 2019 Conference on Empirical Methods in
		  Natural Language Processing and the 9th International Joint
		  Conference on Natural Language Processing (EMNLP-IJCNLP)},
  pages		= {3363--3369},
  year		= {2019}
}

@InProceedings{	  roy2015solving,
  title		= {Solving General Arithmetic Word Problems},
  author	= {Roy, Subhro and Roth, Dan},
  booktitle	= {Proceedings of the 2015 Conference on Empirical Methods in
		  Natural Language Processing},
  pages		= {1743--1752},
  year		= {2015}
}

@InProceedings{	  lin2020birds,
  title		= {Birds have four legs?! NumerSense: Probing Numerical
		  Commonsense Knowledge of Pre-Trained Language Models},
  author	= {Lin, Bill Yuchen and Lee, Seyeon and Khanna, Rahul and
		  Ren, Xiang},
  booktitle	= {Proceedings of the 2020 Conference on Empirical Methods in
		  Natural Language Processing (EMNLP)},
  pages		= {6862--6868},
  year		= {2020}
}

@InProceedings{	  mishra2022numglue,
  title		= {NumGLUE: A Suite of Fundamental yet Challenging
		  Mathematical Reasoning Tasks},
  author	= {Mishra, Swaroop and Mitra, Arindam and Varshney, Neeraj
		  and Sachdeva, Bhavdeep and Clark, Peter and Baral, Chitta
		  and Kalyan, Ashwin},
  booktitle	= {Proceedings of the 60th Annual Meeting of the Association
		  for Computational Linguistics (Volume 1: Long Papers)},
  pages		= {3505--3523},
  year		= {2022}
}

@InProceedings{	  kushman2014learning,
  title		= {Learning to automatically solve algebra word problems},
  author	= {Kushman, Nate and Artzi, Yoav and Zettlemoyer, Luke and
		  Barzilay, Regina},
  booktitle	= {Proceedings of the 52nd Annual Meeting of the Association
		  for Computational Linguistics (Volume 1: Long Papers)},
  pages		= {271--281},
  year		= {2014}
}

@Article{	  roy2015reasoning,
  title		= {Reasoning about quantities in natural language},
  author	= {Roy, Subhro and Vieira, Tim and Roth, Dan},
  journal	= {Transactions of the Association for Computational
		  Linguistics},
  volume	= {3},
  pages		= {1--13},
  year		= {2015},
  publisher	= {MIT Press}
}

@Article{	  koncel2015parsing,
  title		= {Parsing algebraic word problems into equations},
  author	= {Koncel-Kedziorski, Rik and Hajishirzi, Hannaneh and
		  Sabharwal, Ashish and Etzioni, Oren and Ang, Siena Dumas},
  journal	= {Transactions of the Association for Computational
		  Linguistics},
  volume	= {3},
  pages		= {585--597},
  year		= {2015},
  publisher	= {MIT Press}
}

@InProceedings{	  patel_etal_2021_nlp,
  title		= "Are {NLP} Models really able to Solve Simple Math Word
		  Problems?",
  author	= "Patel, Arkil and Bhattamishra, Satwik and Goyal, Navin",
  booktitle	= "Proceedings of the 2021 Conference of the North American
		  Chapter of the Association for Computational Linguistics:
		  Human Language Technologies",
  month		= jun,
  year		= "2021",
  address	= "Online",
  publisher	= "Association for Computational Linguistics",
  url		= "https://aclanthology.org/2021.naacl-main.168",
  doi		= "10.18653/v1/2021.naacl-main.168",
  pages		= "2080--2094",
  abstract	= "The problem of designing NLP solvers for math word
		  problems (MWP) has seen sustained research activity and
		  steady gains in the test accuracy. Since existing solvers
		  achieve high performance on the benchmark datasets for
		  elementary level MWPs containing one-unknown arithmetic
		  word problems, such problems are often considered
		  {``}solved{''} with the bulk of research attention moving
		  to more complex MWPs. In this paper, we restrict our
		  attention to English MWPs taught in grades four and lower.
		  We provide strong evidence that the existing MWP solvers
		  rely on shallow heuristics to achieve high performance on
		  the benchmark datasets. To this end, we show that MWP
		  solvers that do not have access to the question asked in
		  the MWP can still solve a large fraction of MWPs.
		  Similarly, models that treat MWPs as bag-of-words can also
		  achieve surprisingly high accuracy. Further, we introduce a
		  challenge dataset, SVAMP, created by applying carefully
		  chosen variations over examples sampled from existing
		  datasets. The best accuracy achieved by state-of-the-art
		  models is substantially lower on SVAMP, thus showing that
		  much remains to be done even for the simplest of the MWPs."
}