nlpwdl-bibliography.bib

@inproceedings{Wang.et.al.2019.NeurIPS,
	address = {Vancouver, Canada},
	author = {Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R.},
	booktitle = {Proceedings of the 33rd International Conference on Neural Information Processing Systems},
	pages = {3266--3280},
	publisher = {Curran Associates, Inc.},
	title = {{SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems}},
	year = {2019}
}


@inproceedings{Levesque.et.al.2012,
	address = {Rome, Italy},
	author = {Levesque, Hector J. and Davis, Ernest and Morgenstern, Leora},
	booktitle = {Proceedings of the Thirteenth International Conference on Principles of Knowledge Representation and Reasoning},
	pages = {552--561},
	publisher = {Association for the Advancement of Artificial Intelligence},
	title = {{The Winograd Schema Challenge}},
	year = {2012}
}

@article{Dagan.et.al.2009.NLE,
	author = {Dagan, Ido and Dolan, BIll and Magnini, Bernardo and Roth, Dan},
	doi = {10.1017/S1351324909990209},
	journal = {Natural Language Engineering},
	number = {4},
	pages = {1--27},
	title = {{Recognizing textual entailment: Rational, evaluation and approaches}},
	volume = {15},
	year = {2009}
}

@inproceedings{Maas.et.al.2011,
	address = {Portland, Oregon},
	author = {Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan and Ng, Andrew Y. and Potts, Christopher},
	booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
	pages = {142--150},
	publisher = {Association for Computational Linguistics},
	title = {{Learning Word Vectors for Sentiment Analysis}},
	url = {https://aclanthology.org/P11-1015},
	year = {2011}
}


@inproceedings{Bowman.et.al.2015,
	address = {Lisbon, Portugal},
	author = {Bowman, Samuel R. and Angeli, Gabor and Potts, Christopher and Manning, Christopher D.},
	booktitle = {Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing},
	doi = {10.18653/v1/D15-1075},
	pages = {632--642},
	publisher = {Association for Computational Linguistics},
	title = {{A large annotated corpus for learning natural language inference}},
	url = {http://aclweb.org/anthology/D15-1075},
	year = {2015}
}

@article{Habernal.et.al.2023.AILaw,
	title        = {{Mining Legal Arguments in Court Decisions}},
	author       = {\textbf{Habernal}, \textbf{Ivan} and Faber, Daniel and Recchia, Nicola and Bretthauer, Sebastian and Gurevych, Iryna and Spiecker genannt Döhmann, Indra and Burchard, Christoph},
	year         = 2023,
	journal      = {Artificial Intelligence and Law},
	doi          = {10.1007/s10506-023-09361-y},
}

@article{Artstein.Poesio.2008.CoLi,
	author = {Artstein, Ron and Poesio, Massimo},
	doi = {10.1162/coli.07-034-R2},
	journal = {Computational Linguistics},
	number = {4},
	pages = {555--596},
	title = {{Inter-Coder Agreement for Computational Linguistics}},
	volume = {34},
	year = {2008}
}


@inproceedings{TjongKimSang.DeMeulder.2003,
	author = {{Tjong Kim Sang}, Erik F. and {De Meulder}, Fien},
	booktitle = {Proceedings of the Seventh Conference on Natural Language Learning at HLT-NAACL 2003},
	pages = {142--147},
	publisher = {https://aclanthology.org/W03-0419},
	title = {{Introduction to the CoNLL-2003 Shared Task: Language-Independent Named Entity Recognition}},
	year = {2003}
}


@inproceedings{Clark.et.al.2019.NAACL,
	address = {Minneapolis, Minnesota},
	author = {Clark, Christopher and Lee, Kenton and Chang, Ming-wei and Kwiatkowski, Tom and Collins, Michael and Toutanova, Kristina},
	booktitle = {Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)},
	doi = {10.18653/v1/N19-1300},
	pages = {2924--2936},
	publisher = {Association for Computational Linguistics},
	title = {{BoolQ: Exploring the Surprising Difficulty of Natural Yes/No Questions}},
	year = {2019}
}


@inproceedings{Khashabi.et.al.2018.NAACL,
	address = {New Orleans, LA},
	author = {Khashabi, Daniel and Chaturvedi, Snigdha and Roth, Michael and Upadhyay, Shyam and Roth, Dan},
	booktitle = {Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)},
	doi = {10.18653/v1/N18-1023},
	pages = {252--262},
	publisher = {Association for Computational Linguistics},
	title = {{Looking Beyond the Surface: A Challenge Set for Reading Comprehension over Multiple Sentences}},
	year = {2018}
}


@inproceedings{Bojar.et.al.2018.WMT,
	address = {Brussels, Belgium},
	author = {Bojar, Ondřej and Federmann, Christian and Fishel, Mark and Graham, Yvette and Haddow, Barry and Koehn, Philipp and Monz, Christof},
	booktitle = {Proceedings of the Third Conference on Machine Translation: Shared Task Papers},
	doi = {10.18653/v1/W18-6401},
	pages = {272--303},
	publisher = {Association for Computational Linguistics},
	title = {{Findings of the 2018 Conference on Machine Translation (WMT18)}},
	volume = {2},
	year = {2018}
}


@book{Koehn.2020,
	author = {Philipp Koehn},
	title = {Neural Machine Translation},
	publisher = {Cambridge University Press},
	year = {2020},
	note = {(not freely available)}
}

@inproceedings{Hermann.et.al.2015.NeurIPS,
	author = {Hermann, Karl Moritz and Kocisky, Tomas and Grefenstette, Edward and Espeholt, Lasse and Kay, Will and Suleyman, Mustafa and Blunsom, Phil},
	booktitle = {Proceedings of NeurIPS},
	pages = {1--9},
	publisher = {Curran Associates, Inc.},
	title = {{Teaching Machines to Read and Comprehend}},
	year = {2015}
}


@article{Raffel.et.al.2020.JMLR,
	author = {Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena, Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J.},
	journal = {Journal of Machine Learning Research},
	keywords = {attention-,multi-task learning,natural language processing,transfer learning},
	number = {140},
	pages = {1--67},
	title = {{Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer}},
	volume = {21},
	year = {2020}
}

@book{Japkowicz.Shah.2011,
	title = {{Evaluating Learning Algorithms: A Classification Perspective}},
	author = {Nathalie Japkowicz and Mohak Shah},
	year = {2011},
	publisher = {Cambridge University Press},
	note = {(not freely available)},
}

@inproceedings{Papineni.et.al.2002.ACL,
	address = {Philadelphia, PA},
	author = {Papineni, Kishore and Roukos, Salim and Ward, Todd and Zhu, Wei-Jing},
	booktitle = {Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics},
	doi = {10.3115/1073083.1073135},
	pages = {311--318},
	publisher = {Association for Computational Linguistics},
	title = {{BLEU: a Method for Automatic Evaluation of Machine Translation}},
	year = {2002}
}

@inproceedings{Lin.2004,
	title = "{ROUGE}: A Package for Automatic Evaluation of Summaries",
	author = "Lin, Chin-Yew",
	booktitle = "Text Summarization Branches Out",
	year = "2004",
	address = "Barcelona, Spain",
	publisher = "Association for Computational Linguistics",
	url = "https://aclanthology.org/W04-1013",
	pages = "74--81",
}

@inproceedings{Plank.2022.EMNLP,
	address = {Abu Dhabi, United Arab Emirates},
	author = {Plank, Barbara},
	booktitle = {Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing},
	pages = {10671--10682},
	publisher = {Association for Computational Linguistics},
	title = {{The “Problem” of Human Label Variation: On Ground Truth in Data, Modeling and Evaluation}},
	url = {https://aclanthology.org/2022.emnlp-main.731},
	year = {2022}
}


@inproceedings{Geva.et.al.2019.EMNLP,
	address = {Hong Kong, China},
	author = {Geva, Mor and Goldberg, Yoav and Berant, Jonathan},
	booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)},
	doi = {10.18653/v1/D19-1107},
	pages = {1161--1166},
	publisher = {Association for Computational Linguistics},
	title = {{Are We Modeling the Task or the Annotator? An Investigation of Annotator Bias in Natural Language Understanding Datasets}},
	year = {2019}
}


@article{Sai.et.al.2023.CSUR,
	author = {Sai, Ananya B. and Mohankumar, Akash Kumar and Khapra, Mitesh M.},
	doi = {10.1145/3485766},
	journal = {ACM Computing Surveys},
	number = {2},
	pages = {1--39},
	title = {{A Survey of Evaluation Metrics Used for NLG Systems}},
	volume = {55},
	year = {2023}
}


@inproceedings{Habernal.et.al.2018.NAACL.ARCT,
	author = {\textbf{Habernal}, \textbf{Ivan} and Wachsmuth, Henning and Gurevych, Iryna and Stein, Benno},
	booktitle = {Proceedings of NAACL},
	pages = {1930--1940},
	title = {{The Argument Reasoning Comprehension Task: Identification and Reconstruction of Implicit Warrants}},
	url = {http://aclweb.org/anthology/N18-1175},
	address = {New Orleans, LA},
	year = {2018}
}


@inproceedings{Niven.Kao.2019.ACL,
	address = {Florence, Italy},
	author = {Niven, Timothy and Kao, Hung-Yu},
	booktitle = {Proceedings of ACL},
	pages = {4658--4664},
	title = {{Probing Neural Network Comprehension of Natural Language Arguments}},
	url = {https://www.aclweb.org/anthology/P19-1459},
	year = {2019}
}


@article{Forman.Scholz.2009.SIGKDD,
	annote = {fundamental article for reporting f-measure},
	author = {Forman, George and Scholz, Martin},
	file = {:home/habi/Dokumenty/Mendeley Desktop/Forman, Scholz - 2010 - Apples-to-Apples in Cross-Validation Studies Pitfalls in Classifier Performance Measurement.pdf:pdf},
	journal = {ACM SIGKDD Explorations Newsletter},
	mendeley-groups = {evaluation},
	number = {1},
	pages = {49--57},
	title = {{Apples-to-Apples in Cross-Validation Studies: Pitfalls in Classifier Performance Measurement}},
	volume = {12},
	year = {2010}
}


@article{Sokolova.Lapalme.2009,
	author = {Sokolova, Marina and Lapalme, Guy},
	doi = {10.1016/j.ipm.2009.03.002},
	journal = {Information Processing and Management},
	number = {4},
	pages = {427--437},
	publisher = {Elsevier Ltd},
	title = {{A systematic analysis of performance measures for classification tasks}},
	volume = {45},
	year = {2009}
}


@inproceedings{caglayan-etal-2020-curious,
	title = "Curious Case of Language Generation Evaluation Metrics: A Cautionary Tale",
	author = "Caglayan, Ozan  and
	Madhyastha, Pranava  and
	Specia, Lucia",
	booktitle = "Proceedings of COLING",
	year = "2020",
	doi = "10.18653/v1/2020.coling-main.210",
	pages = "2322--2328",
}


@inproceedings{Rajpurkar.et.al.2018.ACL,
	address = {Melbourne, Australia},
	author = {Rajpurkar, Pranav and Jia, Robin and Liang, Percy},
	booktitle = {Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)},
	doi = {10.18653/v1/P18-2124},
	pages = {784--789},
	publisher = {Association for Computational Linguistics},
	title = {{Know What You Don't Know: Unanswerable Questions for SQuAD}},
	year = {2018}
}


@inproceedings{Zhang.et.al.2018.ACL,
	address = {Melbourne, Australia},
	author = {Zhang, Saizheng and Dinan, Emily and Urbanek, Jack and Szlam, Arthur and Kiela, Douwe and Weston, Jason},
	booktitle = {Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
	doi = {10.18653/v1/P18-1205},
	pages = {2204--2213},
	publisher = {Association for Computational Linguistics},
	title = {{Personalizing Dialogue Agents: I have a dog, do you have pets too?}},
	year = {2018}
}


@book{Deisenroth.et.al.2021.book,
	title = {Mathematics for Machine Learning},
	author = {Deisenroth, Marc Peter and Faisal, Aldo and Ong, Cheng Soon},
	year = {2021},
	publisher = {Cambridge University Press},
	url = {mml-book.com},
}

@book{Koller.Friedman.2009.book,
	title = {Probabilistic Graphical Models: Principles and Techniques},
	author = {Koller, Daphne  and Friedman, Nir},
	publisher = {MIT Press},
	year = {2009},
}

@book{Goodfellow.et.al.2016.book,
	title={Deep Learning},
	author={Ian Goodfellow and Yoshua Bengio and Aaron Courville},
	publisher={MIT Press},
	url={www.deeplearningbook.org},
	year={2016}
}

@inproceedings{Iacobacci.et.al.2015.ACL,
	address = {Beijing, China},
	author = {Iacobacci, Ignacio and Pilehvar, Mohammad Taher and Navigli, Roberto},
	booktitle = {Proceedings of ACL},
	doi = {10.3115/v1/P15-1010},
	pages = {95--105},
	publisher = {Association for Computational Linguistics},
	title = {{SensEmbed: Learning Sense Embeddings for Word and Relational Similarity}},
	year = {2015}
}


@inproceedings{Upadhyay.et.al.2016.ACL,
	address = {Berlin, Germany},
	author = {Upadhyay, Shyam and Faruqui, Manaal and Dyer, Chris and Roth, Dan},
	booktitle = {Proceedings of ACL},
	doi = {10.18653/v1/P16-1157},
	pages = {1661--1670},
	title = {{Cross-lingual Models of Word Embeddings: An Empirical Comparison}},
	year = {2016}
}

@inproceedings{Glavas.et.al.2019.ACL,
	address = {Florence, Italy},
	author = {Glava{\v{s}}, Goran and Litschko, Robert and Ruder, Sebastian and Vuli{\'{c}}, Ivan},
	booktitle = {Proceedings of ACL},
	doi = {10.18653/v1/P19-1070},
	pages = {710--721},
	title = {{How to (Properly) Evaluate Cross-Lingual Word Embeddings: On Strong Baselines, Comparative Analyses, and Some Misconceptions}},
	year = {2019}
}


@inproceedings{Vulic.Moens.2015.ACL,
	address = {Beijing, China},
	author = {Vuli{\'{c}}, Ivan and Moens, Marie-Francine},
	booktitle = {Proceedings of ACL (Volume 2: Short Papers)},
	doi = {10.3115/v1/P15-2118},
	pages = {719--725},
	title = {{Bilingual Word Embeddings from Non-Parallel Document-Aligned Data Applied to Bilingual Lexicon Induction}},
	year = {2015}
}


@inproceedings{Artetxe.et.al.2017.ACL,
	address = {Vancouver, Canada},
	author = {Artetxe, Mikel and Labaka, Gorka and Agirre, Eneko},
	booktitle = {Proceedings of ACL},
	doi = {10.18653/v1/P17-1042},
	pages = {451--462},
	title = {{Learning bilingual word embeddings with (almost) no bilingual data}},
	year = {2017}
}

@inproceedings{Ling.et.al.2015.NAACL,
	address = {Denver, Colorado},
	author = {Ling, Wang and Dyer, Chris and Black, Alan W and Trancoso, Isabel},
	booktitle = {Proceedings of NAACL},
	doi = {10.3115/v1/N15-1142},
	pages = {1299--1304},
	title = {{Two/Too Simple Adaptations of Word2Vec for Syntax Problems}},
	year = {2015}
}


@inproceedings{Levy.Goldberg.2014.ACL,
	address = {Baltimore, MD, USA},
	author = {Levy, Omer and Goldberg, Yoav},
	booktitle = {Proceedings of ACL},
	doi = {10.3115/v1/P14-2050},
	pages = {302--308},
	title = {{Dependency-Based Word Embeddings}},
	year = {2014}
}

@article{Bojanowski.et.al.2017.TACL,
	author = {Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and Mikolov, Tomas},
	doi = {10.1162/tacl_a_00051},
	journal = {Transactions of the ACL},
	pages = {135--146},
	title = {{Enriching Word Vectors with Subword Information}},
	volume = {5},
	year = {2017}
}


@inproceedings{Madasu.AnveshRao.2019.EMNLP,
	address = {Hong Kong, China},
	author = {Madasu, Avinash and {Anvesh Rao}, Vijjini},
	booktitle = {Proceedings of EMNLP-IJCNLP},
	doi = {10.18653/v1/D19-1567},
	pages = {5657--5666},
	publisher = {Association for Computational Linguistics},
	title = {{Sequential Learning of Convolutional Features for Effective Text Classification}},
	year = {2019}
}


@inproceedings{Kim.2014.EMNLP,
	address = {Doha, Qatar},
	author = {Kim, Yoon},
	booktitle = {Proceedings of EMNLP},
	doi = {10.3115/v1/D14-1181},
	pages = {1746--1751},
	publisher = {Association for Computational Linguistics},
	title = {{Convolutional Neural Networks for Sentence Classification}},
	year = {2014}
}

@inproceedings{Devlin.et.al.2019.NAACL,
	address = {Minneapolis, Minnesota},
	author = {Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
	booktitle = {Proceedings of NAACL},
	doi = {10.18653/v1/N19-1423},
	pages = {4171--4186},
	publisher = {Association for Computational Linguistics},
	title = {{BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding}},
	year = {2019}
}

@inproceedings{Gururangan.et.al.2018.NAACL.short,
	address = {New Orleans, LA},
	author = {Gururangan, Suchin and Swayamdipta, Swabha and Levy, Omer and Schwartz, Roy and Bowman, Samuel and Smith, Noah A.},
	booktitle = {Proceedings of NAACL},
	doi = {10.18653/v1/N18-2017},
	pages = {107--112},
	publisher = {Association for Computational Linguistics},
	title = {{Annotation Artifacts in Natural Language Inference Data}},
	year = {2018}
}

@article{Goldberg.2016,
	author = {Goldberg, Yoav},
	doi = {10.1613/jair.4992},
	journal = {Journal of Artificial Intelligence Research},
	pages = {345--420},
	title = {{A Primer on Neural Network Models for Natural Language Processing}},
	volume = {57},
	year = {2016}
}


@inproceedings{Gehring.et.al.2017a.ICML,
	address = {Sydney, Australia},
	author = {Gehring, Jonas and Auli, Michael and Grangier, David and Yarats, Denis and Dauphin, Yann N.},
	booktitle = {Proceedings of the 34th International Conference on Machine Learning},
	editor = {Precup, Doina and Teh, Yee Whye},
	pages = {1243--1252},
	publisher = {PMLR},
	title = {{Convolutional Sequence to Sequence Learning}},
	year = {2017}
}


@inproceedings{Krishnan.Manning.2006,
	address = {Sydney, Australia},
	author = {Krishnan, Vijay and Manning, Christopher D.},
	booktitle = {Proceedings of ACL},
	doi = {10.3115/1220175.1220316},
	pages = {1121--1128},
	publisher = {Association for Computational Linguistics},
	title = {{An Effective Two-Stage Model for Exploiting Non-Local Dependencies in Named Entity Recognition}},
	year = {2006}
}


@inproceedings{artemova-etal-2021-teaching,
	title = "Teaching a Massive Open Online Course on Natural Language Processing",
	author = "Artemova, Ekaterina  and
	Apishev, Murat  and
	Kirianov, Denis  and
	Sarkisyan, Veronica  and
	Aksenov, Sergey  and
	Serikov, Oleg",
	booktitle = "Proceedings of the Fifth Workshop on Teaching NLP",
	year = "2021",
	address = "Online",
	publisher = "Association for Computational Linguistics",
	url = "https://www.aclweb.org/anthology/2021.teachingnlp-1.2",
	pages = "13--27",
}


@inproceedings{Vaswani.et.al.2017,
	address = {Long Beach, CA, USA},
	author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N. and Kaiser, Lukasz and Polosukhin, Illia},
	booktitle = {Advances in Neural Information Processing Systems 30},
	pages = {5998--6008},
	publisher = {Curran Associates, Inc.},
	title = {{Attention Is All You Need}},
	year = {2017}
}

@article{Koehn.2017,
	author = {Koehn, Philipp},
	title = {Neural Machine Translation},
	journal = {arXiv preprint},
	date = {2017},
	url = {http://arxiv.org/abs/1709.07809}
}


@inproceedings{Schuster.Nakajima.2012,
	address = {Kyoto, Japan},
	author = {Schuster, Mike and Nakajima, Kaisuke},
	booktitle = {2012 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
	doi = {10.1109/ICASSP.2012.6289079},
	pages = {5149--5152},
	publisher = {IEEE},
	title = {{Japanese and Korean voice search}},
	year = {2012}
}

@article{Wu.et.al.2016.GoogleMT,
	author = {Wu, Yonghui and Schuster, Mike and Chen, Zhifeng and Le, Quoc V. and Norouzi, Mohammad and Macherey, Wolfgang and Krikun, Maxim and Cao, Yuan and Gao, Qin and Macherey, Klaus and Klingner, Jeff and Shah, Apurva and Johnson, Melvin and Liu, Xiaobing and Kaiser, {\L}ukasz and Gouws, Stephan and Kato, Yoshikiyo and Kudo, Taku and Kazawa, Hideto and Stevens, Keith and Kurian, George and Patil, Nishant and Wang, Wei and Young, Cliff and Smith, Jason and Riesa, Jason and Rudnick, Alex and Vinyals, Oriol and Corrado, Greg and Hughes, Macduff and Dean, Jeffrey},
	pages = {1--23},
	title = {{Google's Neural Machine Translation System: Bridging the Gap between Human and Machine Translation}},
	url = {http://arxiv.org/abs/1609.08144},
	year = {2016},
	journal = {arXive},
}


@inproceedings{Sennrich.et.al.2016.ACL,
	address = {Berlin, Germany},
	author = {Sennrich, Rico and Haddow, Barry and Birch, Alexandra},
	booktitle = {Proceedings of ACL},
	doi = {10.18653/v1/P16-1162},
	pages = {1715--1725},
	publisher = {Association for Computational Linguistics},
	title = {{Neural Machine Translation of Rare Words with Subword Units}},
	year = {2016}
}

@article{Caruana.1997,
	author = {Caruana, Rich},
	doi = {10.1023/A:1007379606734},
	journal = {Machine Learning},
	number = {1},
	pages = {41--75},
	title = {{Multi-task Learning}},
	volume = {28},
	year = {1997}
}

@inproceedings{Sogaard.Goldberg.2016,
	address = {Berlin, Germany},
	author = {S{\o}gaard, Anders and Goldberg, Yoav},
	booktitle = {Proceedings of ACL},
	doi = {10.18653/v1/P16-2038},
	pages = {231--235},
	publisher = {Association for Computational Linguistics},
	title = {{Deep multi-task learning with low level tasks supervised at lower layers}},
	year = {2016}
}

@inproceedings{Conneau.et.al.2017.EMNLP,
	address = {Copenhagen, Denmark},
	author = {Conneau, Alexis and Kiela, Douwe and Schwenk, Holger and Barrault, Lo{\"{i}}c and Bordes, Antoine},
	booktitle = {Proceedings of EMNLP},
	pages = {670--680},
	title = {{Supervised Learning of Universal Sentence Representations from Natural Language Inference Data}},
	year = {2017}
}

@article{Rogers.et.al.2020.BERT,
	author = {Rogers, Anna and Kovaleva, Olga and Rumshisky, Anna},
	doi = {10.1162/tacl_a_00349},
	journal = {Transactions of the Association for Computational Linguistics},
	pages = {842--866},
	title = {{A Primer in BERTology: What We Know About How BERT Works}},
	volume = {8},
	year = {2020}
}


@inproceedings{Kingma.Ba.2015,
	address = {San Diego, CA, USA},
	author = {Kingma, Diederik P. and Ba, Jimmy Lei},
	booktitle = {3rd International Conference on Learning Representations, ICLR 2015},
	editor = {Bengio, Yoshua and LeCun, Yann},
	pages = {1--15},
	title = {{Adam: A Method for Stochastic Optimization}},
	year = {2015},
	url = {https://arxiv.org/abs/1412.6980},
}

@article{Bengio.et.al.2003.JMLR,
	author = {Bengio, Yoshua and Ducharme, R{\'{e}}jean and Vincent, Pascal and Jauvin, Christian},
	journal = {Journal of Machine Learning Research},
	pages = {1137--1155},
	title = {{A Neural Probabilistic Language Model}},
	volume = {3},
	year = {2003},
	url = {https://research.jmlr.org/papers/v3/bengio03a.html},
}


@book{Kun.2020,
	author = {Jeremy Kun},
	edition = {2},
	title = {A Programmer’s Introduction to Mathematics},
	url = {https://pimbook.org},
	year = {2020},
}

@book{Goldberg.2017,
	author = {Goldberg, Yoav},
	title = {Neural Network Methods for Natural Language Processing},
	year = {2017},
	publisher = {Morgan \& Claypool},
	
}


@inproceedings{Kudo.Richardson.2018.EMNLP,
	title = "{S}entence{P}iece: A simple and language independent subword tokenizer and detokenizer for Neural Text Processing",
	author = "Kudo, Taku  and
	Richardson, John",
	booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
	year = "2018",
	address = "Brussels, Belgium",
	publisher = "Association for Computational Linguistics",
	doi = "10.18653/v1/D18-2012",
	pages = "66--71",
}

@article{kudo2018subword,
  title={Subword regularization: Improving neural network translation models with multiple subword candidates},
  author={Kudo, Taku},
  journal={arXiv preprint arXiv:1804.10959},
  year={2018}
}

@inproceedings{bahdanau2014neural,
  author       = {Dzmitry Bahdanau and Kyunghyun Cho and Yoshua Bengio},
  editor       = {Yoshua Bengio and Yann LeCun},
  title        = {{Neural Machine Translation by Jointly Learning to Align and Translate}},
  booktitle    = {3rd International Conference on Learning Representations (ICLR)},
  address = {San Diego, CA, USA},
  year         = {2015},  
}

@book{Murphy.2012,
	title = {Machine Learning: a Probabilistic Perspective},
	author = {Murphy, Kevin},
	publisher = {MIT Press},
	year = 2012
}

@inproceedings{Mikolov.et.al.2013.ICLR,
	author = {Tomas Mikolov and Kai Chen and Greg Corrado and Jeffrey Dean},
	city = {Scottsdale, Arizona, USA},
	editor = {Yoshua Bengio and Yann LeCun},
	booktitle = {1st International Conference on Learning Representations ICLR, Workshop Track Proceedings},
	pages = {1-12},
	title = {{Efficient estimation of word representations in vector space}},
	year = {2013},
}


@article{Caliskan.et.al.2017.science,
	author = {Aylin Caliskan and Bryson, Joanna J. and Arvind Narayanan},
	doi = {10.1126/science.aal4230},
	issue = {6334},
	journal = {Science},
	month = {4},
	pages = {183-186},
	title = {Semantics derived automatically from language corpora contain human-like biases},
	volume = {356},
	year = {2017},
}

@inproceedings{Kuzi.et.al.2016.CIKM,
	author = {Saar Kuzi and Anna Shtok and Oren Kurland},
	city = {Indianapolis, IN},
	doi = {10.1145/2983323.2983876},
	booktitle = {Proceedings of the 25th ACM International on Conference on Information and Knowledge Management},
	pages = {1929-1932},
	publisher = {ACM},
	title = {{Query Expansion Using Word Embeddings}},
	year = {2016},
}

@misc{Phuong.Hutter.2022,
	title        = {Formal Algorithms for Transformers},
	author       = {Mary Phuong and Marcus Hutter},
	year         = 2022,
	eprint       = {2207.09238},
	archiveprefix = {arXiv}
}

@inproceedings{izsak-etal-2021-train,
	title        = {How to Train {BERT} with an Academic Budget},
	author       = {Izsak, Peter  and Berchansky, Moshe  and Levy, Omer},
	year         = 2021,
	booktitle    = {Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing},
	publisher    = {Association for Computational Linguistics},
	address      = {Online and Punta Cana, Dominican Republic},
	pages        = {10644--10652},
	doi          = {10.18653/v1/2021.emnlp-main.831}
}

@misc{Hendrycks.Gimpel.2016.arXiv,
	archivePrefix = {arXiv},
	arxivId = {1606.08415},
	author = {Hendrycks, Dan and Gimpel, Kevin},
	eprint = {1606.08415},
	journal = {arXiv preprint},
	pages = {1--10},
	title = {{Gaussian Error Linear Units (GELUs)}},
	url = {http://arxiv.org/abs/1606.08415},
	year = {2016}
}

@report{Radford.et.al.2018.GPT1.report,
	title        = {{Improving Language Understanding by Generative Pre-Training}},
	author       = {Alec Radford and Karthik Narasimhan and Tim Salimans and Ilya Sutskever},
	year         = 2018,
	url          = {https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf},
	type         = {Technical report},
	institution  = {OpenAI}
}

@report{Radford.et.al.2019.GPT2.report,
	title        = {{Language Models are Unsupervised Multitask Learners}},
	author       = {Alec Radford and Jeffrey Wu and Rewon Child and David Luan and Dario Amodei and Ilya Sutskever},
	year         = 2019,
	url          = {https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf},
	type         = {Technical report},
	institution  = {OpenAI}
}

@inproceedings{Liu.et.al.2018.ICLR,
	title        = {Generating Wikipedia by Summarizing Long Sequences},
	author       = {Peter J Liu and Mohammad Saleh and Etienne Pot and Ben Goodrich and Ryan Sepassi and Łukasz Kaiser and Noam Shazeer},
	year         = 2018,
	booktitle    = {Proceedings of the 6th International Conference on Learning Representations},
	url          = {https://openreview.net/forum?id=Hyg0vbWC-},
	address      = {Vancouver, BC, Canada}
}

@article{Brown.et.al.2020.GPT3,
	author = {Tom B. Brown and Benjamin Mann and Nick Ryder and Melanie Subbiah and Jared Kaplan and Prafulla Dhariwal and Arvind Neelakantan and Pranav Shyam and Girish Sastry and Amanda Askell and Sandhini Agarwal and Ariel Herbert-Voss and Gretchen Krueger and Tom Henighan and Rewon Child and Aditya Ramesh and Daniel M. Ziegler and Jeffrey Wu and Clemens Winter and Christopher Hesse and Mark Chen and Eric Sigler and Mateusz Litwin and Scott Gray and Benjamin Chess and Jack Clark and Christopher Berner and Sam McCandlish and Alec Radford and Ilya Sutskever and Dario Amodei},
	journal = {arXiv preprint},
	title = {Language Models are Few-Shot Learners},
	url = {http://arxiv.org/abs/2005.14165},
	year = {2020},
}

@article{Touvron.et.al.2023.llama2,
	title        = {{Llama 2: Open Foundation and Fine-Tuned Chat Models}},
	author       = {Touvron, Hugo and Martin, Louis and Stone, Kevin and Albert, Peter and Almahairi, Amjad and Babaei, Yasmine and Bashlykov, Nikolay and Batra, Soumya and Bhargava, Prajjwal and Bhosale, Shruti and Bikel, Dan and Blecher, Lukas and Ferrer, Cristian Canton and Chen, Moya and Cucurull, Guillem and Esiobu, David and Fernandes, Jude and Fu, Jeremy and Fu, Wenyin and Fuller, Brian and Gao, Cynthia and Goswami, Vedanuj and Goyal, Naman and Hartshorn, Anthony and Hosseini, Saghar and Hou, Rui and Inan, Hakan and Kardas, Marcin and Kerkez, Viktor and Khabsa, Madian and Kloumann, Isabel and Korenev, Artem and Koura, Punit Singh and Lachaux, Marie-Anne and Lavril, Thibaut and Lee, Jenya and Liskovich, Diana and Lu, Yinghai and Mao, Yuning and Martinet, Xavier and Mihaylov, Todor and Mishra, Pushkar and Molybog, Igor and Nie, Yixin and Poulton, Andrew and Reizenstein, Jeremy and Rungta, Rashi and Saladi, Kalyan and Schelten, Alan and Silva, Ruan and Smith, Eric Michael and Subramanian, Ranjan and Tan, Xiaoqing Ellen and Tang, Binh and Taylor, Ross and Williams, Adina and Kuan, Jian Xiang and Xu, Puxin and Yan, Zheng and Zarov, Iliyan and Zhang, Yuchen and Fan, Angela and Kambadur, Melanie and Narang, Sharan and Rodriguez, Aurelien and Stojnic, Robert and Edunov, Sergey and Scialom, Thomas},
	year         = 2023,
	journal      = {arXiv},
	url          = {http://arxiv.org/abs/2307.09288}
}

@inproceedings{Ouyang.et.al.2022.NeurIPS,
	title        = {Training language models to follow instructions with human feedback},
	author       = {Ouyang, Long and Wu, Jeff and Jiang, Xu and Almeida, Diogo and Wainwright, Carroll L and Mishkin, Pamela and Zhang, Chong and Agarwal, Sandhini and Slama, Katarina and Ray, Alex and Schulman, John and Hilton, Jacob and Kelton, Fraser and Miller, Luke and Simens, Maddie and Askell, Amanda and Welinder, Peter and Christiano, Paul and Leike, Jan and Lowe, Ryan},
	year         = 2022,
	booktitle    = {Advances in Neural Information Processing Systems},
	publisher    = {Curran Associates, Inc.},
	volume       = 35,
	pages        = {27730–27744}
}

@inproceedings{Min.et.al.2022.EMNLP,
	title        = {Rethinking the Role of Demonstrations: What Makes In-Context Learning Work?},
	author       = {Min, Sewon  and Lyu, Xinxi  and Holtzman, Ari  and Artetxe, Mikel  and Lewis, Mike  and Hajishirzi, Hannaneh  and Zettlemoyer, Luke},
	year         = 2022,
	month        = dec,
	booktitle    = {Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing},
	publisher    = {Association for Computational Linguistics},
	address      = {Abu Dhabi, United Arab Emirates},
	pages        = {11048--11064},
	doi          = {10.18653/v1/2022.emnlp-main.759},
	url          = {https://aclanthology.org/2022.emnlp-main.759},
	editor       = {Goldberg, Yoav  and Kozareva, Zornitsa  and Zhang, Yue}
}

@inproceedings{Dai.et.al.2023.ACLFindings,
	title        = {Why Can {GPT} Learn In-Context? Language Models Secretly Perform Gradient Descent as Meta-Optimizers},
	author       = {Dai, Damai  and Sun, Yutao  and Dong, Li  and Hao, Yaru  and Ma, Shuming  and Sui, Zhifang  and Wei, Furu},
	year         = 2023,
	booktitle    = {Findings of the Association for Computational Linguistics: ACL 2023},
	publisher    = {Association for Computational Linguistics},
	address      = {Toronto, Canada},
	pages        = {4005--4019},
	doi          = {10.18653/v1/2023.findings-acl.247},
	editor       = {Rogers, Anna  and Boyd-Graber, Jordan  and Okazaki, Naoaki}
}


@inproceedings{Reimers.Gurevych.2019.EMNLP,
  title = {{Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks}},
  booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)},
  author = {Reimers, Nils and Gurevych, Iryna},
  date = {2019},
  pages = {3980--3990},
  publisher = {Association for Computational Linguistics},
  location = {Hong Kong, China},
  doi = {10.18653/v1/D19-1410},
}