-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
33a85a1
commit 7ee6fe6
Showing
1 changed file
with
196 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,196 @@ | ||
@inproceedings{ribeiro-etal-2020-beyond, | ||
title = "Beyond Accuracy: Behavioral Testing of {NLP} Models with {C}heck{L}ist", | ||
author = "Ribeiro, Marco Tulio and | ||
Wu, Tongshuang and | ||
Guestrin, Carlos and | ||
Singh, Sameer", | ||
editor = "Jurafsky, Dan and | ||
Chai, Joyce and | ||
Schluter, Natalie and | ||
Tetreault, Joel", | ||
booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics", | ||
month = jul, | ||
year = "2020", | ||
address = "Online", | ||
publisher = "Association for Computational Linguistics", | ||
url = "https://aclanthology.org/2020.acl-main.442", | ||
doi = "10.18653/v1/2020.acl-main.442", | ||
pages = "4902--4912", | ||
abstract = "Although measuring held-out accuracy has been the primary approach to evaluate generalization, it often overestimates the performance of NLP models, while alternative approaches for evaluating models either focus on individual tasks or on specific behaviors. Inspired by principles of behavioral testing in software engineering, we introduce CheckList, a task-agnostic methodology for testing NLP models. CheckList includes a matrix of general linguistic capabilities and test types that facilitate comprehensive test ideation, as well as a software tool to generate a large and diverse number of test cases quickly. We illustrate the utility of CheckList with tests for three tasks, identifying critical failures in both commercial and state-of-art models. In a user study, a team responsible for a commercial sentiment analysis model found new and actionable bugs in an extensively tested model. In another user study, NLP practitioners with CheckList created twice as many tests, and found almost three times as many bugs as users without it.", | ||
} | ||
@inproceedings{wei-zou-2019-eda, | ||
title = "{EDA}: Easy Data Augmentation Techniques for Boosting Performance on Text Classification Tasks", | ||
author = "Wei, Jason and | ||
Zou, Kai", | ||
editor = "Inui, Kentaro and | ||
Jiang, Jing and | ||
Ng, Vincent and | ||
Wan, Xiaojun", | ||
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)", | ||
month = nov, | ||
year = "2019", | ||
address = "Hong Kong, China", | ||
publisher = "Association for Computational Linguistics", | ||
url = "https://aclanthology.org/D19-1670", | ||
doi = "10.18653/v1/D19-1670", | ||
pages = "6382--6388", | ||
abstract = "We present EDA: easy data augmentation techniques for boosting performance on text classification tasks. EDA consists of four simple but powerful operations: synonym replacement, random insertion, random swap, and random deletion. On five text classification tasks, we show that EDA improves performance for both convolutional and recurrent neural networks. EDA demonstrates particularly strong results for smaller datasets; on average, across five datasets, training with EDA while using only 50{\%} of the available training set achieved the same accuracy as normal training with all available data. We also performed extensive ablation studies and suggest parameters for practical use.", | ||
} | ||
@software{spacy, | ||
title = {{spaCy}: Industrial-strength Natural Language Processing in Python}, | ||
rights = {{MIT}}, | ||
url = {/~https://github.com/explosion/spaCy/blob/abb0ab109d33d2deaa6155a61fad649a25472f9c/CITATION.cff}, | ||
shorttitle = {{spaCy}}, | ||
abstract = {Industrial-strength Natural Language Processing ({NLP}) in Python}, | ||
author = {Honnibal, Matthew and Montani, Ines and Van Landeghem, Sofie and Boyd, Adriane}, | ||
urldate = {2023-01-02}, | ||
date = {2020}, | ||
doi = {10.5281/zenodo.1212303}, | ||
note = {original-date: 2014-07-03T15:15:40Z}, | ||
} | ||
@inproceedings{goel-etal-2021-robustness, | ||
title = "Robustness Gym: Unifying the {NLP} Evaluation Landscape", | ||
author = "Goel, Karan and | ||
Rajani, Nazneen Fatema and | ||
Vig, Jesse and | ||
Taschdjian, Zachary and | ||
Bansal, Mohit and | ||
R{\'e}, Christopher", | ||
editor = "Sil, Avi and | ||
Lin, Xi Victoria", | ||
booktitle = "Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies: Demonstrations", | ||
month = jun, | ||
year = "2021", | ||
address = "Online", | ||
publisher = "Association for Computational Linguistics", | ||
url = "https://aclanthology.org/2021.naacl-demos.6", | ||
doi = "10.18653/v1/2021.naacl-demos.6", | ||
pages = "42--55", | ||
abstract = "Despite impressive performance on standard benchmarks, natural language processing (NLP) models are often brittle when deployed in real-world systems. In this work, we identify challenges with evaluating NLP systems and propose a solution in the form of Robustness Gym (RG), a simple and extensible evaluation toolkit that unifies 4 standard evaluation paradigms: subpopulations, transformations, evaluation sets, and adversarial attacks. By providing a common platform for evaluation, RG enables practitioners to compare results from disparate evaluation paradigms with a single click, and to easily develop and share novel evaluation methods using a built-in set of abstractions. RG is under active development and we welcome feedback {\&} contributions from the community.", | ||
} | ||
@article{wang2017effectiveness, | ||
title={The effectiveness of data augmentation in image classification using deep learning}, | ||
author={Wang, Jason and Perez, Luis and others}, | ||
journal={Convolutional Neural Networks Vis. Recognit}, | ||
volume={11}, | ||
number={2017}, | ||
pages={1--8}, | ||
year={2017} | ||
} | ||
@inproceedings{Park2019SpecAugmentAS, | ||
title={SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition}, | ||
author={Daniel S. Park and William Chan and Yu Zhang and Chung-Cheng Chiu and Barret Zoph and Ekin Dogus Cubuk and Quoc V. Le}, | ||
booktitle={Interspeech}, | ||
year={2019}, | ||
url={https://api.semanticscholar.org/CorpusID:121321299} | ||
} | ||
@inproceedings{lassen-etal-2023-detecting, | ||
title = "Detecting intersectionality in {NER} models: A data-driven approach", | ||
author = "Lassen, Ida Marie S. and | ||
Almasi, Mina and | ||
Enevoldsen, Kenneth and | ||
Kristensen-McLachlan, Ross Deans", | ||
editor = "Degaetano-Ortlieb, Stefania and | ||
Kazantseva, Anna and | ||
Reiter, Nils and | ||
Szpakowicz, Stan", | ||
booktitle = "Proceedings of the 7th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature", | ||
month = may, | ||
year = "2023", | ||
address = "Dubrovnik, Croatia", | ||
publisher = "Association for Computational Linguistics", | ||
url = "https://aclanthology.org/2023.latechclfl-1.13", | ||
doi = "10.18653/v1/2023.latechclfl-1.13", | ||
pages = "116--127", | ||
abstract = "The presence of bias is a pressing concern for both engineers and users of language technology. What is less clear is how exactly bias can be measured, so as to rank models relative to the biases they display. Using an innovative experimental method involving data augmentation, we measure the effect of intersectional biases in Danish models used for Name Entity Recognition (NER). We quantify differences in representational biases, understood as a systematic difference in error or what is called error disparity. Our analysis includes both gender and ethnicity to illustrate the effect of multiple dimensions of bias, as well as experiments which look to move beyond a narrowly binary analysis of gender. We show that all contemporary Danish NER models perform systematically worse on non-binary and minority ethnic names, while not showing significant differences for typically Danish names. Our data augmentation technique can be applied on other languages to test for biases which might be relevant for researchers applying NER models to the study of cultural heritage data.", | ||
} | ||
@inproceedings{nielsen-2023-scandeval, | ||
title = "{S}cand{E}val: A Benchmark for {S}candinavian Natural Language Processing", | ||
author = "Nielsen, Dan", | ||
editor = {Alum{\"a}e, Tanel and | ||
Fishel, Mark}, | ||
booktitle = "Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)", | ||
month = may, | ||
year = "2023", | ||
address = "T{\'o}rshavn, Faroe Islands", | ||
publisher = "University of Tartu Library", | ||
url = "https://aclanthology.org/2023.nodalida-1.20", | ||
pages = "185--201", | ||
abstract = "This paper introduces a Scandinavian benchmarking platform, ScandEval, which can benchmark any pretrained model on four different tasks in the Scandinavian languages. The datasets used in two of the tasks, linguistic acceptability and question answering, are new. We develop and release a Python package and command-line interface, scandeval, which can benchmark any model that has been uploaded to the Hugging Face Hub, with reproducible results. Using this package, we benchmark more than 80 Scandinavian or multilingual models and present the results of these in an interactive online leaderboard, as well as provide an analysis of the results. The analysis shows that there is substantial cross-lingual transfer among the the Mainland Scandinavian languages (Danish, Swedish and Norwegian), with limited cross-lingual transfer between the group of Mainland Scandinavian languages and the group of Insular Scandinavian languages (Icelandic and Faroese). The benchmarking results also show that the investment in language technology in Norway and Sweden has led to language models that outperform massively multilingual models such as XLM-RoBERTa and mDeBERTaV3. We release the source code for both the package and leaderboard.", | ||
} | ||
|
||
@software{pandya_hetpandyatextgenie_2023, | ||
title = {hetpandya/textgenie}, | ||
copyright = {Apache-2.0}, | ||
url = {/~https://github.com/hetpandya/textgenie}, | ||
abstract = {A python package to augment text data using NLP.}, | ||
urldate = {2023-12-08}, | ||
author = {Pandya, Het}, | ||
month = nov, | ||
year = {2023}, | ||
note = {original-date: 2021-06-21T14:14:38Z}, | ||
} | ||
@inproceedings{marivate2020improving, | ||
title={Improving short text classification through global augmentation methods}, | ||
author={Marivate, Vukosi and Sefara, Tshephisho}, | ||
booktitle={International Cross-Domain Conference for Machine Learning and Knowledge Extraction}, | ||
pages={385--399}, | ||
year={2020}, | ||
organization={Springer} | ||
} | ||
@inproceedings{morris2020textattack, | ||
title={TextAttack: A Framework for Adversarial Attacks, Data Augmentation, and Adversarial Training in NLP}, | ||
author={Morris, John and Lifland, Eli and Yoo, Jin Yong and Grigsby, Jake and Jin, Di and Qi, Yanjun}, | ||
booktitle={Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations}, | ||
pages={119--126}, | ||
year={2020} | ||
} | ||
@misc{bird2009natural, | ||
title={Natural language processing with Python. O'Reilly Media Inc., Sebastopol, USA}, | ||
author={Bird, S and Loper, E and Klein, E}, | ||
year={2009} | ||
} | ||
@inproceedings{miller-1994-wordnet, | ||
title = "{W}ord{N}et: A Lexical Database for {E}nglish", | ||
author = "Miller, George A.", | ||
booktitle = "{H}uman {L}anguage {T}echnology: Proceedings of a Workshop held at {P}lainsboro, {N}ew {J}ersey, {M}arch 8-11, 1994", | ||
year = "1994", | ||
url = "https://aclanthology.org/H94-1111", | ||
} | ||
@inproceedings{pennington-etal-2014-glove, | ||
title = "{G}lo{V}e Global Vectors for Word Representation", | ||
author = "Pennington, Jeffrey and | ||
Socher, Richard and | ||
Manning, Christopher", | ||
editor = "Moschitti, Alessandro and | ||
Pang, Bo and | ||
Daelemans, Walter", | ||
booktitle = "Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing ({EMNLP})", | ||
month = oct, | ||
year = "2014", | ||
address = "Doha, Qatar", | ||
publisher = "Association for Computational Linguistics", | ||
url = "https://aclanthology.org/D14-1162", | ||
doi = "10.3115/v1/D14-1162", | ||
pages = "1532--1543", | ||
} | ||
|
||
@software{sloth_dadebiasgenda-lens_2023, | ||
title = {{DaDebias}/genda-lens}, | ||
author = "Sloth, Thea Rolskov and | ||
Rybner, Astrid Sletten", | ||
copyright = {MIT}, | ||
url = {/~https://github.com/DaDebias/genda-lens}, | ||
abstract = {GenDa Lens: Python package for quatifying gender bias in Danish language models.}, | ||
urldate = {2023-12-08}, | ||
publisher = {DaDebias}, | ||
month = jun, | ||
year = {2023}, | ||
note = {original-date: 2023-05-16T07:25:28Z}, | ||
} | ||
@article{Enevoldsen_DaCy_A_Unified_2021, | ||
author = {Enevoldsen, Kenneth and Hansen, Lasse and Nielbo, Kristoffer L.}, | ||
title = {{DaCy: A Unified Framework for Danish NLP}}, | ||
url = {https://ceur-ws.org/Vol-2989/short_paper24.pdf}, | ||
year = {2021} | ||
} |