From 73c814c4edd5c9d924152e9db40117f94eeeb452 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20S=C3=A4nger?= Date: Tue, 14 Mar 2023 10:54:33 +0100 Subject: [PATCH] Update version specifications of spacy and scispacy + revise documentation --- flair/splitter.py | 2 +- flair/tokenization.py | 10 +++++----- resources/docs/HUNFLAIR.md | 18 +++++++++++------- resources/docs/HUNFLAIR_TUTORIAL_1_TAGGING.md | 4 ++-- 4 files changed, 19 insertions(+), 15 deletions(-) diff --git a/flair/splitter.py b/flair/splitter.py index 981ff65c5b..f75ece3112 100644 --- a/flair/splitter.py +++ b/flair/splitter.py @@ -107,7 +107,7 @@ def __init__(self, model: Union[Any, str], tokenizer: Tokenizer = None): from spacy.language import Language except ImportError: raise ImportError( - "Please install spacy v2.3.2 or higher before using the SpacySentenceSplitter, " + "Please install spacy v3.4.4 or higher before using the SpacySentenceSplitter, " "otherwise you can use SegtokSentenceSplitter as alternative implementation." ) diff --git a/flair/tokenization.py b/flair/tokenization.py index 543286d8dd..df0c3b3dd6 100644 --- a/flair/tokenization.py +++ b/flair/tokenization.py @@ -42,7 +42,7 @@ def __init__(self, model): from spacy.language import Language except ImportError: raise ImportError( - "Please install Spacy v2.0 or better before using the Spacy tokenizer, " + "Please install Spacy v3.4.4 or better before using the Spacy tokenizer, " "otherwise you can use SegtokTokenizer as advanced tokenizer." ) @@ -219,12 +219,12 @@ def __init__(self): from spacy.lang import char_classes except ImportError: raise ImportError( - " Please install scispacy version 0.2.5 (recommended) or higher before using the SciSpacy tokenizer, " + " Please install scispacy version 0.5.1 (recommended) or higher before using the SciSpacy tokenizer, " "otherwise you can use SegtokTokenizer as alternative implementation.\n" - " You can install scispacy (version 0.2.5) by running:\n\n" - " pip install scispacy==0.2.5\n\n" + " You can install scispacy (version 0.5.1) by running:\n\n" + " pip install scispacy==0.5.1\n\n" " By default HunFlair uses the `en_core_sci_sm` model. You can install the model by running:\n\n" - " pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_sm-0.2.5.tar.gz\n\n" + " pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz\n\n" " Note that the scispacy version and the version of the model must match to work properly!" ) diff --git a/resources/docs/HUNFLAIR.md b/resources/docs/HUNFLAIR.md index 77122400be..a85b88c8bc 100644 --- a/resources/docs/HUNFLAIR.md +++ b/resources/docs/HUNFLAIR.md @@ -58,8 +58,8 @@ Span[6:7]: "Mouse" → Species (0.9979) Scientific texts are difficult to tokenize. For this reason, we recommend to install [SciSpaCy](https://allenai.github.io/scispacy/) for improved pre-processing and tokenization of scientific / biomedical texts: ``` -pip install scispacy==0.2.5 -pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_sm-0.2.5.tar.gz +pip install scispacy==0.5.1 +pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz ``` Use this code to apply scientific tokenization: @@ -121,10 +121,14 @@ We provide a set of quick tutorials to get you started with *HunFlair*: ## Citing HunFlair Please cite the following paper when using *HunFlair*: ~~~ -@article{weber2020hunflair, - title={HunFlair: An Easy-to-Use Tool for State-of-the-Art Biomedical Named Entity Recognition}, - author={Weber, Leon and S{\"a}nger, Mario and M{\"u}nchmeyer, Jannes and Habibi, Maryam and Leser, Ulf and Akbik, Alan}, - journal={arXiv preprint arXiv:2008.07347}, - year={2020} +@article{weber2021hunflair, + title={HunFlair: an easy-to-use tool for state-of-the-art biomedical named entity recognition}, + author={Weber, Leon and S{\"a}nger, Mario and M{\"u}nchmeyer, Jannes and Habibi, Maryam and Leser, Ulf and Akbik, Alan}, + journal={Bioinformatics}, + volume={37}, + number={17}, + pages={2792--2794}, + year={2021}, + publisher={Oxford University Press} } ~~~ diff --git a/resources/docs/HUNFLAIR_TUTORIAL_1_TAGGING.md b/resources/docs/HUNFLAIR_TUTORIAL_1_TAGGING.md index e46ac18d21..7bad25081b 100644 --- a/resources/docs/HUNFLAIR_TUTORIAL_1_TAGGING.md +++ b/resources/docs/HUNFLAIR_TUTORIAL_1_TAGGING.md @@ -90,8 +90,8 @@ This can be unfavourable if applied to biomedical texts. *HunFlair* integrates [SciSpaCy](https://allenai.github.io/scispacy/), a library specially designed to work with scientific text. To use the library we first have to install it and download one of it's models: ~~~ -pip install scispacy==0.2.5 -pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_sm-0.2.5.tar.gz +pip install scispacy==0.5.1 +pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz ~~~ To use the tokenizer we just have to pass it as parameter to when instancing a sentence: