From fbd12eb4a4c75f193f2badbeebf8967ac52350ef Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 8 May 2023 09:10:35 +0200 Subject: [PATCH 01/12] Set version to v3.6.0.dev0 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 640e9e93b..c6b09039e 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.5.0" +__version__ = "3.6.0.dev0" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" From 46ce66021a1f6c6f18914546051199b478e63040 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 8 May 2023 09:17:33 +0200 Subject: [PATCH 02/12] Temporarily skip download CLI related tests in CI --- .github/workflows/tests.yml | 54 ++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 21b660989..619570090 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -107,22 +107,22 @@ jobs: - name: Test import run: python -W error -c "import spacy" - - name: "Test download CLI" - run: | - python -m spacy download ca_core_news_sm - python -m spacy download ca_core_news_md - python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" - if: matrix.python_version == '3.9' - - - name: "Test download_url in info CLI" - run: | - python -W error -m spacy info ca_core_news_sm | grep -q download_url - if: matrix.python_version == '3.9' - - - name: "Test no warnings on load (#11713)" - run: | - python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')" - if: matrix.python_version == '3.9' +# - name: "Test download CLI" +# run: | +# python -m spacy download ca_core_news_sm +# python -m spacy download ca_core_news_md +# python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" +# if: matrix.python_version == '3.9' +# +# - name: "Test download_url in info CLI" +# run: | +# python -W error -m spacy info ca_core_news_sm | grep -q download_url +# if: matrix.python_version == '3.9' +# +# - name: "Test no warnings on load (#11713)" +# run: | +# python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')" +# if: matrix.python_version == '3.9' - name: "Test convert CLI" run: | @@ -146,17 +146,17 @@ jobs: python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1 if: matrix.python_version == '3.9' - - name: "Test assemble CLI" - run: | - python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" - PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir - if: matrix.python_version == '3.9' - - - name: "Test assemble CLI vectors warning" - run: | - python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" - python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 - if: matrix.python_version == '3.9' +# - name: "Test assemble CLI" +# run: | +# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" +# PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir +# if: matrix.python_version == '3.9' +# +# - name: "Test assemble CLI vectors warning" +# run: | +# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" +# python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 +# if: matrix.python_version == '3.9' - name: "Install test requirements" run: | From 6f314f99c42c3503b89391798a58befbbc23bee4 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 8 May 2023 12:33:56 +0200 Subject: [PATCH 03/12] Use Latin normalization for Serbian attrs (#12608) * Use Latin normalization for Serbian attrs Use Latin normalization for Serbian `NORM`, `PREFIX`, and `SUFFIX`. * Update NORMs in tokenizer exceptions and related tests * Add tests for all custom lex attrs * Remove unused imports --- spacy/lang/sr/lex_attrs.py | 55 +++++++++++++++++++++++++- spacy/lang/sr/tokenizer_exceptions.py | 3 ++ spacy/tests/lang/sr/test_exceptions.py | 12 +++--- spacy/tests/lang/sr/test_lex_attrs.py | 17 ++++++++ 4 files changed, 79 insertions(+), 8 deletions(-) create mode 100644 spacy/tests/lang/sr/test_lex_attrs.py diff --git a/spacy/lang/sr/lex_attrs.py b/spacy/lang/sr/lex_attrs.py index dc48909bc..a356a6a7a 100644 --- a/spacy/lang/sr/lex_attrs.py +++ b/spacy/lang/sr/lex_attrs.py @@ -1,4 +1,4 @@ -from ...attrs import LIKE_NUM +from ...attrs import LIKE_NUM, NORM, PREFIX, SUFFIX _num_words = [ @@ -63,4 +63,55 @@ def like_num(text): return False -LEX_ATTRS = {LIKE_NUM: like_num} +def _cyr_to_latin_norm(text): + # fmt: off + # source: https://github.com/opendatakosovo/cyrillic-transliteration/blob/v1.1.1/cyrtranslit/mapping.py + SR_CYR_TO_LAT_DICT = { + u'А': u'A', u'а': u'a', + u'Б': u'B', u'б': u'b', + u'В': u'V', u'в': u'v', + u'Г': u'G', u'г': u'g', + u'Д': u'D', u'д': u'd', + u'Ђ': u'Đ', u'ђ': u'đ', + u'Е': u'E', u'е': u'e', + u'Ж': u'Ž', u'ж': u'ž', + u'З': u'Z', u'з': u'z', + u'И': u'I', u'и': u'i', + u'Ј': u'J', u'ј': u'j', + u'К': u'K', u'к': u'k', + u'Л': u'L', u'л': u'l', + u'Љ': u'Lj', u'љ': u'lj', + u'М': u'M', u'м': u'm', + u'Н': u'N', u'н': u'n', + u'Њ': u'Nj', u'њ': u'nj', + u'О': u'O', u'о': u'o', + u'П': u'P', u'п': u'p', + u'Р': u'R', u'р': u'r', + u'С': u'S', u'с': u's', + u'Т': u'T', u'т': u't', + u'Ћ': u'Ć', u'ћ': u'ć', + u'У': u'U', u'у': u'u', + u'Ф': u'F', u'ф': u'f', + u'Х': u'H', u'х': u'h', + u'Ц': u'C', u'ц': u'c', + u'Ч': u'Č', u'ч': u'č', + u'Џ': u'Dž', u'џ': u'dž', + u'Ш': u'Š', u'ш': u'š', + } + # fmt: on + return "".join(SR_CYR_TO_LAT_DICT.get(c, c) for c in text) + + +def norm(text): + return _cyr_to_latin_norm(text).lower() + + +def prefix(text): + return _cyr_to_latin_norm(text)[0] + + +def suffix(text): + return _cyr_to_latin_norm(text)[-3:] + + +LEX_ATTRS = {LIKE_NUM: like_num, NORM: norm, PREFIX: prefix, SUFFIX: suffix} diff --git a/spacy/lang/sr/tokenizer_exceptions.py b/spacy/lang/sr/tokenizer_exceptions.py index dcaa3e239..053306088 100755 --- a/spacy/lang/sr/tokenizer_exceptions.py +++ b/spacy/lang/sr/tokenizer_exceptions.py @@ -1,3 +1,4 @@ +from .lex_attrs import _cyr_to_latin_norm from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import ORTH, NORM from ...util import update_exc @@ -89,5 +90,7 @@ _slang_exc = [ for slang_desc in _slang_exc: _exc[slang_desc[ORTH]] = [slang_desc] +for _exc_key in _exc: + _exc[_exc_key][0][NORM] = _cyr_to_latin_norm(_exc[_exc_key][0][NORM]) TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/tests/lang/sr/test_exceptions.py b/spacy/tests/lang/sr/test_exceptions.py index fa92e5e2d..e8819e628 100644 --- a/spacy/tests/lang/sr/test_exceptions.py +++ b/spacy/tests/lang/sr/test_exceptions.py @@ -2,15 +2,15 @@ import pytest @pytest.mark.parametrize( - "text,norms,lemmas", + "text,norms", [ - ("о.г.", ["ове године"], ["ова година"]), - ("чет.", ["четвртак"], ["четвртак"]), - ("гђа", ["госпођа"], ["госпођа"]), - ("ил'", ["или"], ["или"]), + ("о.г.", ["ove godine"]), + ("чет.", ["četvrtak"]), + ("гђа", ["gospođa"]), + ("ил'", ["ili"]), ], ) -def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms, lemmas): +def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms): tokens = sr_tokenizer(text) assert len(tokens) == 1 assert [token.norm_ for token in tokens] == norms diff --git a/spacy/tests/lang/sr/test_lex_attrs.py b/spacy/tests/lang/sr/test_lex_attrs.py new file mode 100644 index 000000000..4a8039df5 --- /dev/null +++ b/spacy/tests/lang/sr/test_lex_attrs.py @@ -0,0 +1,17 @@ +import pytest + + +@pytest.mark.parametrize( + "text,like_num,norm,prefix,suffix", + [ + ("нула", True, "nula", "n", "ula"), + ("Казна", False, "kazna", "K", "zna"), + ], +) +def test_lex_attrs(sr_tokenizer, text, like_num, norm, prefix, suffix): + tokens = sr_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].like_num == like_num + assert tokens[0].norm_ == norm + assert tokens[0].prefix_ == prefix + assert tokens[0].suffix_ == suffix From 1279b464bb2868b027b3690a9329091e153e9f23 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 8 May 2023 16:51:58 +0200 Subject: [PATCH 04/12] In initialize only calculate current vectors hash if needed (#12607) --- spacy/training/initialize.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index e90617852..9cf759c55 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -133,10 +133,11 @@ def init_vocab( logger.info("Added vectors: %s", vectors) # warn if source model vectors are not identical sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {}) - vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"])) - for sourced_component, sourced_vectors_hash in sourced_vectors_hashes.items(): - if vectors_hash != sourced_vectors_hash: - warnings.warn(Warnings.W113.format(name=sourced_component)) + if len(sourced_vectors_hashes) > 0: + vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"])) + for sourced_component, sourced_vectors_hash in sourced_vectors_hashes.items(): + if vectors_hash != sourced_vectors_hash: + warnings.warn(Warnings.W113.format(name=sourced_component)) logger.info("Finished initializing nlp object") From eb3960a15abcf1a77a034bc5f22bea153f428de6 Mon Sep 17 00:00:00 2001 From: "Patrick J. Burns" Date: Tue, 9 May 2023 06:02:45 -0400 Subject: [PATCH 05/12] Add LatinCy models to universe.json (#12597) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add LatinCy models to universe.json * Update website/meta/universe.json Add install code for LatinCy models to 'code_example' Co-authored-by: Adriane Boyd * Update LatinCy ‘code_example’ in website/meta/universe.json Co-authored-by: Adriane Boyd --------- Co-authored-by: Adriane Boyd --- website/meta/universe.json | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index 4067c4d1e..05877cfc6 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1,5 +1,32 @@ { "resources": [ + { + "id": "latincy", + "title": "LatinCy", + "thumb": "https://raw.githubusercontent.com/diyclassics/la_core_web_lg/main/latincy-logo.png", + "slogan": "Synthetic trained spaCy pipelines for Latin NLP", + "description": "Set of trained general purpose Latin-language 'core' pipelines for use with spaCy. The models are trained on a large amount of available Latin data, including all five of the Latin Universal Dependency treebanks, which have been preprocessed to be compatible with each other.", + "url": "https://huggingface.co/latincy", + "code_example": [ + "# pip install https://huggingface.co/latincy/la_core_web_lg/resolve/main/la_core_web_lg-any-py3-none-any.whl", + "import spacy", + "nlp = spacy.load('la_core_web_lg')", + "doc = nlp('Haec narranatur a poetis de Perseo')", + "", + "print(f'{doc[0].text}, {doc[0].norm_}, {doc[0].lemma_}, {doc[0].pos_}')", + "", + "# > Haec, haec, hic, DET" + ], + "code_language": "python", + "author": "Patrick J. Burns", + "author_links": { + "twitter": "@diyclassics", + "github": "diyclassics", + "website": "https://diyclassics.github.io/" + }, + "category": ["pipeline", "research"], + "tags": ["latin"] + }, { "id": "spacy-wasm", "title": "spacy-wasm", From 15f16db6ca3fd10a9667358d2f7b7e6eaf967e0a Mon Sep 17 00:00:00 2001 From: "Patrick J. Burns" Date: Tue, 9 May 2023 09:52:34 -0400 Subject: [PATCH 06/12] Fix typo (#12615) --- website/meta/universe.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index 05877cfc6..b39ebb528 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -11,7 +11,7 @@ "# pip install https://huggingface.co/latincy/la_core_web_lg/resolve/main/la_core_web_lg-any-py3-none-any.whl", "import spacy", "nlp = spacy.load('la_core_web_lg')", - "doc = nlp('Haec narranatur a poetis de Perseo')", + "doc = nlp('Haec narrantur a poetis de Perseo')", "", "print(f'{doc[0].text}, {doc[0].norm_}, {doc[0].lemma_}, {doc[0].pos_}')", "", From d11b549195ce669caa0480804a22019150513be0 Mon Sep 17 00:00:00 2001 From: David Berenstein Date: Wed, 10 May 2023 13:16:16 +0200 Subject: [PATCH 07/12] chore: added adept-augmentations to the spacy universe (#12609) * chore: added adept-augmentations to the spacy universe * Apply suggestions from code review Co-authored-by: Basile Dura * Update universe.json --------- Co-authored-by: Basile Dura --- website/meta/universe.json | 50 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index b39ebb528..e36ba5676 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -2837,6 +2837,56 @@ "tags": ["coreference", "multi-lingual", "cross-lingual", "allennlp"], "spacy_version": 3 }, + { + "id": "adeptaugmentations", + "title": "Adept Augmentations", + "slogan": " A Python library aimed at dissecting and augmenting NER training data for a few-shot scenario.", + "description": "EntitySwapAugmenter takes either a `datasets.Dataset` or a `spacy.tokens.DocBin`. Additionally, it is optional to provide a set of labels. It initially creates a knowledge base of entities belonging to a certain label. When running `augmenter.augment()` for N runs, it then creates N new sentences with random swaps of the original entities with an entity of the same corresponding label from the knowledge base.\n\nFor example, assuming that we have knowledge base for `PERSONS`, `LOCATIONS` and `PRODUCTS`. We can then create additional data for the sentence \"Momofuko Ando created instant noodles in Osaka.\" using `augmenter.augment(N=2)`, resulting in \"David created instant noodles in Madrid.\" or \"Tom created Adept Augmentations in the Netherlands\".", + "github": "davidberenstein1957/adept-augmentations", + "pip": "adept-augmentations", + "thumb": "https://raw.githubusercontent.com/Pandora-Intelligence/crosslingual-coreference/master/img/logo.png", + "code_example": [ + "import spacy", + "from spacy.tokens import DocBin", + "", + "from adept_augmentations import EntitySwapAugmenter", + "", + "nlp = spacy.load(\"en_core_web_sm\")", + "", + "TRAIN_DATA = [", + " \"Apple is looking at buying U.K. startup for $1 billion\",", + " \"Microsoft acquires GitHub for $7.5 billion\"", + "]", + "docs = nlp.pipe(TRAIN_DATA)", + "", + "# Create a new DocBin", + "doc_bin = DocBin(docs=docs)", + "", + "# Augment Data", + "doc_bin = EntitySwapAugmenter(doc_bin).augment(4)", + "for doc in doc_bin.get_docs(nlp.vocab):", + " print(doc.text)", + "", + "# Output", + "#", + "# GitHub is looking at buying U.K. startup for $ 7.5 billion", + "# Microsoft is looking at buying U.K. startup for $ 1 billion", + "# Microsoft is looking at buying U.K. startup for $ 7.5 billion", + "# GitHub is looking at buying U.K. startup for $ 1 billion", + "# Microsoft acquires Apple for $ 7.5 billion", + "# Apple acquires Microsoft for $ 1 billion", + "# Microsoft acquires Microsoft for $ 7.5 billion", + "# GitHub acquires GitHub for $ 1 billion" + ], + "author": "David Berenstein", + "author_links": { + "github": "davidberenstein1957", + "website": "https://www.linkedin.com/in/david-berenstein-1bab11105/" + }, + "category": ["standalone"], + "tags": ["ner", "few-shot", "augmentation", "datasets", "training"], + "spacy_version": 3 + }, { "id": "blackstone", "title": "Blackstone", From a56ab98e3c00c6ffd63e6a8359c129ec535b2cc9 Mon Sep 17 00:00:00 2001 From: royashcenazi <37100955+royashcenazi@users.noreply.github.com> Date: Wed, 10 May 2023 14:19:28 +0300 Subject: [PATCH 08/12] parsigs universe (#12616) * parsigs universe * added model installation explanation in the description * Update website/meta/universe.json Co-authored-by: Basile Dura * added model installement instruction in the code example --------- Co-authored-by: Basile Dura --- website/meta/universe.json | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index e36ba5676..f2b199275 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1,5 +1,28 @@ { "resources": [ + { + "id": "parsigs", + "title": "parsigs", + "slogan": "Structuring prescriptions text made simple using spaCy", + "description": "Parsigs is an open-source project that aims to extract the relevant dosage information from prescriptions text without compromising the patient's privacy.\n\nNotice you also need to install the model in order to use the package: `pip install https://huggingface.co/royashcenazi/en_parsigs/resolve/main/en_parsigs-any-py3-none-any.whl`", + "github": "royashcenazi/parsigs", + "pip": "parsigs", + "code_language": "python", + "author": "Roy Ashcenazi", + "code_example": [ + "# You'll need to install the trained model, see instructions in the description section", + "from parsigs.parse_sig_api import StructuredSig, SigParser", + "sig_parser = SigParser()", + "", + "sig = 'Take 1 tablet of ibuprofen 200mg 3 times every day for 3 weeks'", + "parsed_sig = sig_parser.parse(sig)" + ], + "author_links": { + "github": "royashcenazi" + }, + "category": ["model", "research"], + "tags": ["sigs", "prescription","pharma"] + }, { "id": "latincy", "title": "LatinCy", @@ -26,7 +49,7 @@ }, "category": ["pipeline", "research"], "tags": ["latin"] - }, + }, { "id": "spacy-wasm", "title": "spacy-wasm", From 3252f6b13fd357e0ce3f1ba41b9acff11f0d3839 Mon Sep 17 00:00:00 2001 From: royashcenazi <37100955+royashcenazi@users.noreply.github.com> Date: Wed, 10 May 2023 14:49:51 +0300 Subject: [PATCH 09/12] Parsigs universe 3 (#12617) * parsigs universe * added model installation explanation in the description * Update website/meta/universe.json Co-authored-by: Basile Dura * added model installement instruction in the code example * added biomedical category --------- Co-authored-by: Basile Dura --- website/meta/universe.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index f2b199275..33185ca30 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -20,7 +20,7 @@ "author_links": { "github": "royashcenazi" }, - "category": ["model", "research"], + "category": ["model", "research", "biomedical"], "tags": ["sigs", "prescription","pharma"] }, { From b5af0fe836945ce4b1b63dbd10606d2fe2ad9789 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 11 May 2023 11:54:16 +0200 Subject: [PATCH 10/12] Revert "Use Latin normalization for Serbian attrs (#12608)" (#12621) This reverts commit 6f314f99c42c3503b89391798a58befbbc23bee4. We are reverting this until we can support this normalization more consistently across vectors, training corpora, and lemmatizer data. --- spacy/lang/sr/lex_attrs.py | 55 +------------------------- spacy/lang/sr/tokenizer_exceptions.py | 3 -- spacy/tests/lang/sr/test_exceptions.py | 12 +++--- spacy/tests/lang/sr/test_lex_attrs.py | 17 -------- 4 files changed, 8 insertions(+), 79 deletions(-) delete mode 100644 spacy/tests/lang/sr/test_lex_attrs.py diff --git a/spacy/lang/sr/lex_attrs.py b/spacy/lang/sr/lex_attrs.py index a356a6a7a..dc48909bc 100644 --- a/spacy/lang/sr/lex_attrs.py +++ b/spacy/lang/sr/lex_attrs.py @@ -1,4 +1,4 @@ -from ...attrs import LIKE_NUM, NORM, PREFIX, SUFFIX +from ...attrs import LIKE_NUM _num_words = [ @@ -63,55 +63,4 @@ def like_num(text): return False -def _cyr_to_latin_norm(text): - # fmt: off - # source: https://github.com/opendatakosovo/cyrillic-transliteration/blob/v1.1.1/cyrtranslit/mapping.py - SR_CYR_TO_LAT_DICT = { - u'А': u'A', u'а': u'a', - u'Б': u'B', u'б': u'b', - u'В': u'V', u'в': u'v', - u'Г': u'G', u'г': u'g', - u'Д': u'D', u'д': u'd', - u'Ђ': u'Đ', u'ђ': u'đ', - u'Е': u'E', u'е': u'e', - u'Ж': u'Ž', u'ж': u'ž', - u'З': u'Z', u'з': u'z', - u'И': u'I', u'и': u'i', - u'Ј': u'J', u'ј': u'j', - u'К': u'K', u'к': u'k', - u'Л': u'L', u'л': u'l', - u'Љ': u'Lj', u'љ': u'lj', - u'М': u'M', u'м': u'm', - u'Н': u'N', u'н': u'n', - u'Њ': u'Nj', u'њ': u'nj', - u'О': u'O', u'о': u'o', - u'П': u'P', u'п': u'p', - u'Р': u'R', u'р': u'r', - u'С': u'S', u'с': u's', - u'Т': u'T', u'т': u't', - u'Ћ': u'Ć', u'ћ': u'ć', - u'У': u'U', u'у': u'u', - u'Ф': u'F', u'ф': u'f', - u'Х': u'H', u'х': u'h', - u'Ц': u'C', u'ц': u'c', - u'Ч': u'Č', u'ч': u'č', - u'Џ': u'Dž', u'џ': u'dž', - u'Ш': u'Š', u'ш': u'š', - } - # fmt: on - return "".join(SR_CYR_TO_LAT_DICT.get(c, c) for c in text) - - -def norm(text): - return _cyr_to_latin_norm(text).lower() - - -def prefix(text): - return _cyr_to_latin_norm(text)[0] - - -def suffix(text): - return _cyr_to_latin_norm(text)[-3:] - - -LEX_ATTRS = {LIKE_NUM: like_num, NORM: norm, PREFIX: prefix, SUFFIX: suffix} +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/sr/tokenizer_exceptions.py b/spacy/lang/sr/tokenizer_exceptions.py index 053306088..dcaa3e239 100755 --- a/spacy/lang/sr/tokenizer_exceptions.py +++ b/spacy/lang/sr/tokenizer_exceptions.py @@ -1,4 +1,3 @@ -from .lex_attrs import _cyr_to_latin_norm from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import ORTH, NORM from ...util import update_exc @@ -90,7 +89,5 @@ _slang_exc = [ for slang_desc in _slang_exc: _exc[slang_desc[ORTH]] = [slang_desc] -for _exc_key in _exc: - _exc[_exc_key][0][NORM] = _cyr_to_latin_norm(_exc[_exc_key][0][NORM]) TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/tests/lang/sr/test_exceptions.py b/spacy/tests/lang/sr/test_exceptions.py index e8819e628..fa92e5e2d 100644 --- a/spacy/tests/lang/sr/test_exceptions.py +++ b/spacy/tests/lang/sr/test_exceptions.py @@ -2,15 +2,15 @@ import pytest @pytest.mark.parametrize( - "text,norms", + "text,norms,lemmas", [ - ("о.г.", ["ove godine"]), - ("чет.", ["četvrtak"]), - ("гђа", ["gospođa"]), - ("ил'", ["ili"]), + ("о.г.", ["ове године"], ["ова година"]), + ("чет.", ["четвртак"], ["четвртак"]), + ("гђа", ["госпођа"], ["госпођа"]), + ("ил'", ["или"], ["или"]), ], ) -def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms): +def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms, lemmas): tokens = sr_tokenizer(text) assert len(tokens) == 1 assert [token.norm_ for token in tokens] == norms diff --git a/spacy/tests/lang/sr/test_lex_attrs.py b/spacy/tests/lang/sr/test_lex_attrs.py deleted file mode 100644 index 4a8039df5..000000000 --- a/spacy/tests/lang/sr/test_lex_attrs.py +++ /dev/null @@ -1,17 +0,0 @@ -import pytest - - -@pytest.mark.parametrize( - "text,like_num,norm,prefix,suffix", - [ - ("нула", True, "nula", "n", "ula"), - ("Казна", False, "kazna", "K", "zna"), - ], -) -def test_lex_attrs(sr_tokenizer, text, like_num, norm, prefix, suffix): - tokens = sr_tokenizer(text) - assert len(tokens) == 1 - assert tokens[0].like_num == like_num - assert tokens[0].norm_ == norm - assert tokens[0].prefix_ == prefix - assert tokens[0].suffix_ == suffix From 88680a6eed1c0a3b76975cc236ca820a53773c0e Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Fri, 12 May 2023 00:40:28 -0700 Subject: [PATCH 11/12] docs: remove invalid huggingface-hub push argument (#12624) --- website/docs/api/cli.mdx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx index 323ea2a92..05328b7eb 100644 --- a/website/docs/api/cli.mdx +++ b/website/docs/api/cli.mdx @@ -1640,7 +1640,7 @@ with [`spacy package`](/api/cli#package) and `--build wheel`. For more details, see the spaCy project [integration](/usage/projects#huggingface_hub). ```bash -$ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--local-repo] [--verbose] +$ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--verbose] ``` > #### Example @@ -1654,6 +1654,5 @@ $ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--local-repo] | `whl_path` | The path to the `.whl` file packaged with [`spacy package`](https://spacy.io/api/cli#package). ~~Path(positional)~~ | | `--org`, `-o` | Optional name of organization to which the pipeline should be uploaded. ~~str (option)~~ | | `--msg`, `-m` | Commit message to use for update. Defaults to `"Update spaCy pipeline"`. ~~str (option)~~ | -| `--local-repo`, `-l` | Local path to the model repository (will be created if it doesn't exist). Defaults to `hub` in the current working directory. ~~Path (option)~~ | | `--verbose`, `-V` | Output additional info for debugging, e.g. the full generated hub metadata. ~~bool (flag)~~ | | **UPLOADS** | The pipeline to the hub. | From 3637148c4d7b3677fff9ba7c2800b78a2b3dd694 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 12 May 2023 15:36:54 +0200 Subject: [PATCH 12/12] Add scorer option to return per-component scores (#12540) * Add scorer option to return per-component scores Add `per_component` option to `Language.evaluate` and `Scorer.score` to return scores keyed by `tokenizer` (hard-coded) or by component name. Add option to `evaluate` CLI to score by component. Per-component scores can only be saved to JSON. * Update help text and messages --- spacy/cli/evaluate.py | 84 +++++++++++++++++++--------------- spacy/language.py | 5 +- spacy/scorer.py | 16 +++++-- spacy/tests/test_scorer.py | 15 ++++++ website/docs/api/cli.mdx | 25 +++++----- website/docs/api/language.mdx | 19 ++++---- website/docs/api/scorer.mdx | 12 +++-- website/docs/api/top-level.mdx | 2 +- 8 files changed, 111 insertions(+), 67 deletions(-) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 363c02cd3..9fcdd18be 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -27,6 +27,7 @@ def evaluate_cli( gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"), displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False), displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"), + per_component: bool = Opt(False, "--per-component", "-P", help="Return scores per component, only applicable when an output JSON file is specified."), # fmt: on ): """ @@ -50,6 +51,7 @@ def evaluate_cli( gold_preproc=gold_preproc, displacy_path=displacy_path, displacy_limit=displacy_limit, + per_component=per_component, silent=False, ) @@ -64,6 +66,7 @@ def evaluate( displacy_limit: int = 25, silent: bool = True, spans_key: str = "sc", + per_component: bool = False, ) -> Dict[str, Any]: msg = Printer(no_print=silent, pretty=not silent) fix_random_seed() @@ -78,44 +81,53 @@ def evaluate( corpus = Corpus(data_path, gold_preproc=gold_preproc) nlp = util.load_model(model) dev_dataset = list(corpus(nlp)) - scores = nlp.evaluate(dev_dataset) - metrics = { - "TOK": "token_acc", - "TAG": "tag_acc", - "POS": "pos_acc", - "MORPH": "morph_acc", - "LEMMA": "lemma_acc", - "UAS": "dep_uas", - "LAS": "dep_las", - "NER P": "ents_p", - "NER R": "ents_r", - "NER F": "ents_f", - "TEXTCAT": "cats_score", - "SENT P": "sents_p", - "SENT R": "sents_r", - "SENT F": "sents_f", - "SPAN P": f"spans_{spans_key}_p", - "SPAN R": f"spans_{spans_key}_r", - "SPAN F": f"spans_{spans_key}_f", - "SPEED": "speed", - } - results = {} - data = {} - for metric, key in metrics.items(): - if key in scores: - if key == "cats_score": - metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")" - if isinstance(scores[key], (int, float)): - if key == "speed": - results[metric] = f"{scores[key]:.0f}" + scores = nlp.evaluate(dev_dataset, per_component=per_component) + if per_component: + data = scores + if output is None: + msg.warn( + "The per-component option is enabled but there is no output JSON file provided to save the scores to." + ) + else: + msg.info("Per-component scores will be saved to output JSON file.") + else: + metrics = { + "TOK": "token_acc", + "TAG": "tag_acc", + "POS": "pos_acc", + "MORPH": "morph_acc", + "LEMMA": "lemma_acc", + "UAS": "dep_uas", + "LAS": "dep_las", + "NER P": "ents_p", + "NER R": "ents_r", + "NER F": "ents_f", + "TEXTCAT": "cats_score", + "SENT P": "sents_p", + "SENT R": "sents_r", + "SENT F": "sents_f", + "SPAN P": f"spans_{spans_key}_p", + "SPAN R": f"spans_{spans_key}_r", + "SPAN F": f"spans_{spans_key}_f", + "SPEED": "speed", + } + results = {} + data = {} + for metric, key in metrics.items(): + if key in scores: + if key == "cats_score": + metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")" + if isinstance(scores[key], (int, float)): + if key == "speed": + results[metric] = f"{scores[key]:.0f}" + else: + results[metric] = f"{scores[key]*100:.2f}" else: - results[metric] = f"{scores[key]*100:.2f}" - else: - results[metric] = "-" - data[re.sub(r"[\s/]", "_", key.lower())] = scores[key] + results[metric] = "-" + data[re.sub(r"[\s/]", "_", key.lower())] = scores[key] - msg.table(results, title="Results") - data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent) + msg.table(results, title="Results") + data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent) if displacy_path: factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names] diff --git a/spacy/language.py b/spacy/language.py index 9fdcf6328..289e6dd2c 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1372,6 +1372,7 @@ class Language: scorer: Optional[Scorer] = None, component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, scorer_cfg: Optional[Dict[str, Any]] = None, + per_component: bool = False, ) -> Dict[str, Any]: """Evaluate a model's pipeline components. @@ -1383,6 +1384,8 @@ class Language: arguments for specific components. scorer_cfg (dict): An optional dictionary with extra keyword arguments for the scorer. + per_component (bool): Whether to return the scores keyed by component + name. Defaults to False. RETURNS (Scorer): The scorer containing the evaluation results. @@ -1415,7 +1418,7 @@ class Language: for eg, doc in zip(examples, docs): eg.predicted = doc end_time = timer() - results = scorer.score(examples) + results = scorer.score(examples, per_component=per_component) n_words = sum(len(eg.predicted) for eg in examples) results["speed"] = n_words / (end_time - start_time) return results diff --git a/spacy/scorer.py b/spacy/scorer.py index de4f52be6..86cd00a50 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -121,20 +121,30 @@ class Scorer: nlp.add_pipe(pipe) self.nlp = nlp - def score(self, examples: Iterable[Example]) -> Dict[str, Any]: + def score( + self, examples: Iterable[Example], *, per_component: bool = False + ) -> Dict[str, Any]: """Evaluate a list of Examples. examples (Iterable[Example]): The predicted annotations + correct annotations. + per_component (bool): Whether to return the scores keyed by component + name. Defaults to False. RETURNS (Dict): A dictionary of scores. DOCS: https://spacy.io/api/scorer#score """ scores = {} if hasattr(self.nlp.tokenizer, "score"): - scores.update(self.nlp.tokenizer.score(examples, **self.cfg)) # type: ignore + if per_component: + scores["tokenizer"] = self.nlp.tokenizer.score(examples, **self.cfg) + else: + scores.update(self.nlp.tokenizer.score(examples, **self.cfg)) # type: ignore for name, component in self.nlp.pipeline: if hasattr(component, "score"): - scores.update(component.score(examples, **self.cfg)) + if per_component: + scores[name] = component.score(examples, **self.cfg) + else: + scores.update(component.score(examples, **self.cfg)) return scores @staticmethod diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index dbb47b423..4b2d22986 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -115,6 +115,14 @@ def test_tokenization(sented_doc): assert scores["token_r"] == approx(0.33333333) assert scores["token_f"] == 0.4 + # per-component scoring + scorer = Scorer() + scores = scorer.score([example], per_component=True) + assert scores["tokenizer"]["token_acc"] == 0.5 + assert scores["tokenizer"]["token_p"] == 0.5 + assert scores["tokenizer"]["token_r"] == approx(0.33333333) + assert scores["tokenizer"]["token_f"] == 0.4 + def test_sents(sented_doc): scorer = Scorer() @@ -278,6 +286,13 @@ def test_tag_score(tagged_doc): assert results["morph_per_feat"]["Poss"]["f"] == 0.0 assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272) + # per-component scoring + scorer = Scorer() + results = scorer.score([example], per_component=True) + assert results["tagger"]["tag_acc"] == 0.9 + assert results["morphologizer"]["pos_acc"] == 0.9 + assert results["morphologizer"]["morph_acc"] == approx(0.8) + def test_partial_annotation(en_tokenizer): pred_doc = en_tokenizer("a b c d e") diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx index 05328b7eb..2c90ec6c0 100644 --- a/website/docs/api/cli.mdx +++ b/website/docs/api/cli.mdx @@ -1163,18 +1163,19 @@ skew. To render a sample of dependency parses in a HTML file using the $ python -m spacy benchmark accuracy [model] [data_path] [--output] [--code] [--gold-preproc] [--gpu-id] [--displacy-path] [--displacy-limit] ``` -| Name | Description | -| ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ | -| `data_path` | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ | -| `--output`, `-o` | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~ | -| `--code`, `-c` 3 | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | -| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ | -| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | -| `--displacy-path`, `-dp` | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~ | -| `--displacy-limit`, `-dl` | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | -| **CREATES** | Training results and optional metrics and visualizations. | +| Name | Description | +| ---------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ | +| `data_path` | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ | +| `--output`, `-o` | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~ | +| `--code`, `-c` 3 | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ | +| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | +| `--displacy-path`, `-dp` | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~ | +| `--displacy-limit`, `-dl` | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~ | +| `--per-component`, `-P` 3.6 | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | Training results and optional metrics and visualizations. | ### speed {id="benchmark-speed", version="3.5", tag="command"} diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx index 93ddd79a2..de23156b9 100644 --- a/website/docs/api/language.mdx +++ b/website/docs/api/language.mdx @@ -382,15 +382,16 @@ objects instead of tuples of `Doc` and `GoldParse` objects. > print(scores) > ``` -| Name | Description | -| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | -| _keyword-only_ | | -| `batch_size` | The batch size to use. ~~Optional[int]~~ | -| `scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. ~~Optional[Scorer]~~ | -| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ | -| `scorer_cfg` | Optional dictionary of keyword arguments for the `Scorer`. Defaults to `None`. ~~Optional[Dict[str, Any]]~~ | -| **RETURNS** | A dictionary of evaluation scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ | +| Name | Description | +| -------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `batch_size` | The batch size to use. ~~Optional[int]~~ | +| `scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. ~~Optional[Scorer]~~ | +| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ | +| `scorer_cfg` | Optional dictionary of keyword arguments for the `Scorer`. Defaults to `None`. ~~Optional[Dict[str, Any]]~~ | +| `per_component` 3.6 | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool~~ | +| **RETURNS** | A dictionary of evaluation scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ | ## Language.use_params {id="use_params",tag="contextmanager, method"} diff --git a/website/docs/api/scorer.mdx b/website/docs/api/scorer.mdx index 6f0c95f6f..9bdd0a8f4 100644 --- a/website/docs/api/scorer.mdx +++ b/website/docs/api/scorer.mdx @@ -33,7 +33,7 @@ Create a new `Scorer`. | `default_lang` | The language to use for a default pipeline if `nlp` is not provided. Defaults to `xx`. ~~str~~ | | `default_pipeline` | The pipeline components to use for a default pipeline if `nlp` is not provided. Defaults to `("senter", "tagger", "morphologizer", "parser", "ner", "textcat")`. ~~Iterable[string]~~ | | _keyword-only_ | | -| `\*\*kwargs` | Any additional settings to pass on to the individual scoring methods. ~~Any~~ | +| `**kwargs` | Any additional settings to pass on to the individual scoring methods. ~~Any~~ | ## Scorer.score {id="score",tag="method"} @@ -67,10 +67,12 @@ core pipeline components, the individual score names start with the `Token` or > scores = scorer.score(examples) > ``` -| Name | Description | -| ----------- | ------------------------------------------------------------------------------------------------------------------- | -| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | -| **RETURNS** | A dictionary of scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ | +| Name | Description | +| -------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- | +| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `per_component` 3.6 | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool~~ | +| **RETURNS** | A dictionary of scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ | ## Scorer.score_tokenization {id="score_tokenization",tag="staticmethod",version="3"} diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx index 6de1acdf0..64ec342cd 100644 --- a/website/docs/api/top-level.mdx +++ b/website/docs/api/top-level.mdx @@ -469,7 +469,7 @@ factories. | `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). | | `readers` | Registry for file and data readers, including training and evaluation data readers like [`Corpus`](/api/corpus). | | `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). | -| `scorers` | Registry for functions that create scoring methods for user with the [`Scorer`](/api/scorer). Scoring methods are called with `Iterable[Example]` and arbitrary `\*\*kwargs` and return scores as `Dict[str, Any]`. | +| `scorers` | Registry for functions that create scoring methods for user with the [`Scorer`](/api/scorer). Scoring methods are called with `Iterable[Example]` and arbitrary `**kwargs` and return scores as `Dict[str, Any]`. | | `tokenizers` | Registry for tokenizer factories. Registered functions should return a callback that receives the `nlp` object and returns a [`Tokenizer`](/api/tokenizer) or a custom callable. | ### spacy-transformers registry {id="registry-transformers"}