From ecbb9c4b9f89120ba04642852780d592c024b6ef Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 12 Feb 2020 11:50:42 +0100 Subject: [PATCH 01/41] load Underscore state when multiprocessing --- spacy/language.py | 11 ++++++++--- spacy/tokens/underscore.py | 8 ++++++++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 5544b6341..71180a65d 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -15,6 +15,7 @@ import multiprocessing as mp from itertools import chain, cycle from .tokenizer import Tokenizer +from .tokens.underscore import Underscore from .vocab import Vocab from .lemmatizer import Lemmatizer from .lookups import Lookups @@ -852,7 +853,10 @@ class Language(object): sender.send() procs = [ - mp.Process(target=_apply_pipes, args=(self.make_doc, pipes, rch, sch)) + mp.Process( + target=_apply_pipes, + args=(self.make_doc, pipes, rch, sch, Underscore.get_state()), + ) for rch, sch in zip(texts_q, bytedocs_send_ch) ] for proc in procs: @@ -1107,7 +1111,7 @@ def _pipe(docs, proc, kwargs): yield doc -def _apply_pipes(make_doc, pipes, reciever, sender): +def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state): """Worker for Language.pipe receiver (multiprocessing.Connection): Pipe to receive text. Usually @@ -1115,8 +1119,9 @@ def _apply_pipes(make_doc, pipes, reciever, sender): sender (multiprocessing.Connection): Pipe to send doc. Usually created by `multiprocessing.Pipe()` """ + Underscore.load_state(underscore_state) while True: - texts = reciever.get() + texts = receiver.get() docs = (make_doc(text) for text in texts) for pipe in pipes: docs = pipe(docs) diff --git a/spacy/tokens/underscore.py b/spacy/tokens/underscore.py index b36fe9294..8dac8526e 100644 --- a/spacy/tokens/underscore.py +++ b/spacy/tokens/underscore.py @@ -79,6 +79,14 @@ class Underscore(object): def _get_key(self, name): return ("._.", name, self._start, self._end) + @classmethod + def get_state(cls): + return cls.token_extensions, cls.span_extensions, cls.doc_extensions + + @classmethod + def load_state(cls, state): + cls.token_extensions, cls.span_extensions, cls.doc_extensions = state + def get_ext_args(**kwargs): """Validate and convert arguments. Reused in Doc, Token and Span.""" From 05dedaa2cf2e57469ac860fbd0af638c27c02148 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 12 Feb 2020 12:00:13 +0100 Subject: [PATCH 02/41] add unit test --- spacy/tests/regression/test_issue4903.py | 40 ++++++++++++++++++++++++ spacy/tests/regression/test_issue4924.py | 2 +- 2 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 spacy/tests/regression/test_issue4903.py diff --git a/spacy/tests/regression/test_issue4903.py b/spacy/tests/regression/test_issue4903.py new file mode 100644 index 000000000..97293aec7 --- /dev/null +++ b/spacy/tests/regression/test_issue4903.py @@ -0,0 +1,40 @@ +# coding: utf8 +from __future__ import unicode_literals + +import spacy +from spacy.tokens import Span, Doc + + +class CustomPipe: + name = "my_pipe" + + def __init__(self): + Span.set_extension("my_ext", getter=self._get_my_ext) + Doc.set_extension("my_ext", default=None) + + def __call__(self, doc): + gathered_ext = [] + for sent in doc.sents: + sent_ext = self._get_my_ext(sent) + sent._.set("my_ext", sent_ext) + gathered_ext.append(sent_ext) + + doc._.set("my_ext", "\n".join(gathered_ext)) + + return doc + + @staticmethod + def _get_my_ext(span): + return str(span.end) + + +def test_issue4903(): + # ensures that this runs correctly and doesn't hang or crash on Windows / macOS + nlp = spacy.load("en_core_web_sm") + custom_component = CustomPipe() + nlp.add_pipe(custom_component, after="parser") + + text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."] + # works without 'n_process' + for doc in nlp.pipe(text, n_process=2): + print(doc) diff --git a/spacy/tests/regression/test_issue4924.py b/spacy/tests/regression/test_issue4924.py index 8aea2c3d5..0e45291a9 100644 --- a/spacy/tests/regression/test_issue4924.py +++ b/spacy/tests/regression/test_issue4924.py @@ -11,6 +11,6 @@ def nlp(): return spacy.blank("en") -def test_evaluate(nlp): +def test_issue4924(nlp): docs_golds = [("", {})] nlp.evaluate(docs_golds) From 65f5b48b5db0e8e11e73e505469ccdb38e8f07af Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 12 Feb 2020 12:06:27 +0100 Subject: [PATCH 03/41] add comment --- spacy/language.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/language.py b/spacy/language.py index 71180a65d..737e0bf3c 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1118,6 +1118,7 @@ def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state): created by `multiprocessing.Pipe()` sender (multiprocessing.Connection): Pipe to send doc. Usually created by `multiprocessing.Pipe()` + underscore_state (tuple): The data in the Underscore class of the parent """ Underscore.load_state(underscore_state) while True: From 51d37033c8b2f280cfc0ddf2b1ecf0537f347532 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 12 Feb 2020 12:10:05 +0100 Subject: [PATCH 04/41] remove old comment --- spacy/tests/regression/test_issue4903.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/tests/regression/test_issue4903.py b/spacy/tests/regression/test_issue4903.py index 97293aec7..d09b32849 100644 --- a/spacy/tests/regression/test_issue4903.py +++ b/spacy/tests/regression/test_issue4903.py @@ -35,6 +35,5 @@ def test_issue4903(): nlp.add_pipe(custom_component, after="parser") text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."] - # works without 'n_process' for doc in nlp.pipe(text, n_process=2): print(doc) From 46628d88903edaa2c3614339a0d464b9fcdcc690 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 12 Feb 2020 12:12:52 +0100 Subject: [PATCH 05/41] add some asserts --- spacy/tests/regression/test_issue4903.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/tests/regression/test_issue4903.py b/spacy/tests/regression/test_issue4903.py index d09b32849..0a255d9a8 100644 --- a/spacy/tests/regression/test_issue4903.py +++ b/spacy/tests/regression/test_issue4903.py @@ -35,5 +35,7 @@ def test_issue4903(): nlp.add_pipe(custom_component, after="parser") text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."] - for doc in nlp.pipe(text, n_process=2): - print(doc) + docs = list(nlp.pipe(text, n_process=2)) + assert docs[0].text == "I like bananas." + assert docs[1].text == "Do you like them?" + assert docs[2].text == "No, I prefer wasabi." From 7939c6388656e1abb932b2deb1af90928c297aa2 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 12 Feb 2020 12:26:27 +0100 Subject: [PATCH 06/41] use English instead of model --- spacy/tests/regression/test_issue4903.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/tests/regression/test_issue4903.py b/spacy/tests/regression/test_issue4903.py index 0a255d9a8..82e21b79f 100644 --- a/spacy/tests/regression/test_issue4903.py +++ b/spacy/tests/regression/test_issue4903.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import spacy +from spacy.lang.en import English from spacy.tokens import Span, Doc @@ -30,9 +31,10 @@ class CustomPipe: def test_issue4903(): # ensures that this runs correctly and doesn't hang or crash on Windows / macOS - nlp = spacy.load("en_core_web_sm") + nlp = English() custom_component = CustomPipe() - nlp.add_pipe(custom_component, after="parser") + nlp.add_pipe(nlp.create_pipe("sentencizer")) + nlp.add_pipe(custom_component, after="sentencizer") text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."] docs = list(nlp.pipe(text, n_process=2)) From 6e717c62ed2d0407b37ae0e19c033964425419cc Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 12 Feb 2020 13:21:31 +0100 Subject: [PATCH 07/41] avoid the tests interacting with eachother through the global Underscore variable --- spacy/tests/regression/test_issue4849.py | 6 ++++++ spacy/tests/regression/test_issue4903.py | 7 +++++++ 2 files changed, 13 insertions(+) diff --git a/spacy/tests/regression/test_issue4849.py b/spacy/tests/regression/test_issue4849.py index 834219773..7e58243bc 100644 --- a/spacy/tests/regression/test_issue4849.py +++ b/spacy/tests/regression/test_issue4849.py @@ -3,11 +3,17 @@ from __future__ import unicode_literals from spacy.lang.en import English from spacy.pipeline import EntityRuler +from spacy.tokens.underscore import Underscore def test_issue4849(): nlp = English() + # reset the Underscore object because test_underscore has a lambda function that can't be pickled + Underscore.doc_extensions = {} + Underscore.span_extensions = {} + Underscore.token_extensions = {} + ruler = EntityRuler( nlp, patterns=[ {"label": "PERSON", "pattern": 'joe biden', "id": 'joe-biden'}, diff --git a/spacy/tests/regression/test_issue4903.py b/spacy/tests/regression/test_issue4903.py index 82e21b79f..156845558 100644 --- a/spacy/tests/regression/test_issue4903.py +++ b/spacy/tests/regression/test_issue4903.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import spacy from spacy.lang.en import English from spacy.tokens import Span, Doc +from spacy.tokens.underscore import Underscore class CustomPipe: @@ -31,6 +32,12 @@ class CustomPipe: def test_issue4903(): # ensures that this runs correctly and doesn't hang or crash on Windows / macOS + + # reset the Underscore object because test_underscore has a lambda function that can't be pickled + Underscore.doc_extensions = {} + Underscore.span_extensions = {} + Underscore.token_extensions = {} + nlp = English() custom_component = CustomPipe() nlp.add_pipe(nlp.create_pipe("sentencizer")) From d1f0b397b5a8cba5e59dd5448a831932055c7f45 Mon Sep 17 00:00:00 2001 From: questoph Date: Thu, 13 Feb 2020 22:18:51 +0100 Subject: [PATCH 08/41] Update punctuation.py --- spacy/lang/lb/punctuation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/lang/lb/punctuation.py b/spacy/lang/lb/punctuation.py index 1571e13d7..2a4587856 100644 --- a/spacy/lang/lb/punctuation.py +++ b/spacy/lang/lb/punctuation.py @@ -5,11 +5,13 @@ from ..char_classes import LIST_ELLIPSES, LIST_ICONS, ALPHA, ALPHA_LOWER, ALPHA_ ELISION = " ' ’ ".strip().replace(" ", "") +abbrev = ("d", "D") + _infixes = ( LIST_ELLIPSES + LIST_ICONS + [ - r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION), + r"(?<=^[{ab}][{el}])(?=[{a}])".format(ab=abbrev, a=ALPHA, el=ELISION), r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA), From 5352fc8fc3f06e99a2d5159da7d4c226be1b82c1 Mon Sep 17 00:00:00 2001 From: questoph Date: Fri, 14 Feb 2020 12:02:15 +0100 Subject: [PATCH 09/41] Update tokenizer_exceptions.py --- spacy/lang/lb/tokenizer_exceptions.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/lang/lb/tokenizer_exceptions.py b/spacy/lang/lb/tokenizer_exceptions.py index b32daa58c..1c9b2dde3 100644 --- a/spacy/lang/lb/tokenizer_exceptions.py +++ b/spacy/lang/lb/tokenizer_exceptions.py @@ -10,6 +10,8 @@ _exc = {} # translate / delete what is not necessary for exc_data in [ + {ORTH: "’t", LEMMA: "et", NORM: "et"}, + {ORTH: "’T", LEMMA: "et", NORM: "et"}, {ORTH: "'t", LEMMA: "et", NORM: "et"}, {ORTH: "'T", LEMMA: "et", NORM: "et"}, {ORTH: "wgl.", LEMMA: "wannechgelift", NORM: "wannechgelift"}, From 3853d385faad420f94f39223da148265113149e1 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 20 Feb 2020 13:41:24 +0100 Subject: [PATCH 10/41] Fix formatting in Token API --- website/docs/api/token.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/docs/api/token.md b/website/docs/api/token.md index 68402d1b4..c30c01c20 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -437,8 +437,8 @@ The L2 norm of the token's vector representation. | `norm_` | unicode | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | | `lower` | int | Lowercase form of the token. | | `lower_` | unicode | Lowercase form of the token text. Equivalent to `Token.text.lower()`. | -| `shape` | int | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | -| `shape_` | unicode | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | +| `shape` | int | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | +| `shape_` | unicode | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | | `prefix` | int | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. | | `prefix_` | unicode | A length-N substring from the start of the token. Defaults to `N=1`. | | `suffix` | int | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. | From b49a3afd0cde67debd2128b2cf2c816322c6d0d7 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Sun, 23 Feb 2020 15:49:20 +0100 Subject: [PATCH 11/41] use clean_underscore fixture --- spacy/tests/doc/test_underscore.py | 9 +++++++++ spacy/tests/matcher/test_matcher_api.py | 2 ++ spacy/tests/regression/test_issue4849.py | 5 ----- spacy/tests/regression/test_issue4903.py | 5 ----- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/spacy/tests/doc/test_underscore.py b/spacy/tests/doc/test_underscore.py index 2877bfeea..c1eff2c20 100644 --- a/spacy/tests/doc/test_underscore.py +++ b/spacy/tests/doc/test_underscore.py @@ -7,6 +7,15 @@ from spacy.tokens import Doc, Span, Token from spacy.tokens.underscore import Underscore +@pytest.fixture(scope="function", autouse=True) +def clean_underscore(): + # reset the Underscore object after the test, to avoid having state copied across tests + yield + Underscore.doc_extensions = {} + Underscore.span_extensions = {} + Underscore.token_extensions = {} + + def test_create_doc_underscore(): doc = Mock() doc.doc = doc diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index e4584d03a..a826a0a0e 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -6,6 +6,7 @@ import re from mock import Mock from spacy.matcher import Matcher, DependencyMatcher from spacy.tokens import Doc, Token +from ..doc.test_underscore import clean_underscore @pytest.fixture @@ -200,6 +201,7 @@ def test_matcher_any_token_operator(en_vocab): assert matches[2] == "test hello world" +@pytest.mark.usefixtures("clean_underscore") def test_matcher_extension_attribute(en_vocab): matcher = Matcher(en_vocab) get_is_fruit = lambda token: token.text in ("apple", "banana") diff --git a/spacy/tests/regression/test_issue4849.py b/spacy/tests/regression/test_issue4849.py index 7e58243bc..85d03fe9a 100644 --- a/spacy/tests/regression/test_issue4849.py +++ b/spacy/tests/regression/test_issue4849.py @@ -9,11 +9,6 @@ from spacy.tokens.underscore import Underscore def test_issue4849(): nlp = English() - # reset the Underscore object because test_underscore has a lambda function that can't be pickled - Underscore.doc_extensions = {} - Underscore.span_extensions = {} - Underscore.token_extensions = {} - ruler = EntityRuler( nlp, patterns=[ {"label": "PERSON", "pattern": 'joe biden', "id": 'joe-biden'}, diff --git a/spacy/tests/regression/test_issue4903.py b/spacy/tests/regression/test_issue4903.py index 156845558..9a3c10d61 100644 --- a/spacy/tests/regression/test_issue4903.py +++ b/spacy/tests/regression/test_issue4903.py @@ -33,11 +33,6 @@ class CustomPipe: def test_issue4903(): # ensures that this runs correctly and doesn't hang or crash on Windows / macOS - # reset the Underscore object because test_underscore has a lambda function that can't be pickled - Underscore.doc_extensions = {} - Underscore.span_extensions = {} - Underscore.token_extensions = {} - nlp = English() custom_component = CustomPipe() nlp.add_pipe(nlp.create_pipe("sentencizer")) From 54d8665ff74239c42a0fb6f457c26a50bc269079 Mon Sep 17 00:00:00 2001 From: Santiago Castro Date: Mon, 24 Feb 2020 16:15:28 -0500 Subject: [PATCH 12/41] Add missing comma in a dependency specification Conda is complaining that it can't parse that line otherwise. --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 55396e011..12d7a2e63 100644 --- a/setup.cfg +++ b/setup.cfg @@ -59,7 +59,7 @@ install_requires = [options.extras_require] lookups = - spacy_lookups_data>=0.0.5<0.2.0 + spacy_lookups_data>=0.0.5,<0.2.0 cuda = cupy>=5.0.0b4 cuda80 = From d848a68340ad3e57212384e4c25e45da02b31990 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 25 Feb 2020 12:07:42 +0100 Subject: [PATCH 13/41] thinc 7.4.0.dev2 --- requirements.txt | 2 +- setup.cfg | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 4f0579313..4ceb3a838 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc==7.4.0.dev0 +thinc==7.4.0.dev2 blis>=0.4.0,<0.5.0 murmurhash>=0.28.0,<1.1.0 wasabi>=0.4.0,<1.1.0 diff --git a/setup.cfg b/setup.cfg index 55396e011..78d5be7f5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -38,13 +38,13 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc==7.4.0.dev0 + thinc==7.4.0.dev2 install_requires = # Our libraries murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc==7.4.0.dev0 + thinc==7.4.0.dev2 blis>=0.4.0,<0.5.0 wasabi>=0.4.0,<1.1.0 srsly>=1.0.1,<1.1.0 From dc36ec98a4f57b6f0e9e7d508b3152cb53e67da7 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 25 Feb 2020 16:46:14 +0100 Subject: [PATCH 14/41] Update pyproject.toml --- pyproject.toml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index fed528d4a..8a6ababf3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,11 @@ [build-system] -requires = ["setuptools"] +requires = [ + "setuptools", + "wheel", + "cython>=0.25", + "cymem>=2.0.2,<2.1.0", + "preshed>=3.0.2,<3.1.0", + "murmurhash>=0.28.0,<1.1.0", + "thinc==7.4.0.dev0", +] build-backend = "setuptools.build_meta" From 62406a951374ab18753153d9cea0d0faf9e070d9 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 26 Feb 2020 10:30:35 +0100 Subject: [PATCH 15/41] update from thinc 7.4.0.dev2 to 7.4.0 --- requirements.txt | 2 +- setup.cfg | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 4ceb3a838..e908e25f8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc==7.4.0.dev2 +thinc==7.4.0 blis>=0.4.0,<0.5.0 murmurhash>=0.28.0,<1.1.0 wasabi>=0.4.0,<1.1.0 diff --git a/setup.cfg b/setup.cfg index 78d5be7f5..ac19f7bac 100644 --- a/setup.cfg +++ b/setup.cfg @@ -38,13 +38,13 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc==7.4.0.dev2 + thinc==7.4.0 install_requires = # Our libraries murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc==7.4.0.dev2 + thinc==7.4.0 blis>=0.4.0,<0.5.0 wasabi>=0.4.0,<1.1.0 srsly>=1.0.1,<1.1.0 From 18ff97589d3f1adccd9aa451959dbfe97f67e29a Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 26 Feb 2020 10:50:05 +0100 Subject: [PATCH 16/41] update spacy to 2.2.4.dev0 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index a1880fb54..365c2adbb 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "2.2.3" +__version__ = "2.2.4.dev0" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From ff184b7a9c64d954a7c7445e00c7505ed1d930f0 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 26 Feb 2020 12:10:38 +0100 Subject: [PATCH 17/41] Add tag_map argument to CLI debug-data and train (#4750) (#5038) Add an argument for a path to a JSON-formatted tag map, which is used to update and extend the default language tag map. --- spacy/cli/debug_data.py | 10 +++++++++- spacy/cli/train.py | 8 ++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 4b12052c3..0e12a594c 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -26,6 +26,7 @@ BLANK_MODEL_THRESHOLD = 2000 lang=("model language", "positional", None, str), train_path=("location of JSON-formatted training data", "positional", None, Path), dev_path=("location of JSON-formatted development data", "positional", None, Path), + tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path), base_model=("name of model to update (optional)", "option", "b", str), pipeline=( "Comma-separated names of pipeline components to train", @@ -41,6 +42,7 @@ def debug_data( lang, train_path, dev_path, + tag_map_path=None, base_model=None, pipeline="tagger,parser,ner", ignore_warnings=False, @@ -60,6 +62,10 @@ def debug_data( if not dev_path.exists(): msg.fail("Development data not found", dev_path, exits=1) + tag_map = {} + if tag_map_path is not None: + tag_map = srsly.read_json(tag_map_path) + # Initialize the model and pipeline pipeline = [p.strip() for p in pipeline.split(",")] if base_model: @@ -67,6 +73,8 @@ def debug_data( else: lang_cls = get_lang_class(lang) nlp = lang_cls() + # Update tag map with provided mapping + nlp.vocab.morphology.tag_map.update(tag_map) msg.divider("Data format validation") @@ -344,7 +352,7 @@ def debug_data( if "tagger" in pipeline: msg.divider("Part-of-speech Tagging") labels = [label for label in gold_train_data["tags"]] - tag_map = nlp.Defaults.tag_map + tag_map = nlp.vocab.morphology.tag_map msg.info( "{} {} in data ({} {} in tag map)".format( len(labels), diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 5af93a8f3..968a009f6 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -57,6 +57,7 @@ from .. import about textcat_multilabel=("Textcat classes aren't mutually exclusive (multilabel)", "flag", "TML", bool), textcat_arch=("Textcat model architecture", "option", "ta", str), textcat_positive_label=("Textcat positive label for binary classes with two labels", "option", "tpl", str), + tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path), verbose=("Display more information for debug", "flag", "VV", bool), debug=("Run data diagnostics before training", "flag", "D", bool), # fmt: on @@ -95,6 +96,7 @@ def train( textcat_multilabel=False, textcat_arch="bow", textcat_positive_label=None, + tag_map_path=None, verbose=False, debug=False, ): @@ -132,6 +134,9 @@ def train( output_path.mkdir() msg.good("Created output directory: {}".format(output_path)) + tag_map = {} + if tag_map_path is not None: + tag_map = srsly.read_json(tag_map_path) # Take dropout and batch size as generators of values -- dropout # starts high and decays sharply, to force the optimizer to explore. # Batch size starts at 1 and grows, so that we make updates quickly @@ -238,6 +243,9 @@ def train( pipe_cfg = {} nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) + # Update tag map with provided mapping + nlp.vocab.morphology.tag_map.update(tag_map) + if vectors: msg.text("Loading vector from model '{}'".format(vectors)) _load_vectors(nlp, vectors) From 54da6a2a0717bcbba737e67a9f7ca201f62c6ef3 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 26 Feb 2020 12:51:53 +0100 Subject: [PATCH 18/41] Update pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 8a6ababf3..827e2a797 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,6 +6,6 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc==7.4.0.dev0", + "thinc==7.4.0", ] build-backend = "setuptools.build_meta" From d1f703d78d1fa20078787d8655addd4a31c7c6a4 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 26 Feb 2020 13:06:52 +0100 Subject: [PATCH 19/41] Improve German tokenization Improve German tokenization with respect to Tiger. --- spacy/lang/de/__init__.py | 3 +++ spacy/lang/de/punctuation.py | 27 ++++++++++++++++++++++++++- spacy/lang/de/tokenizer_exceptions.py | 11 +++++++++++ 3 files changed, 40 insertions(+), 1 deletion(-) diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py index 1412f033a..dee1841c8 100644 --- a/spacy/lang/de/__init__.py +++ b/spacy/lang/de/__init__.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .norm_exceptions import NORM_EXCEPTIONS +from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_INFIXES from .tag_map import TAG_MAP from .stop_words import STOP_WORDS @@ -22,6 +23,8 @@ class GermanDefaults(Language.Defaults): Language.Defaults.lex_attr_getters[NORM], NORM_EXCEPTIONS, BASE_NORMS ) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + prefixes = TOKENIZER_PREFIXES + suffixes = TOKENIZER_SUFFIXES infixes = TOKENIZER_INFIXES tag_map = TAG_MAP stop_words = STOP_WORDS diff --git a/spacy/lang/de/punctuation.py b/spacy/lang/de/punctuation.py index 7dfa61bd4..c376ce597 100644 --- a/spacy/lang/de/punctuation.py +++ b/spacy/lang/de/punctuation.py @@ -1,10 +1,32 @@ # coding: utf8 from __future__ import unicode_literals -from ..char_classes import LIST_ELLIPSES, LIST_ICONS +from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES +from ..char_classes import LIST_CURRENCY, CURRENCY, UNITS, PUNCT from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER +from ..punctuation import _prefixes, _suffixes +_prefixes = ["``",] + list(_prefixes) + +_suffixes = ( + ["''", "/"] + + LIST_PUNCT + + LIST_ELLIPSES + + LIST_QUOTES + + LIST_ICONS + + [ + r"(?<=[0-9])\+", + r"(?<=°[FfCcKk])\.", + r"(?<=[0-9])(?:{c})".format(c=CURRENCY), + r"(?<=[0-9])(?:{u})".format(u=UNITS), + r"(?<=[{al}{e}{p}(?:{q})])\.".format( + al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT + ), + r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), + ] +) + _quotes = CONCAT_QUOTES.replace("'", "") _infixes = ( @@ -15,6 +37,7 @@ _infixes = ( r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[0-9{a}])\/(?=[0-9{a}])".format(a=ALPHA), r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes), r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), r"(?<=[0-9])-(?=[0-9])", @@ -22,4 +45,6 @@ _infixes = ( ) +TOKENIZER_PREFIXES = _prefixes +TOKENIZER_SUFFIXES = _suffixes TOKENIZER_INFIXES = _infixes diff --git a/spacy/lang/de/tokenizer_exceptions.py b/spacy/lang/de/tokenizer_exceptions.py index 5b09a0b89..ebbbfba8c 100644 --- a/spacy/lang/de/tokenizer_exceptions.py +++ b/spacy/lang/de/tokenizer_exceptions.py @@ -160,6 +160,8 @@ for exc_data in [ for orth in [ + "``", + "''", "A.C.", "a.D.", "A.D.", @@ -175,10 +177,13 @@ for orth in [ "biol.", "Biol.", "ca.", + "CDU/CSU", "Chr.", "Cie.", + "c/o", "co.", "Co.", + "d'", "D.C.", "Dipl.-Ing.", "Dipl.", @@ -203,12 +208,18 @@ for orth in [ "i.G.", "i.Tr.", "i.V.", + "I.", + "II.", + "III.", + "IV.", + "Inc.", "Ing.", "jr.", "Jr.", "jun.", "jur.", "K.O.", + "L'", "L.A.", "lat.", "M.A.", From b4e0d2bf50fe6c654886eccb0395e47ccfbc3bef Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Feb 2020 20:59:10 +0100 Subject: [PATCH 20/41] Improve Makefile (#5067) * Improve pex making * Update gitignore --- .gitignore | 2 ++ Makefile | 42 +++++++++++++++++++++++++----------------- 2 files changed, 27 insertions(+), 17 deletions(-) diff --git a/.gitignore b/.gitignore index c4ad59fc7..828258603 100644 --- a/.gitignore +++ b/.gitignore @@ -40,6 +40,7 @@ __pycache__/ .~env/ .venv venv/ +env3.*/ .dev .denv .pypyenv @@ -56,6 +57,7 @@ lib64/ parts/ sdist/ var/ +wheelhouse/ *.egg-info/ pip-wheel-metadata/ Pipfile.lock diff --git a/Makefile b/Makefile index 5d15bccec..1be1c9794 100644 --- a/Makefile +++ b/Makefile @@ -1,28 +1,36 @@ SHELL := /bin/bash -sha = $(shell "git" "rev-parse" "--short" "HEAD") +WHEELHOUSE := ./wheelhouse +PYVER := 3.6 +VENV := ./env$(PYVER) + version = $(shell "bin/get-version.sh") -wheel = spacy-$(version)-cp36-cp36m-linux_x86_64.whl -dist/spacy.pex : dist/spacy-$(sha).pex - cp dist/spacy-$(sha).pex dist/spacy.pex - chmod a+rx dist/spacy.pex +dist/spacy-$(version).pex : wheelhouse/spacy-$(version)-*.whl + pex -f ./wheelhouse --no-index --disable-cache -m spacy -o dist/spacy-$(version).pex spacy==$(version) jsonschema + chmod a+rx dist/spacy-$(version).pex -dist/spacy-$(sha).pex : dist/$(wheel) - env3.6/bin/python -m pip install pex==1.5.3 - env3.6/bin/pex pytest dist/$(wheel) spacy_lookups_data -e spacy -o dist/spacy-$(sha).pex +dist/pytest.pex : wheelhouse/pytest-*.whl + $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m pytest -o dist/pytest.pex pytest pytest-timeout mock + chmod a+rx dist/pytest.pex -dist/$(wheel) : setup.py spacy/*.py* spacy/*/*.py* - python3.6 -m venv env3.6 - source env3.6/bin/activate - env3.6/bin/pip install wheel - env3.6/bin/pip install -r requirements.txt --no-cache-dir - env3.6/bin/python setup.py build_ext --inplace - env3.6/bin/python setup.py sdist - env3.6/bin/python setup.py bdist_wheel +wheelhouse/spacy-$(version)-%.whl : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py* + $(VENV)/bin/pip wheel . -w ./wheelhouse + $(VENV)/bin/pip wheel jsonschema spacy_lookups_data -w ./wheelhouse -.PHONY : clean +wheelhouse/pytest-%.whl : $(VENV)/bin/pex + $(VENV)/bin/pip wheel pytest pytest-timeout mock -w ./wheelhouse + +$(VENV) : + python$(PYVER) -m venv $(VENV) + $(VENV)/bin/python -m pip install pex wheel + +.PHONY : clean test + +test : dist/spacy-$(version).pex dist/pytest.pex + PEX_PATH=dist/spacy-$(version).pex ./dist/pytest.pex --pyargs spacy -x clean : setup.py source env3.6/bin/activate rm -rf dist/* + rm -rf ./wheelhouse python setup.py clean --all From 65d7bab10f540d3acd09da9c1cece5a166670a21 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Thu, 27 Feb 2020 18:43:00 +0100 Subject: [PATCH 21/41] Initialize all values in a2b/b2a in new align (#5063) --- spacy/gold.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 3884e1cba..07fd3bdd0 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -151,6 +151,8 @@ def align(tokens_a, tokens_b): cost = 0 a2b = numpy.empty(len(tokens_a), dtype="i") b2a = numpy.empty(len(tokens_b), dtype="i") + a2b.fill(-1) + b2a.fill(-1) a2b_multi = {} b2a_multi = {} i = 0 @@ -160,7 +162,6 @@ def align(tokens_a, tokens_b): while i < len(tokens_a) and j < len(tokens_b): a = tokens_a[i][offset_a:] b = tokens_b[j][offset_b:] - a2b[i] = b2a[j] = -1 if a == b: if offset_a == offset_b == 0: a2b[i] = j From c6b12ab02adcdfe760bc10e249924553cb826410 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 2 Mar 2020 11:49:28 +0100 Subject: [PATCH 22/41] Bugfix/get doc (#5049) * new (broken) unit test * fixing get_doc method --- spacy/errors.py | 4 ++ spacy/pipeline/pipes.pyx | 2 +- spacy/tests/doc/test_doc_api.py | 9 +-- spacy/tests/doc/test_token_api.py | 2 +- spacy/tests/parser/test_parse_navigate.py | 32 +++++----- spacy/tests/regression/test_issue2001-2500.py | 2 +- spacy/tests/regression/test_issue2501-3000.py | 2 +- spacy/tests/regression/test_issue4590.py | 2 +- spacy/tests/regression/test_issue5048.py | 35 +++++++++++ spacy/tests/test_displacy.py | 10 ++-- spacy/tests/util.py | 58 ++++++++++++++----- spacy/tokens/doc.pyx | 4 +- 12 files changed, 115 insertions(+), 47 deletions(-) create mode 100644 spacy/tests/regression/test_issue5048.py diff --git a/spacy/errors.py b/spacy/errors.py index 2f0a8a2ad..5957c5ecd 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -107,6 +107,9 @@ class Warnings(object): W027 = ("Found a large training file of {size} bytes. Note that it may " "be more efficient to split your training data into multiple " "smaller JSON files instead.") + W028 = ("Doc.from_array was called with a vector of type '{type}', " + "but is expecting one of type 'uint64' instead. This may result " + "in problems with the vocab further on in the pipeline.") @@ -541,6 +544,7 @@ class Errors(object): E188 = ("Could not match the gold entity links to entities in the doc - " "make sure the gold EL data refers to valid results of the " "named entity recognizer in the `nlp` pipeline.") + E189 = ("Each argument to `get_doc` should be of equal length.") @add_codes diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 3b190debe..a20c9b6df 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -367,7 +367,7 @@ class Tensorizer(Pipe): return sgd -@component("tagger", assigns=["token.tag", "token.pos"]) +@component("tagger", assigns=["token.tag", "token.pos", "token.lemma"]) class Tagger(Pipe): """Pipeline component for part-of-speech tagging. diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 52f856d3e..19d908529 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -150,10 +150,9 @@ def test_doc_api_runtime_error(en_tokenizer): # Example that caused run-time error while parsing Reddit # fmt: off text = "67% of black households are single parent \n\n72% of all black babies born out of wedlock \n\n50% of all black kids don\u2019t finish high school" - deps = ["nsubj", "prep", "amod", "pobj", "ROOT", "amod", "attr", "", - "nummod", "prep", "det", "amod", "pobj", "acl", "prep", "prep", - "pobj", "", "nummod", "prep", "det", "amod", "pobj", "aux", "neg", - "ROOT", "amod", "dobj"] + deps = ["nummod", "nsubj", "prep", "amod", "pobj", "ROOT", "amod", "attr", "", "nummod", "appos", "prep", "det", + "amod", "pobj", "acl", "prep", "prep", "pobj", + "", "nummod", "nsubj", "prep", "det", "amod", "pobj", "aux", "neg", "ccomp", "amod", "dobj"] # fmt: on tokens = en_tokenizer(text) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps) @@ -277,7 +276,9 @@ def test_doc_is_nered(en_vocab): def test_doc_from_array_sent_starts(en_vocab): words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."] heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6] + # fmt: off deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep", "dep"] + # fmt: on doc = Doc(en_vocab, words=words) for i, (dep, head) in enumerate(zip(deps, heads)): doc[i].dep_ = dep diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py index bff2a95c6..b7522bb98 100644 --- a/spacy/tests/doc/test_token_api.py +++ b/spacy/tests/doc/test_token_api.py @@ -214,7 +214,7 @@ def test_token_api_conjuncts_chain(en_vocab): def test_token_api_conjuncts_simple(en_vocab): words = "They came and went .".split() heads = [1, 0, -1, -2, -1] - deps = ["nsubj", "ROOT", "cc", "conj"] + deps = ["nsubj", "ROOT", "cc", "conj", "dep"] doc = get_doc(en_vocab, words=words, heads=heads, deps=deps) assert [w.text for w in doc[1].conjuncts] == ["went"] assert [w.text for w in doc[3].conjuncts] == ["came"] diff --git a/spacy/tests/parser/test_parse_navigate.py b/spacy/tests/parser/test_parse_navigate.py index eb206458e..41524d45e 100644 --- a/spacy/tests/parser/test_parse_navigate.py +++ b/spacy/tests/parser/test_parse_navigate.py @@ -34,23 +34,23 @@ BIG BROTHER IS WATCHING YOU, the caption beneath it ran. @pytest.fixture def heads(): # fmt: off - return [1, 1, 0, 3, 2, 1, -4, -1, -1, -7, -8, 1, -10, 2, 1, -3, -1, -15, - -1, 1, 4, -1, 1, -3, 0, -1, 1, -2, -4, 1, -2, 1, -2, 3, -1, 1, - -4, -13, -14, -1, -2, 2, 1, -3, -1, 1, -2, -9, -1, 3, 1, 1, -14, - 1, -2, 1, -2, -1, 1, -2, -6, -1, -1, -2, -1, -1, -42, -1, 2, 1, - 0, -1, 1, -2, -1, 2, 1, -4, -8, 0, 1, -2, -1, -1, 3, -1, 1, -6, - 9, 1, 7, -1, 1, -2, 3, 2, 1, -10, -1, 1, -2, -22, -1, 1, 0, -1, - 2, 1, -4, -1, -2, -1, 1, -2, -6, -7, 1, -9, -1, 2, -1, -3, -1, - 3, 2, 1, -4, -19, -24, 3, 2, 1, -4, -1, 1, 2, -1, -5, -34, 1, 0, - -1, 1, -2, -4, 1, 0, 1, -2, -1, 1, -2, -6, 1, 9, -1, 1, -3, -1, - -1, 3, 2, 1, 0, -1, -2, 7, -1, 5, 1, 3, -1, 1, -10, -1, -2, 1, - -2, -15, 1, 0, -1, -1, 2, 1, -3, -1, -1, -2, -1, 1, -2, -12, 1, - 1, 0, 1, -2, -1, -2, -3, 9, -1, 2, -1, -4, 2, 1, -3, -4, -15, 2, - 1, -3, -1, 2, 1, -3, -8, -9, -1, -2, -1, -4, 1, -2, -3, 1, -2, - -19, 17, 1, -2, 14, 13, 3, 2, 1, -4, 8, -1, 1, 5, -1, 2, 1, -3, + return [1, 1, 0, 3, 2, 1, -4, -1, -1, -7, -8, 1, 2, 1, -12, -1, -2, + -1, 1, 4, 3, 1, 1, 0, -1, 1, -2, -4, 1, -2, 1, -2, 3, -1, 1, + -4, -13, -14, -1, -2, 2, 1, -3, -1, 1, -2, -9, -1, -11, 1, 1, -14, + 1, -2, 1, -2, -1, 1, -2, -6, -1, -1, -2, -1, -1, -42, -1, 1, 1, + 0, -1, 1, -2, -1, 2, 1, -4, -8, 18, 1, -2, -1, -1, 3, -1, 1, 10, + 9, 1, 7, -1, 1, -2, 3, 2, 1, 0, -1, 1, -2, -4, -1, 1, 0, -1, + 2, 1, -4, -1, 2, 1, 1, 1, -6, -11, 1, 20, -1, 2, -1, -3, -1, + 3, 2, 1, -4, -10, -11, 3, 2, 1, -4, -1, 1, -3, -1, 0, -1, 1, 0, + -1, 1, -2, -4, 1, 0, 1, -2, -1, 1, -2, -6, 1, 9, -1, 1, 6, -1, + -1, 3, 2, 1, 0, -1, -2, 7, -1, 2, 1, 3, -1, 1, -10, -1, -2, 1, + -2, -5, 1, 0, -1, -1, 1, -2, -5, -1, -1, -2, -1, 1, -2, -12, 1, + 1, 0, 1, -2, -1, -4, -5, 18, -1, 2, -1, -4, 2, 1, -3, -4, -5, 2, + 1, -3, -1, 2, 1, -3, -17, -24, -1, -2, -1, -4, 1, -2, -3, 1, -2, + -10, 17, 1, -2, 14, 13, 3, 2, 1, -4, 8, -1, 1, 5, -1, 2, 1, -3, 0, -1, 1, -2, -4, 1, 0, -1, -1, 2, -1, -3, 1, -2, 1, -2, 3, 1, - 1, -4, -1, -2, 2, 1, -5, -19, -1, 1, 1, 0, 1, 6, -1, 1, -3, -1, - -1, -8, -9, -1] + 1, -4, -1, -2, 2, 1, -3, -19, -1, 1, 1, 0, 0, 6, 5, 1, 3, -1, + -1, 0, -1, -1] # fmt: on diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py index e95c1a9b9..01f0f905c 100644 --- a/spacy/tests/regression/test_issue2001-2500.py +++ b/spacy/tests/regression/test_issue2001-2500.py @@ -48,7 +48,7 @@ def test_issue2203(en_vocab): tag_ids = [en_vocab.strings.add(tag) for tag in tags] lemma_ids = [en_vocab.strings.add(lemma) for lemma in lemmas] doc = Doc(en_vocab, words=words) - # Work around lemma corrpution problem and set lemmas after tags + # Work around lemma corruption problem and set lemmas after tags doc.from_array("TAG", numpy.array(tag_ids, dtype="uint64")) doc.from_array("LEMMA", numpy.array(lemma_ids, dtype="uint64")) assert [t.tag_ for t in doc] == tags diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py index 73ff7376a..1f5e44499 100644 --- a/spacy/tests/regression/test_issue2501-3000.py +++ b/spacy/tests/regression/test_issue2501-3000.py @@ -124,7 +124,7 @@ def test_issue2772(en_vocab): words = "When we write or communicate virtually , we can hide our true feelings .".split() # A tree with a non-projective (i.e. crossing) arc # The arcs (0, 4) and (2, 9) cross. - heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, -1, -2, -1] + heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, 2, 1, -3, -4] deps = ["dep"] * len(heads) doc = get_doc(en_vocab, words=words, heads=heads, deps=deps) assert doc[1].is_sent_start is None diff --git a/spacy/tests/regression/test_issue4590.py b/spacy/tests/regression/test_issue4590.py index 8ec9a0bd1..3d01cd487 100644 --- a/spacy/tests/regression/test_issue4590.py +++ b/spacy/tests/regression/test_issue4590.py @@ -27,7 +27,7 @@ def test_issue4590(en_vocab): text = "The quick brown fox jumped over the lazy fox" heads = [3, 2, 1, 1, 0, -1, 2, 1, -3] - deps = ["det", "amod", "amod", "nsubj", "prep", "pobj", "det", "amod"] + deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"] doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps) diff --git a/spacy/tests/regression/test_issue5048.py b/spacy/tests/regression/test_issue5048.py new file mode 100644 index 000000000..228322493 --- /dev/null +++ b/spacy/tests/regression/test_issue5048.py @@ -0,0 +1,35 @@ +# coding: utf8 +from __future__ import unicode_literals + +import numpy +from spacy.tokens import Doc +from spacy.attrs import DEP, POS, TAG + +from ..util import get_doc + + +def test_issue5048(en_vocab): + words = ["This", "is", "a", "sentence"] + pos_s = ["DET", "VERB", "DET", "NOUN"] + spaces = [" ", " ", " ", ""] + deps_s = ["dep", "adj", "nn", "atm"] + tags_s = ["DT", "VBZ", "DT", "NN"] + + strings = en_vocab.strings + + for w in words: + strings.add(w) + deps = [strings.add(d) for d in deps_s] + pos = [strings.add(p) for p in pos_s] + tags = [strings.add(t) for t in tags_s] + + attrs = [POS, DEP, TAG] + array = numpy.array(list(zip(pos, deps, tags)), dtype="uint64") + + doc = Doc(en_vocab, words=words, spaces=spaces) + doc.from_array(attrs, array) + v1 = [(token.text, token.pos_, token.tag_) for token in doc] + + doc2 = get_doc(en_vocab, words=words, pos=pos_s, deps=deps_s, tags=tags_s) + v2 = [(token.text, token.pos_, token.tag_) for token in doc2] + assert v1 == v2 diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py index d04c0506f..539714e0c 100644 --- a/spacy/tests/test_displacy.py +++ b/spacy/tests/test_displacy.py @@ -31,10 +31,10 @@ def test_displacy_parse_deps(en_vocab): deps = displacy.parse_deps(doc) assert isinstance(deps, dict) assert deps["words"] == [ - {"lemma": None, "text": "This", "tag": "DET"}, - {"lemma": None, "text": "is", "tag": "AUX"}, - {"lemma": None, "text": "a", "tag": "DET"}, - {"lemma": None, "text": "sentence", "tag": "NOUN"}, + {"lemma": None, "text": words[0], "tag": pos[0]}, + {"lemma": None, "text": words[1], "tag": pos[1]}, + {"lemma": None, "text": words[2], "tag": pos[2]}, + {"lemma": None, "text": words[3], "tag": pos[3]}, ] assert deps["arcs"] == [ {"start": 0, "end": 1, "label": "nsubj", "dir": "left"}, @@ -75,7 +75,7 @@ def test_displacy_rtl(): deps = ["foo", "bar", "foo", "baz"] heads = [1, 0, 1, -2] nlp = Persian() - doc = get_doc(nlp.vocab, words=words, pos=pos, tags=pos, heads=heads, deps=deps) + doc = get_doc(nlp.vocab, words=words, tags=pos, heads=heads, deps=deps) doc.ents = [Span(doc, 1, 3, label="TEST")] html = displacy.render(doc, page=True, style="dep") assert "direction: rtl" in html diff --git a/spacy/tests/util.py b/spacy/tests/util.py index 9ee5b89f8..52768dd41 100644 --- a/spacy/tests/util.py +++ b/spacy/tests/util.py @@ -7,8 +7,10 @@ import shutil import contextlib import srsly from pathlib import Path + +from spacy import Errors from spacy.tokens import Doc, Span -from spacy.attrs import POS, HEAD, DEP +from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA from spacy.compat import path2str @@ -26,30 +28,54 @@ def make_tempdir(): shutil.rmtree(path2str(d)) -def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None): +def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None): """Create Doc object from given vocab, words and annotations.""" - pos = pos or [""] * len(words) - tags = tags or [""] * len(words) - heads = heads or [0] * len(words) - deps = deps or [""] * len(words) - for value in deps + tags + pos: + if deps and not heads: + heads = [0] * len(deps) + headings = [] + values = [] + annotations = [pos, heads, deps, lemmas, tags] + possible_headings = [POS, HEAD, DEP, LEMMA, TAG] + for a, annot in enumerate(annotations): + if annot is not None: + if len(annot) != len(words): + raise ValueError(Errors.E189) + headings.append(possible_headings[a]) + if annot is not heads: + values.extend(annot) + for value in values: vocab.strings.add(value) doc = Doc(vocab, words=words) - attrs = doc.to_array([POS, HEAD, DEP]) - for i, (p, head, dep) in enumerate(zip(pos, heads, deps)): - attrs[i, 0] = doc.vocab.strings[p] - attrs[i, 1] = head - attrs[i, 2] = doc.vocab.strings[dep] - doc.from_array([POS, HEAD, DEP], attrs) + + # if there are any other annotations, set them + if headings: + attrs = doc.to_array(headings) + + j = 0 + for annot in annotations: + if annot: + if annot is heads: + for i in range(len(words)): + if attrs.ndim == 1: + attrs[i] = heads[i] + else: + attrs[i,j] = heads[i] + else: + for i in range(len(words)): + if attrs.ndim == 1: + attrs[i] = doc.vocab.strings[annot[i]] + else: + attrs[i, j] = doc.vocab.strings[annot[i]] + j += 1 + doc.from_array(headings, attrs) + + # finally, set the entities if ents: doc.ents = [ Span(doc, start, end, label=doc.vocab.strings[label]) for start, end, label in ents ] - if tags: - for token in doc: - token.tag_ = tags[token.i] return doc diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 63495ec86..11f1ddf5f 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -785,6 +785,8 @@ cdef class Doc: # Allow strings, e.g. 'lemma' or 'LEMMA' attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_) for id_ in attrs] + if array.dtype != numpy.uint64: + user_warning(Warnings.W028.format(type=array.dtype)) if SENT_START in attrs and HEAD in attrs: raise ValueError(Errors.E032) @@ -872,7 +874,7 @@ cdef class Doc: DOCS: https://spacy.io/api/doc#to_bytes """ - array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID] # TODO: ENT_KB_ID ? + array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, NORM] # TODO: ENT_KB_ID ? if self.is_tagged: array_head.extend([TAG, POS]) # If doc parsed add head and dep attribute From 2281c4708cc3dfa68ffcdff5554c18d8fae0c9de Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Mon, 2 Mar 2020 11:55:02 +0100 Subject: [PATCH 23/41] Restore empty tokenizer properties (#5026) * Restore empty tokenizer properties * Check for types in tokenizer.from_bytes() * Add test for setting empty tokenizer rules --- spacy/tests/serialize/test_serialize_tokenizer.py | 11 +++++++++-- spacy/tokenizer.pyx | 14 +++++++------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/spacy/tests/serialize/test_serialize_tokenizer.py b/spacy/tests/serialize/test_serialize_tokenizer.py index 9a273980c..0e0816a55 100644 --- a/spacy/tests/serialize/test_serialize_tokenizer.py +++ b/spacy/tests/serialize/test_serialize_tokenizer.py @@ -15,12 +15,19 @@ def load_tokenizer(b): def test_serialize_custom_tokenizer(en_vocab, en_tokenizer): - """Test that custom tokenizer with not all functions defined can be - serialized and deserialized correctly (see #2494).""" + """Test that custom tokenizer with not all functions defined or empty + properties can be serialized and deserialized correctly (see #2494, + #4991).""" tokenizer = Tokenizer(en_vocab, suffix_search=en_tokenizer.suffix_search) tokenizer_bytes = tokenizer.to_bytes() Tokenizer(en_vocab).from_bytes(tokenizer_bytes) + tokenizer = Tokenizer(en_vocab, rules={"ABC.": [{"ORTH": "ABC", "ORTH": "."}]}) + tokenizer.rules = {} + tokenizer_bytes = tokenizer.to_bytes() + tokenizer_reloaded = Tokenizer(en_vocab).from_bytes(tokenizer_bytes) + assert tokenizer_reloaded.rules == {} + @pytest.mark.skip(reason="Currently unreliable across platforms") @pytest.mark.parametrize("text", ["I💜you", "they’re", "“hello”"]) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 230f41921..12c7b73af 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -14,7 +14,7 @@ import re from .tokens.doc cimport Doc from .strings cimport hash_string -from .compat import unescape_unicode +from .compat import unescape_unicode, basestring_ from .attrs import intify_attrs from .symbols import ORTH @@ -568,22 +568,22 @@ cdef class Tokenizer: for key in ["prefix_search", "suffix_search", "infix_finditer"]: if key in data: data[key] = unescape_unicode(data[key]) - if data.get("prefix_search"): + if "prefix_search" in data and isinstance(data["prefix_search"], basestring_): self.prefix_search = re.compile(data["prefix_search"]).search - if data.get("suffix_search"): + if "suffix_search" in data and isinstance(data["suffix_search"], basestring_): self.suffix_search = re.compile(data["suffix_search"]).search - if data.get("infix_finditer"): + if "infix_finditer" in data and isinstance(data["infix_finditer"], basestring_): self.infix_finditer = re.compile(data["infix_finditer"]).finditer - if data.get("token_match"): + if "token_match" in data and isinstance(data["token_match"], basestring_): self.token_match = re.compile(data["token_match"]).match - if data.get("rules"): + if "rules" in data and isinstance(data["rules"], dict): # make sure to hard reset the cache to remove data from the default exceptions self._rules = {} self._reset_cache([key for key in self._cache]) self._reset_specials() self._cache = PreshMap() self._specials = PreshMap() - self._load_special_tokenization(data.get("rules", {})) + self._load_special_tokenization(data["rules"]) return self From 697bec764de41e39582caadc14608607c2af8d09 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 3 Mar 2020 12:22:39 +0100 Subject: [PATCH 24/41] Normalize IS_SENT_START to SENT_START for Matcher (#5080) --- spacy/matcher/_schemas.py | 4 ++++ spacy/matcher/matcher.pyx | 2 ++ spacy/tests/matcher/test_pattern_validation.py | 2 ++ 3 files changed, 8 insertions(+) diff --git a/spacy/matcher/_schemas.py b/spacy/matcher/_schemas.py index 1b10f0dd5..4ef7ae49a 100644 --- a/spacy/matcher/_schemas.py +++ b/spacy/matcher/_schemas.py @@ -170,6 +170,10 @@ TOKEN_PATTERN_SCHEMA = { "title": "Token is the first in a sentence", "$ref": "#/definitions/boolean_value", }, + "SENT_START": { + "title": "Token is the first in a sentence", + "$ref": "#/definitions/boolean_value", + }, "LIKE_NUM": { "title": "Token resembles a number", "$ref": "#/definitions/boolean_value", diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 30ef3dd36..11461afb8 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -670,6 +670,8 @@ def _get_attr_values(spec, string_store): continue if attr == "TEXT": attr = "ORTH" + if attr == "IS_SENT_START": + attr = "SENT_START" if attr not in TOKEN_PATTERN_SCHEMA["items"]["properties"]: raise ValueError(Errors.E152.format(attr=attr)) attr = IDS.get(attr) diff --git a/spacy/tests/matcher/test_pattern_validation.py b/spacy/tests/matcher/test_pattern_validation.py index 2db2f9eb3..c536698d0 100644 --- a/spacy/tests/matcher/test_pattern_validation.py +++ b/spacy/tests/matcher/test_pattern_validation.py @@ -34,6 +34,8 @@ TEST_PATTERNS = [ ([{"LOWER": {"REGEX": "^X", "NOT_IN": ["XXX", "XY"]}}], 0, 0), ([{"NORM": "a"}, {"POS": {"IN": ["NOUN"]}}], 0, 0), ([{"_": {"foo": {"NOT_IN": ["bar", "baz"]}, "a": 5, "b": {">": 10}}}], 0, 0), + ([{"IS_SENT_START": True}], 0, 0), + ([{"SENT_START": True}], 0, 0), ] XFAIL_TEST_PATTERNS = [([{"orth": "foo"}], 0, 0)] From d078b47c81acdce5ece828f2f7d6e193bb3840ce Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 3 Mar 2020 12:29:05 +0100 Subject: [PATCH 25/41] Break out of infinite loop as intended (#5077) --- spacy/tokens/doc.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 11f1ddf5f..5997be804 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1175,6 +1175,7 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1: heads_within_sents = _set_lr_kids_and_edges(tokens, length, loop_count) if loop_count > 10: user_warning(Warnings.W026) + break loop_count += 1 # Set sentence starts for i in range(length): From d307e9ca58c84dc24e6717fccafe7b55c604ee7c Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 3 Mar 2020 13:58:22 +0100 Subject: [PATCH 26/41] take care of global vectors in multiprocessing (#5081) * restore load_nlp.VECTORS in the child process * add unit test * fix test * remove unnecessary import * add utf8 encoding * import unicode_literals --- spacy/_ml.py | 3 +-- spacy/language.py | 9 ++++++-- spacy/tests/regression/test_issue4725.py | 26 ++++++++++++++++++++++++ spacy/tests/regression/test_issue4849.py | 1 - spacy/tests/regression/test_issue4903.py | 2 -- 5 files changed, 34 insertions(+), 7 deletions(-) create mode 100644 spacy/tests/regression/test_issue4725.py diff --git a/spacy/_ml.py b/spacy/_ml.py index 8695a88cc..fb7d39255 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -296,8 +296,7 @@ def link_vectors_to_models(vocab): key = (ops.device, vectors.name) if key in thinc.extra.load_nlp.VECTORS: if thinc.extra.load_nlp.VECTORS[key].shape != data.shape: - # This is a hack to avoid the problem in #3853. Maybe we should - # print a warning as well? + # This is a hack to avoid the problem in #3853. old_name = vectors.name new_name = vectors.name + "_%d" % data.shape[0] user_warning(Warnings.W019.format(old=old_name, new=new_name)) diff --git a/spacy/language.py b/spacy/language.py index 16aa4967e..28fddfebb 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -3,6 +3,9 @@ from __future__ import absolute_import, unicode_literals import random import itertools + +from thinc.extra import load_nlp + from spacy.util import minibatch import weakref import functools @@ -856,7 +859,7 @@ class Language(object): procs = [ mp.Process( target=_apply_pipes, - args=(self.make_doc, pipes, rch, sch, Underscore.get_state()), + args=(self.make_doc, pipes, rch, sch, Underscore.get_state(), load_nlp.VECTORS), ) for rch, sch in zip(texts_q, bytedocs_send_ch) ] @@ -1112,7 +1115,7 @@ def _pipe(docs, proc, kwargs): yield doc -def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state): +def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state, vectors): """Worker for Language.pipe receiver (multiprocessing.Connection): Pipe to receive text. Usually @@ -1120,8 +1123,10 @@ def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state): sender (multiprocessing.Connection): Pipe to send doc. Usually created by `multiprocessing.Pipe()` underscore_state (tuple): The data in the Underscore class of the parent + vectors (dict): The global vectors data, copied from the parent """ Underscore.load_state(underscore_state) + load_nlp.VECTORS = vectors while True: texts = receiver.get() docs = (make_doc(text) for text in texts) diff --git a/spacy/tests/regression/test_issue4725.py b/spacy/tests/regression/test_issue4725.py new file mode 100644 index 000000000..f80f19852 --- /dev/null +++ b/spacy/tests/regression/test_issue4725.py @@ -0,0 +1,26 @@ +# coding: utf8 +from __future__ import unicode_literals + +import numpy + +from spacy.lang.en import English +from spacy.vocab import Vocab + + +def test_issue4725(): + # ensures that this runs correctly and doesn't hang or crash because of the global vectors + vocab = Vocab(vectors_name="test_vocab_add_vector") + data = numpy.ndarray((5, 3), dtype="f") + data[0] = 1.0 + data[1] = 2.0 + vocab.set_vector("cat", data[0]) + vocab.set_vector("dog", data[1]) + + nlp = English(vocab=vocab) + ner = nlp.create_pipe("ner") + nlp.add_pipe(ner) + nlp.begin_training() + docs = ["Kurt is in London."] * 10 + for _ in nlp.pipe(docs, batch_size=2, n_process=2): + pass + diff --git a/spacy/tests/regression/test_issue4849.py b/spacy/tests/regression/test_issue4849.py index 85d03fe9a..834219773 100644 --- a/spacy/tests/regression/test_issue4849.py +++ b/spacy/tests/regression/test_issue4849.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals from spacy.lang.en import English from spacy.pipeline import EntityRuler -from spacy.tokens.underscore import Underscore def test_issue4849(): diff --git a/spacy/tests/regression/test_issue4903.py b/spacy/tests/regression/test_issue4903.py index 9a3c10d61..d467b1cd6 100644 --- a/spacy/tests/regression/test_issue4903.py +++ b/spacy/tests/regression/test_issue4903.py @@ -1,10 +1,8 @@ # coding: utf8 from __future__ import unicode_literals -import spacy from spacy.lang.en import English from spacy.tokens import Span, Doc -from spacy.tokens.underscore import Underscore class CustomPipe: From a0998868ffe6d0d8d1a610374f537a4f41eda83e Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 3 Mar 2020 13:58:56 +0100 Subject: [PATCH 27/41] prevent updating cfg if the Model was already defined (#5078) --- spacy/syntax/nn_parser.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 153ca67cd..d5c6bf2a8 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -606,7 +606,6 @@ cdef class Parser: if not hasattr(get_gold_tuples, '__call__'): gold_tuples = get_gold_tuples get_gold_tuples = lambda: gold_tuples - cfg.setdefault('min_action_freq', 30) actions = self.moves.get_actions(gold_parses=get_gold_tuples(), min_freq=cfg.get('min_action_freq', 30), learn_tokens=self.cfg.get("learn_tokens", False)) @@ -616,8 +615,9 @@ cdef class Parser: if label not in actions[action]: actions[action][label] = freq self.moves.initialize_actions(actions) - cfg.setdefault('token_vector_width', 96) if self.model is True: + cfg.setdefault('min_action_freq', 30) + cfg.setdefault('token_vector_width', 96) self.model, cfg = self.Model(self.moves.n_moves, **cfg) if sgd is None: sgd = self.create_optimizer() @@ -633,11 +633,11 @@ cdef class Parser: if pipeline is not None: self.init_multitask_objectives(get_gold_tuples, pipeline, sgd=sgd, **cfg) link_vectors_to_models(self.vocab) + self.cfg.update(cfg) else: if sgd is None: sgd = self.create_optimizer() self.model.begin_training([]) - self.cfg.update(cfg) return sgd def to_disk(self, path, exclude=tuple(), **kwargs): From 8c20dae6f7b1d5ac056402e0057269ce80dba0fa Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 3 Mar 2020 21:43:25 +0100 Subject: [PATCH 28/41] Fix model-final/model-best meta from train CLI (#5093) * Fix model-final/model-best meta * include speed and accuracy from final iteration * combine with speeds from base model if necessary * Include token_acc metric for all components --- spacy/cli/train.py | 40 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 968a009f6..59b0f2225 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -554,7 +554,30 @@ def train( with nlp.use_params(optimizer.averages): final_model_path = output_path / "model-final" nlp.to_disk(final_model_path) - final_meta = srsly.read_json(output_path / "model-final" / "meta.json") + meta_loc = output_path / "model-final" / "meta.json" + final_meta = srsly.read_json(meta_loc) + final_meta.setdefault("accuracy", {}) + final_meta["accuracy"].update(meta.get("accuracy", {})) + final_meta.setdefault("speed", {}) + final_meta["speed"].setdefault("cpu", None) + final_meta["speed"].setdefault("gpu", None) + # combine cpu and gpu speeds with the base model speeds + if final_meta["speed"]["cpu"] and meta["speed"]["cpu"]: + speed = _get_total_speed([final_meta["speed"]["cpu"], meta["speed"]["cpu"]]) + final_meta["speed"]["cpu"] = speed + if final_meta["speed"]["gpu"] and meta["speed"]["gpu"]: + speed = _get_total_speed([final_meta["speed"]["gpu"], meta["speed"]["gpu"]]) + final_meta["speed"]["gpu"] = speed + # if there were no speeds to update, overwrite with meta + if final_meta["speed"]["cpu"] is None and final_meta["speed"]["gpu"] is None: + final_meta["speed"].update(meta["speed"]) + # note: beam speeds are not combined with the base model + if has_beam_widths: + final_meta.setdefault("beam_accuracy", {}) + final_meta["beam_accuracy"].update(meta.get("beam_accuracy", {})) + final_meta.setdefault("beam_speed", {}) + final_meta["beam_speed"].update(meta.get("beam_speed", {})) + srsly.write_json(meta_loc, final_meta) msg.good("Saved model to output directory", final_model_path) with msg.loading("Creating best model..."): best_model_path = _collate_best_model(final_meta, output_path, best_pipes) @@ -649,11 +672,11 @@ def _get_metrics(component): if component == "parser": return ("las", "uas", "las_per_type", "token_acc") elif component == "tagger": - return ("tags_acc",) + return ("tags_acc", "token_acc") elif component == "ner": - return ("ents_f", "ents_p", "ents_r", "ents_per_type") + return ("ents_f", "ents_p", "ents_r", "ents_per_type", "token_acc") elif component == "textcat": - return ("textcat_score",) + return ("textcat_score", "token_acc") return ("token_acc",) @@ -709,3 +732,12 @@ def _get_progress( if beam_width is not None: result.insert(1, beam_width) return result + + +def _get_total_speed(speeds): + seconds_per_word = 0.0 + for words_per_second in speeds: + if words_per_second is None: + return None + seconds_per_word += 1.0 / words_per_second + return 1.0 / seconds_per_word From 9be90dbca3a75ebbaa85ec14dd02fe3ab87291be Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 3 Mar 2020 21:44:51 +0100 Subject: [PATCH 29/41] Improve token head verification (#5079) * Improve token head verification Improve the verification for valid token heads when heads are set: * in `Token.head`: heads come from the same document * in `Doc.from_array()`: head indices are within the bounds of the document * Improve error message --- spacy/errors.py | 7 +++++++ spacy/tests/doc/test_array.py | 27 +++++++++++++++++++++++++++ spacy/tests/doc/test_token_api.py | 5 +++++ spacy/tokens/doc.pyx | 10 +++++++++- spacy/tokens/token.pyx | 3 +++ 5 files changed, 51 insertions(+), 1 deletion(-) diff --git a/spacy/errors.py b/spacy/errors.py index 5957c5ecd..b43b8487f 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -545,6 +545,13 @@ class Errors(object): "make sure the gold EL data refers to valid results of the " "named entity recognizer in the `nlp` pipeline.") E189 = ("Each argument to `get_doc` should be of equal length.") + E190 = ("Token head out of range in `Doc.from_array()` for token index " + "'{index}' with value '{value}' (equivalent to relative head " + "index: '{rel_head_index}'). The head indices should be relative " + "to the current token index rather than absolute indices in the " + "array.") + E191 = ("Invalid head: the head token must be from the same doc as the " + "token itself.") @add_codes diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py index aa0d37eca..1c0c79f6e 100644 --- a/spacy/tests/doc/test_array.py +++ b/spacy/tests/doc/test_array.py @@ -77,3 +77,30 @@ def test_doc_array_idx(en_vocab): assert offsets[0] == 0 assert offsets[1] == 3 assert offsets[2] == 11 + + +def test_doc_from_array_heads_in_bounds(en_vocab): + """Test that Doc.from_array doesn't set heads that are out of bounds.""" + words = ["This", "is", "a", "sentence", "."] + doc = Doc(en_vocab, words=words) + for token in doc: + token.head = doc[0] + + # correct + arr = doc.to_array(["HEAD"]) + doc_from_array = Doc(en_vocab, words=words) + doc_from_array.from_array(["HEAD"], arr) + + # head before start + arr = doc.to_array(["HEAD"]) + arr[0] = -1 + doc_from_array = Doc(en_vocab, words=words) + with pytest.raises(ValueError): + doc_from_array.from_array(["HEAD"], arr) + + # head after end + arr = doc.to_array(["HEAD"]) + arr[0] = 5 + doc_from_array = Doc(en_vocab, words=words) + with pytest.raises(ValueError): + doc_from_array.from_array(["HEAD"], arr) diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py index b7522bb98..8c749b26d 100644 --- a/spacy/tests/doc/test_token_api.py +++ b/spacy/tests/doc/test_token_api.py @@ -167,6 +167,11 @@ def test_doc_token_api_head_setter(en_tokenizer): assert doc[4].left_edge.i == 0 assert doc[2].left_edge.i == 0 + # head token must be from the same document + doc2 = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) + with pytest.raises(ValueError): + doc[0].head = doc2[0] + def test_is_sent_start(en_tokenizer): doc = en_tokenizer("This is a sentence. This is another.") diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 5997be804..0c90929c3 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -790,7 +790,7 @@ cdef class Doc: if SENT_START in attrs and HEAD in attrs: raise ValueError(Errors.E032) - cdef int i, col + cdef int i, col, abs_head_index cdef attr_id_t attr_id cdef TokenC* tokens = self.c cdef int length = len(array) @@ -804,6 +804,14 @@ cdef class Doc: attr_ids[i] = attr_id if len(array.shape) == 1: array = array.reshape((array.size, 1)) + # Check that all heads are within the document bounds + if HEAD in attrs: + col = attrs.index(HEAD) + for i in range(length): + # cast index to signed int + abs_head_index = numpy.int32(array[i, col]) + i + if abs_head_index < 0 or abs_head_index >= length: + raise ValueError(Errors.E190.format(index=i, value=array[i, col], rel_head_index=numpy.int32(array[i, col]))) # Do TAG first. This lets subsequent loop override stuff like POS, LEMMA if TAG in attrs: col = attrs.index(TAG) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 8b15a4223..8019e3b4f 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -623,6 +623,9 @@ cdef class Token: # This function sets the head of self to new_head and updates the # counters for left/right dependents and left/right corner for the # new and the old head + # Check that token is from the same document + if self.doc != new_head.doc: + raise ValueError(Errors.E191) # Do nothing if old head is new head if self.i + self.c.head == new_head.i: return From 03376c9d9bea0dd850bd2612521843f6c8f580ba Mon Sep 17 00:00:00 2001 From: Muhammad Irfan Date: Wed, 4 Mar 2020 11:58:56 +0500 Subject: [PATCH 30/41] Basque language added and tested. --- spacy/lang/eu/__init__.py | 30 +++++++++ spacy/lang/eu/examples.py | 16 +++++ spacy/lang/eu/lex_attrs.py | 80 +++++++++++++++++++++++ spacy/lang/eu/punctuation.py | 7 ++ spacy/lang/eu/stop_words.py | 108 +++++++++++++++++++++++++++++++ spacy/lang/eu/tag_map.py | 71 ++++++++++++++++++++ spacy/tests/conftest.py | 5 ++ spacy/tests/lang/eu/test_text.py | 16 +++++ 8 files changed, 333 insertions(+) create mode 100644 spacy/lang/eu/__init__.py create mode 100644 spacy/lang/eu/examples.py create mode 100644 spacy/lang/eu/lex_attrs.py create mode 100644 spacy/lang/eu/punctuation.py create mode 100644 spacy/lang/eu/stop_words.py create mode 100644 spacy/lang/eu/tag_map.py create mode 100644 spacy/tests/lang/eu/test_text.py diff --git a/spacy/lang/eu/__init__.py b/spacy/lang/eu/__init__.py new file mode 100644 index 000000000..4f3338c1d --- /dev/null +++ b/spacy/lang/eu/__init__.py @@ -0,0 +1,30 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS +from .punctuation import TOKENIZER_SUFFIXES +from .tag_map import TAG_MAP + +from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ...language import Language +from ...attrs import LANG + + +class BasqueDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters.update(LEX_ATTRS) + lex_attr_getters[LANG] = lambda text: "eu" + + tokenizer_exceptions = BASE_EXCEPTIONS + tag_map = TAG_MAP + stop_words = STOP_WORDS + suffixes = TOKENIZER_SUFFIXES + + +class Basque(Language): + lang = "eu" + Defaults = BasqueDefaults + + +__all__ = ["Basque"] diff --git a/spacy/lang/eu/examples.py b/spacy/lang/eu/examples.py new file mode 100644 index 000000000..ec9f0dd06 --- /dev/null +++ b/spacy/lang/eu/examples.py @@ -0,0 +1,16 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.eu.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "", + "" +] diff --git a/spacy/lang/eu/lex_attrs.py b/spacy/lang/eu/lex_attrs.py new file mode 100644 index 000000000..c11e913db --- /dev/null +++ b/spacy/lang/eu/lex_attrs.py @@ -0,0 +1,80 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...attrs import LIKE_NUM + +# Source http://mylanguages.org/basque_numbers.php + + +_num_words = """ +bat +bi +hiru +lau +bost +sei +zazpi +zortzi +bederatzi +hamar +hamaika +hamabi +hamahiru +hamalau +hamabost +hamasei +hamazazpi +Hemezortzi +hemeretzi +hogei +ehun +mila +milioi +""".split() + +# source https://www.google.com/intl/ur/inputtools/try/ + +_ordinal_words = """ +lehen +bigarren +hirugarren +laugarren +bosgarren +seigarren +zazpigarren +zortzigarren +bederatzigarren +hamargarren +hamaikagarren +hamabigarren +hamahirugarren +hamalaugarren +hamabosgarren +hamaseigarren +hamazazpigarren +hamazortzigarren +hemeretzigarren +hogeigarren +behin +""".split() + + + +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + if text in _num_words: + return True + if text in _ordinal_words: + return True + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/eu/punctuation.py b/spacy/lang/eu/punctuation.py new file mode 100644 index 000000000..b8b1a1c83 --- /dev/null +++ b/spacy/lang/eu/punctuation.py @@ -0,0 +1,7 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ..punctuation import TOKENIZER_SUFFIXES + + +_suffixes = TOKENIZER_SUFFIXES diff --git a/spacy/lang/eu/stop_words.py b/spacy/lang/eu/stop_words.py new file mode 100644 index 000000000..208238961 --- /dev/null +++ b/spacy/lang/eu/stop_words.py @@ -0,0 +1,108 @@ +# encoding: utf8 +from __future__ import unicode_literals + +# Source: https://github.com/stopwords-iso/stopwords-eu +# https://www.ranks.nl/stopwords/basque +# https://www.mustgo.com/worldlanguages/basque/ +STOP_WORDS = set( +""" +al +anitz +arabera +asko +baina +bat +batean +batek +bati +batzuei +batzuek +batzuetan +batzuk +bera +beraiek +berau +berauek +bere +berori +beroriek +beste +bezala +da +dago +dira +ditu +du +dute +edo +egin +ere +eta +eurak +ez +gainera +gu +gutxi +guzti +haiei +haiek +haietan +hainbeste +hala +han +handik +hango +hara +hari +hark +hartan +hau +hauei +hauek +hauetan +hemen +hemendik +hemengo +hi +hona +honek +honela +honetan +honi +hor +hori +horiei +horiek +horietan +horko +horra +horrek +horrela +horretan +horri +hortik +hura +izan +ni +noiz +nola +non +nondik +nongo +nor +nora +ze +zein +zen +zenbait +zenbat +zer +zergatik +ziren +zituen +zu +zuek +zuen +zuten +""".split() +) diff --git a/spacy/lang/eu/tag_map.py b/spacy/lang/eu/tag_map.py new file mode 100644 index 000000000..2499d7e3e --- /dev/null +++ b/spacy/lang/eu/tag_map.py @@ -0,0 +1,71 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB +from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON + +TAG_MAP = { + ".": {POS: PUNCT, "PunctType": "peri"}, + ",": {POS: PUNCT, "PunctType": "comm"}, + "-LRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "ini"}, + "-RRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "fin"}, + "``": {POS: PUNCT, "PunctType": "quot", "PunctSide": "ini"}, + '""': {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"}, + "''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"}, + ":": {POS: PUNCT}, + "$": {POS: SYM, "Other": {"SymType": "currency"}}, + "#": {POS: SYM, "Other": {"SymType": "numbersign"}}, + "AFX": {POS: ADJ, "Hyph": "yes"}, + "CC": {POS: CCONJ, "ConjType": "coor"}, + "CD": {POS: NUM, "NumType": "card"}, + "DT": {POS: DET}, + "EX": {POS: ADV, "AdvType": "ex"}, + "FW": {POS: X, "Foreign": "yes"}, + "HYPH": {POS: PUNCT, "PunctType": "dash"}, + "IN": {POS: ADP}, + "JJ": {POS: ADJ, "Degree": "pos"}, + "JJR": {POS: ADJ, "Degree": "comp"}, + "JJS": {POS: ADJ, "Degree": "sup"}, + "LS": {POS: PUNCT, "NumType": "ord"}, + "MD": {POS: VERB, "VerbType": "mod"}, + "NIL": {POS: ""}, + "NN": {POS: NOUN, "Number": "sing"}, + "NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"}, + "NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"}, + "NNS": {POS: NOUN, "Number": "plur"}, + "PDT": {POS: ADJ, "AdjType": "pdt", "PronType": "prn"}, + "POS": {POS: PART, "Poss": "yes"}, + "PRP": {POS: PRON, "PronType": "prs"}, + "PRP$": {POS: ADJ, "PronType": "prs", "Poss": "yes"}, + "RB": {POS: ADV, "Degree": "pos"}, + "RBR": {POS: ADV, "Degree": "comp"}, + "RBS": {POS: ADV, "Degree": "sup"}, + "RP": {POS: PART}, + "SP": {POS: SPACE}, + "SYM": {POS: SYM}, + "TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"}, + "UH": {POS: INTJ}, + "VB": {POS: VERB, "VerbForm": "inf"}, + "VBD": {POS: VERB, "VerbForm": "fin", "Tense": "past"}, + "VBG": {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"}, + "VBN": {POS: VERB, "VerbForm": "part", "Tense": "past", "Aspect": "perf"}, + "VBP": {POS: VERB, "VerbForm": "fin", "Tense": "pres"}, + "VBZ": { + POS: VERB, + "VerbForm": "fin", + "Tense": "pres", + "Number": "sing", + "Person": 3, + }, + "WDT": {POS: ADJ, "PronType": "int|rel"}, + "WP": {POS: NOUN, "PronType": "int|rel"}, + "WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"}, + "WRB": {POS: ADV, "PronType": "int|rel"}, + "ADD": {POS: X}, + "NFP": {POS: PUNCT}, + "GW": {POS: X}, + "XX": {POS: X}, + "BES": {POS: VERB}, + "HVS": {POS: VERB}, + "_SP": {POS: SPACE}, +} diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 816970e61..fc89c2658 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -83,6 +83,11 @@ def es_tokenizer(): return get_lang_class("es").Defaults.create_tokenizer() +@pytest.fixture(scope="session") +def eu_tokenizer(): + return get_lang_class("eu").Defaults.create_tokenizer() + + @pytest.fixture(scope="session") def fi_tokenizer(): return get_lang_class("fi").Defaults.create_tokenizer() diff --git a/spacy/tests/lang/eu/test_text.py b/spacy/tests/lang/eu/test_text.py new file mode 100644 index 000000000..e73917ffa --- /dev/null +++ b/spacy/tests/lang/eu/test_text.py @@ -0,0 +1,16 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +def test_eu_tokenizer_handles_long_text(eu_tokenizer): + text = """ta nere guitarra estrenatu ondoren""" + tokens = eu_tokenizer(text) + assert len(tokens) == 5 + + +@pytest.mark.parametrize("text,length", [("milesker ederra joan zen hitzaldia plazer hutsa", 7), ("astelehen guztia sofan pasau biot", 5)]) +def test_eu_tokenizer_handles_cnts(eu_tokenizer, text, length): + tokens = eu_tokenizer(text) + assert len(tokens) == length From 224a7f8e94721a7af10e366773ce2c012a5b8f62 Mon Sep 17 00:00:00 2001 From: Muhammad Irfan Date: Wed, 4 Mar 2020 15:49:06 +0500 Subject: [PATCH 31/41] examples --- spacy/lang/eu/examples.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/spacy/lang/eu/examples.py b/spacy/lang/eu/examples.py index ec9f0dd06..f2d325d78 100644 --- a/spacy/lang/eu/examples.py +++ b/spacy/lang/eu/examples.py @@ -1,7 +1,6 @@ # coding: utf8 from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. @@ -9,8 +8,7 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ - "", - "" + "bilbon ko castinga egin da eta nik jakin ez zuetako inork egin al du edota parte hartu duen ezagunik ba al du", + "gaur telebistan entzunda denok martetik gatoz hortaz martzianoak gara beno nire ustez batzuk beste batzuk baino martzianoagoak dira" ] From 4d655b1d45577ceeb0113616f6cc7590568e5a2b Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 4 Mar 2020 13:50:37 +0100 Subject: [PATCH 32/41] Require srsly >=1.0.2 --- requirements.txt | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index e908e25f8..ec30efc16 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ thinc==7.4.0 blis>=0.4.0,<0.5.0 murmurhash>=0.28.0,<1.1.0 wasabi>=0.4.0,<1.1.0 -srsly>=1.0.1,<1.1.0 +srsly>=1.0.2,<1.1.0 catalogue>=0.0.7,<1.1.0 # Third party dependencies numpy>=1.15.0 diff --git a/setup.cfg b/setup.cfg index 1429c77ac..e44e32bb2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -47,7 +47,7 @@ install_requires = thinc==7.4.0 blis>=0.4.0,<0.5.0 wasabi>=0.4.0,<1.1.0 - srsly>=1.0.1,<1.1.0 + srsly>=1.0.2,<1.1.0 catalogue>=0.0.7,<1.1.0 # Third-party dependencies tqdm>=4.38.0,<5.0.0 From 3440a72ecb188850bf4b08244c2041ac0d8109a7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 4 Mar 2020 19:28:16 +0100 Subject: [PATCH 33/41] Update Makefile (#5099) --- Makefile | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/Makefile b/Makefile index 1be1c9794..13c9026b7 100644 --- a/Makefile +++ b/Makefile @@ -1,36 +1,37 @@ SHELL := /bin/bash -WHEELHOUSE := ./wheelhouse PYVER := 3.6 VENV := ./env$(PYVER) -version = $(shell "bin/get-version.sh") +version := $(shell "bin/get-version.sh") -dist/spacy-$(version).pex : wheelhouse/spacy-$(version)-*.whl - pex -f ./wheelhouse --no-index --disable-cache -m spacy -o dist/spacy-$(version).pex spacy==$(version) jsonschema - chmod a+rx dist/spacy-$(version).pex +dist/spacy-$(version).pex : wheelhouse/spacy-$(version).stamp + $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema + chmod a+rx $@ dist/pytest.pex : wheelhouse/pytest-*.whl - $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m pytest -o dist/pytest.pex pytest pytest-timeout mock - chmod a+rx dist/pytest.pex + $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m pytest -o $@ pytest pytest-timeout mock + chmod a+rx $@ -wheelhouse/spacy-$(version)-%.whl : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py* +wheelhouse/spacy-$(version).stamp : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py* $(VENV)/bin/pip wheel . -w ./wheelhouse $(VENV)/bin/pip wheel jsonschema spacy_lookups_data -w ./wheelhouse + touch $@ wheelhouse/pytest-%.whl : $(VENV)/bin/pex $(VENV)/bin/pip wheel pytest pytest-timeout mock -w ./wheelhouse -$(VENV) : +$(VENV)/bin/pex : python$(PYVER) -m venv $(VENV) $(VENV)/bin/python -m pip install pex wheel .PHONY : clean test test : dist/spacy-$(version).pex dist/pytest.pex - PEX_PATH=dist/spacy-$(version).pex ./dist/pytest.pex --pyargs spacy -x + ( . $(VENV)/bin/activate ; \ + PEX_PATH=dist/spacy-$(version).pex ./dist/pytest.pex --pyargs spacy -x ; ) clean : setup.py - source env3.6/bin/activate rm -rf dist/* rm -rf ./wheelhouse + rm -rf $(VENV) python setup.py clean --all From 80004930ed098ec5b6bf9ecd081b96b1e7e7080f Mon Sep 17 00:00:00 2001 From: David Pollack Date: Thu, 5 Mar 2020 15:48:41 +0100 Subject: [PATCH 34/41] fix typo in svg file --- .github/contributors/dhpollack.md | 106 +++++++++++++++++++++++++++ website/src/images/logos/allenai.svg | 2 +- 2 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 .github/contributors/dhpollack.md diff --git a/.github/contributors/dhpollack.md b/.github/contributors/dhpollack.md new file mode 100644 index 000000000..444d97d42 --- /dev/null +++ b/.github/contributors/dhpollack.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [X] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | David Pollack | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | Mar 5. 2020 | +| GitHub username | dhpollack | +| Website (optional) | | diff --git a/website/src/images/logos/allenai.svg b/website/src/images/logos/allenai.svg index 2879bef60..c00569bf8 100644 --- a/website/src/images/logos/allenai.svg +++ b/website/src/images/logos/allenai.svg @@ -1,6 +1,6 @@ - + From 1a2b8fc264efdc384c5497b97ee4b1f55675a3ec Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Fri, 6 Mar 2020 14:45:28 +0100 Subject: [PATCH 35/41] set vector of merged entity (#5085) * merge_entities sets the vector in the vocab for the merged token * add unit test * import unicode_literals * move code to _merge function * only set vector if vocab has non-zero vectors --- spacy/tests/regression/test_issue5082.py | 46 ++++++++++++++++++++++++ spacy/tokens/_retokenize.pyx | 4 +++ 2 files changed, 50 insertions(+) create mode 100644 spacy/tests/regression/test_issue5082.py diff --git a/spacy/tests/regression/test_issue5082.py b/spacy/tests/regression/test_issue5082.py new file mode 100644 index 000000000..efa5d39f2 --- /dev/null +++ b/spacy/tests/regression/test_issue5082.py @@ -0,0 +1,46 @@ +# coding: utf8 +from __future__ import unicode_literals + +import numpy as np +from spacy.lang.en import English +from spacy.pipeline import EntityRuler + + +def test_issue5082(): + # Ensure the 'merge_entities' pipeline does something sensible for the vectors of the merged tokens + nlp = English() + vocab = nlp.vocab + array1 = np.asarray([0.1, 0.5, 0.8], dtype=np.float32) + array2 = np.asarray([-0.2, -0.6, -0.9], dtype=np.float32) + array3 = np.asarray([0.3, -0.1, 0.7], dtype=np.float32) + array4 = np.asarray([0.5, 0, 0.3], dtype=np.float32) + array34 = np.asarray([0.4, -0.05, 0.5], dtype=np.float32) + + vocab.set_vector("I", array1) + vocab.set_vector("like", array2) + vocab.set_vector("David", array3) + vocab.set_vector("Bowie", array4) + + text = "I like David Bowie" + ruler = EntityRuler(nlp) + patterns = [ + {"label": "PERSON", "pattern": [{"LOWER": "david"}, {"LOWER": "bowie"}]} + ] + ruler.add_patterns(patterns) + nlp.add_pipe(ruler) + + parsed_vectors_1 = [t.vector for t in nlp(text)] + assert len(parsed_vectors_1) == 4 + np.testing.assert_array_equal(parsed_vectors_1[0], array1) + np.testing.assert_array_equal(parsed_vectors_1[1], array2) + np.testing.assert_array_equal(parsed_vectors_1[2], array3) + np.testing.assert_array_equal(parsed_vectors_1[3], array4) + + merge_ents = nlp.create_pipe("merge_entities") + nlp.add_pipe(merge_ents) + + parsed_vectors_2 = [t.vector for t in nlp(text)] + assert len(parsed_vectors_2) == 3 + np.testing.assert_array_equal(parsed_vectors_2[0], array1) + np.testing.assert_array_equal(parsed_vectors_2[1], array2) + np.testing.assert_array_equal(parsed_vectors_2[2], array34) diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index a5d06491a..512ad73bc 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -213,6 +213,10 @@ def _merge(Doc doc, merges): new_orth = ''.join([t.text_with_ws for t in spans[token_index]]) if spans[token_index][-1].whitespace_: new_orth = new_orth[:-len(spans[token_index][-1].whitespace_)] + # add the vector of the (merged) entity to the vocab + if not doc.vocab.get_vector(new_orth).any(): + if doc.vocab.vectors_length > 0: + doc.vocab.set_vector(new_orth, span.vector) token = tokens[token_index] lex = doc.vocab.get(doc.mem, new_orth) token.lex = lex From 993758c58fba9d4611223f5dd6dcdb203cf67bba Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Sun, 8 Mar 2020 13:22:25 +0100 Subject: [PATCH 36/41] Remove unnecessary iterator in Language.pipe (#5101) Remove iterator over `raw_texts` with `iterator.tee()` in `Language.pipe` that is never consumed and consumes memory unnecessarily. --- spacy/language.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 28fddfebb..f0928b1f9 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -757,8 +757,6 @@ class Language(object): DOCS: https://spacy.io/api/language#pipe """ - # raw_texts will be used later to stop iterator. - texts, raw_texts = itertools.tee(texts) if is_python2 and n_process != 1: user_warning(Warnings.W023) n_process = 1 From 9dd98a4b2759f5231fcc3b2a09d16f27b79ab13b Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Sun, 8 Mar 2020 13:24:19 +0100 Subject: [PATCH 37/41] Improve Makefile (#5105) * Explicitly upgrade pip * Include spacy-lookups-data in pex --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 13c9026b7..cf96d6294 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ VENV := ./env$(PYVER) version := $(shell "bin/get-version.sh") dist/spacy-$(version).pex : wheelhouse/spacy-$(version).stamp - $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema + $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema spacy_lookups_data chmod a+rx $@ dist/pytest.pex : wheelhouse/pytest-*.whl @@ -22,7 +22,7 @@ wheelhouse/pytest-%.whl : $(VENV)/bin/pex $(VENV)/bin/pex : python$(PYVER) -m venv $(VENV) - $(VENV)/bin/python -m pip install pex wheel + $(VENV)/bin/pip install -U pip setuptools pex wheel .PHONY : clean test From 31755630a7b33bc9c621c1e82cc0c09da84720d4 Mon Sep 17 00:00:00 2001 From: Yohei Tamura Date: Sun, 8 Mar 2020 21:24:38 +0900 Subject: [PATCH 38/41] fix typ (#5106) --- bin/wiki_entity_linking/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/wiki_entity_linking/README.md b/bin/wiki_entity_linking/README.md index 56d0c1415..4e4af5c21 100644 --- a/bin/wiki_entity_linking/README.md +++ b/bin/wiki_entity_linking/README.md @@ -2,7 +2,7 @@ ### Step 1: Create a Knowledge Base (KB) and training data -Run `wikipedia_pretrain_kb.py` +Run `wikidata_pretrain_kb.py` * This takes as input the locations of a **Wikipedia and a Wikidata dump**, and produces a **KB directory** + **training file** * WikiData: get `latest-all.json.bz2` from https://dumps.wikimedia.org/wikidatawiki/entities/ * Wikipedia: get `enwiki-latest-pages-articles-multistream.xml.bz2` from https://dumps.wikimedia.org/enwiki/latest/ (or for any other language) From 0345135167c882575e006bf434c9f8d8d81f9e12 Mon Sep 17 00:00:00 2001 From: Mark Abraham Date: Sun, 8 Mar 2020 13:25:56 +0100 Subject: [PATCH 39/41] Tokenizer to_disk and from_disk now ensure paths (#5116) * Tokenizer to_disk and from_disk now ensure strings are converted to paths Fixes #5115 * Sign contributor agreement --- .github/contributors/mabraham.md | 89 ++++++++++++++++++++++++++++++++ spacy/tokenizer.pyx | 2 + 2 files changed, 91 insertions(+) create mode 100644 .github/contributors/mabraham.md diff --git a/.github/contributors/mabraham.md b/.github/contributors/mabraham.md new file mode 100644 index 000000000..c91c950a3 --- /dev/null +++ b/.github/contributors/mabraham.md @@ -0,0 +1,89 @@ + + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | | +| GitHub username | | +| Website (optional) | | \ No newline at end of file diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 12c7b73af..4da081259 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -508,6 +508,7 @@ cdef class Tokenizer: DOCS: https://spacy.io/api/tokenizer#to_disk """ + path = util.ensure_path(path) with path.open("wb") as file_: file_.write(self.to_bytes(**kwargs)) @@ -521,6 +522,7 @@ cdef class Tokenizer: DOCS: https://spacy.io/api/tokenizer#from_disk """ + path = util.ensure_path(path) with path.open("rb") as file_: bytes_data = file_.read() self.from_bytes(bytes_data, **kwargs) From 5f680042647ef7d0c71a5041f33558bf81e656d8 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 9 Mar 2020 11:05:00 +0100 Subject: [PATCH 40/41] Port over gitignore changes from develop Prevents stale files when switching branches --- .gitignore | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.gitignore b/.gitignore index 828258603..edcbba4d5 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,11 @@ corpora/ keys/ *.json.gz +# Tests +spacy/tests/package/setup.cfg +spacy/tests/package/pyproject.toml +spacy/tests/package/requirements.txt + # Website website/.cache/ website/public/ From 1d6aec805d5c03ad8a039466e98ed3a619e650c4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 9 Mar 2020 11:17:20 +0100 Subject: [PATCH 41/41] Fix formatting and update docs for v2.2.4 --- spacy/cli/debug_data.py | 25 ++++++++++++++++--------- website/docs/api/cli.md | 30 ++++++++++++++++++++---------- website/docs/api/doc.md | 22 ++++++++++++---------- website/docs/api/span.md | 30 ++++++++++++++++++++++++++---- website/docs/api/top-level.md | 32 ++++++++++++++++---------------- website/meta/languages.json | 2 ++ 6 files changed, 92 insertions(+), 49 deletions(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 0e12a594c..c5e1ff6cf 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -23,20 +23,17 @@ BLANK_MODEL_THRESHOLD = 2000 @plac.annotations( + # fmt: off lang=("model language", "positional", None, str), train_path=("location of JSON-formatted training data", "positional", None, Path), dev_path=("location of JSON-formatted development data", "positional", None, Path), tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path), base_model=("name of model to update (optional)", "option", "b", str), - pipeline=( - "Comma-separated names of pipeline components to train", - "option", - "p", - str, - ), + pipeline=("Comma-separated names of pipeline components to train", "option", "p", str), ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool), verbose=("Print additional information and explanations", "flag", "V", bool), no_format=("Don't pretty-print the results", "flag", "NF", bool), + # fmt: on ) def debug_data( lang, @@ -235,13 +232,17 @@ def debug_data( if gold_train_data["ws_ents"]: msg.fail( - "{} invalid whitespace entity span(s)".format(gold_train_data["ws_ents"]) + "{} invalid whitespace entity span(s)".format( + gold_train_data["ws_ents"] + ) ) has_ws_ents_error = True if gold_train_data["punct_ents"]: msg.warn( - "{} entity span(s) with punctuation".format(gold_train_data["punct_ents"]) + "{} entity span(s) with punctuation".format( + gold_train_data["punct_ents"] + ) ) has_punct_ents_warning = True @@ -592,7 +593,13 @@ def _compile_gold(train_docs, pipeline): if label.startswith(("B-", "U-", "L-")) and doc[i].is_space: # "Illegal" whitespace entity data["ws_ents"] += 1 - if label.startswith(("B-", "U-", "L-")) and doc[i].text in [".", "'", "!", "?", ","]: + if label.startswith(("B-", "U-", "L-")) and doc[i].text in [ + ".", + "'", + "!", + "?", + ",", + ]: # punctuation entity: could be replaced by whitespace when training with noise, # so add a warning to alert the user to this unexpected side effect. data["punct_ents"] += 1 diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 2f7346491..e47695efb 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -184,16 +184,17 @@ low data labels and more. $ python -m spacy debug-data [lang] [train_path] [dev_path] [--base-model] [--pipeline] [--ignore-warnings] [--verbose] [--no-format] ``` -| Argument | Type | Description | -| -------------------------- | ---------- | -------------------------------------------------------------------------------------------------- | -| `lang` | positional | Model language. | -| `train_path` | positional | Location of JSON-formatted training data. Can be a file or a directory of files. | -| `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. | -| `--base-model`, `-b` | option | Optional name of base model to update. Can be any loadable spaCy model. | -| `--pipeline`, `-p` | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. | -| `--ignore-warnings`, `-IW` | flag | Ignore warnings, only show stats and errors. | -| `--verbose`, `-V` | flag | Print additional information and explanations. | -| --no-format, `-NF` | flag | Don't pretty-print the results. Use this if you want to write to a file. | +| Argument | Type | Description | +| ------------------------------------------------------ | ---------- | -------------------------------------------------------------------------------------------------- | +| `lang` | positional | Model language. | +| `train_path` | positional | Location of JSON-formatted training data. Can be a file or a directory of files. | +| `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. | +| `--tag-map-path`, `-tm` 2.2.3 | option | Location of JSON-formatted tag map. | +| `--base-model`, `-b` | option | Optional name of base model to update. Can be any loadable spaCy model. | +| `--pipeline`, `-p` | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. | +| `--ignore-warnings`, `-IW` | flag | Ignore warnings, only show stats and errors. | +| `--verbose`, `-V` | flag | Print additional information and explanations. | +| --no-format, `-NF` | flag | Don't pretty-print the results. Use this if you want to write to a file. | @@ -368,6 +369,7 @@ $ python -m spacy train [lang] [output_path] [train_path] [dev_path] | `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. | | `--base-model`, `-b` 2.1 | option | Optional name of base model to update. Can be any loadable spaCy model. | | `--pipeline`, `-p` 2.1 | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. | +| `--replace-components`, `-R` | flag | Replace components from the base model. | | `--vectors`, `-v` | option | Model to load vectors from. | | `--n-iter`, `-n` | option | Number of iterations (default: `30`). | | `--n-early-stopping`, `-ne` | option | Maximum number of training epochs without dev accuracy improvement. | @@ -378,6 +380,13 @@ $ python -m spacy train [lang] [output_path] [train_path] [dev_path] | `--init-tok2vec`, `-t2v` 2.1 | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental. | | `--parser-multitasks`, `-pt` | option | Side objectives for parser CNN, e.g. `'dep'` or `'dep,tag'` | | `--entity-multitasks`, `-et` | option | Side objectives for NER CNN, e.g. `'dep'` or `'dep,tag'` | +| `--width`, `-cw` 2.2.4 | option | Width of CNN layers of `Tok2Vec` component. | +| `--conv-depth`, `-cd` 2.2.4 | option | Depth of CNN layers of `Tok2Vec` component. | +| `--cnn-window`, `-cW` 2.2.4 | option | Window size for CNN layers of `Tok2Vec` component. | +| `--cnn-pieces`, `-cP` 2.2.4 | option | Maxout size for CNN layers of `Tok2Vec` component. | +| `--use-chars`, `-chr` 2.2.4 | flag | Whether to use character-based embedding of `Tok2Vec` component. | +| `--bilstm-depth`, `-lstm` 2.2.4 | option | Depth of BiLSTM layers of `Tok2Vec` component (requires PyTorch). | +| `--embed-rows`, `-er` 2.2.4 | option | Number of embedding rows of `Tok2Vec` component. | | `--noise-level`, `-nl` | option | Float indicating the amount of corruption for data augmentation. | | `--orth-variant-level`, `-ovl` 2.2 | option | Float indicating the orthography variation for data augmentation (e.g. `0.3` for making 30% of occurrences of some tokens subject to replacement). | | `--gold-preproc`, `-G` | flag | Use gold preprocessing. | @@ -385,6 +394,7 @@ $ python -m spacy train [lang] [output_path] [train_path] [dev_path] | `--textcat-multilabel`, `-TML` 2.2 | flag | Text classification classes aren't mutually exclusive (multilabel). | | `--textcat-arch`, `-ta` 2.2 | option | Text classification model architecture. Defaults to `"bow"`. | | `--textcat-positive-label`, `-tpl` 2.2 | option | Text classification positive label for binary classes with two labels. | +| `--tag-map-path`, `-tm` 2.2.4 | option | Location of JSON-formatted tag map. | | `--verbose`, `-VV` 2.0.13 | flag | Show more detailed messages during training. | | `--help`, `-h` | flag | Show help message and available arguments. | | **CREATES** | model, pickle | A spaCy model on each epoch. | diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 4f948e425..87b854a8c 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -7,9 +7,10 @@ source: spacy/tokens/doc.pyx A `Doc` is a sequence of [`Token`](/api/token) objects. Access sentences and named entities, export annotations to numpy arrays, losslessly serialize to -compressed binary strings. The `Doc` object holds an array of [`TokenC`](/api/cython-structs#tokenc) structs. -The Python-level `Token` and [`Span`](/api/span) objects are views of this -array, i.e. they don't own the data themselves. +compressed binary strings. The `Doc` object holds an array of +[`TokenC`](/api/cython-structs#tokenc) structs. The Python-level `Token` and +[`Span`](/api/span) objects are views of this array, i.e. they don't own the +data themselves. ## Doc.\_\_init\_\_ {#init tag="method"} @@ -197,13 +198,14 @@ the character indices don't map to a valid span. > assert span.text == "New York" > ``` -| Name | Type | Description | -| ----------- | ---------------------------------------- | ------------------------------------------------------- | -| `start` | int | The index of the first character of the span. | -| `end` | int | The index of the last character after the span. | -| `label` | uint64 / unicode | A label to attach to the Span, e.g. for named entities. | -| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. | -| **RETURNS** | `Span` | The newly constructed object or `None`. | +| Name | Type | Description | +| ------------------------------------ | ---------------------------------------- | --------------------------------------------------------------------- | +| `start` | int | The index of the first character of the span. | +| `end` | int | The index of the last character after the span. | +| `label` | uint64 / unicode | A label to attach to the span, e.g. for named entities. | +| `kb_id` 2.2 | uint64 / unicode | An ID from a knowledge base to capture the meaning of a named entity. | +| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. | +| **RETURNS** | `Span` | The newly constructed object or `None`. | ## Doc.similarity {#similarity tag="method" model="vectors"} diff --git a/website/docs/api/span.md b/website/docs/api/span.md index 64b77b89d..3833bbca9 100644 --- a/website/docs/api/span.md +++ b/website/docs/api/span.md @@ -172,6 +172,28 @@ Remove a previously registered extension. | `name` | unicode | Name of the extension. | | **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. | +## Span.char_span {#char_span tag="method" new="2.2.4"} + +Create a `Span` object from the slice `span.text[start:end]`. Returns `None` if +the character indices don't map to a valid span. + +> #### Example +> +> ```python +> doc = nlp("I like New York") +> span = doc[1:4].char_span(5, 13, label="GPE") +> assert span.text == "New York" +> ``` + +| Name | Type | Description | +| ----------- | ---------------------------------------- | --------------------------------------------------------------------- | +| `start` | int | The index of the first character of the span. | +| `end` | int | The index of the last character after the span. | +| `label` | uint64 / unicode | A label to attach to the span, e.g. for named entities. | +| `kb_id` | uint64 / unicode | An ID from a knowledge base to capture the meaning of a named entity. | +| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. | +| **RETURNS** | `Span` | The newly constructed object or `None`. | + ## Span.similarity {#similarity tag="method" model="vectors"} Make a semantic similarity estimate. The default estimate is cosine similarity @@ -293,10 +315,10 @@ Create a new `Doc` object corresponding to the `Span`, with a copy of the data. > assert doc2.text == "New York" > ``` -| Name | Type | Description | -| ----------------- | ----- | ---------------------------------------------------- | -| `copy_user_data` | bool | Whether or not to copy the original doc's user data. | -| **RETURNS** | `Doc` | A `Doc` object of the `Span`'s content. | +| Name | Type | Description | +| ---------------- | ----- | ---------------------------------------------------- | +| `copy_user_data` | bool | Whether or not to copy the original doc's user data. | +| **RETURNS** | `Doc` | A `Doc` object of the `Span`'s content. | ## Span.root {#root tag="property" model="parser"} diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 266df87f0..217c51794 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -236,22 +236,22 @@ If a setting is not present in the options, the default value will be used. > displacy.serve(doc, style="dep", options=options) > ``` -| Name | Type | Description | Default | -| ------------------ | ------- | --------------------------------------------------------------------------------------------------------------- | ----------------------- | -| `fine_grained` | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). | `False` | -| `add_lemma` | bool | Print the lemma's in a separate row below the token texts in the `dep` visualisation. | `False` | -| `collapse_punct` | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True` | -| `collapse_phrases` | bool | Merge noun phrases into one token. | `False` | -| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` | -| `color` | unicode | Text color (HEX, RGB or color names). | `'#000000'` | -| `bg` | unicode | Background color (HEX, RGB or color names). | `'#ffffff'` | -| `font` | unicode | Font name or font family for all text. | `'Arial'` | -| `offset_x` | int | Spacing on left side of the SVG in px. | `50` | -| `arrow_stroke` | int | Width of arrow path in px. | `2` | -| `arrow_width` | int | Width of arrow head in px. | `10` / `8` (compact) | -| `arrow_spacing` | int | Spacing between arrows in px to avoid overlaps. | `20` / `12` (compact) | -| `word_spacing` | int | Vertical spacing between words and arcs in px. | `45` | -| `distance` | int | Distance between words in px. | `175` / `150` (compact) | +| Name | Type | Description | Default | +| ------------------------------------------ | ------- | --------------------------------------------------------------------------------------------------------------- | ----------------------- | +| `fine_grained` | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). | `False` | +| `add_lemma` 2.2.4 | bool | Print the lemma's in a separate row below the token texts. | `False` | +| `collapse_punct` | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True` | +| `collapse_phrases` | bool | Merge noun phrases into one token. | `False` | +| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` | +| `color` | unicode | Text color (HEX, RGB or color names). | `'#000000'` | +| `bg` | unicode | Background color (HEX, RGB or color names). | `'#ffffff'` | +| `font` | unicode | Font name or font family for all text. | `'Arial'` | +| `offset_x` | int | Spacing on left side of the SVG in px. | `50` | +| `arrow_stroke` | int | Width of arrow path in px. | `2` | +| `arrow_width` | int | Width of arrow head in px. | `10` / `8` (compact) | +| `arrow_spacing` | int | Spacing between arrows in px to avoid overlaps. | `20` / `12` (compact) | +| `word_spacing` | int | Vertical spacing between words and arcs in px. | `45` | +| `distance` | int | Distance between words in px. | `175` / `150` (compact) | #### Named Entity Visualizer options {#displacy_options-ent} diff --git a/website/meta/languages.json b/website/meta/languages.json index c22ddad69..8834aaddc 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -95,6 +95,8 @@ "has_examples": true }, { "code": "hr", "name": "Croatian", "has_examples": true }, + { "code": "eu", "name": "Basque", "has_examples": true }, + { "code": "yo", "name": "Yoruba", "has_examples": true }, { "code": "tr", "name": "Turkish", "example": "Bu bir cümledir.", "has_examples": true }, { "code": "ca", "name": "Catalan", "example": "Això és una frase.", "has_examples": true }, { "code": "he", "name": "Hebrew", "example": "זהו משפט.", "has_examples": true },