From 39aabf50ab23f4cadef5d5b459436a988f9fe677 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 9 Oct 2020 11:54:48 +0200 Subject: [PATCH 01/29] Also rename to include_static_vectors in CharEmbed --- spacy/ml/models/tok2vec.py | 6 +++--- spacy/pipeline/morphologizer.pyx | 2 +- spacy/tests/pipeline/test_tok2vec.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 23cfe883b..6ef7b2325 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -177,7 +177,7 @@ def CharacterEmbed( rows: int, nM: int, nC: int, - also_use_static_vectors: bool, + include_static_vectors: bool, feature: Union[int, str] = "LOWER", ) -> Model[List[Doc], List[Floats2d]]: """Construct an embedded representation based on character embeddings, using @@ -204,13 +204,13 @@ def CharacterEmbed( nC (int): The number of UTF-8 bytes to embed per word. Recommended values are between 3 and 8, although it may depend on the length of words in the language. - also_use_static_vectors (bool): Whether to also use static word vectors. + include_static_vectors (bool): Whether to also use static word vectors. Requires a vectors table to be loaded in the Doc objects' vocab. """ feature = intify_attr(feature) if feature is None: raise ValueError(Errors.E911(feat=feature)) - if also_use_static_vectors: + if include_static_vectors: model = chain( concatenate( chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()), diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index a456b7a0f..00188a762 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -32,7 +32,7 @@ width = 128 rows = 7000 nM = 64 nC = 8 -also_use_static_vectors = false +include_static_vectors = false [model.tok2vec.encode] @architectures = "spacy.MaxoutWindowEncoder.v1" diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index 90882ae3f..ec4ed17dd 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -63,8 +63,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): [ (8, MultiHashEmbed, {"rows": [100, 100], "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}), (8, MultiHashEmbed, {"rows": [100, 20], "attrs": ["ORTH", "PREFIX"], "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}), - (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}), - (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}), + (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}), + (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}), ], ) # fmt: on From 18dfb279850adb00c3b3efa18bbb6d58c17bc453 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 9 Oct 2020 12:05:33 +0200 Subject: [PATCH 02/29] Add custom error when evaluation throws a KeyError --- spacy/errors.py | 3 +++ spacy/training/loop.py | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/spacy/errors.py b/spacy/errors.py index 2bc2f3e20..06653edcf 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -456,6 +456,9 @@ class Errors: "issue tracker: http://github.com/explosion/spaCy/issues") # TODO: fix numbering after merging develop into master + E900 = ("Could not run the full 'nlp' pipeline for evaluation. If you specified " + "frozen components, make sure they were already trained and initialized. " + "You can also consider moving them to the 'disabled' list instead.") E901 = ("Failed to remove existing output directory: {path}. If your " "config and the components you train change between runs, a " "non-empty output directory can lead to stale pipeline data. To " diff --git a/spacy/training/loop.py b/spacy/training/loop.py index 242113cc6..8e688a27d 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -249,7 +249,10 @@ def create_evaluation_callback( def evaluate() -> Tuple[float, Dict[str, float]]: dev_examples = list(dev_corpus(nlp)) - scores = nlp.evaluate(dev_examples) + try: + scores = nlp.evaluate(dev_examples) + except KeyError as e: + raise KeyError(Errors.E900) from e # Calculate a weighted sum based on score_weights for the main score. # We can only consider scores that are ints/floats, not dicts like # entity scores per type etc. From 8316bc7d4a6dbd989d53f97a8c7a06758c8d356c Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 9 Oct 2020 12:06:20 +0200 Subject: [PATCH 03/29] bugfix DisabledPipes --- spacy/language.py | 3 +++ spacy/tests/pipeline/test_pipe_methods.py | 14 ++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/spacy/language.py b/spacy/language.py index 1fb559657..24e593043 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1034,6 +1034,9 @@ class Language: ) ) disable = to_disable + # DisabledPipes will restore the pipes in 'disable' when it's done, so we need to exclude + # those pipes that were already disabled. + disable = [d for d in disable if d not in self._disabled] return DisabledPipes(self, disable) def make_doc(self, text: str) -> Doc: diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index c693a7487..cd18b0159 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -129,6 +129,7 @@ def test_enable_pipes_method(nlp, name): @pytest.mark.parametrize("name", ["my_component"]) def test_disable_pipes_context(nlp, name): + """Test that an enabled component stays enabled after running the context manager.""" nlp.add_pipe("new_pipe", name=name) assert nlp.has_pipe(name) with nlp.select_pipes(disable=name): @@ -136,6 +137,19 @@ def test_disable_pipes_context(nlp, name): assert nlp.has_pipe(name) +@pytest.mark.parametrize("name", ["my_component"]) +def test_disable_pipes_context_restore(nlp, name): + """Test that a disabled component stays disabled after running the context manager.""" + nlp.add_pipe("new_pipe", name=name) + assert nlp.has_pipe(name) + nlp.disable_pipes(name) + assert not nlp.has_pipe(name) + with nlp.select_pipes(disable=name): + assert not nlp.has_pipe(name) + assert not nlp.has_pipe(name) + + + def test_select_pipes_list_arg(nlp): for name in ["c1", "c2", "c3"]: nlp.add_pipe("new_pipe", name=name) From 2cafba5f50d83a93582bddea6bd1f569f98207f7 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 9 Oct 2020 12:17:35 +0200 Subject: [PATCH 04/29] shorten error message for clarity --- spacy/errors.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 06653edcf..3ab9661e0 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -457,8 +457,7 @@ class Errors: # TODO: fix numbering after merging develop into master E900 = ("Could not run the full 'nlp' pipeline for evaluation. If you specified " - "frozen components, make sure they were already trained and initialized. " - "You can also consider moving them to the 'disabled' list instead.") + "frozen components, make sure they were already trained and initialized. ") E901 = ("Failed to remove existing output directory: {path}. If your " "config and the components you train change between runs, a " "non-empty output directory can lead to stale pipeline data. To " From 06b9d213fd91397896a24dcf5fa4f90950570e9d Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 9 Oct 2020 12:19:47 +0200 Subject: [PATCH 05/29] formatting --- spacy/errors.py | 2 +- spacy/tests/pipeline/test_pipe_methods.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 3ab9661e0..0932ba0fd 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -457,7 +457,7 @@ class Errors: # TODO: fix numbering after merging develop into master E900 = ("Could not run the full 'nlp' pipeline for evaluation. If you specified " - "frozen components, make sure they were already trained and initialized. ") + "frozen components, make sure they were already initialized and trained. ") E901 = ("Failed to remove existing output directory: {path}. If your " "config and the components you train change between runs, a " "non-empty output directory can lead to stale pipeline data. To " diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index cd18b0159..b744aed98 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -149,7 +149,6 @@ def test_disable_pipes_context_restore(nlp, name): assert not nlp.has_pipe(name) - def test_select_pipes_list_arg(nlp): for name in ["c1", "c2", "c3"]: nlp.add_pipe("new_pipe", name=name) From 853edace37af044e21b0631d8d35ede18d16a482 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 9 Oct 2020 14:11:06 +0200 Subject: [PATCH 06/29] fix MultiHashEmbed example in documentation --- spacy/ml/models/tok2vec.py | 2 +- website/docs/usage/embeddings-transformers.md | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 23cfe883b..1a78cf75e 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -110,7 +110,7 @@ def MultiHashEmbed( The features used can be configured with the 'attrs' argument. The suggested attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into - account some subword information, without construction a fully character-based + account some subword information, without constructing a fully character-based representation. If pretrained vectors are available, they can be included in the representation as well, with the vectors table will be kept static (i.e. it's not updated). diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index 73540b3d3..856685dad 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -516,16 +516,14 @@ Many neural network models are able to use word vector tables as additional features, which sometimes results in significant improvements in accuracy. spaCy's built-in embedding layer, [MultiHashEmbed](/api/architectures#MultiHashEmbed), can be configured to use -word vector tables using the `also_use_static_vectors` flag. This setting is -also available on the [MultiHashEmbedCNN](/api/architectures#MultiHashEmbedCNN) -layer, which builds the default token-to-vector encoding architecture. +word vector tables using the `include_static_vectors` flag. ```ini [tagger.model.tok2vec.embed] @architectures = "spacy.MultiHashEmbed.v1" width = 128 -rows = 7000 -also_embed_subwords = true +attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] +rows = [7000, 3500, 3500, 3500] also_use_static_vectors = true ``` From 2dd79454af73cb07d07ac1b9ad12644736e96bd5 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 9 Oct 2020 14:42:07 +0200 Subject: [PATCH 07/29] Update docs --- website/docs/usage/embeddings-transformers.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index 549c3bcc4..942fc4e7b 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -514,7 +514,7 @@ Many neural network models are able to use word vector tables as additional features, which sometimes results in significant improvements in accuracy. spaCy's built-in embedding layer, [MultiHashEmbed](/api/architectures#MultiHashEmbed), can be configured to use -word vector tables using the `also_use_static_vectors` flag. This setting is +word vector tables using the `include_static_vectors` flag. This setting is also available on the [MultiHashEmbedCNN](/api/architectures#MultiHashEmbedCNN) layer, which builds the default token-to-vector encoding architecture. @@ -522,9 +522,9 @@ layer, which builds the default token-to-vector encoding architecture. [tagger.model.tok2vec.embed] @architectures = "spacy.MultiHashEmbed.v1" width = 128 -rows = 7000 -also_embed_subwords = true -also_use_static_vectors = true +attrs = ["LOWER","PREFIX","SUFFIX","SHAPE"] +rows = [5000,2500,2500,2500] +include_static_vectors = true ``` From 727370c633b37457ddbedc80aecf07e1dc2c967d Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 9 Oct 2020 14:42:51 +0200 Subject: [PATCH 08/29] Remove Span._recalculate_indices Remove `Span._recalculate_indices`, which is a remnant from the deprecated `Span.merge`. --- spacy/tests/doc/test_doc_api.py | 9 +++------ spacy/tests/doc/test_retokenize_merge.py | 1 + spacy/tokens/span.pxd | 1 - spacy/tokens/span.pyx | 17 ----------------- 4 files changed, 4 insertions(+), 24 deletions(-) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index ea832c136..db8a6d1c4 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -608,14 +608,11 @@ def test_doc_init_iob(): doc = Doc(Vocab(), words=words, ents=ents) -@pytest.mark.xfail -def test_doc_set_ents_spans(en_tokenizer): +def test_doc_set_ents_invalid_spans(en_tokenizer): doc = en_tokenizer("Some text about Colombia and the Czech Republic") spans = [Span(doc, 3, 4, label="GPE"), Span(doc, 6, 8, label="GPE")] with doc.retokenize() as retokenizer: for span in spans: retokenizer.merge(span) - # If this line is uncommented, it works: - # print(spans) - doc.ents = spans - assert [ent.text for ent in doc.ents] == ["Colombia", "Czech Republic"] + with pytest.raises(IndexError): + doc.ents = spans diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py index cb886545a..b483255c8 100644 --- a/spacy/tests/doc/test_retokenize_merge.py +++ b/spacy/tests/doc/test_retokenize_merge.py @@ -336,6 +336,7 @@ def test_doc_retokenize_spans_sentence_update_after_merge(en_tokenizer): attrs = {"lemma": "none", "ent_type": "none"} retokenizer.merge(doc[0:2], attrs=attrs) retokenizer.merge(doc[-2:], attrs=attrs) + sent1, sent2 = list(doc.sents) assert len(sent1) == init_len - 1 assert len(sent2) == init_len2 - 1 diff --git a/spacy/tokens/span.pxd b/spacy/tokens/span.pxd index f6f88a23e..cc6b908bb 100644 --- a/spacy/tokens/span.pxd +++ b/spacy/tokens/span.pxd @@ -16,5 +16,4 @@ cdef class Span: cdef public _vector cdef public _vector_norm - cpdef int _recalculate_indices(self) except -1 cpdef np.ndarray to_array(self, object features) diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 64c3c7df0..491ba0266 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -150,7 +150,6 @@ cdef class Span: DOCS: https://nightly.spacy.io/api/span#len """ - self._recalculate_indices() if self.end < self.start: return 0 return self.end - self.start @@ -167,7 +166,6 @@ cdef class Span: DOCS: https://nightly.spacy.io/api/span#getitem """ - self._recalculate_indices() if isinstance(i, slice): start, end = normalize_slice(len(self), i.start, i.stop, i.step) return Span(self.doc, start + self.start, end + self.start) @@ -188,7 +186,6 @@ cdef class Span: DOCS: https://nightly.spacy.io/api/span#iter """ - self._recalculate_indices() for i in range(self.start, self.end): yield self.doc[i] @@ -339,19 +336,6 @@ cdef class Span: output[i-self.start, j] = get_token_attr(&self.doc.c[i], feature) return output - cpdef int _recalculate_indices(self) except -1: - if self.end > self.doc.length \ - or self.doc.c[self.start].idx != self.start_char \ - or (self.doc.c[self.end-1].idx + self.doc.c[self.end-1].lex.length) != self.end_char: - start = token_by_start(self.doc.c, self.doc.length, self.start_char) - if self.start == -1: - raise IndexError(Errors.E036.format(start=self.start_char)) - end = token_by_end(self.doc.c, self.doc.length, self.end_char) - if end == -1: - raise IndexError(Errors.E037.format(end=self.end_char)) - self.start = start - self.end = end + 1 - @property def vocab(self): """RETURNS (Vocab): The Span's Doc's vocab.""" @@ -520,7 +504,6 @@ cdef class Span: DOCS: https://nightly.spacy.io/api/span#root """ - self._recalculate_indices() if "root" in self.doc.user_span_hooks: return self.doc.user_span_hooks["root"](self) # This should probably be called 'head', and the other one called From 040c7c054125d32da2af9c73f604b811e6ae0d97 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 9 Oct 2020 15:40:58 +0200 Subject: [PATCH 09/29] fix get_dim calls in build_simple_cnn_text_classifier --- spacy/ml/models/textcat.py | 4 ++-- spacy/util.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index 1117b4fde..ec8998e2d 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -24,11 +24,11 @@ def build_simple_cnn_text_classifier( """ with Model.define_operators({">>": chain}): if exclusive_classes: - output_layer = Softmax(nO=nO, nI=tok2vec.get_dim("nO")) + output_layer = Softmax(nO=nO, nI=tok2vec.maybe_get_dim("nO")) model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer model.set_ref("output_layer", output_layer) else: - linear_layer = Linear(nO=nO, nI=tok2vec.get_dim("nO")) + linear_layer = Linear(nO=nO, nI=tok2vec.maybe_get_dim("nO")) model = ( tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic() ) diff --git a/spacy/util.py b/spacy/util.py index 3d567a425..47fbcce1c 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -622,7 +622,7 @@ def load_meta(path: Union[str, Path]) -> Dict[str, Any]: if not path.parent.exists(): raise IOError(Errors.E052.format(path=path.parent)) if not path.exists() or not path.is_file(): - raise IOError(Errors.E053.format(path=path, name="meta.json")) + raise IOError(Errors.E053.format(path=path.parent, name="meta.json")) meta = srsly.read_json(path) for setting in ["lang", "name", "version"]: if setting not in meta or not meta[setting]: From e972ecba727a35d59080dc0e217faa02044abb4e Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 9 Oct 2020 16:03:14 +0200 Subject: [PATCH 10/29] add utf8 encoding for opening file --- spacy/cli/convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index e4559929e..8413c639b 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -253,7 +253,7 @@ def _get_converter(msg, converter, input_path): if converter == "auto": converter = input_path.suffix[1:] if converter == "ner" or converter == "iob": - with input_path.open() as file_: + with input_path.open(encoding="utf8") as file_: input_data = file_.read() converter_autodetect = autodetect_ner_format(input_data) if converter_autodetect == "ner": From 97ff090e495208a5944561e210c76ef77e93eab3 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 9 Oct 2020 16:03:57 +0200 Subject: [PATCH 11/29] Fix docs example [ci skip] --- website/docs/usage/processing-pipelines.md | 54 +++++++++------------- 1 file changed, 23 insertions(+), 31 deletions(-) diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index fdae6d3e5..83134962b 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -1403,9 +1403,9 @@ especially useful it you want to pass in a string instead of calling This example shows the implementation of a pipeline component that fetches country meta data via the [REST Countries API](https://restcountries.eu), sets -entity annotations for countries, merges entities into one token and sets custom -attributes on the `Doc`, `Span` and `Token` – for example, the capital, -latitude/longitude coordinates and even the country flag. +entity annotations for countries and sets custom attributes on the `Doc` and +`Span` – for example, the capital, latitude/longitude coordinates and even the +country flag. ```python ### {executable="true"} @@ -1427,54 +1427,46 @@ class RESTCountriesComponent: # Set up the PhraseMatcher with Doc patterns for each country name self.matcher = PhraseMatcher(nlp.vocab) self.matcher.add("COUNTRIES", [nlp.make_doc(c) for c in self.countries.keys()]) - # Register attribute on the Token. We'll be overwriting this based on + # Register attributes on the Span. We'll be overwriting this based on # the matches, so we're only setting a default value, not a getter. - Token.set_extension("is_country", default=False) - Token.set_extension("country_capital", default=False) - Token.set_extension("country_latlng", default=False) - Token.set_extension("country_flag", default=False) - # Register attributes on Doc and Span via a getter that checks if one of - # the contained tokens is set to is_country == True. + Span.set_extension("is_country", default=None) + Span.set_extension("country_capital", default=None) + Span.set_extension("country_latlng", default=None) + Span.set_extension("country_flag", default=None) + # Register attribute on Doc via a getter that checks if the Doc + # contains a country entity Doc.set_extension("has_country", getter=self.has_country) - Span.set_extension("has_country", getter=self.has_country) def __call__(self, doc): spans = [] # keep the spans for later so we can merge them afterwards for _, start, end in self.matcher(doc): # Generate Span representing the entity & set label entity = Span(doc, start, end, label=self.label) + # Set custom attributes on entity. Can be extended with other data + # returned by the API, like currencies, country code, calling code etc. + entity._.set("is_country", True) + entity._.set("country_capital", self.countries[entity.text]["capital"]) + entity._.set("country_latlng", self.countries[entity.text]["latlng"]) + entity._.set("country_flag", self.countries[entity.text]["flag"]) spans.append(entity) - # Set custom attribute on each token of the entity - # Can be extended with other data returned by the API, like - # currencies, country code, flag, calling code etc. - for token in entity: - token._.set("is_country", True) - token._.set("country_capital", self.countries[entity.text]["capital"]) - token._.set("country_latlng", self.countries[entity.text]["latlng"]) - token._.set("country_flag", self.countries[entity.text]["flag"]) - # Iterate over all spans and merge them into one token - with doc.retokenize() as retokenizer: - for span in spans: - retokenizer.merge(span) # Overwrite doc.ents and add entity – be careful not to replace! doc.ents = list(doc.ents) + spans return doc # don't forget to return the Doc! - def has_country(self, tokens): - """Getter for Doc and Span attributes. Since the getter is only called - when we access the attribute, we can refer to the Token's 'is_country' + def has_country(self, doc): + """Getter for Doc attributes. Since the getter is only called + when we access the attribute, we can refer to the Span's 'is_country' attribute here, which is already set in the processing step.""" - return any([t._.get("is_country") for t in tokens]) + return any([entity._.get("is_country") for entity in doc.ents]) nlp = English() nlp.add_pipe("rest_countries", config={"label": "GPE"}) doc = nlp("Some text about Colombia and the Czech Republic") print("Pipeline", nlp.pipe_names) # pipeline contains component name print("Doc has countries", doc._.has_country) # Doc contains countries -for token in doc: - if token._.is_country: - print(token.text, token._.country_capital, token._.country_latlng, token._.country_flag) -print("Entities", [(e.text, e.label_) for e in doc.ents]) +for ent in doc.ents: + if ent._.is_country: + print(ent.text, ent.label_, ent._.country_capital, ent._.country_latlng, ent._.country_flag) ``` In this case, all data can be fetched on initialization in one request. However, From 8ac5f222531dcb602d08118693618598bc0c045d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 9 Oct 2020 18:00:16 +0200 Subject: [PATCH 12/29] Adjust error message --- spacy/errors.py | 5 +++-- spacy/training/loop.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 0932ba0fd..be327a784 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -456,8 +456,9 @@ class Errors: "issue tracker: http://github.com/explosion/spaCy/issues") # TODO: fix numbering after merging develop into master - E900 = ("Could not run the full 'nlp' pipeline for evaluation. If you specified " - "frozen components, make sure they were already initialized and trained. ") + E900 = ("Could not run the full pipeline for evaluation. If you specified " + "frozen components, make sure they were already initialized and " + "trained. Full pipeline: {pipeline}") E901 = ("Failed to remove existing output directory: {path}. If your " "config and the components you train change between runs, a " "non-empty output directory can lead to stale pipeline data. To " diff --git a/spacy/training/loop.py b/spacy/training/loop.py index 8e688a27d..c3fa83b39 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -252,7 +252,7 @@ def create_evaluation_callback( try: scores = nlp.evaluate(dev_examples) except KeyError as e: - raise KeyError(Errors.E900) from e + raise KeyError(Errors.E900.format(pipeline=nlp.pipe_names)) from e # Calculate a weighted sum based on score_weights for the main score. # We can only consider scores that are ints/floats, not dicts like # entity scores per type etc. From 525f7988416f9d944f5993a793f999f91e8685f8 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 9 Oct 2020 18:00:21 +0200 Subject: [PATCH 13/29] Fix typo in test --- spacy/tests/pipeline/test_pipe_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index b744aed98..6a21ddfaa 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -142,7 +142,7 @@ def test_disable_pipes_context_restore(nlp, name): """Test that a disabled component stays disabled after running the context manager.""" nlp.add_pipe("new_pipe", name=name) assert nlp.has_pipe(name) - nlp.disable_pipes(name) + nlp.disable_pipe(name) assert not nlp.has_pipe(name) with nlp.select_pipes(disable=name): assert not nlp.has_pipe(name) From 796f8b9424737b51da81d35fe33b8383f1d5bdf7 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 9 Oct 2020 18:00:27 +0200 Subject: [PATCH 14/29] Increment version --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 095d726a0..763faa3eb 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0a36" +__version__ = "3.0.0a37" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" From bfa3931c9dc9f1ab960d81c985ddaf4bb4a4d023 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 10 Oct 2020 18:55:07 +0200 Subject: [PATCH 15/29] Revert added_strings change (#6236) --- spacy/about.py | 2 +- spacy/errors.py | 8 ++++ spacy/kb.pxd | 1 - spacy/kb.pyx | 15 ++---- spacy/pipeline/attributeruler.py | 17 +++---- spacy/pipeline/entity_linker.py | 3 +- spacy/pipeline/lemmatizer.py | 4 ++ spacy/pipeline/morphologizer.pyx | 5 -- spacy/pipeline/senter.pyx | 1 - spacy/pipeline/tagger.pyx | 3 +- spacy/pipeline/textcat.py | 3 +- spacy/pipeline/tok2vec.py | 1 - spacy/pipeline/trainable_pipe.pxd | 1 - spacy/pipeline/trainable_pipe.pyx | 36 +++++++++------ spacy/pipeline/transition_parser.pyx | 27 ++++++----- spacy/tests/pipeline/test_entity_linker.py | 19 ++------ spacy/tests/pipeline/test_morphologizer.py | 1 - spacy/tests/pipeline/test_senter.py | 1 - spacy/tests/pipeline/test_tagger.py | 2 - spacy/tests/pipeline/test_textcat.py | 2 - spacy/tests/regression/test_issue5230.py | 4 +- .../serialize/test_serialize_pipeline.py | 46 ++++++++++++++++--- spacy/util.py | 2 +- 23 files changed, 110 insertions(+), 94 deletions(-) diff --git a/spacy/about.py b/spacy/about.py index 763faa3eb..efdfd26c0 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0a37" +__version__ = "3.0.0a38" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" diff --git a/spacy/errors.py b/spacy/errors.py index be327a784..5fab0bab1 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -456,6 +456,14 @@ class Errors: "issue tracker: http://github.com/explosion/spaCy/issues") # TODO: fix numbering after merging develop into master + E898 = ("Can't serialize trainable pipe '{name}': the `model` attribute " + "is not set or None. If you've implemented a custom component, make " + "sure to store the component model as `self.model` in your " + "component's __init__ method.") + E899 = ("Can't serialize trainable pipe '{name}': the `vocab` attribute " + "is not set or None. If you've implemented a custom component, make " + "sure to store the current `nlp` object's vocab as `self.vocab` in " + "your component's __init__ method.") E900 = ("Could not run the full pipeline for evaluation. If you specified " "frozen components, make sure they were already initialized and " "trained. Full pipeline: {pipeline}") diff --git a/spacy/kb.pxd b/spacy/kb.pxd index d61bd43fa..4a71b26a2 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -30,7 +30,6 @@ cdef class KnowledgeBase: cdef Pool mem cpdef readonly Vocab vocab cdef int64_t entity_vector_length - cdef public set _added_strings # This maps 64bit keys (hash of unique entity string) # to 64bit values (position of the _KBEntryC struct in the _entries vector). diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 478579d71..10aa377eb 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -92,7 +92,6 @@ cdef class KnowledgeBase: self._alias_index = PreshMap() self.vocab = vocab self._create_empty_vectors(dummy_hash=self.vocab.strings[""]) - self._added_strings = set() @property def entity_vector_length(self): @@ -114,16 +113,12 @@ cdef class KnowledgeBase: def get_alias_strings(self): return [self.vocab.strings[x] for x in self._alias_index] - def add_string(self, string: str): - self._added_strings.add(string) - return self.vocab.strings.add(string) - def add_entity(self, unicode entity, float freq, vector[float] entity_vector): """ Add an entity to the KB, optionally specifying its log probability based on corpus frequency Return the hash of the entity ID/name at the end. """ - cdef hash_t entity_hash = self.add_string(entity) + cdef hash_t entity_hash = self.vocab.strings.add(entity) # Return if this entity was added before if entity_hash in self._entry_index: @@ -157,7 +152,7 @@ cdef class KnowledgeBase: cdef hash_t entity_hash while i < len(entity_list): # only process this entity if its unique ID hadn't been added before - entity_hash = self.add_string(entity_list[i]) + entity_hash = self.vocab.strings.add(entity_list[i]) if entity_hash in self._entry_index: warnings.warn(Warnings.W018.format(entity=entity_list[i])) @@ -203,7 +198,7 @@ cdef class KnowledgeBase: if prob_sum > 1.00001: raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum)) - cdef hash_t alias_hash = self.add_string(alias) + cdef hash_t alias_hash = self.vocab.strings.add(alias) # Check whether this alias was added before if alias_hash in self._alias_index: @@ -332,7 +327,7 @@ cdef class KnowledgeBase: raise ValueError(Errors.E928.format(loc=path)) serialize = {} serialize["contents"] = lambda p: self.write_contents(p) - serialize["strings.json"] = lambda p: srsly.write_json(p, self._added_strings) + serialize["strings.json"] = lambda p: self.vocab.strings.to_disk(p) util.to_disk(path, serialize, exclude) def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()): @@ -343,7 +338,7 @@ cdef class KnowledgeBase: raise ValueError(Errors.E928.format(loc=path)) deserialize = {} deserialize["contents"] = lambda p: self.read_contents(p) - deserialize["strings.json"] = lambda p: [self.add_string(s) for s in srsly.read_json(p)] + deserialize["strings.json"] = lambda p: self.vocab.strings.from_disk(p) util.from_disk(path, deserialize, exclude) def write_contents(self, file_path): diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py index 7a6a1de5b..e17d3be98 100644 --- a/spacy/pipeline/attributeruler.py +++ b/spacy/pipeline/attributeruler.py @@ -1,4 +1,4 @@ -from typing import List, Dict, Union, Iterable, Any, Optional, Callable, Iterator +from typing import List, Dict, Union, Iterable, Any, Optional, Callable from typing import Tuple import srsly from pathlib import Path @@ -57,7 +57,6 @@ class AttributeRuler(Pipe): self.attrs = [] self._attrs_unnormed = [] # store for reference self.indices = [] - self._added_strings = set() def clear(self) -> None: """Reset all patterns.""" @@ -187,16 +186,12 @@ class AttributeRuler(Pipe): # We need to make a string here, because otherwise the ID we pass back # will be interpreted as the hash of a string, rather than an ordinal. key = str(len(self.attrs)) - self.matcher.add(self.add_string(key), patterns) + self.matcher.add(self.vocab.strings.add(key), patterns) self._attrs_unnormed.append(attrs) attrs = normalize_token_attrs(self.vocab, attrs) self.attrs.append(attrs) self.indices.append(index) - def add_string(self, string: str): - self._added_strings.add(string) - return self.vocab.strings.add(string) - def add_patterns(self, patterns: Iterable[AttributeRulerPatternType]) -> None: """Add patterns from a list of pattern dicts with the keys as the arguments to AttributeRuler.add. @@ -256,8 +251,8 @@ class AttributeRuler(Pipe): DOCS: https://nightly.spacy.io/api/attributeruler#to_bytes """ serialize = {} + serialize["vocab"] = self.vocab.to_bytes serialize["patterns"] = lambda: srsly.msgpack_dumps(self.patterns) - serialize["strings.json"] = lambda: srsly.json_dumps(sorted(self._added_strings)) return util.to_bytes(serialize, exclude) def from_bytes( @@ -276,7 +271,7 @@ class AttributeRuler(Pipe): self.add_patterns(srsly.msgpack_loads(b)) deserialize = { - "strings.json": lambda b: [self.add_string(s) for s in srsly.json_loads(b)], + "vocab": lambda b: self.vocab.from_bytes(b), "patterns": load_patterns, } util.from_bytes(bytes_data, deserialize, exclude) @@ -293,7 +288,7 @@ class AttributeRuler(Pipe): DOCS: https://nightly.spacy.io/api/attributeruler#to_disk """ serialize = { - "strings.json": lambda p: srsly.write_json(p, self._added_strings), + "vocab": lambda p: self.vocab.to_disk(p), "patterns": lambda p: srsly.write_msgpack(p, self.patterns), } util.to_disk(path, serialize, exclude) @@ -314,7 +309,7 @@ class AttributeRuler(Pipe): self.add_patterns(srsly.read_msgpack(p)) deserialize = { - "strings.json": lambda p: [self.add_string(s) for s in srsly.read_json(p)], + "vocab": lambda p: self.vocab.from_disk(p), "patterns": load_patterns, } util.from_disk(path, deserialize, exclude) diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 881e98785..3bb449b4d 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -453,6 +453,7 @@ class EntityLinker(TrainablePipe): DOCS: https://nightly.spacy.io/api/entitylinker#to_disk """ serialize = {} + serialize["vocab"] = lambda p: self.vocab.to_disk(p) serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) serialize["kb"] = lambda p: self.kb.to_disk(p) serialize["model"] = lambda p: self.model.to_disk(p) @@ -481,8 +482,6 @@ class EntityLinker(TrainablePipe): deserialize["kb"] = lambda p: self.kb.from_disk(p) deserialize["model"] = load_model util.from_disk(path, deserialize, exclude) - for s in self.kb._added_strings: - self.vocab.strings.add(s) return self def rehearse(self, examples, *, sgd=None, losses=None, **config): diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py index 7f5370753..9be596868 100644 --- a/spacy/pipeline/lemmatizer.py +++ b/spacy/pipeline/lemmatizer.py @@ -281,6 +281,7 @@ class Lemmatizer(Pipe): DOCS: https://nightly.spacy.io/api/lemmatizer#to_disk """ serialize = {} + serialize["vocab"] = lambda p: self.vocab.to_disk(p) serialize["lookups"] = lambda p: self.lookups.to_disk(p) util.to_disk(path, serialize, exclude) @@ -296,6 +297,7 @@ class Lemmatizer(Pipe): DOCS: https://nightly.spacy.io/api/lemmatizer#from_disk """ deserialize = {} + deserialize["vocab"] = lambda p: self.vocab.from_disk(p) deserialize["lookups"] = lambda p: self.lookups.from_disk(p) util.from_disk(path, deserialize, exclude) self._validate_tables() @@ -310,6 +312,7 @@ class Lemmatizer(Pipe): DOCS: https://nightly.spacy.io/api/lemmatizer#to_bytes """ serialize = {} + serialize["vocab"] = self.vocab.to_bytes serialize["lookups"] = self.lookups.to_bytes return util.to_bytes(serialize, exclude) @@ -325,6 +328,7 @@ class Lemmatizer(Pipe): DOCS: https://nightly.spacy.io/api/lemmatizer#from_bytes """ deserialize = {} + deserialize["vocab"] = lambda b: self.vocab.from_bytes(b) deserialize["lookups"] = lambda b: self.lookups.from_bytes(b) util.from_bytes(bytes_data, deserialize, exclude) self._validate_tables() diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 00188a762..ac111f28b 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -95,7 +95,6 @@ class Morphologizer(Tagger): # add mappings for empty morph self.cfg["labels_morph"][Morphology.EMPTY_MORPH] = Morphology.EMPTY_MORPH self.cfg["labels_pos"][Morphology.EMPTY_MORPH] = POS_IDS[""] - self._added_strings = set() @property def labels(self): @@ -129,7 +128,6 @@ class Morphologizer(Tagger): label_dict.pop(self.POS_FEAT) # normalize morph string and add to morphology table norm_morph = self.vocab.strings[self.vocab.morphology.add(label_dict)] - self.add_string(norm_morph) # add label mappings if norm_label not in self.cfg["labels_morph"]: self.cfg["labels_morph"][norm_label] = norm_morph @@ -161,7 +159,6 @@ class Morphologizer(Tagger): if pos: morph_dict[self.POS_FEAT] = pos norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)] - self.add_string(norm_label) # add label->morph and label->POS mappings if norm_label not in self.cfg["labels_morph"]: self.cfg["labels_morph"][norm_label] = morph @@ -179,7 +176,6 @@ class Morphologizer(Tagger): if pos: morph_dict[self.POS_FEAT] = pos norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)] - self.add_string(norm_label) gold_array.append([1.0 if label == norm_label else 0.0 for label in self.labels]) doc_sample.append(example.x) label_sample.append(self.model.ops.asarray(gold_array, dtype="float32")) @@ -238,7 +234,6 @@ class Morphologizer(Tagger): if pos: label_dict[self.POS_FEAT] = pos label = self.vocab.strings[self.vocab.morphology.add(label_dict)] - self.add_string(label) eg_truths.append(label) truths.append(eg_truths) d_scores, loss = loss_func(scores, truths) diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index 8ea4ed1b3..15a21902a 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -61,7 +61,6 @@ class SentenceRecognizer(Tagger): self.name = name self._rehearsal_model = None self.cfg = {} - self._added_strings = set() @property def labels(self): diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 535b71270..1b0f79cea 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -78,7 +78,6 @@ class Tagger(TrainablePipe): self._rehearsal_model = None cfg = {"labels": labels or []} self.cfg = dict(sorted(cfg.items())) - self._added_strings = set() @property def labels(self): @@ -313,7 +312,7 @@ class Tagger(TrainablePipe): return 0 self._allow_extra_label() self.cfg["labels"].append(label) - self.add_string(label) + self.vocab.strings.add(label) return 1 def score(self, examples, **kwargs): diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index e57954184..5ebe0e104 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -110,7 +110,6 @@ class TextCategorizer(TrainablePipe): self._rehearsal_model = None cfg = {"labels": [], "threshold": threshold, "positive_label": None} self.cfg = dict(cfg) - self._added_strings = set() @property def labels(self) -> Tuple[str]: @@ -301,7 +300,7 @@ class TextCategorizer(TrainablePipe): return 0 self._allow_extra_label() self.cfg["labels"].append(label) - self.add_string(label) + self.vocab.strings.add(label) return 1 def initialize( diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index b4625291b..0ad875035 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -64,7 +64,6 @@ class Tok2Vec(TrainablePipe): self.name = name self.listeners = [] self.cfg = {} - self._added_strings = set() def add_listener(self, listener: "Tok2VecListener") -> None: """Add a listener for a downstream component. Usually internals.""" diff --git a/spacy/pipeline/trainable_pipe.pxd b/spacy/pipeline/trainable_pipe.pxd index 8df5cb775..d5cdbb511 100644 --- a/spacy/pipeline/trainable_pipe.pxd +++ b/spacy/pipeline/trainable_pipe.pxd @@ -5,4 +5,3 @@ cdef class TrainablePipe(Pipe): cdef public Vocab vocab cdef public object model cdef public object cfg - cdef public set _added_strings diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx index 07a308953..88e50e7c6 100644 --- a/spacy/pipeline/trainable_pipe.pyx +++ b/spacy/pipeline/trainable_pipe.pyx @@ -13,6 +13,7 @@ from ..vocab import Vocab from ..language import Language from ..training import Example + cdef class TrainablePipe(Pipe): """This class is a base class and not instantiated directly. Trainable pipeline components like the EntityRecognizer or TextCategorizer inherit @@ -35,7 +36,6 @@ cdef class TrainablePipe(Pipe): self.model = model self.name = name self.cfg = dict(cfg) - self._added_strings = set() def __call__(self, Doc doc) -> Doc: """Apply the pipe to one document. The document is modified in place, @@ -198,10 +198,6 @@ cdef class TrainablePipe(Pipe): """ raise NotImplementedError(Errors.E931.format(parent="Pipe", method="add_label", name=self.name)) - def add_string(self, string: str): - self._added_strings.add(string) - return self.vocab.strings.add(string) - @property def is_trainable(self) -> bool: return True @@ -244,6 +240,16 @@ cdef class TrainablePipe(Pipe): """ self.model.finish_update(sgd) + def _validate_serialization_attrs(self): + """Check that the pipe implements the required attributes. If a subclass + implements a custom __init__ method but doesn't set these attributes, + the currently default to None, so we need to perform additonal checks. + """ + if not hasattr(self, "vocab") or self.vocab is None: + raise ValueError(Errors.E899.format(name=util.get_object_name(self))) + if not hasattr(self, "model") or self.model is None: + raise ValueError(Errors.E898.format(name=util.get_object_name(self))) + def to_bytes(self, *, exclude=tuple()): """Serialize the pipe to a bytestring. @@ -252,11 +258,12 @@ cdef class TrainablePipe(Pipe): DOCS: https://nightly.spacy.io/api/pipe#to_bytes """ + self._validate_serialization_attrs() serialize = {} - if hasattr(self, "cfg"): + if hasattr(self, "cfg") and self.cfg is not None: serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) + serialize["vocab"] = self.vocab.to_bytes serialize["model"] = self.model.to_bytes - serialize["strings.json"] = lambda: srsly.json_dumps(sorted(self._added_strings)) return util.to_bytes(serialize, exclude) def from_bytes(self, bytes_data, *, exclude=tuple()): @@ -267,6 +274,7 @@ cdef class TrainablePipe(Pipe): DOCS: https://nightly.spacy.io/api/pipe#from_bytes """ + self._validate_serialization_attrs() def load_model(b): try: @@ -275,9 +283,9 @@ cdef class TrainablePipe(Pipe): raise ValueError(Errors.E149) from None deserialize = {} - deserialize["strings.json"] = lambda b: [self.add_string(s) for s in srsly.json_loads(b)] - if hasattr(self, "cfg"): + if hasattr(self, "cfg") and self.cfg is not None: deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b)) + deserialize["vocab"] = lambda b: self.vocab.from_bytes(b) deserialize["model"] = load_model util.from_bytes(bytes_data, deserialize, exclude) return self @@ -290,10 +298,11 @@ cdef class TrainablePipe(Pipe): DOCS: https://nightly.spacy.io/api/pipe#to_disk """ + self._validate_serialization_attrs() serialize = {} - if hasattr(self, "cfg"): + if hasattr(self, "cfg") and self.cfg is not None: serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) - serialize["strings.json"] = lambda p: srsly.write_json(p, self._added_strings) + serialize["vocab"] = lambda p: self.vocab.to_disk(p) serialize["model"] = lambda p: self.model.to_disk(p) util.to_disk(path, serialize, exclude) @@ -306,6 +315,7 @@ cdef class TrainablePipe(Pipe): DOCS: https://nightly.spacy.io/api/pipe#from_disk """ + self._validate_serialization_attrs() def load_model(p): try: @@ -314,9 +324,9 @@ cdef class TrainablePipe(Pipe): raise ValueError(Errors.E149) from None deserialize = {} - deserialize["strings.json"] = lambda p: [self.add_string(s) for s in srsly.read_json(p)] - if hasattr(self, "cfg"): + if hasattr(self, "cfg") and self.cfg is not None: deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p)) + deserialize["vocab"] = lambda p: self.vocab.from_disk(p) deserialize["model"] = load_model util.from_disk(path, deserialize, exclude) return self diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 3743e1018..63a8595cc 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -76,7 +76,6 @@ cdef class Parser(TrainablePipe): self.add_multitask_objective(multitask) self._rehearsal_model = None - self._added_strings = set() def __getnewargs_ex__(self): """This allows pickling the Parser and its keyword-only init arguments""" @@ -120,7 +119,7 @@ cdef class Parser(TrainablePipe): resized = True if resized: self._resize() - self.add_string(label) + self.vocab.strings.add(label) return 1 return 0 @@ -456,24 +455,24 @@ cdef class Parser(TrainablePipe): def to_disk(self, path, exclude=tuple()): serializers = { - 'model': lambda p: (self.model.to_disk(p) if self.model is not True else True), - 'strings.json': lambda p: srsly.write_json(p, self._added_strings), - 'moves': lambda p: self.moves.to_disk(p, exclude=["strings"]), - 'cfg': lambda p: srsly.write_json(p, self.cfg) + "model": lambda p: (self.model.to_disk(p) if self.model is not True else True), + "vocab": lambda p: self.vocab.to_disk(p), + "moves": lambda p: self.moves.to_disk(p, exclude=["strings"]), + "cfg": lambda p: srsly.write_json(p, self.cfg) } util.to_disk(path, serializers, exclude) def from_disk(self, path, exclude=tuple()): deserializers = { - 'strings.json': lambda p: [self.add_string(s) for s in srsly.read_json(p)], - 'moves': lambda p: self.moves.from_disk(p, exclude=["strings"]), - 'cfg': lambda p: self.cfg.update(srsly.read_json(p)), - 'model': lambda p: None, + "vocab": lambda p: self.vocab.from_disk(p), + "moves": lambda p: self.moves.from_disk(p, exclude=["strings"]), + "cfg": lambda p: self.cfg.update(srsly.read_json(p)), + "model": lambda p: None, } util.from_disk(path, deserializers, exclude) - if 'model' not in exclude: + if "model" not in exclude: path = util.ensure_path(path) - with (path / 'model').open('rb') as file_: + with (path / "model").open("rb") as file_: bytes_data = file_.read() try: self._resize() @@ -485,7 +484,7 @@ cdef class Parser(TrainablePipe): def to_bytes(self, exclude=tuple()): serializers = { "model": lambda: (self.model.to_bytes()), - "strings.json": lambda: srsly.json_dumps(sorted(self._added_strings)), + "vocab": lambda: self.vocab.to_bytes(), "moves": lambda: self.moves.to_bytes(exclude=["strings"]), "cfg": lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True) } @@ -493,7 +492,7 @@ cdef class Parser(TrainablePipe): def from_bytes(self, bytes_data, exclude=tuple()): deserializers = { - "strings.json": lambda b: [self.add_string(s) for s in srsly.json_loads(b)], + "vocab": lambda b: self.vocab.from_bytes(b), "moves": lambda b: self.moves.from_bytes(b, exclude=["strings"]), "cfg": lambda b: self.cfg.update(srsly.json_loads(b)), "model": lambda b: None, diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 71496327b..ff2e33fc7 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -121,9 +121,7 @@ def test_kb_default(nlp): def test_kb_custom_length(nlp): """Test that the default (empty) KB can be configured with a custom entity length""" - entity_linker = nlp.add_pipe( - "entity_linker", config={"entity_vector_length": 35} - ) + entity_linker = nlp.add_pipe("entity_linker", config={"entity_vector_length": 35}) assert len(entity_linker.kb) == 0 assert entity_linker.kb.get_size_entities() == 0 assert entity_linker.kb.get_size_aliases() == 0 @@ -213,16 +211,11 @@ def test_el_pipe_configuration(nlp): kb = KnowledgeBase(vocab, entity_vector_length=1) kb.add_entity(entity="Q2", freq=12, entity_vector=[2]) kb.add_entity(entity="Q3", freq=5, entity_vector=[3]) - kb.add_alias( - alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1] - ) + kb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1]) return kb # run an EL pipe without a trained context encoder, to check the candidate generation step only - entity_linker = nlp.add_pipe( - "entity_linker", - config={"incl_context": False}, - ) + entity_linker = nlp.add_pipe("entity_linker", config={"incl_context": False},) entity_linker.set_kb(create_kb) # With the default get_candidates function, matching is case-sensitive text = "Douglas and douglas are not the same." @@ -453,14 +446,10 @@ def test_overfitting_IO(): return mykb # Create the Entity Linker component and add it to the pipeline - entity_linker = nlp.add_pipe( - "entity_linker", - last=True, - ) + entity_linker = nlp.add_pipe("entity_linker", last=True,) entity_linker.set_kb(create_kb) assert "Q2146908" in entity_linker.vocab.strings assert "Q2146908" in entity_linker.kb.vocab.strings - assert "Q2146908" in entity_linker.kb._added_strings # train the NEL pipe optimizer = nlp.initialize(get_examples=lambda: train_examples) diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py index ce9c0fa54..fd7aa05be 100644 --- a/spacy/tests/pipeline/test_morphologizer.py +++ b/spacy/tests/pipeline/test_morphologizer.py @@ -101,4 +101,3 @@ def test_overfitting_IO(): doc2 = nlp2(test_text) assert [str(t.morph) for t in doc2] == gold_morphs assert [t.pos_ for t in doc2] == gold_pos_tags - assert nlp.get_pipe("morphologizer")._added_strings == nlp2.get_pipe("morphologizer")._added_strings diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py index 472216512..c9722e5de 100644 --- a/spacy/tests/pipeline/test_senter.py +++ b/spacy/tests/pipeline/test_senter.py @@ -80,4 +80,3 @@ def test_overfitting_IO(): nlp2 = util.load_model_from_path(tmp_dir) doc2 = nlp2(test_text) assert [int(t.is_sent_start) for t in doc2] == gold_sent_starts - assert nlp.get_pipe("senter")._added_strings == nlp2.get_pipe("senter")._added_strings diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index 590c22233..b9db76cdf 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -98,7 +98,6 @@ def test_overfitting_IO(): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["tagger"] < 0.00001 - assert tagger._added_strings == {"J", "N", "V"} # test the trained model test_text = "I like blue eggs" @@ -117,7 +116,6 @@ def test_overfitting_IO(): assert doc2[1].tag_ is "V" assert doc2[2].tag_ is "J" assert doc2[3].tag_ is "N" - assert nlp2.get_pipe("tagger")._added_strings == {"J", "N", "V"} def test_tagger_requires_labels(): diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 7eb7ff658..dd2f1070b 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -146,7 +146,6 @@ def test_overfitting_IO(): train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) optimizer = nlp.initialize(get_examples=lambda: train_examples) assert textcat.model.get_dim("nO") == 2 - assert textcat._added_strings == {"NEGATIVE", "POSITIVE"} for i in range(50): losses = {} @@ -168,7 +167,6 @@ def test_overfitting_IO(): cats2 = doc2.cats assert cats2["POSITIVE"] > 0.9 assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.001) - assert nlp2.get_pipe("textcat")._added_strings == {"NEGATIVE", "POSITIVE"} # Test scoring scores = nlp.evaluate(train_examples) diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index 02d0c70dd..a00b2a688 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -7,6 +7,7 @@ from spacy.kb import KnowledgeBase, Writer from spacy.vectors import Vectors from spacy.language import Language from spacy.pipeline import TrainablePipe +from spacy.vocab import Vocab from ..util import make_tempdir @@ -50,8 +51,9 @@ def custom_pipe(): else: self.cfg = None self.model = SerializableDummy() + self.vocab = vocab - return MyPipe(None) + return MyPipe(Vocab()) def tagger(): diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index dfd7f6bd4..951dd3035 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -1,13 +1,13 @@ import pytest -import srsly from spacy import registry, Vocab from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer -from spacy.pipeline import TextCategorizer, SentenceRecognizer +from spacy.pipeline import TextCategorizer, SentenceRecognizer, TrainablePipe from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL from spacy.pipeline.textcat import DEFAULT_TEXTCAT_MODEL from spacy.pipeline.senter import DEFAULT_SENTER_MODEL from spacy.lang.en import English +from thinc.api import Linear import spacy from ..util import make_tempdir @@ -89,7 +89,6 @@ def test_serialize_parser_strings(Parser): assert label not in vocab2.strings parser2 = Parser(vocab2, model, **config) parser2 = parser2.from_bytes(parser1.to_bytes(exclude=["vocab"])) - assert parser1._added_strings == parser2._added_strings == {"FunnyLabel"} assert label in parser2.vocab.strings @@ -166,17 +165,13 @@ def test_serialize_tagger_strings(en_vocab, de_vocab, taggers): # check that custom labels are serialized as part of the component's strings.jsonl tagger.add_label(label) assert label in tagger.vocab.strings - assert tagger._added_strings == {label} file_path = d / "tagger1" tagger.to_disk(file_path) - strings = srsly.read_json(file_path / "strings.json") - assert strings == ["SomeWeirdLabel"] # ensure that the custom strings are loaded back in when using the tagger in another pipeline cfg = {"model": DEFAULT_TAGGER_MODEL} model = registry.resolve(cfg, validate=True)["model"] tagger2 = Tagger(de_vocab, model).from_disk(file_path) assert label in tagger2.vocab.strings - assert tagger2._added_strings == {label} def test_serialize_textcat_empty(en_vocab): @@ -253,3 +248,40 @@ def test_serialize_pipeline_disable_enable(): assert nlp5.pipe_names == ["ner"] assert nlp5.component_names == ["ner"] assert nlp5.disabled == [] + + +def test_serialize_custom_trainable_pipe(): + class BadCustomPipe1(TrainablePipe): + def __init__(self, vocab): + pass + + class BadCustomPipe2(TrainablePipe): + def __init__(self, vocab): + self.vocab = vocab + self.model = None + + class CustomPipe(TrainablePipe): + def __init__(self, vocab, model): + self.vocab = vocab + self.model = model + + pipe = BadCustomPipe1(Vocab()) + with pytest.raises(ValueError): + pipe.to_bytes() + with make_tempdir() as d: + with pytest.raises(ValueError): + pipe.to_disk(d) + pipe = BadCustomPipe2(Vocab()) + with pytest.raises(ValueError): + pipe.to_bytes() + with make_tempdir() as d: + with pytest.raises(ValueError): + pipe.to_disk(d) + pipe = CustomPipe(Vocab(), Linear()) + pipe_bytes = pipe.to_bytes() + new_pipe = CustomPipe(Vocab(), Linear()).from_bytes(pipe_bytes) + assert new_pipe.to_bytes() == pipe_bytes + with make_tempdir() as d: + pipe.to_disk(d) + new_pipe = CustomPipe(Vocab(), Linear()).from_disk(d) + assert new_pipe.to_bytes() == pipe_bytes diff --git a/spacy/util.py b/spacy/util.py index 47fbcce1c..58f951f86 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -821,7 +821,7 @@ def get_object_name(obj: Any) -> str: obj (Any): The Python object, typically a function or class. RETURNS (str): A human-readable name. """ - if hasattr(obj, "name"): + if hasattr(obj, "name") and obj.name is not None: return obj.name if hasattr(obj, "__name__"): return obj.__name__ From 74972744e589969af8d0ebc83259d92c9e9a5f2f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 10 Oct 2020 19:08:57 +0200 Subject: [PATCH 16/29] Update Thinc --- pyproject.toml | 2 +- requirements.txt | 2 +- setup.cfg | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d48886e0c..c175ded66 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc>=8.0.0a43,<8.0.0a50", + "thinc>=8.0.0a44,<8.0.0a50", "blis>=0.4.0,<0.8.0", "pytokenizations", "pathy" diff --git a/requirements.txt b/requirements.txt index 3f3886a60..d6b6267a9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc>=8.0.0a43,<8.0.0a50 +thinc>=8.0.0a44,<8.0.0a50 blis>=0.4.0,<0.8.0 ml_datasets==0.2.0a0 murmurhash>=0.28.0,<1.1.0 diff --git a/setup.cfg b/setup.cfg index 69d4e6347..d9414a4f4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,13 +34,13 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc>=8.0.0a43,<8.0.0a50 + thinc>=8.0.0a44,<8.0.0a50 install_requires = # Our libraries murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=8.0.0a43,<8.0.0a50 + thinc>=8.0.0a44,<8.0.0a50 blis>=0.4.0,<0.8.0 wasabi>=0.8.0,<1.1.0 srsly>=2.3.0,<3.0.0 From 539b0c10daef8bb5d6f7e4f230a02452c6569996 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 10 Oct 2020 19:14:48 +0200 Subject: [PATCH 17/29] Tidy up and auto-format --- spacy/lang/tr/lex_attrs.py | 5 +++-- spacy/lang/tr/syntax_iterators.py | 7 +++---- spacy/language.py | 6 ++++-- spacy/tests/conftest.py | 2 ++ spacy/tests/lang/tr/test_parser.py | 19 ++++++++++++------- spacy/tests/lang/tr/test_text.py | 5 ++--- spacy/tests/pipeline/test_entity_linker.py | 2 +- spacy/tests/regression/test_issue6207.py | 4 ++-- spacy/tests/test_models.py | 13 +++---------- 9 files changed, 32 insertions(+), 31 deletions(-) diff --git a/spacy/lang/tr/lex_attrs.py b/spacy/lang/tr/lex_attrs.py index 3615f4b4c..d9e12c4aa 100644 --- a/spacy/lang/tr/lex_attrs.py +++ b/spacy/lang/tr/lex_attrs.py @@ -62,6 +62,7 @@ _ordinal_words = [ _ordinal_endings = ("inci", "ıncı", "nci", "ncı", "uncu", "üncü") + def like_num(text): if text.startswith(("+", "-", "±", "~")): text = text[1:] @@ -75,11 +76,11 @@ def like_num(text): text_lower = text.lower() - #Check cardinal number + # Check cardinal number if text_lower in _num_words: return True - #Check ordinal number + # Check ordinal number if text_lower in _ordinal_words: return True if text_lower.endswith(_ordinal_endings): diff --git a/spacy/lang/tr/syntax_iterators.py b/spacy/lang/tr/syntax_iterators.py index 665ccb590..d9b342949 100644 --- a/spacy/lang/tr/syntax_iterators.py +++ b/spacy/lang/tr/syntax_iterators.py @@ -49,11 +49,10 @@ def noun_chunks(doclike): prev_end = word.left_edge.i yield word.left_edge.i, extend_right(word), np_label elif word.dep == conj: - cc_token = word.left_edge + cc_token = word.left_edge prev_end = cc_token.i - yield cc_token.right_edge.i + 1, extend_right(word), np_label # Shave off cc tokens from the NP - - + # Shave off cc tokens from the NP + yield cc_token.right_edge.i + 1, extend_right(word), np_label SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} diff --git a/spacy/language.py b/spacy/language.py index 24e593043..dd790e85f 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1,5 +1,5 @@ from typing import Optional, Any, Dict, Callable, Iterable, Union, List, Pattern -from typing import Tuple, Iterator +from typing import Tuple from dataclasses import dataclass import random import itertools @@ -1197,7 +1197,9 @@ class Language: doc = Doc(self.vocab, words=["x", "y", "z"]) get_examples = lambda: [Example.from_dict(doc, {})] if not hasattr(get_examples, "__call__"): - err = Errors.E930.format(method="Language.initialize", obj=type(get_examples)) + err = Errors.E930.format( + method="Language.initialize", obj=type(get_examples) + ) raise TypeError(err) # Make sure the config is interpolated so we can resolve subsections config = self.config.interpolate() diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 7f8ab6768..3b0de899b 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -239,10 +239,12 @@ def th_tokenizer(): def tr_tokenizer(): return get_lang_class("tr")().tokenizer + @pytest.fixture(scope="session") def tr_vocab(): return get_lang_class("tr").Defaults.create_vocab() + @pytest.fixture(scope="session") def tt_tokenizer(): return get_lang_class("tt")().tokenizer diff --git a/spacy/tests/lang/tr/test_parser.py b/spacy/tests/lang/tr/test_parser.py index ff71ac3d4..b23d0869c 100644 --- a/spacy/tests/lang/tr/test_parser.py +++ b/spacy/tests/lang/tr/test_parser.py @@ -225,7 +225,7 @@ def test_tr_noun_chunks_acl_nmod(tr_tokenizer): assert chunks[0].text_with_ws == "en sevdiğim ses sanatçısı " -def test_tr_noun_chunks_acl_nmod(tr_tokenizer): +def test_tr_noun_chunks_acl_nmod2(tr_tokenizer): text = "bildiğim bir turizm şirketi" heads = [3, 3, 3, 3] deps = ["acl", "det", "nmod", "ROOT"] @@ -308,7 +308,7 @@ def test_tr_noun_chunks_np_recursive_four_nouns(tr_tokenizer): assert len(chunks) == 1 assert chunks[0].text_with_ws == "kızına piyano dersi verdiğim hanım " - + def test_tr_noun_chunks_np_recursive_no_nmod(tr_tokenizer): text = "içine birkaç çiçek konmuş olan bir vazo" heads = [3, 2, 3, 6, 3, 6, 6] @@ -326,7 +326,7 @@ def test_tr_noun_chunks_np_recursive_no_nmod(tr_tokenizer): def test_tr_noun_chunks_np_recursive_long_two_acls(tr_tokenizer): text = "içine Simge'nin bahçesinden toplanmış birkaç çiçeğin konmuş olduğu bir vazo" heads = [6, 2, 3, 5, 5, 6, 9, 6, 9, 9] - deps = ["obl", "nmod" , "obl", "acl", "det", "nsubj", "acl", "aux", "det", "ROOT"] + deps = ["obl", "nmod", "obl", "acl", "det", "nsubj", "acl", "aux", "det", "ROOT"] pos = ["ADP", "PROPN", "NOUN", "VERB", "DET", "NOUN", "VERB", "AUX", "DET", "NOUN"] tokens = tr_tokenizer(text) doc = Doc( @@ -334,7 +334,10 @@ def test_tr_noun_chunks_np_recursive_long_two_acls(tr_tokenizer): ) chunks = list(doc.noun_chunks) assert len(chunks) == 1 - assert chunks[0].text_with_ws == "içine Simge'nin bahçesinden toplanmış birkaç çiçeğin konmuş olduğu bir vazo " + assert ( + chunks[0].text_with_ws + == "içine Simge'nin bahçesinden toplanmış birkaç çiçeğin konmuş olduğu bir vazo " + ) def test_tr_noun_chunks_two_nouns_in_nmod(tr_tokenizer): @@ -350,7 +353,8 @@ def test_tr_noun_chunks_two_nouns_in_nmod(tr_tokenizer): assert len(chunks) == 1 assert chunks[0].text_with_ws == "kız ve erkek çocuklar " -def test_tr_noun_chunks_two_nouns_in_nmod(tr_tokenizer): + +def test_tr_noun_chunks_two_nouns_in_nmod2(tr_tokenizer): text = "tatlı ve gürbüz çocuklar" heads = [3, 2, 0, 3] deps = ["amod", "cc", "conj", "ROOT"] @@ -378,6 +382,7 @@ def test_tr_noun_chunks_conj_simple(tr_tokenizer): assert chunks[0].text_with_ws == "ben " assert chunks[1].text_with_ws == "Sen " + def test_tr_noun_chunks_conj_three(tr_tokenizer): text = "sen, ben ve ondan" heads = [0, 2, 0, 4, 0] @@ -394,7 +399,7 @@ def test_tr_noun_chunks_conj_three(tr_tokenizer): assert chunks[2].text_with_ws == "sen " -def test_tr_noun_chunks_conj_three(tr_tokenizer): +def test_tr_noun_chunks_conj_three2(tr_tokenizer): text = "ben ya da sen ya da onlar" heads = [0, 3, 1, 0, 6, 4, 3] deps = ["ROOT", "cc", "fixed", "conj", "cc", "fixed", "conj"] @@ -499,7 +504,7 @@ def test_tr_noun_chunks_flat_names_and_title(tr_tokenizer): assert chunks[0].text_with_ws == "Gazi Mustafa Kemal " -def test_tr_noun_chunks_flat_names_and_title(tr_tokenizer): +def test_tr_noun_chunks_flat_names_and_title2(tr_tokenizer): text = "Ahmet Vefik Paşa" heads = [2, 0, 2] deps = ["nmod", "flat", "ROOT"] diff --git a/spacy/tests/lang/tr/test_text.py b/spacy/tests/lang/tr/test_text.py index 01e279d76..ed7dbb805 100644 --- a/spacy/tests/lang/tr/test_text.py +++ b/spacy/tests/lang/tr/test_text.py @@ -15,8 +15,8 @@ from spacy.lang.tr.lex_attrs import like_num "üçüncü", "beşinci", "100üncü", - "8inci" - ] + "8inci", + ], ) def test_tr_lex_attrs_like_number_cardinal_ordinal(word): assert like_num(word) @@ -26,4 +26,3 @@ def test_tr_lex_attrs_like_number_cardinal_ordinal(word): def test_tr_lex_attrs_capitals(word): assert like_num(word) assert like_num(word.upper()) - diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index ff2e33fc7..e0c63d09e 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -446,7 +446,7 @@ def test_overfitting_IO(): return mykb # Create the Entity Linker component and add it to the pipeline - entity_linker = nlp.add_pipe("entity_linker", last=True,) + entity_linker = nlp.add_pipe("entity_linker", last=True) entity_linker.set_kb(create_kb) assert "Q2146908" in entity_linker.vocab.strings assert "Q2146908" in entity_linker.kb.vocab.strings diff --git a/spacy/tests/regression/test_issue6207.py b/spacy/tests/regression/test_issue6207.py index 47e3803e9..9d8b047bf 100644 --- a/spacy/tests/regression/test_issue6207.py +++ b/spacy/tests/regression/test_issue6207.py @@ -6,8 +6,8 @@ def test_issue6207(en_tokenizer): # Make spans s1 = doc[:4] - s2 = doc[3:6] # overlaps with s1 - s3 = doc[5:7] # overlaps with s2, not s1 + s2 = doc[3:6] # overlaps with s1 + s3 = doc[5:7] # overlaps with s2, not s1 result = filter_spans((s1, s2, s3)) assert s1 in result diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py index 8ca7f8b66..e8884e6b2 100644 --- a/spacy/tests/test_models.py +++ b/spacy/tests/test_models.py @@ -1,10 +1,8 @@ from typing import List - import pytest from thinc.api import fix_random_seed, Adam, set_dropout_rate from numpy.testing import assert_array_equal import numpy - from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier from spacy.ml.staticvectors import StaticVectors @@ -188,12 +186,7 @@ def test_models_update_consistently(seed, dropout, model_func, kwargs, get_X): assert_array_equal(get_all_params(model1), get_all_params(model2)) -@pytest.mark.parametrize( - "model_func,kwargs", - [ - (StaticVectors, {"nO": 128, "nM": 300}), - ] -) +@pytest.mark.parametrize("model_func,kwargs", [(StaticVectors, {"nO": 128, "nM": 300})]) def test_empty_docs(model_func, kwargs): nlp = English() model = model_func(**kwargs).initialize() @@ -201,7 +194,7 @@ def test_empty_docs(model_func, kwargs): for n_docs in range(3): docs = [nlp("") for _ in range(n_docs)] # Test predict - _ = model.predict(docs) + model.predict(docs) # Test backprop output, backprop = model.begin_update(docs) - _ = backprop(output) + backprop(output) From 68d79796c65d83b934e785bb3d8ffbea16fe832f Mon Sep 17 00:00:00 2001 From: svlandeg Date: Sat, 10 Oct 2020 20:59:48 +0200 Subject: [PATCH 18/29] add test for vocab after serializing KB --- spacy/pipeline/trainable_pipe.pyx | 2 +- spacy/tests/pipeline/test_entity_linker.py | 28 +++++++++++++++++++++- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx index 88e50e7c6..07cb01059 100644 --- a/spacy/pipeline/trainable_pipe.pyx +++ b/spacy/pipeline/trainable_pipe.pyx @@ -243,7 +243,7 @@ cdef class TrainablePipe(Pipe): def _validate_serialization_attrs(self): """Check that the pipe implements the required attributes. If a subclass implements a custom __init__ method but doesn't set these attributes, - the currently default to None, so we need to perform additonal checks. + they currently default to None, so we need to perform additonal checks. """ if not hasattr(self, "vocab") or self.vocab is None: raise ValueError(Errors.E899.format(name=util.get_object_name(self))) diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index e0c63d09e..673a354dd 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -5,6 +5,7 @@ from spacy.kb import KnowledgeBase, get_candidates, Candidate from spacy.vocab import Vocab from spacy import util, registry +from spacy.ml import load_kb from spacy.scorer import Scorer from spacy.training import Example from spacy.lang.en import English @@ -215,7 +216,7 @@ def test_el_pipe_configuration(nlp): return kb # run an EL pipe without a trained context encoder, to check the candidate generation step only - entity_linker = nlp.add_pipe("entity_linker", config={"incl_context": False},) + entity_linker = nlp.add_pipe("entity_linker", config={"incl_context": False}) entity_linker.set_kb(create_kb) # With the default get_candidates function, matching is case-sensitive text = "Douglas and douglas are not the same." @@ -496,6 +497,31 @@ def test_overfitting_IO(): assert predictions == GOLD_entities +def test_kb_serialization(): + # Test that the KB can be used in a pipeline with a different vocab + vector_length = 3 + with make_tempdir() as tmp_dir: + kb_dir = tmp_dir / "kb" + nlp1 = English() + assert "Q2146908" not in nlp1.vocab.strings + mykb = KnowledgeBase(nlp1.vocab, entity_vector_length=vector_length) + mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) + mykb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8]) + assert "Q2146908" in nlp1.vocab.strings + mykb.to_disk(kb_dir) + + nlp2 = English() + nlp2.vocab.strings.add("RandomWord") + assert "RandomWord" in nlp2.vocab.strings + assert "Q2146908" not in nlp2.vocab.strings + + # Create the Entity Linker component with the KB from file, and check the final vocab + entity_linker = nlp2.add_pipe("entity_linker", last=True) + entity_linker.set_kb(load_kb(kb_dir)) + assert "Q2146908" in nlp2.vocab.strings + assert "RandomWord" in nlp2.vocab.strings + + def test_scorer_links(): train_examples = [] nlp = English() From 3a505e7e14acf70e82910ca285b762259f20d5d4 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Sat, 10 Oct 2020 21:05:28 +0200 Subject: [PATCH 19/29] small edit to ensure the new word was indeed new --- spacy/tests/pipeline/test_entity_linker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 673a354dd..f2e6defcb 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -511,6 +511,7 @@ def test_kb_serialization(): mykb.to_disk(kb_dir) nlp2 = English() + assert "RandomWord" not in nlp2.vocab.strings nlp2.vocab.strings.add("RandomWord") assert "RandomWord" in nlp2.vocab.strings assert "Q2146908" not in nlp2.vocab.strings From 99606e46fe90a8cb813a10d62d2d234ebdf4540f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 11 Oct 2020 12:30:57 +0200 Subject: [PATCH 20/29] Relax meta.json schema [ci skip] --- spacy/schemas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/schemas.py b/spacy/schemas.py index 07d17d193..f3664acff 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -282,7 +282,7 @@ class ModelMetaSchema(BaseModel): sources: Optional[Union[List[StrictStr], List[Dict[str, str]]]] = Field(None, title="Training data sources") vectors: Dict[str, Any] = Field({}, title="Included word vectors") labels: Dict[str, List[str]] = Field({}, title="Component labels, keyed by component name") - performance: Dict[str, Union[float, Dict[str, Union[float, dict]]]] = Field({}, title="Accuracy and speed numbers") + performance: Dict[str, Any] = Field({}, title="Accuracy and speed numbers") spacy_git_version: StrictStr = Field("", title="Commit of spaCy version used") # fmt: on From ab890a35f9b54c625d423930cf81e75a27bfa69d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 11 Oct 2020 12:55:46 +0200 Subject: [PATCH 21/29] Make console logger table more compact --- spacy/training/loggers.py | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py index b431ecf06..79459a89b 100644 --- a/spacy/training/loggers.py +++ b/spacy/training/loggers.py @@ -11,11 +11,25 @@ if TYPE_CHECKING: from ..language import Language # noqa: F401 +def setup_table( + *, cols: List[str], widths: List[int], max_width: int = 13 +) -> Tuple[List[str], List[int], List[str]]: + final_cols = [] + final_widths = [] + for col, width in zip(cols, widths): + if len(col) > max_width: + col = col[: max_width - 3] + "..." # shorten column if too long + final_cols.append(col.upper()) + final_widths.append(max(len(col), width)) + return final_cols, final_widths, ["r" for _ in final_widths] + + @registry.loggers("spacy.ConsoleLogger.v1") def console_logger(progress_bar: bool = False): def setup_printer( nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr ) -> Tuple[Callable[[Optional[Dict[str, Any]]], None], Callable[[], None]]: + write = lambda text: stdout.write(f"{text}\n") msg = Printer(no_print=True) # ensure that only trainable components are logged logged_pipes = [ @@ -26,15 +40,14 @@ def console_logger(progress_bar: bool = False): eval_frequency = nlp.config["training"]["eval_frequency"] score_weights = nlp.config["training"]["score_weights"] score_cols = [col for col, value in score_weights.items() if value is not None] - score_widths = [max(len(col), 6) for col in score_cols] loss_cols = [f"Loss {pipe}" for pipe in logged_pipes] - loss_widths = [max(len(col), 8) for col in loss_cols] - table_header = ["E", "#"] + loss_cols + score_cols + ["Score"] - table_header = [col.upper() for col in table_header] - table_widths = [3, 6] + loss_widths + score_widths + [6] - table_aligns = ["r" for _ in table_widths] - stdout.write(msg.row(table_header, widths=table_widths) + "\n") - stdout.write(msg.row(["-" * width for width in table_widths]) + "\n") + spacing = 2 + table_header, table_widths, table_aligns = setup_table( + cols=["E", "#"] + loss_cols + score_cols + ["Score"], + widths=[3, 6] + [8 for _ in loss_cols] + [6 for _ in score_cols] + [6], + ) + write(msg.row(table_header, widths=table_widths, spacing=spacing)) + write(msg.row(["-" * width for width in table_widths], spacing=spacing)) progress = None def log_step(info: Optional[Dict[str, Any]]) -> None: @@ -70,7 +83,9 @@ def console_logger(progress_bar: bool = False): ) if progress is not None: progress.close() - stdout.write(msg.row(data, widths=table_widths, aligns=table_aligns) + "\n") + write( + msg.row(data, widths=table_widths, aligns=table_aligns, spacing=spacing) + ) if progress_bar: # Set disable=None, so that it disables on non-TTY progress = tqdm.tqdm( From 4fa967ea843c2b1db0147a2b4d303266e5563f73 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 11 Oct 2020 13:10:58 +0200 Subject: [PATCH 22/29] Increment version [ci skip] --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index efdfd26c0..38efce3e9 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0a38" +__version__ = "3.0.0a39" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" From 40276fd3be231be6969f8c51889c13e77a726fa8 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 12 Oct 2020 11:41:27 +0200 Subject: [PATCH 23/29] update NEL docs after latest refactor --- spacy/ml/models/entity_linker.py | 3 +- website/docs/api/architectures.md | 19 +++---- website/docs/api/entitylinker.md | 84 ++++++++++++++++++++----------- 3 files changed, 68 insertions(+), 38 deletions(-) diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index d945e5fba..f37203b1b 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -1,3 +1,4 @@ +from pathlib import Path from typing import Optional, Callable, Iterable from thinc.api import chain, clone, list2ragged, reduce_mean, residual from thinc.api import Model, Maxout, Linear @@ -25,7 +26,7 @@ def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model: @registry.misc.register("spacy.KBFromFile.v1") -def load_kb(kb_path: str) -> Callable[[Vocab], KnowledgeBase]: +def load_kb(kb_path: Path) -> Callable[[Vocab], KnowledgeBase]: def kb_from_file(vocab): kb = KnowledgeBase(vocab, entity_vector_length=1) kb.from_disk(kb_path) diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index 5246a3ed6..3157c261a 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -637,13 +637,6 @@ into the "real world". This requires 3 main components: > window_size = 1 > maxout_pieces = 3 > subword_features = true -> -> [kb_loader] -> @misc = "spacy.EmptyKB.v1" -> entity_vector_length = 64 -> -> [get_candidates] -> @misc = "spacy.CandidateGenerator.v1" > ``` The `EntityLinker` model architecture is a Thinc `Model` with a @@ -657,13 +650,21 @@ The `EntityLinker` model architecture is a Thinc `Model` with a ### spacy.EmptyKB.v1 {#EmptyKB} -A function that creates a default, empty `KnowledgeBase` from a -[`Vocab`](/api/vocab) instance. +A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab) +instance. This is the default when a new entity linker component is created. | Name | Description | | ---------------------- | ----------------------------------------------------------------------------------- | | `entity_vector_length` | The length of the vectors encoding each entity in the KB. Defaults to `64`. ~~int~~ | +### spacy.KBFromFile.v1 {#KBFromFile} + +A function that reads an existing `KnowledgeBase` from file. + +| Name | Description | +| --------- | -------------------------------------------------------- | +| `kb_path` | The location of the KB that was stored to file. ~~Path~~ | + ### spacy.CandidateGenerator.v1 {#CandidateGenerator} A function that takes as input a [`KnowledgeBase`](/api/kb) and a diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index 169a175e2..0904bbf72 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -34,20 +34,20 @@ architectures and their arguments and hyperparameters. > "incl_prior": True, > "incl_context": True, > "model": DEFAULT_NEL_MODEL, -> "kb_loader": {'@misc': 'spacy.EmptyKB.v1', 'entity_vector_length': 64}, +> "entity_vector_length": 64, > "get_candidates": {'@misc': 'spacy.CandidateGenerator.v1'}, > } > nlp.add_pipe("entity_linker", config=config) > ``` -| Setting | Description | -| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ | -| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ | -| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ | -| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ | -| `kb_loader` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. Defaults to [EmptyKB](/api/architectures#EmptyKB), a function returning an empty `KnowledgeBase` with an `entity_vector_length` of `64`. ~~Callable[[Vocab], KnowledgeBase]~~ | -| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | +| Setting | Description | +| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ | +| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ | +| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ | +| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to 64. ~~int~~ | +| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/entity_linker.py @@ -65,10 +65,6 @@ architectures and their arguments and hyperparameters. > config = {"model": {"@architectures": "my_el.v1"}} > entity_linker = nlp.add_pipe("entity_linker", config=config) > -> # Construction via add_pipe with custom KB and candidate generation -> config = {"kb": {"@misc": "my_kb.v1"}} -> entity_linker = nlp.add_pipe("entity_linker", config=config) -> > # Construction from class > from spacy.pipeline import EntityLinker > entity_linker = EntityLinker(nlp.vocab, model) @@ -76,21 +72,25 @@ architectures and their arguments and hyperparameters. Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and -[`nlp.add_pipe`](/api/language#add_pipe). Note that both the internal -`KnowledgeBase` as well as the Candidate generator can be customized by -providing custom registered functions. +[`nlp.add_pipe`](/api/language#add_pipe). -| Name | Description | -| ---------------- | -------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary. ~~Vocab~~ | -| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ | -| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | -| _keyword-only_ | | -| `kb_loader` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. ~~Callable[[Vocab], KnowledgeBase]~~ | -| `get_candidates` | Function that generates plausible candidates for a given `Span` object. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | -| `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~ | -| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. ~~bool~~ | -| `incl_context` | Whether or not to include the local context in the model. ~~bool~~ | +Upon construction of the entity linker component, an empty knowledge base is +constructed with the provided `entity_vector_length`. If you want to use a +custom knowledge base, you should either call +[`set_kb`](/api/entitylinker#set_kb) or provide a `kb_loader` in the +[`initialize`](/api/entitylinker#initialize) call. + +| Name | Description | +| ---------------------- | -------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| _keyword-only_ | | +| `entity_vector_length` | Size of encoding vectors in the KB. ~~int~~ | +| `get_candidates` | Function that generates plausible candidates for a given `Span` object. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | +| `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~ | +| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. ~~bool~~ | +| `incl_context` | Whether or not to include the local context in the model. ~~bool~~ | ## EntityLinker.\_\_call\_\_ {#call tag="method"} @@ -139,6 +139,28 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ | +## EntityLinker.set_kb {#initialize tag="method" new="3"} + +The `kb_loader` should be a function that takes a `Vocab` instance and creates +the `KnowledgeBase`, ensuring that the strings of the knowledge base are synced +with the current vocab. + +> #### Example +> +> ```python +> def create_kb(vocab): +> kb = KnowledgeBase(vocab, entity_vector_length=128) +> kb.add_entity(...) +> kb.add_alias(...) +> return kb +> entity_linker = nlp.add_pipe("entity_linker") +> entity_linker.set_kb(lambda: [], nlp=nlp, kb_loader=create_kb) +> ``` + +| Name | Description | +| ----------- | ---------------------------------------------------------------------------------------------------------------- | +| `kb_loader` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. ~~Callable[[Vocab], KnowledgeBase]~~ | + ## EntityLinker.initialize {#initialize tag="method" new="3"} Initialize the component for training. `get_examples` should be a function that @@ -150,6 +172,11 @@ network, setting up the label scheme based on the data. This method is typically called by [`Language.initialize`](/api/language#initialize). +Optionally, a `kb_loader` argument may be specified to change the internal +knowledge base. This argument should be a function that takes a `Vocab` instance +and creates the `KnowledgeBase`, ensuring that the strings of the knowledge base +are synced with the current vocab. + This method was previously called `begin_training`. @@ -160,7 +187,7 @@ This method was previously called `begin_training`. > > ```python > entity_linker = nlp.add_pipe("entity_linker") -> entity_linker.initialize(lambda: [], nlp=nlp) +> entity_linker.initialize(lambda: [], nlp=nlp, kb_loader=my_kb) > ``` | Name | Description | @@ -168,6 +195,7 @@ This method was previously called `begin_training`. | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | | _keyword-only_ | | | `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | +| `kb_loader` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. ~~Callable[[Vocab], KnowledgeBase]~~ | ## EntityLinker.predict {#predict tag="method"} From 1f465bea185d6aff3f4320b84f6a006b72b71917 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 13 Oct 2020 09:27:19 +0200 Subject: [PATCH 24/29] if-else --- spacy/util.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/spacy/util.py b/spacy/util.py index 58f951f86..8335a4fcc 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1361,11 +1361,12 @@ def check_bool_env_var(env_var: str) -> bool: def _pipe(docs, proc, kwargs): if hasattr(proc, "pipe"): yield from proc.pipe(docs, **kwargs) - # We added some args for pipe that __call__ doesn't expect. - kwargs = dict(kwargs) - for arg in ["batch_size"]: - if arg in kwargs: - kwargs.pop(arg) - for doc in docs: - doc = proc(doc, **kwargs) - yield doc + else: + # We added some args for pipe that __call__ doesn't expect. + kwargs = dict(kwargs) + for arg in ["batch_size"]: + if arg in kwargs: + kwargs.pop(arg) + for doc in docs: + doc = proc(doc, **kwargs) + yield doc From a0e12c136b5864e7c0390a70902b2b158118d9b8 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 13 Oct 2020 10:00:53 +0200 Subject: [PATCH 25/29] Increment version [ci skip] --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 38efce3e9..2aeef3c8d 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0a39" +__version__ = "3.0.0a40" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" From 4d99d2b94a73d7d950f92526efc5a5f6f9b98121 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 13 Oct 2020 11:38:52 +0200 Subject: [PATCH 26/29] Update docs [ci skip] --- website/docs/api/entitylinker.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index 0904bbf72..683927b1c 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -46,7 +46,7 @@ architectures and their arguments and hyperparameters. | `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ | | `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ | -| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to 64. ~~int~~ | +| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ | | `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | ```python From 86d648740fc4f1fea9ac5c779c2d578c2431cafe Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 13 Oct 2020 11:39:03 +0200 Subject: [PATCH 27/29] Fix morph representation in Doc.to_json --- spacy/tokens/doc.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 4a57e4c83..abc82030d 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1419,7 +1419,7 @@ cdef class Doc: if include_annotation["POS"]: token_data["pos"] = token.pos_ if include_annotation["MORPH"]: - token_data["morph"] = token.morph + token_data["morph"] = token.morph.to_json() if include_annotation["LEMMA"]: token_data["lemma"] = token.lemma_ if include_annotation["DEP"]: From f8a1c1afd6fff111b4434e6d19a2b1aec5b55501 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 13 Oct 2020 14:39:59 +0200 Subject: [PATCH 28/29] avoid dropout at runtime (#6247) --- spacy/about.py | 2 +- spacy/ml/staticvectors.py | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/spacy/about.py b/spacy/about.py index 2aeef3c8d..9c5dd0b4f 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0a40" +__version__ = "3.0.0a41" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py index da731dadb..f0213a9b8 100644 --- a/spacy/ml/staticvectors.py +++ b/spacy/ml/staticvectors.py @@ -39,7 +39,6 @@ def forward( key_attr = model.attrs["key_attr"] W = cast(Floats2d, model.ops.as_contig(model.get_param("W"))) V = cast(Floats2d, docs[0].vocab.vectors.data) - mask = _get_drop_mask(model.ops, W.shape[0], model.attrs.get("dropout_rate")) rows = model.ops.flatten( [doc.vocab.vectors.find(keys=doc.to_array(key_attr)) for doc in docs] ) @@ -47,8 +46,11 @@ def forward( model.ops.gemm(model.ops.as_contig(V[rows]), W, trans2=True), model.ops.asarray([len(doc) for doc in docs], dtype="i"), ) - if mask is not None: - output.data *= mask + mask = None + if is_train: + mask = _get_drop_mask(model.ops, W.shape[0], model.attrs.get("dropout_rate")) + if mask is not None: + output.data *= mask def backprop(d_output: Ragged) -> List[Doc]: if mask is not None: From 1f4930086209128876e2804ae070ded54471e6f2 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 13 Oct 2020 15:41:17 +0200 Subject: [PATCH 29/29] Update transformer recommendations [ci skip] --- .../quickstart_training_recommendations.yml | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/spacy/cli/templates/quickstart_training_recommendations.yml b/spacy/cli/templates/quickstart_training_recommendations.yml index 206e69954..54aec2e31 100644 --- a/spacy/cli/templates/quickstart_training_recommendations.yml +++ b/spacy/cli/templates/quickstart_training_recommendations.yml @@ -32,10 +32,10 @@ es: word_vectors: null transformer: efficiency: - name: mrm8488/RuPERTa-base + name: dccuchile/bert-base-spanish-wwm-cased size_factor: 3 accuracy: - name: mrm8488/RuPERTa-base + name: dccuchile/bert-base-spanish-wwm-cased size_factor: 3 sv: word_vectors: null @@ -101,3 +101,21 @@ pl: accuracy: name: dkleczek/bert-base-polish-cased-v1 size_factor: 3 +nl: + word_vectors: null + transformer: + efficiency: + name: pdelobelle/robbert-v2-dutch-base + size_factor: 3 + accuracy: + name: pdelobelle/robbert-v2-dutch-base + size_factor: 3 +pt: + word_vectors: null + transformer: + efficiency: + name: neuralmind/bert-base-portuguese-cased + size_factor: 3 + accuracy: + name: neuralmind/bert-base-portuguese-cased + size_factor: 3