From 4c56dd5fb85f43457f4355c009ea0502f2007e70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Fri, 10 Jun 2022 18:12:28 +0200 Subject: [PATCH 01/19] precomputable_biaffine: avoid concatenation (#10911) The `forward` of `precomputable_biaffine` performs matrix multiplication and then `vstack`s the result with padding. This creates a temporary array used for the output of matrix concatenation. This change avoids the temporary by pre-allocating an array that is large enough for the output of matrix multiplication plus padding and fills the array in-place. This gave me a small speedup (a bit over 100 WPS) on de_core_news_lg on M1 Max (after changing thinc-apple-ops to support in-place gemm as BLIS does). --- spacy/ml/_precomputable_affine.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py index b99de2d2b..049b7a455 100644 --- a/spacy/ml/_precomputable_affine.py +++ b/spacy/ml/_precomputable_affine.py @@ -22,9 +22,11 @@ def forward(model, X, is_train): nP = model.get_dim("nP") nI = model.get_dim("nI") W = model.get_param("W") - Yf = model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True) + # Preallocate array for layer output, including padding. + Yf = model.ops.alloc2f(X.shape[0] + 1, nF * nO * nP) + model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True, out=Yf[1:]) Yf = Yf.reshape((Yf.shape[0], nF, nO, nP)) - Yf = model.ops.xp.vstack((model.get_param("pad"), Yf)) + Yf[0] = model.get_param("pad") def backward(dY_ids): # This backprop is particularly tricky, because we get back a different From 9b9b743e8b19aae74cf8a405b53aedf4016bdf90 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 17 Jun 2022 19:41:55 +0100 Subject: [PATCH 02/19] Auto-format code with black (#10977) Co-authored-by: explosion-bot --- spacy/ml/_precomputable_affine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py index 049b7a455..49269ccd6 100644 --- a/spacy/ml/_precomputable_affine.py +++ b/spacy/ml/_precomputable_affine.py @@ -23,7 +23,7 @@ def forward(model, X, is_train): nI = model.get_dim("nI") W = model.get_param("W") # Preallocate array for layer output, including padding. - Yf = model.ops.alloc2f(X.shape[0] + 1, nF * nO * nP) + Yf = model.ops.alloc2f(X.shape[0] + 1, nF * nO * nP) model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True, out=Yf[1:]) Yf = Yf.reshape((Yf.shape[0], nF, nO, nP)) Yf[0] = model.get_param("pad") From c900f8573d9b20cc2a2bb567abb950f7236e1115 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Tue, 26 Jul 2022 10:52:01 +0200 Subject: [PATCH 03/19] Fix compatibility with CuPy 9.x (#11194) After the precomputable affine table of shape [nB, nF, nO, nP] is computed, padding with shape [1, nF, nO, nP] is assigned to the first row of the precomputed affine table. However, when we are indexing the precomputed table, we get a row of shape [nF, nO, nP]. CuPy versions before 10.0 cannot paper over this shape difference. This change fixes compatibility with CuPy < 10.0 by squeezing the first dimension of the padding before assignment. --- spacy/ml/_precomputable_affine.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py index 49269ccd6..88c415754 100644 --- a/spacy/ml/_precomputable_affine.py +++ b/spacy/ml/_precomputable_affine.py @@ -26,7 +26,11 @@ def forward(model, X, is_train): Yf = model.ops.alloc2f(X.shape[0] + 1, nF * nO * nP) model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True, out=Yf[1:]) Yf = Yf.reshape((Yf.shape[0], nF, nO, nP)) - Yf[0] = model.get_param("pad") + + # Set padding. Padding has shape (1, nF, nO, nP). Unfortunately, we cannot + # change its shape to (nF, nO, nP) without breaking existing models. So + # we'll squeeze the first dimension here. + Yf[0] = model.ops.xp.squeeze(model.get_param("pad"), 0) def backward(dY_ids): # This backprop is particularly tricky, because we get back a different From 7d122621450c0986c239e98db72f0f50dedb37cd Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 22 Aug 2022 12:04:30 +0200 Subject: [PATCH 04/19] Clean up warnings in the test suite (#11331) --- spacy/tests/doc/test_doc_api.py | 5 +++-- spacy/tests/lang/ru/test_lemmatizer.py | 3 +++ spacy/tests/lang/uk/test_lemmatizer.py | 4 ++++ spacy/tests/matcher/test_phrase_matcher.py | 9 +++++---- spacy/tests/pipeline/test_entity_linker.py | 4 ++++ spacy/training/initialize.py | 2 ++ 6 files changed, 21 insertions(+), 6 deletions(-) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index dd4942989..a64ab2ba8 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -3,6 +3,7 @@ import weakref import numpy from numpy.testing import assert_array_equal import pytest +import warnings from thinc.api import NumpyOps, get_current_ops from spacy.attrs import DEP, ENT_IOB, ENT_TYPE, HEAD, IS_ALPHA, MORPH, POS @@ -529,9 +530,9 @@ def test_doc_from_array_sent_starts(en_vocab): # no warning using default attrs attrs = doc._get_array_attrs() arr = doc.to_array(attrs) - with pytest.warns(None) as record: + with warnings.catch_warnings(): + warnings.simplefilter("error") new_doc.from_array(attrs, arr) - assert len(record) == 0 # only SENT_START uses SENT_START attrs = [SENT_START] arr = doc.to_array(attrs) diff --git a/spacy/tests/lang/ru/test_lemmatizer.py b/spacy/tests/lang/ru/test_lemmatizer.py index 3810323bf..9ca7f441b 100644 --- a/spacy/tests/lang/ru/test_lemmatizer.py +++ b/spacy/tests/lang/ru/test_lemmatizer.py @@ -2,6 +2,9 @@ import pytest from spacy.tokens import Doc +pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning") + + def test_ru_doc_lemmatization(ru_lemmatizer): words = ["мама", "мыла", "раму"] pos = ["NOUN", "VERB", "NOUN"] diff --git a/spacy/tests/lang/uk/test_lemmatizer.py b/spacy/tests/lang/uk/test_lemmatizer.py index 4a787b2a6..57dd4198a 100644 --- a/spacy/tests/lang/uk/test_lemmatizer.py +++ b/spacy/tests/lang/uk/test_lemmatizer.py @@ -1,6 +1,10 @@ +import pytest from spacy.tokens import Doc +pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning") + + def test_uk_lemmatizer(uk_lemmatizer): """Check that the default uk lemmatizer runs.""" doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"]) diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index 3b24f3ba8..8a8d9eb84 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -1,4 +1,5 @@ import pytest +import warnings import srsly from mock import Mock @@ -344,13 +345,13 @@ def test_phrase_matcher_validation(en_vocab): matcher.add("TEST1", [doc1]) with pytest.warns(UserWarning): matcher.add("TEST2", [doc2]) - with pytest.warns(None) as record: + with warnings.catch_warnings(): + warnings.simplefilter("error") matcher.add("TEST3", [doc3]) - assert not record.list matcher = PhraseMatcher(en_vocab, attr="POS", validate=True) - with pytest.warns(None) as record: + with warnings.catch_warnings(): + warnings.simplefilter("error") matcher.add("TEST4", [doc2]) - assert not record.list def test_attr_validation(en_vocab): diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index a6cfead77..a45679b63 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -1048,6 +1048,10 @@ def test_no_gold_ents(patterns): for eg in train_examples: eg.predicted = ruler(eg.predicted) + # Entity ruler is no longer needed (initialization below wipes out the + # patterns and causes warnings) + nlp.remove_pipe("entity_ruler") + def create_kb(vocab): # create artificial KB mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 48ff7b589..6304e4a84 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -337,3 +337,5 @@ def ensure_shape(vectors_loc): # store all the results in a list in memory lines2 = open_file(vectors_loc) yield from lines2 + lines2.close() + lines.close() From 7fefb39e58eba639062870a49836c07944a726dd Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 9 Aug 2022 10:59:36 +0200 Subject: [PATCH 05/19] Fix regex invalid escape sequences (#11276) --- spacy/lang/ko/punctuation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lang/ko/punctuation.py b/spacy/lang/ko/punctuation.py index 7f7b40c5b..f5f1c51da 100644 --- a/spacy/lang/ko/punctuation.py +++ b/spacy/lang/ko/punctuation.py @@ -3,7 +3,7 @@ from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES _infixes = ( - ["·", "ㆍ", "\(", "\)"] + ["·", "ㆍ", r"\(", r"\)"] + [r"(?<=[0-9])~(?=[0-9-])"] + LIST_QUOTES + BASE_TOKENIZER_INFIXES From 76449e07a0ab510e01d953308212b2ac06c5fdb4 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 25 Oct 2022 14:53:18 +0200 Subject: [PATCH 06/19] Rename test helper method with non-test_ name (#11701) --- spacy/tests/test_models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py index 2306cabb7..d91ed1201 100644 --- a/spacy/tests/test_models.py +++ b/spacy/tests/test_models.py @@ -23,7 +23,7 @@ def get_textcat_bow_kwargs(): def get_textcat_cnn_kwargs(): - return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13} + return {"tok2vec": make_test_tok2vec(), "exclusive_classes": False, "nO": 13} def get_all_params(model): @@ -65,7 +65,7 @@ def get_tok2vec_kwargs(): } -def test_tok2vec(): +def make_test_tok2vec(): return build_Tok2Vec_model(**get_tok2vec_kwargs()) From 07026337d2e539f6300f3d0580f48452a17a0eb1 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 25 Nov 2022 13:00:57 +0100 Subject: [PATCH 07/19] Add smart_open requirement, update deprecated options (#11864) * Switch from deprecated `ignore_ext` to `compression` * Add upload/download test for local files --- requirements.txt | 1 + setup.cfg | 3 ++- spacy/cli/_util.py | 2 +- spacy/tests/test_cli.py | 16 ++++++++++++++++ 4 files changed, 20 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 69cefa3f6..783e6f0f8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,6 +12,7 @@ srsly>=2.4.3,<3.0.0 catalogue>=2.0.6,<2.1.0 typer>=0.3.0,<0.5.0 pathy>=0.3.5 +smart-open>=5.2.1,<7.0.0 # Third party dependencies numpy>=1.15.0 requests>=2.13.0,<3.0.0 diff --git a/setup.cfg b/setup.cfg index f2c0c6958..97e6efc21 100644 --- a/setup.cfg +++ b/setup.cfg @@ -51,9 +51,10 @@ install_requires = wasabi>=0.9.1,<1.1.0 srsly>=2.4.3,<3.0.0 catalogue>=2.0.6,<2.1.0 + # Third-party dependencies typer>=0.3.0,<0.5.0 pathy>=0.3.5 - # Third-party dependencies + smart-open>=5.2.1,<7.0.0 tqdm>=4.38.0,<5.0.0 numpy>=1.15.0 requests>=2.13.0,<3.0.0 diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index df98e711f..2f8e492a7 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -358,7 +358,7 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False) if dest.exists() and not force: return None src = str(src) - with smart_open.open(src, mode="rb", ignore_ext=True) as input_file: + with smart_open.open(src, mode="rb", compression="disable") as input_file: with dest.open(mode="wb") as output_file: shutil.copyfileobj(input_file, output_file) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 838e00369..b04c49f47 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -15,6 +15,7 @@ from spacy.cli._util import is_subpath_of, load_project_config from spacy.cli._util import parse_config_overrides, string_to_list from spacy.cli._util import substitute_project_variables from spacy.cli._util import validate_project_commands +from spacy.cli._util import upload_file, download_file from spacy.cli.debug_data import _compile_gold, _get_labels_from_model from spacy.cli.debug_data import _get_labels_from_spancat from spacy.cli.debug_data import _get_distribution, _get_kl_divergence @@ -855,3 +856,18 @@ def test_span_length_freq_dist_output_must_be_correct(): span_freqs = _get_spans_length_freq_dist(sample_span_lengths, threshold) assert sum(span_freqs.values()) >= threshold assert list(span_freqs.keys()) == [3, 1, 4, 5, 2] + + +def test_upload_download_local_file(): + with make_tempdir() as d1, make_tempdir() as d2: + filename = "f.txt" + content = "content" + local_file = d1 / filename + remote_file = d2 / filename + with local_file.open(mode="w") as file_: + file_.write(content) + upload_file(local_file, remote_file) + local_file.unlink() + download_file(remote_file, local_file) + with local_file.open(mode="r") as file_: + assert file_.read() == content From 2201459603c6620bb5a5d2e963c130e8ce521f3a Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 28 Nov 2022 18:01:09 +0900 Subject: [PATCH 08/19] Don't throw an error if using displacy on an unset span key (#11845) * Don't throw an error if using displacy on an unset span key * List available keys in W117 --- spacy/displacy/__init__.py | 5 +++-- spacy/errors.py | 2 +- spacy/tests/test_displacy.py | 10 ++++++++++ 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index 5d49b6eb7..8bf84981b 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -227,12 +227,13 @@ def parse_spans(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]: "kb_id": span.kb_id_ if span.kb_id_ else "", "kb_url": kb_url_template.format(span.kb_id_) if kb_url_template else "#", } - for span in doc.spans[spans_key] + for span in doc.spans.get(spans_key, []) ] tokens = [token.text for token in doc] if not spans: - warnings.warn(Warnings.W117.format(spans_key=spans_key)) + keys = list(doc.spans.keys()) + warnings.warn(Warnings.W117.format(spans_key=spans_key, keys=keys)) title = doc.user_data.get("title", None) if hasattr(doc, "user_data") else None settings = get_doc_settings(doc) return { diff --git a/spacy/errors.py b/spacy/errors.py index 60985fbd7..147af4eec 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -195,7 +195,7 @@ class Warnings(metaclass=ErrorsWithCodes): W117 = ("No spans to visualize found in Doc object with spans_key: '{spans_key}'. If this is " "surprising to you, make sure the Doc was processed using a model " "that supports span categorization, and check the `doc.spans[spans_key]` " - "property manually if necessary.") + "property manually if necessary.\n\nAvailable keys: {keys}") W118 = ("Term '{term}' not found in glossary. It may however be explained in documentation " "for the corpora used to train the language. Please check " "`nlp.meta[\"sources\"]` for any relevant links.") diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py index ccc145b44..f298b38e0 100644 --- a/spacy/tests/test_displacy.py +++ b/spacy/tests/test_displacy.py @@ -203,6 +203,16 @@ def test_displacy_parse_spans_different_spans_key(en_vocab): ] +def test_displacy_parse_empty_spans_key(en_vocab): + """Test that having an unset spans key doesn't raise an error""" + doc = Doc(en_vocab, words=["Welcome", "to", "the", "Bank", "of", "China"]) + doc.spans["custom"] = [Span(doc, 3, 6, "BANK")] + with pytest.warns(UserWarning, match="W117"): + spans = displacy.parse_spans(doc) + + assert isinstance(spans, dict) + + def test_displacy_parse_ents(en_vocab): """Test that named entities on a Doc are converted into displaCy's format.""" doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"]) From 1e701c8beefe7a61d37bea51bf2fda90e51eb2ba Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 2 Dec 2022 09:33:52 +0100 Subject: [PATCH 09/19] Fix spancat for zero suggestions (#11860) * Add test for spancat predict with zero suggestions * Fix spancat for zero suggestions * Undo changes to extract_spans * Use .sum() as in update --- spacy/pipeline/spancat.py | 5 +++- spacy/tests/pipeline/test_spancat.py | 43 ++++++++++++++++++++++------ 2 files changed, 38 insertions(+), 10 deletions(-) diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py index 0a6138fbc..e3fea5946 100644 --- a/spacy/pipeline/spancat.py +++ b/spacy/pipeline/spancat.py @@ -269,7 +269,10 @@ class SpanCategorizer(TrainablePipe): DOCS: https://spacy.io/api/spancategorizer#predict """ indices = self.suggester(docs, ops=self.model.ops) - scores = self.model.predict((docs, indices)) # type: ignore + if indices.lengths.sum() == 0: + scores = self.model.ops.alloc2f(0, 0) + else: + scores = self.model.predict((docs, indices)) # type: ignore return indices, scores def set_candidates( diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py index 15256a763..e9db983d3 100644 --- a/spacy/tests/pipeline/test_spancat.py +++ b/spacy/tests/pipeline/test_spancat.py @@ -372,24 +372,39 @@ def test_overfitting_IO_overlapping(): def test_zero_suggestions(): - # Test with a suggester that returns 0 suggestions + # Test with a suggester that can return 0 suggestions - @registry.misc("test_zero_suggester") - def make_zero_suggester(): - def zero_suggester(docs, *, ops=None): + @registry.misc("test_mixed_zero_suggester") + def make_mixed_zero_suggester(): + def mixed_zero_suggester(docs, *, ops=None): if ops is None: ops = get_current_ops() - return Ragged( - ops.xp.zeros((0, 0), dtype="i"), ops.xp.zeros((len(docs),), dtype="i") - ) + spans = [] + lengths = [] + for doc in docs: + if len(doc) > 0 and len(doc) % 2 == 0: + spans.append((0, 1)) + lengths.append(1) + else: + lengths.append(0) + spans = ops.asarray2i(spans) + lengths_array = ops.asarray1i(lengths) + if len(spans) > 0: + output = Ragged(ops.xp.vstack(spans), lengths_array) + else: + output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array) + return output - return zero_suggester + return mixed_zero_suggester fix_random_seed(0) nlp = English() spancat = nlp.add_pipe( "spancat", - config={"suggester": {"@misc": "test_zero_suggester"}, "spans_key": SPAN_KEY}, + config={ + "suggester": {"@misc": "test_mixed_zero_suggester"}, + "spans_key": SPAN_KEY, + }, ) train_examples = make_examples(nlp) optimizer = nlp.initialize(get_examples=lambda: train_examples) @@ -397,6 +412,16 @@ def test_zero_suggestions(): assert set(spancat.labels) == {"LOC", "PERSON"} nlp.update(train_examples, sgd=optimizer) + # empty doc + nlp("") + # single doc with zero suggestions + nlp("one") + # single doc with one suggestion + nlp("two two") + # batch with mixed zero/one suggestions + list(nlp.pipe(["one", "two two", "three three three", "", "four four four four"])) + # batch with no suggestions + list(nlp.pipe(["", "one", "three three three"])) def test_set_candidates(): From 062bd27f22e8e40027207ca24105438ce8c918ac Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 5 Dec 2022 17:43:23 +0900 Subject: [PATCH 10/19] Switch ubuntu-latest to ubuntu-20.04 in main tests (#11928) * Switch ubuntu-latest to ubuntu-20.04 in main tests * Only use 20.04 for 3.6 --- azure-pipelines.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 4624b2eb2..005fb3609 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -41,7 +41,7 @@ jobs: matrix: # We're only running one platform per Python version to speed up builds Python36Linux: - imageName: "ubuntu-latest" + imageName: "ubuntu-20.04" python.version: "3.6" # Python36Windows: # imageName: "windows-latest" @@ -50,7 +50,7 @@ jobs: # imageName: "macos-latest" # python.version: "3.6" # Python37Linux: - # imageName: "ubuntu-latest" + # imageName: "ubuntu-20.04" # python.version: "3.7" Python37Windows: imageName: "windows-latest" From 809887a925fbd6aec50225e4449fe5ca22892e9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Wed, 7 Dec 2022 05:53:41 +0100 Subject: [PATCH 11/19] EditTreeLemmatizer: correctly add strings when initializing from labels (#11934) Strings in replacement nodes where not added to the `StringStore` when `EditTreeLemmatizer` was initialized from a set of labels. The corresponding test did not capture this because it added the strings through the examples that were passed to the initialization. This change fixes both this bug in the initialization as the 'shadowing' of the bug in the test. --- spacy/pipeline/edit_tree_lemmatizer.py | 4 +- .../pipeline/test_edit_tree_lemmatizer.py | 37 ++++++++++++++++++- 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py index 54a7030dc..7507eb35b 100644 --- a/spacy/pipeline/edit_tree_lemmatizer.py +++ b/spacy/pipeline/edit_tree_lemmatizer.py @@ -331,9 +331,9 @@ class EditTreeLemmatizer(TrainablePipe): tree = dict(tree) if "orig" in tree: - tree["orig"] = self.vocab.strings[tree["orig"]] + tree["orig"] = self.vocab.strings.add(tree["orig"]) if "orig" in tree: - tree["subst"] = self.vocab.strings[tree["subst"]] + tree["subst"] = self.vocab.strings.add(tree["subst"]) trees.append(tree) diff --git a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py index cf541e301..b12ca5dd4 100644 --- a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py +++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py @@ -60,10 +60,45 @@ def test_initialize_from_labels(): nlp2 = Language() lemmatizer2 = nlp2.add_pipe("trainable_lemmatizer") lemmatizer2.initialize( - get_examples=lambda: train_examples, + # We want to check that the strings in replacement nodes are + # added to the string store. Avoid that they get added through + # the examples. + get_examples=lambda: train_examples[:1], labels=lemmatizer.label_data, ) assert lemmatizer2.tree2label == {1: 0, 3: 1, 4: 2, 6: 3} + assert lemmatizer2.label_data == { + "trees": [ + {"orig": "S", "subst": "s"}, + { + "prefix_len": 1, + "suffix_len": 0, + "prefix_tree": 0, + "suffix_tree": 4294967295, + }, + {"orig": "s", "subst": ""}, + { + "prefix_len": 0, + "suffix_len": 1, + "prefix_tree": 4294967295, + "suffix_tree": 2, + }, + { + "prefix_len": 0, + "suffix_len": 0, + "prefix_tree": 4294967295, + "suffix_tree": 4294967295, + }, + {"orig": "E", "subst": "e"}, + { + "prefix_len": 1, + "suffix_len": 0, + "prefix_tree": 5, + "suffix_tree": 4294967295, + }, + ], + "labels": (1, 3, 4, 6), + } def test_no_data(): From cc0c7dab29008fec7f966d907ff57719b34ca943 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Wed, 7 Dec 2022 23:52:35 +0900 Subject: [PATCH 12/19] Add in errors used in the beam code that were removed at some point (#11935) I don't think there's any way to use the beam code at the moment, but as long as it's around the errors it refers to should also be present. --- spacy/errors.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/spacy/errors.py b/spacy/errors.py index 147af4eec..1f1c73f5f 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -335,6 +335,11 @@ class Errors(metaclass=ErrorsWithCodes): "clear the existing vectors and resize the table.") E074 = ("Error interpreting compiled match pattern: patterns are expected " "to end with the attribute {attr}. Got: {bad_attr}.") + E079 = ("Error computing states in beam: number of predicted beams " + "({pbeams}) does not equal number of gold beams ({gbeams}).") + E080 = ("Duplicate state found in beam: {key}.") + E081 = ("Error getting gradient in beam: number of histories ({n_hist}) " + "does not equal number of losses ({losses}).") E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), " "projective heads ({n_proj_heads}) and labels ({n_labels}) do not " "match.") From 289ed4fa43808f257bb4fca8d9a89301bd6f0c58 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 9 Dec 2022 14:38:04 +0100 Subject: [PATCH 13/19] Modify similarity tests to avoid spurious warnings --- spacy/tests/vocab_vectors/test_similarity.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/spacy/tests/vocab_vectors/test_similarity.py b/spacy/tests/vocab_vectors/test_similarity.py index 47cd1f060..e1f95b28b 100644 --- a/spacy/tests/vocab_vectors/test_similarity.py +++ b/spacy/tests/vocab_vectors/test_similarity.py @@ -7,7 +7,7 @@ from ..util import get_cosine, add_vecs_to_vocab @pytest.fixture def vectors(): - return [("apple", [1, 2, 3]), ("orange", [-1, -2, -3])] + return [("apple", [1, 2, 3]), ("orange", [-1, -2, -5])] @pytest.fixture() @@ -71,19 +71,17 @@ def test_vectors_similarity_DD(vocab, vectors): def test_vectors_similarity_TD(vocab, vectors): [(word1, vec1), (word2, vec2)] = vectors doc = Doc(vocab, words=[word1, word2]) - with pytest.warns(UserWarning): - assert isinstance(doc.similarity(doc[0]), float) - assert isinstance(doc[0].similarity(doc), float) - assert doc.similarity(doc[0]) == doc[0].similarity(doc) + assert isinstance(doc.similarity(doc[0]), float) + assert isinstance(doc[0].similarity(doc), float) + assert doc.similarity(doc[0]) == doc[0].similarity(doc) def test_vectors_similarity_TS(vocab, vectors): [(word1, vec1), (word2, vec2)] = vectors doc = Doc(vocab, words=[word1, word2]) - with pytest.warns(UserWarning): - assert isinstance(doc[:2].similarity(doc[0]), float) - assert isinstance(doc[0].similarity(doc[-2]), float) - assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2]) + assert isinstance(doc[:2].similarity(doc[0]), float) + assert isinstance(doc[0].similarity(doc[-2]), float) + assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2]) def test_vectors_similarity_DS(vocab, vectors): From 279749a9d915d99b789e91db95e283ca9a494795 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 12 Dec 2022 08:45:35 +0100 Subject: [PATCH 14/19] Cast to uint64 for all array-based doc representations (#11933) * Convert all individual values explicitly to uint64 for array-based doc representations * Temporarily test with latest numpy v1.24.0rc * Remove unnecessary conversion from attr_t * Reduce number of individual casts * Convert specifically from int32 to uint64 * Revert "Temporarily test with latest numpy v1.24.0rc" This reverts commit eb0e3c5006515b9a7ff52bae59484c909b8a3f65. * Also use int32 in tests --- spacy/tests/doc/test_array.py | 4 ++-- spacy/tokens/doc.pyx | 2 ++ spacy/tokens/span.pyx | 4 ++-- spacy/training/example.pyx | 15 ++++++++------- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py index c334cc6eb..1f2d7d999 100644 --- a/spacy/tests/doc/test_array.py +++ b/spacy/tests/doc/test_array.py @@ -123,14 +123,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab): # head before start arr = doc.to_array(["HEAD"]) - arr[0] = -1 + arr[0] = numpy.int32(-1).astype(numpy.uint64) doc_from_array = Doc(en_vocab, words=words) with pytest.raises(ValueError): doc_from_array.from_array(["HEAD"], arr) # head after end arr = doc.to_array(["HEAD"]) - arr[0] = 5 + arr[0] = numpy.int32(5).astype(numpy.uint64) doc_from_array = Doc(en_vocab, words=words) with pytest.raises(ValueError): doc_from_array.from_array(["HEAD"], arr) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index e38de02b4..9d3bd5b69 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -359,6 +359,7 @@ cdef class Doc: for annot in annotations: if annot: if annot is heads or annot is sent_starts or annot is ent_iobs: + annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64) for i in range(len(words)): if attrs.ndim == 1: attrs[i] = annot[i] @@ -1557,6 +1558,7 @@ cdef class Doc: for j, (attr, annot) in enumerate(token_annotations.items()): if attr is HEAD: + annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64) for i in range(len(words)): array[i, j] = annot[i] elif attr is MORPH: diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index ab888ae95..fd96f03f0 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -299,7 +299,7 @@ cdef class Span: for ancestor in ancestors: ancestor_i = ancestor.i - self.c.start if ancestor_i in range(length): - array[i, head_col] = ancestor_i - i + array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64) # if there is no appropriate ancestor, define a new artificial root value = array[i, head_col] @@ -307,7 +307,7 @@ cdef class Span: new_root = old_to_new_root.get(ancestor_i, None) if new_root is not None: # take the same artificial root as a previous token from the same sentence - array[i, head_col] = new_root - i + array[i, head_col] = numpy.int32(new_root - i).astype(numpy.uint64) else: # set this token as the new artificial root array[i, head_col] = 0 diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index 3035388a6..ce4f746ac 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -353,26 +353,27 @@ def _annot2array(vocab, tok_annot, doc_annot): if key not in IDS: raise ValueError(Errors.E974.format(obj="token", key=key)) elif key in ["ORTH", "SPACY"]: - pass + continue elif key == "HEAD": attrs.append(key) - values.append([h-i if h is not None else 0 for i, h in enumerate(value)]) + row = [h-i if h is not None else 0 for i, h in enumerate(value)] elif key == "DEP": attrs.append(key) - values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]) + row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value] elif key == "SENT_START": attrs.append(key) - values.append([to_ternary_int(v) for v in value]) + row = [to_ternary_int(v) for v in value] elif key == "MORPH": attrs.append(key) - values.append([vocab.morphology.add(v) for v in value]) + row = [vocab.morphology.add(v) for v in value] else: attrs.append(key) if not all(isinstance(v, str) for v in value): types = set([type(v) for v in value]) raise TypeError(Errors.E969.format(field=key, types=types)) from None - values.append([vocab.strings.add(v) for v in value]) - array = numpy.asarray(values, dtype="uint64") + row = [vocab.strings.add(v) for v in value] + values.append([numpy.array(v, dtype=numpy.int32).astype(numpy.uint64) if v < 0 else v for v in row]) + array = numpy.array(values, dtype=numpy.uint64) return attrs, array.T From 4e050330a8f4106c04fc94fad42cf675b43df899 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 3 Nov 2022 09:29:46 +0100 Subject: [PATCH 15/19] Modernize and simplify CI steps (#11738) * Use `build` instead of `python setup.py sdist` * Remove in-place build with `setup.py` * Remove `gpu` parameter and GPU tests * Keep `architecture` and `num_build_jobs` in azure steps with CI defaults * Fix use of `num_build_jobs` parameters * Remove now-unused `prefix` parameter * Test imports and CLI before installing test requirements * Remove `*.egg-info` directory in addition to source directory for an warning-free `import spacy` * Switch `thinc-apple-ops` test to python 3.11 (as most recent python that is tested across platforms) --- .github/azure-steps.yml | 70 +++++++++++++++++++---------------------- azure-pipelines.yml | 17 ---------- 2 files changed, 32 insertions(+), 55 deletions(-) diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml index 80c88b0b8..e8734d5bf 100644 --- a/.github/azure-steps.yml +++ b/.github/azure-steps.yml @@ -1,9 +1,6 @@ parameters: python_version: '' - architecture: '' - prefix: '' - gpu: false - num_build_jobs: 1 + architecture: 'x64' steps: - task: UsePythonVersion@0 @@ -16,16 +13,16 @@ steps: displayName: 'Set variables' - script: | - ${{ parameters.prefix }} python -m pip install -U pip setuptools - ${{ parameters.prefix }} python -m pip install -U -r requirements.txt + python -m pip install -U build pip setuptools + python -m pip install -U -r requirements.txt displayName: "Install dependencies" - script: | - ${{ parameters.prefix }} python setup.py build_ext --inplace -j ${{ parameters.num_build_jobs }} - ${{ parameters.prefix }} python setup.py sdist --formats=gztar - displayName: "Compile and build sdist" + python -m build --sdist + displayName: "Build sdist" - - script: python -m mypy spacy + - script: | + python -m mypy spacy displayName: 'Run mypy' condition: ne(variables['python_version'], '3.10') @@ -34,35 +31,24 @@ steps: contents: "spacy" displayName: "Delete source directory" + - task: DeleteFiles@1 + inputs: + contents: "*.egg-info" + displayName: "Delete egg-info directory" + - script: | - ${{ parameters.prefix }} python -m pip freeze --exclude torch --exclude cupy-cuda110 > installed.txt - ${{ parameters.prefix }} python -m pip uninstall -y -r installed.txt + python -m pip freeze > installed.txt + python -m pip uninstall -y -r installed.txt displayName: "Uninstall all packages" - bash: | - ${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1) - ${{ parameters.prefix }} python -m pip install dist/$SDIST + SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1) + python -m pip install dist/$SDIST displayName: "Install from sdist" - script: | - ${{ parameters.prefix }} python -m pip install -U -r requirements.txt - displayName: "Install test requirements" - - - script: | - ${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0 - ${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html - displayName: "Install GPU requirements" - condition: eq(${{ parameters.gpu }}, true) - - - script: | - ${{ parameters.prefix }} python -m pytest --pyargs spacy - displayName: "Run CPU tests" - condition: eq(${{ parameters.gpu }}, false) - - - script: | - ${{ parameters.prefix }} python -m pytest --pyargs spacy -p spacy.tests.enable_gpu - displayName: "Run GPU tests" - condition: eq(${{ parameters.gpu }}, true) + python -W error -c "import spacy" + displayName: "Test import" - script: | python -m spacy download ca_core_news_sm @@ -105,13 +91,21 @@ steps: displayName: 'Test assemble CLI vectors warning' condition: eq(variables['python_version'], '3.8') + - script: | + python -m pip install -U -r requirements.txt + displayName: "Install test requirements" + + - script: | + python -m pytest --pyargs spacy -W error + displayName: "Run CPU tests" + + - script: | + python -m pip install --pre thinc-apple-ops + python -m pytest --pyargs spacy + displayName: "Run CPU tests with thinc-apple-ops" + condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11')) + - script: | python .github/validate_universe_json.py website/meta/universe.json displayName: 'Test website/meta/universe.json' condition: eq(variables['python_version'], '3.8') - - - script: | - ${{ parameters.prefix }} python -m pip install thinc-apple-ops - ${{ parameters.prefix }} python -m pytest --pyargs spacy - displayName: "Run CPU tests with thinc-apple-ops" - condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.9')) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 005fb3609..927ac0d48 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -92,20 +92,3 @@ jobs: - template: .github/azure-steps.yml parameters: python_version: '$(python.version)' - architecture: 'x64' - -# - job: "TestGPU" -# dependsOn: "Validate" -# strategy: -# matrix: -# Python38LinuxX64_GPU: -# python.version: '3.8' -# pool: -# name: "LinuxX64_GPU" -# steps: -# - template: .github/azure-steps.yml -# parameters: -# python_version: '$(python.version)' -# architecture: 'x64' -# gpu: true -# num_build_jobs: 24 From 9c794644ab95a9263128aae0cf476a6607e497df Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 9 Dec 2022 16:56:52 +0100 Subject: [PATCH 16/19] CI: Test thinc-apple-ops for python 3.10 --- .github/azure-steps.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml index e8734d5bf..731989b40 100644 --- a/.github/azure-steps.yml +++ b/.github/azure-steps.yml @@ -103,7 +103,7 @@ steps: python -m pip install --pre thinc-apple-ops python -m pytest --pyargs spacy displayName: "Run CPU tests with thinc-apple-ops" - condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11')) + condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.10')) - script: | python .github/validate_universe_json.py website/meta/universe.json From 4297340d8c48e880ab19c4274b079de7bbd4c3e5 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 9 Dec 2022 13:53:58 +0100 Subject: [PATCH 17/19] CI and precommit hooks: switch to flake8==5.0.4 --- .pre-commit-config.yaml | 2 +- azure-pipelines.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b959262e3..df59697b1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,7 +6,7 @@ repos: language_version: python3.7 additional_dependencies: ['click==8.0.4'] - repo: https://gitlab.com/pycqa/flake8 - rev: 3.9.2 + rev: 5.0.4 hooks: - id: flake8 args: diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 927ac0d48..a26d26974 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -31,7 +31,7 @@ jobs: inputs: versionSpec: "3.7" - script: | - pip install flake8==3.9.2 + pip install flake8==5.0.4 python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics displayName: "flake8" From b0fb316ca9ef0bb163ae6e50220796b3f92881e7 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 12 Dec 2022 10:13:10 +0100 Subject: [PATCH 18/19] CI: Install thinc-apple-ops through extra (#11963) --- .github/azure-steps.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml index 731989b40..7e3f94df6 100644 --- a/.github/azure-steps.yml +++ b/.github/azure-steps.yml @@ -100,7 +100,7 @@ steps: displayName: "Run CPU tests" - script: | - python -m pip install --pre thinc-apple-ops + python -m pip install 'spacy[apple]' python -m pytest --pyargs spacy displayName: "Run CPU tests with thinc-apple-ops" condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.10')) From 5b3b18d626dc85f0bd3a39e4483e9ec8c0b29868 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 13 Dec 2022 13:19:48 +0100 Subject: [PATCH 19/19] Set version to v3.3.2 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 42f059d16..b4ef29260 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.3.1" +__version__ = "3.3.2" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects"