From 704938777d091207b2f939919d401d6862d8878c Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 25 Nov 2022 13:00:57 +0100 Subject: [PATCH 01/10] Add smart_open requirement, update deprecated options (#11864) * Switch from deprecated `ignore_ext` to `compression` * Add upload/download test for local files --- requirements.txt | 1 + setup.cfg | 1 + spacy/cli/_util.py | 2 +- spacy/tests/test_cli.py | 16 ++++++++++++++++ 4 files changed, 19 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d91a3b3d4..58cc6a2bf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,6 +11,7 @@ srsly>=2.4.3,<3.0.0 catalogue>=2.0.6,<2.1.0 typer>=0.3.0,<0.8.0 pathy>=0.3.5 +smart-open>=5.2.1,<7.0.0 # Third party dependencies numpy>=1.15.0 requests>=2.13.0,<3.0.0 diff --git a/setup.cfg b/setup.cfg index 82d4d2758..330dc8205 100644 --- a/setup.cfg +++ b/setup.cfg @@ -53,6 +53,7 @@ install_requires = # Third-party dependencies typer>=0.3.0,<0.8.0 pathy>=0.3.5 + smart-open>=5.2.1,<7.0.0 tqdm>=4.38.0,<5.0.0 numpy>=1.15.0 requests>=2.13.0,<3.0.0 diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 897964a88..872f69c88 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -358,7 +358,7 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False) if dest.exists() and not force: return None src = str(src) - with smart_open.open(src, mode="rb", ignore_ext=True) as input_file: + with smart_open.open(src, mode="rb", compression="disable") as input_file: with dest.open(mode="wb") as output_file: shutil.copyfileobj(input_file, output_file) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 8225e14f1..44e7a88f6 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -16,6 +16,7 @@ from spacy.cli._util import is_subpath_of, load_project_config from spacy.cli._util import parse_config_overrides, string_to_list from spacy.cli._util import substitute_project_variables from spacy.cli._util import validate_project_commands +from spacy.cli._util import upload_file, download_file from spacy.cli.debug_data import _compile_gold, _get_labels_from_model from spacy.cli.debug_data import _get_labels_from_spancat from spacy.cli.debug_data import _get_distribution, _get_kl_divergence @@ -896,3 +897,18 @@ def test_project_check_requirements(reqs, output): pkg_resources.require("spacyunknowndoesnotexist12345") except pkg_resources.DistributionNotFound: assert output == _check_requirements([req.strip() for req in reqs.split("\n")]) + + +def test_upload_download_local_file(): + with make_tempdir() as d1, make_tempdir() as d2: + filename = "f.txt" + content = "content" + local_file = d1 / filename + remote_file = d2 / filename + with local_file.open(mode="w") as file_: + file_.write(content) + upload_file(local_file, remote_file) + local_file.unlink() + download_file(remote_file, local_file) + with local_file.open(mode="r") as file_: + assert file_.read() == content From 4dbedbbc7f7f4f7f38014a41ddeaea3c1c7268ed Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 28 Nov 2022 18:01:09 +0900 Subject: [PATCH 02/10] Don't throw an error if using displacy on an unset span key (#11845) * Don't throw an error if using displacy on an unset span key * List available keys in W117 --- spacy/displacy/__init__.py | 5 +++-- spacy/errors.py | 2 +- spacy/tests/test_displacy.py | 10 ++++++++++ 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index 7bb300afa..bc32001d7 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -228,12 +228,13 @@ def parse_spans(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]: "kb_id": span.kb_id_ if span.kb_id_ else "", "kb_url": kb_url_template.format(span.kb_id_) if kb_url_template else "#", } - for span in doc.spans[spans_key] + for span in doc.spans.get(spans_key, []) ] tokens = [token.text for token in doc] if not spans: - warnings.warn(Warnings.W117.format(spans_key=spans_key)) + keys = list(doc.spans.keys()) + warnings.warn(Warnings.W117.format(spans_key=spans_key, keys=keys)) title = doc.user_data.get("title", None) if hasattr(doc, "user_data") else None settings = get_doc_settings(doc) return { diff --git a/spacy/errors.py b/spacy/errors.py index 3cc9fd494..a6d15ba0a 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -199,7 +199,7 @@ class Warnings(metaclass=ErrorsWithCodes): W117 = ("No spans to visualize found in Doc object with spans_key: '{spans_key}'. If this is " "surprising to you, make sure the Doc was processed using a model " "that supports span categorization, and check the `doc.spans[spans_key]` " - "property manually if necessary.") + "property manually if necessary.\n\nAvailable keys: {keys}") W118 = ("Term '{term}' not found in glossary. It may however be explained in documentation " "for the corpora used to train the language. Please check " "`nlp.meta[\"sources\"]` for any relevant links.") diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py index ccc145b44..f298b38e0 100644 --- a/spacy/tests/test_displacy.py +++ b/spacy/tests/test_displacy.py @@ -203,6 +203,16 @@ def test_displacy_parse_spans_different_spans_key(en_vocab): ] +def test_displacy_parse_empty_spans_key(en_vocab): + """Test that having an unset spans key doesn't raise an error""" + doc = Doc(en_vocab, words=["Welcome", "to", "the", "Bank", "of", "China"]) + doc.spans["custom"] = [Span(doc, 3, 6, "BANK")] + with pytest.warns(UserWarning, match="W117"): + spans = displacy.parse_spans(doc) + + assert isinstance(spans, dict) + + def test_displacy_parse_ents(en_vocab): """Test that named entities on a Doc are converted into displaCy's format.""" doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"]) From 990deb04a7d34e8c96673c9c41377508e29eaa51 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 2 Dec 2022 09:33:52 +0100 Subject: [PATCH 03/10] Fix spancat for zero suggestions (#11860) * Add test for spancat predict with zero suggestions * Fix spancat for zero suggestions * Undo changes to extract_spans * Use .sum() as in update --- spacy/pipeline/spancat.py | 5 +++- spacy/tests/pipeline/test_spancat.py | 43 ++++++++++++++++++++++------ 2 files changed, 38 insertions(+), 10 deletions(-) diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py index ca9f1dab0..f288e171e 100644 --- a/spacy/pipeline/spancat.py +++ b/spacy/pipeline/spancat.py @@ -272,7 +272,10 @@ class SpanCategorizer(TrainablePipe): DOCS: https://spacy.io/api/spancategorizer#predict """ indices = self.suggester(docs, ops=self.model.ops) - scores = self.model.predict((docs, indices)) # type: ignore + if indices.lengths.sum() == 0: + scores = self.model.ops.alloc2f(0, 0) + else: + scores = self.model.predict((docs, indices)) # type: ignore return indices, scores def set_candidates( diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py index 15256a763..e9db983d3 100644 --- a/spacy/tests/pipeline/test_spancat.py +++ b/spacy/tests/pipeline/test_spancat.py @@ -372,24 +372,39 @@ def test_overfitting_IO_overlapping(): def test_zero_suggestions(): - # Test with a suggester that returns 0 suggestions + # Test with a suggester that can return 0 suggestions - @registry.misc("test_zero_suggester") - def make_zero_suggester(): - def zero_suggester(docs, *, ops=None): + @registry.misc("test_mixed_zero_suggester") + def make_mixed_zero_suggester(): + def mixed_zero_suggester(docs, *, ops=None): if ops is None: ops = get_current_ops() - return Ragged( - ops.xp.zeros((0, 0), dtype="i"), ops.xp.zeros((len(docs),), dtype="i") - ) + spans = [] + lengths = [] + for doc in docs: + if len(doc) > 0 and len(doc) % 2 == 0: + spans.append((0, 1)) + lengths.append(1) + else: + lengths.append(0) + spans = ops.asarray2i(spans) + lengths_array = ops.asarray1i(lengths) + if len(spans) > 0: + output = Ragged(ops.xp.vstack(spans), lengths_array) + else: + output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array) + return output - return zero_suggester + return mixed_zero_suggester fix_random_seed(0) nlp = English() spancat = nlp.add_pipe( "spancat", - config={"suggester": {"@misc": "test_zero_suggester"}, "spans_key": SPAN_KEY}, + config={ + "suggester": {"@misc": "test_mixed_zero_suggester"}, + "spans_key": SPAN_KEY, + }, ) train_examples = make_examples(nlp) optimizer = nlp.initialize(get_examples=lambda: train_examples) @@ -397,6 +412,16 @@ def test_zero_suggestions(): assert set(spancat.labels) == {"LOC", "PERSON"} nlp.update(train_examples, sgd=optimizer) + # empty doc + nlp("") + # single doc with zero suggestions + nlp("one") + # single doc with one suggestion + nlp("two two") + # batch with mixed zero/one suggestions + list(nlp.pipe(["one", "two two", "three three three", "", "four four four four"])) + # batch with no suggestions + list(nlp.pipe(["", "one", "three three three"])) def test_set_candidates(): From 2a19b0a8bdd7a0f7614cd6fe671d1a73af534869 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Fri, 2 Dec 2022 18:17:11 +0900 Subject: [PATCH 04/10] Config generation fails for GPU without transformers (#11899) If you don't have spacy-transformers installed, but try to use `init config` with the GPU flag, you'll get an error. The issue is that the `use_transformers` flag in the config is conflated with the GPU flag, and then there's an attempt to access transformers config info that may not exist. There may be a better way to do this, but this stops the error. --- spacy/cli/templates/quickstart_training.jinja | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 58864883a..b961ac892 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -1,7 +1,7 @@ {# This is a template for training configs used for the quickstart widget in the docs and the init config command. It encodes various best practices and can help generate the best possible configuration, given a user's requirements. #} -{%- set use_transformer = hardware != "cpu" -%} +{%- set use_transformer = hardware != "cpu" and transformer_data -%} {%- set transformer = transformer_data[optimize] if use_transformer else {} -%} {%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%} [paths] From 5a2db6866a5e93e08014ce9264c4982fc55788e9 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 5 Dec 2022 17:43:23 +0900 Subject: [PATCH 05/10] Switch ubuntu-latest to ubuntu-20.04 in main tests (#11928) * Switch ubuntu-latest to ubuntu-20.04 in main tests * Only use 20.04 for 3.6 --- azure-pipelines.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 9c3b92f06..0f7ea91f9 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -41,7 +41,7 @@ jobs: matrix: # We're only running one platform per Python version to speed up builds Python36Linux: - imageName: "ubuntu-latest" + imageName: "ubuntu-20.04" python.version: "3.6" # Python36Windows: # imageName: "windows-latest" @@ -50,7 +50,7 @@ jobs: # imageName: "macos-latest" # python.version: "3.6" # Python37Linux: - # imageName: "ubuntu-latest" + # imageName: "ubuntu-20.04" # python.version: "3.7" Python37Windows: imageName: "windows-latest" From c4e5bc5a21cf128da2357dc5e6d20e50756088da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Wed, 7 Dec 2022 05:53:41 +0100 Subject: [PATCH 06/10] EditTreeLemmatizer: correctly add strings when initializing from labels (#11934) Strings in replacement nodes where not added to the `StringStore` when `EditTreeLemmatizer` was initialized from a set of labels. The corresponding test did not capture this because it added the strings through the examples that were passed to the initialization. This change fixes both this bug in the initialization as the 'shadowing' of the bug in the test. --- spacy/pipeline/edit_tree_lemmatizer.py | 4 +- .../pipeline/test_edit_tree_lemmatizer.py | 37 ++++++++++++++++++- 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py index 12f9b73a3..a56c9975e 100644 --- a/spacy/pipeline/edit_tree_lemmatizer.py +++ b/spacy/pipeline/edit_tree_lemmatizer.py @@ -328,9 +328,9 @@ class EditTreeLemmatizer(TrainablePipe): tree = dict(tree) if "orig" in tree: - tree["orig"] = self.vocab.strings[tree["orig"]] + tree["orig"] = self.vocab.strings.add(tree["orig"]) if "orig" in tree: - tree["subst"] = self.vocab.strings[tree["subst"]] + tree["subst"] = self.vocab.strings.add(tree["subst"]) trees.append(tree) diff --git a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py index cf541e301..b12ca5dd4 100644 --- a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py +++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py @@ -60,10 +60,45 @@ def test_initialize_from_labels(): nlp2 = Language() lemmatizer2 = nlp2.add_pipe("trainable_lemmatizer") lemmatizer2.initialize( - get_examples=lambda: train_examples, + # We want to check that the strings in replacement nodes are + # added to the string store. Avoid that they get added through + # the examples. + get_examples=lambda: train_examples[:1], labels=lemmatizer.label_data, ) assert lemmatizer2.tree2label == {1: 0, 3: 1, 4: 2, 6: 3} + assert lemmatizer2.label_data == { + "trees": [ + {"orig": "S", "subst": "s"}, + { + "prefix_len": 1, + "suffix_len": 0, + "prefix_tree": 0, + "suffix_tree": 4294967295, + }, + {"orig": "s", "subst": ""}, + { + "prefix_len": 0, + "suffix_len": 1, + "prefix_tree": 4294967295, + "suffix_tree": 2, + }, + { + "prefix_len": 0, + "suffix_len": 0, + "prefix_tree": 4294967295, + "suffix_tree": 4294967295, + }, + {"orig": "E", "subst": "e"}, + { + "prefix_len": 1, + "suffix_len": 0, + "prefix_tree": 5, + "suffix_tree": 4294967295, + }, + ], + "labels": (1, 3, 4, 6), + } def test_no_data(): From b83abde77fc2190f3b78d2303f7f30bf9c2826c4 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Wed, 7 Dec 2022 23:52:35 +0900 Subject: [PATCH 07/10] Add in errors used in the beam code that were removed at some point (#11935) I don't think there's any way to use the beam code at the moment, but as long as it's around the errors it refers to should also be present. --- spacy/errors.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/spacy/errors.py b/spacy/errors.py index a6d15ba0a..4c51bcd56 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -345,6 +345,11 @@ class Errors(metaclass=ErrorsWithCodes): "clear the existing vectors and resize the table.") E074 = ("Error interpreting compiled match pattern: patterns are expected " "to end with the attribute {attr}. Got: {bad_attr}.") + E079 = ("Error computing states in beam: number of predicted beams " + "({pbeams}) does not equal number of gold beams ({gbeams}).") + E080 = ("Duplicate state found in beam: {key}.") + E081 = ("Error getting gradient in beam: number of histories ({n_hist}) " + "does not equal number of losses ({losses}).") E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), " "projective heads ({n_proj_heads}) and labels ({n_labels}) do not " "match.") From 4e043b543018cdbb0d7bfc93e4c5a55d54182b6a Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 12 Dec 2022 08:45:35 +0100 Subject: [PATCH 08/10] Cast to uint64 for all array-based doc representations (#11933) * Convert all individual values explicitly to uint64 for array-based doc representations * Temporarily test with latest numpy v1.24.0rc * Remove unnecessary conversion from attr_t * Reduce number of individual casts * Convert specifically from int32 to uint64 * Revert "Temporarily test with latest numpy v1.24.0rc" This reverts commit eb0e3c5006515b9a7ff52bae59484c909b8a3f65. * Also use int32 in tests --- spacy/tests/doc/test_array.py | 4 ++-- spacy/tokens/doc.pyx | 2 ++ spacy/tokens/span.pyx | 4 ++-- spacy/training/example.pyx | 15 ++++++++------- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py index c334cc6eb..1f2d7d999 100644 --- a/spacy/tests/doc/test_array.py +++ b/spacy/tests/doc/test_array.py @@ -123,14 +123,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab): # head before start arr = doc.to_array(["HEAD"]) - arr[0] = -1 + arr[0] = numpy.int32(-1).astype(numpy.uint64) doc_from_array = Doc(en_vocab, words=words) with pytest.raises(ValueError): doc_from_array.from_array(["HEAD"], arr) # head after end arr = doc.to_array(["HEAD"]) - arr[0] = 5 + arr[0] = numpy.int32(5).astype(numpy.uint64) doc_from_array = Doc(en_vocab, words=words) with pytest.raises(ValueError): doc_from_array.from_array(["HEAD"], arr) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index f2621292c..075bc4d15 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -359,6 +359,7 @@ cdef class Doc: for annot in annotations: if annot: if annot is heads or annot is sent_starts or annot is ent_iobs: + annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64) for i in range(len(words)): if attrs.ndim == 1: attrs[i] = annot[i] @@ -1558,6 +1559,7 @@ cdef class Doc: for j, (attr, annot) in enumerate(token_annotations.items()): if attr is HEAD: + annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64) for i in range(len(words)): array[i, j] = annot[i] elif attr is MORPH: diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index c3495f497..99a5f43bd 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -299,7 +299,7 @@ cdef class Span: for ancestor in ancestors: ancestor_i = ancestor.i - self.c.start if ancestor_i in range(length): - array[i, head_col] = ancestor_i - i + array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64) # if there is no appropriate ancestor, define a new artificial root value = array[i, head_col] @@ -307,7 +307,7 @@ cdef class Span: new_root = old_to_new_root.get(ancestor_i, None) if new_root is not None: # take the same artificial root as a previous token from the same sentence - array[i, head_col] = new_root - i + array[i, head_col] = numpy.int32(new_root - i).astype(numpy.uint64) else: # set this token as the new artificial root array[i, head_col] = 0 diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index dfd337b9e..95b0f0de9 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -443,26 +443,27 @@ def _annot2array(vocab, tok_annot, doc_annot): if key not in IDS: raise ValueError(Errors.E974.format(obj="token", key=key)) elif key in ["ORTH", "SPACY"]: - pass + continue elif key == "HEAD": attrs.append(key) - values.append([h-i if h is not None else 0 for i, h in enumerate(value)]) + row = [h-i if h is not None else 0 for i, h in enumerate(value)] elif key == "DEP": attrs.append(key) - values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]) + row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value] elif key == "SENT_START": attrs.append(key) - values.append([to_ternary_int(v) for v in value]) + row = [to_ternary_int(v) for v in value] elif key == "MORPH": attrs.append(key) - values.append([vocab.morphology.add(v) for v in value]) + row = [vocab.morphology.add(v) for v in value] else: attrs.append(key) if not all(isinstance(v, str) for v in value): types = set([type(v) for v in value]) raise TypeError(Errors.E969.format(field=key, types=types)) from None - values.append([vocab.strings.add(v) for v in value]) - array = numpy.asarray(values, dtype="uint64") + row = [vocab.strings.add(v) for v in value] + values.append([numpy.array(v, dtype=numpy.int32).astype(numpy.uint64) if v < 0 else v for v in row]) + array = numpy.array(values, dtype=numpy.uint64) return attrs, array.T From 5c49e821375f4f106cef64bf2f7c22f7cfcf403a Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 12 Dec 2022 10:13:10 +0100 Subject: [PATCH 09/10] CI: Install thinc-apple-ops through extra (#11963) --- .github/azure-steps.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml index e8bd0d212..ed69f611b 100644 --- a/.github/azure-steps.yml +++ b/.github/azure-steps.yml @@ -107,7 +107,7 @@ steps: displayName: "Run CPU tests" - script: | - python -m pip install --pre thinc-apple-ops + python -m pip install 'spacy[apple]' python -m pytest --pyargs spacy displayName: "Run CPU tests with thinc-apple-ops" condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11')) From 39ccd67ba160207f862a91bbc5852d93fd876adc Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 13 Dec 2022 13:20:51 +0100 Subject: [PATCH 10/10] Set version to v3.4.4 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 9efbe7432..98a95b99d 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.4.3" +__version__ = "3.4.4" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects"