From effae12cbd27391e335ef3f279725934b0510f91 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 27 Sep 2021 20:04:02 +0200 Subject: [PATCH 01/29] Update slow readers test to use textcat_multilabel (#9300) --- spacy/tests/training/test_readers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py index f53660818..1f262c011 100644 --- a/spacy/tests/training/test_readers.py +++ b/spacy/tests/training/test_readers.py @@ -82,15 +82,15 @@ def test_cat_readers(reader, additional_config): [nlp] lang = "en" - pipeline = ["tok2vec", "textcat"] + pipeline = ["tok2vec", "textcat_multilabel"] [components] [components.tok2vec] factory = "tok2vec" - [components.textcat] - factory = "textcat" + [components.textcat_multilabel] + factory = "textcat_multilabel" """ config = Config().from_str(nlp_config_string) config["corpora"]["@readers"] = reader From a361df00cd65dd829a07d1256476dfd9639be4a9 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 27 Sep 2021 20:43:03 +0200 Subject: [PATCH 02/29] Raise E983 early on in docbin init (#9247) * raise E983 early on in docbin init * catch situation before error is raised * add more info on the spacy debug command --- spacy/errors.py | 6 ++++-- spacy/tokens/_serialize.py | 8 ++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 135aacf92..b6659a041 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -657,7 +657,9 @@ class Errors: "{nO} - cannot add any more labels.") E923 = ("It looks like there is no proper sample data to initialize the " "Model of component '{name}'. To check your input data paths and " - "annotation, run: python -m spacy debug data config.cfg") + "annotation, run: python -m spacy debug data config.cfg " + "and include the same config override values you would specify " + "for the 'spacy train' command.") E924 = ("The '{name}' component does not seem to be initialized properly. " "This is likely a bug in spaCy, so feel free to open an issue: " "https://github.com/explosion/spaCy/issues") @@ -792,7 +794,7 @@ class Errors: "to token boundaries.") E982 = ("The `Token.ent_iob` attribute should be an integer indexing " "into {values}, but found {value}.") - E983 = ("Invalid key for '{dict}': {key}. Available keys: " + E983 = ("Invalid key(s) for '{dict}': {key}. Available keys: " "{keys}") E984 = ("Invalid component config for '{name}': component block needs either " "a key `factory` specifying the registered function used to " diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 868eb3eab..5be66c801 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -8,7 +8,7 @@ from thinc.api import NumpyOps from .doc import Doc from ..vocab import Vocab from ..compat import copy_reg -from ..attrs import SPACY, ORTH, intify_attr +from ..attrs import SPACY, ORTH, intify_attr, IDS from ..errors import Errors from ..util import ensure_path, SimpleFrozenList @@ -64,7 +64,11 @@ class DocBin: DOCS: https://spacy.io/api/docbin#init """ - attrs = sorted([intify_attr(attr) for attr in attrs]) + int_attrs = [intify_attr(attr) for attr in attrs] + if None in int_attrs: + non_valid = [attr for attr in attrs if intify_attr(attr) is None] + raise KeyError(Errors.E983.format(dict="attrs", key=non_valid, keys=IDS.keys())) from None + attrs = sorted(int_attrs) self.version = "0.1" self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY] self.attrs.insert(0, ORTH) # Ensure ORTH is always attrs[0] From e750c1760c5ad4324e606c8dcc528d1b735d3941 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 27 Sep 2021 20:44:14 +0200 Subject: [PATCH 03/29] Restore tokenization timing in Language.evaluate (#9305) Restore tokenization timing steps that were accidentally removed in #6765. --- spacy/language.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spacy/language.py b/spacy/language.py index 08fb63d4c..6abbc6f56 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1369,6 +1369,9 @@ class Language: scorer = Scorer(**kwargs) # reset annotation in predicted docs and time tokenization start_time = timer() + # this is purely for timing + for eg in examples: + self.make_doc(eg.reference.text) # apply all pipeline components for name, pipe in self.pipeline: kwargs = component_cfg.get(name, {}) From a14ab7e8821dcc7ef39e0f04cc3fdfabf865e29d Mon Sep 17 00:00:00 2001 From: Martin Vallone Date: Thu, 30 Sep 2021 01:46:53 -0300 Subject: [PATCH 04/29] Adding PhruzzMatcher to spaCy universe (#9321) * Adding PhruzzMatcher to spaCy universe * Fixes to make the package work properly --- website/meta/universe.json | 46 +++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index 28fe058eb..ee536c262 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -3476,7 +3476,51 @@ "github": "bbieniek" }, "category": ["apis"] - } + }, + { + "id": "phruzz_matcher", + "title": "phruzz-matcher", + "slogan": "Phrase matcher using RapidFuzz", + "description": "Combination of the RapidFuzz library with Spacy PhraseMatcher The goal of this component is to find matches when there were NO "perfect matches" due to typos or abbreviations between a Spacy doc and a list of phrases.", + "github": "mjvallone/phruzz-matcher", + "pip": "phruzz_matcher", + "code_example": [ + "import spacy", + "from spacy.language import Language", + "from phruzz_matcher.phrase_matcher import PhruzzMatcher", + "", + "famous_people = [", + " \"Brad Pitt\",", + " \"Demi Moore\",", + " \"Bruce Willis\",", + " \"Jim Carrey\",", + "]", + "", + "@Language.factory(\"phrase_matcher\")", + "def phrase_matcher(nlp: Language, name: str):", + " return PhruzzMatcher(nlp, famous_people, \"FAMOUS_PEOPLE\", 85)", + "", + "nlp = spacy.blank('es')", + "nlp.add_pipe(\"phrase_matcher\")", + "", + "doc = nlp(\"El otro día fui a un bar donde vi a brad pit y a Demi Moore, estaban tomando unas cervezas mientras charlaban de sus asuntos.\")", + "print(f\"doc.ents: {doc.ents}\")", + "", + "#OUTPUT", + "#doc.ents: (brad pit, Demi Moore)", + ], + "thumb": "https://avatars.githubusercontent.com/u/961296?v=4", + "image": "", + "code_language": "python", + "author": "Martin Vallone", + "author_links": { + "github": "mjvallone", + "twitter": "vallotin", + "website": "https://fiqus.coop/" + }, + "category": ["pipeline", "research", "standalone"], + "tags": ["spacy", "python", "nlp", "ner"] + } ], "categories": [ From 78a88f7de77257c20ae00e311ed8233ee7921bac Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Thu, 30 Sep 2021 15:23:55 +0900 Subject: [PATCH 05/29] Fix invalid json --- website/meta/universe.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index ee536c262..2b56f7507 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -3481,7 +3481,7 @@ "id": "phruzz_matcher", "title": "phruzz-matcher", "slogan": "Phrase matcher using RapidFuzz", - "description": "Combination of the RapidFuzz library with Spacy PhraseMatcher The goal of this component is to find matches when there were NO "perfect matches" due to typos or abbreviations between a Spacy doc and a list of phrases.", + "description": "Combination of the RapidFuzz library with Spacy PhraseMatcher The goal of this component is to find matches when there were NO \"perfect matches\" due to typos or abbreviations between a Spacy doc and a list of phrases.", "github": "mjvallone/phruzz-matcher", "pip": "phruzz_matcher", "code_example": [ @@ -3507,7 +3507,7 @@ "print(f\"doc.ents: {doc.ents}\")", "", "#OUTPUT", - "#doc.ents: (brad pit, Demi Moore)", + "#doc.ents: (brad pit, Demi Moore)" ], "thumb": "https://avatars.githubusercontent.com/u/961296?v=4", "image": "", From b3192ddea3aef2f55c47bfcaba1614e3fd79bd1d Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 30 Sep 2021 19:02:10 +0200 Subject: [PATCH 06/29] Sync thinc install dep in setup, fix test packaging (#9336) * Sync thinc install dep in setup * Add __init__.py to include package tests in package * Include *.toml in package --- MANIFEST.in | 2 +- setup.cfg | 2 +- spacy/tests/package/__init__.py | 0 3 files changed, 2 insertions(+), 2 deletions(-) create mode 100644 spacy/tests/package/__init__.py diff --git a/MANIFEST.in b/MANIFEST.in index 99fc174bd..d022223cd 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,5 @@ recursive-include include *.h -recursive-include spacy *.pyx *.pxd *.txt *.cfg *.jinja +recursive-include spacy *.pyx *.pxd *.txt *.cfg *.jinja *.toml include LICENSE include README.md include pyproject.toml diff --git a/setup.cfg b/setup.cfg index ff12d511a..fe484f92e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -44,7 +44,7 @@ install_requires = murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=8.0.9,<8.1.0 + thinc>=8.0.10,<8.1.0 blis>=0.4.0,<0.8.0 wasabi>=0.8.1,<1.1.0 srsly>=2.4.1,<3.0.0 diff --git a/spacy/tests/package/__init__.py b/spacy/tests/package/__init__.py new file mode 100644 index 000000000..e69de29bb From 42a76c758fa9828bb305318953519d6349d23e78 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 1 Oct 2021 11:17:11 +0200 Subject: [PATCH 07/29] Auto-format code with black (#9346) Co-authored-by: explosion-bot --- spacy/tokens/_serialize.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 5be66c801..e7799d230 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -67,7 +67,9 @@ class DocBin: int_attrs = [intify_attr(attr) for attr in attrs] if None in int_attrs: non_valid = [attr for attr in attrs if intify_attr(attr) is None] - raise KeyError(Errors.E983.format(dict="attrs", key=non_valid, keys=IDS.keys())) from None + raise KeyError( + Errors.E983.format(dict="attrs", key=non_valid, keys=IDS.keys()) + ) from None attrs = sorted(int_attrs) self.version = "0.1" self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY] From 6e833b617a44a0495f4c7496253ee4f88814d7ed Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Fri, 1 Oct 2021 10:28:22 +0000 Subject: [PATCH 08/29] Updating Troubleshooting Docs (#9329) * Add link to Discussions FAQ * Remove old FAQ entries I think these are no longer relevant. - no-cache-dir: affected pip versions are *very* old now - narrow unicode: not an issue from py3.3+ - utf-8 osx: upstream bug closed in 2019 Some of the other issues are also maybe not frequent. --- website/docs/usage/index.md | 60 ++----------------------------------- 1 file changed, 3 insertions(+), 57 deletions(-) diff --git a/website/docs/usage/index.md b/website/docs/usage/index.md index 665d334f8..707dd3215 100644 --- a/website/docs/usage/index.md +++ b/website/docs/usage/index.md @@ -284,7 +284,9 @@ $ python -m pytest --pyargs %%SPACY_PKG_NAME --slow # basic and slow test ## Troubleshooting guide {#troubleshooting} This section collects some of the most common errors you may come across when -installing, loading and using spaCy, as well as their solutions. +installing, loading and using spaCy, as well as their solutions. Also see the +[Discussions FAQ Thread](https://github.com/explosion/spaCy/discussions/8226), +which is updated more frequently and covers more transitory issues. > #### Help us improve this guide > @@ -311,62 +313,6 @@ language's `Language` class instead, for example - - -``` -no such option: --no-cache-dir -``` - -The `download` command uses pip to install the pipeline packages and sets the -`--no-cache-dir` flag to prevent it from requiring too much memory. -[This setting](https://pip.pypa.io/en/stable/reference/pip_install/#caching) -requires pip v6.0 or newer. Run `pip install -U pip` to upgrade to the latest -version of pip. To see which version you have installed, run `pip --version`. - - - - - -``` -sre_constants.error: bad character range -``` - -In [v2.1](/usage/v2-1), spaCy changed its implementation of regular expressions -for tokenization to make it up to 2-3 times faster. But this also means that -it's very important now that you run spaCy with a wide unicode build of Python. -This means that the build has 1114111 unicode characters available, instead of -only 65535 in a narrow unicode build. You can check this by running the -following command: - -```bash -$ python -c "import sys; print(sys.maxunicode)" -``` - -If you're running a narrow unicode build, reinstall Python and use a wide -unicode build instead. You can also rebuild Python and set the -`--enable-unicode=ucs4` flag. - - - - - -``` -ValueError: unknown locale: UTF-8 -``` - -This error can sometimes occur on OSX and is likely related to a still -unresolved [Python bug](https://bugs.python.org/issue18378). However, it's easy -to fix: just add the following to your `~/.bash_profile` or `~/.zshrc` and then -run `source ~/.bash_profile` or `source ~/.zshrc`. Make sure to add **both -lines** for `LC_ALL` and `LANG`. - -```bash -$ export LC_ALL=en_US.UTF-8 -$ export LANG=en_US.UTF-8 -``` - - - ``` From 4192e715994ac46f8ded67608115153899457892 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 4 Oct 2021 12:19:02 +0200 Subject: [PATCH 09/29] Sync vocab in vectors and components sourced in configs (#9335) Since a component may reference anything in the vocab, share the full vocab when loading source components and vectors (which will include `strings` as of #8909). When loading a source component from a config, save and restore the vocab state after loading source pipelines, in particular to preserve the original state without vectors, since `[initialize.vectors] = null` skips rather than resets the vectors. The vocab references are not synced for components loaded with `Language.add_pipe(source=)` because the pipelines are already loaded and not necessarily with the same vocab. A warning could be added in `Language.create_pipe_from_source` that it may be necessary to save and reload before training, but it's a rare enough case that this kind of warning may be too noisy overall. --- spacy/language.py | 29 +++++++++++++++++++++-------- spacy/training/initialize.py | 14 ++++++-------- 2 files changed, 27 insertions(+), 16 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 6abbc6f56..81d740d74 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -707,8 +707,9 @@ class Language: source_config = source.config.interpolate() pipe_config = util.copy_config(source_config["components"][source_name]) self._pipe_configs[name] = pipe_config - for s in source.vocab.strings: - self.vocab.strings.add(s) + if self.vocab.strings != source.vocab.strings: + for s in source.vocab.strings: + self.vocab.strings.add(s) return pipe, pipe_config["factory"] def add_pipe( @@ -1700,6 +1701,7 @@ class Language: # them here so they're only loaded once source_nlps = {} source_nlp_vectors_hashes = {} + vocab_b = None for pipe_name in config["nlp"]["pipeline"]: if pipe_name not in pipeline: opts = ", ".join(pipeline.keys()) @@ -1722,14 +1724,22 @@ class Language: raw_config=raw_config, ) else: + # We need the sourced components to reference the same + # vocab without modifying the current vocab state **AND** + # we still want to load the source model vectors to perform + # the vectors check. Since the source vectors clobber the + # current ones, we save the original vocab state and + # restore after this loop. Existing strings are preserved + # during deserialization, so they do not need any + # additional handling. + if vocab_b is None: + vocab_b = nlp.vocab.to_bytes(exclude=["lookups", "strings"]) model = pipe_cfg["source"] if model not in source_nlps: - # We only need the components here and we intentionally - # do not load the model with the same vocab because - # this would cause the vectors to be copied into the - # current nlp object (all the strings will be added in - # create_pipe_from_source) - source_nlps[model] = util.load_model(model) + # Load with the same vocab, adding any strings + source_nlps[model] = util.load_model( + model, vocab=nlp.vocab, exclude=["lookups"] + ) source_name = pipe_cfg.get("component", pipe_name) listeners_replaced = False if "replace_listeners" in pipe_cfg: @@ -1756,6 +1766,9 @@ class Language: # Delete from cache if listeners were replaced if listeners_replaced: del source_nlps[model] + # Restore the original vocab after sourcing if necessary + if vocab_b is not None: + nlp.vocab.from_bytes(vocab_b) disabled_pipes = [*config["nlp"]["disabled"], *disable] nlp._disabled = set(p for p in disabled_pipes if p not in exclude) nlp.batch_size = config["nlp"]["batch_size"] diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index bd014f75f..4eb8ea276 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -144,7 +144,12 @@ def load_vectors_into_model( ) -> None: """Load word vectors from an installed model or path into a model instance.""" try: - vectors_nlp = load_model(name) + # Load with the same vocab, which automatically adds the vectors to + # the current nlp object. Exclude lookups so they are not modified. + exclude = ["lookups"] + if not add_strings: + exclude.append("strings") + vectors_nlp = load_model(name, vocab=nlp.vocab, exclude=exclude) except ConfigValidationError as e: title = f"Config validation error for vectors {name}" desc = ( @@ -158,15 +163,8 @@ def load_vectors_into_model( if len(vectors_nlp.vocab.vectors.keys()) == 0: logger.warning(Warnings.W112.format(name=name)) - nlp.vocab.vectors = vectors_nlp.vocab.vectors for lex in nlp.vocab: lex.rank = nlp.vocab.vectors.key2row.get(lex.orth, OOV_RANK) - if add_strings: - # I guess we should add the strings from the vectors_nlp model? - # E.g. if someone does a similarity query, they might expect the strings. - for key in nlp.vocab.vectors.key2row: - if key in vectors_nlp.vocab.strings: - nlp.vocab.strings.add(vectors_nlp.vocab.strings[key]) def init_tok2vec( From f87ae3cb7dcfa9d955d66ecbfb03aae8c58bd42e Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 6 Oct 2021 06:13:18 +0200 Subject: [PATCH 10/29] Doc fixes in convert API (#9350) * add more info on the spacy debug command * formatting --- website/docs/api/cli.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 10ab2083e..268ea0703 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -260,16 +260,18 @@ $ python -m spacy convert [input_file] [output_dir] [--converter] [--file-type] | Name | Description | | ------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------- | -| `input_file` | Input file. ~~Path (positional)~~ | +| `input_path` | Input file or directory. ~~Path (positional)~~ | | `output_dir` | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. ~~Optional[Path] \(option)~~ | | `--converter`, `-c` 2 | Name of converter to use (see below). ~~str (option)~~ | | `--file-type`, `-t` 2.1 | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. ~~str (option)~~ | | `--n-sents`, `-n` | Number of sentences per document. Supported for: `conll`, `conllu`, `iob`, `ner` ~~int (option)~~ | | `--seg-sents`, `-s` 2.2 | Segment sentences. Supported for: `conll`, `ner` ~~bool (flag)~~ | -| `--base`, `-b` | Trained spaCy pipeline for sentence segmentation to use as base (for `--seg-sents`). ~~Optional[str](option)~~ | +| `--base`, `-b`, `--model` | Trained spaCy pipeline for sentence segmentation to use as base (for `--seg-sents`). ~~Optional[str](option)~~ | | `--morphology`, `-m` | Enable appending morphology to tags. Supported for: `conllu` ~~bool (flag)~~ | +| `--merge-subtokens`, `-T` | Merge CoNLL-U subtokens ~~bool (flag)~~ | | `--ner-map`, `-nm` | NER tag mapping (as JSON-encoded dict of entity types). Supported for: `conllu` ~~Optional[Path](option)~~ | | `--lang`, `-l` 2.1 | Language code (if tokenizer required). ~~Optional[str] \(option)~~ | +| `--concatenate`, `-C` | Concatenate output to a single file ~~bool (flag)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | **CREATES** | Binary [`DocBin`](/api/docbin) training data that can be used with [`spacy train`](/api/cli#train). | From 48ba4e60f443eae620740aec1169ecb5dc2f625b Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Thu, 7 Oct 2021 15:47:39 +0000 Subject: [PATCH 11/29] Add new style citation file (#9388) --- CITATION | 8 -------- CITATION.cff | 16 ++++++++++++++++ 2 files changed, 16 insertions(+), 8 deletions(-) delete mode 100644 CITATION create mode 100644 CITATION.cff diff --git a/CITATION b/CITATION deleted file mode 100644 index bdaa90677..000000000 --- a/CITATION +++ /dev/null @@ -1,8 +0,0 @@ -@software{spacy, - author = {Honnibal, Matthew and Montani, Ines and Van Landeghem, Sofie and Boyd, Adriane}, - title = {{spaCy: Industrial-strength Natural Language Processing in Python}}, - year = 2020, - publisher = {Zenodo}, - doi = {10.5281/zenodo.1212303}, - url = {https://doi.org/10.5281/zenodo.1212303} -} diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 000000000..88c05b2a3 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,16 @@ +cff-version: 1.2.0 +preferred-citation: + type: article + message: "If you use spaCy, please cite it as below." + authors: + - family-names: "Honnibal" + given-names: "Matthew" + - family-names: "Montani" + given-names: "Ines" + - family-names: "Van Landeghem" + given-names: "Sofie" + - family-names: "Boyd" + given-names: "Adriane" + title: "spaCy: Industrial-strength Natural Language Processing in Python" + doi: "10.5281/zenodo.1212303" + year: 2020 From 5dbe4e8392a1ea5675426a3e4cab2a74d2a0de93 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 11 Oct 2021 15:41:32 +0900 Subject: [PATCH 12/29] Update new issue config with Python 3.10 info Also adds note that Install issues go to Discussions. --- .github/ISSUE_TEMPLATE/config.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 09de1cd05..fce1a1064 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1,8 +1,11 @@ blank_issues_enabled: false contact_links: + - name: ⚠️ Python 3.10 Support + url: https://github.com/explosion/spaCy/discussions/9418 + about: Python 3.10 wheels haven't been released yet, see the link for details. - name: 🗯 Discussions Forum url: https://github.com/explosion/spaCy/discussions - about: Usage questions, general discussion and anything else that isn't a bug report. + about: Install issues, usage questions, general discussion and anything else that isn't a bug report. - name: 📖 spaCy FAQ & Troubleshooting url: https://github.com/explosion/spaCy/discussions/8226 about: Before you post, check out the FAQ for answers to common community questions! From 2a7e3273105651120c49c06c54da1c776b1ae9fa Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 11 Oct 2021 08:26:13 +0000 Subject: [PATCH 13/29] Fix Dependency Matcher Ordering Issue (#9337) * Fix inconsistency This makes the failing test pass, so that behavior is consistent whether patterns are added in one call or two. The issue is that the hash for patterns depended on the index of the pattern in the list of current patterns, not the list of total patterns, so a second call would get identical match ids. * Add illustrative test case * Add failing test for remove case Patterns are not removed from the internal matcher on calls to remove, which causes spurious weird matches (or misses). * Fix removal issue Remove patterns from the internal matcher. * Check that the single add call also gets no matches --- spacy/matcher/dependencymatcher.pyx | 7 +- .../tests/matcher/test_dependency_matcher.py | 84 +++++++++++++++++++ 2 files changed, 89 insertions(+), 2 deletions(-) diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index 0cda37012..b667e6b2f 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -177,13 +177,14 @@ cdef class DependencyMatcher: # Add 'RIGHT_ATTRS' to self._patterns[key] _patterns = [[[pat["RIGHT_ATTRS"]] for pat in pattern] for pattern in patterns] + pattern_offset = len(self._patterns[key]) self._patterns[key].extend(_patterns) # Add each node pattern of all the input patterns individually to the # matcher. This enables only a single instance of Matcher to be used. # Multiple adds are required to track each node pattern. tokens_to_key_list = [] - for i, current_patterns in enumerate(_patterns): + for i, current_patterns in enumerate(_patterns, start=pattern_offset): # Preallocate list space tokens_to_key = [None] * len(current_patterns) @@ -263,7 +264,9 @@ cdef class DependencyMatcher: self._raw_patterns.pop(key) self._tree.pop(key) self._root.pop(key) - self._tokens_to_key.pop(key) + for mklist in self._tokens_to_key.pop(key): + for mkey in mklist: + self._matcher.remove(mkey) def _get_keys_to_position_maps(self, doc): """ diff --git a/spacy/tests/matcher/test_dependency_matcher.py b/spacy/tests/matcher/test_dependency_matcher.py index 0e1eae588..61ae43c52 100644 --- a/spacy/tests/matcher/test_dependency_matcher.py +++ b/spacy/tests/matcher/test_dependency_matcher.py @@ -368,3 +368,87 @@ def test_dependency_matcher_span_user_data(en_tokenizer): assert doc_match[0] == span_match[0] for doc_t_i, span_t_i in zip(doc_match[1], span_match[1]): assert doc_t_i == span_t_i + offset + + +def test_dependency_matcher_order_issue(en_tokenizer): + # issue from #9263 + doc = en_tokenizer("I like text") + doc[2].head = doc[1] + + # this matches on attrs but not rel op + pattern1 = [ + {"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "like"}}, + { + "LEFT_ID": "root", + "RIGHT_ID": "r", + "RIGHT_ATTRS": {"ORTH": "text"}, + "REL_OP": "<", + }, + ] + + # this matches on rel op but not attrs + pattern2 = [ + {"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "like"}}, + { + "LEFT_ID": "root", + "RIGHT_ID": "r", + "RIGHT_ATTRS": {"ORTH": "fish"}, + "REL_OP": ">", + }, + ] + + matcher = DependencyMatcher(en_tokenizer.vocab) + + # This should behave the same as the next pattern + matcher.add("check", [pattern1, pattern2]) + matches = matcher(doc) + + assert matches == [] + + # use a new matcher + matcher = DependencyMatcher(en_tokenizer.vocab) + # adding one at a time under same label gets a match + matcher.add("check", [pattern1]) + matcher.add("check", [pattern2]) + matches = matcher(doc) + + assert matches == [] + + +def test_dependency_matcher_remove(en_tokenizer): + # issue from #9263 + doc = en_tokenizer("The red book") + doc[1].head = doc[2] + + # this matches + pattern1 = [ + {"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "book"}}, + { + "LEFT_ID": "root", + "RIGHT_ID": "r", + "RIGHT_ATTRS": {"ORTH": "red"}, + "REL_OP": ">", + }, + ] + + # add and then remove it + matcher = DependencyMatcher(en_tokenizer.vocab) + matcher.add("check", [pattern1]) + matcher.remove("check") + + # this matches on rel op but not attrs + pattern2 = [ + {"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "flag"}}, + { + "LEFT_ID": "root", + "RIGHT_ID": "r", + "RIGHT_ATTRS": {"ORTH": "blue"}, + "REL_OP": ">", + }, + ] + + # Adding this new pattern with the same label, which should not match + matcher.add("check", [pattern2]) + matches = matcher(doc) + + assert matches == [] From 5003a9c3c7299830bbdf73f77bed6e4076428a81 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 11 Oct 2021 10:56:14 +0200 Subject: [PATCH 14/29] Move core training logic in CLI into standalone function (#9398) --- spacy/cli/train.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 9fd87dbc1..664fc2aaf 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Dict, Any from pathlib import Path from wasabi import msg import typer @@ -7,7 +7,7 @@ import sys from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error from ._util import import_code, setup_gpu -from ..training.loop import train +from ..training.loop import train as train_nlp from ..training.initialize import init_nlp from .. import util @@ -40,6 +40,18 @@ def train_cli( DOCS: https://spacy.io/api/cli#train """ util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) + overrides = parse_config_overrides(ctx.args) + import_code(code_path) + train(config_path, output_path, use_gpu=use_gpu, overrides=overrides) + + +def train( + config_path: Path, + output_path: Optional[Path] = None, + *, + use_gpu: int = -1, + overrides: Dict[str, Any] = util.SimpleFrozenDict(), +): # Make sure all files and paths exists if they are needed if not config_path or (str(config_path) != "-" and not config_path.exists()): msg.fail("Config file not found", config_path, exits=1) @@ -50,8 +62,6 @@ def train_cli( output_path.mkdir(parents=True) msg.good(f"Created output directory: {output_path}") msg.info(f"Saving to output directory: {output_path}") - overrides = parse_config_overrides(ctx.args) - import_code(code_path) setup_gpu(use_gpu) with show_validation_error(config_path): config = util.load_config(config_path, overrides=overrides, interpolate=False) @@ -60,4 +70,4 @@ def train_cli( nlp = init_nlp(config, use_gpu=use_gpu) msg.good("Initialized pipeline") msg.divider("Training pipeline") - train(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr) + train_nlp(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr) From 3b144a3a515469f2cb17e79d9a54fcc580db0a62 Mon Sep 17 00:00:00 2001 From: Jette16 <33116335+Jette16@users.noreply.github.com> Date: Thu, 23 Sep 2021 14:31:42 +0200 Subject: [PATCH 15/29] Add universe test (#9278) * Added test for universe.json * Added contributor agreement * Ran black on test_universe_json.py --- .github/contributors/Jette16.md | 106 +++++++++++++++++++++ .gitignore | 1 + setup.py | 1 + spacy/tests/universe/test_universe_json.py | 17 ++++ 4 files changed, 125 insertions(+) create mode 100644 .github/contributors/Jette16.md create mode 100644 spacy/tests/universe/test_universe_json.py diff --git a/.github/contributors/Jette16.md b/.github/contributors/Jette16.md new file mode 100644 index 000000000..c064f1d4f --- /dev/null +++ b/.github/contributors/Jette16.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Henriette Behr | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 23.09.2021 | +| GitHub username | Jette16 | +| Website (optional) | | diff --git a/.gitignore b/.gitignore index ac72f2bbf..60036a475 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ keys/ spacy/tests/package/setup.cfg spacy/tests/package/pyproject.toml spacy/tests/package/requirements.txt +spacy/tests/universe/universe.json # Website website/.cache/ diff --git a/setup.py b/setup.py index fcc124a43..03a1e01dd 100755 --- a/setup.py +++ b/setup.py @@ -81,6 +81,7 @@ COPY_FILES = { ROOT / "setup.cfg": PACKAGE_ROOT / "tests" / "package", ROOT / "pyproject.toml": PACKAGE_ROOT / "tests" / "package", ROOT / "requirements.txt": PACKAGE_ROOT / "tests" / "package", + ROOT / "website" / "meta" / "universe.json": PACKAGE_ROOT / "tests" / "universe", } diff --git a/spacy/tests/universe/test_universe_json.py b/spacy/tests/universe/test_universe_json.py new file mode 100644 index 000000000..295889186 --- /dev/null +++ b/spacy/tests/universe/test_universe_json.py @@ -0,0 +1,17 @@ +import json +import re +from pathlib import Path + + +def test_universe_json(): + + root_dir = Path(__file__).parent + universe_file = root_dir / "universe.json" + + with universe_file.open() as f: + universe_data = json.load(f) + for entry in universe_data["resources"]: + if "github" in entry: + assert not re.match( + r"^(http:)|^(https:)", entry["github"] + ), "Github field should be user/repo, not a url" From a5231cb044ed65e75b83ec19685436d9e897ae7d Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 11 Oct 2021 11:13:35 +0200 Subject: [PATCH 16/29] Remove traces of lexemes from vocab serialization (#9400) --- spacy/vocab.pyx | 1 - website/docs/api/vocab.md | 1 - 2 files changed, 2 deletions(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 13dd675af..7af780457 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -530,7 +530,6 @@ cdef class Vocab: setters = { "strings": lambda b: self.strings.from_bytes(b), - "lexemes": lambda b: self.lexemes_from_bytes(b), "vectors": lambda b: serialize_vectors(b), "lookups": lambda b: self.lookups.from_bytes(b), } diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md index 40a3c3b22..c37b27a0e 100644 --- a/website/docs/api/vocab.md +++ b/website/docs/api/vocab.md @@ -325,6 +325,5 @@ serialization by passing in the string names via the `exclude` argument. | Name | Description | | --------- | ----------------------------------------------------- | | `strings` | The strings in the [`StringStore`](/api/stringstore). | -| `lexemes` | The lexeme data. | | `vectors` | The word vectors, if available. | | `lookups` | The lookup tables, if available. | From fd7edbc6456a34c29bd68ea0981d83269e6be478 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 11 Oct 2021 11:17:18 +0200 Subject: [PATCH 17/29] Fix types descriptions of sm and sent models (#9401) --- website/src/templates/models.js | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/website/src/templates/models.js b/website/src/templates/models.js index 21ade5e36..554823ebf 100644 --- a/website/src/templates/models.js +++ b/website/src/templates/models.js @@ -34,6 +34,7 @@ const MODEL_META = { core_sm: 'Vocabulary, syntax, entities', dep: 'Vocabulary, syntax', ent: 'Named entities', + sent: 'Sentence boundaries', pytt: 'PyTorch Transformers', trf: 'Transformers', vectors: 'Word vectors', @@ -195,6 +196,7 @@ const Model = ({ const [isError, setIsError] = useState(true) const [meta, setMeta] = useState({}) const { type, genre, size } = getModelComponents(name) + const display_type = type === 'core' && size === 'sm' ? 'core_sm' : type const version = useMemo(() => getLatestVersion(name, compatibility, prereleases), [ name, compatibility, @@ -231,7 +233,7 @@ const Model = ({ const rows = [ { label: 'Language', tag: langId, content: langName }, - { label: 'Type', tag: type, content: MODEL_META[type] }, + { label: 'Type', tag: type, content: MODEL_META[display_type] }, { label: 'Genre', tag: genre, content: MODEL_META[genre] }, { label: 'Size', tag: size, content: meta.sizeFull }, { label: 'Components', content: components, help: MODEL_META.components }, From fd759a881b02a7bc3488b1d9c005d5849cfc05f9 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 11 Oct 2021 09:38:45 +0000 Subject: [PATCH 18/29] Fix inconsistent lemmas (#9405) * Add util function to unique lists and preserve order * Use unique function instead of list(set()) list(set()) has the issue that it's not consistent between runs of the Python interpreter, so order can vary. list(set()) calls were left in a few places where they were behind calls to sorted(). I think in this case the calls to list() can be removed, but this commit doesn't do that. * Use the existing pattern for this --- spacy/lang/ca/lemmatizer.py | 2 +- spacy/lang/fr/lemmatizer.py | 2 +- spacy/lang/nl/lemmatizer.py | 2 +- spacy/lang/ru/lemmatizer.py | 4 ++-- spacy/util.py | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/spacy/lang/ca/lemmatizer.py b/spacy/lang/ca/lemmatizer.py index 2518eb720..2fd012912 100644 --- a/spacy/lang/ca/lemmatizer.py +++ b/spacy/lang/ca/lemmatizer.py @@ -76,6 +76,6 @@ class CatalanLemmatizer(Lemmatizer): forms.append(self.lookup_lemmatize(token)[0]) if not forms: forms.append(string) - forms = list(set(forms)) + forms = list(dict.fromkeys(forms)) self.cache[cache_key] = forms return forms diff --git a/spacy/lang/fr/lemmatizer.py b/spacy/lang/fr/lemmatizer.py index bb5a270ab..c6422cf96 100644 --- a/spacy/lang/fr/lemmatizer.py +++ b/spacy/lang/fr/lemmatizer.py @@ -75,6 +75,6 @@ class FrenchLemmatizer(Lemmatizer): forms.append(self.lookup_lemmatize(token)[0]) if not forms: forms.append(string) - forms = list(set(forms)) + forms = list(dict.fromkeys(forms)) self.cache[cache_key] = forms return forms diff --git a/spacy/lang/nl/lemmatizer.py b/spacy/lang/nl/lemmatizer.py index 6c025dcf6..4f6b2ef30 100644 --- a/spacy/lang/nl/lemmatizer.py +++ b/spacy/lang/nl/lemmatizer.py @@ -97,7 +97,7 @@ class DutchLemmatizer(Lemmatizer): return forms else: oov_forms.append(form) - forms = list(set(oov_forms)) + forms = list(dict.fromkeys(oov_forms)) # Back-off through remaining return value candidates. if forms: for form in forms: diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py index 399cd174c..92bec4c8c 100644 --- a/spacy/lang/ru/lemmatizer.py +++ b/spacy/lang/ru/lemmatizer.py @@ -56,7 +56,7 @@ class RussianLemmatizer(Lemmatizer): if not len(filtered_analyses): return [string.lower()] if morphology is None or (len(morphology) == 1 and POS in morphology): - return list(set([analysis.normal_form for analysis in filtered_analyses])) + return list(dict.fromkeys([analysis.normal_form for analysis in filtered_analyses])) if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"): features_to_compare = ["Case", "Number", "Gender"] elif univ_pos == "NUM": @@ -87,7 +87,7 @@ class RussianLemmatizer(Lemmatizer): filtered_analyses.append(analysis) if not len(filtered_analyses): return [string.lower()] - return list(set([analysis.normal_form for analysis in filtered_analyses])) + return list(dict.fromkeys([analysis.normal_form for analysis in filtered_analyses])) def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]: string = token.text diff --git a/spacy/util.py b/spacy/util.py index b49bd096f..0aa7c4c17 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1403,7 +1403,7 @@ def get_arg_names(func: Callable) -> List[str]: RETURNS (List[str]): The argument names. """ argspec = inspect.getfullargspec(func) - return list(set([*argspec.args, *argspec.kwonlyargs])) + return list(dict.fromkeys([*argspec.args, *argspec.kwonlyargs])) def combine_score_weights( From b53e39455e6d5121337fa64dd11dbc0175e1568b Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 11 Oct 2021 09:51:19 +0000 Subject: [PATCH 19/29] Fix UD POS docs links (fix #9013) (#9407) * Fix UD POS docs links (fix #9013) The previous link seems to have been for UD v1. * Fix link --- website/docs/api/token.md | 4 ++-- website/docs/usage/101/_pos-deps.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/website/docs/api/token.md b/website/docs/api/token.md index 44c92d1ee..44a2ea9e8 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -474,8 +474,8 @@ The L2 norm of the token's vector representation. | `like_email` | Does the token resemble an email address? ~~bool~~ | | `is_oov` | Is the token out-of-vocabulary (i.e. does it not have a word vector)? ~~bool~~ | | `is_stop` | Is the token part of a "stop list"? ~~bool~~ | -| `pos` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~int~~ | -| `pos_` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~str~~ | +| `pos` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/u/pos/). ~~int~~ | +| `pos_` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/u/pos/). ~~str~~ | | `tag` | Fine-grained part-of-speech. ~~int~~ | | `tag_` | Fine-grained part-of-speech. ~~str~~ | | `morph` 3 | Morphological analysis. ~~MorphAnalysis~~ | diff --git a/website/docs/usage/101/_pos-deps.md b/website/docs/usage/101/_pos-deps.md index a531b245e..93ad0961a 100644 --- a/website/docs/usage/101/_pos-deps.md +++ b/website/docs/usage/101/_pos-deps.md @@ -25,7 +25,7 @@ for token in doc: > - **Text:** The original word text. > - **Lemma:** The base form of the word. -> - **POS:** The simple [UPOS](https://universaldependencies.org/docs/u/pos/) +> - **POS:** The simple [UPOS](https://universaldependencies.org/u/pos/) > part-of-speech tag. > - **Tag:** The detailed part-of-speech tag. > - **Dep:** Syntactic dependency, i.e. the relation between tokens. From 1fa7c4e73b950a2f8501eec53eaff883555f4db7 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 11 Oct 2021 13:56:24 +0200 Subject: [PATCH 20/29] Support issue marker via pytest --- setup.cfg | 3 ++- spacy/tests/conftest.py | 15 +++++++++++++++ spacy/tests/regression/test_issue8168.py | 2 ++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index fe484f92e..2e7be5e12 100644 --- a/setup.cfg +++ b/setup.cfg @@ -122,7 +122,8 @@ exclude = [tool:pytest] markers = - slow + slow: mark a test as slow + issue: reference specific issue [mypy] ignore_missing_imports = True diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index a5dedcc87..10982bac1 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -4,6 +4,7 @@ from spacy.util import get_lang_class def pytest_addoption(parser): parser.addoption("--slow", action="store_true", help="include slow tests") + parser.addoption("--issue", action="store", help="test specific issues") def pytest_runtest_setup(item): @@ -16,10 +17,24 @@ def pytest_runtest_setup(item): # options weren't given. return item.config.getoption(f"--{opt}", False) + # Integration of boolean flags for opt in ["slow"]: if opt in item.keywords and not getopt(opt): pytest.skip(f"need --{opt} option to run") + # Special integration to mark tests with issue numbers + issues = getopt("issue") + if isinstance(issues, str): + if "issue" in item.keywords: + # Convert issues provided on the CLI to list of ints + issue_nos = [int(issue.strip()) for issue in issues.split(",")] + # Get all issues specified by decorators and check if they're provided + issue_refs = [mark.args[0] for mark in item.iter_markers(name="issue")] + if not any([ref in issue_nos for ref in issue_refs]): + pytest.skip(f"not referencing specified issues: {issue_nos}") + else: + pytest.skip("not referencing any issues") + # Fixtures for language tokenizers (languages sorted alphabetically) diff --git a/spacy/tests/regression/test_issue8168.py b/spacy/tests/regression/test_issue8168.py index fbddf643c..e3f3b5cfa 100644 --- a/spacy/tests/regression/test_issue8168.py +++ b/spacy/tests/regression/test_issue8168.py @@ -1,6 +1,8 @@ +import pytest from spacy.lang.en import English +@pytest.mark.issue(8168) def test_issue8168(): nlp = English() ruler = nlp.add_pipe("entity_ruler") From efe5beefe07099b396cd270b5e5bd719b2ec1d84 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 11 Oct 2021 12:57:45 +0000 Subject: [PATCH 21/29] Add test for case where parser overwrite annotations (#9406) * Add test for case where parser overwrite annotations * Move test to its own file Also add note about how other tokens modify results. * Fix xfail decorator --- spacy/tests/regression/test_issue7716.py | 54 ++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 spacy/tests/regression/test_issue7716.py diff --git a/spacy/tests/regression/test_issue7716.py b/spacy/tests/regression/test_issue7716.py new file mode 100644 index 000000000..811952792 --- /dev/null +++ b/spacy/tests/regression/test_issue7716.py @@ -0,0 +1,54 @@ +import pytest +from thinc.api import Adam +from spacy.attrs import NORM +from spacy.vocab import Vocab +from spacy import registry +from spacy.training import Example +from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL +from spacy.tokens import Doc +from spacy.pipeline import DependencyParser + + +@pytest.fixture +def vocab(): + return Vocab(lex_attr_getters={NORM: lambda s: s}) + + +def _parser_example(parser): + doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) + gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]} + return Example.from_dict(doc, gold) + + +@pytest.fixture +def parser(vocab): + vocab.strings.add("ROOT") + cfg = {"model": DEFAULT_PARSER_MODEL} + model = registry.resolve(cfg, validate=True)["model"] + parser = DependencyParser(vocab, model) + parser.cfg["token_vector_width"] = 4 + parser.cfg["hidden_width"] = 32 + # parser.add_label('right') + parser.add_label("left") + parser.initialize(lambda: [_parser_example(parser)]) + sgd = Adam(0.001) + + for i in range(10): + losses = {} + doc = Doc(vocab, words=["a", "b", "c", "d"]) + example = Example.from_dict( + doc, {"heads": [1, 1, 3, 3], "deps": ["left", "ROOT", "left", "ROOT"]} + ) + parser.update([example], sgd=sgd, losses=losses) + return parser + + +@pytest.mark.xfail(reason="Not fixed yet") +def test_partial_annotation(parser): + doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) + doc[2].is_sent_start = False + # Note that if the following line is used, then doc[2].is_sent_start == False + # doc[3].is_sent_start = False + + doc = parser(doc) + assert doc[2].is_sent_start == False From f64e39fa4934fa98603e4e5d9ec1bbd86926e776 Mon Sep 17 00:00:00 2001 From: Ryn Daniels <397565+ryndaniels@users.noreply.github.com> Date: Mon, 11 Oct 2021 15:43:27 +0200 Subject: [PATCH 22/29] Install explosionbot as a github action (#9420) --- .github/workflows/explosionbot.yml | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 .github/workflows/explosionbot.yml diff --git a/.github/workflows/explosionbot.yml b/.github/workflows/explosionbot.yml new file mode 100644 index 000000000..2589c56c8 --- /dev/null +++ b/.github/workflows/explosionbot.yml @@ -0,0 +1,26 @@ +name: Explosion Bot + +on: + issue_comment: + types: + - created + - edited + +jobs: + explosion-bot: + runs-on: ubuntu-18.04 + steps: + - name: Dump GitHub context + env: + GITHUB_CONTEXT: ${{ toJson(github) }} + run: echo "$GITHUB_CONTEXT" + - uses: actions/checkout@v1 + - uses: actions/setup-python@v1 + - name: Install and run explosion-bot + run: | + pip install git+https://${{ secrets.EXPLOSIONBOT_TOKEN }}@github.com/explosion/explosion-bot + python -m explosionbot + env: + INPUT_TOKEN: ${{ secrets.EXPLOSIONBOT_TOKEN }} + INPUT_BK_TOKEN: ${{ secrets.BUILDKITE_SECRET }} + ENABLED_COMMANDS: "test_gpu" \ No newline at end of file From 2fb420ec23c9fa596bb00a01a1c0d2416eb0264d Mon Sep 17 00:00:00 2001 From: Ryn Daniels Date: Mon, 11 Oct 2021 18:20:48 +0200 Subject: [PATCH 23/29] Add allowed_teams to the explosion-bot config --- .github/workflows/explosionbot.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/explosionbot.yml b/.github/workflows/explosionbot.yml index 2589c56c8..7d9ee45e9 100644 --- a/.github/workflows/explosionbot.yml +++ b/.github/workflows/explosionbot.yml @@ -23,4 +23,5 @@ jobs: env: INPUT_TOKEN: ${{ secrets.EXPLOSIONBOT_TOKEN }} INPUT_BK_TOKEN: ${{ secrets.BUILDKITE_SECRET }} - ENABLED_COMMANDS: "test_gpu" \ No newline at end of file + ENABLED_COMMANDS: "test_gpu" + ALLOWED_TEAMS: "spaCy" \ No newline at end of file From 6425b9a1c4cfc2ef6b712d426e55b6f9c6652394 Mon Sep 17 00:00:00 2001 From: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com> Date: Tue, 12 Oct 2021 21:39:14 +0800 Subject: [PATCH 24/29] Include JsonlCorpus from the imports (#9431) --- spacy/training/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py index 055f30f42..34cde0ba9 100644 --- a/spacy/training/__init__.py +++ b/spacy/training/__init__.py @@ -1,4 +1,4 @@ -from .corpus import Corpus # noqa: F401 +from .corpus import Corpus, JsonlCorpus # noqa: F401 from .example import Example, validate_examples, validate_get_examples # noqa: F401 from .alignment import Alignment # noqa: F401 from .augment import dont_augment, orth_variants_augmenter # noqa: F401 From 5e8e8525f02656917c95cd74afa4aa61af818edf Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 12 Oct 2021 19:56:44 +0200 Subject: [PATCH 25/29] fix W108 filter (#9438) * remove text argument from W108 to enable 'once' filtering * include the option of partial POS annotation * fix typo * Update spacy/errors.py Co-authored-by: Adriane Boyd --- spacy/errors.py | 6 +++--- spacy/pipeline/lemmatizer.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index b6659a041..fc44f6ba3 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -25,7 +25,7 @@ def setup_default_warnings(): filter_warning("once", error_msg=Warnings.W036.format(name=pipe)) # warn once about lemmatizer without required POS - filter_warning("once", error_msg="[W108]") + filter_warning("once", error_msg=Warnings.W108) def filter_warning(action: str, error_msg: str): @@ -170,8 +170,8 @@ class Warnings: "call the {matcher} on each Doc object.") W107 = ("The property `Doc.{prop}` is deprecated. Use " "`Doc.has_annotation(\"{attr}\")` instead.") - W108 = ("The rule-based lemmatizer did not find POS annotation for the " - "token '{text}'. Check that your pipeline includes components that " + W108 = ("The rule-based lemmatizer did not find POS annotation for one or " + "more tokens. Check that your pipeline includes components that " "assign token.pos, typically 'tagger'+'attribute_ruler' or " "'morphologizer'.") W109 = ("Unable to save user hooks while serializing the doc. Re-add any " diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py index 2f436c57a..b2338724d 100644 --- a/spacy/pipeline/lemmatizer.py +++ b/spacy/pipeline/lemmatizer.py @@ -184,7 +184,7 @@ class Lemmatizer(Pipe): univ_pos = token.pos_.lower() if univ_pos in ("", "eol", "space"): if univ_pos == "": - warnings.warn(Warnings.W108.format(text=string)) + warnings.warn(Warnings.W108) return [string.lower()] # See Issue #435 for example of where this logic is requied. if self.is_base_form(token): From 2e3d6b8b5a3de49972839308179dceec51b327ef Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 13 Oct 2021 10:47:56 +0200 Subject: [PATCH 26/29] Fix test for spancat (#9446) * fix test for spancat * increase tolerance for almost equal checks * Update spacy/tests/test_models.py * Update spacy/tests/test_models.py --- spacy/tests/pipeline/test_spancat.py | 2 +- spacy/tests/regression/test_issue5501-6000.py | 4 ++-- spacy/tests/test_models.py | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py index 7b759f8f6..d4d0617d7 100644 --- a/spacy/tests/pipeline/test_spancat.py +++ b/spacy/tests/pipeline/test_spancat.py @@ -114,7 +114,7 @@ def test_make_spangroup(max_positive, nr_results): doc = nlp.make_doc("Greater London") ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2]) indices = ngram_suggester([doc])[0].dataXd - assert_array_equal(indices, numpy.asarray([[0, 1], [1, 2], [0, 2]])) + assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]])) labels = ["Thing", "City", "Person", "GreatCity"] scores = numpy.asarray( [[0.2, 0.4, 0.3, 0.1], [0.1, 0.6, 0.2, 0.4], [0.8, 0.7, 0.3, 0.9]], dtype="f" diff --git a/spacy/tests/regression/test_issue5501-6000.py b/spacy/tests/regression/test_issue5501-6000.py index a35de92fa..355ffffeb 100644 --- a/spacy/tests/regression/test_issue5501-6000.py +++ b/spacy/tests/regression/test_issue5501-6000.py @@ -49,8 +49,8 @@ def test_issue5551(textcat_config): # All results should be the same because of the fixed seed assert len(results) == 3 ops = get_current_ops() - assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1])) - assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2])) + assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1]), decimal=5) + assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]), decimal=5) def test_issue5838(): diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py index 47540198a..2306cabb7 100644 --- a/spacy/tests/test_models.py +++ b/spacy/tests/test_models.py @@ -193,6 +193,7 @@ def test_models_update_consistently(seed, dropout, model_func, kwargs, get_X): assert_array_almost_equal( model1.ops.to_numpy(get_all_params(model1)), model2.ops.to_numpy(get_all_params(model2)), + decimal=5, ) From d2645b2e03b7777a664b84196f893327b06bb9eb Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 13 Oct 2021 10:48:35 +0200 Subject: [PATCH 27/29] Fix test for spancat (#9446) * fix test for spancat * increase tolerance for almost equal checks * Update spacy/tests/test_models.py * Update spacy/tests/test_models.py From 78365452d3c1ee3b47c4d12dc43813b655064eac Mon Sep 17 00:00:00 2001 From: Jette16 <33116335+Jette16@users.noreply.github.com> Date: Wed, 13 Oct 2021 14:13:06 +0200 Subject: [PATCH 28/29] Moved test for universe into .github folder (#9447) * Moved universe-test into .github folder * Cleaned code * CHanged a file name --- .github/azure-steps.yml | 5 +++++ .../validate_universe_json.py | 12 +++++++----- .gitignore | 1 - setup.py | 1 - 4 files changed, 12 insertions(+), 7 deletions(-) rename spacy/tests/universe/test_universe_json.py => .github/validate_universe_json.py (72%) diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml index 50e81799e..543804b9f 100644 --- a/.github/azure-steps.yml +++ b/.github/azure-steps.yml @@ -100,3 +100,8 @@ steps: python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 displayName: 'Test assemble CLI vectors warning' condition: eq(variables['python_version'], '3.8') + + - script: | + python .github/validate_universe_json.py website/meta/universe.json + displayName: 'Test website/meta/universe.json' + condition: eq(variables['python_version'], '3.8') diff --git a/spacy/tests/universe/test_universe_json.py b/.github/validate_universe_json.py similarity index 72% rename from spacy/tests/universe/test_universe_json.py rename to .github/validate_universe_json.py index 295889186..b96b7b347 100644 --- a/spacy/tests/universe/test_universe_json.py +++ b/.github/validate_universe_json.py @@ -1,13 +1,11 @@ import json import re +import sys from pathlib import Path -def test_universe_json(): - - root_dir = Path(__file__).parent - universe_file = root_dir / "universe.json" - +def validate_json(document): + universe_file = Path(document) with universe_file.open() as f: universe_data = json.load(f) for entry in universe_data["resources"]: @@ -15,3 +13,7 @@ def test_universe_json(): assert not re.match( r"^(http:)|^(https:)", entry["github"] ), "Github field should be user/repo, not a url" + + +if __name__ == "__main__": + validate_json(str(sys.argv[1])) diff --git a/.gitignore b/.gitignore index 60036a475..ac72f2bbf 100644 --- a/.gitignore +++ b/.gitignore @@ -9,7 +9,6 @@ keys/ spacy/tests/package/setup.cfg spacy/tests/package/pyproject.toml spacy/tests/package/requirements.txt -spacy/tests/universe/universe.json # Website website/.cache/ diff --git a/setup.py b/setup.py index 03a1e01dd..fcc124a43 100755 --- a/setup.py +++ b/setup.py @@ -81,7 +81,6 @@ COPY_FILES = { ROOT / "setup.cfg": PACKAGE_ROOT / "tests" / "package", ROOT / "pyproject.toml": PACKAGE_ROOT / "tests" / "package", ROOT / "requirements.txt": PACKAGE_ROOT / "tests" / "package", - ROOT / "website" / "meta" / "universe.json": PACKAGE_ROOT / "tests" / "universe", } From 72711dc2c9f02a2118625a03140ab19b574d62be Mon Sep 17 00:00:00 2001 From: Edward <43848523+thomashacker@users.noreply.github.com> Date: Wed, 13 Oct 2021 16:29:19 +0200 Subject: [PATCH 29/29] Update universe example codes (#9422) * Update universe plugins * Adjust azure trigger * Add init to tests/universe * deliberatly trying to break the universe to see if the CI catches it * revert Co-authored-by: svlandeg --- azure-pipelines.yml | 2 ++ website/meta/universe.json | 28 +++++++++++++--------------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index ac80b8a10..844946845 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -16,6 +16,8 @@ pr: exclude: - "website/*" - "*.md" + include: + - "website/meta/universe.json" jobs: # Perform basic checks for most important errors (syntax etc.) Uses the config diff --git a/website/meta/universe.json b/website/meta/universe.json index 2b56f7507..7438a8932 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1363,20 +1363,19 @@ "url": "https://explosion.ai/demos/sense2vec", "code_example": [ "import spacy", - "from sense2vec import Sense2VecComponent", "", - "nlp = spacy.load('en')", - "s2v = Sense2VecComponent('/path/to/reddit_vectors-1.1.0')", - "nlp.add_pipe(s2v)", + "nlp = spacy.load(\"en_core_web_sm\")", + "s2v = nlp.add_pipe(\"sense2vec\")", + "s2v.from_disk(\"/path/to/s2v_reddit_2015_md\")", "", "doc = nlp(\"A sentence about natural language processing.\")", - "assert doc[3].text == 'natural language processing'", - "freq = doc[3]._.s2v_freq", - "vector = doc[3]._.s2v_vec", - "most_similar = doc[3]._.s2v_most_similar(3)", - "# [(('natural language processing', 'NOUN'), 1.0),", - "# (('machine learning', 'NOUN'), 0.8986966609954834),", - "# (('computer vision', 'NOUN'), 0.8636297583580017)]" + "assert doc[3:6].text == \"natural language processing\"", + "freq = doc[3:6]._.s2v_freq", + "vector = doc[3:6]._.s2v_vec", + "most_similar = doc[3:6]._.s2v_most_similar(3)", + "# [(('machine learning', 'NOUN'), 0.8986967),", + "# (('computer vision', 'NOUN'), 0.8636297),", + "# (('deep learning', 'NOUN'), 0.8573361)]" ], "category": ["pipeline", "standalone", "visualizers"], "tags": ["vectors"], @@ -2970,11 +2969,10 @@ "github": "thomasthiebaud/spacy-fastlang", "pip": "spacy_fastlang", "code_example": [ - "import spacy", - "from spacy_fastlang import LanguageDetector", + "import spacy_fastlang", "", - "nlp = spacy.load('en_core_web_sm')", - "nlp.add_pipe(LanguageDetector())", + "nlp = spacy.load(\"en_core_web_sm\")", + "nlp.add_pipe(\"language_detector\")", "doc = nlp('Life is like a box of chocolates. You never know what you are gonna get.')", "", "assert doc._.language == 'en'",