From 88d35450dcedd89fa739640d8a8d3e62f3643b4a Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 25 Oct 2022 14:53:18 +0200 Subject: [PATCH 01/23] Rename test helper method with non-test_ name (#11701) --- spacy/tests/test_models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py index 2306cabb7..d91ed1201 100644 --- a/spacy/tests/test_models.py +++ b/spacy/tests/test_models.py @@ -23,7 +23,7 @@ def get_textcat_bow_kwargs(): def get_textcat_cnn_kwargs(): - return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13} + return {"tok2vec": make_test_tok2vec(), "exclusive_classes": False, "nO": 13} def get_all_params(model): @@ -65,7 +65,7 @@ def get_tok2vec_kwargs(): } -def test_tok2vec(): +def make_test_tok2vec(): return build_Tok2Vec_model(**get_tok2vec_kwargs()) From 8740e4341f03fe2720f50c64e2f94a339d6bd4be Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 25 Oct 2022 14:54:54 +0200 Subject: [PATCH 02/23] Update languages and version in README and website (#11694) --- README.md | 6 +++--- website/meta/languages.json | 28 ++++++++++++++++++++++++++-- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index d9ef83e01..abfc3da67 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ be used in real products. spaCy comes with [pretrained pipelines](https://spacy.io/models) and -currently supports tokenization and training for **60+ languages**. It features +currently supports tokenization and training for **70+ languages**. It features state-of-the-art speed and **neural network models** for tagging, parsing, **named entity recognition**, **text classification** and more, multi-task learning with pretrained **transformers** like BERT, as well as a @@ -16,7 +16,7 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy model packaging, deployment and workflow management. spaCy is commercial open-source software, released under the MIT license. -💫 **Version 3.4.0 out now!** +💫 **Version 3.4 out now!** [Check out the release notes here.](https://github.com/explosion/spaCy/releases) [![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8) @@ -79,7 +79,7 @@ more people can benefit from it. ## Features -- Support for **60+ languages** +- Support for **70+ languages** - **Trained pipelines** for different languages and tasks - Multi-task learning with pretrained **transformers** like BERT - Support for pretrained **word vectors** and embeddings diff --git a/website/meta/languages.json b/website/meta/languages.json index 0028b4a5f..bd1535c90 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -4,12 +4,22 @@ "code": "af", "name": "Afrikaans" }, + { + "code": "am", + "name": "Amharic", + "has_examples": true + }, { "code": "ar", "name": "Arabic", "example": "هذه جملة", "has_examples": true }, + { + "code": "az", + "name": "Azerbaijani", + "has_examples": true + }, { "code": "bg", "name": "Bulgarian", @@ -65,7 +75,7 @@ { "code": "dsb", "name": "Lower Sorbian", - "has_examples": true + "has_examples": true }, { "code": "el", @@ -142,6 +152,11 @@ "code": "ga", "name": "Irish" }, + { + "code": "grc", + "name": "Ancient Greek", + "has_examples": true + }, { "code": "gu", "name": "Gujarati", @@ -172,7 +187,7 @@ { "code": "hsb", "name": "Upper Sorbian", - "has_examples": true + "has_examples": true }, { "code": "hu", @@ -260,6 +275,10 @@ "example": "Адамга эң кыйыны — күн сайын адам болуу", "has_examples": true }, + { + "code": "la", + "name": "Latin" + }, { "code": "lb", "name": "Luxembourgish", @@ -448,6 +467,11 @@ "example": "นี่คือประโยค", "has_examples": true }, + { + "code": "ti", + "name": "Tigrinya", + "has_examples": true + }, { "code": "tl", "name": "Tagalog" From 0a9859ba01c8a51842218e1817dff7ff784951df Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 25 Oct 2022 19:38:23 +0200 Subject: [PATCH 03/23] Reduce python 3.10 in CI to one OS (#11703) --- azure-pipelines.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 357cce835..eea07cb7a 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -76,15 +76,15 @@ jobs: # Python39Mac: # imageName: "macos-latest" # python.version: "3.9" - Python310Linux: - imageName: "ubuntu-latest" - python.version: "3.10" + # Python310Linux: + # imageName: "ubuntu-latest" + # python.version: "3.10" Python310Windows: imageName: "windows-latest" python.version: "3.10" - Python310Mac: - imageName: "macos-latest" - python.version: "3.10" + # Python310Mac: + # imageName: "macos-latest" + # python.version: "3.10" Python311Linux: imageName: 'ubuntu-latest' python.version: '3.11.0-rc.2' From a9139907a943f0cc91dac0338aa43caa38939778 Mon Sep 17 00:00:00 2001 From: Ryn Daniels <397565+ryndaniels@users.noreply.github.com> Date: Wed, 26 Oct 2022 09:15:13 +0300 Subject: [PATCH 04/23] update github actions to deal with deprecations (#11702) --- .github/workflows/autoblack.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/autoblack.yml b/.github/workflows/autoblack.yml index 8d0282650..3ad4cf408 100644 --- a/.github/workflows/autoblack.yml +++ b/.github/workflows/autoblack.yml @@ -12,10 +12,10 @@ jobs: if: github.repository_owner == 'explosion' runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: ref: ${{ github.head_ref }} - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v3 - run: pip install black - name: Auto-format code if needed run: black spacy @@ -23,10 +23,11 @@ jobs: # code and makes GitHub think the action failed - name: Check for modified files id: git-check - run: echo ::set-output name=modified::$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) + run: echo modified=$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) >> $GITHUB_OUTPUT + - name: Create Pull Request if: steps.git-check.outputs.modified == 'true' - uses: peter-evans/create-pull-request@v3 + uses: peter-evans/create-pull-request@v4 with: title: Auto-format code with black labels: meta From 865691d169c3be413007f0d7324e03a7aac3b3cb Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 26 Oct 2022 08:43:00 +0200 Subject: [PATCH 05/23] Adjust default attrs for textcat configs (#11698) --- spacy/pipeline/textcat.py | 4 ++-- spacy/pipeline/textcat_multilabel.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index c45f819fc..59549ad99 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -24,8 +24,8 @@ single_label_default_config = """ [model.tok2vec.embed] @architectures = "spacy.MultiHashEmbed.v2" width = 64 -rows = [2000, 2000, 1000, 1000, 1000, 1000] -attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"] +rows = [2000, 2000, 500, 1000, 500] +attrs = ["NORM", "LOWER", "PREFIX", "SUFFIX", "SHAPE"] include_static_vectors = false [model.tok2vec.encode] diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py index 493c440c3..eb83d9cb7 100644 --- a/spacy/pipeline/textcat_multilabel.py +++ b/spacy/pipeline/textcat_multilabel.py @@ -24,8 +24,8 @@ multi_label_default_config = """ [model.tok2vec.embed] @architectures = "spacy.MultiHashEmbed.v2" width = 64 -rows = [2000, 2000, 1000, 1000, 1000, 1000] -attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"] +rows = [2000, 2000, 500, 1000, 500] +attrs = ["NORM", "LOWER", "PREFIX", "SUFFIX", "SHAPE"] include_static_vectors = false [model.tok2vec.encode] From 6b78135b9e158e5bc02e39c1a73ef28bb360a44f Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Thu, 27 Oct 2022 22:08:24 +0900 Subject: [PATCH 06/23] Add warning to install widget for M1 GPUs (#11666) * Add warning to install widget for M1 GPUs * Use Thinc tracking issue instead * Update website/src/widgets/quickstart-install.js Co-authored-by: Adriane Boyd * Underline URL in warning * Update website/src/widgets/quickstart-install.js Co-authored-by: Adriane Boyd * Don't install cupy on m1 gpus Co-authored-by: Adriane Boyd --- website/src/styles/quickstart.module.sass | 3 +++ website/src/widgets/quickstart-install.js | 11 ++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/website/src/styles/quickstart.module.sass b/website/src/styles/quickstart.module.sass index 8ad106a78..d0f9db551 100644 --- a/website/src/styles/quickstart.module.sass +++ b/website/src/styles/quickstart.module.sass @@ -149,6 +149,9 @@ & > span display: block + a + text-decoration: underline + .small font-size: var(--font-size-code) line-height: 1.65 diff --git a/website/src/widgets/quickstart-install.js b/website/src/widgets/quickstart-install.js index 0d2186acb..28dd14ecc 100644 --- a/website/src/widgets/quickstart-install.js +++ b/website/src/widgets/quickstart-install.js @@ -159,6 +159,9 @@ const QuickstartInstall = ({ id, title }) => { setters={setters} showDropdown={showDropdown} > + + # Note M1 GPU support is experimental, see Thinc issue #792 + python -m venv .env @@ -198,7 +201,13 @@ const QuickstartInstall = ({ id, title }) => { {nightly ? ' --pre' : ''} conda install -c conda-forge spacy - + + conda install -c conda-forge cupy + + + conda install -c conda-forge cupy + + conda install -c conda-forge cupy From d61e742960ef230b423dfa157449b291a03bd119 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Fri, 28 Oct 2022 17:25:34 +0900 Subject: [PATCH 07/23] Handle Docs with no entities in EntityLinker (#11640) * Handle docs with no entities If a whole batch contains no entities it won't make it to the model, but it's possible for individual Docs to have no entities. Before this commit, those Docs would cause an error when attempting to concatenate arrays because the dimensions didn't match. It turns out the process of preparing the Ragged at the end of the span maker forward was a little different from list2ragged, which just uses the flatten function directly. Letting list2ragged do the conversion avoids the dimension issue. This did not come up before because in NEL demo projects it's typical for data with no entities to be discarded before it reaches the NEL component. This includes a simple direct test that shows the issue and checks it's resolved. It doesn't check if there are any downstream changes, so a more complete test could be added. A full run was tested by adding an example with no entities to the Emerson sample project. * Add a blank instance to default training data in tests Rather than adding a specific test, since not failing on instances with no entities is basic functionality, it makes sense to add it to the default set. * Fix without modifying architecture If the architecture is modified this would have to be a new version, but this change isn't big enough to merit that. --- spacy/ml/models/entity_linker.py | 7 +++---- spacy/tests/pipeline/test_entity_linker.py | 22 +++++++++++++++++++++- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index 4d18d216a..299b6bb52 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -71,11 +71,10 @@ def span_maker_forward(model, docs: List[Doc], is_train) -> Tuple[Ragged, Callab cands.append((start_token, end_token)) candidates.append(ops.asarray2i(cands)) - candlens = ops.asarray1i([len(cands) for cands in candidates]) - candidates = ops.xp.concatenate(candidates) - outputs = Ragged(candidates, candlens) + lengths = model.ops.asarray1i([len(cands) for cands in candidates]) + out = Ragged(model.ops.flatten(candidates), lengths) # because this is just rearranging docs, the backprop does nothing - return outputs, lambda x: [] + return out, lambda x: [] @registry.misc("spacy.KBFromFile.v1") diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 4d683acc5..99f164f15 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -9,6 +9,7 @@ from spacy.compat import pickle from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase from spacy.lang.en import English from spacy.ml import load_kb +from spacy.ml.models.entity_linker import build_span_maker from spacy.pipeline import EntityLinker from spacy.pipeline.legacy import EntityLinker_v1 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL @@ -715,7 +716,11 @@ TRAIN_DATA = [ ("Russ Cochran was a member of University of Kentucky's golf team.", {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}}, "entities": [(0, 12, "PERSON"), (43, 51, "LOC")], - "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}) + "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}), + # having a blank instance shouldn't break things + ("The weather is nice today.", + {"links": {}, "entities": [], + "sent_starts": [1, -1, 0, 0, 0, 0]}) ] GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"] # fmt: on @@ -1196,3 +1201,18 @@ def test_threshold(meet_threshold: bool, config: Dict[str, Any]): assert len(doc.ents) == 1 assert doc.ents[0].kb_id_ == entity_id if meet_threshold else EntityLinker.NIL + + +def test_span_maker_forward_with_empty(): + """The forward pass of the span maker may have a doc with no entities.""" + nlp = English() + doc1 = nlp("a b c") + ent = doc1[0:1] + ent.label_ = "X" + doc1.ents = [ent] + # no entities + doc2 = nlp("x y z") + + # just to get a model + span_maker = build_span_maker() + span_maker([doc1, doc2], False) From d25f09468c4eca20eb464d78d35e439474ed2dbc Mon Sep 17 00:00:00 2001 From: Aaron Zipp <15341396+aaronzipp@users.noreply.github.com> Date: Mon, 31 Oct 2022 05:27:12 +0100 Subject: [PATCH 08/23] Spelling mistake in rule-based-matching.md (#11717) Changed retokenize to retokenizer --- website/docs/usage/rule-based-matching.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index f096890cb..64bbf8e7b 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -1792,7 +1792,7 @@ the entity `Span` – for example `._.orgs` or `._.prev_orgs` and > [`Doc.retokenize`](/api/doc#retokenize) context manager: > > ```python -> with doc.retokenize() as retokenize: +> with doc.retokenize() as retokenizer: > for ent in doc.ents: > retokenizer.merge(ent) > ``` From f7edd84b44a37b78d87fe6815399a576f1980b8b Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 2 Nov 2022 13:42:20 +0100 Subject: [PATCH 09/23] Switch CI to Python 3.11.0 (#11737) --- azure-pipelines.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index eea07cb7a..bf3672b8b 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -87,13 +87,13 @@ jobs: # python.version: "3.10" Python311Linux: imageName: 'ubuntu-latest' - python.version: '3.11.0-rc.2' + python.version: '3.11.0' Python311Windows: imageName: 'windows-latest' - python.version: '3.11.0-rc.2' + python.version: '3.11.0' Python311Mac: imageName: 'macos-latest' - python.version: '3.11.0-rc.2' + python.version: '3.11.0' maxParallel: 4 pool: vmImage: $(imageName) From 420b1d854be86e899088bb136f1daf23fc61ed1d Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 2 Nov 2022 15:35:04 +0100 Subject: [PATCH 10/23] Update textcat scorer threshold behavior (#11696) * Update textcat scorer threshold behavior For `textcat` (with exclusive classes) the scorer should always use a threshold of 0.0 because there should be one predicted label per doc and the numeric score for that particular label should not matter. * Rename to test_textcat_multilabel_threshold * Remove all uses of threshold for multi_label=False * Update Scorer.score_cats API docs * Add tests for score_cats with thresholds * Update textcat API docs * Fix types * Convert threshold back to float * Fix threshold type in docstring * Improve formatting in Scorer API docs --- spacy/pipeline/textcat.py | 7 +++-- spacy/scorer.py | 12 +++---- spacy/tests/pipeline/test_textcat.py | 6 ++-- spacy/tests/test_scorer.py | 47 ++++++++++++++++++++++++++++ website/docs/api/scorer.md | 21 +++++++------ website/docs/api/textcategorizer.md | 5 ++- 6 files changed, 73 insertions(+), 25 deletions(-) diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 59549ad99..238a768ed 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -72,7 +72,7 @@ subword_features = true "textcat", assigns=["doc.cats"], default_config={ - "threshold": 0.5, + "threshold": 0.0, "model": DEFAULT_SINGLE_TEXTCAT_MODEL, "scorer": {"@scorers": "spacy.textcat_scorer.v1"}, }, @@ -144,7 +144,8 @@ class TextCategorizer(TrainablePipe): model (thinc.api.Model): The Thinc Model powering the pipeline component. name (str): The component instance name, used to add entries to the losses during training. - threshold (float): Cutoff to consider a prediction "positive". + threshold (float): Unused, not needed for single-label (exclusive + classes) classification. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_cats for the attribute "cats". @@ -154,7 +155,7 @@ class TextCategorizer(TrainablePipe): self.model = model self.name = name self._rehearsal_model = None - cfg = {"labels": [], "threshold": threshold, "positive_label": None} + cfg: Dict[str, Any] = {"labels": [], "threshold": threshold, "positive_label": None} self.cfg = dict(cfg) self.scorer = scorer diff --git a/spacy/scorer.py b/spacy/scorer.py index 8cd755ac4..16fc303a0 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -446,7 +446,7 @@ class Scorer: labels (Iterable[str]): The set of possible labels. Defaults to []. multi_label (bool): Whether the attribute allows multiple labels. Defaults to True. When set to False (exclusive labels), missing - gold labels are interpreted as 0.0. + gold labels are interpreted as 0.0 and the threshold is set to 0.0. positive_label (str): The positive label for a binary task with exclusive classes. Defaults to None. threshold (float): Cutoff to consider a prediction "positive". Defaults @@ -471,6 +471,8 @@ class Scorer: """ if threshold is None: threshold = 0.5 if multi_label else 0.0 + if not multi_label: + threshold = 0.0 f_per_type = {label: PRFScore() for label in labels} auc_per_type = {label: ROCAUCScore() for label in labels} labels = set(labels) @@ -505,20 +507,18 @@ class Scorer: # Get the highest-scoring for each. pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1]) gold_label, gold_score = max(gold_cats.items(), key=lambda it: it[1]) - if pred_label == gold_label and pred_score >= threshold: + if pred_label == gold_label: f_per_type[pred_label].tp += 1 else: f_per_type[gold_label].fn += 1 - if pred_score >= threshold: - f_per_type[pred_label].fp += 1 + f_per_type[pred_label].fp += 1 elif gold_cats: gold_label, gold_score = max(gold_cats, key=lambda it: it[1]) if gold_score > 0: f_per_type[gold_label].fn += 1 elif pred_cats: pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1]) - if pred_score >= threshold: - f_per_type[pred_label].fp += 1 + f_per_type[pred_label].fp += 1 micro_prf = PRFScore() for label_prf in f_per_type.values(): micro_prf.tp += label_prf.tp diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 0bb036a33..d359b77db 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -823,10 +823,10 @@ def test_textcat_loss(multi_label: bool, expected_loss: float): assert loss == expected_loss -def test_textcat_threshold(): +def test_textcat_multilabel_threshold(): # Ensure the scorer can be called with a different threshold nlp = English() - nlp.add_pipe("textcat") + nlp.add_pipe("textcat_multilabel") train_examples = [] for text, annotations in TRAIN_DATA_SINGLE_LABEL: @@ -849,7 +849,7 @@ def test_textcat_threshold(): ) pos_f = scores["cats_score"] assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0 - assert pos_f > macro_f + assert pos_f >= macro_f def test_textcat_multi_threshold(): diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index 6e15fa2de..b903f1669 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -474,3 +474,50 @@ def test_prf_score(): assert (a.precision, a.recall, a.fscore) == approx( (c.precision, c.recall, c.fscore) ) + + +def test_score_cats(en_tokenizer): + text = "some text" + gold_doc = en_tokenizer(text) + gold_doc.cats = {"POSITIVE": 1.0, "NEGATIVE": 0.0} + pred_doc = en_tokenizer(text) + pred_doc.cats = {"POSITIVE": 0.75, "NEGATIVE": 0.25} + example = Example(pred_doc, gold_doc) + # threshold is ignored for multi_label=False + scores1 = Scorer.score_cats( + [example], + "cats", + labels=list(gold_doc.cats.keys()), + multi_label=False, + positive_label="POSITIVE", + threshold=0.1, + ) + scores2 = Scorer.score_cats( + [example], + "cats", + labels=list(gold_doc.cats.keys()), + multi_label=False, + positive_label="POSITIVE", + threshold=0.9, + ) + assert scores1["cats_score"] == 1.0 + assert scores2["cats_score"] == 1.0 + assert scores1 == scores2 + # threshold is relevant for multi_label=True + scores = Scorer.score_cats( + [example], + "cats", + labels=list(gold_doc.cats.keys()), + multi_label=True, + threshold=0.9, + ) + assert scores["cats_macro_f"] == 0.0 + # threshold is relevant for multi_label=True + scores = Scorer.score_cats( + [example], + "cats", + labels=list(gold_doc.cats.keys()), + multi_label=True, + threshold=0.1, + ) + assert scores["cats_macro_f"] == 0.5 diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md index ca3462aa9..9ef36e6fc 100644 --- a/website/docs/api/scorer.md +++ b/website/docs/api/scorer.md @@ -229,16 +229,17 @@ The reported `{attr}_score` depends on the classification properties: > print(scores["cats_macro_auc"]) > ``` -| Name | Description | -| ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | -| `attr` | The attribute to score. ~~str~~ | -| _keyword-only_ | | -| `getter` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. ~~Callable[[Doc, str], Dict[str, float]]~~ | -| labels | The set of possible labels. Defaults to `[]`. ~~Iterable[str]~~ | -| `multi_label` | Whether the attribute allows multiple labels. Defaults to `True`. ~~bool~~ | -| `positive_label` | The positive label for a binary task with exclusive classes. Defaults to `None`. ~~Optional[str]~~ | -| **RETURNS** | A dictionary containing the scores, with inapplicable scores as `None`. ~~Dict[str, Optional[float]]~~ | +| Name | Description | +| ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | +| `attr` | The attribute to score. ~~str~~ | +| _keyword-only_ | | +| `getter` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. ~~Callable[[Doc, str], Dict[str, float]]~~ | +| labels | The set of possible labels. Defaults to `[]`. ~~Iterable[str]~~ | +| `multi_label` | Whether the attribute allows multiple labels. Defaults to `True`. When set to `False` (exclusive labels), missing gold labels are interpreted as `0.0` and the threshold is set to `0.0`. ~~bool~~ | +| `positive_label` | The positive label for a binary task with exclusive classes. Defaults to `None`. ~~Optional[str]~~ | +| `threshold` | Cutoff to consider a prediction "positive". Defaults to `0.5` for multi-label, and `0.0` (i.e. whatever's highest scoring) otherwise. ~~float~~ | +| **RETURNS** | A dictionary containing the scores, with inapplicable scores as `None`. ~~Dict[str, Optional[float]]~~ | ## Scorer.score_links {#score_links tag="staticmethod" new="3"} diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index 042b4ab76..f5f8706ec 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -63,7 +63,6 @@ architectures and their arguments and hyperparameters. > ```python > from spacy.pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL > config = { -> "threshold": 0.5, > "model": DEFAULT_SINGLE_TEXTCAT_MODEL, > } > nlp.add_pipe("textcat", config=config) @@ -82,7 +81,7 @@ architectures and their arguments and hyperparameters. | Setting | Description | | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ | +| `threshold` | Cutoff to consider a prediction "positive", relevant for `textcat_multilabel` when calculating accuracy scores. ~~float~~ | | `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ | | `scorer` | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ | @@ -123,7 +122,7 @@ shortcut for this and instantiate the component using its string name and | `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | | `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | | _keyword-only_ | | -| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ | +| `threshold` | Cutoff to consider a prediction "positive", relevant for `textcat_multilabel` when calculating accuracy scores. ~~float~~ | | `scorer` | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ | ## TextCategorizer.\_\_call\_\_ {#call tag="method"} From 2fb7e4dc74bd491ecec43971b2b29b0d28efd492 Mon Sep 17 00:00:00 2001 From: Ryn Daniels <397565+ryndaniels@users.noreply.github.com> Date: Wed, 2 Nov 2022 16:36:30 +0200 Subject: [PATCH 11/23] More version updates for github action deprecation warnings (#11705) * More version updates for github action deprecation warnings * fix the deprecated set-output commands * bump explosion-bot to run on ubuntu-latest --- .github/workflows/autoblack.yml | 2 +- .github/workflows/explosionbot.yml | 6 +++--- .github/workflows/slowtests.yml | 6 +++--- .github/workflows/spacy_universe_alert.yml | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/autoblack.yml b/.github/workflows/autoblack.yml index 3ad4cf408..70882c3cc 100644 --- a/.github/workflows/autoblack.yml +++ b/.github/workflows/autoblack.yml @@ -15,7 +15,7 @@ jobs: - uses: actions/checkout@v3 with: ref: ${{ github.head_ref }} - - uses: actions/setup-python@v3 + - uses: actions/setup-python@v4 - run: pip install black - name: Auto-format code if needed run: black spacy diff --git a/.github/workflows/explosionbot.yml b/.github/workflows/explosionbot.yml index d585ecd9c..6b472cd12 100644 --- a/.github/workflows/explosionbot.yml +++ b/.github/workflows/explosionbot.yml @@ -8,14 +8,14 @@ on: jobs: explosion-bot: - runs-on: ubuntu-18.04 + runs-on: ubuntu-latest steps: - name: Dump GitHub context env: GITHUB_CONTEXT: ${{ toJson(github) }} run: echo "$GITHUB_CONTEXT" - - uses: actions/checkout@v1 - - uses: actions/setup-python@v1 + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 - name: Install and run explosion-bot run: | pip install git+https://${{ secrets.EXPLOSIONBOT_TOKEN }}@github.com/explosion/explosion-bot diff --git a/.github/workflows/slowtests.yml b/.github/workflows/slowtests.yml index 38ceb18c6..f9fd3e817 100644 --- a/.github/workflows/slowtests.yml +++ b/.github/workflows/slowtests.yml @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v1 + uses: actions/checkout@v3 with: ref: ${{ matrix.branch }} - name: Get commits from past 24 hours @@ -23,9 +23,9 @@ jobs: today=$(date '+%Y-%m-%d %H:%M:%S') yesterday=$(date -d "yesterday" '+%Y-%m-%d %H:%M:%S') if git log --after="$yesterday" --before="$today" | grep commit ; then - echo "::set-output name=run_tests::true" + echo run_tests=true >> $GITHUB_OUTPUT else - echo "::set-output name=run_tests::false" + echo run_tests=false >> $GITHUB_OUTPUT fi - name: Trigger buildkite build diff --git a/.github/workflows/spacy_universe_alert.yml b/.github/workflows/spacy_universe_alert.yml index cbbf14c6e..f507e0594 100644 --- a/.github/workflows/spacy_universe_alert.yml +++ b/.github/workflows/spacy_universe_alert.yml @@ -17,8 +17,8 @@ jobs: run: | echo "$GITHUB_CONTEXT" - - uses: actions/checkout@v1 - - uses: actions/setup-python@v1 + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 - name: Install Bernadette app dependency and send an alert env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} From 1211552f0ec84aef0b55f834d76899ab07e2c5cc Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 3 Nov 2022 09:29:46 +0100 Subject: [PATCH 12/23] Modernize and simplify CI steps (#11738) * Use `build` instead of `python setup.py sdist` * Remove in-place build with `setup.py` * Remove `gpu` parameter and GPU tests * Keep `architecture` and `num_build_jobs` in azure steps with CI defaults * Fix use of `num_build_jobs` parameters * Remove now-unused `prefix` parameter * Test imports and CLI before installing test requirements * Remove `*.egg-info` directory in addition to source directory for an warning-free `import spacy` * Switch `thinc-apple-ops` test to python 3.11 (as most recent python that is tested across platforms) --- .github/azure-steps.yml | 70 +++++++++++++++++++---------------------- azure-pipelines.yml | 17 ---------- 2 files changed, 33 insertions(+), 54 deletions(-) diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml index cc0247b3a..b2bc80dd6 100644 --- a/.github/azure-steps.yml +++ b/.github/azure-steps.yml @@ -1,9 +1,7 @@ parameters: python_version: '' - architecture: '' - prefix: '' - gpu: false - num_build_jobs: 1 + architecture: 'x64' + num_build_jobs: 2 steps: - task: UsePythonVersion@0 @@ -17,16 +15,16 @@ steps: displayName: 'Set variables' - script: | - ${{ parameters.prefix }} python -m pip install -U pip setuptools - ${{ parameters.prefix }} python -m pip install -U -r requirements.txt + python -m pip install -U build pip setuptools + python -m pip install -U -r requirements.txt displayName: "Install dependencies" - script: | - ${{ parameters.prefix }} python setup.py build_ext --inplace -j ${{ parameters.num_build_jobs }} - ${{ parameters.prefix }} python setup.py sdist --formats=gztar - displayName: "Compile and build sdist" + python -m build --sdist + displayName: "Build sdist" - - script: python -m mypy spacy + - script: | + python -m mypy spacy displayName: 'Run mypy' condition: ne(variables['python_version'], '3.6') @@ -35,35 +33,24 @@ steps: contents: "spacy" displayName: "Delete source directory" + - task: DeleteFiles@1 + inputs: + contents: "*.egg-info" + displayName: "Delete egg-info directory" + - script: | - ${{ parameters.prefix }} python -m pip freeze --exclude torch --exclude cupy-cuda110 > installed.txt - ${{ parameters.prefix }} python -m pip uninstall -y -r installed.txt + python -m pip freeze > installed.txt + python -m pip uninstall -y -r installed.txt displayName: "Uninstall all packages" - bash: | - ${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1) - ${{ parameters.prefix }} SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST + SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1) + SPACY_NUM_BUILD_JOBS=${{ parameters.num_build_jobs }} python -m pip install dist/$SDIST displayName: "Install from sdist" - script: | - ${{ parameters.prefix }} python -m pip install -U -r requirements.txt - displayName: "Install test requirements" - - - script: | - ${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0 - ${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html - displayName: "Install GPU requirements" - condition: eq(${{ parameters.gpu }}, true) - - - script: | - ${{ parameters.prefix }} python -m pytest --pyargs spacy -W error - displayName: "Run CPU tests" - condition: eq(${{ parameters.gpu }}, false) - - - script: | - ${{ parameters.prefix }} python -m pytest --pyargs spacy -W error -p spacy.tests.enable_gpu - displayName: "Run GPU tests" - condition: eq(${{ parameters.gpu }}, true) + python -W error -c "import spacy" + displayName: "Test import" - script: | python -m spacy download ca_core_news_sm @@ -106,13 +93,22 @@ steps: displayName: 'Test assemble CLI vectors warning' condition: eq(variables['python_version'], '3.8') + - script: | + python -m pip install -U -r requirements.txt + displayName: "Install test requirements" + + - script: | + python -m pytest --pyargs spacy -W error + displayName: "Run CPU tests" + + - script: | + python -m pip install --pre thinc-apple-ops + python -m pytest --pyargs spacy + displayName: "Run CPU tests with thinc-apple-ops" + condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11')) + - script: | python .github/validate_universe_json.py website/meta/universe.json displayName: 'Test website/meta/universe.json' condition: eq(variables['python_version'], '3.8') - - script: | - ${{ parameters.prefix }} python -m pip install --pre thinc-apple-ops - ${{ parameters.prefix }} python -m pytest --pyargs spacy - displayName: "Run CPU tests with thinc-apple-ops" - condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.10')) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index bf3672b8b..3499042cb 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -101,20 +101,3 @@ jobs: - template: .github/azure-steps.yml parameters: python_version: '$(python.version)' - architecture: 'x64' - -# - job: "TestGPU" -# dependsOn: "Validate" -# strategy: -# matrix: -# Python38LinuxX64_GPU: -# python.version: '3.8' -# pool: -# name: "LinuxX64_GPU" -# steps: -# - template: .github/azure-steps.yml -# parameters: -# python_version: '$(python.version)' -# architecture: 'x64' -# gpu: true -# num_build_jobs: 24 From db56600536e2d615a766fc2fc973a6cc9e0f1a52 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Thu, 3 Nov 2022 18:52:59 +0900 Subject: [PATCH 13/23] Fix default parameters for load functions (fix #11706) (#11713) * Fix default parameters for load functions Some load functions used SimpleFrozenList() directly instead of the _DEFAULT_EMPTY_PIPES parameter. That mostly worked as intended, but the changes in #11459 check for equality using identity, not value, so a warning is incorrectly raised sometimes, as in #11706. This change just has all the load functions use the singleton value instead. * Add test that there are no warnings on module-based load This will succeed due to changes in this branch, but local tests with the latest release failed as intended. * Try reverting commit and see if CI changes There is an error in CI that is probably unrelated. Revert "Fix default parameters for load functions" This reverts commit dc46b35687e92e4793e64edb11997d44b88c6a8b. * Revert "Try reverting commit and see if CI changes" This reverts commit 2514ed07ef29851b5ac60015442a7ce44c69decc. Co-authored-by: Adriane Boyd --- .github/azure-steps.yml | 5 +++++ spacy/util.py | 12 ++++++------ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml index b2bc80dd6..e8bd0d212 100644 --- a/.github/azure-steps.yml +++ b/.github/azure-steps.yml @@ -59,6 +59,11 @@ steps: displayName: 'Test download CLI' condition: eq(variables['python_version'], '3.8') + - script: | + python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')" + displayName: 'Test no warnings on load (#11713)' + condition: eq(variables['python_version'], '3.8') + - script: | python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json . displayName: 'Test convert CLI' diff --git a/spacy/util.py b/spacy/util.py index 3034808ba..76a1e0bfa 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -443,9 +443,9 @@ def load_model_from_package( name: str, *, vocab: Union["Vocab", bool] = True, - disable: Union[str, Iterable[str]] = SimpleFrozenList(), - enable: Union[str, Iterable[str]] = SimpleFrozenList(), - exclude: Union[str, Iterable[str]] = SimpleFrozenList(), + disable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, + enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, + exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), ) -> "Language": """Load a model from an installed package. @@ -619,9 +619,9 @@ def load_model_from_init_py( init_file: Union[Path, str], *, vocab: Union["Vocab", bool] = True, - disable: Union[str, Iterable[str]] = SimpleFrozenList(), - enable: Union[str, Iterable[str]] = SimpleFrozenList(), - exclude: Union[str, Iterable[str]] = SimpleFrozenList(), + disable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, + enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, + exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), ) -> "Language": """Helper function to use in the `load()` method of a model package's From 40e1000db08858e8c928efacab8f710e027dde61 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 3 Nov 2022 11:49:08 +0100 Subject: [PATCH 14/23] Restore Doc attr getter values in Doc.to_json (#11700) --- spacy/tests/doc/test_json_doc_conversion.py | 9 +++++++ spacy/tokens/doc.pyx | 27 ++++++++++++++------- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/spacy/tests/doc/test_json_doc_conversion.py b/spacy/tests/doc/test_json_doc_conversion.py index 19698cfb2..11a1817e6 100644 --- a/spacy/tests/doc/test_json_doc_conversion.py +++ b/spacy/tests/doc/test_json_doc_conversion.py @@ -370,3 +370,12 @@ def test_json_to_doc_validation_error(doc): doc_json.pop("tokens") with pytest.raises(ValueError): Doc(doc.vocab).from_json(doc_json, validate=True) + + +def test_to_json_underscore_doc_getters(doc): + def get_text_length(doc): + return len(doc.text) + + Doc.set_extension("text_length", getter=get_text_length) + doc_json = doc.to_json(underscore=["text_length"]) + assert doc_json["_"]["text_length"] == get_text_length(doc) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 295f91c28..f2621292c 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1668,6 +1668,20 @@ cdef class Doc: if underscore: user_keys = set() + # Handle doc attributes with .get to include values from getters + # and not only values stored in user_data, for backwards + # compatibility + for attr in underscore: + if self.has_extension(attr): + if "_" not in data: + data["_"] = {} + value = self._.get(attr) + if not srsly.is_json_serializable(value): + raise ValueError(Errors.E107.format(attr=attr, value=repr(value))) + data["_"][attr] = value + user_keys.add(attr) + # Token and span attributes only include values stored in user_data + # and not values generated by getters if self.user_data: for data_key, value in self.user_data.copy().items(): if type(data_key) == tuple and len(data_key) >= 4 and data_key[0] == "._.": @@ -1678,20 +1692,15 @@ cdef class Doc: user_keys.add(attr) if not srsly.is_json_serializable(value): raise ValueError(Errors.E107.format(attr=attr, value=repr(value))) - # Check if doc attribute - if start is None: - if "_" not in data: - data["_"] = {} - data["_"][attr] = value - # Check if token attribute - elif end is None: + # Token attribute + if start is not None and end is None: if "underscore_token" not in data: data["underscore_token"] = {} if attr not in data["underscore_token"]: data["underscore_token"][attr] = [] data["underscore_token"][attr].append({"start": start, "value": value}) - # Else span attribute - else: + # Span attribute + elif start is not None and end is not None: if "underscore_span" not in data: data["underscore_span"] = {} if attr not in data["underscore_span"]: From bbf64cfc4391cccba447346badaacca4d42e583d Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 4 Nov 2022 11:17:43 +0100 Subject: [PATCH 15/23] Auto-format code with black (#11749) Co-authored-by: explosion-bot --- spacy/pipeline/textcat.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 238a768ed..4023c4456 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -155,7 +155,11 @@ class TextCategorizer(TrainablePipe): self.model = model self.name = name self._rehearsal_model = None - cfg: Dict[str, Any] = {"labels": [], "threshold": threshold, "positive_label": None} + cfg: Dict[str, Any] = { + "labels": [], + "threshold": threshold, + "positive_label": None, + } self.cfg = dict(cfg) self.scorer = scorer From ea326cf47d5324cff14bef983b0da122b9f0d1ed Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 7 Nov 2022 08:11:13 +0100 Subject: [PATCH 16/23] Fix types for Span.id and Span.id_ (#11744) --- spacy/tokens/span.pyi | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi index 617e3d19d..0a6f306a6 100644 --- a/spacy/tokens/span.pyi +++ b/spacy/tokens/span.pyi @@ -117,15 +117,13 @@ class Span: end_char: int label: int kb_id: int + id: int ent_id: int ent_id_: str @property - def id(self) -> int: ... - @property - def id_(self) -> str: ... - @property def orth_(self) -> str: ... @property def lemma_(self) -> str: ... label_: str kb_id_: str + id_: str From b76222e56adb49e33d7d0471674dfe2f207b2020 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 7 Nov 2022 16:11:55 +0900 Subject: [PATCH 17/23] Raise Typer limit (#11720) * Raise typer limit to <0.7.0 * Raise limit to <0.8.0 --- requirements.txt | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 9d6bbb2c4..d91a3b3d4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,7 @@ murmurhash>=0.28.0,<1.1.0 wasabi>=0.9.1,<1.1.0 srsly>=2.4.3,<3.0.0 catalogue>=2.0.6,<2.1.0 -typer>=0.3.0,<0.5.0 +typer>=0.3.0,<0.8.0 pathy>=0.3.5 # Third party dependencies numpy>=1.15.0 diff --git a/setup.cfg b/setup.cfg index c2653feba..82d4d2758 100644 --- a/setup.cfg +++ b/setup.cfg @@ -51,7 +51,7 @@ install_requires = srsly>=2.4.3,<3.0.0 catalogue>=2.0.6,<2.1.0 # Third-party dependencies - typer>=0.3.0,<0.5.0 + typer>=0.3.0,<0.8.0 pathy>=0.3.5 tqdm>=4.38.0,<5.0.0 numpy>=1.15.0 From e91b47a22655c0384202f797e9d50d3660596d32 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 7 Nov 2022 10:43:34 +0100 Subject: [PATCH 18/23] Check for unsafe paths in tarfile.extractall (CVE-2007-4559) (#11746) * Adding tarfile member sanitization to extractall() * Format * Simplify and add error message * Fix import * Add comment about CVE Co-authored-by: TrellixVulnTeam --- spacy/cli/project/remote_storage.py | 19 ++++++++++++++++++- spacy/errors.py | 2 ++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/spacy/cli/project/remote_storage.py b/spacy/cli/project/remote_storage.py index 336a4bcb3..12e252b3c 100644 --- a/spacy/cli/project/remote_storage.py +++ b/spacy/cli/project/remote_storage.py @@ -10,6 +10,7 @@ from .._util import get_hash, get_checksum, download_file, ensure_pathy from ...util import make_tempdir, get_minor_version, ENV_VARS, check_bool_env_var from ...git_info import GIT_VERSION from ... import about +from ...errors import Errors if TYPE_CHECKING: from pathy import Pathy # noqa: F401 @@ -84,7 +85,23 @@ class RemoteStorage: with tarfile.open(tar_loc, mode=mode_string) as tar_file: # This requires that the path is added correctly, relative # to root. This is how we set things up in push() - tar_file.extractall(self.root) + + # Disallow paths outside the current directory for the tar + # file (CVE-2007-4559, directory traversal vulnerability) + def is_within_directory(directory, target): + abs_directory = os.path.abspath(directory) + abs_target = os.path.abspath(target) + prefix = os.path.commonprefix([abs_directory, abs_target]) + return prefix == abs_directory + + def safe_extract(tar, path): + for member in tar.getmembers(): + member_path = os.path.join(path, member.name) + if not is_within_directory(path, member_path): + raise ValueError(Errors.E852) + tar.extractall(path) + + safe_extract(tar_file, self.root) return url def find( diff --git a/spacy/errors.py b/spacy/errors.py index e0628819d..2f8a3996f 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -544,6 +544,8 @@ class Errors(metaclass=ErrorsWithCodes): "during training, make sure to include it in 'annotating components'") # New errors added in v3.x + E852 = ("The tar file pulled from the remote attempted an unsafe path " + "traversal.") E853 = ("Unsupported component factory name '{name}'. The character '.' is " "not permitted in factory names.") E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not " From 6105f20d8a10a18a0e5985d310664812198840a8 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 7 Nov 2022 13:25:40 +0100 Subject: [PATCH 19/23] Switch CI to python 3.11 (#11765) --- azure-pipelines.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 3499042cb..9c3b92f06 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -87,13 +87,13 @@ jobs: # python.version: "3.10" Python311Linux: imageName: 'ubuntu-latest' - python.version: '3.11.0' + python.version: '3.11' Python311Windows: imageName: 'windows-latest' - python.version: '3.11.0' + python.version: '3.11' Python311Mac: imageName: 'macos-latest' - python.version: '3.11.0' + python.version: '3.11' maxParallel: 4 pool: vmImage: $(imageName) From e116395f890a70447c75109026e7b37f20c142c2 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 7 Nov 2022 14:46:08 +0100 Subject: [PATCH 20/23] Add fallback in requirements check, only check once (#11735) * Add fallback in requirements check, only check once * Rename to skip_requirements_check * Update spacy/cli/project/run.py Co-authored-by: Paul O'Leary McCann Co-authored-by: Paul O'Leary McCann --- spacy/cli/project/run.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py index ebab7471e..638e7fab1 100644 --- a/spacy/cli/project/run.py +++ b/spacy/cli/project/run.py @@ -53,6 +53,7 @@ def project_run( force: bool = False, dry: bool = False, capture: bool = False, + skip_requirements_check: bool = False, ) -> None: """Run a named script defined in the project.yml. If the script is part of the default pipeline (defined in the "run" section), DVC is used to @@ -69,6 +70,7 @@ def project_run( sys.exit will be called with the return code. You should use capture=False when you want to turn over execution to the command, and capture=True when you want to run the command more like a function. + skip_requirements_check (bool): Whether to skip the requirements check. """ config = load_project_config(project_dir, overrides=overrides) commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} @@ -76,9 +78,10 @@ def project_run( validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand) req_path = project_dir / "requirements.txt" - if config.get("check_requirements", True) and os.path.exists(req_path): - with req_path.open() as requirements_file: - _check_requirements([req.replace("\n", "") for req in requirements_file]) + if not skip_requirements_check: + if config.get("check_requirements", True) and os.path.exists(req_path): + with req_path.open() as requirements_file: + _check_requirements([req.strip() for req in requirements_file]) if subcommand in workflows: msg.info(f"Running workflow '{subcommand}'") @@ -90,6 +93,7 @@ def project_run( force=force, dry=dry, capture=capture, + skip_requirements_check=True, ) else: cmd = commands[subcommand] @@ -338,6 +342,11 @@ def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]: failed_pkgs_msgs.append(dnf.report()) except pkg_resources.VersionConflict as vc: conflicting_pkgs_msgs.append(vc.report()) + except Exception: + msg.warn(f"Unable to check requirement: {req} " + "Check that the requirement is formatted according to PEP " + "440, in particular that URLs are formatted as " + "'package_name @ URL'") if len(failed_pkgs_msgs) or len(conflicting_pkgs_msgs): msg.warn( From 2e3cfd758ea414497802843970666a18ed4d123e Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 8 Nov 2022 04:46:19 +0100 Subject: [PATCH 21/23] Use python 3.10 for GHA universe alert (#11768) --- .github/workflows/spacy_universe_alert.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/spacy_universe_alert.yml b/.github/workflows/spacy_universe_alert.yml index f507e0594..837aaeb33 100644 --- a/.github/workflows/spacy_universe_alert.yml +++ b/.github/workflows/spacy_universe_alert.yml @@ -19,6 +19,8 @@ jobs: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 + with: + python-version: '3.10' - name: Install Bernadette app dependency and send an alert env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} From 20bbbe3e44f14d42a4861d1399ad98d6e1707d84 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Tue, 8 Nov 2022 14:58:10 +0100 Subject: [PATCH 22/23] Revert disable/disabled merging behavior (#11745) * Merge disable with disabled. Adjust warnings, errors and tests. * Replace any() with set operation. * Update spacy/tests/pipeline/test_pipe_methods.py Co-authored-by: Adriane Boyd * Update docs. * Remve reference to config entry nlp.enabled from docs. Co-authored-by: Adriane Boyd --- spacy/errors.py | 4 +- spacy/language.py | 45 ++++++++----------- spacy/tests/pipeline/test_pipe_methods.py | 18 ++++---- .../serialize/test_serialize_pipeline.py | 7 ++- website/docs/api/language.md | 24 +++++----- website/docs/api/top-level.md | 20 ++++----- website/docs/usage/processing-pipelines.md | 3 +- 7 files changed, 56 insertions(+), 65 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 2f8a3996f..278e5496a 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -212,8 +212,8 @@ class Warnings(metaclass=ErrorsWithCodes): W121 = ("Attempting to trace non-existent method '{method}' in pipe '{pipe}'") W122 = ("Couldn't trace method '{method}' in pipe '{pipe}'. This can happen if the pipe class " "is a Cython extension type.") - W123 = ("Argument {arg} with value {arg_value} is used instead of {config_value} as specified in the config. Be " - "aware that this might affect other components in your pipeline.") + W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option " + "`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.") class Errors(metaclass=ErrorsWithCodes): diff --git a/spacy/language.py b/spacy/language.py index d391f15ab..967af1e62 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1879,31 +1879,22 @@ class Language: if isinstance(exclude, str): exclude = [exclude] - def fetch_pipes_status(value: Iterable[str], key: str) -> Iterable[str]: - """Fetch value for `enable` or `disable` w.r.t. the specified config and passed arguments passed to - .load(). If both arguments and config specified values for this field, the passed arguments take precedence - and a warning is printed. - value (Iterable[str]): Passed value for `enable` or `disable`. - key (str): Key for field in config (either "enabled" or "disabled"). - RETURN (Iterable[str]): - """ - # We assume that no argument was passed if the value is the specified default value. - if id(value) == id(_DEFAULT_EMPTY_PIPES): - return config["nlp"].get(key, []) - else: - if len(config["nlp"].get(key, [])): - warnings.warn( - Warnings.W123.format( - arg=key[:-1], - arg_value=value, - config_value=config["nlp"][key], - ) + # `enable` should not be merged with `enabled` (the opposite is true for `disable`/`disabled`). If the config + # specifies values for `enabled` not included in `enable`, emit warning. + if id(enable) != id(_DEFAULT_EMPTY_PIPES): + enabled = config["nlp"].get("enabled", []) + if len(enabled) and not set(enabled).issubset(enable): + warnings.warn( + Warnings.W123.format( + enable=enable, + enabled=enabled, ) - return value + ) + # Ensure sets of disabled/enabled pipe names are not contradictory. disabled_pipes = cls._resolve_component_status( - fetch_pipes_status(disable, "disabled"), - fetch_pipes_status(enable, "enabled"), + list({*disable, *config["nlp"].get("disabled", [])}), + enable, config["nlp"]["pipeline"], ) nlp._disabled = set(p for p in disabled_pipes if p not in exclude) @@ -2084,10 +2075,12 @@ class Language: if enable: if isinstance(enable, str): enable = [enable] - to_disable = [ - pipe_name for pipe_name in pipe_names if pipe_name not in enable - ] - if disable and disable != to_disable: + to_disable = { + *[pipe_name for pipe_name in pipe_names if pipe_name not in enable], + *disable, + } + # If any pipe to be enabled is in to_disable, the specification is inconsistent. + if len(set(enable) & to_disable): raise ValueError(Errors.E1042.format(enable=enable, disable=disable)) return tuple(to_disable) diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index 14a7a36e5..4dd7bae16 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -615,20 +615,18 @@ def test_enable_disable_conflict_with_config(): with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) - # Expected to fail, as config and arguments conflict. - with pytest.raises(ValueError): - spacy.load( - tmp_dir, enable=["tagger"], config={"nlp": {"disabled": ["senter"]}} - ) + # Expected to succeed, as config and arguments do not conflict. + assert spacy.load( + tmp_dir, enable=["tagger"], config={"nlp": {"disabled": ["senter"]}} + ).disabled == ["senter", "sentencizer"] # Expected to succeed without warning due to the lack of a conflicting config option. spacy.load(tmp_dir, enable=["tagger"]) - # Expected to succeed with a warning, as disable=[] should override the config setting. - with pytest.warns(UserWarning): + # Expected to fail due to conflict between enable and disabled. + with pytest.raises(ValueError): spacy.load( tmp_dir, - enable=["tagger"], - disable=[], - config={"nlp": {"disabled": ["senter"]}}, + enable=["senter"], + config={"nlp": {"disabled": ["senter", "tagger"]}}, ) diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index b948bb76c..9fcf18e2d 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -404,11 +404,10 @@ def test_serialize_pipeline_disable_enable(): assert nlp3.component_names == ["ner", "tagger"] with make_tempdir() as d: nlp3.to_disk(d) - with pytest.warns(UserWarning): - nlp4 = spacy.load(d, disable=["ner"]) - assert nlp4.pipe_names == ["tagger"] + nlp4 = spacy.load(d, disable=["ner"]) + assert nlp4.pipe_names == [] assert nlp4.component_names == ["ner", "tagger"] - assert nlp4.disabled == ["ner"] + assert nlp4.disabled == ["ner", "tagger"] with make_tempdir() as d: nlp.to_disk(d) nlp5 = spacy.load(d, exclude=["tagger"]) diff --git a/website/docs/api/language.md b/website/docs/api/language.md index 767a7450a..504640d57 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -63,18 +63,18 @@ spaCy loads a model under the hood based on its > nlp = Language.from_config(config) > ``` -| Name | Description | -| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `config` | The loaded config. ~~Union[Dict[str, Any], Config]~~ | -| _keyword-only_ | | -| `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ | -| `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ | -| `enable` 3.4 | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled, but can be enabled again using [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ | -| `exclude` | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ | -| `meta` | [Meta data](/api/data-formats#meta) overrides. ~~Dict[str, Any]~~ | -| `auto_fill` | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. ~~bool~~ | -| `validate` | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ | -| **RETURNS** | The initialized object. ~~Language~~ | +| Name | Description | +| ------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `config` | The loaded config. ~~Union[Dict[str, Any], Config]~~ | +| _keyword-only_ | | +| `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ | +| `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). Is merged with the config entry `nlp.disabled`. ~~Union[str, Iterable[str]]~~ | +| `enable` 3.4 | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled, but can be enabled again using [nlp.enable_pipe](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ | +| `exclude` | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ | +| `meta` | [Meta data](/api/data-formats#meta) overrides. ~~Dict[str, Any]~~ | +| `auto_fill` | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. ~~bool~~ | +| `validate` | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ | +| **RETURNS** | The initialized object. ~~Language~~ | ## Language.component {#component tag="classmethod" new="3"} diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index bc53fc868..c798f2a8d 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -45,16 +45,16 @@ specified separately using the new `exclude` keyword argument. > nlp = spacy.load("en_core_web_sm", exclude=["parser", "tagger"]) > ``` -| Name | Description | -| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `name` | Pipeline to load, i.e. package name or path. ~~Union[str, Path]~~ | -| _keyword-only_ | | -| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ | -| `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ | -| `enable` 3.4 | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled. ~~Union[str, Iterable[str]]~~ | -| `exclude` 3 | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ | -| `config` 3 | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ | -| **RETURNS** | A `Language` object with the loaded pipeline. ~~Language~~ | +| Name | Description | +| ------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `name` | Pipeline to load, i.e. package name or path. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ | +| `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). Is merged with the config entry `nlp.disabled`. ~~Union[str, Iterable[str]]~~ | +| `enable` 3.4 | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled. ~~Union[str, Iterable[str]]~~ | +| `exclude` 3 | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ | +| `config` 3 | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ | +| **RETURNS** | A `Language` object with the loaded pipeline. ~~Language~~ | Essentially, `spacy.load()` is a convenience wrapper that reads the pipeline's [`config.cfg`](/api/data-formats#config), uses the language and pipeline diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index bd28810ae..0b63cdcb8 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -363,7 +363,8 @@ nlp.enable_pipe("tagger") ``` In addition to `disable`, `spacy.load()` also accepts `enable`. If `enable` is -set, all components except for those in `enable` are disabled. +set, all components except for those in `enable` are disabled. If `enable` and +`disable` conflict (i.e. the same component is included in both), an error is raised. ```python # Load the complete pipeline, but disable all components except for tok2vec and tagger From 03eebe9d1c79d39a632876205e93f023fc096d85 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 9 Nov 2022 10:59:28 +0100 Subject: [PATCH 23/23] Update warning, add tests for project requirements check (#11777) * Update warning, add tests for project requirements check * Make warning more general for differences between PEP 508 and pip * Add tests for _check_requirements * Parameterize test --- spacy/cli/project/run.py | 5 ++--- spacy/tests/test_cli.py | 41 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py index 638e7fab1..5db9e14f4 100644 --- a/spacy/cli/project/run.py +++ b/spacy/cli/project/run.py @@ -344,9 +344,8 @@ def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]: conflicting_pkgs_msgs.append(vc.report()) except Exception: msg.warn(f"Unable to check requirement: {req} " - "Check that the requirement is formatted according to PEP " - "440, in particular that URLs are formatted as " - "'package_name @ URL'") + "Checks are currently limited to requirement specifiers " + "(PEP 508)") if len(failed_pkgs_msgs) or len(conflicting_pkgs_msgs): msg.warn( diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 838e00369..8225e14f1 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -1,5 +1,6 @@ import os import math +import pkg_resources from random import sample from typing import Counter @@ -25,6 +26,7 @@ from spacy.cli.download import get_compatibility, get_version from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config from spacy.cli.package import get_third_party_dependencies from spacy.cli.package import _is_permitted_package_name +from spacy.cli.project.run import _check_requirements from spacy.cli.validate import get_model_pkgs from spacy.lang.en import English from spacy.lang.nl import Dutch @@ -855,3 +857,42 @@ def test_span_length_freq_dist_output_must_be_correct(): span_freqs = _get_spans_length_freq_dist(sample_span_lengths, threshold) assert sum(span_freqs.values()) >= threshold assert list(span_freqs.keys()) == [3, 1, 4, 5, 2] + + +@pytest.mark.parametrize( + "reqs,output", + [ + [ + """ + spacy + + # comment + + thinc""", + (False, False), + ], + [ + """# comment + --some-flag + spacy""", + (False, False), + ], + [ + """# comment + --some-flag + spacy; python_version >= '3.6'""", + (False, False), + ], + [ + """# comment + spacyunknowndoesnotexist12345""", + (True, False), + ], + ], +) +def test_project_check_requirements(reqs, output): + # excessive guard against unlikely package name + try: + pkg_resources.require("spacyunknowndoesnotexist12345") + except pkg_resources.DistributionNotFound: + assert output == _check_requirements([req.strip() for req in reqs.split("\n")])