diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
index cc0247b3a..b2bc80dd6 100644
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@@ -1,9 +1,7 @@
parameters:
python_version: ''
- architecture: ''
- prefix: ''
- gpu: false
- num_build_jobs: 1
+ architecture: 'x64'
+ num_build_jobs: 2
steps:
- task: UsePythonVersion@0
@@ -17,16 +15,16 @@ steps:
displayName: 'Set variables'
- script: |
- ${{ parameters.prefix }} python -m pip install -U pip setuptools
- ${{ parameters.prefix }} python -m pip install -U -r requirements.txt
+ python -m pip install -U build pip setuptools
+ python -m pip install -U -r requirements.txt
displayName: "Install dependencies"
- script: |
- ${{ parameters.prefix }} python setup.py build_ext --inplace -j ${{ parameters.num_build_jobs }}
- ${{ parameters.prefix }} python setup.py sdist --formats=gztar
- displayName: "Compile and build sdist"
+ python -m build --sdist
+ displayName: "Build sdist"
- - script: python -m mypy spacy
+ - script: |
+ python -m mypy spacy
displayName: 'Run mypy'
condition: ne(variables['python_version'], '3.6')
@@ -35,35 +33,24 @@ steps:
contents: "spacy"
displayName: "Delete source directory"
+ - task: DeleteFiles@1
+ inputs:
+ contents: "*.egg-info"
+ displayName: "Delete egg-info directory"
+
- script: |
- ${{ parameters.prefix }} python -m pip freeze --exclude torch --exclude cupy-cuda110 > installed.txt
- ${{ parameters.prefix }} python -m pip uninstall -y -r installed.txt
+ python -m pip freeze > installed.txt
+ python -m pip uninstall -y -r installed.txt
displayName: "Uninstall all packages"
- bash: |
- ${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
- ${{ parameters.prefix }} SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST
+ SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
+ SPACY_NUM_BUILD_JOBS=${{ parameters.num_build_jobs }} python -m pip install dist/$SDIST
displayName: "Install from sdist"
- script: |
- ${{ parameters.prefix }} python -m pip install -U -r requirements.txt
- displayName: "Install test requirements"
-
- - script: |
- ${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0
- ${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html
- displayName: "Install GPU requirements"
- condition: eq(${{ parameters.gpu }}, true)
-
- - script: |
- ${{ parameters.prefix }} python -m pytest --pyargs spacy -W error
- displayName: "Run CPU tests"
- condition: eq(${{ parameters.gpu }}, false)
-
- - script: |
- ${{ parameters.prefix }} python -m pytest --pyargs spacy -W error -p spacy.tests.enable_gpu
- displayName: "Run GPU tests"
- condition: eq(${{ parameters.gpu }}, true)
+ python -W error -c "import spacy"
+ displayName: "Test import"
- script: |
python -m spacy download ca_core_news_sm
@@ -106,13 +93,22 @@ steps:
displayName: 'Test assemble CLI vectors warning'
condition: eq(variables['python_version'], '3.8')
+ - script: |
+ python -m pip install -U -r requirements.txt
+ displayName: "Install test requirements"
+
+ - script: |
+ python -m pytest --pyargs spacy -W error
+ displayName: "Run CPU tests"
+
+ - script: |
+ python -m pip install --pre thinc-apple-ops
+ python -m pytest --pyargs spacy
+ displayName: "Run CPU tests with thinc-apple-ops"
+ condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
+
- script: |
python .github/validate_universe_json.py website/meta/universe.json
displayName: 'Test website/meta/universe.json'
condition: eq(variables['python_version'], '3.8')
- - script: |
- ${{ parameters.prefix }} python -m pip install --pre thinc-apple-ops
- ${{ parameters.prefix }} python -m pytest --pyargs spacy
- displayName: "Run CPU tests with thinc-apple-ops"
- condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.10'))
diff --git a/.github/workflows/autoblack.yml b/.github/workflows/autoblack.yml
index 8d0282650..70882c3cc 100644
--- a/.github/workflows/autoblack.yml
+++ b/.github/workflows/autoblack.yml
@@ -12,10 +12,10 @@ jobs:
if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v3
with:
ref: ${{ github.head_ref }}
- - uses: actions/setup-python@v2
+ - uses: actions/setup-python@v4
- run: pip install black
- name: Auto-format code if needed
run: black spacy
@@ -23,10 +23,11 @@ jobs:
# code and makes GitHub think the action failed
- name: Check for modified files
id: git-check
- run: echo ::set-output name=modified::$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi)
+ run: echo modified=$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) >> $GITHUB_OUTPUT
+
- name: Create Pull Request
if: steps.git-check.outputs.modified == 'true'
- uses: peter-evans/create-pull-request@v3
+ uses: peter-evans/create-pull-request@v4
with:
title: Auto-format code with black
labels: meta
diff --git a/.github/workflows/explosionbot.yml b/.github/workflows/explosionbot.yml
index d585ecd9c..6b472cd12 100644
--- a/.github/workflows/explosionbot.yml
+++ b/.github/workflows/explosionbot.yml
@@ -8,14 +8,14 @@ on:
jobs:
explosion-bot:
- runs-on: ubuntu-18.04
+ runs-on: ubuntu-latest
steps:
- name: Dump GitHub context
env:
GITHUB_CONTEXT: ${{ toJson(github) }}
run: echo "$GITHUB_CONTEXT"
- - uses: actions/checkout@v1
- - uses: actions/setup-python@v1
+ - uses: actions/checkout@v3
+ - uses: actions/setup-python@v4
- name: Install and run explosion-bot
run: |
pip install git+https://${{ secrets.EXPLOSIONBOT_TOKEN }}@github.com/explosion/explosion-bot
diff --git a/.github/workflows/slowtests.yml b/.github/workflows/slowtests.yml
index 38ceb18c6..f9fd3e817 100644
--- a/.github/workflows/slowtests.yml
+++ b/.github/workflows/slowtests.yml
@@ -14,7 +14,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout
- uses: actions/checkout@v1
+ uses: actions/checkout@v3
with:
ref: ${{ matrix.branch }}
- name: Get commits from past 24 hours
@@ -23,9 +23,9 @@ jobs:
today=$(date '+%Y-%m-%d %H:%M:%S')
yesterday=$(date -d "yesterday" '+%Y-%m-%d %H:%M:%S')
if git log --after="$yesterday" --before="$today" | grep commit ; then
- echo "::set-output name=run_tests::true"
+ echo run_tests=true >> $GITHUB_OUTPUT
else
- echo "::set-output name=run_tests::false"
+ echo run_tests=false >> $GITHUB_OUTPUT
fi
- name: Trigger buildkite build
diff --git a/.github/workflows/spacy_universe_alert.yml b/.github/workflows/spacy_universe_alert.yml
index cbbf14c6e..f507e0594 100644
--- a/.github/workflows/spacy_universe_alert.yml
+++ b/.github/workflows/spacy_universe_alert.yml
@@ -17,8 +17,8 @@ jobs:
run: |
echo "$GITHUB_CONTEXT"
- - uses: actions/checkout@v1
- - uses: actions/setup-python@v1
+ - uses: actions/checkout@v3
+ - uses: actions/setup-python@v4
- name: Install Bernadette app dependency and send an alert
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
diff --git a/README.md b/README.md
index d9ef83e01..abfc3da67 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ be used in real products.
spaCy comes with
[pretrained pipelines](https://spacy.io/models) and
-currently supports tokenization and training for **60+ languages**. It features
+currently supports tokenization and training for **70+ languages**. It features
state-of-the-art speed and **neural network models** for tagging,
parsing, **named entity recognition**, **text classification** and more,
multi-task learning with pretrained **transformers** like BERT, as well as a
@@ -16,7 +16,7 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy
model packaging, deployment and workflow management. spaCy is commercial
open-source software, released under the MIT license.
-💫 **Version 3.4.0 out now!**
+💫 **Version 3.4 out now!**
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
[![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
@@ -79,7 +79,7 @@ more people can benefit from it.
## Features
-- Support for **60+ languages**
+- Support for **70+ languages**
- **Trained pipelines** for different languages and tasks
- Multi-task learning with pretrained **transformers** like BERT
- Support for pretrained **word vectors** and embeddings
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 357cce835..3499042cb 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -76,24 +76,24 @@ jobs:
# Python39Mac:
# imageName: "macos-latest"
# python.version: "3.9"
- Python310Linux:
- imageName: "ubuntu-latest"
- python.version: "3.10"
+ # Python310Linux:
+ # imageName: "ubuntu-latest"
+ # python.version: "3.10"
Python310Windows:
imageName: "windows-latest"
python.version: "3.10"
- Python310Mac:
- imageName: "macos-latest"
- python.version: "3.10"
+ # Python310Mac:
+ # imageName: "macos-latest"
+ # python.version: "3.10"
Python311Linux:
imageName: 'ubuntu-latest'
- python.version: '3.11.0-rc.2'
+ python.version: '3.11.0'
Python311Windows:
imageName: 'windows-latest'
- python.version: '3.11.0-rc.2'
+ python.version: '3.11.0'
Python311Mac:
imageName: 'macos-latest'
- python.version: '3.11.0-rc.2'
+ python.version: '3.11.0'
maxParallel: 4
pool:
vmImage: $(imageName)
@@ -101,20 +101,3 @@ jobs:
- template: .github/azure-steps.yml
parameters:
python_version: '$(python.version)'
- architecture: 'x64'
-
-# - job: "TestGPU"
-# dependsOn: "Validate"
-# strategy:
-# matrix:
-# Python38LinuxX64_GPU:
-# python.version: '3.8'
-# pool:
-# name: "LinuxX64_GPU"
-# steps:
-# - template: .github/azure-steps.yml
-# parameters:
-# python_version: '$(python.version)'
-# architecture: 'x64'
-# gpu: true
-# num_build_jobs: 24
diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py
index 4d18d216a..299b6bb52 100644
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@@ -71,11 +71,10 @@ def span_maker_forward(model, docs: List[Doc], is_train) -> Tuple[Ragged, Callab
cands.append((start_token, end_token))
candidates.append(ops.asarray2i(cands))
- candlens = ops.asarray1i([len(cands) for cands in candidates])
- candidates = ops.xp.concatenate(candidates)
- outputs = Ragged(candidates, candlens)
+ lengths = model.ops.asarray1i([len(cands) for cands in candidates])
+ out = Ragged(model.ops.flatten(candidates), lengths)
# because this is just rearranging docs, the backprop does nothing
- return outputs, lambda x: []
+ return out, lambda x: []
@registry.misc("spacy.KBFromFile.v1")
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 506cdb61c..9cebb9aeb 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -27,8 +27,8 @@ single_label_default_config = """
[model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = 64
-rows = [2000, 2000, 1000, 1000, 1000, 1000]
-attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
+rows = [2000, 2000, 500, 1000, 500]
+attrs = ["NORM", "LOWER", "PREFIX", "SUFFIX", "SHAPE"]
include_static_vectors = false
[model.tok2vec.encode]
@@ -75,7 +75,7 @@ subword_features = true
"textcat",
assigns=["doc.cats"],
default_config={
- "threshold": 0.5,
+ "threshold": 0.0,
"model": DEFAULT_SINGLE_TEXTCAT_MODEL,
"scorer": {"@scorers": "spacy.textcat_scorer.v1"},
"save_activations": False,
@@ -158,7 +158,8 @@ class TextCategorizer(TrainablePipe):
model (thinc.api.Model): The Thinc Model powering the pipeline component.
name (str): The component instance name, used to add entries to the
losses during training.
- threshold (float): Cutoff to consider a prediction "positive".
+ threshold (float): Unused, not needed for single-label (exclusive
+ classes) classification.
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_cats for the attribute "cats".
@@ -168,7 +169,7 @@ class TextCategorizer(TrainablePipe):
self.model = model
self.name = name
self._rehearsal_model = None
- cfg = {"labels": [], "threshold": threshold, "positive_label": None}
+ cfg: Dict[str, Any] = {"labels": [], "threshold": threshold, "positive_label": None}
self.cfg = dict(cfg)
self.scorer = scorer
self.save_activations = save_activations
diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py
index a69937a0c..3ba80653e 100644
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@@ -24,8 +24,8 @@ multi_label_default_config = """
[model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = 64
-rows = [2000, 2000, 1000, 1000, 1000, 1000]
-attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
+rows = [2000, 2000, 500, 1000, 500]
+attrs = ["NORM", "LOWER", "PREFIX", "SUFFIX", "SHAPE"]
include_static_vectors = false
[model.tok2vec.encode]
diff --git a/spacy/scorer.py b/spacy/scorer.py
index 8cd755ac4..16fc303a0 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -446,7 +446,7 @@ class Scorer:
labels (Iterable[str]): The set of possible labels. Defaults to [].
multi_label (bool): Whether the attribute allows multiple labels.
Defaults to True. When set to False (exclusive labels), missing
- gold labels are interpreted as 0.0.
+ gold labels are interpreted as 0.0 and the threshold is set to 0.0.
positive_label (str): The positive label for a binary task with
exclusive classes. Defaults to None.
threshold (float): Cutoff to consider a prediction "positive". Defaults
@@ -471,6 +471,8 @@ class Scorer:
"""
if threshold is None:
threshold = 0.5 if multi_label else 0.0
+ if not multi_label:
+ threshold = 0.0
f_per_type = {label: PRFScore() for label in labels}
auc_per_type = {label: ROCAUCScore() for label in labels}
labels = set(labels)
@@ -505,20 +507,18 @@ class Scorer:
# Get the highest-scoring for each.
pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
gold_label, gold_score = max(gold_cats.items(), key=lambda it: it[1])
- if pred_label == gold_label and pred_score >= threshold:
+ if pred_label == gold_label:
f_per_type[pred_label].tp += 1
else:
f_per_type[gold_label].fn += 1
- if pred_score >= threshold:
- f_per_type[pred_label].fp += 1
+ f_per_type[pred_label].fp += 1
elif gold_cats:
gold_label, gold_score = max(gold_cats, key=lambda it: it[1])
if gold_score > 0:
f_per_type[gold_label].fn += 1
elif pred_cats:
pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
- if pred_score >= threshold:
- f_per_type[pred_label].fp += 1
+ f_per_type[pred_label].fp += 1
micro_prf = PRFScore()
for label_prf in f_per_type.values():
micro_prf.tp += label_prf.tp
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index a6baa1ff4..9a8ce6653 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -10,6 +10,7 @@ from spacy.compat import pickle
from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase
from spacy.lang.en import English
from spacy.ml import load_kb
+from spacy.ml.models.entity_linker import build_span_maker
from spacy.pipeline import EntityLinker, TrainablePipe
from spacy.pipeline.legacy import EntityLinker_v1
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
@@ -716,7 +717,11 @@ TRAIN_DATA = [
("Russ Cochran was a member of University of Kentucky's golf team.",
{"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}},
"entities": [(0, 12, "PERSON"), (43, 51, "LOC")],
- "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]})
+ "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}),
+ # having a blank instance shouldn't break things
+ ("The weather is nice today.",
+ {"links": {}, "entities": [],
+ "sent_starts": [1, -1, 0, 0, 0, 0]})
]
GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
# fmt: on
@@ -1260,3 +1265,18 @@ def test_save_activations():
assert scores.data.shape == (2, 1)
assert scores.data.dtype == "float32"
assert scores.lengths.shape == (1,)
+
+
+def test_span_maker_forward_with_empty():
+ """The forward pass of the span maker may have a doc with no entities."""
+ nlp = English()
+ doc1 = nlp("a b c")
+ ent = doc1[0:1]
+ ent.label_ = "X"
+ doc1.ents = [ent]
+ # no entities
+ doc2 = nlp("x y z")
+
+ # just to get a model
+ span_maker = build_span_maker()
+ span_maker([doc1, doc2], False)
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index c1f61a3c0..46a0b15a7 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -824,10 +824,10 @@ def test_textcat_loss(multi_label: bool, expected_loss: float):
assert loss == expected_loss
-def test_textcat_threshold():
+def test_textcat_multilabel_threshold():
# Ensure the scorer can be called with a different threshold
nlp = English()
- nlp.add_pipe("textcat")
+ nlp.add_pipe("textcat_multilabel")
train_examples = []
for text, annotations in TRAIN_DATA_SINGLE_LABEL:
@@ -850,7 +850,7 @@ def test_textcat_threshold():
)
pos_f = scores["cats_score"]
assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
- assert pos_f > macro_f
+ assert pos_f >= macro_f
def test_textcat_multi_threshold():
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index 659274db9..e423d9a19 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -231,7 +231,7 @@ def test_tok2vec_listener_callback():
def test_tok2vec_listener_overfitting():
- """ Test that a pipeline with a listener properly overfits, even if 'tok2vec' is in the annotating components """
+ """Test that a pipeline with a listener properly overfits, even if 'tok2vec' is in the annotating components"""
orig_config = Config().from_str(cfg_string)
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
train_examples = []
@@ -264,7 +264,7 @@ def test_tok2vec_listener_overfitting():
def test_tok2vec_frozen_not_annotating():
- """ Test that a pipeline with a frozen tok2vec raises an error when the tok2vec is not annotating """
+ """Test that a pipeline with a frozen tok2vec raises an error when the tok2vec is not annotating"""
orig_config = Config().from_str(cfg_string)
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
train_examples = []
@@ -274,12 +274,16 @@ def test_tok2vec_frozen_not_annotating():
for i in range(2):
losses = {}
- with pytest.raises(ValueError, match=r"the tok2vec embedding layer is not updated"):
- nlp.update(train_examples, sgd=optimizer, losses=losses, exclude=["tok2vec"])
+ with pytest.raises(
+ ValueError, match=r"the tok2vec embedding layer is not updated"
+ ):
+ nlp.update(
+ train_examples, sgd=optimizer, losses=losses, exclude=["tok2vec"]
+ )
def test_tok2vec_frozen_overfitting():
- """ Test that a pipeline with a frozen & annotating tok2vec can still overfit """
+ """Test that a pipeline with a frozen & annotating tok2vec can still overfit"""
orig_config = Config().from_str(cfg_string)
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
train_examples = []
@@ -289,7 +293,13 @@ def test_tok2vec_frozen_overfitting():
for i in range(100):
losses = {}
- nlp.update(train_examples, sgd=optimizer, losses=losses, exclude=["tok2vec"], annotates=["tok2vec"])
+ nlp.update(
+ train_examples,
+ sgd=optimizer,
+ losses=losses,
+ exclude=["tok2vec"],
+ annotates=["tok2vec"],
+ )
assert losses["tagger"] < 0.0001
# test the trained model
diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py
index 2306cabb7..d91ed1201 100644
--- a/spacy/tests/test_models.py
+++ b/spacy/tests/test_models.py
@@ -23,7 +23,7 @@ def get_textcat_bow_kwargs():
def get_textcat_cnn_kwargs():
- return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13}
+ return {"tok2vec": make_test_tok2vec(), "exclusive_classes": False, "nO": 13}
def get_all_params(model):
@@ -65,7 +65,7 @@ def get_tok2vec_kwargs():
}
-def test_tok2vec():
+def make_test_tok2vec():
return build_Tok2Vec_model(**get_tok2vec_kwargs())
diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py
index 6e15fa2de..b903f1669 100644
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@@ -474,3 +474,50 @@ def test_prf_score():
assert (a.precision, a.recall, a.fscore) == approx(
(c.precision, c.recall, c.fscore)
)
+
+
+def test_score_cats(en_tokenizer):
+ text = "some text"
+ gold_doc = en_tokenizer(text)
+ gold_doc.cats = {"POSITIVE": 1.0, "NEGATIVE": 0.0}
+ pred_doc = en_tokenizer(text)
+ pred_doc.cats = {"POSITIVE": 0.75, "NEGATIVE": 0.25}
+ example = Example(pred_doc, gold_doc)
+ # threshold is ignored for multi_label=False
+ scores1 = Scorer.score_cats(
+ [example],
+ "cats",
+ labels=list(gold_doc.cats.keys()),
+ multi_label=False,
+ positive_label="POSITIVE",
+ threshold=0.1,
+ )
+ scores2 = Scorer.score_cats(
+ [example],
+ "cats",
+ labels=list(gold_doc.cats.keys()),
+ multi_label=False,
+ positive_label="POSITIVE",
+ threshold=0.9,
+ )
+ assert scores1["cats_score"] == 1.0
+ assert scores2["cats_score"] == 1.0
+ assert scores1 == scores2
+ # threshold is relevant for multi_label=True
+ scores = Scorer.score_cats(
+ [example],
+ "cats",
+ labels=list(gold_doc.cats.keys()),
+ multi_label=True,
+ threshold=0.9,
+ )
+ assert scores["cats_macro_f"] == 0.0
+ # threshold is relevant for multi_label=True
+ scores = Scorer.score_cats(
+ [example],
+ "cats",
+ labels=list(gold_doc.cats.keys()),
+ multi_label=True,
+ threshold=0.1,
+ )
+ assert scores["cats_macro_f"] == 0.5
diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md
index ca3462aa9..9ef36e6fc 100644
--- a/website/docs/api/scorer.md
+++ b/website/docs/api/scorer.md
@@ -229,16 +229,17 @@ The reported `{attr}_score` depends on the classification properties:
> print(scores["cats_macro_auc"])
> ```
-| Name | Description |
-| ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
-| `attr` | The attribute to score. ~~str~~ |
-| _keyword-only_ | |
-| `getter` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. ~~Callable[[Doc, str], Dict[str, float]]~~ |
-| labels | The set of possible labels. Defaults to `[]`. ~~Iterable[str]~~ |
-| `multi_label` | Whether the attribute allows multiple labels. Defaults to `True`. ~~bool~~ |
-| `positive_label` | The positive label for a binary task with exclusive classes. Defaults to `None`. ~~Optional[str]~~ |
-| **RETURNS** | A dictionary containing the scores, with inapplicable scores as `None`. ~~Dict[str, Optional[float]]~~ |
+| Name | Description |
+| ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
+| `attr` | The attribute to score. ~~str~~ |
+| _keyword-only_ | |
+| `getter` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. ~~Callable[[Doc, str], Dict[str, float]]~~ |
+| labels | The set of possible labels. Defaults to `[]`. ~~Iterable[str]~~ |
+| `multi_label` | Whether the attribute allows multiple labels. Defaults to `True`. When set to `False` (exclusive labels), missing gold labels are interpreted as `0.0` and the threshold is set to `0.0`. ~~bool~~ |
+| `positive_label` | The positive label for a binary task with exclusive classes. Defaults to `None`. ~~Optional[str]~~ |
+| `threshold` | Cutoff to consider a prediction "positive". Defaults to `0.5` for multi-label, and `0.0` (i.e. whatever's highest scoring) otherwise. ~~float~~ |
+| **RETURNS** | A dictionary containing the scores, with inapplicable scores as `None`. ~~Dict[str, Optional[float]]~~ |
## Scorer.score_links {#score_links tag="staticmethod" new="3"}
diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md
index d8a609693..ed1205d8c 100644
--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@@ -63,7 +63,6 @@ architectures and their arguments and hyperparameters.
> ```python
> from spacy.pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL
> config = {
-> "threshold": 0.5,
> "model": DEFAULT_SINGLE_TEXTCAT_MODEL,
> }
> nlp.add_pipe("textcat", config=config)
@@ -82,7 +81,7 @@ architectures and their arguments and hyperparameters.
| Setting | Description |
| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ |
+| `threshold` | Cutoff to consider a prediction "positive", relevant for `textcat_multilabel` when calculating accuracy scores. ~~float~~ |
| `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ |
| `scorer` | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ |
@@ -123,7 +122,7 @@ shortcut for this and instantiate the component using its string name and
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| _keyword-only_ | |
-| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ |
+| `threshold` | Cutoff to consider a prediction "positive", relevant for `textcat_multilabel` when calculating accuracy scores. ~~float~~ |
| `scorer` | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ |
| `save_activations` 4.0 | Save activations in `Doc` when annotating. The supported activations is `"probabilities"`. ~~Union[bool, list[str]]~~ |
diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md
index d9f551820..8e55d54d6 100644
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@@ -1791,7 +1791,7 @@ the entity `Span` – for example `._.orgs` or `._.prev_orgs` and
> [`Doc.retokenize`](/api/doc#retokenize) context manager:
>
> ```python
-> with doc.retokenize() as retokenize:
+> with doc.retokenize() as retokenizer:
> for ent in doc.ents:
> retokenizer.merge(ent)
> ```
diff --git a/website/meta/languages.json b/website/meta/languages.json
index 0028b4a5f..bd1535c90 100644
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@@ -4,12 +4,22 @@
"code": "af",
"name": "Afrikaans"
},
+ {
+ "code": "am",
+ "name": "Amharic",
+ "has_examples": true
+ },
{
"code": "ar",
"name": "Arabic",
"example": "هذه جملة",
"has_examples": true
},
+ {
+ "code": "az",
+ "name": "Azerbaijani",
+ "has_examples": true
+ },
{
"code": "bg",
"name": "Bulgarian",
@@ -65,7 +75,7 @@
{
"code": "dsb",
"name": "Lower Sorbian",
- "has_examples": true
+ "has_examples": true
},
{
"code": "el",
@@ -142,6 +152,11 @@
"code": "ga",
"name": "Irish"
},
+ {
+ "code": "grc",
+ "name": "Ancient Greek",
+ "has_examples": true
+ },
{
"code": "gu",
"name": "Gujarati",
@@ -172,7 +187,7 @@
{
"code": "hsb",
"name": "Upper Sorbian",
- "has_examples": true
+ "has_examples": true
},
{
"code": "hu",
@@ -260,6 +275,10 @@
"example": "Адамга эң кыйыны — күн сайын адам болуу",
"has_examples": true
},
+ {
+ "code": "la",
+ "name": "Latin"
+ },
{
"code": "lb",
"name": "Luxembourgish",
@@ -448,6 +467,11 @@
"example": "นี่คือประโยค",
"has_examples": true
},
+ {
+ "code": "ti",
+ "name": "Tigrinya",
+ "has_examples": true
+ },
{
"code": "tl",
"name": "Tagalog"
diff --git a/website/src/styles/quickstart.module.sass b/website/src/styles/quickstart.module.sass
index 8ad106a78..d0f9db551 100644
--- a/website/src/styles/quickstart.module.sass
+++ b/website/src/styles/quickstart.module.sass
@@ -149,6 +149,9 @@
& > span
display: block
+ a
+ text-decoration: underline
+
.small
font-size: var(--font-size-code)
line-height: 1.65
diff --git a/website/src/widgets/quickstart-install.js b/website/src/widgets/quickstart-install.js
index 0d2186acb..28dd14ecc 100644
--- a/website/src/widgets/quickstart-install.js
+++ b/website/src/widgets/quickstart-install.js
@@ -159,6 +159,9 @@ const QuickstartInstall = ({ id, title }) => {
setters={setters}
showDropdown={showDropdown}
>
+
+ # Note M1 GPU support is experimental, see Thinc issue #792
+
python -m venv .env
@@ -198,7 +201,13 @@ const QuickstartInstall = ({ id, title }) => {
{nightly ? ' --pre' : ''}
conda install -c conda-forge spacy
-
+
+ conda install -c conda-forge cupy
+
+
+ conda install -c conda-forge cupy
+
+
conda install -c conda-forge cupy