Merge remote-tracking branch 'upstream/master' into chore/update-v4-from-master-3

2025-08-06 21:30:22 +03:00 · 2022-11-02 10:46:09 +01:00 · 2022-11-02 10:46:09 +01:00 · 79c11de0c4
commit 79c11de0c4
parent cae4589f5a d25f09468c
13 changed files with 100 additions and 34 deletions
--- a/.github/workflows/autoblack.yml
+++ b/.github/workflows/autoblack.yml
@ -12,10 +12,10 @@ jobs:
    if: github.repository_owner == 'explosion'
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
        with:
            ref: ${{ github.head_ref }}
-      - uses: actions/setup-python@v2
+      - uses: actions/setup-python@v3
      - run: pip install black
      - name: Auto-format code if needed
        run: black spacy
@ -23,10 +23,11 @@ jobs:
      # code and makes GitHub think the action failed
      - name: Check for modified files
        id: git-check
-        run: echo ::set-output name=modified::$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi)
+        run: echo modified=$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) >> $GITHUB_OUTPUT
+
      - name: Create Pull Request
        if: steps.git-check.outputs.modified == 'true'
-        uses: peter-evans/create-pull-request@v3
+        uses: peter-evans/create-pull-request@v4
        with:
            title: Auto-format code with black
            labels: meta
--- a/README.md
+++ b/README.md
@ -8,7 +8,7 @@ be used in real products.

 spaCy comes with
 [pretrained pipelines](https://spacy.io/models) and
-currently supports tokenization and training for **60+ languages**. It features
+currently supports tokenization and training for **70+ languages**. It features
 state-of-the-art speed and **neural network models** for tagging,
 parsing, **named entity recognition**, **text classification** and more,
 multi-task learning with pretrained **transformers** like BERT, as well as a
@ -16,7 +16,7 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy
 model packaging, deployment and workflow management. spaCy is commercial
 open-source software, released under the MIT license.

-💫 **Version 3.4.0 out now!**
+💫 **Version 3.4 out now!**
 [Check out the release notes here.](https://github.com/explosion/spaCy/releases)

 [![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
@ -79,7 +79,7 @@ more people can benefit from it.

 ## Features

- Support for **60+ languages**
+- Support for **70+ languages**
 - **Trained pipelines** for different languages and tasks
 - Multi-task learning with pretrained **transformers** like BERT
 - Support for pretrained **word vectors** and embeddings
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -76,15 +76,15 @@ jobs:
        #        Python39Mac:
        #          imageName: "macos-latest"
        #          python.version: "3.9"
-        Python310Linux:
-          imageName: "ubuntu-latest"
-          python.version: "3.10"
+        #        Python310Linux:
+        #          imageName: "ubuntu-latest"
+        #          python.version: "3.10"
        Python310Windows:
          imageName: "windows-latest"
          python.version: "3.10"
-        Python310Mac:
-          imageName: "macos-latest"
-          python.version: "3.10"
+        #        Python310Mac:
+        #          imageName: "macos-latest"
+        #          python.version: "3.10"
        Python311Linux:
          imageName: 'ubuntu-latest'
          python.version: '3.11.0-rc.2'
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@ -71,11 +71,10 @@ def span_maker_forward(model, docs: List[Doc], is_train) -> Tuple[Ragged, Callab
            cands.append((start_token, end_token))

        candidates.append(ops.asarray2i(cands))
-    candlens = ops.asarray1i([len(cands) for cands in candidates])
-    candidates = ops.xp.concatenate(candidates)
-    outputs = Ragged(candidates, candlens)
+    lengths = model.ops.asarray1i([len(cands) for cands in candidates])
+    out = Ragged(model.ops.flatten(candidates), lengths)
    # because this is just rearranging docs, the backprop does nothing
-    return outputs, lambda x: []
+    return out, lambda x: []


@registry.misc("spacy.KBFromFile.v1")
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -27,8 +27,8 @@ single_label_default_config = """
 [model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v2"
 width = 64
-rows = [2000, 2000, 1000, 1000, 1000, 1000]
-attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
+rows = [2000, 2000, 500, 1000, 500]
+attrs = ["NORM", "LOWER", "PREFIX", "SUFFIX", "SHAPE"]
 include_static_vectors = false

 [model.tok2vec.encode]
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@ -24,8 +24,8 @@ multi_label_default_config = """
 [model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v2"
 width = 64
-rows = [2000, 2000, 1000, 1000, 1000, 1000]
-attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
+rows = [2000, 2000, 500, 1000, 500]
+attrs = ["NORM", "LOWER", "PREFIX", "SUFFIX", "SHAPE"]
 include_static_vectors = false

 [model.tok2vec.encode]
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@ -10,6 +10,7 @@ from spacy.compat import pickle
 from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase
 from spacy.lang.en import English
 from spacy.ml import load_kb
+from spacy.ml.models.entity_linker import build_span_maker
 from spacy.pipeline import EntityLinker, TrainablePipe
 from spacy.pipeline.legacy import EntityLinker_v1
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
@ -716,7 +717,11 @@ TRAIN_DATA = [
    ("Russ Cochran was a member of University of Kentucky's golf team.",
        {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}},
         "entities": [(0, 12, "PERSON"), (43, 51, "LOC")],
-         "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]})
+         "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}),
+    # having a blank instance shouldn't break things
+    ("The weather is nice today.",
+        {"links": {}, "entities": [],
+         "sent_starts": [1, -1, 0, 0, 0, 0]})
 ]
 GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
 # fmt: on
@ -1260,3 +1265,18 @@ def test_save_activations():
    assert scores.data.shape == (2, 1)
    assert scores.data.dtype == "float32"
    assert scores.lengths.shape == (1,)
+
+
+def test_span_maker_forward_with_empty():
+    """The forward pass of the span maker may have a doc with no entities."""
+    nlp = English()
+    doc1 = nlp("a b c")
+    ent = doc1[0:1]
+    ent.label_ = "X"
+    doc1.ents = [ent]
+    # no entities
+    doc2 = nlp("x y z")
+
+    # just to get a model
+    span_maker = build_span_maker()
+    span_maker([doc1, doc2], False)
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@ -231,7 +231,7 @@ def test_tok2vec_listener_callback():


 def test_tok2vec_listener_overfitting():
-    """ Test that a pipeline with a listener properly overfits, even if 'tok2vec' is in the annotating components """
+    """Test that a pipeline with a listener properly overfits, even if 'tok2vec' is in the annotating components"""
    orig_config = Config().from_str(cfg_string)
    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
    train_examples = []
@ -264,7 +264,7 @@ def test_tok2vec_listener_overfitting():


 def test_tok2vec_frozen_not_annotating():
-    """ Test that a pipeline with a frozen tok2vec raises an error when the tok2vec is not annotating """
+    """Test that a pipeline with a frozen tok2vec raises an error when the tok2vec is not annotating"""
    orig_config = Config().from_str(cfg_string)
    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
    train_examples = []
@ -274,12 +274,16 @@ def test_tok2vec_frozen_not_annotating():

    for i in range(2):
        losses = {}
-        with pytest.raises(ValueError, match=r"the tok2vec embedding layer is not updated"):
-            nlp.update(train_examples, sgd=optimizer, losses=losses, exclude=["tok2vec"])
+        with pytest.raises(
+            ValueError, match=r"the tok2vec embedding layer is not updated"
+        ):
+            nlp.update(
+                train_examples, sgd=optimizer, losses=losses, exclude=["tok2vec"]
+            )


 def test_tok2vec_frozen_overfitting():
-    """ Test that a pipeline with a frozen & annotating tok2vec can still overfit """
+    """Test that a pipeline with a frozen & annotating tok2vec can still overfit"""
    orig_config = Config().from_str(cfg_string)
    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
    train_examples = []
@ -289,7 +293,13 @@ def test_tok2vec_frozen_overfitting():

    for i in range(100):
        losses = {}
-        nlp.update(train_examples, sgd=optimizer, losses=losses, exclude=["tok2vec"], annotates=["tok2vec"])
+        nlp.update(
+            train_examples,
+            sgd=optimizer,
+            losses=losses,
+            exclude=["tok2vec"],
+            annotates=["tok2vec"],
+        )
    assert losses["tagger"] < 0.0001

    # test the trained model
--- a/spacy/tests/test_models.py
+++ b/spacy/tests/test_models.py
@ -23,7 +23,7 @@ def get_textcat_bow_kwargs():


 def get_textcat_cnn_kwargs():
-    return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13}
+    return {"tok2vec": make_test_tok2vec(), "exclusive_classes": False, "nO": 13}


 def get_all_params(model):
@ -65,7 +65,7 @@ def get_tok2vec_kwargs():
    }


-def test_tok2vec():
+def make_test_tok2vec():
    return build_Tok2Vec_model(**get_tok2vec_kwargs())


--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@ -1791,7 +1791,7 @@ the entity `Span` – for example `._.orgs` or `._.prev_orgs` and
 > [`Doc.retokenize`](/api/doc#retokenize) context manager:
 >
 > ```python
-> with doc.retokenize() as retokenize:
+> with doc.retokenize() as retokenizer:
 >   for ent in doc.ents:
 >       retokenizer.merge(ent)
 > ```
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@ -4,12 +4,22 @@
            "code": "af",
            "name": "Afrikaans"
        },
+        {
+            "code": "am",
+            "name": "Amharic",
+            "has_examples": true
+        },
        {
            "code": "ar",
            "name": "Arabic",
            "example": "هذه جملة",
            "has_examples": true
        },
+        {
+            "code": "az",
+            "name": "Azerbaijani",
+            "has_examples": true
+        },
        {
            "code": "bg",
            "name": "Bulgarian",
@ -65,7 +75,7 @@
        {
            "code": "dsb",
            "name": "Lower Sorbian",
-	    "has_examples": true
+            "has_examples": true
        },
        {
            "code": "el",
@ -142,6 +152,11 @@
            "code": "ga",
            "name": "Irish"
        },
+        {
+            "code": "grc",
+            "name": "Ancient Greek",
+            "has_examples": true
+        },
        {
            "code": "gu",
            "name": "Gujarati",
@ -172,7 +187,7 @@
        {
            "code": "hsb",
            "name": "Upper Sorbian",
-	    "has_examples": true
+            "has_examples": true
        },
        {
            "code": "hu",
@ -260,6 +275,10 @@
            "example": "Адамга эң кыйыны — күн сайын адам болуу",
            "has_examples": true
        },
+        {
+            "code": "la",
+            "name": "Latin"
+        },
        {
            "code": "lb",
            "name": "Luxembourgish",
@ -448,6 +467,11 @@
            "example": "นี่คือประโยค",
            "has_examples": true
        },
+        {
+            "code": "ti",
+            "name": "Tigrinya",
+            "has_examples": true
+        },
        {
            "code": "tl",
            "name": "Tagalog"
--- a/website/src/styles/quickstart.module.sass
+++ b/website/src/styles/quickstart.module.sass
@ -149,6 +149,9 @@
    & > span
        display: block

+    a
+        text-decoration: underline
+
 .small
    font-size: var(--font-size-code)
    line-height: 1.65
--- a/website/src/widgets/quickstart-install.js
+++ b/website/src/widgets/quickstart-install.js
@ -159,6 +159,9 @@ const QuickstartInstall = ({ id, title }) => {
                        setters={setters}
                        showDropdown={showDropdown}
                    >
+                        <QS os="mac" hardware="gpu" platform="arm">
+                            # Note M1 GPU support is experimental, see <a href="https://github.com/explosion/thinc/issues/792">Thinc issue #792</a>
+                        </QS>
                        <QS package="pip" config="venv">
                            python -m venv .env
                        </QS>
@ -198,7 +201,13 @@ const QuickstartInstall = ({ id, title }) => {
                            {nightly ? ' --pre' : ''}
                        </QS>
                        <QS package="conda">conda install -c conda-forge spacy</QS>
-                        <QS package="conda" hardware="gpu">
+                        <QS package="conda" hardware="gpu" os="windows">
+                            conda install -c conda-forge cupy
+                        </QS>
+                        <QS package="conda" hardware="gpu" os="linux">
+                            conda install -c conda-forge cupy
+                        </QS>
+                        <QS package="conda" hardware="gpu" os="mac" platform="x86">
                            conda install -c conda-forge cupy
                        </QS>
                        <QS package="conda" config="train">