Merge branch 'explosion:master' into feature/candidate-generation-by-docs

2025-07-10 16:22:29 +03:00 · 2022-11-04 12:25:25 +01:00 · 2022-11-04 12:25:25 +01:00 · 7a4ef51807
commit 7a4ef51807
parent 4f7b535ebb bbf64cfc43
16 changed files with 161 additions and 107 deletions
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@ -1,9 +1,7 @@
 parameters:
  python_version: ''
-  architecture: ''
+  architecture: 'x64'
-  prefix: ''
+  num_build_jobs: 2
  gpu: false
  num_build_jobs: 1
 steps:
  - task: UsePythonVersion@0
@ -17,16 +15,16 @@ steps:
    displayName: 'Set variables'
  - script: |
-      ${{ parameters.prefix }} python -m pip install -U pip setuptools
+      python -m pip install -U build pip setuptools
-      ${{ parameters.prefix }} python -m pip install -U -r requirements.txt
+      python -m pip install -U -r requirements.txt
    displayName: "Install dependencies"
  - script: |
-      ${{ parameters.prefix }} python setup.py build_ext --inplace -j ${{ parameters.num_build_jobs }}
+      python -m build --sdist
-      ${{ parameters.prefix }} python setup.py sdist --formats=gztar
+    displayName: "Build sdist"
    displayName: "Compile and build sdist"
-  - script: python -m mypy spacy
+  - script: |
      python -m mypy spacy
    displayName: 'Run mypy'
    condition: ne(variables['python_version'], '3.6')
@ -35,35 +33,24 @@ steps:
      contents: "spacy"
    displayName: "Delete source directory"
  - task: DeleteFiles@1
    inputs:
      contents: "*.egg-info"
    displayName: "Delete egg-info directory"
  - script: |
-      ${{ parameters.prefix }} python -m pip freeze --exclude torch --exclude cupy-cuda110 > installed.txt
+      python -m pip freeze > installed.txt
-      ${{ parameters.prefix }} python -m pip uninstall -y -r installed.txt
+      python -m pip uninstall -y -r installed.txt
    displayName: "Uninstall all packages"
  - bash: |
-      ${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
+      SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
-      ${{ parameters.prefix }} SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST
+      SPACY_NUM_BUILD_JOBS=${{ parameters.num_build_jobs }} python -m pip install dist/$SDIST
    displayName: "Install from sdist"
  - script: |
-      ${{ parameters.prefix }} python -m pip install -U -r requirements.txt
+      python -W error -c "import spacy"
-    displayName: "Install test requirements"
+    displayName: "Test import"
  - script: |
      ${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0
      ${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html
    displayName: "Install GPU requirements"
    condition: eq(${{ parameters.gpu }}, true)
  - script: |
      ${{ parameters.prefix }} python -m pytest --pyargs spacy -W error
    displayName: "Run CPU tests"
    condition: eq(${{ parameters.gpu }}, false)
  - script: |
      ${{ parameters.prefix }} python -m pytest --pyargs spacy -W error -p spacy.tests.enable_gpu
    displayName: "Run GPU tests"
    condition: eq(${{ parameters.gpu }}, true)
  - script: |
      python -m spacy download ca_core_news_sm
@ -72,6 +59,11 @@ steps:
    displayName: 'Test download CLI'
    condition: eq(variables['python_version'], '3.8')
  - script: |
      python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
    displayName: 'Test no warnings on load (#11713)'
    condition: eq(variables['python_version'], '3.8')
  - script: |
      python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
    displayName: 'Test convert CLI'
@ -106,13 +98,22 @@ steps:
    displayName: 'Test assemble CLI vectors warning'
    condition: eq(variables['python_version'], '3.8')
  - script: |
      python -m pip install -U -r requirements.txt
    displayName: "Install test requirements"
  - script: |
      python -m pytest --pyargs spacy -W error
    displayName: "Run CPU tests"
  - script: |
      python -m pip install --pre thinc-apple-ops
      python -m pytest --pyargs spacy
    displayName: "Run CPU tests with thinc-apple-ops"
    condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
  - script: |
      python .github/validate_universe_json.py website/meta/universe.json
    displayName: 'Test website/meta/universe.json'
    condition: eq(variables['python_version'], '3.8')
  - script: |
      ${{ parameters.prefix }} python -m pip install --pre thinc-apple-ops
      ${{ parameters.prefix }} python -m pytest --pyargs spacy
    displayName: "Run CPU tests with thinc-apple-ops"
    condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.10'))
--- a/.github/workflows/autoblack.yml
+++ b/.github/workflows/autoblack.yml
@ -15,7 +15,7 @@ jobs:
      - uses: actions/checkout@v3
        with:
            ref: ${{ github.head_ref }}
-      - uses: actions/setup-python@v3
+      - uses: actions/setup-python@v4
      - run: pip install black
      - name: Auto-format code if needed
        run: black spacy
--- a/.github/workflows/explosionbot.yml
+++ b/.github/workflows/explosionbot.yml
@ -8,14 +8,14 @@ on:
 jobs:
  explosion-bot:
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-latest
    steps:
      - name: Dump GitHub context
        env:
          GITHUB_CONTEXT: ${{ toJson(github) }}
        run: echo "$GITHUB_CONTEXT"
-      - uses: actions/checkout@v1
+      - uses: actions/checkout@v3
-      - uses: actions/setup-python@v1
+      - uses: actions/setup-python@v4
      - name: Install and run explosion-bot
        run: |
          pip install git+https://${{ secrets.EXPLOSIONBOT_TOKEN }}@github.com/explosion/explosion-bot
--- a/.github/workflows/slowtests.yml
+++ b/.github/workflows/slowtests.yml
@ -14,7 +14,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@v1
+        uses: actions/checkout@v3
        with:
          ref: ${{ matrix.branch }}
      - name: Get commits from past 24 hours
@ -23,9 +23,9 @@ jobs:
          today=$(date '+%Y-%m-%d %H:%M:%S')
          yesterday=$(date -d "yesterday" '+%Y-%m-%d %H:%M:%S')
          if git log --after="$yesterday" --before="$today" | grep commit ; then
-            echo "::set-output name=run_tests::true"
+            echo run_tests=true >> $GITHUB_OUTPUT
          else
-            echo "::set-output name=run_tests::false"
+            echo run_tests=false >> $GITHUB_OUTPUT
          fi
      - name: Trigger buildkite build
--- a/.github/workflows/spacy_universe_alert.yml
+++ b/.github/workflows/spacy_universe_alert.yml
@ -17,8 +17,8 @@ jobs:
        run: |
          echo "$GITHUB_CONTEXT"
-      - uses: actions/checkout@v1
+      - uses: actions/checkout@v3
-      - uses: actions/setup-python@v1
+      - uses: actions/setup-python@v4
      - name: Install Bernadette app dependency and send an alert
        env:
          SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -87,13 +87,13 @@ jobs:
        #          python.version: "3.10"
        Python311Linux:
          imageName: 'ubuntu-latest'
-          python.version: '3.11.0-rc.2'
+          python.version: '3.11.0'
        Python311Windows:
          imageName: 'windows-latest'
-          python.version: '3.11.0-rc.2'
+          python.version: '3.11.0'
        Python311Mac:
          imageName: 'macos-latest'
-          python.version: '3.11.0-rc.2'
+          python.version: '3.11.0'
      maxParallel: 4
    pool:
      vmImage: $(imageName)
@ -101,20 +101,3 @@ jobs:
      - template: .github/azure-steps.yml
        parameters:
          python_version: '$(python.version)'
          architecture: 'x64'
 #  - job: "TestGPU"
 #    dependsOn: "Validate"
 #    strategy:
 #      matrix:
 #        Python38LinuxX64_GPU:
 #          python.version: '3.8'
 #    pool:
 #      name: "LinuxX64_GPU"
 #    steps:
 #      - template: .github/azure-steps.yml
 #        parameters:
 #          python_version: '$(python.version)'
 #          architecture: 'x64'
 #          gpu: true
 #          num_build_jobs: 24
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -72,7 +72,7 @@ subword_features = true
    "textcat",
    assigns=["doc.cats"],
    default_config={
-        "threshold": 0.5,
+        "threshold": 0.0,
        "model": DEFAULT_SINGLE_TEXTCAT_MODEL,
        "scorer": {"@scorers": "spacy.textcat_scorer.v1"},
    },
@ -144,7 +144,8 @@ class TextCategorizer(TrainablePipe):
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
        name (str): The component instance name, used to add entries to the
            losses during training.
-        threshold (float): Cutoff to consider a prediction "positive".
+        threshold (float): Unused, not needed for single-label (exclusive
            classes) classification.
        scorer (Optional[Callable]): The scoring method. Defaults to
                Scorer.score_cats for the attribute "cats".
@ -154,7 +155,11 @@ class TextCategorizer(TrainablePipe):
        self.model = model
        self.name = name
        self._rehearsal_model = None
-        cfg = {"labels": [], "threshold": threshold, "positive_label": None}
+        cfg: Dict[str, Any] = {
            "labels": [],
            "threshold": threshold,
            "positive_label": None,
        }
        self.cfg = dict(cfg)
        self.scorer = scorer
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -446,7 +446,7 @@ class Scorer:
        labels (Iterable[str]): The set of possible labels. Defaults to [].
        multi_label (bool): Whether the attribute allows multiple labels.
            Defaults to True. When set to False (exclusive labels), missing
-            gold labels are interpreted as 0.0.
+            gold labels are interpreted as 0.0 and the threshold is set to 0.0.
        positive_label (str): The positive label for a binary task with
            exclusive classes. Defaults to None.
        threshold (float): Cutoff to consider a prediction "positive". Defaults
@ -471,6 +471,8 @@ class Scorer:
        """
        if threshold is None:
            threshold = 0.5 if multi_label else 0.0
        if not multi_label:
            threshold = 0.0
        f_per_type = {label: PRFScore() for label in labels}
        auc_per_type = {label: ROCAUCScore() for label in labels}
        labels = set(labels)
@ -505,20 +507,18 @@ class Scorer:
                # Get the highest-scoring for each.
                pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
                gold_label, gold_score = max(gold_cats.items(), key=lambda it: it[1])
-                if pred_label == gold_label and pred_score >= threshold:
+                if pred_label == gold_label:
                    f_per_type[pred_label].tp += 1
                else:
                    f_per_type[gold_label].fn += 1
-                    if pred_score >= threshold:
+                    f_per_type[pred_label].fp += 1
                        f_per_type[pred_label].fp += 1
            elif gold_cats:
                gold_label, gold_score = max(gold_cats, key=lambda it: it[1])
                if gold_score > 0:
                    f_per_type[gold_label].fn += 1
            elif pred_cats:
                pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
-                if pred_score >= threshold:
+                f_per_type[pred_label].fp += 1
                    f_per_type[pred_label].fp += 1
        micro_prf = PRFScore()
        for label_prf in f_per_type.values():
            micro_prf.tp += label_prf.tp
--- a/spacy/tests/doc/test_json_doc_conversion.py
+++ b/spacy/tests/doc/test_json_doc_conversion.py
@ -370,3 +370,12 @@ def test_json_to_doc_validation_error(doc):
    doc_json.pop("tokens")
    with pytest.raises(ValueError):
        Doc(doc.vocab).from_json(doc_json, validate=True)
 def test_to_json_underscore_doc_getters(doc):
    def get_text_length(doc):
        return len(doc.text)
    Doc.set_extension("text_length", getter=get_text_length)
    doc_json = doc.to_json(underscore=["text_length"])
    assert doc_json["_"]["text_length"] == get_text_length(doc)
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@ -823,10 +823,10 @@ def test_textcat_loss(multi_label: bool, expected_loss: float):
    assert loss == expected_loss
-def test_textcat_threshold():
+def test_textcat_multilabel_threshold():
    # Ensure the scorer can be called with a different threshold
    nlp = English()
-    nlp.add_pipe("textcat")
+    nlp.add_pipe("textcat_multilabel")
    train_examples = []
    for text, annotations in TRAIN_DATA_SINGLE_LABEL:
@ -849,7 +849,7 @@ def test_textcat_threshold():
    )
    pos_f = scores["cats_score"]
    assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
-    assert pos_f > macro_f
+    assert pos_f >= macro_f
 def test_textcat_multi_threshold():
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@ -474,3 +474,50 @@ def test_prf_score():
    assert (a.precision, a.recall, a.fscore) == approx(
        (c.precision, c.recall, c.fscore)
    )
 def test_score_cats(en_tokenizer):
    text = "some text"
    gold_doc = en_tokenizer(text)
    gold_doc.cats = {"POSITIVE": 1.0, "NEGATIVE": 0.0}
    pred_doc = en_tokenizer(text)
    pred_doc.cats = {"POSITIVE": 0.75, "NEGATIVE": 0.25}
    example = Example(pred_doc, gold_doc)
    # threshold is ignored for multi_label=False
    scores1 = Scorer.score_cats(
        [example],
        "cats",
        labels=list(gold_doc.cats.keys()),
        multi_label=False,
        positive_label="POSITIVE",
        threshold=0.1,
    )
    scores2 = Scorer.score_cats(
        [example],
        "cats",
        labels=list(gold_doc.cats.keys()),
        multi_label=False,
        positive_label="POSITIVE",
        threshold=0.9,
    )
    assert scores1["cats_score"] == 1.0
    assert scores2["cats_score"] == 1.0
    assert scores1 == scores2
    # threshold is relevant for multi_label=True
    scores = Scorer.score_cats(
        [example],
        "cats",
        labels=list(gold_doc.cats.keys()),
        multi_label=True,
        threshold=0.9,
    )
    assert scores["cats_macro_f"] == 0.0
    # threshold is relevant for multi_label=True
    scores = Scorer.score_cats(
        [example],
        "cats",
        labels=list(gold_doc.cats.keys()),
        multi_label=True,
        threshold=0.1,
    )
    assert scores["cats_macro_f"] == 0.5
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -1668,6 +1668,20 @@ cdef class Doc:
        if underscore:
            user_keys = set()
            # Handle doc attributes with .get to include values from getters
            # and not only values stored in user_data, for backwards
            # compatibility
            for attr in underscore:
                if self.has_extension(attr):
                    if "_" not in data:
                        data["_"] = {}
                    value = self._.get(attr)
                    if not srsly.is_json_serializable(value):
                        raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
                    data["_"][attr] = value
                    user_keys.add(attr)
            # Token and span attributes only include values stored in user_data
            # and not values generated by getters
            if self.user_data:
                for data_key, value in self.user_data.copy().items():
                    if type(data_key) == tuple and len(data_key) >= 4 and data_key[0] == "._.":
@ -1678,20 +1692,15 @@ cdef class Doc:
                            user_keys.add(attr)
                            if not srsly.is_json_serializable(value):
                                raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
-                            # Check if doc attribute
+                            # Token attribute
-                            if start is None:
+                            if start is not None and end is None:
                                if "_" not in data:
                                    data["_"] = {}
                                data["_"][attr] = value
                            # Check if token attribute
                            elif end is None:
                                if "underscore_token" not in data:
                                    data["underscore_token"] = {}
                                if attr not in data["underscore_token"]:
                                    data["underscore_token"][attr] = []
                                data["underscore_token"][attr].append({"start": start, "value": value})
-                            # Else span attribute
+                            # Span attribute
-                            else:
+                            elif start is not None and end is not None:
                                if "underscore_span" not in data:
                                    data["underscore_span"] = {}
                                if attr not in data["underscore_span"]:
--- a/spacy/util.py
+++ b/spacy/util.py
@ -443,9 +443,9 @@ def load_model_from_package(
    name: str,
    *,
    vocab: Union["Vocab", bool] = True,
-    disable: Union[str, Iterable[str]] = SimpleFrozenList(),
+    disable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
-    enable: Union[str, Iterable[str]] = SimpleFrozenList(),
+    enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
-    exclude: Union[str, Iterable[str]] = SimpleFrozenList(),
+    exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
    config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
 ) -> "Language":
    """Load a model from an installed package.
@ -619,9 +619,9 @@ def load_model_from_init_py(
    init_file: Union[Path, str],
    *,
    vocab: Union["Vocab", bool] = True,
-    disable: Union[str, Iterable[str]] = SimpleFrozenList(),
+    disable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
-    enable: Union[str, Iterable[str]] = SimpleFrozenList(),
+    enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
-    exclude: Union[str, Iterable[str]] = SimpleFrozenList(),
+    exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
    config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
 ) -> "Language":
    """Helper function to use in the `load()` method of a model package's
--- a/website/docs/api/scorer.md
+++ b/website/docs/api/scorer.md
@ -229,16 +229,17 @@ The reported `{attr}_score` depends on the classification properties:
 > print(scores["cats_macro_auc"])
 > ```
-| Name             | Description                                                                                                                                        |
+| Name             | Description                                                                                                                                                                                        |
-| ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `examples`       | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                |
+| `examples`       | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                                                                |
-| `attr`           | The attribute to score. ~~str~~                                                                                                                    |
+| `attr`           | The attribute to score. ~~str~~                                                                                                                                                                    |
-| _keyword-only_   |                                                                                                                                                    |
+| _keyword-only_   |                                                                                                                                                                                                    |
-| `getter`         | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. ~~Callable[[Doc, str], Dict[str, float]]~~ |
+| `getter`         | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. ~~Callable[[Doc, str], Dict[str, float]]~~                                                 |
-| labels           | The set of possible labels. Defaults to `[]`. ~~Iterable[str]~~                                                                                    |
+| labels           | The set of possible labels. Defaults to `[]`. ~~Iterable[str]~~                                                                                                                                    |
-| `multi_label`    | Whether the attribute allows multiple labels. Defaults to `True`. ~~bool~~                                                                         |
+| `multi_label`    | Whether the attribute allows multiple labels. Defaults to `True`. When set to `False` (exclusive labels), missing gold labels are interpreted as `0.0` and the threshold is set to `0.0`. ~~bool~~ |
-| `positive_label` | The positive label for a binary task with exclusive classes. Defaults to `None`. ~~Optional[str]~~                                                 |
+| `positive_label` | The positive label for a binary task with exclusive classes. Defaults to `None`. ~~Optional[str]~~                                                                                                 |
-| **RETURNS**      | A dictionary containing the scores, with inapplicable scores as `None`. ~~Dict[str, Optional[float]]~~                                             |
+| `threshold`      | Cutoff to consider a prediction "positive". Defaults to `0.5` for multi-label, and `0.0` (i.e. whatever's highest scoring) otherwise. ~~float~~                                                    |
 | **RETURNS**      | A dictionary containing the scores, with inapplicable scores as `None`. ~~Dict[str, Optional[float]]~~                                                                                             |
 ## Scorer.score_links {#score_links tag="staticmethod" new="3"}
--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@ -63,7 +63,6 @@ architectures and their arguments and hyperparameters.
 > ```python
 > from spacy.pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL
 > config = {
 >    "threshold": 0.5,
 >    "model": DEFAULT_SINGLE_TEXTCAT_MODEL,
 > }
 > nlp.add_pipe("textcat", config=config)
@ -82,7 +81,7 @@ architectures and their arguments and hyperparameters.
 | Setting     | Description                                                                                                                                                      |
 | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~                                                                   |
+| `threshold` | Cutoff to consider a prediction "positive", relevant for `textcat_multilabel` when calculating accuracy scores. ~~float~~                                        |
 | `model`     | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ |
 | `scorer`    | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~                                 |
@ -123,7 +122,7 @@ shortcut for this and instantiate the component using its string name and
 | `model`        | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~       |
 | `name`         | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                              |
 | _keyword-only_ |                                                                                                                                  |
-| `threshold`    | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~                                   |
+| `threshold`    | Cutoff to consider a prediction "positive", relevant for `textcat_multilabel` when calculating accuracy scores. ~~float~~        |
 | `scorer`       | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ |
 ## TextCategorizer.\_\_call\_\_ {#call tag="method"}
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@ -1792,7 +1792,7 @@ the entity `Span` – for example `._.orgs` or `._.prev_orgs` and
 > [`Doc.retokenize`](/api/doc#retokenize) context manager:
 >
 > ```python
-> with doc.retokenize() as retokenize:
+> with doc.retokenize() as retokenizer:
 >   for ent in doc.ents:
 >       retokenizer.merge(ent)
 > ```