Merge pull request #12218 from adrianeboyd/chore/update-v4-from-master-7

Update v4 from master
2025-10-29 06:57:49 +03:00 · 2023-02-03 12:04:20 +01:00 · 2023-02-03 12:04:20 +01:00 · c47ec5b5c6
commit c47ec5b5c6
parent 89f974d4f5 cd95b29053
13 changed files with 96 additions and 45 deletions
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@ -175,6 +175,18 @@ def test_modify_span_group(doc):
    assert group[0].label == doc.vocab.strings["TEST"]
 def test_char_span_attributes(doc):
    label = "LABEL"
    kb_id = "KB_ID"
    span_id = "SPAN_ID"
    span1 = doc.char_span(20, 45, label=label, kb_id=kb_id, span_id=span_id)
    span2 = doc[1:].char_span(15, 40, label=label, kb_id=kb_id, span_id=span_id)
    assert span1.text == span2.text
    assert span1.label_ == span2.label_ == label
    assert span1.kb_id_ == span2.kb_id_ == kb_id
    assert span1.id_ == span2.id_ == span_id
 def test_spans_sent_spans(doc):
    sents = list(doc.sents)
    assert sents[0].start == 0
@ -354,6 +366,14 @@ def test_spans_by_character(doc):
            span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="unk"
        )
    # Span.char_span + alignment mode "contract"
    span2 = doc[0:2].char_span(
        span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract"
    )
    assert span1.start_char == span2.start_char
    assert span1.end_char == span2.end_char
    assert span2.label_ == "GPE"
 def test_span_to_array(doc):
    span = doc[1:-2]
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -1017,8 +1017,6 @@ def test_local_remote_storage_pull_missing():
 def test_cli_find_threshold(capsys):
    thresholds = numpy.linspace(0, 1, 10)
    def make_examples(nlp: Language) -> List[Example]:
        docs: List[Example] = []
@ -1082,8 +1080,6 @@ def test_cli_find_threshold(capsys):
                scores_key="cats_macro_f",
                silent=True,
            )
            assert best_threshold != thresholds[0]
            assert thresholds[0] < best_threshold < thresholds[9]
            assert best_score == max(res.values())
            assert res[1.0] == 0.0
@ -1091,7 +1087,7 @@ def test_cli_find_threshold(capsys):
        nlp, _ = init_nlp((("spancat", {}),))
        with make_tempdir() as nlp_dir:
            nlp.to_disk(nlp_dir)
-            res = find_threshold(
+            best_threshold, best_score, res = find_threshold(
                model=nlp_dir,
                data_path=docs_dir / "docs.spacy",
                pipe_name="spancat",
@ -1099,10 +1095,8 @@ def test_cli_find_threshold(capsys):
                scores_key="spans_sc_f",
                silent=True,
            )
-            assert res[0] != thresholds[0]
+            assert best_score == max(res.values())
-            assert thresholds[0] < res[0] < thresholds[8]
+            assert res[1.0] == 0.0
            assert res[1] >= 0.6
            assert res[2][1.0] == 0.0
        # Having multiple textcat_multilabel components should work, since the name has to be specified.
        nlp, _ = init_nlp((("textcat_multilabel", {}),))
--- a/spacy/tests/test_cli_app.py
+++ b/spacy/tests/test_cli_app.py
@ -4,7 +4,7 @@ from typer.testing import CliRunner
 from spacy.tokens import DocBin, Doc
 from spacy.cli._util import app
-from .util import make_tempdir
+from .util import make_tempdir, normalize_whitespace
 def test_convert_auto():
@ -38,8 +38,8 @@ def test_benchmark_accuracy_alias():
    # Verify that the `evaluate` alias works correctly.
    result_benchmark = CliRunner().invoke(app, ["benchmark", "accuracy", "--help"])
    result_evaluate = CliRunner().invoke(app, ["evaluate", "--help"])
-    assert result_benchmark.stdout == result_evaluate.stdout.replace(
+    assert normalize_whitespace(result_benchmark.stdout) == normalize_whitespace(
-        "spacy evaluate", "spacy benchmark accuracy"
+        result_evaluate.stdout.replace("spacy evaluate", "spacy benchmark accuracy")
    )
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@ -1,6 +1,7 @@
 import numpy
 import tempfile
 import contextlib
 import re
 import srsly
 from spacy.tokens import Doc
 from spacy.vocab import Vocab
@ -95,3 +96,7 @@ def assert_packed_msg_equal(b1, b2):
    for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):
        assert k1 == k2
        assert v1 == v2
 def normalize_whitespace(s):
    return re.sub(r"\s+", " ", s)
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@ -108,6 +108,7 @@ class Doc:
        kb_id: Union[int, str] = ...,
        vector: Optional[Floats1d] = ...,
        alignment_mode: str = ...,
        span_id: Union[int, str] = ...,
    ) -> Span: ...
    def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ...
    @property
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -528,9 +528,9 @@ cdef class Doc:
        doc (Doc): The parent document.
        start_idx (int): The index of the first character of the span.
        end_idx (int): The index of the first character after the span.
-        label (uint64 or string): A label to attach to the Span, e.g. for
+        label (Union[int, str]): A label to attach to the Span, e.g. for
            named entities.
-        kb_id (uint64 or string):  An ID from a KB to capture the meaning of a
+        kb_id (Union[int, str]):  An ID from a KB to capture the meaning of a
            named entity.
        vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
            the span.
@ -539,6 +539,7 @@ cdef class Doc:
            with token boundaries), "contract" (span of all tokens completely
            within the character span), "expand" (span of all tokens at least
            partially covered by the character span). Defaults to "strict".
        span_id (Union[int, str]): An identifier to associate with the span.
        RETURNS (Span): The newly constructed object.
        DOCS: https://spacy.io/api/doc#char_span
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@ -96,6 +96,9 @@ class Span:
        label: Union[int, str] = ...,
        kb_id: Union[int, str] = ...,
        vector: Optional[Floats1d] = ...,
        id: Union[int, str] = ...,
        alignment_mode: str = ...,
        span_id: Union[int, str] = ...,
    ) -> Span: ...
    @property
    def conjuncts(self) -> Tuple[Token]: ...
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -656,22 +656,29 @@ cdef class Span:
        else:
            return self.doc[root]
-    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0):
+    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0, alignment_mode="strict", span_id=0):
        """Create a `Span` object from the slice `span.text[start : end]`.
        start (int): The index of the first character of the span.
        end (int): The index of the first character after the span.
-        label (uint64 or string): A label to attach to the Span, e.g. for
+        label (Union[int, str]): A label to attach to the Span, e.g. for
            named entities.
-        kb_id (uint64 or string):  An ID from a KB to capture the meaning of a named entity.
+        kb_id (Union[int, str]):  An ID from a KB to capture the meaning of a named entity.
        vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
            the span.
        id (Union[int, str]): Unused.
        alignment_mode (str): How character indices are aligned to token
            boundaries. Options: "strict" (character indices must be aligned
            with token boundaries), "contract" (span of all tokens completely
            within the character span), "expand" (span of all tokens at least
            partially covered by the character span). Defaults to "strict".
        span_id (Union[int, str]): An identifier to associate with the span.
        RETURNS (Span): The newly constructed object.
        """
        cdef SpanC* span_c = self.span_c()
        start_idx += span_c.start_char
        end_idx += span_c.start_char
-        return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector)
+        return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector, alignment_mode=alignment_mode, span_id=span_id)
    @property
    def conjuncts(self):
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@ -1410,12 +1410,13 @@ $ python -m spacy project assets [project_dir]
 > $ python -m spacy project assets [--sparse]
 > ```
-| Name             | Description                                                                                                                                               |
+| Name                                           | Description                                                                                                                                               |
-| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ---------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `project_dir`    | Path to project directory. Defaults to current working directory. ~~Path (positional)~~                                                                   |
+| `project_dir`                                  | Path to project directory. Defaults to current working directory. ~~Path (positional)~~                                                                   |
-| `--sparse`, `-S` | Enable [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) to only check out and download what's needed. Requires Git v22.2+. ~~bool (flag)~~ |
+| `--extra`, `-e` <Tag variant="new">3.3.1</Tag> | Download assets marked as "extra". Default false. ~~bool (flag)~~                                                                                         |
-| `--help`, `-h`   | Show help message and available arguments. ~~bool (flag)~~                                                                                                |
+| `--sparse`, `-S`                               | Enable [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) to only check out and download what's needed. Requires Git v22.2+. ~~bool (flag)~~ |
-| **CREATES**      | Downloaded or copied assets defined in the `project.yml`.                                                                                                 |
+| `--help`, `-h`                                 | Show help message and available arguments. ~~bool (flag)~~                                                                                                |
 | **CREATES**                                    | Downloaded or copied assets defined in the `project.yml`.                                                                                                 |
 ### project run {id="project-run",tag="command"}
--- a/website/docs/api/doc.mdx
+++ b/website/docs/api/doc.mdx
@ -37,7 +37,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
 | `words`                                  | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~                                                                                            |
 | `spaces`                                 | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~            |
 | _keyword-only_                           |                                                                                                                                                                                                         |
-| `user\_data`                             | Optional extra data to attach to the Doc. ~~Dict~~                                                                                                                                                      |
+| `user_data`                              | Optional extra data to attach to the Doc. ~~Dict~~                                                                                                                                                      |
 | `tags` <Tag variant="new">3</Tag>        | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                   |
 | `pos` <Tag variant="new">3</Tag>         | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                   |
 | `morphs` <Tag variant="new">3</Tag>      | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                 |
@ -209,15 +209,16 @@ alignment mode `"strict".
 > assert span.text == "New York"
 > ```
-| Name             | Description                                                                                                                                                                                                                                                                  |
+| Name                                     | Description                                                                                                                                                                                                                                                                  |
-| ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ---------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `start`          | The index of the first character of the span. ~~int~~                                                                                                                                                                                                                        |
+| `start`                                  | The index of the first character of the span. ~~int~~                                                                                                                                                                                                                        |
-| `end`            | The index of the last character after the span. ~~int~~                                                                                                                                                                                                                      |
+| `end`                                    | The index of the last character after the span. ~~int~~                                                                                                                                                                                                                      |
-| `label`          | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~                                                                                                                                                                                                  |
+| `label`                                  | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~                                                                                                                                                                                                  |
-| `kb_id`          | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~                                                                                                                                                                                    |
+| `kb_id`                                  | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~                                                                                                                                                                                    |
-| `vector`         | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                                                                                               |
+| `vector`                                 | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                                                                                               |
-| `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
+| `alignment_mode`                         | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
-| **RETURNS**      | The newly constructed object or `None`. ~~Optional[Span]~~                                                                                                                                                                                                                   |
+| `span_id` <Tag variant="new">3.3.1</Tag> | An identifier to associate with the span. ~~Union[int, str]~~                                                                                                                                                                                                                |
 | **RETURNS**                              | The newly constructed object or `None`. ~~Optional[Span]~~                                                                                                                                                                                                                   |
 ## Doc.set_ents {id="set_ents",tag="method",version="3"}
--- a/website/docs/api/span.mdx
+++ b/website/docs/api/span.mdx
@ -186,14 +186,17 @@ the character indices don't map to a valid span.
 > assert span.text == "New York"
 > ```
-| Name        | Description                                                                               |
+| Name                                            | Description                                                                                                                                                                                                                                                                  |
-| ----------- | ----------------------------------------------------------------------------------------- |
+| ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `start`     | The index of the first character of the span. ~~int~~                                     |
+| `start`                                         | The index of the first character of the span. ~~int~~                                                                                                                                                                                                                        |
-| `end`       | The index of the last character after the span. ~~int~~                                   |
+| `end`                                           | The index of the last character after the span. ~~int~~                                                                                                                                                                                                                      |
-| `label`     | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~               |
+| `label`                                         | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~                                                                                                                                                                                                  |
-| `kb_id`     | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
+| `kb_id`                                         | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~                                                                                                                                                                                    |
-| `vector`    | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~            |
+| `vector`                                        | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                                                                                               |
-| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~                                |
+| `id`                                            | Unused. ~~Union[int, str]~~                                                                                                                                                                                                                                                  |
 | `alignment_mode` <Tag variant="new">3.5.1</Tag> | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
 | `span_id` <Tag variant="new">3.5.1</Tag>        | An identifier to associate with the span. ~~Union[int, str]~~                                                                                                                                                                                                                |
 | **RETURNS**                                     | The newly constructed object or `None`. ~~Optional[Span]~~                                                                                                                                                                                                                   |
 ## Span.similarity {id="similarity",tag="method",model="vectors"}
--- a/website/docs/models/index.mdx
+++ b/website/docs/models/index.mdx
@ -21,8 +21,8 @@ menu:
 ## Package naming conventions {id="conventions"}
 In general, spaCy expects all pipeline packages to follow the naming convention
-of `[lang]\_[name]`. For spaCy's pipelines, we also chose to divide the name
+of `[lang]_[name]`. For spaCy's pipelines, we also chose to divide the name into
-into three components:
+three components:
 1. **Type:** Capabilities (e.g. `core` for general-purpose pipeline with
   tagging, parsing, lemmatization and named entity recognition, or `dep` for
--- a/website/docs/usage/v3-5.mdx
+++ b/website/docs/usage/v3-5.mdx
@ -155,6 +155,21 @@ An error is now raised when unsupported values are given as input to train a
 `textcat` or `textcat_multilabel` model - ensure that values are `0.0` or `1.0`
 as explained in the [docs](/api/textcategorizer#assigned-attributes).
 ### Using the default knowledge base
 As `KnowledgeBase` is now an abstract class, you should call the constructor of
 the new `InMemoryLookupKB` instead when you want to use spaCy's default KB
 implementation:
 ```diff
 - kb = KnowledgeBase()
 + kb = InMemoryLookupKB()
 ```
 If you've written a custom KB that inherits from `KnowledgeBase`, you'll need to
 implement its abstract methods, or alternatively inherit from `InMemoryLookupKB`
 instead.
 ### Updated scorers for tokenization and textcat {id="scores"}
 We fixed a bug that inflated the `token_acc` scores in v3.0-v3.4. The reported