Merge pull request #12218 from adrianeboyd/chore/update-v4-from-master-7

Update v4 from master
2026-02-01 21:16:05 +03:00 · 2023-02-03 12:04:20 +01:00 · 2023-02-03 12:04:20 +01:00 · c47ec5b5c6
commit c47ec5b5c6
parent 89f974d4f5 cd95b29053
13 changed files with 96 additions and 45 deletions
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@ -175,6 +175,18 @@ def test_modify_span_group(doc):
    assert group[0].label == doc.vocab.strings["TEST"]


+def test_char_span_attributes(doc):
+    label = "LABEL"
+    kb_id = "KB_ID"
+    span_id = "SPAN_ID"
+    span1 = doc.char_span(20, 45, label=label, kb_id=kb_id, span_id=span_id)
+    span2 = doc[1:].char_span(15, 40, label=label, kb_id=kb_id, span_id=span_id)
+    assert span1.text == span2.text
+    assert span1.label_ == span2.label_ == label
+    assert span1.kb_id_ == span2.kb_id_ == kb_id
+    assert span1.id_ == span2.id_ == span_id
+
+
 def test_spans_sent_spans(doc):
    sents = list(doc.sents)
    assert sents[0].start == 0
@ -354,6 +366,14 @@ def test_spans_by_character(doc):
            span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="unk"
        )

+    # Span.char_span + alignment mode "contract"
+    span2 = doc[0:2].char_span(
+        span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract"
+    )
+    assert span1.start_char == span2.start_char
+    assert span1.end_char == span2.end_char
+    assert span2.label_ == "GPE"
+

 def test_span_to_array(doc):
    span = doc[1:-2]
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -1017,8 +1017,6 @@ def test_local_remote_storage_pull_missing():


 def test_cli_find_threshold(capsys):
-    thresholds = numpy.linspace(0, 1, 10)
-
    def make_examples(nlp: Language) -> List[Example]:
        docs: List[Example] = []

@ -1082,8 +1080,6 @@ def test_cli_find_threshold(capsys):
                scores_key="cats_macro_f",
                silent=True,
            )
-            assert best_threshold != thresholds[0]
-            assert thresholds[0] < best_threshold < thresholds[9]
            assert best_score == max(res.values())
            assert res[1.0] == 0.0

@ -1091,7 +1087,7 @@ def test_cli_find_threshold(capsys):
        nlp, _ = init_nlp((("spancat", {}),))
        with make_tempdir() as nlp_dir:
            nlp.to_disk(nlp_dir)
-            res = find_threshold(
+            best_threshold, best_score, res = find_threshold(
                model=nlp_dir,
                data_path=docs_dir / "docs.spacy",
                pipe_name="spancat",
@ -1099,10 +1095,8 @@ def test_cli_find_threshold(capsys):
                scores_key="spans_sc_f",
                silent=True,
            )
-            assert res[0] != thresholds[0]
-            assert thresholds[0] < res[0] < thresholds[8]
-            assert res[1] >= 0.6
-            assert res[2][1.0] == 0.0
+            assert best_score == max(res.values())
+            assert res[1.0] == 0.0

        # Having multiple textcat_multilabel components should work, since the name has to be specified.
        nlp, _ = init_nlp((("textcat_multilabel", {}),))
--- a/spacy/tests/test_cli_app.py
+++ b/spacy/tests/test_cli_app.py
@ -4,7 +4,7 @@ from typer.testing import CliRunner
 from spacy.tokens import DocBin, Doc

 from spacy.cli._util import app
-from .util import make_tempdir
+from .util import make_tempdir, normalize_whitespace


 def test_convert_auto():
@ -38,8 +38,8 @@ def test_benchmark_accuracy_alias():
    # Verify that the `evaluate` alias works correctly.
    result_benchmark = CliRunner().invoke(app, ["benchmark", "accuracy", "--help"])
    result_evaluate = CliRunner().invoke(app, ["evaluate", "--help"])
-    assert result_benchmark.stdout == result_evaluate.stdout.replace(
-        "spacy evaluate", "spacy benchmark accuracy"
+    assert normalize_whitespace(result_benchmark.stdout) == normalize_whitespace(
+        result_evaluate.stdout.replace("spacy evaluate", "spacy benchmark accuracy")
    )


--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@ -1,6 +1,7 @@
 import numpy
 import tempfile
 import contextlib
+import re
 import srsly
 from spacy.tokens import Doc
 from spacy.vocab import Vocab
@ -95,3 +96,7 @@ def assert_packed_msg_equal(b1, b2):
    for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):
        assert k1 == k2
        assert v1 == v2
+
+
+def normalize_whitespace(s):
+    return re.sub(r"\s+", " ", s)
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@ -108,6 +108,7 @@ class Doc:
        kb_id: Union[int, str] = ...,
        vector: Optional[Floats1d] = ...,
        alignment_mode: str = ...,
+        span_id: Union[int, str] = ...,
    ) -> Span: ...
    def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ...
    @property
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -528,9 +528,9 @@ cdef class Doc:
        doc (Doc): The parent document.
        start_idx (int): The index of the first character of the span.
        end_idx (int): The index of the first character after the span.
-        label (uint64 or string): A label to attach to the Span, e.g. for
+        label (Union[int, str]): A label to attach to the Span, e.g. for
            named entities.
-        kb_id (uint64 or string):  An ID from a KB to capture the meaning of a
+        kb_id (Union[int, str]):  An ID from a KB to capture the meaning of a
            named entity.
        vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
            the span.
@ -539,6 +539,7 @@ cdef class Doc:
            with token boundaries), "contract" (span of all tokens completely
            within the character span), "expand" (span of all tokens at least
            partially covered by the character span). Defaults to "strict".
+        span_id (Union[int, str]): An identifier to associate with the span.
        RETURNS (Span): The newly constructed object.

        DOCS: https://spacy.io/api/doc#char_span
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@ -96,6 +96,9 @@ class Span:
        label: Union[int, str] = ...,
        kb_id: Union[int, str] = ...,
        vector: Optional[Floats1d] = ...,
+        id: Union[int, str] = ...,
+        alignment_mode: str = ...,
+        span_id: Union[int, str] = ...,
    ) -> Span: ...
    @property
    def conjuncts(self) -> Tuple[Token]: ...
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -382,7 +382,7 @@ cdef class Span:
        result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
        # ensure we get a scalar back (numpy does this automatically but cupy doesn't)
        return result.item()
-    
+
    cpdef np.ndarray to_array(self, object py_attr_ids):
        """Given a list of M attribute IDs, export the tokens to a numpy
        `ndarray` of shape `(N, M)`, where `N` is the length of the document.
@ -656,22 +656,29 @@ cdef class Span:
        else:
            return self.doc[root]

-    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0):
+    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0, alignment_mode="strict", span_id=0):
        """Create a `Span` object from the slice `span.text[start : end]`.

        start (int): The index of the first character of the span.
        end (int): The index of the first character after the span.
-        label (uint64 or string): A label to attach to the Span, e.g. for
+        label (Union[int, str]): A label to attach to the Span, e.g. for
            named entities.
-        kb_id (uint64 or string):  An ID from a KB to capture the meaning of a named entity.
+        kb_id (Union[int, str]):  An ID from a KB to capture the meaning of a named entity.
        vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
            the span.
+        id (Union[int, str]): Unused.
+        alignment_mode (str): How character indices are aligned to token
+            boundaries. Options: "strict" (character indices must be aligned
+            with token boundaries), "contract" (span of all tokens completely
+            within the character span), "expand" (span of all tokens at least
+            partially covered by the character span). Defaults to "strict".
+        span_id (Union[int, str]): An identifier to associate with the span.
        RETURNS (Span): The newly constructed object.
        """
        cdef SpanC* span_c = self.span_c()
        start_idx += span_c.start_char
        end_idx += span_c.start_char
-        return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector)
+        return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector, alignment_mode=alignment_mode, span_id=span_id)

    @property
    def conjuncts(self):
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@ -1410,12 +1410,13 @@ $ python -m spacy project assets [project_dir]
 > $ python -m spacy project assets [--sparse]
 > ```

-| Name             | Description                                                                                                                                               |
-| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `project_dir`    | Path to project directory. Defaults to current working directory. ~~Path (positional)~~                                                                   |
-| `--sparse`, `-S` | Enable [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) to only check out and download what's needed. Requires Git v22.2+. ~~bool (flag)~~ |
-| `--help`, `-h`   | Show help message and available arguments. ~~bool (flag)~~                                                                                                |
-| **CREATES**      | Downloaded or copied assets defined in the `project.yml`.                                                                                                 |
+| Name                                           | Description                                                                                                                                               |
+| ---------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `project_dir`                                  | Path to project directory. Defaults to current working directory. ~~Path (positional)~~                                                                   |
+| `--extra`, `-e` <Tag variant="new">3.3.1</Tag> | Download assets marked as "extra". Default false. ~~bool (flag)~~                                                                                         |
+| `--sparse`, `-S`                               | Enable [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) to only check out and download what's needed. Requires Git v22.2+. ~~bool (flag)~~ |
+| `--help`, `-h`                                 | Show help message and available arguments. ~~bool (flag)~~                                                                                                |
+| **CREATES**                                    | Downloaded or copied assets defined in the `project.yml`.                                                                                                 |

 ### project run {id="project-run",tag="command"}

--- a/website/docs/api/doc.mdx
+++ b/website/docs/api/doc.mdx
@ -37,7 +37,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
 | `words`                                  | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~                                                                                            |
 | `spaces`                                 | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~            |
 | _keyword-only_                           |                                                                                                                                                                                                         |
-| `user\_data`                             | Optional extra data to attach to the Doc. ~~Dict~~                                                                                                                                                      |
+| `user_data`                              | Optional extra data to attach to the Doc. ~~Dict~~                                                                                                                                                      |
 | `tags` <Tag variant="new">3</Tag>        | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                   |
 | `pos` <Tag variant="new">3</Tag>         | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                   |
 | `morphs` <Tag variant="new">3</Tag>      | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                 |
@ -209,15 +209,16 @@ alignment mode `"strict".
 > assert span.text == "New York"
 > ```

-| Name             | Description                                                                                                                                                                                                                                                                  |
-| ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `start`          | The index of the first character of the span. ~~int~~                                                                                                                                                                                                                        |
-| `end`            | The index of the last character after the span. ~~int~~                                                                                                                                                                                                                      |
-| `label`          | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~                                                                                                                                                                                                  |
-| `kb_id`          | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~                                                                                                                                                                                    |
-| `vector`         | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                                                                                               |
-| `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
-| **RETURNS**      | The newly constructed object or `None`. ~~Optional[Span]~~                                                                                                                                                                                                                   |
+| Name                                     | Description                                                                                                                                                                                                                                                                  |
+| ---------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `start`                                  | The index of the first character of the span. ~~int~~                                                                                                                                                                                                                        |
+| `end`                                    | The index of the last character after the span. ~~int~~                                                                                                                                                                                                                      |
+| `label`                                  | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~                                                                                                                                                                                                  |
+| `kb_id`                                  | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~                                                                                                                                                                                    |
+| `vector`                                 | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                                                                                               |
+| `alignment_mode`                         | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
+| `span_id` <Tag variant="new">3.3.1</Tag> | An identifier to associate with the span. ~~Union[int, str]~~                                                                                                                                                                                                                |
+| **RETURNS**                              | The newly constructed object or `None`. ~~Optional[Span]~~                                                                                                                                                                                                                   |

 ## Doc.set_ents {id="set_ents",tag="method",version="3"}

--- a/website/docs/api/span.mdx
+++ b/website/docs/api/span.mdx
@ -186,14 +186,17 @@ the character indices don't map to a valid span.
 > assert span.text == "New York"
 > ```

-| Name        | Description                                                                               |
-| ----------- | ----------------------------------------------------------------------------------------- |
-| `start`     | The index of the first character of the span. ~~int~~                                     |
-| `end`       | The index of the last character after the span. ~~int~~                                   |
-| `label`     | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~               |
-| `kb_id`     | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
-| `vector`    | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~            |
-| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~                                |
+| Name                                            | Description                                                                                                                                                                                                                                                                  |
+| ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `start`                                         | The index of the first character of the span. ~~int~~                                                                                                                                                                                                                        |
+| `end`                                           | The index of the last character after the span. ~~int~~                                                                                                                                                                                                                      |
+| `label`                                         | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~                                                                                                                                                                                                  |
+| `kb_id`                                         | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~                                                                                                                                                                                    |
+| `vector`                                        | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                                                                                               |
+| `id`                                            | Unused. ~~Union[int, str]~~                                                                                                                                                                                                                                                  |
+| `alignment_mode` <Tag variant="new">3.5.1</Tag> | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
+| `span_id` <Tag variant="new">3.5.1</Tag>        | An identifier to associate with the span. ~~Union[int, str]~~                                                                                                                                                                                                                |
+| **RETURNS**                                     | The newly constructed object or `None`. ~~Optional[Span]~~                                                                                                                                                                                                                   |

 ## Span.similarity {id="similarity",tag="method",model="vectors"}

--- a/website/docs/models/index.mdx
+++ b/website/docs/models/index.mdx
@ -21,8 +21,8 @@ menu:
 ## Package naming conventions {id="conventions"}

 In general, spaCy expects all pipeline packages to follow the naming convention
-of `[lang]\_[name]`. For spaCy's pipelines, we also chose to divide the name
-into three components:
+of `[lang]_[name]`. For spaCy's pipelines, we also chose to divide the name into
+three components:

 1. **Type:** Capabilities (e.g. `core` for general-purpose pipeline with
   tagging, parsing, lemmatization and named entity recognition, or `dep` for
--- a/website/docs/usage/v3-5.mdx
+++ b/website/docs/usage/v3-5.mdx
@ -155,6 +155,21 @@ An error is now raised when unsupported values are given as input to train a
 `textcat` or `textcat_multilabel` model - ensure that values are `0.0` or `1.0`
 as explained in the [docs](/api/textcategorizer#assigned-attributes).

+### Using the default knowledge base
+
+As `KnowledgeBase` is now an abstract class, you should call the constructor of
+the new `InMemoryLookupKB` instead when you want to use spaCy's default KB
+implementation:
+
+```diff
+- kb = KnowledgeBase()
+ kb = InMemoryLookupKB()
+```
+
+If you've written a custom KB that inherits from `KnowledgeBase`, you'll need to
+implement its abstract methods, or alternatively inherit from `InMemoryLookupKB`
+instead.
+
 ### Updated scorers for tokenization and textcat {id="scores"}

 We fixed a bug that inflated the `token_acc` scores in v3.0-v3.4. The reported