Merge remote-tracking branch 'upstream/master' into v4-isort

2025-07-15 18:52:29 +03:00 · 2023-06-26 12:09:22 +02:00 · 2023-06-26 12:09:22 +02:00 · bf92ca4f10
commit bf92ca4f10
parent 2468742cb8 e1664217f5
15 changed files with 129 additions and 32 deletions
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -37,6 +37,10 @@ jobs:
        run: |
          python -m pip install black -c requirements.txt
          python -m black spacy --check
+      - name: isort
+        run: |
+          python -m pip install isort -c requirements.txt
+          python -m isort spacy --check
      - name: flake8
        run: |
          python -m pip install flake8==5.0.4
--- a/requirements.txt
+++ b/requirements.txt
@ -36,3 +36,4 @@ types-setuptools>=57.0.0
 types-requests
 types-setuptools>=57.0.0
 black==22.3.0
+isort>=5.0,<6.0
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -230,7 +230,7 @@ def debug_data(
    else:
        msg.info("No word vectors present in the package")

-    if "spancat" in factory_names:
+    if "spancat" in factory_names or "spancat_singlelabel" in factory_names:
        model_labels_spancat = _get_labels_from_spancat(nlp)
        has_low_data_warning = False
        has_no_neg_warning = False
@ -848,7 +848,7 @@ def _compile_gold(
                    data["boundary_cross_ents"] += 1
                elif label == "-":
                    data["ner"]["-"] += 1
-        if "spancat" in factory_names:
+        if "spancat" in factory_names or "spancat_singlelabel" in factory_names:
            for spans_key in list(eg.reference.spans.keys()):
                # Obtain the span frequency
                if spans_key not in data["spancat"]:
@ -1046,7 +1046,7 @@ def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]:
    pipe_names = [
        pipe_name
        for pipe_name in nlp.pipe_names
-        if nlp.get_pipe_meta(pipe_name).factory == "spancat"
+        if nlp.get_pipe_meta(pipe_name).factory in ("spancat", "spancat_singlelabel")
    ]
    labels: Dict[str, Set[str]] = {}
    for pipe_name in pipe_names:
--- a/spacy/kb/candidate.pxd
+++ b/spacy/kb/candidate.pxd
@ -11,7 +11,7 @@ cdef class Candidate:
 cdef class InMemoryCandidate(Candidate):
    cdef readonly hash_t _entity_hash
    cdef readonly hash_t _alias_hash
-    cpdef vector[float] _entity_vector
+    cdef vector[float] _entity_vector
    cdef float _prior_prob
    cdef readonly InMemoryLookupKB _kb
    cdef float _entity_freq
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -39,7 +39,11 @@ from .levenshtein import levenshtein_compare
 from ..strings cimport get_string_id

 from ..attrs import IDS
+from ..errors import Errors, MatchPatternError, Warnings
+from ..schemas import validate_token_pattern
+from ..strings import get_string_id
 from ..util import registry
+from .levenshtein import levenshtein_compare

 DEF PADDING = 5

--- a/spacy/matcher/phrasematcher.pyi
+++ b/spacy/matcher/phrasematcher.pyi
@ -6,7 +6,7 @@ from .matcher import Matcher

 class PhraseMatcher:
    def __init__(
-        self, vocab: Vocab, attr: Optional[Union[int, str]], validate: bool = ...
+        self, vocab: Vocab, attr: Optional[Union[int, str]] = ..., validate: bool = ...
    ) -> None: ...
    def __reduce__(self) -> Any: ...
    def __len__(self) -> int: ...
--- a/spacy/tests/package/test_requirements.py
+++ b/spacy/tests/package/test_requirements.py
@ -12,6 +12,7 @@ def test_build_dependencies():
        "hypothesis",
        "pre-commit",
        "black",
+        "isort",
        "mypy",
        "types-dataclasses",
        "types-mock",
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -697,6 +697,7 @@ def test_string_to_list_intify(value):
    assert string_to_list(value, intify=True) == [1, 2, 3]


+@pytest.mark.skip(reason="Temporarily skip before models are published")
 def test_download_compatibility():
    spec = SpecifierSet("==" + about.__version__)
    spec.prereleases = False
@ -707,6 +708,7 @@ def test_download_compatibility():
        assert get_minor_version(about.__version__) == get_minor_version(version)


+@pytest.mark.skip(reason="Temporarily skip before models are published")
 def test_validate_compatibility_table():
    spec = SpecifierSet("==" + about.__version__)
    spec.prereleases = False
@ -858,7 +860,8 @@ def test_debug_data_compile_gold():
    assert data["boundary_cross_ents"] == 1


-def test_debug_data_compile_gold_for_spans():
+@pytest.mark.parametrize("component_name", ["spancat", "spancat_singlelabel"])
+def test_debug_data_compile_gold_for_spans(component_name):
    nlp = English()
    spans_key = "sc"

@ -868,7 +871,7 @@ def test_debug_data_compile_gold_for_spans():
    ref.spans[spans_key] = [Span(ref, 3, 6, "ORG"), Span(ref, 5, 6, "GPE")]
    eg = Example(pred, ref)

-    data = _compile_gold([eg], ["spancat"], nlp, True)
+    data = _compile_gold([eg], [component_name], nlp, True)

    assert data["spancat"][spans_key] == Counter({"ORG": 1, "GPE": 1})
    assert data["spans_length"][spans_key] == {"ORG": [3], "GPE": [1]}
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@ -25,6 +25,8 @@ from .span_groups import SpanGroups
 from .token import Token
 from .underscore import Underscore

+DOCBIN_ALL_ATTRS: Tuple[str, ...]
+
 class DocMethod(Protocol):
    def __call__(self: Doc, *args: Any, **kwargs: Any) -> Any: ...  # type: ignore[misc]

--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -51,13 +51,19 @@ from ..compat import copy_reg, pickle
 from ..errors import Errors, Warnings
 from ..morphology import Morphology
 from ..util import get_words_and_spaces
-from .doc_bin import ALL_ATTRS as DOCBIN_ALL_ATTRS
 from .retokenizer import Retokenizer
 from .underscore import Underscore, get_ext_args

 DEF PADDING = 5


+# We store the docbin attrs here rather than in _serialize to avoid
+# import cycles.
+
+# fmt: off
+DOCBIN_ALL_ATTRS = ("ORTH", "NORM", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "ENT_ID", "LEMMA", "MORPH", "POS", "SENT_START")
+# fmt: on
+
 cdef int bounds_check(int i, int length, int padding) except -1:
    if (i + padding) < 0:
        raise IndexError(Errors.E026.format(i=i, length=length))
--- a/spacy/tokens/doc_bin.py
+++ b/spacy/tokens/doc_bin.py
@ -12,13 +12,10 @@ from ..compat import copy_reg
 from ..errors import Errors
 from ..util import SimpleFrozenList, ensure_path
 from ..vocab import Vocab
+from .doc import DOCBIN_ALL_ATTRS as ALL_ATTRS
 from .doc import Doc
 from .span_groups import SpanGroups

-# fmt: off
-ALL_ATTRS = ("ORTH", "NORM", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "ENT_ID", "LEMMA", "MORPH", "POS", "SENT_START")
-# fmt: on
-

 class DocBin:
    """Pack Doc objects for binary serialization.
--- a/spacy/training/callbacks.py
+++ b/spacy/training/callbacks.py
@ -9,7 +9,7 @@ from ..util import load_model, logger, registry
 def create_copy_from_base_model(
    tokenizer: Optional[str] = None,
    vocab: Optional[str] = None,
-) -> Callable[[Language], Language]:
+) -> Callable[["Language"], "Language"]:
    def copy_from_base_model(nlp):
        if tokenizer:
            logger.info("Copying tokenizer from: %s", tokenizer)
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@ -1,3 +1,4 @@
+import warnings
 from collections.abc import Iterable as IterableInstance

 import numpy
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@ -2743,10 +2743,9 @@
            "description": "Have you ever struggled with needing a [spaCy TextCategorizer](https://spacy.io/api/textcategorizer) but didn't have the time to train one from scratch? Classy Classification is the way to go! For few-shot classification using [sentence-transformers](https://github.com/UKPLab/sentence-transformers) or [spaCy models](https://spacy.io/usage/models), provide a dictionary with labels and examples, or just provide a list of labels for zero shot-classification with [Huggingface zero-shot classifiers](https://huggingface.co/models?pipeline_tag=zero-shot-classification).",
            "github": "davidberenstein1957/classy-classification",
            "pip": "classy-classification",
-            "thumb": "https://raw.githubusercontent.com/Pandora-Intelligence/classy-classification/master/logo.png",
+            "thumb": "https://raw.githubusercontent.com/davidberenstein1957/classy-classification/master/logo.png",
            "code_example": [
                "import spacy",
-                "import classy_classification",
                "",
                "data = {",
                "    \"furniture\": [\"This text is about chairs.\",",
@ -2791,14 +2790,13 @@
            "title": "Concise Concepts",
            "slogan": "Concise Concepts uses few-shot NER based on word embedding similarity to get you going with easy!",
            "description": "When wanting to apply NER to concise concepts, it is really easy to come up with examples, but it takes some effort to train an entire pipeline. Concise Concepts uses few-shot NER based on word embedding similarity to get you going with easy!",
-            "github": "pandora-intelligence/concise-concepts",
+            "github": "davidberenstein1957/concise-concepts",
            "pip": "concise-concepts",
-            "thumb": "https://raw.githubusercontent.com/Pandora-Intelligence/concise-concepts/master/img/logo.png",
-            "image": "https://raw.githubusercontent.com/Pandora-Intelligence/concise-concepts/master/img/example.png",
+            "thumb": "https://raw.githubusercontent.com/davidberenstein1957/concise-concepts/master/img/logo.png",
+            "image": "https://raw.githubusercontent.com/davidberenstein1957/concise-concepts/master/img/example.png",
            "code_example": [
                "import spacy",
                "from spacy import displacy",
-                "import concise_concepts",
                "",
                "data = {",
                "    \"fruit\": [\"apple\", \"pear\", \"orange\"],",
@ -2838,13 +2836,12 @@
            "title": "Crosslingual Coreference",
            "slogan": "One multi-lingual coreference model to rule them all!",
            "description": "Coreference is amazing but the data required for training a model is very scarce. In our case, the available training for non-English languages also data proved to be poorly annotated. Crosslingual Coreference therefore uses the assumption a trained model with English data and cross-lingual embeddings should work for other languages with similar sentence structure. Verified to work quite well for at least (EN, NL, DK, FR, DE).",
-            "github": "pandora-intelligence/crosslingual-coreference",
+            "github": "davidberenstein1957/crosslingual-coreference",
            "pip": "crosslingual-coreference",
-            "thumb": "https://raw.githubusercontent.com/Pandora-Intelligence/crosslingual-coreference/master/img/logo.png",
-            "image": "https://raw.githubusercontent.com/Pandora-Intelligence/crosslingual-coreference/master/img/example_total.png",
+            "thumb": "https://raw.githubusercontent.com/davidberenstein1957/crosslingual-coreference/master/img/logo.png",
+            "image": "https://raw.githubusercontent.com/davidberenstein1957/crosslingual-coreference/master/img/example_total.png",
            "code_example": [
                "import spacy",
-                "import crosslingual_coreference",
                "",
                "text = \"\"\"",
                "    Do not forget about Momofuku Ando!",
@ -2937,6 +2934,54 @@
            "tags": ["ner", "few-shot", "augmentation", "datasets", "training"],
            "spacy_version": 3
        },
+        {
+            "id": "spacysetfit",
+            "title": "spaCy-SetFit",
+            "slogan": "An an easy and intuitive approach to use SetFit in combination with spaCy.",
+            "description": "spaCy-SetFit is a Python library that extends spaCy's text categorization capabilities by incorporating SetFit for few-shot classification. It allows you to train a text categorizer using a intuitive dictionary. \n\nThe library integrates with spaCy's pipeline architecture, enabling easy integration and configuration of the text categorizer component. You can provide a training dataset containing inlier and outlier examples, and spaCy-SetFit will use the paraphrase-MiniLM-L3-v2 model for training the text categorizer with SetFit. Once trained, you can use the categorizer to classify new text and obtain category probabilities.",
+            "github": "davidberenstein1957/spacy-setfit",
+            "pip": "spacy-setfit",
+            "thumb": "https://raw.githubusercontent.com/davidberenstein1957/spacy-setfit/main/logo.png",
+            "code_example": [
+            "import spacy",
+            "",
+            "# Create some example data",
+            "train_dataset = {",
+            "    \"inlier\": [",
+            "        \"Text about furniture\",",
+            "        \"Couches, benches and televisions.\",",
+            "        \"I really need to get a new sofa.\"",
+            "    ],",
+            "    \"outlier\": [",
+            "        \"Text about kitchen equipment\",",
+            "        \"This text is about politics\",",
+            "        \"Comments about AI and stuff.\"",
+            "    ]",
+            "}",
+            "",
+            "# Load the spaCy language model:",
+            "nlp = spacy.load(\"en_core_web_sm\")",
+            "",
+            "# Add the \"text_categorizer\" pipeline component to the spaCy model, and configure it with SetFit parameters:",
+            "nlp.add_pipe(\"text_categorizer\", config={",
+            "    \"pretrained_model_name_or_path\": \"paraphrase-MiniLM-L3-v2\",",
+            "    \"setfit_trainer_args\": {",
+            "        \"train_dataset\": train_dataset",
+            "    }",
+            "})",
+            "doc = nlp(\"I really need to get a new sofa.\")",
+            "doc.cats",
+            "# {'inlier': 0.902350975129, 'outlier': 0.097649024871}"
+            ],
+            "author": "David Berenstein",
+            "author_links": {
+                "github": "davidberenstein1957",
+                "website": "https://www.linkedin.com/in/david-berenstein-1bab11105/"
+            },
+            "category": ["pipeline"],
+            "tags": ["few-shot", "SetFit", "training"],
+            "spacy_version": 3
+        },
        {
            "id": "blackstone",
            "title": "Blackstone",
@ -4320,6 +4365,37 @@
            },
            "category": ["apis", "standalone"],
            "tags": ["apis", "deployment"]
+        },
+        {
+            "id": "span_marker",
+            "title": "SpanMarker",
+            "slogan": "Effortless state-of-the-art NER in spaCy",
+            "description": "The SpanMarker integration with spaCy allows you to seamlessly replace the default spaCy `\"ner\"` pipeline component with any [SpanMarker model available on the Hugging Face Hub](https://huggingface.co/models?library=span-marker). Through this, you can take advantage of the advanced Named Entity Recognition capabilities of SpanMarker within the familiar and powerful spaCy framework.\n\nBy default, the `span_marker` pipeline component uses a [SpanMarker model using RoBERTa-large trained on OntoNotes v5.0](https://huggingface.co/tomaarsen/span-marker-roberta-large-ontonotes5). This model reaches a competitive 91.54 F1, notably higher than the [85.5 and 89.8 F1](https://spacy.io/usage/facts-figures#section-benchmarks) from `en_core_web_lg` and `en_core_web_trf`, respectively. A short head-to-head between this SpanMarker model and the `trf` spaCy model has been posted [here](https://github.com/tomaarsen/SpanMarkerNER/pull/12).\n\nAdditionally, see [here](https://tomaarsen.github.io/SpanMarkerNER/notebooks/spacy_integration.html) for documentation on using SpanMarker with spaCy.",
+            "github": "tomaarsen/SpanMarkerNER",
+            "pip": "span_marker",
+            "code_example": [
+                "import spacy",
+                "",
+                "nlp = spacy.load(\"en_core_web_sm\", disable=[\"ner\"])",
+                "nlp.add_pipe(\"span_marker\", config={\"model\": \"tomaarsen/span-marker-roberta-large-ontonotes5\"})",
+                "",
+                "text = \"\"\"Cleopatra VII, also known as Cleopatra the Great, was the last active ruler of the \\",
+                "Ptolemaic Kingdom of Egypt. She was born in 69 BCE and ruled Egypt from 51 BCE until her \\",
+                "death in 30 BCE.\"\"\"",
+                "doc = nlp(text)",
+                "print([(entity, entity.label_) for entity in doc.ents])",
+                "# [(Cleopatra VII, \"PERSON\"), (Cleopatra the Great, \"PERSON\"), (the Ptolemaic Kingdom of Egypt, \"GPE\"),",
+                "# (69 BCE, \"DATE\"), (Egypt, \"GPE\"), (51 BCE, \"DATE\"), (30 BCE, \"DATE\")]"
+            ],
+            "code_language": "python",
+            "url": "https://tomaarsen.github.io/SpanMarkerNER",
+            "author": "Tom Aarsen",
+            "author_links": {
+                "github": "tomaarsen",
+                "website": "https://www.linkedin.com/in/tomaarsen"
+            },
+            "category": ["pipeline", "standalone", "scientific"],
+            "tags": ["ner"]
        }
    ],

--- a/website/src/components/quickstart.js
+++ b/website/src/components/quickstart.js
@ -215,7 +215,8 @@ const Quickstart = ({
                    }
                )}
                <pre className={classes['code']}>
-                    <code
+                    <code>
+                        <div
                            className={classNames(classes['results'], {
                                [classes['small']]: !!small,
                                [`language-${codeLang}`]: !!codeLang,
@ -224,6 +225,7 @@ const Quickstart = ({
                            ref={contentRef}
                        >
                            {Children.toArray(children).flat().filter(isRelevant)}
+                        </div>
                    </code>

                    <menu className={classes['menu']}>