diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 873158fb8..8822e0722 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -37,6 +37,10 @@ jobs: run: | python -m pip install black -c requirements.txt python -m black spacy --check + - name: isort + run: | + python -m pip install isort -c requirements.txt + python -m isort spacy --check - name: flake8 run: | python -m pip install flake8==5.0.4 diff --git a/requirements.txt b/requirements.txt index c9b2a4b15..4342af047 100644 --- a/requirements.txt +++ b/requirements.txt @@ -36,3 +36,4 @@ types-setuptools>=57.0.0 types-requests types-setuptools>=57.0.0 black==22.3.0 +isort>=5.0,<6.0 diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 261e65e78..4c44a8c0e 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -230,7 +230,7 @@ def debug_data( else: msg.info("No word vectors present in the package") - if "spancat" in factory_names: + if "spancat" in factory_names or "spancat_singlelabel" in factory_names: model_labels_spancat = _get_labels_from_spancat(nlp) has_low_data_warning = False has_no_neg_warning = False @@ -848,7 +848,7 @@ def _compile_gold( data["boundary_cross_ents"] += 1 elif label == "-": data["ner"]["-"] += 1 - if "spancat" in factory_names: + if "spancat" in factory_names or "spancat_singlelabel" in factory_names: for spans_key in list(eg.reference.spans.keys()): # Obtain the span frequency if spans_key not in data["spancat"]: @@ -1046,7 +1046,7 @@ def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]: pipe_names = [ pipe_name for pipe_name in nlp.pipe_names - if nlp.get_pipe_meta(pipe_name).factory == "spancat" + if nlp.get_pipe_meta(pipe_name).factory in ("spancat", "spancat_singlelabel") ] labels: Dict[str, Set[str]] = {} for pipe_name in pipe_names: diff --git a/spacy/kb/candidate.pxd b/spacy/kb/candidate.pxd index 4419ed476..e842e390f 100644 --- a/spacy/kb/candidate.pxd +++ b/spacy/kb/candidate.pxd @@ -11,7 +11,7 @@ cdef class Candidate: cdef class InMemoryCandidate(Candidate): cdef readonly hash_t _entity_hash cdef readonly hash_t _alias_hash - cpdef vector[float] _entity_vector + cdef vector[float] _entity_vector cdef float _prior_prob cdef readonly InMemoryLookupKB _kb cdef float _entity_freq diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index f4068b7d4..42b8a8f9a 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -39,7 +39,11 @@ from .levenshtein import levenshtein_compare from ..strings cimport get_string_id from ..attrs import IDS +from ..errors import Errors, MatchPatternError, Warnings +from ..schemas import validate_token_pattern +from ..strings import get_string_id from ..util import registry +from .levenshtein import levenshtein_compare DEF PADDING = 5 diff --git a/spacy/matcher/phrasematcher.pyi b/spacy/matcher/phrasematcher.pyi index 71406cf57..d3c679a65 100644 --- a/spacy/matcher/phrasematcher.pyi +++ b/spacy/matcher/phrasematcher.pyi @@ -6,7 +6,7 @@ from .matcher import Matcher class PhraseMatcher: def __init__( - self, vocab: Vocab, attr: Optional[Union[int, str]], validate: bool = ... + self, vocab: Vocab, attr: Optional[Union[int, str]] = ..., validate: bool = ... ) -> None: ... def __reduce__(self) -> Any: ... def __len__(self) -> int: ... diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py index 99e2db03d..2576e5a8b 100644 --- a/spacy/tests/package/test_requirements.py +++ b/spacy/tests/package/test_requirements.py @@ -12,6 +12,7 @@ def test_build_dependencies(): "hypothesis", "pre-commit", "black", + "isort", "mypy", "types-dataclasses", "types-mock", diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 88d3ffa45..9a2d7705f 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -697,6 +697,7 @@ def test_string_to_list_intify(value): assert string_to_list(value, intify=True) == [1, 2, 3] +@pytest.mark.skip(reason="Temporarily skip before models are published") def test_download_compatibility(): spec = SpecifierSet("==" + about.__version__) spec.prereleases = False @@ -707,6 +708,7 @@ def test_download_compatibility(): assert get_minor_version(about.__version__) == get_minor_version(version) +@pytest.mark.skip(reason="Temporarily skip before models are published") def test_validate_compatibility_table(): spec = SpecifierSet("==" + about.__version__) spec.prereleases = False @@ -858,7 +860,8 @@ def test_debug_data_compile_gold(): assert data["boundary_cross_ents"] == 1 -def test_debug_data_compile_gold_for_spans(): +@pytest.mark.parametrize("component_name", ["spancat", "spancat_singlelabel"]) +def test_debug_data_compile_gold_for_spans(component_name): nlp = English() spans_key = "sc" @@ -868,7 +871,7 @@ def test_debug_data_compile_gold_for_spans(): ref.spans[spans_key] = [Span(ref, 3, 6, "ORG"), Span(ref, 5, 6, "GPE")] eg = Example(pred, ref) - data = _compile_gold([eg], ["spancat"], nlp, True) + data = _compile_gold([eg], [component_name], nlp, True) assert data["spancat"][spans_key] == Counter({"ORG": 1, "GPE": 1}) assert data["spans_length"][spans_key] == {"ORG": [3], "GPE": [1]} diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi index d29360c87..116533263 100644 --- a/spacy/tokens/doc.pyi +++ b/spacy/tokens/doc.pyi @@ -25,6 +25,8 @@ from .span_groups import SpanGroups from .token import Token from .underscore import Underscore +DOCBIN_ALL_ATTRS: Tuple[str, ...] + class DocMethod(Protocol): def __call__(self: Doc, *args: Any, **kwargs: Any) -> Any: ... # type: ignore[misc] diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 08c3181bf..541178aff 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -51,13 +51,19 @@ from ..compat import copy_reg, pickle from ..errors import Errors, Warnings from ..morphology import Morphology from ..util import get_words_and_spaces -from .doc_bin import ALL_ATTRS as DOCBIN_ALL_ATTRS from .retokenizer import Retokenizer from .underscore import Underscore, get_ext_args DEF PADDING = 5 +# We store the docbin attrs here rather than in _serialize to avoid +# import cycles. + +# fmt: off +DOCBIN_ALL_ATTRS = ("ORTH", "NORM", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "ENT_ID", "LEMMA", "MORPH", "POS", "SENT_START") +# fmt: on + cdef int bounds_check(int i, int length, int padding) except -1: if (i + padding) < 0: raise IndexError(Errors.E026.format(i=i, length=length)) diff --git a/spacy/tokens/doc_bin.py b/spacy/tokens/doc_bin.py index 4dda40a05..7f6a83040 100644 --- a/spacy/tokens/doc_bin.py +++ b/spacy/tokens/doc_bin.py @@ -12,13 +12,10 @@ from ..compat import copy_reg from ..errors import Errors from ..util import SimpleFrozenList, ensure_path from ..vocab import Vocab +from .doc import DOCBIN_ALL_ATTRS as ALL_ATTRS from .doc import Doc from .span_groups import SpanGroups -# fmt: off -ALL_ATTRS = ("ORTH", "NORM", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "ENT_ID", "LEMMA", "MORPH", "POS", "SENT_START") -# fmt: on - class DocBin: """Pack Doc objects for binary serialization. diff --git a/spacy/training/callbacks.py b/spacy/training/callbacks.py index 053227a11..c2f3b8b51 100644 --- a/spacy/training/callbacks.py +++ b/spacy/training/callbacks.py @@ -9,7 +9,7 @@ from ..util import load_model, logger, registry def create_copy_from_base_model( tokenizer: Optional[str] = None, vocab: Optional[str] = None, -) -> Callable[[Language], Language]: +) -> Callable[["Language"], "Language"]: def copy_from_base_model(nlp): if tokenizer: logger.info("Copying tokenizer from: %s", tokenizer) diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index 3a33a8693..1c3cd9939 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -1,3 +1,4 @@ +import warnings from collections.abc import Iterable as IterableInstance import numpy diff --git a/website/meta/universe.json b/website/meta/universe.json index 8e5e0ad62..967d9eb06 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -2743,10 +2743,9 @@ "description": "Have you ever struggled with needing a [spaCy TextCategorizer](https://spacy.io/api/textcategorizer) but didn't have the time to train one from scratch? Classy Classification is the way to go! For few-shot classification using [sentence-transformers](https://github.com/UKPLab/sentence-transformers) or [spaCy models](https://spacy.io/usage/models), provide a dictionary with labels and examples, or just provide a list of labels for zero shot-classification with [Huggingface zero-shot classifiers](https://huggingface.co/models?pipeline_tag=zero-shot-classification).", "github": "davidberenstein1957/classy-classification", "pip": "classy-classification", - "thumb": "https://raw.githubusercontent.com/Pandora-Intelligence/classy-classification/master/logo.png", + "thumb": "https://raw.githubusercontent.com/davidberenstein1957/classy-classification/master/logo.png", "code_example": [ "import spacy", - "import classy_classification", "", "data = {", " \"furniture\": [\"This text is about chairs.\",", @@ -2791,14 +2790,13 @@ "title": "Concise Concepts", "slogan": "Concise Concepts uses few-shot NER based on word embedding similarity to get you going with easy!", "description": "When wanting to apply NER to concise concepts, it is really easy to come up with examples, but it takes some effort to train an entire pipeline. Concise Concepts uses few-shot NER based on word embedding similarity to get you going with easy!", - "github": "pandora-intelligence/concise-concepts", + "github": "davidberenstein1957/concise-concepts", "pip": "concise-concepts", - "thumb": "https://raw.githubusercontent.com/Pandora-Intelligence/concise-concepts/master/img/logo.png", - "image": "https://raw.githubusercontent.com/Pandora-Intelligence/concise-concepts/master/img/example.png", + "thumb": "https://raw.githubusercontent.com/davidberenstein1957/concise-concepts/master/img/logo.png", + "image": "https://raw.githubusercontent.com/davidberenstein1957/concise-concepts/master/img/example.png", "code_example": [ "import spacy", "from spacy import displacy", - "import concise_concepts", "", "data = {", " \"fruit\": [\"apple\", \"pear\", \"orange\"],", @@ -2838,13 +2836,12 @@ "title": "Crosslingual Coreference", "slogan": "One multi-lingual coreference model to rule them all!", "description": "Coreference is amazing but the data required for training a model is very scarce. In our case, the available training for non-English languages also data proved to be poorly annotated. Crosslingual Coreference therefore uses the assumption a trained model with English data and cross-lingual embeddings should work for other languages with similar sentence structure. Verified to work quite well for at least (EN, NL, DK, FR, DE).", - "github": "pandora-intelligence/crosslingual-coreference", + "github": "davidberenstein1957/crosslingual-coreference", "pip": "crosslingual-coreference", - "thumb": "https://raw.githubusercontent.com/Pandora-Intelligence/crosslingual-coreference/master/img/logo.png", - "image": "https://raw.githubusercontent.com/Pandora-Intelligence/crosslingual-coreference/master/img/example_total.png", + "thumb": "https://raw.githubusercontent.com/davidberenstein1957/crosslingual-coreference/master/img/logo.png", + "image": "https://raw.githubusercontent.com/davidberenstein1957/crosslingual-coreference/master/img/example_total.png", "code_example": [ "import spacy", - "import crosslingual_coreference", "", "text = \"\"\"", " Do not forget about Momofuku Ando!", @@ -2937,6 +2934,54 @@ "tags": ["ner", "few-shot", "augmentation", "datasets", "training"], "spacy_version": 3 }, + { + "id": "spacysetfit", + "title": "spaCy-SetFit", + "slogan": "An an easy and intuitive approach to use SetFit in combination with spaCy.", + "description": "spaCy-SetFit is a Python library that extends spaCy's text categorization capabilities by incorporating SetFit for few-shot classification. It allows you to train a text categorizer using a intuitive dictionary. \n\nThe library integrates with spaCy's pipeline architecture, enabling easy integration and configuration of the text categorizer component. You can provide a training dataset containing inlier and outlier examples, and spaCy-SetFit will use the paraphrase-MiniLM-L3-v2 model for training the text categorizer with SetFit. Once trained, you can use the categorizer to classify new text and obtain category probabilities.", + "github": "davidberenstein1957/spacy-setfit", + "pip": "spacy-setfit", + "thumb": "https://raw.githubusercontent.com/davidberenstein1957/spacy-setfit/main/logo.png", + "code_example": [ + "import spacy", + "", + "# Create some example data", + "train_dataset = {", + " \"inlier\": [", + " \"Text about furniture\",", + " \"Couches, benches and televisions.\",", + " \"I really need to get a new sofa.\"", + " ],", + " \"outlier\": [", + " \"Text about kitchen equipment\",", + " \"This text is about politics\",", + " \"Comments about AI and stuff.\"", + " ]", + "}", + "", + "# Load the spaCy language model:", + "nlp = spacy.load(\"en_core_web_sm\")", + "", + "# Add the \"text_categorizer\" pipeline component to the spaCy model, and configure it with SetFit parameters:", + "nlp.add_pipe(\"text_categorizer\", config={", + " \"pretrained_model_name_or_path\": \"paraphrase-MiniLM-L3-v2\",", + " \"setfit_trainer_args\": {", + " \"train_dataset\": train_dataset", + " }", + "})", + "doc = nlp(\"I really need to get a new sofa.\")", + "doc.cats", + "# {'inlier': 0.902350975129, 'outlier': 0.097649024871}" + ], + "author": "David Berenstein", + "author_links": { + "github": "davidberenstein1957", + "website": "https://www.linkedin.com/in/david-berenstein-1bab11105/" + }, + "category": ["pipeline"], + "tags": ["few-shot", "SetFit", "training"], + "spacy_version": 3 + }, { "id": "blackstone", "title": "Blackstone", @@ -4320,6 +4365,37 @@ }, "category": ["apis", "standalone"], "tags": ["apis", "deployment"] + }, + { + "id": "span_marker", + "title": "SpanMarker", + "slogan": "Effortless state-of-the-art NER in spaCy", + "description": "The SpanMarker integration with spaCy allows you to seamlessly replace the default spaCy `\"ner\"` pipeline component with any [SpanMarker model available on the Hugging Face Hub](https://huggingface.co/models?library=span-marker). Through this, you can take advantage of the advanced Named Entity Recognition capabilities of SpanMarker within the familiar and powerful spaCy framework.\n\nBy default, the `span_marker` pipeline component uses a [SpanMarker model using RoBERTa-large trained on OntoNotes v5.0](https://huggingface.co/tomaarsen/span-marker-roberta-large-ontonotes5). This model reaches a competitive 91.54 F1, notably higher than the [85.5 and 89.8 F1](https://spacy.io/usage/facts-figures#section-benchmarks) from `en_core_web_lg` and `en_core_web_trf`, respectively. A short head-to-head between this SpanMarker model and the `trf` spaCy model has been posted [here](https://github.com/tomaarsen/SpanMarkerNER/pull/12).\n\nAdditionally, see [here](https://tomaarsen.github.io/SpanMarkerNER/notebooks/spacy_integration.html) for documentation on using SpanMarker with spaCy.", + "github": "tomaarsen/SpanMarkerNER", + "pip": "span_marker", + "code_example": [ + "import spacy", + "", + "nlp = spacy.load(\"en_core_web_sm\", disable=[\"ner\"])", + "nlp.add_pipe(\"span_marker\", config={\"model\": \"tomaarsen/span-marker-roberta-large-ontonotes5\"})", + "", + "text = \"\"\"Cleopatra VII, also known as Cleopatra the Great, was the last active ruler of the \\", + "Ptolemaic Kingdom of Egypt. She was born in 69 BCE and ruled Egypt from 51 BCE until her \\", + "death in 30 BCE.\"\"\"", + "doc = nlp(text)", + "print([(entity, entity.label_) for entity in doc.ents])", + "# [(Cleopatra VII, \"PERSON\"), (Cleopatra the Great, \"PERSON\"), (the Ptolemaic Kingdom of Egypt, \"GPE\"),", + "# (69 BCE, \"DATE\"), (Egypt, \"GPE\"), (51 BCE, \"DATE\"), (30 BCE, \"DATE\")]" + ], + "code_language": "python", + "url": "https://tomaarsen.github.io/SpanMarkerNER", + "author": "Tom Aarsen", + "author_links": { + "github": "tomaarsen", + "website": "https://www.linkedin.com/in/tomaarsen" + }, + "category": ["pipeline", "standalone", "scientific"], + "tags": ["ner"] } ], diff --git a/website/src/components/quickstart.js b/website/src/components/quickstart.js index 160e5a778..2b5bfb5ba 100644 --- a/website/src/components/quickstart.js +++ b/website/src/components/quickstart.js @@ -215,15 +215,17 @@ const Quickstart = ({ } )}
-                    
-                        {Children.toArray(children).flat().filter(isRelevant)}
+                    
+                        
+ {Children.toArray(children).flat().filter(isRelevant)} +