Merge remote-tracking branch 'upstream/master' into v4-isort

This commit is contained in:
Daniël de Kok 2023-06-26 12:09:22 +02:00
commit bf92ca4f10
15 changed files with 129 additions and 32 deletions

View File

@ -37,6 +37,10 @@ jobs:
run: |
python -m pip install black -c requirements.txt
python -m black spacy --check
- name: isort
run: |
python -m pip install isort -c requirements.txt
python -m isort spacy --check
- name: flake8
run: |
python -m pip install flake8==5.0.4

View File

@ -36,3 +36,4 @@ types-setuptools>=57.0.0
types-requests
types-setuptools>=57.0.0
black==22.3.0
isort>=5.0,<6.0

View File

@ -230,7 +230,7 @@ def debug_data(
else:
msg.info("No word vectors present in the package")
if "spancat" in factory_names:
if "spancat" in factory_names or "spancat_singlelabel" in factory_names:
model_labels_spancat = _get_labels_from_spancat(nlp)
has_low_data_warning = False
has_no_neg_warning = False
@ -848,7 +848,7 @@ def _compile_gold(
data["boundary_cross_ents"] += 1
elif label == "-":
data["ner"]["-"] += 1
if "spancat" in factory_names:
if "spancat" in factory_names or "spancat_singlelabel" in factory_names:
for spans_key in list(eg.reference.spans.keys()):
# Obtain the span frequency
if spans_key not in data["spancat"]:
@ -1046,7 +1046,7 @@ def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]:
pipe_names = [
pipe_name
for pipe_name in nlp.pipe_names
if nlp.get_pipe_meta(pipe_name).factory == "spancat"
if nlp.get_pipe_meta(pipe_name).factory in ("spancat", "spancat_singlelabel")
]
labels: Dict[str, Set[str]] = {}
for pipe_name in pipe_names:

View File

@ -11,7 +11,7 @@ cdef class Candidate:
cdef class InMemoryCandidate(Candidate):
cdef readonly hash_t _entity_hash
cdef readonly hash_t _alias_hash
cpdef vector[float] _entity_vector
cdef vector[float] _entity_vector
cdef float _prior_prob
cdef readonly InMemoryLookupKB _kb
cdef float _entity_freq

View File

@ -39,7 +39,11 @@ from .levenshtein import levenshtein_compare
from ..strings cimport get_string_id
from ..attrs import IDS
from ..errors import Errors, MatchPatternError, Warnings
from ..schemas import validate_token_pattern
from ..strings import get_string_id
from ..util import registry
from .levenshtein import levenshtein_compare
DEF PADDING = 5

View File

@ -6,7 +6,7 @@ from .matcher import Matcher
class PhraseMatcher:
def __init__(
self, vocab: Vocab, attr: Optional[Union[int, str]], validate: bool = ...
self, vocab: Vocab, attr: Optional[Union[int, str]] = ..., validate: bool = ...
) -> None: ...
def __reduce__(self) -> Any: ...
def __len__(self) -> int: ...

View File

@ -12,6 +12,7 @@ def test_build_dependencies():
"hypothesis",
"pre-commit",
"black",
"isort",
"mypy",
"types-dataclasses",
"types-mock",

View File

@ -697,6 +697,7 @@ def test_string_to_list_intify(value):
assert string_to_list(value, intify=True) == [1, 2, 3]
@pytest.mark.skip(reason="Temporarily skip before models are published")
def test_download_compatibility():
spec = SpecifierSet("==" + about.__version__)
spec.prereleases = False
@ -707,6 +708,7 @@ def test_download_compatibility():
assert get_minor_version(about.__version__) == get_minor_version(version)
@pytest.mark.skip(reason="Temporarily skip before models are published")
def test_validate_compatibility_table():
spec = SpecifierSet("==" + about.__version__)
spec.prereleases = False
@ -858,7 +860,8 @@ def test_debug_data_compile_gold():
assert data["boundary_cross_ents"] == 1
def test_debug_data_compile_gold_for_spans():
@pytest.mark.parametrize("component_name", ["spancat", "spancat_singlelabel"])
def test_debug_data_compile_gold_for_spans(component_name):
nlp = English()
spans_key = "sc"
@ -868,7 +871,7 @@ def test_debug_data_compile_gold_for_spans():
ref.spans[spans_key] = [Span(ref, 3, 6, "ORG"), Span(ref, 5, 6, "GPE")]
eg = Example(pred, ref)
data = _compile_gold([eg], ["spancat"], nlp, True)
data = _compile_gold([eg], [component_name], nlp, True)
assert data["spancat"][spans_key] == Counter({"ORG": 1, "GPE": 1})
assert data["spans_length"][spans_key] == {"ORG": [3], "GPE": [1]}

View File

@ -25,6 +25,8 @@ from .span_groups import SpanGroups
from .token import Token
from .underscore import Underscore
DOCBIN_ALL_ATTRS: Tuple[str, ...]
class DocMethod(Protocol):
def __call__(self: Doc, *args: Any, **kwargs: Any) -> Any: ... # type: ignore[misc]

View File

@ -51,13 +51,19 @@ from ..compat import copy_reg, pickle
from ..errors import Errors, Warnings
from ..morphology import Morphology
from ..util import get_words_and_spaces
from .doc_bin import ALL_ATTRS as DOCBIN_ALL_ATTRS
from .retokenizer import Retokenizer
from .underscore import Underscore, get_ext_args
DEF PADDING = 5
# We store the docbin attrs here rather than in _serialize to avoid
# import cycles.
# fmt: off
DOCBIN_ALL_ATTRS = ("ORTH", "NORM", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "ENT_ID", "LEMMA", "MORPH", "POS", "SENT_START")
# fmt: on
cdef int bounds_check(int i, int length, int padding) except -1:
if (i + padding) < 0:
raise IndexError(Errors.E026.format(i=i, length=length))

View File

@ -12,13 +12,10 @@ from ..compat import copy_reg
from ..errors import Errors
from ..util import SimpleFrozenList, ensure_path
from ..vocab import Vocab
from .doc import DOCBIN_ALL_ATTRS as ALL_ATTRS
from .doc import Doc
from .span_groups import SpanGroups
# fmt: off
ALL_ATTRS = ("ORTH", "NORM", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "ENT_ID", "LEMMA", "MORPH", "POS", "SENT_START")
# fmt: on
class DocBin:
"""Pack Doc objects for binary serialization.

View File

@ -9,7 +9,7 @@ from ..util import load_model, logger, registry
def create_copy_from_base_model(
tokenizer: Optional[str] = None,
vocab: Optional[str] = None,
) -> Callable[[Language], Language]:
) -> Callable[["Language"], "Language"]:
def copy_from_base_model(nlp):
if tokenizer:
logger.info("Copying tokenizer from: %s", tokenizer)

View File

@ -1,3 +1,4 @@
import warnings
from collections.abc import Iterable as IterableInstance
import numpy

View File

@ -2743,10 +2743,9 @@
"description": "Have you ever struggled with needing a [spaCy TextCategorizer](https://spacy.io/api/textcategorizer) but didn't have the time to train one from scratch? Classy Classification is the way to go! For few-shot classification using [sentence-transformers](https://github.com/UKPLab/sentence-transformers) or [spaCy models](https://spacy.io/usage/models), provide a dictionary with labels and examples, or just provide a list of labels for zero shot-classification with [Huggingface zero-shot classifiers](https://huggingface.co/models?pipeline_tag=zero-shot-classification).",
"github": "davidberenstein1957/classy-classification",
"pip": "classy-classification",
"thumb": "https://raw.githubusercontent.com/Pandora-Intelligence/classy-classification/master/logo.png",
"thumb": "https://raw.githubusercontent.com/davidberenstein1957/classy-classification/master/logo.png",
"code_example": [
"import spacy",
"import classy_classification",
"",
"data = {",
" \"furniture\": [\"This text is about chairs.\",",
@ -2791,14 +2790,13 @@
"title": "Concise Concepts",
"slogan": "Concise Concepts uses few-shot NER based on word embedding similarity to get you going with easy!",
"description": "When wanting to apply NER to concise concepts, it is really easy to come up with examples, but it takes some effort to train an entire pipeline. Concise Concepts uses few-shot NER based on word embedding similarity to get you going with easy!",
"github": "pandora-intelligence/concise-concepts",
"github": "davidberenstein1957/concise-concepts",
"pip": "concise-concepts",
"thumb": "https://raw.githubusercontent.com/Pandora-Intelligence/concise-concepts/master/img/logo.png",
"image": "https://raw.githubusercontent.com/Pandora-Intelligence/concise-concepts/master/img/example.png",
"thumb": "https://raw.githubusercontent.com/davidberenstein1957/concise-concepts/master/img/logo.png",
"image": "https://raw.githubusercontent.com/davidberenstein1957/concise-concepts/master/img/example.png",
"code_example": [
"import spacy",
"from spacy import displacy",
"import concise_concepts",
"",
"data = {",
" \"fruit\": [\"apple\", \"pear\", \"orange\"],",
@ -2838,13 +2836,12 @@
"title": "Crosslingual Coreference",
"slogan": "One multi-lingual coreference model to rule them all!",
"description": "Coreference is amazing but the data required for training a model is very scarce. In our case, the available training for non-English languages also data proved to be poorly annotated. Crosslingual Coreference therefore uses the assumption a trained model with English data and cross-lingual embeddings should work for other languages with similar sentence structure. Verified to work quite well for at least (EN, NL, DK, FR, DE).",
"github": "pandora-intelligence/crosslingual-coreference",
"github": "davidberenstein1957/crosslingual-coreference",
"pip": "crosslingual-coreference",
"thumb": "https://raw.githubusercontent.com/Pandora-Intelligence/crosslingual-coreference/master/img/logo.png",
"image": "https://raw.githubusercontent.com/Pandora-Intelligence/crosslingual-coreference/master/img/example_total.png",
"thumb": "https://raw.githubusercontent.com/davidberenstein1957/crosslingual-coreference/master/img/logo.png",
"image": "https://raw.githubusercontent.com/davidberenstein1957/crosslingual-coreference/master/img/example_total.png",
"code_example": [
"import spacy",
"import crosslingual_coreference",
"",
"text = \"\"\"",
" Do not forget about Momofuku Ando!",
@ -2937,6 +2934,54 @@
"tags": ["ner", "few-shot", "augmentation", "datasets", "training"],
"spacy_version": 3
},
{
"id": "spacysetfit",
"title": "spaCy-SetFit",
"slogan": "An an easy and intuitive approach to use SetFit in combination with spaCy.",
"description": "spaCy-SetFit is a Python library that extends spaCy's text categorization capabilities by incorporating SetFit for few-shot classification. It allows you to train a text categorizer using a intuitive dictionary. \n\nThe library integrates with spaCy's pipeline architecture, enabling easy integration and configuration of the text categorizer component. You can provide a training dataset containing inlier and outlier examples, and spaCy-SetFit will use the paraphrase-MiniLM-L3-v2 model for training the text categorizer with SetFit. Once trained, you can use the categorizer to classify new text and obtain category probabilities.",
"github": "davidberenstein1957/spacy-setfit",
"pip": "spacy-setfit",
"thumb": "https://raw.githubusercontent.com/davidberenstein1957/spacy-setfit/main/logo.png",
"code_example": [
"import spacy",
"",
"# Create some example data",
"train_dataset = {",
" \"inlier\": [",
" \"Text about furniture\",",
" \"Couches, benches and televisions.\",",
" \"I really need to get a new sofa.\"",
" ],",
" \"outlier\": [",
" \"Text about kitchen equipment\",",
" \"This text is about politics\",",
" \"Comments about AI and stuff.\"",
" ]",
"}",
"",
"# Load the spaCy language model:",
"nlp = spacy.load(\"en_core_web_sm\")",
"",
"# Add the \"text_categorizer\" pipeline component to the spaCy model, and configure it with SetFit parameters:",
"nlp.add_pipe(\"text_categorizer\", config={",
" \"pretrained_model_name_or_path\": \"paraphrase-MiniLM-L3-v2\",",
" \"setfit_trainer_args\": {",
" \"train_dataset\": train_dataset",
" }",
"})",
"doc = nlp(\"I really need to get a new sofa.\")",
"doc.cats",
"# {'inlier': 0.902350975129, 'outlier': 0.097649024871}"
],
"author": "David Berenstein",
"author_links": {
"github": "davidberenstein1957",
"website": "https://www.linkedin.com/in/david-berenstein-1bab11105/"
},
"category": ["pipeline"],
"tags": ["few-shot", "SetFit", "training"],
"spacy_version": 3
},
{
"id": "blackstone",
"title": "Blackstone",
@ -4320,6 +4365,37 @@
},
"category": ["apis", "standalone"],
"tags": ["apis", "deployment"]
},
{
"id": "span_marker",
"title": "SpanMarker",
"slogan": "Effortless state-of-the-art NER in spaCy",
"description": "The SpanMarker integration with spaCy allows you to seamlessly replace the default spaCy `\"ner\"` pipeline component with any [SpanMarker model available on the Hugging Face Hub](https://huggingface.co/models?library=span-marker). Through this, you can take advantage of the advanced Named Entity Recognition capabilities of SpanMarker within the familiar and powerful spaCy framework.\n\nBy default, the `span_marker` pipeline component uses a [SpanMarker model using RoBERTa-large trained on OntoNotes v5.0](https://huggingface.co/tomaarsen/span-marker-roberta-large-ontonotes5). This model reaches a competitive 91.54 F1, notably higher than the [85.5 and 89.8 F1](https://spacy.io/usage/facts-figures#section-benchmarks) from `en_core_web_lg` and `en_core_web_trf`, respectively. A short head-to-head between this SpanMarker model and the `trf` spaCy model has been posted [here](https://github.com/tomaarsen/SpanMarkerNER/pull/12).\n\nAdditionally, see [here](https://tomaarsen.github.io/SpanMarkerNER/notebooks/spacy_integration.html) for documentation on using SpanMarker with spaCy.",
"github": "tomaarsen/SpanMarkerNER",
"pip": "span_marker",
"code_example": [
"import spacy",
"",
"nlp = spacy.load(\"en_core_web_sm\", disable=[\"ner\"])",
"nlp.add_pipe(\"span_marker\", config={\"model\": \"tomaarsen/span-marker-roberta-large-ontonotes5\"})",
"",
"text = \"\"\"Cleopatra VII, also known as Cleopatra the Great, was the last active ruler of the \\",
"Ptolemaic Kingdom of Egypt. She was born in 69 BCE and ruled Egypt from 51 BCE until her \\",
"death in 30 BCE.\"\"\"",
"doc = nlp(text)",
"print([(entity, entity.label_) for entity in doc.ents])",
"# [(Cleopatra VII, \"PERSON\"), (Cleopatra the Great, \"PERSON\"), (the Ptolemaic Kingdom of Egypt, \"GPE\"),",
"# (69 BCE, \"DATE\"), (Egypt, \"GPE\"), (51 BCE, \"DATE\"), (30 BCE, \"DATE\")]"
],
"code_language": "python",
"url": "https://tomaarsen.github.io/SpanMarkerNER",
"author": "Tom Aarsen",
"author_links": {
"github": "tomaarsen",
"website": "https://www.linkedin.com/in/tomaarsen"
},
"category": ["pipeline", "standalone", "scientific"],
"tags": ["ner"]
}
],

View File

@ -215,15 +215,17 @@ const Quickstart = ({
}
)}
<pre className={classes['code']}>
<code
className={classNames(classes['results'], {
[classes['small']]: !!small,
[`language-${codeLang}`]: !!codeLang,
})}
data-quickstart-results=""
ref={contentRef}
>
{Children.toArray(children).flat().filter(isRelevant)}
<code>
<div
className={classNames(classes['results'], {
[classes['small']]: !!small,
[`language-${codeLang}`]: !!codeLang,
})}
data-quickstart-results=""
ref={contentRef}
>
{Children.toArray(children).flat().filter(isRelevant)}
</div>
</code>
<menu className={classes['menu']}>