mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Merge remote-tracking branch 'upstream/master' into v4-isort
This commit is contained in:
commit
bf92ca4f10
4
.github/workflows/tests.yml
vendored
4
.github/workflows/tests.yml
vendored
|
@ -37,6 +37,10 @@ jobs:
|
|||
run: |
|
||||
python -m pip install black -c requirements.txt
|
||||
python -m black spacy --check
|
||||
- name: isort
|
||||
run: |
|
||||
python -m pip install isort -c requirements.txt
|
||||
python -m isort spacy --check
|
||||
- name: flake8
|
||||
run: |
|
||||
python -m pip install flake8==5.0.4
|
||||
|
|
|
@ -36,3 +36,4 @@ types-setuptools>=57.0.0
|
|||
types-requests
|
||||
types-setuptools>=57.0.0
|
||||
black==22.3.0
|
||||
isort>=5.0,<6.0
|
||||
|
|
|
@ -230,7 +230,7 @@ def debug_data(
|
|||
else:
|
||||
msg.info("No word vectors present in the package")
|
||||
|
||||
if "spancat" in factory_names:
|
||||
if "spancat" in factory_names or "spancat_singlelabel" in factory_names:
|
||||
model_labels_spancat = _get_labels_from_spancat(nlp)
|
||||
has_low_data_warning = False
|
||||
has_no_neg_warning = False
|
||||
|
@ -848,7 +848,7 @@ def _compile_gold(
|
|||
data["boundary_cross_ents"] += 1
|
||||
elif label == "-":
|
||||
data["ner"]["-"] += 1
|
||||
if "spancat" in factory_names:
|
||||
if "spancat" in factory_names or "spancat_singlelabel" in factory_names:
|
||||
for spans_key in list(eg.reference.spans.keys()):
|
||||
# Obtain the span frequency
|
||||
if spans_key not in data["spancat"]:
|
||||
|
@ -1046,7 +1046,7 @@ def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]:
|
|||
pipe_names = [
|
||||
pipe_name
|
||||
for pipe_name in nlp.pipe_names
|
||||
if nlp.get_pipe_meta(pipe_name).factory == "spancat"
|
||||
if nlp.get_pipe_meta(pipe_name).factory in ("spancat", "spancat_singlelabel")
|
||||
]
|
||||
labels: Dict[str, Set[str]] = {}
|
||||
for pipe_name in pipe_names:
|
||||
|
|
|
@ -11,7 +11,7 @@ cdef class Candidate:
|
|||
cdef class InMemoryCandidate(Candidate):
|
||||
cdef readonly hash_t _entity_hash
|
||||
cdef readonly hash_t _alias_hash
|
||||
cpdef vector[float] _entity_vector
|
||||
cdef vector[float] _entity_vector
|
||||
cdef float _prior_prob
|
||||
cdef readonly InMemoryLookupKB _kb
|
||||
cdef float _entity_freq
|
||||
|
|
|
@ -39,7 +39,11 @@ from .levenshtein import levenshtein_compare
|
|||
from ..strings cimport get_string_id
|
||||
|
||||
from ..attrs import IDS
|
||||
from ..errors import Errors, MatchPatternError, Warnings
|
||||
from ..schemas import validate_token_pattern
|
||||
from ..strings import get_string_id
|
||||
from ..util import registry
|
||||
from .levenshtein import levenshtein_compare
|
||||
|
||||
DEF PADDING = 5
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ from .matcher import Matcher
|
|||
|
||||
class PhraseMatcher:
|
||||
def __init__(
|
||||
self, vocab: Vocab, attr: Optional[Union[int, str]], validate: bool = ...
|
||||
self, vocab: Vocab, attr: Optional[Union[int, str]] = ..., validate: bool = ...
|
||||
) -> None: ...
|
||||
def __reduce__(self) -> Any: ...
|
||||
def __len__(self) -> int: ...
|
||||
|
|
|
@ -12,6 +12,7 @@ def test_build_dependencies():
|
|||
"hypothesis",
|
||||
"pre-commit",
|
||||
"black",
|
||||
"isort",
|
||||
"mypy",
|
||||
"types-dataclasses",
|
||||
"types-mock",
|
||||
|
|
|
@ -697,6 +697,7 @@ def test_string_to_list_intify(value):
|
|||
assert string_to_list(value, intify=True) == [1, 2, 3]
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Temporarily skip before models are published")
|
||||
def test_download_compatibility():
|
||||
spec = SpecifierSet("==" + about.__version__)
|
||||
spec.prereleases = False
|
||||
|
@ -707,6 +708,7 @@ def test_download_compatibility():
|
|||
assert get_minor_version(about.__version__) == get_minor_version(version)
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Temporarily skip before models are published")
|
||||
def test_validate_compatibility_table():
|
||||
spec = SpecifierSet("==" + about.__version__)
|
||||
spec.prereleases = False
|
||||
|
@ -858,7 +860,8 @@ def test_debug_data_compile_gold():
|
|||
assert data["boundary_cross_ents"] == 1
|
||||
|
||||
|
||||
def test_debug_data_compile_gold_for_spans():
|
||||
@pytest.mark.parametrize("component_name", ["spancat", "spancat_singlelabel"])
|
||||
def test_debug_data_compile_gold_for_spans(component_name):
|
||||
nlp = English()
|
||||
spans_key = "sc"
|
||||
|
||||
|
@ -868,7 +871,7 @@ def test_debug_data_compile_gold_for_spans():
|
|||
ref.spans[spans_key] = [Span(ref, 3, 6, "ORG"), Span(ref, 5, 6, "GPE")]
|
||||
eg = Example(pred, ref)
|
||||
|
||||
data = _compile_gold([eg], ["spancat"], nlp, True)
|
||||
data = _compile_gold([eg], [component_name], nlp, True)
|
||||
|
||||
assert data["spancat"][spans_key] == Counter({"ORG": 1, "GPE": 1})
|
||||
assert data["spans_length"][spans_key] == {"ORG": [3], "GPE": [1]}
|
||||
|
|
|
@ -25,6 +25,8 @@ from .span_groups import SpanGroups
|
|||
from .token import Token
|
||||
from .underscore import Underscore
|
||||
|
||||
DOCBIN_ALL_ATTRS: Tuple[str, ...]
|
||||
|
||||
class DocMethod(Protocol):
|
||||
def __call__(self: Doc, *args: Any, **kwargs: Any) -> Any: ... # type: ignore[misc]
|
||||
|
||||
|
|
|
@ -51,13 +51,19 @@ from ..compat import copy_reg, pickle
|
|||
from ..errors import Errors, Warnings
|
||||
from ..morphology import Morphology
|
||||
from ..util import get_words_and_spaces
|
||||
from .doc_bin import ALL_ATTRS as DOCBIN_ALL_ATTRS
|
||||
from .retokenizer import Retokenizer
|
||||
from .underscore import Underscore, get_ext_args
|
||||
|
||||
DEF PADDING = 5
|
||||
|
||||
|
||||
# We store the docbin attrs here rather than in _serialize to avoid
|
||||
# import cycles.
|
||||
|
||||
# fmt: off
|
||||
DOCBIN_ALL_ATTRS = ("ORTH", "NORM", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "ENT_ID", "LEMMA", "MORPH", "POS", "SENT_START")
|
||||
# fmt: on
|
||||
|
||||
cdef int bounds_check(int i, int length, int padding) except -1:
|
||||
if (i + padding) < 0:
|
||||
raise IndexError(Errors.E026.format(i=i, length=length))
|
||||
|
|
|
@ -12,13 +12,10 @@ from ..compat import copy_reg
|
|||
from ..errors import Errors
|
||||
from ..util import SimpleFrozenList, ensure_path
|
||||
from ..vocab import Vocab
|
||||
from .doc import DOCBIN_ALL_ATTRS as ALL_ATTRS
|
||||
from .doc import Doc
|
||||
from .span_groups import SpanGroups
|
||||
|
||||
# fmt: off
|
||||
ALL_ATTRS = ("ORTH", "NORM", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "ENT_ID", "LEMMA", "MORPH", "POS", "SENT_START")
|
||||
# fmt: on
|
||||
|
||||
|
||||
class DocBin:
|
||||
"""Pack Doc objects for binary serialization.
|
||||
|
|
|
@ -9,7 +9,7 @@ from ..util import load_model, logger, registry
|
|||
def create_copy_from_base_model(
|
||||
tokenizer: Optional[str] = None,
|
||||
vocab: Optional[str] = None,
|
||||
) -> Callable[[Language], Language]:
|
||||
) -> Callable[["Language"], "Language"]:
|
||||
def copy_from_base_model(nlp):
|
||||
if tokenizer:
|
||||
logger.info("Copying tokenizer from: %s", tokenizer)
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
import warnings
|
||||
from collections.abc import Iterable as IterableInstance
|
||||
|
||||
import numpy
|
||||
|
|
|
@ -2743,10 +2743,9 @@
|
|||
"description": "Have you ever struggled with needing a [spaCy TextCategorizer](https://spacy.io/api/textcategorizer) but didn't have the time to train one from scratch? Classy Classification is the way to go! For few-shot classification using [sentence-transformers](https://github.com/UKPLab/sentence-transformers) or [spaCy models](https://spacy.io/usage/models), provide a dictionary with labels and examples, or just provide a list of labels for zero shot-classification with [Huggingface zero-shot classifiers](https://huggingface.co/models?pipeline_tag=zero-shot-classification).",
|
||||
"github": "davidberenstein1957/classy-classification",
|
||||
"pip": "classy-classification",
|
||||
"thumb": "https://raw.githubusercontent.com/Pandora-Intelligence/classy-classification/master/logo.png",
|
||||
"thumb": "https://raw.githubusercontent.com/davidberenstein1957/classy-classification/master/logo.png",
|
||||
"code_example": [
|
||||
"import spacy",
|
||||
"import classy_classification",
|
||||
"",
|
||||
"data = {",
|
||||
" \"furniture\": [\"This text is about chairs.\",",
|
||||
|
@ -2791,14 +2790,13 @@
|
|||
"title": "Concise Concepts",
|
||||
"slogan": "Concise Concepts uses few-shot NER based on word embedding similarity to get you going with easy!",
|
||||
"description": "When wanting to apply NER to concise concepts, it is really easy to come up with examples, but it takes some effort to train an entire pipeline. Concise Concepts uses few-shot NER based on word embedding similarity to get you going with easy!",
|
||||
"github": "pandora-intelligence/concise-concepts",
|
||||
"github": "davidberenstein1957/concise-concepts",
|
||||
"pip": "concise-concepts",
|
||||
"thumb": "https://raw.githubusercontent.com/Pandora-Intelligence/concise-concepts/master/img/logo.png",
|
||||
"image": "https://raw.githubusercontent.com/Pandora-Intelligence/concise-concepts/master/img/example.png",
|
||||
"thumb": "https://raw.githubusercontent.com/davidberenstein1957/concise-concepts/master/img/logo.png",
|
||||
"image": "https://raw.githubusercontent.com/davidberenstein1957/concise-concepts/master/img/example.png",
|
||||
"code_example": [
|
||||
"import spacy",
|
||||
"from spacy import displacy",
|
||||
"import concise_concepts",
|
||||
"",
|
||||
"data = {",
|
||||
" \"fruit\": [\"apple\", \"pear\", \"orange\"],",
|
||||
|
@ -2838,13 +2836,12 @@
|
|||
"title": "Crosslingual Coreference",
|
||||
"slogan": "One multi-lingual coreference model to rule them all!",
|
||||
"description": "Coreference is amazing but the data required for training a model is very scarce. In our case, the available training for non-English languages also data proved to be poorly annotated. Crosslingual Coreference therefore uses the assumption a trained model with English data and cross-lingual embeddings should work for other languages with similar sentence structure. Verified to work quite well for at least (EN, NL, DK, FR, DE).",
|
||||
"github": "pandora-intelligence/crosslingual-coreference",
|
||||
"github": "davidberenstein1957/crosslingual-coreference",
|
||||
"pip": "crosslingual-coreference",
|
||||
"thumb": "https://raw.githubusercontent.com/Pandora-Intelligence/crosslingual-coreference/master/img/logo.png",
|
||||
"image": "https://raw.githubusercontent.com/Pandora-Intelligence/crosslingual-coreference/master/img/example_total.png",
|
||||
"thumb": "https://raw.githubusercontent.com/davidberenstein1957/crosslingual-coreference/master/img/logo.png",
|
||||
"image": "https://raw.githubusercontent.com/davidberenstein1957/crosslingual-coreference/master/img/example_total.png",
|
||||
"code_example": [
|
||||
"import spacy",
|
||||
"import crosslingual_coreference",
|
||||
"",
|
||||
"text = \"\"\"",
|
||||
" Do not forget about Momofuku Ando!",
|
||||
|
@ -2937,6 +2934,54 @@
|
|||
"tags": ["ner", "few-shot", "augmentation", "datasets", "training"],
|
||||
"spacy_version": 3
|
||||
},
|
||||
{
|
||||
"id": "spacysetfit",
|
||||
"title": "spaCy-SetFit",
|
||||
"slogan": "An an easy and intuitive approach to use SetFit in combination with spaCy.",
|
||||
"description": "spaCy-SetFit is a Python library that extends spaCy's text categorization capabilities by incorporating SetFit for few-shot classification. It allows you to train a text categorizer using a intuitive dictionary. \n\nThe library integrates with spaCy's pipeline architecture, enabling easy integration and configuration of the text categorizer component. You can provide a training dataset containing inlier and outlier examples, and spaCy-SetFit will use the paraphrase-MiniLM-L3-v2 model for training the text categorizer with SetFit. Once trained, you can use the categorizer to classify new text and obtain category probabilities.",
|
||||
"github": "davidberenstein1957/spacy-setfit",
|
||||
"pip": "spacy-setfit",
|
||||
"thumb": "https://raw.githubusercontent.com/davidberenstein1957/spacy-setfit/main/logo.png",
|
||||
"code_example": [
|
||||
"import spacy",
|
||||
"",
|
||||
"# Create some example data",
|
||||
"train_dataset = {",
|
||||
" \"inlier\": [",
|
||||
" \"Text about furniture\",",
|
||||
" \"Couches, benches and televisions.\",",
|
||||
" \"I really need to get a new sofa.\"",
|
||||
" ],",
|
||||
" \"outlier\": [",
|
||||
" \"Text about kitchen equipment\",",
|
||||
" \"This text is about politics\",",
|
||||
" \"Comments about AI and stuff.\"",
|
||||
" ]",
|
||||
"}",
|
||||
"",
|
||||
"# Load the spaCy language model:",
|
||||
"nlp = spacy.load(\"en_core_web_sm\")",
|
||||
"",
|
||||
"# Add the \"text_categorizer\" pipeline component to the spaCy model, and configure it with SetFit parameters:",
|
||||
"nlp.add_pipe(\"text_categorizer\", config={",
|
||||
" \"pretrained_model_name_or_path\": \"paraphrase-MiniLM-L3-v2\",",
|
||||
" \"setfit_trainer_args\": {",
|
||||
" \"train_dataset\": train_dataset",
|
||||
" }",
|
||||
"})",
|
||||
"doc = nlp(\"I really need to get a new sofa.\")",
|
||||
"doc.cats",
|
||||
"# {'inlier': 0.902350975129, 'outlier': 0.097649024871}"
|
||||
],
|
||||
"author": "David Berenstein",
|
||||
"author_links": {
|
||||
"github": "davidberenstein1957",
|
||||
"website": "https://www.linkedin.com/in/david-berenstein-1bab11105/"
|
||||
},
|
||||
"category": ["pipeline"],
|
||||
"tags": ["few-shot", "SetFit", "training"],
|
||||
"spacy_version": 3
|
||||
},
|
||||
{
|
||||
"id": "blackstone",
|
||||
"title": "Blackstone",
|
||||
|
@ -4320,6 +4365,37 @@
|
|||
},
|
||||
"category": ["apis", "standalone"],
|
||||
"tags": ["apis", "deployment"]
|
||||
},
|
||||
{
|
||||
"id": "span_marker",
|
||||
"title": "SpanMarker",
|
||||
"slogan": "Effortless state-of-the-art NER in spaCy",
|
||||
"description": "The SpanMarker integration with spaCy allows you to seamlessly replace the default spaCy `\"ner\"` pipeline component with any [SpanMarker model available on the Hugging Face Hub](https://huggingface.co/models?library=span-marker). Through this, you can take advantage of the advanced Named Entity Recognition capabilities of SpanMarker within the familiar and powerful spaCy framework.\n\nBy default, the `span_marker` pipeline component uses a [SpanMarker model using RoBERTa-large trained on OntoNotes v5.0](https://huggingface.co/tomaarsen/span-marker-roberta-large-ontonotes5). This model reaches a competitive 91.54 F1, notably higher than the [85.5 and 89.8 F1](https://spacy.io/usage/facts-figures#section-benchmarks) from `en_core_web_lg` and `en_core_web_trf`, respectively. A short head-to-head between this SpanMarker model and the `trf` spaCy model has been posted [here](https://github.com/tomaarsen/SpanMarkerNER/pull/12).\n\nAdditionally, see [here](https://tomaarsen.github.io/SpanMarkerNER/notebooks/spacy_integration.html) for documentation on using SpanMarker with spaCy.",
|
||||
"github": "tomaarsen/SpanMarkerNER",
|
||||
"pip": "span_marker",
|
||||
"code_example": [
|
||||
"import spacy",
|
||||
"",
|
||||
"nlp = spacy.load(\"en_core_web_sm\", disable=[\"ner\"])",
|
||||
"nlp.add_pipe(\"span_marker\", config={\"model\": \"tomaarsen/span-marker-roberta-large-ontonotes5\"})",
|
||||
"",
|
||||
"text = \"\"\"Cleopatra VII, also known as Cleopatra the Great, was the last active ruler of the \\",
|
||||
"Ptolemaic Kingdom of Egypt. She was born in 69 BCE and ruled Egypt from 51 BCE until her \\",
|
||||
"death in 30 BCE.\"\"\"",
|
||||
"doc = nlp(text)",
|
||||
"print([(entity, entity.label_) for entity in doc.ents])",
|
||||
"# [(Cleopatra VII, \"PERSON\"), (Cleopatra the Great, \"PERSON\"), (the Ptolemaic Kingdom of Egypt, \"GPE\"),",
|
||||
"# (69 BCE, \"DATE\"), (Egypt, \"GPE\"), (51 BCE, \"DATE\"), (30 BCE, \"DATE\")]"
|
||||
],
|
||||
"code_language": "python",
|
||||
"url": "https://tomaarsen.github.io/SpanMarkerNER",
|
||||
"author": "Tom Aarsen",
|
||||
"author_links": {
|
||||
"github": "tomaarsen",
|
||||
"website": "https://www.linkedin.com/in/tomaarsen"
|
||||
},
|
||||
"category": ["pipeline", "standalone", "scientific"],
|
||||
"tags": ["ner"]
|
||||
}
|
||||
],
|
||||
|
||||
|
|
|
@ -215,15 +215,17 @@ const Quickstart = ({
|
|||
}
|
||||
)}
|
||||
<pre className={classes['code']}>
|
||||
<code
|
||||
className={classNames(classes['results'], {
|
||||
[classes['small']]: !!small,
|
||||
[`language-${codeLang}`]: !!codeLang,
|
||||
})}
|
||||
data-quickstart-results=""
|
||||
ref={contentRef}
|
||||
>
|
||||
{Children.toArray(children).flat().filter(isRelevant)}
|
||||
<code>
|
||||
<div
|
||||
className={classNames(classes['results'], {
|
||||
[classes['small']]: !!small,
|
||||
[`language-${codeLang}`]: !!codeLang,
|
||||
})}
|
||||
data-quickstart-results=""
|
||||
ref={contentRef}
|
||||
>
|
||||
{Children.toArray(children).flat().filter(isRelevant)}
|
||||
</div>
|
||||
</code>
|
||||
|
||||
<menu className={classes['menu']}>
|
||||
|
|
Loading…
Reference in New Issue
Block a user