Merge branch 'develop' into nightly.spacy.io

This commit is contained in:
Ines Montani 2020-09-25 13:21:55 +02:00
commit f3aba49830
24 changed files with 651 additions and 152 deletions

View File

@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy-nightly"
__version__ = "3.0.0a24"
__version__ = "3.0.0a25"
__release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

View File

@ -51,7 +51,7 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
update_lockfile(project_dir, cmd)
# We remove the command from the list here, and break, so that
# we iterate over the loop again.
commands.remove(i)
commands.pop(i)
break
else:
# If we didn't break the for loop, break the while loop.

View File

@ -401,10 +401,6 @@ class Errors:
"Matcher or PhraseMatcher with the attribute {attr}. "
"Try using nlp() instead of nlp.make_doc() or list(nlp.pipe()) "
"instead of list(nlp.tokenizer.pipe()).")
E156 = ("The pipeline needs to include a parser in order to use "
"Matcher or PhraseMatcher with the attribute DEP. Try using "
"nlp() instead of nlp.make_doc() or list(nlp.pipe()) instead of "
"list(nlp.tokenizer.pipe()).")
E157 = ("Can't render negative values for dependency arc start or end. "
"Make sure that you're passing in absolute token indices, not "
"relative token offsets.\nstart: {start}, end: {end}, label: "
@ -517,8 +513,8 @@ class Errors:
"instead.")
E927 = ("Can't write to frozen list Maybe you're trying to modify a computed "
"property or default function argument?")
E928 = ("A 'KnowledgeBase' should be written to / read from a file, but the "
"provided argument {loc} is an existing directory.")
E928 = ("A 'KnowledgeBase' can only be serialized to/from from a directory, "
"but the provided argument {loc} points to a file.")
E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does "
"not seem to exist.")
E930 = ("Received invalid get_examples callback in {name}.begin_training. "

View File

@ -10,6 +10,8 @@ from libcpp.vector cimport vector
from pathlib import Path
import warnings
from spacy.strings import StringStore
from spacy import util
from .typedefs cimport hash_t
@ -83,6 +85,9 @@ cdef class KnowledgeBase:
DOCS: https://nightly.spacy.io/api/kb
"""
contents_loc = "contents"
strings_loc = "strings.json"
def __init__(self, Vocab vocab, entity_vector_length):
"""Create a KnowledgeBase."""
self.mem = Pool()
@ -319,15 +324,29 @@ cdef class KnowledgeBase:
return 0.0
def to_disk(self, path):
path = util.ensure_path(path)
if path.is_dir():
if not path.exists():
path.mkdir(parents=True)
if not path.is_dir():
raise ValueError(Errors.E928.format(loc=path))
if not path.parent.exists():
path.parent.mkdir(parents=True)
self.write_contents(path / self.contents_loc)
self.vocab.strings.to_disk(path / self.strings_loc)
cdef Writer writer = Writer(path)
def from_disk(self, path):
path = util.ensure_path(path)
if not path.exists():
raise ValueError(Errors.E929.format(loc=path))
if not path.is_dir():
raise ValueError(Errors.E928.format(loc=path))
self.read_contents(path / self.contents_loc)
kb_strings = StringStore()
kb_strings.from_disk(path / self.strings_loc)
for string in kb_strings:
self.vocab.strings.add(string)
def write_contents(self, file_path):
cdef Writer writer = Writer(file_path)
writer.write_header(self.get_size_entities(), self.entity_vector_length)
# dumping the entity vectors in their original order
@ -366,13 +385,7 @@ cdef class KnowledgeBase:
writer.close()
def from_disk(self, path):
path = util.ensure_path(path)
if path.is_dir():
raise ValueError(Errors.E928.format(loc=path))
if not path.exists():
raise ValueError(Errors.E929.format(loc=path))
def read_contents(self, file_path):
cdef hash_t entity_hash
cdef hash_t alias_hash
cdef int64_t entry_index
@ -382,7 +395,7 @@ cdef class KnowledgeBase:
cdef AliasC alias
cdef float vector_element
cdef Reader reader = Reader(path)
cdef Reader reader = Reader(file_path)
# STEP 0: load header and initialize KB
cdef int64_t nr_entities

View File

@ -17,6 +17,7 @@ from ..vocab cimport Vocab
from ..tokens.doc cimport Doc, get_token_attr_for_matcher
from ..tokens.span cimport Span
from ..tokens.token cimport Token
from ..tokens.morphanalysis cimport MorphAnalysis
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH
from ..schemas import validate_token_pattern
@ -124,7 +125,7 @@ cdef class Matcher:
key = self._normalize_key(key)
for pattern in patterns:
try:
specs = _preprocess_pattern(pattern, self.vocab.strings,
specs = _preprocess_pattern(pattern, self.vocab,
self._extensions, self._extra_predicates)
self.patterns.push_back(init_pattern(self.mem, key, specs))
for spec in specs:
@ -195,7 +196,7 @@ cdef class Matcher:
else:
yield doc
def __call__(self, object doclike, *, as_spans=False):
def __call__(self, object doclike, *, as_spans=False, allow_missing=False):
"""Find all token sequences matching the supplied pattern.
doclike (Doc or Span): The document to match over.
@ -215,16 +216,19 @@ cdef class Matcher:
else:
raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__))
cdef Pool tmp_pool = Pool()
if TAG in self._seen_attrs and not doc.has_annotation("TAG"):
raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG"))
if POS in self._seen_attrs and not doc.has_annotation("POS"):
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS"))
if MORPH in self._seen_attrs and not doc.has_annotation("MORPH"):
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH"))
if LEMMA in self._seen_attrs and not doc.has_annotation("LEMMA"):
raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA"))
if DEP in self._seen_attrs and not doc.has_annotation("DEP"):
raise ValueError(Errors.E156.format())
if not allow_missing:
for attr in (TAG, POS, MORPH, LEMMA, DEP):
if attr in self._seen_attrs and not doc.has_annotation(attr):
if attr == TAG:
pipe = "tagger"
elif attr in (POS, MORPH):
pipe = "morphologizer"
elif attr == LEMMA:
pipe = "lemmatizer"
elif attr == DEP:
pipe = "parser"
error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
raise ValueError(error_msg)
matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
extensions=self._extensions, predicates=self._extra_predicates)
final_matches = []
@ -660,7 +664,7 @@ cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
return id_attr.value
def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predicates):
def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
"""This function interprets the pattern, converting the various bits of
syntactic sugar before we compile it into a struct with init_pattern.
@ -675,6 +679,7 @@ def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predi
extra_predicates.
"""
tokens = []
string_store = vocab.strings
for spec in token_specs:
if not spec:
# Signifier for 'any token'
@ -685,7 +690,7 @@ def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predi
ops = _get_operators(spec)
attr_values = _get_attr_values(spec, string_store)
extensions = _get_extensions(spec, string_store, extensions_table)
predicates = _get_extra_predicates(spec, extra_predicates)
predicates = _get_extra_predicates(spec, extra_predicates, vocab)
for op in ops:
tokens.append((op, list(attr_values), list(extensions), list(predicates)))
return tokens
@ -729,7 +734,7 @@ def _get_attr_values(spec, string_store):
class _RegexPredicate:
operators = ("REGEX",)
def __init__(self, i, attr, value, predicate, is_extension=False):
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
self.i = i
self.attr = attr
self.value = re.compile(value)
@ -747,13 +752,18 @@ class _RegexPredicate:
return bool(self.value.search(value))
class _SetMemberPredicate:
operators = ("IN", "NOT_IN")
class _SetPredicate:
operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET")
def __init__(self, i, attr, value, predicate, is_extension=False):
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
self.i = i
self.attr = attr
self.value = set(get_string_id(v) for v in value)
self.vocab = vocab
if self.attr == MORPH:
# normalize morph strings
self.value = set(self.vocab.morphology.add(v) for v in value)
else:
self.value = set(get_string_id(v) for v in value)
self.predicate = predicate
self.is_extension = is_extension
self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
@ -765,19 +775,32 @@ class _SetMemberPredicate:
value = get_string_id(token._.get(self.attr))
else:
value = get_token_attr_for_matcher(token.c, self.attr)
if self.predicate in ("IS_SUBSET", "IS_SUPERSET"):
if self.attr == MORPH:
# break up MORPH into individual Feat=Val values
value = set(get_string_id(v) for v in MorphAnalysis.from_id(self.vocab, value))
else:
# IS_SUBSET for other attrs will be equivalent to "IN"
# IS_SUPERSET will only match for other attrs with 0 or 1 values
value = set([value])
if self.predicate == "IN":
return value in self.value
else:
elif self.predicate == "NOT_IN":
return value not in self.value
elif self.predicate == "IS_SUBSET":
return value <= self.value
elif self.predicate == "IS_SUPERSET":
return value >= self.value
def __repr__(self):
return repr(("SetMemberPredicate", self.i, self.attr, self.value, self.predicate))
return repr(("SetPredicate", self.i, self.attr, self.value, self.predicate))
class _ComparisonPredicate:
operators = ("==", "!=", ">=", "<=", ">", "<")
def __init__(self, i, attr, value, predicate, is_extension=False):
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
self.i = i
self.attr = attr
self.value = value
@ -806,11 +829,13 @@ class _ComparisonPredicate:
return value < self.value
def _get_extra_predicates(spec, extra_predicates):
def _get_extra_predicates(spec, extra_predicates, vocab):
predicate_types = {
"REGEX": _RegexPredicate,
"IN": _SetMemberPredicate,
"NOT_IN": _SetMemberPredicate,
"IN": _SetPredicate,
"NOT_IN": _SetPredicate,
"IS_SUBSET": _SetPredicate,
"IS_SUPERSET": _SetPredicate,
"==": _ComparisonPredicate,
"!=": _ComparisonPredicate,
">=": _ComparisonPredicate,
@ -838,7 +863,7 @@ def _get_extra_predicates(spec, extra_predicates):
value_with_upper_keys = {k.upper(): v for k, v in value.items()}
for type_, cls in predicate_types.items():
if type_ in value_with_upper_keys:
predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_)
predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_, vocab=vocab)
# Don't create a redundant predicates.
# This helps with efficiency, as we're caching the results.
if predicate.key in seen_predicates:

View File

@ -186,16 +186,18 @@ cdef class PhraseMatcher:
if isinstance(doc, Doc):
attrs = (TAG, POS, MORPH, LEMMA, DEP)
has_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
if self.attr == TAG and not has_annotation[TAG]:
raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG"))
if self.attr == POS and not has_annotation[POS]:
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS"))
if self.attr == MORPH and not has_annotation[MORPH]:
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH"))
if self.attr == LEMMA and not has_annotation[LEMMA]:
raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA"))
if self.attr == DEP and not has_annotation[DEP]:
raise ValueError(Errors.E156.format())
for attr in attrs:
if self.attr == attr and not has_annotation[attr]:
if attr == TAG:
pipe = "tagger"
elif attr in (POS, MORPH):
pipe = "morphologizer"
elif attr == LEMMA:
pipe = "lemmatizer"
elif attr == DEP:
pipe = "parser"
error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
raise ValueError(error_msg)
if self._validate and any(has_annotation.values()) \
and self.attr not in attrs:
string_attr = self.vocab.strings[self.attr]

View File

@ -79,7 +79,7 @@ class AttributeRuler(Pipe):
DOCS: https://nightly.spacy.io/api/attributeruler#call
"""
matches = sorted(self.matcher(doc))
matches = sorted(self.matcher(doc, allow_missing=True))
for match_id, start, end in matches:
span = Span(doc, start, end, label=match_id)
@ -126,8 +126,12 @@ class AttributeRuler(Pipe):
for tag, attrs in tag_map.items():
pattern = [{"TAG": tag}]
attrs, morph_attrs = _split_morph_attrs(attrs)
morph = self.vocab.morphology.add(morph_attrs)
attrs["MORPH"] = self.vocab.strings[morph]
if "MORPH" not in attrs:
morph = self.vocab.morphology.add(morph_attrs)
attrs["MORPH"] = self.vocab.strings[morph]
else:
morph = self.vocab.morphology.add(attrs["MORPH"])
attrs["MORPH"] = self.vocab.strings[morph]
self.add([pattern], attrs)
def load_from_morph_rules(
@ -146,8 +150,12 @@ class AttributeRuler(Pipe):
pattern = [{"ORTH": word, "TAG": tag}]
attrs = morph_rules[tag][word]
attrs, morph_attrs = _split_morph_attrs(attrs)
morph = self.vocab.morphology.add(morph_attrs)
attrs["MORPH"] = self.vocab.strings[morph]
if "MORPH" in attrs:
morph = self.vocab.morphology.add(attrs["MORPH"])
attrs["MORPH"] = self.vocab.strings[morph]
elif morph_attrs:
morph = self.vocab.morphology.add(morph_attrs)
attrs["MORPH"] = self.vocab.strings[morph]
self.add([pattern], attrs)
def add(

View File

@ -16,6 +16,7 @@ from ..training import Example, validate_examples
from ..errors import Errors, Warnings
from ..util import SimpleFrozenList
from .. import util
from ..scorer import Scorer
default_model_config = """
@ -47,6 +48,11 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
"incl_context": True,
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
},
default_score_weights={
"nel_micro_f": 1.0,
"nel_micro_r": None,
"nel_micro_p": None,
},
)
def make_entity_linker(
nlp: Language,
@ -209,12 +215,11 @@ class EntityLinker(Pipe):
# it does run the model twice :(
predictions = self.model.predict(docs)
for eg in examples:
sentences = [s for s in eg.predicted.sents]
sentences = [s for s in eg.reference.sents]
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
for ent in eg.predicted.ents:
kb_id = kb_ids[
ent.start
] # KB ID of the first token is the same as the whole span
for ent in eg.reference.ents:
# KB ID of the first token is the same as the whole span
kb_id = kb_ids[ent.start]
if kb_id:
try:
# find the sentence in the list of sentences.
@ -253,7 +258,7 @@ class EntityLinker(Pipe):
entity_encodings = []
for eg in examples:
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
for ent in eg.predicted.ents:
for ent in eg.reference.ents:
kb_id = kb_ids[ent.start]
if kb_id:
entity_encoding = self.kb.get_vector(kb_id)
@ -415,6 +420,17 @@ class EntityLinker(Pipe):
for token in ent:
token.ent_kb_id_ = kb_id
def score(self, examples, **kwargs):
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores.
DOCS TODO: https://nightly.spacy.io/api/entity_linker#score
"""
validate_examples(examples, "EntityLinker.score")
return Scorer.score_links(examples, negative_labels=[self.NIL])
def to_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
) -> None:

View File

@ -6,7 +6,7 @@ from .transition_parser cimport Parser
from ._parser_internals.ner cimport BiluoPushDown
from ..language import Language
from ..scorer import Scorer
from ..scorer import get_ner_prf, PRFScore
from ..training import validate_examples
@ -117,9 +117,18 @@ cdef class EntityRecognizer(Parser):
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
RETURNS (Dict[str, Any]): The NER precision, recall and f-scores.
DOCS: https://nightly.spacy.io/api/entityrecognizer#score
"""
validate_examples(examples, "EntityRecognizer.score")
return Scorer.score_spans(examples, "ents", **kwargs)
score_per_type = get_ner_prf(examples)
totals = PRFScore()
for prf in score_per_type.values():
totals += prf
return {
"ents_p": totals.precision,
"ents_r": totals.recall,
"ents_f": totals.fscore,
"ents_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
}

View File

@ -61,6 +61,8 @@ class TokenPatternString(BaseModel):
REGEX: Optional[StrictStr] = Field(None, alias="regex")
IN: Optional[List[StrictStr]] = Field(None, alias="in")
NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in")
IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset")
IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
class Config:
extra = "forbid"
@ -77,6 +79,8 @@ class TokenPatternNumber(BaseModel):
REGEX: Optional[StrictStr] = Field(None, alias="regex")
IN: Optional[List[StrictInt]] = Field(None, alias="in")
NOT_IN: Optional[List[StrictInt]] = Field(None, alias="not_in")
ISSUBSET: Optional[List[StrictInt]] = Field(None, alias="issubset")
ISSUPERSET: Optional[List[StrictInt]] = Field(None, alias="issuperset")
EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==")
NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=")
GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=")
@ -115,6 +119,7 @@ class TokenPattern(BaseModel):
lower: Optional[StringValue] = None
pos: Optional[StringValue] = None
tag: Optional[StringValue] = None
morph: Optional[StringValue] = None
dep: Optional[StringValue] = None
lemma: Optional[StringValue] = None
shape: Optional[StringValue] = None

View File

@ -1,5 +1,6 @@
from typing import Optional, Iterable, Dict, Any, Callable, TYPE_CHECKING
import numpy as np
from collections import defaultdict
from .training import Example
from .tokens import Token, Doc, Span
@ -23,6 +24,19 @@ class PRFScore:
self.fp = 0
self.fn = 0
def __iadd__(self, other):
self.tp += other.tp
self.fp += other.fp
self.fn += other.fn
return self
def __add__(self, other):
return PRFScore(
tp=self.tp+other.tp,
fp=self.fp+other.fp,
fn=self.fn+other.fn
)
def score_set(self, cand: set, gold: set) -> None:
self.tp += len(cand.intersection(gold))
self.fp += len(cand - gold)
@ -295,12 +309,6 @@ class Scorer:
# Find all predidate labels, for all and per type
gold_spans = set()
pred_spans = set()
# Special case for ents:
# If we have missing values in the gold, we can't easily tell
# whether our NER predictions are true.
# It seems bad but it's what we've always done.
if attr == "ents" and not all(token.ent_iob != 0 for token in gold_doc):
continue
for span in getter(gold_doc, attr):
gold_span = (span.label_, span.start, span.end - 1)
gold_spans.add(gold_span)
@ -451,6 +459,74 @@ class Scorer:
results[f"{attr}_score_desc"] = "macro AUC"
return results
@staticmethod
def score_links(
examples: Iterable[Example], *, negative_labels: Iterable[str]
) -> Dict[str, Any]:
"""Returns PRF for predicted links on the entity level.
To disentangle the performance of the NEL from the NER,
this method only evaluates NEL links for entities that overlap
between the gold reference and the predictions.
examples (Iterable[Example]): Examples to score
negative_labels (Iterable[str]): The string values that refer to no annotation (e.g. "NIL")
RETURNS (Dict[str, Any]): A dictionary containing the scores.
DOCS (TODO): https://nightly.spacy.io/api/scorer#score_links
"""
f_per_type = {}
for example in examples:
gold_ent_by_offset = {}
for gold_ent in example.reference.ents:
gold_ent_by_offset[(gold_ent.start_char, gold_ent.end_char)] = gold_ent
for pred_ent in example.predicted.ents:
gold_span = gold_ent_by_offset.get(
(pred_ent.start_char, pred_ent.end_char), None
)
label = gold_span.label_
if not label in f_per_type:
f_per_type[label] = PRFScore()
gold = gold_span.kb_id_
# only evaluating entities that overlap between gold and pred,
# to disentangle the performance of the NEL from the NER
if gold is not None:
pred = pred_ent.kb_id_
if gold in negative_labels and pred in negative_labels:
# ignore true negatives
pass
elif gold == pred:
f_per_type[label].tp += 1
elif gold in negative_labels:
f_per_type[label].fp += 1
elif pred in negative_labels:
f_per_type[label].fn += 1
else:
# a wrong prediction (e.g. Q42 != Q3) counts as both a FP as well as a FN
f_per_type[label].fp += 1
f_per_type[label].fn += 1
micro_prf = PRFScore()
for label_prf in f_per_type.values():
micro_prf.tp += label_prf.tp
micro_prf.fn += label_prf.fn
micro_prf.fp += label_prf.fp
n_labels = len(f_per_type) + 1e-100
macro_p = sum(prf.precision for prf in f_per_type.values()) / n_labels
macro_r = sum(prf.recall for prf in f_per_type.values()) / n_labels
macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_labels
results = {
f"nel_score": micro_prf.fscore,
f"nel_score_desc": "micro F",
f"nel_micro_p": micro_prf.precision,
f"nel_micro_r": micro_prf.recall,
f"nel_micro_f": micro_prf.fscore,
f"nel_macro_p": macro_p,
f"nel_macro_r": macro_r,
f"nel_macro_f": macro_f,
f"nel_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
}
return results
@staticmethod
def score_deps(
examples: Iterable[Example],
@ -545,6 +621,39 @@ class Scorer:
}
def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]:
"""Compute per-entity PRFScore objects for a sequence of examples. The
results are returned as a dictionary keyed by the entity type. You can
add the PRFScore objects to get micro-averaged total.
"""
scores = defaultdict(PRFScore)
for eg in examples:
if not eg.y.has_annotation("ENT_IOB"):
continue
golds = {(e.label_, e.start, e.end) for e in eg.y.ents}
align_x2y = eg.alignment.x2y
preds = set()
for pred_ent in eg.x.ents:
if pred_ent.label_ not in scores:
scores[pred_ent.label_] = PRFScore()
indices = align_x2y[pred_ent.start : pred_ent.end].dataXd.ravel()
if len(indices):
g_span = eg.y[indices[0] : indices[-1] + 1]
# Check we aren't missing annotation on this span. If so,
# our prediction is neither right nor wrong, we just
# ignore it.
if all(token.ent_iob != 0 for token in g_span):
key = (pred_ent.label_, indices[0], indices[-1] + 1)
if key in golds:
scores[pred_ent.label_].tp += 1
golds.remove(key)
else:
scores[pred_ent.label_].fp += 1
for label, start, end in golds:
scores[label].fn += 1
return scores
#############################################################################
#
# The following implementation of roc_auc_score() is adapted from

View File

@ -230,6 +230,106 @@ def test_matcher_set_value_operator(en_vocab):
assert len(matches) == 1
def test_matcher_subset_value_operator(en_vocab):
matcher = Matcher(en_vocab)
pattern = [{"MORPH": {"IS_SUBSET": ["Feat=Val", "Feat2=Val2"]}}]
matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"])
assert len(matcher(doc)) == 3
doc[0].morph_ = "Feat=Val"
assert len(matcher(doc)) == 3
doc[0].morph_ = "Feat=Val|Feat2=Val2"
assert len(matcher(doc)) == 3
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3"
assert len(matcher(doc)) == 2
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4"
assert len(matcher(doc)) == 2
# IS_SUBSET acts like "IN" for attrs other than MORPH
matcher = Matcher(en_vocab)
pattern = [{"TAG": {"IS_SUBSET": ["A", "B"]}}]
matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"])
doc[0].tag_ = "A"
assert len(matcher(doc)) == 1
# IS_SUBSET with an empty list matches nothing
matcher = Matcher(en_vocab)
pattern = [{"TAG": {"IS_SUBSET": []}}]
matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"])
doc[0].tag_ = "A"
assert len(matcher(doc)) == 0
def test_matcher_superset_value_operator(en_vocab):
matcher = Matcher(en_vocab)
pattern = [{"MORPH": {"IS_SUPERSET": ["Feat=Val", "Feat2=Val2", "Feat3=Val3"]}}]
matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"])
assert len(matcher(doc)) == 0
doc[0].morph_ = "Feat=Val|Feat2=Val2"
assert len(matcher(doc)) == 0
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3"
assert len(matcher(doc)) == 1
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4"
assert len(matcher(doc)) == 1
# IS_SUPERSET with more than one value only matches for MORPH
matcher = Matcher(en_vocab)
pattern = [{"TAG": {"IS_SUPERSET": ["A", "B"]}}]
matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"])
doc[0].tag_ = "A"
assert len(matcher(doc)) == 0
# IS_SUPERSET with one value is the same as ==
matcher = Matcher(en_vocab)
pattern = [{"TAG": {"IS_SUPERSET": ["A"]}}]
matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"])
doc[0].tag_ = "A"
assert len(matcher(doc)) == 1
# IS_SUPERSET with an empty value matches everything
matcher = Matcher(en_vocab)
pattern = [{"TAG": {"IS_SUPERSET": []}}]
matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"])
doc[0].tag_ = "A"
assert len(matcher(doc)) == 3
def test_matcher_morph_handling(en_vocab):
# order of features in pattern doesn't matter
matcher = Matcher(en_vocab)
pattern1 = [{"MORPH": {"IN": ["Feat1=Val1|Feat2=Val2"]}}]
pattern2 = [{"MORPH": {"IN": ["Feat2=Val2|Feat1=Val1"]}}]
matcher.add("M", [pattern1])
matcher.add("N", [pattern2])
doc = Doc(en_vocab, words=["a", "b", "c"])
assert len(matcher(doc)) == 0
doc[0].morph_ = "Feat2=Val2|Feat1=Val1"
assert len(matcher(doc)) == 2
doc[0].morph_ = "Feat1=Val1|Feat2=Val2"
assert len(matcher(doc)) == 2
# multiple values are split
matcher = Matcher(en_vocab)
pattern1 = [{"MORPH": {"IS_SUPERSET": ["Feat1=Val1", "Feat2=Val2"]}}]
pattern2 = [{"MORPH": {"IS_SUPERSET": ["Feat1=Val1", "Feat1=Val3", "Feat2=Val2"]}}]
matcher.add("M", [pattern1])
matcher.add("N", [pattern2])
doc = Doc(en_vocab, words=["a", "b", "c"])
assert len(matcher(doc)) == 0
doc[0].morph_ = "Feat2=Val2,Val3|Feat1=Val1"
assert len(matcher(doc)) == 1
doc[0].morph_ = "Feat1=Val1,Val3|Feat2=Val2"
assert len(matcher(doc)) == 2
def test_matcher_regex(en_vocab):
matcher = Matcher(en_vocab)
pattern = [{"ORTH": {"REGEX": r"(?:a|an)"}}]
@ -316,6 +416,9 @@ def test_attr_pipeline_checks(en_vocab):
matcher(doc2)
with pytest.raises(ValueError):
matcher(doc3)
# errors can be suppressed if desired
matcher(doc2, allow_missing=True)
matcher(doc3, allow_missing=True)
# TAG, POS, LEMMA require those values
for attr in ("TAG", "POS", "LEMMA"):
matcher = Matcher(en_vocab)

View File

@ -2,8 +2,10 @@ from typing import Callable, Iterable
import pytest
from spacy.kb import KnowledgeBase, get_candidates, Candidate
from spacy.vocab import Vocab
from spacy import util, registry
from spacy.scorer import Scorer
from spacy.training import Example
from spacy.lang.en import English
from spacy.tests.util import make_tempdir
@ -151,22 +153,15 @@ def test_kb_serialize(nlp):
# normal read-write behaviour
mykb.to_disk(d / "kb")
mykb.from_disk(d / "kb")
mykb.to_disk(d / "kb.file")
mykb.from_disk(d / "kb.file")
mykb.to_disk(d / "new" / "kb")
mykb.from_disk(d / "new" / "kb")
# allow overwriting an existing file
mykb.to_disk(d / "kb.file")
with pytest.raises(ValueError):
# can not write to a directory
mykb.to_disk(d)
with pytest.raises(ValueError):
# can not read from a directory
mykb.from_disk(d)
mykb.to_disk(d / "kb")
with pytest.raises(ValueError):
# can not read from an unknown file
mykb.from_disk(d / "unknown" / "kb")
def test_candidate_generation(nlp):
"""Test correct candidate generation"""
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
@ -254,6 +249,41 @@ def test_el_pipe_configuration(nlp):
assert doc[2].ent_kb_id_ == "Q2"
def test_vocab_serialization(nlp):
"""Test that string information is retained across storage"""
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
# adding entities
q1_hash = mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
q2_hash = mykb.add_entity(entity="Q2", freq=12, entity_vector=[2])
q3_hash = mykb.add_entity(entity="Q3", freq=5, entity_vector=[3])
# adding aliases
douglas_hash = mykb.add_alias(
alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1]
)
adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
candidates = mykb.get_alias_candidates("adam")
assert len(candidates) == 1
assert candidates[0].entity == q2_hash
assert candidates[0].entity_ == "Q2"
assert candidates[0].alias == adam_hash
assert candidates[0].alias_ == "adam"
with make_tempdir() as d:
mykb.to_disk(d / "kb")
kb_new_vocab = KnowledgeBase(Vocab(), entity_vector_length=1)
kb_new_vocab.from_disk(d / "kb")
candidates = kb_new_vocab.get_alias_candidates("adam")
assert len(candidates) == 1
assert candidates[0].entity == q2_hash
assert candidates[0].entity_ == "Q2"
assert candidates[0].alias == adam_hash
assert candidates[0].alias_ == "adam"
def test_append_alias(nlp):
"""Test that we can append additional alias-entity pairs"""
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
@ -377,16 +407,20 @@ def test_preserving_links_ents_2(nlp):
TRAIN_DATA = [
("Russ Cochran captured his first major title with his son as caddie.",
{"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}},
"entities": [(0, 12, "PERSON")]}),
"entities": [(0, 12, "PERSON")],
"sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}),
("Russ Cochran his reprints include EC Comics.",
{"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}},
"entities": [(0, 12, "PERSON")]}),
"entities": [(0, 12, "PERSON")],
"sent_starts": [1, -1, 0, 0, 0, 0, 0, 0]}),
("Russ Cochran has been publishing comic art.",
{"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}},
"entities": [(0, 12, "PERSON")]}),
"entities": [(0, 12, "PERSON")],
"sent_starts": [1, -1, 0, 0, 0, 0, 0, 0]}),
("Russ Cochran was a member of University of Kentucky's golf team.",
{"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}},
"entities": [(0, 12, "PERSON"), (43, 51, "LOC")]}),
"entities": [(0, 12, "PERSON"), (43, 51, "LOC")],
"sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]})
]
GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
# fmt: on
@ -395,16 +429,8 @@ GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
def test_overfitting_IO():
# Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
nlp = English()
nlp.add_pipe("sentencizer")
vector_length = 3
# Add a custom component to recognize "Russ Cochran" as an entity for the example training data
patterns = [
{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}
]
ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns)
# Convert the texts to docs to make sure we have doc.ents set for the training examples
train_examples = []
for text, annotation in TRAIN_DATA:
@ -446,6 +472,16 @@ def test_overfitting_IO():
nlp.update(train_examples, sgd=optimizer, losses=losses)
assert losses["entity_linker"] < 0.001
# adding additional components that are required for the entity_linker
nlp.add_pipe("sentencizer", first=True)
# Add a custom component to recognize "Russ Cochran" as an entity for the example training data
patterns = [
{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}
]
ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
ruler.add_patterns(patterns)
# test the trained model
predictions = []
for text, annotation in TRAIN_DATA:
@ -465,3 +501,46 @@ def test_overfitting_IO():
for ent in doc2.ents:
predictions.append(ent.kb_id_)
assert predictions == GOLD_entities
def test_scorer_links():
train_examples = []
nlp = English()
ref1 = nlp("Julia lives in London happily.")
ref1.ents = [
Span(ref1, 0, 1, label="PERSON", kb_id="Q2"),
Span(ref1, 3, 4, label="LOC", kb_id="Q3"),
]
pred1 = nlp("Julia lives in London happily.")
pred1.ents = [
Span(pred1, 0, 1, label="PERSON", kb_id="Q70"),
Span(pred1, 3, 4, label="LOC", kb_id="Q3"),
]
train_examples.append(Example(pred1, ref1))
ref2 = nlp("She loves London.")
ref2.ents = [
Span(ref2, 0, 1, label="PERSON", kb_id="Q2"),
Span(ref2, 2, 3, label="LOC", kb_id="Q13"),
]
pred2 = nlp("She loves London.")
pred2.ents = [
Span(pred2, 0, 1, label="PERSON", kb_id="Q2"),
Span(pred2, 2, 3, label="LOC", kb_id="NIL"),
]
train_examples.append(Example(pred2, ref2))
ref3 = nlp("London is great.")
ref3.ents = [Span(ref3, 0, 1, label="LOC", kb_id="NIL")]
pred3 = nlp("London is great.")
pred3.ents = [Span(pred3, 0, 1, label="LOC", kb_id="NIL")]
train_examples.append(Example(pred3, ref3))
scores = Scorer().score_links(train_examples, negative_labels=["NIL"])
assert scores["nel_f_per_type"]["PERSON"]["p"] == 1 / 2
assert scores["nel_f_per_type"]["PERSON"]["r"] == 1 / 2
assert scores["nel_f_per_type"]["LOC"]["p"] == 1 / 1
assert scores["nel_f_per_type"]["LOC"]["r"] == 1 / 2
assert scores["nel_micro_p"] == 2 / 3
assert scores["nel_micro_r"] == 2 / 4

View File

@ -345,12 +345,13 @@ def test_language_factories_invalid():
[{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}],
{"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25},
),
([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75},),
([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75}),
([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"a": 0.0, "b": 0.0, "c": 0.0}),
],
)
def test_language_factories_combine_score_weights(weights, expected):
result = combine_score_weights(weights)
assert sum(result.values()) in (0.99, 1.0)
assert sum(result.values()) in (0.99, 1.0, 0.0)
assert result == expected

View File

@ -244,3 +244,22 @@ def test_Example_from_dict_with_links_invalid(annots):
predicted = Doc(vocab, words=annots["words"])
with pytest.raises(ValueError):
Example.from_dict(predicted, annots)
def test_Example_from_dict_sentences():
vocab = Vocab()
predicted = Doc(vocab, words=["One", "sentence", ".", "one", "more"])
annots = {"sent_starts": [1, 0, 0, 1, 0]}
ex = Example.from_dict(predicted, annots)
assert len(list(ex.reference.sents)) == 2
# this currently throws an error - bug or feature?
# predicted = Doc(vocab, words=["One", "sentence", "not", "one", "more"])
# annots = {"sent_starts": [1, 0, 0, 0, 0]}
# ex = Example.from_dict(predicted, annots)
# assert len(list(ex.reference.sents)) == 1
predicted = Doc(vocab, words=["One", "sentence", "not", "one", "more"])
annots = {"sent_starts": [1, -1, 0, 0, 0]}
ex = Example.from_dict(predicted, annots)
assert len(list(ex.reference.sents)) == 1

View File

@ -1,4 +1,5 @@
from ..tokens.doc cimport Doc
from libc.stdint cimport uint64_t
cdef class Example:
@ -7,3 +8,5 @@ cdef class Example:
cdef readonly object _cached_alignment
cdef readonly object _cached_words_x
cdef readonly object _cached_words_y
cdef readonly uint64_t _x_sig
cdef readonly uint64_t _y_sig

View File

@ -1,6 +1,7 @@
from collections import Iterable as IterableInstance
import warnings
import numpy
from murmurhash.mrmr cimport hash64
from ..tokens.doc cimport Doc
from ..tokens.span cimport Span
@ -97,15 +98,36 @@ cdef class Example:
@property
def alignment(self):
words_x = [token.text for token in self.x]
words_y = [token.text for token in self.y]
if self._cached_alignment is None or \
words_x != self._cached_words_x or \
words_y != self._cached_words_y:
self._cached_alignment = Alignment.from_strings(words_x, words_y)
x_sig = hash64(self.x.c, sizeof(self.x.c[0]) * self.x.length, 0)
y_sig = hash64(self.y.c, sizeof(self.y.c[0]) * self.y.length, 0)
if self._cached_alignment is None:
words_x = [token.text for token in self.x]
words_y = [token.text for token in self.y]
self._x_sig = x_sig
self._y_sig = y_sig
self._cached_words_x = words_x
self._cached_words_y = words_y
return self._cached_alignment
self._cached_alignment = Alignment.from_strings(words_x, words_y)
return self._cached_alignment
elif self._x_sig == x_sig and self._y_sig == y_sig:
# If we have a cached alignment, check whether the cache is invalid
# due to retokenization. To make this check fast in loops, we first
# check a hash of the TokenC arrays.
return self._cached_alignment
else:
words_x = [token.text for token in self.x]
words_y = [token.text for token in self.y]
if words_x == self._cached_words_x and words_y == self._cached_words_y:
self._x_sig = x_sig
self._y_sig = y_sig
return self._cached_alignment
else:
self._cached_alignment = Alignment.from_strings(words_x, words_y)
self._cached_words_x = words_x
self._cached_words_y = words_y
self._x_sig = x_sig
self._y_sig = y_sig
return self._cached_alignment
def get_aligned(self, field, as_string=False):
"""Return an aligned array for a token attribute."""
@ -288,7 +310,6 @@ def _annot2array(vocab, tok_annot, doc_annot):
def _add_entities_to_doc(doc, ner_data):
print(ner_data)
if ner_data is None:
return
elif ner_data == []:

View File

@ -1233,8 +1233,13 @@ def combine_score_weights(
# components.
total = sum(w_dict.values())
for key, value in w_dict.items():
weight = round(value / total / len(all_weights), 2)
result[key] = result.get(key, 0.0) + weight
if total == 0:
weight = 0.0
else:
weight = round(value / total / len(all_weights), 2)
prev_weight = result.get(key, 0.0)
prev_weight = 0.0 if prev_weight is None else prev_weight
result[key] = prev_weight + weight
return result

View File

@ -225,6 +225,21 @@ pipe's entity linking model and context encoder. Delegates to
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
## EntityLinker.score {#score tag="method" new="3"}
Score a batch of examples.
> #### Example
>
> ```python
> scores = entity_linker.score(examples)
> ```
| Name | Description |
| ----------- | ---------------------------------------------------------------------------------------------- |
| `examples` | The examples to score. ~~Iterable[Example]~~ |
| **RETURNS** | The scores, produced by [`Scorer.score_links`](/api/scorer#score_links) . ~~Dict[str, float]~~ |
## EntityLinker.create_optimizer {#create_optimizer tag="method"}
Create an optimizer for the pipeline component.

View File

@ -242,10 +242,10 @@ Score a batch of examples.
> scores = ner.score(examples)
> ```
| Name | Description |
| ----------- | ---------------------------------------------------------------------------------------------------------------------- |
| `examples` | The examples to score. ~~Iterable[Example]~~ |
| **RETURNS** | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
| Name | Description |
| ----------- | --------------------------------------------------------- |
| `examples` | The examples to score. ~~Iterable[Example]~~ |
| **RETURNS** | The scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
## EntityRecognizer.create_optimizer {#create_optimizer tag="method"}

View File

@ -30,20 +30,20 @@ pattern keys correspond to a number of
[`Token` attributes](/api/token#attributes). The supported attributes for
rule-based matching are:
| Attribute |  Description |
| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
| `ORTH` | The exact verbatim text of a token. ~~str~~ |
| `TEXT` <Tag variant="new">2.1</Tag> | The exact verbatim text of a token. ~~str~~ |
| `LOWER` | The lowercase form of the token text. ~~str~~ |
|  `LENGTH` | The length of the token text. ~~int~~ |
|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~ |
|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | Token text is in lowercase, uppercase, titlecase. ~~bool~~ |
|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ |
|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ |
|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. ~~str~~ |
| `ENT_TYPE` | The token's entity label. ~~str~~ |
| `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
| `OP` | Operator or quantifier to determine how often to match a token pattern. ~~str~~ |
| Attribute |  Description |
| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
| `ORTH` | The exact verbatim text of a token. ~~str~~ |
| `TEXT` <Tag variant="new">2.1</Tag> | The exact verbatim text of a token. ~~str~~ |
| `LOWER` | The lowercase form of the token text. ~~str~~ |
|  `LENGTH` | The length of the token text. ~~int~~ |
|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~ |
|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | Token text is in lowercase, uppercase, titlecase. ~~bool~~ |
|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ |
|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ |
|  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~ |
| `ENT_TYPE` | The token's entity label. ~~str~~ |
| `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
| `OP` | Operator or quantifier to determine how often to match a token pattern. ~~str~~ |
Operators and quantifiers define **how often** a token pattern should be
matched:
@ -79,6 +79,8 @@ it compares to another value.
| -------------------------- | ------------------------------------------------------------------------------------------------------- |
| `IN` | Attribute value is member of a list. ~~Any~~ |
| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ |
| `ISSUBSET` | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~ |
| `ISSUPERSET` | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~ |
| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
## Matcher.\_\_init\_\_ {#init tag="method"}

View File

@ -206,3 +206,26 @@ depends on the scorer settings:
| `multi_label` | Whether the attribute allows multiple labels. Defaults to `True`. ~~bool~~ |
| `positive_label` | The positive label for a binary task with exclusive classes. Defaults to `None`. ~~Optional[str]~~ |
| **RETURNS** | A dictionary containing the scores, with inapplicable scores as `None`. ~~Dict[str, Optional[float]]~~ |
## Scorer.score_links {#score_links tag="staticmethod" new="3"}
Returns PRF for predicted links on the entity level. To disentangle the
performance of the NEL from the NER, this method only evaluates NEL links for
entities that overlap between the gold reference and the predictions.
> #### Example
>
> ```python
> scores = Scorer.score_links(
> examples,
> negative_labels=["NIL", ""]
> )
> print(scores["nel_micro_f"])
> ```
| Name | Description |
| ----------------- | ------------------------------------------------------------------------------------------------------------------- |
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
| _keyword-only_ | |
| `negative_labels` | The string values that refer to no annotation (e.g. "NIL"). ~~Iterable[str]~~ |
| **RETURNS** | A dictionary containing the scores. ~~Dict[str, Optional[float]]~~ |

View File

@ -158,20 +158,20 @@ The available token pattern keys correspond to a number of
[`Token` attributes](/api/token#attributes). The supported attributes for
rule-based matching are:
| Attribute |  Description |
| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
| `ORTH` | The exact verbatim text of a token. ~~str~~ |
| `TEXT` <Tag variant="new">2.1</Tag> | The exact verbatim text of a token. ~~str~~ |
| `LOWER` | The lowercase form of the token text. ~~str~~ |
|  `LENGTH` | The length of the token text. ~~int~~ |
|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~ |
|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | Token text is in lowercase, uppercase, titlecase. ~~bool~~ |
|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ |
|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ |
|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. ~~str~~ |
| `ENT_TYPE` | The token's entity label. ~~str~~ |
| `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
| `OP` | [Operator or quantifier](#quantifiers) to determine how often to match a token pattern. ~~str~~ |
| Attribute |  Description |
| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
| `ORTH` | The exact verbatim text of a token. ~~str~~ |
| `TEXT` <Tag variant="new">2.1</Tag> | The exact verbatim text of a token. ~~str~~ |
| `LOWER` | The lowercase form of the token text. ~~str~~ |
|  `LENGTH` | The length of the token text. ~~int~~ |
|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~ |
|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | Token text is in lowercase, uppercase, titlecase. ~~bool~~ |
|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ |
|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ |
|  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~ |
| `ENT_TYPE` | The token's entity label. ~~str~~ |
| `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
| `OP` | [Operator or quantifier](#quantifiers) to determine how often to match a token pattern. ~~str~~ |
<Accordion title="Does it matter if the attribute names are uppercase or lowercase?">
@ -236,6 +236,8 @@ following rich comparison attributes are available:
| -------------------------- | ------------------------------------------------------------------------------------------------------- |
| `IN` | Attribute value is member of a list. ~~Any~~ |
| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ |
| `ISSUBSET` | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~ |
| `ISSUPERSET` | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~ |
| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
#### Regular expressions {#regex new="2.1"}

View File

@ -11,12 +11,24 @@ import { Table, Tr, Td, Th } from '../components/table'
import Tag from '../components/tag'
import { H2, Label } from '../components/typography'
import Icon from '../components/icon'
import Link from '../components/link'
import Link, { OptionalLink } from '../components/link'
import Infobox from '../components/infobox'
import Accordion from '../components/accordion'
import { join, arrayToObj, abbrNum, markdownToReact } from '../components/util'
import { isString, isEmptyObj } from '../components/util'
const COMPONENT_LINKS = {
tok2vec: '/api/tok2vec',
transformer: '/api/transformer',
tagger: '/api/tagger',
parser: '/api/dependencyparser',
ner: '/api/entityrecognizer',
lemmatizer: '/api/lemmatizer',
attribute_ruler: '/api/attributeruler',
senter: '/api/sentencerecognizer',
morphologizer: '/api/morphologizer',
}
const MODEL_META = {
core: 'Vocabulary, syntax, entities, vectors',
core_sm: 'Vocabulary, syntax, entities',
@ -78,10 +90,15 @@ function isStableVersion(v) {
return !v.includes('a') && !v.includes('b') && !v.includes('dev') && !v.includes('rc')
}
function getLatestVersion(modelId, compatibility) {
function getLatestVersion(modelId, compatibility, prereleases) {
for (let [version, models] of Object.entries(compatibility)) {
if (isStableVersion(version) && models[modelId]) {
return models[modelId][0]
const modelVersions = models[modelId]
for (let modelVersion of modelVersions) {
if (isStableVersion(modelVersion) || prereleases) {
return modelVersion
}
}
}
}
}
@ -141,18 +158,44 @@ function formatSources(data = []) {
))
}
function linkComponents(components = []) {
return join(
components.map(c => (
<Fragment key={c}>
<OptionalLink to={COMPONENT_LINKS[c]} hideIcon>
<InlineCode>{c}</InlineCode>
</OptionalLink>
</Fragment>
))
)
}
const Help = ({ children }) => (
<span data-tooltip={children}>
<Icon name="help2" width={16} variant="subtle" inline />
</span>
)
const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExamples, licenses }) => {
const Model = ({
name,
langId,
langName,
baseUrl,
repo,
compatibility,
hasExamples,
licenses,
prereleases,
}) => {
const [initialized, setInitialized] = useState(false)
const [isError, setIsError] = useState(true)
const [meta, setMeta] = useState({})
const { type, genre, size } = getModelComponents(name)
const version = useMemo(() => getLatestVersion(name, compatibility), [name, compatibility])
const version = useMemo(() => getLatestVersion(name, compatibility, prereleases), [
name,
compatibility,
prereleases,
])
useEffect(() => {
window.dispatchEvent(new Event('resize')) // scroll position for progress
@ -173,10 +216,8 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
const releaseTag = meta.fullName ? `/tag/${meta.fullName}` : ''
const releaseUrl = `https://github.com/${repo}/releases/${releaseTag}`
const pipeline =
meta.pipeline && join(meta.pipeline.map(p => <InlineCode key={p}>{p}</InlineCode>))
const components =
meta.components && join(meta.components.map(p => <InlineCode key={p}>{p}</InlineCode>))
const pipeline = linkComponents(meta.pipeline)
const components = linkComponents(meta.components)
const sources = formatSources(meta.sources)
const author = !meta.url ? meta.author : <Link to={meta.url}>{meta.author}</Link>
const licenseUrl = licenses[meta.license] ? licenses[meta.license].url : null
@ -332,7 +373,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
const Models = ({ pageContext, repo, children }) => {
const [initialized, setInitialized] = useState(false)
const [compatibility, setCompatibility] = useState({})
const { id, title, meta, hasExamples } = pageContext
const { id, title, meta } = pageContext
const { models, isStarters } = meta
const baseUrl = `https://raw.githubusercontent.com/${repo}/master`
@ -381,6 +422,7 @@ const Models = ({ pageContext, repo, children }) => {
repo={repo}
licenses={arrayToObj(site.siteMetadata.licenses, 'id')}
hasExamples={meta.hasExamples}
prereleases={site.siteMetadata.nightly}
/>
))
}
@ -397,6 +439,7 @@ const query = graphql`
query ModelsQuery {
site {
siteMetadata {
nightly
licenses {
id
url