diff --git a/spacy/about.py b/spacy/about.py
index 56b05257a..ea9f9f33e 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy-nightly"
-__version__ = "3.0.0a24"
+__version__ = "3.0.0a25"
__release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
diff --git a/spacy/cli/project/pull.py b/spacy/cli/project/pull.py
index 3119d3a12..26676d5b3 100644
--- a/spacy/cli/project/pull.py
+++ b/spacy/cli/project/pull.py
@@ -51,7 +51,7 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
update_lockfile(project_dir, cmd)
# We remove the command from the list here, and break, so that
# we iterate over the loop again.
- commands.remove(i)
+ commands.pop(i)
break
else:
# If we didn't break the for loop, break the while loop.
diff --git a/spacy/errors.py b/spacy/errors.py
index 6fdf8cb57..4216e3936 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -401,10 +401,6 @@ class Errors:
"Matcher or PhraseMatcher with the attribute {attr}. "
"Try using nlp() instead of nlp.make_doc() or list(nlp.pipe()) "
"instead of list(nlp.tokenizer.pipe()).")
- E156 = ("The pipeline needs to include a parser in order to use "
- "Matcher or PhraseMatcher with the attribute DEP. Try using "
- "nlp() instead of nlp.make_doc() or list(nlp.pipe()) instead of "
- "list(nlp.tokenizer.pipe()).")
E157 = ("Can't render negative values for dependency arc start or end. "
"Make sure that you're passing in absolute token indices, not "
"relative token offsets.\nstart: {start}, end: {end}, label: "
@@ -517,8 +513,8 @@ class Errors:
"instead.")
E927 = ("Can't write to frozen list Maybe you're trying to modify a computed "
"property or default function argument?")
- E928 = ("A 'KnowledgeBase' should be written to / read from a file, but the "
- "provided argument {loc} is an existing directory.")
+ E928 = ("A 'KnowledgeBase' can only be serialized to/from from a directory, "
+ "but the provided argument {loc} points to a file.")
E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does "
"not seem to exist.")
E930 = ("Received invalid get_examples callback in {name}.begin_training. "
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index ff5382c24..bdf652766 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -10,6 +10,8 @@ from libcpp.vector cimport vector
from pathlib import Path
import warnings
+from spacy.strings import StringStore
+
from spacy import util
from .typedefs cimport hash_t
@@ -83,6 +85,9 @@ cdef class KnowledgeBase:
DOCS: https://nightly.spacy.io/api/kb
"""
+ contents_loc = "contents"
+ strings_loc = "strings.json"
+
def __init__(self, Vocab vocab, entity_vector_length):
"""Create a KnowledgeBase."""
self.mem = Pool()
@@ -319,15 +324,29 @@ cdef class KnowledgeBase:
return 0.0
-
def to_disk(self, path):
path = util.ensure_path(path)
- if path.is_dir():
+ if not path.exists():
+ path.mkdir(parents=True)
+ if not path.is_dir():
raise ValueError(Errors.E928.format(loc=path))
- if not path.parent.exists():
- path.parent.mkdir(parents=True)
+ self.write_contents(path / self.contents_loc)
+ self.vocab.strings.to_disk(path / self.strings_loc)
- cdef Writer writer = Writer(path)
+ def from_disk(self, path):
+ path = util.ensure_path(path)
+ if not path.exists():
+ raise ValueError(Errors.E929.format(loc=path))
+ if not path.is_dir():
+ raise ValueError(Errors.E928.format(loc=path))
+ self.read_contents(path / self.contents_loc)
+ kb_strings = StringStore()
+ kb_strings.from_disk(path / self.strings_loc)
+ for string in kb_strings:
+ self.vocab.strings.add(string)
+
+ def write_contents(self, file_path):
+ cdef Writer writer = Writer(file_path)
writer.write_header(self.get_size_entities(), self.entity_vector_length)
# dumping the entity vectors in their original order
@@ -366,13 +385,7 @@ cdef class KnowledgeBase:
writer.close()
- def from_disk(self, path):
- path = util.ensure_path(path)
- if path.is_dir():
- raise ValueError(Errors.E928.format(loc=path))
- if not path.exists():
- raise ValueError(Errors.E929.format(loc=path))
-
+ def read_contents(self, file_path):
cdef hash_t entity_hash
cdef hash_t alias_hash
cdef int64_t entry_index
@@ -382,7 +395,7 @@ cdef class KnowledgeBase:
cdef AliasC alias
cdef float vector_element
- cdef Reader reader = Reader(path)
+ cdef Reader reader = Reader(file_path)
# STEP 0: load header and initialize KB
cdef int64_t nr_entities
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index d83f58181..a4d20ec55 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -17,6 +17,7 @@ from ..vocab cimport Vocab
from ..tokens.doc cimport Doc, get_token_attr_for_matcher
from ..tokens.span cimport Span
from ..tokens.token cimport Token
+from ..tokens.morphanalysis cimport MorphAnalysis
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH
from ..schemas import validate_token_pattern
@@ -124,7 +125,7 @@ cdef class Matcher:
key = self._normalize_key(key)
for pattern in patterns:
try:
- specs = _preprocess_pattern(pattern, self.vocab.strings,
+ specs = _preprocess_pattern(pattern, self.vocab,
self._extensions, self._extra_predicates)
self.patterns.push_back(init_pattern(self.mem, key, specs))
for spec in specs:
@@ -195,7 +196,7 @@ cdef class Matcher:
else:
yield doc
- def __call__(self, object doclike, *, as_spans=False):
+ def __call__(self, object doclike, *, as_spans=False, allow_missing=False):
"""Find all token sequences matching the supplied pattern.
doclike (Doc or Span): The document to match over.
@@ -215,16 +216,19 @@ cdef class Matcher:
else:
raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__))
cdef Pool tmp_pool = Pool()
- if TAG in self._seen_attrs and not doc.has_annotation("TAG"):
- raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG"))
- if POS in self._seen_attrs and not doc.has_annotation("POS"):
- raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS"))
- if MORPH in self._seen_attrs and not doc.has_annotation("MORPH"):
- raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH"))
- if LEMMA in self._seen_attrs and not doc.has_annotation("LEMMA"):
- raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA"))
- if DEP in self._seen_attrs and not doc.has_annotation("DEP"):
- raise ValueError(Errors.E156.format())
+ if not allow_missing:
+ for attr in (TAG, POS, MORPH, LEMMA, DEP):
+ if attr in self._seen_attrs and not doc.has_annotation(attr):
+ if attr == TAG:
+ pipe = "tagger"
+ elif attr in (POS, MORPH):
+ pipe = "morphologizer"
+ elif attr == LEMMA:
+ pipe = "lemmatizer"
+ elif attr == DEP:
+ pipe = "parser"
+ error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
+ raise ValueError(error_msg)
matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
extensions=self._extensions, predicates=self._extra_predicates)
final_matches = []
@@ -660,7 +664,7 @@ cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
return id_attr.value
-def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predicates):
+def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
"""This function interprets the pattern, converting the various bits of
syntactic sugar before we compile it into a struct with init_pattern.
@@ -675,6 +679,7 @@ def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predi
extra_predicates.
"""
tokens = []
+ string_store = vocab.strings
for spec in token_specs:
if not spec:
# Signifier for 'any token'
@@ -685,7 +690,7 @@ def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predi
ops = _get_operators(spec)
attr_values = _get_attr_values(spec, string_store)
extensions = _get_extensions(spec, string_store, extensions_table)
- predicates = _get_extra_predicates(spec, extra_predicates)
+ predicates = _get_extra_predicates(spec, extra_predicates, vocab)
for op in ops:
tokens.append((op, list(attr_values), list(extensions), list(predicates)))
return tokens
@@ -729,7 +734,7 @@ def _get_attr_values(spec, string_store):
class _RegexPredicate:
operators = ("REGEX",)
- def __init__(self, i, attr, value, predicate, is_extension=False):
+ def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
self.i = i
self.attr = attr
self.value = re.compile(value)
@@ -747,13 +752,18 @@ class _RegexPredicate:
return bool(self.value.search(value))
-class _SetMemberPredicate:
- operators = ("IN", "NOT_IN")
+class _SetPredicate:
+ operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET")
- def __init__(self, i, attr, value, predicate, is_extension=False):
+ def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
self.i = i
self.attr = attr
- self.value = set(get_string_id(v) for v in value)
+ self.vocab = vocab
+ if self.attr == MORPH:
+ # normalize morph strings
+ self.value = set(self.vocab.morphology.add(v) for v in value)
+ else:
+ self.value = set(get_string_id(v) for v in value)
self.predicate = predicate
self.is_extension = is_extension
self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
@@ -765,19 +775,32 @@ class _SetMemberPredicate:
value = get_string_id(token._.get(self.attr))
else:
value = get_token_attr_for_matcher(token.c, self.attr)
+
+ if self.predicate in ("IS_SUBSET", "IS_SUPERSET"):
+ if self.attr == MORPH:
+ # break up MORPH into individual Feat=Val values
+ value = set(get_string_id(v) for v in MorphAnalysis.from_id(self.vocab, value))
+ else:
+ # IS_SUBSET for other attrs will be equivalent to "IN"
+ # IS_SUPERSET will only match for other attrs with 0 or 1 values
+ value = set([value])
if self.predicate == "IN":
return value in self.value
- else:
+ elif self.predicate == "NOT_IN":
return value not in self.value
+ elif self.predicate == "IS_SUBSET":
+ return value <= self.value
+ elif self.predicate == "IS_SUPERSET":
+ return value >= self.value
def __repr__(self):
- return repr(("SetMemberPredicate", self.i, self.attr, self.value, self.predicate))
+ return repr(("SetPredicate", self.i, self.attr, self.value, self.predicate))
class _ComparisonPredicate:
operators = ("==", "!=", ">=", "<=", ">", "<")
- def __init__(self, i, attr, value, predicate, is_extension=False):
+ def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
self.i = i
self.attr = attr
self.value = value
@@ -806,11 +829,13 @@ class _ComparisonPredicate:
return value < self.value
-def _get_extra_predicates(spec, extra_predicates):
+def _get_extra_predicates(spec, extra_predicates, vocab):
predicate_types = {
"REGEX": _RegexPredicate,
- "IN": _SetMemberPredicate,
- "NOT_IN": _SetMemberPredicate,
+ "IN": _SetPredicate,
+ "NOT_IN": _SetPredicate,
+ "IS_SUBSET": _SetPredicate,
+ "IS_SUPERSET": _SetPredicate,
"==": _ComparisonPredicate,
"!=": _ComparisonPredicate,
">=": _ComparisonPredicate,
@@ -838,7 +863,7 @@ def _get_extra_predicates(spec, extra_predicates):
value_with_upper_keys = {k.upper(): v for k, v in value.items()}
for type_, cls in predicate_types.items():
if type_ in value_with_upper_keys:
- predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_)
+ predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_, vocab=vocab)
# Don't create a redundant predicates.
# This helps with efficiency, as we're caching the results.
if predicate.key in seen_predicates:
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index b00ba157f..7e99859b5 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -186,16 +186,18 @@ cdef class PhraseMatcher:
if isinstance(doc, Doc):
attrs = (TAG, POS, MORPH, LEMMA, DEP)
has_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
- if self.attr == TAG and not has_annotation[TAG]:
- raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG"))
- if self.attr == POS and not has_annotation[POS]:
- raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS"))
- if self.attr == MORPH and not has_annotation[MORPH]:
- raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH"))
- if self.attr == LEMMA and not has_annotation[LEMMA]:
- raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA"))
- if self.attr == DEP and not has_annotation[DEP]:
- raise ValueError(Errors.E156.format())
+ for attr in attrs:
+ if self.attr == attr and not has_annotation[attr]:
+ if attr == TAG:
+ pipe = "tagger"
+ elif attr in (POS, MORPH):
+ pipe = "morphologizer"
+ elif attr == LEMMA:
+ pipe = "lemmatizer"
+ elif attr == DEP:
+ pipe = "parser"
+ error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
+ raise ValueError(error_msg)
if self._validate and any(has_annotation.values()) \
and self.attr not in attrs:
string_attr = self.vocab.strings[self.attr]
diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py
index f64fcbc54..0d59a1ba0 100644
--- a/spacy/pipeline/attributeruler.py
+++ b/spacy/pipeline/attributeruler.py
@@ -79,7 +79,7 @@ class AttributeRuler(Pipe):
DOCS: https://nightly.spacy.io/api/attributeruler#call
"""
- matches = sorted(self.matcher(doc))
+ matches = sorted(self.matcher(doc, allow_missing=True))
for match_id, start, end in matches:
span = Span(doc, start, end, label=match_id)
@@ -126,8 +126,12 @@ class AttributeRuler(Pipe):
for tag, attrs in tag_map.items():
pattern = [{"TAG": tag}]
attrs, morph_attrs = _split_morph_attrs(attrs)
- morph = self.vocab.morphology.add(morph_attrs)
- attrs["MORPH"] = self.vocab.strings[morph]
+ if "MORPH" not in attrs:
+ morph = self.vocab.morphology.add(morph_attrs)
+ attrs["MORPH"] = self.vocab.strings[morph]
+ else:
+ morph = self.vocab.morphology.add(attrs["MORPH"])
+ attrs["MORPH"] = self.vocab.strings[morph]
self.add([pattern], attrs)
def load_from_morph_rules(
@@ -146,8 +150,12 @@ class AttributeRuler(Pipe):
pattern = [{"ORTH": word, "TAG": tag}]
attrs = morph_rules[tag][word]
attrs, morph_attrs = _split_morph_attrs(attrs)
- morph = self.vocab.morphology.add(morph_attrs)
- attrs["MORPH"] = self.vocab.strings[morph]
+ if "MORPH" in attrs:
+ morph = self.vocab.morphology.add(attrs["MORPH"])
+ attrs["MORPH"] = self.vocab.strings[morph]
+ elif morph_attrs:
+ morph = self.vocab.morphology.add(morph_attrs)
+ attrs["MORPH"] = self.vocab.strings[morph]
self.add([pattern], attrs)
def add(
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 1debadd82..039e2a891 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -16,6 +16,7 @@ from ..training import Example, validate_examples
from ..errors import Errors, Warnings
from ..util import SimpleFrozenList
from .. import util
+from ..scorer import Scorer
default_model_config = """
@@ -47,6 +48,11 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
"incl_context": True,
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
},
+ default_score_weights={
+ "nel_micro_f": 1.0,
+ "nel_micro_r": None,
+ "nel_micro_p": None,
+ },
)
def make_entity_linker(
nlp: Language,
@@ -209,12 +215,11 @@ class EntityLinker(Pipe):
# it does run the model twice :(
predictions = self.model.predict(docs)
for eg in examples:
- sentences = [s for s in eg.predicted.sents]
+ sentences = [s for s in eg.reference.sents]
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
- for ent in eg.predicted.ents:
- kb_id = kb_ids[
- ent.start
- ] # KB ID of the first token is the same as the whole span
+ for ent in eg.reference.ents:
+ # KB ID of the first token is the same as the whole span
+ kb_id = kb_ids[ent.start]
if kb_id:
try:
# find the sentence in the list of sentences.
@@ -253,7 +258,7 @@ class EntityLinker(Pipe):
entity_encodings = []
for eg in examples:
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
- for ent in eg.predicted.ents:
+ for ent in eg.reference.ents:
kb_id = kb_ids[ent.start]
if kb_id:
entity_encoding = self.kb.get_vector(kb_id)
@@ -415,6 +420,17 @@ class EntityLinker(Pipe):
for token in ent:
token.ent_kb_id_ = kb_id
+ def score(self, examples, **kwargs):
+ """Score a batch of examples.
+
+ examples (Iterable[Example]): The examples to score.
+ RETURNS (Dict[str, Any]): The scores.
+
+ DOCS TODO: https://nightly.spacy.io/api/entity_linker#score
+ """
+ validate_examples(examples, "EntityLinker.score")
+ return Scorer.score_links(examples, negative_labels=[self.NIL])
+
def to_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
) -> None:
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx
index c9b0a5031..fc0dda40d 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@@ -6,7 +6,7 @@ from .transition_parser cimport Parser
from ._parser_internals.ner cimport BiluoPushDown
from ..language import Language
-from ..scorer import Scorer
+from ..scorer import get_ner_prf, PRFScore
from ..training import validate_examples
@@ -117,9 +117,18 @@ cdef class EntityRecognizer(Parser):
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
- RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
+ RETURNS (Dict[str, Any]): The NER precision, recall and f-scores.
DOCS: https://nightly.spacy.io/api/entityrecognizer#score
"""
validate_examples(examples, "EntityRecognizer.score")
- return Scorer.score_spans(examples, "ents", **kwargs)
+ score_per_type = get_ner_prf(examples)
+ totals = PRFScore()
+ for prf in score_per_type.values():
+ totals += prf
+ return {
+ "ents_p": totals.precision,
+ "ents_r": totals.recall,
+ "ents_f": totals.fscore,
+ "ents_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
+ }
diff --git a/spacy/schemas.py b/spacy/schemas.py
index eea6639d3..0c85dfe57 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -61,6 +61,8 @@ class TokenPatternString(BaseModel):
REGEX: Optional[StrictStr] = Field(None, alias="regex")
IN: Optional[List[StrictStr]] = Field(None, alias="in")
NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in")
+ IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset")
+ IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
class Config:
extra = "forbid"
@@ -77,6 +79,8 @@ class TokenPatternNumber(BaseModel):
REGEX: Optional[StrictStr] = Field(None, alias="regex")
IN: Optional[List[StrictInt]] = Field(None, alias="in")
NOT_IN: Optional[List[StrictInt]] = Field(None, alias="not_in")
+ ISSUBSET: Optional[List[StrictInt]] = Field(None, alias="issubset")
+ ISSUPERSET: Optional[List[StrictInt]] = Field(None, alias="issuperset")
EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==")
NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=")
GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=")
@@ -115,6 +119,7 @@ class TokenPattern(BaseModel):
lower: Optional[StringValue] = None
pos: Optional[StringValue] = None
tag: Optional[StringValue] = None
+ morph: Optional[StringValue] = None
dep: Optional[StringValue] = None
lemma: Optional[StringValue] = None
shape: Optional[StringValue] = None
diff --git a/spacy/scorer.py b/spacy/scorer.py
index c50de3d43..b2f97e163 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -1,5 +1,6 @@
from typing import Optional, Iterable, Dict, Any, Callable, TYPE_CHECKING
import numpy as np
+from collections import defaultdict
from .training import Example
from .tokens import Token, Doc, Span
@@ -23,6 +24,19 @@ class PRFScore:
self.fp = 0
self.fn = 0
+ def __iadd__(self, other):
+ self.tp += other.tp
+ self.fp += other.fp
+ self.fn += other.fn
+ return self
+
+ def __add__(self, other):
+ return PRFScore(
+ tp=self.tp+other.tp,
+ fp=self.fp+other.fp,
+ fn=self.fn+other.fn
+ )
+
def score_set(self, cand: set, gold: set) -> None:
self.tp += len(cand.intersection(gold))
self.fp += len(cand - gold)
@@ -295,12 +309,6 @@ class Scorer:
# Find all predidate labels, for all and per type
gold_spans = set()
pred_spans = set()
- # Special case for ents:
- # If we have missing values in the gold, we can't easily tell
- # whether our NER predictions are true.
- # It seems bad but it's what we've always done.
- if attr == "ents" and not all(token.ent_iob != 0 for token in gold_doc):
- continue
for span in getter(gold_doc, attr):
gold_span = (span.label_, span.start, span.end - 1)
gold_spans.add(gold_span)
@@ -451,6 +459,74 @@ class Scorer:
results[f"{attr}_score_desc"] = "macro AUC"
return results
+ @staticmethod
+ def score_links(
+ examples: Iterable[Example], *, negative_labels: Iterable[str]
+ ) -> Dict[str, Any]:
+ """Returns PRF for predicted links on the entity level.
+ To disentangle the performance of the NEL from the NER,
+ this method only evaluates NEL links for entities that overlap
+ between the gold reference and the predictions.
+
+ examples (Iterable[Example]): Examples to score
+ negative_labels (Iterable[str]): The string values that refer to no annotation (e.g. "NIL")
+ RETURNS (Dict[str, Any]): A dictionary containing the scores.
+
+ DOCS (TODO): https://nightly.spacy.io/api/scorer#score_links
+ """
+ f_per_type = {}
+ for example in examples:
+ gold_ent_by_offset = {}
+ for gold_ent in example.reference.ents:
+ gold_ent_by_offset[(gold_ent.start_char, gold_ent.end_char)] = gold_ent
+
+ for pred_ent in example.predicted.ents:
+ gold_span = gold_ent_by_offset.get(
+ (pred_ent.start_char, pred_ent.end_char), None
+ )
+ label = gold_span.label_
+ if not label in f_per_type:
+ f_per_type[label] = PRFScore()
+ gold = gold_span.kb_id_
+ # only evaluating entities that overlap between gold and pred,
+ # to disentangle the performance of the NEL from the NER
+ if gold is not None:
+ pred = pred_ent.kb_id_
+ if gold in negative_labels and pred in negative_labels:
+ # ignore true negatives
+ pass
+ elif gold == pred:
+ f_per_type[label].tp += 1
+ elif gold in negative_labels:
+ f_per_type[label].fp += 1
+ elif pred in negative_labels:
+ f_per_type[label].fn += 1
+ else:
+ # a wrong prediction (e.g. Q42 != Q3) counts as both a FP as well as a FN
+ f_per_type[label].fp += 1
+ f_per_type[label].fn += 1
+ micro_prf = PRFScore()
+ for label_prf in f_per_type.values():
+ micro_prf.tp += label_prf.tp
+ micro_prf.fn += label_prf.fn
+ micro_prf.fp += label_prf.fp
+ n_labels = len(f_per_type) + 1e-100
+ macro_p = sum(prf.precision for prf in f_per_type.values()) / n_labels
+ macro_r = sum(prf.recall for prf in f_per_type.values()) / n_labels
+ macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_labels
+ results = {
+ f"nel_score": micro_prf.fscore,
+ f"nel_score_desc": "micro F",
+ f"nel_micro_p": micro_prf.precision,
+ f"nel_micro_r": micro_prf.recall,
+ f"nel_micro_f": micro_prf.fscore,
+ f"nel_macro_p": macro_p,
+ f"nel_macro_r": macro_r,
+ f"nel_macro_f": macro_f,
+ f"nel_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
+ }
+ return results
+
@staticmethod
def score_deps(
examples: Iterable[Example],
@@ -545,6 +621,39 @@ class Scorer:
}
+def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]:
+ """Compute per-entity PRFScore objects for a sequence of examples. The
+ results are returned as a dictionary keyed by the entity type. You can
+ add the PRFScore objects to get micro-averaged total.
+ """
+ scores = defaultdict(PRFScore)
+ for eg in examples:
+ if not eg.y.has_annotation("ENT_IOB"):
+ continue
+ golds = {(e.label_, e.start, e.end) for e in eg.y.ents}
+ align_x2y = eg.alignment.x2y
+ preds = set()
+ for pred_ent in eg.x.ents:
+ if pred_ent.label_ not in scores:
+ scores[pred_ent.label_] = PRFScore()
+ indices = align_x2y[pred_ent.start : pred_ent.end].dataXd.ravel()
+ if len(indices):
+ g_span = eg.y[indices[0] : indices[-1] + 1]
+ # Check we aren't missing annotation on this span. If so,
+ # our prediction is neither right nor wrong, we just
+ # ignore it.
+ if all(token.ent_iob != 0 for token in g_span):
+ key = (pred_ent.label_, indices[0], indices[-1] + 1)
+ if key in golds:
+ scores[pred_ent.label_].tp += 1
+ golds.remove(key)
+ else:
+ scores[pred_ent.label_].fp += 1
+ for label, start, end in golds:
+ scores[label].fn += 1
+ return scores
+
+
#############################################################################
#
# The following implementation of roc_auc_score() is adapted from
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index 04f9585f1..627110cdd 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -230,6 +230,106 @@ def test_matcher_set_value_operator(en_vocab):
assert len(matches) == 1
+def test_matcher_subset_value_operator(en_vocab):
+ matcher = Matcher(en_vocab)
+ pattern = [{"MORPH": {"IS_SUBSET": ["Feat=Val", "Feat2=Val2"]}}]
+ matcher.add("M", [pattern])
+ doc = Doc(en_vocab, words=["a", "b", "c"])
+ assert len(matcher(doc)) == 3
+ doc[0].morph_ = "Feat=Val"
+ assert len(matcher(doc)) == 3
+ doc[0].morph_ = "Feat=Val|Feat2=Val2"
+ assert len(matcher(doc)) == 3
+ doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3"
+ assert len(matcher(doc)) == 2
+ doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4"
+ assert len(matcher(doc)) == 2
+
+ # IS_SUBSET acts like "IN" for attrs other than MORPH
+ matcher = Matcher(en_vocab)
+ pattern = [{"TAG": {"IS_SUBSET": ["A", "B"]}}]
+ matcher.add("M", [pattern])
+ doc = Doc(en_vocab, words=["a", "b", "c"])
+ doc[0].tag_ = "A"
+ assert len(matcher(doc)) == 1
+
+ # IS_SUBSET with an empty list matches nothing
+ matcher = Matcher(en_vocab)
+ pattern = [{"TAG": {"IS_SUBSET": []}}]
+ matcher.add("M", [pattern])
+ doc = Doc(en_vocab, words=["a", "b", "c"])
+ doc[0].tag_ = "A"
+ assert len(matcher(doc)) == 0
+
+
+def test_matcher_superset_value_operator(en_vocab):
+ matcher = Matcher(en_vocab)
+ pattern = [{"MORPH": {"IS_SUPERSET": ["Feat=Val", "Feat2=Val2", "Feat3=Val3"]}}]
+ matcher.add("M", [pattern])
+ doc = Doc(en_vocab, words=["a", "b", "c"])
+ assert len(matcher(doc)) == 0
+ doc[0].morph_ = "Feat=Val|Feat2=Val2"
+ assert len(matcher(doc)) == 0
+ doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3"
+ assert len(matcher(doc)) == 1
+ doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4"
+ assert len(matcher(doc)) == 1
+
+ # IS_SUPERSET with more than one value only matches for MORPH
+ matcher = Matcher(en_vocab)
+ pattern = [{"TAG": {"IS_SUPERSET": ["A", "B"]}}]
+ matcher.add("M", [pattern])
+ doc = Doc(en_vocab, words=["a", "b", "c"])
+ doc[0].tag_ = "A"
+ assert len(matcher(doc)) == 0
+
+ # IS_SUPERSET with one value is the same as ==
+ matcher = Matcher(en_vocab)
+ pattern = [{"TAG": {"IS_SUPERSET": ["A"]}}]
+ matcher.add("M", [pattern])
+ doc = Doc(en_vocab, words=["a", "b", "c"])
+ doc[0].tag_ = "A"
+ assert len(matcher(doc)) == 1
+
+ # IS_SUPERSET with an empty value matches everything
+ matcher = Matcher(en_vocab)
+ pattern = [{"TAG": {"IS_SUPERSET": []}}]
+ matcher.add("M", [pattern])
+ doc = Doc(en_vocab, words=["a", "b", "c"])
+ doc[0].tag_ = "A"
+ assert len(matcher(doc)) == 3
+
+
+def test_matcher_morph_handling(en_vocab):
+ # order of features in pattern doesn't matter
+ matcher = Matcher(en_vocab)
+ pattern1 = [{"MORPH": {"IN": ["Feat1=Val1|Feat2=Val2"]}}]
+ pattern2 = [{"MORPH": {"IN": ["Feat2=Val2|Feat1=Val1"]}}]
+ matcher.add("M", [pattern1])
+ matcher.add("N", [pattern2])
+ doc = Doc(en_vocab, words=["a", "b", "c"])
+ assert len(matcher(doc)) == 0
+
+ doc[0].morph_ = "Feat2=Val2|Feat1=Val1"
+ assert len(matcher(doc)) == 2
+ doc[0].morph_ = "Feat1=Val1|Feat2=Val2"
+ assert len(matcher(doc)) == 2
+
+ # multiple values are split
+ matcher = Matcher(en_vocab)
+ pattern1 = [{"MORPH": {"IS_SUPERSET": ["Feat1=Val1", "Feat2=Val2"]}}]
+ pattern2 = [{"MORPH": {"IS_SUPERSET": ["Feat1=Val1", "Feat1=Val3", "Feat2=Val2"]}}]
+ matcher.add("M", [pattern1])
+ matcher.add("N", [pattern2])
+ doc = Doc(en_vocab, words=["a", "b", "c"])
+ assert len(matcher(doc)) == 0
+
+ doc[0].morph_ = "Feat2=Val2,Val3|Feat1=Val1"
+ assert len(matcher(doc)) == 1
+ doc[0].morph_ = "Feat1=Val1,Val3|Feat2=Val2"
+ assert len(matcher(doc)) == 2
+
+
def test_matcher_regex(en_vocab):
matcher = Matcher(en_vocab)
pattern = [{"ORTH": {"REGEX": r"(?:a|an)"}}]
@@ -316,6 +416,9 @@ def test_attr_pipeline_checks(en_vocab):
matcher(doc2)
with pytest.raises(ValueError):
matcher(doc3)
+ # errors can be suppressed if desired
+ matcher(doc2, allow_missing=True)
+ matcher(doc3, allow_missing=True)
# TAG, POS, LEMMA require those values
for attr in ("TAG", "POS", "LEMMA"):
matcher = Matcher(en_vocab)
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 88e0646b3..878f41a28 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -2,8 +2,10 @@ from typing import Callable, Iterable
import pytest
from spacy.kb import KnowledgeBase, get_candidates, Candidate
+from spacy.vocab import Vocab
from spacy import util, registry
+from spacy.scorer import Scorer
from spacy.training import Example
from spacy.lang.en import English
from spacy.tests.util import make_tempdir
@@ -151,22 +153,15 @@ def test_kb_serialize(nlp):
# normal read-write behaviour
mykb.to_disk(d / "kb")
mykb.from_disk(d / "kb")
- mykb.to_disk(d / "kb.file")
- mykb.from_disk(d / "kb.file")
mykb.to_disk(d / "new" / "kb")
mykb.from_disk(d / "new" / "kb")
# allow overwriting an existing file
- mykb.to_disk(d / "kb.file")
- with pytest.raises(ValueError):
- # can not write to a directory
- mykb.to_disk(d)
- with pytest.raises(ValueError):
- # can not read from a directory
- mykb.from_disk(d)
+ mykb.to_disk(d / "kb")
with pytest.raises(ValueError):
# can not read from an unknown file
mykb.from_disk(d / "unknown" / "kb")
+
def test_candidate_generation(nlp):
"""Test correct candidate generation"""
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
@@ -254,6 +249,41 @@ def test_el_pipe_configuration(nlp):
assert doc[2].ent_kb_id_ == "Q2"
+def test_vocab_serialization(nlp):
+ """Test that string information is retained across storage"""
+ mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+
+ # adding entities
+ q1_hash = mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
+ q2_hash = mykb.add_entity(entity="Q2", freq=12, entity_vector=[2])
+ q3_hash = mykb.add_entity(entity="Q3", freq=5, entity_vector=[3])
+
+ # adding aliases
+ douglas_hash = mykb.add_alias(
+ alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1]
+ )
+ adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
+
+ candidates = mykb.get_alias_candidates("adam")
+ assert len(candidates) == 1
+ assert candidates[0].entity == q2_hash
+ assert candidates[0].entity_ == "Q2"
+ assert candidates[0].alias == adam_hash
+ assert candidates[0].alias_ == "adam"
+
+ with make_tempdir() as d:
+ mykb.to_disk(d / "kb")
+ kb_new_vocab = KnowledgeBase(Vocab(), entity_vector_length=1)
+ kb_new_vocab.from_disk(d / "kb")
+
+ candidates = kb_new_vocab.get_alias_candidates("adam")
+ assert len(candidates) == 1
+ assert candidates[0].entity == q2_hash
+ assert candidates[0].entity_ == "Q2"
+ assert candidates[0].alias == adam_hash
+ assert candidates[0].alias_ == "adam"
+
+
def test_append_alias(nlp):
"""Test that we can append additional alias-entity pairs"""
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
@@ -377,16 +407,20 @@ def test_preserving_links_ents_2(nlp):
TRAIN_DATA = [
("Russ Cochran captured his first major title with his son as caddie.",
{"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}},
- "entities": [(0, 12, "PERSON")]}),
+ "entities": [(0, 12, "PERSON")],
+ "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}),
("Russ Cochran his reprints include EC Comics.",
{"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}},
- "entities": [(0, 12, "PERSON")]}),
+ "entities": [(0, 12, "PERSON")],
+ "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0]}),
("Russ Cochran has been publishing comic art.",
{"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}},
- "entities": [(0, 12, "PERSON")]}),
+ "entities": [(0, 12, "PERSON")],
+ "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0]}),
("Russ Cochran was a member of University of Kentucky's golf team.",
{"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}},
- "entities": [(0, 12, "PERSON"), (43, 51, "LOC")]}),
+ "entities": [(0, 12, "PERSON"), (43, 51, "LOC")],
+ "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]})
]
GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
# fmt: on
@@ -395,16 +429,8 @@ GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
def test_overfitting_IO():
# Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
nlp = English()
- nlp.add_pipe("sentencizer")
vector_length = 3
- # Add a custom component to recognize "Russ Cochran" as an entity for the example training data
- patterns = [
- {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}
- ]
- ruler = nlp.add_pipe("entity_ruler")
- ruler.add_patterns(patterns)
-
# Convert the texts to docs to make sure we have doc.ents set for the training examples
train_examples = []
for text, annotation in TRAIN_DATA:
@@ -446,6 +472,16 @@ def test_overfitting_IO():
nlp.update(train_examples, sgd=optimizer, losses=losses)
assert losses["entity_linker"] < 0.001
+ # adding additional components that are required for the entity_linker
+ nlp.add_pipe("sentencizer", first=True)
+
+ # Add a custom component to recognize "Russ Cochran" as an entity for the example training data
+ patterns = [
+ {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}
+ ]
+ ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
+ ruler.add_patterns(patterns)
+
# test the trained model
predictions = []
for text, annotation in TRAIN_DATA:
@@ -465,3 +501,46 @@ def test_overfitting_IO():
for ent in doc2.ents:
predictions.append(ent.kb_id_)
assert predictions == GOLD_entities
+
+
+def test_scorer_links():
+ train_examples = []
+ nlp = English()
+ ref1 = nlp("Julia lives in London happily.")
+ ref1.ents = [
+ Span(ref1, 0, 1, label="PERSON", kb_id="Q2"),
+ Span(ref1, 3, 4, label="LOC", kb_id="Q3"),
+ ]
+ pred1 = nlp("Julia lives in London happily.")
+ pred1.ents = [
+ Span(pred1, 0, 1, label="PERSON", kb_id="Q70"),
+ Span(pred1, 3, 4, label="LOC", kb_id="Q3"),
+ ]
+ train_examples.append(Example(pred1, ref1))
+
+ ref2 = nlp("She loves London.")
+ ref2.ents = [
+ Span(ref2, 0, 1, label="PERSON", kb_id="Q2"),
+ Span(ref2, 2, 3, label="LOC", kb_id="Q13"),
+ ]
+ pred2 = nlp("She loves London.")
+ pred2.ents = [
+ Span(pred2, 0, 1, label="PERSON", kb_id="Q2"),
+ Span(pred2, 2, 3, label="LOC", kb_id="NIL"),
+ ]
+ train_examples.append(Example(pred2, ref2))
+
+ ref3 = nlp("London is great.")
+ ref3.ents = [Span(ref3, 0, 1, label="LOC", kb_id="NIL")]
+ pred3 = nlp("London is great.")
+ pred3.ents = [Span(pred3, 0, 1, label="LOC", kb_id="NIL")]
+ train_examples.append(Example(pred3, ref3))
+
+ scores = Scorer().score_links(train_examples, negative_labels=["NIL"])
+ assert scores["nel_f_per_type"]["PERSON"]["p"] == 1 / 2
+ assert scores["nel_f_per_type"]["PERSON"]["r"] == 1 / 2
+ assert scores["nel_f_per_type"]["LOC"]["p"] == 1 / 1
+ assert scores["nel_f_per_type"]["LOC"]["r"] == 1 / 2
+
+ assert scores["nel_micro_p"] == 2 / 3
+ assert scores["nel_micro_r"] == 2 / 4
diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py
index 4c197005e..07648024c 100644
--- a/spacy/tests/pipeline/test_pipe_factories.py
+++ b/spacy/tests/pipeline/test_pipe_factories.py
@@ -345,12 +345,13 @@ def test_language_factories_invalid():
[{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}],
{"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25},
),
- ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75},),
+ ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75}),
+ ([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"a": 0.0, "b": 0.0, "c": 0.0}),
],
)
def test_language_factories_combine_score_weights(weights, expected):
result = combine_score_weights(weights)
- assert sum(result.values()) in (0.99, 1.0)
+ assert sum(result.values()) in (0.99, 1.0, 0.0)
assert result == expected
diff --git a/spacy/tests/test_new_example.py b/spacy/tests/training/test_new_example.py
similarity index 91%
rename from spacy/tests/test_new_example.py
rename to spacy/tests/training/test_new_example.py
index 597809286..81207b640 100644
--- a/spacy/tests/test_new_example.py
+++ b/spacy/tests/training/test_new_example.py
@@ -244,3 +244,22 @@ def test_Example_from_dict_with_links_invalid(annots):
predicted = Doc(vocab, words=annots["words"])
with pytest.raises(ValueError):
Example.from_dict(predicted, annots)
+
+
+def test_Example_from_dict_sentences():
+ vocab = Vocab()
+ predicted = Doc(vocab, words=["One", "sentence", ".", "one", "more"])
+ annots = {"sent_starts": [1, 0, 0, 1, 0]}
+ ex = Example.from_dict(predicted, annots)
+ assert len(list(ex.reference.sents)) == 2
+
+ # this currently throws an error - bug or feature?
+ # predicted = Doc(vocab, words=["One", "sentence", "not", "one", "more"])
+ # annots = {"sent_starts": [1, 0, 0, 0, 0]}
+ # ex = Example.from_dict(predicted, annots)
+ # assert len(list(ex.reference.sents)) == 1
+
+ predicted = Doc(vocab, words=["One", "sentence", "not", "one", "more"])
+ annots = {"sent_starts": [1, -1, 0, 0, 0]}
+ ex = Example.from_dict(predicted, annots)
+ assert len(list(ex.reference.sents)) == 1
diff --git a/spacy/training/example.pxd b/spacy/training/example.pxd
index e06e36287..49e239757 100644
--- a/spacy/training/example.pxd
+++ b/spacy/training/example.pxd
@@ -1,4 +1,5 @@
from ..tokens.doc cimport Doc
+from libc.stdint cimport uint64_t
cdef class Example:
@@ -7,3 +8,5 @@ cdef class Example:
cdef readonly object _cached_alignment
cdef readonly object _cached_words_x
cdef readonly object _cached_words_y
+ cdef readonly uint64_t _x_sig
+ cdef readonly uint64_t _y_sig
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 1e7bea5df..f2c78203a 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -1,6 +1,7 @@
from collections import Iterable as IterableInstance
import warnings
import numpy
+from murmurhash.mrmr cimport hash64
from ..tokens.doc cimport Doc
from ..tokens.span cimport Span
@@ -97,15 +98,36 @@ cdef class Example:
@property
def alignment(self):
- words_x = [token.text for token in self.x]
- words_y = [token.text for token in self.y]
- if self._cached_alignment is None or \
- words_x != self._cached_words_x or \
- words_y != self._cached_words_y:
- self._cached_alignment = Alignment.from_strings(words_x, words_y)
+ x_sig = hash64(self.x.c, sizeof(self.x.c[0]) * self.x.length, 0)
+ y_sig = hash64(self.y.c, sizeof(self.y.c[0]) * self.y.length, 0)
+ if self._cached_alignment is None:
+ words_x = [token.text for token in self.x]
+ words_y = [token.text for token in self.y]
+ self._x_sig = x_sig
+ self._y_sig = y_sig
self._cached_words_x = words_x
self._cached_words_y = words_y
- return self._cached_alignment
+ self._cached_alignment = Alignment.from_strings(words_x, words_y)
+ return self._cached_alignment
+ elif self._x_sig == x_sig and self._y_sig == y_sig:
+ # If we have a cached alignment, check whether the cache is invalid
+ # due to retokenization. To make this check fast in loops, we first
+ # check a hash of the TokenC arrays.
+ return self._cached_alignment
+ else:
+ words_x = [token.text for token in self.x]
+ words_y = [token.text for token in self.y]
+ if words_x == self._cached_words_x and words_y == self._cached_words_y:
+ self._x_sig = x_sig
+ self._y_sig = y_sig
+ return self._cached_alignment
+ else:
+ self._cached_alignment = Alignment.from_strings(words_x, words_y)
+ self._cached_words_x = words_x
+ self._cached_words_y = words_y
+ self._x_sig = x_sig
+ self._y_sig = y_sig
+ return self._cached_alignment
def get_aligned(self, field, as_string=False):
"""Return an aligned array for a token attribute."""
@@ -288,7 +310,6 @@ def _annot2array(vocab, tok_annot, doc_annot):
def _add_entities_to_doc(doc, ner_data):
- print(ner_data)
if ner_data is None:
return
elif ner_data == []:
diff --git a/spacy/util.py b/spacy/util.py
index 709da8d29..378ec2823 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1233,8 +1233,13 @@ def combine_score_weights(
# components.
total = sum(w_dict.values())
for key, value in w_dict.items():
- weight = round(value / total / len(all_weights), 2)
- result[key] = result.get(key, 0.0) + weight
+ if total == 0:
+ weight = 0.0
+ else:
+ weight = round(value / total / len(all_weights), 2)
+ prev_weight = result.get(key, 0.0)
+ prev_weight = 0.0 if prev_weight is None else prev_weight
+ result[key] = prev_weight + weight
return result
diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md
index 9cb35b487..945a1568a 100644
--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@@ -225,6 +225,21 @@ pipe's entity linking model and context encoder. Delegates to
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
+## EntityLinker.score {#score tag="method" new="3"}
+
+Score a batch of examples.
+
+> #### Example
+>
+> ```python
+> scores = entity_linker.score(examples)
+> ```
+
+| Name | Description |
+| ----------- | ---------------------------------------------------------------------------------------------- |
+| `examples` | The examples to score. ~~Iterable[Example]~~ |
+| **RETURNS** | The scores, produced by [`Scorer.score_links`](/api/scorer#score_links) . ~~Dict[str, float]~~ |
+
## EntityLinker.create_optimizer {#create_optimizer tag="method"}
Create an optimizer for the pipeline component.
diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md
index 8af73f44b..6d710f425 100644
--- a/website/docs/api/entityrecognizer.md
+++ b/website/docs/api/entityrecognizer.md
@@ -242,10 +242,10 @@ Score a batch of examples.
> scores = ner.score(examples)
> ```
-| Name | Description |
-| ----------- | ---------------------------------------------------------------------------------------------------------------------- |
-| `examples` | The examples to score. ~~Iterable[Example]~~ |
-| **RETURNS** | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
+| Name | Description |
+| ----------- | --------------------------------------------------------- |
+| `examples` | The examples to score. ~~Iterable[Example]~~ |
+| **RETURNS** | The scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
## EntityRecognizer.create_optimizer {#create_optimizer tag="method"}
diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md
index 1f1946be5..3f7076a1c 100644
--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@@ -30,20 +30,20 @@ pattern keys correspond to a number of
[`Token` attributes](/api/token#attributes). The supported attributes for
rule-based matching are:
-| Attribute | Description |
-| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
-| `ORTH` | The exact verbatim text of a token. ~~str~~ |
-| `TEXT` 2.1 | The exact verbatim text of a token. ~~str~~ |
-| `LOWER` | The lowercase form of the token text. ~~str~~ |
-| `LENGTH` | The length of the token text. ~~int~~ |
-| `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~ |
-| `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | Token text is in lowercase, uppercase, titlecase. ~~bool~~ |
-| `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ |
-| `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ |
-| `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. ~~str~~ |
-| `ENT_TYPE` | The token's entity label. ~~str~~ |
-| `_` 2.1 | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
-| `OP` | Operator or quantifier to determine how often to match a token pattern. ~~str~~ |
+| Attribute | Description |
+| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
+| `ORTH` | The exact verbatim text of a token. ~~str~~ |
+| `TEXT` 2.1 | The exact verbatim text of a token. ~~str~~ |
+| `LOWER` | The lowercase form of the token text. ~~str~~ |
+| `LENGTH` | The length of the token text. ~~int~~ |
+| `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~ |
+| `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | Token text is in lowercase, uppercase, titlecase. ~~bool~~ |
+| `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ |
+| `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ |
+| `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~ |
+| `ENT_TYPE` | The token's entity label. ~~str~~ |
+| `_` 2.1 | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
+| `OP` | Operator or quantifier to determine how often to match a token pattern. ~~str~~ |
Operators and quantifiers define **how often** a token pattern should be
matched:
@@ -79,6 +79,8 @@ it compares to another value.
| -------------------------- | ------------------------------------------------------------------------------------------------------- |
| `IN` | Attribute value is member of a list. ~~Any~~ |
| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ |
+| `ISSUBSET` | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~ |
+| `ISSUPERSET` | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~ |
| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
## Matcher.\_\_init\_\_ {#init tag="method"}
diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md
index 1c0895bcf..0dbc0de33 100644
--- a/website/docs/api/scorer.md
+++ b/website/docs/api/scorer.md
@@ -206,3 +206,26 @@ depends on the scorer settings:
| `multi_label` | Whether the attribute allows multiple labels. Defaults to `True`. ~~bool~~ |
| `positive_label` | The positive label for a binary task with exclusive classes. Defaults to `None`. ~~Optional[str]~~ |
| **RETURNS** | A dictionary containing the scores, with inapplicable scores as `None`. ~~Dict[str, Optional[float]]~~ |
+
+## Scorer.score_links {#score_links tag="staticmethod" new="3"}
+
+Returns PRF for predicted links on the entity level. To disentangle the
+performance of the NEL from the NER, this method only evaluates NEL links for
+entities that overlap between the gold reference and the predictions.
+
+> #### Example
+>
+> ```python
+> scores = Scorer.score_links(
+> examples,
+> negative_labels=["NIL", ""]
+> )
+> print(scores["nel_micro_f"])
+> ```
+
+| Name | Description |
+| ----------------- | ------------------------------------------------------------------------------------------------------------------- |
+| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
+| _keyword-only_ | |
+| `negative_labels` | The string values that refer to no annotation (e.g. "NIL"). ~~Iterable[str]~~ |
+| **RETURNS** | A dictionary containing the scores. ~~Dict[str, Optional[float]]~~ |
diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md
index 7e979b32e..256f4ccb4 100644
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@@ -158,20 +158,20 @@ The available token pattern keys correspond to a number of
[`Token` attributes](/api/token#attributes). The supported attributes for
rule-based matching are:
-| Attribute | Description |
-| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
-| `ORTH` | The exact verbatim text of a token. ~~str~~ |
-| `TEXT` 2.1 | The exact verbatim text of a token. ~~str~~ |
-| `LOWER` | The lowercase form of the token text. ~~str~~ |
-| `LENGTH` | The length of the token text. ~~int~~ |
-| `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~ |
-| `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | Token text is in lowercase, uppercase, titlecase. ~~bool~~ |
-| `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ |
-| `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ |
-| `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. ~~str~~ |
-| `ENT_TYPE` | The token's entity label. ~~str~~ |
-| `_` 2.1 | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
-| `OP` | [Operator or quantifier](#quantifiers) to determine how often to match a token pattern. ~~str~~ |
+| Attribute | Description |
+| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
+| `ORTH` | The exact verbatim text of a token. ~~str~~ |
+| `TEXT` 2.1 | The exact verbatim text of a token. ~~str~~ |
+| `LOWER` | The lowercase form of the token text. ~~str~~ |
+| `LENGTH` | The length of the token text. ~~int~~ |
+| `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~ |
+| `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | Token text is in lowercase, uppercase, titlecase. ~~bool~~ |
+| `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ |
+| `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ |
+| `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~ |
+| `ENT_TYPE` | The token's entity label. ~~str~~ |
+| `_` 2.1 | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
+| `OP` | [Operator or quantifier](#quantifiers) to determine how often to match a token pattern. ~~str~~ |
@@ -236,6 +236,8 @@ following rich comparison attributes are available:
| -------------------------- | ------------------------------------------------------------------------------------------------------- |
| `IN` | Attribute value is member of a list. ~~Any~~ |
| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ |
+| `ISSUBSET` | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~ |
+| `ISSUPERSET` | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~ |
| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
#### Regular expressions {#regex new="2.1"}
diff --git a/website/src/templates/models.js b/website/src/templates/models.js
index cdfe2e46d..f9895334d 100644
--- a/website/src/templates/models.js
+++ b/website/src/templates/models.js
@@ -11,12 +11,24 @@ import { Table, Tr, Td, Th } from '../components/table'
import Tag from '../components/tag'
import { H2, Label } from '../components/typography'
import Icon from '../components/icon'
-import Link from '../components/link'
+import Link, { OptionalLink } from '../components/link'
import Infobox from '../components/infobox'
import Accordion from '../components/accordion'
import { join, arrayToObj, abbrNum, markdownToReact } from '../components/util'
import { isString, isEmptyObj } from '../components/util'
+const COMPONENT_LINKS = {
+ tok2vec: '/api/tok2vec',
+ transformer: '/api/transformer',
+ tagger: '/api/tagger',
+ parser: '/api/dependencyparser',
+ ner: '/api/entityrecognizer',
+ lemmatizer: '/api/lemmatizer',
+ attribute_ruler: '/api/attributeruler',
+ senter: '/api/sentencerecognizer',
+ morphologizer: '/api/morphologizer',
+}
+
const MODEL_META = {
core: 'Vocabulary, syntax, entities, vectors',
core_sm: 'Vocabulary, syntax, entities',
@@ -78,10 +90,15 @@ function isStableVersion(v) {
return !v.includes('a') && !v.includes('b') && !v.includes('dev') && !v.includes('rc')
}
-function getLatestVersion(modelId, compatibility) {
+function getLatestVersion(modelId, compatibility, prereleases) {
for (let [version, models] of Object.entries(compatibility)) {
if (isStableVersion(version) && models[modelId]) {
- return models[modelId][0]
+ const modelVersions = models[modelId]
+ for (let modelVersion of modelVersions) {
+ if (isStableVersion(modelVersion) || prereleases) {
+ return modelVersion
+ }
+ }
}
}
}
@@ -141,18 +158,44 @@ function formatSources(data = []) {
))
}
+function linkComponents(components = []) {
+ return join(
+ components.map(c => (
+
+
+ {c}
+
+
+ ))
+ )
+}
+
const Help = ({ children }) => (
)
-const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExamples, licenses }) => {
+const Model = ({
+ name,
+ langId,
+ langName,
+ baseUrl,
+ repo,
+ compatibility,
+ hasExamples,
+ licenses,
+ prereleases,
+}) => {
const [initialized, setInitialized] = useState(false)
const [isError, setIsError] = useState(true)
const [meta, setMeta] = useState({})
const { type, genre, size } = getModelComponents(name)
- const version = useMemo(() => getLatestVersion(name, compatibility), [name, compatibility])
+ const version = useMemo(() => getLatestVersion(name, compatibility, prereleases), [
+ name,
+ compatibility,
+ prereleases,
+ ])
useEffect(() => {
window.dispatchEvent(new Event('resize')) // scroll position for progress
@@ -173,10 +216,8 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
const releaseTag = meta.fullName ? `/tag/${meta.fullName}` : ''
const releaseUrl = `https://github.com/${repo}/releases/${releaseTag}`
- const pipeline =
- meta.pipeline && join(meta.pipeline.map(p => {p}))
- const components =
- meta.components && join(meta.components.map(p => {p}))
+ const pipeline = linkComponents(meta.pipeline)
+ const components = linkComponents(meta.components)
const sources = formatSources(meta.sources)
const author = !meta.url ? meta.author : {meta.author}
const licenseUrl = licenses[meta.license] ? licenses[meta.license].url : null
@@ -332,7 +373,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
const Models = ({ pageContext, repo, children }) => {
const [initialized, setInitialized] = useState(false)
const [compatibility, setCompatibility] = useState({})
- const { id, title, meta, hasExamples } = pageContext
+ const { id, title, meta } = pageContext
const { models, isStarters } = meta
const baseUrl = `https://raw.githubusercontent.com/${repo}/master`
@@ -381,6 +422,7 @@ const Models = ({ pageContext, repo, children }) => {
repo={repo}
licenses={arrayToObj(site.siteMetadata.licenses, 'id')}
hasExamples={meta.hasExamples}
+ prereleases={site.siteMetadata.nightly}
/>
))
}
@@ -397,6 +439,7 @@ const query = graphql`
query ModelsQuery {
site {
siteMetadata {
+ nightly
licenses {
id
url