Merge branch 'develop' into nightly.spacy.io

2025-08-01 19:00:20 +03:00 · 2020-09-25 13:21:55 +02:00 · 2020-09-25 13:21:55 +02:00 · f3aba49830
commit f3aba49830
parent 35cfe09348 02a1b6ab83
24 changed files with 651 additions and 152 deletions
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a24"
+__version__ = "3.0.0a25"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
--- a/spacy/cli/project/pull.py
+++ b/spacy/cli/project/pull.py
@ -51,7 +51,7 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
                    update_lockfile(project_dir, cmd)
                # We remove the command from the list here, and break, so that
                # we iterate over the loop again.
-                commands.remove(i)
+                commands.pop(i)
                break
        else:
            # If we didn't break the for loop, break the while loop.
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -401,10 +401,6 @@ class Errors:
            "Matcher or PhraseMatcher with the attribute {attr}. "
            "Try using nlp() instead of nlp.make_doc() or list(nlp.pipe()) "
            "instead of list(nlp.tokenizer.pipe()).")
-    E156 = ("The pipeline needs to include a parser in order to use "
-            "Matcher or PhraseMatcher with the attribute DEP. Try using "
-            "nlp() instead of nlp.make_doc() or list(nlp.pipe()) instead of "
-            "list(nlp.tokenizer.pipe()).")
    E157 = ("Can't render negative values for dependency arc start or end. "
            "Make sure that you're passing in absolute token indices, not "
            "relative token offsets.\nstart: {start}, end: {end}, label: "
@ -517,8 +513,8 @@ class Errors:
            "instead.")
    E927 = ("Can't write to frozen list Maybe you're trying to modify a computed "
            "property or default function argument?")
-    E928 = ("A 'KnowledgeBase' should be written to / read from a file, but the "
-            "provided argument {loc} is an existing directory.")
+    E928 = ("A 'KnowledgeBase' can only be serialized to/from from a directory, "
+            "but the provided argument {loc} points to a file.")
    E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does "
            "not seem to exist.")
    E930 = ("Received invalid get_examples callback in {name}.begin_training. "
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@ -10,6 +10,8 @@ from libcpp.vector cimport vector
 from pathlib import Path
 import warnings

+from spacy.strings import StringStore
+
 from spacy import util

 from .typedefs cimport hash_t
@ -83,6 +85,9 @@ cdef class KnowledgeBase:
    DOCS: https://nightly.spacy.io/api/kb
    """

+    contents_loc = "contents"
+    strings_loc = "strings.json"
+
    def __init__(self, Vocab vocab, entity_vector_length):
        """Create a KnowledgeBase."""
        self.mem = Pool()
@ -319,15 +324,29 @@ cdef class KnowledgeBase:

        return 0.0

-
    def to_disk(self, path):
        path = util.ensure_path(path)
-        if path.is_dir():
+        if not path.exists():
+            path.mkdir(parents=True)
+        if not path.is_dir():
            raise ValueError(Errors.E928.format(loc=path))
-        if not path.parent.exists():
-            path.parent.mkdir(parents=True)
+        self.write_contents(path / self.contents_loc)
+        self.vocab.strings.to_disk(path / self.strings_loc)

-        cdef Writer writer = Writer(path)
+    def from_disk(self, path):
+        path = util.ensure_path(path)
+        if not path.exists():
+            raise ValueError(Errors.E929.format(loc=path))
+        if not path.is_dir():
+            raise ValueError(Errors.E928.format(loc=path))
+        self.read_contents(path / self.contents_loc)
+        kb_strings = StringStore()
+        kb_strings.from_disk(path / self.strings_loc)
+        for string in kb_strings:
+            self.vocab.strings.add(string)
+
+    def write_contents(self, file_path):
+        cdef Writer writer = Writer(file_path)
        writer.write_header(self.get_size_entities(), self.entity_vector_length)

        # dumping the entity vectors in their original order
@ -366,13 +385,7 @@ cdef class KnowledgeBase:

        writer.close()

-    def from_disk(self, path):
-        path = util.ensure_path(path)
-        if path.is_dir():
-            raise ValueError(Errors.E928.format(loc=path))
-        if not path.exists():
-            raise ValueError(Errors.E929.format(loc=path))
-
+    def read_contents(self, file_path):
        cdef hash_t entity_hash
        cdef hash_t alias_hash
        cdef int64_t entry_index
@ -382,7 +395,7 @@ cdef class KnowledgeBase:
        cdef AliasC alias
        cdef float vector_element

-        cdef Reader reader = Reader(path)
+        cdef Reader reader = Reader(file_path)

        # STEP 0: load header and initialize KB
        cdef int64_t nr_entities
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -17,6 +17,7 @@ from ..vocab cimport Vocab
 from ..tokens.doc cimport Doc, get_token_attr_for_matcher
 from ..tokens.span cimport Span
 from ..tokens.token cimport Token
+from ..tokens.morphanalysis cimport MorphAnalysis
 from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH

 from ..schemas import validate_token_pattern
@ -124,7 +125,7 @@ cdef class Matcher:
        key = self._normalize_key(key)
        for pattern in patterns:
            try:
-                specs = _preprocess_pattern(pattern, self.vocab.strings,
+                specs = _preprocess_pattern(pattern, self.vocab,
                    self._extensions, self._extra_predicates)
                self.patterns.push_back(init_pattern(self.mem, key, specs))
                for spec in specs:
@ -195,7 +196,7 @@ cdef class Matcher:
                else:
                    yield doc

-    def __call__(self, object doclike, *, as_spans=False):
+    def __call__(self, object doclike, *, as_spans=False, allow_missing=False):
        """Find all token sequences matching the supplied pattern.

        doclike (Doc or Span): The document to match over.
@ -215,16 +216,19 @@ cdef class Matcher:
        else:
            raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__))
        cdef Pool tmp_pool = Pool()
-        if TAG in self._seen_attrs and not doc.has_annotation("TAG"):
-            raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG"))
-        if POS in self._seen_attrs and not doc.has_annotation("POS"):
-            raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS"))
-        if MORPH in self._seen_attrs and not doc.has_annotation("MORPH"):
-            raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH"))
-        if LEMMA in self._seen_attrs and not doc.has_annotation("LEMMA"):
-            raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA"))
-        if DEP in self._seen_attrs and not doc.has_annotation("DEP"):
-            raise ValueError(Errors.E156.format())
+        if not allow_missing:
+            for attr in (TAG, POS, MORPH, LEMMA, DEP):
+                if attr in self._seen_attrs and not doc.has_annotation(attr):
+                    if attr == TAG:
+                        pipe = "tagger"
+                    elif attr in (POS, MORPH):
+                        pipe = "morphologizer"
+                    elif attr == LEMMA:
+                        pipe = "lemmatizer"
+                    elif attr == DEP:
+                        pipe = "parser"
+                    error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
+                    raise ValueError(error_msg)
        matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
                                extensions=self._extensions, predicates=self._extra_predicates)
        final_matches = []
@ -660,7 +664,7 @@ cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
    return id_attr.value


-def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predicates):
+def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
    """This function interprets the pattern, converting the various bits of
    syntactic sugar before we compile it into a struct with init_pattern.

@ -675,6 +679,7 @@ def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predi
        extra_predicates.
    """
    tokens = []
+    string_store = vocab.strings
    for spec in token_specs:
        if not spec:
            # Signifier for 'any token'
@ -685,7 +690,7 @@ def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predi
        ops = _get_operators(spec)
        attr_values = _get_attr_values(spec, string_store)
        extensions = _get_extensions(spec, string_store, extensions_table)
-        predicates = _get_extra_predicates(spec, extra_predicates)
+        predicates = _get_extra_predicates(spec, extra_predicates, vocab)
        for op in ops:
            tokens.append((op, list(attr_values), list(extensions), list(predicates)))
    return tokens
@ -729,7 +734,7 @@ def _get_attr_values(spec, string_store):
 class _RegexPredicate:
    operators = ("REGEX",)

-    def __init__(self, i, attr, value, predicate, is_extension=False):
+    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
        self.i = i
        self.attr = attr
        self.value = re.compile(value)
@ -747,13 +752,18 @@ class _RegexPredicate:
        return bool(self.value.search(value))


-class _SetMemberPredicate:
-    operators = ("IN", "NOT_IN")
+class _SetPredicate:
+    operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET")

-    def __init__(self, i, attr, value, predicate, is_extension=False):
+    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
        self.i = i
        self.attr = attr
-        self.value = set(get_string_id(v) for v in value)
+        self.vocab = vocab
+        if self.attr == MORPH:
+            # normalize morph strings
+            self.value = set(self.vocab.morphology.add(v) for v in value)
+        else:
+            self.value = set(get_string_id(v) for v in value)
        self.predicate = predicate
        self.is_extension = is_extension
        self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
@ -765,19 +775,32 @@ class _SetMemberPredicate:
            value = get_string_id(token._.get(self.attr))
        else:
            value = get_token_attr_for_matcher(token.c, self.attr)
+
+        if self.predicate in ("IS_SUBSET", "IS_SUPERSET"):
+            if self.attr == MORPH:
+                # break up MORPH into individual Feat=Val values
+                value = set(get_string_id(v) for v in MorphAnalysis.from_id(self.vocab, value))
+            else:
+                # IS_SUBSET for other attrs will be equivalent to "IN"
+                # IS_SUPERSET will only match for other attrs with 0 or 1 values
+                value = set([value])
        if self.predicate == "IN":
            return value in self.value
-        else:
+        elif self.predicate == "NOT_IN":
            return value not in self.value
+        elif self.predicate == "IS_SUBSET":
+            return value <= self.value
+        elif self.predicate == "IS_SUPERSET":
+            return value >= self.value

    def __repr__(self):
-        return repr(("SetMemberPredicate", self.i, self.attr, self.value, self.predicate))
+        return repr(("SetPredicate", self.i, self.attr, self.value, self.predicate))


 class _ComparisonPredicate:
    operators = ("==", "!=", ">=", "<=", ">", "<")

-    def __init__(self, i, attr, value, predicate, is_extension=False):
+    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
        self.i = i
        self.attr = attr
        self.value = value
@ -806,11 +829,13 @@ class _ComparisonPredicate:
            return value < self.value


-def _get_extra_predicates(spec, extra_predicates):
+def _get_extra_predicates(spec, extra_predicates, vocab):
    predicate_types = {
        "REGEX": _RegexPredicate,
-        "IN": _SetMemberPredicate,
-        "NOT_IN": _SetMemberPredicate,
+        "IN": _SetPredicate,
+        "NOT_IN": _SetPredicate,
+        "IS_SUBSET": _SetPredicate,
+        "IS_SUPERSET": _SetPredicate,
        "==": _ComparisonPredicate,
        "!=": _ComparisonPredicate,
        ">=": _ComparisonPredicate,
@ -838,7 +863,7 @@ def _get_extra_predicates(spec, extra_predicates):
            value_with_upper_keys = {k.upper(): v for k, v in value.items()}
            for type_, cls in predicate_types.items():
                if type_ in value_with_upper_keys:
-                    predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_)
+                    predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_, vocab=vocab)
                    # Don't create a redundant predicates.
                    # This helps with efficiency, as we're caching the results.
                    if predicate.key in seen_predicates:
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@ -186,16 +186,18 @@ cdef class PhraseMatcher:
            if isinstance(doc, Doc):
                attrs = (TAG, POS, MORPH, LEMMA, DEP)
                has_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
-                if self.attr == TAG and not has_annotation[TAG]:
-                    raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG"))
-                if self.attr == POS and not has_annotation[POS]:
-                    raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS"))
-                if self.attr == MORPH and not has_annotation[MORPH]:
-                    raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH"))
-                if self.attr == LEMMA and not has_annotation[LEMMA]:
-                    raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA"))
-                if self.attr == DEP and not has_annotation[DEP]:
-                    raise ValueError(Errors.E156.format())
+                for attr in attrs:
+                    if self.attr == attr and not has_annotation[attr]:
+                        if attr == TAG:
+                            pipe = "tagger"
+                        elif attr in (POS, MORPH):
+                            pipe = "morphologizer"
+                        elif attr == LEMMA:
+                            pipe = "lemmatizer"
+                        elif attr == DEP:
+                            pipe = "parser"
+                        error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
+                        raise ValueError(error_msg)
                if self._validate and any(has_annotation.values()) \
                        and self.attr not in attrs:
                    string_attr = self.vocab.strings[self.attr]
--- a/spacy/pipeline/attributeruler.py
+++ b/spacy/pipeline/attributeruler.py
@ -79,7 +79,7 @@ class AttributeRuler(Pipe):

        DOCS: https://nightly.spacy.io/api/attributeruler#call
        """
-        matches = sorted(self.matcher(doc))
+        matches = sorted(self.matcher(doc, allow_missing=True))

        for match_id, start, end in matches:
            span = Span(doc, start, end, label=match_id)
@ -126,8 +126,12 @@ class AttributeRuler(Pipe):
        for tag, attrs in tag_map.items():
            pattern = [{"TAG": tag}]
            attrs, morph_attrs = _split_morph_attrs(attrs)
-            morph = self.vocab.morphology.add(morph_attrs)
-            attrs["MORPH"] = self.vocab.strings[morph]
+            if "MORPH" not in attrs:
+                morph = self.vocab.morphology.add(morph_attrs)
+                attrs["MORPH"] = self.vocab.strings[morph]
+            else:
+                morph = self.vocab.morphology.add(attrs["MORPH"])
+                attrs["MORPH"] = self.vocab.strings[morph]
            self.add([pattern], attrs)

    def load_from_morph_rules(
@ -146,8 +150,12 @@ class AttributeRuler(Pipe):
                pattern = [{"ORTH": word, "TAG": tag}]
                attrs = morph_rules[tag][word]
                attrs, morph_attrs = _split_morph_attrs(attrs)
-                morph = self.vocab.morphology.add(morph_attrs)
-                attrs["MORPH"] = self.vocab.strings[morph]
+                if "MORPH" in attrs:
+                    morph = self.vocab.morphology.add(attrs["MORPH"])
+                    attrs["MORPH"] = self.vocab.strings[morph]
+                elif morph_attrs:
+                    morph = self.vocab.morphology.add(morph_attrs)
+                    attrs["MORPH"] = self.vocab.strings[morph]
                self.add([pattern], attrs)

    def add(
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@ -16,6 +16,7 @@ from ..training import Example, validate_examples
 from ..errors import Errors, Warnings
 from ..util import SimpleFrozenList
 from .. import util
+from ..scorer import Scorer


 default_model_config = """
@ -47,6 +48,11 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
        "incl_context": True,
        "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
    },
+    default_score_weights={
+        "nel_micro_f": 1.0,
+        "nel_micro_r": None,
+        "nel_micro_p": None,
+    },
 )
 def make_entity_linker(
    nlp: Language,
@ -209,12 +215,11 @@ class EntityLinker(Pipe):
            # it does run the model twice :(
            predictions = self.model.predict(docs)
        for eg in examples:
-            sentences = [s for s in eg.predicted.sents]
+            sentences = [s for s in eg.reference.sents]
            kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
-            for ent in eg.predicted.ents:
-                kb_id = kb_ids[
-                    ent.start
-                ]  # KB ID of the first token is the same as the whole span
+            for ent in eg.reference.ents:
+                # KB ID of the first token is the same as the whole span
+                kb_id = kb_ids[ent.start]
                if kb_id:
                    try:
                        # find the sentence in the list of sentences.
@ -253,7 +258,7 @@ class EntityLinker(Pipe):
        entity_encodings = []
        for eg in examples:
            kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
-            for ent in eg.predicted.ents:
+            for ent in eg.reference.ents:
                kb_id = kb_ids[ent.start]
                if kb_id:
                    entity_encoding = self.kb.get_vector(kb_id)
@ -415,6 +420,17 @@ class EntityLinker(Pipe):
                for token in ent:
                    token.ent_kb_id_ = kb_id

+    def score(self, examples, **kwargs):
+        """Score a batch of examples.
+
+        examples (Iterable[Example]): The examples to score.
+        RETURNS (Dict[str, Any]): The scores.
+
+        DOCS TODO: https://nightly.spacy.io/api/entity_linker#score
+        """
+        validate_examples(examples, "EntityLinker.score")
+        return Scorer.score_links(examples, negative_labels=[self.NIL])
+
    def to_disk(
        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
    ) -> None:
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@ -6,7 +6,7 @@ from .transition_parser cimport Parser
 from ._parser_internals.ner cimport BiluoPushDown

 from ..language import Language
-from ..scorer import Scorer
+from ..scorer import get_ner_prf, PRFScore
 from ..training import validate_examples


@ -117,9 +117,18 @@ cdef class EntityRecognizer(Parser):
        """Score a batch of examples.

        examples (Iterable[Example]): The examples to score.
-        RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
+        RETURNS (Dict[str, Any]): The NER precision, recall and f-scores.

        DOCS: https://nightly.spacy.io/api/entityrecognizer#score
        """
        validate_examples(examples, "EntityRecognizer.score")
-        return Scorer.score_spans(examples, "ents", **kwargs)
+        score_per_type = get_ner_prf(examples)
+        totals = PRFScore()
+        for prf in score_per_type.values():
+            totals += prf
+        return {
+            "ents_p": totals.precision,
+            "ents_r": totals.recall,
+            "ents_f": totals.fscore,
+            "ents_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
+        }
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -61,6 +61,8 @@ class TokenPatternString(BaseModel):
    REGEX: Optional[StrictStr] = Field(None, alias="regex")
    IN: Optional[List[StrictStr]] = Field(None, alias="in")
    NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in")
+    IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset")
+    IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")

    class Config:
        extra = "forbid"
@ -77,6 +79,8 @@ class TokenPatternNumber(BaseModel):
    REGEX: Optional[StrictStr] = Field(None, alias="regex")
    IN: Optional[List[StrictInt]] = Field(None, alias="in")
    NOT_IN: Optional[List[StrictInt]] = Field(None, alias="not_in")
+    ISSUBSET: Optional[List[StrictInt]] = Field(None, alias="issubset")
+    ISSUPERSET: Optional[List[StrictInt]] = Field(None, alias="issuperset")
    EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==")
    NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=")
    GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=")
@ -115,6 +119,7 @@ class TokenPattern(BaseModel):
    lower: Optional[StringValue] = None
    pos: Optional[StringValue] = None
    tag: Optional[StringValue] = None
+    morph: Optional[StringValue] = None
    dep: Optional[StringValue] = None
    lemma: Optional[StringValue] = None
    shape: Optional[StringValue] = None
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -1,5 +1,6 @@
 from typing import Optional, Iterable, Dict, Any, Callable, TYPE_CHECKING
 import numpy as np
+from collections import defaultdict

 from .training import Example
 from .tokens import Token, Doc, Span
@ -23,6 +24,19 @@ class PRFScore:
        self.fp = 0
        self.fn = 0

+    def __iadd__(self, other):
+        self.tp += other.tp
+        self.fp += other.fp
+        self.fn += other.fn
+        return self
+
+    def __add__(self, other):
+        return PRFScore(
+            tp=self.tp+other.tp,
+            fp=self.fp+other.fp,
+            fn=self.fn+other.fn
+        )
+
    def score_set(self, cand: set, gold: set) -> None:
        self.tp += len(cand.intersection(gold))
        self.fp += len(cand - gold)
@ -295,12 +309,6 @@ class Scorer:
            # Find all predidate labels, for all and per type
            gold_spans = set()
            pred_spans = set()
-            # Special case for ents:
-            # If we have missing values in the gold, we can't easily tell
-            # whether our NER predictions are true.
-            # It seems bad but it's what we've always done.
-            if attr == "ents" and not all(token.ent_iob != 0 for token in gold_doc):
-                continue
            for span in getter(gold_doc, attr):
                gold_span = (span.label_, span.start, span.end - 1)
                gold_spans.add(gold_span)
@ -451,6 +459,74 @@ class Scorer:
            results[f"{attr}_score_desc"] = "macro AUC"
        return results

+    @staticmethod
+    def score_links(
+        examples: Iterable[Example], *, negative_labels: Iterable[str]
+    ) -> Dict[str, Any]:
+        """Returns PRF for predicted links on the entity level.
+        To disentangle the performance of the NEL from the NER,
+        this method only evaluates NEL links for entities that overlap
+        between the gold reference and the predictions.
+
+        examples (Iterable[Example]): Examples to score
+        negative_labels (Iterable[str]): The string values that refer to no annotation (e.g. "NIL")
+        RETURNS (Dict[str, Any]): A dictionary containing the scores.
+
+        DOCS (TODO): https://nightly.spacy.io/api/scorer#score_links
+        """
+        f_per_type = {}
+        for example in examples:
+            gold_ent_by_offset = {}
+            for gold_ent in example.reference.ents:
+                gold_ent_by_offset[(gold_ent.start_char, gold_ent.end_char)] = gold_ent
+
+            for pred_ent in example.predicted.ents:
+                gold_span = gold_ent_by_offset.get(
+                    (pred_ent.start_char, pred_ent.end_char), None
+                )
+                label = gold_span.label_
+                if not label in f_per_type:
+                    f_per_type[label] = PRFScore()
+                gold = gold_span.kb_id_
+                # only evaluating entities that overlap between gold and pred,
+                # to disentangle the performance of the NEL from the NER
+                if gold is not None:
+                    pred = pred_ent.kb_id_
+                    if gold in negative_labels and pred in negative_labels:
+                        # ignore true negatives
+                        pass
+                    elif gold == pred:
+                        f_per_type[label].tp += 1
+                    elif gold in negative_labels:
+                        f_per_type[label].fp += 1
+                    elif pred in negative_labels:
+                        f_per_type[label].fn += 1
+                    else:
+                        # a wrong prediction (e.g. Q42 != Q3) counts as both a FP as well as a FN
+                        f_per_type[label].fp += 1
+                        f_per_type[label].fn += 1
+        micro_prf = PRFScore()
+        for label_prf in f_per_type.values():
+            micro_prf.tp += label_prf.tp
+            micro_prf.fn += label_prf.fn
+            micro_prf.fp += label_prf.fp
+        n_labels = len(f_per_type) + 1e-100
+        macro_p = sum(prf.precision for prf in f_per_type.values()) / n_labels
+        macro_r = sum(prf.recall for prf in f_per_type.values()) / n_labels
+        macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_labels
+        results = {
+            f"nel_score": micro_prf.fscore,
+            f"nel_score_desc": "micro F",
+            f"nel_micro_p": micro_prf.precision,
+            f"nel_micro_r": micro_prf.recall,
+            f"nel_micro_f": micro_prf.fscore,
+            f"nel_macro_p": macro_p,
+            f"nel_macro_r": macro_r,
+            f"nel_macro_f": macro_f,
+            f"nel_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
+        }
+        return results
+
    @staticmethod
    def score_deps(
        examples: Iterable[Example],
@ -545,6 +621,39 @@ class Scorer:
        }


+def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]:
+    """Compute per-entity PRFScore objects for a sequence of examples. The
+    results are returned as a dictionary keyed by the entity type. You can
+    add the PRFScore objects to get micro-averaged total.
+    """
+    scores = defaultdict(PRFScore)
+    for eg in examples:
+        if not eg.y.has_annotation("ENT_IOB"):
+            continue
+        golds = {(e.label_, e.start, e.end) for e in eg.y.ents}
+        align_x2y = eg.alignment.x2y
+        preds = set()
+        for pred_ent in eg.x.ents:
+            if pred_ent.label_ not in scores:
+                scores[pred_ent.label_] = PRFScore()
+            indices = align_x2y[pred_ent.start : pred_ent.end].dataXd.ravel()
+            if len(indices):
+                g_span = eg.y[indices[0] : indices[-1] + 1]
+                # Check we aren't missing annotation on this span. If so,
+                # our prediction is neither right nor wrong, we just
+                # ignore it.
+                if all(token.ent_iob != 0 for token in g_span):
+                    key = (pred_ent.label_, indices[0], indices[-1] + 1)
+                    if key in golds:
+                        scores[pred_ent.label_].tp += 1
+                        golds.remove(key)
+                    else:
+                        scores[pred_ent.label_].fp += 1
+        for label, start, end in golds:
+            scores[label].fn += 1
+    return scores
+
+
 #############################################################################
 #
 # The following implementation of roc_auc_score() is adapted from
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@ -230,6 +230,106 @@ def test_matcher_set_value_operator(en_vocab):
    assert len(matches) == 1


+def test_matcher_subset_value_operator(en_vocab):
+    matcher = Matcher(en_vocab)
+    pattern = [{"MORPH": {"IS_SUBSET": ["Feat=Val", "Feat2=Val2"]}}]
+    matcher.add("M", [pattern])
+    doc = Doc(en_vocab, words=["a", "b", "c"])
+    assert len(matcher(doc)) == 3
+    doc[0].morph_ = "Feat=Val"
+    assert len(matcher(doc)) == 3
+    doc[0].morph_ = "Feat=Val|Feat2=Val2"
+    assert len(matcher(doc)) == 3
+    doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3"
+    assert len(matcher(doc)) == 2
+    doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4"
+    assert len(matcher(doc)) == 2
+
+    # IS_SUBSET acts like "IN" for attrs other than MORPH
+    matcher = Matcher(en_vocab)
+    pattern = [{"TAG": {"IS_SUBSET": ["A", "B"]}}]
+    matcher.add("M", [pattern])
+    doc = Doc(en_vocab, words=["a", "b", "c"])
+    doc[0].tag_ = "A"
+    assert len(matcher(doc)) == 1
+
+    # IS_SUBSET with an empty list matches nothing
+    matcher = Matcher(en_vocab)
+    pattern = [{"TAG": {"IS_SUBSET": []}}]
+    matcher.add("M", [pattern])
+    doc = Doc(en_vocab, words=["a", "b", "c"])
+    doc[0].tag_ = "A"
+    assert len(matcher(doc)) == 0
+
+
+def test_matcher_superset_value_operator(en_vocab):
+    matcher = Matcher(en_vocab)
+    pattern = [{"MORPH": {"IS_SUPERSET": ["Feat=Val", "Feat2=Val2", "Feat3=Val3"]}}]
+    matcher.add("M", [pattern])
+    doc = Doc(en_vocab, words=["a", "b", "c"])
+    assert len(matcher(doc)) == 0
+    doc[0].morph_ = "Feat=Val|Feat2=Val2"
+    assert len(matcher(doc)) == 0
+    doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3"
+    assert len(matcher(doc)) == 1
+    doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4"
+    assert len(matcher(doc)) == 1
+
+    # IS_SUPERSET with more than one value only matches for MORPH
+    matcher = Matcher(en_vocab)
+    pattern = [{"TAG": {"IS_SUPERSET": ["A", "B"]}}]
+    matcher.add("M", [pattern])
+    doc = Doc(en_vocab, words=["a", "b", "c"])
+    doc[0].tag_ = "A"
+    assert len(matcher(doc)) == 0
+
+    # IS_SUPERSET with one value is the same as ==
+    matcher = Matcher(en_vocab)
+    pattern = [{"TAG": {"IS_SUPERSET": ["A"]}}]
+    matcher.add("M", [pattern])
+    doc = Doc(en_vocab, words=["a", "b", "c"])
+    doc[0].tag_ = "A"
+    assert len(matcher(doc)) == 1
+
+    # IS_SUPERSET with an empty value matches everything
+    matcher = Matcher(en_vocab)
+    pattern = [{"TAG": {"IS_SUPERSET": []}}]
+    matcher.add("M", [pattern])
+    doc = Doc(en_vocab, words=["a", "b", "c"])
+    doc[0].tag_ = "A"
+    assert len(matcher(doc)) == 3
+
+
+def test_matcher_morph_handling(en_vocab):
+    # order of features in pattern doesn't matter
+    matcher = Matcher(en_vocab)
+    pattern1 = [{"MORPH": {"IN": ["Feat1=Val1|Feat2=Val2"]}}]
+    pattern2 = [{"MORPH": {"IN": ["Feat2=Val2|Feat1=Val1"]}}]
+    matcher.add("M", [pattern1])
+    matcher.add("N", [pattern2])
+    doc = Doc(en_vocab, words=["a", "b", "c"])
+    assert len(matcher(doc)) == 0
+
+    doc[0].morph_ = "Feat2=Val2|Feat1=Val1"
+    assert len(matcher(doc)) == 2
+    doc[0].morph_ = "Feat1=Val1|Feat2=Val2"
+    assert len(matcher(doc)) == 2
+
+    # multiple values are split
+    matcher = Matcher(en_vocab)
+    pattern1 = [{"MORPH": {"IS_SUPERSET": ["Feat1=Val1", "Feat2=Val2"]}}]
+    pattern2 = [{"MORPH": {"IS_SUPERSET": ["Feat1=Val1", "Feat1=Val3", "Feat2=Val2"]}}]
+    matcher.add("M", [pattern1])
+    matcher.add("N", [pattern2])
+    doc = Doc(en_vocab, words=["a", "b", "c"])
+    assert len(matcher(doc)) == 0
+
+    doc[0].morph_ = "Feat2=Val2,Val3|Feat1=Val1"
+    assert len(matcher(doc)) == 1
+    doc[0].morph_ = "Feat1=Val1,Val3|Feat2=Val2"
+    assert len(matcher(doc)) == 2
+
+
 def test_matcher_regex(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"ORTH": {"REGEX": r"(?:a|an)"}}]
@ -316,6 +416,9 @@ def test_attr_pipeline_checks(en_vocab):
        matcher(doc2)
    with pytest.raises(ValueError):
        matcher(doc3)
+    # errors can be suppressed if desired
+    matcher(doc2, allow_missing=True)
+    matcher(doc3, allow_missing=True)
    # TAG, POS, LEMMA require those values
    for attr in ("TAG", "POS", "LEMMA"):
        matcher = Matcher(en_vocab)
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@ -2,8 +2,10 @@ from typing import Callable, Iterable
 import pytest

 from spacy.kb import KnowledgeBase, get_candidates, Candidate
+from spacy.vocab import Vocab

 from spacy import util, registry
+from spacy.scorer import Scorer
 from spacy.training import Example
 from spacy.lang.en import English
 from spacy.tests.util import make_tempdir
@ -151,22 +153,15 @@ def test_kb_serialize(nlp):
        # normal read-write behaviour
        mykb.to_disk(d / "kb")
        mykb.from_disk(d / "kb")
-        mykb.to_disk(d / "kb.file")
-        mykb.from_disk(d / "kb.file")
        mykb.to_disk(d / "new" / "kb")
        mykb.from_disk(d / "new" / "kb")
        # allow overwriting an existing file
-        mykb.to_disk(d / "kb.file")
-        with pytest.raises(ValueError):
-            # can not write to a directory
-            mykb.to_disk(d)
-        with pytest.raises(ValueError):
-            # can not read from a directory
-            mykb.from_disk(d)
+        mykb.to_disk(d / "kb")
        with pytest.raises(ValueError):
            # can not read from an unknown file
            mykb.from_disk(d / "unknown" / "kb")

+
 def test_candidate_generation(nlp):
    """Test correct candidate generation"""
    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
@ -254,6 +249,41 @@ def test_el_pipe_configuration(nlp):
    assert doc[2].ent_kb_id_ == "Q2"


+def test_vocab_serialization(nlp):
+    """Test that string information is retained across storage"""
+    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+
+    # adding entities
+    q1_hash = mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
+    q2_hash = mykb.add_entity(entity="Q2", freq=12, entity_vector=[2])
+    q3_hash = mykb.add_entity(entity="Q3", freq=5, entity_vector=[3])
+
+    # adding aliases
+    douglas_hash = mykb.add_alias(
+        alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1]
+    )
+    adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
+
+    candidates = mykb.get_alias_candidates("adam")
+    assert len(candidates) == 1
+    assert candidates[0].entity == q2_hash
+    assert candidates[0].entity_ == "Q2"
+    assert candidates[0].alias == adam_hash
+    assert candidates[0].alias_ == "adam"
+
+    with make_tempdir() as d:
+        mykb.to_disk(d / "kb")
+        kb_new_vocab = KnowledgeBase(Vocab(), entity_vector_length=1)
+        kb_new_vocab.from_disk(d / "kb")
+
+        candidates = kb_new_vocab.get_alias_candidates("adam")
+        assert len(candidates) == 1
+        assert candidates[0].entity == q2_hash
+        assert candidates[0].entity_ == "Q2"
+        assert candidates[0].alias == adam_hash
+        assert candidates[0].alias_ == "adam"
+
+
 def test_append_alias(nlp):
    """Test that we can append additional alias-entity pairs"""
    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
@ -377,16 +407,20 @@ def test_preserving_links_ents_2(nlp):
 TRAIN_DATA = [
    ("Russ Cochran captured his first major title with his son as caddie.",
        {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}},
-         "entities": [(0, 12, "PERSON")]}),
+         "entities": [(0, 12, "PERSON")],
+         "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}),
    ("Russ Cochran his reprints include EC Comics.",
        {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}},
-         "entities": [(0, 12, "PERSON")]}),
+         "entities": [(0, 12, "PERSON")],
+         "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0]}),
    ("Russ Cochran has been publishing comic art.",
        {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}},
-         "entities": [(0, 12, "PERSON")]}),
+         "entities": [(0, 12, "PERSON")],
+         "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0]}),
    ("Russ Cochran was a member of University of Kentucky's golf team.",
        {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}},
-         "entities": [(0, 12, "PERSON"), (43, 51, "LOC")]}),
+         "entities": [(0, 12, "PERSON"), (43, 51, "LOC")],
+         "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]})
 ]
 GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
 # fmt: on
@ -395,16 +429,8 @@ GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
 def test_overfitting_IO():
    # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
    nlp = English()
-    nlp.add_pipe("sentencizer")
    vector_length = 3

-    # Add a custom component to recognize "Russ Cochran" as an entity for the example training data
-    patterns = [
-        {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}
-    ]
-    ruler = nlp.add_pipe("entity_ruler")
-    ruler.add_patterns(patterns)
-
    # Convert the texts to docs to make sure we have doc.ents set for the training examples
    train_examples = []
    for text, annotation in TRAIN_DATA:
@ -446,6 +472,16 @@ def test_overfitting_IO():
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses["entity_linker"] < 0.001

+    # adding additional components that are required for the entity_linker
+    nlp.add_pipe("sentencizer", first=True)
+
+    # Add a custom component to recognize "Russ Cochran" as an entity for the example training data
+    patterns = [
+        {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}
+    ]
+    ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
+    ruler.add_patterns(patterns)
+
    # test the trained model
    predictions = []
    for text, annotation in TRAIN_DATA:
@ -465,3 +501,46 @@ def test_overfitting_IO():
            for ent in doc2.ents:
                predictions.append(ent.kb_id_)
        assert predictions == GOLD_entities
+
+
+def test_scorer_links():
+    train_examples = []
+    nlp = English()
+    ref1 = nlp("Julia lives in London happily.")
+    ref1.ents = [
+        Span(ref1, 0, 1, label="PERSON", kb_id="Q2"),
+        Span(ref1, 3, 4, label="LOC", kb_id="Q3"),
+    ]
+    pred1 = nlp("Julia lives in London happily.")
+    pred1.ents = [
+        Span(pred1, 0, 1, label="PERSON", kb_id="Q70"),
+        Span(pred1, 3, 4, label="LOC", kb_id="Q3"),
+    ]
+    train_examples.append(Example(pred1, ref1))
+
+    ref2 = nlp("She loves London.")
+    ref2.ents = [
+        Span(ref2, 0, 1, label="PERSON", kb_id="Q2"),
+        Span(ref2, 2, 3, label="LOC", kb_id="Q13"),
+    ]
+    pred2 = nlp("She loves London.")
+    pred2.ents = [
+        Span(pred2, 0, 1, label="PERSON", kb_id="Q2"),
+        Span(pred2, 2, 3, label="LOC", kb_id="NIL"),
+    ]
+    train_examples.append(Example(pred2, ref2))
+
+    ref3 = nlp("London is great.")
+    ref3.ents = [Span(ref3, 0, 1, label="LOC", kb_id="NIL")]
+    pred3 = nlp("London is great.")
+    pred3.ents = [Span(pred3, 0, 1, label="LOC", kb_id="NIL")]
+    train_examples.append(Example(pred3, ref3))
+
+    scores = Scorer().score_links(train_examples, negative_labels=["NIL"])
+    assert scores["nel_f_per_type"]["PERSON"]["p"] == 1 / 2
+    assert scores["nel_f_per_type"]["PERSON"]["r"] == 1 / 2
+    assert scores["nel_f_per_type"]["LOC"]["p"] == 1 / 1
+    assert scores["nel_f_per_type"]["LOC"]["r"] == 1 / 2
+
+    assert scores["nel_micro_p"] == 2 / 3
+    assert scores["nel_micro_r"] == 2 / 4
--- a/spacy/tests/pipeline/test_pipe_factories.py
+++ b/spacy/tests/pipeline/test_pipe_factories.py
@ -345,12 +345,13 @@ def test_language_factories_invalid():
            [{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}],
            {"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25},
        ),
-        ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75},),
+        ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75}),
+        ([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"a": 0.0, "b": 0.0, "c": 0.0}),
    ],
 )
 def test_language_factories_combine_score_weights(weights, expected):
    result = combine_score_weights(weights)
-    assert sum(result.values()) in (0.99, 1.0)
+    assert sum(result.values()) in (0.99, 1.0, 0.0)
    assert result == expected


--- a/spacy/tests/training/test_new_example.py
+++ b/spacy/tests/training/test_new_example.py
@ -244,3 +244,22 @@ def test_Example_from_dict_with_links_invalid(annots):
    predicted = Doc(vocab, words=annots["words"])
    with pytest.raises(ValueError):
        Example.from_dict(predicted, annots)
+
+
+def test_Example_from_dict_sentences():
+    vocab = Vocab()
+    predicted = Doc(vocab, words=["One", "sentence", ".", "one", "more"])
+    annots = {"sent_starts": [1, 0, 0, 1, 0]}
+    ex = Example.from_dict(predicted, annots)
+    assert len(list(ex.reference.sents)) == 2
+
+    # this currently throws an error - bug or feature?
+    # predicted = Doc(vocab, words=["One", "sentence", "not", "one", "more"])
+    # annots = {"sent_starts": [1, 0, 0, 0, 0]}
+    # ex = Example.from_dict(predicted, annots)
+    # assert len(list(ex.reference.sents)) == 1
+
+    predicted = Doc(vocab, words=["One", "sentence", "not", "one", "more"])
+    annots = {"sent_starts": [1, -1, 0, 0, 0]}
+    ex = Example.from_dict(predicted, annots)
+    assert len(list(ex.reference.sents)) == 1
--- a/spacy/training/example.pxd
+++ b/spacy/training/example.pxd
@ -1,4 +1,5 @@
 from ..tokens.doc cimport Doc
+from libc.stdint cimport uint64_t


 cdef class Example:
@ -7,3 +8,5 @@ cdef class Example:
    cdef readonly object _cached_alignment
    cdef readonly object _cached_words_x
    cdef readonly object _cached_words_y
+    cdef readonly uint64_t _x_sig
+    cdef readonly uint64_t _y_sig
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@ -1,6 +1,7 @@
 from collections import Iterable as IterableInstance
 import warnings
 import numpy
+from murmurhash.mrmr cimport hash64

 from ..tokens.doc cimport Doc
 from ..tokens.span cimport Span
@ -97,15 +98,36 @@ cdef class Example:

    @property
    def alignment(self):
-        words_x = [token.text for token in self.x]
-        words_y = [token.text for token in self.y]
-        if self._cached_alignment is None or \
-                words_x != self._cached_words_x or \
-                words_y != self._cached_words_y:
-            self._cached_alignment = Alignment.from_strings(words_x, words_y)
+        x_sig = hash64(self.x.c, sizeof(self.x.c[0]) * self.x.length, 0)
+        y_sig = hash64(self.y.c, sizeof(self.y.c[0]) * self.y.length, 0)
+        if self._cached_alignment is None:
+            words_x = [token.text for token in self.x]
+            words_y = [token.text for token in self.y]
+            self._x_sig = x_sig
+            self._y_sig = y_sig
            self._cached_words_x = words_x
            self._cached_words_y = words_y
-        return self._cached_alignment
+            self._cached_alignment = Alignment.from_strings(words_x, words_y)
+            return self._cached_alignment
+        elif self._x_sig == x_sig and self._y_sig == y_sig:
+            # If we have a cached alignment, check whether the cache is invalid
+            # due to retokenization. To make this check fast in loops, we first
+            # check a hash of the TokenC arrays.
+            return self._cached_alignment
+        else:
+            words_x = [token.text for token in self.x]
+            words_y = [token.text for token in self.y]
+            if words_x == self._cached_words_x and words_y == self._cached_words_y:
+                self._x_sig = x_sig
+                self._y_sig = y_sig
+                return self._cached_alignment
+            else:
+                self._cached_alignment = Alignment.from_strings(words_x, words_y)
+                self._cached_words_x = words_x
+                self._cached_words_y = words_y
+                self._x_sig = x_sig
+                self._y_sig = y_sig
+                return self._cached_alignment

    def get_aligned(self, field, as_string=False):
        """Return an aligned array for a token attribute."""
@ -288,7 +310,6 @@ def _annot2array(vocab, tok_annot, doc_annot):


 def _add_entities_to_doc(doc, ner_data):
-    print(ner_data)
    if ner_data is None:
        return
    elif ner_data == []:
--- a/spacy/util.py
+++ b/spacy/util.py
@ -1233,8 +1233,13 @@ def combine_score_weights(
        # components.
        total = sum(w_dict.values())
        for key, value in w_dict.items():
-            weight = round(value / total / len(all_weights), 2)
-            result[key] = result.get(key, 0.0) + weight
+            if total == 0:
+                weight = 0.0
+            else:
+                weight = round(value / total / len(all_weights), 2)
+            prev_weight = result.get(key, 0.0)
+            prev_weight = 0.0 if prev_weight is None else prev_weight
+            result[key] = prev_weight + weight
    return result


--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@ -225,6 +225,21 @@ pipe's entity linking model and context encoder. Delegates to
 | `losses`          | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~           |
 | **RETURNS**       | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                              |

+## EntityLinker.score {#score tag="method" new="3"}
+
+Score a batch of examples.
+
+> #### Example
+>
+> ```python
+> scores = entity_linker.score(examples)
+> ```
+
+| Name        | Description                                                                                    |
+| ----------- | ---------------------------------------------------------------------------------------------- |
+| `examples`  | The examples to score. ~~Iterable[Example]~~                                                   |
+| **RETURNS** | The scores, produced by [`Scorer.score_links`](/api/scorer#score_links) . ~~Dict[str, float]~~ |
+
 ## EntityLinker.create_optimizer {#create_optimizer tag="method"}

 Create an optimizer for the pipeline component.
--- a/website/docs/api/entityrecognizer.md
+++ b/website/docs/api/entityrecognizer.md
@ -242,10 +242,10 @@ Score a batch of examples.
 > scores = ner.score(examples)
 > ```

-| Name        | Description                                                                                                            |
-| ----------- | ---------------------------------------------------------------------------------------------------------------------- |
-| `examples`  | The examples to score. ~~Iterable[Example]~~                                                                           |
-| **RETURNS** | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
+| Name        | Description                                               |
+| ----------- | --------------------------------------------------------- |
+| `examples`  | The examples to score. ~~Iterable[Example]~~              |
+| **RETURNS** | The scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ |

 ## EntityRecognizer.create_optimizer {#create_optimizer tag="method"}

--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@ -30,20 +30,20 @@ pattern keys correspond to a number of
 [`Token` attributes](/api/token#attributes). The supported attributes for
 rule-based matching are:

-| Attribute                              |  Description                                                                                                              |
-| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
-| `ORTH`                                 | The exact verbatim text of a token. ~~str~~                                                                               |
-| `TEXT` <Tag variant="new">2.1</Tag>    | The exact verbatim text of a token. ~~str~~                                                                               |
-| `LOWER`                                | The lowercase form of the token text. ~~str~~                                                                             |
-|  `LENGTH`                              | The length of the token text. ~~int~~                                                                                     |
-|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`    | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~                                          |
-|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE`    | Token text is in lowercase, uppercase, titlecase. ~~bool~~                                                                |
-|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP`     | Token is punctuation, whitespace, stop word. ~~bool~~                                                                     |
-|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`  | Token text resembles a number, URL, email. ~~bool~~                                                                       |
-|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. ~~str~~                               |
-| `ENT_TYPE`                             | The token's entity label. ~~str~~                                                                                         |
-| `_` <Tag variant="new">2.1</Tag>       | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
-| `OP`                                   | Operator or quantifier to determine how often to match a token pattern. ~~str~~                                           |
+| Attribute                                       |  Description                                                                                                              |
+| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
+| `ORTH`                                          | The exact verbatim text of a token. ~~str~~                                                                               |
+| `TEXT` <Tag variant="new">2.1</Tag>             | The exact verbatim text of a token. ~~str~~                                                                               |
+| `LOWER`                                         | The lowercase form of the token text. ~~str~~                                                                             |
+|  `LENGTH`                                       | The length of the token text. ~~int~~                                                                                     |
+|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`             | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~                                          |
+|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE`             | Token text is in lowercase, uppercase, titlecase. ~~bool~~                                                                |
+|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP`              | Token is punctuation, whitespace, stop word. ~~bool~~                                                                     |
+|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`           | Token text resembles a number, URL, email. ~~bool~~                                                                       |
+|  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~       |
+| `ENT_TYPE`                                      | The token's entity label. ~~str~~                                                                                         |
+| `_` <Tag variant="new">2.1</Tag>                | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
+| `OP`                                            | Operator or quantifier to determine how often to match a token pattern. ~~str~~                                           |

 Operators and quantifiers define **how often** a token pattern should be
 matched:
@ -79,6 +79,8 @@ it compares to another value.
 | -------------------------- | ------------------------------------------------------------------------------------------------------- |
 | `IN`                       | Attribute value is member of a list. ~~Any~~                                                            |
 | `NOT_IN`                   | Attribute value is _not_ member of a list. ~~Any~~                                                      |
+| `ISSUBSET`                 | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~                                          |
+| `ISSUPERSET`               | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~                                        |
 | `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |

 ## Matcher.\_\_init\_\_ {#init tag="method"}
--- a/website/docs/api/scorer.md
+++ b/website/docs/api/scorer.md
@ -206,3 +206,26 @@ depends on the scorer settings:
 | `multi_label`    | Whether the attribute allows multiple labels. Defaults to `True`. ~~bool~~                                                                         |
 | `positive_label` | The positive label for a binary task with exclusive classes. Defaults to `None`. ~~Optional[str]~~                                                 |
 | **RETURNS**      | A dictionary containing the scores, with inapplicable scores as `None`. ~~Dict[str, Optional[float]]~~                                             |
+
+## Scorer.score_links {#score_links tag="staticmethod" new="3"}
+
+Returns PRF for predicted links on the entity level. To disentangle the
+performance of the NEL from the NER, this method only evaluates NEL links for
+entities that overlap between the gold reference and the predictions.
+
+> #### Example
+>
+> ```python
+> scores = Scorer.score_links(
+>     examples,
+>     negative_labels=["NIL", ""]
+> )
+> print(scores["nel_micro_f"])
+> ```
+
+| Name              | Description                                                                                                         |
+| ----------------- | ------------------------------------------------------------------------------------------------------------------- |
+| `examples`        | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
+| _keyword-only_    |                                                                                                                     |
+| `negative_labels` | The string values that refer to no annotation (e.g. "NIL"). ~~Iterable[str]~~                                       |
+| **RETURNS**       | A dictionary containing the scores. ~~Dict[str, Optional[float]]~~                                                  |
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@ -158,20 +158,20 @@ The available token pattern keys correspond to a number of
 [`Token` attributes](/api/token#attributes). The supported attributes for
 rule-based matching are:

-| Attribute                              |  Description                                                                                                              |
-| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
-| `ORTH`                                 | The exact verbatim text of a token. ~~str~~                                                                               |
-| `TEXT` <Tag variant="new">2.1</Tag>    | The exact verbatim text of a token. ~~str~~                                                                               |
-| `LOWER`                                | The lowercase form of the token text. ~~str~~                                                                             |
-|  `LENGTH`                              | The length of the token text. ~~int~~                                                                                     |
-|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`    | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~                                          |
-|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE`    | Token text is in lowercase, uppercase, titlecase. ~~bool~~                                                                |
-|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP`     | Token is punctuation, whitespace, stop word. ~~bool~~                                                                     |
-|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`  | Token text resembles a number, URL, email. ~~bool~~                                                                       |
-|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. ~~str~~                               |
-| `ENT_TYPE`                             | The token's entity label. ~~str~~                                                                                         |
-| `_` <Tag variant="new">2.1</Tag>       | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
-| `OP`                                   | [Operator or quantifier](#quantifiers) to determine how often to match a token pattern. ~~str~~                           |
+| Attribute                                       |  Description                                                                                                              |
+| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
+| `ORTH`                                          | The exact verbatim text of a token. ~~str~~                                                                               |
+| `TEXT` <Tag variant="new">2.1</Tag>             | The exact verbatim text of a token. ~~str~~                                                                               |
+| `LOWER`                                         | The lowercase form of the token text. ~~str~~                                                                             |
+|  `LENGTH`                                       | The length of the token text. ~~int~~                                                                                     |
+|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`             | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~                                          |
+|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE`             | Token text is in lowercase, uppercase, titlecase. ~~bool~~                                                                |
+|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP`              | Token is punctuation, whitespace, stop word. ~~bool~~                                                                     |
+|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`           | Token text resembles a number, URL, email. ~~bool~~                                                                       |
+|  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~       |
+| `ENT_TYPE`                                      | The token's entity label. ~~str~~                                                                                         |
+| `_` <Tag variant="new">2.1</Tag>                | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
+| `OP`                                            | [Operator or quantifier](#quantifiers) to determine how often to match a token pattern. ~~str~~                           |

 <Accordion title="Does it matter if the attribute names are uppercase or lowercase?">

@ -236,6 +236,8 @@ following rich comparison attributes are available:
 | -------------------------- | ------------------------------------------------------------------------------------------------------- |
 | `IN`                       | Attribute value is member of a list. ~~Any~~                                                            |
 | `NOT_IN`                   | Attribute value is _not_ member of a list. ~~Any~~                                                      |
+| `ISSUBSET`                 | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~                                          |
+| `ISSUPERSET`               | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~                                        |
 | `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |

 #### Regular expressions {#regex new="2.1"}
--- a/website/src/templates/models.js
+++ b/website/src/templates/models.js
@ -11,12 +11,24 @@ import { Table, Tr, Td, Th } from '../components/table'
 import Tag from '../components/tag'
 import { H2, Label } from '../components/typography'
 import Icon from '../components/icon'
-import Link from '../components/link'
+import Link, { OptionalLink } from '../components/link'
 import Infobox from '../components/infobox'
 import Accordion from '../components/accordion'
 import { join, arrayToObj, abbrNum, markdownToReact } from '../components/util'
 import { isString, isEmptyObj } from '../components/util'

+const COMPONENT_LINKS = {
+    tok2vec: '/api/tok2vec',
+    transformer: '/api/transformer',
+    tagger: '/api/tagger',
+    parser: '/api/dependencyparser',
+    ner: '/api/entityrecognizer',
+    lemmatizer: '/api/lemmatizer',
+    attribute_ruler: '/api/attributeruler',
+    senter: '/api/sentencerecognizer',
+    morphologizer: '/api/morphologizer',
+}
+
 const MODEL_META = {
    core: 'Vocabulary, syntax, entities, vectors',
    core_sm: 'Vocabulary, syntax, entities',
@ -78,10 +90,15 @@ function isStableVersion(v) {
    return !v.includes('a') && !v.includes('b') && !v.includes('dev') && !v.includes('rc')
 }

-function getLatestVersion(modelId, compatibility) {
+function getLatestVersion(modelId, compatibility, prereleases) {
    for (let [version, models] of Object.entries(compatibility)) {
        if (isStableVersion(version) && models[modelId]) {
-            return models[modelId][0]
+            const modelVersions = models[modelId]
+            for (let modelVersion of modelVersions) {
+                if (isStableVersion(modelVersion) || prereleases) {
+                    return modelVersion
+                }
+            }
        }
    }
 }
@ -141,18 +158,44 @@ function formatSources(data = []) {
    ))
 }

+function linkComponents(components = []) {
+    return join(
+        components.map(c => (
+            <Fragment key={c}>
+                <OptionalLink to={COMPONENT_LINKS[c]} hideIcon>
+                    <InlineCode>{c}</InlineCode>
+                </OptionalLink>
+            </Fragment>
+        ))
+    )
+}
+
 const Help = ({ children }) => (
    <span data-tooltip={children}>
        <Icon name="help2" width={16} variant="subtle" inline />
    </span>
 )

-const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExamples, licenses }) => {
+const Model = ({
+    name,
+    langId,
+    langName,
+    baseUrl,
+    repo,
+    compatibility,
+    hasExamples,
+    licenses,
+    prereleases,
+}) => {
    const [initialized, setInitialized] = useState(false)
    const [isError, setIsError] = useState(true)
    const [meta, setMeta] = useState({})
    const { type, genre, size } = getModelComponents(name)
-    const version = useMemo(() => getLatestVersion(name, compatibility), [name, compatibility])
+    const version = useMemo(() => getLatestVersion(name, compatibility, prereleases), [
+        name,
+        compatibility,
+        prereleases,
+    ])

    useEffect(() => {
        window.dispatchEvent(new Event('resize')) // scroll position for progress
@ -173,10 +216,8 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl

    const releaseTag = meta.fullName ? `/tag/${meta.fullName}` : ''
    const releaseUrl = `https://github.com/${repo}/releases/${releaseTag}`
-    const pipeline =
-        meta.pipeline && join(meta.pipeline.map(p => <InlineCode key={p}>{p}</InlineCode>))
-    const components =
-        meta.components && join(meta.components.map(p => <InlineCode key={p}>{p}</InlineCode>))
+    const pipeline = linkComponents(meta.pipeline)
+    const components = linkComponents(meta.components)
    const sources = formatSources(meta.sources)
    const author = !meta.url ? meta.author : <Link to={meta.url}>{meta.author}</Link>
    const licenseUrl = licenses[meta.license] ? licenses[meta.license].url : null
@ -332,7 +373,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
 const Models = ({ pageContext, repo, children }) => {
    const [initialized, setInitialized] = useState(false)
    const [compatibility, setCompatibility] = useState({})
-    const { id, title, meta, hasExamples } = pageContext
+    const { id, title, meta } = pageContext
    const { models, isStarters } = meta
    const baseUrl = `https://raw.githubusercontent.com/${repo}/master`

@ -381,6 +422,7 @@ const Models = ({ pageContext, repo, children }) => {
                            repo={repo}
                            licenses={arrayToObj(site.siteMetadata.licenses, 'id')}
                            hasExamples={meta.hasExamples}
+                            prereleases={site.siteMetadata.nightly}
                        />
                    ))
                }
@ -397,6 +439,7 @@ const query = graphql`
    query ModelsQuery {
        site {
            siteMetadata {
+                nightly
                licenses {
                    id
                    url