Merge branch 'develop' into nightly.spacy.io

2025-10-29 06:57:49 +03:00 · 2020-09-25 13:21:55 +02:00 · 2020-09-25 13:21:55 +02:00 · f3aba49830
commit f3aba49830
parent 35cfe09348 02a1b6ab83
24 changed files with 651 additions and 152 deletions
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a24"
+__version__ = "3.0.0a25"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
--- a/spacy/cli/project/pull.py
+++ b/spacy/cli/project/pull.py
@ -51,7 +51,7 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
                    update_lockfile(project_dir, cmd)
                # We remove the command from the list here, and break, so that
                # we iterate over the loop again.
-                commands.remove(i)
+                commands.pop(i)
                break
        else:
            # If we didn't break the for loop, break the while loop.
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -401,10 +401,6 @@ class Errors:
            "Matcher or PhraseMatcher with the attribute {attr}. "
            "Try using nlp() instead of nlp.make_doc() or list(nlp.pipe()) "
            "instead of list(nlp.tokenizer.pipe()).")
    E156 = ("The pipeline needs to include a parser in order to use "
            "Matcher or PhraseMatcher with the attribute DEP. Try using "
            "nlp() instead of nlp.make_doc() or list(nlp.pipe()) instead of "
            "list(nlp.tokenizer.pipe()).")
    E157 = ("Can't render negative values for dependency arc start or end. "
            "Make sure that you're passing in absolute token indices, not "
            "relative token offsets.\nstart: {start}, end: {end}, label: "
@ -517,8 +513,8 @@ class Errors:
            "instead.")
    E927 = ("Can't write to frozen list Maybe you're trying to modify a computed "
            "property or default function argument?")
-    E928 = ("A 'KnowledgeBase' should be written to / read from a file, but the "
+    E928 = ("A 'KnowledgeBase' can only be serialized to/from from a directory, "
-            "provided argument {loc} is an existing directory.")
+            "but the provided argument {loc} points to a file.")
    E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does "
            "not seem to exist.")
    E930 = ("Received invalid get_examples callback in {name}.begin_training. "
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@ -10,6 +10,8 @@ from libcpp.vector cimport vector
 from pathlib import Path
 import warnings
 from spacy.strings import StringStore
 from spacy import util
 from .typedefs cimport hash_t
@ -83,6 +85,9 @@ cdef class KnowledgeBase:
    DOCS: https://nightly.spacy.io/api/kb
    """
    contents_loc = "contents"
    strings_loc = "strings.json"
    def __init__(self, Vocab vocab, entity_vector_length):
        """Create a KnowledgeBase."""
        self.mem = Pool()
@ -319,15 +324,29 @@ cdef class KnowledgeBase:
        return 0.0
    def to_disk(self, path):
        path = util.ensure_path(path)
-        if path.is_dir():
+        if not path.exists():
            path.mkdir(parents=True)
        if not path.is_dir():
            raise ValueError(Errors.E928.format(loc=path))
-        if not path.parent.exists():
+        self.write_contents(path / self.contents_loc)
-            path.parent.mkdir(parents=True)
+        self.vocab.strings.to_disk(path / self.strings_loc)
-        cdef Writer writer = Writer(path)
+    def from_disk(self, path):
        path = util.ensure_path(path)
        if not path.exists():
            raise ValueError(Errors.E929.format(loc=path))
        if not path.is_dir():
            raise ValueError(Errors.E928.format(loc=path))
        self.read_contents(path / self.contents_loc)
        kb_strings = StringStore()
        kb_strings.from_disk(path / self.strings_loc)
        for string in kb_strings:
            self.vocab.strings.add(string)
    def write_contents(self, file_path):
        cdef Writer writer = Writer(file_path)
        writer.write_header(self.get_size_entities(), self.entity_vector_length)
        # dumping the entity vectors in their original order
@ -366,13 +385,7 @@ cdef class KnowledgeBase:
        writer.close()
-    def from_disk(self, path):
+    def read_contents(self, file_path):
        path = util.ensure_path(path)
        if path.is_dir():
            raise ValueError(Errors.E928.format(loc=path))
        if not path.exists():
            raise ValueError(Errors.E929.format(loc=path))
        cdef hash_t entity_hash
        cdef hash_t alias_hash
        cdef int64_t entry_index
@ -382,7 +395,7 @@ cdef class KnowledgeBase:
        cdef AliasC alias
        cdef float vector_element
-        cdef Reader reader = Reader(path)
+        cdef Reader reader = Reader(file_path)
        # STEP 0: load header and initialize KB
        cdef int64_t nr_entities
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -17,6 +17,7 @@ from ..vocab cimport Vocab
 from ..tokens.doc cimport Doc, get_token_attr_for_matcher
 from ..tokens.span cimport Span
 from ..tokens.token cimport Token
 from ..tokens.morphanalysis cimport MorphAnalysis
 from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH
 from ..schemas import validate_token_pattern
@ -124,7 +125,7 @@ cdef class Matcher:
        key = self._normalize_key(key)
        for pattern in patterns:
            try:
-                specs = _preprocess_pattern(pattern, self.vocab.strings,
+                specs = _preprocess_pattern(pattern, self.vocab,
                    self._extensions, self._extra_predicates)
                self.patterns.push_back(init_pattern(self.mem, key, specs))
                for spec in specs:
@ -195,7 +196,7 @@ cdef class Matcher:
                else:
                    yield doc
-    def __call__(self, object doclike, *, as_spans=False):
+    def __call__(self, object doclike, *, as_spans=False, allow_missing=False):
        """Find all token sequences matching the supplied pattern.
        doclike (Doc or Span): The document to match over.
@ -215,16 +216,19 @@ cdef class Matcher:
        else:
            raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__))
        cdef Pool tmp_pool = Pool()
-        if TAG in self._seen_attrs and not doc.has_annotation("TAG"):
+        if not allow_missing:
-            raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG"))
+            for attr in (TAG, POS, MORPH, LEMMA, DEP):
-        if POS in self._seen_attrs and not doc.has_annotation("POS"):
+                if attr in self._seen_attrs and not doc.has_annotation(attr):
-            raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS"))
+                    if attr == TAG:
-        if MORPH in self._seen_attrs and not doc.has_annotation("MORPH"):
+                        pipe = "tagger"
-            raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH"))
+                    elif attr in (POS, MORPH):
-        if LEMMA in self._seen_attrs and not doc.has_annotation("LEMMA"):
+                        pipe = "morphologizer"
-            raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA"))
+                    elif attr == LEMMA:
-        if DEP in self._seen_attrs and not doc.has_annotation("DEP"):
+                        pipe = "lemmatizer"
-            raise ValueError(Errors.E156.format())
+                    elif attr == DEP:
                        pipe = "parser"
                    error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
                    raise ValueError(error_msg)
        matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
                                extensions=self._extensions, predicates=self._extra_predicates)
        final_matches = []
@ -660,7 +664,7 @@ cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
    return id_attr.value
-def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predicates):
+def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
    """This function interprets the pattern, converting the various bits of
    syntactic sugar before we compile it into a struct with init_pattern.
@ -675,6 +679,7 @@ def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predi
        extra_predicates.
    """
    tokens = []
    string_store = vocab.strings
    for spec in token_specs:
        if not spec:
            # Signifier for 'any token'
@ -685,7 +690,7 @@ def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predi
        ops = _get_operators(spec)
        attr_values = _get_attr_values(spec, string_store)
        extensions = _get_extensions(spec, string_store, extensions_table)
-        predicates = _get_extra_predicates(spec, extra_predicates)
+        predicates = _get_extra_predicates(spec, extra_predicates, vocab)
        for op in ops:
            tokens.append((op, list(attr_values), list(extensions), list(predicates)))
    return tokens
@ -729,7 +734,7 @@ def _get_attr_values(spec, string_store):
 class _RegexPredicate:
    operators = ("REGEX",)
-    def __init__(self, i, attr, value, predicate, is_extension=False):
+    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
        self.i = i
        self.attr = attr
        self.value = re.compile(value)
@ -747,13 +752,18 @@ class _RegexPredicate:
        return bool(self.value.search(value))
-class _SetMemberPredicate:
+class _SetPredicate:
-    operators = ("IN", "NOT_IN")
+    operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET")
-    def __init__(self, i, attr, value, predicate, is_extension=False):
+    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
        self.i = i
        self.attr = attr
-        self.value = set(get_string_id(v) for v in value)
+        self.vocab = vocab
        if self.attr == MORPH:
            # normalize morph strings
            self.value = set(self.vocab.morphology.add(v) for v in value)
        else:
            self.value = set(get_string_id(v) for v in value)
        self.predicate = predicate
        self.is_extension = is_extension
        self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
@ -765,19 +775,32 @@ class _SetMemberPredicate:
            value = get_string_id(token._.get(self.attr))
        else:
            value = get_token_attr_for_matcher(token.c, self.attr)
        if self.predicate in ("IS_SUBSET", "IS_SUPERSET"):
            if self.attr == MORPH:
                # break up MORPH into individual Feat=Val values
                value = set(get_string_id(v) for v in MorphAnalysis.from_id(self.vocab, value))
            else:
                # IS_SUBSET for other attrs will be equivalent to "IN"
                # IS_SUPERSET will only match for other attrs with 0 or 1 values
                value = set([value])
        if self.predicate == "IN":
            return value in self.value
-        else:
+        elif self.predicate == "NOT_IN":
            return value not in self.value
        elif self.predicate == "IS_SUBSET":
            return value <= self.value
        elif self.predicate == "IS_SUPERSET":
            return value >= self.value
    def __repr__(self):
-        return repr(("SetMemberPredicate", self.i, self.attr, self.value, self.predicate))
+        return repr(("SetPredicate", self.i, self.attr, self.value, self.predicate))
 class _ComparisonPredicate:
    operators = ("==", "!=", ">=", "<=", ">", "<")
-    def __init__(self, i, attr, value, predicate, is_extension=False):
+    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
        self.i = i
        self.attr = attr
        self.value = value
@ -806,11 +829,13 @@ class _ComparisonPredicate:
            return value < self.value
-def _get_extra_predicates(spec, extra_predicates):
+def _get_extra_predicates(spec, extra_predicates, vocab):
    predicate_types = {
        "REGEX": _RegexPredicate,
-        "IN": _SetMemberPredicate,
+        "IN": _SetPredicate,
-        "NOT_IN": _SetMemberPredicate,
+        "NOT_IN": _SetPredicate,
        "IS_SUBSET": _SetPredicate,
        "IS_SUPERSET": _SetPredicate,
        "==": _ComparisonPredicate,
        "!=": _ComparisonPredicate,
        ">=": _ComparisonPredicate,
@ -838,7 +863,7 @@ def _get_extra_predicates(spec, extra_predicates):
            value_with_upper_keys = {k.upper(): v for k, v in value.items()}
            for type_, cls in predicate_types.items():
                if type_ in value_with_upper_keys:
-                    predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_)
+                    predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_, vocab=vocab)
                    # Don't create a redundant predicates.
                    # This helps with efficiency, as we're caching the results.
                    if predicate.key in seen_predicates:
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@ -186,16 +186,18 @@ cdef class PhraseMatcher:
            if isinstance(doc, Doc):
                attrs = (TAG, POS, MORPH, LEMMA, DEP)
                has_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
-                if self.attr == TAG and not has_annotation[TAG]:
+                for attr in attrs:
-                    raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG"))
+                    if self.attr == attr and not has_annotation[attr]:
-                if self.attr == POS and not has_annotation[POS]:
+                        if attr == TAG:
-                    raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS"))
+                            pipe = "tagger"
-                if self.attr == MORPH and not has_annotation[MORPH]:
+                        elif attr in (POS, MORPH):
-                    raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH"))
+                            pipe = "morphologizer"
-                if self.attr == LEMMA and not has_annotation[LEMMA]:
+                        elif attr == LEMMA:
-                    raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA"))
+                            pipe = "lemmatizer"
-                if self.attr == DEP and not has_annotation[DEP]:
+                        elif attr == DEP:
-                    raise ValueError(Errors.E156.format())
+                            pipe = "parser"
                        error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
                        raise ValueError(error_msg)
                if self._validate and any(has_annotation.values()) \
                        and self.attr not in attrs:
                    string_attr = self.vocab.strings[self.attr]
--- a/spacy/pipeline/attributeruler.py
+++ b/spacy/pipeline/attributeruler.py
@ -79,7 +79,7 @@ class AttributeRuler(Pipe):
        DOCS: https://nightly.spacy.io/api/attributeruler#call
        """
-        matches = sorted(self.matcher(doc))
+        matches = sorted(self.matcher(doc, allow_missing=True))
        for match_id, start, end in matches:
            span = Span(doc, start, end, label=match_id)
@ -126,8 +126,12 @@ class AttributeRuler(Pipe):
        for tag, attrs in tag_map.items():
            pattern = [{"TAG": tag}]
            attrs, morph_attrs = _split_morph_attrs(attrs)
-            morph = self.vocab.morphology.add(morph_attrs)
+            if "MORPH" not in attrs:
-            attrs["MORPH"] = self.vocab.strings[morph]
+                morph = self.vocab.morphology.add(morph_attrs)
                attrs["MORPH"] = self.vocab.strings[morph]
            else:
                morph = self.vocab.morphology.add(attrs["MORPH"])
                attrs["MORPH"] = self.vocab.strings[morph]
            self.add([pattern], attrs)
    def load_from_morph_rules(
@ -146,8 +150,12 @@ class AttributeRuler(Pipe):
                pattern = [{"ORTH": word, "TAG": tag}]
                attrs = morph_rules[tag][word]
                attrs, morph_attrs = _split_morph_attrs(attrs)
-                morph = self.vocab.morphology.add(morph_attrs)
+                if "MORPH" in attrs:
-                attrs["MORPH"] = self.vocab.strings[morph]
+                    morph = self.vocab.morphology.add(attrs["MORPH"])
                    attrs["MORPH"] = self.vocab.strings[morph]
                elif morph_attrs:
                    morph = self.vocab.morphology.add(morph_attrs)
                    attrs["MORPH"] = self.vocab.strings[morph]
                self.add([pattern], attrs)
    def add(
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@ -16,6 +16,7 @@ from ..training import Example, validate_examples
 from ..errors import Errors, Warnings
 from ..util import SimpleFrozenList
 from .. import util
 from ..scorer import Scorer
 default_model_config = """
@ -47,6 +48,11 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
        "incl_context": True,
        "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
    },
    default_score_weights={
        "nel_micro_f": 1.0,
        "nel_micro_r": None,
        "nel_micro_p": None,
    },
 )
 def make_entity_linker(
    nlp: Language,
@ -209,12 +215,11 @@ class EntityLinker(Pipe):
            # it does run the model twice :(
            predictions = self.model.predict(docs)
        for eg in examples:
-            sentences = [s for s in eg.predicted.sents]
+            sentences = [s for s in eg.reference.sents]
            kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
-            for ent in eg.predicted.ents:
+            for ent in eg.reference.ents:
-                kb_id = kb_ids[
+                # KB ID of the first token is the same as the whole span
-                    ent.start
+                kb_id = kb_ids[ent.start]
                ]  # KB ID of the first token is the same as the whole span
                if kb_id:
                    try:
                        # find the sentence in the list of sentences.
@ -253,7 +258,7 @@ class EntityLinker(Pipe):
        entity_encodings = []
        for eg in examples:
            kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
-            for ent in eg.predicted.ents:
+            for ent in eg.reference.ents:
                kb_id = kb_ids[ent.start]
                if kb_id:
                    entity_encoding = self.kb.get_vector(kb_id)
@ -415,6 +420,17 @@ class EntityLinker(Pipe):
                for token in ent:
                    token.ent_kb_id_ = kb_id
    def score(self, examples, **kwargs):
        """Score a batch of examples.
        examples (Iterable[Example]): The examples to score.
        RETURNS (Dict[str, Any]): The scores.
        DOCS TODO: https://nightly.spacy.io/api/entity_linker#score
        """
        validate_examples(examples, "EntityLinker.score")
        return Scorer.score_links(examples, negative_labels=[self.NIL])
    def to_disk(
        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
    ) -> None:
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@ -6,7 +6,7 @@ from .transition_parser cimport Parser
 from ._parser_internals.ner cimport BiluoPushDown
 from ..language import Language
-from ..scorer import Scorer
+from ..scorer import get_ner_prf, PRFScore
 from ..training import validate_examples
@ -117,9 +117,18 @@ cdef class EntityRecognizer(Parser):
        """Score a batch of examples.
        examples (Iterable[Example]): The examples to score.
-        RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
+        RETURNS (Dict[str, Any]): The NER precision, recall and f-scores.
        DOCS: https://nightly.spacy.io/api/entityrecognizer#score
        """
        validate_examples(examples, "EntityRecognizer.score")
-        return Scorer.score_spans(examples, "ents", **kwargs)
+        score_per_type = get_ner_prf(examples)
        totals = PRFScore()
        for prf in score_per_type.values():
            totals += prf
        return {
            "ents_p": totals.precision,
            "ents_r": totals.recall,
            "ents_f": totals.fscore,
            "ents_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
        }
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -61,6 +61,8 @@ class TokenPatternString(BaseModel):
    REGEX: Optional[StrictStr] = Field(None, alias="regex")
    IN: Optional[List[StrictStr]] = Field(None, alias="in")
    NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in")
    IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset")
    IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
    class Config:
        extra = "forbid"
@ -77,6 +79,8 @@ class TokenPatternNumber(BaseModel):
    REGEX: Optional[StrictStr] = Field(None, alias="regex")
    IN: Optional[List[StrictInt]] = Field(None, alias="in")
    NOT_IN: Optional[List[StrictInt]] = Field(None, alias="not_in")
    ISSUBSET: Optional[List[StrictInt]] = Field(None, alias="issubset")
    ISSUPERSET: Optional[List[StrictInt]] = Field(None, alias="issuperset")
    EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==")
    NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=")
    GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=")
@ -115,6 +119,7 @@ class TokenPattern(BaseModel):
    lower: Optional[StringValue] = None
    pos: Optional[StringValue] = None
    tag: Optional[StringValue] = None
    morph: Optional[StringValue] = None
    dep: Optional[StringValue] = None
    lemma: Optional[StringValue] = None
    shape: Optional[StringValue] = None
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -1,5 +1,6 @@
 from typing import Optional, Iterable, Dict, Any, Callable, TYPE_CHECKING
 import numpy as np
 from collections import defaultdict
 from .training import Example
 from .tokens import Token, Doc, Span
@ -23,6 +24,19 @@ class PRFScore:
        self.fp = 0
        self.fn = 0
    def __iadd__(self, other):
        self.tp += other.tp
        self.fp += other.fp
        self.fn += other.fn
        return self
    def __add__(self, other):
        return PRFScore(
            tp=self.tp+other.tp,
            fp=self.fp+other.fp,
            fn=self.fn+other.fn
        )
    def score_set(self, cand: set, gold: set) -> None:
        self.tp += len(cand.intersection(gold))
        self.fp += len(cand - gold)
@ -295,12 +309,6 @@ class Scorer:
            # Find all predidate labels, for all and per type
            gold_spans = set()
            pred_spans = set()
            # Special case for ents:
            # If we have missing values in the gold, we can't easily tell
            # whether our NER predictions are true.
            # It seems bad but it's what we've always done.
            if attr == "ents" and not all(token.ent_iob != 0 for token in gold_doc):
                continue
            for span in getter(gold_doc, attr):
                gold_span = (span.label_, span.start, span.end - 1)
                gold_spans.add(gold_span)
@ -451,6 +459,74 @@ class Scorer:
            results[f"{attr}_score_desc"] = "macro AUC"
        return results
    @staticmethod
    def score_links(
        examples: Iterable[Example], *, negative_labels: Iterable[str]
    ) -> Dict[str, Any]:
        """Returns PRF for predicted links on the entity level.
        To disentangle the performance of the NEL from the NER,
        this method only evaluates NEL links for entities that overlap
        between the gold reference and the predictions.
        examples (Iterable[Example]): Examples to score
        negative_labels (Iterable[str]): The string values that refer to no annotation (e.g. "NIL")
        RETURNS (Dict[str, Any]): A dictionary containing the scores.
        DOCS (TODO): https://nightly.spacy.io/api/scorer#score_links
        """
        f_per_type = {}
        for example in examples:
            gold_ent_by_offset = {}
            for gold_ent in example.reference.ents:
                gold_ent_by_offset[(gold_ent.start_char, gold_ent.end_char)] = gold_ent
            for pred_ent in example.predicted.ents:
                gold_span = gold_ent_by_offset.get(
                    (pred_ent.start_char, pred_ent.end_char), None
                )
                label = gold_span.label_
                if not label in f_per_type:
                    f_per_type[label] = PRFScore()
                gold = gold_span.kb_id_
                # only evaluating entities that overlap between gold and pred,
                # to disentangle the performance of the NEL from the NER
                if gold is not None:
                    pred = pred_ent.kb_id_
                    if gold in negative_labels and pred in negative_labels:
                        # ignore true negatives
                        pass
                    elif gold == pred:
                        f_per_type[label].tp += 1
                    elif gold in negative_labels:
                        f_per_type[label].fp += 1
                    elif pred in negative_labels:
                        f_per_type[label].fn += 1
                    else:
                        # a wrong prediction (e.g. Q42 != Q3) counts as both a FP as well as a FN
                        f_per_type[label].fp += 1
                        f_per_type[label].fn += 1
        micro_prf = PRFScore()
        for label_prf in f_per_type.values():
            micro_prf.tp += label_prf.tp
            micro_prf.fn += label_prf.fn
            micro_prf.fp += label_prf.fp
        n_labels = len(f_per_type) + 1e-100
        macro_p = sum(prf.precision for prf in f_per_type.values()) / n_labels
        macro_r = sum(prf.recall for prf in f_per_type.values()) / n_labels
        macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_labels
        results = {
            f"nel_score": micro_prf.fscore,
            f"nel_score_desc": "micro F",
            f"nel_micro_p": micro_prf.precision,
            f"nel_micro_r": micro_prf.recall,
            f"nel_micro_f": micro_prf.fscore,
            f"nel_macro_p": macro_p,
            f"nel_macro_r": macro_r,
            f"nel_macro_f": macro_f,
            f"nel_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
        }
        return results
    @staticmethod
    def score_deps(
        examples: Iterable[Example],
@ -545,6 +621,39 @@ class Scorer:
        }
 def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]:
    """Compute per-entity PRFScore objects for a sequence of examples. The
    results are returned as a dictionary keyed by the entity type. You can
    add the PRFScore objects to get micro-averaged total.
    """
    scores = defaultdict(PRFScore)
    for eg in examples:
        if not eg.y.has_annotation("ENT_IOB"):
            continue
        golds = {(e.label_, e.start, e.end) for e in eg.y.ents}
        align_x2y = eg.alignment.x2y
        preds = set()
        for pred_ent in eg.x.ents:
            if pred_ent.label_ not in scores:
                scores[pred_ent.label_] = PRFScore()
            indices = align_x2y[pred_ent.start : pred_ent.end].dataXd.ravel()
            if len(indices):
                g_span = eg.y[indices[0] : indices[-1] + 1]
                # Check we aren't missing annotation on this span. If so,
                # our prediction is neither right nor wrong, we just
                # ignore it.
                if all(token.ent_iob != 0 for token in g_span):
                    key = (pred_ent.label_, indices[0], indices[-1] + 1)
                    if key in golds:
                        scores[pred_ent.label_].tp += 1
                        golds.remove(key)
                    else:
                        scores[pred_ent.label_].fp += 1
        for label, start, end in golds:
            scores[label].fn += 1
    return scores
 #############################################################################
 #
 # The following implementation of roc_auc_score() is adapted from
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@ -230,6 +230,106 @@ def test_matcher_set_value_operator(en_vocab):
    assert len(matches) == 1
 def test_matcher_subset_value_operator(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"MORPH": {"IS_SUBSET": ["Feat=Val", "Feat2=Val2"]}}]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    assert len(matcher(doc)) == 3
    doc[0].morph_ = "Feat=Val"
    assert len(matcher(doc)) == 3
    doc[0].morph_ = "Feat=Val|Feat2=Val2"
    assert len(matcher(doc)) == 3
    doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3"
    assert len(matcher(doc)) == 2
    doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4"
    assert len(matcher(doc)) == 2
    # IS_SUBSET acts like "IN" for attrs other than MORPH
    matcher = Matcher(en_vocab)
    pattern = [{"TAG": {"IS_SUBSET": ["A", "B"]}}]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    doc[0].tag_ = "A"
    assert len(matcher(doc)) == 1
    # IS_SUBSET with an empty list matches nothing
    matcher = Matcher(en_vocab)
    pattern = [{"TAG": {"IS_SUBSET": []}}]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    doc[0].tag_ = "A"
    assert len(matcher(doc)) == 0
 def test_matcher_superset_value_operator(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"MORPH": {"IS_SUPERSET": ["Feat=Val", "Feat2=Val2", "Feat3=Val3"]}}]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    assert len(matcher(doc)) == 0
    doc[0].morph_ = "Feat=Val|Feat2=Val2"
    assert len(matcher(doc)) == 0
    doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3"
    assert len(matcher(doc)) == 1
    doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4"
    assert len(matcher(doc)) == 1
    # IS_SUPERSET with more than one value only matches for MORPH
    matcher = Matcher(en_vocab)
    pattern = [{"TAG": {"IS_SUPERSET": ["A", "B"]}}]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    doc[0].tag_ = "A"
    assert len(matcher(doc)) == 0
    # IS_SUPERSET with one value is the same as ==
    matcher = Matcher(en_vocab)
    pattern = [{"TAG": {"IS_SUPERSET": ["A"]}}]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    doc[0].tag_ = "A"
    assert len(matcher(doc)) == 1
    # IS_SUPERSET with an empty value matches everything
    matcher = Matcher(en_vocab)
    pattern = [{"TAG": {"IS_SUPERSET": []}}]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    doc[0].tag_ = "A"
    assert len(matcher(doc)) == 3
 def test_matcher_morph_handling(en_vocab):
    # order of features in pattern doesn't matter
    matcher = Matcher(en_vocab)
    pattern1 = [{"MORPH": {"IN": ["Feat1=Val1|Feat2=Val2"]}}]
    pattern2 = [{"MORPH": {"IN": ["Feat2=Val2|Feat1=Val1"]}}]
    matcher.add("M", [pattern1])
    matcher.add("N", [pattern2])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    assert len(matcher(doc)) == 0
    doc[0].morph_ = "Feat2=Val2|Feat1=Val1"
    assert len(matcher(doc)) == 2
    doc[0].morph_ = "Feat1=Val1|Feat2=Val2"
    assert len(matcher(doc)) == 2
    # multiple values are split
    matcher = Matcher(en_vocab)
    pattern1 = [{"MORPH": {"IS_SUPERSET": ["Feat1=Val1", "Feat2=Val2"]}}]
    pattern2 = [{"MORPH": {"IS_SUPERSET": ["Feat1=Val1", "Feat1=Val3", "Feat2=Val2"]}}]
    matcher.add("M", [pattern1])
    matcher.add("N", [pattern2])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    assert len(matcher(doc)) == 0
    doc[0].morph_ = "Feat2=Val2,Val3|Feat1=Val1"
    assert len(matcher(doc)) == 1
    doc[0].morph_ = "Feat1=Val1,Val3|Feat2=Val2"
    assert len(matcher(doc)) == 2
 def test_matcher_regex(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"ORTH": {"REGEX": r"(?:a|an)"}}]
@ -316,6 +416,9 @@ def test_attr_pipeline_checks(en_vocab):
        matcher(doc2)
    with pytest.raises(ValueError):
        matcher(doc3)
    # errors can be suppressed if desired
    matcher(doc2, allow_missing=True)
    matcher(doc3, allow_missing=True)
    # TAG, POS, LEMMA require those values
    for attr in ("TAG", "POS", "LEMMA"):
        matcher = Matcher(en_vocab)
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@ -2,8 +2,10 @@ from typing import Callable, Iterable
 import pytest
 from spacy.kb import KnowledgeBase, get_candidates, Candidate
 from spacy.vocab import Vocab
 from spacy import util, registry
 from spacy.scorer import Scorer
 from spacy.training import Example
 from spacy.lang.en import English
 from spacy.tests.util import make_tempdir
@ -151,22 +153,15 @@ def test_kb_serialize(nlp):
        # normal read-write behaviour
        mykb.to_disk(d / "kb")
        mykb.from_disk(d / "kb")
        mykb.to_disk(d / "kb.file")
        mykb.from_disk(d / "kb.file")
        mykb.to_disk(d / "new" / "kb")
        mykb.from_disk(d / "new" / "kb")
        # allow overwriting an existing file
-        mykb.to_disk(d / "kb.file")
+        mykb.to_disk(d / "kb")
        with pytest.raises(ValueError):
            # can not write to a directory
            mykb.to_disk(d)
        with pytest.raises(ValueError):
            # can not read from a directory
            mykb.from_disk(d)
        with pytest.raises(ValueError):
            # can not read from an unknown file
            mykb.from_disk(d / "unknown" / "kb")
 def test_candidate_generation(nlp):
    """Test correct candidate generation"""
    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
@ -254,6 +249,41 @@ def test_el_pipe_configuration(nlp):
    assert doc[2].ent_kb_id_ == "Q2"
 def test_vocab_serialization(nlp):
    """Test that string information is retained across storage"""
    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
    # adding entities
    q1_hash = mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
    q2_hash = mykb.add_entity(entity="Q2", freq=12, entity_vector=[2])
    q3_hash = mykb.add_entity(entity="Q3", freq=5, entity_vector=[3])
    # adding aliases
    douglas_hash = mykb.add_alias(
        alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1]
    )
    adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
    candidates = mykb.get_alias_candidates("adam")
    assert len(candidates) == 1
    assert candidates[0].entity == q2_hash
    assert candidates[0].entity_ == "Q2"
    assert candidates[0].alias == adam_hash
    assert candidates[0].alias_ == "adam"
    with make_tempdir() as d:
        mykb.to_disk(d / "kb")
        kb_new_vocab = KnowledgeBase(Vocab(), entity_vector_length=1)
        kb_new_vocab.from_disk(d / "kb")
        candidates = kb_new_vocab.get_alias_candidates("adam")
        assert len(candidates) == 1
        assert candidates[0].entity == q2_hash
        assert candidates[0].entity_ == "Q2"
        assert candidates[0].alias == adam_hash
        assert candidates[0].alias_ == "adam"
 def test_append_alias(nlp):
    """Test that we can append additional alias-entity pairs"""
    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
@ -377,16 +407,20 @@ def test_preserving_links_ents_2(nlp):
 TRAIN_DATA = [
    ("Russ Cochran captured his first major title with his son as caddie.",
        {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}},
-         "entities": [(0, 12, "PERSON")]}),
+         "entities": [(0, 12, "PERSON")],
         "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}),
    ("Russ Cochran his reprints include EC Comics.",
        {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}},
-         "entities": [(0, 12, "PERSON")]}),
+         "entities": [(0, 12, "PERSON")],
         "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0]}),
    ("Russ Cochran has been publishing comic art.",
        {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}},
-         "entities": [(0, 12, "PERSON")]}),
+         "entities": [(0, 12, "PERSON")],
         "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0]}),
    ("Russ Cochran was a member of University of Kentucky's golf team.",
        {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}},
-         "entities": [(0, 12, "PERSON"), (43, 51, "LOC")]}),
+         "entities": [(0, 12, "PERSON"), (43, 51, "LOC")],
         "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]})
 ]
 GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
 # fmt: on
@ -395,16 +429,8 @@ GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
 def test_overfitting_IO():
    # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
    nlp = English()
    nlp.add_pipe("sentencizer")
    vector_length = 3
    # Add a custom component to recognize "Russ Cochran" as an entity for the example training data
    patterns = [
        {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}
    ]
    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)
    # Convert the texts to docs to make sure we have doc.ents set for the training examples
    train_examples = []
    for text, annotation in TRAIN_DATA:
@ -446,6 +472,16 @@ def test_overfitting_IO():
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses["entity_linker"] < 0.001
    # adding additional components that are required for the entity_linker
    nlp.add_pipe("sentencizer", first=True)
    # Add a custom component to recognize "Russ Cochran" as an entity for the example training data
    patterns = [
        {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}
    ]
    ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
    ruler.add_patterns(patterns)
    # test the trained model
    predictions = []
    for text, annotation in TRAIN_DATA:
@ -465,3 +501,46 @@ def test_overfitting_IO():
            for ent in doc2.ents:
                predictions.append(ent.kb_id_)
        assert predictions == GOLD_entities
 def test_scorer_links():
    train_examples = []
    nlp = English()
    ref1 = nlp("Julia lives in London happily.")
    ref1.ents = [
        Span(ref1, 0, 1, label="PERSON", kb_id="Q2"),
        Span(ref1, 3, 4, label="LOC", kb_id="Q3"),
    ]
    pred1 = nlp("Julia lives in London happily.")
    pred1.ents = [
        Span(pred1, 0, 1, label="PERSON", kb_id="Q70"),
        Span(pred1, 3, 4, label="LOC", kb_id="Q3"),
    ]
    train_examples.append(Example(pred1, ref1))
    ref2 = nlp("She loves London.")
    ref2.ents = [
        Span(ref2, 0, 1, label="PERSON", kb_id="Q2"),
        Span(ref2, 2, 3, label="LOC", kb_id="Q13"),
    ]
    pred2 = nlp("She loves London.")
    pred2.ents = [
        Span(pred2, 0, 1, label="PERSON", kb_id="Q2"),
        Span(pred2, 2, 3, label="LOC", kb_id="NIL"),
    ]
    train_examples.append(Example(pred2, ref2))
    ref3 = nlp("London is great.")
    ref3.ents = [Span(ref3, 0, 1, label="LOC", kb_id="NIL")]
    pred3 = nlp("London is great.")
    pred3.ents = [Span(pred3, 0, 1, label="LOC", kb_id="NIL")]
    train_examples.append(Example(pred3, ref3))
    scores = Scorer().score_links(train_examples, negative_labels=["NIL"])
    assert scores["nel_f_per_type"]["PERSON"]["p"] == 1 / 2
    assert scores["nel_f_per_type"]["PERSON"]["r"] == 1 / 2
    assert scores["nel_f_per_type"]["LOC"]["p"] == 1 / 1
    assert scores["nel_f_per_type"]["LOC"]["r"] == 1 / 2
    assert scores["nel_micro_p"] == 2 / 3
    assert scores["nel_micro_r"] == 2 / 4
--- a/spacy/tests/pipeline/test_pipe_factories.py
+++ b/spacy/tests/pipeline/test_pipe_factories.py
@ -345,12 +345,13 @@ def test_language_factories_invalid():
            [{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}],
            {"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25},
        ),
-        ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75},),
+        ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75}),
        ([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"a": 0.0, "b": 0.0, "c": 0.0}),
    ],
 )
 def test_language_factories_combine_score_weights(weights, expected):
    result = combine_score_weights(weights)
-    assert sum(result.values()) in (0.99, 1.0)
+    assert sum(result.values()) in (0.99, 1.0, 0.0)
    assert result == expected
--- a/spacy/tests/training/test_new_example.py
+++ b/spacy/tests/training/test_new_example.py
@ -244,3 +244,22 @@ def test_Example_from_dict_with_links_invalid(annots):
    predicted = Doc(vocab, words=annots["words"])
    with pytest.raises(ValueError):
        Example.from_dict(predicted, annots)
 def test_Example_from_dict_sentences():
    vocab = Vocab()
    predicted = Doc(vocab, words=["One", "sentence", ".", "one", "more"])
    annots = {"sent_starts": [1, 0, 0, 1, 0]}
    ex = Example.from_dict(predicted, annots)
    assert len(list(ex.reference.sents)) == 2
    # this currently throws an error - bug or feature?
    # predicted = Doc(vocab, words=["One", "sentence", "not", "one", "more"])
    # annots = {"sent_starts": [1, 0, 0, 0, 0]}
    # ex = Example.from_dict(predicted, annots)
    # assert len(list(ex.reference.sents)) == 1
    predicted = Doc(vocab, words=["One", "sentence", "not", "one", "more"])
    annots = {"sent_starts": [1, -1, 0, 0, 0]}
    ex = Example.from_dict(predicted, annots)
    assert len(list(ex.reference.sents)) == 1
--- a/spacy/training/example.pxd
+++ b/spacy/training/example.pxd
@ -1,4 +1,5 @@
 from ..tokens.doc cimport Doc
 from libc.stdint cimport uint64_t
 cdef class Example:
@ -7,3 +8,5 @@ cdef class Example:
    cdef readonly object _cached_alignment
    cdef readonly object _cached_words_x
    cdef readonly object _cached_words_y
    cdef readonly uint64_t _x_sig
    cdef readonly uint64_t _y_sig
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@ -1,6 +1,7 @@
 from collections import Iterable as IterableInstance
 import warnings
 import numpy
 from murmurhash.mrmr cimport hash64
 from ..tokens.doc cimport Doc
 from ..tokens.span cimport Span
@ -97,15 +98,36 @@ cdef class Example:
    @property
    def alignment(self):
-        words_x = [token.text for token in self.x]
+        x_sig = hash64(self.x.c, sizeof(self.x.c[0]) * self.x.length, 0)
-        words_y = [token.text for token in self.y]
+        y_sig = hash64(self.y.c, sizeof(self.y.c[0]) * self.y.length, 0)
-        if self._cached_alignment is None or \
+        if self._cached_alignment is None:
-                words_x != self._cached_words_x or \
+            words_x = [token.text for token in self.x]
-                words_y != self._cached_words_y:
+            words_y = [token.text for token in self.y]
-            self._cached_alignment = Alignment.from_strings(words_x, words_y)
+            self._x_sig = x_sig
            self._y_sig = y_sig
            self._cached_words_x = words_x
            self._cached_words_y = words_y
-        return self._cached_alignment
+            self._cached_alignment = Alignment.from_strings(words_x, words_y)
            return self._cached_alignment
        elif self._x_sig == x_sig and self._y_sig == y_sig:
            # If we have a cached alignment, check whether the cache is invalid
            # due to retokenization. To make this check fast in loops, we first
            # check a hash of the TokenC arrays.
            return self._cached_alignment
        else:
            words_x = [token.text for token in self.x]
            words_y = [token.text for token in self.y]
            if words_x == self._cached_words_x and words_y == self._cached_words_y:
                self._x_sig = x_sig
                self._y_sig = y_sig
                return self._cached_alignment
            else:
                self._cached_alignment = Alignment.from_strings(words_x, words_y)
                self._cached_words_x = words_x
                self._cached_words_y = words_y
                self._x_sig = x_sig
                self._y_sig = y_sig
                return self._cached_alignment
    def get_aligned(self, field, as_string=False):
        """Return an aligned array for a token attribute."""
@ -288,7 +310,6 @@ def _annot2array(vocab, tok_annot, doc_annot):
 def _add_entities_to_doc(doc, ner_data):
    print(ner_data)
    if ner_data is None:
        return
    elif ner_data == []:
--- a/spacy/util.py
+++ b/spacy/util.py
@ -1233,8 +1233,13 @@ def combine_score_weights(
        # components.
        total = sum(w_dict.values())
        for key, value in w_dict.items():
-            weight = round(value / total / len(all_weights), 2)
+            if total == 0:
-            result[key] = result.get(key, 0.0) + weight
+                weight = 0.0
            else:
                weight = round(value / total / len(all_weights), 2)
            prev_weight = result.get(key, 0.0)
            prev_weight = 0.0 if prev_weight is None else prev_weight
            result[key] = prev_weight + weight
    return result
--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@ -225,6 +225,21 @@ pipe's entity linking model and context encoder. Delegates to
 | `losses`          | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~           |
 | **RETURNS**       | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                              |
 ## EntityLinker.score {#score tag="method" new="3"}
 Score a batch of examples.
 > #### Example
 >
 > ```python
 > scores = entity_linker.score(examples)
 > ```
 | Name        | Description                                                                                    |
 | ----------- | ---------------------------------------------------------------------------------------------- |
 | `examples`  | The examples to score. ~~Iterable[Example]~~                                                   |
 | **RETURNS** | The scores, produced by [`Scorer.score_links`](/api/scorer#score_links) . ~~Dict[str, float]~~ |
 ## EntityLinker.create_optimizer {#create_optimizer tag="method"}
 Create an optimizer for the pipeline component.
--- a/website/docs/api/entityrecognizer.md
+++ b/website/docs/api/entityrecognizer.md
@ -242,10 +242,10 @@ Score a batch of examples.
 > scores = ner.score(examples)
 > ```
-| Name        | Description                                                                                                            |
+| Name        | Description                                               |
-| ----------- | ---------------------------------------------------------------------------------------------------------------------- |
+| ----------- | --------------------------------------------------------- |
-| `examples`  | The examples to score. ~~Iterable[Example]~~                                                                           |
+| `examples`  | The examples to score. ~~Iterable[Example]~~              |
-| **RETURNS** | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
+| **RETURNS** | The scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
 ## EntityRecognizer.create_optimizer {#create_optimizer tag="method"}
--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@ -30,20 +30,20 @@ pattern keys correspond to a number of
 [`Token` attributes](/api/token#attributes). The supported attributes for
 rule-based matching are:
-| Attribute                              |  Description                                                                                                              |
+| Attribute                                       |  Description                                                                                                              |
-| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
+| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
-| `ORTH`                                 | The exact verbatim text of a token. ~~str~~                                                                               |
+| `ORTH`                                          | The exact verbatim text of a token. ~~str~~                                                                               |
-| `TEXT` <Tag variant="new">2.1</Tag>    | The exact verbatim text of a token. ~~str~~                                                                               |
+| `TEXT` <Tag variant="new">2.1</Tag>             | The exact verbatim text of a token. ~~str~~                                                                               |
-| `LOWER`                                | The lowercase form of the token text. ~~str~~                                                                             |
+| `LOWER`                                         | The lowercase form of the token text. ~~str~~                                                                             |
-|  `LENGTH`                              | The length of the token text. ~~int~~                                                                                     |
+|  `LENGTH`                                       | The length of the token text. ~~int~~                                                                                     |
-|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`    | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~                                          |
+|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`             | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~                                          |
-|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE`    | Token text is in lowercase, uppercase, titlecase. ~~bool~~                                                                |
+|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE`             | Token text is in lowercase, uppercase, titlecase. ~~bool~~                                                                |
-|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP`     | Token is punctuation, whitespace, stop word. ~~bool~~                                                                     |
+|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP`              | Token is punctuation, whitespace, stop word. ~~bool~~                                                                     |
-|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`  | Token text resembles a number, URL, email. ~~bool~~                                                                       |
+|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`           | Token text resembles a number, URL, email. ~~bool~~                                                                       |
-|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. ~~str~~                               |
+|  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~       |
-| `ENT_TYPE`                             | The token's entity label. ~~str~~                                                                                         |
+| `ENT_TYPE`                                      | The token's entity label. ~~str~~                                                                                         |
-| `_` <Tag variant="new">2.1</Tag>       | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
+| `_` <Tag variant="new">2.1</Tag>                | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
-| `OP`                                   | Operator or quantifier to determine how often to match a token pattern. ~~str~~                                           |
+| `OP`                                            | Operator or quantifier to determine how often to match a token pattern. ~~str~~                                           |
 Operators and quantifiers define **how often** a token pattern should be
 matched:
@ -79,6 +79,8 @@ it compares to another value.
 | -------------------------- | ------------------------------------------------------------------------------------------------------- |
 | `IN`                       | Attribute value is member of a list. ~~Any~~                                                            |
 | `NOT_IN`                   | Attribute value is _not_ member of a list. ~~Any~~                                                      |
 | `ISSUBSET`                 | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~                                          |
 | `ISSUPERSET`               | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~                                        |
 | `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
 ## Matcher.\_\_init\_\_ {#init tag="method"}
--- a/website/docs/api/scorer.md
+++ b/website/docs/api/scorer.md
@ -206,3 +206,26 @@ depends on the scorer settings:
 | `multi_label`    | Whether the attribute allows multiple labels. Defaults to `True`. ~~bool~~                                                                         |
 | `positive_label` | The positive label for a binary task with exclusive classes. Defaults to `None`. ~~Optional[str]~~                                                 |
 | **RETURNS**      | A dictionary containing the scores, with inapplicable scores as `None`. ~~Dict[str, Optional[float]]~~                                             |
 ## Scorer.score_links {#score_links tag="staticmethod" new="3"}
 Returns PRF for predicted links on the entity level. To disentangle the
 performance of the NEL from the NER, this method only evaluates NEL links for
 entities that overlap between the gold reference and the predictions.
 > #### Example
 >
 > ```python
 > scores = Scorer.score_links(
 >     examples,
 >     negative_labels=["NIL", ""]
 > )
 > print(scores["nel_micro_f"])
 > ```
 | Name              | Description                                                                                                         |
 | ----------------- | ------------------------------------------------------------------------------------------------------------------- |
 | `examples`        | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
 | _keyword-only_    |                                                                                                                     |
 | `negative_labels` | The string values that refer to no annotation (e.g. "NIL"). ~~Iterable[str]~~                                       |
 | **RETURNS**       | A dictionary containing the scores. ~~Dict[str, Optional[float]]~~                                                  |
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@ -158,20 +158,20 @@ The available token pattern keys correspond to a number of
 [`Token` attributes](/api/token#attributes). The supported attributes for
 rule-based matching are:
-| Attribute                              |  Description                                                                                                              |
+| Attribute                                       |  Description                                                                                                              |
-| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
+| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
-| `ORTH`                                 | The exact verbatim text of a token. ~~str~~                                                                               |
+| `ORTH`                                          | The exact verbatim text of a token. ~~str~~                                                                               |
-| `TEXT` <Tag variant="new">2.1</Tag>    | The exact verbatim text of a token. ~~str~~                                                                               |
+| `TEXT` <Tag variant="new">2.1</Tag>             | The exact verbatim text of a token. ~~str~~                                                                               |
-| `LOWER`                                | The lowercase form of the token text. ~~str~~                                                                             |
+| `LOWER`                                         | The lowercase form of the token text. ~~str~~                                                                             |
-|  `LENGTH`                              | The length of the token text. ~~int~~                                                                                     |
+|  `LENGTH`                                       | The length of the token text. ~~int~~                                                                                     |
-|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`    | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~                                          |
+|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`             | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~                                          |
-|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE`    | Token text is in lowercase, uppercase, titlecase. ~~bool~~                                                                |
+|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE`             | Token text is in lowercase, uppercase, titlecase. ~~bool~~                                                                |
-|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP`     | Token is punctuation, whitespace, stop word. ~~bool~~                                                                     |
+|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP`              | Token is punctuation, whitespace, stop word. ~~bool~~                                                                     |
-|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`  | Token text resembles a number, URL, email. ~~bool~~                                                                       |
+|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`           | Token text resembles a number, URL, email. ~~bool~~                                                                       |
-|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. ~~str~~                               |
+|  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~       |
-| `ENT_TYPE`                             | The token's entity label. ~~str~~                                                                                         |
+| `ENT_TYPE`                                      | The token's entity label. ~~str~~                                                                                         |
-| `_` <Tag variant="new">2.1</Tag>       | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
+| `_` <Tag variant="new">2.1</Tag>                | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
-| `OP`                                   | [Operator or quantifier](#quantifiers) to determine how often to match a token pattern. ~~str~~                           |
+| `OP`                                            | [Operator or quantifier](#quantifiers) to determine how often to match a token pattern. ~~str~~                           |
 <Accordion title="Does it matter if the attribute names are uppercase or lowercase?">
@ -236,6 +236,8 @@ following rich comparison attributes are available:
 | -------------------------- | ------------------------------------------------------------------------------------------------------- |
 | `IN`                       | Attribute value is member of a list. ~~Any~~                                                            |
 | `NOT_IN`                   | Attribute value is _not_ member of a list. ~~Any~~                                                      |
 | `ISSUBSET`                 | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~                                          |
 | `ISSUPERSET`               | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~                                        |
 | `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
 #### Regular expressions {#regex new="2.1"}
--- a/website/src/templates/models.js
+++ b/website/src/templates/models.js
@ -11,12 +11,24 @@ import { Table, Tr, Td, Th } from '../components/table'
 import Tag from '../components/tag'
 import { H2, Label } from '../components/typography'
 import Icon from '../components/icon'
-import Link from '../components/link'
+import Link, { OptionalLink } from '../components/link'
 import Infobox from '../components/infobox'
 import Accordion from '../components/accordion'
 import { join, arrayToObj, abbrNum, markdownToReact } from '../components/util'
 import { isString, isEmptyObj } from '../components/util'
 const COMPONENT_LINKS = {
    tok2vec: '/api/tok2vec',
    transformer: '/api/transformer',
    tagger: '/api/tagger',
    parser: '/api/dependencyparser',
    ner: '/api/entityrecognizer',
    lemmatizer: '/api/lemmatizer',
    attribute_ruler: '/api/attributeruler',
    senter: '/api/sentencerecognizer',
    morphologizer: '/api/morphologizer',
 }
 const MODEL_META = {
    core: 'Vocabulary, syntax, entities, vectors',
    core_sm: 'Vocabulary, syntax, entities',
@ -78,10 +90,15 @@ function isStableVersion(v) {
    return !v.includes('a') && !v.includes('b') && !v.includes('dev') && !v.includes('rc')
 }
-function getLatestVersion(modelId, compatibility) {
+function getLatestVersion(modelId, compatibility, prereleases) {
    for (let [version, models] of Object.entries(compatibility)) {
        if (isStableVersion(version) && models[modelId]) {
-            return models[modelId][0]
+            const modelVersions = models[modelId]
            for (let modelVersion of modelVersions) {
                if (isStableVersion(modelVersion) || prereleases) {
                    return modelVersion
                }
            }
        }
    }
 }
@ -141,18 +158,44 @@ function formatSources(data = []) {
    ))
 }
 function linkComponents(components = []) {
    return join(
        components.map(c => (
            <Fragment key={c}>
                <OptionalLink to={COMPONENT_LINKS[c]} hideIcon>
                    <InlineCode>{c}</InlineCode>
                </OptionalLink>
            </Fragment>
        ))
    )
 }
 const Help = ({ children }) => (
    <span data-tooltip={children}>
        <Icon name="help2" width={16} variant="subtle" inline />
    </span>
 )
-const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExamples, licenses }) => {
+const Model = ({
    name,
    langId,
    langName,
    baseUrl,
    repo,
    compatibility,
    hasExamples,
    licenses,
    prereleases,
 }) => {
    const [initialized, setInitialized] = useState(false)
    const [isError, setIsError] = useState(true)
    const [meta, setMeta] = useState({})
    const { type, genre, size } = getModelComponents(name)
-    const version = useMemo(() => getLatestVersion(name, compatibility), [name, compatibility])
+    const version = useMemo(() => getLatestVersion(name, compatibility, prereleases), [
        name,
        compatibility,
        prereleases,
    ])
    useEffect(() => {
        window.dispatchEvent(new Event('resize')) // scroll position for progress
@ -173,10 +216,8 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
    const releaseTag = meta.fullName ? `/tag/${meta.fullName}` : ''
    const releaseUrl = `https://github.com/${repo}/releases/${releaseTag}`
-    const pipeline =
+    const pipeline = linkComponents(meta.pipeline)
-        meta.pipeline && join(meta.pipeline.map(p => <InlineCode key={p}>{p}</InlineCode>))
+    const components = linkComponents(meta.components)
    const components =
        meta.components && join(meta.components.map(p => <InlineCode key={p}>{p}</InlineCode>))
    const sources = formatSources(meta.sources)
    const author = !meta.url ? meta.author : <Link to={meta.url}>{meta.author}</Link>
    const licenseUrl = licenses[meta.license] ? licenses[meta.license].url : null
@ -332,7 +373,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
 const Models = ({ pageContext, repo, children }) => {
    const [initialized, setInitialized] = useState(false)
    const [compatibility, setCompatibility] = useState({})
-    const { id, title, meta, hasExamples } = pageContext
+    const { id, title, meta } = pageContext
    const { models, isStarters } = meta
    const baseUrl = `https://raw.githubusercontent.com/${repo}/master`
@ -381,6 +422,7 @@ const Models = ({ pageContext, repo, children }) => {
                            repo={repo}
                            licenses={arrayToObj(site.siteMetadata.licenses, 'id')}
                            hasExamples={meta.hasExamples}
                            prereleases={site.siteMetadata.nightly}
                        />
                    ))
                }
@ -397,6 +439,7 @@ const query = graphql`
    query ModelsQuery {
        site {
            siteMetadata {
                nightly
                licenses {
                    id
                    url