From 0bc214c1028bbc33c101c7cc48c3f1a2dff6c663 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 24 Sep 2020 16:11:33 +0200
Subject: [PATCH 01/16] Fix pull

---
 spacy/cli/project/pull.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/cli/project/pull.py b/spacy/cli/project/pull.py
index 3119d3a12..26676d5b3 100644
--- a/spacy/cli/project/pull.py
+++ b/spacy/cli/project/pull.py
@@ -51,7 +51,7 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
                     update_lockfile(project_dir, cmd)
                 # We remove the command from the list here, and break, so that
                 # we iterate over the loop again.
-                commands.remove(i)
+                commands.pop(i)
                 break
         else:
             # If we didn't break the for loop, break the while loop.

From d0ef4a4cf5f3d2db1e6624634731ac09b2eeda42 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 16:42:13 +0200
Subject: [PATCH 02/16] Prevent division by zero in score weights

---
 spacy/tests/pipeline/test_pipe_factories.py | 5 +++--
 spacy/util.py                               | 5 ++++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py
index 4c197005e..07648024c 100644
--- a/spacy/tests/pipeline/test_pipe_factories.py
+++ b/spacy/tests/pipeline/test_pipe_factories.py
@@ -345,12 +345,13 @@ def test_language_factories_invalid():
             [{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}],
             {"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25},
         ),
-        ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75},),
+        ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75}),
+        ([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"a": 0.0, "b": 0.0, "c": 0.0}),
     ],
 )
 def test_language_factories_combine_score_weights(weights, expected):
     result = combine_score_weights(weights)
-    assert sum(result.values()) in (0.99, 1.0)
+    assert sum(result.values()) in (0.99, 1.0, 0.0)
     assert result == expected
 
 
diff --git a/spacy/util.py b/spacy/util.py
index 709da8d29..ad3298651 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1233,7 +1233,10 @@ def combine_score_weights(
         # components.
         total = sum(w_dict.values())
         for key, value in w_dict.items():
-            weight = round(value / total / len(all_weights), 2)
+            if total == 0:
+                weight = 0.0
+            else:
+                weight = round(value / total / len(all_weights), 2)
             result[key] = result.get(key, 0.0) + weight
     return result
 

From c7eedd3534f551d5d23b0dfddc5e2be603780ddd Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 24 Sep 2020 16:53:59 +0200
Subject: [PATCH 03/16] updates to NEL functionality (#6132)

* NEL: read sentences and ents from reference

* fiddling with sent_start annotations

* add KB serialization test

* KB write additional file with strings.json

* score_links function to calculate NEL P/R/F

* formatting

* documentation
---
 spacy/errors.py                               |   4 +-
 spacy/kb.pyx                                  |  39 ++++--
 spacy/pipeline/entity_linker.py               |  26 +++-
 spacy/scorer.py                               |  68 ++++++++++
 spacy/tests/pipeline/test_entity_linker.py    | 121 +++++++++++++++---
 .../tests/{ => training}/test_new_example.py  |  19 +++
 website/docs/api/entitylinker.md              |  15 +++
 website/docs/api/scorer.md                    |  23 ++++
 8 files changed, 273 insertions(+), 42 deletions(-)
 rename spacy/tests/{ => training}/test_new_example.py (91%)

diff --git a/spacy/errors.py b/spacy/errors.py
index 6fdf8cb57..50d2fea5f 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -517,8 +517,8 @@ class Errors:
             "instead.")
     E927 = ("Can't write to frozen list Maybe you're trying to modify a computed "
             "property or default function argument?")
-    E928 = ("A 'KnowledgeBase' should be written to / read from a file, but the "
-            "provided argument {loc} is an existing directory.")
+    E928 = ("A 'KnowledgeBase' can only be serialized to/from from a directory, "
+            "but the provided argument {loc} points to a file.")
     E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does "
             "not seem to exist.")
     E930 = ("Received invalid get_examples callback in {name}.begin_training. "
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index ff5382c24..bdf652766 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -10,6 +10,8 @@ from libcpp.vector cimport vector
 from pathlib import Path
 import warnings
 
+from spacy.strings import StringStore
+
 from spacy import util
 
 from .typedefs cimport hash_t
@@ -83,6 +85,9 @@ cdef class KnowledgeBase:
     DOCS: https://nightly.spacy.io/api/kb
     """
 
+    contents_loc = "contents"
+    strings_loc = "strings.json"
+
     def __init__(self, Vocab vocab, entity_vector_length):
         """Create a KnowledgeBase."""
         self.mem = Pool()
@@ -319,15 +324,29 @@ cdef class KnowledgeBase:
 
         return 0.0
 
-
     def to_disk(self, path):
         path = util.ensure_path(path)
-        if path.is_dir():
+        if not path.exists():
+            path.mkdir(parents=True)
+        if not path.is_dir():
             raise ValueError(Errors.E928.format(loc=path))
-        if not path.parent.exists():
-            path.parent.mkdir(parents=True)
+        self.write_contents(path / self.contents_loc)
+        self.vocab.strings.to_disk(path / self.strings_loc)
 
-        cdef Writer writer = Writer(path)
+    def from_disk(self, path):
+        path = util.ensure_path(path)
+        if not path.exists():
+            raise ValueError(Errors.E929.format(loc=path))
+        if not path.is_dir():
+            raise ValueError(Errors.E928.format(loc=path))
+        self.read_contents(path / self.contents_loc)
+        kb_strings = StringStore()
+        kb_strings.from_disk(path / self.strings_loc)
+        for string in kb_strings:
+            self.vocab.strings.add(string)
+
+    def write_contents(self, file_path):
+        cdef Writer writer = Writer(file_path)
         writer.write_header(self.get_size_entities(), self.entity_vector_length)
 
         # dumping the entity vectors in their original order
@@ -366,13 +385,7 @@ cdef class KnowledgeBase:
 
         writer.close()
 
-    def from_disk(self, path):
-        path = util.ensure_path(path)
-        if path.is_dir():
-            raise ValueError(Errors.E928.format(loc=path))
-        if not path.exists():
-            raise ValueError(Errors.E929.format(loc=path))
-
+    def read_contents(self, file_path):
         cdef hash_t entity_hash
         cdef hash_t alias_hash
         cdef int64_t entry_index
@@ -382,7 +395,7 @@ cdef class KnowledgeBase:
         cdef AliasC alias
         cdef float vector_element
 
-        cdef Reader reader = Reader(path)
+        cdef Reader reader = Reader(file_path)
 
         # STEP 0: load header and initialize KB
         cdef int64_t nr_entities
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 1debadd82..fec53c77a 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -16,6 +16,7 @@ from ..training import Example, validate_examples
 from ..errors import Errors, Warnings
 from ..util import SimpleFrozenList
 from .. import util
+from ..scorer import Scorer
 
 
 default_model_config = """
@@ -47,6 +48,8 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
         "incl_context": True,
         "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
     },
+    scores=["nel_micro_p", "nel_micro_r", "nel_micro_f"],
+    default_score_weights={"nel_micro_f": 1.0},
 )
 def make_entity_linker(
     nlp: Language,
@@ -209,12 +212,11 @@ class EntityLinker(Pipe):
             # it does run the model twice :(
             predictions = self.model.predict(docs)
         for eg in examples:
-            sentences = [s for s in eg.predicted.sents]
+            sentences = [s for s in eg.reference.sents]
             kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
-            for ent in eg.predicted.ents:
-                kb_id = kb_ids[
-                    ent.start
-                ]  # KB ID of the first token is the same as the whole span
+            for ent in eg.reference.ents:
+                # KB ID of the first token is the same as the whole span
+                kb_id = kb_ids[ent.start]
                 if kb_id:
                     try:
                         # find the sentence in the list of sentences.
@@ -253,7 +255,7 @@ class EntityLinker(Pipe):
         entity_encodings = []
         for eg in examples:
             kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
-            for ent in eg.predicted.ents:
+            for ent in eg.reference.ents:
                 kb_id = kb_ids[ent.start]
                 if kb_id:
                     entity_encoding = self.kb.get_vector(kb_id)
@@ -415,6 +417,18 @@ class EntityLinker(Pipe):
                 for token in ent:
                     token.ent_kb_id_ = kb_id
 
+    def score(self, examples, **kwargs):
+        """Score a batch of examples.
+
+        examples (Iterable[Example]): The examples to score.
+        RETURNS (Dict[str, Any]): The scores.
+
+        DOCS TODO: https://nightly.spacy.io/api/entity_linker#score
+        """
+        validate_examples(examples, "EntityLinker.score")
+        return Scorer.score_links(examples, negative_labels=[self.NIL])
+
+
     def to_disk(
         self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
     ) -> None:
diff --git a/spacy/scorer.py b/spacy/scorer.py
index c50de3d43..cd3b013cd 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -451,6 +451,74 @@ class Scorer:
             results[f"{attr}_score_desc"] = "macro AUC"
         return results
 
+    @staticmethod
+    def score_links(
+        examples: Iterable[Example], *, negative_labels: Iterable[str]
+    ) -> Dict[str, Any]:
+        """Returns PRF for predicted links on the entity level.
+        To disentangle the performance of the NEL from the NER,
+        this method only evaluates NEL links for entities that overlap
+        between the gold reference and the predictions.
+
+        examples (Iterable[Example]): Examples to score
+        negative_labels (Iterable[str]): The string values that refer to no annotation (e.g. "NIL")
+        RETURNS (Dict[str, Any]): A dictionary containing the scores.
+
+        DOCS (TODO): https://nightly.spacy.io/api/scorer#score_links
+        """
+        f_per_type = {}
+        for example in examples:
+            gold_ent_by_offset = {}
+            for gold_ent in example.reference.ents:
+                gold_ent_by_offset[(gold_ent.start_char, gold_ent.end_char)] = gold_ent
+
+            for pred_ent in example.predicted.ents:
+                gold_span = gold_ent_by_offset.get(
+                    (pred_ent.start_char, pred_ent.end_char), None
+                )
+                label = gold_span.label_
+                if not label in f_per_type:
+                    f_per_type[label] = PRFScore()
+                gold = gold_span.kb_id_
+                # only evaluating entities that overlap between gold and pred,
+                # to disentangle the performance of the NEL from the NER
+                if gold is not None:
+                    pred = pred_ent.kb_id_
+                    if gold in negative_labels and pred in negative_labels:
+                        # ignore true negatives
+                        pass
+                    elif gold == pred:
+                        f_per_type[label].tp += 1
+                    elif gold in negative_labels:
+                        f_per_type[label].fp += 1
+                    elif pred in negative_labels:
+                        f_per_type[label].fn += 1
+                    else:
+                        # a wrong prediction (e.g. Q42 != Q3) counts as both a FP as well as a FN
+                        f_per_type[label].fp += 1
+                        f_per_type[label].fn += 1
+        micro_prf = PRFScore()
+        for label_prf in f_per_type.values():
+            micro_prf.tp += label_prf.tp
+            micro_prf.fn += label_prf.fn
+            micro_prf.fp += label_prf.fp
+        n_labels = len(f_per_type) + 1e-100
+        macro_p = sum(prf.precision for prf in f_per_type.values()) / n_labels
+        macro_r = sum(prf.recall for prf in f_per_type.values()) / n_labels
+        macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_labels
+        results = {
+            f"nel_score": micro_prf.fscore,
+            f"nel_score_desc": "micro F",
+            f"nel_micro_p": micro_prf.precision,
+            f"nel_micro_r": micro_prf.recall,
+            f"nel_micro_f": micro_prf.fscore,
+            f"nel_macro_p": macro_p,
+            f"nel_macro_r": macro_r,
+            f"nel_macro_f": macro_f,
+            f"nel_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
+        }
+        return results
+
     @staticmethod
     def score_deps(
         examples: Iterable[Example],
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 88e0646b3..878f41a28 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -2,8 +2,10 @@ from typing import Callable, Iterable
 import pytest
 
 from spacy.kb import KnowledgeBase, get_candidates, Candidate
+from spacy.vocab import Vocab
 
 from spacy import util, registry
+from spacy.scorer import Scorer
 from spacy.training import Example
 from spacy.lang.en import English
 from spacy.tests.util import make_tempdir
@@ -151,22 +153,15 @@ def test_kb_serialize(nlp):
         # normal read-write behaviour
         mykb.to_disk(d / "kb")
         mykb.from_disk(d / "kb")
-        mykb.to_disk(d / "kb.file")
-        mykb.from_disk(d / "kb.file")
         mykb.to_disk(d / "new" / "kb")
         mykb.from_disk(d / "new" / "kb")
         # allow overwriting an existing file
-        mykb.to_disk(d / "kb.file")
-        with pytest.raises(ValueError):
-            # can not write to a directory
-            mykb.to_disk(d)
-        with pytest.raises(ValueError):
-            # can not read from a directory
-            mykb.from_disk(d)
+        mykb.to_disk(d / "kb")
         with pytest.raises(ValueError):
             # can not read from an unknown file
             mykb.from_disk(d / "unknown" / "kb")
 
+
 def test_candidate_generation(nlp):
     """Test correct candidate generation"""
     mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
@@ -254,6 +249,41 @@ def test_el_pipe_configuration(nlp):
     assert doc[2].ent_kb_id_ == "Q2"
 
 
+def test_vocab_serialization(nlp):
+    """Test that string information is retained across storage"""
+    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+
+    # adding entities
+    q1_hash = mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
+    q2_hash = mykb.add_entity(entity="Q2", freq=12, entity_vector=[2])
+    q3_hash = mykb.add_entity(entity="Q3", freq=5, entity_vector=[3])
+
+    # adding aliases
+    douglas_hash = mykb.add_alias(
+        alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1]
+    )
+    adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
+
+    candidates = mykb.get_alias_candidates("adam")
+    assert len(candidates) == 1
+    assert candidates[0].entity == q2_hash
+    assert candidates[0].entity_ == "Q2"
+    assert candidates[0].alias == adam_hash
+    assert candidates[0].alias_ == "adam"
+
+    with make_tempdir() as d:
+        mykb.to_disk(d / "kb")
+        kb_new_vocab = KnowledgeBase(Vocab(), entity_vector_length=1)
+        kb_new_vocab.from_disk(d / "kb")
+
+        candidates = kb_new_vocab.get_alias_candidates("adam")
+        assert len(candidates) == 1
+        assert candidates[0].entity == q2_hash
+        assert candidates[0].entity_ == "Q2"
+        assert candidates[0].alias == adam_hash
+        assert candidates[0].alias_ == "adam"
+
+
 def test_append_alias(nlp):
     """Test that we can append additional alias-entity pairs"""
     mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
@@ -377,16 +407,20 @@ def test_preserving_links_ents_2(nlp):
 TRAIN_DATA = [
     ("Russ Cochran captured his first major title with his son as caddie.",
         {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}},
-         "entities": [(0, 12, "PERSON")]}),
+         "entities": [(0, 12, "PERSON")],
+         "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}),
     ("Russ Cochran his reprints include EC Comics.",
         {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}},
-         "entities": [(0, 12, "PERSON")]}),
+         "entities": [(0, 12, "PERSON")],
+         "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0]}),
     ("Russ Cochran has been publishing comic art.",
         {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}},
-         "entities": [(0, 12, "PERSON")]}),
+         "entities": [(0, 12, "PERSON")],
+         "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0]}),
     ("Russ Cochran was a member of University of Kentucky's golf team.",
         {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}},
-         "entities": [(0, 12, "PERSON"), (43, 51, "LOC")]}),
+         "entities": [(0, 12, "PERSON"), (43, 51, "LOC")],
+         "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]})
 ]
 GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
 # fmt: on
@@ -395,16 +429,8 @@ GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
 def test_overfitting_IO():
     # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
     nlp = English()
-    nlp.add_pipe("sentencizer")
     vector_length = 3
 
-    # Add a custom component to recognize "Russ Cochran" as an entity for the example training data
-    patterns = [
-        {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}
-    ]
-    ruler = nlp.add_pipe("entity_ruler")
-    ruler.add_patterns(patterns)
-
     # Convert the texts to docs to make sure we have doc.ents set for the training examples
     train_examples = []
     for text, annotation in TRAIN_DATA:
@@ -446,6 +472,16 @@ def test_overfitting_IO():
         nlp.update(train_examples, sgd=optimizer, losses=losses)
     assert losses["entity_linker"] < 0.001
 
+    # adding additional components that are required for the entity_linker
+    nlp.add_pipe("sentencizer", first=True)
+
+    # Add a custom component to recognize "Russ Cochran" as an entity for the example training data
+    patterns = [
+        {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}
+    ]
+    ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
+    ruler.add_patterns(patterns)
+
     # test the trained model
     predictions = []
     for text, annotation in TRAIN_DATA:
@@ -465,3 +501,46 @@ def test_overfitting_IO():
             for ent in doc2.ents:
                 predictions.append(ent.kb_id_)
         assert predictions == GOLD_entities
+
+
+def test_scorer_links():
+    train_examples = []
+    nlp = English()
+    ref1 = nlp("Julia lives in London happily.")
+    ref1.ents = [
+        Span(ref1, 0, 1, label="PERSON", kb_id="Q2"),
+        Span(ref1, 3, 4, label="LOC", kb_id="Q3"),
+    ]
+    pred1 = nlp("Julia lives in London happily.")
+    pred1.ents = [
+        Span(pred1, 0, 1, label="PERSON", kb_id="Q70"),
+        Span(pred1, 3, 4, label="LOC", kb_id="Q3"),
+    ]
+    train_examples.append(Example(pred1, ref1))
+
+    ref2 = nlp("She loves London.")
+    ref2.ents = [
+        Span(ref2, 0, 1, label="PERSON", kb_id="Q2"),
+        Span(ref2, 2, 3, label="LOC", kb_id="Q13"),
+    ]
+    pred2 = nlp("She loves London.")
+    pred2.ents = [
+        Span(pred2, 0, 1, label="PERSON", kb_id="Q2"),
+        Span(pred2, 2, 3, label="LOC", kb_id="NIL"),
+    ]
+    train_examples.append(Example(pred2, ref2))
+
+    ref3 = nlp("London is great.")
+    ref3.ents = [Span(ref3, 0, 1, label="LOC", kb_id="NIL")]
+    pred3 = nlp("London is great.")
+    pred3.ents = [Span(pred3, 0, 1, label="LOC", kb_id="NIL")]
+    train_examples.append(Example(pred3, ref3))
+
+    scores = Scorer().score_links(train_examples, negative_labels=["NIL"])
+    assert scores["nel_f_per_type"]["PERSON"]["p"] == 1 / 2
+    assert scores["nel_f_per_type"]["PERSON"]["r"] == 1 / 2
+    assert scores["nel_f_per_type"]["LOC"]["p"] == 1 / 1
+    assert scores["nel_f_per_type"]["LOC"]["r"] == 1 / 2
+
+    assert scores["nel_micro_p"] == 2 / 3
+    assert scores["nel_micro_r"] == 2 / 4
diff --git a/spacy/tests/test_new_example.py b/spacy/tests/training/test_new_example.py
similarity index 91%
rename from spacy/tests/test_new_example.py
rename to spacy/tests/training/test_new_example.py
index 597809286..81207b640 100644
--- a/spacy/tests/test_new_example.py
+++ b/spacy/tests/training/test_new_example.py
@@ -244,3 +244,22 @@ def test_Example_from_dict_with_links_invalid(annots):
     predicted = Doc(vocab, words=annots["words"])
     with pytest.raises(ValueError):
         Example.from_dict(predicted, annots)
+
+
+def test_Example_from_dict_sentences():
+    vocab = Vocab()
+    predicted = Doc(vocab, words=["One", "sentence", ".", "one", "more"])
+    annots = {"sent_starts": [1, 0, 0, 1, 0]}
+    ex = Example.from_dict(predicted, annots)
+    assert len(list(ex.reference.sents)) == 2
+
+    # this currently throws an error - bug or feature?
+    # predicted = Doc(vocab, words=["One", "sentence", "not", "one", "more"])
+    # annots = {"sent_starts": [1, 0, 0, 0, 0]}
+    # ex = Example.from_dict(predicted, annots)
+    # assert len(list(ex.reference.sents)) == 1
+
+    predicted = Doc(vocab, words=["One", "sentence", "not", "one", "more"])
+    annots = {"sent_starts": [1, -1, 0, 0, 0]}
+    ex = Example.from_dict(predicted, annots)
+    assert len(list(ex.reference.sents)) == 1
diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md
index 9cb35b487..945a1568a 100644
--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@@ -225,6 +225,21 @@ pipe's entity linking model and context encoder. Delegates to
 | `losses`          | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~           |
 | **RETURNS**       | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                              |
 
+## EntityLinker.score {#score tag="method" new="3"}
+
+Score a batch of examples.
+
+> #### Example
+>
+> ```python
+> scores = entity_linker.score(examples)
+> ```
+
+| Name        | Description                                                                                    |
+| ----------- | ---------------------------------------------------------------------------------------------- |
+| `examples`  | The examples to score. ~~Iterable[Example]~~                                                   |
+| **RETURNS** | The scores, produced by [`Scorer.score_links`](/api/scorer#score_links) . ~~Dict[str, float]~~ |
+
 ## EntityLinker.create_optimizer {#create_optimizer tag="method"}
 
 Create an optimizer for the pipeline component.
diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md
index 1c0895bcf..0dbc0de33 100644
--- a/website/docs/api/scorer.md
+++ b/website/docs/api/scorer.md
@@ -206,3 +206,26 @@ depends on the scorer settings:
 | `multi_label`    | Whether the attribute allows multiple labels. Defaults to `True`. ~~bool~~                                                                         |
 | `positive_label` | The positive label for a binary task with exclusive classes. Defaults to `None`. ~~Optional[str]~~                                                 |
 | **RETURNS**      | A dictionary containing the scores, with inapplicable scores as `None`. ~~Dict[str, Optional[float]]~~                                             |
+
+## Scorer.score_links {#score_links tag="staticmethod" new="3"}
+
+Returns PRF for predicted links on the entity level. To disentangle the
+performance of the NEL from the NER, this method only evaluates NEL links for
+entities that overlap between the gold reference and the predictions.
+
+> #### Example
+>
+> ```python
+> scores = Scorer.score_links(
+>     examples,
+>     negative_labels=["NIL", ""]
+> )
+> print(scores["nel_micro_f"])
+> ```
+
+| Name              | Description                                                                                                         |
+| ----------------- | ------------------------------------------------------------------------------------------------------------------- |
+| `examples`        | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
+| _keyword-only_    |                                                                                                                     |
+| `negative_labels` | The string values that refer to no annotation (e.g. "NIL"). ~~Iterable[str]~~                                       |
+| **RETURNS**       | A dictionary containing the scores. ~~Dict[str, Optional[float]]~~                                                  |

From 59340606b7881928c924e4c11bc59192522fedb8 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 24 Sep 2020 16:54:39 +0200
Subject: [PATCH 04/16] Add option to disable Matcher errors (#6125)

* Add option to disable Matcher errors

* Add option to disable Matcher errors when a doc doesn't contain a
particular type of annotation

Minor additional change:

* Update `AttributeRuler.load_from_morph_rules` to allow direct `MORPH`
values

* Rename suppress_errors to allow_missing

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>

* Refactor annotation checks in Matcher and PhraseMatcher

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
---
 spacy/errors.py                         |  4 ----
 spacy/matcher/matcher.pyx               | 25 ++++++++++++++-----------
 spacy/matcher/phrasematcher.pyx         | 22 ++++++++++++----------
 spacy/pipeline/attributeruler.py        | 18 +++++++++++++-----
 spacy/tests/matcher/test_matcher_api.py |  3 +++
 5 files changed, 42 insertions(+), 30 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 50d2fea5f..4216e3936 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -401,10 +401,6 @@ class Errors:
             "Matcher or PhraseMatcher with the attribute {attr}. "
             "Try using nlp() instead of nlp.make_doc() or list(nlp.pipe()) "
             "instead of list(nlp.tokenizer.pipe()).")
-    E156 = ("The pipeline needs to include a parser in order to use "
-            "Matcher or PhraseMatcher with the attribute DEP. Try using "
-            "nlp() instead of nlp.make_doc() or list(nlp.pipe()) instead of "
-            "list(nlp.tokenizer.pipe()).")
     E157 = ("Can't render negative values for dependency arc start or end. "
             "Make sure that you're passing in absolute token indices, not "
             "relative token offsets.\nstart: {start}, end: {end}, label: "
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index d83f58181..39c7168e4 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -195,7 +195,7 @@ cdef class Matcher:
                 else:
                     yield doc
 
-    def __call__(self, object doclike, *, as_spans=False):
+    def __call__(self, object doclike, *, as_spans=False, allow_missing=False):
         """Find all token sequences matching the supplied pattern.
 
         doclike (Doc or Span): The document to match over.
@@ -215,16 +215,19 @@ cdef class Matcher:
         else:
             raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__))
         cdef Pool tmp_pool = Pool()
-        if TAG in self._seen_attrs and not doc.has_annotation("TAG"):
-            raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG"))
-        if POS in self._seen_attrs and not doc.has_annotation("POS"):
-            raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS"))
-        if MORPH in self._seen_attrs and not doc.has_annotation("MORPH"):
-            raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH"))
-        if LEMMA in self._seen_attrs and not doc.has_annotation("LEMMA"):
-            raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA"))
-        if DEP in self._seen_attrs and not doc.has_annotation("DEP"):
-            raise ValueError(Errors.E156.format())
+        if not allow_missing:
+            for attr in (TAG, POS, MORPH, LEMMA, DEP):
+                if attr in self._seen_attrs and not doc.has_annotation(attr):
+                    if attr == TAG:
+                        pipe = "tagger"
+                    elif attr in (POS, MORPH):
+                        pipe = "morphologizer"
+                    elif attr == LEMMA:
+                        pipe = "lemmatizer"
+                    elif attr == DEP:
+                        pipe = "parser"
+                    error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
+                    raise ValueError(error_msg)
         matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
                                 extensions=self._extensions, predicates=self._extra_predicates)
         final_matches = []
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index b00ba157f..7e99859b5 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -186,16 +186,18 @@ cdef class PhraseMatcher:
             if isinstance(doc, Doc):
                 attrs = (TAG, POS, MORPH, LEMMA, DEP)
                 has_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
-                if self.attr == TAG and not has_annotation[TAG]:
-                    raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG"))
-                if self.attr == POS and not has_annotation[POS]:
-                    raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS"))
-                if self.attr == MORPH and not has_annotation[MORPH]:
-                    raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH"))
-                if self.attr == LEMMA and not has_annotation[LEMMA]:
-                    raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA"))
-                if self.attr == DEP and not has_annotation[DEP]:
-                    raise ValueError(Errors.E156.format())
+                for attr in attrs:
+                    if self.attr == attr and not has_annotation[attr]:
+                        if attr == TAG:
+                            pipe = "tagger"
+                        elif attr in (POS, MORPH):
+                            pipe = "morphologizer"
+                        elif attr == LEMMA:
+                            pipe = "lemmatizer"
+                        elif attr == DEP:
+                            pipe = "parser"
+                        error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
+                        raise ValueError(error_msg)
                 if self._validate and any(has_annotation.values()) \
                         and self.attr not in attrs:
                     string_attr = self.vocab.strings[self.attr]
diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py
index f64fcbc54..0d59a1ba0 100644
--- a/spacy/pipeline/attributeruler.py
+++ b/spacy/pipeline/attributeruler.py
@@ -79,7 +79,7 @@ class AttributeRuler(Pipe):
 
         DOCS: https://nightly.spacy.io/api/attributeruler#call
         """
-        matches = sorted(self.matcher(doc))
+        matches = sorted(self.matcher(doc, allow_missing=True))
 
         for match_id, start, end in matches:
             span = Span(doc, start, end, label=match_id)
@@ -126,8 +126,12 @@ class AttributeRuler(Pipe):
         for tag, attrs in tag_map.items():
             pattern = [{"TAG": tag}]
             attrs, morph_attrs = _split_morph_attrs(attrs)
-            morph = self.vocab.morphology.add(morph_attrs)
-            attrs["MORPH"] = self.vocab.strings[morph]
+            if "MORPH" not in attrs:
+                morph = self.vocab.morphology.add(morph_attrs)
+                attrs["MORPH"] = self.vocab.strings[morph]
+            else:
+                morph = self.vocab.morphology.add(attrs["MORPH"])
+                attrs["MORPH"] = self.vocab.strings[morph]
             self.add([pattern], attrs)
 
     def load_from_morph_rules(
@@ -146,8 +150,12 @@ class AttributeRuler(Pipe):
                 pattern = [{"ORTH": word, "TAG": tag}]
                 attrs = morph_rules[tag][word]
                 attrs, morph_attrs = _split_morph_attrs(attrs)
-                morph = self.vocab.morphology.add(morph_attrs)
-                attrs["MORPH"] = self.vocab.strings[morph]
+                if "MORPH" in attrs:
+                    morph = self.vocab.morphology.add(attrs["MORPH"])
+                    attrs["MORPH"] = self.vocab.strings[morph]
+                elif morph_attrs:
+                    morph = self.vocab.morphology.add(morph_attrs)
+                    attrs["MORPH"] = self.vocab.strings[morph]
                 self.add([pattern], attrs)
 
     def add(
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index 04f9585f1..c407595e5 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -316,6 +316,9 @@ def test_attr_pipeline_checks(en_vocab):
         matcher(doc2)
     with pytest.raises(ValueError):
         matcher(doc3)
+    # errors can be suppressed if desired
+    matcher(doc2, allow_missing=True)
+    matcher(doc3, allow_missing=True)
     # TAG, POS, LEMMA require those values
     for attr in ("TAG", "POS", "LEMMA"):
         matcher = Matcher(en_vocab)

From 3c062b3911d70f0f9521653cac6d0a7b85bc272f Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 24 Sep 2020 16:55:09 +0200
Subject: [PATCH 05/16] Add MORPH handling to Matcher (#6107)

* Add MORPH handling to Matcher

* Add `MORPH` to `Matcher` schema
* Rename `_SetMemberPredicate` to `_SetPredicate`
* Add `ISSUBSET` and `ISSUPERSET` operators to `_SetPredicate`
  * Add special handling for normalization and conversion of morph
    values into sets
  * For other attrs, `ISSUBSET` acts like `IN` and `ISSUPERSET` only
    matches for 0 or 1 values

* Update test

* Rename to IS_SUBSET and IS_SUPERSET
---
 spacy/matcher/matcher.pyx                 |  52 +++++++----
 spacy/schemas.py                          |   5 ++
 spacy/tests/matcher/test_matcher_api.py   | 100 ++++++++++++++++++++++
 website/docs/api/matcher.md               |  30 ++++---
 website/docs/usage/rule-based-matching.md |  30 ++++---
 5 files changed, 174 insertions(+), 43 deletions(-)

diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 39c7168e4..a4d20ec55 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -17,6 +17,7 @@ from ..vocab cimport Vocab
 from ..tokens.doc cimport Doc, get_token_attr_for_matcher
 from ..tokens.span cimport Span
 from ..tokens.token cimport Token
+from ..tokens.morphanalysis cimport MorphAnalysis
 from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH
 
 from ..schemas import validate_token_pattern
@@ -124,7 +125,7 @@ cdef class Matcher:
         key = self._normalize_key(key)
         for pattern in patterns:
             try:
-                specs = _preprocess_pattern(pattern, self.vocab.strings,
+                specs = _preprocess_pattern(pattern, self.vocab,
                     self._extensions, self._extra_predicates)
                 self.patterns.push_back(init_pattern(self.mem, key, specs))
                 for spec in specs:
@@ -663,7 +664,7 @@ cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
     return id_attr.value
 
 
-def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predicates):
+def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
     """This function interprets the pattern, converting the various bits of
     syntactic sugar before we compile it into a struct with init_pattern.
 
@@ -678,6 +679,7 @@ def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predi
         extra_predicates.
     """
     tokens = []
+    string_store = vocab.strings
     for spec in token_specs:
         if not spec:
             # Signifier for 'any token'
@@ -688,7 +690,7 @@ def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predi
         ops = _get_operators(spec)
         attr_values = _get_attr_values(spec, string_store)
         extensions = _get_extensions(spec, string_store, extensions_table)
-        predicates = _get_extra_predicates(spec, extra_predicates)
+        predicates = _get_extra_predicates(spec, extra_predicates, vocab)
         for op in ops:
             tokens.append((op, list(attr_values), list(extensions), list(predicates)))
     return tokens
@@ -732,7 +734,7 @@ def _get_attr_values(spec, string_store):
 class _RegexPredicate:
     operators = ("REGEX",)
 
-    def __init__(self, i, attr, value, predicate, is_extension=False):
+    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
         self.i = i
         self.attr = attr
         self.value = re.compile(value)
@@ -750,13 +752,18 @@ class _RegexPredicate:
         return bool(self.value.search(value))
 
 
-class _SetMemberPredicate:
-    operators = ("IN", "NOT_IN")
+class _SetPredicate:
+    operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET")
 
-    def __init__(self, i, attr, value, predicate, is_extension=False):
+    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
         self.i = i
         self.attr = attr
-        self.value = set(get_string_id(v) for v in value)
+        self.vocab = vocab
+        if self.attr == MORPH:
+            # normalize morph strings
+            self.value = set(self.vocab.morphology.add(v) for v in value)
+        else:
+            self.value = set(get_string_id(v) for v in value)
         self.predicate = predicate
         self.is_extension = is_extension
         self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
@@ -768,19 +775,32 @@ class _SetMemberPredicate:
             value = get_string_id(token._.get(self.attr))
         else:
             value = get_token_attr_for_matcher(token.c, self.attr)
+
+        if self.predicate in ("IS_SUBSET", "IS_SUPERSET"):
+            if self.attr == MORPH:
+                # break up MORPH into individual Feat=Val values
+                value = set(get_string_id(v) for v in MorphAnalysis.from_id(self.vocab, value))
+            else:
+                # IS_SUBSET for other attrs will be equivalent to "IN"
+                # IS_SUPERSET will only match for other attrs with 0 or 1 values
+                value = set([value])
         if self.predicate == "IN":
             return value in self.value
-        else:
+        elif self.predicate == "NOT_IN":
             return value not in self.value
+        elif self.predicate == "IS_SUBSET":
+            return value <= self.value
+        elif self.predicate == "IS_SUPERSET":
+            return value >= self.value
 
     def __repr__(self):
-        return repr(("SetMemberPredicate", self.i, self.attr, self.value, self.predicate))
+        return repr(("SetPredicate", self.i, self.attr, self.value, self.predicate))
 
 
 class _ComparisonPredicate:
     operators = ("==", "!=", ">=", "<=", ">", "<")
 
-    def __init__(self, i, attr, value, predicate, is_extension=False):
+    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
         self.i = i
         self.attr = attr
         self.value = value
@@ -809,11 +829,13 @@ class _ComparisonPredicate:
             return value < self.value
 
 
-def _get_extra_predicates(spec, extra_predicates):
+def _get_extra_predicates(spec, extra_predicates, vocab):
     predicate_types = {
         "REGEX": _RegexPredicate,
-        "IN": _SetMemberPredicate,
-        "NOT_IN": _SetMemberPredicate,
+        "IN": _SetPredicate,
+        "NOT_IN": _SetPredicate,
+        "IS_SUBSET": _SetPredicate,
+        "IS_SUPERSET": _SetPredicate,
         "==": _ComparisonPredicate,
         "!=": _ComparisonPredicate,
         ">=": _ComparisonPredicate,
@@ -841,7 +863,7 @@ def _get_extra_predicates(spec, extra_predicates):
             value_with_upper_keys = {k.upper(): v for k, v in value.items()}
             for type_, cls in predicate_types.items():
                 if type_ in value_with_upper_keys:
-                    predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_)
+                    predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_, vocab=vocab)
                     # Don't create a redundant predicates.
                     # This helps with efficiency, as we're caching the results.
                     if predicate.key in seen_predicates:
diff --git a/spacy/schemas.py b/spacy/schemas.py
index eea6639d3..0c85dfe57 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -61,6 +61,8 @@ class TokenPatternString(BaseModel):
     REGEX: Optional[StrictStr] = Field(None, alias="regex")
     IN: Optional[List[StrictStr]] = Field(None, alias="in")
     NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in")
+    IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset")
+    IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
 
     class Config:
         extra = "forbid"
@@ -77,6 +79,8 @@ class TokenPatternNumber(BaseModel):
     REGEX: Optional[StrictStr] = Field(None, alias="regex")
     IN: Optional[List[StrictInt]] = Field(None, alias="in")
     NOT_IN: Optional[List[StrictInt]] = Field(None, alias="not_in")
+    ISSUBSET: Optional[List[StrictInt]] = Field(None, alias="issubset")
+    ISSUPERSET: Optional[List[StrictInt]] = Field(None, alias="issuperset")
     EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==")
     NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=")
     GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=")
@@ -115,6 +119,7 @@ class TokenPattern(BaseModel):
     lower: Optional[StringValue] = None
     pos: Optional[StringValue] = None
     tag: Optional[StringValue] = None
+    morph: Optional[StringValue] = None
     dep: Optional[StringValue] = None
     lemma: Optional[StringValue] = None
     shape: Optional[StringValue] = None
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index c407595e5..627110cdd 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -230,6 +230,106 @@ def test_matcher_set_value_operator(en_vocab):
     assert len(matches) == 1
 
 
+def test_matcher_subset_value_operator(en_vocab):
+    matcher = Matcher(en_vocab)
+    pattern = [{"MORPH": {"IS_SUBSET": ["Feat=Val", "Feat2=Val2"]}}]
+    matcher.add("M", [pattern])
+    doc = Doc(en_vocab, words=["a", "b", "c"])
+    assert len(matcher(doc)) == 3
+    doc[0].morph_ = "Feat=Val"
+    assert len(matcher(doc)) == 3
+    doc[0].morph_ = "Feat=Val|Feat2=Val2"
+    assert len(matcher(doc)) == 3
+    doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3"
+    assert len(matcher(doc)) == 2
+    doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4"
+    assert len(matcher(doc)) == 2
+
+    # IS_SUBSET acts like "IN" for attrs other than MORPH
+    matcher = Matcher(en_vocab)
+    pattern = [{"TAG": {"IS_SUBSET": ["A", "B"]}}]
+    matcher.add("M", [pattern])
+    doc = Doc(en_vocab, words=["a", "b", "c"])
+    doc[0].tag_ = "A"
+    assert len(matcher(doc)) == 1
+
+    # IS_SUBSET with an empty list matches nothing
+    matcher = Matcher(en_vocab)
+    pattern = [{"TAG": {"IS_SUBSET": []}}]
+    matcher.add("M", [pattern])
+    doc = Doc(en_vocab, words=["a", "b", "c"])
+    doc[0].tag_ = "A"
+    assert len(matcher(doc)) == 0
+
+
+def test_matcher_superset_value_operator(en_vocab):
+    matcher = Matcher(en_vocab)
+    pattern = [{"MORPH": {"IS_SUPERSET": ["Feat=Val", "Feat2=Val2", "Feat3=Val3"]}}]
+    matcher.add("M", [pattern])
+    doc = Doc(en_vocab, words=["a", "b", "c"])
+    assert len(matcher(doc)) == 0
+    doc[0].morph_ = "Feat=Val|Feat2=Val2"
+    assert len(matcher(doc)) == 0
+    doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3"
+    assert len(matcher(doc)) == 1
+    doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4"
+    assert len(matcher(doc)) == 1
+
+    # IS_SUPERSET with more than one value only matches for MORPH
+    matcher = Matcher(en_vocab)
+    pattern = [{"TAG": {"IS_SUPERSET": ["A", "B"]}}]
+    matcher.add("M", [pattern])
+    doc = Doc(en_vocab, words=["a", "b", "c"])
+    doc[0].tag_ = "A"
+    assert len(matcher(doc)) == 0
+
+    # IS_SUPERSET with one value is the same as ==
+    matcher = Matcher(en_vocab)
+    pattern = [{"TAG": {"IS_SUPERSET": ["A"]}}]
+    matcher.add("M", [pattern])
+    doc = Doc(en_vocab, words=["a", "b", "c"])
+    doc[0].tag_ = "A"
+    assert len(matcher(doc)) == 1
+
+    # IS_SUPERSET with an empty value matches everything
+    matcher = Matcher(en_vocab)
+    pattern = [{"TAG": {"IS_SUPERSET": []}}]
+    matcher.add("M", [pattern])
+    doc = Doc(en_vocab, words=["a", "b", "c"])
+    doc[0].tag_ = "A"
+    assert len(matcher(doc)) == 3
+
+
+def test_matcher_morph_handling(en_vocab):
+    # order of features in pattern doesn't matter
+    matcher = Matcher(en_vocab)
+    pattern1 = [{"MORPH": {"IN": ["Feat1=Val1|Feat2=Val2"]}}]
+    pattern2 = [{"MORPH": {"IN": ["Feat2=Val2|Feat1=Val1"]}}]
+    matcher.add("M", [pattern1])
+    matcher.add("N", [pattern2])
+    doc = Doc(en_vocab, words=["a", "b", "c"])
+    assert len(matcher(doc)) == 0
+
+    doc[0].morph_ = "Feat2=Val2|Feat1=Val1"
+    assert len(matcher(doc)) == 2
+    doc[0].morph_ = "Feat1=Val1|Feat2=Val2"
+    assert len(matcher(doc)) == 2
+
+    # multiple values are split
+    matcher = Matcher(en_vocab)
+    pattern1 = [{"MORPH": {"IS_SUPERSET": ["Feat1=Val1", "Feat2=Val2"]}}]
+    pattern2 = [{"MORPH": {"IS_SUPERSET": ["Feat1=Val1", "Feat1=Val3", "Feat2=Val2"]}}]
+    matcher.add("M", [pattern1])
+    matcher.add("N", [pattern2])
+    doc = Doc(en_vocab, words=["a", "b", "c"])
+    assert len(matcher(doc)) == 0
+
+    doc[0].morph_ = "Feat2=Val2,Val3|Feat1=Val1"
+    assert len(matcher(doc)) == 1
+    doc[0].morph_ = "Feat1=Val1,Val3|Feat2=Val2"
+    assert len(matcher(doc)) == 2
+
+
 def test_matcher_regex(en_vocab):
     matcher = Matcher(en_vocab)
     pattern = [{"ORTH": {"REGEX": r"(?:a|an)"}}]
diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md
index 1f1946be5..3f7076a1c 100644
--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@@ -30,20 +30,20 @@ pattern keys correspond to a number of
 [`Token` attributes](/api/token#attributes). The supported attributes for
 rule-based matching are:
 
-| Attribute                              |  Description                                                                                                              |
-| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
-| `ORTH`                                 | The exact verbatim text of a token. ~~str~~                                                                               |
-| `TEXT` <Tag variant="new">2.1</Tag>    | The exact verbatim text of a token. ~~str~~                                                                               |
-| `LOWER`                                | The lowercase form of the token text. ~~str~~                                                                             |
-|  `LENGTH`                              | The length of the token text. ~~int~~                                                                                     |
-|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`    | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~                                          |
-|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE`    | Token text is in lowercase, uppercase, titlecase. ~~bool~~                                                                |
-|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP`     | Token is punctuation, whitespace, stop word. ~~bool~~                                                                     |
-|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`  | Token text resembles a number, URL, email. ~~bool~~                                                                       |
-|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. ~~str~~                               |
-| `ENT_TYPE`                             | The token's entity label. ~~str~~                                                                                         |
-| `_` <Tag variant="new">2.1</Tag>       | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
-| `OP`                                   | Operator or quantifier to determine how often to match a token pattern. ~~str~~                                           |
+| Attribute                                       |  Description                                                                                                              |
+| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
+| `ORTH`                                          | The exact verbatim text of a token. ~~str~~                                                                               |
+| `TEXT` <Tag variant="new">2.1</Tag>             | The exact verbatim text of a token. ~~str~~                                                                               |
+| `LOWER`                                         | The lowercase form of the token text. ~~str~~                                                                             |
+|  `LENGTH`                                       | The length of the token text. ~~int~~                                                                                     |
+|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`             | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~                                          |
+|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE`             | Token text is in lowercase, uppercase, titlecase. ~~bool~~                                                                |
+|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP`              | Token is punctuation, whitespace, stop word. ~~bool~~                                                                     |
+|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`           | Token text resembles a number, URL, email. ~~bool~~                                                                       |
+|  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~       |
+| `ENT_TYPE`                                      | The token's entity label. ~~str~~                                                                                         |
+| `_` <Tag variant="new">2.1</Tag>                | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
+| `OP`                                            | Operator or quantifier to determine how often to match a token pattern. ~~str~~                                           |
 
 Operators and quantifiers define **how often** a token pattern should be
 matched:
@@ -79,6 +79,8 @@ it compares to another value.
 | -------------------------- | ------------------------------------------------------------------------------------------------------- |
 | `IN`                       | Attribute value is member of a list. ~~Any~~                                                            |
 | `NOT_IN`                   | Attribute value is _not_ member of a list. ~~Any~~                                                      |
+| `ISSUBSET`                 | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~                                          |
+| `ISSUPERSET`               | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~                                        |
 | `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
 
 ## Matcher.\_\_init\_\_ {#init tag="method"}
diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md
index 7e979b32e..256f4ccb4 100644
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@@ -158,20 +158,20 @@ The available token pattern keys correspond to a number of
 [`Token` attributes](/api/token#attributes). The supported attributes for
 rule-based matching are:
 
-| Attribute                              |  Description                                                                                                              |
-| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
-| `ORTH`                                 | The exact verbatim text of a token. ~~str~~                                                                               |
-| `TEXT` <Tag variant="new">2.1</Tag>    | The exact verbatim text of a token. ~~str~~                                                                               |
-| `LOWER`                                | The lowercase form of the token text. ~~str~~                                                                             |
-|  `LENGTH`                              | The length of the token text. ~~int~~                                                                                     |
-|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`    | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~                                          |
-|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE`    | Token text is in lowercase, uppercase, titlecase. ~~bool~~                                                                |
-|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP`     | Token is punctuation, whitespace, stop word. ~~bool~~                                                                     |
-|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`  | Token text resembles a number, URL, email. ~~bool~~                                                                       |
-|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. ~~str~~                               |
-| `ENT_TYPE`                             | The token's entity label. ~~str~~                                                                                         |
-| `_` <Tag variant="new">2.1</Tag>       | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
-| `OP`                                   | [Operator or quantifier](#quantifiers) to determine how often to match a token pattern. ~~str~~                           |
+| Attribute                                       |  Description                                                                                                              |
+| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
+| `ORTH`                                          | The exact verbatim text of a token. ~~str~~                                                                               |
+| `TEXT` <Tag variant="new">2.1</Tag>             | The exact verbatim text of a token. ~~str~~                                                                               |
+| `LOWER`                                         | The lowercase form of the token text. ~~str~~                                                                             |
+|  `LENGTH`                                       | The length of the token text. ~~int~~                                                                                     |
+|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`             | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~                                          |
+|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE`             | Token text is in lowercase, uppercase, titlecase. ~~bool~~                                                                |
+|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP`              | Token is punctuation, whitespace, stop word. ~~bool~~                                                                     |
+|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`           | Token text resembles a number, URL, email. ~~bool~~                                                                       |
+|  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~       |
+| `ENT_TYPE`                                      | The token's entity label. ~~str~~                                                                                         |
+| `_` <Tag variant="new">2.1</Tag>                | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
+| `OP`                                            | [Operator or quantifier](#quantifiers) to determine how often to match a token pattern. ~~str~~                           |
 
 <Accordion title="Does it matter if the attribute names are uppercase or lowercase?">
 
@@ -236,6 +236,8 @@ following rich comparison attributes are available:
 | -------------------------- | ------------------------------------------------------------------------------------------------------- |
 | `IN`                       | Attribute value is member of a list. ~~Any~~                                                            |
 | `NOT_IN`                   | Attribute value is _not_ member of a list. ~~Any~~                                                      |
+| `ISSUBSET`                 | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~                                          |
+| `ISSUPERSET`               | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~                                        |
 | `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
 
 #### Regular expressions {#regex new="2.1"}

From 20b89a97176a5fc2d2c2c01e4f725f3a1d1e928b Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 16:57:02 +0200
Subject: [PATCH 06/16] Increment version [ci skip]

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 56b05257a..ea9f9f33e 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a24"
+__version__ = "3.0.0a25"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From 0b52b6904c78cc9e12db962d89db1ab2db38d545 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 17:10:35 +0200
Subject: [PATCH 07/16] Update entity_linker.py

---
 spacy/pipeline/entity_linker.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index fec53c77a..039e2a891 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -48,8 +48,11 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
         "incl_context": True,
         "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
     },
-    scores=["nel_micro_p", "nel_micro_r", "nel_micro_f"],
-    default_score_weights={"nel_micro_f": 1.0},
+    default_score_weights={
+        "nel_micro_f": 1.0,
+        "nel_micro_r": None,
+        "nel_micro_p": None,
+    },
 )
 def make_entity_linker(
     nlp: Language,
@@ -428,7 +431,6 @@ class EntityLinker(Pipe):
         validate_examples(examples, "EntityLinker.score")
         return Scorer.score_links(examples, negative_labels=[self.NIL])
 
-
     def to_disk(
         self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
     ) -> None:

From 26e28ed4134734dbc86fedb97339eec47282025a Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 17:11:13 +0200
Subject: [PATCH 08/16] Fix combined scores if multiple components report it

---
 spacy/util.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/spacy/util.py b/spacy/util.py
index ad3298651..378ec2823 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1237,7 +1237,9 @@ def combine_score_weights(
                 weight = 0.0
             else:
                 weight = round(value / total / len(all_weights), 2)
-            result[key] = result.get(key, 0.0) + weight
+            prev_weight = result.get(key, 0.0)
+            prev_weight = 0.0 if prev_weight is None else prev_weight
+            result[key] = prev_weight + weight
     return result
 
 

From 2abb4ba9db0d0ec074a7336be8a7395da78eaaa4 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 24 Sep 2020 18:13:39 +0200
Subject: [PATCH 09/16] Make a pre-check to speed up alignment cache (#6139)

* Dirty trick to fast-track alignment cache

* Improve alignment cache check

* Fix header

* Fix align cache

* Fix align logic
---
 spacy/training/example.pxd |  3 +++
 spacy/training/example.pyx | 36 +++++++++++++++++++++++++++++-------
 2 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/spacy/training/example.pxd b/spacy/training/example.pxd
index e06e36287..49e239757 100644
--- a/spacy/training/example.pxd
+++ b/spacy/training/example.pxd
@@ -1,4 +1,5 @@
 from ..tokens.doc cimport Doc
+from libc.stdint cimport uint64_t
 
 
 cdef class Example:
@@ -7,3 +8,5 @@ cdef class Example:
     cdef readonly object _cached_alignment
     cdef readonly object _cached_words_x
     cdef readonly object _cached_words_y
+    cdef readonly uint64_t _x_sig
+    cdef readonly uint64_t _y_sig
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 1e7bea5df..6a9815c44 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -1,6 +1,7 @@
 from collections import Iterable as IterableInstance
 import warnings
 import numpy
+from murmurhash.mrmr cimport hash64
 
 from ..tokens.doc cimport Doc
 from ..tokens.span cimport Span
@@ -97,15 +98,36 @@ cdef class Example:
 
     @property
     def alignment(self):
-        words_x = [token.text for token in self.x]
-        words_y = [token.text for token in self.y]
-        if self._cached_alignment is None or \
-                words_x != self._cached_words_x or \
-                words_y != self._cached_words_y:
-            self._cached_alignment = Alignment.from_strings(words_x, words_y)
+        x_sig = hash64(self.x.c, sizeof(self.x.c[0]) * self.x.length, 0)
+        y_sig = hash64(self.y.c, sizeof(self.y.c[0]) * self.y.length, 0)
+        if self._cached_alignment is None:
+            words_x = [token.text for token in self.x]
+            words_y = [token.text for token in self.y]
+            self._x_sig = x_sig
+            self._y_sig = y_sig
             self._cached_words_x = words_x
             self._cached_words_y = words_y
-        return self._cached_alignment
+            self._cached_alignment = Alignment.from_strings(words_x, words_y)
+            return self._cached_alignment
+        elif self._x_sig == x_sig and self._y_sig == y_sig:
+            # If we have a cached alignment, check whether the cache is invalid
+            # due to retokenization. To make this check fast in loops, we first
+            # check a hash of the TokenC arrays.
+            return self._cached_alignment
+        else:
+            words_x = [token.text for token in self.x]
+            words_y = [token.text for token in self.y]
+            if words_x == self._cached_words_x and words_y == self._cached_words_y:
+                self._x_sig = x_sig
+                self._y_sig = y_sig
+                return self._cached_alignment
+            else:
+                self._cached_alignment = Alignment.from_strings(words_x, words_y)
+                self._cached_words_x = words_x
+                self._cached_words_y = words_y
+                self._x_sig = x_sig
+                self._y_sig = y_sig
+                return self._cached_alignment
 
     def get_aligned(self, field, as_string=False):
         """Return an aligned array for a token attribute."""

From 16475528f735114370d2db48b576106b1a6451e5 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 24 Sep 2020 20:38:57 +0200
Subject: [PATCH 10/16] Fix skipped documents in entity scorer (#6137)

* Fix skipped documents in entity scorer

* Add back the skipping of unannotated entities

* Update spacy/scorer.py

* Use more specific NER scorer

* Fix import

* Fix get_ner_prf

* Add scorer

* Fix scorer

Co-authored-by: Ines Montani <ines@ines.io>
---
 spacy/pipeline/ner.pyx | 15 ++++++++--
 spacy/scorer.py        | 64 ++++++++++++++++++++++++++++++++++++------
 2 files changed, 67 insertions(+), 12 deletions(-)

diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx
index c9b0a5031..fc0dda40d 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@@ -6,7 +6,7 @@ from .transition_parser cimport Parser
 from ._parser_internals.ner cimport BiluoPushDown
 
 from ..language import Language
-from ..scorer import Scorer
+from ..scorer import get_ner_prf, PRFScore
 from ..training import validate_examples
 
 
@@ -117,9 +117,18 @@ cdef class EntityRecognizer(Parser):
         """Score a batch of examples.
 
         examples (Iterable[Example]): The examples to score.
-        RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
+        RETURNS (Dict[str, Any]): The NER precision, recall and f-scores.
 
         DOCS: https://nightly.spacy.io/api/entityrecognizer#score
         """
         validate_examples(examples, "EntityRecognizer.score")
-        return Scorer.score_spans(examples, "ents", **kwargs)
+        score_per_type = get_ner_prf(examples)
+        totals = PRFScore()
+        for prf in score_per_type.values():
+            totals += prf
+        return {
+            "ents_p": totals.precision,
+            "ents_r": totals.recall,
+            "ents_f": totals.fscore,
+            "ents_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
+        }
diff --git a/spacy/scorer.py b/spacy/scorer.py
index cd3b013cd..c1795847d 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -1,5 +1,6 @@
 from typing import Optional, Iterable, Dict, Any, Callable, TYPE_CHECKING
 import numpy as np
+from collections import defaultdict
 
 from .training import Example
 from .tokens import Token, Doc, Span
@@ -23,6 +24,19 @@ class PRFScore:
         self.fp = 0
         self.fn = 0
 
+    def __iadd__(self, other):
+        self.tp += other.tp
+        self.fp += other.fp
+        self.fn += other.fn
+        return self
+
+    def __add__(self, other):
+        return PRFScore(
+            tp=self.tp+other.tp,
+            fp=self.fp+other.fp,
+            fn=self.fn+other.fn
+        )
+
     def score_set(self, cand: set, gold: set) -> None:
         self.tp += len(cand.intersection(gold))
         self.fp += len(cand - gold)
@@ -295,20 +309,19 @@ class Scorer:
             # Find all predidate labels, for all and per type
             gold_spans = set()
             pred_spans = set()
-            # Special case for ents:
-            # If we have missing values in the gold, we can't easily tell
-            # whether our NER predictions are true.
-            # It seems bad but it's what we've always done.
-            if attr == "ents" and not all(token.ent_iob != 0 for token in gold_doc):
-                continue
             for span in getter(gold_doc, attr):
                 gold_span = (span.label_, span.start, span.end - 1)
                 gold_spans.add(gold_span)
                 gold_per_type[span.label_].add((span.label_, span.start, span.end - 1))
             pred_per_type = {label: set() for label in labels}
-            for span in example.get_aligned_spans_x2y(getter(pred_doc, attr)):
-                pred_spans.add((span.label_, span.start, span.end - 1))
-                pred_per_type[span.label_].add((span.label_, span.start, span.end - 1))
+            align_x2y = example.alignment.x2y
+            for pred_span in getter(pred_doc, attr):
+                indices = align_x2y[pred_span.start : pred_span.end].dataXd.ravel()
+                if len(indices):
+                    g_span = gold_doc[indices[0] : indices[-1]]
+                    span = (pred_span.label_, indices[0], indices[-1])
+                    pred_spans.add(span)
+                    pred_per_type[pred_span.label_].add(span)
             # Scores per label
             for k, v in score_per_type.items():
                 if k in pred_per_type:
@@ -613,6 +626,39 @@ class Scorer:
         }
 
 
+def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]:
+    """Compute per-entity PRFScore objects for a sequence of examples. The
+    results are returned as a dictionary keyed by the entity type. You can
+    add the PRFScore objects to get micro-averaged total.
+    """
+    scores = defaultdict(PRFScore)
+    for eg in examples:
+        if not eg.y.has_annotation("ENT_IOB"):
+            continue
+        golds = {(e.label_, e.start, e.end) for e in eg.y.ents}
+        align_x2y = eg.alignment.x2y
+        preds = set()
+        for pred_ent in eg.x.ents:
+            if pred_ent.label_ not in scores:
+                scores[pred_ent.label_] = PRFScore()
+            indices = align_x2y[pred_ent.start : pred_ent.end].dataXd.ravel()
+            if len(indices):
+                g_span = eg.y[indices[0] : indices[-1] + 1]
+                # Check we aren't missing annotation on this span. If so,
+                # our prediction is neither right nor wrong, we just
+                # ignore it.
+                if all(token.ent_iob != 0 for token in g_span):
+                    key = (pred_ent.label_, indices[0], indices[-1] + 1)
+                    if key in golds:
+                        scores[pred_ent.label_].tp += 1
+                        golds.remove(key)
+                    else:
+                        scores[pred_ent.label_].fp += 1
+        for label, start, end in golds:
+            scores[label].fn += 1
+    return scores
+
+
 #############################################################################
 #
 # The following implementation of roc_auc_score() is adapted from

From 2aa4d65734dec26d09d3326bf0498a2dafd54817 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 20:41:09 +0200
Subject: [PATCH 11/16] Update docs [ci skip]

---
 website/docs/api/entityrecognizer.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md
index 8af73f44b..6d710f425 100644
--- a/website/docs/api/entityrecognizer.md
+++ b/website/docs/api/entityrecognizer.md
@@ -242,10 +242,10 @@ Score a batch of examples.
 > scores = ner.score(examples)
 > ```
 
-| Name        | Description                                                                                                            |
-| ----------- | ---------------------------------------------------------------------------------------------------------------------- |
-| `examples`  | The examples to score. ~~Iterable[Example]~~                                                                           |
-| **RETURNS** | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
+| Name        | Description                                               |
+| ----------- | --------------------------------------------------------- |
+| `examples`  | The examples to score. ~~Iterable[Example]~~              |
+| **RETURNS** | The scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
 
 ## EntityRecognizer.create_optimizer {#create_optimizer tag="method"}
 

From 93d7ff309fba4faa805ca105b56a04daefa77f5c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 24 Sep 2020 21:05:27 +0200
Subject: [PATCH 12/16] Remove print

---
 spacy/training/example.pyx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 6a9815c44..f2c78203a 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -310,7 +310,6 @@ def _annot2array(vocab, tok_annot, doc_annot):
 
 
 def _add_entities_to_doc(doc, ner_data):
-    print(ner_data)
     if ner_data is None:
         return
     elif ner_data == []:

From 50f20cf7224edefbfa789755a1415841e6cd647b Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 25 Sep 2020 08:21:30 +0200
Subject: [PATCH 13/16] Revert changes to Scorer.score_spans

---
 spacy/scorer.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/spacy/scorer.py b/spacy/scorer.py
index c1795847d..b2f97e163 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -314,14 +314,9 @@ class Scorer:
                 gold_spans.add(gold_span)
                 gold_per_type[span.label_].add((span.label_, span.start, span.end - 1))
             pred_per_type = {label: set() for label in labels}
-            align_x2y = example.alignment.x2y
-            for pred_span in getter(pred_doc, attr):
-                indices = align_x2y[pred_span.start : pred_span.end].dataXd.ravel()
-                if len(indices):
-                    g_span = gold_doc[indices[0] : indices[-1]]
-                    span = (pred_span.label_, indices[0], indices[-1])
-                    pred_spans.add(span)
-                    pred_per_type[pred_span.label_].add(span)
+            for span in example.get_aligned_spans_x2y(getter(pred_doc, attr)):
+                pred_spans.add((span.label_, span.start, span.end - 1))
+                pred_per_type[span.label_].add((span.label_, span.start, span.end - 1))
             # Scores per label
             for k, v in score_per_type.items():
                 if k in pred_per_type:

From c7956a40474892b8459e5241de965e46ca388980 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 25 Sep 2020 09:25:46 +0200
Subject: [PATCH 14/16] Update models.js [ci skip]

---
 website/src/templates/models.js | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/website/src/templates/models.js b/website/src/templates/models.js
index cdfe2e46d..f67188c0b 100644
--- a/website/src/templates/models.js
+++ b/website/src/templates/models.js
@@ -78,10 +78,15 @@ function isStableVersion(v) {
     return !v.includes('a') && !v.includes('b') && !v.includes('dev') && !v.includes('rc')
 }
 
-function getLatestVersion(modelId, compatibility) {
+function getLatestVersion(modelId, compatibility, prereleases) {
     for (let [version, models] of Object.entries(compatibility)) {
         if (isStableVersion(version) && models[modelId]) {
-            return models[modelId][0]
+            const modelVersions = models[modelId]
+            for (let modelVersion of modelVersions) {
+                if (isStableVersion(modelVersion) || prereleases) {
+                    return modelVersion
+                }
+            }
         }
     }
 }
@@ -147,12 +152,26 @@ const Help = ({ children }) => (
     </span>
 )
 
-const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExamples, licenses }) => {
+const Model = ({
+    name,
+    langId,
+    langName,
+    baseUrl,
+    repo,
+    compatibility,
+    hasExamples,
+    licenses,
+    prereleases,
+}) => {
     const [initialized, setInitialized] = useState(false)
     const [isError, setIsError] = useState(true)
     const [meta, setMeta] = useState({})
     const { type, genre, size } = getModelComponents(name)
-    const version = useMemo(() => getLatestVersion(name, compatibility), [name, compatibility])
+    const version = useMemo(() => getLatestVersion(name, compatibility, prereleases), [
+        name,
+        compatibility,
+        prereleases,
+    ])
 
     useEffect(() => {
         window.dispatchEvent(new Event('resize')) // scroll position for progress
@@ -332,7 +351,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
 const Models = ({ pageContext, repo, children }) => {
     const [initialized, setInitialized] = useState(false)
     const [compatibility, setCompatibility] = useState({})
-    const { id, title, meta, hasExamples } = pageContext
+    const { id, title, meta } = pageContext
     const { models, isStarters } = meta
     const baseUrl = `https://raw.githubusercontent.com/${repo}/master`
 
@@ -381,6 +400,7 @@ const Models = ({ pageContext, repo, children }) => {
                             repo={repo}
                             licenses={arrayToObj(site.siteMetadata.licenses, 'id')}
                             hasExamples={meta.hasExamples}
+                            prereleases={site.siteMetadata.nightly}
                         />
                     ))
                 }
@@ -397,6 +417,7 @@ const query = graphql`
     query ModelsQuery {
         site {
             siteMetadata {
+                nightly
                 licenses {
                     id
                     url

From 2cfe9340a1727acf9fcfd23a6ac0c0f2c0215010 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 25 Sep 2020 13:21:20 +0200
Subject: [PATCH 15/16] Link model components [ci skip]

---
 website/src/templates/models.js | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/website/src/templates/models.js b/website/src/templates/models.js
index f67188c0b..8a73a6282 100644
--- a/website/src/templates/models.js
+++ b/website/src/templates/models.js
@@ -11,12 +11,23 @@ import { Table, Tr, Td, Th } from '../components/table'
 import Tag from '../components/tag'
 import { H2, Label } from '../components/typography'
 import Icon from '../components/icon'
-import Link from '../components/link'
+import Link, { OptionalLink } from '../components/link'
 import Infobox from '../components/infobox'
 import Accordion from '../components/accordion'
 import { join, arrayToObj, abbrNum, markdownToReact } from '../components/util'
 import { isString, isEmptyObj } from '../components/util'
 
+const COMPONENT_LINKS = {
+    tok2vec: '/api/tok2vec',
+    transformer: '/api/transformer',
+    tagger: '/api/tagger',
+    parser: '/api/dependencyparser',
+    ner: '/api/entityrecognizer',
+    lemmatizer: '/api/lemmatizer',
+    attribute_ruler: '/api/attributeruler',
+    senter: '/api/sentencerecognizer',
+}
+
 const MODEL_META = {
     core: 'Vocabulary, syntax, entities, vectors',
     core_sm: 'Vocabulary, syntax, entities',
@@ -146,6 +157,18 @@ function formatSources(data = []) {
     ))
 }
 
+function linkComponents(components = []) {
+    return join(
+        components.map(c => (
+            <Fragment key={c}>
+                <OptionalLink to={COMPONENT_LINKS[c]} hideIcon>
+                    <InlineCode>{c}</InlineCode>
+                </OptionalLink>
+            </Fragment>
+        ))
+    )
+}
+
 const Help = ({ children }) => (
     <span data-tooltip={children}>
         <Icon name="help2" width={16} variant="subtle" inline />
@@ -192,10 +215,8 @@ const Model = ({
 
     const releaseTag = meta.fullName ? `/tag/${meta.fullName}` : ''
     const releaseUrl = `https://github.com/${repo}/releases/${releaseTag}`
-    const pipeline =
-        meta.pipeline && join(meta.pipeline.map(p => <InlineCode key={p}>{p}</InlineCode>))
-    const components =
-        meta.components && join(meta.components.map(p => <InlineCode key={p}>{p}</InlineCode>))
+    const pipeline = linkComponents(meta.pipeline)
+    const components = linkComponents(meta.components)
     const sources = formatSources(meta.sources)
     const author = !meta.url ? meta.author : <Link to={meta.url}>{meta.author}</Link>
     const licenseUrl = licenses[meta.license] ? licenses[meta.license].url : null

From 02a1b6ab839f4a07c3cb1fb727c847f58a1c44f9 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 25 Sep 2020 13:21:43 +0200
Subject: [PATCH 16/16] Update links [ci skip]

---
 website/src/templates/models.js | 1 +
 1 file changed, 1 insertion(+)

diff --git a/website/src/templates/models.js b/website/src/templates/models.js
index 8a73a6282..f9895334d 100644
--- a/website/src/templates/models.js
+++ b/website/src/templates/models.js
@@ -26,6 +26,7 @@ const COMPONENT_LINKS = {
     lemmatizer: '/api/lemmatizer',
     attribute_ruler: '/api/attributeruler',
     senter: '/api/sentencerecognizer',
+    morphologizer: '/api/morphologizer',
 }
 
 const MODEL_META = {