From 8b650f3a786094833cccd8686ab4d6d73330565c Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 17 Sep 2020 21:10:41 +0200
Subject: [PATCH 01/14] Modify setting missing and blocked entity tokens

In order to make it easier to construct `Doc` objects as training data,
modify how missing and blocked entity tokens are set to prioritize
setting `O` and missing entity tokens for training purposes over setting
blocked entity tokens.

* `Doc.ents` setter sets tokens outside entity spans to `O` regardless
of the current state of each token

* For `Doc.ents`, setting a span with a missing label sets the `ent_iob`
to missing instead of blocked

* `Doc.block_ents(spans)` marks spans as hard `O` for use with the
`EntityRecognizer`
---
 spacy/tests/doc/test_doc_api.py | 18 ++++++++++++++++--
 spacy/tests/parser/test_ner.py  |  4 ++--
 spacy/tokens/doc.pyx            | 25 +++++++++++++++++++------
 spacy/training/example.pyx      |  4 +---
 spacy/training/iob_utils.py     | 12 ++++--------
 5 files changed, 42 insertions(+), 21 deletions(-)

diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index ce979d3d1..53c309ba5 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -137,7 +137,7 @@ def test_doc_api_set_ents(en_tokenizer):
     assert len(tokens.ents) == 0
     tokens.ents = [(tokens.vocab.strings["PRODUCT"], 2, 4)]
     assert len(list(tokens.ents)) == 1
-    assert [t.ent_iob for t in tokens] == [0, 0, 3, 1, 0, 0, 0, 0]
+    assert [t.ent_iob for t in tokens] == [2, 2, 3, 1, 2, 2, 2, 2]
     assert tokens.ents[0].label_ == "PRODUCT"
     assert tokens.ents[0].start == 2
     assert tokens.ents[0].end == 4
@@ -426,7 +426,7 @@ def test_has_annotation(en_vocab):
     doc[0].lemma_ = "a"
     doc[0].dep_ = "dep"
     doc[0].head = doc[1]
-    doc.ents = [Span(doc, 0, 1, label="HELLO")]
+    doc.ents = [Span(doc, 0, 1, label="HELLO"), Span(doc, 1, 2, label="")]
 
     for attr in attrs:
         assert doc.has_annotation(attr)
@@ -454,3 +454,17 @@ def test_is_flags_deprecated(en_tokenizer):
         doc.is_nered
     with pytest.deprecated_call():
         doc.is_sentenced
+
+
+def test_block_ents(en_tokenizer):
+    doc = en_tokenizer("a b c d e")
+    doc.block_ents([doc[1:2], doc[3:5]])
+    assert [t.ent_iob for t in doc] == [0, 3, 0, 3, 3]
+    assert [t.ent_type for t in doc] == [0, 0, 0, 0, 0]
+    assert doc.ents == tuple()
+
+    # invalid IOB repaired
+    doc.ents = [Span(doc, 3, 5, "ENT")]
+    assert [t.ent_iob for t in doc] == [2, 2, 2, 3, 1]
+    doc.block_ents([doc[3:4]])
+    assert [t.ent_iob for t in doc] == [2, 2, 2, 3, 3]
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 548cd2697..b8fdf15f9 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -168,7 +168,7 @@ def test_accept_blocked_token():
     ner2 = nlp2.create_pipe("ner", config=config)
 
     # set "New York" to a blocked entity
-    doc2.ents = [(0, 3, 5)]
+    doc2.block_ents([doc2[3:5]])
     assert [token.ent_iob_ for token in doc2] == ["", "", "", "B", "B"]
     assert [token.ent_type_ for token in doc2] == ["", "", "", "", ""]
 
@@ -358,5 +358,5 @@ class BlockerComponent1:
         self.name = name
 
     def __call__(self, doc):
-        doc.ents = [(0, self.start, self.end)]
+        doc.block_ents([doc[self.start:self.end]])
         return doc
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 5c5443258..1bae84508 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -590,17 +590,16 @@ cdef class Doc:
                 entity_type = 0
                 kb_id = 0
 
-                # Set ent_iob to Missing (0) by default unless this token was nered before
-                ent_iob = 0
-                if self.c[i].ent_iob != 0:
-                    ent_iob = 2
+                # Set ent_iob to Outside (2) by default
+                ent_iob = 2
 
                 # overwrite if the token was part of a specified entity
                 if i in tokens_in_ents.keys():
                     ent_start, ent_end, entity_type, kb_id = tokens_in_ents[i]
                     if entity_type is None or entity_type <= 0:
-                        # Blocking this token from being overwritten by downstream NER
-                        ent_iob = 3
+                        # Empty label: Missing, unset this token
+                        ent_iob = 0
+                        entity_type = 0
                     elif ent_start == i:
                         # Marking the start of an entity
                         ent_iob = 3
@@ -612,6 +611,20 @@ cdef class Doc:
                 self.c[i].ent_kb_id = kb_id
                 self.c[i].ent_iob = ent_iob
 
+    def block_ents(self, spans):
+        """Mark spans as never an entity for the EntityRecognizer.
+
+        spans (List[Span]): The spans to block as never entities.
+        """
+        for span in spans:
+            for i in range(span.start, span.end):
+                self.c[i].ent_iob = 3
+                self.c[i].ent_type = 0
+            # if the following token is I, set to B
+            if span.end < self.length:
+                if self.c[span.end].ent_iob == 1:
+                    self.c[span.end].ent_iob = 3
+
     @property
     def noun_chunks(self):
         """Iterate over the base noun phrases in the document. Yields base
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 3344704bf..d396a2040 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -172,7 +172,7 @@ cdef class Example:
         return output
 
     def get_aligned_ner(self):
-        if not self.y.is_nered:
+        if not self.y.has_annotation("ENT_IOB"):
             return [None] * len(self.x)  # should this be 'missing' instead of 'None' ?
         x_ents = self.get_aligned_spans_y2x(self.y.ents)
         # Default to 'None' for missing values
@@ -303,9 +303,7 @@ def _add_entities_to_doc(doc, ner_data):
             spans_from_biluo_tags(doc, ner_data)
         )
     elif isinstance(ner_data[0], Span):
-        # Ugh, this is super messy. Really hard to set O entities
         doc.ents = ner_data
-        doc.ents = [span for span in ner_data if span.label_]
     else:
         raise ValueError(Errors.E973)
 
diff --git a/spacy/training/iob_utils.py b/spacy/training/iob_utils.py
index ceb5e16b8..33a4733ca 100644
--- a/spacy/training/iob_utils.py
+++ b/spacy/training/iob_utils.py
@@ -182,22 +182,18 @@ def tags_to_entities(tags):
     entities = []
     start = None
     for i, tag in enumerate(tags):
-        if tag is None:
-            continue
-        if tag.startswith("O"):
+        if tag is None or tag.startswith("-"):
             # TODO: We shouldn't be getting these malformed inputs. Fix this.
             if start is not None:
                 start = None
             else:
                 entities.append(("", i, i))
-            continue
-        elif tag == "-":
-            continue
+        elif tag.startswith("O"):
+            pass
         elif tag.startswith("I"):
             if start is None:
                 raise ValueError(Errors.E067.format(start="I", tags=tags[: i + 1]))
-            continue
-        if tag.startswith("U"):
+        elif tag.startswith("U"):
             entities.append((tag[2:], i, i))
         elif tag.startswith("B"):
             start = i

From 177df15d89da7eccc1603c33b847a12c43a56e0c Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 21 Sep 2020 15:54:05 +0200
Subject: [PATCH 02/14] Implement Doc.set_ents

---
 spacy/errors.py                 |   9 +++
 spacy/tests/doc/test_doc_api.py |  63 +++++++++++++++--
 spacy/tests/parser/test_ner.py  |   4 +-
 spacy/tokens/doc.pyx            | 122 +++++++++++++++++++++++++++++---
 spacy/training/example.pyx      |  10 ++-
 spacy/training/iob_utils.py     |   5 +-
 6 files changed, 192 insertions(+), 21 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 81e3616be..a21ff5476 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -682,6 +682,15 @@ class Errors:
     E1009 = ("String for hash '{val}' not found in StringStore. Set the value "
              "through token.morph_ instead or add the string to the "
              "StringStore with `nlp.vocab.strings.add(string)`.")
+    E1010 = ("Unable to set entity information for token {i} which is included "
+             "in more than one span in entities, blocked, missing or outside.")
+    E1011 = ("Unsupported default '{default}' in doc.set_ents. Available "
+             "options: {modes}")
+    E1012 = ("Spans provided to doc.set_ents must be provided as a list of "
+             "`Span` objects.")
+    E1013 = ("Unable to set entity for span with empty label. Entity spans are "
+             "required to have a label. To set entity information as missing "
+             "or blocked, use the keyword arguments with doc.set_ents.")
 
 
 @add_codes
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index a08efe9d7..7339a9aef 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -425,7 +425,7 @@ def test_has_annotation(en_vocab):
     doc[0].lemma_ = "a"
     doc[0].dep_ = "dep"
     doc[0].head = doc[1]
-    doc.ents = [Span(doc, 0, 1, label="HELLO"), Span(doc, 1, 2, label="")]
+    doc.set_ents([Span(doc, 0, 1, label="HELLO")], default="missing")
 
     for attr in attrs:
         assert doc.has_annotation(attr)
@@ -455,15 +455,68 @@ def test_is_flags_deprecated(en_tokenizer):
         doc.is_sentenced
 
 
-def test_block_ents(en_tokenizer):
+def test_set_ents(en_tokenizer):
+    # set ents
     doc = en_tokenizer("a b c d e")
-    doc.block_ents([doc[1:2], doc[3:5]])
+    doc.set_ents([Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)])
+    assert [t.ent_iob for t in doc] == [3, 3, 1, 2, 2]
+    assert [t.ent_type for t in doc] == [10, 11, 11, 0, 0]
+
+    # add ents, invalid IOB repaired
+    doc = en_tokenizer("a b c d e")
+    doc.set_ents([Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)])
+    doc.set_ents([Span(doc, 0, 2, 12)], default="unmodified")
+    assert [t.ent_iob for t in doc] == [3, 1, 3, 2, 2]
+    assert [t.ent_type for t in doc] == [12, 12, 11, 0, 0]
+
+    # missing ents
+    doc = en_tokenizer("a b c d e")
+    doc.set_ents([Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)], missing=[doc[4:5]])
+    assert [t.ent_iob for t in doc] == [3, 3, 1, 2, 0]
+    assert [t.ent_type for t in doc] == [10, 11, 11, 0, 0]
+
+    # outside ents
+    doc = en_tokenizer("a b c d e")
+    doc.set_ents(
+        [Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)],
+        outside=[doc[4:5]],
+        default="missing",
+    )
+    assert [t.ent_iob for t in doc] == [3, 3, 1, 0, 2]
+    assert [t.ent_type for t in doc] == [10, 11, 11, 0, 0]
+
+    # blocked ents
+    doc = en_tokenizer("a b c d e")
+    doc.set_ents([], blocked=[doc[1:2], doc[3:5]], default="unmodified")
     assert [t.ent_iob for t in doc] == [0, 3, 0, 3, 3]
     assert [t.ent_type for t in doc] == [0, 0, 0, 0, 0]
     assert doc.ents == tuple()
 
-    # invalid IOB repaired
+    # invalid IOB repaired after blocked
     doc.ents = [Span(doc, 3, 5, "ENT")]
     assert [t.ent_iob for t in doc] == [2, 2, 2, 3, 1]
-    doc.block_ents([doc[3:4]])
+    doc.set_ents([], blocked=[doc[3:4]], default="unmodified")
     assert [t.ent_iob for t in doc] == [2, 2, 2, 3, 3]
+
+    # all types
+    doc = en_tokenizer("a b c d e")
+    doc.set_ents(
+        [Span(doc, 0, 1, 10)],
+        blocked=[doc[1:2]],
+        missing=[doc[2:3]],
+        outside=[doc[3:4]],
+        default="unmodified",
+    )
+    assert [t.ent_iob for t in doc] == [3, 3, 0, 2, 0]
+    assert [t.ent_type for t in doc] == [10, 0, 0, 0, 0]
+
+    doc = en_tokenizer("a b c d e")
+    # single span instead of a list
+    with pytest.raises(ValueError):
+        doc.set_ents([], missing=doc[1:2])
+    # invalid default mode
+    with pytest.raises(ValueError):
+        doc.set_ents([], missing=[doc[1:2]], default="none")
+    # conflicting/overlapping specifications
+    with pytest.raises(ValueError):
+        doc.set_ents([], missing=[doc[1:2]], outside=[doc[1:2]])
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index b8fdf15f9..cd5581769 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -168,7 +168,7 @@ def test_accept_blocked_token():
     ner2 = nlp2.create_pipe("ner", config=config)
 
     # set "New York" to a blocked entity
-    doc2.block_ents([doc2[3:5]])
+    doc2.set_ents([], blocked=[doc2[3:5]], default="unmodified")
     assert [token.ent_iob_ for token in doc2] == ["", "", "", "B", "B"]
     assert [token.ent_type_ for token in doc2] == ["", "", "", "", ""]
 
@@ -358,5 +358,5 @@ class BlockerComponent1:
         self.name = name
 
     def __call__(self, doc):
-        doc.block_ents([doc[self.start:self.end]])
+        doc.set_ents([], blocked=[doc[self.start:self.end]], default="unmodified")
         return doc
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index cc621b443..be99bacf3 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -7,6 +7,7 @@ from libc.stdint cimport int32_t, uint64_t
 
 import copy
 from collections import Counter
+from enum import Enum
 import numpy
 import srsly
 from thinc.api import get_array_module
@@ -86,6 +87,17 @@ cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name)
         return get_token_attr(token, feat_name)
 
 
+class SetEntsDefault(str, Enum):
+    blocked = "blocked"
+    missing = "missing"
+    outside = "outside"
+    unmodified = "unmodified"
+
+    @classmethod
+    def values(cls):
+        return list(cls.__members__.keys())
+
+
 cdef class Doc:
     """A sequence of Token objects. Access sentences and named entities, export
     annotations to numpy arrays, losslessly serialize to compressed binary
@@ -597,9 +609,9 @@ cdef class Doc:
                 if i in tokens_in_ents.keys():
                     ent_start, ent_end, entity_type, kb_id = tokens_in_ents[i]
                     if entity_type is None or entity_type <= 0:
-                        # Empty label: Missing, unset this token
-                        ent_iob = 0
-                        entity_type = 0
+                        # Only allow labelled spans
+                        print(i, ent_start, ent_end, entity_type)
+                        raise ValueError(Errors.E1013)
                     elif ent_start == i:
                         # Marking the start of an entity
                         ent_iob = 3
@@ -611,19 +623,107 @@ cdef class Doc:
                 self.c[i].ent_kb_id = kb_id
                 self.c[i].ent_iob = ent_iob
 
-    def block_ents(self, spans):
-        """Mark spans as never an entity for the EntityRecognizer.
+    def set_ents(self, entities, *, blocked=None, missing=None, outside=None, default=SetEntsDefault.outside):
+        """Set entity annotation.
 
-        spans (List[Span]): The spans to block as never entities.
+        entities (List[Span]): Spans with labels to set as entities.
+        blocked (Optional[List[Span]]): Spans to set as 'blocked' (never an
+            entity) for spacy's built-in NER component. Other components may
+            ignore this setting.
+        missing (Optional[List[Span]]): Spans with missing/unknown entity
+            information.
+        outside (Optional[List[Span]]): Spans outside of entities (O in IOB).
+        default (str): How to set entity annotation for tokens outside of any
+            provided spans. Options: "blocked", "missing", "outside" and
+            "unmodified" (preserve current state). Defaults to "outside".
         """
-        for span in spans:
+        if default not in SetEntsDefault.values():
+            raise ValueError(Errors.E1011.format(default=default, modes=", ".join(SetEntsDefault)))
+
+        if blocked is None:
+            blocked = tuple()
+        if missing is None:
+            missing = tuple()
+        if outside is None:
+            outside = tuple()
+
+        # Find all tokens covered by spans and check that none are overlapping
+        seen_tokens = set()
+        for span in entities:
+            if not isinstance(span, Span):
+                raise ValueError(Errors.E1012.format(span=span))
+            for i in range(span.start, span.end):
+                if i in seen_tokens:
+                    raise ValueError(Errors.E1010.format(i=i))
+                seen_tokens.add(i)
+        for span in blocked:
+            if not isinstance(span, Span):
+                raise ValueError(Errors.E1012.format(span=span))
+            for i in range(span.start, span.end):
+                if i in seen_tokens:
+                    raise ValueError(Errors.E1010.format(i=i))
+                seen_tokens.add(i)
+        for span in missing:
+            if not isinstance(span, Span):
+                raise ValueError(Errors.E1012.format(span=span))
+            for i in range(span.start, span.end):
+                if i in seen_tokens:
+                    raise ValueError(Errors.E1010.format(i=i))
+                seen_tokens.add(i)
+        for span in outside:
+            if not isinstance(span, Span):
+                raise ValueError(Errors.E1012.format(span=span))
+            for i in range(span.start, span.end):
+                if i in seen_tokens:
+                    raise ValueError(Errors.E1010.format(i=i))
+                seen_tokens.add(i)
+
+        # Set all specified entity information
+        for span in entities:
+            for i in range(span.start, span.end):
+                if not span.label:
+                    raise ValueError(Errors.E1013)
+                if i == span.start:
+                    self.c[i].ent_iob = 3
+                else:
+                    self.c[i].ent_iob = 1
+                self.c[i].ent_type = span.label
+        for span in blocked:
             for i in range(span.start, span.end):
                 self.c[i].ent_iob = 3
                 self.c[i].ent_type = 0
-            # if the following token is I, set to B
-            if span.end < self.length:
-                if self.c[span.end].ent_iob == 1:
-                    self.c[span.end].ent_iob = 3
+        for span in missing:
+            for i in range(span.start, span.end):
+                self.c[i].ent_iob = 0
+                self.c[i].ent_type = 0
+        for span in outside:
+            for i in range(span.start, span.end):
+                self.c[i].ent_iob = 2
+                self.c[i].ent_type = 0
+
+        # Set tokens outside of all provided spans
+        if default != SetEntsDefault.unmodified:
+            for i in range(self.length):
+                if i not in seen_tokens:
+                    self.c[i].ent_type = 0
+                    if default == SetEntsDefault.outside:
+                        self.c[i].ent_iob = 2
+                    elif default == SetEntsDefault.missing:
+                        self.c[i].ent_iob = 0
+                    elif default == SetEntsDefault.blocked:
+                        self.c[i].ent_iob = 3
+
+        # Fix any resulting inconsistent annotation
+        for i in range(self.length - 1):
+            # I must follow B or I: convert I to B
+            if (self.c[i].ent_iob == 0 or self.c[i].ent_iob == 2) and \
+                    self.c[i+1].ent_iob == 1:
+                self.c[i+1].ent_iob = 3
+            # Change of type with BI or II: convert second I to B
+            if self.c[i].ent_type != self.c[i+1].ent_type and \
+                    (self.c[i].ent_iob == 3 or self.c[i].ent_iob == 1) and \
+                    self.c[i+1].ent_iob == 1:
+                self.c[i+1].ent_iob = 3
 
     @property
     def noun_chunks(self):
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index d396a2040..82d8b6fce 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -288,6 +288,7 @@ def _annot2array(vocab, tok_annot, doc_annot):
 
 
 def _add_entities_to_doc(doc, ner_data):
+    print(ner_data)
     if ner_data is None:
         return
     elif ner_data == []:
@@ -303,7 +304,14 @@ def _add_entities_to_doc(doc, ner_data):
             spans_from_biluo_tags(doc, ner_data)
         )
     elif isinstance(ner_data[0], Span):
-        doc.ents = ner_data
+        entities = []
+        missing = []
+        for span in ner_data:
+            if span.label:
+                entities.append(span)
+            else:
+                missing.append(span)
+        doc.set_ents(entities, missing=missing)
     else:
         raise ValueError(Errors.E973)
 
diff --git a/spacy/training/iob_utils.py b/spacy/training/iob_utils.py
index 33a4733ca..b435c8ecb 100644
--- a/spacy/training/iob_utils.py
+++ b/spacy/training/iob_utils.py
@@ -149,9 +149,10 @@ def spans_from_biluo_tags(doc, tags):
 
     doc (Doc): The document that the BILUO tags refer to.
     entities (iterable): A sequence of BILUO tags with each tag describing one
-        token. Each tags string will be of the form of either "", "O" or
+        token. Each tag string will be of the form of either "", "O" or
         "{action}-{label}", where action is one of "B", "I", "L", "U".
-    RETURNS (list): A sequence of Span objects.
+    RETURNS (list): A sequence of Span objects. Each token with a missing IOB
+        tag is returned as a Span with an empty label.
     """
     token_offsets = tags_to_entities(tags)
     spans = []

From b1a7d6c528e08c4a80594ae6338cacb22bf8b5b1 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 22 Sep 2020 14:42:51 +0200
Subject: [PATCH 03/14] Refactor seen token detection

---
 spacy/errors.py      |  4 ++--
 spacy/tokens/doc.pyx | 24 ++----------------------
 2 files changed, 4 insertions(+), 24 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index de4ffde3c..27091810d 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -690,8 +690,8 @@ class Errors:
              "in more than one span in entities, blocked, missing or outside.")
     E1011 = ("Unsupported default '{default}' in doc.set_ents. Available "
              "options: {modes}")
-    E1012 = ("Spans provided to doc.set_ents must be provided as a list of "
-             "`Span` objects.")
+    E1012 = ("Entity spans and blocked/missing/outside spans should be "
+             "provided to doc.set_ents as lists of `Span` objects.")
     E1013 = ("Unable to set entity for span with empty label. Entity spans are "
              "required to have a label. To set entity information as missing "
              "or blocked, use the keyword arguments with doc.set_ents.")
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 34742e587..4bf6f0e5e 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -8,6 +8,7 @@ from libc.stdint cimport int32_t, uint64_t
 import copy
 from collections import Counter
 from enum import Enum
+import itertools
 import numpy
 import srsly
 from thinc.api import get_array_module
@@ -742,28 +743,7 @@ cdef class Doc:
 
         # Find all tokens covered by spans and check that none are overlapping
         seen_tokens = set()
-        for span in entities:
-            if not isinstance(span, Span):
-                raise ValueError(Errors.E1012.format(span=span))
-            for i in range(span.start, span.end):
-                if i in seen_tokens:
-                    raise ValueError(Errors.E1010.format(i=i))
-                seen_tokens.add(i)
-        for span in blocked:
-            if not isinstance(span, Span):
-                raise ValueError(Errors.E1012.format(span=span))
-            for i in range(span.start, span.end):
-                if i in seen_tokens:
-                    raise ValueError(Errors.E1010.format(i=i))
-                seen_tokens.add(i)
-        for span in missing:
-            if not isinstance(span, Span):
-                raise ValueError(Errors.E1012.format(span=span))
-            for i in range(span.start, span.end):
-                if i in seen_tokens:
-                    raise ValueError(Errors.E1010.format(i=i))
-                seen_tokens.add(i)
-        for span in outside:
+        for span in itertools.chain.from_iterable([entities, blocked, missing, outside]):
             if not isinstance(span, Span):
                 raise ValueError(Errors.E1012.format(span=span))
             for i in range(span.start, span.end):

From 8eaacaae97f0caf77576e843a8d6bcf866c79236 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 24 Sep 2020 12:36:51 +0200
Subject: [PATCH 04/14] Refactor Doc.ents setter to use Doc.set_ents

Additional changes:

* Entity spans with missing labels are ignored
* Fix ent_kb_id setting in `Doc.set_ents`
---
 spacy/tests/doc/test_add_entities.py |  4 +--
 spacy/tests/doc/test_doc_api.py      |  2 +-
 spacy/tokens/doc.pyx                 | 50 ++++++----------------------
 3 files changed, 14 insertions(+), 42 deletions(-)

diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py
index 40aff8e31..615ab9e5b 100644
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@@ -29,10 +29,10 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
     ner.begin_training(lambda: [_ner_example(ner)])
     ner(doc)
 
-    doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)]
+    doc.ents = [("ANIMAL", 3, 4)]
     assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"]
 
-    doc.ents = [(doc.vocab.strings["WORD"], 0, 2)]
+    doc.ents = [("WORD", 0, 2)]
     assert [w.ent_iob_ for w in doc] == ["B", "I", "O", "O"]
 
 
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 892b65cf4..e5e72fe2a 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -534,4 +534,4 @@ def test_doc_ents_setter():
     vocab = Vocab()
     ents = [("HELLO", 0, 2), (vocab.strings.add("WORLD"), 3, 5)]
     doc = Doc(vocab, words=words, ents=ents)
-    assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
\ No newline at end of file
+    assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 4bf6f0e5e..670c7440f 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -673,49 +673,16 @@ cdef class Doc:
             # TODO:
             # 1. Test basic data-driven ORTH gazetteer
             # 2. Test more nuanced date and currency regex
-            tokens_in_ents = {}
-            cdef attr_t entity_type
-            cdef attr_t kb_id
-            cdef int ent_start, ent_end, token_index
+            cdef attr_t entity_type, kb_id
+            cdef int ent_start, ent_end
+            ent_spans = []
             for ent_info in ents:
                 entity_type_, kb_id, ent_start, ent_end = get_entity_info(ent_info)
                 if isinstance(entity_type_, str):
                     self.vocab.strings.add(entity_type_)
-                entity_type = self.vocab.strings.as_int(entity_type_)
-                for token_index in range(ent_start, ent_end):
-                    if token_index in tokens_in_ents:
-                        raise ValueError(Errors.E103.format(
-                            span1=(tokens_in_ents[token_index][0],
-                                   tokens_in_ents[token_index][1],
-                                   self.vocab.strings[tokens_in_ents[token_index][2]]),
-                            span2=(ent_start, ent_end, self.vocab.strings[entity_type])))
-                    tokens_in_ents[token_index] = (ent_start, ent_end, entity_type, kb_id)
-            cdef int i
-            for i in range(self.length):
-                # default values
-                entity_type = 0
-                kb_id = 0
-
-                # Set ent_iob to Outside (2) by default
-                ent_iob = 2
-
-                # overwrite if the token was part of a specified entity
-                if i in tokens_in_ents.keys():
-                    ent_start, ent_end, entity_type, kb_id = tokens_in_ents[i]
-                    if entity_type is None or entity_type <= 0:
-                        # Only allow labelled spans
-                        print(i, ent_start, ent_end, entity_type)
-                        raise ValueError(Errors.E1013)
-                    elif ent_start == i:
-                        # Marking the start of an entity
-                        ent_iob = 3
-                    else:
-                        # Marking the inside of an entity
-                        ent_iob = 1
-
-                self.c[i].ent_type = entity_type
-                self.c[i].ent_kb_id = kb_id
-                self.c[i].ent_iob = ent_iob
+                span = Span(self, ent_start, ent_end, label=entity_type_, kb_id=kb_id)
+                ent_spans.append(span)
+            self.set_ents(ent_spans, default=SetEntsDefault.outside)
 
     def set_ents(self, entities, *, blocked=None, missing=None, outside=None, default=SetEntsDefault.outside):
         """Set entity annotation.
@@ -734,6 +701,9 @@ cdef class Doc:
         if default not in SetEntsDefault.values():
             raise ValueError(Errors.E1011.format(default=default, modes=", ".join(SetEntsDefault)))
 
+        # Ignore spans with missing labels
+        entities = [ent for ent in entities if ent.label > 0]
+
         if blocked is None:
             blocked = tuple()
         if missing is None:
@@ -742,6 +712,7 @@ cdef class Doc:
             outside = tuple()
 
         # Find all tokens covered by spans and check that none are overlapping
+        cdef int i
         seen_tokens = set()
         for span in itertools.chain.from_iterable([entities, blocked, missing, outside]):
             if not isinstance(span, Span):
@@ -761,6 +732,7 @@ cdef class Doc:
                 else:
                     self.c[i].ent_iob = 1
                 self.c[i].ent_type = span.label
+                self.c[i].ent_kb_id = span.kb_id
         for span in blocked:
             for i in range(span.start, span.end):
                 self.c[i].ent_iob = 3

From be56c0994b09a8ba5042eb563d05ea5bb7f75a6d Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 12:40:25 +0200
Subject: [PATCH 05/14] Add [training.before_to_disk] callback

---
 spacy/cli/train.py       | 18 ++++++++++++++++++
 spacy/default_config.cfg |  2 ++
 spacy/errors.py          |  3 +++
 spacy/schemas.py         |  1 +
 4 files changed, 24 insertions(+)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index eabc82be0..6d61c2425 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -97,6 +97,7 @@ def train(
     dev_corpus = dot_to_object(config, T_cfg["dev_corpus"])
     batcher = T_cfg["batcher"]
     train_logger = T_cfg["logger"]
+    before_to_disk = create_before_to_disk_callback(T_cfg["before_to_disk"])
     # Components that shouldn't be updated during training
     frozen_components = T_cfg["frozen_components"]
     # Sourced components that require resume_training
@@ -167,6 +168,7 @@ def train(
                     with nlp.select_pipes(disable=frozen_components):
                         update_meta(T_cfg, nlp, info)
                     with nlp.use_params(optimizer.averages):
+                        nlp = before_to_disk(nlp)
                         nlp.to_disk(output_path / "model-best")
                 progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
                 progress.set_description(f"Epoch {info['epoch']}")
@@ -179,6 +181,7 @@ def train(
                 f"Aborting and saving the final best model. "
                 f"Encountered exception: {str(e)}"
             )
+            nlp = before_to_disk(nlp)
             nlp.to_disk(output_path / "model-final")
         raise e
     finally:
@@ -233,6 +236,21 @@ def create_evaluation_callback(
     return evaluate
 
 
+def create_before_to_disk_callback(
+    callback: Optional[Callable[[Language], Language]]
+) -> Callable[[Language], Language]:
+    def before_to_disk(nlp: Language) -> Language:
+        if not callback:
+            return nlp
+        modified_nlp = callback(nlp)
+        if not isinstance(modified_nlp, Language):
+            err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp))
+            raise ValueError(err)
+        return modified_nlp
+
+    return before_to_disk
+
+
 def train_while_improving(
     nlp: Language,
     optimizer: Optimizer,
diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index 5cd97a0eb..6f8c0aa00 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -72,6 +72,8 @@ frozen_components = []
 dev_corpus = "corpora.dev"
 # Location in the config where the train corpus is defined
 train_corpus = "corpora.train"
+# Optional callback before nlp object is saved to disk after training
+before_to_disk = null
 
 [training.logger]
 @loggers = "spacy.ConsoleLogger.v1"
diff --git a/spacy/errors.py b/spacy/errors.py
index dce5cf51c..d67f01a1d 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -480,6 +480,9 @@ class Errors:
     E201 = ("Span index out of range.")
 
     # TODO: fix numbering after merging develop into master
+    E914 = ("Executing {name} callback failed. Expected the function to "
+            "returnthe nlp object but got: {value}. Maybe you forgot to return "
+            "the modified object in your function?")
     E915 = ("Can't use score '{name}' to calculate final weighted score. Expected "
             "float or int but got: {score_type}. To exclude the score from the "
             "final score, set its weight to null in the [training.score_weights] "
diff --git a/spacy/schemas.py b/spacy/schemas.py
index e34841008..6a9a82d06 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -217,6 +217,7 @@ class ConfigSchemaTraining(BaseModel):
     optimizer: Optimizer = Field(..., title="The optimizer to use")
     logger: Logger = Field(..., title="The logger to track training progress")
     frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training")
+    before_to_disk: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after training, before it's saved to disk")
     # fmt: on
 
     class Config:

From 138c8d45dbd1372fafe6b280fdedf33790d20d32 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 12:43:39 +0200
Subject: [PATCH 06/14] Update docs

---
 website/docs/api/data-formats.md | 45 ++++++++++++++++----------------
 1 file changed, 23 insertions(+), 22 deletions(-)

diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index e3b3900be..6f156fe37 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -180,26 +180,27 @@ single corpus once and then divide it up into `train` and `dev` partitions.
 This section defines settings and controls for the training and evaluation
 process that are used when you run [`spacy train`](/api/cli#train).
 
-| Name                  | Description                                                                                                                                                                                                  |
-| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~                                                                                                                                       |
-| `batcher`             | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
-| `dev_corpus`          | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~                                                                                                              |
-| `dropout`             | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                               |
-| `eval_frequency`      | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                    |
-| `frozen_components`   | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                              |
-| `gpu_allocator`       | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~                                                            |
-| `init_tok2vec`        | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~                                              |
-| `lookups`             | Additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `null`. ~~Optional[Lookups]~~                                                     |
-| `max_epochs`          | Maximum number of epochs to train for. Defaults to `0`. ~~int~~                                                                                                                                              |
-| `max_steps`           | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~                                                                                                                                    |
-| `optimizer`           | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                      |
-| `patience`            | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~                                                                                                              |
-| `raw_text`            | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~                                          |
-| `score_weights`       | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~                                |
-| `seed`                | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                                                                                              |
-| `train_corpus`        | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~                                                                                                          |
-| `vectors`             | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~                                             |
+| Name                  | Description                                                                                                                                                                                                                               |
+| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~                                                                                                                                                                    |
+| `batcher`             | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~                              |
+| `before_to_disk`      | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
+| `dev_corpus`          | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~                                                                                                                                           |
+| `dropout`             | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                                                            |
+| `eval_frequency`      | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                                                 |
+| `frozen_components`   | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                                                           |
+| `gpu_allocator`       | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~                                                                                         |
+| `init_tok2vec`        | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~                                                                           |
+| `lookups`             | Additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `null`. ~~Optional[Lookups]~~                                                                                  |
+| `max_epochs`          | Maximum number of epochs to train for. Defaults to `0`. ~~int~~                                                                                                                                                                           |
+| `max_steps`           | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~                                                                                                                                                                 |
+| `optimizer`           | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                                                   |
+| `patience`            | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~                                                                                                                                           |
+| `raw_text`            | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~                                                                       |
+| `score_weights`       | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~                                                             |
+| `seed`                | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                                                                                                                           |
+| `train_corpus`        | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~                                                                                                                                       |
+| `vectors`             | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~                                                                          |
 
 ### pretraining {#config-pretraining tag="section,optional"}
 
@@ -275,8 +276,8 @@ $ python -m spacy convert ./data.json ./output.spacy
 > entity label, prefixed by the BILUO marker. For example `"B-ORG"` describes
 > the first token of a multi-token `ORG` entity and `"U-PERSON"` a single token
 > representing a `PERSON` entity. The
-> [`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) function
-> can help you convert entity offsets to the right format.
+> [`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) function can
+> help you convert entity offsets to the right format.
 
 ```python
 ### Example structure

From 1c63f02f99d6c3d663c4a9cfb0e3395986bd7598 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 24 Sep 2020 12:51:16 +0200
Subject: [PATCH 07/14] Add API docs

---
 website/docs/api/doc.md | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index 7175f6e7f..e10d9d077 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -219,6 +219,30 @@ alignment mode `"strict".
 | `alignment_mode`                     | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
 | **RETURNS**                          | The newly constructed object or `None`. ~~Optional[Span]~~                                                                                                                                                                                                                   |
 
+## Doc.set_ents {#ents tag="method" new="3"}
+
+Set the named entities in the document.
+
+> #### Example
+>
+> ```python
+> from spacy.tokens import Span
+> doc = nlp("Mr. Best flew to New York on Saturday morning.")
+> doc.set_ents([Span(doc, 0, 2, "PERSON")])
+> ents = list(doc.ents)
+> assert ents[0].label_ == "PERSON"
+> assert ents[0].text == "Mr. Best"
+> ```
+
+| Name           | Description                                                                                                                                                                               |
+| -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| entities       | Spans with labels to set as entities. ~~List[Span]~~                                                                                                                                      |
+| _keyword-only_ |                                                                                                                                                                                           |
+| blocked        | Spans to set as "blocked" (never an entity) for spacy's built-in NER component. Other components may ignore this setting. ~~Optional[List[Span]]~~                                        |
+| missing        | Spans with missing/unknown entity information. ~~Optional[List[Span]]~~                                                                                                                   |
+| outside        | Spans outside of entities (O in IOB). ~~Optional[List[Span]]~~                                                                                                                            |
+| default        | How to set entity annotation for tokens outside of any provided spans. Options: "blocked", "missing", "outside" and "unmodified" (preserve current state). Defaults to "outside". ~~str~~ |
+
 ## Doc.similarity {#similarity tag="method" model="vectors"}
 
 Make a semantic similarity estimate. The default estimate is cosine similarity
@@ -542,7 +566,6 @@ objects, if the entity recognizer has been applied.
 > ```python
 > doc = nlp("Mr. Best flew to New York on Saturday morning.")
 > ents = list(doc.ents)
-> assert ents[0].label == 346
 > assert ents[0].label_ == "PERSON"
 > assert ents[0].text == "Mr. Best"
 > ```

From 5c13e0cf1bdf536c54660340e71742bf0493ea07 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 24 Sep 2020 13:41:55 +0200
Subject: [PATCH 08/14] Remove unused error

---
 spacy/errors.py      | 3 ---
 spacy/tokens/doc.pyx | 2 --
 2 files changed, 5 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 27091810d..998e57f27 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -692,9 +692,6 @@ class Errors:
              "options: {modes}")
     E1012 = ("Entity spans and blocked/missing/outside spans should be "
              "provided to doc.set_ents as lists of `Span` objects.")
-    E1013 = ("Unable to set entity for span with empty label. Entity spans are "
-             "required to have a label. To set entity information as missing "
-             "or blocked, use the keyword arguments with doc.set_ents.")
 
 
 @add_codes
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 670c7440f..b4027f87e 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -725,8 +725,6 @@ cdef class Doc:
         # Set all specified entity information
         for span in entities:
             for i in range(span.start, span.end):
-                if not span.label:
-                    raise ValueError(Errors.E1013)
                 if i == span.start:
                     self.c[i].ent_iob = 3
                 else:

From 92f8b6959a359ff4495205df42f9e86c30aeb8f6 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 13:48:41 +0200
Subject: [PATCH 09/14] Fix typo

---
 spacy/errors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index d67f01a1d..708b7fda8 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -481,7 +481,7 @@ class Errors:
 
     # TODO: fix numbering after merging develop into master
     E914 = ("Executing {name} callback failed. Expected the function to "
-            "returnthe nlp object but got: {value}. Maybe you forgot to return "
+            "return the nlp object but got: {value}. Maybe you forgot to return "
             "the modified object in your function?")
     E915 = ("Can't use score '{name}' to calculate final weighted score. Expected "
             "float or int but got: {score_type}. To exclude the score from the "

From 88e54caa1275481a43b1069c8ec6d352f554e333 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 14:32:35 +0200
Subject: [PATCH 10/14] accuracy -> performance

---
 spacy/cli/info.py                | 4 +++-
 spacy/schemas.py                 | 3 +--
 website/docs/api/data-formats.md | 2 +-
 website/src/templates/models.js  | 2 +-
 4 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/spacy/cli/info.py b/spacy/cli/info.py
index 2b87163c2..2f2515278 100644
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@@ -91,7 +91,9 @@ def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
         meta["source"] = str(model_path.resolve())
     else:
         meta["source"] = str(model_path)
-    return {k: v for k, v in meta.items() if k not in ("accuracy", "speed")}
+    return {
+        k: v for k, v in meta.items() if k not in ("accuracy", "performance", "speed")
+    }
 
 
 def get_markdown(data: Dict[str, Any], title: Optional[str] = None) -> str:
diff --git a/spacy/schemas.py b/spacy/schemas.py
index e34841008..1ff73bccc 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -182,8 +182,7 @@ class ModelMetaSchema(BaseModel):
     sources: Optional[Union[List[StrictStr], List[Dict[str, str]]]] = Field(None, title="Training data sources")
     vectors: Dict[str, Any] = Field({}, title="Included word vectors")
     labels: Dict[str, List[str]] = Field({}, title="Component labels, keyed by component name")
-    accuracy: Dict[str, Union[float, Dict[str, float]]] = Field({}, title="Accuracy numbers")
-    speed: Dict[str, Union[float, int]] = Field({}, title="Speed evaluation numbers")
+    performance: Dict[str, Union[float, Dict[str, float]]] = Field({}, title="Accuracy and speed numbers")
     spacy_git_version: StrictStr = Field("", title="Commit of spaCy version used")
     # fmt: on
 
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index e3b3900be..34565f160 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -518,7 +518,7 @@ source of truth** used for loading a pipeline.
 >     "ner": ["PERSON", "ORG", "PRODUCT"],
 >     "textcat": ["POSITIVE", "NEGATIVE"]
 >   },
->   "accuracy": {
+>   "performance": {
 >     "ents_f": 82.7300930714,
 >     "ents_p": 82.135523614,
 >     "ents_r": 83.3333333333,
diff --git a/website/src/templates/models.js b/website/src/templates/models.js
index 5d705048b..413f23dc5 100644
--- a/website/src/templates/models.js
+++ b/website/src/templates/models.js
@@ -117,7 +117,7 @@ function formatModelMeta(data) {
         license: data.license,
         labels: isEmptyObj(data.labels) ? null : data.labels,
         vectors: formatVectors(data.vectors),
-        accuracy: formatAccuracy(data.accuracy),
+        accuracy: formatAccuracy(data.performance),
     }
 }
 

From 3b58a8be2b32b29a4a121bf0ed75ae3cd2920ee9 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 14:32:42 +0200
Subject: [PATCH 11/14] Update docs

---
 website/docs/api/data-formats.md         | 4 ++--
 website/docs/usage/_benchmarks-models.md | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index 34565f160..0fc3481a4 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -275,8 +275,8 @@ $ python -m spacy convert ./data.json ./output.spacy
 > entity label, prefixed by the BILUO marker. For example `"B-ORG"` describes
 > the first token of a multi-token `ORG` entity and `"U-PERSON"` a single token
 > representing a `PERSON` entity. The
-> [`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) function
-> can help you convert entity offsets to the right format.
+> [`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) function can
+> help you convert entity offsets to the right format.
 
 ```python
 ### Example structure
diff --git a/website/docs/usage/_benchmarks-models.md b/website/docs/usage/_benchmarks-models.md
index 5b193d3a4..88e79112f 100644
--- a/website/docs/usage/_benchmarks-models.md
+++ b/website/docs/usage/_benchmarks-models.md
@@ -7,7 +7,7 @@ import { Help } from 'components/typography'; import Link from 'components/link'
 | Pipeline                                                   | Parser | Tagger |  NER | WPS<br />CPU <Help>words per second on CPU, higher is better</Help> | WPS<br/>GPU <Help>words per second on GPU, higher is better</Help> |
 | ---------------------------------------------------------- | -----: | -----: | ---: | ------------------------------------------------------------------: | -----------------------------------------------------------------: |
 | [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3) |        |        |      |                                                                     |                                                                 6k |
-| [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3)   |        |        |      |                                                                     |                                                                    |
+| [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3)   |   92.1 |   97.4 | 87.0 |                                                                  7k |                                                                    |
 | `en_core_web_lg` (spaCy v2)                                |   91.9 |   97.2 | 85.9 |                                                                 10k |                                                                    |
 
 <figcaption class="caption">

From 24e7ac3f2bbdab6a1e124c2770c7545cd08906c8 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 14:43:56 +0200
Subject: [PATCH 12/14] Fix download CLI [ci skip]

---
 spacy/cli/download.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index 036aeab17..0e7ec2ea5 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -88,7 +88,6 @@ def get_compatibility() -> dict:
 
 
 def get_version(model: str, comp: dict) -> str:
-    model = get_base_version(model)
     if model not in comp:
         msg.fail(
             f"No compatible package found for '{model}' (spaCy v{about.__version__})",

From 3f751e68f596d1c186e0baa125a6cba1ff6a7995 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 14:45:41 +0200
Subject: [PATCH 13/14] Increment version [ci skip]

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 8d019897b..56b05257a 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a23"
+__version__ = "3.0.0a24"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From 6bc5058d137daa28184c0494f9380b7832770c59 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 14:53:34 +0200
Subject: [PATCH 14/14] Update models directory [ci skip]

---
 website/src/templates/models.js | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/website/src/templates/models.js b/website/src/templates/models.js
index 413f23dc5..cdfe2e46d 100644
--- a/website/src/templates/models.js
+++ b/website/src/templates/models.js
@@ -32,11 +32,17 @@ const MODEL_META = {
     las: 'Labelled dependencies',
     token_acc: 'Tokenization',
     tok: 'Tokenization',
+    lemma: 'Statistical lemmatization',
+    morph: 'Morphological analysis',
     tags_acc: 'Part-of-speech tags (fine grained tags, Token.tag)',
     tag: 'Part-of-speech tags (fine grained tags, Token.tag)',
+    pos: 'Part-of-speech tags (coarse grained tags, Token.pos)',
     ents_f: 'Named entities (F-score)',
     ents_p: 'Named entities (precision)',
     ents_r: 'Named entities (recall)',
+    ner_f: 'Named entities (F-score)',
+    ner_p: 'Named entities (precision)',
+    ner_r: 'Named entities (recall)',
     sent_f: 'Sentence segmentation (F-score)',
     sent_p: 'Sentence segmentation (precision)',
     sent_r: 'Sentence segmentation (recall)',
@@ -88,11 +94,12 @@ function formatVectors(data) {
 }
 
 function formatAccuracy(data) {
+    const exclude = ['speed']
     if (!data) return []
     return Object.keys(data)
         .map(label => {
             const value = data[label]
-            return isNaN(value)
+            return isNaN(value) || exclude.includes(label)
                 ? null
                 : {
                       label,
@@ -109,6 +116,7 @@ function formatModelMeta(data) {
         version: data.version,
         sizeFull: data.size,
         pipeline: data.pipeline,
+        components: data.components,
         notes: data.notes,
         description: data.description,
         sources: data.sources,
@@ -117,7 +125,8 @@ function formatModelMeta(data) {
         license: data.license,
         labels: isEmptyObj(data.labels) ? null : data.labels,
         vectors: formatVectors(data.vectors),
-        accuracy: formatAccuracy(data.performance),
+        // TODO: remove accuracy fallback
+        accuracy: formatAccuracy(data.accuracy || data.performance),
     }
 }