From f420aa1138f52c732102b6ad00825bab797792ec Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 15 Sep 2020 22:30:09 +0200
Subject: [PATCH 01/10] use e.value to get to the ExceptionInfo value

---
 spacy/tests/test_language.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index fba362b76..2a24d368a 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -298,4 +298,4 @@ def test_language_init_invalid_vocab(value):
     err_fragment = "invalid value"
     with pytest.raises(ValueError) as e:
         Language(value)
-    assert err_fragment in str(e)
+    assert err_fragment in str(e.value)

From 87c329c7114767d8788090a3838fce0bf36822b7 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 16 Sep 2020 17:37:29 +0200
Subject: [PATCH 02/10] Set rule-based lemmatizers as default (#6076)

For languages without provided models and with lemmatizer rules in
`spacy-lookups-data`, make the rule-based lemmatizer the default:
Bengali, Persian, Norwegian, Swedish
---
 spacy/lang/bn/__init__.py            | 22 ++++++++++++++++++++++
 spacy/lang/fa/__init__.py            | 22 ++++++++++++++++++++++
 spacy/lang/nb/__init__.py            | 22 ++++++++++++++++++++++
 spacy/lang/sv/__init__.py            | 23 +++++++++++++++++++++++
 spacy/tests/lang/test_lemmatizers.py |  2 +-
 5 files changed, 90 insertions(+), 1 deletion(-)

diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py
index 6c1d66cba..270185a4b 100644
--- a/spacy/lang/bn/__init__.py
+++ b/spacy/lang/bn/__init__.py
@@ -1,7 +1,11 @@
+from typing import Optional
+from thinc.api import Model
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 from .stop_words import STOP_WORDS
 from ...language import Language
+from ...lookups import Lookups
+from ...pipeline import Lemmatizer
 
 
 class BengaliDefaults(Language.Defaults):
@@ -17,4 +21,22 @@ class Bengali(Language):
     Defaults = BengaliDefaults
 
 
+@Bengali.factory(
+    "lemmatizer",
+    assigns=["token.lemma"],
+    default_config={"model": None, "mode": "rule", "lookups": None},
+    scores=["lemma_acc"],
+    default_score_weights={"lemma_acc": 1.0},
+)
+def make_lemmatizer(
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    lookups: Optional[Lookups],
+):
+    lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
+    return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+
+
 __all__ = ["Bengali"]
diff --git a/spacy/lang/fa/__init__.py b/spacy/lang/fa/__init__.py
index 7fdb9d065..244534120 100644
--- a/spacy/lang/fa/__init__.py
+++ b/spacy/lang/fa/__init__.py
@@ -1,9 +1,13 @@
+from typing import Optional
+from thinc.api import Model
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_SUFFIXES
 from .syntax_iterators import SYNTAX_ITERATORS
 from ...language import Language
+from ...lookups import Lookups
+from ...pipeline import Lemmatizer
 
 
 class PersianDefaults(Language.Defaults):
@@ -20,4 +24,22 @@ class Persian(Language):
     Defaults = PersianDefaults
 
 
+@Persian.factory(
+    "lemmatizer",
+    assigns=["token.lemma"],
+    default_config={"model": None, "mode": "rule", "lookups": None},
+    scores=["lemma_acc"],
+    default_score_weights={"lemma_acc": 1.0},
+)
+def make_lemmatizer(
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    lookups: Optional[Lookups],
+):
+    lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
+    return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+
+
 __all__ = ["Persian"]
diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py
index d2bb92072..28a2f0bf2 100644
--- a/spacy/lang/nb/__init__.py
+++ b/spacy/lang/nb/__init__.py
@@ -1,9 +1,13 @@
+from typing import Optional
+from thinc.api import Model
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from .punctuation import TOKENIZER_SUFFIXES
 from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
 from ...language import Language
+from ...lookups import Lookups
+from ...pipeline import Lemmatizer
 
 
 class NorwegianDefaults(Language.Defaults):
@@ -20,4 +24,22 @@ class Norwegian(Language):
     Defaults = NorwegianDefaults
 
 
+@Norwegian.factory(
+    "lemmatizer",
+    assigns=["token.lemma"],
+    default_config={"model": None, "mode": "rule", "lookups": None},
+    scores=["lemma_acc"],
+    default_score_weights={"lemma_acc": 1.0},
+)
+def make_lemmatizer(
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    lookups: Optional[Lookups],
+):
+    lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
+    return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+
+
 __all__ = ["Norwegian"]
diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py
index 0c6a1b9f4..6db74cd39 100644
--- a/spacy/lang/sv/__init__.py
+++ b/spacy/lang/sv/__init__.py
@@ -1,8 +1,13 @@
+from typing import Optional
+from thinc.api import Model
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
 from ...language import Language
+from ...lookups import Lookups
+from ...pipeline import Lemmatizer
+
 
 # Punctuation stolen from Danish
 from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
@@ -22,4 +27,22 @@ class Swedish(Language):
     Defaults = SwedishDefaults
 
 
+@Swedish.factory(
+    "lemmatizer",
+    assigns=["token.lemma"],
+    default_config={"model": None, "mode": "rule", "lookups": None},
+    scores=["lemma_acc"],
+    default_score_weights={"lemma_acc": 1.0},
+)
+def make_lemmatizer(
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    lookups: Optional[Lookups],
+):
+    lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
+    return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+
+
 __all__ = ["Swedish"]
diff --git a/spacy/tests/lang/test_lemmatizers.py b/spacy/tests/lang/test_lemmatizers.py
index 14c59659a..6e7f82341 100644
--- a/spacy/tests/lang/test_lemmatizers.py
+++ b/spacy/tests/lang/test_lemmatizers.py
@@ -8,7 +8,7 @@ from spacy.util import get_lang_class
 # Only include languages with no external dependencies
 # excluded: ru, uk
 # excluded for custom tables: pl
-LANGUAGES = ["el", "en", "fr", "nl"]
+LANGUAGES = ["bn", "el", "en", "fa", "fr", "nb", "nl", "sv"]
 # fmt: on
 
 

From d722a439aa3bef5d4b4fa677aa6b427f7186a673 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 16 Sep 2020 17:39:41 +0200
Subject: [PATCH 03/10] Remove unneeded methods in senter and morphologizer
 (#6074)

Now that the tagger doesn't manage the tag map, the child classes senter
and morphologizer don't need to override the serialization methods.
---
 spacy/pipeline/morphologizer.pyx | 76 --------------------------------
 spacy/pipeline/senter.pyx        | 76 --------------------------------
 2 files changed, 152 deletions(-)

diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 57bdb28d7..0e0791004 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -259,79 +259,3 @@ class Morphologizer(Tagger):
         results.update(Scorer.score_token_attr_per_feat(examples,
             "morph", **kwargs))
         return results
-
-    def to_bytes(self, *, exclude=tuple()):
-        """Serialize the pipe to a bytestring.
-
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (bytes): The serialized object.
-
-        DOCS: https://nightly.spacy.io/api/morphologizer#to_bytes
-        """
-        serialize = {}
-        serialize["model"] = self.model.to_bytes
-        serialize["vocab"] = self.vocab.to_bytes
-        serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
-        return util.to_bytes(serialize, exclude)
-
-    def from_bytes(self, bytes_data, *, exclude=tuple()):
-        """Load the pipe from a bytestring.
-
-        bytes_data (bytes): The serialized pipe.
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (Morphologizer): The loaded Morphologizer.
-
-        DOCS: https://nightly.spacy.io/api/morphologizer#from_bytes
-        """
-        def load_model(b):
-            try:
-                self.model.from_bytes(b)
-            except AttributeError:
-                raise ValueError(Errors.E149) from None
-
-        deserialize = {
-            "vocab": lambda b: self.vocab.from_bytes(b),
-            "cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
-            "model": lambda b: load_model(b),
-        }
-        util.from_bytes(bytes_data, deserialize, exclude)
-        return self
-
-    def to_disk(self, path, *, exclude=tuple()):
-        """Serialize the pipe to disk.
-
-        path (str / Path): Path to a directory.
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-
-        DOCS: https://nightly.spacy.io/api/morphologizer#to_disk
-        """
-        serialize = {
-            "vocab": lambda p: self.vocab.to_disk(p),
-            "model": lambda p: p.open("wb").write(self.model.to_bytes()),
-            "cfg": lambda p: srsly.write_json(p, self.cfg),
-        }
-        util.to_disk(path, serialize, exclude)
-
-    def from_disk(self, path, *, exclude=tuple()):
-        """Load the pipe from disk. Modifies the object in place and returns it.
-
-        path (str / Path): Path to a directory.
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (Morphologizer): The modified Morphologizer object.
-
-        DOCS: https://nightly.spacy.io/api/morphologizer#from_disk
-        """
-        def load_model(p):
-            with p.open("rb") as file_:
-                try:
-                    self.model.from_bytes(file_.read())
-                except AttributeError:
-                    raise ValueError(Errors.E149) from None
-
-        deserialize = {
-            "vocab": lambda p: self.vocab.from_disk(p),
-            "cfg": lambda p: self.cfg.update(deserialize_config(p)),
-            "model": load_model,
-        }
-        util.from_disk(path, deserialize, exclude)
-        return self
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 00664131b..a7eb721fd 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -170,79 +170,3 @@ class SentenceRecognizer(Tagger):
         results = Scorer.score_spans(examples, "sents", **kwargs)
         del results["sents_per_type"]
         return results
-
-    def to_bytes(self, *, exclude=tuple()):
-        """Serialize the pipe to a bytestring.
-
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (bytes): The serialized object.
-
-        DOCS: https://nightly.spacy.io/api/sentencerecognizer#to_bytes
-        """
-        serialize = {}
-        serialize["model"] = self.model.to_bytes
-        serialize["vocab"] = self.vocab.to_bytes
-        serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
-        return util.to_bytes(serialize, exclude)
-
-    def from_bytes(self, bytes_data, *, exclude=tuple()):
-        """Load the pipe from a bytestring.
-
-        bytes_data (bytes): The serialized pipe.
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (Tagger): The loaded SentenceRecognizer.
-
-        DOCS: https://nightly.spacy.io/api/sentencerecognizer#from_bytes
-        """
-        def load_model(b):
-            try:
-                self.model.from_bytes(b)
-            except AttributeError:
-                raise ValueError(Errors.E149) from None
-
-        deserialize = {
-            "vocab": lambda b: self.vocab.from_bytes(b),
-            "cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
-            "model": lambda b: load_model(b),
-        }
-        util.from_bytes(bytes_data, deserialize, exclude)
-        return self
-
-    def to_disk(self, path, *, exclude=tuple()):
-        """Serialize the pipe to disk.
-
-        path (str / Path): Path to a directory.
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-
-        DOCS: https://nightly.spacy.io/api/sentencerecognizer#to_disk
-        """
-        serialize = {
-            "vocab": lambda p: self.vocab.to_disk(p),
-            "model": lambda p: p.open("wb").write(self.model.to_bytes()),
-            "cfg": lambda p: srsly.write_json(p, self.cfg),
-        }
-        util.to_disk(path, serialize, exclude)
-
-    def from_disk(self, path, *, exclude=tuple()):
-        """Load the pipe from disk. Modifies the object in place and returns it.
-
-        path (str / Path): Path to a directory.
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (Tagger): The modified SentenceRecognizer object.
-
-        DOCS: https://nightly.spacy.io/api/sentencerecognizer#from_disk
-        """
-        def load_model(p):
-            with p.open("rb") as file_:
-                try:
-                    self.model.from_bytes(file_.read())
-                except AttributeError:
-                    raise ValueError(Errors.E149) from None
-
-        deserialize = {
-            "vocab": lambda p: self.vocab.from_disk(p),
-            "cfg": lambda p: self.cfg.update(deserialize_config(p)),
-            "model": load_model,
-        }
-        util.from_disk(path, deserialize, exclude)
-        return self

From f3db3f6fe00455f69bf05135f941ba88d307738b Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 16 Sep 2020 17:45:04 +0200
Subject: [PATCH 04/10] Add vectors option to CharacterEmbed (#6069)

* Add vectors option to CharacterEmbed

* Update spacy/pipeline/morphologizer.pyx

* Adjust default morphologizer config

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
---
 spacy/ml/models/tok2vec.py       | 39 +++++++++++++++++++++++---------
 spacy/pipeline/morphologizer.pyx |  1 +
 spacy/tests/test_tok2vec.py      |  4 ++--
 3 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 2e5f8a802..7ced4bd04 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -164,7 +164,7 @@ def MultiHashEmbed(
 
 
 @registry.architectures.register("spacy.CharacterEmbed.v1")
-def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
+def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool):
     """Construct an embedded representation based on character embeddings, using
     a feed-forward network. A fixed number of UTF-8 byte characters are used for
     each word, taken from the beginning and end of the word equally. Padding is
@@ -188,18 +188,35 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
     nC (int): The number of UTF-8 bytes to embed per word. Recommended values
         are between 3 and 8, although it may depend on the length of words in the
         language.
+    also_use_static_vectors (bool): Whether to also use static word vectors.
+        Requires a vectors table to be loaded in the Doc objects' vocab.
     """
-    model = chain(
-        concatenate(
-            chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
-            chain(
-                FeatureExtractor([NORM]),
-                list2ragged(),
-                with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
+    if also_use_static_vectors:
+        model = chain(
+            concatenate(
+                chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
+                chain(
+                    FeatureExtractor([NORM]),
+                    list2ragged(),
+                    with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
+                ),
+                StaticVectors(width, dropout=0.0),
             ),
-        ),
-        with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)),
-        ragged2list(),
+            with_array(Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0)),
+            ragged2list(),
+    )
+    else:
+        model = chain(
+            concatenate(
+                chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
+                chain(
+                    FeatureExtractor([NORM]),
+                    list2ragged(),
+                    with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
+                ),
+            ),
+            with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)),
+            ragged2list(),
     )
     return model
 
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 0e0791004..bb68a358c 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -32,6 +32,7 @@ width = 128
 rows = 7000
 nM = 64
 nC = 8
+also_use_static_vectors = false
 
 [model.tok2vec.encode]
 @architectures = "spacy.MaxoutWindowEncoder.v1"
diff --git a/spacy/tests/test_tok2vec.py b/spacy/tests/test_tok2vec.py
index fb30c6ae5..f3f35e4a7 100644
--- a/spacy/tests/test_tok2vec.py
+++ b/spacy/tests/test_tok2vec.py
@@ -63,8 +63,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
     [
         (8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
         (8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
-        (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
-        (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
+        (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
+        (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
     ],
 )
 # fmt: on

From d31afc833485fb6fd347fd41d94a4050a69dfa96 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 16 Sep 2020 17:49:48 +0200
Subject: [PATCH 05/10] Fix Language.link_components when model is None

---
 spacy/language.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 8f7cb1973..4c0a6d7e6 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1450,8 +1450,8 @@ class Language:
         """
         for i, (name1, proc1) in enumerate(self.pipeline):
             if hasattr(proc1, "find_listeners"):
-                for name2, proc2 in self.pipeline[i:]:
-                    if hasattr(proc2, "model"):
+                for name2, proc2 in self.pipeline[i+1:]:
+                    if isinstance(getattr(proc2, "model", None), Model):
                         proc1.find_listeners(proc2.model)
 
     @classmethod

From 4a573d18b3a818d3f9de3115d5376bf564337ba5 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 16 Sep 2020 17:51:29 +0200
Subject: [PATCH 06/10] Add comment

---
 spacy/language.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/spacy/language.py b/spacy/language.py
index 4c0a6d7e6..3f0f850c2 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1448,6 +1448,11 @@ class Language:
         """Register 'listeners' within pipeline components, to allow them to
         effectively share weights.
         """
+        # I had though, "Why do we do this inside the Language object? Shouldn't
+        # it be the tok2vec/transformer/etc's job?
+        # The problem is we need to do it during deserialization...And the
+        # components don't receive the pipeline then. So this does have to be
+        # here :(
         for i, (name1, proc1) in enumerate(self.pipeline):
             if hasattr(proc1, "find_listeners"):
                 for name2, proc2 in self.pipeline[i+1:]:

From c776594ab1a27f51ddb6e5ea1ea815f515ad5213 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 16 Sep 2020 18:15:14 +0200
Subject: [PATCH 07/10] Fix

---
 spacy/language.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/language.py b/spacy/language.py
index 3f0f850c2..d530e6b92 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -8,7 +8,7 @@ from contextlib import contextmanager
 from copy import deepcopy
 from pathlib import Path
 import warnings
-from thinc.api import get_current_ops, Config, require_gpu, Optimizer
+from thinc.api import Model, get_current_ops, Config, require_gpu, Optimizer
 import srsly
 import multiprocessing as mp
 from itertools import chain, cycle

From a119667a36cced2ae5db6333e1539eb407fff70d Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 16 Sep 2020 20:32:38 +0200
Subject: [PATCH 08/10] Clean up spacy.tokens (#6046)

* Clean up spacy.tokens

* Update `set_children_from_heads`:
  * Don't check `dep` when setting lr_* or sentence starts
  * Set all non-sentence starts to `False`

* Use `set_children_from_heads` in `Token.head` setter
  * Reduce similar/duplicate code (admittedly adds a bit of overhead)
  * Update sentence starts consistently

* Remove unused `Doc.set_parse`

* Minor changes:
  * Declare cython variables (to avoid cython warnings)
  * Clean up imports

* Modify set_children_from_heads to set token range

Modify `set_children_from_heads` so that it adjust tokens within a
specified range rather then the whole document.

Modify the `Token.head` setter to adjust only the tokens affected by the
new head assignment.
---
 .../pipeline/_parser_internals/arc_eager.pyx  |  2 +-
 spacy/pipeline/_parser_internals/nonproj.pyx  |  2 +-
 spacy/tests/doc/test_doc_api.py               | 12 +--
 spacy/tests/doc/test_token_api.py             | 35 ++++++-
 spacy/tests/parser/test_parse.py              |  2 +-
 spacy/tests/regression/test_issue2501-3000.py |  2 +-
 spacy/tokens/_retokenize.pyx                  |  5 +-
 spacy/tokens/doc.pxd                          |  9 +-
 spacy/tokens/doc.pyx                          | 63 +++++--------
 spacy/tokens/span.pyx                         |  3 -
 spacy/tokens/token.pyx                        | 92 +++----------------
 11 files changed, 85 insertions(+), 142 deletions(-)

diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index bb0bf35b8..a5fc2ea0e 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -680,7 +680,7 @@ cdef class ArcEager(TransitionSystem):
 
     def finalize_doc(self, Doc doc):
         doc.is_parsed = True
-        set_children_from_heads(doc.c, doc.length)
+        set_children_from_heads(doc.c, 0, doc.length)
 
     def has_gold(self, Example eg, start=0, end=None):
         for word in eg.y[start:end]:
diff --git a/spacy/pipeline/_parser_internals/nonproj.pyx b/spacy/pipeline/_parser_internals/nonproj.pyx
index 8f5fdaa71..82070cd27 100644
--- a/spacy/pipeline/_parser_internals/nonproj.pyx
+++ b/spacy/pipeline/_parser_internals/nonproj.pyx
@@ -119,7 +119,7 @@ cpdef deprojectivize(Doc doc):
             new_head = _find_new_head(doc[i], head_label)
             doc.c[i].head = new_head.i - i
             doc.c[i].dep = doc.vocab.strings.add(new_label)
-    set_children_from_heads(doc.c, doc.length)
+    set_children_from_heads(doc.c, 0, doc.length)
     return doc
 
 
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index b37a31e43..31dbad9ca 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -265,17 +265,11 @@ def test_doc_is_nered(en_vocab):
 
 def test_doc_from_array_sent_starts(en_vocab):
     words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."]
-    heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6]
+    heads = [0, -1, -2, -3, -4, -5, 0, -1, -2, -3]
     # fmt: off
-    deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep", "dep"]
+    deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep"]
     # fmt: on
-    doc = Doc(en_vocab, words=words)
-    for i, (dep, head) in enumerate(zip(deps, heads)):
-        doc[i].dep_ = dep
-        doc[i].head = doc[head]
-        if head == i:
-            doc[i].is_sent_start = True
-    doc.is_parsed
+    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
 
     attrs = [SENT_START, HEAD]
     arr = doc.to_array(attrs)
diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py
index be56c9b71..28ef0dd7f 100644
--- a/spacy/tests/doc/test_token_api.py
+++ b/spacy/tests/doc/test_token_api.py
@@ -112,7 +112,6 @@ def test_doc_token_api_ancestors(en_tokenizer):
 
 
 def test_doc_token_api_head_setter(en_tokenizer):
-    # the structure of this sentence depends on the English annotation scheme
     text = "Yesterday I saw a dog that barked loudly."
     heads = [2, 1, 0, 1, -2, 1, -2, -1, -6]
     tokens = en_tokenizer(text)
@@ -169,6 +168,40 @@ def test_doc_token_api_head_setter(en_tokenizer):
     with pytest.raises(ValueError):
         doc[0].head = doc2[0]
 
+    # test sentence starts when two sentences are joined
+    text = "This is one sentence. This is another sentence."
+    heads = [0, -1, -2, -3, -4, 0, -1, -2, -3, -4]
+    tokens = en_tokenizer(text)
+    doc = get_doc(
+        tokens.vocab,
+        words=[t.text for t in tokens],
+        heads=heads,
+        deps=["dep"] * len(heads),
+    )
+    # initially two sentences
+    assert doc[0].is_sent_start
+    assert doc[5].is_sent_start
+    assert doc[0].left_edge == doc[0]
+    assert doc[0].right_edge == doc[4]
+    assert doc[5].left_edge == doc[5]
+    assert doc[5].right_edge == doc[9]
+
+    # modifying with a sentence doesn't change sent starts
+    doc[2].head = doc[3]
+    assert doc[0].is_sent_start
+    assert doc[5].is_sent_start
+    assert doc[0].left_edge == doc[0]
+    assert doc[0].right_edge == doc[4]
+    assert doc[5].left_edge == doc[5]
+    assert doc[5].right_edge == doc[9]
+
+    # attach the second sentence to the first, resulting in one sentence
+    doc[5].head = doc[0]
+    assert doc[0].is_sent_start
+    assert not doc[5].is_sent_start
+    assert doc[0].left_edge == doc[0]
+    assert doc[0].right_edge == doc[9]
+
 
 def test_is_sent_start(en_tokenizer):
     doc = en_tokenizer("This is a sentence. This is another.")
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 8d45e2132..691a7c3aa 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -184,7 +184,7 @@ def test_parser_set_sent_starts(en_vocab):
         if i == 0 or i == 3:
             assert doc[i].is_sent_start is True
         else:
-            assert doc[i].is_sent_start is None
+            assert not doc[i].is_sent_start
     for sent in doc.sents:
         for token in sent:
             assert token.head in sent
diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py
index beb8faca1..859e4d80e 100644
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@@ -123,7 +123,7 @@ def test_issue2772(en_vocab):
     heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, 2, 1, -3, -4]
     deps = ["dep"] * len(heads)
     doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
-    assert doc[1].is_sent_start is None
+    assert not doc[1].is_sent_start
 
 
 @pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"])
diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx
index 9323bb579..cd1e73a2b 100644
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@@ -274,7 +274,7 @@ def _merge(Doc doc, merges):
     for i in range(doc.length):
         doc.c[i].head -= i
     # Set the left/right children, left/right edges
-    set_children_from_heads(doc.c, doc.length)
+    set_children_from_heads(doc.c, 0, doc.length)
     # Make sure ent_iob remains consistent
     make_iob_consistent(doc.c, doc.length)
     # Return the merged Python object
@@ -381,7 +381,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
     for i in range(doc.length):
         doc.c[i].head -= i
     # set children from head
-    set_children_from_heads(doc.c, doc.length)
+    set_children_from_heads(doc.c, 0, doc.length)
 
 
 def _validate_extensions(extensions):
@@ -408,7 +408,6 @@ cdef make_iob_consistent(TokenC* tokens, int length):
 def normalize_token_attrs(Vocab vocab, attrs):
     if "_" in attrs:  # Extension attributes
         extensions = attrs["_"]
-        print("EXTENSIONS", extensions)
         _validate_extensions(extensions)
         attrs = {key: value for key, value in attrs.items() if key != "_"}
         attrs = intify_attrs(attrs, strings_map=vocab.strings)
diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index 2775aa97e..9b382d687 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -19,10 +19,10 @@ ctypedef fused LexemeOrToken:
     const_TokenC_ptr
 
 
-cdef int set_children_from_heads(TokenC* tokens, int length) except -1
+cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1
 
 
-cdef int _set_lr_kids_and_edges(TokenC* tokens, int length, int loop_count) except -1
+cdef int _set_lr_kids_and_edges(TokenC* tokens, int start, int end, int loop_count) except -1
 
 
 cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2
@@ -31,9 +31,6 @@ cdef int token_by_start(const TokenC* tokens, int length, int start_char) except
 cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
 
 
-cdef int set_children_from_heads(TokenC* tokens, int length) except -1
-
-
 cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
 
 cdef class Doc:
@@ -74,5 +71,3 @@ cdef class Doc:
     cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1
 
     cpdef np.ndarray to_array(self, object features)
-
-    cdef void set_parse(self, const TokenC* parsed) nogil
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 93520aeda..62a6dd6db 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1,32 +1,27 @@
 # cython: infer_types=True, bounds_check=False, profile=True
 cimport cython
 cimport numpy as np
-from libc.string cimport memcpy, memset
+from libc.string cimport memcpy
 from libc.math cimport sqrt
 from libc.stdint cimport int32_t, uint64_t
 
 import copy
 from collections import Counter
 import numpy
-import numpy.linalg
-import struct
 import srsly
 from thinc.api import get_array_module
 from thinc.util import copy_array
 import warnings
-import copy
 
 from .span cimport Span
 from .token cimport Token
 from ..lexeme cimport Lexeme, EMPTY_LEXEME
 from ..typedefs cimport attr_t, flags_t
-from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
+from ..attrs cimport attr_id_t
 from ..attrs cimport LENGTH, POS, LEMMA, TAG, MORPH, DEP, HEAD, SPACY, ENT_IOB
-from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, attr_id_t
-from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
+from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, NORM
 
-from ..attrs import intify_attr, intify_attrs, IDS
-from ..util import normalize_slice
+from ..attrs import intify_attr, IDS
 from ..compat import copy_reg, pickle
 from ..errors import Errors, Warnings
 from .. import util
@@ -291,7 +286,7 @@ cdef class Doc:
         DOCS: https://nightly.spacy.io/api/doc#getitem
         """
         if isinstance(i, slice):
-            start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
+            start, stop = util.normalize_slice(len(self), i.start, i.stop, i.step)
             return Span(self, start, stop, label=0)
         if i < 0:
             i = self.length + i
@@ -627,10 +622,7 @@ cdef class Doc:
     @property
     def sents(self):
         """Iterate over the sentences in the document. Yields sentence `Span`
-        objects. Sentence spans have no label. To improve accuracy on informal
-        texts, spaCy calculates sentence boundaries from the syntactic
-        dependency parse. If the parser is disabled, the `sents` iterator will
-        be unavailable.
+        objects. Sentence spans have no label.
 
         YIELDS (Span): Sentences in the document.
 
@@ -786,14 +778,6 @@ cdef class Doc:
         for i in range(self.length, self.max_length + PADDING):
             self.c[i].lex = &EMPTY_LEXEME
 
-    cdef void set_parse(self, const TokenC* parsed) nogil:
-        # TODO: This method is fairly misleading atm. It's used by Parser
-        # to actually apply the parse calculated. Need to rethink this.
-        # Probably we should use from_array?
-        self.is_parsed = True
-        for i in range(self.length):
-            self.c[i] = parsed[i]
-
     def from_array(self, attrs, array):
         """Load attributes from a numpy array. Write to a `Doc` object, from an
         `(M, N)` array of attributes.
@@ -884,7 +868,7 @@ cdef class Doc:
         self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs)
         # If document is parsed, set children
         if self.is_parsed:
-            set_children_from_heads(self.c, length)
+            set_children_from_heads(self.c, 0, length)
         return self
 
     @staticmethod
@@ -1321,13 +1305,13 @@ cdef int token_by_char(const TokenC* tokens, int length, int char_idx) except -2
             return mid
     return -1
 
-
-cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
+cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
+    # note: end is exclusive
     cdef TokenC* head
     cdef TokenC* child
     cdef int i
     # Set number of left/right children to 0. We'll increment it in the loops.
-    for i in range(length):
+    for i in range(start, end):
         tokens[i].l_kids = 0
         tokens[i].r_kids = 0
         tokens[i].l_edge = i
@@ -1341,38 +1325,40 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
     # without risking getting stuck in an infinite loop if something is
     # terribly malformed.
     while not heads_within_sents:
-        heads_within_sents = _set_lr_kids_and_edges(tokens, length, loop_count)
+        heads_within_sents = _set_lr_kids_and_edges(tokens, start, end, loop_count)
         if loop_count > 10:
             warnings.warn(Warnings.W026)
             break
         loop_count += 1
     # Set sentence starts
-    for i in range(length):
-        if tokens[i].head == 0 and tokens[i].dep != 0:
+    for i in range(start, end):
+        tokens[i].sent_start = -1
+    for i in range(start, end):
+        if tokens[i].head == 0:
             tokens[tokens[i].l_edge].sent_start = True
 
 
-cdef int _set_lr_kids_and_edges(TokenC* tokens, int length, int loop_count) except -1:
+cdef int _set_lr_kids_and_edges(TokenC* tokens, int start, int end, int loop_count) except -1:
     # May be called multiple times due to non-projectivity. See issues #3170
     # and #4688.
     # Set left edges
     cdef TokenC* head
     cdef TokenC* child
     cdef int i, j
-    for i in range(length):
+    for i in range(start, end):
         child = &tokens[i]
         head = &tokens[i + child.head]
-        if child < head and loop_count == 0:
+        if loop_count == 0 and child < head:
             head.l_kids += 1
         if child.l_edge < head.l_edge:
             head.l_edge = child.l_edge
         if child.r_edge > head.r_edge:
             head.r_edge = child.r_edge
     # Set right edges - same as above, but iterate in reverse
-    for i in range(length-1, -1, -1):
+    for i in range(end-1, start-1, -1):
         child = &tokens[i]
         head = &tokens[i + child.head]
-        if child > head and loop_count == 0:
+        if loop_count == 0 and child > head:
             head.r_kids += 1
         if child.r_edge > head.r_edge:
             head.r_edge = child.r_edge
@@ -1380,14 +1366,14 @@ cdef int _set_lr_kids_and_edges(TokenC* tokens, int length, int loop_count) exce
             head.l_edge = child.l_edge
     # Get sentence start positions according to current state
     sent_starts = set()
-    for i in range(length):
-        if tokens[i].head == 0 and tokens[i].dep != 0:
+    for i in range(start, end):
+        if tokens[i].head == 0:
             sent_starts.add(tokens[i].l_edge)
     cdef int curr_sent_start = 0
     cdef int curr_sent_end = 0
     # Check whether any heads are not within the current sentence
-    for i in range(length):
-        if (i > 0 and i in sent_starts) or i == length - 1:
+    for i in range(start, end):
+        if (i > 0 and i in sent_starts) or i == end - 1:
             curr_sent_end = i
             for j in range(curr_sent_start, curr_sent_end):
                 if tokens[j].head + j < curr_sent_start or tokens[j].head + j >= curr_sent_end + 1:
@@ -1436,6 +1422,7 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
         with shape (n, n), where n = len(doc).
     """
     cdef int [:,:] lca_matrix
+    cdef int j, k
     n_tokens= end - start
     lca_mat = numpy.empty((n_tokens, n_tokens), dtype=numpy.int32)
     lca_mat.fill(-1)
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index f06f3307d..1f42c84ee 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -4,13 +4,10 @@ cimport numpy as np
 from libc.math cimport sqrt
 
 import numpy
-import numpy.linalg
 from thinc.api import get_array_module
-from collections import defaultdict
 import warnings
 
 from .doc cimport token_by_start, token_by_end, get_token_attr, _get_lca_matrix
-from .token cimport TokenC
 from ..structs cimport TokenC, LexemeC
 from ..typedefs cimport flags_t, attr_t, hash_t
 from ..attrs cimport attr_id_t
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 2474f0637..35142c35e 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -1,6 +1,4 @@
 # cython: infer_types=True
-from libc.string cimport memcpy
-from cpython.mem cimport PyMem_Malloc, PyMem_Free
 # Compiler crashes on memory view coercion without this. Should report bug.
 from cython.view cimport array as cvarray
 cimport numpy as np
@@ -14,14 +12,13 @@ from ..typedefs cimport hash_t
 from ..lexeme cimport Lexeme
 from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
 from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
-from ..attrs cimport IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM, LIKE_EMAIL
-from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX
-from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP
+from ..attrs cimport IS_TITLE, IS_UPPER, IS_CURRENCY, IS_STOP
+from ..attrs cimport LIKE_URL, LIKE_NUM, LIKE_EMAIL
 from ..symbols cimport conj
 from .morphanalysis cimport MorphAnalysis
+from .doc cimport set_children_from_heads
 
 from .. import parts_of_speech
-from .. import util
 from ..errors import Errors, Warnings
 from .underscore import Underscore, get_ext_args
 
@@ -658,78 +655,19 @@ cdef class Token:
             # Do nothing if old head is new head
             if self.i + self.c.head == new_head.i:
                 return
-            cdef Token old_head = self.head
-            cdef int rel_newhead_i = new_head.i - self.i
-            # Is the new head a descendant of the old head
-            cdef bint is_desc = old_head.is_ancestor(new_head)
-            cdef int new_edge
-            cdef Token anc, child
-            # Update number of deps of old head
-            if self.c.head > 0:  # left dependent
-                old_head.c.l_kids -= 1
-                if self.c.l_edge == old_head.c.l_edge:
-                    # The token dominates the left edge so the left edge of
-                    # the head may change when the token is reattached, it may
-                    # not change if the new head is a descendant of the current
-                    # head.
-                    new_edge = self.c.l_edge
-                    # The new l_edge is the left-most l_edge on any of the
-                    # other dependents where the l_edge is left of the head,
-                    # otherwise it is the head
-                    if not is_desc:
-                        new_edge = old_head.i
-                        for child in old_head.children:
-                            if child == self:
-                                continue
-                            if child.c.l_edge < new_edge:
-                                new_edge = child.c.l_edge
-                        old_head.c.l_edge = new_edge
-                    # Walk up the tree from old_head and assign new l_edge to
-                    # ancestors until an ancestor already has an l_edge that's
-                    # further left
-                    for anc in old_head.ancestors:
-                        if anc.c.l_edge <= new_edge:
-                            break
-                        anc.c.l_edge = new_edge
-            elif self.c.head < 0:  # right dependent
-                old_head.c.r_kids -= 1
-                # Do the same thing as for l_edge
-                if self.c.r_edge == old_head.c.r_edge:
-                    new_edge = self.c.r_edge
-                    if not is_desc:
-                        new_edge = old_head.i
-                        for child in old_head.children:
-                            if child == self:
-                                continue
-                            if child.c.r_edge > new_edge:
-                                new_edge = child.c.r_edge
-                        old_head.c.r_edge = new_edge
-                    for anc in old_head.ancestors:
-                        if anc.c.r_edge >= new_edge:
-                            break
-                        anc.c.r_edge = new_edge
-            # Update number of deps of new head
-            if rel_newhead_i > 0:  # left dependent
-                new_head.c.l_kids += 1
-                # Walk up the tree from new head and set l_edge to self.l_edge
-                # until you hit a token with an l_edge further to the left
-                if self.c.l_edge < new_head.c.l_edge:
-                    new_head.c.l_edge = self.c.l_edge
-                    for anc in new_head.ancestors:
-                        if anc.c.l_edge <= self.c.l_edge:
-                            break
-                        anc.c.l_edge = self.c.l_edge
-            elif rel_newhead_i < 0:  # right dependent
-                new_head.c.r_kids += 1
-                # Do the same as for l_edge
-                if self.c.r_edge > new_head.c.r_edge:
-                    new_head.c.r_edge = self.c.r_edge
-                    for anc in new_head.ancestors:
-                        if anc.c.r_edge >= self.c.r_edge:
-                            break
-                        anc.c.r_edge = self.c.r_edge
+            # Find the widest l/r_edges of the roots of the two tokens involved
+            # to limit the number of tokens for set_children_from_heads
+            cdef Token self_root, new_head_root
+            self_ancestors = list(self.ancestors)
+            new_head_ancestors = list(new_head.ancestors)
+            self_root = self_ancestors[-1] if self_ancestors else self
+            new_head_root = new_head_ancestors[-1] if new_head_ancestors else new_head
+            start = self_root.c.l_edge if self_root.c.l_edge < new_head_root.c.l_edge else new_head_root.c.l_edge
+            end = self_root.c.r_edge if self_root.c.r_edge > new_head_root.c.r_edge else new_head_root.c.r_edge
             # Set new head
-            self.c.head = rel_newhead_i
+            self.c.head = new_head.i - self.i
+            # Adjust parse properties and sentence starts
+            set_children_from_heads(self.doc.c, start, end + 1)
 
     @property
     def conjuncts(self):

From 7e4cd7575c33929bca0d3f7d932b0968803e4a71 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 17 Sep 2020 00:14:01 +0200
Subject: [PATCH 09/10] Refactor Docs.is_ flags (#6044)

* Refactor Docs.is_ flags

* Add derived `Doc.has_annotation` method

  * `Doc.has_annotation(attr)` returns `True` for partial annotation

  * `Doc.has_annotation(attr, require_complete=True)` returns `True` for
    complete annotation

* Add deprecation warnings to `is_tagged`, `is_parsed`, `is_sentenced`
and `is_nered`

* Add `Doc._get_array_attrs()`, which returns a full list of `Doc` attrs
for use with `Doc.to_array`, `Doc.to_bytes` and `Doc.from_docs`. The
list is the `DocBin` attributes list plus `SPACY` and `LENGTH`.

Notes on `Doc.has_annotation`:

* `HEAD` is converted to `DEP` because heads don't have an unset state

* Accept `IS_SENT_START` as a synonym of `SENT_START`

Additional changes:

* Add `NORM`, `ENT_ID` and `SENT_START` to default attributes for
`DocBin`

* In `Doc.from_array()` the presence of `DEP` causes `HEAD` to override
`SENT_START`

* In `Doc.from_array()` using `attrs` other than
`Doc._get_array_attrs()` (i.e., a user's custom list rather than our
default internal list) with both `HEAD` and `SENT_START` shows a warning
that `HEAD` will override `SENT_START`

* `set_children_from_heads` does not require dependency labels to set
sentence boundaries and sets `sent_start` for all non-sentence starts to
`-1`

* Fix call to set_children_form_heads

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
---
 spacy/displacy/__init__.py                    |   2 +-
 spacy/errors.py                               |  14 +-
 spacy/lang/de/syntax_iterators.py             |   2 +-
 spacy/lang/el/syntax_iterators.py             |   2 +-
 spacy/lang/en/syntax_iterators.py             |   2 +-
 spacy/lang/es/syntax_iterators.py             |   2 +-
 spacy/lang/fa/syntax_iterators.py             |   2 +-
 spacy/lang/fr/syntax_iterators.py             |   2 +-
 spacy/lang/id/syntax_iterators.py             |   2 +-
 spacy/lang/nb/syntax_iterators.py             |   2 +-
 spacy/lang/sv/syntax_iterators.py             |   2 +-
 spacy/matcher/matcher.pyx                     |  15 +-
 spacy/matcher/phrasematcher.pyx               |  20 ++-
 .../pipeline/_parser_internals/arc_eager.pyx  |   1 -
 spacy/pipeline/functions.py                   |   2 +-
 spacy/pipeline/morphologizer.pyx              |   2 -
 spacy/pipeline/tagger.pyx                     |   1 -
 spacy/tests/doc/test_doc_api.py               |  89 ++++++++--
 spacy/tests/doc/test_span.py                  |   6 +-
 spacy/tests/doc/test_token_api.py             |   9 +-
 spacy/tests/lang/de/test_noun_chunks.py       |   4 -
 spacy/tests/lang/el/test_noun_chunks.py       |   4 -
 spacy/tests/lang/en/test_noun_chunks.py       |   4 -
 spacy/tests/lang/en/test_sbd.py               |   3 +-
 spacy/tests/lang/es/test_noun_chunks.py       |   4 -
 spacy/tests/lang/fa/test_noun_chunks.py       |   4 -
 spacy/tests/lang/fr/test_noun_chunks.py       |   4 -
 spacy/tests/lang/id/test_noun_chunks.py       |   4 -
 spacy/tests/lang/nb/test_noun_chunks.py       |   4 -
 spacy/tests/lang/sv/test_noun_chunks.py       |   4 -
 spacy/tests/matcher/test_matcher_api.py       |  11 +-
 spacy/tests/matcher/test_phrase_matcher.py    |  17 +-
 spacy/tests/parser/test_parse.py              |   5 +-
 spacy/tests/parser/test_parse_navigate.py     |   2 +-
 spacy/tests/parser/test_space_attachment.py   |   3 +-
 spacy/tests/pipeline/test_attributeruler.py   |   6 +
 spacy/tests/pipeline/test_functions.py        |   2 -
 spacy/tests/pipeline/test_sentencizer.py      |  12 +-
 spacy/tests/regression/test_issue1-1000.py    |   5 +-
 spacy/tests/regression/test_issue1501-2000.py |  27 ++-
 spacy/tests/regression/test_issue2001-2500.py |   5 +-
 spacy/tests/regression/test_issue2501-3000.py |   8 +-
 spacy/tests/regression/test_issue3001-3500.py |  18 +-
 spacy/tests/regression/test_issue3501-4000.py |   2 -
 spacy/tests/regression/test_issue4001-4500.py |   5 +-
 spacy/tests/test_scorer.py                    |   1 -
 spacy/tests/test_training.py                  |  20 +--
 spacy/tokens/_serialize.py                    |   2 +-
 spacy/tokens/doc.pxd                          |   4 -
 spacy/tokens/doc.pyx                          | 157 +++++++++---------
 spacy/tokens/span.pyx                         |  17 +-
 spacy/tokens/token.pyx                        |   2 +-
 spacy/training/converters/conllu2docs.py      |   4 -
 spacy/training/gold_io.pyx                    |  12 +-
 website/docs/api/doc.md                       |  47 +++---
 website/docs/usage/v3.md                      |  20 +++
 56 files changed, 350 insertions(+), 282 deletions(-)

diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py
index 0e80c3b5f..48229572b 100644
--- a/spacy/displacy/__init__.py
+++ b/spacy/displacy/__init__.py
@@ -121,7 +121,7 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
     RETURNS (dict): Generated dependency parse keyed by words and arcs.
     """
     doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data"]))
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
         warnings.warn(Warnings.W005)
     if options.get("collapse_phrases", False):
         with doc.retokenize() as retokenizer:
diff --git a/spacy/errors.py b/spacy/errors.py
index 3bdeeccbe..173aedab9 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -119,6 +119,11 @@ class Warnings:
     W105 = ("As of spaCy v3.0, the {matcher}.pipe method is deprecated. If you "
             "need to match on a stream of documents, you can use nlp.pipe and "
             "call the {matcher} on each Doc object.")
+    W106 = ("Both HEAD and SENT_START are included as attributes in "
+            "doc.from_array(). The parse trees based on the HEAD attribute "
+            "will override the values in SENT_START.")
+    W107 = ("The property Doc.{prop} is deprecated. Use "
+            "Doc.has_annotation(\"{attr}\") instead.")
 
 
 @add_codes
@@ -192,11 +197,6 @@ class Errors:
             "Alternatively, add the dependency parser, or set sentence "
             "boundaries by setting doc[i].is_sent_start.")
     E031 = ("Invalid token: empty string ('') at position {i}.")
-    E032 = ("Conflicting attributes specified in doc.from_array(): "
-            "(HEAD, SENT_START). The HEAD attribute currently sets sentence "
-            "boundaries implicitly, based on the tree structure. This means "
-            "the HEAD attribute would potentially override the sentence "
-            "boundaries set by SENT_START.")
     E033 = ("Cannot load into non-empty Doc of length {length}.")
     E035 = ("Error creating span with start {start} and end {end} for Doc of "
             "length {length}.")
@@ -397,8 +397,8 @@ class Errors:
     E154 = ("One of the attributes or values is not supported for token "
             "patterns. Please use the option validate=True with Matcher, "
             "PhraseMatcher, or EntityRuler for more details.")
-    E155 = ("The pipeline needs to include a tagger in order to use "
-            "Matcher or PhraseMatcher with the attributes POS, TAG, or LEMMA. "
+    E155 = ("The pipeline needs to include a {pipe} in order to use "
+            "Matcher or PhraseMatcher with the attribute {attr}. "
             "Try using nlp() instead of nlp.make_doc() or list(nlp.pipe()) "
             "instead of list(nlp.tokenizer.pipe()).")
     E156 = ("The pipeline needs to include a parser in order to use "
diff --git a/spacy/lang/de/syntax_iterators.py b/spacy/lang/de/syntax_iterators.py
index bd495f792..bd75a61eb 100644
--- a/spacy/lang/de/syntax_iterators.py
+++ b/spacy/lang/de/syntax_iterators.py
@@ -16,7 +16,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
     labels = ["sb", "oa", "da", "nk", "mo", "ag", "ROOT", "root", "cj", "pd", "og", "app"]
     # fmt: on
     doc = doclike.doc  # Ensure works on both Doc and Span.
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
         raise ValueError(Errors.E029)
     np_label = doc.vocab.strings.add("NP")
     np_deps = set(doc.vocab.strings.add(label) for label in labels)
diff --git a/spacy/lang/el/syntax_iterators.py b/spacy/lang/el/syntax_iterators.py
index 0a13edcc0..89cfd8b72 100644
--- a/spacy/lang/el/syntax_iterators.py
+++ b/spacy/lang/el/syntax_iterators.py
@@ -13,7 +13,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
     # Further improvement of the models will eliminate the need for this tag.
     labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"]
     doc = doclike.doc  # Ensure works on both Doc and Span.
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
         raise ValueError(Errors.E029)
     np_deps = [doc.vocab.strings.add(label) for label in labels]
     conj = doc.vocab.strings.add("conj")
diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py
index 59ae733bd..2a1b0867e 100644
--- a/spacy/lang/en/syntax_iterators.py
+++ b/spacy/lang/en/syntax_iterators.py
@@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
     labels = ["nsubj", "dobj", "nsubjpass", "pcomp", "pobj", "dative", "appos", "attr", "ROOT"]
     # fmt: on
     doc = doclike.doc  # Ensure works on both Doc and Span.
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
         raise ValueError(Errors.E029)
     np_deps = [doc.vocab.strings.add(label) for label in labels]
     conj = doc.vocab.strings.add("conj")
diff --git a/spacy/lang/es/syntax_iterators.py b/spacy/lang/es/syntax_iterators.py
index 427f1f203..ad0a1b838 100644
--- a/spacy/lang/es/syntax_iterators.py
+++ b/spacy/lang/es/syntax_iterators.py
@@ -8,7 +8,7 @@ from ...tokens import Doc, Span, Token
 def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
     """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
     doc = doclike.doc
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
         raise ValueError(Errors.E029)
     if not len(doc):
         return
diff --git a/spacy/lang/fa/syntax_iterators.py b/spacy/lang/fa/syntax_iterators.py
index b63db3539..0be06e73c 100644
--- a/spacy/lang/fa/syntax_iterators.py
+++ b/spacy/lang/fa/syntax_iterators.py
@@ -19,7 +19,7 @@ def noun_chunks(doclike):
     ]
     doc = doclike.doc  # Ensure works on both Doc and Span.
 
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
         raise ValueError(Errors.E029)
 
     np_deps = [doc.vocab.strings.add(label) for label in labels]
diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py
index d297203e3..68117a54d 100644
--- a/spacy/lang/fr/syntax_iterators.py
+++ b/spacy/lang/fr/syntax_iterators.py
@@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
     labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
     # fmt: on
     doc = doclike.doc  # Ensure works on both Doc and Span.
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
         raise ValueError(Errors.E029)
     np_deps = [doc.vocab.strings[label] for label in labels]
     conj = doc.vocab.strings.add("conj")
diff --git a/spacy/lang/id/syntax_iterators.py b/spacy/lang/id/syntax_iterators.py
index f6d261643..0f29bfe16 100644
--- a/spacy/lang/id/syntax_iterators.py
+++ b/spacy/lang/id/syntax_iterators.py
@@ -13,7 +13,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
     labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
     # fmt: on
     doc = doclike.doc  # Ensure works on both Doc and Span.
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
         raise ValueError(Errors.E029)
     np_deps = [doc.vocab.strings[label] for label in labels]
     conj = doc.vocab.strings.add("conj")
diff --git a/spacy/lang/nb/syntax_iterators.py b/spacy/lang/nb/syntax_iterators.py
index d297203e3..68117a54d 100644
--- a/spacy/lang/nb/syntax_iterators.py
+++ b/spacy/lang/nb/syntax_iterators.py
@@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
     labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
     # fmt: on
     doc = doclike.doc  # Ensure works on both Doc and Span.
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
         raise ValueError(Errors.E029)
     np_deps = [doc.vocab.strings[label] for label in labels]
     conj = doc.vocab.strings.add("conj")
diff --git a/spacy/lang/sv/syntax_iterators.py b/spacy/lang/sv/syntax_iterators.py
index 662b508ed..d5ae47853 100644
--- a/spacy/lang/sv/syntax_iterators.py
+++ b/spacy/lang/sv/syntax_iterators.py
@@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
     labels = ["nsubj", "nsubj:pass", "dobj", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
     # fmt: on
     doc = doclike.doc  # Ensure works on both Doc and Span.
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
         raise ValueError(Errors.E029)
     np_deps = [doc.vocab.strings[label] for label in labels]
     conj = doc.vocab.strings.add("conj")
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 079cac788..d83f58181 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -17,7 +17,7 @@ from ..vocab cimport Vocab
 from ..tokens.doc cimport Doc, get_token_attr_for_matcher
 from ..tokens.span cimport Span
 from ..tokens.token cimport Token
-from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA
+from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH
 
 from ..schemas import validate_token_pattern
 from ..errors import Errors, MatchPatternError, Warnings
@@ -215,10 +215,15 @@ cdef class Matcher:
         else:
             raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__))
         cdef Pool tmp_pool = Pool()
-        if len(set([LEMMA, POS, TAG]) & self._seen_attrs) > 0 \
-          and not doc.is_tagged:
-            raise ValueError(Errors.E155.format())
-        if DEP in self._seen_attrs and not doc.is_parsed:
+        if TAG in self._seen_attrs and not doc.has_annotation("TAG"):
+            raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG"))
+        if POS in self._seen_attrs and not doc.has_annotation("POS"):
+            raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS"))
+        if MORPH in self._seen_attrs and not doc.has_annotation("MORPH"):
+            raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH"))
+        if LEMMA in self._seen_attrs and not doc.has_annotation("LEMMA"):
+            raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA"))
+        if DEP in self._seen_attrs and not doc.has_annotation("DEP"):
             raise ValueError(Errors.E156.format())
         matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
                                 extensions=self._extensions, predicates=self._extra_predicates)
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index fae513367..b00ba157f 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -4,7 +4,7 @@ from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter
 
 import warnings
 
-from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA
+from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA, MORPH
 from ..structs cimport TokenC
 from ..tokens.token cimport Token
 from ..tokens.span cimport Span
@@ -184,12 +184,20 @@ cdef class PhraseMatcher:
             if len(doc) == 0:
                 continue
             if isinstance(doc, Doc):
-                if self.attr in (POS, TAG, LEMMA) and not doc.is_tagged:
-                    raise ValueError(Errors.E155.format())
-                if self.attr == DEP and not doc.is_parsed:
+                attrs = (TAG, POS, MORPH, LEMMA, DEP)
+                has_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
+                if self.attr == TAG and not has_annotation[TAG]:
+                    raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG"))
+                if self.attr == POS and not has_annotation[POS]:
+                    raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS"))
+                if self.attr == MORPH and not has_annotation[MORPH]:
+                    raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH"))
+                if self.attr == LEMMA and not has_annotation[LEMMA]:
+                    raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA"))
+                if self.attr == DEP and not has_annotation[DEP]:
                     raise ValueError(Errors.E156.format())
-                if self._validate and (doc.is_tagged or doc.is_parsed) \
-                  and self.attr not in (DEP, POS, TAG, LEMMA):
+                if self._validate and any(has_annotation.values()) \
+                        and self.attr not in attrs:
                     string_attr = self.vocab.strings[self.attr]
                     warnings.warn(Warnings.W012.format(key=key, attr=string_attr))
                 keyword = self._convert_to_array(doc)
diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index a5fc2ea0e..dafa99bdd 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -679,7 +679,6 @@ cdef class ArcEager(TransitionSystem):
                 st._sent[i].dep = self.root_label
 
     def finalize_doc(self, Doc doc):
-        doc.is_parsed = True
         set_children_from_heads(doc.c, 0, doc.length)
 
     def has_gold(self, Example eg, start=0, end=None):
diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py
index 7e68ea369..614608b25 100644
--- a/spacy/pipeline/functions.py
+++ b/spacy/pipeline/functions.py
@@ -17,7 +17,7 @@ def merge_noun_chunks(doc: Doc) -> Doc:
 
     DOCS: https://nightly.spacy.io/api/pipeline-functions#merge_noun_chunks
     """
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
         return doc
     with doc.retokenize() as retokenizer:
         for np in doc.noun_chunks:
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index bb68a358c..62ad9e0eb 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -204,8 +204,6 @@ class Morphologizer(Tagger):
                 doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"][morph])
                 doc.c[j].pos = self.cfg["labels_pos"][morph]
 
-            doc.is_morphed = True
-
     def get_loss(self, examples, scores):
         """Find the loss and gradient of loss for the batch of documents and
         their predicted scores.
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 1f8b4eb7a..0d78047ae 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -168,7 +168,6 @@ class Tagger(Pipe):
                 # Don't clobber preset POS tags
                 if doc.c[j].tag == 0:
                     doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
-            doc.is_tagged = True
 
     def update(self, examples, *, drop=0., sgd=None, losses=None, set_annotations=False):
         """Learn from a batch of documents and gold-standard information,
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 31dbad9ca..ce979d3d1 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -106,6 +106,7 @@ def test_doc_api_serialize(en_tokenizer, text):
     tokens = en_tokenizer(text)
     tokens[0].lemma_ = "lemma"
     tokens[0].norm_ = "norm"
+    tokens.ents = [(tokens.vocab.strings["PRODUCT"], 0, 1)]
     tokens[0].ent_kb_id_ = "ent_kb_id"
     new_tokens = Doc(tokens.vocab).from_bytes(tokens.to_bytes())
     assert tokens.text == new_tokens.text
@@ -144,7 +145,6 @@ def test_doc_api_set_ents(en_tokenizer):
 
 def test_doc_api_sents_empty_string(en_tokenizer):
     doc = en_tokenizer("")
-    doc.is_parsed = True
     sents = list(doc.sents)
     assert len(sents) == 0
 
@@ -181,10 +181,11 @@ def test_doc_api_right_edge(en_tokenizer):
     text = "I have proposed to myself, for the sake of such as live under the government of the Romans, to translate those books into the Greek tongue."
     heads = [2, 1, 0, -1, -1, -3, 15, 1, -2, -1, 1, -3, -1, -1, 1, -2, -1, 1,
              -2, -7, 1, -19, 1, -2, -3, 2, 1, -3, -26]
+    deps = ["dep"] * len(heads)
     # fmt: on
 
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
     assert doc[6].text == "for"
     subtree = [w.text for w in doc[6].subtree]
     # fmt: off
@@ -240,7 +241,9 @@ def test_doc_api_similarity_match():
 )
 def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix):
     tokens = en_tokenizer(sentence)
-    doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
+    doc = get_doc(
+        tokens.vocab, [t.text for t in tokens], heads=heads, deps=["dep"] * len(heads)
+    )
     lca = doc.get_lca_matrix()
     assert (lca == lca_matrix).all()
     assert lca[1, 1] == 1
@@ -251,16 +254,16 @@ def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix):
 def test_doc_is_nered(en_vocab):
     words = ["I", "live", "in", "New", "York"]
     doc = Doc(en_vocab, words=words)
-    assert not doc.is_nered
+    assert not doc.has_annotation("ENT_IOB")
     doc.ents = [Span(doc, 3, 5, label="GPE")]
-    assert doc.is_nered
+    assert doc.has_annotation("ENT_IOB")
     # Test creating doc from array with unknown values
     arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64")
     doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr)
-    assert doc.is_nered
+    assert doc.has_annotation("ENT_IOB")
     # Test serialization
     new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
-    assert new_doc.is_nered
+    assert new_doc.has_annotation("ENT_IOB")
 
 
 def test_doc_from_array_sent_starts(en_vocab):
@@ -271,25 +274,35 @@ def test_doc_from_array_sent_starts(en_vocab):
     # fmt: on
     doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
 
+    # HEAD overrides SENT_START with warning
     attrs = [SENT_START, HEAD]
     arr = doc.to_array(attrs)
     new_doc = Doc(en_vocab, words=words)
-    with pytest.raises(ValueError):
+    with pytest.warns(UserWarning):
         new_doc.from_array(attrs, arr)
 
-    attrs = [SENT_START, DEP]
+    # no warning using default attrs
+    attrs = doc._get_array_attrs()
+    arr = doc.to_array(attrs)
+    with pytest.warns(None) as record:
+        new_doc.from_array(attrs, arr)
+        assert len(record) == 0
+
+    # only SENT_START uses SENT_START
+    attrs = [SENT_START]
     arr = doc.to_array(attrs)
     new_doc = Doc(en_vocab, words=words)
     new_doc.from_array(attrs, arr)
     assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc]
-    assert not new_doc.is_parsed
+    assert not new_doc.has_annotation("DEP")
 
+    # only HEAD uses HEAD
     attrs = [HEAD, DEP]
     arr = doc.to_array(attrs)
     new_doc = Doc(en_vocab, words=words)
     new_doc.from_array(attrs, arr)
     assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc]
-    assert new_doc.is_parsed
+    assert new_doc.has_annotation("DEP")
 
 
 def test_doc_from_array_morph(en_vocab):
@@ -359,9 +372,6 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
     assert m_doc[9].idx == think_idx
 
     m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"])
-    with pytest.raises(ValueError):
-        # important attributes from sentenziser or parser are missing
-        assert list(m_doc.sents)
     assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
     # space delimiter considered, although spacy attribute was missing
     assert str(m_doc) == " ".join(en_texts_without_empty)
@@ -373,6 +383,15 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
     assert m_doc[9].idx == think_idx
 
 
+def test_doc_api_from_docs_ents(en_tokenizer):
+    texts = ["Merging the docs is fun.", "They don't think alike."]
+    docs = [en_tokenizer(t) for t in texts]
+    docs[0].ents = ()
+    docs[1].ents = (Span(docs[1], 0, 1, label="foo"),)
+    doc = Doc.from_docs(docs)
+    assert len(doc.ents) == 1
+
+
 def test_doc_lang(en_vocab):
     doc = Doc(en_vocab, words=["Hello", "world"])
     assert doc.lang_ == "en"
@@ -393,3 +412,45 @@ def test_token_lexeme(en_vocab):
     assert isinstance(token.lex, Lexeme)
     assert token.lex.text == token.text
     assert en_vocab[token.orth] == token.lex
+
+
+def test_has_annotation(en_vocab):
+    doc = Doc(en_vocab, words=["Hello", "world"])
+    attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "HEAD", "ENT_IOB", "ENT_TYPE")
+    for attr in attrs:
+        assert not doc.has_annotation(attr)
+
+    doc[0].tag_ = "A"
+    doc[0].pos_ = "X"
+    doc[0].morph_ = "Feat=Val"
+    doc[0].lemma_ = "a"
+    doc[0].dep_ = "dep"
+    doc[0].head = doc[1]
+    doc.ents = [Span(doc, 0, 1, label="HELLO")]
+
+    for attr in attrs:
+        assert doc.has_annotation(attr)
+        assert not doc.has_annotation(attr, require_complete=True)
+
+    doc[1].tag_ = "A"
+    doc[1].pos_ = "X"
+    doc[1].morph_ = ""
+    doc[1].lemma_ = "a"
+    doc[1].dep_ = "dep"
+    doc.ents = [Span(doc, 0, 2, label="HELLO")]
+
+    for attr in attrs:
+        assert doc.has_annotation(attr)
+        assert doc.has_annotation(attr, require_complete=True)
+
+
+def test_is_flags_deprecated(en_tokenizer):
+    doc = en_tokenizer("test")
+    with pytest.deprecated_call():
+        doc.is_tagged
+    with pytest.deprecated_call():
+        doc.is_parsed
+    with pytest.deprecated_call():
+        doc.is_nered
+    with pytest.deprecated_call():
+        doc.is_sentenced
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index 1e9623484..ad4f49042 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -24,7 +24,6 @@ def doc_not_parsed(en_tokenizer):
     text = "This is a sentence. This is another sentence. And a third."
     tokens = en_tokenizer(text)
     doc = Doc(tokens.vocab, words=[t.text for t in tokens])
-    doc.is_parsed = False
     return doc
 
 
@@ -71,8 +70,9 @@ def test_spans_string_fn(doc):
 def test_spans_root2(en_tokenizer):
     text = "through North and South Carolina"
     heads = [0, 3, -1, -2, -4]
+    deps = ["dep"] * len(heads)
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
     assert doc[-2:].root.text == "Carolina"
 
 
@@ -92,7 +92,7 @@ def test_spans_span_sent(doc, doc_not_parsed):
 def test_spans_lca_matrix(en_tokenizer):
     """Test span's lca matrix generation"""
     tokens = en_tokenizer("the lazy dog slept")
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0])
+    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0], deps=["dep"] * 4)
     lca = doc[:2].get_lca_matrix()
     assert lca.shape == (2, 2)
     assert lca[0, 0] == 0  # the & the -> the
diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py
index 28ef0dd7f..1308df67b 100644
--- a/spacy/tests/doc/test_token_api.py
+++ b/spacy/tests/doc/test_token_api.py
@@ -114,8 +114,9 @@ def test_doc_token_api_ancestors(en_tokenizer):
 def test_doc_token_api_head_setter(en_tokenizer):
     text = "Yesterday I saw a dog that barked loudly."
     heads = [2, 1, 0, 1, -2, 1, -2, -1, -6]
+    deps = ["dep"] * len(heads)
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
 
     assert doc[6].n_lefts == 1
     assert doc[6].n_rights == 1
@@ -208,7 +209,6 @@ def test_is_sent_start(en_tokenizer):
     assert doc[5].is_sent_start is None
     doc[5].is_sent_start = True
     assert doc[5].is_sent_start is True
-    doc.is_parsed = True
     assert len(list(doc.sents)) == 2
 
 
@@ -217,7 +217,6 @@ def test_is_sent_end(en_tokenizer):
     assert doc[4].is_sent_end is None
     doc[5].is_sent_start = True
     assert doc[4].is_sent_end is True
-    doc.is_parsed = True
     assert len(list(doc.sents)) == 2
 
 
@@ -242,14 +241,14 @@ def test_token0_has_sent_start_true():
     doc = Doc(Vocab(), words=["hello", "world"])
     assert doc[0].is_sent_start is True
     assert doc[1].is_sent_start is None
-    assert not doc.is_sentenced
+    assert not doc.has_annotation("SENT_START")
 
 
 def test_tokenlast_has_sent_end_true():
     doc = Doc(Vocab(), words=["hello", "world"])
     assert doc[0].is_sent_end is None
     assert doc[1].is_sent_end is True
-    assert not doc.is_sentenced
+    assert not doc.has_annotation("SENT_START")
 
 
 def test_token_api_conjuncts_chain(en_vocab):
diff --git a/spacy/tests/lang/de/test_noun_chunks.py b/spacy/tests/lang/de/test_noun_chunks.py
index ff9f8d5e5..0ed12d208 100644
--- a/spacy/tests/lang/de/test_noun_chunks.py
+++ b/spacy/tests/lang/de/test_noun_chunks.py
@@ -3,11 +3,7 @@ import pytest
 
 def test_noun_chunks_is_parsed_de(de_tokenizer):
     """Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed.
-    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False'
-    to make sure the noun chunks don't run.
     """
     doc = de_tokenizer("Er lag auf seinem")
-    doc.is_parsed = False
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
diff --git a/spacy/tests/lang/el/test_noun_chunks.py b/spacy/tests/lang/el/test_noun_chunks.py
index 38e72b0b2..2d376c612 100644
--- a/spacy/tests/lang/el/test_noun_chunks.py
+++ b/spacy/tests/lang/el/test_noun_chunks.py
@@ -3,11 +3,7 @@ import pytest
 
 def test_noun_chunks_is_parsed_el(el_tokenizer):
     """Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed.
-    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False'
-    to make sure the noun chunks don't run.
     """
     doc = el_tokenizer("είναι χώρα της νοτιοανατολικής")
-    doc.is_parsed = False
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
diff --git a/spacy/tests/lang/en/test_noun_chunks.py b/spacy/tests/lang/en/test_noun_chunks.py
index 5395dbabe..fa3a134bd 100644
--- a/spacy/tests/lang/en/test_noun_chunks.py
+++ b/spacy/tests/lang/en/test_noun_chunks.py
@@ -11,12 +11,8 @@ from ...util import get_doc
 
 def test_noun_chunks_is_parsed(en_tokenizer):
     """Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed.
-    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False'
-    to make sure the noun chunks don't run.
     """
     doc = en_tokenizer("This is a sentence")
-    doc.is_parsed = False
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
 
diff --git a/spacy/tests/lang/en/test_sbd.py b/spacy/tests/lang/en/test_sbd.py
index 38c8d94d8..ee1e6be17 100644
--- a/spacy/tests/lang/en/test_sbd.py
+++ b/spacy/tests/lang/en/test_sbd.py
@@ -7,8 +7,9 @@ from ...util import get_doc, apply_transition_sequence
 @pytest.mark.parametrize("punct", [".", "!", "?", ""])
 def test_en_sbd_single_punct(en_tokenizer, text, punct):
     heads = [2, 1, 0, -1] if punct else [2, 1, 0]
+    deps = ["dep"] * len(heads)
     tokens = en_tokenizer(text + punct)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
     assert len(doc) == 4 if punct else 3
     assert len(list(doc.sents)) == 1
     assert sum(len(sent) for sent in doc.sents) == len(doc)
diff --git a/spacy/tests/lang/es/test_noun_chunks.py b/spacy/tests/lang/es/test_noun_chunks.py
index a7ec4e562..db89fd903 100644
--- a/spacy/tests/lang/es/test_noun_chunks.py
+++ b/spacy/tests/lang/es/test_noun_chunks.py
@@ -3,11 +3,7 @@ import pytest
 
 def test_noun_chunks_is_parsed_es(es_tokenizer):
     """Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed.
-    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False'
-    to make sure the noun chunks don't run.
     """
     doc = es_tokenizer("en Oxford este verano")
-    doc.is_parsed = False
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
diff --git a/spacy/tests/lang/fa/test_noun_chunks.py b/spacy/tests/lang/fa/test_noun_chunks.py
index 767e91f6b..53b39d9a1 100644
--- a/spacy/tests/lang/fa/test_noun_chunks.py
+++ b/spacy/tests/lang/fa/test_noun_chunks.py
@@ -3,12 +3,8 @@ import pytest
 
 def test_noun_chunks_is_parsed_fa(fa_tokenizer):
     """Test that noun_chunks raises Value Error for 'fa' language if Doc is not parsed.
-    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False'
-    to make sure the noun chunks don't run.
     """
 
     doc = fa_tokenizer("این یک جمله نمونه می باشد.")
-    doc.is_parsed = False
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
diff --git a/spacy/tests/lang/fr/test_noun_chunks.py b/spacy/tests/lang/fr/test_noun_chunks.py
index 5fd6897f7..d81199a3e 100644
--- a/spacy/tests/lang/fr/test_noun_chunks.py
+++ b/spacy/tests/lang/fr/test_noun_chunks.py
@@ -3,11 +3,7 @@ import pytest
 
 def test_noun_chunks_is_parsed_fr(fr_tokenizer):
     """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed.
-    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False'
-    to make sure the noun chunks don't run.
     """
     doc = fr_tokenizer("trouver des travaux antérieurs")
-    doc.is_parsed = False
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
diff --git a/spacy/tests/lang/id/test_noun_chunks.py b/spacy/tests/lang/id/test_noun_chunks.py
index 445643933..fef1524f1 100644
--- a/spacy/tests/lang/id/test_noun_chunks.py
+++ b/spacy/tests/lang/id/test_noun_chunks.py
@@ -3,11 +3,7 @@ import pytest
 
 def test_noun_chunks_is_parsed_id(id_tokenizer):
     """Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed.
-    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False'
-    to make sure the noun chunks don't run.
     """
     doc = id_tokenizer("sebelas")
-    doc.is_parsed = False
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
diff --git a/spacy/tests/lang/nb/test_noun_chunks.py b/spacy/tests/lang/nb/test_noun_chunks.py
index c6a00354b..9965fcd14 100644
--- a/spacy/tests/lang/nb/test_noun_chunks.py
+++ b/spacy/tests/lang/nb/test_noun_chunks.py
@@ -3,11 +3,7 @@ import pytest
 
 def test_noun_chunks_is_parsed_nb(nb_tokenizer):
     """Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed.
-    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False'
-    to make sure the noun chunks don't run.
     """
     doc = nb_tokenizer("Smørsausen brukes bl.a. til")
-    doc.is_parsed = False
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
diff --git a/spacy/tests/lang/sv/test_noun_chunks.py b/spacy/tests/lang/sv/test_noun_chunks.py
index f352ca648..458cdadd5 100644
--- a/spacy/tests/lang/sv/test_noun_chunks.py
+++ b/spacy/tests/lang/sv/test_noun_chunks.py
@@ -5,12 +5,8 @@ from ...util import get_doc
 
 def test_noun_chunks_is_parsed_sv(sv_tokenizer):
     """Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed.
-    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False'
-    to make sure the noun chunks don't run.
     """
     doc = sv_tokenizer("Studenten läste den bästa boken")
-    doc.is_parsed = False
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
 
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index e0f335a19..04f9585f1 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -301,11 +301,14 @@ def test_matcher_basic_check(en_vocab):
 
 def test_attr_pipeline_checks(en_vocab):
     doc1 = Doc(en_vocab, words=["Test"])
-    doc1.is_parsed = True
+    doc1[0].dep_ = "ROOT"
     doc2 = Doc(en_vocab, words=["Test"])
-    doc2.is_tagged = True
+    doc2[0].tag_ = "TAG"
+    doc2[0].pos_ = "X"
+    doc2[0].morph_ = "Feat=Val"
+    doc2[0].lemma_ = "LEMMA"
     doc3 = Doc(en_vocab, words=["Test"])
-    # DEP requires is_parsed
+    # DEP requires DEP
     matcher = Matcher(en_vocab)
     matcher.add("TEST", [[{"DEP": "a"}]])
     matcher(doc1)
@@ -313,7 +316,7 @@ def test_attr_pipeline_checks(en_vocab):
         matcher(doc2)
     with pytest.raises(ValueError):
         matcher(doc3)
-    # TAG, POS, LEMMA require is_tagged
+    # TAG, POS, LEMMA require those values
     for attr in ("TAG", "POS", "LEMMA"):
         matcher = Matcher(en_vocab)
         matcher.add("TEST", [[{attr: "a"}]])
diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py
index 4b7027f87..9caf284a3 100644
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@@ -187,9 +187,11 @@ def test_phrase_matcher_bool_attrs(en_vocab):
 
 def test_phrase_matcher_validation(en_vocab):
     doc1 = Doc(en_vocab, words=["Test"])
-    doc1.is_parsed = True
+    doc1[0].dep_ = "ROOT"
     doc2 = Doc(en_vocab, words=["Test"])
-    doc2.is_tagged = True
+    doc2[0].tag_ = "TAG"
+    doc2[0].pos_ = "X"
+    doc2[0].morph_ = "Feat=Val"
     doc3 = Doc(en_vocab, words=["Test"])
     matcher = PhraseMatcher(en_vocab, validate=True)
     with pytest.warns(UserWarning):
@@ -212,18 +214,21 @@ def test_attr_validation(en_vocab):
 
 def test_attr_pipeline_checks(en_vocab):
     doc1 = Doc(en_vocab, words=["Test"])
-    doc1.is_parsed = True
+    doc1[0].dep_ = "ROOT"
     doc2 = Doc(en_vocab, words=["Test"])
-    doc2.is_tagged = True
+    doc2[0].tag_ = "TAG"
+    doc2[0].pos_ = "X"
+    doc2[0].morph_ = "Feat=Val"
+    doc2[0].lemma_ = "LEMMA"
     doc3 = Doc(en_vocab, words=["Test"])
-    # DEP requires is_parsed
+    # DEP requires DEP
     matcher = PhraseMatcher(en_vocab, attr="DEP")
     matcher.add("TEST1", [doc1])
     with pytest.raises(ValueError):
         matcher.add("TEST2", [doc2])
     with pytest.raises(ValueError):
         matcher.add("TEST3", [doc3])
-    # TAG, POS, LEMMA require is_tagged
+    # TAG, POS, LEMMA require those values
     for attr in ("TAG", "POS", "LEMMA"):
         matcher = PhraseMatcher(en_vocab, attr=attr)
         matcher.add("TEST2", [doc2])
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 691a7c3aa..9e760c1e7 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -67,8 +67,9 @@ def test_parser_initial(en_tokenizer, en_parser):
 def test_parser_parse_subtrees(en_tokenizer, en_parser):
     text = "The four wheels on the bus turned quickly"
     heads = [2, 1, 4, -1, 1, -2, 0, -1]
+    deps = ["dep"] * len(heads)
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
     assert len(list(doc[2].lefts)) == 2
     assert len(list(doc[2].rights)) == 1
     assert len(list(doc[2].children)) == 3
@@ -184,7 +185,7 @@ def test_parser_set_sent_starts(en_vocab):
         if i == 0 or i == 3:
             assert doc[i].is_sent_start is True
         else:
-            assert not doc[i].is_sent_start
+            assert doc[i].is_sent_start is False
     for sent in doc.sents:
         for token in sent:
             assert token.head in sent
diff --git a/spacy/tests/parser/test_parse_navigate.py b/spacy/tests/parser/test_parse_navigate.py
index f42601a85..db1e98ba0 100644
--- a/spacy/tests/parser/test_parse_navigate.py
+++ b/spacy/tests/parser/test_parse_navigate.py
@@ -63,7 +63,7 @@ def test_parser_parse_navigate_consistency(en_tokenizer, text, heads):
 
 def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads):
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=["dep"] * len(heads))
 
     lefts = {}
     rights = {}
diff --git a/spacy/tests/parser/test_space_attachment.py b/spacy/tests/parser/test_space_attachment.py
index 3a0a6b943..3672dabea 100644
--- a/spacy/tests/parser/test_space_attachment.py
+++ b/spacy/tests/parser/test_space_attachment.py
@@ -8,8 +8,9 @@ from ..util import get_doc, apply_transition_sequence
 def test_parser_space_attachment(en_tokenizer):
     text = "This is a test.\nTo ensure  spaces are attached well."
     heads = [1, 0, 1, -2, -3, -1, 1, 4, -1, 2, 1, 0, -1, -2]
+    deps = ["dep"] * len(heads)
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
     for sent in doc.sents:
         if len(sent) == 1:
             assert not sent[-1].is_space
diff --git a/spacy/tests/pipeline/test_attributeruler.py b/spacy/tests/pipeline/test_attributeruler.py
index 9254688cc..a66b34bc0 100644
--- a/spacy/tests/pipeline/test_attributeruler.py
+++ b/spacy/tests/pipeline/test_attributeruler.py
@@ -72,6 +72,8 @@ def test_attributeruler_init(nlp, pattern_dicts):
     assert doc[2].morph_ == "Case=Nom|Number=Plur"
     assert doc[3].lemma_ == "cat"
     assert doc[3].morph_ == "Case=Nom|Number=Sing"
+    assert doc.has_annotation("LEMMA")
+    assert doc.has_annotation("MORPH")
 
 
 def test_attributeruler_init_patterns(nlp, pattern_dicts):
@@ -82,6 +84,8 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
     assert doc[2].morph_ == "Case=Nom|Number=Plur"
     assert doc[3].lemma_ == "cat"
     assert doc[3].morph_ == "Case=Nom|Number=Sing"
+    assert doc.has_annotation("LEMMA")
+    assert doc.has_annotation("MORPH")
     nlp.remove_pipe("attribute_ruler")
     # initialize with patterns from asset
     nlp.add_pipe(
@@ -93,6 +97,8 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
     assert doc[2].morph_ == "Case=Nom|Number=Plur"
     assert doc[3].lemma_ == "cat"
     assert doc[3].morph_ == "Case=Nom|Number=Sing"
+    assert doc.has_annotation("LEMMA")
+    assert doc.has_annotation("MORPH")
 
 
 def test_attributeruler_score(nlp, pattern_dicts):
diff --git a/spacy/tests/pipeline/test_functions.py b/spacy/tests/pipeline/test_functions.py
index 0ec8a5ec2..ee9e34df3 100644
--- a/spacy/tests/pipeline/test_functions.py
+++ b/spacy/tests/pipeline/test_functions.py
@@ -35,8 +35,6 @@ def doc2(en_tokenizer):
         deps=deps,
     )
     doc.ents = [Span(doc, 2, 4, doc.vocab.strings["GPE"])]
-    doc.is_parsed = True
-    doc.is_tagged = True
     return doc
 
 
diff --git a/spacy/tests/pipeline/test_sentencizer.py b/spacy/tests/pipeline/test_sentencizer.py
index 1b1c51f34..5dd0fef43 100644
--- a/spacy/tests/pipeline/test_sentencizer.py
+++ b/spacy/tests/pipeline/test_sentencizer.py
@@ -9,7 +9,7 @@ def test_sentencizer(en_vocab):
     doc = Doc(en_vocab, words=["Hello", "!", "This", "is", "a", "test", "."])
     sentencizer = Sentencizer(punct_chars=None)
     doc = sentencizer(doc)
-    assert doc.is_sentenced
+    assert doc.has_annotation("SENT_START")
     sent_starts = [t.is_sent_start for t in doc]
     sent_ends = [t.is_sent_end for t in doc]
     assert sent_starts == [True, False, True, False, False, False, False]
@@ -22,13 +22,13 @@ def test_sentencizer_pipe():
     nlp = English()
     nlp.add_pipe("sentencizer")
     for doc in nlp.pipe(texts):
-        assert doc.is_sentenced
+        assert doc.has_annotation("SENT_START")
         sent_starts = [t.is_sent_start for t in doc]
         assert sent_starts == [True, False, True, False, False, False, False]
         assert len(list(doc.sents)) == 2
     for ex in nlp.pipe(texts):
         doc = ex.doc
-        assert doc.is_sentenced
+        assert doc.has_annotation("SENT_START")
         sent_starts = [t.is_sent_start for t in doc]
         assert sent_starts == [True, False, True, False, False, False, False]
         assert len(list(doc.sents)) == 2
@@ -42,7 +42,7 @@ def test_sentencizer_empty_docs():
     nlp.add_pipe("sentencizer")
     for texts in [one_empty_text, many_empty_texts, some_empty_texts]:
         for doc in nlp.pipe(texts):
-            assert doc.is_sentenced
+            assert doc.has_annotation("SENT_START")
             sent_starts = [t.is_sent_start for t in doc]
             if len(doc) == 0:
                 assert sent_starts == []
@@ -82,7 +82,7 @@ def test_sentencizer_complex(en_vocab, words, sent_starts, sent_ends, n_sents):
     doc = Doc(en_vocab, words=words)
     sentencizer = Sentencizer(punct_chars=None)
     doc = sentencizer(doc)
-    assert doc.is_sentenced
+    assert doc.has_annotation("SENT_START")
     assert [t.is_sent_start for t in doc] == sent_starts
     assert [t.is_sent_end for t in doc] == sent_ends
     assert len(list(doc.sents)) == n_sents
@@ -115,7 +115,7 @@ def test_sentencizer_custom_punct(
     doc = Doc(en_vocab, words=words)
     sentencizer = Sentencizer(punct_chars=punct_chars)
     doc = sentencizer(doc)
-    assert doc.is_sentenced
+    assert doc.has_annotation("SENT_START")
     assert [t.is_sent_start for t in doc] == sent_starts
     assert [t.is_sent_end for t in doc] == sent_ends
     assert len(list(doc.sents)) == n_sents
diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py
index ed5bcc1a5..30f66fb1d 100644
--- a/spacy/tests/regression/test_issue1-1000.py
+++ b/spacy/tests/regression/test_issue1-1000.py
@@ -94,7 +94,6 @@ def test_issue309(en_tokenizer):
     doc = get_doc(
         tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=["ROOT"]
     )
-    doc.is_parsed = True
     assert len(doc) == 1
     sents = list(doc.sents)
     assert len(sents) == 1
@@ -170,11 +169,9 @@ def test_issue595():
 
 def test_issue599(en_vocab):
     doc = Doc(en_vocab)
-    doc.is_tagged = True
-    doc.is_parsed = True
     doc2 = Doc(doc.vocab)
     doc2.from_bytes(doc.to_bytes())
-    assert doc2.is_parsed
+    assert doc2.has_annotation("DEP")
 
 
 def test_issue600():
diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py
index c1d726db6..e226c8524 100644
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@@ -14,7 +14,7 @@ from spacy.tokens import Doc, Span, Token
 from spacy.attrs import HEAD, DEP
 from spacy.matcher import Matcher
 
-from ..util import make_tempdir
+from ..util import make_tempdir, get_doc
 
 
 def test_issue1506():
@@ -198,17 +198,26 @@ def test_issue1834():
     """Test that sentence boundaries & parse/tag flags are not lost
     during serialization."""
     string = "This is a first sentence . And another one"
-    doc = Doc(Vocab(), words=string.split())
-    doc[6].sent_start = True
+    words = string.split()
+    doc = get_doc(Vocab(), words=words)
+    doc[6].is_sent_start = True
     new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
     assert new_doc[6].sent_start
-    assert not new_doc.is_parsed
-    assert not new_doc.is_tagged
-    doc.is_parsed = True
-    doc.is_tagged = True
+    assert not new_doc.has_annotation("DEP")
+    assert not new_doc.has_annotation("TAG")
+    doc = get_doc(
+        Vocab(),
+        words=words,
+        tags=["TAG"] * len(words),
+        heads=[0, -1, -2, -3, -4, -5, 0, -1, -2],
+        deps=["dep"] * len(words),
+    )
+    print(doc.has_annotation("DEP"), [t.head.i for t in doc], [t.is_sent_start for t in doc])
     new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
-    assert new_doc.is_parsed
-    assert new_doc.is_tagged
+    print(new_doc.has_annotation("DEP"), [t.head.i for t in new_doc], [t.is_sent_start for t in new_doc])
+    assert new_doc[6].sent_start
+    assert new_doc.has_annotation("DEP")
+    assert new_doc.has_annotation("TAG")
 
 
 def test_issue1868():
diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py
index 357fbb84e..3bea5d3f6 100644
--- a/spacy/tests/regression/test_issue2001-2500.py
+++ b/spacy/tests/regression/test_issue2001-2500.py
@@ -72,8 +72,6 @@ def test_issue2219(en_vocab):
 def test_issue2361(de_tokenizer):
     chars = ("&lt;", "&gt;", "&amp;", "&quot;")
     doc = de_tokenizer('< > & " ')
-    doc.is_parsed = True
-    doc.is_tagged = True
     html = render(doc)
     for char in chars:
         assert char in html
@@ -108,6 +106,7 @@ def test_issue2385_biluo(tags):
 def test_issue2396(en_vocab):
     words = ["She", "created", "a", "test", "for", "spacy"]
     heads = [1, 0, 1, -2, -1, -1]
+    deps = ["dep"] * len(heads)
     matrix = numpy.array(
         [
             [0, 1, 1, 1, 1, 1],
@@ -119,7 +118,7 @@ def test_issue2396(en_vocab):
         ],
         dtype=numpy.int32,
     )
-    doc = get_doc(en_vocab, words=words, heads=heads)
+    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
     span = doc[:]
     assert (doc.get_lca_matrix() == matrix).all()
     assert (span.get_lca_matrix() == matrix).all()
diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py
index 859e4d80e..9267a7346 100644
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@@ -16,16 +16,16 @@ from ..util import get_doc
 
 
 def test_issue2564():
-    """Test the tagger sets is_tagged correctly when used via Language.pipe."""
+    """Test the tagger sets has_annotation("TAG") correctly when used via Language.pipe."""
     nlp = Language()
     tagger = nlp.add_pipe("tagger")
     tagger.add_label("A")
     nlp.begin_training()
     doc = nlp("hello world")
-    assert doc.is_tagged
+    assert doc.has_annotation("TAG")
     docs = nlp.pipe(["hello", "world"])
     piped_doc = next(docs)
-    assert piped_doc.is_tagged
+    assert piped_doc.has_annotation("TAG")
 
 
 def test_issue2569(en_tokenizer):
@@ -123,7 +123,7 @@ def test_issue2772(en_vocab):
     heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, 2, 1, -3, -4]
     deps = ["dep"] * len(heads)
     doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
-    assert not doc[1].is_sent_start
+    assert doc[1].is_sent_start is False
 
 
 @pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"])
diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py
index 3059eb5ab..d848467dd 100644
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@@ -63,7 +63,7 @@ def test_issue3012(en_vocab):
     pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
     ents = [(2, 4, "PERCENT")]
     doc = get_doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
-    assert doc.is_tagged
+    assert doc.has_annotation("TAG")
 
     expected = ("10", "NUM", "CD", "PERCENT")
     assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
@@ -83,10 +83,14 @@ def test_issue3012(en_vocab):
 def test_issue3199():
     """Test that Span.noun_chunks works correctly if no noun chunks iterator
     is available. To make this test future-proof, we're constructing a Doc
-    with a new Vocab here and setting is_parsed to make sure the noun chunks run.
+    with a new Vocab here and a parse tree to make sure the noun chunks run.
     """
-    doc = Doc(Vocab(), words=["This", "is", "a", "sentence"])
-    doc.is_parsed = True
+    doc = get_doc(
+        Vocab(),
+        words=["This", "is", "a", "sentence"],
+        heads=[0, -1, -2, -3],
+        deps=["dep"] * 4,
+    )
     assert list(doc[0:3].noun_chunks) == []
 
 
@@ -250,16 +254,16 @@ def test_issue3456():
 
 
 def test_issue3468():
-    """Test that sentence boundaries are set correctly so Doc.is_sentenced can
+    """Test that sentence boundaries are set correctly so Doc.has_annotation("SENT_START") can
     be restored after serialization."""
     nlp = English()
     nlp.add_pipe("sentencizer")
     doc = nlp("Hello world")
     assert doc[0].is_sent_start
-    assert doc.is_sentenced
+    assert doc.has_annotation("SENT_START")
     assert len(list(doc.sents)) == 1
     doc_bytes = doc.to_bytes()
     new_doc = Doc(nlp.vocab).from_bytes(doc_bytes)
     assert new_doc[0].is_sent_start
-    assert new_doc.is_sentenced
+    assert new_doc.has_annotation("SENT_START")
     assert len(list(new_doc.sents)) == 1
diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py
index d36e693c7..8c483d877 100644
--- a/spacy/tests/regression/test_issue3501-4000.py
+++ b/spacy/tests/regression/test_issue3501-4000.py
@@ -356,7 +356,6 @@ def test_issue3882(en_vocab):
     copy of the Doc.
     """
     doc = Doc(en_vocab, words=["Hello", "world"])
-    doc.is_parsed = True
     doc.user_data["test"] = set()
     parse_deps(doc)
 
@@ -386,7 +385,6 @@ def test_issue3959():
     doc[0].pos_ = "NOUN"
     assert doc[0].pos_ == "NOUN"
     # usually this is already True when starting from proper models instead of blank English
-    doc.is_tagged = True
     with make_tempdir() as tmp_dir:
         file_path = tmp_dir / "my_doc"
         doc.to_disk(file_path)
diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py
index 2beccedcf..4e58c347e 100644
--- a/spacy/tests/regression/test_issue4001-4500.py
+++ b/spacy/tests/regression/test_issue4001-4500.py
@@ -189,7 +189,6 @@ def test_issue4133(en_vocab):
     for i, token in enumerate(doc):
         token.pos_ = pos[i]
     # usually this is already True when starting from proper models instead of blank English
-    doc.is_tagged = True
     doc_bytes = doc.to_bytes()
     vocab = Vocab()
     vocab = vocab.from_bytes(vocab_bytes)
@@ -249,7 +248,7 @@ def test_issue4267():
     assert "ner" in nlp.pipe_names
     # assert that we have correct IOB annotations
     doc1 = nlp("hi")
-    assert doc1.is_nered
+    assert doc1.has_annotation("ENT_IOB")
     for token in doc1:
         assert token.ent_iob == 2
     # add entity ruler and run again
@@ -260,7 +259,7 @@ def test_issue4267():
     assert "ner" in nlp.pipe_names
     # assert that we still have correct IOB annotations
     doc2 = nlp("hi")
-    assert doc2.is_nered
+    assert doc2.has_annotation("ENT_IOB")
     for token in doc2:
         assert token.ent_iob == 2
 
diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py
index fb96c0361..6e3604ce8 100644
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@@ -80,7 +80,6 @@ def tagged_doc():
         doc[i].morph_ = morphs[i]
         if i > 0:
             doc[i].is_sent_start = False
-    doc.is_tagged = True
     return doc
 
 
diff --git a/spacy/tests/test_training.py b/spacy/tests/test_training.py
index 1926aca1f..5fd40a0dc 100644
--- a/spacy/tests/test_training.py
+++ b/spacy/tests/test_training.py
@@ -12,7 +12,7 @@ from thinc.api import compounding
 import pytest
 import srsly
 
-from .util import make_tempdir
+from .util import make_tempdir, get_doc
 
 
 @pytest.fixture
@@ -26,24 +26,16 @@ def doc():
               "NounType=prop|Number=sing", "PunctType=peri"]
     # head of '.' is intentionally nonprojective for testing
     heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5]
+    heads = [head - i for i, head in enumerate(heads)]
     deps = ["poss", "case", "nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
     lemmas = ["Sarah", "'s", "sister", "fly", "to", "Silicon", "Valley", "via", "London", "."]
-    biluo_tags = ["U-PERSON", "O", "O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
+    ents = ((0, 2, "PERSON"), (5, 7, "LOC"), (8, 9, "GPE"))
     cats = {"TRAVEL": 1.0, "BAKING": 0.0}
     # fmt: on
     nlp = English()
-    doc = nlp(text)
-    for i in range(len(tags)):
-        doc[i].tag_ = tags[i]
-        doc[i].pos_ = pos[i]
-        doc[i].morph_ = morphs[i]
-        doc[i].lemma_ = lemmas[i]
-        doc[i].dep_ = deps[i]
-        doc[i].head = doc[heads[i]]
-    doc.ents = spans_from_biluo_tags(doc, biluo_tags)
+    words = [t.text for t in nlp.make_doc(text)]
+    doc = get_doc(nlp.vocab, words=words, tags=tags, pos=pos, morphs=morphs, heads=heads, deps=deps, lemmas=lemmas, ents=ents)
     doc.cats = cats
-    doc.is_tagged = True
-    doc.is_parsed = True
     return doc
 
 
@@ -194,7 +186,7 @@ def test_json2docs_no_ner(en_vocab):
     docs = json2docs(data)
     assert len(docs) == 1
     for doc in docs:
-        assert not doc.is_nered
+        assert not doc.has_annotation("ENT_IOB")
     for token in doc:
         assert token.ent_iob == 0
     eg = Example(
diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py
index cd8c81939..c9a20f6c0 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@@ -13,7 +13,7 @@ from ..errors import Errors
 from ..util import ensure_path, SimpleFrozenList
 
 # fmt: off
-ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")
+ALL_ATTRS = ("ORTH", "NORM", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "ENT_ID", "LEMMA", "MORPH", "POS", "SENT_START")
 # fmt: on
 
 
diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index 9b382d687..08f795b1a 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -46,10 +46,6 @@ cdef class Doc:
 
     cdef TokenC* c
 
-    cdef public bint is_tagged
-    cdef public bint is_parsed
-    cdef public bint is_morphed
-
     cdef public float sentiment
 
     cdef public dict user_hooks
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 62a6dd6db..5c5443258 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -24,9 +24,11 @@ from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, NORM
 from ..attrs import intify_attr, IDS
 from ..compat import copy_reg, pickle
 from ..errors import Errors, Warnings
+from ..morphology import Morphology
 from .. import util
 from .underscore import Underscore, get_ext_args
 from ._retokenize import Retokenizer
+from ._serialize import ALL_ATTRS as DOCBIN_ALL_ATTRS
 
 
 DEF PADDING = 5
@@ -185,8 +187,6 @@ cdef class Doc:
         self.c = data_start + PADDING
         self.max_length = size
         self.length = 0
-        self.is_tagged = False
-        self.is_parsed = False
         self.sentiment = 0.0
         self.cats = {}
         self.user_hooks = {}
@@ -216,11 +216,6 @@ cdef class Doc:
             else:
                 lexeme = self.vocab.get_by_orth(self.mem, word)
             self.push_back(lexeme, has_space)
-        # Tough to decide on policy for this. Is an empty doc tagged and parsed?
-        # There's no information we'd like to add to it, so I guess so?
-        if self.length == 0:
-            self.is_tagged = True
-            self.is_parsed = True
 
     @property
     def _(self):
@@ -228,37 +223,61 @@ cdef class Doc:
         return Underscore(Underscore.doc_extensions, self)
 
     @property
-    def is_sentenced(self):
-        """Check if the document has sentence boundaries assigned. This is
-        defined as having at least one of the following:
+    def is_tagged(self):
+        warnings.warn(Warnings.W107.format(prop="is_tagged", attr="TAG"), DeprecationWarning)
+        return self.has_annotation("TAG")
 
-        a) An entry "sents" in doc.user_hooks";
-        b) Doc.is_parsed is set to True;
-        c) At least one token other than the first where sent_start is not None.
-        """
-        if "sents" in self.user_hooks:
-            return True
-        if self.is_parsed:
-            return True
-        if len(self) < 2:
-            return True
-        for i in range(1, self.length):
-            if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
-                return True
-        return False
+    @property
+    def is_parsed(self):
+        warnings.warn(Warnings.W107.format(prop="is_parsed", attr="DEP"), DeprecationWarning)
+        return self.has_annotation("DEP")
 
     @property
     def is_nered(self):
-        """Check if the document has named entities set. Will return True if
-        *any* of the tokens has a named entity tag set (even if the others are
-        unknown values), or if the document is empty.
+        warnings.warn(Warnings.W107.format(prop="is_nered", attr="ENT_IOB"), DeprecationWarning)
+        return self.has_annotation("ENT_IOB")
+
+    @property
+    def is_sentenced(self):
+        warnings.warn(Warnings.W107.format(prop="is_sentenced", attr="SENT_START"), DeprecationWarning)
+        return self.has_annotation("SENT_START")
+
+    def has_annotation(self, attr, *, require_complete=False):
+        """Check whether the doc contains annotation on a token attribute.
+
+        attr (Union[int, str]): The attribute string name or int ID.
+        require_complete (bool): Whether to check that the attribute is set on
+            every token in the doc.
+        RETURNS (bool): Whether annotation is present.
+
+        DOCS: https://nightly.spacy.io/api/doc#has_annotation
         """
-        if len(self) == 0:
+
+        # empty docs are always annotated
+        if self.length == 0:
             return True
-        for i in range(self.length):
-            if self.c[i].ent_iob != 0:
+        cdef int i
+        cdef int range_start = 0
+        attr = intify_attr(attr)
+        # adjust attributes
+        if attr == HEAD:
+            # HEAD does not have an unset state, so rely on DEP
+            attr = DEP
+        elif attr == self.vocab.strings["IS_SENT_START"]:
+            # as in Matcher, allow IS_SENT_START as an alias of SENT_START
+            attr = SENT_START
+        # special cases for sentence boundaries
+        if attr == SENT_START:
+            if "sents" in self.user_hooks:
                 return True
-        return False
+            # docs of length 1 always have sentence boundaries
+            if self.length == 1:
+                return True
+            range_start = 1
+        if require_complete:
+            return all(Token.get_struct_attr(&self.c[i], attr) for i in range(range_start, self.length))
+        else:
+            return any(Token.get_struct_attr(&self.c[i], attr) for i in range(range_start, self.length))
 
     def __getitem__(self, object i):
         """Get a `Token` or `Span` object.
@@ -628,7 +647,7 @@ cdef class Doc:
 
         DOCS: https://nightly.spacy.io/api/doc#sents
         """
-        if not self.is_sentenced:
+        if not self.has_annotation("SENT_START"):
             raise ValueError(Errors.E030)
         if "sents" in self.user_hooks:
             yield from self.user_hooks["sents"](self)
@@ -652,10 +671,6 @@ cdef class Doc:
         return self.vocab.lang
 
     cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
-        if self.length == 0:
-            # Flip these to false when we see the first token.
-            self.is_tagged = False
-            self.is_parsed = False
         if self.length == self.max_length:
             self._realloc(self.length * 2)
         cdef TokenC* t = &self.c[self.length]
@@ -802,8 +817,8 @@ cdef class Doc:
         if array.dtype != numpy.uint64:
             warnings.warn(Warnings.W028.format(type=array.dtype))
 
-        if SENT_START in attrs and HEAD in attrs:
-            raise ValueError(Errors.E032)
+        if set(attrs) != set(Doc._get_array_attrs()) and SENT_START in attrs and HEAD in attrs:
+            warnings.warn(Warnings.W106)
         cdef int i, col
         cdef int32_t abs_head_index
         cdef attr_id_t attr_id
@@ -863,18 +878,17 @@ cdef class Doc:
                     # add morph to morphology table
                     self.vocab.morphology.add(self.vocab.strings[value])
                 Token.set_struct_attr(token, attr_ids[j], value)
-        # Set flags
-        self.is_parsed = bool(self.is_parsed or HEAD in attrs)
-        self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs)
-        # If document is parsed, set children
-        if self.is_parsed:
-            set_children_from_heads(self.c, 0, length)
+        # If document is parsed, set children and sentence boundaries
+        if HEAD in attrs and DEP in attrs:
+            col = attrs.index(DEP)
+            if array[:, col].any():
+                set_children_from_heads(self.c, 0, length)
         return self
 
     @staticmethod
     def from_docs(docs, ensure_whitespace=True, attrs=None):
-        """Concatenate multiple Doc objects to form a new one. Raises an error if the `Doc` objects do not all share
-        the same `Vocab`.
+        """Concatenate multiple Doc objects to form a new one. Raises an error
+        if the `Doc` objects do not all share the same `Vocab`.
 
         docs (list): A list of Doc objects.
         ensure_whitespace (bool): Insert a space between two adjacent docs whenever the first doc does not end in whitespace.
@@ -892,16 +906,7 @@ cdef class Doc:
         (vocab,) = vocab
 
         if attrs is None:
-            attrs = [LEMMA, NORM]
-            if all(doc.is_nered for doc in docs):
-                attrs.extend([ENT_IOB, ENT_KB_ID, ENT_TYPE])
-            # TODO: separate for is_morphed?
-            if all(doc.is_tagged for doc in docs):
-                attrs.extend([TAG, POS, MORPH])
-            if all(doc.is_parsed for doc in docs):
-                attrs.extend([HEAD, DEP])
-            else:
-                attrs.append(SENT_START)
+            attrs = Doc._get_array_attrs()
         else:
             if any(isinstance(attr, str) for attr in attrs):     # resolve attribute names
                 attrs = [intify_attr(attr) for attr in attrs]    # intify_attr returns None for invalid attrs
@@ -973,9 +978,6 @@ cdef class Doc:
         other.tensor = copy.deepcopy(self.tensor)
         other.cats = copy.deepcopy(self.cats)
         other.user_data = copy.deepcopy(self.user_data)
-        other.is_tagged = self.is_tagged
-        other.is_parsed = self.is_parsed
-        other.is_morphed = self.is_morphed
         other.sentiment = self.sentiment
         other.has_unknown_spaces = self.has_unknown_spaces
         other.user_hooks = dict(self.user_hooks)
@@ -1049,22 +1051,16 @@ cdef class Doc:
 
         DOCS: https://nightly.spacy.io/api/doc#to_bytes
         """
-        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, NORM, ENT_KB_ID]
-        if self.is_tagged:
-            array_head.extend([TAG, POS])
-        # If doc parsed add head and dep attribute
-        if self.is_parsed:
-            array_head.extend([HEAD, DEP])
-        # Otherwise add sent_start
-        else:
-            array_head.append(SENT_START)
+        array_head = Doc._get_array_attrs()
         strings = set()
         for token in self:
             strings.add(token.tag_)
             strings.add(token.lemma_)
+            strings.add(token.morph_)
             strings.add(token.dep_)
             strings.add(token.ent_type_)
             strings.add(token.ent_kb_id_)
+            strings.add(token.ent_id_)
             strings.add(token.norm_)
         # Msgpack doesn't distinguish between lists and tuples, which is
         # vexing for user data. As a best guess, we *know* that within
@@ -1214,22 +1210,29 @@ cdef class Doc:
         DOCS: https://nightly.spacy.io/api/doc#to_json
         """
         data = {"text": self.text}
-        if self.is_nered:
+        if self.has_annotation("ENT_IOB"):
             data["ents"] = [{"start": ent.start_char, "end": ent.end_char,
                             "label": ent.label_} for ent in self.ents]
-        if self.is_sentenced:
+        if self.has_annotation("SENT_START"):
             sents = list(self.sents)
             data["sents"] = [{"start": sent.start_char, "end": sent.end_char}
                              for sent in sents]
         if self.cats:
             data["cats"] = self.cats
         data["tokens"] = []
+        attrs = ["TAG", "MORPH", "POS", "LEMMA", "DEP"]
+        include_annotation = {attr: self.has_annotation(attr) for attr in attrs}
         for token in self:
             token_data = {"id": token.i, "start": token.idx, "end": token.idx + len(token)}
-            if self.is_tagged:
-                token_data["pos"] = token.pos_
+            if include_annotation["TAG"]:
                 token_data["tag"] = token.tag_
-            if self.is_parsed:
+            if include_annotation["POS"]:
+                token_data["pos"] = token.pos_
+            if include_annotation["MORPH"]:
+                token_data["morph"] = token.morph_
+            if include_annotation["LEMMA"]:
+                token_data["lemma"] = token.lemma_
+            if include_annotation["DEP"]:
                 token_data["dep"] = token.dep_
                 token_data["head"] = token.head.i
             data["tokens"].append(token_data)
@@ -1275,6 +1278,12 @@ cdef class Doc:
                     j += 1
         return output
 
+    @staticmethod
+    def _get_array_attrs():
+        attrs = [LENGTH, SPACY]
+        attrs.extend(intify_attr(x) for x in DOCBIN_ALL_ATTRS)
+        return tuple(attrs)
+
 
 cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2:
     cdef int i = token_by_char(tokens, length, start_char)
@@ -1335,7 +1344,7 @@ cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
         tokens[i].sent_start = -1
     for i in range(start, end):
         if tokens[i].head == 0:
-            tokens[tokens[i].l_edge].sent_start = True
+            tokens[tokens[i].l_edge].sent_start = 1
 
 
 cdef int _set_lr_kids_and_edges(TokenC* tokens, int start, int end, int loop_count) except -1:
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 1f42c84ee..781474d3a 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -201,7 +201,7 @@ cdef class Span:
         return Underscore(Underscore.span_extensions, self,
                           start=self.start_char, end=self.end_char)
 
-    def as_doc(self, bint copy_user_data=False):
+    def as_doc(self, *, bint copy_user_data=False):
         """Create a `Doc` object with a copy of the `Span`'s data.
 
         copy_user_data (bool): Whether or not to copy the original doc's user data.
@@ -209,19 +209,10 @@ cdef class Span:
 
         DOCS: https://nightly.spacy.io/api/span#as_doc
         """
-        # TODO: make copy_user_data a keyword-only argument (Python 3 only)
         words = [t.text for t in self]
         spaces = [bool(t.whitespace_) for t in self]
         cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces)
-        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, ENT_KB_ID]
-        if self.doc.is_tagged:
-            array_head.append(TAG)
-        # If doc parsed add head and dep attribute
-        if self.doc.is_parsed:
-            array_head.extend([HEAD, DEP])
-        # Otherwise add sent_start
-        else:
-            array_head.append(SENT_START)
+        array_head = self.doc._get_array_attrs()
         array = self.doc.to_array(array_head)
         array = array[self.start : self.end]
         self._fix_dep_copy(array_head, array)
@@ -375,7 +366,7 @@ cdef class Span:
         self.doc.sents
         # Use `sent_start` token attribute to find sentence boundaries
         cdef int n = 0
-        if self.doc.is_sentenced:
+        if self.doc.has_annotation("SENT_START"):
             # Find start of the sentence
             start = self.start
             while self.doc.c[start].sent_start != 1 and start > 0:
@@ -507,8 +498,6 @@ cdef class Span:
 
         DOCS: https://nightly.spacy.io/api/span#noun_chunks
         """
-        if not self.doc.is_parsed:
-            raise ValueError(Errors.E029)
         # Accumulate the result before beginning to iterate over it. This
         # prevents the tokenisation from being changed out from under us
         # during the iteration. The tricky thing here is that Span accepts
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 35142c35e..239de4559 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -486,7 +486,7 @@ cdef class Token:
                 return True
 
         def __set__(self, value):
-            if self.doc.is_parsed:
+            if self.doc.has_annotation("DEP"):
                 raise ValueError(Errors.E043)
             if value is None:
                 self.c.sent_start = 0
diff --git a/spacy/training/converters/conllu2docs.py b/spacy/training/converters/conllu2docs.py
index 85afdeef3..ebd123375 100644
--- a/spacy/training/converters/conllu2docs.py
+++ b/spacy/training/converters/conllu2docs.py
@@ -212,8 +212,6 @@ def doc_from_conllu_sentence(
         doc[i]._.merged_spaceafter = spaces[i]
     ents = get_entities(lines, ner_tag_pattern, ner_map)
     doc.ents = spans_from_biluo_tags(doc, ents)
-    doc.is_parsed = True
-    doc.is_tagged = True
 
     if merge_subtokens:
         doc = merge_conllu_subtokens(lines, doc)
@@ -243,8 +241,6 @@ def doc_from_conllu_sentence(
         doc_x[i].dep_ = deps[i]
         doc_x[i].head = doc_x[heads[i]]
     doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents]
-    doc_x.is_parsed = True
-    doc_x.is_tagged = True
 
     return doc_x
 
diff --git a/spacy/training/gold_io.pyx b/spacy/training/gold_io.pyx
index 5dc39eb31..b58df0d71 100644
--- a/spacy/training/gold_io.pyx
+++ b/spacy/training/gold_io.pyx
@@ -33,19 +33,25 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
                 link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
                 json_para["links"].append(link_dict)
         biluo_tags = biluo_tags_from_offsets(doc, json_para["entities"], missing=ner_missing_tag)
+        attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "ENT_IOB")
+        include_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
         for j, sent in enumerate(doc.sents):
             json_sent = {"tokens": [], "brackets": []}
             for token in sent:
                 json_token = {"id": token.i, "orth": token.text, "space": token.whitespace_}
-                if doc.is_tagged:
+                if include_annotation["TAG"]:
                     json_token["tag"] = token.tag_
+                if include_annotation["POS"]:
                     json_token["pos"] = token.pos_
+                if include_annotation["MORPH"]:
                     json_token["morph"] = token.morph_
+                if include_annotation["LEMMA"]:
                     json_token["lemma"] = token.lemma_
-                if doc.is_parsed:
+                if include_annotation["DEP"]:
                     json_token["head"] = token.head.i-token.i
                     json_token["dep"] = token.dep_
-                json_token["ner"] = biluo_tags[token.i]
+                if include_annotation["ENT_IOB"]:
+                    json_token["ner"] = biluo_tags[token.i]
                 json_sent["tokens"].append(json_token)
             json_para["sentences"].append(json_sent)
         json_doc["paragraphs"].append(json_para)
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index 88dc62c2a..380f6a172 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -267,6 +267,17 @@ ancestor is found, e.g. if span excludes a necessary ancestor.
 | ----------- | -------------------------------------------------------------------------------------- |
 | **RETURNS** | The lowest common ancestor matrix of the `Doc`. ~~numpy.ndarray[ndim=2, dtype=int32]~~ |
 
+## Doc.has_annotation {#has_annotation tag="method"}
+
+Check whether the doc contains annotation on a token attribute.
+
+| Name               | Description                                                                                         |
+| ------------------ | --------------------------------------------------------------------------------------------------- |
+| `attr`             | The attribute string name or int ID. ~~Union[int, str]~~                                            |
+| _keyword-only_     |                                                                                                     |
+| `require_complete` | Whether to check that the attribute is set on every token in the doc. Defaults to `False`. ~~bool~~ |
+| **RETURNS**        | Whether specified annotation is present in the doc. ~~bool~~                                        |
+
 ## Doc.to_array {#to_array tag="method"}
 
 Export given token attributes to a numpy `ndarray`. If `attr_ids` is a sequence
@@ -609,26 +620,22 @@ The L2 norm of the document's vector representation.
 
 ## Attributes {#attributes}
 
-| Name                                    | Description                                                                                                                                                                              |
-| --------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `text`                                  | A string representation of the document text. ~~str~~                                                                                                                                    |
-| `text_with_ws`                          | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~                                                                                            |
-| `mem`                                   | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~                                                                                                                 |
-| `vocab`                                 | The store of lexical types. ~~Vocab~~                                                                                                                                                    |
-| `tensor` <Tag variant="new">2</Tag>     | Container for dense vector representations. ~~numpy.ndarray~~                                                                                                                            |
-| `cats` <Tag variant="new">2</Tag>       | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. ~~Dict[str, float]~~                                              |
-| `user_data`                             | A generic storage area, for user custom data. ~~Dict[str, Any]~~                                                                                                                         |
-| `lang` <Tag variant="new">2.1</Tag>     | Language of the document's vocabulary. ~~int~~                                                                                                                                           |
-| `lang_` <Tag variant="new">2.1</Tag>    | Language of the document's vocabulary. ~~str~~                                                                                                                                           |
-| `is_tagged`                             | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty. ~~bool~~                                                                       |
-| `is_parsed`                             | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty. ~~bool~~                                                                        |
-| `is_sentenced`                          | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty. ~~bool~~                                                             |
-| `is_nered` <Tag variant="new">2.1</Tag> | A flag indicating that named entities have been set. Will return `True` if the `Doc` is empty, or if _any_ of the tokens has an entity tag set, even if the others are unknown. ~~bool~~ |
-| `sentiment`                             | The document's positivity/negativity score, if available. ~~float~~                                                                                                                      |
-| `user_hooks`                            | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~                                                                                                |
-| `user_token_hooks`                      | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~                                                                                        |
-| `user_span_hooks`                       | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~                                                                                         |
-| `_`                                     | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~                                                            |
+| Name                                 | Description                                                                                                                                 |
+| ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `text`                               | A string representation of the document text. ~~str~~                                                                                       |
+| `text_with_ws`                       | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~                                               |
+| `mem`                                | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~                                                                    |
+| `vocab`                              | The store of lexical types. ~~Vocab~~                                                                                                       |
+| `tensor` <Tag variant="new">2</Tag>  | Container for dense vector representations. ~~numpy.ndarray~~                                                                               |
+| `cats` <Tag variant="new">2</Tag>    | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. ~~Dict[str, float]~~ |
+| `user_data`                          | A generic storage area, for user custom data. ~~Dict[str, Any]~~                                                                            |
+| `lang` <Tag variant="new">2.1</Tag>  | Language of the document's vocabulary. ~~int~~                                                                                              |
+| `lang_` <Tag variant="new">2.1</Tag> | Language of the document's vocabulary. ~~str~~                                                                                              |
+| `sentiment`                          | The document's positivity/negativity score, if available. ~~float~~                                                                         |
+| `user_hooks`                         | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~                                                   |
+| `user_token_hooks`                   | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~                                           |
+| `user_span_hooks`                    | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~                                            |
+| `_`                                  | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~               |
 
 ## Serialization fields {#serialization-fields}
 
diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index 44810da58..346b44600 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -410,6 +410,7 @@ The following methods, attributes and commands are new in spaCy v3.0.
 | ------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | [`Token.lex`](/api/token#attributes)                                                                                            | Access a token's [`Lexeme`](/api/lexeme).                                                                                                                                                        |
 | [`Token.morph`](/api/token#attributes), [`Token.morph_`](/api/token#attributes)                                                 | Access a token's morphological analysis.                                                                                                                                                         |
+| [`Doc.has_annotation`](/api/doc#has_annotation)                                                                                 | Check whether a doc has annotation on a token attribute.                                                                                                                                         |
 | [`Language.select_pipes`](/api/language#select_pipes)                                                                           | Context manager for enabling or disabling specific pipeline components for a block.                                                                                                              |
 | [`Language.disable_pipe`](/api/language#disable_pipe), [`Language.enable_pipe`](/api/language#enable_pipe)                      | Disable or enable a loaded pipeline component (but don't remove it).                                                                                                                             |
 | [`Language.analyze_pipes`](/api/language#analyze_pipes)                                                                         | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies.                                                                                                          |
@@ -763,6 +764,25 @@ nlp = spacy.blank("en")
 + ruler.load_from_tag_map(YOUR_TAG_MAP)
 ```
 
+### Migrating Doc flags {#migrating-doc-flags}
+
+The `Doc` flags `Doc.is_tagged`, `Doc.is_parsed`, `Doc.is_nered` and
+`Doc.is_sentenced` are deprecated in v3 and replaced by
+[`Doc.has_annotation`](/api/doc#has_annotation) method, which refers to the
+token attribute symbols (the same symbols used in `Matcher` patterns):
+
+```diff
+doc = nlp(text)
+- doc.is_parsed
++ doc.has_annotation("DEP")
+- doc.is_tagged
++ doc.has_annotation("TAG")
+- doc.is_sentenced
++ doc.has_annotation("SENT_START")
+- doc.is_nered
++ doc.has_annotation("ENT_IOB")
+```
+
 ### Training pipelines and models {#migrating-training}
 
 To train your pipelines, you should now pretty much always use the

From 8303d101a5327e96ecddb28d7dc668d75db56b50 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 17 Sep 2020 00:18:49 +0200
Subject: [PATCH 10/10] Set version to v3.0.0a19

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 4ed3dd327..4fb6dfff1 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a18"
+__version__ = "3.0.0a19"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"