Refactor Docs.is_ flags (#6044)

* Refactor Docs.is_ flags * Add derived `Doc.has_annotation` method * `Doc.has_annotation(attr)` returns `True` for partial annotation * `Doc.has_annotation(attr, require_complete=True)` returns `True` for complete annotation * Add deprecation warnings to `is_tagged`, `is_parsed`, `is_sentenced` and `is_nered` * Add `Doc._get_array_attrs()`, which returns a full list of `Doc` attrs for use with `Doc.to_array`, `Doc.to_bytes` and `Doc.from_docs`. The list is the `DocBin` attributes list plus `SPACY` and `LENGTH`. Notes on `Doc.has_annotation`: * `HEAD` is converted to `DEP` because heads don't have an unset state * Accept `IS_SENT_START` as a synonym of `SENT_START` Additional changes: * Add `NORM`, `ENT_ID` and `SENT_START` to default attributes for `DocBin` * In `Doc.from_array()` the presence of `DEP` causes `HEAD` to override `SENT_START` * In `Doc.from_array()` using `attrs` other than `Doc._get_array_attrs()` (i.e., a user's custom list rather than our default internal list) with both `HEAD` and `SENT_START` shows a warning that `HEAD` will override `SENT_START` * `set_children_from_heads` does not require dependency labels to set sentence boundaries and sets `sent_start` for all non-sentence starts to `-1` * Fix call to set_children_form_heads Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
2025-12-15 14:14:31 +03:00 · 2020-09-17 00:14:01 +02:00 · 2020-09-17 00:14:01 +02:00 · 7e4cd7575c
commit 7e4cd7575c
parent a119667a36
56 changed files with 350 additions and 282 deletions
--- a/spacy/displacy/init.py
+++ b/spacy/displacy/init.py
@ -121,7 +121,7 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
    RETURNS (dict): Generated dependency parse keyed by words and arcs.
    """
    doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data"]))
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
        warnings.warn(Warnings.W005)
    if options.get("collapse_phrases", False):
        with doc.retokenize() as retokenizer:
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -119,6 +119,11 @@ class Warnings:
    W105 = ("As of spaCy v3.0, the {matcher}.pipe method is deprecated. If you "
            "need to match on a stream of documents, you can use nlp.pipe and "
            "call the {matcher} on each Doc object.")
    W106 = ("Both HEAD and SENT_START are included as attributes in "
            "doc.from_array(). The parse trees based on the HEAD attribute "
            "will override the values in SENT_START.")
    W107 = ("The property Doc.{prop} is deprecated. Use "
            "Doc.has_annotation(\"{attr}\") instead.")
@add_codes
@ -192,11 +197,6 @@ class Errors:
            "Alternatively, add the dependency parser, or set sentence "
            "boundaries by setting doc[i].is_sent_start.")
    E031 = ("Invalid token: empty string ('') at position {i}.")
    E032 = ("Conflicting attributes specified in doc.from_array(): "
            "(HEAD, SENT_START). The HEAD attribute currently sets sentence "
            "boundaries implicitly, based on the tree structure. This means "
            "the HEAD attribute would potentially override the sentence "
            "boundaries set by SENT_START.")
    E033 = ("Cannot load into non-empty Doc of length {length}.")
    E035 = ("Error creating span with start {start} and end {end} for Doc of "
            "length {length}.")
@ -397,8 +397,8 @@ class Errors:
    E154 = ("One of the attributes or values is not supported for token "
            "patterns. Please use the option validate=True with Matcher, "
            "PhraseMatcher, or EntityRuler for more details.")
-    E155 = ("The pipeline needs to include a tagger in order to use "
+    E155 = ("The pipeline needs to include a {pipe} in order to use "
-            "Matcher or PhraseMatcher with the attributes POS, TAG, or LEMMA. "
+            "Matcher or PhraseMatcher with the attribute {attr}. "
            "Try using nlp() instead of nlp.make_doc() or list(nlp.pipe()) "
            "instead of list(nlp.tokenizer.pipe()).")
    E156 = ("The pipeline needs to include a parser in order to use "
--- a/spacy/lang/de/syntax_iterators.py
+++ b/spacy/lang/de/syntax_iterators.py
@ -16,7 +16,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
    labels = ["sb", "oa", "da", "nk", "mo", "ag", "ROOT", "root", "cj", "pd", "og", "app"]
    # fmt: on
    doc = doclike.doc  # Ensure works on both Doc and Span.
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
        raise ValueError(Errors.E029)
    np_label = doc.vocab.strings.add("NP")
    np_deps = set(doc.vocab.strings.add(label) for label in labels)
--- a/spacy/lang/el/syntax_iterators.py
+++ b/spacy/lang/el/syntax_iterators.py
@ -13,7 +13,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
    # Further improvement of the models will eliminate the need for this tag.
    labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"]
    doc = doclike.doc  # Ensure works on both Doc and Span.
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
        raise ValueError(Errors.E029)
    np_deps = [doc.vocab.strings.add(label) for label in labels]
    conj = doc.vocab.strings.add("conj")
--- a/spacy/lang/en/syntax_iterators.py
+++ b/spacy/lang/en/syntax_iterators.py
@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
    labels = ["nsubj", "dobj", "nsubjpass", "pcomp", "pobj", "dative", "appos", "attr", "ROOT"]
    # fmt: on
    doc = doclike.doc  # Ensure works on both Doc and Span.
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
        raise ValueError(Errors.E029)
    np_deps = [doc.vocab.strings.add(label) for label in labels]
    conj = doc.vocab.strings.add("conj")
--- a/spacy/lang/es/syntax_iterators.py
+++ b/spacy/lang/es/syntax_iterators.py
@ -8,7 +8,7 @@ from ...tokens import Doc, Span, Token
 def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
    """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
    doc = doclike.doc
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
        raise ValueError(Errors.E029)
    if not len(doc):
        return
--- a/spacy/lang/fa/syntax_iterators.py
+++ b/spacy/lang/fa/syntax_iterators.py
@ -19,7 +19,7 @@ def noun_chunks(doclike):
    ]
    doc = doclike.doc  # Ensure works on both Doc and Span.
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
        raise ValueError(Errors.E029)
    np_deps = [doc.vocab.strings.add(label) for label in labels]
--- a/spacy/lang/fr/syntax_iterators.py
+++ b/spacy/lang/fr/syntax_iterators.py
@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
    labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
    # fmt: on
    doc = doclike.doc  # Ensure works on both Doc and Span.
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
        raise ValueError(Errors.E029)
    np_deps = [doc.vocab.strings[label] for label in labels]
    conj = doc.vocab.strings.add("conj")
--- a/spacy/lang/id/syntax_iterators.py
+++ b/spacy/lang/id/syntax_iterators.py
@ -13,7 +13,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
    labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
    # fmt: on
    doc = doclike.doc  # Ensure works on both Doc and Span.
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
        raise ValueError(Errors.E029)
    np_deps = [doc.vocab.strings[label] for label in labels]
    conj = doc.vocab.strings.add("conj")
--- a/spacy/lang/nb/syntax_iterators.py
+++ b/spacy/lang/nb/syntax_iterators.py
@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
    labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
    # fmt: on
    doc = doclike.doc  # Ensure works on both Doc and Span.
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
        raise ValueError(Errors.E029)
    np_deps = [doc.vocab.strings[label] for label in labels]
    conj = doc.vocab.strings.add("conj")
--- a/spacy/lang/sv/syntax_iterators.py
+++ b/spacy/lang/sv/syntax_iterators.py
@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
    labels = ["nsubj", "nsubj:pass", "dobj", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
    # fmt: on
    doc = doclike.doc  # Ensure works on both Doc and Span.
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
        raise ValueError(Errors.E029)
    np_deps = [doc.vocab.strings[label] for label in labels]
    conj = doc.vocab.strings.add("conj")
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -17,7 +17,7 @@ from ..vocab cimport Vocab
 from ..tokens.doc cimport Doc, get_token_attr_for_matcher
 from ..tokens.span cimport Span
 from ..tokens.token cimport Token
-from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA
+from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH
 from ..schemas import validate_token_pattern
 from ..errors import Errors, MatchPatternError, Warnings
@ -215,10 +215,15 @@ cdef class Matcher:
        else:
            raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__))
        cdef Pool tmp_pool = Pool()
-        if len(set([LEMMA, POS, TAG]) & self._seen_attrs) > 0 \
+        if TAG in self._seen_attrs and not doc.has_annotation("TAG"):
-          and not doc.is_tagged:
+            raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG"))
-            raise ValueError(Errors.E155.format())
+        if POS in self._seen_attrs and not doc.has_annotation("POS"):
-        if DEP in self._seen_attrs and not doc.is_parsed:
+            raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS"))
        if MORPH in self._seen_attrs and not doc.has_annotation("MORPH"):
            raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH"))
        if LEMMA in self._seen_attrs and not doc.has_annotation("LEMMA"):
            raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA"))
        if DEP in self._seen_attrs and not doc.has_annotation("DEP"):
            raise ValueError(Errors.E156.format())
        matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
                                extensions=self._extensions, predicates=self._extra_predicates)
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@ -4,7 +4,7 @@ from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter
 import warnings
-from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA
+from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA, MORPH
 from ..structs cimport TokenC
 from ..tokens.token cimport Token
 from ..tokens.span cimport Span
@ -184,12 +184,20 @@ cdef class PhraseMatcher:
            if len(doc) == 0:
                continue
            if isinstance(doc, Doc):
-                if self.attr in (POS, TAG, LEMMA) and not doc.is_tagged:
+                attrs = (TAG, POS, MORPH, LEMMA, DEP)
-                    raise ValueError(Errors.E155.format())
+                has_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
-                if self.attr == DEP and not doc.is_parsed:
+                if self.attr == TAG and not has_annotation[TAG]:
                    raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG"))
                if self.attr == POS and not has_annotation[POS]:
                    raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS"))
                if self.attr == MORPH and not has_annotation[MORPH]:
                    raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH"))
                if self.attr == LEMMA and not has_annotation[LEMMA]:
                    raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA"))
                if self.attr == DEP and not has_annotation[DEP]:
                    raise ValueError(Errors.E156.format())
-                if self._validate and (doc.is_tagged or doc.is_parsed) \
+                if self._validate and any(has_annotation.values()) \
-                  and self.attr not in (DEP, POS, TAG, LEMMA):
+                        and self.attr not in attrs:
                    string_attr = self.vocab.strings[self.attr]
                    warnings.warn(Warnings.W012.format(key=key, attr=string_attr))
                keyword = self._convert_to_array(doc)
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@ -679,7 +679,6 @@ cdef class ArcEager(TransitionSystem):
                st._sent[i].dep = self.root_label
    def finalize_doc(self, Doc doc):
        doc.is_parsed = True
        set_children_from_heads(doc.c, 0, doc.length)
    def has_gold(self, Example eg, start=0, end=None):
--- a/spacy/pipeline/functions.py
+++ b/spacy/pipeline/functions.py
@ -17,7 +17,7 @@ def merge_noun_chunks(doc: Doc) -> Doc:
    DOCS: https://nightly.spacy.io/api/pipeline-functions#merge_noun_chunks
    """
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
        return doc
    with doc.retokenize() as retokenizer:
        for np in doc.noun_chunks:
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -204,8 +204,6 @@ class Morphologizer(Tagger):
                doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"][morph])
                doc.c[j].pos = self.cfg["labels_pos"][morph]
            doc.is_morphed = True
    def get_loss(self, examples, scores):
        """Find the loss and gradient of loss for the batch of documents and
        their predicted scores.
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -168,7 +168,6 @@ class Tagger(Pipe):
                # Don't clobber preset POS tags
                if doc.c[j].tag == 0:
                    doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
            doc.is_tagged = True
    def update(self, examples, *, drop=0., sgd=None, losses=None, set_annotations=False):
        """Learn from a batch of documents and gold-standard information,
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -106,6 +106,7 @@ def test_doc_api_serialize(en_tokenizer, text):
    tokens = en_tokenizer(text)
    tokens[0].lemma_ = "lemma"
    tokens[0].norm_ = "norm"
    tokens.ents = [(tokens.vocab.strings["PRODUCT"], 0, 1)]
    tokens[0].ent_kb_id_ = "ent_kb_id"
    new_tokens = Doc(tokens.vocab).from_bytes(tokens.to_bytes())
    assert tokens.text == new_tokens.text
@ -144,7 +145,6 @@ def test_doc_api_set_ents(en_tokenizer):
 def test_doc_api_sents_empty_string(en_tokenizer):
    doc = en_tokenizer("")
    doc.is_parsed = True
    sents = list(doc.sents)
    assert len(sents) == 0
@ -181,10 +181,11 @@ def test_doc_api_right_edge(en_tokenizer):
    text = "I have proposed to myself, for the sake of such as live under the government of the Romans, to translate those books into the Greek tongue."
    heads = [2, 1, 0, -1, -1, -3, 15, 1, -2, -1, 1, -3, -1, -1, 1, -2, -1, 1,
             -2, -7, 1, -19, 1, -2, -3, 2, 1, -3, -26]
    deps = ["dep"] * len(heads)
    # fmt: on
    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
    assert doc[6].text == "for"
    subtree = [w.text for w in doc[6].subtree]
    # fmt: off
@ -240,7 +241,9 @@ def test_doc_api_similarity_match():
 )
 def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix):
    tokens = en_tokenizer(sentence)
-    doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
+    doc = get_doc(
        tokens.vocab, [t.text for t in tokens], heads=heads, deps=["dep"] * len(heads)
    )
    lca = doc.get_lca_matrix()
    assert (lca == lca_matrix).all()
    assert lca[1, 1] == 1
@ -251,16 +254,16 @@ def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix):
 def test_doc_is_nered(en_vocab):
    words = ["I", "live", "in", "New", "York"]
    doc = Doc(en_vocab, words=words)
-    assert not doc.is_nered
+    assert not doc.has_annotation("ENT_IOB")
    doc.ents = [Span(doc, 3, 5, label="GPE")]
-    assert doc.is_nered
+    assert doc.has_annotation("ENT_IOB")
    # Test creating doc from array with unknown values
    arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64")
    doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr)
-    assert doc.is_nered
+    assert doc.has_annotation("ENT_IOB")
    # Test serialization
    new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
-    assert new_doc.is_nered
+    assert new_doc.has_annotation("ENT_IOB")
 def test_doc_from_array_sent_starts(en_vocab):
@ -271,25 +274,35 @@ def test_doc_from_array_sent_starts(en_vocab):
    # fmt: on
    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
    # HEAD overrides SENT_START with warning
    attrs = [SENT_START, HEAD]
    arr = doc.to_array(attrs)
    new_doc = Doc(en_vocab, words=words)
-    with pytest.raises(ValueError):
+    with pytest.warns(UserWarning):
        new_doc.from_array(attrs, arr)
-    attrs = [SENT_START, DEP]
+    # no warning using default attrs
    attrs = doc._get_array_attrs()
    arr = doc.to_array(attrs)
    with pytest.warns(None) as record:
        new_doc.from_array(attrs, arr)
        assert len(record) == 0
    # only SENT_START uses SENT_START
    attrs = [SENT_START]
    arr = doc.to_array(attrs)
    new_doc = Doc(en_vocab, words=words)
    new_doc.from_array(attrs, arr)
    assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc]
-    assert not new_doc.is_parsed
+    assert not new_doc.has_annotation("DEP")
    # only HEAD uses HEAD
    attrs = [HEAD, DEP]
    arr = doc.to_array(attrs)
    new_doc = Doc(en_vocab, words=words)
    new_doc.from_array(attrs, arr)
    assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc]
-    assert new_doc.is_parsed
+    assert new_doc.has_annotation("DEP")
 def test_doc_from_array_morph(en_vocab):
@ -359,9 +372,6 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
    assert m_doc[9].idx == think_idx
    m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"])
    with pytest.raises(ValueError):
        # important attributes from sentenziser or parser are missing
        assert list(m_doc.sents)
    assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
    # space delimiter considered, although spacy attribute was missing
    assert str(m_doc) == " ".join(en_texts_without_empty)
@ -373,6 +383,15 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
    assert m_doc[9].idx == think_idx
 def test_doc_api_from_docs_ents(en_tokenizer):
    texts = ["Merging the docs is fun.", "They don't think alike."]
    docs = [en_tokenizer(t) for t in texts]
    docs[0].ents = ()
    docs[1].ents = (Span(docs[1], 0, 1, label="foo"),)
    doc = Doc.from_docs(docs)
    assert len(doc.ents) == 1
 def test_doc_lang(en_vocab):
    doc = Doc(en_vocab, words=["Hello", "world"])
    assert doc.lang_ == "en"
@ -393,3 +412,45 @@ def test_token_lexeme(en_vocab):
    assert isinstance(token.lex, Lexeme)
    assert token.lex.text == token.text
    assert en_vocab[token.orth] == token.lex
 def test_has_annotation(en_vocab):
    doc = Doc(en_vocab, words=["Hello", "world"])
    attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "HEAD", "ENT_IOB", "ENT_TYPE")
    for attr in attrs:
        assert not doc.has_annotation(attr)
    doc[0].tag_ = "A"
    doc[0].pos_ = "X"
    doc[0].morph_ = "Feat=Val"
    doc[0].lemma_ = "a"
    doc[0].dep_ = "dep"
    doc[0].head = doc[1]
    doc.ents = [Span(doc, 0, 1, label="HELLO")]
    for attr in attrs:
        assert doc.has_annotation(attr)
        assert not doc.has_annotation(attr, require_complete=True)
    doc[1].tag_ = "A"
    doc[1].pos_ = "X"
    doc[1].morph_ = ""
    doc[1].lemma_ = "a"
    doc[1].dep_ = "dep"
    doc.ents = [Span(doc, 0, 2, label="HELLO")]
    for attr in attrs:
        assert doc.has_annotation(attr)
        assert doc.has_annotation(attr, require_complete=True)
 def test_is_flags_deprecated(en_tokenizer):
    doc = en_tokenizer("test")
    with pytest.deprecated_call():
        doc.is_tagged
    with pytest.deprecated_call():
        doc.is_parsed
    with pytest.deprecated_call():
        doc.is_nered
    with pytest.deprecated_call():
        doc.is_sentenced
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@ -24,7 +24,6 @@ def doc_not_parsed(en_tokenizer):
    text = "This is a sentence. This is another sentence. And a third."
    tokens = en_tokenizer(text)
    doc = Doc(tokens.vocab, words=[t.text for t in tokens])
    doc.is_parsed = False
    return doc
@ -71,8 +70,9 @@ def test_spans_string_fn(doc):
 def test_spans_root2(en_tokenizer):
    text = "through North and South Carolina"
    heads = [0, 3, -1, -2, -4]
    deps = ["dep"] * len(heads)
    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
    assert doc[-2:].root.text == "Carolina"
@ -92,7 +92,7 @@ def test_spans_span_sent(doc, doc_not_parsed):
 def test_spans_lca_matrix(en_tokenizer):
    """Test span's lca matrix generation"""
    tokens = en_tokenizer("the lazy dog slept")
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0])
+    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0], deps=["dep"] * 4)
    lca = doc[:2].get_lca_matrix()
    assert lca.shape == (2, 2)
    assert lca[0, 0] == 0  # the & the -> the
--- a/spacy/tests/doc/test_token_api.py
+++ b/spacy/tests/doc/test_token_api.py
@ -114,8 +114,9 @@ def test_doc_token_api_ancestors(en_tokenizer):
 def test_doc_token_api_head_setter(en_tokenizer):
    text = "Yesterday I saw a dog that barked loudly."
    heads = [2, 1, 0, 1, -2, 1, -2, -1, -6]
    deps = ["dep"] * len(heads)
    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
    assert doc[6].n_lefts == 1
    assert doc[6].n_rights == 1
@ -208,7 +209,6 @@ def test_is_sent_start(en_tokenizer):
    assert doc[5].is_sent_start is None
    doc[5].is_sent_start = True
    assert doc[5].is_sent_start is True
    doc.is_parsed = True
    assert len(list(doc.sents)) == 2
@ -217,7 +217,6 @@ def test_is_sent_end(en_tokenizer):
    assert doc[4].is_sent_end is None
    doc[5].is_sent_start = True
    assert doc[4].is_sent_end is True
    doc.is_parsed = True
    assert len(list(doc.sents)) == 2
@ -242,14 +241,14 @@ def test_token0_has_sent_start_true():
    doc = Doc(Vocab(), words=["hello", "world"])
    assert doc[0].is_sent_start is True
    assert doc[1].is_sent_start is None
-    assert not doc.is_sentenced
+    assert not doc.has_annotation("SENT_START")
 def test_tokenlast_has_sent_end_true():
    doc = Doc(Vocab(), words=["hello", "world"])
    assert doc[0].is_sent_end is None
    assert doc[1].is_sent_end is True
-    assert not doc.is_sentenced
+    assert not doc.has_annotation("SENT_START")
 def test_token_api_conjuncts_chain(en_vocab):
--- a/spacy/tests/lang/de/test_noun_chunks.py
+++ b/spacy/tests/lang/de/test_noun_chunks.py
@ -3,11 +3,7 @@ import pytest
 def test_noun_chunks_is_parsed_de(de_tokenizer):
    """Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed.
    To check this test, we're constructing a Doc
    with a new Vocab here and forcing is_parsed to 'False'
    to make sure the noun chunks don't run.
    """
    doc = de_tokenizer("Er lag auf seinem")
    doc.is_parsed = False
    with pytest.raises(ValueError):
        list(doc.noun_chunks)
--- a/spacy/tests/lang/el/test_noun_chunks.py
+++ b/spacy/tests/lang/el/test_noun_chunks.py
@ -3,11 +3,7 @@ import pytest
 def test_noun_chunks_is_parsed_el(el_tokenizer):
    """Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed.
    To check this test, we're constructing a Doc
    with a new Vocab here and forcing is_parsed to 'False'
    to make sure the noun chunks don't run.
    """
    doc = el_tokenizer("είναι χώρα της νοτιοανατολικής")
    doc.is_parsed = False
    with pytest.raises(ValueError):
        list(doc.noun_chunks)
--- a/spacy/tests/lang/en/test_noun_chunks.py
+++ b/spacy/tests/lang/en/test_noun_chunks.py
@ -11,12 +11,8 @@ from ...util import get_doc
 def test_noun_chunks_is_parsed(en_tokenizer):
    """Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed.
    To check this test, we're constructing a Doc
    with a new Vocab here and forcing is_parsed to 'False'
    to make sure the noun chunks don't run.
    """
    doc = en_tokenizer("This is a sentence")
    doc.is_parsed = False
    with pytest.raises(ValueError):
        list(doc.noun_chunks)
--- a/spacy/tests/lang/en/test_sbd.py
+++ b/spacy/tests/lang/en/test_sbd.py
@ -7,8 +7,9 @@ from ...util import get_doc, apply_transition_sequence
@pytest.mark.parametrize("punct", [".", "!", "?", ""])
 def test_en_sbd_single_punct(en_tokenizer, text, punct):
    heads = [2, 1, 0, -1] if punct else [2, 1, 0]
    deps = ["dep"] * len(heads)
    tokens = en_tokenizer(text + punct)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
    assert len(doc) == 4 if punct else 3
    assert len(list(doc.sents)) == 1
    assert sum(len(sent) for sent in doc.sents) == len(doc)
--- a/spacy/tests/lang/es/test_noun_chunks.py
+++ b/spacy/tests/lang/es/test_noun_chunks.py
@ -3,11 +3,7 @@ import pytest
 def test_noun_chunks_is_parsed_es(es_tokenizer):
    """Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed.
    To check this test, we're constructing a Doc
    with a new Vocab here and forcing is_parsed to 'False'
    to make sure the noun chunks don't run.
    """
    doc = es_tokenizer("en Oxford este verano")
    doc.is_parsed = False
    with pytest.raises(ValueError):
        list(doc.noun_chunks)
--- a/spacy/tests/lang/fa/test_noun_chunks.py
+++ b/spacy/tests/lang/fa/test_noun_chunks.py
@ -3,12 +3,8 @@ import pytest
 def test_noun_chunks_is_parsed_fa(fa_tokenizer):
    """Test that noun_chunks raises Value Error for 'fa' language if Doc is not parsed.
    To check this test, we're constructing a Doc
    with a new Vocab here and forcing is_parsed to 'False'
    to make sure the noun chunks don't run.
    """
    doc = fa_tokenizer("این یک جمله نمونه می باشد.")
    doc.is_parsed = False
    with pytest.raises(ValueError):
        list(doc.noun_chunks)
--- a/spacy/tests/lang/fr/test_noun_chunks.py
+++ b/spacy/tests/lang/fr/test_noun_chunks.py
@ -3,11 +3,7 @@ import pytest
 def test_noun_chunks_is_parsed_fr(fr_tokenizer):
    """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed.
    To check this test, we're constructing a Doc
    with a new Vocab here and forcing is_parsed to 'False'
    to make sure the noun chunks don't run.
    """
    doc = fr_tokenizer("trouver des travaux antérieurs")
    doc.is_parsed = False
    with pytest.raises(ValueError):
        list(doc.noun_chunks)
--- a/spacy/tests/lang/id/test_noun_chunks.py
+++ b/spacy/tests/lang/id/test_noun_chunks.py
@ -3,11 +3,7 @@ import pytest
 def test_noun_chunks_is_parsed_id(id_tokenizer):
    """Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed.
    To check this test, we're constructing a Doc
    with a new Vocab here and forcing is_parsed to 'False'
    to make sure the noun chunks don't run.
    """
    doc = id_tokenizer("sebelas")
    doc.is_parsed = False
    with pytest.raises(ValueError):
        list(doc.noun_chunks)
--- a/spacy/tests/lang/nb/test_noun_chunks.py
+++ b/spacy/tests/lang/nb/test_noun_chunks.py
@ -3,11 +3,7 @@ import pytest
 def test_noun_chunks_is_parsed_nb(nb_tokenizer):
    """Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed.
    To check this test, we're constructing a Doc
    with a new Vocab here and forcing is_parsed to 'False'
    to make sure the noun chunks don't run.
    """
    doc = nb_tokenizer("Smørsausen brukes bl.a. til")
    doc.is_parsed = False
    with pytest.raises(ValueError):
        list(doc.noun_chunks)
--- a/spacy/tests/lang/sv/test_noun_chunks.py
+++ b/spacy/tests/lang/sv/test_noun_chunks.py
@ -5,12 +5,8 @@ from ...util import get_doc
 def test_noun_chunks_is_parsed_sv(sv_tokenizer):
    """Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed.
    To check this test, we're constructing a Doc
    with a new Vocab here and forcing is_parsed to 'False'
    to make sure the noun chunks don't run.
    """
    doc = sv_tokenizer("Studenten läste den bästa boken")
    doc.is_parsed = False
    with pytest.raises(ValueError):
        list(doc.noun_chunks)
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@ -301,11 +301,14 @@ def test_matcher_basic_check(en_vocab):
 def test_attr_pipeline_checks(en_vocab):
    doc1 = Doc(en_vocab, words=["Test"])
-    doc1.is_parsed = True
+    doc1[0].dep_ = "ROOT"
    doc2 = Doc(en_vocab, words=["Test"])
-    doc2.is_tagged = True
+    doc2[0].tag_ = "TAG"
    doc2[0].pos_ = "X"
    doc2[0].morph_ = "Feat=Val"
    doc2[0].lemma_ = "LEMMA"
    doc3 = Doc(en_vocab, words=["Test"])
-    # DEP requires is_parsed
+    # DEP requires DEP
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"DEP": "a"}]])
    matcher(doc1)
@ -313,7 +316,7 @@ def test_attr_pipeline_checks(en_vocab):
        matcher(doc2)
    with pytest.raises(ValueError):
        matcher(doc3)
-    # TAG, POS, LEMMA require is_tagged
+    # TAG, POS, LEMMA require those values
    for attr in ("TAG", "POS", "LEMMA"):
        matcher = Matcher(en_vocab)
        matcher.add("TEST", [[{attr: "a"}]])
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@ -187,9 +187,11 @@ def test_phrase_matcher_bool_attrs(en_vocab):
 def test_phrase_matcher_validation(en_vocab):
    doc1 = Doc(en_vocab, words=["Test"])
-    doc1.is_parsed = True
+    doc1[0].dep_ = "ROOT"
    doc2 = Doc(en_vocab, words=["Test"])
-    doc2.is_tagged = True
+    doc2[0].tag_ = "TAG"
    doc2[0].pos_ = "X"
    doc2[0].morph_ = "Feat=Val"
    doc3 = Doc(en_vocab, words=["Test"])
    matcher = PhraseMatcher(en_vocab, validate=True)
    with pytest.warns(UserWarning):
@ -212,18 +214,21 @@ def test_attr_validation(en_vocab):
 def test_attr_pipeline_checks(en_vocab):
    doc1 = Doc(en_vocab, words=["Test"])
-    doc1.is_parsed = True
+    doc1[0].dep_ = "ROOT"
    doc2 = Doc(en_vocab, words=["Test"])
-    doc2.is_tagged = True
+    doc2[0].tag_ = "TAG"
    doc2[0].pos_ = "X"
    doc2[0].morph_ = "Feat=Val"
    doc2[0].lemma_ = "LEMMA"
    doc3 = Doc(en_vocab, words=["Test"])
-    # DEP requires is_parsed
+    # DEP requires DEP
    matcher = PhraseMatcher(en_vocab, attr="DEP")
    matcher.add("TEST1", [doc1])
    with pytest.raises(ValueError):
        matcher.add("TEST2", [doc2])
    with pytest.raises(ValueError):
        matcher.add("TEST3", [doc3])
-    # TAG, POS, LEMMA require is_tagged
+    # TAG, POS, LEMMA require those values
    for attr in ("TAG", "POS", "LEMMA"):
        matcher = PhraseMatcher(en_vocab, attr=attr)
        matcher.add("TEST2", [doc2])
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@ -67,8 +67,9 @@ def test_parser_initial(en_tokenizer, en_parser):
 def test_parser_parse_subtrees(en_tokenizer, en_parser):
    text = "The four wheels on the bus turned quickly"
    heads = [2, 1, 4, -1, 1, -2, 0, -1]
    deps = ["dep"] * len(heads)
    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
    assert len(list(doc[2].lefts)) == 2
    assert len(list(doc[2].rights)) == 1
    assert len(list(doc[2].children)) == 3
@ -184,7 +185,7 @@ def test_parser_set_sent_starts(en_vocab):
        if i == 0 or i == 3:
            assert doc[i].is_sent_start is True
        else:
-            assert not doc[i].is_sent_start
+            assert doc[i].is_sent_start is False
    for sent in doc.sents:
        for token in sent:
            assert token.head in sent
--- a/spacy/tests/parser/test_parse_navigate.py
+++ b/spacy/tests/parser/test_parse_navigate.py
@ -63,7 +63,7 @@ def test_parser_parse_navigate_consistency(en_tokenizer, text, heads):
 def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads):
    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=["dep"] * len(heads))
    lefts = {}
    rights = {}
--- a/spacy/tests/parser/test_space_attachment.py
+++ b/spacy/tests/parser/test_space_attachment.py
@ -8,8 +8,9 @@ from ..util import get_doc, apply_transition_sequence
 def test_parser_space_attachment(en_tokenizer):
    text = "This is a test.\nTo ensure  spaces are attached well."
    heads = [1, 0, 1, -2, -3, -1, 1, 4, -1, 2, 1, 0, -1, -2]
    deps = ["dep"] * len(heads)
    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
    for sent in doc.sents:
        if len(sent) == 1:
            assert not sent[-1].is_space
--- a/spacy/tests/pipeline/test_attributeruler.py
+++ b/spacy/tests/pipeline/test_attributeruler.py
@ -72,6 +72,8 @@ def test_attributeruler_init(nlp, pattern_dicts):
    assert doc[2].morph_ == "Case=Nom|Number=Plur"
    assert doc[3].lemma_ == "cat"
    assert doc[3].morph_ == "Case=Nom|Number=Sing"
    assert doc.has_annotation("LEMMA")
    assert doc.has_annotation("MORPH")
 def test_attributeruler_init_patterns(nlp, pattern_dicts):
@ -82,6 +84,8 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
    assert doc[2].morph_ == "Case=Nom|Number=Plur"
    assert doc[3].lemma_ == "cat"
    assert doc[3].morph_ == "Case=Nom|Number=Sing"
    assert doc.has_annotation("LEMMA")
    assert doc.has_annotation("MORPH")
    nlp.remove_pipe("attribute_ruler")
    # initialize with patterns from asset
    nlp.add_pipe(
@ -93,6 +97,8 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
    assert doc[2].morph_ == "Case=Nom|Number=Plur"
    assert doc[3].lemma_ == "cat"
    assert doc[3].morph_ == "Case=Nom|Number=Sing"
    assert doc.has_annotation("LEMMA")
    assert doc.has_annotation("MORPH")
 def test_attributeruler_score(nlp, pattern_dicts):
--- a/spacy/tests/pipeline/test_functions.py
+++ b/spacy/tests/pipeline/test_functions.py
@ -35,8 +35,6 @@ def doc2(en_tokenizer):
        deps=deps,
    )
    doc.ents = [Span(doc, 2, 4, doc.vocab.strings["GPE"])]
    doc.is_parsed = True
    doc.is_tagged = True
    return doc
--- a/spacy/tests/pipeline/test_sentencizer.py
+++ b/spacy/tests/pipeline/test_sentencizer.py
@ -9,7 +9,7 @@ def test_sentencizer(en_vocab):
    doc = Doc(en_vocab, words=["Hello", "!", "This", "is", "a", "test", "."])
    sentencizer = Sentencizer(punct_chars=None)
    doc = sentencizer(doc)
-    assert doc.is_sentenced
+    assert doc.has_annotation("SENT_START")
    sent_starts = [t.is_sent_start for t in doc]
    sent_ends = [t.is_sent_end for t in doc]
    assert sent_starts == [True, False, True, False, False, False, False]
@ -22,13 +22,13 @@ def test_sentencizer_pipe():
    nlp = English()
    nlp.add_pipe("sentencizer")
    for doc in nlp.pipe(texts):
-        assert doc.is_sentenced
+        assert doc.has_annotation("SENT_START")
        sent_starts = [t.is_sent_start for t in doc]
        assert sent_starts == [True, False, True, False, False, False, False]
        assert len(list(doc.sents)) == 2
    for ex in nlp.pipe(texts):
        doc = ex.doc
-        assert doc.is_sentenced
+        assert doc.has_annotation("SENT_START")
        sent_starts = [t.is_sent_start for t in doc]
        assert sent_starts == [True, False, True, False, False, False, False]
        assert len(list(doc.sents)) == 2
@ -42,7 +42,7 @@ def test_sentencizer_empty_docs():
    nlp.add_pipe("sentencizer")
    for texts in [one_empty_text, many_empty_texts, some_empty_texts]:
        for doc in nlp.pipe(texts):
-            assert doc.is_sentenced
+            assert doc.has_annotation("SENT_START")
            sent_starts = [t.is_sent_start for t in doc]
            if len(doc) == 0:
                assert sent_starts == []
@ -82,7 +82,7 @@ def test_sentencizer_complex(en_vocab, words, sent_starts, sent_ends, n_sents):
    doc = Doc(en_vocab, words=words)
    sentencizer = Sentencizer(punct_chars=None)
    doc = sentencizer(doc)
-    assert doc.is_sentenced
+    assert doc.has_annotation("SENT_START")
    assert [t.is_sent_start for t in doc] == sent_starts
    assert [t.is_sent_end for t in doc] == sent_ends
    assert len(list(doc.sents)) == n_sents
@ -115,7 +115,7 @@ def test_sentencizer_custom_punct(
    doc = Doc(en_vocab, words=words)
    sentencizer = Sentencizer(punct_chars=punct_chars)
    doc = sentencizer(doc)
-    assert doc.is_sentenced
+    assert doc.has_annotation("SENT_START")
    assert [t.is_sent_start for t in doc] == sent_starts
    assert [t.is_sent_end for t in doc] == sent_ends
    assert len(list(doc.sents)) == n_sents
--- a/spacy/tests/regression/test_issue1-1000.py
+++ b/spacy/tests/regression/test_issue1-1000.py
@ -94,7 +94,6 @@ def test_issue309(en_tokenizer):
    doc = get_doc(
        tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=["ROOT"]
    )
    doc.is_parsed = True
    assert len(doc) == 1
    sents = list(doc.sents)
    assert len(sents) == 1
@ -170,11 +169,9 @@ def test_issue595():
 def test_issue599(en_vocab):
    doc = Doc(en_vocab)
    doc.is_tagged = True
    doc.is_parsed = True
    doc2 = Doc(doc.vocab)
    doc2.from_bytes(doc.to_bytes())
-    assert doc2.is_parsed
+    assert doc2.has_annotation("DEP")
 def test_issue600():
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@ -14,7 +14,7 @@ from spacy.tokens import Doc, Span, Token
 from spacy.attrs import HEAD, DEP
 from spacy.matcher import Matcher
-from ..util import make_tempdir
+from ..util import make_tempdir, get_doc
 def test_issue1506():
@ -198,17 +198,26 @@ def test_issue1834():
    """Test that sentence boundaries & parse/tag flags are not lost
    during serialization."""
    string = "This is a first sentence . And another one"
-    doc = Doc(Vocab(), words=string.split())
+    words = string.split()
-    doc[6].sent_start = True
+    doc = get_doc(Vocab(), words=words)
    doc[6].is_sent_start = True
    new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
    assert new_doc[6].sent_start
-    assert not new_doc.is_parsed
+    assert not new_doc.has_annotation("DEP")
-    assert not new_doc.is_tagged
+    assert not new_doc.has_annotation("TAG")
-    doc.is_parsed = True
+    doc = get_doc(
-    doc.is_tagged = True
+        Vocab(),
        words=words,
        tags=["TAG"] * len(words),
        heads=[0, -1, -2, -3, -4, -5, 0, -1, -2],
        deps=["dep"] * len(words),
    )
    print(doc.has_annotation("DEP"), [t.head.i for t in doc], [t.is_sent_start for t in doc])
    new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
-    assert new_doc.is_parsed
+    print(new_doc.has_annotation("DEP"), [t.head.i for t in new_doc], [t.is_sent_start for t in new_doc])
-    assert new_doc.is_tagged
+    assert new_doc[6].sent_start
    assert new_doc.has_annotation("DEP")
    assert new_doc.has_annotation("TAG")
 def test_issue1868():
--- a/spacy/tests/regression/test_issue2001-2500.py
+++ b/spacy/tests/regression/test_issue2001-2500.py
@ -72,8 +72,6 @@ def test_issue2219(en_vocab):
 def test_issue2361(de_tokenizer):
    chars = ("&lt;", "&gt;", "&amp;", "&quot;")
    doc = de_tokenizer('< > & " ')
    doc.is_parsed = True
    doc.is_tagged = True
    html = render(doc)
    for char in chars:
        assert char in html
@ -108,6 +106,7 @@ def test_issue2385_biluo(tags):
 def test_issue2396(en_vocab):
    words = ["She", "created", "a", "test", "for", "spacy"]
    heads = [1, 0, 1, -2, -1, -1]
    deps = ["dep"] * len(heads)
    matrix = numpy.array(
        [
            [0, 1, 1, 1, 1, 1],
@ -119,7 +118,7 @@ def test_issue2396(en_vocab):
        ],
        dtype=numpy.int32,
    )
-    doc = get_doc(en_vocab, words=words, heads=heads)
+    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
    span = doc[:]
    assert (doc.get_lca_matrix() == matrix).all()
    assert (span.get_lca_matrix() == matrix).all()
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@ -16,16 +16,16 @@ from ..util import get_doc
 def test_issue2564():
-    """Test the tagger sets is_tagged correctly when used via Language.pipe."""
+    """Test the tagger sets has_annotation("TAG") correctly when used via Language.pipe."""
    nlp = Language()
    tagger = nlp.add_pipe("tagger")
    tagger.add_label("A")
    nlp.begin_training()
    doc = nlp("hello world")
-    assert doc.is_tagged
+    assert doc.has_annotation("TAG")
    docs = nlp.pipe(["hello", "world"])
    piped_doc = next(docs)
-    assert piped_doc.is_tagged
+    assert piped_doc.has_annotation("TAG")
 def test_issue2569(en_tokenizer):
@ -123,7 +123,7 @@ def test_issue2772(en_vocab):
    heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, 2, 1, -3, -4]
    deps = ["dep"] * len(heads)
    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
-    assert not doc[1].is_sent_start
+    assert doc[1].is_sent_start is False
@pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"])
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@ -63,7 +63,7 @@ def test_issue3012(en_vocab):
    pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
    ents = [(2, 4, "PERCENT")]
    doc = get_doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
-    assert doc.is_tagged
+    assert doc.has_annotation("TAG")
    expected = ("10", "NUM", "CD", "PERCENT")
    assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
@ -83,10 +83,14 @@ def test_issue3012(en_vocab):
 def test_issue3199():
    """Test that Span.noun_chunks works correctly if no noun chunks iterator
    is available. To make this test future-proof, we're constructing a Doc
-    with a new Vocab here and setting is_parsed to make sure the noun chunks run.
+    with a new Vocab here and a parse tree to make sure the noun chunks run.
    """
-    doc = Doc(Vocab(), words=["This", "is", "a", "sentence"])
+    doc = get_doc(
-    doc.is_parsed = True
+        Vocab(),
        words=["This", "is", "a", "sentence"],
        heads=[0, -1, -2, -3],
        deps=["dep"] * 4,
    )
    assert list(doc[0:3].noun_chunks) == []
@ -250,16 +254,16 @@ def test_issue3456():
 def test_issue3468():
-    """Test that sentence boundaries are set correctly so Doc.is_sentenced can
+    """Test that sentence boundaries are set correctly so Doc.has_annotation("SENT_START") can
    be restored after serialization."""
    nlp = English()
    nlp.add_pipe("sentencizer")
    doc = nlp("Hello world")
    assert doc[0].is_sent_start
-    assert doc.is_sentenced
+    assert doc.has_annotation("SENT_START")
    assert len(list(doc.sents)) == 1
    doc_bytes = doc.to_bytes()
    new_doc = Doc(nlp.vocab).from_bytes(doc_bytes)
    assert new_doc[0].is_sent_start
-    assert new_doc.is_sentenced
+    assert new_doc.has_annotation("SENT_START")
    assert len(list(new_doc.sents)) == 1
--- a/spacy/tests/regression/test_issue3501-4000.py
+++ b/spacy/tests/regression/test_issue3501-4000.py
@ -356,7 +356,6 @@ def test_issue3882(en_vocab):
    copy of the Doc.
    """
    doc = Doc(en_vocab, words=["Hello", "world"])
    doc.is_parsed = True
    doc.user_data["test"] = set()
    parse_deps(doc)
@ -386,7 +385,6 @@ def test_issue3959():
    doc[0].pos_ = "NOUN"
    assert doc[0].pos_ == "NOUN"
    # usually this is already True when starting from proper models instead of blank English
    doc.is_tagged = True
    with make_tempdir() as tmp_dir:
        file_path = tmp_dir / "my_doc"
        doc.to_disk(file_path)
--- a/spacy/tests/regression/test_issue4001-4500.py
+++ b/spacy/tests/regression/test_issue4001-4500.py
@ -189,7 +189,6 @@ def test_issue4133(en_vocab):
    for i, token in enumerate(doc):
        token.pos_ = pos[i]
    # usually this is already True when starting from proper models instead of blank English
    doc.is_tagged = True
    doc_bytes = doc.to_bytes()
    vocab = Vocab()
    vocab = vocab.from_bytes(vocab_bytes)
@ -249,7 +248,7 @@ def test_issue4267():
    assert "ner" in nlp.pipe_names
    # assert that we have correct IOB annotations
    doc1 = nlp("hi")
-    assert doc1.is_nered
+    assert doc1.has_annotation("ENT_IOB")
    for token in doc1:
        assert token.ent_iob == 2
    # add entity ruler and run again
@ -260,7 +259,7 @@ def test_issue4267():
    assert "ner" in nlp.pipe_names
    # assert that we still have correct IOB annotations
    doc2 = nlp("hi")
-    assert doc2.is_nered
+    assert doc2.has_annotation("ENT_IOB")
    for token in doc2:
        assert token.ent_iob == 2
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@ -80,7 +80,6 @@ def tagged_doc():
        doc[i].morph_ = morphs[i]
        if i > 0:
            doc[i].is_sent_start = False
    doc.is_tagged = True
    return doc
--- a/spacy/tests/test_training.py
+++ b/spacy/tests/test_training.py
@ -12,7 +12,7 @@ from thinc.api import compounding
 import pytest
 import srsly
-from .util import make_tempdir
+from .util import make_tempdir, get_doc
@pytest.fixture
@ -26,24 +26,16 @@ def doc():
              "NounType=prop|Number=sing", "PunctType=peri"]
    # head of '.' is intentionally nonprojective for testing
    heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5]
    heads = [head - i for i, head in enumerate(heads)]
    deps = ["poss", "case", "nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
    lemmas = ["Sarah", "'s", "sister", "fly", "to", "Silicon", "Valley", "via", "London", "."]
-    biluo_tags = ["U-PERSON", "O", "O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
+    ents = ((0, 2, "PERSON"), (5, 7, "LOC"), (8, 9, "GPE"))
    cats = {"TRAVEL": 1.0, "BAKING": 0.0}
    # fmt: on
    nlp = English()
-    doc = nlp(text)
+    words = [t.text for t in nlp.make_doc(text)]
-    for i in range(len(tags)):
+    doc = get_doc(nlp.vocab, words=words, tags=tags, pos=pos, morphs=morphs, heads=heads, deps=deps, lemmas=lemmas, ents=ents)
        doc[i].tag_ = tags[i]
        doc[i].pos_ = pos[i]
        doc[i].morph_ = morphs[i]
        doc[i].lemma_ = lemmas[i]
        doc[i].dep_ = deps[i]
        doc[i].head = doc[heads[i]]
    doc.ents = spans_from_biluo_tags(doc, biluo_tags)
    doc.cats = cats
    doc.is_tagged = True
    doc.is_parsed = True
    return doc
@ -194,7 +186,7 @@ def test_json2docs_no_ner(en_vocab):
    docs = json2docs(data)
    assert len(docs) == 1
    for doc in docs:
-        assert not doc.is_nered
+        assert not doc.has_annotation("ENT_IOB")
    for token in doc:
        assert token.ent_iob == 0
    eg = Example(
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@ -13,7 +13,7 @@ from ..errors import Errors
 from ..util import ensure_path, SimpleFrozenList
 # fmt: off
-ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")
+ALL_ATTRS = ("ORTH", "NORM", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "ENT_ID", "LEMMA", "MORPH", "POS", "SENT_START")
 # fmt: on
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@ -46,10 +46,6 @@ cdef class Doc:
    cdef TokenC* c
    cdef public bint is_tagged
    cdef public bint is_parsed
    cdef public bint is_morphed
    cdef public float sentiment
    cdef public dict user_hooks
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -24,9 +24,11 @@ from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, NORM
 from ..attrs import intify_attr, IDS
 from ..compat import copy_reg, pickle
 from ..errors import Errors, Warnings
 from ..morphology import Morphology
 from .. import util
 from .underscore import Underscore, get_ext_args
 from ._retokenize import Retokenizer
 from ._serialize import ALL_ATTRS as DOCBIN_ALL_ATTRS
 DEF PADDING = 5
@ -185,8 +187,6 @@ cdef class Doc:
        self.c = data_start + PADDING
        self.max_length = size
        self.length = 0
        self.is_tagged = False
        self.is_parsed = False
        self.sentiment = 0.0
        self.cats = {}
        self.user_hooks = {}
@ -216,11 +216,6 @@ cdef class Doc:
            else:
                lexeme = self.vocab.get_by_orth(self.mem, word)
            self.push_back(lexeme, has_space)
        # Tough to decide on policy for this. Is an empty doc tagged and parsed?
        # There's no information we'd like to add to it, so I guess so?
        if self.length == 0:
            self.is_tagged = True
            self.is_parsed = True
    @property
    def _(self):
@ -228,37 +223,61 @@ cdef class Doc:
        return Underscore(Underscore.doc_extensions, self)
    @property
-    def is_sentenced(self):
+    def is_tagged(self):
-        """Check if the document has sentence boundaries assigned. This is
+        warnings.warn(Warnings.W107.format(prop="is_tagged", attr="TAG"), DeprecationWarning)
-        defined as having at least one of the following:
+        return self.has_annotation("TAG")
-        a) An entry "sents" in doc.user_hooks";
+    @property
-        b) Doc.is_parsed is set to True;
+    def is_parsed(self):
-        c) At least one token other than the first where sent_start is not None.
+        warnings.warn(Warnings.W107.format(prop="is_parsed", attr="DEP"), DeprecationWarning)
-        """
+        return self.has_annotation("DEP")
        if "sents" in self.user_hooks:
            return True
        if self.is_parsed:
            return True
        if len(self) < 2:
            return True
        for i in range(1, self.length):
            if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
                return True
        return False
    @property
    def is_nered(self):
-        """Check if the document has named entities set. Will return True if
+        warnings.warn(Warnings.W107.format(prop="is_nered", attr="ENT_IOB"), DeprecationWarning)
-        *any* of the tokens has a named entity tag set (even if the others are
+        return self.has_annotation("ENT_IOB")
-        unknown values), or if the document is empty.
+
    @property
    def is_sentenced(self):
        warnings.warn(Warnings.W107.format(prop="is_sentenced", attr="SENT_START"), DeprecationWarning)
        return self.has_annotation("SENT_START")
    def has_annotation(self, attr, *, require_complete=False):
        """Check whether the doc contains annotation on a token attribute.
        attr (Union[int, str]): The attribute string name or int ID.
        require_complete (bool): Whether to check that the attribute is set on
            every token in the doc.
        RETURNS (bool): Whether annotation is present.
        DOCS: https://nightly.spacy.io/api/doc#has_annotation
        """
-        if len(self) == 0:
+
        # empty docs are always annotated
        if self.length == 0:
            return True
-        for i in range(self.length):
+        cdef int i
-            if self.c[i].ent_iob != 0:
+        cdef int range_start = 0
        attr = intify_attr(attr)
        # adjust attributes
        if attr == HEAD:
            # HEAD does not have an unset state, so rely on DEP
            attr = DEP
        elif attr == self.vocab.strings["IS_SENT_START"]:
            # as in Matcher, allow IS_SENT_START as an alias of SENT_START
            attr = SENT_START
        # special cases for sentence boundaries
        if attr == SENT_START:
            if "sents" in self.user_hooks:
                return True
-        return False
+            # docs of length 1 always have sentence boundaries
            if self.length == 1:
                return True
            range_start = 1
        if require_complete:
            return all(Token.get_struct_attr(&self.c[i], attr) for i in range(range_start, self.length))
        else:
            return any(Token.get_struct_attr(&self.c[i], attr) for i in range(range_start, self.length))
    def __getitem__(self, object i):
        """Get a `Token` or `Span` object.
@ -628,7 +647,7 @@ cdef class Doc:
        DOCS: https://nightly.spacy.io/api/doc#sents
        """
-        if not self.is_sentenced:
+        if not self.has_annotation("SENT_START"):
            raise ValueError(Errors.E030)
        if "sents" in self.user_hooks:
            yield from self.user_hooks["sents"](self)
@ -652,10 +671,6 @@ cdef class Doc:
        return self.vocab.lang
    cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
        if self.length == 0:
            # Flip these to false when we see the first token.
            self.is_tagged = False
            self.is_parsed = False
        if self.length == self.max_length:
            self._realloc(self.length * 2)
        cdef TokenC* t = &self.c[self.length]
@ -802,8 +817,8 @@ cdef class Doc:
        if array.dtype != numpy.uint64:
            warnings.warn(Warnings.W028.format(type=array.dtype))
-        if SENT_START in attrs and HEAD in attrs:
+        if set(attrs) != set(Doc._get_array_attrs()) and SENT_START in attrs and HEAD in attrs:
-            raise ValueError(Errors.E032)
+            warnings.warn(Warnings.W106)
        cdef int i, col
        cdef int32_t abs_head_index
        cdef attr_id_t attr_id
@ -863,18 +878,17 @@ cdef class Doc:
                    # add morph to morphology table
                    self.vocab.morphology.add(self.vocab.strings[value])
                Token.set_struct_attr(token, attr_ids[j], value)
-        # Set flags
+        # If document is parsed, set children and sentence boundaries
-        self.is_parsed = bool(self.is_parsed or HEAD in attrs)
+        if HEAD in attrs and DEP in attrs:
-        self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs)
+            col = attrs.index(DEP)
-        # If document is parsed, set children
+            if array[:, col].any():
-        if self.is_parsed:
+                set_children_from_heads(self.c, 0, length)
            set_children_from_heads(self.c, 0, length)
        return self
    @staticmethod
    def from_docs(docs, ensure_whitespace=True, attrs=None):
-        """Concatenate multiple Doc objects to form a new one. Raises an error if the `Doc` objects do not all share
+        """Concatenate multiple Doc objects to form a new one. Raises an error
-        the same `Vocab`.
+        if the `Doc` objects do not all share the same `Vocab`.
        docs (list): A list of Doc objects.
        ensure_whitespace (bool): Insert a space between two adjacent docs whenever the first doc does not end in whitespace.
@ -892,16 +906,7 @@ cdef class Doc:
        (vocab,) = vocab
        if attrs is None:
-            attrs = [LEMMA, NORM]
+            attrs = Doc._get_array_attrs()
            if all(doc.is_nered for doc in docs):
                attrs.extend([ENT_IOB, ENT_KB_ID, ENT_TYPE])
            # TODO: separate for is_morphed?
            if all(doc.is_tagged for doc in docs):
                attrs.extend([TAG, POS, MORPH])
            if all(doc.is_parsed for doc in docs):
                attrs.extend([HEAD, DEP])
            else:
                attrs.append(SENT_START)
        else:
            if any(isinstance(attr, str) for attr in attrs):     # resolve attribute names
                attrs = [intify_attr(attr) for attr in attrs]    # intify_attr returns None for invalid attrs
@ -973,9 +978,6 @@ cdef class Doc:
        other.tensor = copy.deepcopy(self.tensor)
        other.cats = copy.deepcopy(self.cats)
        other.user_data = copy.deepcopy(self.user_data)
        other.is_tagged = self.is_tagged
        other.is_parsed = self.is_parsed
        other.is_morphed = self.is_morphed
        other.sentiment = self.sentiment
        other.has_unknown_spaces = self.has_unknown_spaces
        other.user_hooks = dict(self.user_hooks)
@ -1049,22 +1051,16 @@ cdef class Doc:
        DOCS: https://nightly.spacy.io/api/doc#to_bytes
        """
-        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, NORM, ENT_KB_ID]
+        array_head = Doc._get_array_attrs()
        if self.is_tagged:
            array_head.extend([TAG, POS])
        # If doc parsed add head and dep attribute
        if self.is_parsed:
            array_head.extend([HEAD, DEP])
        # Otherwise add sent_start
        else:
            array_head.append(SENT_START)
        strings = set()
        for token in self:
            strings.add(token.tag_)
            strings.add(token.lemma_)
            strings.add(token.morph_)
            strings.add(token.dep_)
            strings.add(token.ent_type_)
            strings.add(token.ent_kb_id_)
            strings.add(token.ent_id_)
            strings.add(token.norm_)
        # Msgpack doesn't distinguish between lists and tuples, which is
        # vexing for user data. As a best guess, we *know* that within
@ -1214,22 +1210,29 @@ cdef class Doc:
        DOCS: https://nightly.spacy.io/api/doc#to_json
        """
        data = {"text": self.text}
-        if self.is_nered:
+        if self.has_annotation("ENT_IOB"):
            data["ents"] = [{"start": ent.start_char, "end": ent.end_char,
                            "label": ent.label_} for ent in self.ents]
-        if self.is_sentenced:
+        if self.has_annotation("SENT_START"):
            sents = list(self.sents)
            data["sents"] = [{"start": sent.start_char, "end": sent.end_char}
                             for sent in sents]
        if self.cats:
            data["cats"] = self.cats
        data["tokens"] = []
        attrs = ["TAG", "MORPH", "POS", "LEMMA", "DEP"]
        include_annotation = {attr: self.has_annotation(attr) for attr in attrs}
        for token in self:
            token_data = {"id": token.i, "start": token.idx, "end": token.idx + len(token)}
-            if self.is_tagged:
+            if include_annotation["TAG"]:
                token_data["pos"] = token.pos_
                token_data["tag"] = token.tag_
-            if self.is_parsed:
+            if include_annotation["POS"]:
                token_data["pos"] = token.pos_
            if include_annotation["MORPH"]:
                token_data["morph"] = token.morph_
            if include_annotation["LEMMA"]:
                token_data["lemma"] = token.lemma_
            if include_annotation["DEP"]:
                token_data["dep"] = token.dep_
                token_data["head"] = token.head.i
            data["tokens"].append(token_data)
@ -1275,6 +1278,12 @@ cdef class Doc:
                    j += 1
        return output
    @staticmethod
    def _get_array_attrs():
        attrs = [LENGTH, SPACY]
        attrs.extend(intify_attr(x) for x in DOCBIN_ALL_ATTRS)
        return tuple(attrs)
 cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2:
    cdef int i = token_by_char(tokens, length, start_char)
@ -1335,7 +1344,7 @@ cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
        tokens[i].sent_start = -1
    for i in range(start, end):
        if tokens[i].head == 0:
-            tokens[tokens[i].l_edge].sent_start = True
+            tokens[tokens[i].l_edge].sent_start = 1
 cdef int _set_lr_kids_and_edges(TokenC* tokens, int start, int end, int loop_count) except -1:
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -201,7 +201,7 @@ cdef class Span:
        return Underscore(Underscore.span_extensions, self,
                          start=self.start_char, end=self.end_char)
-    def as_doc(self, bint copy_user_data=False):
+    def as_doc(self, *, bint copy_user_data=False):
        """Create a `Doc` object with a copy of the `Span`'s data.
        copy_user_data (bool): Whether or not to copy the original doc's user data.
@ -209,19 +209,10 @@ cdef class Span:
        DOCS: https://nightly.spacy.io/api/span#as_doc
        """
        # TODO: make copy_user_data a keyword-only argument (Python 3 only)
        words = [t.text for t in self]
        spaces = [bool(t.whitespace_) for t in self]
        cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces)
-        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, ENT_KB_ID]
+        array_head = self.doc._get_array_attrs()
        if self.doc.is_tagged:
            array_head.append(TAG)
        # If doc parsed add head and dep attribute
        if self.doc.is_parsed:
            array_head.extend([HEAD, DEP])
        # Otherwise add sent_start
        else:
            array_head.append(SENT_START)
        array = self.doc.to_array(array_head)
        array = array[self.start : self.end]
        self._fix_dep_copy(array_head, array)
@ -375,7 +366,7 @@ cdef class Span:
        self.doc.sents
        # Use `sent_start` token attribute to find sentence boundaries
        cdef int n = 0
-        if self.doc.is_sentenced:
+        if self.doc.has_annotation("SENT_START"):
            # Find start of the sentence
            start = self.start
            while self.doc.c[start].sent_start != 1 and start > 0:
@ -507,8 +498,6 @@ cdef class Span:
        DOCS: https://nightly.spacy.io/api/span#noun_chunks
        """
        if not self.doc.is_parsed:
            raise ValueError(Errors.E029)
        # Accumulate the result before beginning to iterate over it. This
        # prevents the tokenisation from being changed out from under us
        # during the iteration. The tricky thing here is that Span accepts
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -486,7 +486,7 @@ cdef class Token:
                return True
        def __set__(self, value):
-            if self.doc.is_parsed:
+            if self.doc.has_annotation("DEP"):
                raise ValueError(Errors.E043)
            if value is None:
                self.c.sent_start = 0
--- a/spacy/training/converters/conllu2docs.py
+++ b/spacy/training/converters/conllu2docs.py
@ -212,8 +212,6 @@ def doc_from_conllu_sentence(
        doc[i]._.merged_spaceafter = spaces[i]
    ents = get_entities(lines, ner_tag_pattern, ner_map)
    doc.ents = spans_from_biluo_tags(doc, ents)
    doc.is_parsed = True
    doc.is_tagged = True
    if merge_subtokens:
        doc = merge_conllu_subtokens(lines, doc)
@ -243,8 +241,6 @@ def doc_from_conllu_sentence(
        doc_x[i].dep_ = deps[i]
        doc_x[i].head = doc_x[heads[i]]
    doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents]
    doc_x.is_parsed = True
    doc_x.is_tagged = True
    return doc_x
--- a/spacy/training/gold_io.pyx
+++ b/spacy/training/gold_io.pyx
@ -33,19 +33,25 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
                link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
                json_para["links"].append(link_dict)
        biluo_tags = biluo_tags_from_offsets(doc, json_para["entities"], missing=ner_missing_tag)
        attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "ENT_IOB")
        include_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
        for j, sent in enumerate(doc.sents):
            json_sent = {"tokens": [], "brackets": []}
            for token in sent:
                json_token = {"id": token.i, "orth": token.text, "space": token.whitespace_}
-                if doc.is_tagged:
+                if include_annotation["TAG"]:
                    json_token["tag"] = token.tag_
                if include_annotation["POS"]:
                    json_token["pos"] = token.pos_
                if include_annotation["MORPH"]:
                    json_token["morph"] = token.morph_
                if include_annotation["LEMMA"]:
                    json_token["lemma"] = token.lemma_
-                if doc.is_parsed:
+                if include_annotation["DEP"]:
                    json_token["head"] = token.head.i-token.i
                    json_token["dep"] = token.dep_
-                json_token["ner"] = biluo_tags[token.i]
+                if include_annotation["ENT_IOB"]:
                    json_token["ner"] = biluo_tags[token.i]
                json_sent["tokens"].append(json_token)
            json_para["sentences"].append(json_sent)
        json_doc["paragraphs"].append(json_para)
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@ -267,6 +267,17 @@ ancestor is found, e.g. if span excludes a necessary ancestor.
 | ----------- | -------------------------------------------------------------------------------------- |
 | **RETURNS** | The lowest common ancestor matrix of the `Doc`. ~~numpy.ndarray[ndim=2, dtype=int32]~~ |
 ## Doc.has_annotation {#has_annotation tag="method"}
 Check whether the doc contains annotation on a token attribute.
 | Name               | Description                                                                                         |
 | ------------------ | --------------------------------------------------------------------------------------------------- |
 | `attr`             | The attribute string name or int ID. ~~Union[int, str]~~                                            |
 | _keyword-only_     |                                                                                                     |
 | `require_complete` | Whether to check that the attribute is set on every token in the doc. Defaults to `False`. ~~bool~~ |
 | **RETURNS**        | Whether specified annotation is present in the doc. ~~bool~~                                        |
 ## Doc.to_array {#to_array tag="method"}
 Export given token attributes to a numpy `ndarray`. If `attr_ids` is a sequence
@ -609,26 +620,22 @@ The L2 norm of the document's vector representation.
 ## Attributes {#attributes}
-| Name                                    | Description                                                                                                                                                                              |
+| Name                                 | Description                                                                                                                                 |
-| --------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| `text`                                  | A string representation of the document text. ~~str~~                                                                                                                                    |
+| `text`                               | A string representation of the document text. ~~str~~                                                                                       |
-| `text_with_ws`                          | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~                                                                                            |
+| `text_with_ws`                       | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~                                               |
-| `mem`                                   | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~                                                                                                                 |
+| `mem`                                | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~                                                                    |
-| `vocab`                                 | The store of lexical types. ~~Vocab~~                                                                                                                                                    |
+| `vocab`                              | The store of lexical types. ~~Vocab~~                                                                                                       |
-| `tensor` <Tag variant="new">2</Tag>     | Container for dense vector representations. ~~numpy.ndarray~~                                                                                                                            |
+| `tensor` <Tag variant="new">2</Tag>  | Container for dense vector representations. ~~numpy.ndarray~~                                                                               |
-| `cats` <Tag variant="new">2</Tag>       | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. ~~Dict[str, float]~~                                              |
+| `cats` <Tag variant="new">2</Tag>    | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. ~~Dict[str, float]~~ |
-| `user_data`                             | A generic storage area, for user custom data. ~~Dict[str, Any]~~                                                                                                                         |
+| `user_data`                          | A generic storage area, for user custom data. ~~Dict[str, Any]~~                                                                            |
-| `lang` <Tag variant="new">2.1</Tag>     | Language of the document's vocabulary. ~~int~~                                                                                                                                           |
+| `lang` <Tag variant="new">2.1</Tag>  | Language of the document's vocabulary. ~~int~~                                                                                              |
-| `lang_` <Tag variant="new">2.1</Tag>    | Language of the document's vocabulary. ~~str~~                                                                                                                                           |
+| `lang_` <Tag variant="new">2.1</Tag> | Language of the document's vocabulary. ~~str~~                                                                                              |
-| `is_tagged`                             | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty. ~~bool~~                                                                       |
+| `sentiment`                          | The document's positivity/negativity score, if available. ~~float~~                                                                         |
-| `is_parsed`                             | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty. ~~bool~~                                                                        |
+| `user_hooks`                         | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~                                                   |
-| `is_sentenced`                          | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty. ~~bool~~                                                             |
+| `user_token_hooks`                   | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~                                           |
-| `is_nered` <Tag variant="new">2.1</Tag> | A flag indicating that named entities have been set. Will return `True` if the `Doc` is empty, or if _any_ of the tokens has an entity tag set, even if the others are unknown. ~~bool~~ |
+| `user_span_hooks`                    | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~                                            |
-| `sentiment`                             | The document's positivity/negativity score, if available. ~~float~~                                                                                                                      |
+| `_`                                  | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~               |
 | `user_hooks`                            | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~                                                                                                |
 | `user_token_hooks`                      | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~                                                                                        |
 | `user_span_hooks`                       | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~                                                                                         |
 | `_`                                     | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~                                                            |
 ## Serialization fields {#serialization-fields}
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@ -410,6 +410,7 @@ The following methods, attributes and commands are new in spaCy v3.0.
 | ------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | [`Token.lex`](/api/token#attributes)                                                                                            | Access a token's [`Lexeme`](/api/lexeme).                                                                                                                                                        |
 | [`Token.morph`](/api/token#attributes), [`Token.morph_`](/api/token#attributes)                                                 | Access a token's morphological analysis.                                                                                                                                                         |
 | [`Doc.has_annotation`](/api/doc#has_annotation)                                                                                 | Check whether a doc has annotation on a token attribute.                                                                                                                                         |
 | [`Language.select_pipes`](/api/language#select_pipes)                                                                           | Context manager for enabling or disabling specific pipeline components for a block.                                                                                                              |
 | [`Language.disable_pipe`](/api/language#disable_pipe), [`Language.enable_pipe`](/api/language#enable_pipe)                      | Disable or enable a loaded pipeline component (but don't remove it).                                                                                                                             |
 | [`Language.analyze_pipes`](/api/language#analyze_pipes)                                                                         | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies.                                                                                                          |
@ -763,6 +764,25 @@ nlp = spacy.blank("en")
 + ruler.load_from_tag_map(YOUR_TAG_MAP)
 ```
 ### Migrating Doc flags {#migrating-doc-flags}
 The `Doc` flags `Doc.is_tagged`, `Doc.is_parsed`, `Doc.is_nered` and
 `Doc.is_sentenced` are deprecated in v3 and replaced by
 [`Doc.has_annotation`](/api/doc#has_annotation) method, which refers to the
 token attribute symbols (the same symbols used in `Matcher` patterns):
 ```diff
 doc = nlp(text)
 - doc.is_parsed
 + doc.has_annotation("DEP")
 - doc.is_tagged
 + doc.has_annotation("TAG")
 - doc.is_sentenced
 + doc.has_annotation("SENT_START")
 - doc.is_nered
 + doc.has_annotation("ENT_IOB")
 ```
 ### Training pipelines and models {#migrating-training}
 To train your pipelines, you should now pretty much always use the