Merge

2025-07-16 03:02:41 +03:00 · 2020-10-01 23:07:53 +02:00 · 2020-10-01 23:07:53 +02:00 · 75a1569908
commit 75a1569908
parent b854bca15c 300e5a9928
59 changed files with 576 additions and 342 deletions
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a26"
+__version__ = "3.0.0a28"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@ -7,6 +7,7 @@ import srsly

 from .. import util
 from ..training.initialize import init_nlp, convert_vectors
+from ..language import Language
 from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
 from ._util import import_code, setup_gpu

@ -19,9 +20,9 @@ def init_vectors_cli(
    output_dir: Path = Arg(..., help="Pipeline output directory"),
    prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
    truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
-    jsonl_loc: Optional[Path]=Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file"),
    name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
+    jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
    # fmt: on
 ):
    """Convert word vectors for use with spaCy. Will export an nlp object that
@ -32,12 +33,7 @@ def init_vectors_cli(
    msg.info(f"Creating blank nlp object for language '{lang}'")
    nlp = util.get_lang_class(lang)()
    if jsonl_loc is not None:
-        lex_attrs = srsly.read_jsonl(jsonl_loc)
-        for attrs in lex_attrs:
-            if "settings" in attrs:
-                continue
-            lexeme = nlp.vocab[attrs["orth"]]
-            lexeme.set_attrs(**attrs)
+        update_lexemes(nlp, jsonl_loc)
    convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
    msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
    nlp.to_disk(output_dir)
@ -48,6 +44,16 @@ def init_vectors_cli(
    )


+def update_lexemes(nlp: Language, jsonl_loc: Path) -> None:
+    # Mostly used for backwards-compatibility and may be removed in the future
+    lex_attrs = srsly.read_jsonl(jsonl_loc)
+    for attrs in lex_attrs:
+        if "settings" in attrs:
+            continue
+        lexeme = nlp.vocab[attrs["orth"]]
+        lexeme.set_attrs(**attrs)
+
+
@init_cli.command(
    "nlp",
    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
@ -89,7 +95,7 @@ def init_labels_cli(
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
    # fmt: on
 ):
-    """Generate a JSON file for labels in the data. This helps speed up the
+    """Generate JSON files for the labels in the data. This helps speed up the
    training process, since spaCy won't have to preprocess the data to
    extract the labels."""
    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@ -2,7 +2,6 @@
 train = null
 dev = null
 vectors = null
-vocab_data = null
 init_tok2vec = null

 [system]
@ -11,8 +10,13 @@ gpu_allocator = null

 [nlp]
 lang = null
+# List of pipeline component names, in order. The names should correspond to
+# components defined in the [components block]
 pipeline = []
+# Components that are loaded but disabled by default
 disabled = []
+# Optional callbacks to modify the nlp object before it's initialized, after
+# it's created and after the pipeline has been set up
 before_creation = null
 after_creation = null
 after_pipeline_creation = null
@ -20,6 +24,7 @@ after_pipeline_creation = null
 [nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1"

+# The pipeline components and their models
 [components]

 # Readers for corpora like dev and train.
@ -38,8 +43,7 @@ max_length = 0
 limit = 0
 # Apply some simply data augmentation, where we replace tokens with variations.
 # This is especially useful for punctuation and case replacement, to help
-# generalize beyond corpora that don't have smart-quotes, or only have smart
-# quotes, etc.
+# generalize beyond corpora that don't/only have smart quotes etc.
 augmenter = null

 [corpora.dev]
@ -53,6 +57,7 @@ gold_preproc = false
 max_length = 0
 # Limitation on number of training examples
 limit = 0
+# Optional callback for data augmentation
 augmenter = null

 # Training hyper-parameters and additional features.
@ -102,17 +107,18 @@ use_averages = false
 eps = 1e-8
 learn_rate = 0.001

-# The 'initialize' step is run before training or pretraining. Components and
-# the tokenizer can each define their own arguments via their .initialize
-# methods that are populated by the config. This lets them gather resources like
-# lookup tables and build label sets, construct vocabularies, etc.
+# These settings are used when nlp.initialize() is called (typically before
+# training or pretraining). Components and the tokenizer can each define their
+# own arguments via their initialize methods that are populated by the config.
+# This lets them gather data resources, build label sets etc.
 [initialize]
-vocab_data = ${paths.vocab_data}
-lookups = null
 vectors = ${paths.vectors}
 # Extra resources for transfer-learning or pseudo-rehearsal
 init_tok2vec = ${paths.init_tok2vec}
+# Data and lookups for vocabulary
+vocab_data = null
+lookups = null
 # Arguments passed to the tokenizer's initialize method
 tokenizer = {}
-# Arguments passed to the initialize methods of the components (keyed by component name)
+# Arguments for initialize methods of the components (keyed by component)
 components = {}
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -710,6 +710,9 @@ class Errors:
             "options: {modes}")
    E1012 = ("Entity spans and blocked/missing/outside spans should be "
             "provided to doc.set_ents as lists of `Span` objects.")
+    E1013 = ("Invalid morph: the MorphAnalysis must have the same vocab as the "
+             "token itself. To set the morph from this MorphAnalysis, set from "
+             "the string value with: `token.set_morph(str(other_morph))`.")


@add_codes
--- a/spacy/lang/da/init.py
+++ b/spacy/lang/da/init.py
@ -3,21 +3,9 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from ...language import Language
-from ...util import load_config_from_str
-
-
-DEFAULT_CONFIG = """
-[initialize]
-
-[initialize.lookups]
-@misc = "spacy.LookupsDataLoader.v1"
-lang = ${nlp.lang}
-tables = ["lexeme_norm"]
-"""


 class DanishDefaults(Language.Defaults):
-    config = load_config_from_str(DEFAULT_CONFIG)
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    infixes = TOKENIZER_INFIXES
    suffixes = TOKENIZER_SUFFIXES
--- a/spacy/lang/de/init.py
+++ b/spacy/lang/de/init.py
@ -3,21 +3,9 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIX
 from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
 from ...language import Language
-from ...util import load_config_from_str
-
-
-DEFAULT_CONFIG = """
-[initialize]
-
-[initialize.lookups]
-@misc = "spacy.LookupsDataLoader.v1"
-lang = ${nlp.lang}
-tables = ["lexeme_norm"]
-"""


 class GermanDefaults(Language.Defaults):
-    config = load_config_from_str(DEFAULT_CONFIG)
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    prefixes = TOKENIZER_PREFIXES
    suffixes = TOKENIZER_SUFFIXES
--- a/spacy/lang/el/init.py
+++ b/spacy/lang/el/init.py
@ -9,21 +9,9 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIX
 from .lemmatizer import GreekLemmatizer
 from ...lookups import Lookups
 from ...language import Language
-from ...util import load_config_from_str
-
-
-DEFAULT_CONFIG = """
-[initialize]
-
-[initialize.lookups]
-@misc = "spacy.LookupsDataLoader.v1"
-lang = ${nlp.lang}
-tables = ["lexeme_norm"]
-"""


 class GreekDefaults(Language.Defaults):
-    config = load_config_from_str(DEFAULT_CONFIG)
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    prefixes = TOKENIZER_PREFIXES
    suffixes = TOKENIZER_SUFFIXES
--- a/spacy/lang/id/init.py
+++ b/spacy/lang/id/init.py
@ -4,21 +4,9 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
 from ...language import Language
-from ...util import load_config_from_str
-
-
-DEFAULT_CONFIG = """
-[initialize]
-
-[initialize.lookups]
-@misc = "spacy.LookupsDataLoader.v1"
-lang = ${nlp.lang}
-tables = ["lexeme_norm"]
-"""


 class IndonesianDefaults(Language.Defaults):
-    config = load_config_from_str(DEFAULT_CONFIG)
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    prefixes = TOKENIZER_PREFIXES
    suffixes = TOKENIZER_SUFFIXES
--- a/spacy/lang/lb/init.py
+++ b/spacy/lang/lb/init.py
@ -3,21 +3,9 @@ from .punctuation import TOKENIZER_INFIXES
 from .lex_attrs import LEX_ATTRS
 from .stop_words import STOP_WORDS
 from ...language import Language
-from ...util import load_config_from_str
-
-
-DEFAULT_CONFIG = """
-[initialize]
-
-[initialize.lookups]
-@misc = "spacy.LookupsDataLoader.v1"
-lang = ${nlp.lang}
-tables = ["lexeme_norm"]
-"""


 class LuxembourgishDefaults(Language.Defaults):
-    config = load_config_from_str(DEFAULT_CONFIG)
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    infixes = TOKENIZER_INFIXES
    lex_attr_getters = LEX_ATTRS
--- a/spacy/lang/pt/init.py
+++ b/spacy/lang/pt/init.py
@ -3,21 +3,9 @@ from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
 from ...language import Language
-from ...util import load_config_from_str
-
-
-DEFAULT_CONFIG = """
-[initialize]
-
-[initialize.lookups]
-@misc = "spacy.LookupsDataLoader.v1"
-lang = ${nlp.lang}
-tables = ["lexeme_norm"]
-"""


 class PortugueseDefaults(Language.Defaults):
-    config = load_config_from_str(DEFAULT_CONFIG)
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    infixes = TOKENIZER_INFIXES
    prefixes = TOKENIZER_PREFIXES
--- a/spacy/lang/ru/init.py
+++ b/spacy/lang/ru/init.py
@ -7,21 +7,9 @@ from .lex_attrs import LEX_ATTRS
 from .lemmatizer import RussianLemmatizer
 from ...language import Language
 from ...lookups import Lookups
-from ...util import load_config_from_str
-
-
-DEFAULT_CONFIG = """
-[initialize]
-
-[initialize.lookups]
-@misc = "spacy.LookupsDataLoader.v1"
-lang = ${nlp.lang}
-tables = ["lexeme_norm"]
-"""


 class RussianDefaults(Language.Defaults):
-    config = load_config_from_str(DEFAULT_CONFIG)
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    lex_attr_getters = LEX_ATTRS
    stop_words = STOP_WORDS
--- a/spacy/lang/sr/init.py
+++ b/spacy/lang/sr/init.py
@ -2,21 +2,9 @@ from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .lex_attrs import LEX_ATTRS
 from ...language import Language
-from ...util import load_config_from_str
-
-
-DEFAULT_CONFIG = """
-[initialize]
-
-[initialize.lookups]
-@misc = "spacy.LookupsDataLoader.v1"
-lang = ${nlp.lang}
-tables = ["lexeme_norm"]
-"""


 class SerbianDefaults(Language.Defaults):
-    config = load_config_from_str(DEFAULT_CONFIG)
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    lex_attr_getters = LEX_ATTRS
    stop_words = STOP_WORDS
--- a/spacy/lang/ta/init.py
+++ b/spacy/lang/ta/init.py
@ -1,21 +1,9 @@
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from ...language import Language
-from ...util import load_config_from_str
-
-
-DEFAULT_CONFIG = """
-[initialize]
-
-[initialize.lookups]
-@misc = "spacy.LookupsDataLoader.v1"
-lang = ${nlp.lang}
-tables = ["lexeme_norm"]
-"""


 class TamilDefaults(Language.Defaults):
-    config = load_config_from_str(DEFAULT_CONFIG)
    lex_attr_getters = LEX_ATTRS
    stop_words = STOP_WORDS

--- a/spacy/lang/th/init.py
+++ b/spacy/lang/th/init.py
@ -10,13 +10,6 @@ DEFAULT_CONFIG = """

 [nlp.tokenizer]
@tokenizers = "spacy.th.ThaiTokenizer"
-
-[initialize]
-
-[initialize.lookups]
-@misc = "spacy.LookupsDataLoader.v1"
-lang = ${nlp.lang}
-tables = ["lexeme_norm"]
 """


--- a/spacy/ml/featureextractor.py
+++ b/spacy/ml/featureextractor.py
@ -0,0 +1,25 @@
+from typing import List, Union, Callable, Tuple
+from thinc.types import Ints2d, Doc
+from thinc.api import Model, registry
+
+
+
+@registry.layers("spacy.FeatureExtractor.v1")
+def FeatureExtractor(columns: List[Union[int, str]]) -> Model[List[Doc], List[Ints2d]]:
+    return Model("extract_features", forward, attrs={"columns": columns})
+
+
+def forward(model: Model[List[Doc], List[Ints2d]], docs, is_train: bool) -> Tuple[List[Ints2d], Callable]:
+    columns = model.attrs["columns"]
+    features: List[Ints2d] = []
+    for doc in docs:
+        if hasattr(doc, "to_array"):
+            attrs = doc.to_array(columns)
+        else:
+            attrs = doc.doc.to_array(columns)[doc.start : doc.end]
+        if attrs.ndim == 1:
+            attrs = attrs.reshape((attrs.shape[0], 1))
+        features.append(model.ops.asarray2i(attrs, dtype="uint64"))
+
+    backprop: Callable[[List[Ints2d]], List] = lambda d_features: []
+    return features, backprop
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@ -3,12 +3,13 @@ from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
 from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
 from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
 from thinc.api import HashEmbed, with_array, with_cpu, uniqued
-from thinc.api import Relu, residual, expand_window, FeatureExtractor
+from thinc.api import Relu, residual, expand_window

 from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
 from ...util import registry
 from ..extract_ngrams import extract_ngrams
 from ..staticvectors import StaticVectors
+from ..featureextractor import FeatureExtractor


@registry.architectures.register("spacy.TextCatCNN.v1")
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@ -1,16 +1,16 @@
 from typing import Optional, List, Union
-from thinc.api import chain, clone, concatenate, with_array, with_padded
-from thinc.api import Model, noop, list2ragged, ragged2list
-from thinc.api import FeatureExtractor, HashEmbed
-from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
 from thinc.types import Floats2d
+from thinc.api import chain, clone, concatenate, with_array, with_padded
+from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed
+from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM

 from ...tokens import Doc
 from ...util import registry
 from ...ml import _character_embed
 from ..staticvectors import StaticVectors
+from ..featureextractor import FeatureExtractor
 from ...pipeline.tok2vec import Tok2VecListener
-from ...attrs import ORTH, NORM, PREFIX, SUFFIX, SHAPE
+from ...attrs import ORTH, LOWER, PREFIX, SUFFIX, SHAPE, intify_attr


@registry.architectures.register("spacy.Tok2VecListener.v1")
@ -98,7 +98,7 @@ def MultiHashEmbed(
    attributes using hash embedding, concatenates the results, and passes it
    through a feed-forward subnetwork to build a mixed representations.

-    The features used are the NORM, PREFIX, SUFFIX and SHAPE, which can have
+    The features used are the LOWER, PREFIX, SUFFIX and SHAPE, which can have
    varying definitions depending on the Vocab of the Doc object passed in.
    Vectors from pretrained static vectors can also be incorporated into the
    concatenated representation.
@ -115,7 +115,7 @@ def MultiHashEmbed(
    also_use_static_vectors (bool): Whether to also use static word vectors.
        Requires a vectors table to be loaded in the Doc objects' vocab.
    """
-    cols = [NORM, PREFIX, SUFFIX, SHAPE, ORTH]
+    cols = [LOWER, PREFIX, SUFFIX, SHAPE, ORTH]
    seed = 7

    def make_hash_embed(feature):
@ -123,7 +123,7 @@ def MultiHashEmbed(
        seed += 1
        return HashEmbed(
            width,
-            rows if feature == NORM else rows // 2,
+            rows if feature == LOWER else rows // 2,
            column=cols.index(feature),
            seed=seed,
            dropout=0.0,
@ -131,13 +131,13 @@ def MultiHashEmbed(

    if also_embed_subwords:
        embeddings = [
-            make_hash_embed(NORM),
+            make_hash_embed(LOWER),
            make_hash_embed(PREFIX),
            make_hash_embed(SUFFIX),
            make_hash_embed(SHAPE),
        ]
    else:
-        embeddings = [make_hash_embed(NORM)]
+        embeddings = [make_hash_embed(LOWER)]
    concat_size = width * (len(embeddings) + also_use_static_vectors)
    if also_use_static_vectors:
        model = chain(
@ -180,13 +180,17 @@ def CharacterEmbed(
    of being in an arbitrary position depending on the word length.

    The characters are embedded in a embedding table with a given number of rows,
-    and the vectors concatenated. A hash-embedded vector of the NORM of the word is
+    and the vectors concatenated. A hash-embedded vector of the LOWER of the word is
    also concatenated on, and the result is then passed through a feed-forward
    network to construct a single vector to represent the information.

    feature (int or str): An attribute to embed, to concatenate with the characters.
    width (int): The width of the output vector and the feature embedding.
+<<<<<<< HEAD
    rows (int): The number of rows in the NORM hash embedding table.
+=======
+    rows (int): The number of rows in the LOWER hash embedding table.
+>>>>>>> 300e5a9928fd226dfddbf7d5c22558f696bfa1af
    nM (int): The dimensionality of the character embeddings. Recommended values
        are between 16 and 64.
    nC (int): The number of UTF-8 bytes to embed per word. Recommended values
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -149,7 +149,7 @@ class Morphologizer(Tagger):
        for example in get_examples():
            for i, token in enumerate(example.reference):
                pos = token.pos_
-                morph = token.morph_
+                morph = str(token.morph)
                # create and add the combined morph+POS label
                morph_dict = Morphology.feats_to_dict(morph)
                if pos:
@ -167,7 +167,7 @@ class Morphologizer(Tagger):
            gold_array = []
            for i, token in enumerate(example.reference):
                pos = token.pos_
-                morph = token.morph_
+                morph = str(token.morph)
                morph_dict = Morphology.feats_to_dict(morph)
                if pos:
                    morph_dict[self.POS_FEAT] = pos
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -268,6 +268,9 @@ class Tagger(Pipe):
        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects..
        nlp (Language): The current nlp object the component is part of.
+        labels: The labels to add to the component, typically generated by the
+            `init labels` command. If no labels are provided, the get_examples
+            callback is used to extract the labels from the data.

        DOCS: https://nightly.spacy.io/api/tagger#initialize
        """
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -355,6 +355,9 @@ class TextCategorizer(Pipe):
        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects.
        nlp (Language): The current nlp object the component is part of.
+        labels: The labels to add to the component, typically generated by the
+            `init labels` command. If no labels are provided, the get_examples
+            callback is used to extract the labels from the data.

        DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
        """
--- a/spacy/tests/doc/test_array.py
+++ b/spacy/tests/doc/test_array.py
@ -46,9 +46,9 @@ def test_doc_array_morph(en_vocab):
    words = ["Eat", "blue", "ham"]
    morph = ["Feat=V", "Feat=J", "Feat=N"]
    doc = Doc(en_vocab, words=words, morphs=morph)
-    assert morph[0] == doc[0].morph_
-    assert morph[1] == doc[1].morph_
-    assert morph[2] == doc[2].morph_
+    assert morph[0] == str(doc[0].morph)
+    assert morph[1] == str(doc[1].morph)
+    assert morph[2] == str(doc[2].morph)

    feats_array = doc.to_array((ORTH, MORPH))
    assert feats_array[0][1] == doc[0].morph.key
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -319,15 +319,13 @@ def test_doc_from_array_morph(en_vocab):
    words = ["I", "live", "in", "New", "York", "."]
    morphs = ["Feat1=A", "Feat1=B", "Feat1=C", "Feat1=A|Feat2=D", "Feat2=E", "Feat3=F"]
    # fmt: on
-    doc = Doc(en_vocab, words=words)
-    for i, morph in enumerate(morphs):
-        doc[i].morph_ = morph
+    doc = Doc(en_vocab, words=words, morphs=morphs)
    attrs = [MORPH]
    arr = doc.to_array(attrs)
    new_doc = Doc(en_vocab, words=words)
    new_doc.from_array(attrs, arr)
-    assert [t.morph_ for t in new_doc] == morphs
-    assert [t.morph_ for t in doc] == [t.morph_ for t in new_doc]
+    assert [str(t.morph) for t in new_doc] == morphs
+    assert [str(t.morph) for t in doc] == [str(t.morph) for t in new_doc]


 def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
@ -423,7 +421,7 @@ def test_has_annotation(en_vocab):

    doc[0].tag_ = "A"
    doc[0].pos_ = "X"
-    doc[0].morph_ = "Feat=Val"
+    doc[0].set_morph("Feat=Val")
    doc[0].lemma_ = "a"
    doc[0].dep_ = "dep"
    doc[0].head = doc[1]
@ -435,7 +433,7 @@ def test_has_annotation(en_vocab):

    doc[1].tag_ = "A"
    doc[1].pos_ = "X"
-    doc[1].morph_ = ""
+    doc[1].set_morph("")
    doc[1].lemma_ = "a"
    doc[1].dep_ = "dep"
    doc.ents = [Span(doc, 0, 2, label="HELLO")]
@ -533,5 +531,78 @@ def test_doc_ents_setter():
    assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
    vocab = Vocab()
    ents = [("HELLO", 0, 2), (vocab.strings.add("WORLD"), 3, 5)]
+    ents = ["B-HELLO", "I-HELLO", "O", "B-WORLD", "I-WORLD"]
    doc = Doc(vocab, words=words, ents=ents)
    assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
+
+
+def test_doc_morph_setter(en_tokenizer, de_tokenizer):
+    doc1 = en_tokenizer("a b")
+    doc1b = en_tokenizer("c d")
+    doc2 = de_tokenizer("a b")
+
+    # unset values can be copied
+    doc1[0].morph = doc1[1].morph
+    assert doc1[0].morph.key == 0
+    assert doc1[1].morph.key == 0
+
+    # morph values from the same vocab can be copied
+    doc1[0].set_morph("Feat=Val")
+    doc1[1].morph = doc1[0].morph
+    assert doc1[0].morph == doc1[1].morph
+
+    # ... also across docs
+    doc1b[0].morph = doc1[0].morph
+    assert doc1[0].morph == doc1b[0].morph
+
+    doc2[0].set_morph("Feat2=Val2")
+
+    # the morph value must come from the same vocab
+    with pytest.raises(ValueError):
+        doc1[0].morph = doc2[0].morph
+
+
+def test_doc_init_iob():
+    """Test ents validation/normalization in Doc.__init__"""
+    words = ["a", "b", "c", "d", "e"]
+    ents = ["O"] * len(words)
+    doc = Doc(Vocab(), words=words, ents=ents)
+    assert doc.ents == ()
+
+    ents = ["B-PERSON", "I-PERSON", "O", "I-PERSON", "I-PERSON"]
+    doc = Doc(Vocab(), words=words, ents=ents)
+    assert len(doc.ents) == 2
+
+    ents = ["B-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
+    doc = Doc(Vocab(), words=words, ents=ents)
+    assert len(doc.ents) == 3
+
+    # None is missing
+    ents = ["B-PERSON", "I-PERSON", "O", None, "I-GPE"]
+    doc = Doc(Vocab(), words=words, ents=ents)
+    assert len(doc.ents) == 2
+
+    # empty tag is missing
+    ents = ["", "B-PERSON", "O", "B-PERSON", "I-PERSON"]
+    doc = Doc(Vocab(), words=words, ents=ents)
+    assert len(doc.ents) == 2
+
+    # invalid IOB
+    ents = ["Q-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
+    with pytest.raises(ValueError):
+        doc = Doc(Vocab(), words=words, ents=ents)
+
+    # no dash
+    ents = ["OPERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
+    with pytest.raises(ValueError):
+        doc = Doc(Vocab(), words=words, ents=ents)
+
+    # no ent type
+    ents = ["O", "B-", "O", "I-PERSON", "I-GPE"]
+    with pytest.raises(ValueError):
+        doc = Doc(Vocab(), words=words, ents=ents)
+
+    # not strings or None
+    ents = [0, "B-", "O", "I-PERSON", "I-GPE"]
+    with pytest.raises(ValueError):
+        doc = Doc(Vocab(), words=words, ents=ents)
--- a/spacy/tests/doc/test_morphanalysis.py
+++ b/spacy/tests/doc/test_morphanalysis.py
@ -4,13 +4,13 @@ import pytest
@pytest.fixture
 def i_has(en_tokenizer):
    doc = en_tokenizer("I has")
-    doc[0].morph_ = {"PronType": "prs"}
-    doc[1].morph_ = {
+    doc[0].set_morph({"PronType": "prs"})
+    doc[1].set_morph({
        "VerbForm": "fin",
        "Tense": "pres",
        "Number": "sing",
        "Person": "three",
-    }
+    })

    return doc

@ -47,20 +47,20 @@ def test_morph_get(i_has):
 def test_morph_set(i_has):
    assert i_has[0].morph.get("PronType") == ["prs"]
    # set by string
-    i_has[0].morph_ = "PronType=unk"
+    i_has[0].set_morph("PronType=unk")
    assert i_has[0].morph.get("PronType") == ["unk"]
    # set by string, fields are alphabetized
-    i_has[0].morph_ = "PronType=123|NounType=unk"
-    assert i_has[0].morph_ == "NounType=unk|PronType=123"
+    i_has[0].set_morph("PronType=123|NounType=unk")
+    assert str(i_has[0].morph) == "NounType=unk|PronType=123"
    # set by dict
-    i_has[0].morph_ = {"AType": "123", "BType": "unk"}
-    assert i_has[0].morph_ == "AType=123|BType=unk"
+    i_has[0].set_morph({"AType": "123", "BType": "unk"})
+    assert str(i_has[0].morph) == "AType=123|BType=unk"
    # set by string with multiple values, fields and values are alphabetized
-    i_has[0].morph_ = "BType=c|AType=b,a"
-    assert i_has[0].morph_ == "AType=a,b|BType=c"
+    i_has[0].set_morph("BType=c|AType=b,a")
+    assert str(i_has[0].morph) == "AType=a,b|BType=c"
    # set by dict with multiple values, fields and values are alphabetized
-    i_has[0].morph_ = {"AType": "b,a", "BType": "c"}
-    assert i_has[0].morph_ == "AType=a,b|BType=c"
+    i_has[0].set_morph({"AType": "b,a", "BType": "c"})
+    assert str(i_has[0].morph) == "AType=a,b|BType=c"


 def test_morph_str(i_has):
@ -72,25 +72,25 @@ def test_morph_property(tokenizer):
    doc = tokenizer("a dog")

    # set through token.morph_
-    doc[0].morph_ = "PronType=prs"
-    assert doc[0].morph_ == "PronType=prs"
+    doc[0].set_morph("PronType=prs")
+    assert str(doc[0].morph) == "PronType=prs"
    assert doc.to_array(["MORPH"])[0] != 0

    # unset with token.morph
-    doc[0].morph = 0
+    doc[0].set_morph(0)
    assert doc.to_array(["MORPH"])[0] == 0

    # empty morph is equivalent to "_"
-    doc[0].morph_ = ""
-    assert doc[0].morph_ == ""
+    doc[0].set_morph("")
+    assert str(doc[0].morph) == ""
    assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]

    # "_" morph is also equivalent to empty morph
-    doc[0].morph_ = "_"
-    assert doc[0].morph_ == ""
+    doc[0].set_morph("_")
+    assert str(doc[0].morph) == ""
    assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]

    # set through existing hash with token.morph
    tokenizer.vocab.strings.add("Feat=Val")
-    doc[0].morph = tokenizer.vocab.strings.add("Feat=Val")
-    assert doc[0].morph_ == "Feat=Val"
+    doc[0].set_morph(tokenizer.vocab.strings.add("Feat=Val"))
+    assert str(doc[0].morph) == "Feat=Val"
--- a/spacy/tests/doc/test_retokenize_merge.py
+++ b/spacy/tests/doc/test_retokenize_merge.py
@ -21,11 +21,11 @@ def test_doc_retokenize_merge(en_tokenizer):
    assert doc[4].text == "the beach boys"
    assert doc[4].text_with_ws == "the beach boys "
    assert doc[4].tag_ == "NAMED"
-    assert doc[4].morph_ == "Number=Plur"
+    assert str(doc[4].morph) == "Number=Plur"
    assert doc[5].text == "all night"
    assert doc[5].text_with_ws == "all night"
    assert doc[5].tag_ == "NAMED"
-    assert doc[5].morph_ == "Number=Plur"
+    assert str(doc[5].morph) == "Number=Plur"


 def test_doc_retokenize_merge_children(en_tokenizer):
@ -201,6 +201,12 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer):
    heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15]
    tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"]
    ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)]
+    ents = ["O"] * len(heads)
+    ents[0] = "B-PERSON"
+    ents[1] = "I-PERSON"
+    ents[10] = "B-GPE"
+    ents[13] = "B-PERSON"
+    ents[14] = "I-PERSON"
    # fmt: on
    tokens = en_tokenizer(text)
    doc = Doc(
@ -269,7 +275,11 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
    # if there is a parse, span.root provides default values
    words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
    heads = [0, 0, 3, 0, 0, 0, 5, 0, 0]
-    ents = [("ent-de", 3, 5), ("ent-fg", 5, 7)]
+    ents = ["O"] * len(words)
+    ents[3] = "B-ent-de"
+    ents[4] = "I-ent-de"
+    ents[5] = "B-ent-fg"
+    ents[6] = "I-ent-fg"
    deps = ["dep"] * len(words)
    en_vocab.strings.add("ent-de")
    en_vocab.strings.add("ent-fg")
@ -292,7 +302,11 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
    # check that B is preserved if span[start] is B
    words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
    heads = [0, 0, 3, 4, 0, 0, 5, 0, 0]
-    ents = [("ent-de", 3, 5), ("ent-de", 5, 7)]
+    ents = ["O"] * len(words)
+    ents[3] = "B-ent-de"
+    ents[4] = "I-ent-de"
+    ents[5] = "B-ent-de"
+    ents[6] = "I-ent-de"
    deps = ["dep"] * len(words)
    doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
    with doc.retokenize() as retokenizer:
--- a/spacy/tests/doc/test_retokenize_split.py
+++ b/spacy/tests/doc/test_retokenize_split.py
@ -27,11 +27,11 @@ def test_doc_retokenize_split(en_vocab):
    assert doc[0].text == "Los"
    assert doc[0].head.text == "Angeles"
    assert doc[0].idx == 0
-    assert doc[0].morph_ == "Number=Sing"
+    assert str(doc[0].morph) == "Number=Sing"
    assert doc[1].idx == 3
    assert doc[1].text == "Angeles"
    assert doc[1].head.text == "start"
-    assert doc[1].morph_ == "Number=Sing"
+    assert str(doc[1].morph) == "Number=Sing"
    assert doc[2].text == "start"
    assert doc[2].head.text == "."
    assert doc[3].text == "."
--- a/spacy/tests/doc/test_to_json.py
+++ b/spacy/tests/doc/test_to_json.py
@ -9,7 +9,7 @@ def doc(en_vocab):
    tags = ["VBP", "NN", "NN"]
    heads = [0, 0, 0]
    deps = ["ROOT", "dobj", "dobj"]
-    ents = [("ORG", 1, 2)]
+    ents = ["O", "B-ORG", "O"]
    return Doc(
        en_vocab, words=words, pos=pos, tags=tags, heads=heads, deps=deps, ents=ents
    )
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@ -236,13 +236,13 @@ def test_matcher_subset_value_operator(en_vocab):
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    assert len(matcher(doc)) == 3
-    doc[0].morph_ = "Feat=Val"
+    doc[0].set_morph("Feat=Val")
    assert len(matcher(doc)) == 3
-    doc[0].morph_ = "Feat=Val|Feat2=Val2"
+    doc[0].set_morph("Feat=Val|Feat2=Val2")
    assert len(matcher(doc)) == 3
-    doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3"
+    doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3")
    assert len(matcher(doc)) == 2
-    doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4"
+    doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4")
    assert len(matcher(doc)) == 2

    # IS_SUBSET acts like "IN" for attrs other than MORPH
@ -268,11 +268,11 @@ def test_matcher_superset_value_operator(en_vocab):
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    assert len(matcher(doc)) == 0
-    doc[0].morph_ = "Feat=Val|Feat2=Val2"
+    doc[0].set_morph("Feat=Val|Feat2=Val2")
    assert len(matcher(doc)) == 0
-    doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3"
+    doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3")
    assert len(matcher(doc)) == 1
-    doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4"
+    doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4")
    assert len(matcher(doc)) == 1

    # IS_SUPERSET with more than one value only matches for MORPH
@ -310,9 +310,9 @@ def test_matcher_morph_handling(en_vocab):
    doc = Doc(en_vocab, words=["a", "b", "c"])
    assert len(matcher(doc)) == 0

-    doc[0].morph_ = "Feat2=Val2|Feat1=Val1"
+    doc[0].set_morph("Feat2=Val2|Feat1=Val1")
    assert len(matcher(doc)) == 2
-    doc[0].morph_ = "Feat1=Val1|Feat2=Val2"
+    doc[0].set_morph("Feat1=Val1|Feat2=Val2")
    assert len(matcher(doc)) == 2

    # multiple values are split
@ -324,9 +324,9 @@ def test_matcher_morph_handling(en_vocab):
    doc = Doc(en_vocab, words=["a", "b", "c"])
    assert len(matcher(doc)) == 0

-    doc[0].morph_ = "Feat2=Val2,Val3|Feat1=Val1"
+    doc[0].set_morph("Feat2=Val2,Val3|Feat1=Val1")
    assert len(matcher(doc)) == 1
-    doc[0].morph_ = "Feat1=Val1,Val3|Feat2=Val2"
+    doc[0].set_morph("Feat1=Val1,Val3|Feat2=Val2")
    assert len(matcher(doc)) == 2


@ -405,7 +405,7 @@ def test_attr_pipeline_checks(en_vocab):
    doc2 = Doc(en_vocab, words=["Test"])
    doc2[0].tag_ = "TAG"
    doc2[0].pos_ = "X"
-    doc2[0].morph_ = "Feat=Val"
+    doc2[0].set_morph("Feat=Val")
    doc2[0].lemma_ = "LEMMA"
    doc3 = Doc(en_vocab, words=["Test"])
    # DEP requires DEP
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@ -190,7 +190,7 @@ def test_phrase_matcher_validation(en_vocab):
    doc2 = Doc(en_vocab, words=["Test"])
    doc2[0].tag_ = "TAG"
    doc2[0].pos_ = "X"
-    doc2[0].morph_ = "Feat=Val"
+    doc2[0].set_morph("Feat=Val")
    doc3 = Doc(en_vocab, words=["Test"])
    matcher = PhraseMatcher(en_vocab, validate=True)
    with pytest.warns(UserWarning):
@ -217,7 +217,7 @@ def test_attr_pipeline_checks(en_vocab):
    doc2 = Doc(en_vocab, words=["Test"])
    doc2[0].tag_ = "TAG"
    doc2[0].pos_ = "X"
-    doc2[0].morph_ = "Feat=Val"
+    doc2[0].set_morph("Feat=Val")
    doc2[0].lemma_ = "LEMMA"
    doc3 = Doc(en_vocab, words=["Test"])
    # DEP requires DEP
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@ -339,7 +339,6 @@ def test_ner_warns_no_lookups(caplog):
    nlp.vocab.lookups = Lookups()
    assert not len(nlp.vocab.lookups)
    nlp.add_pipe("ner")
-    nlp.config["initialize"]["lookups"] = None
    with caplog.at_level(logging.DEBUG):
        nlp.initialize()
        assert "W033" in caplog.text
--- a/spacy/tests/pipeline/test_attributeruler.py
+++ b/spacy/tests/pipeline/test_attributeruler.py
@ -69,9 +69,9 @@ def test_attributeruler_init(nlp, pattern_dicts):
        a.add(**p)
    doc = nlp("This is a test.")
    assert doc[2].lemma_ == "the"
-    assert doc[2].morph_ == "Case=Nom|Number=Plur"
+    assert str(doc[2].morph) == "Case=Nom|Number=Plur"
    assert doc[3].lemma_ == "cat"
-    assert doc[3].morph_ == "Case=Nom|Number=Sing"
+    assert str(doc[3].morph) == "Case=Nom|Number=Sing"
    assert doc.has_annotation("LEMMA")
    assert doc.has_annotation("MORPH")

@ -81,9 +81,9 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
    nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
    doc = nlp("This is a test.")
    assert doc[2].lemma_ == "the"
-    assert doc[2].morph_ == "Case=Nom|Number=Plur"
+    assert str(doc[2].morph) == "Case=Nom|Number=Plur"
    assert doc[3].lemma_ == "cat"
-    assert doc[3].morph_ == "Case=Nom|Number=Sing"
+    assert str(doc[3].morph) == "Case=Nom|Number=Sing"
    assert doc.has_annotation("LEMMA")
    assert doc.has_annotation("MORPH")
    nlp.remove_pipe("attribute_ruler")
@ -94,9 +94,9 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
    )
    doc = nlp("This is a test.")
    assert doc[2].lemma_ == "the"
-    assert doc[2].morph_ == "Case=Nom|Number=Plur"
+    assert str(doc[2].morph) == "Case=Nom|Number=Plur"
    assert doc[3].lemma_ == "cat"
-    assert doc[3].morph_ == "Case=Nom|Number=Sing"
+    assert str(doc[3].morph) == "Case=Nom|Number=Sing"
    assert doc.has_annotation("LEMMA")
    assert doc.has_annotation("MORPH")

@ -106,9 +106,9 @@ def test_attributeruler_score(nlp, pattern_dicts):
    nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
    doc = nlp("This is a test.")
    assert doc[2].lemma_ == "the"
-    assert doc[2].morph_ == "Case=Nom|Number=Plur"
+    assert str(doc[2].morph) == "Case=Nom|Number=Plur"
    assert doc[3].lemma_ == "cat"
-    assert doc[3].morph_ == "Case=Nom|Number=Sing"
+    assert str(doc[3].morph) == "Case=Nom|Number=Sing"

    dev_examples = [
        Example.from_dict(
@ -150,10 +150,10 @@ def test_attributeruler_tag_map(nlp, tag_map):
    for i in range(len(doc)):
        if i == 4:
            assert doc[i].pos_ == "PUNCT"
-            assert doc[i].morph_ == "PunctType=peri"
+            assert str(doc[i].morph) == "PunctType=peri"
        else:
            assert doc[i].pos_ == ""
-            assert doc[i].morph_ == ""
+            assert str(doc[i].morph) == ""


 def test_attributeruler_morph_rules(nlp, morph_rules):
@ -168,11 +168,11 @@ def test_attributeruler_morph_rules(nlp, morph_rules):
    for i in range(len(doc)):
        if i != 2:
            assert doc[i].pos_ == ""
-            assert doc[i].morph_ == ""
+            assert str(doc[i].morph) == ""
        else:
            assert doc[2].pos_ == "DET"
            assert doc[2].lemma_ == "a"
-            assert doc[2].morph_ == "Case=Nom"
+            assert str(doc[2].morph) == "Case=Nom"


 def test_attributeruler_indices(nlp):
@ -194,14 +194,14 @@ def test_attributeruler_indices(nlp):
    for i in range(len(doc)):
        if i == 1:
            assert doc[i].lemma_ == "was"
-            assert doc[i].morph_ == "Case=Nom|Number=Sing"
+            assert str(doc[i].morph) == "Case=Nom|Number=Sing"
        elif i == 2:
            assert doc[i].lemma_ == "the"
-            assert doc[i].morph_ == "Case=Nom|Number=Plur"
+            assert str(doc[i].morph) == "Case=Nom|Number=Plur"
        elif i == 3:
            assert doc[i].lemma_ == "cat"
        else:
-            assert doc[i].morph_ == ""
+            assert str(doc[i].morph) == ""
    # raises an error when trying to modify a token outside of the match
    a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=2)
    with pytest.raises(ValueError):
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@ -91,7 +91,7 @@ def test_overfitting_IO():
    doc = nlp(test_text)
    gold_morphs = ["Feat=N", "Feat=V", "", ""]
    gold_pos_tags = ["NOUN", "VERB", "ADJ", ""]
-    assert [t.morph_ for t in doc] == gold_morphs
+    assert [str(t.morph) for t in doc] == gold_morphs
    assert [t.pos_ for t in doc] == gold_pos_tags

    # Also test the results are still the same after IO
@ -99,5 +99,5 @@ def test_overfitting_IO():
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        doc2 = nlp2(test_text)
-        assert [t.morph_ for t in doc2] == gold_morphs
+        assert [str(t.morph) for t in doc2] == gold_morphs
        assert [t.pos_ for t in doc2] == gold_pos_tags
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@ -59,7 +59,7 @@ def test_issue3012(en_vocab):
    words = ["This", "is", "10", "%", "."]
    tags = ["DT", "VBZ", "CD", "NN", "."]
    pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
-    ents = [("PERCENT", 2, 4)]
+    ents = ["O", "O", "B-PERCENT", "I-PERCENT", "O"]
    doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
    assert doc.has_annotation("TAG")
    expected = ("10", "NUM", "CD", "PERCENT")
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@ -76,7 +76,7 @@ def tagged_doc():
    for i in range(len(tags)):
        doc[i].tag_ = tags[i]
        doc[i].pos_ = pos[i]
-        doc[i].morph_ = morphs[i]
+        doc[i].set_morph(morphs[i])
        if i > 0:
            doc[i].is_sent_start = False
    return doc
@ -184,7 +184,7 @@ def test_ner_per_type(en_vocab):
        doc = Doc(
            en_vocab,
            words=input_.split(" "),
-            ents=[("CARDINAL", 0, 1), ("CARDINAL", 2, 3)],
+            ents=["B-CARDINAL", "O", "B-CARDINAL"],
        )
        entities = offsets_to_biluo_tags(doc, annot["entities"])
        example = Example.from_dict(doc, {"entities": entities})
@ -209,7 +209,7 @@ def test_ner_per_type(en_vocab):
        doc = Doc(
            en_vocab,
            words=input_.split(" "),
-            ents=[("ORG", 0, 1), ("GPE", 5, 6), ("ORG", 6, 7)],
+            ents=["B-ORG", "O", "O", "O", "O", "B-GPE", "B-ORG", "O", "O", "O"],
        )
        entities = offsets_to_biluo_tags(doc, annot["entities"])
        example = Example.from_dict(doc, {"entities": entities})
@ -242,7 +242,7 @@ def test_tag_score(tagged_doc):
    gold = {
        "tags": [t.tag_ for t in tagged_doc],
        "pos": [t.pos_ for t in tagged_doc],
-        "morphs": [t.morph_ for t in tagged_doc],
+        "morphs": [str(t.morph) for t in tagged_doc],
        "sent_starts": [1 if t.is_sent_start else -1 for t in tagged_doc],
    }
    example = Example.from_dict(tagged_doc, gold)
@ -259,7 +259,7 @@ def test_tag_score(tagged_doc):
    tags[0] = "NN"
    pos = [t.pos_ for t in tagged_doc]
    pos[1] = "X"
-    morphs = [t.morph_ for t in tagged_doc]
+    morphs = [str(t.morph) for t in tagged_doc]
    morphs[1] = "Number=sing"
    morphs[2] = "Number=plur"
    gold = {
--- a/spacy/tests/training/test_new_example.py
+++ b/spacy/tests/training/test_new_example.py
@ -113,7 +113,7 @@ def test_Example_from_dict_with_morphology(annots):
    predicted = Doc(vocab, words=annots["words"])
    example = Example.from_dict(predicted, annots)
    for i, token in enumerate(example.reference):
-        assert token.morph_ == annots["morphs"][i]
+        assert str(token.morph) == annots["morphs"][i]


@pytest.mark.parametrize(
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@ -30,7 +30,12 @@ def doc(en_vocab):
    heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5]
    deps = ["poss", "case", "nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
    lemmas = ["Sarah", "'s", "sister", "fly", "to", "Silicon", "Valley", "via", "London", "."]
-    ents = (("PERSON", 0, 2), ("LOC", 5, 7), ("GPE", 8, 9))
+    ents = ["O"] * len(words)
+    ents[0] = "B-PERSON"
+    ents[1] = "I-PERSON"
+    ents[5] = "B-LOC"
+    ents[6] = "I-LOC"
+    ents[8] = "B-GPE"
    cats = {"TRAVEL": 1.0, "BAKING": 0.0}
    # fmt: on
    doc = Doc(
@ -455,7 +460,7 @@ def test_roundtrip_docs_to_docbin(doc):
    idx = [t.idx for t in doc]
    tags = [t.tag_ for t in doc]
    pos = [t.pos_ for t in doc]
-    morphs = [t.morph_ for t in doc]
+    morphs = [str(t.morph) for t in doc]
    lemmas = [t.lemma_ for t in doc]
    deps = [t.dep_ for t in doc]
    heads = [t.head.i for t in doc]
@ -477,7 +482,7 @@ def test_roundtrip_docs_to_docbin(doc):
    assert idx == [t.idx for t in reloaded_example.reference]
    assert tags == [t.tag_ for t in reloaded_example.reference]
    assert pos == [t.pos_ for t in reloaded_example.reference]
-    assert morphs == [t.morph_ for t in reloaded_example.reference]
+    assert morphs == [str(t.morph) for t in reloaded_example.reference]
    assert lemmas == [t.lemma_ for t in reloaded_example.reference]
    assert deps == [t.dep_ for t in reloaded_example.reference]
    assert heads == [t.head.i for t in reloaded_example.reference]
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@ -101,7 +101,7 @@ class DocBin:
            self.strings.add(token.text)
            self.strings.add(token.tag_)
            self.strings.add(token.lemma_)
-            self.strings.add(token.morph_)
+            self.strings.add(str(token.morph))
            self.strings.add(token.dep_)
            self.strings.add(token.ent_type_)
            self.strings.add(token.ent_kb_id_)
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -213,8 +213,9 @@ cdef class Doc:
        sent_starts (Optional[List[Union[bool, None]]]): A list of values, of
            the same length as words, to assign as token.is_sent_start. Will be
            overridden by heads if heads is provided. Defaults to None.
-        ents (Optional[List[Tuple[Union[str, int], int, int]]]): A list of
-            (label, start, end) tuples to assign as doc.ents. Defaults to None.
+        ents (Optional[List[str]]): A list of unicode strings, of the same
+            length as words, as IOB tags to assign as token.ent_iob and
+            token.ent_type. Defaults to None.

        DOCS: https://nightly.spacy.io/api/doc#init
        """
@ -275,16 +276,55 @@ cdef class Doc:
                    sent_starts[i] = -1
                elif sent_starts[i] is None or sent_starts[i] not in [-1, 0, 1]:
                    sent_starts[i] = 0
+        ent_iobs = None
+        ent_types = None
+        if ents is not None:
+            iob_strings = Token.iob_strings()
+            # make valid IOB2 out of IOB1 or IOB2
+            for i, ent in enumerate(ents):
+                if ent is "":
+                    ents[i] = None
+                elif ent is not None and not isinstance(ent, str):
+                    raise ValueError(Errors.E177.format(tag=ent))
+                if i < len(ents) - 1:
+                    # OI -> OB
+                    if (ent is None or ent.startswith("O")) and \
+                            (ents[i+1] is not None and ents[i+1].startswith("I")):
+                        ents[i+1] = "B" + ents[i+1][1:]
+                    # B-TYPE1 I-TYPE2 or I-TYPE1 I-TYPE2 -> B/I-TYPE1 B-TYPE2
+                    if ent is not None and ents[i+1] is not None and \
+                            (ent.startswith("B") or ent.startswith("I")) and \
+                            ents[i+1].startswith("I") and \
+                            ent[1:] != ents[i+1][1:]:
+                        ents[i+1] = "B" + ents[i+1][1:]
+            ent_iobs = []
+            ent_types = []
+            for ent in ents:
+                if ent is None:
+                    ent_iobs.append(iob_strings.index(""))
+                    ent_types.append("")
+                elif ent == "O":
+                    ent_iobs.append(iob_strings.index(ent))
+                    ent_types.append("")
+                else:
+                    if len(ent) < 3 or ent[1] != "-":
+                        raise ValueError(Errors.E177.format(tag=ent))
+                    ent_iob, ent_type = ent.split("-", 1) 
+                    if ent_iob not in iob_strings:
+                        raise ValueError(Errors.E177.format(tag=ent))
+                    ent_iob = iob_strings.index(ent_iob)
+                    ent_iobs.append(ent_iob)
+                    ent_types.append(ent_type)
        headings = []
        values = []
-        annotations = [pos, heads, deps, lemmas, tags, morphs, sent_starts]
-        possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH, SENT_START]
+        annotations = [pos, heads, deps, lemmas, tags, morphs, sent_starts, ent_iobs, ent_types]
+        possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH, SENT_START, ENT_IOB, ENT_TYPE]
        for a, annot in enumerate(annotations):
            if annot is not None:
                if len(annot) != len(words):
                    raise ValueError(Errors.E189)
                headings.append(possible_headings[a])
-                if annot is not heads and annot is not sent_starts:
+                if annot is not heads and annot is not sent_starts and annot is not ent_iobs:
                    values.extend(annot)
        for value in values:
            self.vocab.strings.add(value)
@ -296,7 +336,7 @@ cdef class Doc:
            j = 0
            for annot in annotations:
                if annot:
-                    if annot is heads or annot is sent_starts:
+                    if annot is heads or annot is sent_starts or annot is ent_iobs:
                        for i in range(len(words)):
                            if attrs.ndim == 1:
                                attrs[i] = annot[i]
@ -317,8 +357,6 @@ cdef class Doc:
                                attrs[i, j] = self.vocab.strings[annot[i]]
                    j += 1
            self.from_array(headings, attrs)
-        if ents is not None:
-            self.ents = ents

    @property
    def _(self):
@ -1210,7 +1248,7 @@ cdef class Doc:
        for token in self:
            strings.add(token.tag_)
            strings.add(token.lemma_)
-            strings.add(token.morph_)
+            strings.add(str(token.morph))
            strings.add(token.dep_)
            strings.add(token.ent_type_)
            strings.add(token.ent_kb_id_)
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -215,20 +215,20 @@ cdef class Token:
        def __get__(self):
            return MorphAnalysis.from_id(self.vocab, self.c.morph)

-        def __set__(self, attr_t morph):
-            if morph == 0:
-                self.c.morph = morph
-            elif morph in self.vocab.strings:
-                self.morph_ = self.vocab.strings[morph]
+        def __set__(self, MorphAnalysis morph):
+            # Check that the morph has the same vocab
+            if self.vocab != morph.vocab:
+                raise ValueError(Errors.E1013)
+            self.c.morph = morph.c.key
+
+    def set_morph(self, features):
+        cdef hash_t key
+        if features is 0:
+            self.c.morph = 0
        else:
-                raise ValueError(Errors.E1009.format(val=morph))
-
-    property morph_:
-        def __get__(self):
-            return str(MorphAnalysis.from_id(self.vocab, self.c.morph))
-
-        def __set__(self, features):
-            cdef hash_t key = self.vocab.morphology.add(features)
+            if isinstance(features, int):
+                features = self.vocab.strings[features]
+            key = self.vocab.morphology.add(features)
            self.c.morph = key

    @property
--- a/spacy/training/converters/conllu_to_docs.py
+++ b/spacy/training/converters/conllu_to_docs.py
@ -207,6 +207,7 @@ def conllu_sentence_to_doc(
        pos=poses,
        deps=deps,
        lemmas=lemmas,
+        morphs=morphs,
        heads=heads,
    )
    for i in range(len(doc)):
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@ -1,4 +1,4 @@
-from collections import Iterable as IterableInstance
+from collections.abc import Iterable as IterableInstance
 import warnings
 import numpy
 from murmurhash.mrmr cimport hash64
@ -226,7 +226,7 @@ cdef class Example:
                "TAG": [t.tag_ for t in self.reference],
                "LEMMA": [t.lemma_ for t in self.reference],
                "POS": [t.pos_ for t in self.reference],
-                "MORPH": [t.morph_ for t in self.reference],
+                "MORPH": [str(t.morph) for t in self.reference],
                "HEAD": [t.head.i for t in self.reference],
                "DEP": [t.dep_ for t in self.reference],
                "SENT_START": [int(bool(t.is_sent_start)) for t in self.reference]
--- a/spacy/training/gold_io.pyx
+++ b/spacy/training/gold_io.pyx
@ -44,7 +44,7 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
                if include_annotation["POS"]:
                    json_token["pos"] = token.pos_
                if include_annotation["MORPH"]:
-                    json_token["morph"] = token.morph_
+                    json_token["morph"] = str(token.morph)
                if include_annotation["LEMMA"]:
                    json_token["lemma"] = token.lemma_
                if include_annotation["DEP"]:
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@ -144,9 +144,9 @@ argument that connects to the shared `tok2vec` component in the pipeline.
 Construct an embedding layer that separately embeds a number of lexical
 attributes using hash embedding, concatenates the results, and passes it through
 a feed-forward subnetwork to build mixed representations. The features used are
-the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, which can have varying definitions
-depending on the `Vocab` of the `Doc` object passed in. Vectors from pretrained
-static vectors can also be incorporated into the concatenated representation.
+the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, and they are extracted with a
+[FeatureExtractor](/api/architectures#FeatureExtractor) layer. Vectors from pretrained static
+vectors can also be incorporated into the concatenated representation.

 | Name                      | Description                                                                                                                                                                                                       |
 | ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@ -291,6 +291,24 @@ on [static vectors](/usage/embeddings-transformers#static-vectors) for details.
 | `key_attr`  | Defaults to `"ORTH"`. ~~str~~                                                                                                                                                                                           |
 | **CREATES** | The model using the architecture. ~~Model[List[Doc], Ragged]~~                                                                                                                                                          |

+### spacy.FeatureExtractor.v1 {#FeatureExtractor}
+
+> #### Example config
+>
+> ```ini
+> [model]
+> @architectures = "spacy.FeatureExtractor.v1"
+> columns = ["NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
+> ```
+
+Extract arrays of input features from [`Doc`](/api/doc) objects. Expects a list
+of feature names to extract, which should refer to token attributes.
+
+| Name        |  Description                                                             |
+| ----------- | ------------------------------------------------------------------------ |
+| `columns`   | The token attributes to extract. ~~List[Union[int, str]]~~               |
+| **CREATES** | The created feature extraction layer. ~~Model[List[Doc], List[Ints2d]]~~ |
+
 ## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"}

 The following architectures are provided by the package
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -186,15 +186,14 @@ This functionality was previously available as part of the command `init-model`.
 </Infobox>

 ```cli
-$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--name] [--lexemes-jsonl] [--verbose]
+$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--name] [--verbose]
 ```

 | Name               | Description                                                                                                                                                                                                                                                         |
-| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `lang`             | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~                                                                                                                                                |
 | `vectors_loc`      | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ |
 | `output_dir`       | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~                                                                                                                                                                               |
-| `--lexemes-jsonl`, `-j` | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. ~~Optional[Path] \(option)~~                                                                                                                         |
 | `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~                                                                                                                                                  |
 | `--prune`, `-p`    | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~                                                                                                                                                                     |
 | `--name`, `-n`     | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~                                                                                                                                                   |
@ -202,6 +201,39 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr
 | `--help`, `-h`     | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                                                          |
 | **CREATES**        | A spaCy pipeline directory containing the vocab and vectors.                                                                                                                                                                                                        |

+### init labels {#init-labels new="3" tag="command"}
+
+Generate JSON files for the labels in the data. This helps speed up the training
+process, since spaCy won't have to preprocess the data to extract the labels.
+After generating the labels, you can provide them to components that accept a
+`labels` argument on initialization via the
+[`[initialize]`](/api/data-formats#config-initialize) block of your config.
+
+> #### Example config
+>
+> ```ini
+> [initialize.components.ner]
+>
+> [initialize.components.ner.labels]
+> @readers = "spacy.read_labels.v1"
+> path = "corpus/labels/ner.json
+> ```
+
+```cli
+$ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [--gpu-id] [overrides]
+```
+
+| Name              | Description                                                                                                                                                                                |
+| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `config_path`     | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~                                                                |
+| `output_path`     | Output directory for the label files. Will create one JSON file per component. ~~Path (positional)~~                                                                                       |
+| `--code`, `-c`    | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~       |
+| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~                                                                                                                               |
+| `--gpu-id`, `-g`  | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                 |
+| `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                 |
+| overrides         | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
+| **CREATES**       | The final trained pipeline and the best trained pipeline.                                                                                                                                  |
+
 ## convert {#convert tag="command"}

 Convert files into spaCy's
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@ -238,8 +238,6 @@ without requiring them at runtime when you load the trained pipeline back in.
 > data_path = "/path/to/component_data"
 > ```

-<!-- TODO: -->
-
 | Name           | Description                                                                                                                                                                                                                                                                                                                                                                                                    |
 | -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `components`   | Additional arguments passed to the `initialize` method of a pipeline component, keyed by component name. If type annotations are available on the method, the config will be validated against them. The `initialize` methods will always receive the `get_examples` callback and the current `nlp` object. ~~Dict[str, Dict[str, Any]]~~                                                                      |
@ -454,15 +452,20 @@ example = Example.from_dict(doc, gold_dict)

 ## Lexical data for vocabulary {#vocab-jsonl new="2"}

-To populate a pipeline's vocabulary, you can use the
-[`spacy init vectors`](/api/cli#init-vectors) command and load in a
-[newline-delimited JSON](http://jsonlines.org/) (JSONL) file containing one
-lexical entry per line via the `--jsonl-loc` option. The first line defines the
-language and vocabulary settings. All other lines are expected to be JSON
-objects describing an individual lexeme. The lexical attributes will be then set
-as attributes on spaCy's [`Lexeme`](/api/lexeme#attributes) object. The `vocab`
-command outputs a ready-to-use spaCy pipeline with a `Vocab` containing the
-lexical data.
+This data file can be provided via the `vocab_data` setting in the
+`[initialize]` block of the training config to pre-define the lexical data to
+initialize the `nlp` object's vocabulary with. The file should contain one
+lexical entry per line. The first line defines the language and vocabulary
+settings. All other lines are expected to be JSON objects describing an
+individual lexeme. The lexical attributes will be then set as attributes on
+spaCy's [`Lexeme`](/api/lexeme#attributes) object.
+
+> #### Example config
+>
+> ```ini
+> [initialize]
+> vocab_data = "/path/to/vocab-data.jsonl"
+> ```

 ```python
 ### First line
--- a/website/docs/api/dependencyparser.md
+++ b/website/docs/api/dependencyparser.md
@ -21,8 +21,9 @@ non-projective parses.
 The parser is trained using an **imitation learning objective**. It follows the
 actions predicted by the current weights, and at each state, determines which
 actions are compatible with the optimal parse that could be reached from the
-current state. The weights are updated such that the scores assigned to the set of optimal actions is increased, while scores assigned to other actions are decreased. Note
-that more than one action may be optimal for a given state.
+current state. The weights are updated such that the scores assigned to the set
+of optimal actions is increased, while scores assigned to other actions are
+decreased. Note that more than one action may be optimal for a given state.

 ## Config and implementation {#config}

@ -139,7 +140,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |

-## DependencyParser.initialize {#initialize tag="method"}
+## DependencyParser.initialize {#initialize tag="method" new="3"}

 Initialize the component for training. `get_examples` should be a function that
 returns an iterable of [`Example`](/api/example) objects. The data examples are
@ -148,7 +149,10 @@ training data or a representative sample. Initialization includes validating the
 network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme based on the data. This method is typically called
-by [`Language.initialize`](/api/language#initialize).
+by [`Language.initialize`](/api/language#initialize) and lets you customize
+arguments it receives via the
+[`[initialize.components]`](/api/data-formats#config-initialize) block in the
+config.

 <Infobox variant="warning" title="Changed in v3.0" id="begin_training">

@ -162,12 +166,22 @@ This method was previously called `begin_training`.
 > parser = nlp.add_pipe("parser")
 > parser.initialize(lambda: [], nlp=nlp)
 > ```
+>
+> ```ini
+> ### config.cfg
+> [initialize.components.parser]
+>
+> [initialize.components.parser.labels]
+> @readers = "spacy.read_labels.v1"
+> path = "corpus/labels/parser.json
+> ```

 | Name           | Description                                                                                                                                                                                                                                                                                                         |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~                                                                                                                                                                               |
 | _keyword-only_ |                                                                                                                                                                                                                                                                                                                     |
 | `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                |
+| `labels`       | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |

 ## DependencyParser.predict {#predict tag="method"}

--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@ -32,7 +32,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
 > ```

 | Name                                     | Description                                                                                                                                                                                        |
-| ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `vocab`                                  | A storage container for lexical types. ~~Vocab~~                                                                                                                                                   |
 | `words`                                  | A list of strings to add to the container. ~~Optional[List[str]]~~                                                                                                                                 |
 | `spaces`                                 | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~       |
@ -45,7 +45,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
 | `heads` <Tag variant="new">3</Tag>       | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ |
 | `deps` <Tag variant="new">3</Tag>        | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                              |
 | `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~  |
-| `ents` <Tag variant="new">3</Tag>        | A list of `(label, start, end)` tuples to assign as `doc.ents`. Note that the `start` and `end` indices here refer to the token indices. Defaults to `None`. ~~Optional[List[Tuple[Union[str, int], int, int]]]~~ |
+| `ents` <Tag variant="new">3</Tag>        | A list of strings, of the same length of `words`, to assign the token-based IOB tag. Defaults to `None`. ~~Optional[List[str]]~~                                                                   |

 ## Doc.\_\_getitem\_\_ {#getitem tag="method"}

@ -503,7 +503,9 @@ invalidated, although they may accidentally continue to work.
 Mark a span for merging. The `attrs` will be applied to the resulting token (if
 they're context-dependent token attributes like `LEMMA` or `DEP`) or to the
 underlying lexeme (if they're context-independent lexical attributes like
-`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided using the `"_"` key and specifying a dictionary that maps attribute names to values.
+`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided
+using the `"_"` key and specifying a dictionary that maps attribute names to
+values.

 > #### Example
 >
--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@ -139,7 +139,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |

-## EntityLinker.initialize {#initialize tag="method"}
+## EntityLinker.initialize {#initialize tag="method" new="3"}

 Initialize the component for training. `get_examples` should be a function that
 returns an iterable of [`Example`](/api/example) objects. The data examples are
--- a/website/docs/api/entityrecognizer.md
+++ b/website/docs/api/entityrecognizer.md
@ -129,7 +129,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |

-## EntityRecognizer.initialize {#initialize tag="method"}
+## EntityRecognizer.initialize {#initialize tag="method" new="3"}

 Initialize the component for training. `get_examples` should be a function that
 returns an iterable of [`Example`](/api/example) objects. The data examples are
@ -138,7 +138,10 @@ training data or a representative sample. Initialization includes validating the
 network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme based on the data. This method is typically called
-by [`Language.initialize`](/api/language#initialize).
+by [`Language.initialize`](/api/language#initialize) and lets you customize
+arguments it receives via the
+[`[initialize.components]`](/api/data-formats#config-initialize) block in the
+config.

 <Infobox variant="warning" title="Changed in v3.0" id="begin_training">

@ -152,12 +155,22 @@ This method was previously called `begin_training`.
 > ner = nlp.add_pipe("ner")
 > ner.initialize(lambda: [], nlp=nlp)
 > ```
+>
+> ```ini
+> ### config.cfg
+> [initialize.components.ner]
+>
+> [initialize.components.ner.labels]
+> @readers = "spacy.read_labels.v1"
+> path = "corpus/labels/ner.json
+> ```

 | Name           | Description                                                                                                                                                                                                                                                                                                         |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~                                                                                                                                                                               |
 | _keyword-only_ |                                                                                                                                                                                                                                                                                                                     |
 | `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                |
+| `labels`       | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |

 ## EntityRecognizer.predict {#predict tag="method"}

--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@ -202,7 +202,7 @@ more efficient than processing texts one-by-one.
 | `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~                                                                                                               |
 | **YIELDS**                                 | Documents in the order of the original text. ~~Doc~~                                                                                                                |

-## Language.initialize {#initialize tag="method"}
+## Language.initialize {#initialize tag="method" new="3"}

 Initialize the pipeline for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). Under the hood, it uses the
--- a/website/docs/api/morphologizer.md
+++ b/website/docs/api/morphologizer.md
@ -126,7 +126,10 @@ training data or a representative sample. Initialization includes validating the
 network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme based on the data. This method is typically called
-by [`Language.initialize`](/api/language#initialize).
+by [`Language.initialize`](/api/language#initialize) and lets you customize
+arguments it receives via the
+[`[initialize.components]`](/api/data-formats#config-initialize) block in the
+config.

 > #### Example
 >
@ -134,12 +137,22 @@ by [`Language.initialize`](/api/language#initialize).
 > morphologizer = nlp.add_pipe("morphologizer")
 > morphologizer.initialize(lambda: [], nlp=nlp)
 > ```
+>
+> ```ini
+> ### config.cfg
+> [initialize.components.morphologizer]
+>
+> [initialize.components.morphologizer.labels]
+> @readers = "spacy.read_labels.v1"
+> path = "corpus/labels/morphologizer.json
+> ```

 | Name           | Description                                                                                                                                                                                                                                                                                                         |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~                                                                                                                                                                               |
 | _keyword-only_ |                                                                                                                                                                                                                                                                                                                     |
 | `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                |
+| `labels`       | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |

 ## Morphologizer.predict {#predict tag="method"}

--- a/website/docs/api/pipe.md
+++ b/website/docs/api/pipe.md
@ -98,7 +98,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |

-## Pipe.initialize {#initialize tag="method"}
+## Pipe.initialize {#initialize tag="method" new="3"}

 Initialize the component for training. `get_examples` should be a function that
 returns an iterable of [`Example`](/api/example) objects. The data examples are
--- a/website/docs/api/tagger.md
+++ b/website/docs/api/tagger.md
@ -112,7 +112,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |

-## Tagger.initialize {#initialize tag="method"}
+## Tagger.initialize {#initialize tag="method" new="3"}

 Initialize the component for training. `get_examples` should be a function that
 returns an iterable of [`Example`](/api/example) objects. The data examples are
@ -121,7 +121,10 @@ training data or a representative sample. Initialization includes validating the
 network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme based on the data. This method is typically called
-by [`Language.initialize`](/api/language#initialize).
+by [`Language.initialize`](/api/language#initialize) and lets you customize
+arguments it receives via the
+[`[initialize.components]`](/api/data-formats#config-initialize) block in the
+config.

 <Infobox variant="warning" title="Changed in v3.0" id="begin_training">

@ -135,12 +138,22 @@ This method was previously called `begin_training`.
 > tagger = nlp.add_pipe("tagger")
 > tagger.initialize(lambda: [], nlp=nlp)
 > ```
+>
+> ```ini
+> ### config.cfg
+> [initialize.components.tagger]
+>
+> [initialize.components.tagger.labels]
+> @readers = "spacy.read_labels.v1"
+> path = "corpus/labels/tagger.json
+> ```

 | Name           | Description                                                                                                                                                                                                                                                                                                         |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~                                                                                                                                                                               |
 | _keyword-only_ |                                                                                                                                                                                                                                                                                                                     |
 | `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                |
+| `labels`       | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[list]~~ |

 ## Tagger.predict {#predict tag="method"}

--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@ -125,7 +125,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |

-## TextCategorizer.initialize {#initialize tag="method"}
+## TextCategorizer.initialize {#initialize tag="method" new="3"}

 Initialize the component for training. `get_examples` should be a function that
 returns an iterable of [`Example`](/api/example) objects. The data examples are
@ -134,7 +134,10 @@ training data or a representative sample. Initialization includes validating the
 network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme based on the data. This method is typically called
-by [`Language.initialize`](/api/language#initialize).
+by [`Language.initialize`](/api/language#initialize) and lets you customize
+arguments it receives via the
+[`[initialize.components]`](/api/data-formats#config-initialize) block in the
+config.

 <Infobox variant="warning" title="Changed in v3.0" id="begin_training">

@ -148,12 +151,22 @@ This method was previously called `begin_training`.
 > textcat = nlp.add_pipe("textcat")
 > textcat.initialize(lambda: [], nlp=nlp)
 > ```
+>
+> ```ini
+> ### config.cfg
+> [initialize.components.textcat]
+>
+> [initialize.components.textcat.labels]
+> @readers = "spacy.read_labels.v1"
+> path = "corpus/labels/textcat.json
+> ```

 | Name           | Description                                                                                                                                                                                                                                                                                                         |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~                                                                                                                                                                               |
 | _keyword-only_ |                                                                                                                                                                                                                                                                                                                     |
 | `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                |
+| `labels`       | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |

 ## TextCategorizer.predict {#predict tag="method"}

--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@ -538,6 +538,32 @@ JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class.
 | `limit`      | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~                                  |
 | **CREATES**  | The corpus reader. ~~JsonlTexts~~                                                                                                |

+### spacy.read_labels.v1 {#read_labels tag="registered function"}
+
+Read a JSON-formatted labels file generated with
+[`init labels`](/api/cli#init-labels). Typically used in the
+[`[initialize]`](/api/data-formats#config-initialize) block of the training
+config to speed up the model initialization process and provide pre-generated
+label sets.
+
+> #### Example config
+>
+> ```ini
+> [initialize.components]
+>
+> [initialize.components.ner]
+>
+> [initialize.components.ner.labels]
+> @readers = "spacy.read_labels.v1"
+> path = "corpus/labels/ner.json"
+> ```
+
+| Name        | Description                                                                                                                                                                                                               |
+| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `path`      | The path to the labels file generated with [`init labels`](/api/cli#init-labels). ~~Path~~                                                                                                                                |
+| `require`   | Whether to require the file to exist. If set to `False` and the labels file doesn't exist, the loader will return `None` and the `initialize` method will extract the labels from the data. Defaults to `False`. ~~bool~~ |
+| **CREATES** | The                                                                                                                                                                                                                       |
+
 ## Batchers {#batchers source="spacy/training/batchers.py" new="3"}

 A data batcher implements a batching strategy that essentially turns a stream of
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@ -585,8 +585,9 @@ vectors, but combines them via summation with a smaller table of learned
 embeddings.

 ```python
-from thinc.api import add, chain, remap_ids, Embed, FeatureExtractor
+from thinc.api import add, chain, remap_ids, Embed
 from spacy.ml.staticvectors import StaticVectors
+from spacy.ml.featureextractor import FeatureExtractor
 from spacy.util import registry

@registry.architectures("my_example.MyEmbedding.v1")
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@ -204,7 +204,19 @@ initialize it.

 ![Illustration of pipeline lifecycle](../images/lifecycle.svg)

-<!-- TODO: explain lifecycle and initialization -->
+At runtime spaCy will only use the `[nlp]` and `[components]` blocks of the
+config and load all data, including tokenization rules, model weights and other
+resources from the pipeline directory. The `[training]` block contains the
+settings for training the model and is only used during training. Similarly, the
+`[initialize]` block defines how the initial `nlp` object should be set up
+before training and whether it should be initialized with vectors or pretrained
+tok2vec weights, or any other data needed by the components.
+
+The initialization settings are only loaded and used when
+[`nlp.initialize`](/api/language#initialize) is called (typically right before
+training). This allows you to set up your pipeline using local data resources
+and custom functions, and preserve the information in your config – but without
+requiring it to be available at runtime

 ### Overwriting config settings on the command line {#config-overrides}

@ -803,6 +815,10 @@ def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
    return create_model(output_width)
 ```

+<!-- TODO:
+### Customizing the initialization {#initialization}
+-->
+
 ## Data utilities {#data}

 spaCy includes various features and utilities to make it easy to train models
@ -853,7 +869,7 @@ nlp = spacy.blank("en")
 docbin = DocBin(nlp.vocab)
 words = ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "."]
 spaces = [True, True, True, True, True, True, True, False]
-ents = [("ORG", 0, 1), ("GPE", 5, 6)]
+ents = ["B-ORG", "O", "O", "O", "O", "B-GPE", "O", "O"]
 doc = Doc(nlp.vocab, words=words, spaces=spaces, ents=ents)
 docbin.add(doc)
 docbin.to_disk("./train.spacy")
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@ -104,7 +104,6 @@ import Benchmarks from 'usage/\_benchmarks-models.md'
 >
 > ```ini
 > [training]
-> vectors = null
 > accumulate_gradient = 3
 >
 > [training.optimizer]
@ -430,6 +429,8 @@ The following methods, attributes and commands are new in spaCy v3.0.
 | [`util.load_meta`](/api/top-level#util.load_meta), [`util.load_config`](/api/top-level#util.load_config)                        | Updated helpers for loading a pipeline's [`meta.json`](/api/data-formats#meta) and [`config.cfg`](/api/data-formats#config).                                                                     |
 | [`util.get_installed_models`](/api/top-level#util.get_installed_models)                                                         | Names of all pipeline packages installed in the environment.                                                                                                                                     |
 | [`init config`](/api/cli#init-config), [`init fill-config`](/api/cli#init-fill-config), [`debug config`](/api/cli#debug-config) | CLI commands for initializing, auto-filling and debugging [training configs](/usage/training).                                                                                                   |
+| [`init vectors`](/api/cli#init-vectors)                                                                                         | Convert word vectors for use with spaCy.                                                                                                                                                         |
+| [`init labels`](/api/cli#init-labels)                                                                                           | Generate JSON files for the labels in the data to speed up training.                                                                                                                             |
 | [`project`](/api/cli#project)                                                                                                   | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects).                                                                                                       |
 | [`ray`](/api/cli#ray)                                                                                                           | Suite of CLI commands for parallel training with [Ray](https://ray.io/), provided by the [`spacy-ray`](https://github.com/explosion/spacy-ray) extension package.                                |

--- a/website/gatsby-config.js
+++ b/website/gatsby-config.js
@ -1,6 +1,11 @@
 const autoprefixer = require('autoprefixer')
 const path = require('path')

+// https://florian.ec/blog/gatsby-build-netlify-segmentation-fault/
+const sharp = require('sharp')
+sharp.cache(false)
+sharp.simd(false)
+
 // Markdown plugins
 const wrapSectionPlugin = require('./src/plugins/remark-wrap-section.js')
 const customAttrsPlugin = require('./src/plugins/remark-custom-attrs.js')