Merge branch 'develop' into nightly.spacy.io

2025-07-15 02:32:37 +03:00 · 2020-12-08 18:07:03 +11:00 · 2020-12-08 18:07:03 +11:00 · bf6992c2dd
commit bf6992c2dd
parent 24f5fe8839 ef59ce783b
19 changed files with 292 additions and 129 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,7 +6,7 @@ requires = [
    "cymem>=2.0.2,<2.1.0",
    "preshed>=3.0.2,<3.1.0",
    "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.0.0rc0,<8.1.0",
+    "thinc>=8.0.0rc2,<8.1.0",
    "blis>=0.4.0,<0.8.0",
    "pathy"
 ]
--- a/requirements.txt
+++ b/requirements.txt
@ -1,7 +1,7 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.0.0rc0,<8.1.0
+thinc>=8.0.0rc2,<8.1.0
 blis>=0.4.0,<0.8.0
 ml_datasets==0.2.0a0
 murmurhash>=0.28.0,<1.1.0
--- a/setup.cfg
+++ b/setup.cfg
@ -34,13 +34,13 @@ setup_requires =
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
    murmurhash>=0.28.0,<1.1.0
-    thinc>=8.0.0rc0,<8.1.0
+    thinc>=8.0.0rc2,<8.1.0
 install_requires =
    # Our libraries
    murmurhash>=0.28.0,<1.1.0
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
-    thinc>=8.0.0rc0,<8.1.0
+    thinc>=8.0.0rc2,<8.1.0
    blis>=0.4.0,<0.8.0
    wasabi>=0.8.0,<1.1.0
    srsly>=2.3.0,<3.0.0
--- a/spacy/init.py
+++ b/spacy/init.py
@ -7,7 +7,7 @@ warnings.filterwarnings("ignore", message="numpy.dtype size changed")  # noqa
 warnings.filterwarnings("ignore", message="numpy.ufunc size changed")  # noqa
 # These are imported as part of the API
-from thinc.api import prefer_gpu, require_gpu  # noqa: F401
+from thinc.api import prefer_gpu, require_gpu, require_cpu  # noqa: F401
 from thinc.api import Config
 from . import pipeline  # noqa: F401
--- a/spacy/default_config_pretraining.cfg
+++ b/spacy/default_config_pretraining.cfg
@ -17,7 +17,9 @@ tolerance = 0.2
 get_length = null
 [pretraining.objective]
-type = "characters"
+@architectures = "spacy.PretrainCharacters.v1"
 maxout_pieces = 3
 hidden_size = 300
 n_characters = 4
 [pretraining.optimizer]
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -484,8 +484,8 @@ class Errors:
            "has been applied.")
    E905 = ("Cannot initialize StaticVectors layer: nM dimension unset. This "
            "dimension refers to the width of the vectors table.")
-    E906 = ("Unexpected `loss` value in pretraining objective: {loss_type}")
+    E906 = ("Unexpected `loss` value in pretraining objective: '{found}'. Supported values "
-    E907 = ("Unexpected `objective_type` value in pretraining objective: {objective_type}")
+            "are: {supported}")
    E908 = ("Can't set `spaces` without `words` in `Doc.__init__`.")
    E909 = ("Expected {name} in parser internals. This is likely a bug in spaCy.")
    E910 = ("Encountered NaN value when computing loss for component '{name}'.")
--- a/spacy/ml/models/init.py
+++ b/spacy/ml/models/init.py
@ -1,4 +1,5 @@
 from .entity_linker import *  # noqa
 from .multi_task import *  # noqa
 from .parser import *  # noqa
 from .tagger import *  # noqa
 from .textcat import *  # noqa
--- a/spacy/ml/models/multi_task.py
+++ b/spacy/ml/models/multi_task.py
@ -1,7 +1,14 @@
-from typing import Optional, Iterable, Tuple, List, TYPE_CHECKING
+from typing import Optional, Iterable, Tuple, List, Callable, TYPE_CHECKING
 import numpy
 from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
 from thinc.api import MultiSoftmax, list2array
 from thinc.api import to_categorical, CosineDistance, L2Distance
 from ...util import registry
 from ...errors import Errors
 from ...attrs import ID
 import numpy
 from functools import partial
 if TYPE_CHECKING:
    # This lets us add type hints for mypy etc. without causing circular imports
@ -9,6 +16,74 @@ if TYPE_CHECKING:
    from ...tokens import Doc  # noqa: F401
@registry.architectures.register("spacy.PretrainVectors.v1")
 def create_pretrain_vectors(
    maxout_pieces: int, hidden_size: int, loss: str
 ) -> Callable[["Vocab", Model], Model]:
    def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model:
        model = build_cloze_multi_task_model(
            vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
        )
        model.attrs["loss"] = create_vectors_loss()
        return model
    def create_vectors_loss() -> Callable:
        if loss == "cosine":
            distance = CosineDistance(normalize=True, ignore_zeros=True)
            return partial(get_vectors_loss, distance=distance)
        elif loss == "L2":
            distance = L2Distance(normalize=True)
            return partial(get_vectors_loss, distance=distance)
        else:
            raise ValueError(Errors.E906.format(found=loss, supported="'cosine', 'L2'"))
    return create_vectors_objective
@registry.architectures.register("spacy.PretrainCharacters.v1")
 def create_pretrain_characters(
    maxout_pieces: int, hidden_size: int, n_characters: int
 ) -> Callable[["Vocab", Model], Model]:
    def create_characters_objective(vocab: "Vocab", tok2vec: Model) -> Model:
        model = build_cloze_characters_multi_task_model(
            vocab,
            tok2vec,
            hidden_size=hidden_size,
            maxout_pieces=maxout_pieces,
            nr_char=n_characters,
        )
        model.attrs["loss"] = partial(get_characters_loss, nr_char=n_characters)
        return model
    return create_characters_objective
 def get_vectors_loss(ops, docs, prediction, distance):
    """Compute a loss based on a distance between the documents' vectors and
    the prediction.
    """
    # The simplest way to implement this would be to vstack the
    # token.vector values, but that's a bit inefficient, especially on GPU.
    # Instead we fetch the index into the vectors table for each of our tokens,
    # and look them up all at once. This prevents data copying.
    ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
    target = docs[0].vocab.vectors.data[ids]
    d_target, loss = distance(prediction, target)
    return loss, d_target
 def get_characters_loss(ops, docs, prediction, nr_char):
    """Compute a loss based on a number of characters predicted from the docs."""
    target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
    target_ids = target_ids.reshape((-1,))
    target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
    target = target.reshape((-1, 256 * nr_char))
    diff = prediction - target
    loss = (diff ** 2).sum()
    d_target = diff / float(prediction.shape[0])
    return loss, d_target
 def build_multi_task_model(
    tok2vec: Model,
    maxout_pieces: int,
@ -33,23 +108,19 @@ def build_multi_task_model(
 def build_cloze_multi_task_model(
-    vocab: "Vocab",
+    vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int
    tok2vec: Model,
    maxout_pieces: int,
    hidden_size: int,
    nO: Optional[int] = None,
 ) -> Model:
-    # nO = vocab.vectors.data.shape[1]
+    nO = vocab.vectors.data.shape[1]
    output_layer = chain(
        list2array(),
        Maxout(
-            nO=nO,
+            nO=hidden_size,
            nI=tok2vec.get_dim("nO"),
            nP=maxout_pieces,
            normalize=True,
            dropout=0.0,
        ),
-        Linear(nO=nO, nI=nO, init_W=zero_init),
+        Linear(nO=nO, nI=hidden_size, init_W=zero_init),
    )
    model = chain(tok2vec, output_layer)
    model = build_masked_language_model(vocab, model)
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -351,9 +351,7 @@ class ConfigSchemaPretrain(BaseModel):
    batcher: Batcher = Field(..., title="Batcher for the training data")
    component: str = Field(..., title="Component to find the layer to pretrain")
    layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
-
+    objective: Callable[["Vocab", "Model"], "Model"] = Field(..., title="A function that creates the pretraining objective.")
    # TODO: use a more detailed schema for this?
    objective: Dict[str, Any] = Field(..., title="Pretraining objective")
    # fmt: on
    class Config:
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@ -3,15 +3,15 @@ from thinc.api import Config, ConfigValidationError
 import spacy
 from spacy.lang.en import English
 from spacy.lang.de import German
-from spacy.language import Language, DEFAULT_CONFIG
+from spacy.language import Language, DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH
-from spacy.util import registry, load_model_from_config
+from spacy.util import registry, load_model_from_config, load_config
 from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model
 from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder
-from spacy.schemas import ConfigSchema
+from spacy.schemas import ConfigSchema, ConfigSchemaPretrain
 from ..util import make_tempdir
 nlp_config_string = """
 [paths]
 train = null
@ -63,6 +63,59 @@ factory = "tagger"
 width = ${components.tok2vec.model.width}
 """
 pretrain_config_string = """
 [paths]
 train = null
 dev = null
 [corpora]
 [corpora.train]
@readers = "spacy.Corpus.v1"
 path = ${paths.train}
 [corpora.dev]
@readers = "spacy.Corpus.v1"
 path = ${paths.dev}
 [training]
 [training.batcher]
@batchers = "spacy.batch_by_words.v1"
 size = 666
 [nlp]
 lang = "en"
 pipeline = ["tok2vec", "tagger"]
 [components]
 [components.tok2vec]
 factory = "tok2vec"
 [components.tok2vec.model]
@architectures = "spacy.HashEmbedCNN.v1"
 pretrained_vectors = null
 width = 342
 depth = 4
 window_size = 1
 embed_size = 2000
 maxout_pieces = 3
 subword_features = true
 [components.tagger]
 factory = "tagger"
 [components.tagger.model]
@architectures = "spacy.Tagger.v1"
 [components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
 width = ${components.tok2vec.model.width}
 [pretraining]
 """
 parser_config_string = """
 [model]
@ -126,6 +179,14 @@ def test_create_nlp_from_config():
        load_model_from_config(Config(bad_cfg), auto_fill=True)
 def test_create_nlp_from_pretraining_config():
    """Test that the default pretraining config validates properly"""
    config = Config().from_str(pretrain_config_string)
    pretrain_config = load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
    filled = config.merge(pretrain_config)
    resolved = registry.resolve(filled["pretraining"], schema=ConfigSchemaPretrain)
 def test_create_nlp_from_config_multiple_instances():
    """Test that the nlp object is created correctly for a config with multiple
    instances of the same component."""
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@ -4,7 +4,7 @@ import ctypes
 from pathlib import Path
 from spacy.about import __version__ as spacy_version
 from spacy import util
-from spacy import prefer_gpu, require_gpu
+from spacy import prefer_gpu, require_gpu, require_cpu
 from spacy.ml._precomputable_affine import PrecomputableAffine
 from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
 from spacy.util import dot_to_object, SimpleFrozenList
@ -15,6 +15,8 @@ from spacy.lang.nl import Dutch
 from spacy.language import DEFAULT_CONFIG_PATH
 from spacy.schemas import ConfigSchemaTraining
 from thinc.api import get_current_ops, NumpyOps, CupyOps
 from .util import get_random_doc
@ -81,6 +83,8 @@ def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
 def test_prefer_gpu():
    try:
        import cupy  # noqa: F401
        prefer_gpu()
        assert isinstance(get_current_ops(), CupyOps)
    except ImportError:
        assert not prefer_gpu()
@ -88,10 +92,24 @@ def test_prefer_gpu():
 def test_require_gpu():
    try:
        import cupy  # noqa: F401
        require_gpu()
        assert isinstance(get_current_ops(), CupyOps)
    except ImportError:
        with pytest.raises(ValueError):
            require_gpu()
 def test_require_cpu():
    require_cpu()
    assert isinstance(get_current_ops(), NumpyOps)
    try:
        import cupy  # noqa: F401
        require_gpu()
        assert isinstance(get_current_ops(), CupyOps)
    except ImportError:
        pass
    require_cpu()
    assert isinstance(get_current_ops(), NumpyOps)
 def test_ascii_filenames():
    """Test that all filenames in the project are ASCII.
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@ -2,6 +2,7 @@ import pytest
 from spacy.vocab import Vocab
 from spacy.tokenizer import Tokenizer
 from spacy.util import ensure_path
 from spacy.lang.en import English
 def test_tokenizer_handles_no_word(tokenizer):
@ -150,6 +151,22 @@ def test_tokenizer_special_cases_with_affixes(tokenizer):
    ]
 def test_tokenizer_special_cases_with_affixes_preserve_spacy():
    tokenizer = English().tokenizer
    # reset all special cases
    tokenizer.rules = {}
    # in-place modification (only merges)
    text = "''a'' "
    tokenizer.add_special_case("''", [{"ORTH": "''"}])
    assert tokenizer(text).text == text
    # not in-place (splits and merges)
    tokenizer.add_special_case("ab", [{"ORTH": "a"}, {"ORTH": "b"}])
    text = "ab ab ab ''ab ab'' ab'' ''ab"
    assert tokenizer(text).text == text
 def test_tokenizer_special_cases_with_period(tokenizer):
    text = "_SPECIAL_."
    tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}])
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@ -514,6 +514,11 @@ def test_roundtrip_docs_to_docbin(doc):
            ([[0], [1], [2, 3]], [[0], [1], [2], [2]]),
        ),
        ([" ", "a"], ["a"], ([[], [0]], [[1]])),
        (
            ["a", "''", "'", ","],
            ["a'", "''", ","],
            ([[0], [0, 1], [1], [2]], [[0, 1], [1, 2], [3]]),
        ),
    ],
 )
 def test_align(tokens_a, tokens_b, expected):  # noqa
@ -698,7 +703,7 @@ def test_alignment_spaces(en_vocab):
    align = Alignment.from_strings(other_tokens, spacy_tokens)
    assert list(align.x2y.lengths) == [0, 3, 1, 1, 1, 1, 1]
    assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
-    assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,]
+    assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
    assert list(align.y2x.dataXd) == [1, 1, 1, 2, 3, 4, 5, 6]
    # multiple leading whitespace tokens
@ -707,7 +712,7 @@ def test_alignment_spaces(en_vocab):
    align = Alignment.from_strings(other_tokens, spacy_tokens)
    assert list(align.x2y.lengths) == [0, 0, 3, 1, 1, 1, 1, 1]
    assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
-    assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,]
+    assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
    assert list(align.y2x.dataXd) == [2, 2, 2, 3, 4, 5, 6, 7]
    # both with leading whitespace, not identical
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -338,7 +338,7 @@ cdef class Tokenizer:
                    # Copy special case tokens into doc and adjust token and
                    # character offsets
                    idx_offset = 0
-                    orig_final_spacy = doc.c[span_end + offset - 1].spacy
+                    orig_final_spacy = doc.c[span_end - 1].spacy
                    orig_idx = doc.c[i].idx
                    for j in range(cached.length):
                        tokens[i + offset + j] = cached.data.tokens[j]
--- a/spacy/training/align.pyx
+++ b/spacy/training/align.pyx
@ -7,8 +7,8 @@ from ..errors import Errors
 def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[List[int]]]:
    # Create character-to-token mappings
-    char_to_token_a = tuple(chain(*((i,) * len(x) for i, x in enumerate(A))))
+    char_to_token_a = tuple(chain(*((i,) * len(x.lower()) for i, x in enumerate(A))))
-    char_to_token_b = tuple(chain(*((i,) * len(x) for i, x in enumerate(B))))
+    char_to_token_b = tuple(chain(*((i,) * len(x.lower()) for i, x in enumerate(B))))
    str_a = "".join(A).lower()
    str_b = "".join(B).lower()
    cdef int len_str_a = len(str_a)
@ -36,8 +36,14 @@ def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[Li
        if prev_token_idx_b != token_idx_b:
            b2a.append(set())
        # Process the alignment at the current position
-        if A[token_idx_a] == B[token_idx_b]:
+        if A[token_idx_a] == B[token_idx_b] and \
-            # Current tokens are identical
+                (char_idx_a == 0 or \
                    char_to_token_a[char_idx_a - 1] < token_idx_a) and \
                (char_idx_b == 0 or \
                    char_to_token_b[char_idx_b - 1] < token_idx_b):
            # Current tokens are identical and both character offsets are the
            # start of a token (either at the beginning of the document or the
            # previous character belongs to a different token)
            a2b[-1].add(token_idx_b)
            b2a[-1].add(token_idx_a)
            char_idx_a += len(A[token_idx_a])
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@ -28,7 +28,7 @@ def train(
    use_gpu: int = -1,
    stdout: IO = sys.stdout,
    stderr: IO = sys.stderr,
-) -> None:
+) -> Tuple["Language", Optional[Path]]:
    """Train a pipeline.
    nlp (Language): The initialized nlp object with the full config.
@ -40,7 +40,7 @@ def train(
    stderr (file): A second file-like object to write output messages. To disable
        printing, set to io.StringIO.
-    RETURNS (Path / None): The path to the final exported model.
+    RETURNS (tuple): The final nlp object and the path to the exported model.
    """
    # We use no_print here so we can respect the stdout/stderr options.
    msg = Printer(no_print=True)
@ -105,17 +105,18 @@ def train(
        raise e
    finally:
        finalize_logger()
        if optimizer.averages:
            nlp.use_params(optimizer.averages)
        if output_path is not None:
            final_model_path = output_path / DIR_MODEL_LAST
-            if optimizer.averages:
+            nlp.to_disk(final_model_path)
                with nlp.use_params(optimizer.averages):
                    nlp.to_disk(final_model_path)
            else:
                nlp.to_disk(final_model_path)
            # This will only run if we don't hit an error
            stdout.write(
                msg.good("Saved pipeline to output directory", final_model_path) + "\n"
            )
            return (nlp, final_model_path)
        else:
            return (nlp, None)
 def train_while_improving(
--- a/spacy/training/pretrain.py
+++ b/spacy/training/pretrain.py
@ -1,22 +1,16 @@
 from typing import Optional, Callable, Iterable, Union, List
 from thinc.api import Config, fix_random_seed, set_gpu_allocator, Model, Optimizer
-from thinc.api import set_dropout_rate, to_categorical, CosineDistance, L2Distance
+from thinc.api import set_dropout_rate
 from pathlib import Path
 from functools import partial
 from collections import Counter
 import srsly
 import numpy
 import time
 import re
 from wasabi import Printer
 from .example import Example
 from ..tokens import Doc
 from ..attrs import ID
 from ..ml.models.multi_task import build_cloze_multi_task_model
 from ..ml.models.multi_task import build_cloze_characters_multi_task_model
 from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
 from ..errors import Errors
 from ..util import registry, load_model_from_config, dot_to_object
@ -49,6 +43,7 @@ def pretrain(
    else:
        # Without '--resume-path' the '--epoch-resume' argument is ignored
        epoch_resume = 0
    objective = model.attrs["loss"]
    # TODO: move this to logger function?
    tracker = ProgressTracker(frequency=10000)
    msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
@ -69,7 +64,6 @@ def pretrain(
            with (output_dir / "log.jsonl").open("a") as file_:
                file_.write(srsly.json_dumps(log) + "\n")
    objective = create_objective(P["objective"])
    # TODO: I think we probably want this to look more like the
    # 'create_train_batches' function?
    for epoch in range(epoch_resume, P["max_epochs"]):
@ -132,58 +126,6 @@ def make_update(
    return float(loss)
 def create_objective(config: Config):
    """Create the objective for pretraining.
    We'd like to replace this with a registry function but it's tricky because
    we're also making a model choice based on this. For now we hard-code support
    for two types (characters, vectors). For characters you can specify
    n_characters, for vectors you can specify the loss.
    Bleh.
    """
    objective_type = config["type"]
    if objective_type == "characters":
        return partial(get_characters_loss, nr_char=config["n_characters"])
    elif objective_type == "vectors":
        if config["loss"] == "cosine":
            distance = CosineDistance(normalize=True, ignore_zeros=True)
            return partial(get_vectors_loss, distance=distance)
        elif config["loss"] == "L2":
            distance = L2Distance(normalize=True, ignore_zeros=True)
            return partial(get_vectors_loss, distance=distance)
        else:
            raise ValueError(Errors.E906.format(loss_type=config["loss"]))
    else:
        raise ValueError(Errors.E907.format(objective_type=objective_type))
 def get_vectors_loss(ops, docs, prediction, distance):
    """Compute a loss based on a distance between the documents' vectors and
    the prediction.
    """
    # The simplest way to implement this would be to vstack the
    # token.vector values, but that's a bit inefficient, especially on GPU.
    # Instead we fetch the index into the vectors table for each of our tokens,
    # and look them up all at once. This prevents data copying.
    ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
    target = docs[0].vocab.vectors.data[ids]
    d_target, loss = distance(prediction, target)
    return loss, d_target
 def get_characters_loss(ops, docs, prediction, nr_char):
    """Compute a loss based on a number of characters predicted from the docs."""
    target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
    target_ids = target_ids.reshape((-1,))
    target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
    target = target.reshape((-1, 256 * nr_char))
    diff = prediction - target
    loss = (diff ** 2).sum()
    d_target = diff / float(prediction.shape[0])
    return loss, d_target
 def create_pretraining_model(nlp, pretrain_config):
    """Define a network for the pretraining. We simply add an output layer onto
    the tok2vec input model. The tok2vec input model needs to be a model that
@ -192,27 +134,15 @@ def create_pretraining_model(nlp, pretrain_config):
    The actual tok2vec layer is stored as a reference, and only this bit will be
    serialized to file and read back in when calling the 'train' command.
    """
    nlp.initialize()
    component = nlp.get_pipe(pretrain_config["component"])
    if pretrain_config.get("layer"):
        tok2vec = component.model.get_ref(pretrain_config["layer"])
    else:
        tok2vec = component.model
-    # TODO
+    create_function = pretrain_config["objective"]
-    maxout_pieces = 3
+    model = create_function(nlp.vocab, tok2vec)
    hidden_size = 300
    if pretrain_config["objective"]["type"] == "vectors":
        model = build_cloze_multi_task_model(
            nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
        )
    elif pretrain_config["objective"]["type"] == "characters":
        model = build_cloze_characters_multi_task_model(
            nlp.vocab,
            tok2vec,
            hidden_size=hidden_size,
            maxout_pieces=maxout_pieces,
            nr_char=pretrain_config["objective"]["n_characters"],
        )
    model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
    set_dropout_rate(model, pretrain_config["dropout"])
    return model
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@ -171,6 +171,25 @@ and _before_ loading any pipelines.
 | `gpu_id`    | Device index to select. Defaults to `0`. ~~int~~ |
 | **RETURNS** | `True` ~~bool~~                                  |
 ### spacy.require_cpu {#spacy.require_cpu tag="function" new="3.0.0"}
 Allocate data and perform operations on CPU. 
 If data has already been allocated on GPU, it will not
 be moved. Ideally, this function should be called right after importing spaCy
 and _before_ loading any pipelines.
 > #### Example
 >
 > ```python
 > import spacy
 > spacy.require_cpu()
 > nlp = spacy.load("en_core_web_sm")
 > ```
 | Name        | Description                                      |
 | ----------- | ------------------------------------------------ |
 | **RETURNS** | `True` ~~bool~~                                  |
 ## displaCy {#displacy source="spacy/displacy"}
 As of v2.0, spaCy comes with a built-in visualization suite. For more info and
--- a/website/docs/usage/index.md
+++ b/website/docs/usage/index.md
@ -158,29 +158,37 @@ The other way to install spaCy is to clone its
 source. That is the common way if you want to make changes to the code base.
 You'll need to make sure that you have a development environment consisting of a
 Python distribution including header files, a compiler,
-[pip](https://pip.pypa.io/en/latest/installing/),
+[pip](https://pip.pypa.io/en/stable/) and [git](https://git-scm.com) installed.
-[virtualenv](https://virtualenv.pypa.io/) and [git](https://git-scm.com)
+The compiler part is the trickiest. How to do that depends on your system. See
-installed. The compiler part is the trickiest. How to do that depends on your
+notes on [Ubuntu](#source-ubuntu), [macOS / OS X](#source-osx) and
 system. See notes on [Ubuntu](#source-ubuntu), [macOS / OS X](#source-osx) and
 [Windows](#source-windows) for details.
 ```bash
-$ python -m pip install -U pip                  # update pip
+$ python -m pip install -U pip setuptools wheel # install/update build tools
 $ git clone https://github.com/explosion/spaCy  # clone spaCy
 $ cd spaCy                                      # navigate into dir
 $ python -m venv .env                           # create environment in .env
 $ source .env/bin/activate                      # activate virtual env
-$ export PYTHONPATH=`pwd`                       # set Python path to spaCy dir
+$ pip install .                                 # compile and install spaCy
 $ pip install -r requirements.txt               # install all requirements
 $ python setup.py build_ext --inplace           # compile spaCy
 $ python setup.py install                       # install spaCy
 ```
-Compared to regular install via pip, the
+To install with extras:
-[`requirements.txt`](%%GITHUB_SPACY/requirements.txt) additionally installs
+
-developer dependencies such as Cython. See the [quickstart widget](#quickstart)
+```bash
-to get the right commands for your platform and Python version.
+$ pip install .[lookups,cuda102]                # install spaCy with extras
 ```
 To install all dependencies required for development:
 ```bash
 $ pip install -r requirements.txt
 ```
 Compared to a regular install via pip, the
 [`requirements.txt`](%%GITHUB_SPACY/requirements.txt) additionally includes
 developer dependencies such as Cython and the libraries required to run the test
 suite. See the [quickstart widget](#quickstart) to get the right commands for
 your platform and Python version.
 <a id="source-ubuntu"></a><a id="source-osx"></a><a id="source-windows"></a>
@ -195,6 +203,32 @@ to get the right commands for your platform and Python version.
  [Visual Studio Express](https://www.visualstudio.com/vs/visual-studio-express/)
  that matches the version that was used to compile your Python interpreter.
 #### Additional options for developers {#source-developers}
 Some additional options may be useful for spaCy developers who are editing the
 source code and recompiling frequently.
 - Install in editable mode. Changes to `.py` files will be reflected as soon as
  the files are saved, but edits to Cython files (`.pxd`, `.pyx`) will require
  the `pip install` or `python setup.py build_ext` command below to be run
  again. Before installing in editable mode, be sure you have removed any
  previous installs with `pip uninstall spacy`, which you may need to run
  multiple times to remove all traces of earlier installs.
  ```bash
  $ pip install -r requirements.txt
  $ pip install --no-build-isolation --editable .
  ```
 - Build in parallel using `N` CPUs to speed up compilation and then install in
  editable mode:
  ```bash
  $ pip install -r requirements.txt
  $ python setup.py build_ext --inplace -j N
  $ pip install --no-build-isolation --editable .
  ```
 ### Building an executable {#executable}
 The spaCy repository includes a [`Makefile`](%%GITHUB_SPACY/Makefile) that