Merge branch 'develop' into nightly.spacy.io

2025-08-05 12:50:20 +03:00 · 2020-12-08 18:07:03 +11:00 · 2020-12-08 18:07:03 +11:00 · bf6992c2dd
commit bf6992c2dd
parent 24f5fe8839 ef59ce783b
19 changed files with 292 additions and 129 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,7 +6,7 @@ requires = [
    "cymem>=2.0.2,<2.1.0",
    "preshed>=3.0.2,<3.1.0",
    "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.0.0rc0,<8.1.0",
+    "thinc>=8.0.0rc2,<8.1.0",
    "blis>=0.4.0,<0.8.0",
    "pathy"
 ]
--- a/requirements.txt
+++ b/requirements.txt
@ -1,7 +1,7 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.0.0rc0,<8.1.0
+thinc>=8.0.0rc2,<8.1.0
 blis>=0.4.0,<0.8.0
 ml_datasets==0.2.0a0
 murmurhash>=0.28.0,<1.1.0
--- a/setup.cfg
+++ b/setup.cfg
@ -34,13 +34,13 @@ setup_requires =
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
    murmurhash>=0.28.0,<1.1.0
-    thinc>=8.0.0rc0,<8.1.0
+    thinc>=8.0.0rc2,<8.1.0
 install_requires =
    # Our libraries
    murmurhash>=0.28.0,<1.1.0
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
-    thinc>=8.0.0rc0,<8.1.0
+    thinc>=8.0.0rc2,<8.1.0
    blis>=0.4.0,<0.8.0
    wasabi>=0.8.0,<1.1.0
    srsly>=2.3.0,<3.0.0
--- a/spacy/init.py
+++ b/spacy/init.py
@ -7,7 +7,7 @@ warnings.filterwarnings("ignore", message="numpy.dtype size changed")  # noqa
 warnings.filterwarnings("ignore", message="numpy.ufunc size changed")  # noqa

 # These are imported as part of the API
-from thinc.api import prefer_gpu, require_gpu  # noqa: F401
+from thinc.api import prefer_gpu, require_gpu, require_cpu  # noqa: F401
 from thinc.api import Config

 from . import pipeline  # noqa: F401
--- a/spacy/default_config_pretraining.cfg
+++ b/spacy/default_config_pretraining.cfg
@ -17,7 +17,9 @@ tolerance = 0.2
 get_length = null

 [pretraining.objective]
-type = "characters"
+@architectures = "spacy.PretrainCharacters.v1"
+maxout_pieces = 3
+hidden_size = 300
 n_characters = 4

 [pretraining.optimizer]
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -484,8 +484,8 @@ class Errors:
            "has been applied.")
    E905 = ("Cannot initialize StaticVectors layer: nM dimension unset. This "
            "dimension refers to the width of the vectors table.")
-    E906 = ("Unexpected `loss` value in pretraining objective: {loss_type}")
-    E907 = ("Unexpected `objective_type` value in pretraining objective: {objective_type}")
+    E906 = ("Unexpected `loss` value in pretraining objective: '{found}'. Supported values "
+            "are: {supported}")
    E908 = ("Can't set `spaces` without `words` in `Doc.__init__`.")
    E909 = ("Expected {name} in parser internals. This is likely a bug in spaCy.")
    E910 = ("Encountered NaN value when computing loss for component '{name}'.")
--- a/spacy/ml/models/init.py
+++ b/spacy/ml/models/init.py
@ -1,4 +1,5 @@
 from .entity_linker import *  # noqa
+from .multi_task import *  # noqa
 from .parser import *  # noqa
 from .tagger import *  # noqa
 from .textcat import *  # noqa
--- a/spacy/ml/models/multi_task.py
+++ b/spacy/ml/models/multi_task.py
@ -1,7 +1,14 @@
-from typing import Optional, Iterable, Tuple, List, TYPE_CHECKING
-import numpy
+from typing import Optional, Iterable, Tuple, List, Callable, TYPE_CHECKING
 from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
 from thinc.api import MultiSoftmax, list2array
+from thinc.api import to_categorical, CosineDistance, L2Distance
+
+from ...util import registry
+from ...errors import Errors
+from ...attrs import ID
+
+import numpy
+from functools import partial

 if TYPE_CHECKING:
    # This lets us add type hints for mypy etc. without causing circular imports
@ -9,6 +16,74 @@ if TYPE_CHECKING:
    from ...tokens import Doc  # noqa: F401


+@registry.architectures.register("spacy.PretrainVectors.v1")
+def create_pretrain_vectors(
+    maxout_pieces: int, hidden_size: int, loss: str
+) -> Callable[["Vocab", Model], Model]:
+    def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model:
+        model = build_cloze_multi_task_model(
+            vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
+        )
+        model.attrs["loss"] = create_vectors_loss()
+        return model
+
+    def create_vectors_loss() -> Callable:
+        if loss == "cosine":
+            distance = CosineDistance(normalize=True, ignore_zeros=True)
+            return partial(get_vectors_loss, distance=distance)
+        elif loss == "L2":
+            distance = L2Distance(normalize=True)
+            return partial(get_vectors_loss, distance=distance)
+        else:
+            raise ValueError(Errors.E906.format(found=loss, supported="'cosine', 'L2'"))
+
+    return create_vectors_objective
+
+
+@registry.architectures.register("spacy.PretrainCharacters.v1")
+def create_pretrain_characters(
+    maxout_pieces: int, hidden_size: int, n_characters: int
+) -> Callable[["Vocab", Model], Model]:
+    def create_characters_objective(vocab: "Vocab", tok2vec: Model) -> Model:
+        model = build_cloze_characters_multi_task_model(
+            vocab,
+            tok2vec,
+            hidden_size=hidden_size,
+            maxout_pieces=maxout_pieces,
+            nr_char=n_characters,
+        )
+        model.attrs["loss"] = partial(get_characters_loss, nr_char=n_characters)
+        return model
+
+    return create_characters_objective
+
+
+def get_vectors_loss(ops, docs, prediction, distance):
+    """Compute a loss based on a distance between the documents' vectors and
+    the prediction.
+    """
+    # The simplest way to implement this would be to vstack the
+    # token.vector values, but that's a bit inefficient, especially on GPU.
+    # Instead we fetch the index into the vectors table for each of our tokens,
+    # and look them up all at once. This prevents data copying.
+    ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
+    target = docs[0].vocab.vectors.data[ids]
+    d_target, loss = distance(prediction, target)
+    return loss, d_target
+
+
+def get_characters_loss(ops, docs, prediction, nr_char):
+    """Compute a loss based on a number of characters predicted from the docs."""
+    target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
+    target_ids = target_ids.reshape((-1,))
+    target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
+    target = target.reshape((-1, 256 * nr_char))
+    diff = prediction - target
+    loss = (diff ** 2).sum()
+    d_target = diff / float(prediction.shape[0])
+    return loss, d_target
+
+
 def build_multi_task_model(
    tok2vec: Model,
    maxout_pieces: int,
@ -33,23 +108,19 @@ def build_multi_task_model(


 def build_cloze_multi_task_model(
-    vocab: "Vocab",
-    tok2vec: Model,
-    maxout_pieces: int,
-    hidden_size: int,
-    nO: Optional[int] = None,
+    vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int
 ) -> Model:
-    # nO = vocab.vectors.data.shape[1]
+    nO = vocab.vectors.data.shape[1]
    output_layer = chain(
        list2array(),
        Maxout(
-            nO=nO,
+            nO=hidden_size,
            nI=tok2vec.get_dim("nO"),
            nP=maxout_pieces,
            normalize=True,
            dropout=0.0,
        ),
-        Linear(nO=nO, nI=nO, init_W=zero_init),
+        Linear(nO=nO, nI=hidden_size, init_W=zero_init),
    )
    model = chain(tok2vec, output_layer)
    model = build_masked_language_model(vocab, model)
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -351,9 +351,7 @@ class ConfigSchemaPretrain(BaseModel):
    batcher: Batcher = Field(..., title="Batcher for the training data")
    component: str = Field(..., title="Component to find the layer to pretrain")
    layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
-
-    # TODO: use a more detailed schema for this?
-    objective: Dict[str, Any] = Field(..., title="Pretraining objective")
+    objective: Callable[["Vocab", "Model"], "Model"] = Field(..., title="A function that creates the pretraining objective.")
    # fmt: on

    class Config:
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@ -3,15 +3,15 @@ from thinc.api import Config, ConfigValidationError
 import spacy
 from spacy.lang.en import English
 from spacy.lang.de import German
-from spacy.language import Language, DEFAULT_CONFIG
-from spacy.util import registry, load_model_from_config
+from spacy.language import Language, DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH
+from spacy.util import registry, load_model_from_config, load_config
 from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model
 from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder
-from spacy.schemas import ConfigSchema
+from spacy.schemas import ConfigSchema, ConfigSchemaPretrain
+

 from ..util import make_tempdir

-
 nlp_config_string = """
 [paths]
 train = null
@ -63,6 +63,59 @@ factory = "tagger"
 width = ${components.tok2vec.model.width}
 """

+pretrain_config_string = """
+[paths]
+train = null
+dev = null
+
+[corpora]
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+
+[training]
+
+[training.batcher]
+@batchers = "spacy.batch_by_words.v1"
+size = 666
+
+[nlp]
+lang = "en"
+pipeline = ["tok2vec", "tagger"]
+
+[components]
+
+[components.tok2vec]
+factory = "tok2vec"
+
+[components.tok2vec.model]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = null
+width = 342
+depth = 4
+window_size = 1
+embed_size = 2000
+maxout_pieces = 3
+subword_features = true
+
+[components.tagger]
+factory = "tagger"
+
+[components.tagger.model]
+@architectures = "spacy.Tagger.v1"
+
+[components.tagger.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.width}
+
+[pretraining]
+"""
+

 parser_config_string = """
 [model]
@ -126,6 +179,14 @@ def test_create_nlp_from_config():
        load_model_from_config(Config(bad_cfg), auto_fill=True)


+def test_create_nlp_from_pretraining_config():
+    """Test that the default pretraining config validates properly"""
+    config = Config().from_str(pretrain_config_string)
+    pretrain_config = load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
+    filled = config.merge(pretrain_config)
+    resolved = registry.resolve(filled["pretraining"], schema=ConfigSchemaPretrain)
+
+
 def test_create_nlp_from_config_multiple_instances():
    """Test that the nlp object is created correctly for a config with multiple
    instances of the same component."""
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@ -4,7 +4,7 @@ import ctypes
 from pathlib import Path
 from spacy.about import __version__ as spacy_version
 from spacy import util
-from spacy import prefer_gpu, require_gpu
+from spacy import prefer_gpu, require_gpu, require_cpu
 from spacy.ml._precomputable_affine import PrecomputableAffine
 from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
 from spacy.util import dot_to_object, SimpleFrozenList
@ -15,6 +15,8 @@ from spacy.lang.nl import Dutch
 from spacy.language import DEFAULT_CONFIG_PATH
 from spacy.schemas import ConfigSchemaTraining

+from thinc.api import get_current_ops, NumpyOps, CupyOps
+
 from .util import get_random_doc


@ -81,6 +83,8 @@ def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
 def test_prefer_gpu():
    try:
        import cupy  # noqa: F401
+        prefer_gpu()
+        assert isinstance(get_current_ops(), CupyOps)
    except ImportError:
        assert not prefer_gpu()

@ -88,10 +92,24 @@ def test_prefer_gpu():
 def test_require_gpu():
    try:
        import cupy  # noqa: F401
+        require_gpu()
+        assert isinstance(get_current_ops(), CupyOps)
    except ImportError:
        with pytest.raises(ValueError):
            require_gpu()

+def test_require_cpu():
+    require_cpu()
+    assert isinstance(get_current_ops(), NumpyOps)
+    try:
+        import cupy  # noqa: F401
+        require_gpu()
+        assert isinstance(get_current_ops(), CupyOps)
+    except ImportError:
+        pass
+    require_cpu()
+    assert isinstance(get_current_ops(), NumpyOps)
+

 def test_ascii_filenames():
    """Test that all filenames in the project are ASCII.
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@ -2,6 +2,7 @@ import pytest
 from spacy.vocab import Vocab
 from spacy.tokenizer import Tokenizer
 from spacy.util import ensure_path
+from spacy.lang.en import English


 def test_tokenizer_handles_no_word(tokenizer):
@ -150,6 +151,22 @@ def test_tokenizer_special_cases_with_affixes(tokenizer):
    ]


+def test_tokenizer_special_cases_with_affixes_preserve_spacy():
+    tokenizer = English().tokenizer
+    # reset all special cases
+    tokenizer.rules = {}
+
+    # in-place modification (only merges)
+    text = "''a'' "
+    tokenizer.add_special_case("''", [{"ORTH": "''"}])
+    assert tokenizer(text).text == text
+
+    # not in-place (splits and merges)
+    tokenizer.add_special_case("ab", [{"ORTH": "a"}, {"ORTH": "b"}])
+    text = "ab ab ab ''ab ab'' ab'' ''ab"
+    assert tokenizer(text).text == text
+
+
 def test_tokenizer_special_cases_with_period(tokenizer):
    text = "_SPECIAL_."
    tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}])
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@ -514,6 +514,11 @@ def test_roundtrip_docs_to_docbin(doc):
            ([[0], [1], [2, 3]], [[0], [1], [2], [2]]),
        ),
        ([" ", "a"], ["a"], ([[], [0]], [[1]])),
+        (
+            ["a", "''", "'", ","],
+            ["a'", "''", ","],
+            ([[0], [0, 1], [1], [2]], [[0, 1], [1, 2], [3]]),
+        ),
    ],
 )
 def test_align(tokens_a, tokens_b, expected):  # noqa
@ -698,7 +703,7 @@ def test_alignment_spaces(en_vocab):
    align = Alignment.from_strings(other_tokens, spacy_tokens)
    assert list(align.x2y.lengths) == [0, 3, 1, 1, 1, 1, 1]
    assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
-    assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,]
+    assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
    assert list(align.y2x.dataXd) == [1, 1, 1, 2, 3, 4, 5, 6]

    # multiple leading whitespace tokens
@ -707,7 +712,7 @@ def test_alignment_spaces(en_vocab):
    align = Alignment.from_strings(other_tokens, spacy_tokens)
    assert list(align.x2y.lengths) == [0, 0, 3, 1, 1, 1, 1, 1]
    assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
-    assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,]
+    assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
    assert list(align.y2x.dataXd) == [2, 2, 2, 3, 4, 5, 6, 7]

    # both with leading whitespace, not identical
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -338,7 +338,7 @@ cdef class Tokenizer:
                    # Copy special case tokens into doc and adjust token and
                    # character offsets
                    idx_offset = 0
-                    orig_final_spacy = doc.c[span_end + offset - 1].spacy
+                    orig_final_spacy = doc.c[span_end - 1].spacy
                    orig_idx = doc.c[i].idx
                    for j in range(cached.length):
                        tokens[i + offset + j] = cached.data.tokens[j]
--- a/spacy/training/align.pyx
+++ b/spacy/training/align.pyx
@ -7,8 +7,8 @@ from ..errors import Errors

 def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[List[int]]]:
    # Create character-to-token mappings
-    char_to_token_a = tuple(chain(*((i,) * len(x) for i, x in enumerate(A))))
-    char_to_token_b = tuple(chain(*((i,) * len(x) for i, x in enumerate(B))))
+    char_to_token_a = tuple(chain(*((i,) * len(x.lower()) for i, x in enumerate(A))))
+    char_to_token_b = tuple(chain(*((i,) * len(x.lower()) for i, x in enumerate(B))))
    str_a = "".join(A).lower()
    str_b = "".join(B).lower()
    cdef int len_str_a = len(str_a)
@ -36,8 +36,14 @@ def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[Li
        if prev_token_idx_b != token_idx_b:
            b2a.append(set())
        # Process the alignment at the current position
-        if A[token_idx_a] == B[token_idx_b]:
-            # Current tokens are identical
+        if A[token_idx_a] == B[token_idx_b] and \
+                (char_idx_a == 0 or \
+                    char_to_token_a[char_idx_a - 1] < token_idx_a) and \
+                (char_idx_b == 0 or \
+                    char_to_token_b[char_idx_b - 1] < token_idx_b):
+            # Current tokens are identical and both character offsets are the
+            # start of a token (either at the beginning of the document or the
+            # previous character belongs to a different token)
            a2b[-1].add(token_idx_b)
            b2a[-1].add(token_idx_a)
            char_idx_a += len(A[token_idx_a])
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@ -28,7 +28,7 @@ def train(
    use_gpu: int = -1,
    stdout: IO = sys.stdout,
    stderr: IO = sys.stderr,
-) -> None:
+) -> Tuple["Language", Optional[Path]]:
    """Train a pipeline.

    nlp (Language): The initialized nlp object with the full config.
@ -40,7 +40,7 @@ def train(
    stderr (file): A second file-like object to write output messages. To disable
        printing, set to io.StringIO.

-    RETURNS (Path / None): The path to the final exported model.
+    RETURNS (tuple): The final nlp object and the path to the exported model.
    """
    # We use no_print here so we can respect the stdout/stderr options.
    msg = Printer(no_print=True)
@ -105,17 +105,18 @@ def train(
        raise e
    finally:
        finalize_logger()
+        if optimizer.averages:
+            nlp.use_params(optimizer.averages)
        if output_path is not None:
            final_model_path = output_path / DIR_MODEL_LAST
-            if optimizer.averages:
-                with nlp.use_params(optimizer.averages):
-                    nlp.to_disk(final_model_path)
-            else:
-                nlp.to_disk(final_model_path)
+            nlp.to_disk(final_model_path)
            # This will only run if we don't hit an error
            stdout.write(
                msg.good("Saved pipeline to output directory", final_model_path) + "\n"
            )
+            return (nlp, final_model_path)
+        else:
+            return (nlp, None)


 def train_while_improving(
--- a/spacy/training/pretrain.py
+++ b/spacy/training/pretrain.py
@ -1,22 +1,16 @@
 from typing import Optional, Callable, Iterable, Union, List
 from thinc.api import Config, fix_random_seed, set_gpu_allocator, Model, Optimizer
-from thinc.api import set_dropout_rate, to_categorical, CosineDistance, L2Distance
+from thinc.api import set_dropout_rate
 from pathlib import Path
-from functools import partial
 from collections import Counter
 import srsly
-import numpy
 import time
 import re
 from wasabi import Printer

 from .example import Example
 from ..tokens import Doc
-from ..attrs import ID
-from ..ml.models.multi_task import build_cloze_multi_task_model
-from ..ml.models.multi_task import build_cloze_characters_multi_task_model
 from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
-from ..errors import Errors
 from ..util import registry, load_model_from_config, dot_to_object


@ -49,6 +43,7 @@ def pretrain(
    else:
        # Without '--resume-path' the '--epoch-resume' argument is ignored
        epoch_resume = 0
+    objective = model.attrs["loss"]
    # TODO: move this to logger function?
    tracker = ProgressTracker(frequency=10000)
    msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
@ -69,7 +64,6 @@ def pretrain(
            with (output_dir / "log.jsonl").open("a") as file_:
                file_.write(srsly.json_dumps(log) + "\n")

-    objective = create_objective(P["objective"])
    # TODO: I think we probably want this to look more like the
    # 'create_train_batches' function?
    for epoch in range(epoch_resume, P["max_epochs"]):
@ -132,58 +126,6 @@ def make_update(
    return float(loss)


-def create_objective(config: Config):
-    """Create the objective for pretraining.
-
-    We'd like to replace this with a registry function but it's tricky because
-    we're also making a model choice based on this. For now we hard-code support
-    for two types (characters, vectors). For characters you can specify
-    n_characters, for vectors you can specify the loss.
-
-    Bleh.
-    """
-    objective_type = config["type"]
-    if objective_type == "characters":
-        return partial(get_characters_loss, nr_char=config["n_characters"])
-    elif objective_type == "vectors":
-        if config["loss"] == "cosine":
-            distance = CosineDistance(normalize=True, ignore_zeros=True)
-            return partial(get_vectors_loss, distance=distance)
-        elif config["loss"] == "L2":
-            distance = L2Distance(normalize=True, ignore_zeros=True)
-            return partial(get_vectors_loss, distance=distance)
-        else:
-            raise ValueError(Errors.E906.format(loss_type=config["loss"]))
-    else:
-        raise ValueError(Errors.E907.format(objective_type=objective_type))
-
-
-def get_vectors_loss(ops, docs, prediction, distance):
-    """Compute a loss based on a distance between the documents' vectors and
-    the prediction.
-    """
-    # The simplest way to implement this would be to vstack the
-    # token.vector values, but that's a bit inefficient, especially on GPU.
-    # Instead we fetch the index into the vectors table for each of our tokens,
-    # and look them up all at once. This prevents data copying.
-    ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
-    target = docs[0].vocab.vectors.data[ids]
-    d_target, loss = distance(prediction, target)
-    return loss, d_target
-
-
-def get_characters_loss(ops, docs, prediction, nr_char):
-    """Compute a loss based on a number of characters predicted from the docs."""
-    target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
-    target_ids = target_ids.reshape((-1,))
-    target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
-    target = target.reshape((-1, 256 * nr_char))
-    diff = prediction - target
-    loss = (diff ** 2).sum()
-    d_target = diff / float(prediction.shape[0])
-    return loss, d_target
-
-
 def create_pretraining_model(nlp, pretrain_config):
    """Define a network for the pretraining. We simply add an output layer onto
    the tok2vec input model. The tok2vec input model needs to be a model that
@ -192,27 +134,15 @@ def create_pretraining_model(nlp, pretrain_config):
    The actual tok2vec layer is stored as a reference, and only this bit will be
    serialized to file and read back in when calling the 'train' command.
    """
+    nlp.initialize()
    component = nlp.get_pipe(pretrain_config["component"])
    if pretrain_config.get("layer"):
        tok2vec = component.model.get_ref(pretrain_config["layer"])
    else:
        tok2vec = component.model

-    # TODO
-    maxout_pieces = 3
-    hidden_size = 300
-    if pretrain_config["objective"]["type"] == "vectors":
-        model = build_cloze_multi_task_model(
-            nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
-        )
-    elif pretrain_config["objective"]["type"] == "characters":
-        model = build_cloze_characters_multi_task_model(
-            nlp.vocab,
-            tok2vec,
-            hidden_size=hidden_size,
-            maxout_pieces=maxout_pieces,
-            nr_char=pretrain_config["objective"]["n_characters"],
-        )
+    create_function = pretrain_config["objective"]
+    model = create_function(nlp.vocab, tok2vec)
    model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
    set_dropout_rate(model, pretrain_config["dropout"])
    return model
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@ -171,6 +171,25 @@ and _before_ loading any pipelines.
 | `gpu_id`    | Device index to select. Defaults to `0`. ~~int~~ |
 | **RETURNS** | `True` ~~bool~~                                  |

+### spacy.require_cpu {#spacy.require_cpu tag="function" new="3.0.0"}
+
+Allocate data and perform operations on CPU. 
+If data has already been allocated on GPU, it will not
+be moved. Ideally, this function should be called right after importing spaCy
+and _before_ loading any pipelines.
+
+> #### Example
+>
+> ```python
+> import spacy
+> spacy.require_cpu()
+> nlp = spacy.load("en_core_web_sm")
+> ```
+
+| Name        | Description                                      |
+| ----------- | ------------------------------------------------ |
+| **RETURNS** | `True` ~~bool~~                                  |
+
 ## displaCy {#displacy source="spacy/displacy"}

 As of v2.0, spaCy comes with a built-in visualization suite. For more info and
--- a/website/docs/usage/index.md
+++ b/website/docs/usage/index.md
@ -158,29 +158,37 @@ The other way to install spaCy is to clone its
 source. That is the common way if you want to make changes to the code base.
 You'll need to make sure that you have a development environment consisting of a
 Python distribution including header files, a compiler,
-[pip](https://pip.pypa.io/en/latest/installing/),
-[virtualenv](https://virtualenv.pypa.io/) and [git](https://git-scm.com)
-installed. The compiler part is the trickiest. How to do that depends on your
-system. See notes on [Ubuntu](#source-ubuntu), [macOS / OS X](#source-osx) and
+[pip](https://pip.pypa.io/en/stable/) and [git](https://git-scm.com) installed.
+The compiler part is the trickiest. How to do that depends on your system. See
+notes on [Ubuntu](#source-ubuntu), [macOS / OS X](#source-osx) and
 [Windows](#source-windows) for details.

 ```bash
-$ python -m pip install -U pip                  # update pip
+$ python -m pip install -U pip setuptools wheel # install/update build tools
 $ git clone https://github.com/explosion/spaCy  # clone spaCy
 $ cd spaCy                                      # navigate into dir
-
 $ python -m venv .env                           # create environment in .env
 $ source .env/bin/activate                      # activate virtual env
-$ export PYTHONPATH=`pwd`                       # set Python path to spaCy dir
-$ pip install -r requirements.txt               # install all requirements
-$ python setup.py build_ext --inplace           # compile spaCy
-$ python setup.py install                       # install spaCy
+$ pip install .                                 # compile and install spaCy
 ```

-Compared to regular install via pip, the
-[`requirements.txt`](%%GITHUB_SPACY/requirements.txt) additionally installs
-developer dependencies such as Cython. See the [quickstart widget](#quickstart)
-to get the right commands for your platform and Python version.
+To install with extras:
+
+```bash
+$ pip install .[lookups,cuda102]                # install spaCy with extras
+```
+
+To install all dependencies required for development:
+
+```bash
+$ pip install -r requirements.txt
+```
+
+Compared to a regular install via pip, the
+[`requirements.txt`](%%GITHUB_SPACY/requirements.txt) additionally includes
+developer dependencies such as Cython and the libraries required to run the test
+suite. See the [quickstart widget](#quickstart) to get the right commands for
+your platform and Python version.

 <a id="source-ubuntu"></a><a id="source-osx"></a><a id="source-windows"></a>

@ -195,6 +203,32 @@ to get the right commands for your platform and Python version.
  [Visual Studio Express](https://www.visualstudio.com/vs/visual-studio-express/)
  that matches the version that was used to compile your Python interpreter.

+#### Additional options for developers {#source-developers}
+
+Some additional options may be useful for spaCy developers who are editing the
+source code and recompiling frequently.
+
+- Install in editable mode. Changes to `.py` files will be reflected as soon as
+  the files are saved, but edits to Cython files (`.pxd`, `.pyx`) will require
+  the `pip install` or `python setup.py build_ext` command below to be run
+  again. Before installing in editable mode, be sure you have removed any
+  previous installs with `pip uninstall spacy`, which you may need to run
+  multiple times to remove all traces of earlier installs.
+
+  ```bash
+  $ pip install -r requirements.txt
+  $ pip install --no-build-isolation --editable .
+  ```
+
+- Build in parallel using `N` CPUs to speed up compilation and then install in
+  editable mode:
+
+  ```bash
+  $ pip install -r requirements.txt
+  $ python setup.py build_ext --inplace -j N
+  $ pip install --no-build-isolation --editable .
+  ```
+
 ### Building an executable {#executable}

 The spaCy repository includes a [`Makefile`](%%GITHUB_SPACY/Makefile) that