Merge branch 'develop' into nightly.spacy.io

This commit is contained in:
Ines Montani 2020-12-08 18:07:03 +11:00
commit bf6992c2dd
19 changed files with 292 additions and 129 deletions

View File

@ -6,7 +6,7 @@ requires = [
"cymem>=2.0.2,<2.1.0", "cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0", "preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0", "murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.0rc0,<8.1.0", "thinc>=8.0.0rc2,<8.1.0",
"blis>=0.4.0,<0.8.0", "blis>=0.4.0,<0.8.0",
"pathy" "pathy"
] ]

View File

@ -1,7 +1,7 @@
# Our libraries # Our libraries
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.0rc0,<8.1.0 thinc>=8.0.0rc2,<8.1.0
blis>=0.4.0,<0.8.0 blis>=0.4.0,<0.8.0
ml_datasets==0.2.0a0 ml_datasets==0.2.0a0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0

View File

@ -34,13 +34,13 @@ setup_requires =
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
thinc>=8.0.0rc0,<8.1.0 thinc>=8.0.0rc2,<8.1.0
install_requires = install_requires =
# Our libraries # Our libraries
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.0rc0,<8.1.0 thinc>=8.0.0rc2,<8.1.0
blis>=0.4.0,<0.8.0 blis>=0.4.0,<0.8.0
wasabi>=0.8.0,<1.1.0 wasabi>=0.8.0,<1.1.0
srsly>=2.3.0,<3.0.0 srsly>=2.3.0,<3.0.0

View File

@ -7,7 +7,7 @@ warnings.filterwarnings("ignore", message="numpy.dtype size changed") # noqa
warnings.filterwarnings("ignore", message="numpy.ufunc size changed") # noqa warnings.filterwarnings("ignore", message="numpy.ufunc size changed") # noqa
# These are imported as part of the API # These are imported as part of the API
from thinc.api import prefer_gpu, require_gpu # noqa: F401 from thinc.api import prefer_gpu, require_gpu, require_cpu # noqa: F401
from thinc.api import Config from thinc.api import Config
from . import pipeline # noqa: F401 from . import pipeline # noqa: F401

View File

@ -17,7 +17,9 @@ tolerance = 0.2
get_length = null get_length = null
[pretraining.objective] [pretraining.objective]
type = "characters" @architectures = "spacy.PretrainCharacters.v1"
maxout_pieces = 3
hidden_size = 300
n_characters = 4 n_characters = 4
[pretraining.optimizer] [pretraining.optimizer]

View File

@ -484,8 +484,8 @@ class Errors:
"has been applied.") "has been applied.")
E905 = ("Cannot initialize StaticVectors layer: nM dimension unset. This " E905 = ("Cannot initialize StaticVectors layer: nM dimension unset. This "
"dimension refers to the width of the vectors table.") "dimension refers to the width of the vectors table.")
E906 = ("Unexpected `loss` value in pretraining objective: {loss_type}") E906 = ("Unexpected `loss` value in pretraining objective: '{found}'. Supported values "
E907 = ("Unexpected `objective_type` value in pretraining objective: {objective_type}") "are: {supported}")
E908 = ("Can't set `spaces` without `words` in `Doc.__init__`.") E908 = ("Can't set `spaces` without `words` in `Doc.__init__`.")
E909 = ("Expected {name} in parser internals. This is likely a bug in spaCy.") E909 = ("Expected {name} in parser internals. This is likely a bug in spaCy.")
E910 = ("Encountered NaN value when computing loss for component '{name}'.") E910 = ("Encountered NaN value when computing loss for component '{name}'.")

View File

@ -1,4 +1,5 @@
from .entity_linker import * # noqa from .entity_linker import * # noqa
from .multi_task import * # noqa
from .parser import * # noqa from .parser import * # noqa
from .tagger import * # noqa from .tagger import * # noqa
from .textcat import * # noqa from .textcat import * # noqa

View File

@ -1,7 +1,14 @@
from typing import Optional, Iterable, Tuple, List, TYPE_CHECKING from typing import Optional, Iterable, Tuple, List, Callable, TYPE_CHECKING
import numpy
from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
from thinc.api import MultiSoftmax, list2array from thinc.api import MultiSoftmax, list2array
from thinc.api import to_categorical, CosineDistance, L2Distance
from ...util import registry
from ...errors import Errors
from ...attrs import ID
import numpy
from functools import partial
if TYPE_CHECKING: if TYPE_CHECKING:
# This lets us add type hints for mypy etc. without causing circular imports # This lets us add type hints for mypy etc. without causing circular imports
@ -9,6 +16,74 @@ if TYPE_CHECKING:
from ...tokens import Doc # noqa: F401 from ...tokens import Doc # noqa: F401
@registry.architectures.register("spacy.PretrainVectors.v1")
def create_pretrain_vectors(
maxout_pieces: int, hidden_size: int, loss: str
) -> Callable[["Vocab", Model], Model]:
def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model:
model = build_cloze_multi_task_model(
vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
)
model.attrs["loss"] = create_vectors_loss()
return model
def create_vectors_loss() -> Callable:
if loss == "cosine":
distance = CosineDistance(normalize=True, ignore_zeros=True)
return partial(get_vectors_loss, distance=distance)
elif loss == "L2":
distance = L2Distance(normalize=True)
return partial(get_vectors_loss, distance=distance)
else:
raise ValueError(Errors.E906.format(found=loss, supported="'cosine', 'L2'"))
return create_vectors_objective
@registry.architectures.register("spacy.PretrainCharacters.v1")
def create_pretrain_characters(
maxout_pieces: int, hidden_size: int, n_characters: int
) -> Callable[["Vocab", Model], Model]:
def create_characters_objective(vocab: "Vocab", tok2vec: Model) -> Model:
model = build_cloze_characters_multi_task_model(
vocab,
tok2vec,
hidden_size=hidden_size,
maxout_pieces=maxout_pieces,
nr_char=n_characters,
)
model.attrs["loss"] = partial(get_characters_loss, nr_char=n_characters)
return model
return create_characters_objective
def get_vectors_loss(ops, docs, prediction, distance):
"""Compute a loss based on a distance between the documents' vectors and
the prediction.
"""
# The simplest way to implement this would be to vstack the
# token.vector values, but that's a bit inefficient, especially on GPU.
# Instead we fetch the index into the vectors table for each of our tokens,
# and look them up all at once. This prevents data copying.
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
target = docs[0].vocab.vectors.data[ids]
d_target, loss = distance(prediction, target)
return loss, d_target
def get_characters_loss(ops, docs, prediction, nr_char):
"""Compute a loss based on a number of characters predicted from the docs."""
target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
target_ids = target_ids.reshape((-1,))
target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
target = target.reshape((-1, 256 * nr_char))
diff = prediction - target
loss = (diff ** 2).sum()
d_target = diff / float(prediction.shape[0])
return loss, d_target
def build_multi_task_model( def build_multi_task_model(
tok2vec: Model, tok2vec: Model,
maxout_pieces: int, maxout_pieces: int,
@ -33,23 +108,19 @@ def build_multi_task_model(
def build_cloze_multi_task_model( def build_cloze_multi_task_model(
vocab: "Vocab", vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int
tok2vec: Model,
maxout_pieces: int,
hidden_size: int,
nO: Optional[int] = None,
) -> Model: ) -> Model:
# nO = vocab.vectors.data.shape[1] nO = vocab.vectors.data.shape[1]
output_layer = chain( output_layer = chain(
list2array(), list2array(),
Maxout( Maxout(
nO=nO, nO=hidden_size,
nI=tok2vec.get_dim("nO"), nI=tok2vec.get_dim("nO"),
nP=maxout_pieces, nP=maxout_pieces,
normalize=True, normalize=True,
dropout=0.0, dropout=0.0,
), ),
Linear(nO=nO, nI=nO, init_W=zero_init), Linear(nO=nO, nI=hidden_size, init_W=zero_init),
) )
model = chain(tok2vec, output_layer) model = chain(tok2vec, output_layer)
model = build_masked_language_model(vocab, model) model = build_masked_language_model(vocab, model)

View File

@ -351,9 +351,7 @@ class ConfigSchemaPretrain(BaseModel):
batcher: Batcher = Field(..., title="Batcher for the training data") batcher: Batcher = Field(..., title="Batcher for the training data")
component: str = Field(..., title="Component to find the layer to pretrain") component: str = Field(..., title="Component to find the layer to pretrain")
layer: str = Field(..., title="Layer to pretrain. Whole model if empty.") layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
objective: Callable[["Vocab", "Model"], "Model"] = Field(..., title="A function that creates the pretraining objective.")
# TODO: use a more detailed schema for this?
objective: Dict[str, Any] = Field(..., title="Pretraining objective")
# fmt: on # fmt: on
class Config: class Config:

View File

@ -3,15 +3,15 @@ from thinc.api import Config, ConfigValidationError
import spacy import spacy
from spacy.lang.en import English from spacy.lang.en import English
from spacy.lang.de import German from spacy.lang.de import German
from spacy.language import Language, DEFAULT_CONFIG from spacy.language import Language, DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH
from spacy.util import registry, load_model_from_config from spacy.util import registry, load_model_from_config, load_config
from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model
from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder
from spacy.schemas import ConfigSchema from spacy.schemas import ConfigSchema, ConfigSchemaPretrain
from ..util import make_tempdir from ..util import make_tempdir
nlp_config_string = """ nlp_config_string = """
[paths] [paths]
train = null train = null
@ -63,6 +63,59 @@ factory = "tagger"
width = ${components.tok2vec.model.width} width = ${components.tok2vec.model.width}
""" """
pretrain_config_string = """
[paths]
train = null
dev = null
[corpora]
[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
[training]
[training.batcher]
@batchers = "spacy.batch_by_words.v1"
size = 666
[nlp]
lang = "en"
pipeline = ["tok2vec", "tagger"]
[components]
[components.tok2vec]
factory = "tok2vec"
[components.tok2vec.model]
@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = null
width = 342
depth = 4
window_size = 1
embed_size = 2000
maxout_pieces = 3
subword_features = true
[components.tagger]
factory = "tagger"
[components.tagger.model]
@architectures = "spacy.Tagger.v1"
[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.width}
[pretraining]
"""
parser_config_string = """ parser_config_string = """
[model] [model]
@ -126,6 +179,14 @@ def test_create_nlp_from_config():
load_model_from_config(Config(bad_cfg), auto_fill=True) load_model_from_config(Config(bad_cfg), auto_fill=True)
def test_create_nlp_from_pretraining_config():
"""Test that the default pretraining config validates properly"""
config = Config().from_str(pretrain_config_string)
pretrain_config = load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
filled = config.merge(pretrain_config)
resolved = registry.resolve(filled["pretraining"], schema=ConfigSchemaPretrain)
def test_create_nlp_from_config_multiple_instances(): def test_create_nlp_from_config_multiple_instances():
"""Test that the nlp object is created correctly for a config with multiple """Test that the nlp object is created correctly for a config with multiple
instances of the same component.""" instances of the same component."""

View File

@ -4,7 +4,7 @@ import ctypes
from pathlib import Path from pathlib import Path
from spacy.about import __version__ as spacy_version from spacy.about import __version__ as spacy_version
from spacy import util from spacy import util
from spacy import prefer_gpu, require_gpu from spacy import prefer_gpu, require_gpu, require_cpu
from spacy.ml._precomputable_affine import PrecomputableAffine from spacy.ml._precomputable_affine import PrecomputableAffine
from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
from spacy.util import dot_to_object, SimpleFrozenList from spacy.util import dot_to_object, SimpleFrozenList
@ -15,6 +15,8 @@ from spacy.lang.nl import Dutch
from spacy.language import DEFAULT_CONFIG_PATH from spacy.language import DEFAULT_CONFIG_PATH
from spacy.schemas import ConfigSchemaTraining from spacy.schemas import ConfigSchemaTraining
from thinc.api import get_current_ops, NumpyOps, CupyOps
from .util import get_random_doc from .util import get_random_doc
@ -81,6 +83,8 @@ def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
def test_prefer_gpu(): def test_prefer_gpu():
try: try:
import cupy # noqa: F401 import cupy # noqa: F401
prefer_gpu()
assert isinstance(get_current_ops(), CupyOps)
except ImportError: except ImportError:
assert not prefer_gpu() assert not prefer_gpu()
@ -88,10 +92,24 @@ def test_prefer_gpu():
def test_require_gpu(): def test_require_gpu():
try: try:
import cupy # noqa: F401 import cupy # noqa: F401
require_gpu()
assert isinstance(get_current_ops(), CupyOps)
except ImportError: except ImportError:
with pytest.raises(ValueError): with pytest.raises(ValueError):
require_gpu() require_gpu()
def test_require_cpu():
require_cpu()
assert isinstance(get_current_ops(), NumpyOps)
try:
import cupy # noqa: F401
require_gpu()
assert isinstance(get_current_ops(), CupyOps)
except ImportError:
pass
require_cpu()
assert isinstance(get_current_ops(), NumpyOps)
def test_ascii_filenames(): def test_ascii_filenames():
"""Test that all filenames in the project are ASCII. """Test that all filenames in the project are ASCII.

View File

@ -2,6 +2,7 @@ import pytest
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.tokenizer import Tokenizer from spacy.tokenizer import Tokenizer
from spacy.util import ensure_path from spacy.util import ensure_path
from spacy.lang.en import English
def test_tokenizer_handles_no_word(tokenizer): def test_tokenizer_handles_no_word(tokenizer):
@ -150,6 +151,22 @@ def test_tokenizer_special_cases_with_affixes(tokenizer):
] ]
def test_tokenizer_special_cases_with_affixes_preserve_spacy():
tokenizer = English().tokenizer
# reset all special cases
tokenizer.rules = {}
# in-place modification (only merges)
text = "''a'' "
tokenizer.add_special_case("''", [{"ORTH": "''"}])
assert tokenizer(text).text == text
# not in-place (splits and merges)
tokenizer.add_special_case("ab", [{"ORTH": "a"}, {"ORTH": "b"}])
text = "ab ab ab ''ab ab'' ab'' ''ab"
assert tokenizer(text).text == text
def test_tokenizer_special_cases_with_period(tokenizer): def test_tokenizer_special_cases_with_period(tokenizer):
text = "_SPECIAL_." text = "_SPECIAL_."
tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}]) tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}])

View File

@ -514,6 +514,11 @@ def test_roundtrip_docs_to_docbin(doc):
([[0], [1], [2, 3]], [[0], [1], [2], [2]]), ([[0], [1], [2, 3]], [[0], [1], [2], [2]]),
), ),
([" ", "a"], ["a"], ([[], [0]], [[1]])), ([" ", "a"], ["a"], ([[], [0]], [[1]])),
(
["a", "''", "'", ","],
["a'", "''", ","],
([[0], [0, 1], [1], [2]], [[0, 1], [1, 2], [3]]),
),
], ],
) )
def test_align(tokens_a, tokens_b, expected): # noqa def test_align(tokens_a, tokens_b, expected): # noqa
@ -698,7 +703,7 @@ def test_alignment_spaces(en_vocab):
align = Alignment.from_strings(other_tokens, spacy_tokens) align = Alignment.from_strings(other_tokens, spacy_tokens)
assert list(align.x2y.lengths) == [0, 3, 1, 1, 1, 1, 1] assert list(align.x2y.lengths) == [0, 3, 1, 1, 1, 1, 1]
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5] assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,] assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
assert list(align.y2x.dataXd) == [1, 1, 1, 2, 3, 4, 5, 6] assert list(align.y2x.dataXd) == [1, 1, 1, 2, 3, 4, 5, 6]
# multiple leading whitespace tokens # multiple leading whitespace tokens
@ -707,7 +712,7 @@ def test_alignment_spaces(en_vocab):
align = Alignment.from_strings(other_tokens, spacy_tokens) align = Alignment.from_strings(other_tokens, spacy_tokens)
assert list(align.x2y.lengths) == [0, 0, 3, 1, 1, 1, 1, 1] assert list(align.x2y.lengths) == [0, 0, 3, 1, 1, 1, 1, 1]
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5] assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,] assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
assert list(align.y2x.dataXd) == [2, 2, 2, 3, 4, 5, 6, 7] assert list(align.y2x.dataXd) == [2, 2, 2, 3, 4, 5, 6, 7]
# both with leading whitespace, not identical # both with leading whitespace, not identical

View File

@ -338,7 +338,7 @@ cdef class Tokenizer:
# Copy special case tokens into doc and adjust token and # Copy special case tokens into doc and adjust token and
# character offsets # character offsets
idx_offset = 0 idx_offset = 0
orig_final_spacy = doc.c[span_end + offset - 1].spacy orig_final_spacy = doc.c[span_end - 1].spacy
orig_idx = doc.c[i].idx orig_idx = doc.c[i].idx
for j in range(cached.length): for j in range(cached.length):
tokens[i + offset + j] = cached.data.tokens[j] tokens[i + offset + j] = cached.data.tokens[j]

View File

@ -7,8 +7,8 @@ from ..errors import Errors
def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[List[int]]]: def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[List[int]]]:
# Create character-to-token mappings # Create character-to-token mappings
char_to_token_a = tuple(chain(*((i,) * len(x) for i, x in enumerate(A)))) char_to_token_a = tuple(chain(*((i,) * len(x.lower()) for i, x in enumerate(A))))
char_to_token_b = tuple(chain(*((i,) * len(x) for i, x in enumerate(B)))) char_to_token_b = tuple(chain(*((i,) * len(x.lower()) for i, x in enumerate(B))))
str_a = "".join(A).lower() str_a = "".join(A).lower()
str_b = "".join(B).lower() str_b = "".join(B).lower()
cdef int len_str_a = len(str_a) cdef int len_str_a = len(str_a)
@ -36,8 +36,14 @@ def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[Li
if prev_token_idx_b != token_idx_b: if prev_token_idx_b != token_idx_b:
b2a.append(set()) b2a.append(set())
# Process the alignment at the current position # Process the alignment at the current position
if A[token_idx_a] == B[token_idx_b]: if A[token_idx_a] == B[token_idx_b] and \
# Current tokens are identical (char_idx_a == 0 or \
char_to_token_a[char_idx_a - 1] < token_idx_a) and \
(char_idx_b == 0 or \
char_to_token_b[char_idx_b - 1] < token_idx_b):
# Current tokens are identical and both character offsets are the
# start of a token (either at the beginning of the document or the
# previous character belongs to a different token)
a2b[-1].add(token_idx_b) a2b[-1].add(token_idx_b)
b2a[-1].add(token_idx_a) b2a[-1].add(token_idx_a)
char_idx_a += len(A[token_idx_a]) char_idx_a += len(A[token_idx_a])

View File

@ -28,7 +28,7 @@ def train(
use_gpu: int = -1, use_gpu: int = -1,
stdout: IO = sys.stdout, stdout: IO = sys.stdout,
stderr: IO = sys.stderr, stderr: IO = sys.stderr,
) -> None: ) -> Tuple["Language", Optional[Path]]:
"""Train a pipeline. """Train a pipeline.
nlp (Language): The initialized nlp object with the full config. nlp (Language): The initialized nlp object with the full config.
@ -40,7 +40,7 @@ def train(
stderr (file): A second file-like object to write output messages. To disable stderr (file): A second file-like object to write output messages. To disable
printing, set to io.StringIO. printing, set to io.StringIO.
RETURNS (Path / None): The path to the final exported model. RETURNS (tuple): The final nlp object and the path to the exported model.
""" """
# We use no_print here so we can respect the stdout/stderr options. # We use no_print here so we can respect the stdout/stderr options.
msg = Printer(no_print=True) msg = Printer(no_print=True)
@ -105,17 +105,18 @@ def train(
raise e raise e
finally: finally:
finalize_logger() finalize_logger()
if optimizer.averages:
nlp.use_params(optimizer.averages)
if output_path is not None: if output_path is not None:
final_model_path = output_path / DIR_MODEL_LAST final_model_path = output_path / DIR_MODEL_LAST
if optimizer.averages: nlp.to_disk(final_model_path)
with nlp.use_params(optimizer.averages):
nlp.to_disk(final_model_path)
else:
nlp.to_disk(final_model_path)
# This will only run if we don't hit an error # This will only run if we don't hit an error
stdout.write( stdout.write(
msg.good("Saved pipeline to output directory", final_model_path) + "\n" msg.good("Saved pipeline to output directory", final_model_path) + "\n"
) )
return (nlp, final_model_path)
else:
return (nlp, None)
def train_while_improving( def train_while_improving(

View File

@ -1,22 +1,16 @@
from typing import Optional, Callable, Iterable, Union, List from typing import Optional, Callable, Iterable, Union, List
from thinc.api import Config, fix_random_seed, set_gpu_allocator, Model, Optimizer from thinc.api import Config, fix_random_seed, set_gpu_allocator, Model, Optimizer
from thinc.api import set_dropout_rate, to_categorical, CosineDistance, L2Distance from thinc.api import set_dropout_rate
from pathlib import Path from pathlib import Path
from functools import partial
from collections import Counter from collections import Counter
import srsly import srsly
import numpy
import time import time
import re import re
from wasabi import Printer from wasabi import Printer
from .example import Example from .example import Example
from ..tokens import Doc from ..tokens import Doc
from ..attrs import ID
from ..ml.models.multi_task import build_cloze_multi_task_model
from ..ml.models.multi_task import build_cloze_characters_multi_task_model
from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
from ..errors import Errors
from ..util import registry, load_model_from_config, dot_to_object from ..util import registry, load_model_from_config, dot_to_object
@ -49,6 +43,7 @@ def pretrain(
else: else:
# Without '--resume-path' the '--epoch-resume' argument is ignored # Without '--resume-path' the '--epoch-resume' argument is ignored
epoch_resume = 0 epoch_resume = 0
objective = model.attrs["loss"]
# TODO: move this to logger function? # TODO: move this to logger function?
tracker = ProgressTracker(frequency=10000) tracker = ProgressTracker(frequency=10000)
msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}") msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
@ -69,7 +64,6 @@ def pretrain(
with (output_dir / "log.jsonl").open("a") as file_: with (output_dir / "log.jsonl").open("a") as file_:
file_.write(srsly.json_dumps(log) + "\n") file_.write(srsly.json_dumps(log) + "\n")
objective = create_objective(P["objective"])
# TODO: I think we probably want this to look more like the # TODO: I think we probably want this to look more like the
# 'create_train_batches' function? # 'create_train_batches' function?
for epoch in range(epoch_resume, P["max_epochs"]): for epoch in range(epoch_resume, P["max_epochs"]):
@ -132,58 +126,6 @@ def make_update(
return float(loss) return float(loss)
def create_objective(config: Config):
"""Create the objective for pretraining.
We'd like to replace this with a registry function but it's tricky because
we're also making a model choice based on this. For now we hard-code support
for two types (characters, vectors). For characters you can specify
n_characters, for vectors you can specify the loss.
Bleh.
"""
objective_type = config["type"]
if objective_type == "characters":
return partial(get_characters_loss, nr_char=config["n_characters"])
elif objective_type == "vectors":
if config["loss"] == "cosine":
distance = CosineDistance(normalize=True, ignore_zeros=True)
return partial(get_vectors_loss, distance=distance)
elif config["loss"] == "L2":
distance = L2Distance(normalize=True, ignore_zeros=True)
return partial(get_vectors_loss, distance=distance)
else:
raise ValueError(Errors.E906.format(loss_type=config["loss"]))
else:
raise ValueError(Errors.E907.format(objective_type=objective_type))
def get_vectors_loss(ops, docs, prediction, distance):
"""Compute a loss based on a distance between the documents' vectors and
the prediction.
"""
# The simplest way to implement this would be to vstack the
# token.vector values, but that's a bit inefficient, especially on GPU.
# Instead we fetch the index into the vectors table for each of our tokens,
# and look them up all at once. This prevents data copying.
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
target = docs[0].vocab.vectors.data[ids]
d_target, loss = distance(prediction, target)
return loss, d_target
def get_characters_loss(ops, docs, prediction, nr_char):
"""Compute a loss based on a number of characters predicted from the docs."""
target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
target_ids = target_ids.reshape((-1,))
target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
target = target.reshape((-1, 256 * nr_char))
diff = prediction - target
loss = (diff ** 2).sum()
d_target = diff / float(prediction.shape[0])
return loss, d_target
def create_pretraining_model(nlp, pretrain_config): def create_pretraining_model(nlp, pretrain_config):
"""Define a network for the pretraining. We simply add an output layer onto """Define a network for the pretraining. We simply add an output layer onto
the tok2vec input model. The tok2vec input model needs to be a model that the tok2vec input model. The tok2vec input model needs to be a model that
@ -192,27 +134,15 @@ def create_pretraining_model(nlp, pretrain_config):
The actual tok2vec layer is stored as a reference, and only this bit will be The actual tok2vec layer is stored as a reference, and only this bit will be
serialized to file and read back in when calling the 'train' command. serialized to file and read back in when calling the 'train' command.
""" """
nlp.initialize()
component = nlp.get_pipe(pretrain_config["component"]) component = nlp.get_pipe(pretrain_config["component"])
if pretrain_config.get("layer"): if pretrain_config.get("layer"):
tok2vec = component.model.get_ref(pretrain_config["layer"]) tok2vec = component.model.get_ref(pretrain_config["layer"])
else: else:
tok2vec = component.model tok2vec = component.model
# TODO create_function = pretrain_config["objective"]
maxout_pieces = 3 model = create_function(nlp.vocab, tok2vec)
hidden_size = 300
if pretrain_config["objective"]["type"] == "vectors":
model = build_cloze_multi_task_model(
nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
)
elif pretrain_config["objective"]["type"] == "characters":
model = build_cloze_characters_multi_task_model(
nlp.vocab,
tok2vec,
hidden_size=hidden_size,
maxout_pieces=maxout_pieces,
nr_char=pretrain_config["objective"]["n_characters"],
)
model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")]) model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
set_dropout_rate(model, pretrain_config["dropout"]) set_dropout_rate(model, pretrain_config["dropout"])
return model return model

View File

@ -171,6 +171,25 @@ and _before_ loading any pipelines.
| `gpu_id` | Device index to select. Defaults to `0`. ~~int~~ | | `gpu_id` | Device index to select. Defaults to `0`. ~~int~~ |
| **RETURNS** | `True` ~~bool~~ | | **RETURNS** | `True` ~~bool~~ |
### spacy.require_cpu {#spacy.require_cpu tag="function" new="3.0.0"}
Allocate data and perform operations on CPU.
If data has already been allocated on GPU, it will not
be moved. Ideally, this function should be called right after importing spaCy
and _before_ loading any pipelines.
> #### Example
>
> ```python
> import spacy
> spacy.require_cpu()
> nlp = spacy.load("en_core_web_sm")
> ```
| Name | Description |
| ----------- | ------------------------------------------------ |
| **RETURNS** | `True` ~~bool~~ |
## displaCy {#displacy source="spacy/displacy"} ## displaCy {#displacy source="spacy/displacy"}
As of v2.0, spaCy comes with a built-in visualization suite. For more info and As of v2.0, spaCy comes with a built-in visualization suite. For more info and

View File

@ -158,29 +158,37 @@ The other way to install spaCy is to clone its
source. That is the common way if you want to make changes to the code base. source. That is the common way if you want to make changes to the code base.
You'll need to make sure that you have a development environment consisting of a You'll need to make sure that you have a development environment consisting of a
Python distribution including header files, a compiler, Python distribution including header files, a compiler,
[pip](https://pip.pypa.io/en/latest/installing/), [pip](https://pip.pypa.io/en/stable/) and [git](https://git-scm.com) installed.
[virtualenv](https://virtualenv.pypa.io/) and [git](https://git-scm.com) The compiler part is the trickiest. How to do that depends on your system. See
installed. The compiler part is the trickiest. How to do that depends on your notes on [Ubuntu](#source-ubuntu), [macOS / OS X](#source-osx) and
system. See notes on [Ubuntu](#source-ubuntu), [macOS / OS X](#source-osx) and
[Windows](#source-windows) for details. [Windows](#source-windows) for details.
```bash ```bash
$ python -m pip install -U pip # update pip $ python -m pip install -U pip setuptools wheel # install/update build tools
$ git clone https://github.com/explosion/spaCy # clone spaCy $ git clone https://github.com/explosion/spaCy # clone spaCy
$ cd spaCy # navigate into dir $ cd spaCy # navigate into dir
$ python -m venv .env # create environment in .env $ python -m venv .env # create environment in .env
$ source .env/bin/activate # activate virtual env $ source .env/bin/activate # activate virtual env
$ export PYTHONPATH=`pwd` # set Python path to spaCy dir $ pip install . # compile and install spaCy
$ pip install -r requirements.txt # install all requirements
$ python setup.py build_ext --inplace # compile spaCy
$ python setup.py install # install spaCy
``` ```
Compared to regular install via pip, the To install with extras:
[`requirements.txt`](%%GITHUB_SPACY/requirements.txt) additionally installs
developer dependencies such as Cython. See the [quickstart widget](#quickstart) ```bash
to get the right commands for your platform and Python version. $ pip install .[lookups,cuda102] # install spaCy with extras
```
To install all dependencies required for development:
```bash
$ pip install -r requirements.txt
```
Compared to a regular install via pip, the
[`requirements.txt`](%%GITHUB_SPACY/requirements.txt) additionally includes
developer dependencies such as Cython and the libraries required to run the test
suite. See the [quickstart widget](#quickstart) to get the right commands for
your platform and Python version.
<a id="source-ubuntu"></a><a id="source-osx"></a><a id="source-windows"></a> <a id="source-ubuntu"></a><a id="source-osx"></a><a id="source-windows"></a>
@ -195,6 +203,32 @@ to get the right commands for your platform and Python version.
[Visual Studio Express](https://www.visualstudio.com/vs/visual-studio-express/) [Visual Studio Express](https://www.visualstudio.com/vs/visual-studio-express/)
that matches the version that was used to compile your Python interpreter. that matches the version that was used to compile your Python interpreter.
#### Additional options for developers {#source-developers}
Some additional options may be useful for spaCy developers who are editing the
source code and recompiling frequently.
- Install in editable mode. Changes to `.py` files will be reflected as soon as
the files are saved, but edits to Cython files (`.pxd`, `.pyx`) will require
the `pip install` or `python setup.py build_ext` command below to be run
again. Before installing in editable mode, be sure you have removed any
previous installs with `pip uninstall spacy`, which you may need to run
multiple times to remove all traces of earlier installs.
```bash
$ pip install -r requirements.txt
$ pip install --no-build-isolation --editable .
```
- Build in parallel using `N` CPUs to speed up compilation and then install in
editable mode:
```bash
$ pip install -r requirements.txt
$ python setup.py build_ext --inplace -j N
$ pip install --no-build-isolation --editable .
```
### Building an executable {#executable} ### Building an executable {#executable}
The spaCy repository includes a [`Makefile`](%%GITHUB_SPACY/Makefile) that The spaCy repository includes a [`Makefile`](%%GITHUB_SPACY/Makefile) that