mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-14 02:20:34 +03:00
Merge branch 'develop' into nightly.spacy.io
This commit is contained in:
commit
bf6992c2dd
|
@ -6,7 +6,7 @@ requires = [
|
|||
"cymem>=2.0.2,<2.1.0",
|
||||
"preshed>=3.0.2,<3.1.0",
|
||||
"murmurhash>=0.28.0,<1.1.0",
|
||||
"thinc>=8.0.0rc0,<8.1.0",
|
||||
"thinc>=8.0.0rc2,<8.1.0",
|
||||
"blis>=0.4.0,<0.8.0",
|
||||
"pathy"
|
||||
]
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# Our libraries
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.0rc0,<8.1.0
|
||||
thinc>=8.0.0rc2,<8.1.0
|
||||
blis>=0.4.0,<0.8.0
|
||||
ml_datasets==0.2.0a0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
|
|
|
@ -34,13 +34,13 @@ setup_requires =
|
|||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
thinc>=8.0.0rc0,<8.1.0
|
||||
thinc>=8.0.0rc2,<8.1.0
|
||||
install_requires =
|
||||
# Our libraries
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.0rc0,<8.1.0
|
||||
thinc>=8.0.0rc2,<8.1.0
|
||||
blis>=0.4.0,<0.8.0
|
||||
wasabi>=0.8.0,<1.1.0
|
||||
srsly>=2.3.0,<3.0.0
|
||||
|
|
|
@ -7,7 +7,7 @@ warnings.filterwarnings("ignore", message="numpy.dtype size changed") # noqa
|
|||
warnings.filterwarnings("ignore", message="numpy.ufunc size changed") # noqa
|
||||
|
||||
# These are imported as part of the API
|
||||
from thinc.api import prefer_gpu, require_gpu # noqa: F401
|
||||
from thinc.api import prefer_gpu, require_gpu, require_cpu # noqa: F401
|
||||
from thinc.api import Config
|
||||
|
||||
from . import pipeline # noqa: F401
|
||||
|
|
|
@ -17,7 +17,9 @@ tolerance = 0.2
|
|||
get_length = null
|
||||
|
||||
[pretraining.objective]
|
||||
type = "characters"
|
||||
@architectures = "spacy.PretrainCharacters.v1"
|
||||
maxout_pieces = 3
|
||||
hidden_size = 300
|
||||
n_characters = 4
|
||||
|
||||
[pretraining.optimizer]
|
||||
|
|
|
@ -484,8 +484,8 @@ class Errors:
|
|||
"has been applied.")
|
||||
E905 = ("Cannot initialize StaticVectors layer: nM dimension unset. This "
|
||||
"dimension refers to the width of the vectors table.")
|
||||
E906 = ("Unexpected `loss` value in pretraining objective: {loss_type}")
|
||||
E907 = ("Unexpected `objective_type` value in pretraining objective: {objective_type}")
|
||||
E906 = ("Unexpected `loss` value in pretraining objective: '{found}'. Supported values "
|
||||
"are: {supported}")
|
||||
E908 = ("Can't set `spaces` without `words` in `Doc.__init__`.")
|
||||
E909 = ("Expected {name} in parser internals. This is likely a bug in spaCy.")
|
||||
E910 = ("Encountered NaN value when computing loss for component '{name}'.")
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
from .entity_linker import * # noqa
|
||||
from .multi_task import * # noqa
|
||||
from .parser import * # noqa
|
||||
from .tagger import * # noqa
|
||||
from .textcat import * # noqa
|
||||
|
|
|
@ -1,7 +1,14 @@
|
|||
from typing import Optional, Iterable, Tuple, List, TYPE_CHECKING
|
||||
import numpy
|
||||
from typing import Optional, Iterable, Tuple, List, Callable, TYPE_CHECKING
|
||||
from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
|
||||
from thinc.api import MultiSoftmax, list2array
|
||||
from thinc.api import to_categorical, CosineDistance, L2Distance
|
||||
|
||||
from ...util import registry
|
||||
from ...errors import Errors
|
||||
from ...attrs import ID
|
||||
|
||||
import numpy
|
||||
from functools import partial
|
||||
|
||||
if TYPE_CHECKING:
|
||||
# This lets us add type hints for mypy etc. without causing circular imports
|
||||
|
@ -9,6 +16,74 @@ if TYPE_CHECKING:
|
|||
from ...tokens import Doc # noqa: F401
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.PretrainVectors.v1")
|
||||
def create_pretrain_vectors(
|
||||
maxout_pieces: int, hidden_size: int, loss: str
|
||||
) -> Callable[["Vocab", Model], Model]:
|
||||
def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model:
|
||||
model = build_cloze_multi_task_model(
|
||||
vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
|
||||
)
|
||||
model.attrs["loss"] = create_vectors_loss()
|
||||
return model
|
||||
|
||||
def create_vectors_loss() -> Callable:
|
||||
if loss == "cosine":
|
||||
distance = CosineDistance(normalize=True, ignore_zeros=True)
|
||||
return partial(get_vectors_loss, distance=distance)
|
||||
elif loss == "L2":
|
||||
distance = L2Distance(normalize=True)
|
||||
return partial(get_vectors_loss, distance=distance)
|
||||
else:
|
||||
raise ValueError(Errors.E906.format(found=loss, supported="'cosine', 'L2'"))
|
||||
|
||||
return create_vectors_objective
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.PretrainCharacters.v1")
|
||||
def create_pretrain_characters(
|
||||
maxout_pieces: int, hidden_size: int, n_characters: int
|
||||
) -> Callable[["Vocab", Model], Model]:
|
||||
def create_characters_objective(vocab: "Vocab", tok2vec: Model) -> Model:
|
||||
model = build_cloze_characters_multi_task_model(
|
||||
vocab,
|
||||
tok2vec,
|
||||
hidden_size=hidden_size,
|
||||
maxout_pieces=maxout_pieces,
|
||||
nr_char=n_characters,
|
||||
)
|
||||
model.attrs["loss"] = partial(get_characters_loss, nr_char=n_characters)
|
||||
return model
|
||||
|
||||
return create_characters_objective
|
||||
|
||||
|
||||
def get_vectors_loss(ops, docs, prediction, distance):
|
||||
"""Compute a loss based on a distance between the documents' vectors and
|
||||
the prediction.
|
||||
"""
|
||||
# The simplest way to implement this would be to vstack the
|
||||
# token.vector values, but that's a bit inefficient, especially on GPU.
|
||||
# Instead we fetch the index into the vectors table for each of our tokens,
|
||||
# and look them up all at once. This prevents data copying.
|
||||
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
||||
target = docs[0].vocab.vectors.data[ids]
|
||||
d_target, loss = distance(prediction, target)
|
||||
return loss, d_target
|
||||
|
||||
|
||||
def get_characters_loss(ops, docs, prediction, nr_char):
|
||||
"""Compute a loss based on a number of characters predicted from the docs."""
|
||||
target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
|
||||
target_ids = target_ids.reshape((-1,))
|
||||
target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
|
||||
target = target.reshape((-1, 256 * nr_char))
|
||||
diff = prediction - target
|
||||
loss = (diff ** 2).sum()
|
||||
d_target = diff / float(prediction.shape[0])
|
||||
return loss, d_target
|
||||
|
||||
|
||||
def build_multi_task_model(
|
||||
tok2vec: Model,
|
||||
maxout_pieces: int,
|
||||
|
@ -33,23 +108,19 @@ def build_multi_task_model(
|
|||
|
||||
|
||||
def build_cloze_multi_task_model(
|
||||
vocab: "Vocab",
|
||||
tok2vec: Model,
|
||||
maxout_pieces: int,
|
||||
hidden_size: int,
|
||||
nO: Optional[int] = None,
|
||||
vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int
|
||||
) -> Model:
|
||||
# nO = vocab.vectors.data.shape[1]
|
||||
nO = vocab.vectors.data.shape[1]
|
||||
output_layer = chain(
|
||||
list2array(),
|
||||
Maxout(
|
||||
nO=nO,
|
||||
nO=hidden_size,
|
||||
nI=tok2vec.get_dim("nO"),
|
||||
nP=maxout_pieces,
|
||||
normalize=True,
|
||||
dropout=0.0,
|
||||
),
|
||||
Linear(nO=nO, nI=nO, init_W=zero_init),
|
||||
Linear(nO=nO, nI=hidden_size, init_W=zero_init),
|
||||
)
|
||||
model = chain(tok2vec, output_layer)
|
||||
model = build_masked_language_model(vocab, model)
|
||||
|
|
|
@ -351,9 +351,7 @@ class ConfigSchemaPretrain(BaseModel):
|
|||
batcher: Batcher = Field(..., title="Batcher for the training data")
|
||||
component: str = Field(..., title="Component to find the layer to pretrain")
|
||||
layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
|
||||
|
||||
# TODO: use a more detailed schema for this?
|
||||
objective: Dict[str, Any] = Field(..., title="Pretraining objective")
|
||||
objective: Callable[["Vocab", "Model"], "Model"] = Field(..., title="A function that creates the pretraining objective.")
|
||||
# fmt: on
|
||||
|
||||
class Config:
|
||||
|
|
|
@ -3,15 +3,15 @@ from thinc.api import Config, ConfigValidationError
|
|||
import spacy
|
||||
from spacy.lang.en import English
|
||||
from spacy.lang.de import German
|
||||
from spacy.language import Language, DEFAULT_CONFIG
|
||||
from spacy.util import registry, load_model_from_config
|
||||
from spacy.language import Language, DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH
|
||||
from spacy.util import registry, load_model_from_config, load_config
|
||||
from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model
|
||||
from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder
|
||||
from spacy.schemas import ConfigSchema
|
||||
from spacy.schemas import ConfigSchema, ConfigSchemaPretrain
|
||||
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
nlp_config_string = """
|
||||
[paths]
|
||||
train = null
|
||||
|
@ -63,6 +63,59 @@ factory = "tagger"
|
|||
width = ${components.tok2vec.model.width}
|
||||
"""
|
||||
|
||||
pretrain_config_string = """
|
||||
[paths]
|
||||
train = null
|
||||
dev = null
|
||||
|
||||
[corpora]
|
||||
|
||||
[corpora.train]
|
||||
@readers = "spacy.Corpus.v1"
|
||||
path = ${paths.train}
|
||||
|
||||
[corpora.dev]
|
||||
@readers = "spacy.Corpus.v1"
|
||||
path = ${paths.dev}
|
||||
|
||||
[training]
|
||||
|
||||
[training.batcher]
|
||||
@batchers = "spacy.batch_by_words.v1"
|
||||
size = 666
|
||||
|
||||
[nlp]
|
||||
lang = "en"
|
||||
pipeline = ["tok2vec", "tagger"]
|
||||
|
||||
[components]
|
||||
|
||||
[components.tok2vec]
|
||||
factory = "tok2vec"
|
||||
|
||||
[components.tok2vec.model]
|
||||
@architectures = "spacy.HashEmbedCNN.v1"
|
||||
pretrained_vectors = null
|
||||
width = 342
|
||||
depth = 4
|
||||
window_size = 1
|
||||
embed_size = 2000
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
|
||||
[components.tagger]
|
||||
factory = "tagger"
|
||||
|
||||
[components.tagger.model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
|
||||
[components.tagger.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.width}
|
||||
|
||||
[pretraining]
|
||||
"""
|
||||
|
||||
|
||||
parser_config_string = """
|
||||
[model]
|
||||
|
@ -126,6 +179,14 @@ def test_create_nlp_from_config():
|
|||
load_model_from_config(Config(bad_cfg), auto_fill=True)
|
||||
|
||||
|
||||
def test_create_nlp_from_pretraining_config():
|
||||
"""Test that the default pretraining config validates properly"""
|
||||
config = Config().from_str(pretrain_config_string)
|
||||
pretrain_config = load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
|
||||
filled = config.merge(pretrain_config)
|
||||
resolved = registry.resolve(filled["pretraining"], schema=ConfigSchemaPretrain)
|
||||
|
||||
|
||||
def test_create_nlp_from_config_multiple_instances():
|
||||
"""Test that the nlp object is created correctly for a config with multiple
|
||||
instances of the same component."""
|
||||
|
|
|
@ -4,7 +4,7 @@ import ctypes
|
|||
from pathlib import Path
|
||||
from spacy.about import __version__ as spacy_version
|
||||
from spacy import util
|
||||
from spacy import prefer_gpu, require_gpu
|
||||
from spacy import prefer_gpu, require_gpu, require_cpu
|
||||
from spacy.ml._precomputable_affine import PrecomputableAffine
|
||||
from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
|
||||
from spacy.util import dot_to_object, SimpleFrozenList
|
||||
|
@ -15,6 +15,8 @@ from spacy.lang.nl import Dutch
|
|||
from spacy.language import DEFAULT_CONFIG_PATH
|
||||
from spacy.schemas import ConfigSchemaTraining
|
||||
|
||||
from thinc.api import get_current_ops, NumpyOps, CupyOps
|
||||
|
||||
from .util import get_random_doc
|
||||
|
||||
|
||||
|
@ -81,6 +83,8 @@ def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
|
|||
def test_prefer_gpu():
|
||||
try:
|
||||
import cupy # noqa: F401
|
||||
prefer_gpu()
|
||||
assert isinstance(get_current_ops(), CupyOps)
|
||||
except ImportError:
|
||||
assert not prefer_gpu()
|
||||
|
||||
|
@ -88,10 +92,24 @@ def test_prefer_gpu():
|
|||
def test_require_gpu():
|
||||
try:
|
||||
import cupy # noqa: F401
|
||||
require_gpu()
|
||||
assert isinstance(get_current_ops(), CupyOps)
|
||||
except ImportError:
|
||||
with pytest.raises(ValueError):
|
||||
require_gpu()
|
||||
|
||||
def test_require_cpu():
|
||||
require_cpu()
|
||||
assert isinstance(get_current_ops(), NumpyOps)
|
||||
try:
|
||||
import cupy # noqa: F401
|
||||
require_gpu()
|
||||
assert isinstance(get_current_ops(), CupyOps)
|
||||
except ImportError:
|
||||
pass
|
||||
require_cpu()
|
||||
assert isinstance(get_current_ops(), NumpyOps)
|
||||
|
||||
|
||||
def test_ascii_filenames():
|
||||
"""Test that all filenames in the project are ASCII.
|
||||
|
|
|
@ -2,6 +2,7 @@ import pytest
|
|||
from spacy.vocab import Vocab
|
||||
from spacy.tokenizer import Tokenizer
|
||||
from spacy.util import ensure_path
|
||||
from spacy.lang.en import English
|
||||
|
||||
|
||||
def test_tokenizer_handles_no_word(tokenizer):
|
||||
|
@ -150,6 +151,22 @@ def test_tokenizer_special_cases_with_affixes(tokenizer):
|
|||
]
|
||||
|
||||
|
||||
def test_tokenizer_special_cases_with_affixes_preserve_spacy():
|
||||
tokenizer = English().tokenizer
|
||||
# reset all special cases
|
||||
tokenizer.rules = {}
|
||||
|
||||
# in-place modification (only merges)
|
||||
text = "''a'' "
|
||||
tokenizer.add_special_case("''", [{"ORTH": "''"}])
|
||||
assert tokenizer(text).text == text
|
||||
|
||||
# not in-place (splits and merges)
|
||||
tokenizer.add_special_case("ab", [{"ORTH": "a"}, {"ORTH": "b"}])
|
||||
text = "ab ab ab ''ab ab'' ab'' ''ab"
|
||||
assert tokenizer(text).text == text
|
||||
|
||||
|
||||
def test_tokenizer_special_cases_with_period(tokenizer):
|
||||
text = "_SPECIAL_."
|
||||
tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}])
|
||||
|
|
|
@ -514,6 +514,11 @@ def test_roundtrip_docs_to_docbin(doc):
|
|||
([[0], [1], [2, 3]], [[0], [1], [2], [2]]),
|
||||
),
|
||||
([" ", "a"], ["a"], ([[], [0]], [[1]])),
|
||||
(
|
||||
["a", "''", "'", ","],
|
||||
["a'", "''", ","],
|
||||
([[0], [0, 1], [1], [2]], [[0, 1], [1, 2], [3]]),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_align(tokens_a, tokens_b, expected): # noqa
|
||||
|
@ -698,7 +703,7 @@ def test_alignment_spaces(en_vocab):
|
|||
align = Alignment.from_strings(other_tokens, spacy_tokens)
|
||||
assert list(align.x2y.lengths) == [0, 3, 1, 1, 1, 1, 1]
|
||||
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
|
||||
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,]
|
||||
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
|
||||
assert list(align.y2x.dataXd) == [1, 1, 1, 2, 3, 4, 5, 6]
|
||||
|
||||
# multiple leading whitespace tokens
|
||||
|
@ -707,7 +712,7 @@ def test_alignment_spaces(en_vocab):
|
|||
align = Alignment.from_strings(other_tokens, spacy_tokens)
|
||||
assert list(align.x2y.lengths) == [0, 0, 3, 1, 1, 1, 1, 1]
|
||||
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
|
||||
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,]
|
||||
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
|
||||
assert list(align.y2x.dataXd) == [2, 2, 2, 3, 4, 5, 6, 7]
|
||||
|
||||
# both with leading whitespace, not identical
|
||||
|
|
|
@ -338,7 +338,7 @@ cdef class Tokenizer:
|
|||
# Copy special case tokens into doc and adjust token and
|
||||
# character offsets
|
||||
idx_offset = 0
|
||||
orig_final_spacy = doc.c[span_end + offset - 1].spacy
|
||||
orig_final_spacy = doc.c[span_end - 1].spacy
|
||||
orig_idx = doc.c[i].idx
|
||||
for j in range(cached.length):
|
||||
tokens[i + offset + j] = cached.data.tokens[j]
|
||||
|
|
|
@ -7,8 +7,8 @@ from ..errors import Errors
|
|||
|
||||
def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[List[int]]]:
|
||||
# Create character-to-token mappings
|
||||
char_to_token_a = tuple(chain(*((i,) * len(x) for i, x in enumerate(A))))
|
||||
char_to_token_b = tuple(chain(*((i,) * len(x) for i, x in enumerate(B))))
|
||||
char_to_token_a = tuple(chain(*((i,) * len(x.lower()) for i, x in enumerate(A))))
|
||||
char_to_token_b = tuple(chain(*((i,) * len(x.lower()) for i, x in enumerate(B))))
|
||||
str_a = "".join(A).lower()
|
||||
str_b = "".join(B).lower()
|
||||
cdef int len_str_a = len(str_a)
|
||||
|
@ -36,8 +36,14 @@ def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[Li
|
|||
if prev_token_idx_b != token_idx_b:
|
||||
b2a.append(set())
|
||||
# Process the alignment at the current position
|
||||
if A[token_idx_a] == B[token_idx_b]:
|
||||
# Current tokens are identical
|
||||
if A[token_idx_a] == B[token_idx_b] and \
|
||||
(char_idx_a == 0 or \
|
||||
char_to_token_a[char_idx_a - 1] < token_idx_a) and \
|
||||
(char_idx_b == 0 or \
|
||||
char_to_token_b[char_idx_b - 1] < token_idx_b):
|
||||
# Current tokens are identical and both character offsets are the
|
||||
# start of a token (either at the beginning of the document or the
|
||||
# previous character belongs to a different token)
|
||||
a2b[-1].add(token_idx_b)
|
||||
b2a[-1].add(token_idx_a)
|
||||
char_idx_a += len(A[token_idx_a])
|
||||
|
|
|
@ -28,7 +28,7 @@ def train(
|
|||
use_gpu: int = -1,
|
||||
stdout: IO = sys.stdout,
|
||||
stderr: IO = sys.stderr,
|
||||
) -> None:
|
||||
) -> Tuple["Language", Optional[Path]]:
|
||||
"""Train a pipeline.
|
||||
|
||||
nlp (Language): The initialized nlp object with the full config.
|
||||
|
@ -40,7 +40,7 @@ def train(
|
|||
stderr (file): A second file-like object to write output messages. To disable
|
||||
printing, set to io.StringIO.
|
||||
|
||||
RETURNS (Path / None): The path to the final exported model.
|
||||
RETURNS (tuple): The final nlp object and the path to the exported model.
|
||||
"""
|
||||
# We use no_print here so we can respect the stdout/stderr options.
|
||||
msg = Printer(no_print=True)
|
||||
|
@ -105,17 +105,18 @@ def train(
|
|||
raise e
|
||||
finally:
|
||||
finalize_logger()
|
||||
if optimizer.averages:
|
||||
nlp.use_params(optimizer.averages)
|
||||
if output_path is not None:
|
||||
final_model_path = output_path / DIR_MODEL_LAST
|
||||
if optimizer.averages:
|
||||
with nlp.use_params(optimizer.averages):
|
||||
nlp.to_disk(final_model_path)
|
||||
else:
|
||||
nlp.to_disk(final_model_path)
|
||||
nlp.to_disk(final_model_path)
|
||||
# This will only run if we don't hit an error
|
||||
stdout.write(
|
||||
msg.good("Saved pipeline to output directory", final_model_path) + "\n"
|
||||
)
|
||||
return (nlp, final_model_path)
|
||||
else:
|
||||
return (nlp, None)
|
||||
|
||||
|
||||
def train_while_improving(
|
||||
|
|
|
@ -1,22 +1,16 @@
|
|||
from typing import Optional, Callable, Iterable, Union, List
|
||||
from thinc.api import Config, fix_random_seed, set_gpu_allocator, Model, Optimizer
|
||||
from thinc.api import set_dropout_rate, to_categorical, CosineDistance, L2Distance
|
||||
from thinc.api import set_dropout_rate
|
||||
from pathlib import Path
|
||||
from functools import partial
|
||||
from collections import Counter
|
||||
import srsly
|
||||
import numpy
|
||||
import time
|
||||
import re
|
||||
from wasabi import Printer
|
||||
|
||||
from .example import Example
|
||||
from ..tokens import Doc
|
||||
from ..attrs import ID
|
||||
from ..ml.models.multi_task import build_cloze_multi_task_model
|
||||
from ..ml.models.multi_task import build_cloze_characters_multi_task_model
|
||||
from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
|
||||
from ..errors import Errors
|
||||
from ..util import registry, load_model_from_config, dot_to_object
|
||||
|
||||
|
||||
|
@ -49,6 +43,7 @@ def pretrain(
|
|||
else:
|
||||
# Without '--resume-path' the '--epoch-resume' argument is ignored
|
||||
epoch_resume = 0
|
||||
objective = model.attrs["loss"]
|
||||
# TODO: move this to logger function?
|
||||
tracker = ProgressTracker(frequency=10000)
|
||||
msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
|
||||
|
@ -69,7 +64,6 @@ def pretrain(
|
|||
with (output_dir / "log.jsonl").open("a") as file_:
|
||||
file_.write(srsly.json_dumps(log) + "\n")
|
||||
|
||||
objective = create_objective(P["objective"])
|
||||
# TODO: I think we probably want this to look more like the
|
||||
# 'create_train_batches' function?
|
||||
for epoch in range(epoch_resume, P["max_epochs"]):
|
||||
|
@ -132,58 +126,6 @@ def make_update(
|
|||
return float(loss)
|
||||
|
||||
|
||||
def create_objective(config: Config):
|
||||
"""Create the objective for pretraining.
|
||||
|
||||
We'd like to replace this with a registry function but it's tricky because
|
||||
we're also making a model choice based on this. For now we hard-code support
|
||||
for two types (characters, vectors). For characters you can specify
|
||||
n_characters, for vectors you can specify the loss.
|
||||
|
||||
Bleh.
|
||||
"""
|
||||
objective_type = config["type"]
|
||||
if objective_type == "characters":
|
||||
return partial(get_characters_loss, nr_char=config["n_characters"])
|
||||
elif objective_type == "vectors":
|
||||
if config["loss"] == "cosine":
|
||||
distance = CosineDistance(normalize=True, ignore_zeros=True)
|
||||
return partial(get_vectors_loss, distance=distance)
|
||||
elif config["loss"] == "L2":
|
||||
distance = L2Distance(normalize=True, ignore_zeros=True)
|
||||
return partial(get_vectors_loss, distance=distance)
|
||||
else:
|
||||
raise ValueError(Errors.E906.format(loss_type=config["loss"]))
|
||||
else:
|
||||
raise ValueError(Errors.E907.format(objective_type=objective_type))
|
||||
|
||||
|
||||
def get_vectors_loss(ops, docs, prediction, distance):
|
||||
"""Compute a loss based on a distance between the documents' vectors and
|
||||
the prediction.
|
||||
"""
|
||||
# The simplest way to implement this would be to vstack the
|
||||
# token.vector values, but that's a bit inefficient, especially on GPU.
|
||||
# Instead we fetch the index into the vectors table for each of our tokens,
|
||||
# and look them up all at once. This prevents data copying.
|
||||
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
||||
target = docs[0].vocab.vectors.data[ids]
|
||||
d_target, loss = distance(prediction, target)
|
||||
return loss, d_target
|
||||
|
||||
|
||||
def get_characters_loss(ops, docs, prediction, nr_char):
|
||||
"""Compute a loss based on a number of characters predicted from the docs."""
|
||||
target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
|
||||
target_ids = target_ids.reshape((-1,))
|
||||
target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
|
||||
target = target.reshape((-1, 256 * nr_char))
|
||||
diff = prediction - target
|
||||
loss = (diff ** 2).sum()
|
||||
d_target = diff / float(prediction.shape[0])
|
||||
return loss, d_target
|
||||
|
||||
|
||||
def create_pretraining_model(nlp, pretrain_config):
|
||||
"""Define a network for the pretraining. We simply add an output layer onto
|
||||
the tok2vec input model. The tok2vec input model needs to be a model that
|
||||
|
@ -192,27 +134,15 @@ def create_pretraining_model(nlp, pretrain_config):
|
|||
The actual tok2vec layer is stored as a reference, and only this bit will be
|
||||
serialized to file and read back in when calling the 'train' command.
|
||||
"""
|
||||
nlp.initialize()
|
||||
component = nlp.get_pipe(pretrain_config["component"])
|
||||
if pretrain_config.get("layer"):
|
||||
tok2vec = component.model.get_ref(pretrain_config["layer"])
|
||||
else:
|
||||
tok2vec = component.model
|
||||
|
||||
# TODO
|
||||
maxout_pieces = 3
|
||||
hidden_size = 300
|
||||
if pretrain_config["objective"]["type"] == "vectors":
|
||||
model = build_cloze_multi_task_model(
|
||||
nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
|
||||
)
|
||||
elif pretrain_config["objective"]["type"] == "characters":
|
||||
model = build_cloze_characters_multi_task_model(
|
||||
nlp.vocab,
|
||||
tok2vec,
|
||||
hidden_size=hidden_size,
|
||||
maxout_pieces=maxout_pieces,
|
||||
nr_char=pretrain_config["objective"]["n_characters"],
|
||||
)
|
||||
create_function = pretrain_config["objective"]
|
||||
model = create_function(nlp.vocab, tok2vec)
|
||||
model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
|
||||
set_dropout_rate(model, pretrain_config["dropout"])
|
||||
return model
|
||||
|
|
|
@ -171,6 +171,25 @@ and _before_ loading any pipelines.
|
|||
| `gpu_id` | Device index to select. Defaults to `0`. ~~int~~ |
|
||||
| **RETURNS** | `True` ~~bool~~ |
|
||||
|
||||
### spacy.require_cpu {#spacy.require_cpu tag="function" new="3.0.0"}
|
||||
|
||||
Allocate data and perform operations on CPU.
|
||||
If data has already been allocated on GPU, it will not
|
||||
be moved. Ideally, this function should be called right after importing spaCy
|
||||
and _before_ loading any pipelines.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> import spacy
|
||||
> spacy.require_cpu()
|
||||
> nlp = spacy.load("en_core_web_sm")
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------ |
|
||||
| **RETURNS** | `True` ~~bool~~ |
|
||||
|
||||
## displaCy {#displacy source="spacy/displacy"}
|
||||
|
||||
As of v2.0, spaCy comes with a built-in visualization suite. For more info and
|
||||
|
|
|
@ -158,29 +158,37 @@ The other way to install spaCy is to clone its
|
|||
source. That is the common way if you want to make changes to the code base.
|
||||
You'll need to make sure that you have a development environment consisting of a
|
||||
Python distribution including header files, a compiler,
|
||||
[pip](https://pip.pypa.io/en/latest/installing/),
|
||||
[virtualenv](https://virtualenv.pypa.io/) and [git](https://git-scm.com)
|
||||
installed. The compiler part is the trickiest. How to do that depends on your
|
||||
system. See notes on [Ubuntu](#source-ubuntu), [macOS / OS X](#source-osx) and
|
||||
[pip](https://pip.pypa.io/en/stable/) and [git](https://git-scm.com) installed.
|
||||
The compiler part is the trickiest. How to do that depends on your system. See
|
||||
notes on [Ubuntu](#source-ubuntu), [macOS / OS X](#source-osx) and
|
||||
[Windows](#source-windows) for details.
|
||||
|
||||
```bash
|
||||
$ python -m pip install -U pip # update pip
|
||||
$ python -m pip install -U pip setuptools wheel # install/update build tools
|
||||
$ git clone https://github.com/explosion/spaCy # clone spaCy
|
||||
$ cd spaCy # navigate into dir
|
||||
|
||||
$ python -m venv .env # create environment in .env
|
||||
$ source .env/bin/activate # activate virtual env
|
||||
$ export PYTHONPATH=`pwd` # set Python path to spaCy dir
|
||||
$ pip install -r requirements.txt # install all requirements
|
||||
$ python setup.py build_ext --inplace # compile spaCy
|
||||
$ python setup.py install # install spaCy
|
||||
$ pip install . # compile and install spaCy
|
||||
```
|
||||
|
||||
Compared to regular install via pip, the
|
||||
[`requirements.txt`](%%GITHUB_SPACY/requirements.txt) additionally installs
|
||||
developer dependencies such as Cython. See the [quickstart widget](#quickstart)
|
||||
to get the right commands for your platform and Python version.
|
||||
To install with extras:
|
||||
|
||||
```bash
|
||||
$ pip install .[lookups,cuda102] # install spaCy with extras
|
||||
```
|
||||
|
||||
To install all dependencies required for development:
|
||||
|
||||
```bash
|
||||
$ pip install -r requirements.txt
|
||||
```
|
||||
|
||||
Compared to a regular install via pip, the
|
||||
[`requirements.txt`](%%GITHUB_SPACY/requirements.txt) additionally includes
|
||||
developer dependencies such as Cython and the libraries required to run the test
|
||||
suite. See the [quickstart widget](#quickstart) to get the right commands for
|
||||
your platform and Python version.
|
||||
|
||||
<a id="source-ubuntu"></a><a id="source-osx"></a><a id="source-windows"></a>
|
||||
|
||||
|
@ -195,6 +203,32 @@ to get the right commands for your platform and Python version.
|
|||
[Visual Studio Express](https://www.visualstudio.com/vs/visual-studio-express/)
|
||||
that matches the version that was used to compile your Python interpreter.
|
||||
|
||||
#### Additional options for developers {#source-developers}
|
||||
|
||||
Some additional options may be useful for spaCy developers who are editing the
|
||||
source code and recompiling frequently.
|
||||
|
||||
- Install in editable mode. Changes to `.py` files will be reflected as soon as
|
||||
the files are saved, but edits to Cython files (`.pxd`, `.pyx`) will require
|
||||
the `pip install` or `python setup.py build_ext` command below to be run
|
||||
again. Before installing in editable mode, be sure you have removed any
|
||||
previous installs with `pip uninstall spacy`, which you may need to run
|
||||
multiple times to remove all traces of earlier installs.
|
||||
|
||||
```bash
|
||||
$ pip install -r requirements.txt
|
||||
$ pip install --no-build-isolation --editable .
|
||||
```
|
||||
|
||||
- Build in parallel using `N` CPUs to speed up compilation and then install in
|
||||
editable mode:
|
||||
|
||||
```bash
|
||||
$ pip install -r requirements.txt
|
||||
$ python setup.py build_ext --inplace -j N
|
||||
$ pip install --no-build-isolation --editable .
|
||||
```
|
||||
|
||||
### Building an executable {#executable}
|
||||
|
||||
The spaCy repository includes a [`Makefile`](%%GITHUB_SPACY/Makefile) that
|
||||
|
|
Loading…
Reference in New Issue
Block a user