Merge branch 'feature/prepare' of https://github.com/explosion/spaCy into feature/prepare

This commit is contained in:
Matthew Honnibal 2020-09-29 16:22:53 +02:00
commit e957d66b92
5 changed files with 99 additions and 67 deletions

View File

@ -19,13 +19,18 @@ def init_vectors_cli(
prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"), prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"), truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"), name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
# fmt: on # fmt: on
): ):
"""Convert word vectors for use with spaCy. Will export an nlp object that
you can use in the [initialize.vocab] block of your config to initialize
a model with vectors.
"""
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
msg.info(f"Creating blank nlp object for language '{lang}'") msg.info(f"Creating blank nlp object for language '{lang}'")
nlp = util.get_lang_class(lang)() nlp = util.get_lang_class(lang)()
convert_vectors( convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
nlp, vectors_loc, truncate=truncate, prune=prune, name=name, silent=False msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
)
nlp.to_disk(output_dir) nlp.to_disk(output_dir)
msg.good( msg.good(
"Saved nlp object with vectors to output directory. You can now use the " "Saved nlp object with vectors to output directory. You can now use the "

View File

@ -18,6 +18,7 @@ from .tokens.underscore import Underscore
from .vocab import Vocab, create_vocab from .vocab import Vocab, create_vocab
from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
from .training import Example, validate_examples from .training import Example, validate_examples
from .training.initialize import init_vocab, init_tok2vec
from .scorer import Scorer from .scorer import Scorer
from .util import registry, SimpleFrozenList from .util import registry, SimpleFrozenList
from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
@ -27,7 +28,8 @@ from .lang.punctuation import TOKENIZER_INFIXES
from .tokens import Doc from .tokens import Doc
from .tokenizer import Tokenizer from .tokenizer import Tokenizer
from .errors import Errors, Warnings from .errors import Errors, Warnings
from .schemas import ConfigSchema, ConfigSchemaNlp, validate_init_settings from .schemas import ConfigSchema, ConfigSchemaNlp, ConfigSchemaInit
from .schemas import ConfigSchemaPretrain, validate_init_settings
from .git_info import GIT_VERSION from .git_info import GIT_VERSION
from . import util from . import util
from . import about from . import about
@ -1161,7 +1163,6 @@ class Language:
self, self,
get_examples: Optional[Callable[[], Iterable[Example]]] = None, get_examples: Optional[Callable[[], Iterable[Example]]] = None,
*, *,
settings: Dict[str, Dict[str, Any]] = SimpleFrozenDict(),
sgd: Optional[Optimizer] = None, sgd: Optional[Optimizer] = None,
) -> Optimizer: ) -> Optimizer:
"""Initialize the pipe for training, using data examples if available. """Initialize the pipe for training, using data examples if available.
@ -1198,28 +1199,38 @@ class Language:
if not valid_examples: if not valid_examples:
err = Errors.E930.format(name="Language", obj="empty list") err = Errors.E930.format(name="Language", obj="empty list")
raise ValueError(err) raise ValueError(err)
# Make sure the config is interpolated so we can resolve subsections
config = self.config.interpolate()
# These are the settings provided in the [initialize] block in the config
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
V = I["vocab"]
init_vocab(
self, data=V["data"], lookups=V["lookups"], vectors=V["vectors"],
)
pretrain_cfg = config.get("pretraining")
if pretrain_cfg:
P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain)
init_tok2vec(self, P, V)
if self.vocab.vectors.data.shape[1] >= 1: if self.vocab.vectors.data.shape[1] >= 1:
ops = get_current_ops() ops = get_current_ops()
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
self._optimizer = sgd
if hasattr(self.tokenizer, "initialize"): if hasattr(self.tokenizer, "initialize"):
tok_settings = settings.get("tokenizer", {})
tok_settings = validate_init_settings( tok_settings = validate_init_settings(
self.tokenizer.initialize, self.tokenizer.initialize,
tok_settings, I["tokenizer"],
section="tokenizer", section="tokenizer",
name="tokenizer", name="tokenizer",
) )
self.tokenizer.initialize(get_examples, nlp=self, **tok_settings) self.tokenizer.initialize(get_examples, nlp=self, **tok_settings)
proc_settings = settings.get("components", {})
for name, proc in self.pipeline: for name, proc in self.pipeline:
if hasattr(proc, "initialize"): if hasattr(proc, "initialize"):
p_settings = proc_settings.get(name, {}) p_settings = I["components"].get(name, {})
p_settings = validate_init_settings( p_settings = validate_init_settings(
proc.initialize, p_settings, section="components", name=name proc.initialize, p_settings, section="components", name=name
) )
proc.initialize(get_examples, nlp=self, **p_settings) proc.initialize(get_examples, nlp=self, **p_settings)
self._link_components() self._link_components()
self._optimizer = sgd
if sgd is not None: if sgd is not None:
self._optimizer = sgd self._optimizer = sgd
elif self._optimizer is None: elif self._optimizer is None:

View File

@ -9,34 +9,61 @@ from pydantic import StrictBool
def test_initialize_arguments(): def test_initialize_arguments():
name = "test_initialize_arguments" name = "test_initialize_arguments"
class CustomTokenizer:
def __init__(self, tokenizer):
self.tokenizer = tokenizer
self.from_initialize = None
def __call__(self, text):
return self.tokenizer(text)
def initialize(self, get_examples, nlp, custom: int):
self.from_initialize = custom
class Component: class Component:
def __init__(self): def __init__(self):
... self.from_initialize = None
def initialize( def initialize(
self, get_examples, nlp, custom1: str, custom2: StrictBool = False self, get_examples, nlp, custom1: str, custom2: StrictBool = False
): ):
... self.from_initialize = (custom1, custom2)
Language.factory(name, func=lambda nlp, name: Component()) Language.factory(name, func=lambda nlp, name: Component())
nlp = English() nlp = English()
nlp.tokenizer = CustomTokenizer(nlp.tokenizer)
example = Example.from_dict(nlp("x"), {}) example = Example.from_dict(nlp("x"), {})
get_examples = lambda: [example] get_examples = lambda: [example]
nlp.add_pipe(name) nlp.add_pipe(name)
# The settings here will typically come from the [initialize] block # The settings here will typically come from the [initialize] block
init_cfg = {"tokenizer": {"custom": 1}, "components": {name: {}}}
nlp.config["initialize"].update(init_cfg)
with pytest.raises(ConfigValidationError) as e: with pytest.raises(ConfigValidationError) as e:
# Empty settings, no required custom1 argument # Empty config for component, no required custom1 argument
nlp.initialize(get_examples, settings={"components": {name: {}}}) nlp.initialize(get_examples)
errors = e.value.errors errors = e.value.errors
assert len(errors) == 1 assert len(errors) == 1
assert errors[0]["loc"] == ("custom1",) assert errors[0]["loc"] == ("custom1",)
assert errors[0]["type"] == "value_error.missing" assert errors[0]["type"] == "value_error.missing"
init_cfg = {
"tokenizer": {"custom": 1},
"components": {name: {"custom1": "x", "custom2": 1}},
}
nlp.config["initialize"].update(init_cfg)
with pytest.raises(ConfigValidationError) as e: with pytest.raises(ConfigValidationError) as e:
# Wrong type # Wrong type of custom 2
settings = {"components": {name: {"custom1": "x", "custom2": 1}}} nlp.initialize(get_examples)
nlp.initialize(get_examples, settings=settings)
errors = e.value.errors errors = e.value.errors
assert len(errors) == 1 assert len(errors) == 1
assert errors[0]["loc"] == ("custom2",) assert errors[0]["loc"] == ("custom2",)
assert errors[0]["type"] == "value_error.strictbool" assert errors[0]["type"] == "value_error.strictbool"
init_cfg = {
"tokenizer": {"custom": 1},
"components": {name: {"custom1": "x", "custom2": True}},
}
nlp.config["initialize"].update(init_cfg)
nlp.initialize(get_examples)
assert nlp.tokenizer.from_initialize == 1
pipe = nlp.get_pipe(name)
assert pipe.from_initialize == ("x", True)

View File

@ -1,8 +1,7 @@
from typing import Union, Dict, Optional, Any, List, IO from typing import Union, Dict, Optional, Any, List, IO, TYPE_CHECKING
from thinc.api import Config, fix_random_seed, set_gpu_allocator from thinc.api import Config, fix_random_seed, set_gpu_allocator
from thinc.api import ConfigValidationError from thinc.api import ConfigValidationError
from pathlib import Path from pathlib import Path
from wasabi import Printer
import srsly import srsly
import numpy import numpy
import tarfile import tarfile
@ -11,17 +10,18 @@ import zipfile
import tqdm import tqdm
from .loop import create_before_to_disk_callback from .loop import create_before_to_disk_callback
from ..language import Language
from ..lookups import Lookups from ..lookups import Lookups
from ..vectors import Vectors from ..vectors import Vectors
from ..errors import Errors from ..errors import Errors
from ..schemas import ConfigSchemaTraining, ConfigSchemaInit, ConfigSchemaPretrain from ..schemas import ConfigSchemaTraining
from ..util import registry, load_model_from_config, resolve_dot_names from ..util import registry, load_model_from_config, resolve_dot_names, logger
from ..util import load_model, ensure_path, OOV_RANK, DEFAULT_OOV_PROB from ..util import load_model, ensure_path, OOV_RANK, DEFAULT_OOV_PROB
if TYPE_CHECKING:
from ..language import Language # noqa: F401
def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Language:
msg = Printer(no_print=silent) def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
raw_config = config raw_config = config
config = raw_config.interpolate() config = raw_config.interpolate()
if config["training"]["seed"] is not None: if config["training"]["seed"] is not None:
@ -32,39 +32,29 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu
# Use original config here before it's resolved to functions # Use original config here before it's resolved to functions
sourced_components = get_sourced_components(config) sourced_components = get_sourced_components(config)
nlp = load_model_from_config(raw_config, auto_fill=True) nlp = load_model_from_config(raw_config, auto_fill=True)
msg.good("Set up nlp object from config") logger.info("Set up nlp object from config")
config = nlp.config.interpolate() config = nlp.config.interpolate()
# Resolve all training-relevant sections using the filled nlp config # Resolve all training-relevant sections using the filled nlp config
T = registry.resolve(config["training"], schema=ConfigSchemaTraining) T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
dot_names = [T["train_corpus"], T["dev_corpus"]] dot_names = [T["train_corpus"], T["dev_corpus"]]
train_corpus, dev_corpus = resolve_dot_names(config, dot_names) train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
V = I["vocab"]
init_vocab(
nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"], silent=silent
)
optimizer = T["optimizer"] optimizer = T["optimizer"]
before_to_disk = create_before_to_disk_callback(T["before_to_disk"]) before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
# Components that shouldn't be updated during training # Components that shouldn't be updated during training
frozen_components = T["frozen_components"] frozen_components = T["frozen_components"]
# Sourced components that require resume_training # Sourced components that require resume_training
resume_components = [p for p in sourced_components if p not in frozen_components] resume_components = [p for p in sourced_components if p not in frozen_components]
msg.info(f"Pipeline: {nlp.pipe_names}") logger.info(f"Pipeline: {nlp.pipe_names}")
if resume_components: if resume_components:
with nlp.select_pipes(enable=resume_components): with nlp.select_pipes(enable=resume_components):
msg.info(f"Resuming training for: {resume_components}") logger.info(f"Resuming training for: {resume_components}")
nlp.resume_training(sgd=optimizer) nlp.resume_training(sgd=optimizer)
with nlp.select_pipes(disable=[*frozen_components, *resume_components]): with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer, settings=I) nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
msg.good("Initialized pipeline components") logger.info("Initialized pipeline components")
# Verify the config after calling 'initialize' to ensure labels # Verify the config after calling 'initialize' to ensure labels
# are properly initialized # are properly initialized
verify_config(nlp) verify_config(nlp)
if "pretraining" in config and config["pretraining"]:
P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain)
loaded = add_tok2vec_weights(nlp, P, V)
if loaded and P["component"]:
msg.good(f"Loaded pretrained weights into component '{P['component']}'")
nlp = before_to_disk(nlp) nlp = before_to_disk(nlp)
return nlp return nlp
@ -75,17 +65,15 @@ def must_reinitialize(train_config: Config, init_config: Config) -> bool:
def init_vocab( def init_vocab(
nlp: Language, nlp: "Language",
*, *,
data: Optional[Path] = None, data: Optional[Path] = None,
lookups: Optional[Lookups] = None, lookups: Optional[Lookups] = None,
vectors: Optional[str] = None, vectors: Optional[str] = None,
silent: bool = True, ) -> "Language":
) -> Language:
msg = Printer(no_print=silent)
if lookups: if lookups:
nlp.vocab.lookups = lookups nlp.vocab.lookups = lookups
msg.good(f"Added vocab lookups: {', '.join(lookups.tables)}") logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}")
data_path = ensure_path(data) data_path = ensure_path(data)
if data_path is not None: if data_path is not None:
lex_attrs = srsly.read_jsonl(data_path) lex_attrs = srsly.read_jsonl(data_path)
@ -101,15 +89,15 @@ def init_vocab(
else: else:
oov_prob = DEFAULT_OOV_PROB oov_prob = DEFAULT_OOV_PROB
nlp.vocab.cfg.update({"oov_prob": oov_prob}) nlp.vocab.cfg.update({"oov_prob": oov_prob})
msg.good(f"Added {len(nlp.vocab)} lexical entries to the vocab") logger.info(f"Added {len(nlp.vocab)} lexical entries to the vocab")
msg.good("Created vocabulary") logger.info("Created vocabulary")
if vectors is not None: if vectors is not None:
load_vectors_into_model(nlp, vectors) load_vectors_into_model(nlp, vectors)
msg.good(f"Added vectors: {vectors}") logger.info(f"Added vectors: {vectors}")
def load_vectors_into_model( def load_vectors_into_model(
nlp: Language, name: Union[str, Path], *, add_strings: bool = True nlp: "Language", name: Union[str, Path], *, add_strings: bool = True
) -> None: ) -> None:
"""Load word vectors from an installed model or path into a model instance.""" """Load word vectors from an installed model or path into a model instance."""
try: try:
@ -132,8 +120,8 @@ def load_vectors_into_model(
nlp.vocab.strings.add(vectors_nlp.vocab.strings[key]) nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
def add_tok2vec_weights( def init_tok2vec(
nlp: Language, pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any] nlp: "Language", pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any]
) -> bool: ) -> bool:
# Load pretrained tok2vec weights - cf. CLI command 'pretrain' # Load pretrained tok2vec weights - cf. CLI command 'pretrain'
P = pretrain_config P = pretrain_config
@ -171,7 +159,7 @@ def add_tok2vec_weights(
return False return False
def verify_config(nlp: Language) -> None: def verify_config(nlp: "Language") -> None:
"""Perform additional checks based on the config, loaded nlp object and training data.""" """Perform additional checks based on the config, loaded nlp object and training data."""
# TODO: maybe we should validate based on the actual components, the list # TODO: maybe we should validate based on the actual components, the list
# in config["nlp"]["pipeline"] instead? # in config["nlp"]["pipeline"] instead?
@ -182,7 +170,7 @@ def verify_config(nlp: Language) -> None:
verify_textcat_config(nlp, pipe_config) verify_textcat_config(nlp, pipe_config)
def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None: def verify_textcat_config(nlp: "Language", pipe_config: Dict[str, Any]) -> None:
# if 'positive_label' is provided: double check whether it's in the data and # if 'positive_label' is provided: double check whether it's in the data and
# the task is binary # the task is binary
if pipe_config.get("positive_label"): if pipe_config.get("positive_label"):
@ -211,15 +199,13 @@ def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
def convert_vectors( def convert_vectors(
nlp: Language, nlp: "Language",
vectors_loc: Optional[Path], vectors_loc: Optional[Path],
*, *,
truncate: int, truncate: int,
prune: int, prune: int,
name: Optional[str] = None, name: Optional[str] = None,
silent: bool = True,
) -> None: ) -> None:
msg = Printer(no_print=silent)
vectors_loc = ensure_path(vectors_loc) vectors_loc = ensure_path(vectors_loc)
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"): if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb"))) nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
@ -228,9 +214,9 @@ def convert_vectors(
nlp.vocab.vectors.add(lex.orth, row=lex.rank) nlp.vocab.vectors.add(lex.orth, row=lex.rank)
else: else:
if vectors_loc: if vectors_loc:
with msg.loading(f"Reading vectors from {vectors_loc}"): logger.info(f"Reading vectors from {vectors_loc}")
vectors_data, vector_keys = read_vectors(vectors_loc, truncate) vectors_data, vector_keys = read_vectors(vectors_loc, truncate)
msg.good(f"Loaded vectors from {vectors_loc}") logger.info(f"Loaded vectors from {vectors_loc}")
else: else:
vectors_data, vector_keys = (None, None) vectors_data, vector_keys = (None, None)
if vector_keys is not None: if vector_keys is not None:
@ -247,7 +233,6 @@ def convert_vectors(
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
if prune >= 1: if prune >= 1:
nlp.vocab.prune_vectors(prune) nlp.vocab.prune_vectors(prune)
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
def read_vectors(vectors_loc: Path, truncate_vectors: int): def read_vectors(vectors_loc: Path, truncate_vectors: int):

View File

@ -1,5 +1,5 @@
from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any
from typing import Optional from typing import Optional, TYPE_CHECKING
from pathlib import Path from pathlib import Path
from timeit import default_timer as timer from timeit import default_timer as timer
from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator
@ -9,13 +9,15 @@ from wasabi import Printer
from .example import Example from .example import Example
from ..schemas import ConfigSchemaTraining from ..schemas import ConfigSchemaTraining
from ..language import Language
from ..errors import Errors from ..errors import Errors
from ..util import resolve_dot_names, registry from ..util import resolve_dot_names, registry
if TYPE_CHECKING:
from ..language import Language # noqa: F401
def train( def train(
nlp: Language, nlp: "Language",
output_path: Optional[Path] = None, output_path: Optional[Path] = None,
*, *,
use_gpu: int = -1, use_gpu: int = -1,
@ -110,7 +112,7 @@ def train(
def train_while_improving( def train_while_improving(
nlp: Language, nlp: "Language",
optimizer: Optimizer, optimizer: Optimizer,
train_data, train_data,
evaluate, evaluate,
@ -233,7 +235,7 @@ def subdivide_batch(batch, accumulate_gradient):
def create_evaluation_callback( def create_evaluation_callback(
nlp: Language, dev_corpus: Callable, weights: Dict[str, float] nlp: "Language", dev_corpus: Callable, weights: Dict[str, float]
) -> Callable[[], Tuple[float, Dict[str, float]]]: ) -> Callable[[], Tuple[float, Dict[str, float]]]:
weights = {key: value for key, value in weights.items() if value is not None} weights = {key: value for key, value in weights.items() if value is not None}
@ -277,7 +279,7 @@ def create_train_batches(
def update_meta( def update_meta(
training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any] training: Union[Dict[str, Any], Config], nlp: "Language", info: Dict[str, Any]
) -> None: ) -> None:
nlp.meta["performance"] = {} nlp.meta["performance"] = {}
for metric in training["score_weights"]: for metric in training["score_weights"]:
@ -288,8 +290,10 @@ def update_meta(
def create_before_to_disk_callback( def create_before_to_disk_callback(
callback: Optional[Callable[[Language], Language]] callback: Optional[Callable[["Language"], "Language"]]
) -> Callable[[Language], Language]: ) -> Callable[["Language"], "Language"]:
from ..language import Language # noqa: F811
def before_to_disk(nlp: Language) -> Language: def before_to_disk(nlp: Language) -> Language:
if not callback: if not callback:
return nlp return nlp