mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-01 18:33:12 +03:00
Merge branch 'feature/prepare' of https://github.com/explosion/spaCy into feature/prepare
This commit is contained in:
commit
e957d66b92
|
@ -19,13 +19,18 @@ def init_vectors_cli(
|
||||||
prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
|
prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
|
||||||
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
||||||
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
||||||
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
|
"""Convert word vectors for use with spaCy. Will export an nlp object that
|
||||||
|
you can use in the [initialize.vocab] block of your config to initialize
|
||||||
|
a model with vectors.
|
||||||
|
"""
|
||||||
|
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
|
||||||
msg.info(f"Creating blank nlp object for language '{lang}'")
|
msg.info(f"Creating blank nlp object for language '{lang}'")
|
||||||
nlp = util.get_lang_class(lang)()
|
nlp = util.get_lang_class(lang)()
|
||||||
convert_vectors(
|
convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
|
||||||
nlp, vectors_loc, truncate=truncate, prune=prune, name=name, silent=False
|
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
|
||||||
)
|
|
||||||
nlp.to_disk(output_dir)
|
nlp.to_disk(output_dir)
|
||||||
msg.good(
|
msg.good(
|
||||||
"Saved nlp object with vectors to output directory. You can now use the "
|
"Saved nlp object with vectors to output directory. You can now use the "
|
||||||
|
|
|
@ -18,6 +18,7 @@ from .tokens.underscore import Underscore
|
||||||
from .vocab import Vocab, create_vocab
|
from .vocab import Vocab, create_vocab
|
||||||
from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
|
from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
|
||||||
from .training import Example, validate_examples
|
from .training import Example, validate_examples
|
||||||
|
from .training.initialize import init_vocab, init_tok2vec
|
||||||
from .scorer import Scorer
|
from .scorer import Scorer
|
||||||
from .util import registry, SimpleFrozenList
|
from .util import registry, SimpleFrozenList
|
||||||
from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
|
from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
|
||||||
|
@ -27,7 +28,8 @@ from .lang.punctuation import TOKENIZER_INFIXES
|
||||||
from .tokens import Doc
|
from .tokens import Doc
|
||||||
from .tokenizer import Tokenizer
|
from .tokenizer import Tokenizer
|
||||||
from .errors import Errors, Warnings
|
from .errors import Errors, Warnings
|
||||||
from .schemas import ConfigSchema, ConfigSchemaNlp, validate_init_settings
|
from .schemas import ConfigSchema, ConfigSchemaNlp, ConfigSchemaInit
|
||||||
|
from .schemas import ConfigSchemaPretrain, validate_init_settings
|
||||||
from .git_info import GIT_VERSION
|
from .git_info import GIT_VERSION
|
||||||
from . import util
|
from . import util
|
||||||
from . import about
|
from . import about
|
||||||
|
@ -1161,7 +1163,6 @@ class Language:
|
||||||
self,
|
self,
|
||||||
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
|
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
|
||||||
*,
|
*,
|
||||||
settings: Dict[str, Dict[str, Any]] = SimpleFrozenDict(),
|
|
||||||
sgd: Optional[Optimizer] = None,
|
sgd: Optional[Optimizer] = None,
|
||||||
) -> Optimizer:
|
) -> Optimizer:
|
||||||
"""Initialize the pipe for training, using data examples if available.
|
"""Initialize the pipe for training, using data examples if available.
|
||||||
|
@ -1198,28 +1199,38 @@ class Language:
|
||||||
if not valid_examples:
|
if not valid_examples:
|
||||||
err = Errors.E930.format(name="Language", obj="empty list")
|
err = Errors.E930.format(name="Language", obj="empty list")
|
||||||
raise ValueError(err)
|
raise ValueError(err)
|
||||||
|
# Make sure the config is interpolated so we can resolve subsections
|
||||||
|
config = self.config.interpolate()
|
||||||
|
# These are the settings provided in the [initialize] block in the config
|
||||||
|
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
|
||||||
|
V = I["vocab"]
|
||||||
|
init_vocab(
|
||||||
|
self, data=V["data"], lookups=V["lookups"], vectors=V["vectors"],
|
||||||
|
)
|
||||||
|
pretrain_cfg = config.get("pretraining")
|
||||||
|
if pretrain_cfg:
|
||||||
|
P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain)
|
||||||
|
init_tok2vec(self, P, V)
|
||||||
if self.vocab.vectors.data.shape[1] >= 1:
|
if self.vocab.vectors.data.shape[1] >= 1:
|
||||||
ops = get_current_ops()
|
ops = get_current_ops()
|
||||||
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
||||||
self._optimizer = sgd
|
|
||||||
if hasattr(self.tokenizer, "initialize"):
|
if hasattr(self.tokenizer, "initialize"):
|
||||||
tok_settings = settings.get("tokenizer", {})
|
|
||||||
tok_settings = validate_init_settings(
|
tok_settings = validate_init_settings(
|
||||||
self.tokenizer.initialize,
|
self.tokenizer.initialize,
|
||||||
tok_settings,
|
I["tokenizer"],
|
||||||
section="tokenizer",
|
section="tokenizer",
|
||||||
name="tokenizer",
|
name="tokenizer",
|
||||||
)
|
)
|
||||||
self.tokenizer.initialize(get_examples, nlp=self, **tok_settings)
|
self.tokenizer.initialize(get_examples, nlp=self, **tok_settings)
|
||||||
proc_settings = settings.get("components", {})
|
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if hasattr(proc, "initialize"):
|
if hasattr(proc, "initialize"):
|
||||||
p_settings = proc_settings.get(name, {})
|
p_settings = I["components"].get(name, {})
|
||||||
p_settings = validate_init_settings(
|
p_settings = validate_init_settings(
|
||||||
proc.initialize, p_settings, section="components", name=name
|
proc.initialize, p_settings, section="components", name=name
|
||||||
)
|
)
|
||||||
proc.initialize(get_examples, nlp=self, **p_settings)
|
proc.initialize(get_examples, nlp=self, **p_settings)
|
||||||
self._link_components()
|
self._link_components()
|
||||||
|
self._optimizer = sgd
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
self._optimizer = sgd
|
self._optimizer = sgd
|
||||||
elif self._optimizer is None:
|
elif self._optimizer is None:
|
||||||
|
|
|
@ -9,34 +9,61 @@ from pydantic import StrictBool
|
||||||
def test_initialize_arguments():
|
def test_initialize_arguments():
|
||||||
name = "test_initialize_arguments"
|
name = "test_initialize_arguments"
|
||||||
|
|
||||||
|
class CustomTokenizer:
|
||||||
|
def __init__(self, tokenizer):
|
||||||
|
self.tokenizer = tokenizer
|
||||||
|
self.from_initialize = None
|
||||||
|
|
||||||
|
def __call__(self, text):
|
||||||
|
return self.tokenizer(text)
|
||||||
|
|
||||||
|
def initialize(self, get_examples, nlp, custom: int):
|
||||||
|
self.from_initialize = custom
|
||||||
|
|
||||||
class Component:
|
class Component:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
...
|
self.from_initialize = None
|
||||||
|
|
||||||
def initialize(
|
def initialize(
|
||||||
self, get_examples, nlp, custom1: str, custom2: StrictBool = False
|
self, get_examples, nlp, custom1: str, custom2: StrictBool = False
|
||||||
):
|
):
|
||||||
...
|
self.from_initialize = (custom1, custom2)
|
||||||
|
|
||||||
Language.factory(name, func=lambda nlp, name: Component())
|
Language.factory(name, func=lambda nlp, name: Component())
|
||||||
|
|
||||||
nlp = English()
|
nlp = English()
|
||||||
|
nlp.tokenizer = CustomTokenizer(nlp.tokenizer)
|
||||||
example = Example.from_dict(nlp("x"), {})
|
example = Example.from_dict(nlp("x"), {})
|
||||||
get_examples = lambda: [example]
|
get_examples = lambda: [example]
|
||||||
nlp.add_pipe(name)
|
nlp.add_pipe(name)
|
||||||
# The settings here will typically come from the [initialize] block
|
# The settings here will typically come from the [initialize] block
|
||||||
|
init_cfg = {"tokenizer": {"custom": 1}, "components": {name: {}}}
|
||||||
|
nlp.config["initialize"].update(init_cfg)
|
||||||
with pytest.raises(ConfigValidationError) as e:
|
with pytest.raises(ConfigValidationError) as e:
|
||||||
# Empty settings, no required custom1 argument
|
# Empty config for component, no required custom1 argument
|
||||||
nlp.initialize(get_examples, settings={"components": {name: {}}})
|
nlp.initialize(get_examples)
|
||||||
errors = e.value.errors
|
errors = e.value.errors
|
||||||
assert len(errors) == 1
|
assert len(errors) == 1
|
||||||
assert errors[0]["loc"] == ("custom1",)
|
assert errors[0]["loc"] == ("custom1",)
|
||||||
assert errors[0]["type"] == "value_error.missing"
|
assert errors[0]["type"] == "value_error.missing"
|
||||||
|
init_cfg = {
|
||||||
|
"tokenizer": {"custom": 1},
|
||||||
|
"components": {name: {"custom1": "x", "custom2": 1}},
|
||||||
|
}
|
||||||
|
nlp.config["initialize"].update(init_cfg)
|
||||||
with pytest.raises(ConfigValidationError) as e:
|
with pytest.raises(ConfigValidationError) as e:
|
||||||
# Wrong type
|
# Wrong type of custom 2
|
||||||
settings = {"components": {name: {"custom1": "x", "custom2": 1}}}
|
nlp.initialize(get_examples)
|
||||||
nlp.initialize(get_examples, settings=settings)
|
|
||||||
errors = e.value.errors
|
errors = e.value.errors
|
||||||
assert len(errors) == 1
|
assert len(errors) == 1
|
||||||
assert errors[0]["loc"] == ("custom2",)
|
assert errors[0]["loc"] == ("custom2",)
|
||||||
assert errors[0]["type"] == "value_error.strictbool"
|
assert errors[0]["type"] == "value_error.strictbool"
|
||||||
|
init_cfg = {
|
||||||
|
"tokenizer": {"custom": 1},
|
||||||
|
"components": {name: {"custom1": "x", "custom2": True}},
|
||||||
|
}
|
||||||
|
nlp.config["initialize"].update(init_cfg)
|
||||||
|
nlp.initialize(get_examples)
|
||||||
|
assert nlp.tokenizer.from_initialize == 1
|
||||||
|
pipe = nlp.get_pipe(name)
|
||||||
|
assert pipe.from_initialize == ("x", True)
|
||||||
|
|
|
@ -1,8 +1,7 @@
|
||||||
from typing import Union, Dict, Optional, Any, List, IO
|
from typing import Union, Dict, Optional, Any, List, IO, TYPE_CHECKING
|
||||||
from thinc.api import Config, fix_random_seed, set_gpu_allocator
|
from thinc.api import Config, fix_random_seed, set_gpu_allocator
|
||||||
from thinc.api import ConfigValidationError
|
from thinc.api import ConfigValidationError
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import Printer
|
|
||||||
import srsly
|
import srsly
|
||||||
import numpy
|
import numpy
|
||||||
import tarfile
|
import tarfile
|
||||||
|
@ -11,17 +10,18 @@ import zipfile
|
||||||
import tqdm
|
import tqdm
|
||||||
|
|
||||||
from .loop import create_before_to_disk_callback
|
from .loop import create_before_to_disk_callback
|
||||||
from ..language import Language
|
|
||||||
from ..lookups import Lookups
|
from ..lookups import Lookups
|
||||||
from ..vectors import Vectors
|
from ..vectors import Vectors
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..schemas import ConfigSchemaTraining, ConfigSchemaInit, ConfigSchemaPretrain
|
from ..schemas import ConfigSchemaTraining
|
||||||
from ..util import registry, load_model_from_config, resolve_dot_names
|
from ..util import registry, load_model_from_config, resolve_dot_names, logger
|
||||||
from ..util import load_model, ensure_path, OOV_RANK, DEFAULT_OOV_PROB
|
from ..util import load_model, ensure_path, OOV_RANK, DEFAULT_OOV_PROB
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from ..language import Language # noqa: F401
|
||||||
|
|
||||||
def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Language:
|
|
||||||
msg = Printer(no_print=silent)
|
def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
|
||||||
raw_config = config
|
raw_config = config
|
||||||
config = raw_config.interpolate()
|
config = raw_config.interpolate()
|
||||||
if config["training"]["seed"] is not None:
|
if config["training"]["seed"] is not None:
|
||||||
|
@ -32,39 +32,29 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu
|
||||||
# Use original config here before it's resolved to functions
|
# Use original config here before it's resolved to functions
|
||||||
sourced_components = get_sourced_components(config)
|
sourced_components = get_sourced_components(config)
|
||||||
nlp = load_model_from_config(raw_config, auto_fill=True)
|
nlp = load_model_from_config(raw_config, auto_fill=True)
|
||||||
msg.good("Set up nlp object from config")
|
logger.info("Set up nlp object from config")
|
||||||
config = nlp.config.interpolate()
|
config = nlp.config.interpolate()
|
||||||
# Resolve all training-relevant sections using the filled nlp config
|
# Resolve all training-relevant sections using the filled nlp config
|
||||||
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
||||||
dot_names = [T["train_corpus"], T["dev_corpus"]]
|
dot_names = [T["train_corpus"], T["dev_corpus"]]
|
||||||
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
|
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
|
||||||
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
|
|
||||||
V = I["vocab"]
|
|
||||||
init_vocab(
|
|
||||||
nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"], silent=silent
|
|
||||||
)
|
|
||||||
optimizer = T["optimizer"]
|
optimizer = T["optimizer"]
|
||||||
before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
|
before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
|
||||||
# Components that shouldn't be updated during training
|
# Components that shouldn't be updated during training
|
||||||
frozen_components = T["frozen_components"]
|
frozen_components = T["frozen_components"]
|
||||||
# Sourced components that require resume_training
|
# Sourced components that require resume_training
|
||||||
resume_components = [p for p in sourced_components if p not in frozen_components]
|
resume_components = [p for p in sourced_components if p not in frozen_components]
|
||||||
msg.info(f"Pipeline: {nlp.pipe_names}")
|
logger.info(f"Pipeline: {nlp.pipe_names}")
|
||||||
if resume_components:
|
if resume_components:
|
||||||
with nlp.select_pipes(enable=resume_components):
|
with nlp.select_pipes(enable=resume_components):
|
||||||
msg.info(f"Resuming training for: {resume_components}")
|
logger.info(f"Resuming training for: {resume_components}")
|
||||||
nlp.resume_training(sgd=optimizer)
|
nlp.resume_training(sgd=optimizer)
|
||||||
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
|
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
|
||||||
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer, settings=I)
|
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
|
||||||
msg.good("Initialized pipeline components")
|
logger.info("Initialized pipeline components")
|
||||||
# Verify the config after calling 'initialize' to ensure labels
|
# Verify the config after calling 'initialize' to ensure labels
|
||||||
# are properly initialized
|
# are properly initialized
|
||||||
verify_config(nlp)
|
verify_config(nlp)
|
||||||
if "pretraining" in config and config["pretraining"]:
|
|
||||||
P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain)
|
|
||||||
loaded = add_tok2vec_weights(nlp, P, V)
|
|
||||||
if loaded and P["component"]:
|
|
||||||
msg.good(f"Loaded pretrained weights into component '{P['component']}'")
|
|
||||||
nlp = before_to_disk(nlp)
|
nlp = before_to_disk(nlp)
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
@ -75,17 +65,15 @@ def must_reinitialize(train_config: Config, init_config: Config) -> bool:
|
||||||
|
|
||||||
|
|
||||||
def init_vocab(
|
def init_vocab(
|
||||||
nlp: Language,
|
nlp: "Language",
|
||||||
*,
|
*,
|
||||||
data: Optional[Path] = None,
|
data: Optional[Path] = None,
|
||||||
lookups: Optional[Lookups] = None,
|
lookups: Optional[Lookups] = None,
|
||||||
vectors: Optional[str] = None,
|
vectors: Optional[str] = None,
|
||||||
silent: bool = True,
|
) -> "Language":
|
||||||
) -> Language:
|
|
||||||
msg = Printer(no_print=silent)
|
|
||||||
if lookups:
|
if lookups:
|
||||||
nlp.vocab.lookups = lookups
|
nlp.vocab.lookups = lookups
|
||||||
msg.good(f"Added vocab lookups: {', '.join(lookups.tables)}")
|
logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}")
|
||||||
data_path = ensure_path(data)
|
data_path = ensure_path(data)
|
||||||
if data_path is not None:
|
if data_path is not None:
|
||||||
lex_attrs = srsly.read_jsonl(data_path)
|
lex_attrs = srsly.read_jsonl(data_path)
|
||||||
|
@ -101,15 +89,15 @@ def init_vocab(
|
||||||
else:
|
else:
|
||||||
oov_prob = DEFAULT_OOV_PROB
|
oov_prob = DEFAULT_OOV_PROB
|
||||||
nlp.vocab.cfg.update({"oov_prob": oov_prob})
|
nlp.vocab.cfg.update({"oov_prob": oov_prob})
|
||||||
msg.good(f"Added {len(nlp.vocab)} lexical entries to the vocab")
|
logger.info(f"Added {len(nlp.vocab)} lexical entries to the vocab")
|
||||||
msg.good("Created vocabulary")
|
logger.info("Created vocabulary")
|
||||||
if vectors is not None:
|
if vectors is not None:
|
||||||
load_vectors_into_model(nlp, vectors)
|
load_vectors_into_model(nlp, vectors)
|
||||||
msg.good(f"Added vectors: {vectors}")
|
logger.info(f"Added vectors: {vectors}")
|
||||||
|
|
||||||
|
|
||||||
def load_vectors_into_model(
|
def load_vectors_into_model(
|
||||||
nlp: Language, name: Union[str, Path], *, add_strings: bool = True
|
nlp: "Language", name: Union[str, Path], *, add_strings: bool = True
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Load word vectors from an installed model or path into a model instance."""
|
"""Load word vectors from an installed model or path into a model instance."""
|
||||||
try:
|
try:
|
||||||
|
@ -132,8 +120,8 @@ def load_vectors_into_model(
|
||||||
nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
|
nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
|
||||||
|
|
||||||
|
|
||||||
def add_tok2vec_weights(
|
def init_tok2vec(
|
||||||
nlp: Language, pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any]
|
nlp: "Language", pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any]
|
||||||
) -> bool:
|
) -> bool:
|
||||||
# Load pretrained tok2vec weights - cf. CLI command 'pretrain'
|
# Load pretrained tok2vec weights - cf. CLI command 'pretrain'
|
||||||
P = pretrain_config
|
P = pretrain_config
|
||||||
|
@ -171,7 +159,7 @@ def add_tok2vec_weights(
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def verify_config(nlp: Language) -> None:
|
def verify_config(nlp: "Language") -> None:
|
||||||
"""Perform additional checks based on the config, loaded nlp object and training data."""
|
"""Perform additional checks based on the config, loaded nlp object and training data."""
|
||||||
# TODO: maybe we should validate based on the actual components, the list
|
# TODO: maybe we should validate based on the actual components, the list
|
||||||
# in config["nlp"]["pipeline"] instead?
|
# in config["nlp"]["pipeline"] instead?
|
||||||
|
@ -182,7 +170,7 @@ def verify_config(nlp: Language) -> None:
|
||||||
verify_textcat_config(nlp, pipe_config)
|
verify_textcat_config(nlp, pipe_config)
|
||||||
|
|
||||||
|
|
||||||
def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None:
|
def verify_textcat_config(nlp: "Language", pipe_config: Dict[str, Any]) -> None:
|
||||||
# if 'positive_label' is provided: double check whether it's in the data and
|
# if 'positive_label' is provided: double check whether it's in the data and
|
||||||
# the task is binary
|
# the task is binary
|
||||||
if pipe_config.get("positive_label"):
|
if pipe_config.get("positive_label"):
|
||||||
|
@ -211,15 +199,13 @@ def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
|
||||||
|
|
||||||
|
|
||||||
def convert_vectors(
|
def convert_vectors(
|
||||||
nlp: Language,
|
nlp: "Language",
|
||||||
vectors_loc: Optional[Path],
|
vectors_loc: Optional[Path],
|
||||||
*,
|
*,
|
||||||
truncate: int,
|
truncate: int,
|
||||||
prune: int,
|
prune: int,
|
||||||
name: Optional[str] = None,
|
name: Optional[str] = None,
|
||||||
silent: bool = True,
|
|
||||||
) -> None:
|
) -> None:
|
||||||
msg = Printer(no_print=silent)
|
|
||||||
vectors_loc = ensure_path(vectors_loc)
|
vectors_loc = ensure_path(vectors_loc)
|
||||||
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
|
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
|
||||||
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
|
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
|
||||||
|
@ -228,9 +214,9 @@ def convert_vectors(
|
||||||
nlp.vocab.vectors.add(lex.orth, row=lex.rank)
|
nlp.vocab.vectors.add(lex.orth, row=lex.rank)
|
||||||
else:
|
else:
|
||||||
if vectors_loc:
|
if vectors_loc:
|
||||||
with msg.loading(f"Reading vectors from {vectors_loc}"):
|
logger.info(f"Reading vectors from {vectors_loc}")
|
||||||
vectors_data, vector_keys = read_vectors(vectors_loc, truncate)
|
vectors_data, vector_keys = read_vectors(vectors_loc, truncate)
|
||||||
msg.good(f"Loaded vectors from {vectors_loc}")
|
logger.info(f"Loaded vectors from {vectors_loc}")
|
||||||
else:
|
else:
|
||||||
vectors_data, vector_keys = (None, None)
|
vectors_data, vector_keys = (None, None)
|
||||||
if vector_keys is not None:
|
if vector_keys is not None:
|
||||||
|
@ -247,7 +233,6 @@ def convert_vectors(
|
||||||
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
|
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
|
||||||
if prune >= 1:
|
if prune >= 1:
|
||||||
nlp.vocab.prune_vectors(prune)
|
nlp.vocab.prune_vectors(prune)
|
||||||
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
|
|
||||||
|
|
||||||
|
|
||||||
def read_vectors(vectors_loc: Path, truncate_vectors: int):
|
def read_vectors(vectors_loc: Path, truncate_vectors: int):
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any
|
from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any
|
||||||
from typing import Optional
|
from typing import Optional, TYPE_CHECKING
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator
|
from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator
|
||||||
|
@ -9,13 +9,15 @@ from wasabi import Printer
|
||||||
|
|
||||||
from .example import Example
|
from .example import Example
|
||||||
from ..schemas import ConfigSchemaTraining
|
from ..schemas import ConfigSchemaTraining
|
||||||
from ..language import Language
|
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..util import resolve_dot_names, registry
|
from ..util import resolve_dot_names, registry
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from ..language import Language # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
def train(
|
def train(
|
||||||
nlp: Language,
|
nlp: "Language",
|
||||||
output_path: Optional[Path] = None,
|
output_path: Optional[Path] = None,
|
||||||
*,
|
*,
|
||||||
use_gpu: int = -1,
|
use_gpu: int = -1,
|
||||||
|
@ -110,7 +112,7 @@ def train(
|
||||||
|
|
||||||
|
|
||||||
def train_while_improving(
|
def train_while_improving(
|
||||||
nlp: Language,
|
nlp: "Language",
|
||||||
optimizer: Optimizer,
|
optimizer: Optimizer,
|
||||||
train_data,
|
train_data,
|
||||||
evaluate,
|
evaluate,
|
||||||
|
@ -233,7 +235,7 @@ def subdivide_batch(batch, accumulate_gradient):
|
||||||
|
|
||||||
|
|
||||||
def create_evaluation_callback(
|
def create_evaluation_callback(
|
||||||
nlp: Language, dev_corpus: Callable, weights: Dict[str, float]
|
nlp: "Language", dev_corpus: Callable, weights: Dict[str, float]
|
||||||
) -> Callable[[], Tuple[float, Dict[str, float]]]:
|
) -> Callable[[], Tuple[float, Dict[str, float]]]:
|
||||||
weights = {key: value for key, value in weights.items() if value is not None}
|
weights = {key: value for key, value in weights.items() if value is not None}
|
||||||
|
|
||||||
|
@ -277,7 +279,7 @@ def create_train_batches(
|
||||||
|
|
||||||
|
|
||||||
def update_meta(
|
def update_meta(
|
||||||
training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
|
training: Union[Dict[str, Any], Config], nlp: "Language", info: Dict[str, Any]
|
||||||
) -> None:
|
) -> None:
|
||||||
nlp.meta["performance"] = {}
|
nlp.meta["performance"] = {}
|
||||||
for metric in training["score_weights"]:
|
for metric in training["score_weights"]:
|
||||||
|
@ -288,8 +290,10 @@ def update_meta(
|
||||||
|
|
||||||
|
|
||||||
def create_before_to_disk_callback(
|
def create_before_to_disk_callback(
|
||||||
callback: Optional[Callable[[Language], Language]]
|
callback: Optional[Callable[["Language"], "Language"]]
|
||||||
) -> Callable[[Language], Language]:
|
) -> Callable[["Language"], "Language"]:
|
||||||
|
from ..language import Language # noqa: F811
|
||||||
|
|
||||||
def before_to_disk(nlp: Language) -> Language:
|
def before_to_disk(nlp: Language) -> Language:
|
||||||
if not callback:
|
if not callback:
|
||||||
return nlp
|
return nlp
|
||||||
|
|
Loading…
Reference in New Issue
Block a user