mirror of
https://github.com/explosion/spaCy.git
synced 2025-05-05 08:13:41 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
3b8f352eda
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy-nightly"
|
__title__ = "spacy-nightly"
|
||||||
__version__ = "3.0.0a29"
|
__version__ = "3.0.0a30"
|
||||||
__release__ = True
|
__release__ = True
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
|
|
|
@ -3,6 +3,7 @@ from pathlib import Path
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
import typer
|
import typer
|
||||||
import logging
|
import logging
|
||||||
|
import sys
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
||||||
from ._util import import_code, setup_gpu
|
from ._util import import_code, setup_gpu
|
||||||
|
@ -39,7 +40,12 @@ def train_cli(
|
||||||
DOCS: https://nightly.spacy.io/api/cli#train
|
DOCS: https://nightly.spacy.io/api/cli#train
|
||||||
"""
|
"""
|
||||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||||
verify_cli_args(config_path, output_path)
|
# Make sure all files and paths exists if they are needed
|
||||||
|
if not config_path or not config_path.exists():
|
||||||
|
msg.fail("Config file not found", config_path, exits=1)
|
||||||
|
if output_path is not None and not output_path.exists():
|
||||||
|
output_path.mkdir()
|
||||||
|
msg.good(f"Created output directory: {output_path}")
|
||||||
overrides = parse_config_overrides(ctx.args)
|
overrides = parse_config_overrides(ctx.args)
|
||||||
import_code(code_path)
|
import_code(code_path)
|
||||||
setup_gpu(use_gpu)
|
setup_gpu(use_gpu)
|
||||||
|
@ -50,14 +56,4 @@ def train_cli(
|
||||||
nlp = init_nlp(config, use_gpu=use_gpu)
|
nlp = init_nlp(config, use_gpu=use_gpu)
|
||||||
msg.good("Initialized pipeline")
|
msg.good("Initialized pipeline")
|
||||||
msg.divider("Training pipeline")
|
msg.divider("Training pipeline")
|
||||||
train(nlp, output_path, use_gpu=use_gpu, silent=False)
|
train(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr)
|
||||||
|
|
||||||
|
|
||||||
def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> None:
|
|
||||||
# Make sure all files and paths exists if they are needed
|
|
||||||
if not config_path or not config_path.exists():
|
|
||||||
msg.fail("Config file not found", config_path, exits=1)
|
|
||||||
if output_path is not None:
|
|
||||||
if not output_path.exists():
|
|
||||||
output_path.mkdir()
|
|
||||||
msg.good(f"Created output directory: {output_path}")
|
|
||||||
|
|
|
@ -102,7 +102,7 @@ def load_vectors_into_model(
|
||||||
"with the packaged vectors. Make sure that the vectors package you're "
|
"with the packaged vectors. Make sure that the vectors package you're "
|
||||||
"loading is compatible with the current version of spaCy."
|
"loading is compatible with the current version of spaCy."
|
||||||
)
|
)
|
||||||
err = ConfigValidationError.from_error(config=None, title=title, desc=desc)
|
err = ConfigValidationError.from_error(e, config=None, title=title, desc=desc)
|
||||||
raise err from None
|
raise err from None
|
||||||
nlp.vocab.vectors = vectors_nlp.vocab.vectors
|
nlp.vocab.vectors = vectors_nlp.vocab.vectors
|
||||||
if add_strings:
|
if add_strings:
|
||||||
|
|
|
@ -1,18 +1,24 @@
|
||||||
from typing import Dict, Any, Tuple, Callable, List
|
from typing import TYPE_CHECKING, Dict, Any, Tuple, Callable, List, Optional, IO
|
||||||
|
import wasabi
|
||||||
|
import tqdm
|
||||||
|
import sys
|
||||||
|
|
||||||
from ..util import registry
|
from ..util import registry
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from wasabi import msg
|
|
||||||
|
|
||||||
|
|
||||||
@registry.loggers("spacy.ConsoleLogger.v1")
|
@registry.loggers("spacy.ConsoleLogger.v1")
|
||||||
def console_logger():
|
def console_logger(progress_bar: bool=False):
|
||||||
def setup_printer(
|
def setup_printer(
|
||||||
nlp: "Language",
|
nlp: "Language",
|
||||||
) -> Tuple[Callable[[Dict[str, Any]], None], Callable]:
|
stdout: IO=sys.stdout,
|
||||||
|
stderr: IO=sys.stderr
|
||||||
|
) -> Tuple[Callable[[Optional[Dict[str, Any]]], None], Callable]:
|
||||||
|
msg = wasabi.Printer(no_print=True)
|
||||||
# we assume here that only components are enabled that should be trained & logged
|
# we assume here that only components are enabled that should be trained & logged
|
||||||
logged_pipes = nlp.pipe_names
|
logged_pipes = nlp.pipe_names
|
||||||
|
eval_frequency = nlp.config["training"]["eval_frequency"]
|
||||||
score_weights = nlp.config["training"]["score_weights"]
|
score_weights = nlp.config["training"]["score_weights"]
|
||||||
score_cols = [col for col, value in score_weights.items() if value is not None]
|
score_cols = [col for col, value in score_weights.items() if value is not None]
|
||||||
score_widths = [max(len(col), 6) for col in score_cols]
|
score_widths = [max(len(col), 6) for col in score_cols]
|
||||||
|
@ -22,10 +28,18 @@ def console_logger():
|
||||||
table_header = [col.upper() for col in table_header]
|
table_header = [col.upper() for col in table_header]
|
||||||
table_widths = [3, 6] + loss_widths + score_widths + [6]
|
table_widths = [3, 6] + loss_widths + score_widths + [6]
|
||||||
table_aligns = ["r" for _ in table_widths]
|
table_aligns = ["r" for _ in table_widths]
|
||||||
msg.row(table_header, widths=table_widths)
|
stdout.write(msg.row(table_header, widths=table_widths))
|
||||||
msg.row(["-" * width for width in table_widths])
|
stdout.write(msg.row(["-" * width for width in table_widths]))
|
||||||
|
progress = None
|
||||||
|
|
||||||
def log_step(info: Dict[str, Any]):
|
def log_step(info: Optional[Dict[str, Any]]):
|
||||||
|
nonlocal progress
|
||||||
|
|
||||||
|
if info is None:
|
||||||
|
# If we don't have a new checkpoint, just return.
|
||||||
|
if progress is not None:
|
||||||
|
progress.update(1)
|
||||||
|
return
|
||||||
try:
|
try:
|
||||||
losses = [
|
losses = [
|
||||||
"{0:.2f}".format(float(info["losses"][pipe_name]))
|
"{0:.2f}".format(float(info["losses"][pipe_name]))
|
||||||
|
@ -39,24 +53,37 @@ def console_logger():
|
||||||
keys=list(info["losses"].keys()),
|
keys=list(info["losses"].keys()),
|
||||||
)
|
)
|
||||||
) from None
|
) from None
|
||||||
|
|
||||||
scores = []
|
scores = []
|
||||||
for col in score_cols:
|
for col in score_cols:
|
||||||
score = info["other_scores"].get(col, 0.0)
|
score = info["other_scores"].get(col, 0.0)
|
||||||
try:
|
try:
|
||||||
score = float(score)
|
score = float(score)
|
||||||
if col != "speed":
|
|
||||||
score *= 100
|
|
||||||
scores.append("{0:.2f}".format(score))
|
|
||||||
except TypeError:
|
except TypeError:
|
||||||
err = Errors.E916.format(name=col, score_type=type(score))
|
err = Errors.E916.format(name=col, score_type=type(score))
|
||||||
raise ValueError(err) from None
|
raise ValueError(err) from None
|
||||||
|
if col != "speed":
|
||||||
|
score *= 100
|
||||||
|
scores.append("{0:.2f}".format(score))
|
||||||
|
|
||||||
data = (
|
data = (
|
||||||
[info["epoch"], info["step"]]
|
[info["epoch"], info["step"]]
|
||||||
+ losses
|
+ losses
|
||||||
+ scores
|
+ scores
|
||||||
+ ["{0:.2f}".format(float(info["score"]))]
|
+ ["{0:.2f}".format(float(info["score"]))]
|
||||||
)
|
)
|
||||||
msg.row(data, widths=table_widths, aligns=table_aligns)
|
if progress is not None:
|
||||||
|
progress.close()
|
||||||
|
stdout.write(msg.row(data, widths=table_widths, aligns=table_aligns))
|
||||||
|
if progress_bar:
|
||||||
|
# Set disable=None, so that it disables on non-TTY
|
||||||
|
progress = tqdm.tqdm(
|
||||||
|
total=eval_frequency,
|
||||||
|
disable=None,
|
||||||
|
leave=False,
|
||||||
|
file=stderr
|
||||||
|
)
|
||||||
|
progress.set_description(f"Epoch {info['epoch']+1}")
|
||||||
|
|
||||||
def finalize():
|
def finalize():
|
||||||
pass
|
pass
|
||||||
|
@ -70,10 +97,12 @@ def console_logger():
|
||||||
def wandb_logger(project_name: str, remove_config_values: List[str] = []):
|
def wandb_logger(project_name: str, remove_config_values: List[str] = []):
|
||||||
import wandb
|
import wandb
|
||||||
|
|
||||||
console = console_logger()
|
console = console_logger(progress_bar=False)
|
||||||
|
|
||||||
def setup_logger(
|
def setup_logger(
|
||||||
nlp: "Language",
|
nlp: "Language",
|
||||||
|
stdout: IO=sys.stdout,
|
||||||
|
stderr: IO=sys.stderr
|
||||||
) -> Tuple[Callable[[Dict[str, Any]], None], Callable]:
|
) -> Tuple[Callable[[Dict[str, Any]], None], Callable]:
|
||||||
config = nlp.config.interpolate()
|
config = nlp.config.interpolate()
|
||||||
config_dot = util.dict_to_dot(config)
|
config_dot = util.dict_to_dot(config)
|
||||||
|
@ -81,18 +110,19 @@ def wandb_logger(project_name: str, remove_config_values: List[str] = []):
|
||||||
del config_dot[field]
|
del config_dot[field]
|
||||||
config = util.dot_to_dict(config_dot)
|
config = util.dot_to_dict(config_dot)
|
||||||
wandb.init(project=project_name, config=config, reinit=True)
|
wandb.init(project=project_name, config=config, reinit=True)
|
||||||
console_log_step, console_finalize = console(nlp)
|
console_log_step, console_finalize = console(nlp, stdout, stderr)
|
||||||
|
|
||||||
def log_step(info: Dict[str, Any]):
|
def log_step(info: Optional[Dict[str, Any]]):
|
||||||
console_log_step(info)
|
console_log_step(info)
|
||||||
score = info["score"]
|
if info is not None:
|
||||||
other_scores = info["other_scores"]
|
score = info["score"]
|
||||||
losses = info["losses"]
|
other_scores = info["other_scores"]
|
||||||
wandb.log({"score": score})
|
losses = info["losses"]
|
||||||
if losses:
|
wandb.log({"score": score})
|
||||||
wandb.log({f"loss_{k}": v for k, v in losses.items()})
|
if losses:
|
||||||
if isinstance(other_scores, dict):
|
wandb.log({f"loss_{k}": v for k, v in losses.items()})
|
||||||
wandb.log(other_scores)
|
if isinstance(other_scores, dict):
|
||||||
|
wandb.log(other_scores)
|
||||||
|
|
||||||
def finalize():
|
def finalize():
|
||||||
console_finalize()
|
console_finalize()
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any
|
from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any, IO
|
||||||
from typing import Optional, TYPE_CHECKING
|
from typing import Optional, TYPE_CHECKING
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator
|
from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator
|
||||||
import random
|
import random
|
||||||
import tqdm
|
import wasabi
|
||||||
from wasabi import Printer
|
import sys
|
||||||
|
|
||||||
from .example import Example
|
from .example import Example
|
||||||
from ..schemas import ConfigSchemaTraining
|
from ..schemas import ConfigSchemaTraining
|
||||||
|
@ -21,7 +21,8 @@ def train(
|
||||||
output_path: Optional[Path] = None,
|
output_path: Optional[Path] = None,
|
||||||
*,
|
*,
|
||||||
use_gpu: int = -1,
|
use_gpu: int = -1,
|
||||||
silent: bool = False,
|
stdout: IO=sys.stdout,
|
||||||
|
stderr: IO=sys.stderr
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Train a pipeline.
|
"""Train a pipeline.
|
||||||
|
|
||||||
|
@ -29,10 +30,15 @@ def train(
|
||||||
output_path (Path): Optional output path to save trained model to.
|
output_path (Path): Optional output path to save trained model to.
|
||||||
use_gpu (int): Whether to train on GPU. Make sure to call require_gpu
|
use_gpu (int): Whether to train on GPU. Make sure to call require_gpu
|
||||||
before calling this function.
|
before calling this function.
|
||||||
silent (bool): Whether to pretty-print outputs.
|
stdout (file): A file-like object to write output messages. To disable
|
||||||
|
printing, set to io.StringIO.
|
||||||
|
stderr (file): A second file-like object to write output messages. To disable
|
||||||
|
printing, set to io.StringIO.
|
||||||
|
|
||||||
RETURNS (Path / None): The path to the final exported model.
|
RETURNS (Path / None): The path to the final exported model.
|
||||||
"""
|
"""
|
||||||
msg = Printer(no_print=silent)
|
# We use no_print here so we can respect the stdout/stderr options.
|
||||||
|
msg = wasabi.Printer(no_print=True)
|
||||||
# Create iterator, which yields out info after each optimization step.
|
# Create iterator, which yields out info after each optimization step.
|
||||||
config = nlp.config.interpolate()
|
config = nlp.config.interpolate()
|
||||||
if config["training"]["seed"] is not None:
|
if config["training"]["seed"] is not None:
|
||||||
|
@ -63,50 +69,44 @@ def train(
|
||||||
eval_frequency=T["eval_frequency"],
|
eval_frequency=T["eval_frequency"],
|
||||||
exclude=frozen_components,
|
exclude=frozen_components,
|
||||||
)
|
)
|
||||||
msg.info(f"Pipeline: {nlp.pipe_names}")
|
stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}"))
|
||||||
if frozen_components:
|
if frozen_components:
|
||||||
msg.info(f"Frozen components: {frozen_components}")
|
stdout.write(msg.info(f"Frozen components: {frozen_components}"))
|
||||||
msg.info(f"Initial learn rate: {optimizer.learn_rate}")
|
stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate}"))
|
||||||
with nlp.select_pipes(disable=frozen_components):
|
with nlp.select_pipes(disable=frozen_components):
|
||||||
print_row, finalize_logger = train_logger(nlp)
|
log_step, finalize_logger = train_logger(nlp, stdout, stderr)
|
||||||
try:
|
try:
|
||||||
progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
|
|
||||||
progress.set_description(f"Epoch 1")
|
|
||||||
for batch, info, is_best_checkpoint in training_step_iterator:
|
for batch, info, is_best_checkpoint in training_step_iterator:
|
||||||
progress.update(1)
|
log_step(info if is_best_checkpoint else None)
|
||||||
if is_best_checkpoint is not None:
|
if is_best_checkpoint is not None and output_path is not None:
|
||||||
progress.close()
|
with nlp.select_pipes(disable=frozen_components):
|
||||||
print_row(info)
|
update_meta(T, nlp, info)
|
||||||
if is_best_checkpoint and output_path is not None:
|
with nlp.use_params(optimizer.averages):
|
||||||
with nlp.select_pipes(disable=frozen_components):
|
nlp = before_to_disk(nlp)
|
||||||
update_meta(T, nlp, info)
|
nlp.to_disk(output_path / "model-best")
|
||||||
with nlp.use_params(optimizer.averages):
|
|
||||||
nlp = before_to_disk(nlp)
|
|
||||||
nlp.to_disk(output_path / "model-best")
|
|
||||||
progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
|
|
||||||
progress.set_description(f"Epoch {info['epoch']}")
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
finalize_logger()
|
|
||||||
if output_path is not None:
|
if output_path is not None:
|
||||||
# We don't want to swallow the traceback if we don't have a
|
# We don't want to swallow the traceback if we don't have a
|
||||||
# specific error.
|
# specific error, but we do want to warn that we're trying
|
||||||
msg.warn(
|
# to do something here.
|
||||||
f"Aborting and saving the final best model. "
|
stdout.write(
|
||||||
f"Encountered exception: {str(e)}"
|
msg.warn(
|
||||||
|
f"Aborting and saving the final best model. "
|
||||||
|
f"Encountered exception: {str(e)}"
|
||||||
|
)
|
||||||
)
|
)
|
||||||
nlp = before_to_disk(nlp)
|
|
||||||
nlp.to_disk(output_path / "model-final")
|
|
||||||
raise e
|
raise e
|
||||||
finally:
|
finally:
|
||||||
finalize_logger()
|
finalize_logger()
|
||||||
if output_path is not None:
|
if output_path is not None:
|
||||||
final_model_path = output_path / "model-final"
|
final_model_path = output_path / "model-last"
|
||||||
if optimizer.averages:
|
if optimizer.averages:
|
||||||
with nlp.use_params(optimizer.averages):
|
with nlp.use_params(optimizer.averages):
|
||||||
nlp.to_disk(final_model_path)
|
nlp.to_disk(final_model_path)
|
||||||
else:
|
else:
|
||||||
nlp.to_disk(final_model_path)
|
nlp.to_disk(final_model_path)
|
||||||
msg.good(f"Saved pipeline to output directory", final_model_path)
|
# This will only run if we don't hit an error
|
||||||
|
stdout.write(msg.good("Saved pipeline to output directory", final_model_path))
|
||||||
|
|
||||||
|
|
||||||
def train_while_improving(
|
def train_while_improving(
|
||||||
|
|
|
@ -689,8 +689,8 @@ During training, the results of each step are passed to a logger function. By
|
||||||
default, these results are written to the console with the
|
default, these results are written to the console with the
|
||||||
[`ConsoleLogger`](/api/top-level#ConsoleLogger). There is also built-in support
|
[`ConsoleLogger`](/api/top-level#ConsoleLogger). There is also built-in support
|
||||||
for writing the log files to [Weights & Biases](https://www.wandb.com/) with the
|
for writing the log files to [Weights & Biases](https://www.wandb.com/) with the
|
||||||
[`WandbLogger`](/api/top-level#WandbLogger). The logger function receives a
|
[`WandbLogger`](/api/top-level#WandbLogger). On each step, the logger function
|
||||||
**dictionary** with the following keys:
|
receives a **dictionary** with the following keys:
|
||||||
|
|
||||||
| Key | Value |
|
| Key | Value |
|
||||||
| -------------- | ----------------------------------------------------------------------------------------------------- |
|
| -------------- | ----------------------------------------------------------------------------------------------------- |
|
||||||
|
@ -715,30 +715,37 @@ tabular results to a file:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### functions.py
|
### functions.py
|
||||||
from typing import Tuple, Callable, Dict, Any
|
import sys
|
||||||
|
from typing import IO, Tuple, Callable, Dict, Any
|
||||||
import spacy
|
import spacy
|
||||||
|
from spacy import Language
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
@spacy.registry.loggers("my_custom_logger.v1")
|
@spacy.registry.loggers("my_custom_logger.v1")
|
||||||
def custom_logger(log_path):
|
def custom_logger(log_path):
|
||||||
def setup_logger(nlp: "Language") -> Tuple[Callable, Callable]:
|
def setup_logger(
|
||||||
with Path(log_path).open("w", encoding="utf8") as file_:
|
nlp: Language,
|
||||||
file_.write("step\\t")
|
stdout: IO=sys.stdout,
|
||||||
file_.write("score\\t")
|
stderr: IO=sys.stderr
|
||||||
for pipe in nlp.pipe_names:
|
) -> Tuple[Callable, Callable]:
|
||||||
file_.write(f"loss_{pipe}\\t")
|
stdout.write(f"Logging to {log_path}\n")
|
||||||
file_.write("\\n")
|
log_file = Path(log_path).open("w", encoding="utf8")
|
||||||
|
log_file.write("step\\t")
|
||||||
|
log_file.write("score\\t")
|
||||||
|
for pipe in nlp.pipe_names:
|
||||||
|
log_file.write(f"loss_{pipe}\\t")
|
||||||
|
log_file.write("\\n")
|
||||||
|
|
||||||
def log_step(info: Dict[str, Any]):
|
def log_step(info: Optional[Dict[str, Any]]):
|
||||||
with Path(log_path).open("a") as file_:
|
if info:
|
||||||
file_.write(f"{info['step']}\\t")
|
log_file.write(f"{info['step']}\\t")
|
||||||
file_.write(f"{info['score']}\\t")
|
log_file.write(f"{info['score']}\\t")
|
||||||
for pipe in nlp.pipe_names:
|
for pipe in nlp.pipe_names:
|
||||||
file_.write(f"{info['losses'][pipe]}\\t")
|
log_file.write(f"{info['losses'][pipe]}\\t")
|
||||||
file_.write("\\n")
|
log_file.write("\\n")
|
||||||
|
|
||||||
def finalize():
|
def finalize():
|
||||||
pass
|
log_file.close()
|
||||||
|
|
||||||
return log_step, finalize
|
return log_step, finalize
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user