mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-14 18:40:33 +03:00
Merge branch 'develop' into nightly.spacy.io
This commit is contained in:
commit
812c15c213
2
Makefile
2
Makefile
|
@ -1,7 +1,7 @@
|
||||||
SHELL := /bin/bash
|
SHELL := /bin/bash
|
||||||
|
|
||||||
ifndef SPACY_EXTRAS
|
ifndef SPACY_EXTRAS
|
||||||
override SPACY_EXTRAS = spacy-lookups-data==0.4.0.dev0 jieba pkuseg==0.0.25 sudachipy sudachidict_core
|
override SPACY_EXTRAS = spacy-lookups-data==1.0.0rc0 jieba pkuseg==0.0.25 pickle5 sudachipy sudachidict_core
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifndef PYVER
|
ifndef PYVER
|
||||||
|
|
|
@ -6,7 +6,7 @@ requires = [
|
||||||
"cymem>=2.0.2,<2.1.0",
|
"cymem>=2.0.2,<2.1.0",
|
||||||
"preshed>=3.0.2,<3.1.0",
|
"preshed>=3.0.2,<3.1.0",
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"thinc>=8.0.0a35,<8.0.0a40",
|
"thinc>=8.0.0a42,<8.0.0a50",
|
||||||
"blis>=0.4.0,<0.5.0",
|
"blis>=0.4.0,<0.5.0",
|
||||||
"pytokenizations",
|
"pytokenizations",
|
||||||
"pathy"
|
"pathy"
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# Our libraries
|
# Our libraries
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0a35,<8.0.0a40
|
thinc>=8.0.0a42,<8.0.0a50
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.5.0
|
||||||
ml_datasets==0.2.0a0
|
ml_datasets==0.2.0a0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
|
@ -14,7 +14,7 @@ pathy
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
pydantic>=1.3.0,<2.0.0
|
pydantic>=1.5.0,<2.0.0
|
||||||
pytokenizations
|
pytokenizations
|
||||||
# Official Python utilities
|
# Official Python utilities
|
||||||
setuptools
|
setuptools
|
||||||
|
|
10
setup.cfg
10
setup.cfg
|
@ -34,13 +34,13 @@ setup_requires =
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
thinc>=8.0.0a35,<8.0.0a40
|
thinc>=8.0.0a42,<8.0.0a50
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0a35,<8.0.0a40
|
thinc>=8.0.0a42,<8.0.0a50
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.5.0
|
||||||
wasabi>=0.8.0,<1.1.0
|
wasabi>=0.8.0,<1.1.0
|
||||||
srsly>=2.1.0,<3.0.0
|
srsly>=2.1.0,<3.0.0
|
||||||
|
@ -51,7 +51,7 @@ install_requires =
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
pydantic>=1.3.0,<2.0.0
|
pydantic>=1.5.0,<2.0.0
|
||||||
pytokenizations
|
pytokenizations
|
||||||
# Official Python utilities
|
# Official Python utilities
|
||||||
setuptools
|
setuptools
|
||||||
|
@ -65,7 +65,7 @@ console_scripts =
|
||||||
|
|
||||||
[options.extras_require]
|
[options.extras_require]
|
||||||
lookups =
|
lookups =
|
||||||
spacy_lookups_data==0.4.0.dev0
|
spacy_lookups_data==1.0.0rc0
|
||||||
cuda =
|
cuda =
|
||||||
cupy>=5.0.0b4,<9.0.0
|
cupy>=5.0.0b4,<9.0.0
|
||||||
cuda80 =
|
cuda80 =
|
||||||
|
@ -98,7 +98,7 @@ universal = false
|
||||||
formats = gztar
|
formats = gztar
|
||||||
|
|
||||||
[flake8]
|
[flake8]
|
||||||
ignore = E203, E266, E501, E731, W503
|
ignore = E203, E266, E501, E731, W503, E741
|
||||||
max-line-length = 80
|
max-line-length = 80
|
||||||
select = B,C,E,F,W,T4,B9
|
select = B,C,E,F,W,T4,B9
|
||||||
exclude =
|
exclude =
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy-nightly"
|
__title__ = "spacy-nightly"
|
||||||
__version__ = "3.0.0a26"
|
__version__ = "3.0.0a28"
|
||||||
__release__ = True
|
__release__ = True
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
|
|
|
@ -15,7 +15,7 @@ from .debug_config import debug_config # noqa: F401
|
||||||
from .debug_model import debug_model # noqa: F401
|
from .debug_model import debug_model # noqa: F401
|
||||||
from .evaluate import evaluate # noqa: F401
|
from .evaluate import evaluate # noqa: F401
|
||||||
from .convert import convert # noqa: F401
|
from .convert import convert # noqa: F401
|
||||||
from .init_model import init_model # noqa: F401
|
from .init_pipeline import init_pipeline_cli # noqa: F401
|
||||||
from .init_config import init_config, fill_config # noqa: F401
|
from .init_config import init_config, fill_config # noqa: F401
|
||||||
from .validate import validate # noqa: F401
|
from .validate import validate # noqa: F401
|
||||||
from .project.clone import project_clone # noqa: F401
|
from .project.clone import project_clone # noqa: F401
|
||||||
|
|
|
@ -10,12 +10,13 @@ from click import NoSuchOption
|
||||||
from click.parser import split_arg_string
|
from click.parser import split_arg_string
|
||||||
from typer.main import get_command
|
from typer.main import get_command
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from thinc.config import Config, ConfigValidationError
|
from thinc.api import Config, ConfigValidationError, require_gpu
|
||||||
from configparser import InterpolationError
|
from configparser import InterpolationError
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from ..schemas import ProjectConfigSchema, validate
|
from ..schemas import ProjectConfigSchema, validate
|
||||||
from ..util import import_file, run_command, make_tempdir, registry, logger
|
from ..util import import_file, run_command, make_tempdir, registry, logger
|
||||||
|
from ..util import ENV_VARS
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from pathy import Pathy # noqa: F401
|
from pathy import Pathy # noqa: F401
|
||||||
|
@ -39,7 +40,6 @@ commands to check and validate your config files, training and evaluation data,
|
||||||
and custom model implementations.
|
and custom model implementations.
|
||||||
"""
|
"""
|
||||||
INIT_HELP = """Commands for initializing configs and pipeline packages."""
|
INIT_HELP = """Commands for initializing configs and pipeline packages."""
|
||||||
OVERRIDES_ENV_VAR = "SPACY_CONFIG_OVERRIDES"
|
|
||||||
|
|
||||||
# Wrappers for Typer's annotations. Initially created to set defaults and to
|
# Wrappers for Typer's annotations. Initially created to set defaults and to
|
||||||
# keep the names short, but not needed at the moment.
|
# keep the names short, but not needed at the moment.
|
||||||
|
@ -65,7 +65,7 @@ def setup_cli() -> None:
|
||||||
|
|
||||||
|
|
||||||
def parse_config_overrides(
|
def parse_config_overrides(
|
||||||
args: List[str], env_var: Optional[str] = OVERRIDES_ENV_VAR
|
args: List[str], env_var: Optional[str] = ENV_VARS.CONFIG_OVERRIDES
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
"""Generate a dictionary of config overrides based on the extra arguments
|
"""Generate a dictionary of config overrides based on the extra arguments
|
||||||
provided on the CLI, e.g. --training.batch_size to override
|
provided on the CLI, e.g. --training.batch_size to override
|
||||||
|
@ -226,24 +226,30 @@ def get_checksum(path: Union[Path, str]) -> str:
|
||||||
def show_validation_error(
|
def show_validation_error(
|
||||||
file_path: Optional[Union[str, Path]] = None,
|
file_path: Optional[Union[str, Path]] = None,
|
||||||
*,
|
*,
|
||||||
title: str = "Config validation error",
|
title: Optional[str] = None,
|
||||||
|
desc: str = "",
|
||||||
|
show_config: Optional[bool] = None,
|
||||||
hint_fill: bool = True,
|
hint_fill: bool = True,
|
||||||
):
|
):
|
||||||
"""Helper to show custom config validation errors on the CLI.
|
"""Helper to show custom config validation errors on the CLI.
|
||||||
|
|
||||||
file_path (str / Path): Optional file path of config file, used in hints.
|
file_path (str / Path): Optional file path of config file, used in hints.
|
||||||
title (str): Title of the custom formatted error.
|
title (str): Override title of custom formatted error.
|
||||||
|
desc (str): Override description of custom formatted error.
|
||||||
|
show_config (bool): Whether to output the config the error refers to.
|
||||||
hint_fill (bool): Show hint about filling config.
|
hint_fill (bool): Show hint about filling config.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
yield
|
yield
|
||||||
except (ConfigValidationError, InterpolationError) as e:
|
except ConfigValidationError as e:
|
||||||
msg.fail(title, spaced=True)
|
title = title if title is not None else e.title
|
||||||
# TODO: This is kinda hacky and we should probably provide a better
|
if e.desc:
|
||||||
# helper for this in Thinc
|
desc = f"{e.desc}" if not desc else f"{e.desc}\n\n{desc}"
|
||||||
err_text = str(e).replace("Config validation error", "").strip()
|
# Re-generate a new error object with overrides
|
||||||
print(err_text)
|
err = e.from_error(e, title="", desc=desc, show_config=show_config)
|
||||||
if hint_fill and "field required" in err_text:
|
msg.fail(title)
|
||||||
|
print(err.text.strip())
|
||||||
|
if hint_fill and "value_error.missing" in err.error_types:
|
||||||
config_path = file_path if file_path is not None else "config.cfg"
|
config_path = file_path if file_path is not None else "config.cfg"
|
||||||
msg.text(
|
msg.text(
|
||||||
"If your config contains missing values, you can run the 'init "
|
"If your config contains missing values, you can run the 'init "
|
||||||
|
@ -252,6 +258,8 @@ def show_validation_error(
|
||||||
)
|
)
|
||||||
print(f"{COMMAND} init fill-config {config_path} --base {config_path}\n")
|
print(f"{COMMAND} init fill-config {config_path} --base {config_path}\n")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
except InterpolationError as e:
|
||||||
|
msg.fail("Config validation error", e, exits=1)
|
||||||
|
|
||||||
|
|
||||||
def import_code(code_path: Optional[Union[Path, str]]) -> None:
|
def import_code(code_path: Optional[Union[Path, str]]) -> None:
|
||||||
|
@ -267,18 +275,6 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None:
|
||||||
msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
|
msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
|
||||||
|
|
||||||
|
|
||||||
def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
|
|
||||||
"""RETURNS (List[str]): All sourced components in the original config,
|
|
||||||
e.g. {"source": "en_core_web_sm"}. If the config contains a key
|
|
||||||
"factory", we assume it refers to a component factory.
|
|
||||||
"""
|
|
||||||
return [
|
|
||||||
name
|
|
||||||
for name, cfg in config.get("components", {}).items()
|
|
||||||
if "factory" not in cfg and "source" in cfg
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
|
def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
|
||||||
"""Upload a file.
|
"""Upload a file.
|
||||||
|
|
||||||
|
@ -450,3 +446,12 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in
|
||||||
p = int(p)
|
p = int(p)
|
||||||
result.append(p)
|
result.append(p)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def setup_gpu(use_gpu: int) -> None:
|
||||||
|
"""Configure the GPU and log info."""
|
||||||
|
if use_gpu >= 0:
|
||||||
|
msg.info(f"Using GPU: {use_gpu}")
|
||||||
|
require_gpu(use_gpu)
|
||||||
|
else:
|
||||||
|
msg.info("Using CPU")
|
||||||
|
|
|
@ -9,7 +9,8 @@ import sys
|
||||||
from ._util import app, Arg, Opt
|
from ._util import app, Arg, Opt
|
||||||
from ..training import docs_to_json
|
from ..training import docs_to_json
|
||||||
from ..tokens import DocBin
|
from ..tokens import DocBin
|
||||||
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs, conllu_to_docs
|
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
|
||||||
|
from ..training.converters import conllu_to_docs
|
||||||
|
|
||||||
|
|
||||||
# Converters are matched by file extension except for ner/iob, which are
|
# Converters are matched by file extension except for ner/iob, which are
|
||||||
|
|
|
@ -2,11 +2,13 @@ from typing import Optional, Dict, Any, Union, List
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg, table
|
from wasabi import msg, table
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
from thinc.config import VARIABLE_RE, ConfigValidationError
|
from thinc.config import VARIABLE_RE
|
||||||
import typer
|
import typer
|
||||||
|
|
||||||
from ._util import Arg, Opt, show_validation_error, parse_config_overrides
|
from ._util import Arg, Opt, show_validation_error, parse_config_overrides
|
||||||
from ._util import import_code, debug_cli
|
from ._util import import_code, debug_cli
|
||||||
|
from ..schemas import ConfigSchemaTraining
|
||||||
|
from ..util import registry
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -51,10 +53,11 @@ def debug_config(
|
||||||
msg.divider("Config validation")
|
msg.divider("Config validation")
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
config = util.load_config(config_path, overrides=overrides)
|
config = util.load_config(config_path, overrides=overrides)
|
||||||
nlp, resolved = util.load_model_from_config(config)
|
nlp = util.load_model_from_config(config)
|
||||||
# Use the resolved config here in case user has one function returning
|
config = nlp.config.interpolate()
|
||||||
# a dict of corpora etc.
|
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
||||||
check_section_refs(resolved, ["training.dev_corpus", "training.train_corpus"])
|
dot_names = [T["train_corpus"], T["dev_corpus"]]
|
||||||
|
util.resolve_dot_names(config, dot_names)
|
||||||
msg.good("Config is valid")
|
msg.good("Config is valid")
|
||||||
if show_vars:
|
if show_vars:
|
||||||
variables = get_variables(config)
|
variables = get_variables(config)
|
||||||
|
@ -96,23 +99,3 @@ def get_variables(config: Config) -> Dict[str, Any]:
|
||||||
value = util.dot_to_object(config, path)
|
value = util.dot_to_object(config, path)
|
||||||
result[variable] = repr(value)
|
result[variable] = repr(value)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def check_section_refs(config: Config, fields: List[str]) -> None:
|
|
||||||
"""Validate fields in the config that refer to other sections or values
|
|
||||||
(e.g. in the corpora) and make sure that those references exist.
|
|
||||||
"""
|
|
||||||
errors = []
|
|
||||||
for field in fields:
|
|
||||||
# If the field doesn't exist in the config, we ignore it
|
|
||||||
try:
|
|
||||||
value = util.dot_to_object(config, field)
|
|
||||||
except KeyError:
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
util.dot_to_object(config, value)
|
|
||||||
except KeyError:
|
|
||||||
msg = f"not a valid section reference: {value}"
|
|
||||||
errors.append({"loc": field.split("."), "msg": msg})
|
|
||||||
if errors:
|
|
||||||
raise ConfigValidationError(config, errors)
|
|
||||||
|
|
|
@ -7,10 +7,13 @@ from wasabi import Printer, MESSAGES, msg
|
||||||
import typer
|
import typer
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
||||||
from ._util import import_code, debug_cli, get_sourced_components
|
from ._util import import_code, debug_cli
|
||||||
from ..training import Corpus, Example
|
from ..training import Example
|
||||||
|
from ..training.initialize import get_sourced_components
|
||||||
|
from ..schemas import ConfigSchemaTraining
|
||||||
from ..pipeline._parser_internals import nonproj
|
from ..pipeline._parser_internals import nonproj
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
|
from ..util import registry, resolve_dot_names
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -24,7 +27,7 @@ BLANK_MODEL_THRESHOLD = 2000
|
||||||
|
|
||||||
|
|
||||||
@debug_cli.command(
|
@debug_cli.command(
|
||||||
"data", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
"data", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
|
||||||
)
|
)
|
||||||
@app.command(
|
@app.command(
|
||||||
"debug-data",
|
"debug-data",
|
||||||
|
@ -34,8 +37,6 @@ BLANK_MODEL_THRESHOLD = 2000
|
||||||
def debug_data_cli(
|
def debug_data_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True),
|
|
||||||
dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True),
|
|
||||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
|
ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
|
||||||
|
@ -59,8 +60,6 @@ def debug_data_cli(
|
||||||
overrides = parse_config_overrides(ctx.args)
|
overrides = parse_config_overrides(ctx.args)
|
||||||
import_code(code_path)
|
import_code(code_path)
|
||||||
debug_data(
|
debug_data(
|
||||||
train_path,
|
|
||||||
dev_path,
|
|
||||||
config_path,
|
config_path,
|
||||||
config_overrides=overrides,
|
config_overrides=overrides,
|
||||||
ignore_warnings=ignore_warnings,
|
ignore_warnings=ignore_warnings,
|
||||||
|
@ -71,8 +70,6 @@ def debug_data_cli(
|
||||||
|
|
||||||
|
|
||||||
def debug_data(
|
def debug_data(
|
||||||
train_path: Path,
|
|
||||||
dev_path: Path,
|
|
||||||
config_path: Path,
|
config_path: Path,
|
||||||
*,
|
*,
|
||||||
config_overrides: Dict[str, Any] = {},
|
config_overrides: Dict[str, Any] = {},
|
||||||
|
@ -85,56 +82,29 @@ def debug_data(
|
||||||
no_print=silent, pretty=not no_format, ignore_warnings=ignore_warnings
|
no_print=silent, pretty=not no_format, ignore_warnings=ignore_warnings
|
||||||
)
|
)
|
||||||
# Make sure all files and paths exists if they are needed
|
# Make sure all files and paths exists if they are needed
|
||||||
if not train_path.exists():
|
|
||||||
msg.fail("Training data not found", train_path, exits=1)
|
|
||||||
if not dev_path.exists():
|
|
||||||
msg.fail("Development data not found", dev_path, exits=1)
|
|
||||||
if not config_path.exists():
|
|
||||||
msg.fail("Config file not found", config_path, exists=1)
|
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
cfg = util.load_config(config_path, overrides=config_overrides)
|
cfg = util.load_config(config_path, overrides=config_overrides)
|
||||||
nlp, config = util.load_model_from_config(cfg)
|
nlp = util.load_model_from_config(cfg)
|
||||||
|
config = nlp.config.interpolate()
|
||||||
|
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
||||||
# Use original config here, not resolved version
|
# Use original config here, not resolved version
|
||||||
sourced_components = get_sourced_components(cfg)
|
sourced_components = get_sourced_components(cfg)
|
||||||
frozen_components = config["training"]["frozen_components"]
|
frozen_components = T["frozen_components"]
|
||||||
resume_components = [p for p in sourced_components if p not in frozen_components]
|
resume_components = [p for p in sourced_components if p not in frozen_components]
|
||||||
pipeline = nlp.pipe_names
|
pipeline = nlp.pipe_names
|
||||||
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
|
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
|
||||||
tag_map_path = util.ensure_path(config["training"]["tag_map"])
|
|
||||||
tag_map = {}
|
|
||||||
if tag_map_path is not None:
|
|
||||||
tag_map = srsly.read_json(tag_map_path)
|
|
||||||
morph_rules_path = util.ensure_path(config["training"]["morph_rules"])
|
|
||||||
morph_rules = {}
|
|
||||||
if morph_rules_path is not None:
|
|
||||||
morph_rules = srsly.read_json(morph_rules_path)
|
|
||||||
# Replace tag map with provided mapping
|
|
||||||
nlp.vocab.morphology.load_tag_map(tag_map)
|
|
||||||
# Load morph rules
|
|
||||||
nlp.vocab.morphology.load_morph_exceptions(morph_rules)
|
|
||||||
|
|
||||||
msg.divider("Data file validation")
|
msg.divider("Data file validation")
|
||||||
|
|
||||||
# Create the gold corpus to be able to better analyze data
|
# Create the gold corpus to be able to better analyze data
|
||||||
loading_train_error_message = ""
|
dot_names = [T["train_corpus"], T["dev_corpus"]]
|
||||||
loading_dev_error_message = ""
|
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
|
||||||
with msg.loading("Loading corpus..."):
|
train_dataset = list(train_corpus(nlp))
|
||||||
try:
|
dev_dataset = list(dev_corpus(nlp))
|
||||||
train_dataset = list(Corpus(train_path)(nlp))
|
|
||||||
except ValueError as e:
|
|
||||||
loading_train_error_message = f"Training data cannot be loaded: {e}"
|
|
||||||
try:
|
|
||||||
dev_dataset = list(Corpus(dev_path)(nlp))
|
|
||||||
except ValueError as e:
|
|
||||||
loading_dev_error_message = f"Development data cannot be loaded: {e}"
|
|
||||||
if loading_train_error_message or loading_dev_error_message:
|
|
||||||
if loading_train_error_message:
|
|
||||||
msg.fail(loading_train_error_message)
|
|
||||||
if loading_dev_error_message:
|
|
||||||
msg.fail(loading_dev_error_message)
|
|
||||||
sys.exit(1)
|
|
||||||
msg.good("Corpus is loadable")
|
msg.good("Corpus is loadable")
|
||||||
|
|
||||||
|
nlp.initialize(lambda: train_dataset)
|
||||||
|
msg.good("Pipeline can be initialized with data")
|
||||||
|
|
||||||
# Create all gold data here to avoid iterating over the train_dataset constantly
|
# Create all gold data here to avoid iterating over the train_dataset constantly
|
||||||
gold_train_data = _compile_gold(train_dataset, factory_names, nlp, make_proj=True)
|
gold_train_data = _compile_gold(train_dataset, factory_names, nlp, make_proj=True)
|
||||||
gold_train_unpreprocessed_data = _compile_gold(
|
gold_train_unpreprocessed_data = _compile_gold(
|
||||||
|
@ -144,10 +114,10 @@ def debug_data(
|
||||||
|
|
||||||
train_texts = gold_train_data["texts"]
|
train_texts = gold_train_data["texts"]
|
||||||
dev_texts = gold_dev_data["texts"]
|
dev_texts = gold_dev_data["texts"]
|
||||||
frozen_components = config["training"]["frozen_components"]
|
frozen_components = T["frozen_components"]
|
||||||
|
|
||||||
msg.divider("Training stats")
|
msg.divider("Training stats")
|
||||||
msg.text(f"Language: {config['nlp']['lang']}")
|
msg.text(f"Language: {nlp.lang}")
|
||||||
msg.text(f"Training pipeline: {', '.join(pipeline)}")
|
msg.text(f"Training pipeline: {', '.join(pipeline)}")
|
||||||
if resume_components:
|
if resume_components:
|
||||||
msg.text(f"Components from other pipelines: {', '.join(resume_components)}")
|
msg.text(f"Components from other pipelines: {', '.join(resume_components)}")
|
||||||
|
@ -354,17 +324,12 @@ def debug_data(
|
||||||
if "tagger" in factory_names:
|
if "tagger" in factory_names:
|
||||||
msg.divider("Part-of-speech Tagging")
|
msg.divider("Part-of-speech Tagging")
|
||||||
labels = [label for label in gold_train_data["tags"]]
|
labels = [label for label in gold_train_data["tags"]]
|
||||||
tag_map = nlp.vocab.morphology.tag_map
|
# TODO: does this need to be updated?
|
||||||
msg.info(f"{len(labels)} label(s) in data ({len(tag_map)} label(s) in tag map)")
|
msg.info(f"{len(labels)} label(s) in data")
|
||||||
labels_with_counts = _format_labels(
|
labels_with_counts = _format_labels(
|
||||||
gold_train_data["tags"].most_common(), counts=True
|
gold_train_data["tags"].most_common(), counts=True
|
||||||
)
|
)
|
||||||
msg.text(labels_with_counts, show=verbose)
|
msg.text(labels_with_counts, show=verbose)
|
||||||
non_tagmap = [l for l in labels if l not in tag_map]
|
|
||||||
if not non_tagmap:
|
|
||||||
msg.good(f"All labels present in tag map for language '{nlp.lang}'")
|
|
||||||
for label in non_tagmap:
|
|
||||||
msg.fail(f"Label '{label}' not found in tag map for language '{nlp.lang}'")
|
|
||||||
|
|
||||||
if "parser" in factory_names:
|
if "parser" in factory_names:
|
||||||
has_low_data_warning = False
|
has_low_data_warning = False
|
||||||
|
|
|
@ -1,20 +1,24 @@
|
||||||
import warnings
|
|
||||||
from typing import Dict, Any, Optional, Iterable
|
from typing import Dict, Any, Optional, Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from spacy.training import Example
|
from spacy.training import Example
|
||||||
from spacy.util import dot_to_object
|
from spacy.util import resolve_dot_names
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
|
from thinc.api import fix_random_seed, set_dropout_rate, Adam
|
||||||
from thinc.api import Model, data_validation, set_gpu_allocator
|
from thinc.api import Model, data_validation, set_gpu_allocator
|
||||||
import typer
|
import typer
|
||||||
|
|
||||||
from ._util import Arg, Opt, debug_cli, show_validation_error
|
from ._util import Arg, Opt, debug_cli, show_validation_error
|
||||||
from ._util import parse_config_overrides, string_to_list
|
from ._util import parse_config_overrides, string_to_list, setup_gpu
|
||||||
|
from ..schemas import ConfigSchemaTraining
|
||||||
|
from ..util import registry
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
@debug_cli.command("model")
|
@debug_cli.command(
|
||||||
|
"model",
|
||||||
|
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||||
|
)
|
||||||
def debug_model_cli(
|
def debug_model_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
|
@ -38,11 +42,7 @@ def debug_model_cli(
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/cli#debug-model
|
DOCS: https://nightly.spacy.io/api/cli#debug-model
|
||||||
"""
|
"""
|
||||||
if use_gpu >= 0:
|
setup_gpu(use_gpu)
|
||||||
msg.info("Using GPU")
|
|
||||||
require_gpu(use_gpu)
|
|
||||||
else:
|
|
||||||
msg.info("Using CPU")
|
|
||||||
layers = string_to_list(layers, intify=True)
|
layers = string_to_list(layers, intify=True)
|
||||||
print_settings = {
|
print_settings = {
|
||||||
"dimensions": dimensions,
|
"dimensions": dimensions,
|
||||||
|
@ -57,14 +57,18 @@ def debug_model_cli(
|
||||||
}
|
}
|
||||||
config_overrides = parse_config_overrides(ctx.args)
|
config_overrides = parse_config_overrides(ctx.args)
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
config = util.load_config(
|
raw_config = util.load_config(
|
||||||
config_path, overrides=config_overrides, interpolate=True
|
config_path, overrides=config_overrides, interpolate=False
|
||||||
)
|
)
|
||||||
allocator = config["training"]["gpu_allocator"]
|
config = raw_config.interpolate()
|
||||||
if use_gpu >= 0 and allocator:
|
allocator = config["training"]["gpu_allocator"]
|
||||||
set_gpu_allocator(allocator)
|
if use_gpu >= 0 and allocator:
|
||||||
nlp, config = util.load_model_from_config(config)
|
set_gpu_allocator(allocator)
|
||||||
seed = config["training"]["seed"]
|
with show_validation_error(config_path):
|
||||||
|
nlp = util.load_model_from_config(raw_config)
|
||||||
|
config = nlp.config.interpolate()
|
||||||
|
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
||||||
|
seed = T["seed"]
|
||||||
if seed is not None:
|
if seed is not None:
|
||||||
msg.info(f"Fixing random seed: {seed}")
|
msg.info(f"Fixing random seed: {seed}")
|
||||||
fix_random_seed(seed)
|
fix_random_seed(seed)
|
||||||
|
@ -75,11 +79,16 @@ def debug_model_cli(
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
model = pipe.model
|
model = pipe.model
|
||||||
debug_model(config, nlp, model, print_settings=print_settings)
|
debug_model(config, T, nlp, model, print_settings=print_settings)
|
||||||
|
|
||||||
|
|
||||||
def debug_model(
|
def debug_model(
|
||||||
config, nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] = None
|
config,
|
||||||
|
resolved_train_config,
|
||||||
|
nlp,
|
||||||
|
model: Model,
|
||||||
|
*,
|
||||||
|
print_settings: Optional[Dict[str, Any]] = None,
|
||||||
):
|
):
|
||||||
if not isinstance(model, Model):
|
if not isinstance(model, Model):
|
||||||
msg.fail(
|
msg.fail(
|
||||||
|
@ -100,15 +109,18 @@ def debug_model(
|
||||||
# The output vector might differ from the official type of the output layer
|
# The output vector might differ from the official type of the output layer
|
||||||
with data_validation(False):
|
with data_validation(False):
|
||||||
try:
|
try:
|
||||||
train_corpus = dot_to_object(config, config["training"]["train_corpus"])
|
dot_names = [resolved_train_config["train_corpus"]]
|
||||||
nlp.begin_training(lambda: train_corpus(nlp))
|
with show_validation_error():
|
||||||
|
(train_corpus,) = resolve_dot_names(config, dot_names)
|
||||||
|
nlp.initialize(lambda: train_corpus(nlp))
|
||||||
msg.info("Initialized the model with the training corpus.")
|
msg.info("Initialized the model with the training corpus.")
|
||||||
except ValueError:
|
except ValueError:
|
||||||
try:
|
try:
|
||||||
_set_output_dim(nO=7, model=model)
|
_set_output_dim(nO=7, model=model)
|
||||||
nlp.begin_training(lambda: [Example.from_dict(x, {}) for x in X])
|
with show_validation_error():
|
||||||
|
nlp.initialize(lambda: [Example.from_dict(x, {}) for x in X])
|
||||||
msg.info("Initialized the model with dummy data.")
|
msg.info("Initialized the model with dummy data.")
|
||||||
except:
|
except Exception:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
"Could not initialize the model: you'll have to provide a valid train_corpus argument in the config file.",
|
"Could not initialize the model: you'll have to provide a valid train_corpus argument in the config file.",
|
||||||
exits=1,
|
exits=1,
|
||||||
|
|
|
@ -3,11 +3,11 @@ from wasabi import Printer
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import re
|
import re
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import require_gpu, fix_random_seed
|
from thinc.api import fix_random_seed
|
||||||
|
|
||||||
from ..training import Corpus
|
from ..training import Corpus
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ._util import app, Arg, Opt
|
from ._util import app, Arg, Opt, setup_gpu, import_code
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from .. import util
|
from .. import util
|
||||||
from .. import displacy
|
from .. import displacy
|
||||||
|
@ -19,6 +19,7 @@ def evaluate_cli(
|
||||||
model: str = Arg(..., help="Model name or path"),
|
model: str = Arg(..., help="Model name or path"),
|
||||||
data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
|
data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
|
||||||
output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False),
|
output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False),
|
||||||
|
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||||
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
|
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
|
||||||
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
|
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
|
||||||
|
@ -37,6 +38,7 @@ def evaluate_cli(
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/cli#evaluate
|
DOCS: https://nightly.spacy.io/api/cli#evaluate
|
||||||
"""
|
"""
|
||||||
|
import_code(code_path)
|
||||||
evaluate(
|
evaluate(
|
||||||
model,
|
model,
|
||||||
data_path,
|
data_path,
|
||||||
|
@ -61,8 +63,7 @@ def evaluate(
|
||||||
) -> Scorer:
|
) -> Scorer:
|
||||||
msg = Printer(no_print=silent, pretty=not silent)
|
msg = Printer(no_print=silent, pretty=not silent)
|
||||||
fix_random_seed()
|
fix_random_seed()
|
||||||
if use_gpu >= 0:
|
setup_gpu(use_gpu)
|
||||||
require_gpu(use_gpu)
|
|
||||||
data_path = util.ensure_path(data_path)
|
data_path = util.ensure_path(data_path)
|
||||||
output_path = util.ensure_path(output)
|
output_path = util.ensure_path(output)
|
||||||
displacy_path = util.ensure_path(displacy_path)
|
displacy_path = util.ensure_path(displacy_path)
|
||||||
|
|
|
@ -88,10 +88,10 @@ def fill_config(
|
||||||
msg = Printer(no_print=no_print)
|
msg = Printer(no_print=no_print)
|
||||||
with show_validation_error(hint_fill=False):
|
with show_validation_error(hint_fill=False):
|
||||||
config = util.load_config(base_path)
|
config = util.load_config(base_path)
|
||||||
nlp, _ = util.load_model_from_config(config, auto_fill=True, validate=False)
|
nlp = util.load_model_from_config(config, auto_fill=True, validate=False)
|
||||||
# Load a second time with validation to be extra sure that the produced
|
# Load a second time with validation to be extra sure that the produced
|
||||||
# config result is a valid config
|
# config result is a valid config
|
||||||
nlp, _ = util.load_model_from_config(nlp.config)
|
nlp = util.load_model_from_config(nlp.config)
|
||||||
filled = nlp.config
|
filled = nlp.config
|
||||||
if pretraining:
|
if pretraining:
|
||||||
validate_config_for_pretrain(filled, msg)
|
validate_config_for_pretrain(filled, msg)
|
||||||
|
@ -169,7 +169,7 @@ def init_config(
|
||||||
msg.text(f"- {label}: {value}")
|
msg.text(f"- {label}: {value}")
|
||||||
with show_validation_error(hint_fill=False):
|
with show_validation_error(hint_fill=False):
|
||||||
config = util.load_config_from_str(base_template)
|
config = util.load_config_from_str(base_template)
|
||||||
nlp, _ = util.load_model_from_config(config, auto_fill=True)
|
nlp = util.load_model_from_config(config, auto_fill=True)
|
||||||
config = nlp.config
|
config = nlp.config
|
||||||
if pretraining:
|
if pretraining:
|
||||||
validate_config_for_pretrain(config, msg)
|
validate_config_for_pretrain(config, msg)
|
||||||
|
|
|
@ -1,360 +0,0 @@
|
||||||
from typing import Optional, List, Dict, Any, Union, IO
|
|
||||||
import math
|
|
||||||
from tqdm import tqdm
|
|
||||||
import numpy
|
|
||||||
from ast import literal_eval
|
|
||||||
from pathlib import Path
|
|
||||||
from preshed.counter import PreshCounter
|
|
||||||
import tarfile
|
|
||||||
import gzip
|
|
||||||
import zipfile
|
|
||||||
import srsly
|
|
||||||
import warnings
|
|
||||||
from wasabi import msg, Printer
|
|
||||||
import typer
|
|
||||||
|
|
||||||
from ._util import app, init_cli, Arg, Opt
|
|
||||||
from ..vectors import Vectors
|
|
||||||
from ..errors import Errors, Warnings
|
|
||||||
from ..language import Language
|
|
||||||
from ..util import ensure_path, get_lang_class, load_model, OOV_RANK
|
|
||||||
|
|
||||||
try:
|
|
||||||
import ftfy
|
|
||||||
except ImportError:
|
|
||||||
ftfy = None
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_OOV_PROB = -20
|
|
||||||
|
|
||||||
|
|
||||||
@init_cli.command("vocab")
|
|
||||||
@app.command(
|
|
||||||
"init-model",
|
|
||||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
|
||||||
hidden=True, # hide this from main CLI help but still allow it to work with warning
|
|
||||||
)
|
|
||||||
def init_model_cli(
|
|
||||||
# fmt: off
|
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
|
||||||
lang: str = Arg(..., help="Pipeline language"),
|
|
||||||
output_dir: Path = Arg(..., help="Pipeline output directory"),
|
|
||||||
freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True),
|
|
||||||
clusters_loc: Optional[Path] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data", exists=True),
|
|
||||||
jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file", exists=True),
|
|
||||||
vectors_loc: Optional[Path] = Opt(None, "--vectors-loc", "-v", help="Optional vectors file in Word2Vec format", exists=True),
|
|
||||||
prune_vectors: int = Opt(-1, "--prune-vectors", "-V", help="Optional number of vectors to prune to"),
|
|
||||||
truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
|
||||||
vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
|
||||||
model_name: Optional[str] = Opt(None, "--meta-name", "-mn", help="Optional name of the package for the pipeline meta"),
|
|
||||||
base_model: Optional[str] = Opt(None, "--base", "-b", help="Name of or path to base pipeline to start with (mostly relevant for pipelines with custom tokenizers)")
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Create a new blank pipeline directory with vocab and vectors from raw data.
|
|
||||||
If vectors are provided in Word2Vec format, they can be either a .txt or
|
|
||||||
zipped as a .zip or .tar.gz.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/cli#init-vocab
|
|
||||||
"""
|
|
||||||
if ctx.command.name == "init-model":
|
|
||||||
msg.warn(
|
|
||||||
"The init-model command is now called 'init vocab'. You can run "
|
|
||||||
"'python -m spacy init --help' for an overview of the other "
|
|
||||||
"available initialization commands."
|
|
||||||
)
|
|
||||||
init_model(
|
|
||||||
lang,
|
|
||||||
output_dir,
|
|
||||||
freqs_loc=freqs_loc,
|
|
||||||
clusters_loc=clusters_loc,
|
|
||||||
jsonl_loc=jsonl_loc,
|
|
||||||
vectors_loc=vectors_loc,
|
|
||||||
prune_vectors=prune_vectors,
|
|
||||||
truncate_vectors=truncate_vectors,
|
|
||||||
vectors_name=vectors_name,
|
|
||||||
model_name=model_name,
|
|
||||||
base_model=base_model,
|
|
||||||
silent=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def init_model(
|
|
||||||
lang: str,
|
|
||||||
output_dir: Path,
|
|
||||||
freqs_loc: Optional[Path] = None,
|
|
||||||
clusters_loc: Optional[Path] = None,
|
|
||||||
jsonl_loc: Optional[Path] = None,
|
|
||||||
vectors_loc: Optional[Path] = None,
|
|
||||||
prune_vectors: int = -1,
|
|
||||||
truncate_vectors: int = 0,
|
|
||||||
vectors_name: Optional[str] = None,
|
|
||||||
model_name: Optional[str] = None,
|
|
||||||
base_model: Optional[str] = None,
|
|
||||||
silent: bool = True,
|
|
||||||
) -> Language:
|
|
||||||
msg = Printer(no_print=silent, pretty=not silent)
|
|
||||||
if jsonl_loc is not None:
|
|
||||||
if freqs_loc is not None or clusters_loc is not None:
|
|
||||||
settings = ["-j"]
|
|
||||||
if freqs_loc:
|
|
||||||
settings.append("-f")
|
|
||||||
if clusters_loc:
|
|
||||||
settings.append("-c")
|
|
||||||
msg.warn(
|
|
||||||
"Incompatible arguments",
|
|
||||||
"The -f and -c arguments are deprecated, and not compatible "
|
|
||||||
"with the -j argument, which should specify the same "
|
|
||||||
"information. Either merge the frequencies and clusters data "
|
|
||||||
"into the JSONL-formatted file (recommended), or use only the "
|
|
||||||
"-f and -c files, without the other lexical attributes.",
|
|
||||||
)
|
|
||||||
jsonl_loc = ensure_path(jsonl_loc)
|
|
||||||
lex_attrs = srsly.read_jsonl(jsonl_loc)
|
|
||||||
else:
|
|
||||||
clusters_loc = ensure_path(clusters_loc)
|
|
||||||
freqs_loc = ensure_path(freqs_loc)
|
|
||||||
if freqs_loc is not None and not freqs_loc.exists():
|
|
||||||
msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
|
|
||||||
lex_attrs = read_attrs_from_deprecated(msg, freqs_loc, clusters_loc)
|
|
||||||
|
|
||||||
with msg.loading("Creating blank pipeline..."):
|
|
||||||
nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model)
|
|
||||||
|
|
||||||
msg.good("Successfully created blank pipeline")
|
|
||||||
if vectors_loc is not None:
|
|
||||||
add_vectors(
|
|
||||||
msg, nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name
|
|
||||||
)
|
|
||||||
vec_added = len(nlp.vocab.vectors)
|
|
||||||
lex_added = len(nlp.vocab)
|
|
||||||
msg.good(
|
|
||||||
"Sucessfully compiled vocab", f"{lex_added} entries, {vec_added} vectors",
|
|
||||||
)
|
|
||||||
if not output_dir.exists():
|
|
||||||
output_dir.mkdir()
|
|
||||||
nlp.to_disk(output_dir)
|
|
||||||
return nlp
|
|
||||||
|
|
||||||
|
|
||||||
def open_file(loc: Union[str, Path]) -> IO:
|
|
||||||
"""Handle .gz, .tar.gz or unzipped files"""
|
|
||||||
loc = ensure_path(loc)
|
|
||||||
if tarfile.is_tarfile(str(loc)):
|
|
||||||
return tarfile.open(str(loc), "r:gz")
|
|
||||||
elif loc.parts[-1].endswith("gz"):
|
|
||||||
return (line.decode("utf8") for line in gzip.open(str(loc), "r"))
|
|
||||||
elif loc.parts[-1].endswith("zip"):
|
|
||||||
zip_file = zipfile.ZipFile(str(loc))
|
|
||||||
names = zip_file.namelist()
|
|
||||||
file_ = zip_file.open(names[0])
|
|
||||||
return (line.decode("utf8") for line in file_)
|
|
||||||
else:
|
|
||||||
return loc.open("r", encoding="utf8")
|
|
||||||
|
|
||||||
|
|
||||||
def read_attrs_from_deprecated(
|
|
||||||
msg: Printer, freqs_loc: Optional[Path], clusters_loc: Optional[Path]
|
|
||||||
) -> List[Dict[str, Any]]:
|
|
||||||
if freqs_loc is not None:
|
|
||||||
with msg.loading("Counting frequencies..."):
|
|
||||||
probs, _ = read_freqs(freqs_loc)
|
|
||||||
msg.good("Counted frequencies")
|
|
||||||
else:
|
|
||||||
probs, _ = ({}, DEFAULT_OOV_PROB) # noqa: F841
|
|
||||||
if clusters_loc:
|
|
||||||
with msg.loading("Reading clusters..."):
|
|
||||||
clusters = read_clusters(clusters_loc)
|
|
||||||
msg.good("Read clusters")
|
|
||||||
else:
|
|
||||||
clusters = {}
|
|
||||||
lex_attrs = []
|
|
||||||
sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True)
|
|
||||||
if len(sorted_probs):
|
|
||||||
for i, (word, prob) in tqdm(enumerate(sorted_probs)):
|
|
||||||
attrs = {"orth": word, "id": i, "prob": prob}
|
|
||||||
# Decode as a little-endian string, so that we can do & 15 to get
|
|
||||||
# the first 4 bits. See _parse_features.pyx
|
|
||||||
if word in clusters:
|
|
||||||
attrs["cluster"] = int(clusters[word][::-1], 2)
|
|
||||||
else:
|
|
||||||
attrs["cluster"] = 0
|
|
||||||
lex_attrs.append(attrs)
|
|
||||||
return lex_attrs
|
|
||||||
|
|
||||||
|
|
||||||
def create_model(
|
|
||||||
lang: str,
|
|
||||||
lex_attrs: List[Dict[str, Any]],
|
|
||||||
name: Optional[str] = None,
|
|
||||||
base_model: Optional[Union[str, Path]] = None,
|
|
||||||
) -> Language:
|
|
||||||
if base_model:
|
|
||||||
nlp = load_model(base_model)
|
|
||||||
# keep the tokenizer but remove any existing pipeline components due to
|
|
||||||
# potentially conflicting vectors
|
|
||||||
for pipe in nlp.pipe_names:
|
|
||||||
nlp.remove_pipe(pipe)
|
|
||||||
else:
|
|
||||||
lang_class = get_lang_class(lang)
|
|
||||||
nlp = lang_class()
|
|
||||||
for lexeme in nlp.vocab:
|
|
||||||
lexeme.rank = OOV_RANK
|
|
||||||
for attrs in lex_attrs:
|
|
||||||
if "settings" in attrs:
|
|
||||||
continue
|
|
||||||
lexeme = nlp.vocab[attrs["orth"]]
|
|
||||||
lexeme.set_attrs(**attrs)
|
|
||||||
if len(nlp.vocab):
|
|
||||||
oov_prob = min(lex.prob for lex in nlp.vocab) - 1
|
|
||||||
else:
|
|
||||||
oov_prob = DEFAULT_OOV_PROB
|
|
||||||
nlp.vocab.cfg.update({"oov_prob": oov_prob})
|
|
||||||
if name:
|
|
||||||
nlp.meta["name"] = name
|
|
||||||
return nlp
|
|
||||||
|
|
||||||
|
|
||||||
def add_vectors(
|
|
||||||
msg: Printer,
|
|
||||||
nlp: Language,
|
|
||||||
vectors_loc: Optional[Path],
|
|
||||||
truncate_vectors: int,
|
|
||||||
prune_vectors: int,
|
|
||||||
name: Optional[str] = None,
|
|
||||||
) -> None:
|
|
||||||
vectors_loc = ensure_path(vectors_loc)
|
|
||||||
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
|
|
||||||
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
|
|
||||||
for lex in nlp.vocab:
|
|
||||||
if lex.rank and lex.rank != OOV_RANK:
|
|
||||||
nlp.vocab.vectors.add(lex.orth, row=lex.rank)
|
|
||||||
else:
|
|
||||||
if vectors_loc:
|
|
||||||
with msg.loading(f"Reading vectors from {vectors_loc}"):
|
|
||||||
vectors_data, vector_keys = read_vectors(
|
|
||||||
msg, vectors_loc, truncate_vectors
|
|
||||||
)
|
|
||||||
msg.good(f"Loaded vectors from {vectors_loc}")
|
|
||||||
else:
|
|
||||||
vectors_data, vector_keys = (None, None)
|
|
||||||
if vector_keys is not None:
|
|
||||||
for word in vector_keys:
|
|
||||||
if word not in nlp.vocab:
|
|
||||||
nlp.vocab[word]
|
|
||||||
if vectors_data is not None:
|
|
||||||
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
|
|
||||||
if name is None:
|
|
||||||
# TODO: Is this correct? Does this matter?
|
|
||||||
nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors"
|
|
||||||
else:
|
|
||||||
nlp.vocab.vectors.name = name
|
|
||||||
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
|
|
||||||
if prune_vectors >= 1:
|
|
||||||
nlp.vocab.prune_vectors(prune_vectors)
|
|
||||||
|
|
||||||
|
|
||||||
def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int):
|
|
||||||
f = open_file(vectors_loc)
|
|
||||||
f = ensure_shape(f)
|
|
||||||
shape = tuple(int(size) for size in next(f).split())
|
|
||||||
if truncate_vectors >= 1:
|
|
||||||
shape = (truncate_vectors, shape[1])
|
|
||||||
vectors_data = numpy.zeros(shape=shape, dtype="f")
|
|
||||||
vectors_keys = []
|
|
||||||
for i, line in enumerate(tqdm(f)):
|
|
||||||
line = line.rstrip()
|
|
||||||
pieces = line.rsplit(" ", vectors_data.shape[1])
|
|
||||||
word = pieces.pop(0)
|
|
||||||
if len(pieces) != vectors_data.shape[1]:
|
|
||||||
msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1)
|
|
||||||
vectors_data[i] = numpy.asarray(pieces, dtype="f")
|
|
||||||
vectors_keys.append(word)
|
|
||||||
if i == truncate_vectors - 1:
|
|
||||||
break
|
|
||||||
return vectors_data, vectors_keys
|
|
||||||
|
|
||||||
|
|
||||||
def ensure_shape(lines):
|
|
||||||
"""Ensure that the first line of the data is the vectors shape.
|
|
||||||
|
|
||||||
If it's not, we read in the data and output the shape as the first result,
|
|
||||||
so that the reader doesn't have to deal with the problem.
|
|
||||||
"""
|
|
||||||
first_line = next(lines)
|
|
||||||
try:
|
|
||||||
shape = tuple(int(size) for size in first_line.split())
|
|
||||||
except ValueError:
|
|
||||||
shape = None
|
|
||||||
if shape is not None:
|
|
||||||
# All good, give the data
|
|
||||||
yield first_line
|
|
||||||
yield from lines
|
|
||||||
else:
|
|
||||||
# Figure out the shape, make it the first value, and then give the
|
|
||||||
# rest of the data.
|
|
||||||
width = len(first_line.split()) - 1
|
|
||||||
captured = [first_line] + list(lines)
|
|
||||||
length = len(captured)
|
|
||||||
yield f"{length} {width}"
|
|
||||||
yield from captured
|
|
||||||
|
|
||||||
|
|
||||||
def read_freqs(
|
|
||||||
freqs_loc: Path, max_length: int = 100, min_doc_freq: int = 5, min_freq: int = 50
|
|
||||||
):
|
|
||||||
counts = PreshCounter()
|
|
||||||
total = 0
|
|
||||||
with freqs_loc.open() as f:
|
|
||||||
for i, line in enumerate(f):
|
|
||||||
freq, doc_freq, key = line.rstrip().split("\t", 2)
|
|
||||||
freq = int(freq)
|
|
||||||
counts.inc(i + 1, freq)
|
|
||||||
total += freq
|
|
||||||
counts.smooth()
|
|
||||||
log_total = math.log(total)
|
|
||||||
probs = {}
|
|
||||||
with freqs_loc.open() as f:
|
|
||||||
for line in tqdm(f):
|
|
||||||
freq, doc_freq, key = line.rstrip().split("\t", 2)
|
|
||||||
doc_freq = int(doc_freq)
|
|
||||||
freq = int(freq)
|
|
||||||
if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
|
|
||||||
try:
|
|
||||||
word = literal_eval(key)
|
|
||||||
except SyntaxError:
|
|
||||||
# Take odd strings literally.
|
|
||||||
word = literal_eval(f"'{key}'")
|
|
||||||
smooth_count = counts.smoother(int(freq))
|
|
||||||
probs[word] = math.log(smooth_count) - log_total
|
|
||||||
oov_prob = math.log(counts.smoother(0)) - log_total
|
|
||||||
return probs, oov_prob
|
|
||||||
|
|
||||||
|
|
||||||
def read_clusters(clusters_loc: Path) -> dict:
|
|
||||||
clusters = {}
|
|
||||||
if ftfy is None:
|
|
||||||
warnings.warn(Warnings.W004)
|
|
||||||
with clusters_loc.open() as f:
|
|
||||||
for line in tqdm(f):
|
|
||||||
try:
|
|
||||||
cluster, word, freq = line.split()
|
|
||||||
if ftfy is not None:
|
|
||||||
word = ftfy.fix_text(word)
|
|
||||||
except ValueError:
|
|
||||||
continue
|
|
||||||
# If the clusterer has only seen the word a few times, its
|
|
||||||
# cluster is unreliable.
|
|
||||||
if int(freq) >= 3:
|
|
||||||
clusters[word] = cluster
|
|
||||||
else:
|
|
||||||
clusters[word] = "0"
|
|
||||||
# Expand clusters with re-casing
|
|
||||||
for word, cluster in list(clusters.items()):
|
|
||||||
if word.lower() not in clusters:
|
|
||||||
clusters[word.lower()] = cluster
|
|
||||||
if word.title() not in clusters:
|
|
||||||
clusters[word.title()] = cluster
|
|
||||||
if word.upper() not in clusters:
|
|
||||||
clusters[word.upper()] = cluster
|
|
||||||
return clusters
|
|
117
spacy/cli/init_pipeline.py
Normal file
117
spacy/cli/init_pipeline.py
Normal file
|
@ -0,0 +1,117 @@
|
||||||
|
from typing import Optional
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from wasabi import msg
|
||||||
|
import typer
|
||||||
|
import srsly
|
||||||
|
|
||||||
|
from .. import util
|
||||||
|
from ..training.initialize import init_nlp, convert_vectors
|
||||||
|
from ..language import Language
|
||||||
|
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
|
||||||
|
from ._util import import_code, setup_gpu
|
||||||
|
|
||||||
|
|
||||||
|
@init_cli.command("vectors")
|
||||||
|
def init_vectors_cli(
|
||||||
|
# fmt: off
|
||||||
|
lang: str = Arg(..., help="The language of the nlp object to create"),
|
||||||
|
vectors_loc: Path = Arg(..., help="Vectors file in Word2Vec format", exists=True),
|
||||||
|
output_dir: Path = Arg(..., help="Pipeline output directory"),
|
||||||
|
prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
|
||||||
|
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
||||||
|
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
||||||
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
|
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
|
||||||
|
# fmt: on
|
||||||
|
):
|
||||||
|
"""Convert word vectors for use with spaCy. Will export an nlp object that
|
||||||
|
you can use in the [initialize] block of your config to initialize
|
||||||
|
a model with vectors.
|
||||||
|
"""
|
||||||
|
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||||
|
msg.info(f"Creating blank nlp object for language '{lang}'")
|
||||||
|
nlp = util.get_lang_class(lang)()
|
||||||
|
if jsonl_loc is not None:
|
||||||
|
update_lexemes(nlp, jsonl_loc)
|
||||||
|
convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
|
||||||
|
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
|
||||||
|
nlp.to_disk(output_dir)
|
||||||
|
msg.good(
|
||||||
|
"Saved nlp object with vectors to output directory. You can now use the "
|
||||||
|
"path to it in your config as the 'vectors' setting in [initialize.vocab].",
|
||||||
|
output_dir.resolve(),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def update_lexemes(nlp: Language, jsonl_loc: Path) -> None:
|
||||||
|
# Mostly used for backwards-compatibility and may be removed in the future
|
||||||
|
lex_attrs = srsly.read_jsonl(jsonl_loc)
|
||||||
|
for attrs in lex_attrs:
|
||||||
|
if "settings" in attrs:
|
||||||
|
continue
|
||||||
|
lexeme = nlp.vocab[attrs["orth"]]
|
||||||
|
lexeme.set_attrs(**attrs)
|
||||||
|
|
||||||
|
|
||||||
|
@init_cli.command(
|
||||||
|
"nlp",
|
||||||
|
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||||
|
hidden=True,
|
||||||
|
)
|
||||||
|
def init_pipeline_cli(
|
||||||
|
# fmt: off
|
||||||
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
|
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||||
|
output_path: Path = Arg(..., help="Output directory for the prepared data"),
|
||||||
|
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
|
||||||
|
# fmt: on
|
||||||
|
):
|
||||||
|
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||||
|
overrides = parse_config_overrides(ctx.args)
|
||||||
|
import_code(code_path)
|
||||||
|
setup_gpu(use_gpu)
|
||||||
|
with show_validation_error(config_path):
|
||||||
|
config = util.load_config(config_path, overrides=overrides)
|
||||||
|
with show_validation_error(hint_fill=False):
|
||||||
|
nlp = init_nlp(config, use_gpu=use_gpu)
|
||||||
|
nlp.to_disk(output_path)
|
||||||
|
msg.good(f"Saved initialized pipeline to {output_path}")
|
||||||
|
|
||||||
|
|
||||||
|
@init_cli.command(
|
||||||
|
"labels",
|
||||||
|
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||||
|
)
|
||||||
|
def init_labels_cli(
|
||||||
|
# fmt: off
|
||||||
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
|
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||||
|
output_path: Path = Arg(..., help="Output directory for the labels"),
|
||||||
|
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
|
||||||
|
# fmt: on
|
||||||
|
):
|
||||||
|
"""Generate JSON files for the labels in the data. This helps speed up the
|
||||||
|
training process, since spaCy won't have to preprocess the data to
|
||||||
|
extract the labels."""
|
||||||
|
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||||
|
if not output_path.exists():
|
||||||
|
output_path.mkdir()
|
||||||
|
overrides = parse_config_overrides(ctx.args)
|
||||||
|
import_code(code_path)
|
||||||
|
setup_gpu(use_gpu)
|
||||||
|
with show_validation_error(config_path):
|
||||||
|
config = util.load_config(config_path, overrides=overrides)
|
||||||
|
with show_validation_error(hint_fill=False):
|
||||||
|
nlp = init_nlp(config, use_gpu=use_gpu)
|
||||||
|
for name, component in nlp.pipeline:
|
||||||
|
if getattr(component, "label_data", None) is not None:
|
||||||
|
output_file = output_path / f"{name}.json"
|
||||||
|
srsly.write_json(output_file, component.label_data)
|
||||||
|
msg.good(f"Saving {name} labels to {output_file}")
|
||||||
|
else:
|
||||||
|
msg.info(f"No labels found for {name}")
|
|
@ -1,25 +1,13 @@
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
import numpy
|
|
||||||
import time
|
|
||||||
import re
|
|
||||||
from collections import Counter
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from thinc.api import require_gpu, set_gpu_allocator
|
|
||||||
from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
|
|
||||||
from thinc.api import Config, CosineDistance, L2Distance
|
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
import srsly
|
|
||||||
from functools import partial
|
|
||||||
import typer
|
import typer
|
||||||
|
import re
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
||||||
from ._util import import_code
|
from ._util import import_code, setup_gpu
|
||||||
from ..ml.models.multi_task import build_cloze_multi_task_model
|
from ..training.pretrain import pretrain
|
||||||
from ..ml.models.multi_task import build_cloze_characters_multi_task_model
|
from ..util import load_config
|
||||||
from ..tokens import Doc
|
|
||||||
from ..attrs import ID
|
|
||||||
from .. import util
|
|
||||||
from ..util import dot_to_object
|
|
||||||
|
|
||||||
|
|
||||||
@app.command(
|
@app.command(
|
||||||
|
@ -61,25 +49,22 @@ def pretrain_cli(
|
||||||
config_overrides = parse_config_overrides(ctx.args)
|
config_overrides = parse_config_overrides(ctx.args)
|
||||||
import_code(code_path)
|
import_code(code_path)
|
||||||
verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
|
verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
|
||||||
if use_gpu >= 0:
|
setup_gpu(use_gpu)
|
||||||
msg.info("Using GPU")
|
|
||||||
require_gpu(use_gpu)
|
|
||||||
else:
|
|
||||||
msg.info("Using CPU")
|
|
||||||
msg.info(f"Loading config from: {config_path}")
|
msg.info(f"Loading config from: {config_path}")
|
||||||
|
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
config = util.load_config(
|
raw_config = load_config(
|
||||||
config_path, overrides=config_overrides, interpolate=True
|
config_path, overrides=config_overrides, interpolate=False
|
||||||
)
|
)
|
||||||
|
config = raw_config.interpolate()
|
||||||
if not config.get("pretraining"):
|
if not config.get("pretraining"):
|
||||||
# TODO: What's the solution here? How do we handle optional blocks?
|
# TODO: What's the solution here? How do we handle optional blocks?
|
||||||
msg.fail("The [pretraining] block in your config is empty", exits=1)
|
msg.fail("The [pretraining] block in your config is empty", exits=1)
|
||||||
if not output_dir.exists():
|
if not output_dir.exists():
|
||||||
output_dir.mkdir()
|
output_dir.mkdir()
|
||||||
msg.good(f"Created output directory: {output_dir}")
|
msg.good(f"Created output directory: {output_dir}")
|
||||||
|
# Save non-interpolated config
|
||||||
config.to_disk(output_dir / "config.cfg")
|
raw_config.to_disk(output_dir / "config.cfg")
|
||||||
msg.good("Saved config file in the output directory")
|
msg.good("Saved config file in the output directory")
|
||||||
|
|
||||||
pretrain(
|
pretrain(
|
||||||
|
@ -88,251 +73,11 @@ def pretrain_cli(
|
||||||
resume_path=resume_path,
|
resume_path=resume_path,
|
||||||
epoch_resume=epoch_resume,
|
epoch_resume=epoch_resume,
|
||||||
use_gpu=use_gpu,
|
use_gpu=use_gpu,
|
||||||
|
silent=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def pretrain(
|
|
||||||
config: Config,
|
|
||||||
output_dir: Path,
|
|
||||||
resume_path: Optional[Path] = None,
|
|
||||||
epoch_resume: Optional[int] = None,
|
|
||||||
use_gpu: int = -1,
|
|
||||||
):
|
|
||||||
if config["training"]["seed"] is not None:
|
|
||||||
fix_random_seed(config["training"]["seed"])
|
|
||||||
allocator = config["training"]["gpu_allocator"]
|
|
||||||
if use_gpu >= 0 and allocator:
|
|
||||||
set_gpu_allocator(allocator)
|
|
||||||
|
|
||||||
nlp, config = util.load_model_from_config(config)
|
|
||||||
P_cfg = config["pretraining"]
|
|
||||||
corpus = dot_to_object(config, P_cfg["corpus"])
|
|
||||||
batcher = P_cfg["batcher"]
|
|
||||||
model = create_pretraining_model(nlp, config["pretraining"])
|
|
||||||
optimizer = config["pretraining"]["optimizer"]
|
|
||||||
|
|
||||||
# Load in pretrained weights to resume from
|
|
||||||
if resume_path is not None:
|
|
||||||
_resume_model(model, resume_path, epoch_resume)
|
|
||||||
else:
|
|
||||||
# Without '--resume-path' the '--epoch-resume' argument is ignored
|
|
||||||
epoch_resume = 0
|
|
||||||
|
|
||||||
tracker = ProgressTracker(frequency=10000)
|
|
||||||
msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
|
|
||||||
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
|
|
||||||
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
|
|
||||||
|
|
||||||
def _save_model(epoch, is_temp=False):
|
|
||||||
is_temp_str = ".temp" if is_temp else ""
|
|
||||||
with model.use_params(optimizer.averages):
|
|
||||||
with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
|
|
||||||
file_.write(model.get_ref("tok2vec").to_bytes())
|
|
||||||
log = {
|
|
||||||
"nr_word": tracker.nr_word,
|
|
||||||
"loss": tracker.loss,
|
|
||||||
"epoch_loss": tracker.epoch_loss,
|
|
||||||
"epoch": epoch,
|
|
||||||
}
|
|
||||||
with (output_dir / "log.jsonl").open("a") as file_:
|
|
||||||
file_.write(srsly.json_dumps(log) + "\n")
|
|
||||||
|
|
||||||
objective = create_objective(P_cfg["objective"])
|
|
||||||
# TODO: I think we probably want this to look more like the
|
|
||||||
# 'create_train_batches' function?
|
|
||||||
for epoch in range(epoch_resume, P_cfg["max_epochs"]):
|
|
||||||
for batch_id, batch in enumerate(batcher(corpus(nlp))):
|
|
||||||
docs = ensure_docs(batch)
|
|
||||||
loss = make_update(model, docs, optimizer, objective)
|
|
||||||
progress = tracker.update(epoch, loss, docs)
|
|
||||||
if progress:
|
|
||||||
msg.row(progress, **row_settings)
|
|
||||||
if P_cfg["n_save_every"] and (batch_id % P_cfg["n_save_every"] == 0):
|
|
||||||
_save_model(epoch, is_temp=True)
|
|
||||||
_save_model(epoch)
|
|
||||||
tracker.epoch_loss = 0.0
|
|
||||||
msg.good("Successfully finished pretrain")
|
msg.good("Successfully finished pretrain")
|
||||||
|
|
||||||
|
|
||||||
def ensure_docs(examples_or_docs):
|
|
||||||
docs = []
|
|
||||||
for eg_or_doc in examples_or_docs:
|
|
||||||
if isinstance(eg_or_doc, Doc):
|
|
||||||
docs.append(eg_or_doc)
|
|
||||||
else:
|
|
||||||
docs.append(eg_or_doc.reference)
|
|
||||||
return docs
|
|
||||||
|
|
||||||
|
|
||||||
def _resume_model(model, resume_path, epoch_resume):
|
|
||||||
msg.info(f"Resume training tok2vec from: {resume_path}")
|
|
||||||
with resume_path.open("rb") as file_:
|
|
||||||
weights_data = file_.read()
|
|
||||||
model.get_ref("tok2vec").from_bytes(weights_data)
|
|
||||||
# Parse the epoch number from the given weight file
|
|
||||||
model_name = re.search(r"model\d+\.bin", str(resume_path))
|
|
||||||
if model_name:
|
|
||||||
# Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
|
|
||||||
epoch_resume = int(model_name.group(0)[5:][:-4]) + 1
|
|
||||||
msg.info(f"Resuming from epoch: {epoch_resume}")
|
|
||||||
else:
|
|
||||||
msg.info(f"Resuming from epoch: {epoch_resume}")
|
|
||||||
|
|
||||||
|
|
||||||
def make_update(model, docs, optimizer, objective_func):
|
|
||||||
"""Perform an update over a single batch of documents.
|
|
||||||
|
|
||||||
docs (iterable): A batch of `Doc` objects.
|
|
||||||
optimizer (callable): An optimizer.
|
|
||||||
RETURNS loss: A float for the loss.
|
|
||||||
"""
|
|
||||||
predictions, backprop = model.begin_update(docs)
|
|
||||||
loss, gradients = objective_func(model.ops, docs, predictions)
|
|
||||||
backprop(gradients)
|
|
||||||
model.finish_update(optimizer)
|
|
||||||
# Don't want to return a cupy object here
|
|
||||||
# The gradients are modified in-place by the BERT MLM,
|
|
||||||
# so we get an accurate loss
|
|
||||||
return float(loss)
|
|
||||||
|
|
||||||
|
|
||||||
def create_objective(config):
|
|
||||||
"""Create the objective for pretraining.
|
|
||||||
|
|
||||||
We'd like to replace this with a registry function but it's tricky because
|
|
||||||
we're also making a model choice based on this. For now we hard-code support
|
|
||||||
for two types (characters, vectors). For characters you can specify
|
|
||||||
n_characters, for vectors you can specify the loss.
|
|
||||||
|
|
||||||
Bleh.
|
|
||||||
"""
|
|
||||||
objective_type = config["type"]
|
|
||||||
if objective_type == "characters":
|
|
||||||
return partial(get_characters_loss, nr_char=config["n_characters"])
|
|
||||||
elif objective_type == "vectors":
|
|
||||||
if config["loss"] == "cosine":
|
|
||||||
return partial(
|
|
||||||
get_vectors_loss,
|
|
||||||
distance=CosineDistance(normalize=True, ignore_zeros=True),
|
|
||||||
)
|
|
||||||
elif config["loss"] == "L2":
|
|
||||||
return partial(
|
|
||||||
get_vectors_loss, distance=L2Distance(normalize=True, ignore_zeros=True)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
raise ValueError("Unexpected loss type", config["loss"])
|
|
||||||
else:
|
|
||||||
raise ValueError("Unexpected objective_type", objective_type)
|
|
||||||
|
|
||||||
|
|
||||||
def get_vectors_loss(ops, docs, prediction, distance):
|
|
||||||
"""Compute a loss based on a distance between the documents' vectors and
|
|
||||||
the prediction.
|
|
||||||
"""
|
|
||||||
# The simplest way to implement this would be to vstack the
|
|
||||||
# token.vector values, but that's a bit inefficient, especially on GPU.
|
|
||||||
# Instead we fetch the index into the vectors table for each of our tokens,
|
|
||||||
# and look them up all at once. This prevents data copying.
|
|
||||||
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
|
||||||
target = docs[0].vocab.vectors.data[ids]
|
|
||||||
d_target, loss = distance(prediction, target)
|
|
||||||
return loss, d_target
|
|
||||||
|
|
||||||
|
|
||||||
def get_characters_loss(ops, docs, prediction, nr_char):
|
|
||||||
"""Compute a loss based on a number of characters predicted from the docs."""
|
|
||||||
target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
|
|
||||||
target_ids = target_ids.reshape((-1,))
|
|
||||||
target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
|
|
||||||
target = target.reshape((-1, 256 * nr_char))
|
|
||||||
diff = prediction - target
|
|
||||||
loss = (diff ** 2).sum()
|
|
||||||
d_target = diff / float(prediction.shape[0])
|
|
||||||
return loss, d_target
|
|
||||||
|
|
||||||
|
|
||||||
def create_pretraining_model(nlp, pretrain_config):
|
|
||||||
"""Define a network for the pretraining. We simply add an output layer onto
|
|
||||||
the tok2vec input model. The tok2vec input model needs to be a model that
|
|
||||||
takes a batch of Doc objects (as a list), and returns a list of arrays.
|
|
||||||
Each array in the output needs to have one row per token in the doc.
|
|
||||||
The actual tok2vec layer is stored as a reference, and only this bit will be
|
|
||||||
serialized to file and read back in when calling the 'train' command.
|
|
||||||
"""
|
|
||||||
component = nlp.get_pipe(pretrain_config["component"])
|
|
||||||
if pretrain_config.get("layer"):
|
|
||||||
tok2vec = component.model.get_ref(pretrain_config["layer"])
|
|
||||||
else:
|
|
||||||
tok2vec = component.model
|
|
||||||
|
|
||||||
# TODO
|
|
||||||
maxout_pieces = 3
|
|
||||||
hidden_size = 300
|
|
||||||
if pretrain_config["objective"]["type"] == "vectors":
|
|
||||||
model = build_cloze_multi_task_model(
|
|
||||||
nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
|
|
||||||
)
|
|
||||||
elif pretrain_config["objective"]["type"] == "characters":
|
|
||||||
model = build_cloze_characters_multi_task_model(
|
|
||||||
nlp.vocab,
|
|
||||||
tok2vec,
|
|
||||||
hidden_size=hidden_size,
|
|
||||||
maxout_pieces=maxout_pieces,
|
|
||||||
nr_char=pretrain_config["objective"]["n_characters"],
|
|
||||||
)
|
|
||||||
model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
|
|
||||||
set_dropout_rate(model, pretrain_config["dropout"])
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
class ProgressTracker:
|
|
||||||
def __init__(self, frequency=1000000):
|
|
||||||
self.loss = 0.0
|
|
||||||
self.prev_loss = 0.0
|
|
||||||
self.nr_word = 0
|
|
||||||
self.words_per_epoch = Counter()
|
|
||||||
self.frequency = frequency
|
|
||||||
self.last_time = time.time()
|
|
||||||
self.last_update = 0
|
|
||||||
self.epoch_loss = 0.0
|
|
||||||
|
|
||||||
def update(self, epoch, loss, docs):
|
|
||||||
self.loss += loss
|
|
||||||
self.epoch_loss += loss
|
|
||||||
words_in_batch = sum(len(doc) for doc in docs)
|
|
||||||
self.words_per_epoch[epoch] += words_in_batch
|
|
||||||
self.nr_word += words_in_batch
|
|
||||||
words_since_update = self.nr_word - self.last_update
|
|
||||||
if words_since_update >= self.frequency:
|
|
||||||
wps = words_since_update / (time.time() - self.last_time)
|
|
||||||
self.last_update = self.nr_word
|
|
||||||
self.last_time = time.time()
|
|
||||||
loss_per_word = self.loss - self.prev_loss
|
|
||||||
status = (
|
|
||||||
epoch,
|
|
||||||
self.nr_word,
|
|
||||||
_smart_round(self.loss, width=10),
|
|
||||||
_smart_round(loss_per_word, width=6),
|
|
||||||
int(wps),
|
|
||||||
)
|
|
||||||
self.prev_loss = float(self.loss)
|
|
||||||
return status
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def _smart_round(figure, width=10, max_decimal=4):
|
|
||||||
"""Round large numbers as integers, smaller numbers as decimals."""
|
|
||||||
n_digits = len(str(int(figure)))
|
|
||||||
n_decimal = width - (n_digits + 1)
|
|
||||||
if n_decimal <= 1:
|
|
||||||
return str(int(figure))
|
|
||||||
else:
|
|
||||||
n_decimal = min(n_decimal, max_decimal)
|
|
||||||
format_str = "%." + str(n_decimal) + "f"
|
|
||||||
return format_str % figure
|
|
||||||
|
|
||||||
|
|
||||||
def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
|
def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
|
||||||
if not config_path or not config_path.exists():
|
if not config_path or not config_path.exists():
|
||||||
msg.fail("Config file not found", config_path, exits=1)
|
msg.fail("Config file not found", config_path, exits=1)
|
||||||
|
|
|
@ -114,6 +114,6 @@ def project_document(
|
||||||
content = f"{before}{content}{after}"
|
content = f"{before}{content}{after}"
|
||||||
else:
|
else:
|
||||||
msg.warn("Replacing existing file")
|
msg.warn("Replacing existing file")
|
||||||
with output_file.open("w") as f:
|
with output_file.open("w", encoding="utf8") as f:
|
||||||
f.write(content)
|
f.write(content)
|
||||||
msg.good("Saved project documentation", output_file)
|
msg.good("Saved project documentation", output_file)
|
||||||
|
|
|
@ -134,7 +134,7 @@ def update_dvc_config(
|
||||||
|
|
||||||
|
|
||||||
def run_dvc_commands(
|
def run_dvc_commands(
|
||||||
commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {},
|
commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {}
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Run a sequence of DVC commands in a subprocess, in order.
|
"""Run a sequence of DVC commands in a subprocess, in order.
|
||||||
|
|
||||||
|
|
|
@ -4,8 +4,8 @@ can help generate the best possible configuration, given a user's requirements.
|
||||||
{%- set use_transformer = (transformer_data and hardware != "cpu") -%}
|
{%- set use_transformer = (transformer_data and hardware != "cpu") -%}
|
||||||
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
||||||
[paths]
|
[paths]
|
||||||
train = ""
|
train = null
|
||||||
dev = ""
|
dev = null
|
||||||
|
|
||||||
[system]
|
[system]
|
||||||
{% if use_transformer -%}
|
{% if use_transformer -%}
|
||||||
|
@ -277,11 +277,6 @@ path = ${paths.dev}
|
||||||
max_length = 0
|
max_length = 0
|
||||||
|
|
||||||
[training]
|
[training]
|
||||||
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
|
|
||||||
vectors = null
|
|
||||||
{% else -%}
|
|
||||||
vectors = "{{ word_vectors }}"
|
|
||||||
{% endif -%}
|
|
||||||
{% if use_transformer -%}
|
{% if use_transformer -%}
|
||||||
accumulate_gradient = {{ transformer["size_factor"] }}
|
accumulate_gradient = {{ transformer["size_factor"] }}
|
||||||
{% endif -%}
|
{% endif -%}
|
||||||
|
@ -317,3 +312,10 @@ start = 100
|
||||||
stop = 1000
|
stop = 1000
|
||||||
compound = 1.001
|
compound = 1.001
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
|
[initialize]
|
||||||
|
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
|
||||||
|
vectors = null
|
||||||
|
{% else -%}
|
||||||
|
vectors = "{{ word_vectors }}"
|
||||||
|
{% endif -%}
|
||||||
|
|
|
@ -1,23 +1,14 @@
|
||||||
from typing import Optional, Dict, Any, Tuple, Union, Callable, List
|
from typing import Optional
|
||||||
from timeit import default_timer as timer
|
|
||||||
import srsly
|
|
||||||
import tqdm
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
import thinc
|
|
||||||
import thinc.schedules
|
|
||||||
from thinc.api import Config, Optimizer, require_gpu, fix_random_seed, set_gpu_allocator
|
|
||||||
import random
|
|
||||||
import typer
|
import typer
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
||||||
from ._util import import_code, get_sourced_components
|
from ._util import import_code, setup_gpu
|
||||||
from ..language import Language
|
from ..training.loop import train
|
||||||
|
from ..training.initialize import init_nlp
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..training.example import Example
|
|
||||||
from ..errors import Errors
|
|
||||||
from ..util import dot_to_object
|
|
||||||
|
|
||||||
|
|
||||||
@app.command(
|
@app.command(
|
||||||
|
@ -30,8 +21,7 @@ def train_cli(
|
||||||
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
|
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
|
||||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
|
||||||
resume: bool = Opt(False, "--resume", "-R", help="Resume training"),
|
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
@ -48,375 +38,19 @@ def train_cli(
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/cli#train
|
DOCS: https://nightly.spacy.io/api/cli#train
|
||||||
"""
|
"""
|
||||||
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
|
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||||
verify_cli_args(config_path, output_path)
|
verify_cli_args(config_path, output_path)
|
||||||
overrides = parse_config_overrides(ctx.args)
|
overrides = parse_config_overrides(ctx.args)
|
||||||
import_code(code_path)
|
import_code(code_path)
|
||||||
train(
|
setup_gpu(use_gpu)
|
||||||
config_path,
|
|
||||||
output_path=output_path,
|
|
||||||
config_overrides=overrides,
|
|
||||||
use_gpu=use_gpu,
|
|
||||||
resume_training=resume,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def train(
|
|
||||||
config_path: Path,
|
|
||||||
output_path: Optional[Path] = None,
|
|
||||||
config_overrides: Dict[str, Any] = {},
|
|
||||||
use_gpu: int = -1,
|
|
||||||
resume_training: bool = False,
|
|
||||||
) -> None:
|
|
||||||
if use_gpu >= 0:
|
|
||||||
msg.info(f"Using GPU: {use_gpu}")
|
|
||||||
require_gpu(use_gpu)
|
|
||||||
else:
|
|
||||||
msg.info("Using CPU")
|
|
||||||
msg.info(f"Loading config and nlp from: {config_path}")
|
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
config = util.load_config(
|
config = util.load_config(config_path, overrides=overrides, interpolate=False)
|
||||||
config_path, overrides=config_overrides, interpolate=True
|
msg.divider("Initializing pipeline")
|
||||||
)
|
with show_validation_error(config_path, hint_fill=False):
|
||||||
if config["training"]["seed"] is not None:
|
nlp = init_nlp(config, use_gpu=use_gpu)
|
||||||
fix_random_seed(config["training"]["seed"])
|
msg.good("Initialized pipeline")
|
||||||
allocator = config["training"]["gpu_allocator"]
|
msg.divider("Training pipeline")
|
||||||
if use_gpu >= 0 and allocator:
|
train(nlp, output_path, use_gpu=use_gpu, silent=False)
|
||||||
set_gpu_allocator(allocator)
|
|
||||||
# Use original config here before it's resolved to functions
|
|
||||||
sourced_components = get_sourced_components(config)
|
|
||||||
with show_validation_error(config_path):
|
|
||||||
nlp, config = util.load_model_from_config(config)
|
|
||||||
util.load_vocab_data_into_model(nlp, lookups=config["training"]["lookups"])
|
|
||||||
if config["training"]["vectors"] is not None:
|
|
||||||
util.load_vectors_into_model(nlp, config["training"]["vectors"])
|
|
||||||
raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
|
|
||||||
T_cfg = config["training"]
|
|
||||||
optimizer = T_cfg["optimizer"]
|
|
||||||
train_corpus = dot_to_object(config, T_cfg["train_corpus"])
|
|
||||||
dev_corpus = dot_to_object(config, T_cfg["dev_corpus"])
|
|
||||||
batcher = T_cfg["batcher"]
|
|
||||||
train_logger = T_cfg["logger"]
|
|
||||||
before_to_disk = create_before_to_disk_callback(T_cfg["before_to_disk"])
|
|
||||||
# Components that shouldn't be updated during training
|
|
||||||
frozen_components = T_cfg["frozen_components"]
|
|
||||||
# Sourced components that require resume_training
|
|
||||||
resume_components = [p for p in sourced_components if p not in frozen_components]
|
|
||||||
msg.info(f"Pipeline: {nlp.pipe_names}")
|
|
||||||
if resume_components:
|
|
||||||
with nlp.select_pipes(enable=resume_components):
|
|
||||||
msg.info(f"Resuming training for: {resume_components}")
|
|
||||||
nlp.resume_training(sgd=optimizer)
|
|
||||||
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
|
|
||||||
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
|
|
||||||
# Verify the config after calling 'begin_training' to ensure labels are properly initialized
|
|
||||||
verify_config(nlp)
|
|
||||||
|
|
||||||
if tag_map:
|
|
||||||
# Replace tag map with provided mapping
|
|
||||||
nlp.vocab.morphology.load_tag_map(tag_map)
|
|
||||||
if morph_rules:
|
|
||||||
# Load morph rules
|
|
||||||
nlp.vocab.morphology.load_morph_exceptions(morph_rules)
|
|
||||||
|
|
||||||
# Load pretrained tok2vec weights - cf. CLI command 'pretrain'
|
|
||||||
if weights_data is not None:
|
|
||||||
tok2vec_component = config["pretraining"]["component"]
|
|
||||||
if tok2vec_component is None:
|
|
||||||
msg.fail(
|
|
||||||
f"To use pretrained tok2vec weights, [pretraining.component] "
|
|
||||||
f"needs to specify the component that should load them.",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
layer = nlp.get_pipe(tok2vec_component).model
|
|
||||||
tok2vec_layer = config["pretraining"]["layer"]
|
|
||||||
if tok2vec_layer:
|
|
||||||
layer = layer.get_ref(tok2vec_layer)
|
|
||||||
layer.from_bytes(weights_data)
|
|
||||||
msg.info(f"Loaded pretrained weights into component '{tok2vec_component}'")
|
|
||||||
|
|
||||||
# Create iterator, which yields out info after each optimization step.
|
|
||||||
msg.info("Start training")
|
|
||||||
score_weights = T_cfg["score_weights"]
|
|
||||||
training_step_iterator = train_while_improving(
|
|
||||||
nlp,
|
|
||||||
optimizer,
|
|
||||||
create_train_batches(train_corpus(nlp), batcher, T_cfg["max_epochs"]),
|
|
||||||
create_evaluation_callback(nlp, dev_corpus, score_weights),
|
|
||||||
dropout=T_cfg["dropout"],
|
|
||||||
accumulate_gradient=T_cfg["accumulate_gradient"],
|
|
||||||
patience=T_cfg["patience"],
|
|
||||||
max_steps=T_cfg["max_steps"],
|
|
||||||
eval_frequency=T_cfg["eval_frequency"],
|
|
||||||
raw_text=None,
|
|
||||||
exclude=frozen_components,
|
|
||||||
)
|
|
||||||
msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
|
|
||||||
with nlp.select_pipes(disable=frozen_components):
|
|
||||||
print_row, finalize_logger = train_logger(nlp)
|
|
||||||
|
|
||||||
try:
|
|
||||||
progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
|
|
||||||
progress.set_description(f"Epoch 1")
|
|
||||||
for batch, info, is_best_checkpoint in training_step_iterator:
|
|
||||||
progress.update(1)
|
|
||||||
if is_best_checkpoint is not None:
|
|
||||||
progress.close()
|
|
||||||
print_row(info)
|
|
||||||
if is_best_checkpoint and output_path is not None:
|
|
||||||
with nlp.select_pipes(disable=frozen_components):
|
|
||||||
update_meta(T_cfg, nlp, info)
|
|
||||||
with nlp.use_params(optimizer.averages):
|
|
||||||
nlp = before_to_disk(nlp)
|
|
||||||
nlp.to_disk(output_path / "model-best")
|
|
||||||
progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
|
|
||||||
progress.set_description(f"Epoch {info['epoch']}")
|
|
||||||
except Exception as e:
|
|
||||||
finalize_logger()
|
|
||||||
if output_path is not None:
|
|
||||||
# We don't want to swallow the traceback if we don't have a
|
|
||||||
# specific error.
|
|
||||||
msg.warn(
|
|
||||||
f"Aborting and saving the final best model. "
|
|
||||||
f"Encountered exception: {str(e)}"
|
|
||||||
)
|
|
||||||
nlp = before_to_disk(nlp)
|
|
||||||
nlp.to_disk(output_path / "model-final")
|
|
||||||
raise e
|
|
||||||
finally:
|
|
||||||
finalize_logger()
|
|
||||||
if output_path is not None:
|
|
||||||
final_model_path = output_path / "model-final"
|
|
||||||
if optimizer.averages:
|
|
||||||
with nlp.use_params(optimizer.averages):
|
|
||||||
nlp.to_disk(final_model_path)
|
|
||||||
else:
|
|
||||||
nlp.to_disk(final_model_path)
|
|
||||||
msg.good(f"Saved pipeline to output directory {final_model_path}")
|
|
||||||
|
|
||||||
|
|
||||||
def create_train_batches(iterator, batcher, max_epochs: int):
|
|
||||||
epoch = 0
|
|
||||||
examples = list(iterator)
|
|
||||||
if not examples:
|
|
||||||
# Raise error if no data
|
|
||||||
raise ValueError(Errors.E986)
|
|
||||||
while max_epochs < 1 or epoch != max_epochs:
|
|
||||||
random.shuffle(examples)
|
|
||||||
for batch in batcher(examples):
|
|
||||||
yield epoch, batch
|
|
||||||
epoch += 1
|
|
||||||
|
|
||||||
|
|
||||||
def create_evaluation_callback(
|
|
||||||
nlp: Language, dev_corpus: Callable, weights: Dict[str, float]
|
|
||||||
) -> Callable[[], Tuple[float, Dict[str, float]]]:
|
|
||||||
weights = {key: value for key, value in weights.items() if value is not None}
|
|
||||||
|
|
||||||
def evaluate() -> Tuple[float, Dict[str, float]]:
|
|
||||||
dev_examples = list(dev_corpus(nlp))
|
|
||||||
scores = nlp.evaluate(dev_examples)
|
|
||||||
# Calculate a weighted sum based on score_weights for the main score.
|
|
||||||
# We can only consider scores that are ints/floats, not dicts like
|
|
||||||
# entity scores per type etc.
|
|
||||||
for key, value in scores.items():
|
|
||||||
if key in weights and not isinstance(value, (int, float)):
|
|
||||||
raise ValueError(Errors.E915.format(name=key, score_type=type(value)))
|
|
||||||
try:
|
|
||||||
weighted_score = sum(
|
|
||||||
scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights
|
|
||||||
)
|
|
||||||
except KeyError as e:
|
|
||||||
keys = list(scores.keys())
|
|
||||||
err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys)
|
|
||||||
raise KeyError(err) from None
|
|
||||||
return weighted_score, scores
|
|
||||||
|
|
||||||
return evaluate
|
|
||||||
|
|
||||||
|
|
||||||
def create_before_to_disk_callback(
|
|
||||||
callback: Optional[Callable[[Language], Language]]
|
|
||||||
) -> Callable[[Language], Language]:
|
|
||||||
def before_to_disk(nlp: Language) -> Language:
|
|
||||||
if not callback:
|
|
||||||
return nlp
|
|
||||||
modified_nlp = callback(nlp)
|
|
||||||
if not isinstance(modified_nlp, Language):
|
|
||||||
err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp))
|
|
||||||
raise ValueError(err)
|
|
||||||
return modified_nlp
|
|
||||||
|
|
||||||
return before_to_disk
|
|
||||||
|
|
||||||
|
|
||||||
def train_while_improving(
|
|
||||||
nlp: Language,
|
|
||||||
optimizer: Optimizer,
|
|
||||||
train_data,
|
|
||||||
evaluate,
|
|
||||||
*,
|
|
||||||
dropout: float,
|
|
||||||
eval_frequency: int,
|
|
||||||
accumulate_gradient: int,
|
|
||||||
patience: int,
|
|
||||||
max_steps: int,
|
|
||||||
raw_text: List[Dict[str, str]],
|
|
||||||
exclude: List[str],
|
|
||||||
):
|
|
||||||
"""Train until an evaluation stops improving. Works as a generator,
|
|
||||||
with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
|
|
||||||
where info is a dict, and is_best_checkpoint is in [True, False, None] --
|
|
||||||
None indicating that the iteration was not evaluated as a checkpoint.
|
|
||||||
The evaluation is conducted by calling the evaluate callback.
|
|
||||||
|
|
||||||
Positional arguments:
|
|
||||||
nlp: The spaCy pipeline to evaluate.
|
|
||||||
optimizer: The optimizer callable.
|
|
||||||
train_data (Iterable[Batch]): A generator of batches, with the training
|
|
||||||
data. Each batch should be a Sized[Tuple[Input, Annot]]. The training
|
|
||||||
data iterable needs to take care of iterating over the epochs and
|
|
||||||
shuffling.
|
|
||||||
evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation.
|
|
||||||
The callback should take no arguments and return a tuple
|
|
||||||
`(main_score, other_scores)`. The main_score should be a float where
|
|
||||||
higher is better. other_scores can be any object.
|
|
||||||
|
|
||||||
Every iteration, the function yields out a tuple with:
|
|
||||||
|
|
||||||
* batch: A list of Example objects.
|
|
||||||
* info: A dict with various information about the last update (see below).
|
|
||||||
* is_best_checkpoint: A value in None, False, True, indicating whether this
|
|
||||||
was the best evaluation so far. You should use this to save the model
|
|
||||||
checkpoints during training. If None, evaluation was not conducted on
|
|
||||||
that iteration. False means evaluation was conducted, but a previous
|
|
||||||
evaluation was better.
|
|
||||||
|
|
||||||
The info dict provides the following information:
|
|
||||||
|
|
||||||
epoch (int): How many passes over the data have been completed.
|
|
||||||
step (int): How many steps have been completed.
|
|
||||||
score (float): The main score from the last evaluation.
|
|
||||||
other_scores: : The other scores from the last evaluation.
|
|
||||||
losses: The accumulated losses throughout training.
|
|
||||||
checkpoints: A list of previous results, where each result is a
|
|
||||||
(score, step, epoch) tuple.
|
|
||||||
"""
|
|
||||||
if isinstance(dropout, float):
|
|
||||||
dropouts = thinc.schedules.constant(dropout)
|
|
||||||
else:
|
|
||||||
dropouts = dropout
|
|
||||||
results = []
|
|
||||||
losses = {}
|
|
||||||
if raw_text:
|
|
||||||
random.shuffle(raw_text)
|
|
||||||
raw_examples = [
|
|
||||||
Example.from_dict(nlp.make_doc(rt["text"]), {}) for rt in raw_text
|
|
||||||
]
|
|
||||||
raw_batches = util.minibatch(raw_examples, size=8)
|
|
||||||
|
|
||||||
words_seen = 0
|
|
||||||
start_time = timer()
|
|
||||||
for step, (epoch, batch) in enumerate(train_data):
|
|
||||||
dropout = next(dropouts)
|
|
||||||
for subbatch in subdivide_batch(batch, accumulate_gradient):
|
|
||||||
|
|
||||||
nlp.update(
|
|
||||||
subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude
|
|
||||||
)
|
|
||||||
if raw_text:
|
|
||||||
# If raw text is available, perform 'rehearsal' updates,
|
|
||||||
# which use unlabelled data to reduce overfitting.
|
|
||||||
raw_batch = list(next(raw_batches))
|
|
||||||
nlp.rehearse(raw_batch, sgd=optimizer, losses=losses, exclude=exclude)
|
|
||||||
# TODO: refactor this so we don't have to run it separately in here
|
|
||||||
for name, proc in nlp.pipeline:
|
|
||||||
if (
|
|
||||||
name not in exclude
|
|
||||||
and hasattr(proc, "model")
|
|
||||||
and proc.model not in (True, False, None)
|
|
||||||
):
|
|
||||||
proc.model.finish_update(optimizer)
|
|
||||||
optimizer.step_schedules()
|
|
||||||
if not (step % eval_frequency):
|
|
||||||
if optimizer.averages:
|
|
||||||
with nlp.use_params(optimizer.averages):
|
|
||||||
score, other_scores = evaluate()
|
|
||||||
else:
|
|
||||||
score, other_scores = evaluate()
|
|
||||||
results.append((score, step))
|
|
||||||
is_best_checkpoint = score == max(results)[0]
|
|
||||||
else:
|
|
||||||
score, other_scores = (None, None)
|
|
||||||
is_best_checkpoint = None
|
|
||||||
words_seen += sum(len(eg) for eg in batch)
|
|
||||||
info = {
|
|
||||||
"epoch": epoch,
|
|
||||||
"step": step,
|
|
||||||
"score": score,
|
|
||||||
"other_scores": other_scores,
|
|
||||||
"losses": losses,
|
|
||||||
"checkpoints": results,
|
|
||||||
"seconds": int(timer() - start_time),
|
|
||||||
"words": words_seen,
|
|
||||||
}
|
|
||||||
yield batch, info, is_best_checkpoint
|
|
||||||
if is_best_checkpoint is not None:
|
|
||||||
losses = {}
|
|
||||||
# Stop if no improvement in `patience` updates (if specified)
|
|
||||||
best_score, best_step = max(results)
|
|
||||||
if patience and (step - best_step) >= patience:
|
|
||||||
break
|
|
||||||
# Stop if we've exhausted our max steps (if specified)
|
|
||||||
if max_steps and step >= max_steps:
|
|
||||||
break
|
|
||||||
|
|
||||||
|
|
||||||
def subdivide_batch(batch, accumulate_gradient):
|
|
||||||
batch = list(batch)
|
|
||||||
batch.sort(key=lambda eg: len(eg.predicted))
|
|
||||||
sub_len = len(batch) // accumulate_gradient
|
|
||||||
start = 0
|
|
||||||
for i in range(accumulate_gradient):
|
|
||||||
subbatch = batch[start : start + sub_len]
|
|
||||||
if subbatch:
|
|
||||||
yield subbatch
|
|
||||||
start += len(subbatch)
|
|
||||||
subbatch = batch[start:]
|
|
||||||
if subbatch:
|
|
||||||
yield subbatch
|
|
||||||
|
|
||||||
|
|
||||||
def update_meta(
|
|
||||||
training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
|
|
||||||
) -> None:
|
|
||||||
nlp.meta["performance"] = {}
|
|
||||||
for metric in training["score_weights"]:
|
|
||||||
if metric is not None:
|
|
||||||
nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
|
|
||||||
for pipe_name in nlp.pipe_names:
|
|
||||||
nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
|
|
||||||
|
|
||||||
|
|
||||||
def load_from_paths(
|
|
||||||
config: Config,
|
|
||||||
) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]:
|
|
||||||
# TODO: separate checks from loading
|
|
||||||
raw_text = util.ensure_path(config["training"]["raw_text"])
|
|
||||||
if raw_text is not None:
|
|
||||||
if not raw_text.exists():
|
|
||||||
msg.fail("Can't find raw text", raw_text, exits=1)
|
|
||||||
raw_text = list(srsly.read_jsonl(config["training"]["raw_text"]))
|
|
||||||
tag_map = {}
|
|
||||||
morph_rules = {}
|
|
||||||
weights_data = None
|
|
||||||
init_tok2vec = util.ensure_path(config["training"]["init_tok2vec"])
|
|
||||||
if init_tok2vec is not None:
|
|
||||||
if not init_tok2vec.exists():
|
|
||||||
msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
|
|
||||||
with init_tok2vec.open("rb") as file_:
|
|
||||||
weights_data = file_.read()
|
|
||||||
return raw_text, tag_map, morph_rules, weights_data
|
|
||||||
|
|
||||||
|
|
||||||
def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> None:
|
def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> None:
|
||||||
|
@ -427,30 +61,3 @@ def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> No
|
||||||
if not output_path.exists():
|
if not output_path.exists():
|
||||||
output_path.mkdir()
|
output_path.mkdir()
|
||||||
msg.good(f"Created output directory: {output_path}")
|
msg.good(f"Created output directory: {output_path}")
|
||||||
|
|
||||||
|
|
||||||
def verify_config(nlp: Language) -> None:
|
|
||||||
"""Perform additional checks based on the config, loaded nlp object and training data."""
|
|
||||||
# TODO: maybe we should validate based on the actual components, the list
|
|
||||||
# in config["nlp"]["pipeline"] instead?
|
|
||||||
for pipe_config in nlp.config["components"].values():
|
|
||||||
# We can't assume that the component name == the factory
|
|
||||||
factory = pipe_config["factory"]
|
|
||||||
if factory == "textcat":
|
|
||||||
verify_textcat_config(nlp, pipe_config)
|
|
||||||
|
|
||||||
|
|
||||||
def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None:
|
|
||||||
# if 'positive_label' is provided: double check whether it's in the data and
|
|
||||||
# the task is binary
|
|
||||||
if pipe_config.get("positive_label"):
|
|
||||||
textcat_labels = nlp.get_pipe("textcat").labels
|
|
||||||
pos_label = pipe_config.get("positive_label")
|
|
||||||
if pos_label not in textcat_labels:
|
|
||||||
raise ValueError(
|
|
||||||
Errors.E920.format(pos_label=pos_label, labels=textcat_labels)
|
|
||||||
)
|
|
||||||
if len(list(textcat_labels)) != 2:
|
|
||||||
raise ValueError(
|
|
||||||
Errors.E919.format(pos_label=pos_label, labels=textcat_labels)
|
|
||||||
)
|
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
[paths]
|
[paths]
|
||||||
train = ""
|
train = null
|
||||||
dev = ""
|
dev = null
|
||||||
raw = null
|
vectors = null
|
||||||
|
vocab_data = null
|
||||||
init_tok2vec = null
|
init_tok2vec = null
|
||||||
|
|
||||||
[system]
|
[system]
|
||||||
|
@ -35,6 +36,11 @@ gold_preproc = false
|
||||||
max_length = 0
|
max_length = 0
|
||||||
# Limitation on number of training examples
|
# Limitation on number of training examples
|
||||||
limit = 0
|
limit = 0
|
||||||
|
# Apply some simply data augmentation, where we replace tokens with variations.
|
||||||
|
# This is especially useful for punctuation and case replacement, to help
|
||||||
|
# generalize beyond corpora that don't have smart-quotes, or only have smart
|
||||||
|
# quotes, etc.
|
||||||
|
augmenter = null
|
||||||
|
|
||||||
[corpora.dev]
|
[corpora.dev]
|
||||||
@readers = "spacy.Corpus.v1"
|
@readers = "spacy.Corpus.v1"
|
||||||
|
@ -47,6 +53,7 @@ gold_preproc = false
|
||||||
max_length = 0
|
max_length = 0
|
||||||
# Limitation on number of training examples
|
# Limitation on number of training examples
|
||||||
limit = 0
|
limit = 0
|
||||||
|
augmenter = null
|
||||||
|
|
||||||
# Training hyper-parameters and additional features.
|
# Training hyper-parameters and additional features.
|
||||||
[training]
|
[training]
|
||||||
|
@ -54,11 +61,6 @@ seed = ${system.seed}
|
||||||
gpu_allocator = ${system.gpu_allocator}
|
gpu_allocator = ${system.gpu_allocator}
|
||||||
dropout = 0.1
|
dropout = 0.1
|
||||||
accumulate_gradient = 1
|
accumulate_gradient = 1
|
||||||
# Extra resources for transfer-learning or pseudo-rehearsal
|
|
||||||
init_tok2vec = ${paths.init_tok2vec}
|
|
||||||
raw_text = ${paths.raw}
|
|
||||||
vectors = null
|
|
||||||
lookups = null
|
|
||||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
# Controls early-stopping. 0 or -1 mean unlimited.
|
||||||
patience = 1600
|
patience = 1600
|
||||||
max_epochs = 0
|
max_epochs = 0
|
||||||
|
@ -99,3 +101,18 @@ grad_clip = 1.0
|
||||||
use_averages = false
|
use_averages = false
|
||||||
eps = 1e-8
|
eps = 1e-8
|
||||||
learn_rate = 0.001
|
learn_rate = 0.001
|
||||||
|
|
||||||
|
# The 'initialize' step is run before training or pretraining. Components and
|
||||||
|
# the tokenizer can each define their own arguments via their .initialize
|
||||||
|
# methods that are populated by the config. This lets them gather resources like
|
||||||
|
# lookup tables and build label sets, construct vocabularies, etc.
|
||||||
|
[initialize]
|
||||||
|
vocab_data = ${paths.vocab_data}
|
||||||
|
lookups = null
|
||||||
|
vectors = ${paths.vectors}
|
||||||
|
# Extra resources for transfer-learning or pseudo-rehearsal
|
||||||
|
init_tok2vec = ${paths.init_tok2vec}
|
||||||
|
# Arguments passed to the tokenizer's initialize method
|
||||||
|
tokenizer = {}
|
||||||
|
# Arguments passed to the initialize methods of the components (keyed by component name)
|
||||||
|
components = {}
|
||||||
|
|
|
@ -1,3 +1,6 @@
|
||||||
|
[paths]
|
||||||
|
raw_text = null
|
||||||
|
|
||||||
[pretraining]
|
[pretraining]
|
||||||
max_epochs = 1000
|
max_epochs = 1000
|
||||||
dropout = 0.2
|
dropout = 0.2
|
||||||
|
@ -32,7 +35,7 @@ learn_rate = 0.001
|
||||||
|
|
||||||
[corpora.pretrain]
|
[corpora.pretrain]
|
||||||
@readers = "spacy.JsonlReader.v1"
|
@readers = "spacy.JsonlReader.v1"
|
||||||
path = ${paths.raw}
|
path = ${paths.raw_text}
|
||||||
min_length = 5
|
min_length = 5
|
||||||
max_length = 500
|
max_length = 500
|
||||||
limit = 0
|
limit = 0
|
||||||
|
|
|
@ -85,6 +85,7 @@ class Warnings:
|
||||||
"attribute or operator.")
|
"attribute or operator.")
|
||||||
|
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
|
W089 = ("The nlp.begin_training method has been renamed to nlp.initialize.")
|
||||||
W090 = ("Could not locate any {format} files in path '{path}'.")
|
W090 = ("Could not locate any {format} files in path '{path}'.")
|
||||||
W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
|
W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
|
||||||
W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
|
W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
|
||||||
|
@ -306,7 +307,7 @@ class Errors:
|
||||||
"settings: {opts}")
|
"settings: {opts}")
|
||||||
E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}")
|
E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}")
|
||||||
E109 = ("Component '{name}' could not be run. Did you forget to "
|
E109 = ("Component '{name}' could not be run. Did you forget to "
|
||||||
"call begin_training()?")
|
"call initialize()?")
|
||||||
E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
|
E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
|
||||||
E111 = ("Pickling a token is not supported, because tokens are only views "
|
E111 = ("Pickling a token is not supported, because tokens are only views "
|
||||||
"of the parent Doc and can't exist on their own. A pickled token "
|
"of the parent Doc and can't exist on their own. A pickled token "
|
||||||
|
@ -376,7 +377,7 @@ class Errors:
|
||||||
"provided {found}.")
|
"provided {found}.")
|
||||||
E143 = ("Labels for component '{name}' not initialized. This can be fixed "
|
E143 = ("Labels for component '{name}' not initialized. This can be fixed "
|
||||||
"by calling add_label, or by providing a representative batch of "
|
"by calling add_label, or by providing a representative batch of "
|
||||||
"examples to the component's begin_training method.")
|
"examples to the component's initialize method.")
|
||||||
E145 = ("Error reading `{param}` from input file.")
|
E145 = ("Error reading `{param}` from input file.")
|
||||||
E146 = ("Could not access `{path}`.")
|
E146 = ("Could not access `{path}`.")
|
||||||
E147 = ("Unexpected error in the {method} functionality of the "
|
E147 = ("Unexpected error in the {method} functionality of the "
|
||||||
|
@ -476,6 +477,14 @@ class Errors:
|
||||||
E201 = ("Span index out of range.")
|
E201 = ("Span index out of range.")
|
||||||
|
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
|
E912 = ("No orth_variants lookups table for data augmentation available for "
|
||||||
|
"language '{lang}'. If orth_variants are available in "
|
||||||
|
"spacy-lookups-data, make sure the package is installed and the "
|
||||||
|
"table is loaded in the [initialize.lookups] block of your config. "
|
||||||
|
"Alternatively, you can provide your own Lookups object with a "
|
||||||
|
"table orth_variants as the argument 'lookuos' of the augmenter.")
|
||||||
|
E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
|
||||||
|
"config.cfg or override it on the CLI?")
|
||||||
E914 = ("Executing {name} callback failed. Expected the function to "
|
E914 = ("Executing {name} callback failed. Expected the function to "
|
||||||
"return the nlp object but got: {value}. Maybe you forgot to return "
|
"return the nlp object but got: {value}. Maybe you forgot to return "
|
||||||
"the modified object in your function?")
|
"the modified object in your function?")
|
||||||
|
@ -517,7 +526,7 @@ class Errors:
|
||||||
"but the provided argument {loc} points to a file.")
|
"but the provided argument {loc} points to a file.")
|
||||||
E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does "
|
E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does "
|
||||||
"not seem to exist.")
|
"not seem to exist.")
|
||||||
E930 = ("Received invalid get_examples callback in {name}.begin_training. "
|
E930 = ("Received invalid get_examples callback in {name}.initialize. "
|
||||||
"Expected function that returns an iterable of Example objects but "
|
"Expected function that returns an iterable of Example objects but "
|
||||||
"got: {obj}")
|
"got: {obj}")
|
||||||
E931 = ("Encountered Pipe subclass without Pipe.{method} method in component "
|
E931 = ("Encountered Pipe subclass without Pipe.{method} method in component "
|
||||||
|
@ -553,7 +562,10 @@ class Errors:
|
||||||
E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
|
E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
|
||||||
E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
|
E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
|
||||||
"component.")
|
"component.")
|
||||||
E955 = ("Can't find table(s) '{table}' for language '{lang}' in spacy-lookups-data.")
|
E955 = ("Can't find table(s) '{table}' for language '{lang}' in "
|
||||||
|
"spacy-lookups-data. If you want to initialize a blank nlp object, "
|
||||||
|
"make sure you have the spacy-lookups-data package installed or "
|
||||||
|
"remove the [initialize.lookups] block from your config.")
|
||||||
E956 = ("Can't find component '{name}' in [components] block in the config. "
|
E956 = ("Can't find component '{name}' in [components] block in the config. "
|
||||||
"Available components: {opts}")
|
"Available components: {opts}")
|
||||||
E957 = ("Writing directly to Language.factories isn't needed anymore in "
|
E957 = ("Writing directly to Language.factories isn't needed anymore in "
|
||||||
|
@ -670,10 +682,10 @@ class Errors:
|
||||||
"'{token_attrs}'.")
|
"'{token_attrs}'.")
|
||||||
E999 = ("Unable to merge the `Doc` objects because they do not all share "
|
E999 = ("Unable to merge the `Doc` objects because they do not all share "
|
||||||
"the same `Vocab`.")
|
"the same `Vocab`.")
|
||||||
E1000 = ("No pkuseg model available. Provide a pkuseg model when "
|
E1000 = ("The Chinese word segmenter is pkuseg but no pkuseg model was "
|
||||||
"initializing the pipeline:\n"
|
"loaded. Provide the name of a pretrained model or the path to "
|
||||||
'cfg = {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path}}\n'
|
"a model and initialize the pipeline:\n\n"
|
||||||
'nlp = Chinese(config=cfg)')
|
'nlp.tokenizer.initialize(pkuseg_model="default")')
|
||||||
E1001 = ("Target token outside of matched span for match with tokens "
|
E1001 = ("Target token outside of matched span for match with tokens "
|
||||||
"'{span}' and offset '{index}' matched by patterns '{patterns}'.")
|
"'{span}' and offset '{index}' matched by patterns '{patterns}'.")
|
||||||
E1002 = ("Span index out of range.")
|
E1002 = ("Span index out of range.")
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
|
|
@ -3,8 +3,7 @@ from ...tokens import Token
|
||||||
|
|
||||||
|
|
||||||
class EnglishLemmatizer(Lemmatizer):
|
class EnglishLemmatizer(Lemmatizer):
|
||||||
"""English lemmatizer. Only overrides is_base_form.
|
"""English lemmatizer. Only overrides is_base_form."""
|
||||||
"""
|
|
||||||
|
|
||||||
def is_base_form(self, token: Token) -> bool:
|
def is_base_form(self, token: Token) -> bool:
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -58,7 +58,7 @@ def noun_bounds(
|
||||||
doc, token, np_left_deps, np_right_deps, stop_deps
|
doc, token, np_left_deps, np_right_deps, stop_deps
|
||||||
)
|
)
|
||||||
filter_func = lambda t: is_verb_token(t) or t.dep in stop_deps
|
filter_func = lambda t: is_verb_token(t) or t.dep in stop_deps
|
||||||
if list(filter(filter_func, doc[left_bound.i : right.i],)):
|
if list(filter(filter_func, doc[left_bound.i : right.i])):
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
right_bound = right
|
right_bound = right
|
||||||
|
|
|
@ -2,7 +2,6 @@ from typing import Optional, Union, Dict, Any
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import srsly
|
import srsly
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
@ -12,9 +11,11 @@ from .tag_bigram_map import TAG_BIGRAM_MAP
|
||||||
from ...compat import copy_reg
|
from ...compat import copy_reg
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from ...scorer import Scorer
|
||||||
from ...symbols import POS
|
from ...symbols import POS
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...util import DummyTokenizer, registry
|
from ...training import validate_examples
|
||||||
|
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||||
from ... import util
|
from ... import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -130,6 +131,10 @@ class JapaneseTokenizer(DummyTokenizer):
|
||||||
)
|
)
|
||||||
return sub_tokens_list
|
return sub_tokens_list
|
||||||
|
|
||||||
|
def score(self, examples):
|
||||||
|
validate_examples(examples, "JapaneseTokenizer.score")
|
||||||
|
return Scorer.score_tokenization(examples)
|
||||||
|
|
||||||
def _get_config(self) -> Dict[str, Any]:
|
def _get_config(self) -> Dict[str, Any]:
|
||||||
return {"split_mode": self.split_mode}
|
return {"split_mode": self.split_mode}
|
||||||
|
|
||||||
|
@ -160,7 +165,7 @@ class JapaneseTokenizer(DummyTokenizer):
|
||||||
|
|
||||||
|
|
||||||
class JapaneseDefaults(Language.Defaults):
|
class JapaneseDefaults(Language.Defaults):
|
||||||
config = Config().from_str(DEFAULT_CONFIG)
|
config = load_config_from_str(DEFAULT_CONFIG)
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from typing import Optional, Any, Dict
|
from typing import Optional, Any, Dict
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
|
@ -7,8 +6,10 @@ from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...compat import copy_reg
|
from ...compat import copy_reg
|
||||||
|
from ...scorer import Scorer
|
||||||
from ...symbols import POS
|
from ...symbols import POS
|
||||||
from ...util import DummyTokenizer, registry
|
from ...training import validate_examples
|
||||||
|
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -62,9 +63,13 @@ class KoreanTokenizer(DummyTokenizer):
|
||||||
lemma = surface
|
lemma = surface
|
||||||
yield {"surface": surface, "lemma": lemma, "tag": tag}
|
yield {"surface": surface, "lemma": lemma, "tag": tag}
|
||||||
|
|
||||||
|
def score(self, examples):
|
||||||
|
validate_examples(examples, "KoreanTokenizer.score")
|
||||||
|
return Scorer.score_tokenization(examples)
|
||||||
|
|
||||||
|
|
||||||
class KoreanDefaults(Language.Defaults):
|
class KoreanDefaults(Language.Defaults):
|
||||||
config = Config().from_str(DEFAULT_CONFIG)
|
config = load_config_from_str(DEFAULT_CONFIG)
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
|
|
@ -108,8 +108,8 @@ _num_words = [
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
"""
|
"""
|
||||||
Check if text resembles a number
|
Check if text resembles a number
|
||||||
"""
|
"""
|
||||||
if text.startswith(("+", "-", "±", "~")):
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
text = text[1:]
|
text = text[1:]
|
||||||
text = text.replace(",", "").replace(".", "")
|
text = text.replace(",", "").replace(".", "")
|
||||||
|
|
|
@ -1,10 +1,8 @@
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...util import DummyTokenizer, registry
|
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -42,7 +40,7 @@ class ThaiTokenizer(DummyTokenizer):
|
||||||
|
|
||||||
|
|
||||||
class ThaiDefaults(Language.Defaults):
|
class ThaiDefaults(Language.Defaults):
|
||||||
config = Config().from_str(DEFAULT_CONFIG)
|
config = load_config_from_str(DEFAULT_CONFIG)
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
|
@ -1,10 +1,8 @@
|
||||||
from thinc.api import Config
|
from .stop_words import STOP_WORDS
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from .stop_words import STOP_WORDS
|
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||||
from ...util import DummyTokenizer, registry
|
|
||||||
from .lex_attrs import LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -17,7 +15,7 @@ use_pyvi = true
|
||||||
|
|
||||||
|
|
||||||
@registry.tokenizers("spacy.vi.VietnameseTokenizer")
|
@registry.tokenizers("spacy.vi.VietnameseTokenizer")
|
||||||
def create_vietnamese_tokenizer(use_pyvi: bool = True,):
|
def create_vietnamese_tokenizer(use_pyvi: bool = True):
|
||||||
def vietnamese_tokenizer_factory(nlp):
|
def vietnamese_tokenizer_factory(nlp):
|
||||||
return VietnameseTokenizer(nlp, use_pyvi=use_pyvi)
|
return VietnameseTokenizer(nlp, use_pyvi=use_pyvi)
|
||||||
|
|
||||||
|
@ -55,7 +53,7 @@ class VietnameseTokenizer(DummyTokenizer):
|
||||||
|
|
||||||
|
|
||||||
class VietnameseDefaults(Language.Defaults):
|
class VietnameseDefaults(Language.Defaults):
|
||||||
config = Config().from_str(DEFAULT_CONFIG)
|
config = load_config_from_str(DEFAULT_CONFIG)
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
|
@ -1,21 +1,25 @@
|
||||||
from typing import Optional, List, Dict, Any
|
from typing import Optional, List, Dict, Any, Callable, Iterable
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
import tempfile
|
import tempfile
|
||||||
import srsly
|
import srsly
|
||||||
import warnings
|
import warnings
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from ...errors import Warnings, Errors
|
from ...errors import Warnings, Errors
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from ...scorer import Scorer
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...util import DummyTokenizer, registry
|
from ...training import validate_examples, Example
|
||||||
|
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ... import util
|
from ... import util
|
||||||
|
|
||||||
|
|
||||||
_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.25` or from https://github.com/lancopku/pkuseg-python"
|
# fmt: off
|
||||||
|
_PKUSEG_INSTALL_MSG = "install pkuseg and pickle5 with `pip install pkuseg==0.0.25 pickle5`"
|
||||||
|
_PKUSEG_PICKLE_WARNING = "Failed to force pkuseg model to use pickle protocol 4. If you're saving this model with python 3.8, it may not work with python 3.6-3.7."
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
[nlp]
|
[nlp]
|
||||||
|
@ -23,6 +27,10 @@ DEFAULT_CONFIG = """
|
||||||
[nlp.tokenizer]
|
[nlp.tokenizer]
|
||||||
@tokenizers = "spacy.zh.ChineseTokenizer"
|
@tokenizers = "spacy.zh.ChineseTokenizer"
|
||||||
segmenter = "char"
|
segmenter = "char"
|
||||||
|
|
||||||
|
[initialize]
|
||||||
|
|
||||||
|
[initialize.tokenizer]
|
||||||
pkuseg_model = null
|
pkuseg_model = null
|
||||||
pkuseg_user_dict = "default"
|
pkuseg_user_dict = "default"
|
||||||
"""
|
"""
|
||||||
|
@ -39,41 +47,23 @@ class Segmenter(str, Enum):
|
||||||
|
|
||||||
|
|
||||||
@registry.tokenizers("spacy.zh.ChineseTokenizer")
|
@registry.tokenizers("spacy.zh.ChineseTokenizer")
|
||||||
def create_chinese_tokenizer(
|
def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char,):
|
||||||
segmenter: Segmenter = Segmenter.char,
|
|
||||||
pkuseg_model: Optional[str] = None,
|
|
||||||
pkuseg_user_dict: Optional[str] = "default",
|
|
||||||
):
|
|
||||||
def chinese_tokenizer_factory(nlp):
|
def chinese_tokenizer_factory(nlp):
|
||||||
return ChineseTokenizer(
|
return ChineseTokenizer(nlp, segmenter=segmenter)
|
||||||
nlp,
|
|
||||||
segmenter=segmenter,
|
|
||||||
pkuseg_model=pkuseg_model,
|
|
||||||
pkuseg_user_dict=pkuseg_user_dict,
|
|
||||||
)
|
|
||||||
|
|
||||||
return chinese_tokenizer_factory
|
return chinese_tokenizer_factory
|
||||||
|
|
||||||
|
|
||||||
class ChineseTokenizer(DummyTokenizer):
|
class ChineseTokenizer(DummyTokenizer):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self, nlp: Language, segmenter: Segmenter = Segmenter.char,
|
||||||
nlp: Language,
|
|
||||||
segmenter: Segmenter = Segmenter.char,
|
|
||||||
pkuseg_model: Optional[str] = None,
|
|
||||||
pkuseg_user_dict: Optional[str] = None,
|
|
||||||
):
|
):
|
||||||
self.vocab = nlp.vocab
|
self.vocab = nlp.vocab
|
||||||
if isinstance(segmenter, Segmenter): # we might have the Enum here
|
if isinstance(segmenter, Segmenter):
|
||||||
segmenter = segmenter.value
|
segmenter = segmenter.value
|
||||||
self.segmenter = segmenter
|
self.segmenter = segmenter
|
||||||
self.pkuseg_model = pkuseg_model
|
|
||||||
self.pkuseg_user_dict = pkuseg_user_dict
|
|
||||||
self.pkuseg_seg = None
|
self.pkuseg_seg = None
|
||||||
self.jieba_seg = None
|
self.jieba_seg = None
|
||||||
self.configure_segmenter(segmenter)
|
|
||||||
|
|
||||||
def configure_segmenter(self, segmenter: str):
|
|
||||||
if segmenter not in Segmenter.values():
|
if segmenter not in Segmenter.values():
|
||||||
warn_msg = Warnings.W103.format(
|
warn_msg = Warnings.W103.format(
|
||||||
lang="Chinese",
|
lang="Chinese",
|
||||||
|
@ -83,12 +73,21 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
)
|
)
|
||||||
warnings.warn(warn_msg)
|
warnings.warn(warn_msg)
|
||||||
self.segmenter = Segmenter.char
|
self.segmenter = Segmenter.char
|
||||||
self.jieba_seg = try_jieba_import(self.segmenter)
|
if segmenter == Segmenter.jieba:
|
||||||
self.pkuseg_seg = try_pkuseg_import(
|
self.jieba_seg = try_jieba_import()
|
||||||
self.segmenter,
|
|
||||||
pkuseg_model=self.pkuseg_model,
|
def initialize(
|
||||||
pkuseg_user_dict=self.pkuseg_user_dict,
|
self,
|
||||||
)
|
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
|
||||||
|
*,
|
||||||
|
nlp: Optional[Language] = None,
|
||||||
|
pkuseg_model: Optional[str] = None,
|
||||||
|
pkuseg_user_dict: str = "default",
|
||||||
|
):
|
||||||
|
if self.segmenter == Segmenter.pkuseg:
|
||||||
|
self.pkuseg_seg = try_pkuseg_import(
|
||||||
|
pkuseg_model=pkuseg_model, pkuseg_user_dict=pkuseg_user_dict,
|
||||||
|
)
|
||||||
|
|
||||||
def __call__(self, text: str) -> Doc:
|
def __call__(self, text: str) -> Doc:
|
||||||
if self.segmenter == Segmenter.jieba:
|
if self.segmenter == Segmenter.jieba:
|
||||||
|
@ -136,17 +135,17 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter)
|
warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter)
|
||||||
warnings.warn(warn_msg)
|
warnings.warn(warn_msg)
|
||||||
|
|
||||||
|
def score(self, examples):
|
||||||
|
validate_examples(examples, "ChineseTokenizer.score")
|
||||||
|
return Scorer.score_tokenization(examples)
|
||||||
|
|
||||||
def _get_config(self) -> Dict[str, Any]:
|
def _get_config(self) -> Dict[str, Any]:
|
||||||
return {
|
return {
|
||||||
"segmenter": self.segmenter,
|
"segmenter": self.segmenter,
|
||||||
"pkuseg_model": self.pkuseg_model,
|
|
||||||
"pkuseg_user_dict": self.pkuseg_user_dict,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def _set_config(self, config: Dict[str, Any] = {}) -> None:
|
def _set_config(self, config: Dict[str, Any] = {}) -> None:
|
||||||
self.segmenter = config.get("segmenter", Segmenter.char)
|
self.segmenter = config.get("segmenter", Segmenter.char)
|
||||||
self.pkuseg_model = config.get("pkuseg_model", None)
|
|
||||||
self.pkuseg_user_dict = config.get("pkuseg_user_dict", "default")
|
|
||||||
|
|
||||||
def to_bytes(self, **kwargs):
|
def to_bytes(self, **kwargs):
|
||||||
pkuseg_features_b = b""
|
pkuseg_features_b = b""
|
||||||
|
@ -157,6 +156,22 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
self.pkuseg_seg.feature_extractor.save(tempdir)
|
self.pkuseg_seg.feature_extractor.save(tempdir)
|
||||||
self.pkuseg_seg.model.save(tempdir)
|
self.pkuseg_seg.model.save(tempdir)
|
||||||
tempdir = Path(tempdir)
|
tempdir = Path(tempdir)
|
||||||
|
# pkuseg saves features.pkl with pickle.HIGHEST_PROTOCOL, which
|
||||||
|
# means that it will be saved with pickle protocol 5 with
|
||||||
|
# python 3.8, which can't be reloaded with python 3.6-3.7.
|
||||||
|
# To try to make the model compatible with python 3.6+, reload
|
||||||
|
# the data with pickle5 and convert it back to protocol 4.
|
||||||
|
try:
|
||||||
|
import pickle5
|
||||||
|
|
||||||
|
with open(tempdir / "features.pkl", "rb") as fileh:
|
||||||
|
features = pickle5.load(fileh)
|
||||||
|
with open(tempdir / "features.pkl", "wb") as fileh:
|
||||||
|
pickle5.dump(features, fileh, protocol=4)
|
||||||
|
except ImportError as e:
|
||||||
|
raise e
|
||||||
|
except Exception:
|
||||||
|
warnings.warn(_PKUSEG_PICKLE_WARNING)
|
||||||
with open(tempdir / "features.pkl", "rb") as fileh:
|
with open(tempdir / "features.pkl", "rb") as fileh:
|
||||||
pkuseg_features_b = fileh.read()
|
pkuseg_features_b = fileh.read()
|
||||||
with open(tempdir / "weights.npz", "rb") as fileh:
|
with open(tempdir / "weights.npz", "rb") as fileh:
|
||||||
|
@ -229,6 +244,18 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
path.mkdir(parents=True)
|
path.mkdir(parents=True)
|
||||||
self.pkuseg_seg.model.save(path)
|
self.pkuseg_seg.model.save(path)
|
||||||
self.pkuseg_seg.feature_extractor.save(path)
|
self.pkuseg_seg.feature_extractor.save(path)
|
||||||
|
# try to convert features.pkl to pickle protocol 4
|
||||||
|
try:
|
||||||
|
import pickle5
|
||||||
|
|
||||||
|
with open(path / "features.pkl", "rb") as fileh:
|
||||||
|
features = pickle5.load(fileh)
|
||||||
|
with open(path / "features.pkl", "wb") as fileh:
|
||||||
|
pickle5.dump(features, fileh, protocol=4)
|
||||||
|
except ImportError as e:
|
||||||
|
raise e
|
||||||
|
except Exception:
|
||||||
|
warnings.warn(_PKUSEG_PICKLE_WARNING)
|
||||||
|
|
||||||
def save_pkuseg_processors(path):
|
def save_pkuseg_processors(path):
|
||||||
if self.pkuseg_seg:
|
if self.pkuseg_seg:
|
||||||
|
@ -285,7 +312,7 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
|
|
||||||
|
|
||||||
class ChineseDefaults(Language.Defaults):
|
class ChineseDefaults(Language.Defaults):
|
||||||
config = Config().from_str(DEFAULT_CONFIG)
|
config = load_config_from_str(DEFAULT_CONFIG)
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||||
|
@ -296,47 +323,33 @@ class Chinese(Language):
|
||||||
Defaults = ChineseDefaults
|
Defaults = ChineseDefaults
|
||||||
|
|
||||||
|
|
||||||
def try_jieba_import(segmenter: str) -> None:
|
def try_jieba_import() -> None:
|
||||||
try:
|
try:
|
||||||
import jieba
|
import jieba
|
||||||
|
|
||||||
if segmenter == Segmenter.jieba:
|
# segment a short text to have jieba initialize its cache in advance
|
||||||
# segment a short text to have jieba initialize its cache in advance
|
list(jieba.cut("作为", cut_all=False))
|
||||||
list(jieba.cut("作为", cut_all=False))
|
|
||||||
|
|
||||||
return jieba
|
return jieba
|
||||||
except ImportError:
|
except ImportError:
|
||||||
if segmenter == Segmenter.jieba:
|
msg = (
|
||||||
msg = (
|
"Jieba not installed. To use jieba, install it with `pip "
|
||||||
"Jieba not installed. To use jieba, install it with `pip "
|
" install jieba` or from https://github.com/fxsjy/jieba"
|
||||||
" install jieba` or from https://github.com/fxsjy/jieba"
|
)
|
||||||
)
|
raise ImportError(msg) from None
|
||||||
raise ImportError(msg) from None
|
|
||||||
|
|
||||||
|
|
||||||
def try_pkuseg_import(segmenter: str, pkuseg_model: str, pkuseg_user_dict: str) -> None:
|
def try_pkuseg_import(pkuseg_model: str, pkuseg_user_dict: str) -> None:
|
||||||
try:
|
try:
|
||||||
import pkuseg
|
import pkuseg
|
||||||
|
|
||||||
if pkuseg_model:
|
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
|
||||||
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
|
|
||||||
elif segmenter == Segmenter.pkuseg:
|
|
||||||
msg = (
|
|
||||||
"The Chinese word segmenter is 'pkuseg' but no pkuseg model "
|
|
||||||
"was specified. Please provide the name of a pretrained model "
|
|
||||||
"or the path to a model with:\n"
|
|
||||||
'cfg = {"nlp": {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path }}\n'
|
|
||||||
"nlp = Chinese.from_config(cfg)"
|
|
||||||
)
|
|
||||||
raise ValueError(msg)
|
|
||||||
except ImportError:
|
except ImportError:
|
||||||
if segmenter == Segmenter.pkuseg:
|
msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
|
||||||
msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
|
raise ImportError(msg) from None
|
||||||
raise ImportError(msg) from None
|
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
if segmenter == Segmenter.pkuseg:
|
msg = "Unable to load pkuseg model from: " + pkuseg_model
|
||||||
msg = "Unable to load pkuseg model from: " + pkuseg_model
|
raise FileNotFoundError(msg) from None
|
||||||
raise FileNotFoundError(msg) from None
|
|
||||||
|
|
||||||
|
|
||||||
def _get_pkuseg_trie_data(node, path=""):
|
def _get_pkuseg_trie_data(node, path=""):
|
||||||
|
|
|
@ -8,7 +8,7 @@ from contextlib import contextmanager
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import warnings
|
import warnings
|
||||||
from thinc.api import Model, get_current_ops, Config, require_gpu, Optimizer
|
from thinc.api import Model, get_current_ops, Config, Optimizer
|
||||||
import srsly
|
import srsly
|
||||||
import multiprocessing as mp
|
import multiprocessing as mp
|
||||||
from itertools import chain, cycle
|
from itertools import chain, cycle
|
||||||
|
@ -18,8 +18,9 @@ from .tokens.underscore import Underscore
|
||||||
from .vocab import Vocab, create_vocab
|
from .vocab import Vocab, create_vocab
|
||||||
from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
|
from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
|
||||||
from .training import Example, validate_examples
|
from .training import Example, validate_examples
|
||||||
|
from .training.initialize import init_vocab, init_tok2vec
|
||||||
from .scorer import Scorer
|
from .scorer import Scorer
|
||||||
from .util import create_default_optimizer, registry, SimpleFrozenList
|
from .util import registry, SimpleFrozenList
|
||||||
from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
|
from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
|
||||||
from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
|
from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
|
||||||
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||||
|
@ -27,7 +28,8 @@ from .lang.punctuation import TOKENIZER_INFIXES
|
||||||
from .tokens import Doc
|
from .tokens import Doc
|
||||||
from .tokenizer import Tokenizer
|
from .tokenizer import Tokenizer
|
||||||
from .errors import Errors, Warnings
|
from .errors import Errors, Warnings
|
||||||
from .schemas import ConfigSchema
|
from .schemas import ConfigSchema, ConfigSchemaNlp, ConfigSchemaInit
|
||||||
|
from .schemas import ConfigSchemaPretrain, validate_init_settings
|
||||||
from .git_info import GIT_VERSION
|
from .git_info import GIT_VERSION
|
||||||
from . import util
|
from . import util
|
||||||
from . import about
|
from . import about
|
||||||
|
@ -166,11 +168,10 @@ class Language:
|
||||||
self._components = []
|
self._components = []
|
||||||
self._disabled = set()
|
self._disabled = set()
|
||||||
self.max_length = max_length
|
self.max_length = max_length
|
||||||
self.resolved = {}
|
|
||||||
# Create the default tokenizer from the default config
|
# Create the default tokenizer from the default config
|
||||||
if not create_tokenizer:
|
if not create_tokenizer:
|
||||||
tokenizer_cfg = {"tokenizer": self._config["nlp"]["tokenizer"]}
|
tokenizer_cfg = {"tokenizer": self._config["nlp"]["tokenizer"]}
|
||||||
create_tokenizer = registry.make_from_config(tokenizer_cfg)["tokenizer"]
|
create_tokenizer = registry.resolve(tokenizer_cfg)["tokenizer"]
|
||||||
self.tokenizer = create_tokenizer(self)
|
self.tokenizer = create_tokenizer(self)
|
||||||
|
|
||||||
def __init_subclass__(cls, **kwargs):
|
def __init_subclass__(cls, **kwargs):
|
||||||
|
@ -467,7 +468,7 @@ class Language:
|
||||||
if "nlp" not in arg_names or "name" not in arg_names:
|
if "nlp" not in arg_names or "name" not in arg_names:
|
||||||
raise ValueError(Errors.E964.format(name=name))
|
raise ValueError(Errors.E964.format(name=name))
|
||||||
# Officially register the factory so we can later call
|
# Officially register the factory so we can later call
|
||||||
# registry.make_from_config and refer to it in the config as
|
# registry.resolve and refer to it in the config as
|
||||||
# @factories = "spacy.Language.xyz". We use the class name here so
|
# @factories = "spacy.Language.xyz". We use the class name here so
|
||||||
# different classes can have different factories.
|
# different classes can have different factories.
|
||||||
registry.factories.register(internal_name, func=factory_func)
|
registry.factories.register(internal_name, func=factory_func)
|
||||||
|
@ -650,8 +651,9 @@ class Language:
|
||||||
cfg = {factory_name: config}
|
cfg = {factory_name: config}
|
||||||
# We're calling the internal _fill here to avoid constructing the
|
# We're calling the internal _fill here to avoid constructing the
|
||||||
# registered functions twice
|
# registered functions twice
|
||||||
resolved, filled = registry.resolve(cfg, validate=validate)
|
resolved = registry.resolve(cfg, validate=validate)
|
||||||
filled = Config(filled[factory_name])
|
filled = registry.fill({"cfg": cfg[factory_name]}, validate=validate)["cfg"]
|
||||||
|
filled = Config(filled)
|
||||||
filled["factory"] = factory_name
|
filled["factory"] = factory_name
|
||||||
filled.pop("@factories", None)
|
filled.pop("@factories", None)
|
||||||
# Remove the extra values we added because we don't want to keep passing
|
# Remove the extra values we added because we don't want to keep passing
|
||||||
|
@ -1065,7 +1067,7 @@ class Language:
|
||||||
validate_examples(examples, "Language.update")
|
validate_examples(examples, "Language.update")
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
if self._optimizer is None:
|
if self._optimizer is None:
|
||||||
self._optimizer = create_default_optimizer()
|
self._optimizer = self.create_optimizer()
|
||||||
sgd = self._optimizer
|
sgd = self._optimizer
|
||||||
if component_cfg is None:
|
if component_cfg is None:
|
||||||
component_cfg = {}
|
component_cfg = {}
|
||||||
|
@ -1123,7 +1125,7 @@ class Language:
|
||||||
validate_examples(examples, "Language.rehearse")
|
validate_examples(examples, "Language.rehearse")
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
if self._optimizer is None:
|
if self._optimizer is None:
|
||||||
self._optimizer = create_default_optimizer()
|
self._optimizer = self.create_optimizer()
|
||||||
sgd = self._optimizer
|
sgd = self._optimizer
|
||||||
pipes = list(self.pipeline)
|
pipes = list(self.pipeline)
|
||||||
random.shuffle(pipes)
|
random.shuffle(pipes)
|
||||||
|
@ -1153,61 +1155,73 @@ class Language:
|
||||||
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
|
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
|
||||||
*,
|
*,
|
||||||
sgd: Optional[Optimizer] = None,
|
sgd: Optional[Optimizer] = None,
|
||||||
device: int = -1,
|
) -> Optimizer:
|
||||||
|
warnings.warn(Warnings.W089, DeprecationWarning)
|
||||||
|
return self.initialize(get_examples, sgd=sgd)
|
||||||
|
|
||||||
|
def initialize(
|
||||||
|
self,
|
||||||
|
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
|
||||||
|
*,
|
||||||
|
sgd: Optional[Optimizer] = None,
|
||||||
) -> Optimizer:
|
) -> Optimizer:
|
||||||
"""Initialize the pipe for training, using data examples if available.
|
"""Initialize the pipe for training, using data examples if available.
|
||||||
|
|
||||||
get_examples (Callable[[], Iterable[Example]]): Optional function that
|
get_examples (Callable[[], Iterable[Example]]): Optional function that
|
||||||
returns gold-standard Example objects.
|
returns gold-standard Example objects.
|
||||||
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
|
sgd (Optional[Optimizer]): An optimizer to use for updates. If not
|
||||||
create_optimizer if it doesn't exist.
|
provided, will be created using the .create_optimizer() method.
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/language#begin_training
|
DOCS: https://nightly.spacy.io/api/language#initialize
|
||||||
"""
|
"""
|
||||||
if get_examples is None:
|
if get_examples is None:
|
||||||
util.logger.debug(
|
util.logger.debug(
|
||||||
"No 'get_examples' callback provided to 'Language.begin_training', creating dummy examples"
|
"No 'get_examples' callback provided to 'Language.initialize', creating dummy examples"
|
||||||
)
|
)
|
||||||
doc = Doc(self.vocab, words=["x", "y", "z"])
|
doc = Doc(self.vocab, words=["x", "y", "z"])
|
||||||
get_examples = lambda: [Example.from_dict(doc, {})]
|
get_examples = lambda: [Example.from_dict(doc, {})]
|
||||||
# Populate vocab
|
|
||||||
if not hasattr(get_examples, "__call__"):
|
if not hasattr(get_examples, "__call__"):
|
||||||
err = Errors.E930.format(name="Language", obj=type(get_examples))
|
err = Errors.E930.format(name="Language", obj=type(get_examples))
|
||||||
raise ValueError(err)
|
raise ValueError(err)
|
||||||
valid_examples = False
|
# Make sure the config is interpolated so we can resolve subsections
|
||||||
for example in get_examples():
|
config = self.config.interpolate()
|
||||||
if not isinstance(example, Example):
|
# These are the settings provided in the [initialize] block in the config
|
||||||
err = Errors.E978.format(
|
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
|
||||||
name="Language.begin_training", types=type(example)
|
init_vocab(
|
||||||
)
|
self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
|
||||||
raise ValueError(err)
|
)
|
||||||
else:
|
pretrain_cfg = config.get("pretraining")
|
||||||
valid_examples = True
|
if pretrain_cfg:
|
||||||
for word in [t.text for t in example.reference]:
|
P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain)
|
||||||
_ = self.vocab[word] # noqa: F841
|
init_tok2vec(self, P, I)
|
||||||
if not valid_examples:
|
if self.vocab.vectors.data.shape[1] >= 1:
|
||||||
err = Errors.E930.format(name="Language", obj="empty list")
|
ops = get_current_ops()
|
||||||
raise ValueError(err)
|
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
||||||
if device >= 0: # TODO: do we need this here?
|
if hasattr(self.tokenizer, "initialize"):
|
||||||
require_gpu(device)
|
tok_settings = validate_init_settings(
|
||||||
if self.vocab.vectors.data.shape[1] >= 1:
|
self.tokenizer.initialize,
|
||||||
ops = get_current_ops()
|
I["tokenizer"],
|
||||||
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
section="tokenizer",
|
||||||
if sgd is None:
|
name="tokenizer",
|
||||||
sgd = create_default_optimizer()
|
)
|
||||||
self._optimizer = sgd
|
self.tokenizer.initialize(get_examples, nlp=self, **tok_settings)
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if hasattr(proc, "begin_training"):
|
if hasattr(proc, "initialize"):
|
||||||
proc.begin_training(
|
p_settings = I["components"].get(name, {})
|
||||||
get_examples, pipeline=self.pipeline, sgd=self._optimizer
|
p_settings = validate_init_settings(
|
||||||
|
proc.initialize, p_settings, section="components", name=name
|
||||||
)
|
)
|
||||||
|
proc.initialize(get_examples, nlp=self, **p_settings)
|
||||||
self._link_components()
|
self._link_components()
|
||||||
|
self._optimizer = sgd
|
||||||
|
if sgd is not None:
|
||||||
|
self._optimizer = sgd
|
||||||
|
elif self._optimizer is None:
|
||||||
|
self._optimizer = self.create_optimizer()
|
||||||
return self._optimizer
|
return self._optimizer
|
||||||
|
|
||||||
def resume_training(
|
def resume_training(self, *, sgd: Optional[Optimizer] = None) -> Optimizer:
|
||||||
self, *, sgd: Optional[Optimizer] = None, device: int = -1
|
|
||||||
) -> Optimizer:
|
|
||||||
"""Continue training a pretrained model.
|
"""Continue training a pretrained model.
|
||||||
|
|
||||||
Create and return an optimizer, and initialize "rehearsal" for any pipeline
|
Create and return an optimizer, and initialize "rehearsal" for any pipeline
|
||||||
|
@ -1216,22 +1230,20 @@ class Language:
|
||||||
rehearsal, collect samples of text you want the models to retain performance
|
rehearsal, collect samples of text you want the models to retain performance
|
||||||
on, and call nlp.rehearse() with a batch of Example objects.
|
on, and call nlp.rehearse() with a batch of Example objects.
|
||||||
|
|
||||||
sgd (Optional[Optimizer]): An optimizer.
|
|
||||||
RETURNS (Optimizer): The optimizer.
|
RETURNS (Optimizer): The optimizer.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/language#resume_training
|
DOCS: https://nightly.spacy.io/api/language#resume_training
|
||||||
"""
|
"""
|
||||||
if device >= 0: # TODO: do we need this here?
|
ops = get_current_ops()
|
||||||
require_gpu(device)
|
if self.vocab.vectors.data.shape[1] >= 1:
|
||||||
ops = get_current_ops()
|
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
||||||
if self.vocab.vectors.data.shape[1] >= 1:
|
|
||||||
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
|
||||||
if sgd is None:
|
|
||||||
sgd = create_default_optimizer()
|
|
||||||
self._optimizer = sgd
|
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if hasattr(proc, "_rehearsal_model"):
|
if hasattr(proc, "_rehearsal_model"):
|
||||||
proc._rehearsal_model = deepcopy(proc.model)
|
proc._rehearsal_model = deepcopy(proc.model)
|
||||||
|
if sgd is not None:
|
||||||
|
self._optimizer = sgd
|
||||||
|
elif self._optimizer is None:
|
||||||
|
self._optimizer = self.create_optimizer()
|
||||||
return self._optimizer
|
return self._optimizer
|
||||||
|
|
||||||
def evaluate(
|
def evaluate(
|
||||||
|
@ -1293,6 +1305,11 @@ class Language:
|
||||||
results["speed"] = n_words / (end_time - start_time)
|
results["speed"] = n_words / (end_time - start_time)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
def create_optimizer(self):
|
||||||
|
"""Create an optimizer, usually using the [training.optimizer] config."""
|
||||||
|
subconfig = {"optimizer": self.config["training"]["optimizer"]}
|
||||||
|
return registry.resolve(subconfig)["optimizer"]
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def use_params(self, params: Optional[dict]):
|
def use_params(self, params: Optional[dict]):
|
||||||
"""Replace weights of models in the pipeline with those provided in the
|
"""Replace weights of models in the pipeline with those provided in the
|
||||||
|
@ -1501,7 +1518,7 @@ class Language:
|
||||||
).merge(config)
|
).merge(config)
|
||||||
if "nlp" not in config:
|
if "nlp" not in config:
|
||||||
raise ValueError(Errors.E985.format(config=config))
|
raise ValueError(Errors.E985.format(config=config))
|
||||||
config_lang = config["nlp"]["lang"]
|
config_lang = config["nlp"].get("lang")
|
||||||
if config_lang is not None and config_lang != cls.lang:
|
if config_lang is not None and config_lang != cls.lang:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
Errors.E958.format(
|
Errors.E958.format(
|
||||||
|
@ -1518,15 +1535,19 @@ class Language:
|
||||||
config = util.copy_config(config)
|
config = util.copy_config(config)
|
||||||
orig_pipeline = config.pop("components", {})
|
orig_pipeline = config.pop("components", {})
|
||||||
config["components"] = {}
|
config["components"] = {}
|
||||||
resolved, filled = registry.resolve(
|
if auto_fill:
|
||||||
config, validate=validate, schema=ConfigSchema
|
filled = registry.fill(config, validate=validate, schema=ConfigSchema)
|
||||||
)
|
else:
|
||||||
|
filled = config
|
||||||
filled["components"] = orig_pipeline
|
filled["components"] = orig_pipeline
|
||||||
config["components"] = orig_pipeline
|
config["components"] = orig_pipeline
|
||||||
create_tokenizer = resolved["nlp"]["tokenizer"]
|
resolved_nlp = registry.resolve(
|
||||||
before_creation = resolved["nlp"]["before_creation"]
|
filled["nlp"], validate=validate, schema=ConfigSchemaNlp
|
||||||
after_creation = resolved["nlp"]["after_creation"]
|
)
|
||||||
after_pipeline_creation = resolved["nlp"]["after_pipeline_creation"]
|
create_tokenizer = resolved_nlp["tokenizer"]
|
||||||
|
before_creation = resolved_nlp["before_creation"]
|
||||||
|
after_creation = resolved_nlp["after_creation"]
|
||||||
|
after_pipeline_creation = resolved_nlp["after_pipeline_creation"]
|
||||||
lang_cls = cls
|
lang_cls = cls
|
||||||
if before_creation is not None:
|
if before_creation is not None:
|
||||||
lang_cls = before_creation(cls)
|
lang_cls = before_creation(cls)
|
||||||
|
@ -1587,7 +1608,6 @@ class Language:
|
||||||
disabled_pipes = [*config["nlp"]["disabled"], *disable]
|
disabled_pipes = [*config["nlp"]["disabled"], *disable]
|
||||||
nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
|
nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
|
||||||
nlp.config = filled if auto_fill else config
|
nlp.config = filled if auto_fill else config
|
||||||
nlp.resolved = resolved
|
|
||||||
if after_pipeline_creation is not None:
|
if after_pipeline_creation is not None:
|
||||||
nlp = after_pipeline_creation(nlp)
|
nlp = after_pipeline_creation(nlp)
|
||||||
if not isinstance(nlp, cls):
|
if not isinstance(nlp, cls):
|
||||||
|
|
25
spacy/ml/featureextractor.py
Normal file
25
spacy/ml/featureextractor.py
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
from typing import List, Union, Callable, Tuple
|
||||||
|
from thinc.types import Ints2d, Doc
|
||||||
|
from thinc.api import Model, registry
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@registry.layers("spacy.FeatureExtractor.v1")
|
||||||
|
def FeatureExtractor(columns: List[Union[int, str]]) -> Model[List[Doc], List[Ints2d]]:
|
||||||
|
return Model("extract_features", forward, attrs={"columns": columns})
|
||||||
|
|
||||||
|
|
||||||
|
def forward(model: Model[List[Doc], List[Ints2d]], docs, is_train: bool) -> Tuple[List[Ints2d], Callable]:
|
||||||
|
columns = model.attrs["columns"]
|
||||||
|
features: List[Ints2d] = []
|
||||||
|
for doc in docs:
|
||||||
|
if hasattr(doc, "to_array"):
|
||||||
|
attrs = doc.to_array(columns)
|
||||||
|
else:
|
||||||
|
attrs = doc.doc.to_array(columns)[doc.start : doc.end]
|
||||||
|
if attrs.ndim == 1:
|
||||||
|
attrs = attrs.reshape((attrs.shape[0], 1))
|
||||||
|
features.append(model.ops.asarray2i(attrs, dtype="uint64"))
|
||||||
|
|
||||||
|
backprop: Callable[[List[Ints2d]], List] = lambda d_features: []
|
||||||
|
return features, backprop
|
|
@ -3,12 +3,13 @@ from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
|
||||||
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
|
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
|
||||||
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
|
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
|
||||||
from thinc.api import HashEmbed, with_array, with_cpu, uniqued
|
from thinc.api import HashEmbed, with_array, with_cpu, uniqued
|
||||||
from thinc.api import Relu, residual, expand_window, FeatureExtractor
|
from thinc.api import Relu, residual, expand_window
|
||||||
|
|
||||||
from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
|
from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
from ..extract_ngrams import extract_ngrams
|
from ..extract_ngrams import extract_ngrams
|
||||||
from ..staticvectors import StaticVectors
|
from ..staticvectors import StaticVectors
|
||||||
|
from ..featureextractor import FeatureExtractor
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.TextCatCNN.v1")
|
@registry.architectures.register("spacy.TextCatCNN.v1")
|
||||||
|
|
|
@ -1,14 +1,14 @@
|
||||||
from typing import Optional, List
|
from typing import Optional, List
|
||||||
from thinc.api import chain, clone, concatenate, with_array, with_padded
|
|
||||||
from thinc.api import Model, noop, list2ragged, ragged2list
|
|
||||||
from thinc.api import FeatureExtractor, HashEmbed
|
|
||||||
from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
|
|
||||||
from thinc.types import Floats2d
|
from thinc.types import Floats2d
|
||||||
|
from thinc.api import chain, clone, concatenate, with_array, with_padded
|
||||||
|
from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed
|
||||||
|
from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
|
||||||
|
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
from ...ml import _character_embed
|
from ...ml import _character_embed
|
||||||
from ..staticvectors import StaticVectors
|
from ..staticvectors import StaticVectors
|
||||||
|
from ..featureextractor import FeatureExtractor
|
||||||
from ...pipeline.tok2vec import Tok2VecListener
|
from ...pipeline.tok2vec import Tok2VecListener
|
||||||
from ...attrs import ORTH, NORM, PREFIX, SUFFIX, SHAPE
|
from ...attrs import ORTH, NORM, PREFIX, SUFFIX, SHAPE
|
||||||
|
|
||||||
|
|
|
@ -29,7 +29,8 @@ cdef class Morphology:
|
||||||
FEATURE_SEP = "|"
|
FEATURE_SEP = "|"
|
||||||
FIELD_SEP = "="
|
FIELD_SEP = "="
|
||||||
VALUE_SEP = ","
|
VALUE_SEP = ","
|
||||||
EMPTY_MORPH = "_" # not an empty string so that the PreshMap key is not 0
|
# not an empty string so that the PreshMap key is not 0
|
||||||
|
EMPTY_MORPH = symbols.NAMES[symbols._]
|
||||||
|
|
||||||
def __init__(self, StringStore strings):
|
def __init__(self, StringStore strings):
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
|
|
|
@ -78,7 +78,7 @@ def get_attr_info(nlp: "Language", attr: str) -> Dict[str, List[str]]:
|
||||||
|
|
||||||
|
|
||||||
def analyze_pipes(
|
def analyze_pipes(
|
||||||
nlp: "Language", *, keys: List[str] = DEFAULT_KEYS,
|
nlp: "Language", *, keys: List[str] = DEFAULT_KEYS
|
||||||
) -> Dict[str, Union[List[str], Dict[str, List[str]]]]:
|
) -> Dict[str, Union[List[str], Dict[str, List[str]]]]:
|
||||||
"""Print a formatted summary for the current nlp object's pipeline. Shows
|
"""Print a formatted summary for the current nlp object's pipeline. Shows
|
||||||
a table with the pipeline components and why they assign and require, as
|
a table with the pipeline components and why they assign and require, as
|
||||||
|
|
|
@ -82,8 +82,7 @@ class AttributeRuler(Pipe):
|
||||||
matches = self.matcher(doc, allow_missing=True)
|
matches = self.matcher(doc, allow_missing=True)
|
||||||
# Sort by the attribute ID, so that later rules have precendence
|
# Sort by the attribute ID, so that later rules have precendence
|
||||||
matches = [
|
matches = [
|
||||||
(_parse_key(self.vocab.strings[m_id]), m_id, s, e)
|
(int(self.vocab.strings[m_id]), m_id, s, e) for m_id, s, e in matches
|
||||||
for m_id, s, e in matches
|
|
||||||
]
|
]
|
||||||
matches.sort()
|
matches.sort()
|
||||||
for attr_id, match_id, start, end in matches:
|
for attr_id, match_id, start, end in matches:
|
||||||
|
@ -93,7 +92,7 @@ class AttributeRuler(Pipe):
|
||||||
try:
|
try:
|
||||||
# The index can be negative, which makes it annoying to do
|
# The index can be negative, which makes it annoying to do
|
||||||
# the boundscheck. Let Span do it instead.
|
# the boundscheck. Let Span do it instead.
|
||||||
token = span[index]
|
token = span[index] # noqa: F841
|
||||||
except IndexError:
|
except IndexError:
|
||||||
# The original exception is just our conditional logic, so we
|
# The original exception is just our conditional logic, so we
|
||||||
# raise from.
|
# raise from.
|
||||||
|
@ -103,7 +102,7 @@ class AttributeRuler(Pipe):
|
||||||
span=[t.text for t in span],
|
span=[t.text for t in span],
|
||||||
index=index,
|
index=index,
|
||||||
)
|
)
|
||||||
) from None
|
) from None
|
||||||
set_token_attrs(span[index], attrs)
|
set_token_attrs(span[index], attrs)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
@ -184,7 +183,7 @@ class AttributeRuler(Pipe):
|
||||||
"""
|
"""
|
||||||
# We need to make a string here, because otherwise the ID we pass back
|
# We need to make a string here, because otherwise the ID we pass back
|
||||||
# will be interpreted as the hash of a string, rather than an ordinal.
|
# will be interpreted as the hash of a string, rather than an ordinal.
|
||||||
key = _make_key(len(self.attrs))
|
key = str(len(self.attrs))
|
||||||
self.matcher.add(self.vocab.strings.add(key), patterns)
|
self.matcher.add(self.vocab.strings.add(key), patterns)
|
||||||
self._attrs_unnormed.append(attrs)
|
self._attrs_unnormed.append(attrs)
|
||||||
attrs = normalize_token_attrs(self.vocab, attrs)
|
attrs = normalize_token_attrs(self.vocab, attrs)
|
||||||
|
@ -209,7 +208,7 @@ class AttributeRuler(Pipe):
|
||||||
all_patterns = []
|
all_patterns = []
|
||||||
for i in range(len(self.attrs)):
|
for i in range(len(self.attrs)):
|
||||||
p = {}
|
p = {}
|
||||||
p["patterns"] = self.matcher.get(_make_key(i))[1]
|
p["patterns"] = self.matcher.get(str(i))[1]
|
||||||
p["attrs"] = self._attrs_unnormed[i]
|
p["attrs"] = self._attrs_unnormed[i]
|
||||||
p["index"] = self.indices[i]
|
p["index"] = self.indices[i]
|
||||||
all_patterns.append(p)
|
all_patterns.append(p)
|
||||||
|
@ -313,12 +312,6 @@ class AttributeRuler(Pipe):
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def _make_key(n_attr):
|
|
||||||
return f"attr_rule_{n_attr}"
|
|
||||||
|
|
||||||
def _parse_key(key):
|
|
||||||
return int(key.rsplit("_", 1)[1])
|
|
||||||
|
|
||||||
|
|
||||||
def _split_morph_attrs(attrs):
|
def _split_morph_attrs(attrs):
|
||||||
"""Split entries from a tag map or morph rules dict into to two dicts, one
|
"""Split entries from a tag map or morph rules dict into to two dicts, one
|
||||||
|
|
|
@ -126,13 +126,13 @@ cdef class DependencyParser(Parser):
|
||||||
def add_multitask_objective(self, mt_component):
|
def add_multitask_objective(self, mt_component):
|
||||||
self._multitasks.append(mt_component)
|
self._multitasks.append(mt_component)
|
||||||
|
|
||||||
def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
|
def init_multitask_objectives(self, get_examples, nlp=None, **cfg):
|
||||||
# TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ?
|
# TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ?
|
||||||
for labeller in self._multitasks:
|
for labeller in self._multitasks:
|
||||||
labeller.model.set_dim("nO", len(self.labels))
|
labeller.model.set_dim("nO", len(self.labels))
|
||||||
if labeller.model.has_ref("output_layer"):
|
if labeller.model.has_ref("output_layer"):
|
||||||
labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
|
labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
|
||||||
labeller.begin_training(get_examples, pipeline=pipeline, sgd=sgd)
|
labeller.initialize(get_examples, nlp=nlp)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List, Tuple
|
from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import srsly
|
import srsly
|
||||||
import random
|
import random
|
||||||
|
@ -140,26 +140,20 @@ class EntityLinker(Pipe):
|
||||||
if len(self.kb) == 0:
|
if len(self.kb) == 0:
|
||||||
raise ValueError(Errors.E139.format(name=self.name))
|
raise ValueError(Errors.E139.format(name=self.name))
|
||||||
|
|
||||||
def begin_training(
|
def initialize(
|
||||||
self,
|
self,
|
||||||
get_examples: Callable[[], Iterable[Example]],
|
get_examples: Callable[[], Iterable[Example]],
|
||||||
*,
|
*,
|
||||||
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
|
nlp: Optional[Language] = None,
|
||||||
sgd: Optional[Optimizer] = None,
|
):
|
||||||
) -> Optimizer:
|
|
||||||
"""Initialize the pipe for training, using a representative set
|
"""Initialize the pipe for training, using a representative set
|
||||||
of data examples.
|
of data examples.
|
||||||
|
|
||||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
returns a representative sample of gold-standard Example objects.
|
returns a representative sample of gold-standard Example objects.
|
||||||
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
nlp (Language): The current nlp object the component is part of.
|
||||||
components that this component is part of. Corresponds to
|
|
||||||
nlp.pipeline.
|
|
||||||
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
|
|
||||||
create_optimizer if it doesn't exist.
|
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/entitylinker#begin_training
|
DOCS: https://nightly.spacy.io/api/entitylinker#initialize
|
||||||
"""
|
"""
|
||||||
self._ensure_examples(get_examples)
|
self._ensure_examples(get_examples)
|
||||||
self._require_kb()
|
self._require_kb()
|
||||||
|
@ -174,9 +168,6 @@ class EntityLinker(Pipe):
|
||||||
self.model.initialize(
|
self.model.initialize(
|
||||||
X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
|
X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
|
||||||
)
|
)
|
||||||
if sgd is None:
|
|
||||||
sgd = self.create_optimizer()
|
|
||||||
return sgd
|
|
||||||
|
|
||||||
def update(
|
def update(
|
||||||
self,
|
self,
|
||||||
|
|
|
@ -67,7 +67,7 @@ class Lemmatizer(Pipe):
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def load_lookups(cls, lang: str, mode: str, lookups: Optional[Lookups],) -> Lookups:
|
def load_lookups(cls, lang: str, mode: str, lookups: Optional[Lookups]) -> Lookups:
|
||||||
"""Load and validate lookups tables. If the provided lookups is None,
|
"""Load and validate lookups tables. If the provided lookups is None,
|
||||||
load the default lookups tables according to the language and mode
|
load the default lookups tables according to the language and mode
|
||||||
settings. Confirm that all required tables for the language and mode
|
settings. Confirm that all required tables for the language and mode
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# cython: infer_types=True, profile=True, binding=True
|
# cython: infer_types=True, profile=True, binding=True
|
||||||
from typing import Optional
|
from typing import Optional, Union, Dict
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import SequenceCategoricalCrossentropy, Model, Config
|
from thinc.api import SequenceCategoricalCrossentropy, Model, Config
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
|
@ -101,6 +101,11 @@ class Morphologizer(Tagger):
|
||||||
"""RETURNS (Tuple[str]): The labels currently added to the component."""
|
"""RETURNS (Tuple[str]): The labels currently added to the component."""
|
||||||
return tuple(self.cfg["labels_morph"].keys())
|
return tuple(self.cfg["labels_morph"].keys())
|
||||||
|
|
||||||
|
@property
|
||||||
|
def label_data(self) -> Dict[str, Dict[str, Union[str, float, int, None]]]:
|
||||||
|
"""A dictionary with all labels data."""
|
||||||
|
return {"morph": self.cfg["labels_morph"], "pos": self.cfg["labels_pos"]}
|
||||||
|
|
||||||
def add_label(self, label):
|
def add_label(self, label):
|
||||||
"""Add a new label to the pipe.
|
"""Add a new label to the pipe.
|
||||||
|
|
||||||
|
@ -129,20 +134,15 @@ class Morphologizer(Tagger):
|
||||||
self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
|
self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
def begin_training(self, get_examples, *, pipeline=None, sgd=None):
|
def initialize(self, get_examples, *, nlp=None):
|
||||||
"""Initialize the pipe for training, using a representative set
|
"""Initialize the pipe for training, using a representative set
|
||||||
of data examples.
|
of data examples.
|
||||||
|
|
||||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
returns a representative sample of gold-standard Example objects.
|
returns a representative sample of gold-standard Example objects.
|
||||||
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
nlp (Language): The current nlp object the component is part of.
|
||||||
components that this component is part of. Corresponds to
|
|
||||||
nlp.pipeline.
|
|
||||||
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
|
|
||||||
create_optimizer if it doesn't exist.
|
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/morphologizer#begin_training
|
DOCS: https://nightly.spacy.io/api/morphologizer#initialize
|
||||||
"""
|
"""
|
||||||
self._ensure_examples(get_examples)
|
self._ensure_examples(get_examples)
|
||||||
# First, fetch all labels from the data
|
# First, fetch all labels from the data
|
||||||
|
@ -178,9 +178,6 @@ class Morphologizer(Tagger):
|
||||||
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
||||||
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
||||||
self.model.initialize(X=doc_sample, Y=label_sample)
|
self.model.initialize(X=doc_sample, Y=label_sample)
|
||||||
if sgd is None:
|
|
||||||
sgd = self.create_optimizer()
|
|
||||||
return sgd
|
|
||||||
|
|
||||||
def set_annotations(self, docs, batch_tag_ids):
|
def set_annotations(self, docs, batch_tag_ids):
|
||||||
"""Modify a batch of documents, using pre-computed scores.
|
"""Modify a batch of documents, using pre-computed scores.
|
||||||
|
|
|
@ -81,7 +81,7 @@ class MultitaskObjective(Tagger):
|
||||||
def set_annotations(self, docs, dep_ids):
|
def set_annotations(self, docs, dep_ids):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def begin_training(self, get_examples, pipeline=None, sgd=None):
|
def initialize(self, get_examples, nlp=None):
|
||||||
if not hasattr(get_examples, "__call__"):
|
if not hasattr(get_examples, "__call__"):
|
||||||
err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
|
err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
|
||||||
raise ValueError(err)
|
raise ValueError(err)
|
||||||
|
@ -91,9 +91,6 @@ class MultitaskObjective(Tagger):
|
||||||
if label is not None and label not in self.labels:
|
if label is not None and label not in self.labels:
|
||||||
self.labels[label] = len(self.labels)
|
self.labels[label] = len(self.labels)
|
||||||
self.model.initialize() # TODO: fix initialization by defining X and Y
|
self.model.initialize() # TODO: fix initialization by defining X and Y
|
||||||
if sgd is None:
|
|
||||||
sgd = self.create_optimizer()
|
|
||||||
return sgd
|
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
tokvecs = self.model.get_ref("tok2vec")(docs)
|
tokvecs = self.model.get_ref("tok2vec")(docs)
|
||||||
|
@ -177,13 +174,10 @@ class ClozeMultitask(Pipe):
|
||||||
def set_annotations(self, docs, dep_ids):
|
def set_annotations(self, docs, dep_ids):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def begin_training(self, get_examples, pipeline=None, sgd=None):
|
def initialize(self, get_examples, nlp=None):
|
||||||
self.model.initialize() # TODO: fix initialization by defining X and Y
|
self.model.initialize() # TODO: fix initialization by defining X and Y
|
||||||
X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
|
X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
|
||||||
self.model.output_layer.begin_training(X)
|
self.model.output_layer.initialize(X)
|
||||||
if sgd is None:
|
|
||||||
sgd = self.create_optimizer()
|
|
||||||
return sgd
|
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
tokvecs = self.model.get_ref("tok2vec")(docs)
|
tokvecs = self.model.get_ref("tok2vec")(docs)
|
||||||
|
|
|
@ -96,14 +96,14 @@ cdef class EntityRecognizer(Parser):
|
||||||
"""Register another component as a multi-task objective. Experimental."""
|
"""Register another component as a multi-task objective. Experimental."""
|
||||||
self._multitasks.append(mt_component)
|
self._multitasks.append(mt_component)
|
||||||
|
|
||||||
def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
|
def init_multitask_objectives(self, get_examples, nlp=None, **cfg):
|
||||||
"""Setup multi-task objective components. Experimental and internal."""
|
"""Setup multi-task objective components. Experimental and internal."""
|
||||||
# TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ?
|
# TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ?
|
||||||
for labeller in self._multitasks:
|
for labeller in self._multitasks:
|
||||||
labeller.model.set_dim("nO", len(self.labels))
|
labeller.model.set_dim("nO", len(self.labels))
|
||||||
if labeller.model.has_ref("output_layer"):
|
if labeller.model.has_ref("output_layer"):
|
||||||
labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
|
labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
|
||||||
labeller.begin_training(get_examples, pipeline=pipeline)
|
labeller.initialize(get_examples, nlp=nlp)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
# cython: infer_types=True, profile=True
|
# cython: infer_types=True, profile=True
|
||||||
|
from typing import Optional, Tuple
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import set_dropout_rate, Model
|
from thinc.api import set_dropout_rate, Model
|
||||||
|
|
||||||
|
@ -32,6 +33,17 @@ cdef class Pipe:
|
||||||
self.name = name
|
self.name = name
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def labels(self) -> Optional[Tuple[str]]:
|
||||||
|
return []
|
||||||
|
|
||||||
|
@property
|
||||||
|
def label_data(self):
|
||||||
|
"""Optional JSON-serializable data that would be sufficient to recreate
|
||||||
|
the label set if provided to the `pipe.initialize()` method.
|
||||||
|
"""
|
||||||
|
return None
|
||||||
|
|
||||||
def __call__(self, Doc doc):
|
def __call__(self, Doc doc):
|
||||||
"""Apply the pipe to one document. The document is modified in place,
|
"""Apply the pipe to one document. The document is modified in place,
|
||||||
and returned. This usually happens under the hood when the nlp object
|
and returned. This usually happens under the hood when the nlp object
|
||||||
|
@ -183,7 +195,7 @@ cdef class Pipe:
|
||||||
"""
|
"""
|
||||||
return util.create_default_optimizer()
|
return util.create_default_optimizer()
|
||||||
|
|
||||||
def begin_training(self, get_examples, *, pipeline=None, sgd=None):
|
def initialize(self, get_examples, *, nlp=None):
|
||||||
"""Initialize the pipe for training, using data examples if available.
|
"""Initialize the pipe for training, using data examples if available.
|
||||||
This method needs to be implemented by each Pipe component,
|
This method needs to be implemented by each Pipe component,
|
||||||
ensuring the internal model (if available) is initialized properly
|
ensuring the internal model (if available) is initialized properly
|
||||||
|
@ -191,16 +203,11 @@ cdef class Pipe:
|
||||||
|
|
||||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
returns a representative sample of gold-standard Example objects.
|
returns a representative sample of gold-standard Example objects.
|
||||||
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
nlp (Language): The current nlp object the component is part of.
|
||||||
components that this component is part of. Corresponds to
|
|
||||||
nlp.pipeline.
|
|
||||||
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
|
|
||||||
create_optimizer if it doesn't exist.
|
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/pipe#begin_training
|
DOCS: https://nightly.spacy.io/api/pipe#initialize
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name))
|
pass
|
||||||
|
|
||||||
def _ensure_examples(self, get_examples):
|
def _ensure_examples(self, get_examples):
|
||||||
if get_examples is None or not hasattr(get_examples, "__call__"):
|
if get_examples is None or not hasattr(get_examples, "__call__"):
|
||||||
|
|
|
@ -58,7 +58,7 @@ class Sentencizer(Pipe):
|
||||||
else:
|
else:
|
||||||
self.punct_chars = set(self.default_punct_chars)
|
self.punct_chars = set(self.default_punct_chars)
|
||||||
|
|
||||||
def begin_training(self, get_examples, pipeline=None, sgd=None):
|
def initialize(self, get_examples, nlp=None):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
|
|
|
@ -71,6 +71,10 @@ class SentenceRecognizer(Tagger):
|
||||||
# are 0
|
# are 0
|
||||||
return tuple(["I", "S"])
|
return tuple(["I", "S"])
|
||||||
|
|
||||||
|
@property
|
||||||
|
def label_data(self):
|
||||||
|
return self.labels
|
||||||
|
|
||||||
def set_annotations(self, docs, batch_tag_ids):
|
def set_annotations(self, docs, batch_tag_ids):
|
||||||
"""Modify a batch of documents, using pre-computed scores.
|
"""Modify a batch of documents, using pre-computed scores.
|
||||||
|
|
||||||
|
@ -124,20 +128,15 @@ class SentenceRecognizer(Tagger):
|
||||||
raise ValueError("nan value when computing loss")
|
raise ValueError("nan value when computing loss")
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
def begin_training(self, get_examples, *, pipeline=None, sgd=None):
|
def initialize(self, get_examples, *, nlp=None):
|
||||||
"""Initialize the pipe for training, using a representative set
|
"""Initialize the pipe for training, using a representative set
|
||||||
of data examples.
|
of data examples.
|
||||||
|
|
||||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
returns a representative sample of gold-standard Example objects.
|
returns a representative sample of gold-standard Example objects.
|
||||||
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
nlp (Language): The current nlp object the component is part of.
|
||||||
components that this component is part of. Corresponds to
|
|
||||||
nlp.pipeline.
|
|
||||||
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
|
|
||||||
create_optimizer if it doesn't exist.
|
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/sentencerecognizer#begin_training
|
DOCS: https://nightly.spacy.io/api/sentencerecognizer#initialize
|
||||||
"""
|
"""
|
||||||
self._ensure_examples(get_examples)
|
self._ensure_examples(get_examples)
|
||||||
doc_sample = []
|
doc_sample = []
|
||||||
|
@ -151,9 +150,6 @@ class SentenceRecognizer(Tagger):
|
||||||
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
||||||
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
||||||
self.model.initialize(X=doc_sample, Y=label_sample)
|
self.model.initialize(X=doc_sample, Y=label_sample)
|
||||||
if sgd is None:
|
|
||||||
sgd = self.create_optimizer()
|
|
||||||
return sgd
|
|
||||||
|
|
||||||
def add_label(self, label, values=None):
|
def add_label(self, label, values=None):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
|
@ -90,6 +90,11 @@ class Tagger(Pipe):
|
||||||
"""
|
"""
|
||||||
return tuple(self.cfg["labels"])
|
return tuple(self.cfg["labels"])
|
||||||
|
|
||||||
|
@property
|
||||||
|
def label_data(self):
|
||||||
|
"""Data about the labels currently added to the component."""
|
||||||
|
return tuple(self.cfg["labels"])
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
"""Apply the pipe to a Doc.
|
"""Apply the pipe to a Doc.
|
||||||
|
|
||||||
|
@ -256,31 +261,33 @@ class Tagger(Pipe):
|
||||||
raise ValueError("nan value when computing loss")
|
raise ValueError("nan value when computing loss")
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
def begin_training(self, get_examples, *, pipeline=None, sgd=None):
|
def initialize(self, get_examples, *, nlp=None, labels=None):
|
||||||
"""Initialize the pipe for training, using a representative set
|
"""Initialize the pipe for training, using a representative set
|
||||||
of data examples.
|
of data examples.
|
||||||
|
|
||||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
returns a representative sample of gold-standard Example objects..
|
returns a representative sample of gold-standard Example objects..
|
||||||
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
nlp (Language): The current nlp object the component is part of.
|
||||||
components that this component is part of. Corresponds to
|
labels: The labels to add to the component, typically generated by the
|
||||||
nlp.pipeline.
|
`init labels` command. If no labels are provided, the get_examples
|
||||||
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
|
callback is used to extract the labels from the data.
|
||||||
create_optimizer if it doesn't exist.
|
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/tagger#begin_training
|
DOCS: https://nightly.spacy.io/api/tagger#initialize
|
||||||
"""
|
"""
|
||||||
self._ensure_examples(get_examples)
|
self._ensure_examples(get_examples)
|
||||||
|
if labels is not None:
|
||||||
|
for tag in labels:
|
||||||
|
self.add_label(tag)
|
||||||
|
else:
|
||||||
|
tags = set()
|
||||||
|
for example in get_examples():
|
||||||
|
for token in example.y:
|
||||||
|
if token.tag_:
|
||||||
|
tags.add(token.tag_)
|
||||||
|
for tag in sorted(tags):
|
||||||
|
self.add_label(tag)
|
||||||
doc_sample = []
|
doc_sample = []
|
||||||
label_sample = []
|
label_sample = []
|
||||||
tags = set()
|
|
||||||
for example in get_examples():
|
|
||||||
for token in example.y:
|
|
||||||
if token.tag_:
|
|
||||||
tags.add(token.tag_)
|
|
||||||
for tag in sorted(tags):
|
|
||||||
self.add_label(tag)
|
|
||||||
for example in islice(get_examples(), 10):
|
for example in islice(get_examples(), 10):
|
||||||
doc_sample.append(example.x)
|
doc_sample.append(example.x)
|
||||||
gold_tags = example.get_aligned("TAG", as_string=True)
|
gold_tags = example.get_aligned("TAG", as_string=True)
|
||||||
|
@ -289,9 +296,6 @@ class Tagger(Pipe):
|
||||||
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
||||||
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
||||||
self.model.initialize(X=doc_sample, Y=label_sample)
|
self.model.initialize(X=doc_sample, Y=label_sample)
|
||||||
if sgd is None:
|
|
||||||
sgd = self.create_optimizer()
|
|
||||||
return sgd
|
|
||||||
|
|
||||||
def add_label(self, label):
|
def add_label(self, label):
|
||||||
"""Add a new label to the pipe.
|
"""Add a new label to the pipe.
|
||||||
|
|
|
@ -154,8 +154,16 @@ class TextCategorizer(Pipe):
|
||||||
|
|
||||||
@labels.setter
|
@labels.setter
|
||||||
def labels(self, value: List[str]) -> None:
|
def labels(self, value: List[str]) -> None:
|
||||||
|
# TODO: This really shouldn't be here. I had a look and I added it when
|
||||||
|
# I added the labels property, but it's pretty nasty to have this, and
|
||||||
|
# will lead to problems.
|
||||||
self.cfg["labels"] = tuple(value)
|
self.cfg["labels"] = tuple(value)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def label_data(self) -> List[str]:
|
||||||
|
"""RETURNS (List[str]): Information about the component's labels."""
|
||||||
|
return self.labels
|
||||||
|
|
||||||
def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
|
def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
|
||||||
"""Apply the pipe to a stream of documents. This usually happens under
|
"""Apply the pipe to a stream of documents. This usually happens under
|
||||||
the hood when the nlp object is called on a text and all components are
|
the hood when the nlp object is called on a text and all components are
|
||||||
|
@ -334,43 +342,40 @@ class TextCategorizer(Pipe):
|
||||||
self.labels = tuple(list(self.labels) + [label])
|
self.labels = tuple(list(self.labels) + [label])
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
def begin_training(
|
def initialize(
|
||||||
self,
|
self,
|
||||||
get_examples: Callable[[], Iterable[Example]],
|
get_examples: Callable[[], Iterable[Example]],
|
||||||
*,
|
*,
|
||||||
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
|
nlp: Optional[Language] = None,
|
||||||
sgd: Optional[Optimizer] = None,
|
labels: Optional[Dict] = None,
|
||||||
) -> Optimizer:
|
):
|
||||||
"""Initialize the pipe for training, using a representative set
|
"""Initialize the pipe for training, using a representative set
|
||||||
of data examples.
|
of data examples.
|
||||||
|
|
||||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
returns a representative sample of gold-standard Example objects.
|
returns a representative sample of gold-standard Example objects.
|
||||||
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
nlp (Language): The current nlp object the component is part of.
|
||||||
components that this component is part of. Corresponds to
|
labels: The labels to add to the component, typically generated by the
|
||||||
nlp.pipeline.
|
`init labels` command. If no labels are provided, the get_examples
|
||||||
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
|
callback is used to extract the labels from the data.
|
||||||
create_optimizer if it doesn't exist.
|
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/textcategorizer#begin_training
|
DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
|
||||||
"""
|
"""
|
||||||
self._ensure_examples(get_examples)
|
self._ensure_examples(get_examples)
|
||||||
subbatch = [] # Select a subbatch of examples to initialize the model
|
if labels is None:
|
||||||
for example in islice(get_examples(), 10):
|
for example in get_examples():
|
||||||
if len(subbatch) < 2:
|
for cat in example.y.cats:
|
||||||
subbatch.append(example)
|
self.add_label(cat)
|
||||||
for cat in example.y.cats:
|
else:
|
||||||
self.add_label(cat)
|
for label in labels:
|
||||||
|
self.add_label(label)
|
||||||
|
subbatch = list(islice(get_examples(), 10))
|
||||||
doc_sample = [eg.reference for eg in subbatch]
|
doc_sample = [eg.reference for eg in subbatch]
|
||||||
label_sample, _ = self._examples_to_truth(subbatch)
|
label_sample, _ = self._examples_to_truth(subbatch)
|
||||||
self._require_labels()
|
self._require_labels()
|
||||||
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
||||||
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
||||||
self.model.initialize(X=doc_sample, Y=label_sample)
|
self.model.initialize(X=doc_sample, Y=label_sample)
|
||||||
if sgd is None:
|
|
||||||
sgd = self.create_optimizer()
|
|
||||||
return sgd
|
|
||||||
|
|
||||||
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||||
"""Score a batch of examples.
|
"""Score a batch of examples.
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List, Tuple
|
from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List
|
||||||
from thinc.api import Model, set_dropout_rate, Optimizer, Config
|
from thinc.api import Model, set_dropout_rate, Optimizer, Config
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
|
|
||||||
|
@ -203,26 +203,20 @@ class Tok2Vec(Pipe):
|
||||||
def get_loss(self, examples, scores) -> None:
|
def get_loss(self, examples, scores) -> None:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def begin_training(
|
def initialize(
|
||||||
self,
|
self,
|
||||||
get_examples: Callable[[], Iterable[Example]],
|
get_examples: Callable[[], Iterable[Example]],
|
||||||
*,
|
*,
|
||||||
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
|
nlp: Optional[Language] = None,
|
||||||
sgd: Optional[Optimizer] = None,
|
|
||||||
):
|
):
|
||||||
"""Initialize the pipe for training, using a representative set
|
"""Initialize the pipe for training, using a representative set
|
||||||
of data examples.
|
of data examples.
|
||||||
|
|
||||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
returns a representative sample of gold-standard Example objects.
|
returns a representative sample of gold-standard Example objects.
|
||||||
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
nlp (Language): The current nlp object the component is part of.
|
||||||
components that this component is part of. Corresponds to
|
|
||||||
nlp.pipeline.
|
|
||||||
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
|
|
||||||
create_optimizer if it doesn't exist.
|
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/tok2vec#begin_training
|
DOCS: https://nightly.spacy.io/api/tok2vec#initialize
|
||||||
"""
|
"""
|
||||||
self._ensure_examples(get_examples)
|
self._ensure_examples(get_examples)
|
||||||
doc_sample = []
|
doc_sample = []
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# cython: infer_types=True, cdivision=True, boundscheck=False
|
# cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
|
@ -7,6 +7,7 @@ from libcpp.vector cimport vector
|
||||||
from libc.string cimport memset
|
from libc.string cimport memset
|
||||||
from libc.stdlib cimport calloc, free
|
from libc.stdlib cimport calloc, free
|
||||||
import random
|
import random
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import set_dropout_rate
|
from thinc.api import set_dropout_rate
|
||||||
|
@ -95,6 +96,10 @@ cdef class Parser(Pipe):
|
||||||
class_names = [self.moves.get_class_name(i) for i in range(self.moves.n_moves)]
|
class_names = [self.moves.get_class_name(i) for i in range(self.moves.n_moves)]
|
||||||
return class_names
|
return class_names
|
||||||
|
|
||||||
|
@property
|
||||||
|
def label_data(self):
|
||||||
|
return self.moves.labels
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def tok2vec(self):
|
def tok2vec(self):
|
||||||
"""Return the embedding and convolutional layer of the model."""
|
"""Return the embedding and convolutional layer of the model."""
|
||||||
|
@ -354,7 +359,7 @@ cdef class Parser(Pipe):
|
||||||
# If all weights for an output are 0 in the original model, don't
|
# If all weights for an output are 0 in the original model, don't
|
||||||
# supervise that output. This allows us to add classes.
|
# supervise that output. This allows us to add classes.
|
||||||
loss += (d_scores**2).sum()
|
loss += (d_scores**2).sum()
|
||||||
backprop(d_scores, sgd=sgd)
|
backprop(d_scores)
|
||||||
# Follow the predicted action
|
# Follow the predicted action
|
||||||
self.transition_states(states, guesses)
|
self.transition_states(states, guesses)
|
||||||
states = [state for state in states if not state.is_final()]
|
states = [state for state in states if not state.is_final()]
|
||||||
|
@ -405,18 +410,20 @@ cdef class Parser(Pipe):
|
||||||
def set_output(self, nO):
|
def set_output(self, nO):
|
||||||
self.model.attrs["resize_output"](self.model, nO)
|
self.model.attrs["resize_output"](self.model, nO)
|
||||||
|
|
||||||
def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
|
def initialize(self, get_examples, nlp=None, labels=None):
|
||||||
self._ensure_examples(get_examples)
|
self._ensure_examples(get_examples)
|
||||||
self.cfg.update(kwargs)
|
|
||||||
lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
|
lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
|
||||||
if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
|
if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
|
||||||
langs = ", ".join(util.LEXEME_NORM_LANGS)
|
langs = ", ".join(util.LEXEME_NORM_LANGS)
|
||||||
util.logger.debug(Warnings.W033.format(model="parser or NER", langs=langs))
|
util.logger.debug(Warnings.W033.format(model="parser or NER", langs=langs))
|
||||||
actions = self.moves.get_actions(
|
if labels is not None:
|
||||||
examples=get_examples(),
|
actions = dict(labels)
|
||||||
min_freq=self.cfg['min_action_freq'],
|
else:
|
||||||
learn_tokens=self.cfg["learn_tokens"]
|
actions = self.moves.get_actions(
|
||||||
)
|
examples=get_examples(),
|
||||||
|
min_freq=self.cfg['min_action_freq'],
|
||||||
|
learn_tokens=self.cfg["learn_tokens"]
|
||||||
|
)
|
||||||
for action, labels in self.moves.labels.items():
|
for action, labels in self.moves.labels.items():
|
||||||
actions.setdefault(action, {})
|
actions.setdefault(action, {})
|
||||||
for label, freq in labels.items():
|
for label, freq in labels.items():
|
||||||
|
@ -425,11 +432,9 @@ cdef class Parser(Pipe):
|
||||||
self.moves.initialize_actions(actions)
|
self.moves.initialize_actions(actions)
|
||||||
# make sure we resize so we have an appropriate upper layer
|
# make sure we resize so we have an appropriate upper layer
|
||||||
self._resize()
|
self._resize()
|
||||||
if sgd is None:
|
|
||||||
sgd = self.create_optimizer()
|
|
||||||
doc_sample = []
|
doc_sample = []
|
||||||
if pipeline is not None:
|
if nlp is not None:
|
||||||
for name, component in pipeline:
|
for name, component in nlp.pipeline:
|
||||||
if component is self:
|
if component is self:
|
||||||
break
|
break
|
||||||
if hasattr(component, "pipe"):
|
if hasattr(component, "pipe"):
|
||||||
|
@ -441,9 +446,8 @@ cdef class Parser(Pipe):
|
||||||
doc_sample.append(example.predicted)
|
doc_sample.append(example.predicted)
|
||||||
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
||||||
self.model.initialize(doc_sample)
|
self.model.initialize(doc_sample)
|
||||||
if pipeline is not None:
|
if nlp is not None:
|
||||||
self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg)
|
self.init_multitask_objectives(get_examples, nlp.pipeline)
|
||||||
return sgd
|
|
||||||
|
|
||||||
def to_disk(self, path, exclude=tuple()):
|
def to_disk(self, path, exclude=tuple()):
|
||||||
serializers = {
|
serializers = {
|
||||||
|
|
145
spacy/schemas.py
145
spacy/schemas.py
|
@ -1,14 +1,17 @@
|
||||||
from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
|
from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
|
||||||
from typing import Iterable, TypeVar, TYPE_CHECKING
|
from typing import Iterable, TypeVar, TYPE_CHECKING
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pydantic import BaseModel, Field, ValidationError, validator
|
from pydantic import BaseModel, Field, ValidationError, validator, create_model
|
||||||
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
|
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
|
||||||
from pydantic import root_validator
|
from pydantic.main import ModelMetaclass
|
||||||
|
from thinc.api import Optimizer, ConfigValidationError
|
||||||
|
from thinc.config import Promise
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from thinc.api import Optimizer
|
import inspect
|
||||||
|
|
||||||
from .attrs import NAMES
|
from .attrs import NAMES
|
||||||
from .lookups import Lookups
|
from .lookups import Lookups
|
||||||
|
from .util import is_cython_func
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
# This lets us add type hints for mypy etc. without causing circular imports
|
# This lets us add type hints for mypy etc. without causing circular imports
|
||||||
|
@ -16,10 +19,12 @@ if TYPE_CHECKING:
|
||||||
from .training import Example # noqa: F401
|
from .training import Example # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
ItemT = TypeVar("ItemT")
|
ItemT = TypeVar("ItemT")
|
||||||
Batcher = Callable[[Iterable[ItemT]], Iterable[List[ItemT]]]
|
Batcher = Union[Callable[[Iterable[ItemT]], Iterable[List[ItemT]]], Promise]
|
||||||
Reader = Callable[["Language", str], Iterable["Example"]]
|
Reader = Union[Callable[["Language", str], Iterable["Example"]], Promise]
|
||||||
Logger = Callable[["Language"], Tuple[Callable[[Dict[str, Any]], None], Callable]]
|
Logger = Union[Callable[["Language"], Tuple[Callable[[Dict[str, Any]], None], Callable]], Promise]
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
def validate(schema: Type[BaseModel], obj: Dict[str, Any]) -> List[str]:
|
def validate(schema: Type[BaseModel], obj: Dict[str, Any]) -> List[str]:
|
||||||
|
@ -41,6 +46,96 @@ def validate(schema: Type[BaseModel], obj: Dict[str, Any]) -> List[str]:
|
||||||
return [f"[{loc}] {', '.join(msg)}" for loc, msg in data.items()]
|
return [f"[{loc}] {', '.join(msg)}" for loc, msg in data.items()]
|
||||||
|
|
||||||
|
|
||||||
|
# Initialization
|
||||||
|
|
||||||
|
|
||||||
|
class ArgSchemaConfig:
|
||||||
|
extra = "forbid"
|
||||||
|
arbitrary_types_allowed = True
|
||||||
|
|
||||||
|
|
||||||
|
class ArgSchemaConfigExtra:
|
||||||
|
extra = "forbid"
|
||||||
|
arbitrary_types_allowed = True
|
||||||
|
|
||||||
|
|
||||||
|
def get_arg_model(
|
||||||
|
func: Callable,
|
||||||
|
*,
|
||||||
|
exclude: Iterable[str] = tuple(),
|
||||||
|
name: str = "ArgModel",
|
||||||
|
strict: bool = True,
|
||||||
|
) -> ModelMetaclass:
|
||||||
|
"""Generate a pydantic model for function arguments.
|
||||||
|
|
||||||
|
func (Callable): The function to generate the schema for.
|
||||||
|
exclude (Iterable[str]): Parameter names to ignore.
|
||||||
|
name (str): Name of created model class.
|
||||||
|
strict (bool): Don't allow extra arguments if no variable keyword arguments
|
||||||
|
are allowed on the function.
|
||||||
|
RETURNS (ModelMetaclass): A pydantic model.
|
||||||
|
"""
|
||||||
|
sig_args = {}
|
||||||
|
try:
|
||||||
|
sig = inspect.signature(func)
|
||||||
|
except ValueError:
|
||||||
|
# Typically happens if the method is part of a Cython module without
|
||||||
|
# binding=True. Here we just use an empty model that allows everything.
|
||||||
|
return create_model(name, __config__=ArgSchemaConfigExtra)
|
||||||
|
has_variable = False
|
||||||
|
for param in sig.parameters.values():
|
||||||
|
if param.name in exclude:
|
||||||
|
continue
|
||||||
|
if param.kind == param.VAR_KEYWORD:
|
||||||
|
# The function allows variable keyword arguments so we shouldn't
|
||||||
|
# include **kwargs etc. in the schema and switch to non-strict
|
||||||
|
# mode and pass through all other values
|
||||||
|
has_variable = True
|
||||||
|
continue
|
||||||
|
# If no annotation is specified assume it's anything
|
||||||
|
annotation = param.annotation if param.annotation != param.empty else Any
|
||||||
|
# If no default value is specified assume that it's required. Cython
|
||||||
|
# functions/methods will have param.empty for default value None so we
|
||||||
|
# need to treat them differently
|
||||||
|
default_empty = None if is_cython_func(func) else ...
|
||||||
|
default = param.default if param.default != param.empty else default_empty
|
||||||
|
sig_args[param.name] = (annotation, default)
|
||||||
|
is_strict = strict and not has_variable
|
||||||
|
sig_args["__config__"] = ArgSchemaConfig if is_strict else ArgSchemaConfigExtra
|
||||||
|
return create_model(name, **sig_args)
|
||||||
|
|
||||||
|
|
||||||
|
def validate_init_settings(
|
||||||
|
func: Callable,
|
||||||
|
settings: Dict[str, Any],
|
||||||
|
*,
|
||||||
|
section: Optional[str] = None,
|
||||||
|
name: str = "",
|
||||||
|
exclude: Iterable[str] = ("get_examples", "nlp"),
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Validate initialization settings against the expected arguments in
|
||||||
|
the method signature. Will parse values if possible (e.g. int to string)
|
||||||
|
and return the updated settings dict. Will raise a ConfigValidationError
|
||||||
|
if types don't match or required values are missing.
|
||||||
|
|
||||||
|
func (Callable): The initialize method of a given component etc.
|
||||||
|
settings (Dict[str, Any]): The settings from the repsective [initialize] block.
|
||||||
|
section (str): Initialize section, for error message.
|
||||||
|
name (str): Name of the block in the section.
|
||||||
|
exclude (Iterable[str]): Parameter names to exclude from schema.
|
||||||
|
RETURNS (Dict[str, Any]): The validated settings.
|
||||||
|
"""
|
||||||
|
schema = get_arg_model(func, exclude=exclude, name="InitArgModel")
|
||||||
|
try:
|
||||||
|
return schema(**settings).dict()
|
||||||
|
except ValidationError as e:
|
||||||
|
block = "initialize" if not section else f"initialize.{section}"
|
||||||
|
title = f"Error validating initialization settings in [{block}]"
|
||||||
|
raise ConfigValidationError(
|
||||||
|
title=title, errors=e.errors(), config=settings, parent=name
|
||||||
|
) from None
|
||||||
|
|
||||||
|
|
||||||
# Matcher token patterns
|
# Matcher token patterns
|
||||||
|
|
||||||
|
|
||||||
|
@ -202,8 +297,6 @@ class ModelMetaSchema(BaseModel):
|
||||||
|
|
||||||
class ConfigSchemaTraining(BaseModel):
|
class ConfigSchemaTraining(BaseModel):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
|
|
||||||
lookups: Optional[Lookups] = Field(..., title="Vocab lookups")
|
|
||||||
dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data")
|
dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data")
|
||||||
train_corpus: StrictStr = Field(..., title="Path in the config to the training data")
|
train_corpus: StrictStr = Field(..., title="Path in the config to the training data")
|
||||||
batcher: Batcher = Field(..., title="Batcher for the training data")
|
batcher: Batcher = Field(..., title="Batcher for the training data")
|
||||||
|
@ -216,8 +309,6 @@ class ConfigSchemaTraining(BaseModel):
|
||||||
gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU")
|
gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU")
|
||||||
accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps")
|
accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps")
|
||||||
score_weights: Dict[StrictStr, Optional[Union[StrictFloat, StrictInt]]] = Field(..., title="Scores to report and their weights for selecting final model")
|
score_weights: Dict[StrictStr, Optional[Union[StrictFloat, StrictInt]]] = Field(..., title="Scores to report and their weights for selecting final model")
|
||||||
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
|
|
||||||
raw_text: Optional[StrictStr] = Field(default=None, title="Raw text")
|
|
||||||
optimizer: Optimizer = Field(..., title="The optimizer to use")
|
optimizer: Optimizer = Field(..., title="The optimizer to use")
|
||||||
logger: Logger = Field(..., title="The logger to track training progress")
|
logger: Logger = Field(..., title="The logger to track training progress")
|
||||||
frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training")
|
frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training")
|
||||||
|
@ -270,28 +361,42 @@ class ConfigSchemaPretrain(BaseModel):
|
||||||
arbitrary_types_allowed = True
|
arbitrary_types_allowed = True
|
||||||
|
|
||||||
|
|
||||||
|
class ConfigSchemaInit(BaseModel):
|
||||||
|
# fmt: off
|
||||||
|
vocab_data: Optional[StrictStr] = Field(..., title="Path to JSON-formatted vocabulary file")
|
||||||
|
lookups: Optional[Lookups] = Field(..., title="Vocabulary lookups, e.g. lexeme normalization")
|
||||||
|
vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
|
||||||
|
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
|
||||||
|
tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize")
|
||||||
|
components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for Pipe.initialize methods of pipeline components, keyed by component")
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
extra = "forbid"
|
||||||
|
arbitrary_types_allowed = True
|
||||||
|
|
||||||
|
|
||||||
class ConfigSchema(BaseModel):
|
class ConfigSchema(BaseModel):
|
||||||
training: ConfigSchemaTraining
|
training: ConfigSchemaTraining
|
||||||
nlp: ConfigSchemaNlp
|
nlp: ConfigSchemaNlp
|
||||||
pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {}
|
pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {}
|
||||||
components: Dict[str, Dict[str, Any]]
|
components: Dict[str, Dict[str, Any]]
|
||||||
corpora: Dict[str, Reader]
|
corpora: Dict[str, Reader]
|
||||||
|
initialize: ConfigSchemaInit
|
||||||
@root_validator(allow_reuse=True)
|
|
||||||
def validate_config(cls, values):
|
|
||||||
"""Perform additional validation for settings with dependencies."""
|
|
||||||
pt = values.get("pretraining")
|
|
||||||
if pt and not isinstance(pt, ConfigSchemaPretrainEmpty):
|
|
||||||
if pt.objective.get("type") == "vectors" and not values["nlp"].vectors:
|
|
||||||
err = "Need nlp.vectors if pretraining.objective.type is vectors"
|
|
||||||
raise ValueError(err)
|
|
||||||
return values
|
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
extra = "allow"
|
extra = "allow"
|
||||||
arbitrary_types_allowed = True
|
arbitrary_types_allowed = True
|
||||||
|
|
||||||
|
|
||||||
|
CONFIG_SCHEMAS = {
|
||||||
|
"nlp": ConfigSchemaNlp,
|
||||||
|
"training": ConfigSchemaTraining,
|
||||||
|
"pretraining": ConfigSchemaPretrain,
|
||||||
|
"initialize": ConfigSchemaInit,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
# Project config Schema
|
# Project config Schema
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -32,9 +32,7 @@ class PRFScore:
|
||||||
|
|
||||||
def __add__(self, other):
|
def __add__(self, other):
|
||||||
return PRFScore(
|
return PRFScore(
|
||||||
tp=self.tp+other.tp,
|
tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn
|
||||||
fp=self.fp+other.fp,
|
|
||||||
fn=self.fn+other.fn
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def score_set(self, cand: set, gold: set) -> None:
|
def score_set(self, cand: set, gold: set) -> None:
|
||||||
|
@ -485,7 +483,7 @@ class Scorer:
|
||||||
(pred_ent.start_char, pred_ent.end_char), None
|
(pred_ent.start_char, pred_ent.end_char), None
|
||||||
)
|
)
|
||||||
label = gold_span.label_
|
label = gold_span.label_
|
||||||
if not label in f_per_type:
|
if label not in f_per_type:
|
||||||
f_per_type[label] = PRFScore()
|
f_per_type[label] = PRFScore()
|
||||||
gold = gold_span.kb_id_
|
gold = gold_span.kb_id_
|
||||||
# only evaluating entities that overlap between gold and pred,
|
# only evaluating entities that overlap between gold and pred,
|
||||||
|
@ -632,7 +630,6 @@ def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]:
|
||||||
continue
|
continue
|
||||||
golds = {(e.label_, e.start, e.end) for e in eg.y.ents}
|
golds = {(e.label_, e.start, e.end) for e in eg.y.ents}
|
||||||
align_x2y = eg.alignment.x2y
|
align_x2y = eg.alignment.x2y
|
||||||
preds = set()
|
|
||||||
for pred_ent in eg.x.ents:
|
for pred_ent in eg.x.ents:
|
||||||
if pred_ent.label_ not in scores:
|
if pred_ent.label_ not in scores:
|
||||||
scores[pred_ent.label_] = PRFScore()
|
scores[pred_ent.label_] = PRFScore()
|
||||||
|
|
|
@ -466,3 +466,4 @@ cdef enum symbol_t:
|
||||||
ENT_ID
|
ENT_ID
|
||||||
|
|
||||||
IDX
|
IDX
|
||||||
|
_
|
||||||
|
|
|
@ -465,6 +465,7 @@ IDS = {
|
||||||
"acl": acl,
|
"acl": acl,
|
||||||
"LAW": LAW,
|
"LAW": LAW,
|
||||||
"MORPH": MORPH,
|
"MORPH": MORPH,
|
||||||
|
"_": _,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -272,22 +272,35 @@ def zh_tokenizer_char():
|
||||||
def zh_tokenizer_jieba():
|
def zh_tokenizer_jieba():
|
||||||
pytest.importorskip("jieba")
|
pytest.importorskip("jieba")
|
||||||
config = {
|
config = {
|
||||||
"@tokenizers": "spacy.zh.ChineseTokenizer",
|
"nlp": {
|
||||||
"segmenter": "jieba",
|
"tokenizer": {
|
||||||
|
"@tokenizers": "spacy.zh.ChineseTokenizer",
|
||||||
|
"segmenter": "jieba",
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
nlp = get_lang_class("zh").from_config({"nlp": {"tokenizer": config}})
|
nlp = get_lang_class("zh").from_config(config)
|
||||||
return nlp.tokenizer
|
return nlp.tokenizer
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def zh_tokenizer_pkuseg():
|
def zh_tokenizer_pkuseg():
|
||||||
pytest.importorskip("pkuseg")
|
pytest.importorskip("pkuseg")
|
||||||
|
pytest.importorskip("pickle5")
|
||||||
config = {
|
config = {
|
||||||
"@tokenizers": "spacy.zh.ChineseTokenizer",
|
"nlp": {
|
||||||
"segmenter": "pkuseg",
|
"tokenizer": {
|
||||||
"pkuseg_model": "default",
|
"@tokenizers": "spacy.zh.ChineseTokenizer",
|
||||||
|
"segmenter": "pkuseg",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"initialize": {"tokenizer": {
|
||||||
|
"pkuseg_model": "default",
|
||||||
|
}
|
||||||
|
},
|
||||||
}
|
}
|
||||||
nlp = get_lang_class("zh").from_config({"nlp": {"tokenizer": config}})
|
nlp = get_lang_class("zh").from_config(config)
|
||||||
|
nlp.initialize()
|
||||||
return nlp.tokenizer
|
return nlp.tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -24,9 +24,9 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
|
||||||
"update_with_oracle_cut_size": 100,
|
"update_with_oracle_cut_size": 100,
|
||||||
}
|
}
|
||||||
cfg = {"model": DEFAULT_NER_MODEL}
|
cfg = {"model": DEFAULT_NER_MODEL}
|
||||||
model = registry.make_from_config(cfg, validate=True)["model"]
|
model = registry.resolve(cfg, validate=True)["model"]
|
||||||
ner = EntityRecognizer(en_vocab, model, **config)
|
ner = EntityRecognizer(en_vocab, model, **config)
|
||||||
ner.begin_training(lambda: [_ner_example(ner)])
|
ner.initialize(lambda: [_ner_example(ner)])
|
||||||
ner(doc)
|
ner(doc)
|
||||||
|
|
||||||
doc.ents = [("ANIMAL", 3, 4)]
|
doc.ents = [("ANIMAL", 3, 4)]
|
||||||
|
@ -46,9 +46,9 @@ def test_ents_reset(en_vocab):
|
||||||
"update_with_oracle_cut_size": 100,
|
"update_with_oracle_cut_size": 100,
|
||||||
}
|
}
|
||||||
cfg = {"model": DEFAULT_NER_MODEL}
|
cfg = {"model": DEFAULT_NER_MODEL}
|
||||||
model = registry.make_from_config(cfg, validate=True)["model"]
|
model = registry.resolve(cfg, validate=True)["model"]
|
||||||
ner = EntityRecognizer(en_vocab, model, **config)
|
ner = EntityRecognizer(en_vocab, model, **config)
|
||||||
ner.begin_training(lambda: [_ner_example(ner)])
|
ner.initialize(lambda: [_ner_example(ner)])
|
||||||
ner(doc)
|
ner(doc)
|
||||||
orig_iobs = [t.ent_iob_ for t in doc]
|
orig_iobs = [t.ent_iob_ for t in doc]
|
||||||
doc.ents = list(doc.ents)
|
doc.ents = list(doc.ents)
|
||||||
|
|
|
@ -19,7 +19,7 @@ def test_doc_api_init(en_vocab):
|
||||||
assert [t.is_sent_start for t in doc] == [True, False, True, False]
|
assert [t.is_sent_start for t in doc] == [True, False, True, False]
|
||||||
# heads override sent_starts
|
# heads override sent_starts
|
||||||
doc = Doc(
|
doc = Doc(
|
||||||
en_vocab, words=words, sent_starts=[True] * 4, heads=heads, deps=["dep"] * 4,
|
en_vocab, words=words, sent_starts=[True] * 4, heads=heads, deps=["dep"] * 4
|
||||||
)
|
)
|
||||||
assert [t.is_sent_start for t in doc] == [True, False, True, False]
|
assert [t.is_sent_start for t in doc] == [True, False, True, False]
|
||||||
|
|
||||||
|
@ -533,5 +533,52 @@ def test_doc_ents_setter():
|
||||||
assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
|
assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
|
||||||
vocab = Vocab()
|
vocab = Vocab()
|
||||||
ents = [("HELLO", 0, 2), (vocab.strings.add("WORLD"), 3, 5)]
|
ents = [("HELLO", 0, 2), (vocab.strings.add("WORLD"), 3, 5)]
|
||||||
|
ents = ["B-HELLO", "I-HELLO", "O", "B-WORLD", "I-WORLD"]
|
||||||
doc = Doc(vocab, words=words, ents=ents)
|
doc = Doc(vocab, words=words, ents=ents)
|
||||||
assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
|
assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_init_iob():
|
||||||
|
"""Test ents validation/normalization in Doc.__init__"""
|
||||||
|
words = ["a", "b", "c", "d", "e"]
|
||||||
|
ents = ["O"] * len(words)
|
||||||
|
doc = Doc(Vocab(), words=words, ents=ents)
|
||||||
|
assert doc.ents == ()
|
||||||
|
|
||||||
|
ents = ["B-PERSON", "I-PERSON", "O", "I-PERSON", "I-PERSON"]
|
||||||
|
doc = Doc(Vocab(), words=words, ents=ents)
|
||||||
|
assert len(doc.ents) == 2
|
||||||
|
|
||||||
|
ents = ["B-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
|
||||||
|
doc = Doc(Vocab(), words=words, ents=ents)
|
||||||
|
assert len(doc.ents) == 3
|
||||||
|
|
||||||
|
# None is missing
|
||||||
|
ents = ["B-PERSON", "I-PERSON", "O", None, "I-GPE"]
|
||||||
|
doc = Doc(Vocab(), words=words, ents=ents)
|
||||||
|
assert len(doc.ents) == 2
|
||||||
|
|
||||||
|
# empty tag is missing
|
||||||
|
ents = ["", "B-PERSON", "O", "B-PERSON", "I-PERSON"]
|
||||||
|
doc = Doc(Vocab(), words=words, ents=ents)
|
||||||
|
assert len(doc.ents) == 2
|
||||||
|
|
||||||
|
# invalid IOB
|
||||||
|
ents = ["Q-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
doc = Doc(Vocab(), words=words, ents=ents)
|
||||||
|
|
||||||
|
# no dash
|
||||||
|
ents = ["OPERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
doc = Doc(Vocab(), words=words, ents=ents)
|
||||||
|
|
||||||
|
# no ent type
|
||||||
|
ents = ["O", "B-", "O", "I-PERSON", "I-GPE"]
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
doc = Doc(Vocab(), words=words, ents=ents)
|
||||||
|
|
||||||
|
# not strings or None
|
||||||
|
ents = [0, "B-", "O", "I-PERSON", "I-GPE"]
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
doc = Doc(Vocab(), words=words, ents=ents)
|
||||||
|
|
|
@ -201,6 +201,12 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer):
|
||||||
heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15]
|
heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15]
|
||||||
tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"]
|
tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"]
|
||||||
ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)]
|
ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)]
|
||||||
|
ents = ["O"] * len(heads)
|
||||||
|
ents[0] = "B-PERSON"
|
||||||
|
ents[1] = "I-PERSON"
|
||||||
|
ents[10] = "B-GPE"
|
||||||
|
ents[13] = "B-PERSON"
|
||||||
|
ents[14] = "I-PERSON"
|
||||||
# fmt: on
|
# fmt: on
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = Doc(
|
doc = Doc(
|
||||||
|
@ -269,7 +275,11 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
|
||||||
# if there is a parse, span.root provides default values
|
# if there is a parse, span.root provides default values
|
||||||
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
|
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
|
||||||
heads = [0, 0, 3, 0, 0, 0, 5, 0, 0]
|
heads = [0, 0, 3, 0, 0, 0, 5, 0, 0]
|
||||||
ents = [("ent-de", 3, 5), ("ent-fg", 5, 7)]
|
ents = ["O"] * len(words)
|
||||||
|
ents[3] = "B-ent-de"
|
||||||
|
ents[4] = "I-ent-de"
|
||||||
|
ents[5] = "B-ent-fg"
|
||||||
|
ents[6] = "I-ent-fg"
|
||||||
deps = ["dep"] * len(words)
|
deps = ["dep"] * len(words)
|
||||||
en_vocab.strings.add("ent-de")
|
en_vocab.strings.add("ent-de")
|
||||||
en_vocab.strings.add("ent-fg")
|
en_vocab.strings.add("ent-fg")
|
||||||
|
@ -292,7 +302,11 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
|
||||||
# check that B is preserved if span[start] is B
|
# check that B is preserved if span[start] is B
|
||||||
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
|
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
|
||||||
heads = [0, 0, 3, 4, 0, 0, 5, 0, 0]
|
heads = [0, 0, 3, 4, 0, 0, 5, 0, 0]
|
||||||
ents = [("ent-de", 3, 5), ("ent-de", 5, 7)]
|
ents = ["O"] * len(words)
|
||||||
|
ents[3] = "B-ent-de"
|
||||||
|
ents[4] = "I-ent-de"
|
||||||
|
ents[5] = "B-ent-de"
|
||||||
|
ents[6] = "I-ent-de"
|
||||||
deps = ["dep"] * len(words)
|
deps = ["dep"] * len(words)
|
||||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
|
doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
|
|
|
@ -9,7 +9,7 @@ def doc(en_vocab):
|
||||||
tags = ["VBP", "NN", "NN"]
|
tags = ["VBP", "NN", "NN"]
|
||||||
heads = [0, 0, 0]
|
heads = [0, 0, 0]
|
||||||
deps = ["ROOT", "dobj", "dobj"]
|
deps = ["ROOT", "dobj", "dobj"]
|
||||||
ents = [("ORG", 1, 2)]
|
ents = ["O", "B-ORG", "O"]
|
||||||
return Doc(
|
return Doc(
|
||||||
en_vocab, words=words, pos=pos, tags=tags, heads=heads, deps=deps, ents=ents
|
en_vocab, words=words, pos=pos, tags=tags, heads=heads, deps=deps, ents=ents
|
||||||
)
|
)
|
||||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed_de(de_tokenizer):
|
def test_noun_chunks_is_parsed_de(de_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed.
|
"""Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed."""
|
||||||
"""
|
|
||||||
doc = de_tokenizer("Er lag auf seinem")
|
doc = de_tokenizer("Er lag auf seinem")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed_el(el_tokenizer):
|
def test_noun_chunks_is_parsed_el(el_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed.
|
"""Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed."""
|
||||||
"""
|
|
||||||
doc = el_tokenizer("είναι χώρα της νοτιοανατολικής")
|
doc = el_tokenizer("είναι χώρα της νοτιοανατολικής")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
|
@ -7,8 +7,7 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed(en_tokenizer):
|
def test_noun_chunks_is_parsed(en_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed.
|
"""Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed."""
|
||||||
"""
|
|
||||||
doc = en_tokenizer("This is a sentence")
|
doc = en_tokenizer("This is a sentence")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed_es(es_tokenizer):
|
def test_noun_chunks_is_parsed_es(es_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed.
|
"""Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed."""
|
||||||
"""
|
|
||||||
doc = es_tokenizer("en Oxford este verano")
|
doc = es_tokenizer("en Oxford este verano")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed_fa(fa_tokenizer):
|
def test_noun_chunks_is_parsed_fa(fa_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'fa' language if Doc is not parsed.
|
"""Test that noun_chunks raises Value Error for 'fa' language if Doc is not parsed."""
|
||||||
"""
|
|
||||||
|
|
||||||
doc = fa_tokenizer("این یک جمله نمونه می باشد.")
|
doc = fa_tokenizer("این یک جمله نمونه می باشد.")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
|
|
|
@ -36,9 +36,7 @@ def test_fr_tokenizer_infix_exceptions(fr_tokenizer, text):
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize("text", ["janv.", "juill.", "Dr.", "av.", "sept."])
|
||||||
"text", ["janv.", "juill.", "Dr.", "av.", "sept."],
|
|
||||||
)
|
|
||||||
def test_fr_tokenizer_handles_abbr(fr_tokenizer, text):
|
def test_fr_tokenizer_handles_abbr(fr_tokenizer, text):
|
||||||
tokens = fr_tokenizer(text)
|
tokens = fr_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed_fr(fr_tokenizer):
|
def test_noun_chunks_is_parsed_fr(fr_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed.
|
"""Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed."""
|
||||||
"""
|
|
||||||
doc = fr_tokenizer("trouver des travaux antérieurs")
|
doc = fr_tokenizer("trouver des travaux antérieurs")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed_id(id_tokenizer):
|
def test_noun_chunks_is_parsed_id(id_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed.
|
"""Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed."""
|
||||||
"""
|
|
||||||
doc = id_tokenizer("sebelas")
|
doc = id_tokenizer("sebelas")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
|
@ -112,7 +112,7 @@ def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c):
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS,
|
"text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS
|
||||||
)
|
)
|
||||||
def test_ja_tokenizer_sub_tokens(
|
def test_ja_tokenizer_sub_tokens(
|
||||||
ja_tokenizer, text, sub_tokens_list_a, sub_tokens_list_b, sub_tokens_list_c
|
ja_tokenizer, text, sub_tokens_list_a, sub_tokens_list_b, sub_tokens_list_c
|
||||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed_nb(nb_tokenizer):
|
def test_noun_chunks_is_parsed_nb(nb_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed.
|
"""Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed."""
|
||||||
"""
|
|
||||||
doc = nb_tokenizer("Smørsausen brukes bl.a. til")
|
doc = nb_tokenizer("Smørsausen brukes bl.a. til")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
|
@ -8,7 +8,7 @@ def test_ne_tokenizer_handlers_long_text(ne_tokenizer):
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"text,length", [("समय जान कति पनि बेर लाग्दैन ।", 7), ("म ठूलो हुँदै थिएँ ।", 5)],
|
"text,length", [("समय जान कति पनि बेर लाग्दैन ।", 7), ("म ठूलो हुँदै थिएँ ।", 5)]
|
||||||
)
|
)
|
||||||
def test_ne_tokenizer_handles_cnts(ne_tokenizer, text, length):
|
def test_ne_tokenizer_handles_cnts(ne_tokenizer, text, length):
|
||||||
tokens = ne_tokenizer(text)
|
tokens = ne_tokenizer(text)
|
||||||
|
|
|
@ -10,7 +10,7 @@ def test_sa_tokenizer_handles_long_text(sa_tokenizer):
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"text,length",
|
"text,length",
|
||||||
[
|
[
|
||||||
("श्री भगवानुवाच पश्य मे पार्थ रूपाणि शतशोऽथ सहस्रशः।", 9,),
|
("श्री भगवानुवाच पश्य मे पार्थ रूपाणि शतशोऽथ सहस्रशः।", 9),
|
||||||
("गुणान् सर्वान् स्वभावो मूर्ध्नि वर्तते ।", 6),
|
("गुणान् सर्वान् स्वभावो मूर्ध्नि वर्तते ।", 6),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
|
@ -3,8 +3,7 @@ from spacy.tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed_sv(sv_tokenizer):
|
def test_noun_chunks_is_parsed_sv(sv_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed.
|
"""Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed."""
|
||||||
"""
|
|
||||||
doc = sv_tokenizer("Studenten läste den bästa boken")
|
doc = sv_tokenizer("Studenten läste den bästa boken")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
|
@ -27,9 +27,18 @@ def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba):
|
||||||
|
|
||||||
@pytest.mark.slow
|
@pytest.mark.slow
|
||||||
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
|
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
|
||||||
nlp = Chinese(
|
config = {
|
||||||
meta={
|
"nlp": {
|
||||||
"tokenizer": {"config": {"segmenter": "pkuseg", "pkuseg_model": "medicine"}}
|
"tokenizer": {
|
||||||
}
|
"@tokenizers": "spacy.zh.ChineseTokenizer",
|
||||||
)
|
"segmenter": "pkuseg",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"initialize": {"tokenizer": {
|
||||||
|
"pkuseg_model": "medicine",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
nlp = Chinese.from_config(config)
|
||||||
|
nlp.initialize()
|
||||||
zh_tokenizer_serialize(nlp.tokenizer)
|
zh_tokenizer_serialize(nlp.tokenizer)
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.lang.zh import Chinese, _get_pkuseg_trie_data
|
from spacy.lang.zh import Chinese, _get_pkuseg_trie_data
|
||||||
from thinc.config import ConfigValidationError
|
from thinc.api import ConfigValidationError
|
||||||
|
|
||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
|
|
|
@ -23,7 +23,7 @@ def parser(vocab):
|
||||||
"update_with_oracle_cut_size": 100,
|
"update_with_oracle_cut_size": 100,
|
||||||
}
|
}
|
||||||
cfg = {"model": DEFAULT_PARSER_MODEL}
|
cfg = {"model": DEFAULT_PARSER_MODEL}
|
||||||
model = registry.make_from_config(cfg, validate=True)["model"]
|
model = registry.resolve(cfg, validate=True)["model"]
|
||||||
parser = DependencyParser(vocab, model, **config)
|
parser = DependencyParser(vocab, model, **config)
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
@ -35,7 +35,7 @@ def test_init_parser(parser):
|
||||||
def _train_parser(parser):
|
def _train_parser(parser):
|
||||||
fix_random_seed(1)
|
fix_random_seed(1)
|
||||||
parser.add_label("left")
|
parser.add_label("left")
|
||||||
parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg)
|
parser.initialize(lambda: [_parser_example(parser)])
|
||||||
sgd = Adam(0.001)
|
sgd = Adam(0.001)
|
||||||
|
|
||||||
for i in range(5):
|
for i in range(5):
|
||||||
|
@ -82,12 +82,12 @@ def test_add_label_deserializes_correctly():
|
||||||
"update_with_oracle_cut_size": 100,
|
"update_with_oracle_cut_size": 100,
|
||||||
}
|
}
|
||||||
cfg = {"model": DEFAULT_NER_MODEL}
|
cfg = {"model": DEFAULT_NER_MODEL}
|
||||||
model = registry.make_from_config(cfg, validate=True)["model"]
|
model = registry.resolve(cfg, validate=True)["model"]
|
||||||
ner1 = EntityRecognizer(Vocab(), model, **config)
|
ner1 = EntityRecognizer(Vocab(), model, **config)
|
||||||
ner1.add_label("C")
|
ner1.add_label("C")
|
||||||
ner1.add_label("B")
|
ner1.add_label("B")
|
||||||
ner1.add_label("A")
|
ner1.add_label("A")
|
||||||
ner1.begin_training(lambda: [_ner_example(ner1)])
|
ner1.initialize(lambda: [_ner_example(ner1)])
|
||||||
ner2 = EntityRecognizer(Vocab(), model, **config)
|
ner2 = EntityRecognizer(Vocab(), model, **config)
|
||||||
|
|
||||||
# the second model needs to be resized before we can call from_bytes
|
# the second model needs to be resized before we can call from_bytes
|
||||||
|
@ -111,7 +111,7 @@ def test_add_label_get_label(pipe_cls, n_moves, model_config):
|
||||||
splitting the move names.
|
splitting the move names.
|
||||||
"""
|
"""
|
||||||
labels = ["A", "B", "C"]
|
labels = ["A", "B", "C"]
|
||||||
model = registry.make_from_config({"model": model_config}, validate=True)["model"]
|
model = registry.resolve({"model": model_config}, validate=True)["model"]
|
||||||
config = {
|
config = {
|
||||||
"learn_tokens": False,
|
"learn_tokens": False,
|
||||||
"min_action_freq": 30,
|
"min_action_freq": 30,
|
||||||
|
|
|
@ -127,7 +127,7 @@ def test_get_oracle_actions():
|
||||||
"update_with_oracle_cut_size": 100,
|
"update_with_oracle_cut_size": 100,
|
||||||
}
|
}
|
||||||
cfg = {"model": DEFAULT_PARSER_MODEL}
|
cfg = {"model": DEFAULT_PARSER_MODEL}
|
||||||
model = registry.make_from_config(cfg, validate=True)["model"]
|
model = registry.resolve(cfg, validate=True)["model"]
|
||||||
parser = DependencyParser(doc.vocab, model, **config)
|
parser = DependencyParser(doc.vocab, model, **config)
|
||||||
parser.moves.add_action(0, "")
|
parser.moves.add_action(0, "")
|
||||||
parser.moves.add_action(1, "")
|
parser.moves.add_action(1, "")
|
||||||
|
|
|
@ -202,7 +202,7 @@ def test_train_empty():
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
ner = nlp.add_pipe("ner", last=True)
|
ner = nlp.add_pipe("ner", last=True)
|
||||||
ner.add_label("PERSON")
|
ner.add_label("PERSON")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
for itn in range(2):
|
for itn in range(2):
|
||||||
losses = {}
|
losses = {}
|
||||||
batches = util.minibatch(train_examples, size=8)
|
batches = util.minibatch(train_examples, size=8)
|
||||||
|
@ -213,7 +213,7 @@ def test_train_empty():
|
||||||
def test_overwrite_token():
|
def test_overwrite_token():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
nlp.add_pipe("ner")
|
nlp.add_pipe("ner")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
# The untrained NER will predict O for each token
|
# The untrained NER will predict O for each token
|
||||||
doc = nlp("I live in New York")
|
doc = nlp("I live in New York")
|
||||||
assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"]
|
assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"]
|
||||||
|
@ -235,7 +235,7 @@ def test_empty_ner():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
ner = nlp.add_pipe("ner")
|
ner = nlp.add_pipe("ner")
|
||||||
ner.add_label("MY_LABEL")
|
ner.add_label("MY_LABEL")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
doc = nlp("John is watching the news about Croatia's elections")
|
doc = nlp("John is watching the news about Croatia's elections")
|
||||||
# if this goes wrong, the initialization of the parser's upper layer is probably broken
|
# if this goes wrong, the initialization of the parser's upper layer is probably broken
|
||||||
result = ["O", "O", "O", "O", "O", "O", "O", "O", "O"]
|
result = ["O", "O", "O", "O", "O", "O", "O", "O", "O"]
|
||||||
|
@ -254,7 +254,7 @@ def test_ruler_before_ner():
|
||||||
# 2: untrained NER - should set everything else to O
|
# 2: untrained NER - should set everything else to O
|
||||||
untrained_ner = nlp.add_pipe("ner")
|
untrained_ner = nlp.add_pipe("ner")
|
||||||
untrained_ner.add_label("MY_LABEL")
|
untrained_ner.add_label("MY_LABEL")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
doc = nlp("This is Antti Korhonen speaking in Finland")
|
doc = nlp("This is Antti Korhonen speaking in Finland")
|
||||||
expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
|
expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
|
||||||
expected_types = ["THING", "", "", "", "", "", ""]
|
expected_types = ["THING", "", "", "", "", "", ""]
|
||||||
|
@ -269,7 +269,7 @@ def test_ner_before_ruler():
|
||||||
# 1: untrained NER - should set everything to O
|
# 1: untrained NER - should set everything to O
|
||||||
untrained_ner = nlp.add_pipe("ner", name="uner")
|
untrained_ner = nlp.add_pipe("ner", name="uner")
|
||||||
untrained_ner.add_label("MY_LABEL")
|
untrained_ner.add_label("MY_LABEL")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
|
|
||||||
# 2 : Entity Ruler - should set "this" to B and keep everything else O
|
# 2 : Entity Ruler - should set "this" to B and keep everything else O
|
||||||
patterns = [{"label": "THING", "pattern": "This"}]
|
patterns = [{"label": "THING", "pattern": "This"}]
|
||||||
|
@ -290,7 +290,7 @@ def test_block_ner():
|
||||||
nlp.add_pipe("blocker", config={"start": 2, "end": 5})
|
nlp.add_pipe("blocker", config={"start": 2, "end": 5})
|
||||||
untrained_ner = nlp.add_pipe("ner")
|
untrained_ner = nlp.add_pipe("ner")
|
||||||
untrained_ner.add_label("MY_LABEL")
|
untrained_ner.add_label("MY_LABEL")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
doc = nlp("This is Antti L Korhonen speaking in Finland")
|
doc = nlp("This is Antti L Korhonen speaking in Finland")
|
||||||
expected_iobs = ["O", "O", "B", "B", "B", "O", "O", "O"]
|
expected_iobs = ["O", "O", "B", "B", "B", "O", "O", "O"]
|
||||||
expected_types = ["", "", "", "", "", "", "", ""]
|
expected_types = ["", "", "", "", "", "", "", ""]
|
||||||
|
@ -307,7 +307,7 @@ def test_overfitting_IO():
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||||
for ent in annotations.get("entities"):
|
for ent in annotations.get("entities"):
|
||||||
ner.add_label(ent[2])
|
ner.add_label(ent[2])
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.initialize()
|
||||||
|
|
||||||
for i in range(50):
|
for i in range(50):
|
||||||
losses = {}
|
losses = {}
|
||||||
|
@ -340,13 +340,13 @@ def test_ner_warns_no_lookups(caplog):
|
||||||
assert not len(nlp.vocab.lookups)
|
assert not len(nlp.vocab.lookups)
|
||||||
nlp.add_pipe("ner")
|
nlp.add_pipe("ner")
|
||||||
with caplog.at_level(logging.DEBUG):
|
with caplog.at_level(logging.DEBUG):
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
assert "W033" in caplog.text
|
assert "W033" in caplog.text
|
||||||
caplog.clear()
|
caplog.clear()
|
||||||
nlp.vocab.lookups.add_table("lexeme_norm")
|
nlp.vocab.lookups.add_table("lexeme_norm")
|
||||||
nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A"
|
nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A"
|
||||||
with caplog.at_level(logging.DEBUG):
|
with caplog.at_level(logging.DEBUG):
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
assert "W033" not in caplog.text
|
assert "W033" not in caplog.text
|
||||||
|
|
||||||
|
|
||||||
|
@ -358,5 +358,5 @@ class BlockerComponent1:
|
||||||
self.name = name
|
self.name = name
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
doc.set_ents([], blocked=[doc[self.start:self.end]], default="unmodified")
|
doc.set_ents([], blocked=[doc[self.start : self.end]], default="unmodified")
|
||||||
return doc
|
return doc
|
||||||
|
|
|
@ -25,7 +25,7 @@ def arc_eager(vocab):
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def tok2vec():
|
def tok2vec():
|
||||||
cfg = {"model": DEFAULT_TOK2VEC_MODEL}
|
cfg = {"model": DEFAULT_TOK2VEC_MODEL}
|
||||||
tok2vec = registry.make_from_config(cfg, validate=True)["model"]
|
tok2vec = registry.resolve(cfg, validate=True)["model"]
|
||||||
tok2vec.initialize()
|
tok2vec.initialize()
|
||||||
return tok2vec
|
return tok2vec
|
||||||
|
|
||||||
|
@ -38,14 +38,14 @@ def parser(vocab, arc_eager):
|
||||||
"update_with_oracle_cut_size": 100,
|
"update_with_oracle_cut_size": 100,
|
||||||
}
|
}
|
||||||
cfg = {"model": DEFAULT_PARSER_MODEL}
|
cfg = {"model": DEFAULT_PARSER_MODEL}
|
||||||
model = registry.make_from_config(cfg, validate=True)["model"]
|
model = registry.resolve(cfg, validate=True)["model"]
|
||||||
return Parser(vocab, model, moves=arc_eager, **config)
|
return Parser(vocab, model, moves=arc_eager, **config)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def model(arc_eager, tok2vec, vocab):
|
def model(arc_eager, tok2vec, vocab):
|
||||||
cfg = {"model": DEFAULT_PARSER_MODEL}
|
cfg = {"model": DEFAULT_PARSER_MODEL}
|
||||||
model = registry.make_from_config(cfg, validate=True)["model"]
|
model = registry.resolve(cfg, validate=True)["model"]
|
||||||
model.attrs["resize_output"](model, arc_eager.n_moves)
|
model.attrs["resize_output"](model, arc_eager.n_moves)
|
||||||
model.initialize()
|
model.initialize()
|
||||||
return model
|
return model
|
||||||
|
@ -72,7 +72,7 @@ def test_build_model(parser, vocab):
|
||||||
"update_with_oracle_cut_size": 100,
|
"update_with_oracle_cut_size": 100,
|
||||||
}
|
}
|
||||||
cfg = {"model": DEFAULT_PARSER_MODEL}
|
cfg = {"model": DEFAULT_PARSER_MODEL}
|
||||||
model = registry.make_from_config(cfg, validate=True)["model"]
|
model = registry.resolve(cfg, validate=True)["model"]
|
||||||
parser.model = Parser(vocab, model=model, moves=parser.moves, **config).model
|
parser.model = Parser(vocab, model=model, moves=parser.moves, **config).model
|
||||||
assert parser.model is not None
|
assert parser.model is not None
|
||||||
|
|
||||||
|
|
|
@ -191,7 +191,7 @@ def test_overfitting_IO():
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||||
for dep in annotations.get("deps", []):
|
for dep in annotations.get("deps", []):
|
||||||
parser.add_label(dep)
|
parser.add_label(dep)
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.initialize()
|
||||||
for i in range(100):
|
for i in range(100):
|
||||||
losses = {}
|
losses = {}
|
||||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||||
|
|
|
@ -28,13 +28,13 @@ def parser(vocab):
|
||||||
"update_with_oracle_cut_size": 100,
|
"update_with_oracle_cut_size": 100,
|
||||||
}
|
}
|
||||||
cfg = {"model": DEFAULT_PARSER_MODEL}
|
cfg = {"model": DEFAULT_PARSER_MODEL}
|
||||||
model = registry.make_from_config(cfg, validate=True)["model"]
|
model = registry.resolve(cfg, validate=True)["model"]
|
||||||
parser = DependencyParser(vocab, model, **config)
|
parser = DependencyParser(vocab, model, **config)
|
||||||
parser.cfg["token_vector_width"] = 4
|
parser.cfg["token_vector_width"] = 4
|
||||||
parser.cfg["hidden_width"] = 32
|
parser.cfg["hidden_width"] = 32
|
||||||
# parser.add_label('right')
|
# parser.add_label('right')
|
||||||
parser.add_label("left")
|
parser.add_label("left")
|
||||||
parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg)
|
parser.initialize(lambda: [_parser_example(parser)])
|
||||||
sgd = Adam(0.001)
|
sgd = Adam(0.001)
|
||||||
|
|
||||||
for i in range(10):
|
for i in range(10):
|
||||||
|
|
|
@ -134,7 +134,7 @@ def test_kb_undefined(nlp):
|
||||||
"""Test that the EL can't train without defining a KB"""
|
"""Test that the EL can't train without defining a KB"""
|
||||||
entity_linker = nlp.add_pipe("entity_linker", config={})
|
entity_linker = nlp.add_pipe("entity_linker", config={})
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
entity_linker.begin_training(lambda: [])
|
entity_linker.initialize(lambda: [])
|
||||||
|
|
||||||
|
|
||||||
def test_kb_empty(nlp):
|
def test_kb_empty(nlp):
|
||||||
|
@ -143,7 +143,7 @@ def test_kb_empty(nlp):
|
||||||
entity_linker = nlp.add_pipe("entity_linker", config=config)
|
entity_linker = nlp.add_pipe("entity_linker", config=config)
|
||||||
assert len(entity_linker.kb) == 0
|
assert len(entity_linker.kb) == 0
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
entity_linker.begin_training(lambda: [])
|
entity_linker.initialize(lambda: [])
|
||||||
|
|
||||||
|
|
||||||
def test_kb_serialize(nlp):
|
def test_kb_serialize(nlp):
|
||||||
|
@ -254,14 +254,12 @@ def test_vocab_serialization(nlp):
|
||||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||||
|
|
||||||
# adding entities
|
# adding entities
|
||||||
q1_hash = mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
|
mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
|
||||||
q2_hash = mykb.add_entity(entity="Q2", freq=12, entity_vector=[2])
|
q2_hash = mykb.add_entity(entity="Q2", freq=12, entity_vector=[2])
|
||||||
q3_hash = mykb.add_entity(entity="Q3", freq=5, entity_vector=[3])
|
mykb.add_entity(entity="Q3", freq=5, entity_vector=[3])
|
||||||
|
|
||||||
# adding aliases
|
# adding aliases
|
||||||
douglas_hash = mykb.add_alias(
|
mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1])
|
||||||
alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1]
|
|
||||||
)
|
|
||||||
adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
|
adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
|
||||||
|
|
||||||
candidates = mykb.get_alias_candidates("adam")
|
candidates = mykb.get_alias_candidates("adam")
|
||||||
|
@ -360,7 +358,7 @@ def test_preserving_links_asdoc(nlp):
|
||||||
ruler.add_patterns(patterns)
|
ruler.add_patterns(patterns)
|
||||||
el_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}, "incl_prior": False}
|
el_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}, "incl_prior": False}
|
||||||
entity_linker = nlp.add_pipe("entity_linker", config=el_config, last=True)
|
entity_linker = nlp.add_pipe("entity_linker", config=el_config, last=True)
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
assert entity_linker.model.get_dim("nO") == vector_length
|
assert entity_linker.model.get_dim("nO") == vector_length
|
||||||
|
|
||||||
# test whether the entity links are preserved by the `as_doc()` function
|
# test whether the entity links are preserved by the `as_doc()` function
|
||||||
|
@ -463,7 +461,7 @@ def test_overfitting_IO():
|
||||||
)
|
)
|
||||||
|
|
||||||
# train the NEL pipe
|
# train the NEL pipe
|
||||||
optimizer = nlp.begin_training(get_examples=lambda: train_examples)
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
assert entity_linker.model.get_dim("nO") == vector_length
|
assert entity_linker.model.get_dim("nO") == vector_length
|
||||||
assert entity_linker.model.get_dim("nO") == entity_linker.kb.entity_vector_length
|
assert entity_linker.model.get_dim("nO") == entity_linker.kb.entity_vector_length
|
||||||
|
|
||||||
|
|
69
spacy/tests/pipeline/test_initialize.py
Normal file
69
spacy/tests/pipeline/test_initialize.py
Normal file
|
@ -0,0 +1,69 @@
|
||||||
|
import pytest
|
||||||
|
from spacy.language import Language
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from spacy.training import Example
|
||||||
|
from thinc.api import ConfigValidationError
|
||||||
|
from pydantic import StrictBool
|
||||||
|
|
||||||
|
|
||||||
|
def test_initialize_arguments():
|
||||||
|
name = "test_initialize_arguments"
|
||||||
|
|
||||||
|
class CustomTokenizer:
|
||||||
|
def __init__(self, tokenizer):
|
||||||
|
self.tokenizer = tokenizer
|
||||||
|
self.from_initialize = None
|
||||||
|
|
||||||
|
def __call__(self, text):
|
||||||
|
return self.tokenizer(text)
|
||||||
|
|
||||||
|
def initialize(self, get_examples, nlp, custom: int):
|
||||||
|
self.from_initialize = custom
|
||||||
|
|
||||||
|
class Component:
|
||||||
|
def __init__(self):
|
||||||
|
self.from_initialize = None
|
||||||
|
|
||||||
|
def initialize(
|
||||||
|
self, get_examples, nlp, custom1: str, custom2: StrictBool = False
|
||||||
|
):
|
||||||
|
self.from_initialize = (custom1, custom2)
|
||||||
|
|
||||||
|
Language.factory(name, func=lambda nlp, name: Component())
|
||||||
|
|
||||||
|
nlp = English()
|
||||||
|
nlp.tokenizer = CustomTokenizer(nlp.tokenizer)
|
||||||
|
example = Example.from_dict(nlp("x"), {})
|
||||||
|
get_examples = lambda: [example]
|
||||||
|
nlp.add_pipe(name)
|
||||||
|
# The settings here will typically come from the [initialize] block
|
||||||
|
init_cfg = {"tokenizer": {"custom": 1}, "components": {name: {}}}
|
||||||
|
nlp.config["initialize"].update(init_cfg)
|
||||||
|
with pytest.raises(ConfigValidationError) as e:
|
||||||
|
# Empty config for component, no required custom1 argument
|
||||||
|
nlp.initialize(get_examples)
|
||||||
|
errors = e.value.errors
|
||||||
|
assert len(errors) == 1
|
||||||
|
assert errors[0]["loc"] == ("custom1",)
|
||||||
|
assert errors[0]["type"] == "value_error.missing"
|
||||||
|
init_cfg = {
|
||||||
|
"tokenizer": {"custom": 1},
|
||||||
|
"components": {name: {"custom1": "x", "custom2": 1}},
|
||||||
|
}
|
||||||
|
nlp.config["initialize"].update(init_cfg)
|
||||||
|
with pytest.raises(ConfigValidationError) as e:
|
||||||
|
# Wrong type of custom 2
|
||||||
|
nlp.initialize(get_examples)
|
||||||
|
errors = e.value.errors
|
||||||
|
assert len(errors) == 1
|
||||||
|
assert errors[0]["loc"] == ("custom2",)
|
||||||
|
assert errors[0]["type"] == "value_error.strictbool"
|
||||||
|
init_cfg = {
|
||||||
|
"tokenizer": {"custom": 1},
|
||||||
|
"components": {name: {"custom1": "x"}},
|
||||||
|
}
|
||||||
|
nlp.config["initialize"].update(init_cfg)
|
||||||
|
nlp.initialize(get_examples)
|
||||||
|
assert nlp.tokenizer.from_initialize == 1
|
||||||
|
pipe = nlp.get_pipe(name)
|
||||||
|
assert pipe.from_initialize == ("x", False)
|
|
@ -33,7 +33,7 @@ def test_no_label():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
nlp.add_pipe("morphologizer")
|
nlp.add_pipe("morphologizer")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
|
|
||||||
|
|
||||||
def test_implicit_label():
|
def test_implicit_label():
|
||||||
|
@ -42,7 +42,7 @@ def test_implicit_label():
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for t in TRAIN_DATA:
|
for t in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
nlp.begin_training(get_examples=lambda: train_examples)
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
|
||||||
|
|
||||||
def test_no_resize():
|
def test_no_resize():
|
||||||
|
@ -50,13 +50,13 @@ def test_no_resize():
|
||||||
morphologizer = nlp.add_pipe("morphologizer")
|
morphologizer = nlp.add_pipe("morphologizer")
|
||||||
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
|
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
|
||||||
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "VERB")
|
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "VERB")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
# this throws an error because the morphologizer can't be resized after initialization
|
# this throws an error because the morphologizer can't be resized after initialization
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "ADJ")
|
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "ADJ")
|
||||||
|
|
||||||
|
|
||||||
def test_begin_training_examples():
|
def test_initialize_examples():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
morphologizer = nlp.add_pipe("morphologizer")
|
morphologizer = nlp.add_pipe("morphologizer")
|
||||||
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
|
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
|
||||||
|
@ -64,12 +64,12 @@ def test_begin_training_examples():
|
||||||
for t in TRAIN_DATA:
|
for t in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
# you shouldn't really call this more than once, but for testing it should be fine
|
# you shouldn't really call this more than once, but for testing it should be fine
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
nlp.begin_training(get_examples=lambda: train_examples)
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
with pytest.raises(TypeError):
|
|
||||||
nlp.begin_training(get_examples=lambda: None)
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.begin_training(get_examples=train_examples)
|
nlp.initialize(get_examples=lambda: None)
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
nlp.initialize(get_examples=train_examples)
|
||||||
|
|
||||||
|
|
||||||
def test_overfitting_IO():
|
def test_overfitting_IO():
|
||||||
|
@ -79,7 +79,7 @@ def test_overfitting_IO():
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for inst in TRAIN_DATA:
|
for inst in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
|
train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
|
||||||
optimizer = nlp.begin_training(get_examples=lambda: train_examples)
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
|
||||||
for i in range(50):
|
for i in range(50):
|
||||||
losses = {}
|
losses = {}
|
||||||
|
|
|
@ -4,8 +4,7 @@ from spacy.lang.en import English
|
||||||
from spacy.lang.de import German
|
from spacy.lang.de import German
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.util import registry, SimpleFrozenDict, combine_score_weights
|
from spacy.util import registry, SimpleFrozenDict, combine_score_weights
|
||||||
from thinc.api import Model, Linear
|
from thinc.api import Model, Linear, ConfigValidationError
|
||||||
from thinc.config import ConfigValidationError
|
|
||||||
from pydantic import StrictInt, StrictStr
|
from pydantic import StrictInt, StrictStr
|
||||||
|
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
|
|
|
@ -31,19 +31,19 @@ TRAIN_DATA = [
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_begin_training_examples():
|
def test_initialize_examples():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
nlp.add_pipe("senter")
|
nlp.add_pipe("senter")
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for t in TRAIN_DATA:
|
for t in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
# you shouldn't really call this more than once, but for testing it should be fine
|
# you shouldn't really call this more than once, but for testing it should be fine
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
nlp.begin_training(get_examples=lambda: train_examples)
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
with pytest.raises(TypeError):
|
|
||||||
nlp.begin_training(get_examples=lambda: None)
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.begin_training(get_examples=train_examples)
|
nlp.initialize(get_examples=lambda: None)
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
nlp.initialize(get_examples=train_examples)
|
||||||
|
|
||||||
|
|
||||||
def test_overfitting_IO():
|
def test_overfitting_IO():
|
||||||
|
@ -58,7 +58,7 @@ def test_overfitting_IO():
|
||||||
train_examples[1].reference[11].is_sent_start = False
|
train_examples[1].reference[11].is_sent_start = False
|
||||||
|
|
||||||
nlp.add_pipe("senter")
|
nlp.add_pipe("senter")
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.initialize()
|
||||||
|
|
||||||
for i in range(200):
|
for i in range(200):
|
||||||
losses = {}
|
losses = {}
|
||||||
|
|
|
@ -15,14 +15,14 @@ def test_label_types():
|
||||||
tagger.add_label(9)
|
tagger.add_label(9)
|
||||||
|
|
||||||
|
|
||||||
def test_tagger_begin_training_tag_map():
|
def test_tagger_initialize_tag_map():
|
||||||
"""Test that Tagger.begin_training() without gold tuples does not clobber
|
"""Test that Tagger.initialize() without gold tuples does not clobber
|
||||||
the tag map."""
|
the tag map."""
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
tagger = nlp.add_pipe("tagger")
|
tagger = nlp.add_pipe("tagger")
|
||||||
orig_tag_count = len(tagger.labels)
|
orig_tag_count = len(tagger.labels)
|
||||||
tagger.add_label("A")
|
tagger.add_label("A")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels)
|
assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels)
|
||||||
|
|
||||||
|
|
||||||
|
@ -38,7 +38,7 @@ def test_no_label():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
nlp.add_pipe("tagger")
|
nlp.add_pipe("tagger")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
|
|
||||||
|
|
||||||
def test_no_resize():
|
def test_no_resize():
|
||||||
|
@ -47,7 +47,7 @@ def test_no_resize():
|
||||||
tagger.add_label("N")
|
tagger.add_label("N")
|
||||||
tagger.add_label("V")
|
tagger.add_label("V")
|
||||||
assert tagger.labels == ("N", "V")
|
assert tagger.labels == ("N", "V")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
assert tagger.model.get_dim("nO") == 2
|
assert tagger.model.get_dim("nO") == 2
|
||||||
# this throws an error because the tagger can't be resized after initialization
|
# this throws an error because the tagger can't be resized after initialization
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
|
@ -60,10 +60,10 @@ def test_implicit_label():
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for t in TRAIN_DATA:
|
for t in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
nlp.begin_training(get_examples=lambda: train_examples)
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
|
||||||
|
|
||||||
def test_begin_training_examples():
|
def test_initialize_examples():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
tagger = nlp.add_pipe("tagger")
|
tagger = nlp.add_pipe("tagger")
|
||||||
train_examples = []
|
train_examples = []
|
||||||
|
@ -72,16 +72,16 @@ def test_begin_training_examples():
|
||||||
for t in TRAIN_DATA:
|
for t in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
# you shouldn't really call this more than once, but for testing it should be fine
|
# you shouldn't really call this more than once, but for testing it should be fine
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
nlp.begin_training(get_examples=lambda: train_examples)
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
with pytest.raises(TypeError):
|
|
||||||
nlp.begin_training(get_examples=lambda: None)
|
|
||||||
with pytest.raises(TypeError):
|
|
||||||
nlp.begin_training(get_examples=lambda: train_examples[0])
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.begin_training(get_examples=lambda: [])
|
nlp.initialize(get_examples=lambda: None)
|
||||||
|
with pytest.raises(TypeError):
|
||||||
|
nlp.initialize(get_examples=lambda: train_examples[0])
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.begin_training(get_examples=train_examples)
|
nlp.initialize(get_examples=lambda: [])
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
nlp.initialize(get_examples=train_examples)
|
||||||
|
|
||||||
|
|
||||||
def test_overfitting_IO():
|
def test_overfitting_IO():
|
||||||
|
@ -91,7 +91,7 @@ def test_overfitting_IO():
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for t in TRAIN_DATA:
|
for t in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
optimizer = nlp.begin_training(get_examples=lambda: train_examples)
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
assert tagger.model.get_dim("nO") == len(TAGS)
|
assert tagger.model.get_dim("nO") == len(TAGS)
|
||||||
|
|
||||||
for i in range(50):
|
for i in range(50):
|
||||||
|
@ -122,4 +122,4 @@ def test_tagger_requires_labels():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
nlp.add_pipe("tagger")
|
nlp.add_pipe("tagger")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
|
|
|
@ -9,10 +9,10 @@ from spacy.pipeline import TextCategorizer
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
||||||
from spacy.scorer import Scorer
|
from spacy.scorer import Scorer
|
||||||
|
from spacy.training import Example
|
||||||
|
from spacy.training.initialize import verify_textcat_config
|
||||||
|
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
from ...cli.train import verify_textcat_config
|
|
||||||
from ...training import Example
|
|
||||||
|
|
||||||
|
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
|
@ -26,7 +26,7 @@ def test_simple_train():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
textcat = nlp.add_pipe("textcat")
|
textcat = nlp.add_pipe("textcat")
|
||||||
textcat.add_label("answer")
|
textcat.add_label("answer")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
for i in range(5):
|
for i in range(5):
|
||||||
for text, answer in [
|
for text, answer in [
|
||||||
("aaaa", 1.0),
|
("aaaa", 1.0),
|
||||||
|
@ -56,7 +56,7 @@ def test_textcat_learns_multilabel():
|
||||||
textcat = TextCategorizer(nlp.vocab, width=8)
|
textcat = TextCategorizer(nlp.vocab, width=8)
|
||||||
for letter in letters:
|
for letter in letters:
|
||||||
textcat.add_label(letter)
|
textcat.add_label(letter)
|
||||||
optimizer = textcat.begin_training(lambda: [])
|
optimizer = textcat.initialize(lambda: [])
|
||||||
for i in range(30):
|
for i in range(30):
|
||||||
losses = {}
|
losses = {}
|
||||||
examples = [Example.from_dict(doc, {"cats": cats}) for doc, cat in docs]
|
examples = [Example.from_dict(doc, {"cats": cats}) for doc, cat in docs]
|
||||||
|
@ -86,7 +86,7 @@ def test_no_label():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
nlp.add_pipe("textcat")
|
nlp.add_pipe("textcat")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
|
|
||||||
|
|
||||||
def test_implicit_label():
|
def test_implicit_label():
|
||||||
|
@ -95,7 +95,7 @@ def test_implicit_label():
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for t in TRAIN_DATA:
|
for t in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
nlp.begin_training(get_examples=lambda: train_examples)
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
|
||||||
|
|
||||||
def test_no_resize():
|
def test_no_resize():
|
||||||
|
@ -103,14 +103,14 @@ def test_no_resize():
|
||||||
textcat = nlp.add_pipe("textcat")
|
textcat = nlp.add_pipe("textcat")
|
||||||
textcat.add_label("POSITIVE")
|
textcat.add_label("POSITIVE")
|
||||||
textcat.add_label("NEGATIVE")
|
textcat.add_label("NEGATIVE")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
assert textcat.model.get_dim("nO") == 2
|
assert textcat.model.get_dim("nO") == 2
|
||||||
# this throws an error because the textcat can't be resized after initialization
|
# this throws an error because the textcat can't be resized after initialization
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
textcat.add_label("NEUTRAL")
|
textcat.add_label("NEUTRAL")
|
||||||
|
|
||||||
|
|
||||||
def test_begin_training_examples():
|
def test_initialize_examples():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
textcat = nlp.add_pipe("textcat")
|
textcat = nlp.add_pipe("textcat")
|
||||||
train_examples = []
|
train_examples = []
|
||||||
|
@ -119,12 +119,12 @@ def test_begin_training_examples():
|
||||||
for label, value in annotations.get("cats").items():
|
for label, value in annotations.get("cats").items():
|
||||||
textcat.add_label(label)
|
textcat.add_label(label)
|
||||||
# you shouldn't really call this more than once, but for testing it should be fine
|
# you shouldn't really call this more than once, but for testing it should be fine
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
nlp.begin_training(get_examples=lambda: train_examples)
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
with pytest.raises(TypeError):
|
|
||||||
nlp.begin_training(get_examples=lambda: None)
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.begin_training(get_examples=train_examples)
|
nlp.initialize(get_examples=lambda: None)
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
nlp.initialize(get_examples=train_examples)
|
||||||
|
|
||||||
|
|
||||||
def test_overfitting_IO():
|
def test_overfitting_IO():
|
||||||
|
@ -139,7 +139,7 @@ def test_overfitting_IO():
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for text, annotations in TRAIN_DATA:
|
for text, annotations in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||||
optimizer = nlp.begin_training(get_examples=lambda: train_examples)
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
assert textcat.model.get_dim("nO") == 2
|
assert textcat.model.get_dim("nO") == 2
|
||||||
|
|
||||||
for i in range(50):
|
for i in range(50):
|
||||||
|
@ -195,7 +195,7 @@ def test_textcat_configs(textcat_config):
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||||
for label, value in annotations.get("cats").items():
|
for label, value in annotations.get("cats").items():
|
||||||
textcat.add_label(label)
|
textcat.add_label(label)
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.initialize()
|
||||||
for i in range(5):
|
for i in range(5):
|
||||||
losses = {}
|
losses = {}
|
||||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||||
|
@ -226,6 +226,7 @@ def test_positive_class_not_binary():
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
verify_textcat_config(nlp, pipe_config)
|
verify_textcat_config(nlp, pipe_config)
|
||||||
|
|
||||||
|
|
||||||
def test_textcat_evaluation():
|
def test_textcat_evaluation():
|
||||||
train_examples = []
|
train_examples = []
|
||||||
nlp = English()
|
nlp = English()
|
||||||
|
@ -241,15 +242,17 @@ def test_textcat_evaluation():
|
||||||
pred2.cats = {"winter": 1.0, "summer": 0.0, "spring": 0.0, "autumn": 1.0}
|
pred2.cats = {"winter": 1.0, "summer": 0.0, "spring": 0.0, "autumn": 1.0}
|
||||||
train_examples.append(Example(pred2, ref2))
|
train_examples.append(Example(pred2, ref2))
|
||||||
|
|
||||||
scores = Scorer().score_cats(train_examples, "cats", labels=["winter", "summer", "spring", "autumn"])
|
scores = Scorer().score_cats(
|
||||||
assert scores["cats_f_per_type"]["winter"]["p"] == 1/2
|
train_examples, "cats", labels=["winter", "summer", "spring", "autumn"]
|
||||||
assert scores["cats_f_per_type"]["winter"]["r"] == 1/1
|
)
|
||||||
|
assert scores["cats_f_per_type"]["winter"]["p"] == 1 / 2
|
||||||
|
assert scores["cats_f_per_type"]["winter"]["r"] == 1 / 1
|
||||||
assert scores["cats_f_per_type"]["summer"]["p"] == 0
|
assert scores["cats_f_per_type"]["summer"]["p"] == 0
|
||||||
assert scores["cats_f_per_type"]["summer"]["r"] == 0/1
|
assert scores["cats_f_per_type"]["summer"]["r"] == 0 / 1
|
||||||
assert scores["cats_f_per_type"]["spring"]["p"] == 1/1
|
assert scores["cats_f_per_type"]["spring"]["p"] == 1 / 1
|
||||||
assert scores["cats_f_per_type"]["spring"]["r"] == 1/2
|
assert scores["cats_f_per_type"]["spring"]["r"] == 1 / 2
|
||||||
assert scores["cats_f_per_type"]["autumn"]["p"] == 2/2
|
assert scores["cats_f_per_type"]["autumn"]["p"] == 2 / 2
|
||||||
assert scores["cats_f_per_type"]["autumn"]["r"] == 2/2
|
assert scores["cats_f_per_type"]["autumn"]["r"] == 2 / 2
|
||||||
|
|
||||||
assert scores["cats_micro_p"] == 4/5
|
assert scores["cats_micro_p"] == 4 / 5
|
||||||
assert scores["cats_micro_r"] == 4/6
|
assert scores["cats_micro_r"] == 4 / 6
|
||||||
|
|
|
@ -73,8 +73,7 @@ def test_tok2vec_configs(width, embed_arch, embed_config, encode_arch, encode_co
|
||||||
encode_config["width"] = width
|
encode_config["width"] = width
|
||||||
docs = get_batch(3)
|
docs = get_batch(3)
|
||||||
tok2vec = build_Tok2Vec_model(
|
tok2vec = build_Tok2Vec_model(
|
||||||
embed_arch(**embed_config),
|
embed_arch(**embed_config), encode_arch(**encode_config)
|
||||||
encode_arch(**encode_config)
|
|
||||||
)
|
)
|
||||||
tok2vec.initialize(docs)
|
tok2vec.initialize(docs)
|
||||||
vectors, backprop = tok2vec.begin_update(docs)
|
vectors, backprop = tok2vec.begin_update(docs)
|
||||||
|
@ -88,7 +87,7 @@ def test_init_tok2vec():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
tok2vec = nlp.add_pipe("tok2vec")
|
tok2vec = nlp.add_pipe("tok2vec")
|
||||||
assert tok2vec.listeners == []
|
assert tok2vec.listeners == []
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
assert tok2vec.model.get_dim("nO")
|
assert tok2vec.model.get_dim("nO")
|
||||||
|
|
||||||
|
|
||||||
|
@ -139,7 +138,7 @@ TRAIN_DATA = [
|
||||||
|
|
||||||
def test_tok2vec_listener():
|
def test_tok2vec_listener():
|
||||||
orig_config = Config().from_str(cfg_string)
|
orig_config = Config().from_str(cfg_string)
|
||||||
nlp, config = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
||||||
assert nlp.pipe_names == ["tok2vec", "tagger"]
|
assert nlp.pipe_names == ["tok2vec", "tagger"]
|
||||||
tagger = nlp.get_pipe("tagger")
|
tagger = nlp.get_pipe("tagger")
|
||||||
tok2vec = nlp.get_pipe("tok2vec")
|
tok2vec = nlp.get_pipe("tok2vec")
|
||||||
|
@ -154,7 +153,7 @@ def test_tok2vec_listener():
|
||||||
|
|
||||||
# Check that the Tok2Vec component finds it listeners
|
# Check that the Tok2Vec component finds it listeners
|
||||||
assert tok2vec.listeners == []
|
assert tok2vec.listeners == []
|
||||||
optimizer = nlp.begin_training(lambda: train_examples)
|
optimizer = nlp.initialize(lambda: train_examples)
|
||||||
assert tok2vec.listeners == [tagger_tok2vec]
|
assert tok2vec.listeners == [tagger_tok2vec]
|
||||||
|
|
||||||
for i in range(5):
|
for i in range(5):
|
||||||
|
@ -173,7 +172,7 @@ def test_tok2vec_listener():
|
||||||
|
|
||||||
def test_tok2vec_listener_callback():
|
def test_tok2vec_listener_callback():
|
||||||
orig_config = Config().from_str(cfg_string)
|
orig_config = Config().from_str(cfg_string)
|
||||||
nlp, config = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
||||||
assert nlp.pipe_names == ["tok2vec", "tagger"]
|
assert nlp.pipe_names == ["tok2vec", "tagger"]
|
||||||
tagger = nlp.get_pipe("tagger")
|
tagger = nlp.get_pipe("tagger")
|
||||||
tok2vec = nlp.get_pipe("tok2vec")
|
tok2vec = nlp.get_pipe("tok2vec")
|
||||||
|
|
|
@ -428,7 +428,7 @@ def test_issue999():
|
||||||
for _, offsets in TRAIN_DATA:
|
for _, offsets in TRAIN_DATA:
|
||||||
for start, end, label in offsets:
|
for start, end, label in offsets:
|
||||||
ner.add_label(label)
|
ner.add_label(label)
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
for itn in range(20):
|
for itn in range(20):
|
||||||
random.shuffle(TRAIN_DATA)
|
random.shuffle(TRAIN_DATA)
|
||||||
for raw_text, entity_offsets in TRAIN_DATA:
|
for raw_text, entity_offsets in TRAIN_DATA:
|
||||||
|
|
|
@ -250,7 +250,7 @@ def test_issue1915():
|
||||||
ner = nlp.add_pipe("ner")
|
ner = nlp.add_pipe("ner")
|
||||||
ner.add_label("answer")
|
ner.add_label("answer")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.begin_training(**cfg)
|
nlp.initialize(**cfg)
|
||||||
|
|
||||||
|
|
||||||
def test_issue1945():
|
def test_issue1945():
|
||||||
|
|
|
@ -30,7 +30,7 @@ def test_issue2179():
|
||||||
nlp = Italian()
|
nlp = Italian()
|
||||||
ner = nlp.add_pipe("ner")
|
ner = nlp.add_pipe("ner")
|
||||||
ner.add_label("CITIZENSHIP")
|
ner.add_label("CITIZENSHIP")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
nlp2 = Italian()
|
nlp2 = Italian()
|
||||||
nlp2.add_pipe("ner")
|
nlp2.add_pipe("ner")
|
||||||
assert len(nlp2.get_pipe("ner").labels) == 0
|
assert len(nlp2.get_pipe("ner").labels) == 0
|
||||||
|
|
|
@ -18,7 +18,7 @@ def test_issue2564():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
tagger = nlp.add_pipe("tagger")
|
tagger = nlp.add_pipe("tagger")
|
||||||
tagger.add_label("A")
|
tagger.add_label("A")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
doc = nlp("hello world")
|
doc = nlp("hello world")
|
||||||
assert doc.has_annotation("TAG")
|
assert doc.has_annotation("TAG")
|
||||||
docs = nlp.pipe(["hello", "world"])
|
docs = nlp.pipe(["hello", "world"])
|
||||||
|
@ -149,7 +149,7 @@ def test_issue2800():
|
||||||
ner = nlp.add_pipe("ner")
|
ner = nlp.add_pipe("ner")
|
||||||
for entity_type in list(entity_types):
|
for entity_type in list(entity_types):
|
||||||
ner.add_label(entity_type)
|
ner.add_label(entity_type)
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.initialize()
|
||||||
for i in range(20):
|
for i in range(20):
|
||||||
losses = {}
|
losses = {}
|
||||||
random.shuffle(train_data)
|
random.shuffle(train_data)
|
||||||
|
|
|
@ -59,7 +59,7 @@ def test_issue3012(en_vocab):
|
||||||
words = ["This", "is", "10", "%", "."]
|
words = ["This", "is", "10", "%", "."]
|
||||||
tags = ["DT", "VBZ", "CD", "NN", "."]
|
tags = ["DT", "VBZ", "CD", "NN", "."]
|
||||||
pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
|
pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
|
||||||
ents = [("PERCENT", 2, 4)]
|
ents = ["O", "O", "B-PERCENT", "I-PERCENT", "O"]
|
||||||
doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
|
doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
|
||||||
assert doc.has_annotation("TAG")
|
assert doc.has_annotation("TAG")
|
||||||
expected = ("10", "NUM", "CD", "PERCENT")
|
expected = ("10", "NUM", "CD", "PERCENT")
|
||||||
|
@ -92,7 +92,7 @@ def test_issue3209():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
ner = nlp.add_pipe("ner")
|
ner = nlp.add_pipe("ner")
|
||||||
ner.add_label("ANIMAL")
|
ner.add_label("ANIMAL")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"]
|
move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"]
|
||||||
assert ner.move_names == move_names
|
assert ner.move_names == move_names
|
||||||
nlp2 = English()
|
nlp2 = English()
|
||||||
|
@ -195,7 +195,7 @@ def test_issue3345():
|
||||||
"update_with_oracle_cut_size": 100,
|
"update_with_oracle_cut_size": 100,
|
||||||
}
|
}
|
||||||
cfg = {"model": DEFAULT_NER_MODEL}
|
cfg = {"model": DEFAULT_NER_MODEL}
|
||||||
model = registry.make_from_config(cfg, validate=True)["model"]
|
model = registry.resolve(cfg, validate=True)["model"]
|
||||||
ner = EntityRecognizer(doc.vocab, model, **config)
|
ner = EntityRecognizer(doc.vocab, model, **config)
|
||||||
# Add the OUT action. I wouldn't have thought this would be necessary...
|
# Add the OUT action. I wouldn't have thought this would be necessary...
|
||||||
ner.moves.add_action(5, "")
|
ner.moves.add_action(5, "")
|
||||||
|
@ -239,7 +239,7 @@ def test_issue3456():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
tagger = nlp.add_pipe("tagger")
|
tagger = nlp.add_pipe("tagger")
|
||||||
tagger.add_label("A")
|
tagger.add_label("A")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
list(nlp.pipe(["hi", ""]))
|
list(nlp.pipe(["hi", ""]))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -223,15 +223,13 @@ def test_issue3611():
|
||||||
textcat.add_label(label)
|
textcat.add_label(label)
|
||||||
# training the network
|
# training the network
|
||||||
with nlp.select_pipes(enable="textcat"):
|
with nlp.select_pipes(enable="textcat"):
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.initialize()
|
||||||
for i in range(3):
|
for i in range(3):
|
||||||
losses = {}
|
losses = {}
|
||||||
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
||||||
|
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
nlp.update(
|
nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
|
||||||
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3625():
|
def test_issue3625():
|
||||||
|
@ -264,13 +262,11 @@ def test_issue3830_no_subtok():
|
||||||
"min_action_freq": 30,
|
"min_action_freq": 30,
|
||||||
"update_with_oracle_cut_size": 100,
|
"update_with_oracle_cut_size": 100,
|
||||||
}
|
}
|
||||||
model = registry.make_from_config({"model": DEFAULT_PARSER_MODEL}, validate=True)[
|
model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"]
|
||||||
"model"
|
|
||||||
]
|
|
||||||
parser = DependencyParser(Vocab(), model, **config)
|
parser = DependencyParser(Vocab(), model, **config)
|
||||||
parser.add_label("nsubj")
|
parser.add_label("nsubj")
|
||||||
assert "subtok" not in parser.labels
|
assert "subtok" not in parser.labels
|
||||||
parser.begin_training(lambda: [_parser_example(parser)])
|
parser.initialize(lambda: [_parser_example(parser)])
|
||||||
assert "subtok" not in parser.labels
|
assert "subtok" not in parser.labels
|
||||||
|
|
||||||
|
|
||||||
|
@ -281,13 +277,11 @@ def test_issue3830_with_subtok():
|
||||||
"min_action_freq": 30,
|
"min_action_freq": 30,
|
||||||
"update_with_oracle_cut_size": 100,
|
"update_with_oracle_cut_size": 100,
|
||||||
}
|
}
|
||||||
model = registry.make_from_config({"model": DEFAULT_PARSER_MODEL}, validate=True)[
|
model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"]
|
||||||
"model"
|
|
||||||
]
|
|
||||||
parser = DependencyParser(Vocab(), model, **config)
|
parser = DependencyParser(Vocab(), model, **config)
|
||||||
parser.add_label("nsubj")
|
parser.add_label("nsubj")
|
||||||
assert "subtok" not in parser.labels
|
assert "subtok" not in parser.labels
|
||||||
parser.begin_training(lambda: [_parser_example(parser)])
|
parser.initialize(lambda: [_parser_example(parser)])
|
||||||
assert "subtok" in parser.labels
|
assert "subtok" in parser.labels
|
||||||
|
|
||||||
|
|
||||||
|
@ -346,7 +340,7 @@ def test_issue3880():
|
||||||
nlp.add_pipe("parser").add_label("dep")
|
nlp.add_pipe("parser").add_label("dep")
|
||||||
nlp.add_pipe("ner").add_label("PERSON")
|
nlp.add_pipe("ner").add_label("PERSON")
|
||||||
nlp.add_pipe("tagger").add_label("NN")
|
nlp.add_pipe("tagger").add_label("NN")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
for doc in nlp.pipe(texts):
|
for doc in nlp.pipe(texts):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -394,7 +388,7 @@ def test_issue3959():
|
||||||
|
|
||||||
|
|
||||||
def test_issue3962(en_vocab):
|
def test_issue3962(en_vocab):
|
||||||
""" Ensure that as_doc does not result in out-of-bound access of tokens.
|
"""Ensure that as_doc does not result in out-of-bound access of tokens.
|
||||||
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
||||||
# fmt: off
|
# fmt: off
|
||||||
words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."]
|
words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."]
|
||||||
|
@ -432,7 +426,7 @@ def test_issue3962(en_vocab):
|
||||||
|
|
||||||
|
|
||||||
def test_issue3962_long(en_vocab):
|
def test_issue3962_long(en_vocab):
|
||||||
""" Ensure that as_doc does not result in out-of-bound access of tokens.
|
"""Ensure that as_doc does not result in out-of-bound access of tokens.
|
||||||
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
||||||
# fmt: off
|
# fmt: off
|
||||||
words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."]
|
words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."]
|
||||||
|
@ -467,8 +461,7 @@ def test_issue3962_long(en_vocab):
|
||||||
|
|
||||||
|
|
||||||
def test_issue3972(en_vocab):
|
def test_issue3972(en_vocab):
|
||||||
"""Test that the PhraseMatcher returns duplicates for duplicate match IDs.
|
"""Test that the PhraseMatcher returns duplicates for duplicate match IDs."""
|
||||||
"""
|
|
||||||
matcher = PhraseMatcher(en_vocab)
|
matcher = PhraseMatcher(en_vocab)
|
||||||
matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
|
matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
|
||||||
matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
|
matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
|
||||||
|
|
|
@ -19,8 +19,7 @@ from ..util import make_tempdir
|
||||||
|
|
||||||
|
|
||||||
def test_issue4002(en_vocab):
|
def test_issue4002(en_vocab):
|
||||||
"""Test that the PhraseMatcher can match on overwritten NORM attributes.
|
"""Test that the PhraseMatcher can match on overwritten NORM attributes."""
|
||||||
"""
|
|
||||||
matcher = PhraseMatcher(en_vocab, attr="NORM")
|
matcher = PhraseMatcher(en_vocab, attr="NORM")
|
||||||
pattern1 = Doc(en_vocab, words=["c", "d"])
|
pattern1 = Doc(en_vocab, words=["c", "d"])
|
||||||
assert [t.norm_ for t in pattern1] == ["c", "d"]
|
assert [t.norm_ for t in pattern1] == ["c", "d"]
|
||||||
|
@ -66,15 +65,13 @@ def test_issue4030():
|
||||||
textcat.add_label(label)
|
textcat.add_label(label)
|
||||||
# training the network
|
# training the network
|
||||||
with nlp.select_pipes(enable="textcat"):
|
with nlp.select_pipes(enable="textcat"):
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.initialize()
|
||||||
for i in range(3):
|
for i in range(3):
|
||||||
losses = {}
|
losses = {}
|
||||||
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
||||||
|
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
nlp.update(
|
nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
|
||||||
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
|
|
||||||
)
|
|
||||||
# processing of an empty doc should result in 0.0 for all categories
|
# processing of an empty doc should result in 0.0 for all categories
|
||||||
doc = nlp("")
|
doc = nlp("")
|
||||||
assert doc.cats["offensive"] == 0.0
|
assert doc.cats["offensive"] == 0.0
|
||||||
|
@ -87,7 +84,7 @@ def test_issue4042():
|
||||||
# add ner pipe
|
# add ner pipe
|
||||||
ner = nlp.add_pipe("ner")
|
ner = nlp.add_pipe("ner")
|
||||||
ner.add_label("SOME_LABEL")
|
ner.add_label("SOME_LABEL")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
# Add entity ruler
|
# Add entity ruler
|
||||||
patterns = [
|
patterns = [
|
||||||
{"label": "MY_ORG", "pattern": "Apple"},
|
{"label": "MY_ORG", "pattern": "Apple"},
|
||||||
|
@ -118,7 +115,7 @@ def test_issue4042_bug2():
|
||||||
# add ner pipe
|
# add ner pipe
|
||||||
ner1 = nlp1.add_pipe("ner")
|
ner1 = nlp1.add_pipe("ner")
|
||||||
ner1.add_label("SOME_LABEL")
|
ner1.add_label("SOME_LABEL")
|
||||||
nlp1.begin_training()
|
nlp1.initialize()
|
||||||
# add a new label to the doc
|
# add a new label to the doc
|
||||||
doc1 = nlp1("What do you think about Apple ?")
|
doc1 = nlp1("What do you think about Apple ?")
|
||||||
assert len(ner1.labels) == 1
|
assert len(ner1.labels) == 1
|
||||||
|
@ -244,7 +241,7 @@ def test_issue4267():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
ner = nlp.add_pipe("ner")
|
ner = nlp.add_pipe("ner")
|
||||||
ner.add_label("PEOPLE")
|
ner.add_label("PEOPLE")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
assert "ner" in nlp.pipe_names
|
assert "ner" in nlp.pipe_names
|
||||||
# assert that we have correct IOB annotations
|
# assert that we have correct IOB annotations
|
||||||
doc1 = nlp("hi")
|
doc1 = nlp("hi")
|
||||||
|
@ -299,7 +296,7 @@ def test_issue4313():
|
||||||
config = {}
|
config = {}
|
||||||
ner = nlp.create_pipe("ner", config=config)
|
ner = nlp.create_pipe("ner", config=config)
|
||||||
ner.add_label("SOME_LABEL")
|
ner.add_label("SOME_LABEL")
|
||||||
ner.begin_training(lambda: [])
|
ner.initialize(lambda: [])
|
||||||
# add a new label to the doc
|
# add a new label to the doc
|
||||||
doc = nlp("What do you think about Apple ?")
|
doc = nlp("What do you think about Apple ?")
|
||||||
assert len(ner.labels) == 1
|
assert len(ner.labels) == 1
|
||||||
|
@ -327,7 +324,7 @@ def test_issue4348():
|
||||||
TRAIN_DATA = [example, example]
|
TRAIN_DATA = [example, example]
|
||||||
tagger = nlp.add_pipe("tagger")
|
tagger = nlp.add_pipe("tagger")
|
||||||
tagger.add_label("A")
|
tagger.add_label("A")
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.initialize()
|
||||||
for i in range(5):
|
for i in range(5):
|
||||||
losses = {}
|
losses = {}
|
||||||
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||||
|
|
|
@ -180,7 +180,7 @@ def test_issue4725_2():
|
||||||
vocab.set_vector("dog", data[1])
|
vocab.set_vector("dog", data[1])
|
||||||
nlp = English(vocab=vocab)
|
nlp = English(vocab=vocab)
|
||||||
nlp.add_pipe("ner")
|
nlp.add_pipe("ner")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
docs = ["Kurt is in London."] * 10
|
docs = ["Kurt is in London."] * 10
|
||||||
for _ in nlp.pipe(docs, batch_size=2, n_process=2):
|
for _ in nlp.pipe(docs, batch_size=2, n_process=2):
|
||||||
pass
|
pass
|
||||||
|
|
|
@ -64,7 +64,7 @@ def tagger():
|
||||||
# 1. no model leads to error in serialization,
|
# 1. no model leads to error in serialization,
|
||||||
# 2. the affected line is the one for model serialization
|
# 2. the affected line is the one for model serialization
|
||||||
tagger.add_label("A")
|
tagger.add_label("A")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
return tagger
|
return tagger
|
||||||
|
|
||||||
|
|
||||||
|
@ -85,7 +85,7 @@ def entity_linker():
|
||||||
# need to add model for two reasons:
|
# need to add model for two reasons:
|
||||||
# 1. no model leads to error in serialization,
|
# 1. no model leads to error in serialization,
|
||||||
# 2. the affected line is the one for model serialization
|
# 2. the affected line is the one for model serialization
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
return entity_linker
|
return entity_linker
|
||||||
|
|
||||||
|
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user