mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Merge remote-tracking branch 'upstream/develop' into feature/small-fixes
This commit is contained in:
commit
02247cccaf
2
Makefile
2
Makefile
|
@ -1,7 +1,7 @@
|
|||
SHELL := /bin/bash
|
||||
|
||||
ifndef SPACY_EXTRAS
|
||||
override SPACY_EXTRAS = spacy-lookups-data==0.4.0.dev0 jieba pkuseg==0.0.25 sudachipy sudachidict_core
|
||||
override SPACY_EXTRAS = spacy-lookups-data==1.0.0rc0 jieba pkuseg==0.0.25 pickle5 sudachipy sudachidict_core
|
||||
endif
|
||||
|
||||
ifndef PYVER
|
||||
|
|
|
@ -6,7 +6,7 @@ requires = [
|
|||
"cymem>=2.0.2,<2.1.0",
|
||||
"preshed>=3.0.2,<3.1.0",
|
||||
"murmurhash>=0.28.0,<1.1.0",
|
||||
"thinc>=8.0.0a41,<8.0.0a50",
|
||||
"thinc>=8.0.0a43,<8.0.0a50",
|
||||
"blis>=0.4.0,<0.5.0",
|
||||
"pytokenizations",
|
||||
"pathy"
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
# Our libraries
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.0a41,<8.0.0a50
|
||||
thinc>=8.0.0a43,<8.0.0a50
|
||||
blis>=0.4.0,<0.5.0
|
||||
ml_datasets==0.2.0a0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
wasabi>=0.8.0,<1.1.0
|
||||
srsly>=2.1.0,<3.0.0
|
||||
srsly>=2.3.0,<3.0.0
|
||||
catalogue>=2.0.1,<2.1.0
|
||||
typer>=0.3.0,<0.4.0
|
||||
pathy
|
||||
|
|
12
setup.cfg
12
setup.cfg
|
@ -34,16 +34,16 @@ setup_requires =
|
|||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
thinc>=8.0.0a41,<8.0.0a50
|
||||
thinc>=8.0.0a43,<8.0.0a50
|
||||
install_requires =
|
||||
# Our libraries
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.0a41,<8.0.0a50
|
||||
thinc>=8.0.0a43,<8.0.0a50
|
||||
blis>=0.4.0,<0.5.0
|
||||
wasabi>=0.8.0,<1.1.0
|
||||
srsly>=2.1.0,<3.0.0
|
||||
srsly>=2.3.0,<3.0.0
|
||||
catalogue>=2.0.1,<2.1.0
|
||||
typer>=0.3.0,<0.4.0
|
||||
pathy
|
||||
|
@ -65,7 +65,7 @@ console_scripts =
|
|||
|
||||
[options.extras_require]
|
||||
lookups =
|
||||
spacy_lookups_data==0.4.0.dev0
|
||||
spacy_lookups_data==1.0.0rc0
|
||||
cuda =
|
||||
cupy>=5.0.0b4,<9.0.0
|
||||
cuda80 =
|
||||
|
@ -84,7 +84,7 @@ cuda102 =
|
|||
cupy-cuda102>=5.0.0b4,<9.0.0
|
||||
# Language tokenizers with external dependencies
|
||||
ja =
|
||||
sudachipy>=0.4.5
|
||||
sudachipy>=0.4.9
|
||||
sudachidict_core>=20200330
|
||||
ko =
|
||||
natto-py==0.9.0
|
||||
|
@ -98,7 +98,7 @@ universal = false
|
|||
formats = gztar
|
||||
|
||||
[flake8]
|
||||
ignore = E203, E266, E501, E731, W503
|
||||
ignore = E203, E266, E501, E731, W503, E741
|
||||
max-line-length = 80
|
||||
select = B,C,E,F,W,T4,B9
|
||||
exclude =
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy-nightly"
|
||||
__version__ = "3.0.0a26"
|
||||
__version__ = "3.0.0a29"
|
||||
__release__ = True
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
|
|
|
@ -15,7 +15,7 @@ from .debug_config import debug_config # noqa: F401
|
|||
from .debug_model import debug_model # noqa: F401
|
||||
from .evaluate import evaluate # noqa: F401
|
||||
from .convert import convert # noqa: F401
|
||||
from .init_model import init_model # noqa: F401
|
||||
from .init_pipeline import init_pipeline_cli # noqa: F401
|
||||
from .init_config import init_config, fill_config # noqa: F401
|
||||
from .validate import validate # noqa: F401
|
||||
from .project.clone import project_clone # noqa: F401
|
||||
|
|
|
@ -10,12 +10,13 @@ from click import NoSuchOption
|
|||
from click.parser import split_arg_string
|
||||
from typer.main import get_command
|
||||
from contextlib import contextmanager
|
||||
from thinc.api import Config, ConfigValidationError
|
||||
from thinc.api import Config, ConfigValidationError, require_gpu
|
||||
from configparser import InterpolationError
|
||||
import os
|
||||
|
||||
from ..schemas import ProjectConfigSchema, validate
|
||||
from ..util import import_file, run_command, make_tempdir, registry, logger
|
||||
from ..util import ENV_VARS
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathy import Pathy # noqa: F401
|
||||
|
@ -39,7 +40,6 @@ commands to check and validate your config files, training and evaluation data,
|
|||
and custom model implementations.
|
||||
"""
|
||||
INIT_HELP = """Commands for initializing configs and pipeline packages."""
|
||||
OVERRIDES_ENV_VAR = "SPACY_CONFIG_OVERRIDES"
|
||||
|
||||
# Wrappers for Typer's annotations. Initially created to set defaults and to
|
||||
# keep the names short, but not needed at the moment.
|
||||
|
@ -65,7 +65,7 @@ def setup_cli() -> None:
|
|||
|
||||
|
||||
def parse_config_overrides(
|
||||
args: List[str], env_var: Optional[str] = OVERRIDES_ENV_VAR
|
||||
args: List[str], env_var: Optional[str] = ENV_VARS.CONFIG_OVERRIDES
|
||||
) -> Dict[str, Any]:
|
||||
"""Generate a dictionary of config overrides based on the extra arguments
|
||||
provided on the CLI, e.g. --training.batch_size to override
|
||||
|
@ -275,18 +275,6 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None:
|
|||
msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
|
||||
|
||||
|
||||
def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
|
||||
"""RETURNS (List[str]): All sourced components in the original config,
|
||||
e.g. {"source": "en_core_web_sm"}. If the config contains a key
|
||||
"factory", we assume it refers to a component factory.
|
||||
"""
|
||||
return [
|
||||
name
|
||||
for name, cfg in config.get("components", {}).items()
|
||||
if "factory" not in cfg and "source" in cfg
|
||||
]
|
||||
|
||||
|
||||
def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
|
||||
"""Upload a file.
|
||||
|
||||
|
@ -458,3 +446,12 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in
|
|||
p = int(p)
|
||||
result.append(p)
|
||||
return result
|
||||
|
||||
|
||||
def setup_gpu(use_gpu: int) -> None:
|
||||
"""Configure the GPU and log info."""
|
||||
if use_gpu >= 0:
|
||||
msg.info(f"Using GPU: {use_gpu}")
|
||||
require_gpu(use_gpu)
|
||||
else:
|
||||
msg.info("Using CPU")
|
||||
|
|
|
@ -9,7 +9,8 @@ import sys
|
|||
from ._util import app, Arg, Opt
|
||||
from ..training import docs_to_json
|
||||
from ..tokens import DocBin
|
||||
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs, conllu_to_docs
|
||||
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
|
||||
from ..training.converters import conllu_to_docs
|
||||
|
||||
|
||||
# Converters are matched by file extension except for ner/iob, which are
|
||||
|
|
|
@ -1,12 +1,14 @@
|
|||
from typing import Optional, Dict, Any, Union, List
|
||||
from pathlib import Path
|
||||
from wasabi import msg, table
|
||||
from thinc.api import Config, ConfigValidationError
|
||||
from thinc.api import Config
|
||||
from thinc.config import VARIABLE_RE
|
||||
import typer
|
||||
|
||||
from ._util import Arg, Opt, show_validation_error, parse_config_overrides
|
||||
from ._util import import_code, debug_cli
|
||||
from ..schemas import ConfigSchemaTraining
|
||||
from ..util import registry
|
||||
from .. import util
|
||||
|
||||
|
||||
|
@ -52,10 +54,10 @@ def debug_config(
|
|||
with show_validation_error(config_path):
|
||||
config = util.load_config(config_path, overrides=overrides)
|
||||
nlp = util.load_model_from_config(config)
|
||||
# Use the resolved config here in case user has one function returning
|
||||
# a dict of corpora etc.
|
||||
resolved = util.resolve_training_config(nlp.config)
|
||||
check_section_refs(resolved, ["training.dev_corpus", "training.train_corpus"])
|
||||
config = nlp.config.interpolate()
|
||||
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
||||
dot_names = [T["train_corpus"], T["dev_corpus"]]
|
||||
util.resolve_dot_names(config, dot_names)
|
||||
msg.good("Config is valid")
|
||||
if show_vars:
|
||||
variables = get_variables(config)
|
||||
|
@ -97,23 +99,3 @@ def get_variables(config: Config) -> Dict[str, Any]:
|
|||
value = util.dot_to_object(config, path)
|
||||
result[variable] = repr(value)
|
||||
return result
|
||||
|
||||
|
||||
def check_section_refs(config: Config, fields: List[str]) -> None:
|
||||
"""Validate fields in the config that refer to other sections or values
|
||||
(e.g. in the corpora) and make sure that those references exist.
|
||||
"""
|
||||
errors = []
|
||||
for field in fields:
|
||||
# If the field doesn't exist in the config, we ignore it
|
||||
try:
|
||||
value = util.dot_to_object(config, field)
|
||||
except KeyError:
|
||||
continue
|
||||
try:
|
||||
util.dot_to_object(config, value)
|
||||
except KeyError:
|
||||
msg = f"not a valid section reference: {value}"
|
||||
errors.append({"loc": field.split("."), "msg": msg})
|
||||
if errors:
|
||||
raise ConfigValidationError(config=config, errors=errors)
|
||||
|
|
|
@ -7,10 +7,13 @@ from wasabi import Printer, MESSAGES, msg
|
|||
import typer
|
||||
|
||||
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
||||
from ._util import import_code, debug_cli, get_sourced_components
|
||||
from ..training import Corpus, Example
|
||||
from ._util import import_code, debug_cli
|
||||
from ..training import Example
|
||||
from ..training.initialize import get_sourced_components
|
||||
from ..schemas import ConfigSchemaTraining
|
||||
from ..pipeline._parser_internals import nonproj
|
||||
from ..language import Language
|
||||
from ..util import registry, resolve_dot_names
|
||||
from .. import util
|
||||
|
||||
|
||||
|
@ -24,7 +27,7 @@ BLANK_MODEL_THRESHOLD = 2000
|
|||
|
||||
|
||||
@debug_cli.command(
|
||||
"data", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
"data", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
|
||||
)
|
||||
@app.command(
|
||||
"debug-data",
|
||||
|
@ -34,8 +37,6 @@ BLANK_MODEL_THRESHOLD = 2000
|
|||
def debug_data_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True),
|
||||
dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True),
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
|
||||
|
@ -59,8 +60,6 @@ def debug_data_cli(
|
|||
overrides = parse_config_overrides(ctx.args)
|
||||
import_code(code_path)
|
||||
debug_data(
|
||||
train_path,
|
||||
dev_path,
|
||||
config_path,
|
||||
config_overrides=overrides,
|
||||
ignore_warnings=ignore_warnings,
|
||||
|
@ -71,8 +70,6 @@ def debug_data_cli(
|
|||
|
||||
|
||||
def debug_data(
|
||||
train_path: Path,
|
||||
dev_path: Path,
|
||||
config_path: Path,
|
||||
*,
|
||||
config_overrides: Dict[str, Any] = {},
|
||||
|
@ -85,57 +82,29 @@ def debug_data(
|
|||
no_print=silent, pretty=not no_format, ignore_warnings=ignore_warnings
|
||||
)
|
||||
# Make sure all files and paths exists if they are needed
|
||||
if not train_path.exists():
|
||||
msg.fail("Training data not found", train_path, exits=1)
|
||||
if not dev_path.exists():
|
||||
msg.fail("Development data not found", dev_path, exits=1)
|
||||
if not config_path.exists():
|
||||
msg.fail("Config file not found", config_path, exists=1)
|
||||
with show_validation_error(config_path):
|
||||
cfg = util.load_config(config_path, overrides=config_overrides)
|
||||
nlp = util.load_model_from_config(cfg)
|
||||
C = util.resolve_training_config(nlp.config)
|
||||
config = nlp.config.interpolate()
|
||||
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
||||
# Use original config here, not resolved version
|
||||
sourced_components = get_sourced_components(cfg)
|
||||
frozen_components = C["training"]["frozen_components"]
|
||||
frozen_components = T["frozen_components"]
|
||||
resume_components = [p for p in sourced_components if p not in frozen_components]
|
||||
pipeline = nlp.pipe_names
|
||||
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
|
||||
tag_map_path = util.ensure_path(C["training"]["tag_map"])
|
||||
tag_map = {}
|
||||
if tag_map_path is not None:
|
||||
tag_map = srsly.read_json(tag_map_path)
|
||||
morph_rules_path = util.ensure_path(C["training"]["morph_rules"])
|
||||
morph_rules = {}
|
||||
if morph_rules_path is not None:
|
||||
morph_rules = srsly.read_json(morph_rules_path)
|
||||
# Replace tag map with provided mapping
|
||||
nlp.vocab.morphology.load_tag_map(tag_map)
|
||||
# Load morph rules
|
||||
nlp.vocab.morphology.load_morph_exceptions(morph_rules)
|
||||
|
||||
msg.divider("Data file validation")
|
||||
|
||||
# Create the gold corpus to be able to better analyze data
|
||||
loading_train_error_message = ""
|
||||
loading_dev_error_message = ""
|
||||
with msg.loading("Loading corpus..."):
|
||||
try:
|
||||
train_dataset = list(Corpus(train_path)(nlp))
|
||||
except ValueError as e:
|
||||
loading_train_error_message = f"Training data cannot be loaded: {e}"
|
||||
try:
|
||||
dev_dataset = list(Corpus(dev_path)(nlp))
|
||||
except ValueError as e:
|
||||
loading_dev_error_message = f"Development data cannot be loaded: {e}"
|
||||
if loading_train_error_message or loading_dev_error_message:
|
||||
if loading_train_error_message:
|
||||
msg.fail(loading_train_error_message)
|
||||
if loading_dev_error_message:
|
||||
msg.fail(loading_dev_error_message)
|
||||
sys.exit(1)
|
||||
dot_names = [T["train_corpus"], T["dev_corpus"]]
|
||||
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
|
||||
train_dataset = list(train_corpus(nlp))
|
||||
dev_dataset = list(dev_corpus(nlp))
|
||||
msg.good("Corpus is loadable")
|
||||
|
||||
nlp.initialize(lambda: train_dataset)
|
||||
msg.good("Pipeline can be initialized with data")
|
||||
|
||||
# Create all gold data here to avoid iterating over the train_dataset constantly
|
||||
gold_train_data = _compile_gold(train_dataset, factory_names, nlp, make_proj=True)
|
||||
gold_train_unpreprocessed_data = _compile_gold(
|
||||
|
@ -145,10 +114,10 @@ def debug_data(
|
|||
|
||||
train_texts = gold_train_data["texts"]
|
||||
dev_texts = gold_dev_data["texts"]
|
||||
frozen_components = C["training"]["frozen_components"]
|
||||
frozen_components = T["frozen_components"]
|
||||
|
||||
msg.divider("Training stats")
|
||||
msg.text(f"Language: {C['nlp']['lang']}")
|
||||
msg.text(f"Language: {nlp.lang}")
|
||||
msg.text(f"Training pipeline: {', '.join(pipeline)}")
|
||||
if resume_components:
|
||||
msg.text(f"Components from other pipelines: {', '.join(resume_components)}")
|
||||
|
@ -355,17 +324,12 @@ def debug_data(
|
|||
if "tagger" in factory_names:
|
||||
msg.divider("Part-of-speech Tagging")
|
||||
labels = [label for label in gold_train_data["tags"]]
|
||||
tag_map = nlp.vocab.morphology.tag_map
|
||||
msg.info(f"{len(labels)} label(s) in data ({len(tag_map)} label(s) in tag map)")
|
||||
# TODO: does this need to be updated?
|
||||
msg.info(f"{len(labels)} label(s) in data")
|
||||
labels_with_counts = _format_labels(
|
||||
gold_train_data["tags"].most_common(), counts=True
|
||||
)
|
||||
msg.text(labels_with_counts, show=verbose)
|
||||
non_tagmap = [l for l in labels if l not in tag_map]
|
||||
if not non_tagmap:
|
||||
msg.good(f"All labels present in tag map for language '{nlp.lang}'")
|
||||
for label in non_tagmap:
|
||||
msg.fail(f"Label '{label}' not found in tag map for language '{nlp.lang}'")
|
||||
|
||||
if "parser" in factory_names:
|
||||
has_low_data_warning = False
|
||||
|
|
|
@ -2,18 +2,23 @@ from typing import Dict, Any, Optional, Iterable
|
|||
from pathlib import Path
|
||||
|
||||
from spacy.training import Example
|
||||
from spacy.util import dot_to_object
|
||||
from spacy.util import resolve_dot_names
|
||||
from wasabi import msg
|
||||
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
|
||||
from thinc.api import fix_random_seed, set_dropout_rate, Adam
|
||||
from thinc.api import Model, data_validation, set_gpu_allocator
|
||||
import typer
|
||||
|
||||
from ._util import Arg, Opt, debug_cli, show_validation_error
|
||||
from ._util import parse_config_overrides, string_to_list
|
||||
from ._util import parse_config_overrides, string_to_list, setup_gpu
|
||||
from ..schemas import ConfigSchemaTraining
|
||||
from ..util import registry
|
||||
from .. import util
|
||||
|
||||
|
||||
@debug_cli.command("model")
|
||||
@debug_cli.command(
|
||||
"model",
|
||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
)
|
||||
def debug_model_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
|
@ -37,11 +42,7 @@ def debug_model_cli(
|
|||
|
||||
DOCS: https://nightly.spacy.io/api/cli#debug-model
|
||||
"""
|
||||
if use_gpu >= 0:
|
||||
msg.info("Using GPU")
|
||||
require_gpu(use_gpu)
|
||||
else:
|
||||
msg.info("Using CPU")
|
||||
setup_gpu(use_gpu)
|
||||
layers = string_to_list(layers, intify=True)
|
||||
print_settings = {
|
||||
"dimensions": dimensions,
|
||||
|
@ -59,14 +60,15 @@ def debug_model_cli(
|
|||
raw_config = util.load_config(
|
||||
config_path, overrides=config_overrides, interpolate=False
|
||||
)
|
||||
config = raw_config.iterpolate()
|
||||
config = raw_config.interpolate()
|
||||
allocator = config["training"]["gpu_allocator"]
|
||||
if use_gpu >= 0 and allocator:
|
||||
set_gpu_allocator(allocator)
|
||||
with show_validation_error(config_path):
|
||||
nlp = util.load_model_from_config(raw_config)
|
||||
C = util.resolve_training_config(nlp.config)
|
||||
seed = C["training"]["seed"]
|
||||
config = nlp.config.interpolate()
|
||||
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
||||
seed = T["seed"]
|
||||
if seed is not None:
|
||||
msg.info(f"Fixing random seed: {seed}")
|
||||
fix_random_seed(seed)
|
||||
|
@ -77,11 +79,16 @@ def debug_model_cli(
|
|||
exits=1,
|
||||
)
|
||||
model = pipe.model
|
||||
debug_model(C, nlp, model, print_settings=print_settings)
|
||||
debug_model(config, T, nlp, model, print_settings=print_settings)
|
||||
|
||||
|
||||
def debug_model(
|
||||
config, nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] = None
|
||||
config,
|
||||
resolved_train_config,
|
||||
nlp,
|
||||
model: Model,
|
||||
*,
|
||||
print_settings: Optional[Dict[str, Any]] = None,
|
||||
):
|
||||
if not isinstance(model, Model):
|
||||
msg.fail(
|
||||
|
@ -102,13 +109,16 @@ def debug_model(
|
|||
# The output vector might differ from the official type of the output layer
|
||||
with data_validation(False):
|
||||
try:
|
||||
train_corpus = dot_to_object(config, config["training"]["train_corpus"])
|
||||
nlp.begin_training(lambda: train_corpus(nlp))
|
||||
dot_names = [resolved_train_config["train_corpus"]]
|
||||
with show_validation_error():
|
||||
(train_corpus,) = resolve_dot_names(config, dot_names)
|
||||
nlp.initialize(lambda: train_corpus(nlp))
|
||||
msg.info("Initialized the model with the training corpus.")
|
||||
except ValueError:
|
||||
try:
|
||||
_set_output_dim(nO=7, model=model)
|
||||
nlp.begin_training(lambda: [Example.from_dict(x, {}) for x in X])
|
||||
with show_validation_error():
|
||||
nlp.initialize(lambda: [Example.from_dict(x, {}) for x in X])
|
||||
msg.info("Initialized the model with dummy data.")
|
||||
except Exception:
|
||||
msg.fail(
|
||||
|
|
|
@ -3,11 +3,11 @@ from wasabi import Printer
|
|||
from pathlib import Path
|
||||
import re
|
||||
import srsly
|
||||
from thinc.api import require_gpu, fix_random_seed
|
||||
from thinc.api import fix_random_seed
|
||||
|
||||
from ..training import Corpus
|
||||
from ..tokens import Doc
|
||||
from ._util import app, Arg, Opt
|
||||
from ._util import app, Arg, Opt, setup_gpu, import_code
|
||||
from ..scorer import Scorer
|
||||
from .. import util
|
||||
from .. import displacy
|
||||
|
@ -19,6 +19,7 @@ def evaluate_cli(
|
|||
model: str = Arg(..., help="Model name or path"),
|
||||
data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
|
||||
output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
|
||||
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
|
||||
|
@ -37,6 +38,7 @@ def evaluate_cli(
|
|||
|
||||
DOCS: https://nightly.spacy.io/api/cli#evaluate
|
||||
"""
|
||||
import_code(code_path)
|
||||
evaluate(
|
||||
model,
|
||||
data_path,
|
||||
|
@ -61,8 +63,7 @@ def evaluate(
|
|||
) -> Scorer:
|
||||
msg = Printer(no_print=silent, pretty=not silent)
|
||||
fix_random_seed()
|
||||
if use_gpu >= 0:
|
||||
require_gpu(use_gpu)
|
||||
setup_gpu(use_gpu)
|
||||
data_path = util.ensure_path(data_path)
|
||||
output_path = util.ensure_path(output)
|
||||
displacy_path = util.ensure_path(displacy_path)
|
||||
|
|
|
@ -1,360 +0,0 @@
|
|||
from typing import Optional, List, Dict, Any, Union, IO
|
||||
import math
|
||||
from tqdm import tqdm
|
||||
import numpy
|
||||
from ast import literal_eval
|
||||
from pathlib import Path
|
||||
from preshed.counter import PreshCounter
|
||||
import tarfile
|
||||
import gzip
|
||||
import zipfile
|
||||
import srsly
|
||||
import warnings
|
||||
from wasabi import msg, Printer
|
||||
import typer
|
||||
|
||||
from ._util import app, init_cli, Arg, Opt
|
||||
from ..vectors import Vectors
|
||||
from ..errors import Errors, Warnings
|
||||
from ..language import Language
|
||||
from ..util import ensure_path, get_lang_class, load_model, OOV_RANK
|
||||
|
||||
try:
|
||||
import ftfy
|
||||
except ImportError:
|
||||
ftfy = None
|
||||
|
||||
|
||||
DEFAULT_OOV_PROB = -20
|
||||
|
||||
|
||||
@init_cli.command("vocab")
|
||||
@app.command(
|
||||
"init-model",
|
||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
hidden=True, # hide this from main CLI help but still allow it to work with warning
|
||||
)
|
||||
def init_model_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
lang: str = Arg(..., help="Pipeline language"),
|
||||
output_dir: Path = Arg(..., help="Pipeline output directory"),
|
||||
freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True),
|
||||
clusters_loc: Optional[Path] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data", exists=True),
|
||||
jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file", exists=True),
|
||||
vectors_loc: Optional[Path] = Opt(None, "--vectors-loc", "-v", help="Optional vectors file in Word2Vec format", exists=True),
|
||||
prune_vectors: int = Opt(-1, "--prune-vectors", "-V", help="Optional number of vectors to prune to"),
|
||||
truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
||||
vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
||||
model_name: Optional[str] = Opt(None, "--meta-name", "-mn", help="Optional name of the package for the pipeline meta"),
|
||||
base_model: Optional[str] = Opt(None, "--base", "-b", help="Name of or path to base pipeline to start with (mostly relevant for pipelines with custom tokenizers)")
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Create a new blank pipeline directory with vocab and vectors from raw data.
|
||||
If vectors are provided in Word2Vec format, they can be either a .txt or
|
||||
zipped as a .zip or .tar.gz.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/cli#init-vocab
|
||||
"""
|
||||
if ctx.command.name == "init-model":
|
||||
msg.warn(
|
||||
"The init-model command is now called 'init vocab'. You can run "
|
||||
"'python -m spacy init --help' for an overview of the other "
|
||||
"available initialization commands."
|
||||
)
|
||||
init_model(
|
||||
lang,
|
||||
output_dir,
|
||||
freqs_loc=freqs_loc,
|
||||
clusters_loc=clusters_loc,
|
||||
jsonl_loc=jsonl_loc,
|
||||
vectors_loc=vectors_loc,
|
||||
prune_vectors=prune_vectors,
|
||||
truncate_vectors=truncate_vectors,
|
||||
vectors_name=vectors_name,
|
||||
model_name=model_name,
|
||||
base_model=base_model,
|
||||
silent=False,
|
||||
)
|
||||
|
||||
|
||||
def init_model(
|
||||
lang: str,
|
||||
output_dir: Path,
|
||||
freqs_loc: Optional[Path] = None,
|
||||
clusters_loc: Optional[Path] = None,
|
||||
jsonl_loc: Optional[Path] = None,
|
||||
vectors_loc: Optional[Path] = None,
|
||||
prune_vectors: int = -1,
|
||||
truncate_vectors: int = 0,
|
||||
vectors_name: Optional[str] = None,
|
||||
model_name: Optional[str] = None,
|
||||
base_model: Optional[str] = None,
|
||||
silent: bool = True,
|
||||
) -> Language:
|
||||
msg = Printer(no_print=silent, pretty=not silent)
|
||||
if jsonl_loc is not None:
|
||||
if freqs_loc is not None or clusters_loc is not None:
|
||||
settings = ["-j"]
|
||||
if freqs_loc:
|
||||
settings.append("-f")
|
||||
if clusters_loc:
|
||||
settings.append("-c")
|
||||
msg.warn(
|
||||
"Incompatible arguments",
|
||||
"The -f and -c arguments are deprecated, and not compatible "
|
||||
"with the -j argument, which should specify the same "
|
||||
"information. Either merge the frequencies and clusters data "
|
||||
"into the JSONL-formatted file (recommended), or use only the "
|
||||
"-f and -c files, without the other lexical attributes.",
|
||||
)
|
||||
jsonl_loc = ensure_path(jsonl_loc)
|
||||
lex_attrs = srsly.read_jsonl(jsonl_loc)
|
||||
else:
|
||||
clusters_loc = ensure_path(clusters_loc)
|
||||
freqs_loc = ensure_path(freqs_loc)
|
||||
if freqs_loc is not None and not freqs_loc.exists():
|
||||
msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
|
||||
lex_attrs = read_attrs_from_deprecated(msg, freqs_loc, clusters_loc)
|
||||
|
||||
with msg.loading("Creating blank pipeline..."):
|
||||
nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model)
|
||||
|
||||
msg.good("Successfully created blank pipeline")
|
||||
if vectors_loc is not None:
|
||||
add_vectors(
|
||||
msg, nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name
|
||||
)
|
||||
vec_added = len(nlp.vocab.vectors)
|
||||
lex_added = len(nlp.vocab)
|
||||
msg.good(
|
||||
"Sucessfully compiled vocab", f"{lex_added} entries, {vec_added} vectors",
|
||||
)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
nlp.to_disk(output_dir)
|
||||
return nlp
|
||||
|
||||
|
||||
def open_file(loc: Union[str, Path]) -> IO:
|
||||
"""Handle .gz, .tar.gz or unzipped files"""
|
||||
loc = ensure_path(loc)
|
||||
if tarfile.is_tarfile(str(loc)):
|
||||
return tarfile.open(str(loc), "r:gz")
|
||||
elif loc.parts[-1].endswith("gz"):
|
||||
return (line.decode("utf8") for line in gzip.open(str(loc), "r"))
|
||||
elif loc.parts[-1].endswith("zip"):
|
||||
zip_file = zipfile.ZipFile(str(loc))
|
||||
names = zip_file.namelist()
|
||||
file_ = zip_file.open(names[0])
|
||||
return (line.decode("utf8") for line in file_)
|
||||
else:
|
||||
return loc.open("r", encoding="utf8")
|
||||
|
||||
|
||||
def read_attrs_from_deprecated(
|
||||
msg: Printer, freqs_loc: Optional[Path], clusters_loc: Optional[Path]
|
||||
) -> List[Dict[str, Any]]:
|
||||
if freqs_loc is not None:
|
||||
with msg.loading("Counting frequencies..."):
|
||||
probs, _ = read_freqs(freqs_loc)
|
||||
msg.good("Counted frequencies")
|
||||
else:
|
||||
probs, _ = ({}, DEFAULT_OOV_PROB) # noqa: F841
|
||||
if clusters_loc:
|
||||
with msg.loading("Reading clusters..."):
|
||||
clusters = read_clusters(clusters_loc)
|
||||
msg.good("Read clusters")
|
||||
else:
|
||||
clusters = {}
|
||||
lex_attrs = []
|
||||
sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True)
|
||||
if len(sorted_probs):
|
||||
for i, (word, prob) in tqdm(enumerate(sorted_probs)):
|
||||
attrs = {"orth": word, "id": i, "prob": prob}
|
||||
# Decode as a little-endian string, so that we can do & 15 to get
|
||||
# the first 4 bits. See _parse_features.pyx
|
||||
if word in clusters:
|
||||
attrs["cluster"] = int(clusters[word][::-1], 2)
|
||||
else:
|
||||
attrs["cluster"] = 0
|
||||
lex_attrs.append(attrs)
|
||||
return lex_attrs
|
||||
|
||||
|
||||
def create_model(
|
||||
lang: str,
|
||||
lex_attrs: List[Dict[str, Any]],
|
||||
name: Optional[str] = None,
|
||||
base_model: Optional[Union[str, Path]] = None,
|
||||
) -> Language:
|
||||
if base_model:
|
||||
nlp = load_model(base_model)
|
||||
# keep the tokenizer but remove any existing pipeline components due to
|
||||
# potentially conflicting vectors
|
||||
for pipe in nlp.pipe_names:
|
||||
nlp.remove_pipe(pipe)
|
||||
else:
|
||||
lang_class = get_lang_class(lang)
|
||||
nlp = lang_class()
|
||||
for lexeme in nlp.vocab:
|
||||
lexeme.rank = OOV_RANK
|
||||
for attrs in lex_attrs:
|
||||
if "settings" in attrs:
|
||||
continue
|
||||
lexeme = nlp.vocab[attrs["orth"]]
|
||||
lexeme.set_attrs(**attrs)
|
||||
if len(nlp.vocab):
|
||||
oov_prob = min(lex.prob for lex in nlp.vocab) - 1
|
||||
else:
|
||||
oov_prob = DEFAULT_OOV_PROB
|
||||
nlp.vocab.cfg.update({"oov_prob": oov_prob})
|
||||
if name:
|
||||
nlp.meta["name"] = name
|
||||
return nlp
|
||||
|
||||
|
||||
def add_vectors(
|
||||
msg: Printer,
|
||||
nlp: Language,
|
||||
vectors_loc: Optional[Path],
|
||||
truncate_vectors: int,
|
||||
prune_vectors: int,
|
||||
name: Optional[str] = None,
|
||||
) -> None:
|
||||
vectors_loc = ensure_path(vectors_loc)
|
||||
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
|
||||
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
|
||||
for lex in nlp.vocab:
|
||||
if lex.rank and lex.rank != OOV_RANK:
|
||||
nlp.vocab.vectors.add(lex.orth, row=lex.rank)
|
||||
else:
|
||||
if vectors_loc:
|
||||
with msg.loading(f"Reading vectors from {vectors_loc}"):
|
||||
vectors_data, vector_keys = read_vectors(
|
||||
msg, vectors_loc, truncate_vectors
|
||||
)
|
||||
msg.good(f"Loaded vectors from {vectors_loc}")
|
||||
else:
|
||||
vectors_data, vector_keys = (None, None)
|
||||
if vector_keys is not None:
|
||||
for word in vector_keys:
|
||||
if word not in nlp.vocab:
|
||||
nlp.vocab[word]
|
||||
if vectors_data is not None:
|
||||
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
|
||||
if name is None:
|
||||
# TODO: Is this correct? Does this matter?
|
||||
nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors"
|
||||
else:
|
||||
nlp.vocab.vectors.name = name
|
||||
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
|
||||
if prune_vectors >= 1:
|
||||
nlp.vocab.prune_vectors(prune_vectors)
|
||||
|
||||
|
||||
def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int):
|
||||
f = open_file(vectors_loc)
|
||||
f = ensure_shape(f)
|
||||
shape = tuple(int(size) for size in next(f).split())
|
||||
if truncate_vectors >= 1:
|
||||
shape = (truncate_vectors, shape[1])
|
||||
vectors_data = numpy.zeros(shape=shape, dtype="f")
|
||||
vectors_keys = []
|
||||
for i, line in enumerate(tqdm(f)):
|
||||
line = line.rstrip()
|
||||
pieces = line.rsplit(" ", vectors_data.shape[1])
|
||||
word = pieces.pop(0)
|
||||
if len(pieces) != vectors_data.shape[1]:
|
||||
msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1)
|
||||
vectors_data[i] = numpy.asarray(pieces, dtype="f")
|
||||
vectors_keys.append(word)
|
||||
if i == truncate_vectors - 1:
|
||||
break
|
||||
return vectors_data, vectors_keys
|
||||
|
||||
|
||||
def ensure_shape(lines):
|
||||
"""Ensure that the first line of the data is the vectors shape.
|
||||
|
||||
If it's not, we read in the data and output the shape as the first result,
|
||||
so that the reader doesn't have to deal with the problem.
|
||||
"""
|
||||
first_line = next(lines)
|
||||
try:
|
||||
shape = tuple(int(size) for size in first_line.split())
|
||||
except ValueError:
|
||||
shape = None
|
||||
if shape is not None:
|
||||
# All good, give the data
|
||||
yield first_line
|
||||
yield from lines
|
||||
else:
|
||||
# Figure out the shape, make it the first value, and then give the
|
||||
# rest of the data.
|
||||
width = len(first_line.split()) - 1
|
||||
captured = [first_line] + list(lines)
|
||||
length = len(captured)
|
||||
yield f"{length} {width}"
|
||||
yield from captured
|
||||
|
||||
|
||||
def read_freqs(
|
||||
freqs_loc: Path, max_length: int = 100, min_doc_freq: int = 5, min_freq: int = 50
|
||||
):
|
||||
counts = PreshCounter()
|
||||
total = 0
|
||||
with freqs_loc.open() as f:
|
||||
for i, line in enumerate(f):
|
||||
freq, doc_freq, key = line.rstrip().split("\t", 2)
|
||||
freq = int(freq)
|
||||
counts.inc(i + 1, freq)
|
||||
total += freq
|
||||
counts.smooth()
|
||||
log_total = math.log(total)
|
||||
probs = {}
|
||||
with freqs_loc.open() as f:
|
||||
for line in tqdm(f):
|
||||
freq, doc_freq, key = line.rstrip().split("\t", 2)
|
||||
doc_freq = int(doc_freq)
|
||||
freq = int(freq)
|
||||
if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
|
||||
try:
|
||||
word = literal_eval(key)
|
||||
except SyntaxError:
|
||||
# Take odd strings literally.
|
||||
word = literal_eval(f"'{key}'")
|
||||
smooth_count = counts.smoother(int(freq))
|
||||
probs[word] = math.log(smooth_count) - log_total
|
||||
oov_prob = math.log(counts.smoother(0)) - log_total
|
||||
return probs, oov_prob
|
||||
|
||||
|
||||
def read_clusters(clusters_loc: Path) -> dict:
|
||||
clusters = {}
|
||||
if ftfy is None:
|
||||
warnings.warn(Warnings.W004)
|
||||
with clusters_loc.open() as f:
|
||||
for line in tqdm(f):
|
||||
try:
|
||||
cluster, word, freq = line.split()
|
||||
if ftfy is not None:
|
||||
word = ftfy.fix_text(word)
|
||||
except ValueError:
|
||||
continue
|
||||
# If the clusterer has only seen the word a few times, its
|
||||
# cluster is unreliable.
|
||||
if int(freq) >= 3:
|
||||
clusters[word] = cluster
|
||||
else:
|
||||
clusters[word] = "0"
|
||||
# Expand clusters with re-casing
|
||||
for word, cluster in list(clusters.items()):
|
||||
if word.lower() not in clusters:
|
||||
clusters[word.lower()] = cluster
|
||||
if word.title() not in clusters:
|
||||
clusters[word.title()] = cluster
|
||||
if word.upper() not in clusters:
|
||||
clusters[word.upper()] = cluster
|
||||
return clusters
|
117
spacy/cli/init_pipeline.py
Normal file
117
spacy/cli/init_pipeline.py
Normal file
|
@ -0,0 +1,117 @@
|
|||
from typing import Optional
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import typer
|
||||
import srsly
|
||||
|
||||
from .. import util
|
||||
from ..training.initialize import init_nlp, convert_vectors
|
||||
from ..language import Language
|
||||
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
|
||||
from ._util import import_code, setup_gpu
|
||||
|
||||
|
||||
@init_cli.command("vectors")
|
||||
def init_vectors_cli(
|
||||
# fmt: off
|
||||
lang: str = Arg(..., help="The language of the nlp object to create"),
|
||||
vectors_loc: Path = Arg(..., help="Vectors file in Word2Vec format", exists=True),
|
||||
output_dir: Path = Arg(..., help="Pipeline output directory"),
|
||||
prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
|
||||
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
||||
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
|
||||
# fmt: on
|
||||
):
|
||||
"""Convert word vectors for use with spaCy. Will export an nlp object that
|
||||
you can use in the [initialize] block of your config to initialize
|
||||
a model with vectors.
|
||||
"""
|
||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||
msg.info(f"Creating blank nlp object for language '{lang}'")
|
||||
nlp = util.get_lang_class(lang)()
|
||||
if jsonl_loc is not None:
|
||||
update_lexemes(nlp, jsonl_loc)
|
||||
convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
|
||||
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
|
||||
nlp.to_disk(output_dir)
|
||||
msg.good(
|
||||
"Saved nlp object with vectors to output directory. You can now use the "
|
||||
"path to it in your config as the 'vectors' setting in [initialize.vocab].",
|
||||
output_dir.resolve(),
|
||||
)
|
||||
|
||||
|
||||
def update_lexemes(nlp: Language, jsonl_loc: Path) -> None:
|
||||
# Mostly used for backwards-compatibility and may be removed in the future
|
||||
lex_attrs = srsly.read_jsonl(jsonl_loc)
|
||||
for attrs in lex_attrs:
|
||||
if "settings" in attrs:
|
||||
continue
|
||||
lexeme = nlp.vocab[attrs["orth"]]
|
||||
lexeme.set_attrs(**attrs)
|
||||
|
||||
|
||||
@init_cli.command(
|
||||
"nlp",
|
||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
hidden=True,
|
||||
)
|
||||
def init_pipeline_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||
output_path: Path = Arg(..., help="Output directory for the prepared data"),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
|
||||
# fmt: on
|
||||
):
|
||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
import_code(code_path)
|
||||
setup_gpu(use_gpu)
|
||||
with show_validation_error(config_path):
|
||||
config = util.load_config(config_path, overrides=overrides)
|
||||
with show_validation_error(hint_fill=False):
|
||||
nlp = init_nlp(config, use_gpu=use_gpu)
|
||||
nlp.to_disk(output_path)
|
||||
msg.good(f"Saved initialized pipeline to {output_path}")
|
||||
|
||||
|
||||
@init_cli.command(
|
||||
"labels",
|
||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
)
|
||||
def init_labels_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||
output_path: Path = Arg(..., help="Output directory for the labels"),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
|
||||
# fmt: on
|
||||
):
|
||||
"""Generate JSON files for the labels in the data. This helps speed up the
|
||||
training process, since spaCy won't have to preprocess the data to
|
||||
extract the labels."""
|
||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||
if not output_path.exists():
|
||||
output_path.mkdir()
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
import_code(code_path)
|
||||
setup_gpu(use_gpu)
|
||||
with show_validation_error(config_path):
|
||||
config = util.load_config(config_path, overrides=overrides)
|
||||
with show_validation_error(hint_fill=False):
|
||||
nlp = init_nlp(config, use_gpu=use_gpu)
|
||||
for name, component in nlp.pipeline:
|
||||
if getattr(component, "label_data", None) is not None:
|
||||
output_file = output_path / f"{name}.json"
|
||||
srsly.write_json(output_file, component.label_data)
|
||||
msg.good(f"Saving {name} labels to {output_file}")
|
||||
else:
|
||||
msg.info(f"No labels found for {name}")
|
|
@ -1,25 +1,13 @@
|
|||
from typing import Optional
|
||||
import numpy
|
||||
import time
|
||||
import re
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
from thinc.api import require_gpu, set_gpu_allocator
|
||||
from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
|
||||
from thinc.api import Config, CosineDistance, L2Distance
|
||||
from wasabi import msg
|
||||
import srsly
|
||||
from functools import partial
|
||||
import typer
|
||||
import re
|
||||
|
||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
||||
from ._util import import_code
|
||||
from ..ml.models.multi_task import build_cloze_multi_task_model
|
||||
from ..ml.models.multi_task import build_cloze_characters_multi_task_model
|
||||
from ..tokens import Doc
|
||||
from ..attrs import ID
|
||||
from .. import util
|
||||
from ..util import dot_to_object
|
||||
from ._util import import_code, setup_gpu
|
||||
from ..training.pretrain import pretrain
|
||||
from ..util import load_config
|
||||
|
||||
|
||||
@app.command(
|
||||
|
@ -61,15 +49,11 @@ def pretrain_cli(
|
|||
config_overrides = parse_config_overrides(ctx.args)
|
||||
import_code(code_path)
|
||||
verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
|
||||
if use_gpu >= 0:
|
||||
msg.info("Using GPU")
|
||||
require_gpu(use_gpu)
|
||||
else:
|
||||
msg.info("Using CPU")
|
||||
setup_gpu(use_gpu)
|
||||
msg.info(f"Loading config from: {config_path}")
|
||||
|
||||
with show_validation_error(config_path):
|
||||
raw_config = util.load_config(
|
||||
raw_config = load_config(
|
||||
config_path, overrides=config_overrides, interpolate=False
|
||||
)
|
||||
config = raw_config.interpolate()
|
||||
|
@ -89,250 +73,11 @@ def pretrain_cli(
|
|||
resume_path=resume_path,
|
||||
epoch_resume=epoch_resume,
|
||||
use_gpu=use_gpu,
|
||||
silent=False,
|
||||
)
|
||||
|
||||
|
||||
def pretrain(
|
||||
config: Config,
|
||||
output_dir: Path,
|
||||
resume_path: Optional[Path] = None,
|
||||
epoch_resume: Optional[int] = None,
|
||||
use_gpu: int = -1,
|
||||
):
|
||||
if config["training"]["seed"] is not None:
|
||||
fix_random_seed(config["training"]["seed"])
|
||||
allocator = config["training"]["gpu_allocator"]
|
||||
if use_gpu >= 0 and allocator:
|
||||
set_gpu_allocator(allocator)
|
||||
nlp = util.load_model_from_config(config)
|
||||
C = util.resolve_training_config(nlp.config)
|
||||
P_cfg = C["pretraining"]
|
||||
corpus = dot_to_object(C, P_cfg["corpus"])
|
||||
batcher = P_cfg["batcher"]
|
||||
model = create_pretraining_model(nlp, C["pretraining"])
|
||||
optimizer = C["pretraining"]["optimizer"]
|
||||
# Load in pretrained weights to resume from
|
||||
if resume_path is not None:
|
||||
_resume_model(model, resume_path, epoch_resume)
|
||||
else:
|
||||
# Without '--resume-path' the '--epoch-resume' argument is ignored
|
||||
epoch_resume = 0
|
||||
|
||||
tracker = ProgressTracker(frequency=10000)
|
||||
msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
|
||||
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
|
||||
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
|
||||
|
||||
def _save_model(epoch, is_temp=False):
|
||||
is_temp_str = ".temp" if is_temp else ""
|
||||
with model.use_params(optimizer.averages):
|
||||
with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
|
||||
file_.write(model.get_ref("tok2vec").to_bytes())
|
||||
log = {
|
||||
"nr_word": tracker.nr_word,
|
||||
"loss": tracker.loss,
|
||||
"epoch_loss": tracker.epoch_loss,
|
||||
"epoch": epoch,
|
||||
}
|
||||
with (output_dir / "log.jsonl").open("a") as file_:
|
||||
file_.write(srsly.json_dumps(log) + "\n")
|
||||
|
||||
objective = create_objective(P_cfg["objective"])
|
||||
# TODO: I think we probably want this to look more like the
|
||||
# 'create_train_batches' function?
|
||||
for epoch in range(epoch_resume, P_cfg["max_epochs"]):
|
||||
for batch_id, batch in enumerate(batcher(corpus(nlp))):
|
||||
docs = ensure_docs(batch)
|
||||
loss = make_update(model, docs, optimizer, objective)
|
||||
progress = tracker.update(epoch, loss, docs)
|
||||
if progress:
|
||||
msg.row(progress, **row_settings)
|
||||
if P_cfg["n_save_every"] and (batch_id % P_cfg["n_save_every"] == 0):
|
||||
_save_model(epoch, is_temp=True)
|
||||
_save_model(epoch)
|
||||
tracker.epoch_loss = 0.0
|
||||
msg.good("Successfully finished pretrain")
|
||||
|
||||
|
||||
def ensure_docs(examples_or_docs):
|
||||
docs = []
|
||||
for eg_or_doc in examples_or_docs:
|
||||
if isinstance(eg_or_doc, Doc):
|
||||
docs.append(eg_or_doc)
|
||||
else:
|
||||
docs.append(eg_or_doc.reference)
|
||||
return docs
|
||||
|
||||
|
||||
def _resume_model(model, resume_path, epoch_resume):
|
||||
msg.info(f"Resume training tok2vec from: {resume_path}")
|
||||
with resume_path.open("rb") as file_:
|
||||
weights_data = file_.read()
|
||||
model.get_ref("tok2vec").from_bytes(weights_data)
|
||||
# Parse the epoch number from the given weight file
|
||||
model_name = re.search(r"model\d+\.bin", str(resume_path))
|
||||
if model_name:
|
||||
# Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
|
||||
epoch_resume = int(model_name.group(0)[5:][:-4]) + 1
|
||||
msg.info(f"Resuming from epoch: {epoch_resume}")
|
||||
else:
|
||||
msg.info(f"Resuming from epoch: {epoch_resume}")
|
||||
|
||||
|
||||
def make_update(model, docs, optimizer, objective_func):
|
||||
"""Perform an update over a single batch of documents.
|
||||
|
||||
docs (iterable): A batch of `Doc` objects.
|
||||
optimizer (callable): An optimizer.
|
||||
RETURNS loss: A float for the loss.
|
||||
"""
|
||||
predictions, backprop = model.begin_update(docs)
|
||||
loss, gradients = objective_func(model.ops, docs, predictions)
|
||||
backprop(gradients)
|
||||
model.finish_update(optimizer)
|
||||
# Don't want to return a cupy object here
|
||||
# The gradients are modified in-place by the BERT MLM,
|
||||
# so we get an accurate loss
|
||||
return float(loss)
|
||||
|
||||
|
||||
def create_objective(config):
|
||||
"""Create the objective for pretraining.
|
||||
|
||||
We'd like to replace this with a registry function but it's tricky because
|
||||
we're also making a model choice based on this. For now we hard-code support
|
||||
for two types (characters, vectors). For characters you can specify
|
||||
n_characters, for vectors you can specify the loss.
|
||||
|
||||
Bleh.
|
||||
"""
|
||||
objective_type = config["type"]
|
||||
if objective_type == "characters":
|
||||
return partial(get_characters_loss, nr_char=config["n_characters"])
|
||||
elif objective_type == "vectors":
|
||||
if config["loss"] == "cosine":
|
||||
return partial(
|
||||
get_vectors_loss,
|
||||
distance=CosineDistance(normalize=True, ignore_zeros=True),
|
||||
)
|
||||
elif config["loss"] == "L2":
|
||||
return partial(
|
||||
get_vectors_loss, distance=L2Distance(normalize=True, ignore_zeros=True)
|
||||
)
|
||||
else:
|
||||
raise ValueError("Unexpected loss type", config["loss"])
|
||||
else:
|
||||
raise ValueError("Unexpected objective_type", objective_type)
|
||||
|
||||
|
||||
def get_vectors_loss(ops, docs, prediction, distance):
|
||||
"""Compute a loss based on a distance between the documents' vectors and
|
||||
the prediction.
|
||||
"""
|
||||
# The simplest way to implement this would be to vstack the
|
||||
# token.vector values, but that's a bit inefficient, especially on GPU.
|
||||
# Instead we fetch the index into the vectors table for each of our tokens,
|
||||
# and look them up all at once. This prevents data copying.
|
||||
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
||||
target = docs[0].vocab.vectors.data[ids]
|
||||
d_target, loss = distance(prediction, target)
|
||||
return loss, d_target
|
||||
|
||||
|
||||
def get_characters_loss(ops, docs, prediction, nr_char):
|
||||
"""Compute a loss based on a number of characters predicted from the docs."""
|
||||
target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
|
||||
target_ids = target_ids.reshape((-1,))
|
||||
target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
|
||||
target = target.reshape((-1, 256 * nr_char))
|
||||
diff = prediction - target
|
||||
loss = (diff ** 2).sum()
|
||||
d_target = diff / float(prediction.shape[0])
|
||||
return loss, d_target
|
||||
|
||||
|
||||
def create_pretraining_model(nlp, pretrain_config):
|
||||
"""Define a network for the pretraining. We simply add an output layer onto
|
||||
the tok2vec input model. The tok2vec input model needs to be a model that
|
||||
takes a batch of Doc objects (as a list), and returns a list of arrays.
|
||||
Each array in the output needs to have one row per token in the doc.
|
||||
The actual tok2vec layer is stored as a reference, and only this bit will be
|
||||
serialized to file and read back in when calling the 'train' command.
|
||||
"""
|
||||
component = nlp.get_pipe(pretrain_config["component"])
|
||||
if pretrain_config.get("layer"):
|
||||
tok2vec = component.model.get_ref(pretrain_config["layer"])
|
||||
else:
|
||||
tok2vec = component.model
|
||||
|
||||
# TODO
|
||||
maxout_pieces = 3
|
||||
hidden_size = 300
|
||||
if pretrain_config["objective"]["type"] == "vectors":
|
||||
model = build_cloze_multi_task_model(
|
||||
nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
|
||||
)
|
||||
elif pretrain_config["objective"]["type"] == "characters":
|
||||
model = build_cloze_characters_multi_task_model(
|
||||
nlp.vocab,
|
||||
tok2vec,
|
||||
hidden_size=hidden_size,
|
||||
maxout_pieces=maxout_pieces,
|
||||
nr_char=pretrain_config["objective"]["n_characters"],
|
||||
)
|
||||
model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
|
||||
set_dropout_rate(model, pretrain_config["dropout"])
|
||||
return model
|
||||
|
||||
|
||||
class ProgressTracker:
|
||||
def __init__(self, frequency=1000000):
|
||||
self.loss = 0.0
|
||||
self.prev_loss = 0.0
|
||||
self.nr_word = 0
|
||||
self.words_per_epoch = Counter()
|
||||
self.frequency = frequency
|
||||
self.last_time = time.time()
|
||||
self.last_update = 0
|
||||
self.epoch_loss = 0.0
|
||||
|
||||
def update(self, epoch, loss, docs):
|
||||
self.loss += loss
|
||||
self.epoch_loss += loss
|
||||
words_in_batch = sum(len(doc) for doc in docs)
|
||||
self.words_per_epoch[epoch] += words_in_batch
|
||||
self.nr_word += words_in_batch
|
||||
words_since_update = self.nr_word - self.last_update
|
||||
if words_since_update >= self.frequency:
|
||||
wps = words_since_update / (time.time() - self.last_time)
|
||||
self.last_update = self.nr_word
|
||||
self.last_time = time.time()
|
||||
loss_per_word = self.loss - self.prev_loss
|
||||
status = (
|
||||
epoch,
|
||||
self.nr_word,
|
||||
_smart_round(self.loss, width=10),
|
||||
_smart_round(loss_per_word, width=6),
|
||||
int(wps),
|
||||
)
|
||||
self.prev_loss = float(self.loss)
|
||||
return status
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def _smart_round(figure, width=10, max_decimal=4):
|
||||
"""Round large numbers as integers, smaller numbers as decimals."""
|
||||
n_digits = len(str(int(figure)))
|
||||
n_decimal = width - (n_digits + 1)
|
||||
if n_decimal <= 1:
|
||||
return str(int(figure))
|
||||
else:
|
||||
n_decimal = min(n_decimal, max_decimal)
|
||||
format_str = "%." + str(n_decimal) + "f"
|
||||
return format_str % figure
|
||||
|
||||
|
||||
def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
|
||||
if not config_path or not config_path.exists():
|
||||
msg.fail("Config file not found", config_path, exits=1)
|
||||
|
|
|
@ -134,7 +134,7 @@ def update_dvc_config(
|
|||
|
||||
|
||||
def run_dvc_commands(
|
||||
commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {},
|
||||
commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {}
|
||||
) -> None:
|
||||
"""Run a sequence of DVC commands in a subprocess, in order.
|
||||
|
||||
|
|
|
@ -4,8 +4,8 @@ can help generate the best possible configuration, given a user's requirements.
|
|||
{%- set use_transformer = (transformer_data and hardware != "cpu") -%}
|
||||
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
||||
[paths]
|
||||
train = ""
|
||||
dev = ""
|
||||
train = null
|
||||
dev = null
|
||||
|
||||
[system]
|
||||
{% if use_transformer -%}
|
||||
|
@ -37,6 +37,22 @@ tokenizer_config = {"use_fast": true}
|
|||
window = 128
|
||||
stride = 96
|
||||
|
||||
{% if "morphologizer" in components %}
|
||||
[components.morphologizer]
|
||||
factory = "morphologizer"
|
||||
|
||||
[components.morphologizer.model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
nO = null
|
||||
|
||||
[components.morphologizer.model.tok2vec]
|
||||
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||
grad_factor = 1.0
|
||||
|
||||
[components.morphologizer.model.tok2vec.pooling]
|
||||
@layers = "reduce_mean.v1"
|
||||
{%- endif %}
|
||||
|
||||
{% if "tagger" in components %}
|
||||
[components.tagger]
|
||||
factory = "tagger"
|
||||
|
@ -166,6 +182,19 @@ depth = {{ 4 if optimize == "efficiency" else 8 }}
|
|||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
|
||||
{% if "morphologizer" in components %}
|
||||
[components.morphologizer]
|
||||
factory = "morphologizer"
|
||||
|
||||
[components.morphologizer.model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
nO = null
|
||||
|
||||
[components.morphologizer.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
{%- endif %}
|
||||
|
||||
{% if "tagger" in components %}
|
||||
[components.tagger]
|
||||
factory = "tagger"
|
||||
|
@ -257,7 +286,7 @@ no_output_layer = false
|
|||
{% endif %}
|
||||
|
||||
{% for pipe in components %}
|
||||
{% if pipe not in ["tagger", "parser", "ner", "textcat", "entity_linker"] %}
|
||||
{% if pipe not in ["tagger", "morphologizer", "parser", "ner", "textcat", "entity_linker"] %}
|
||||
{# Other components defined by the user: we just assume they're factories #}
|
||||
[components.{{ pipe }}]
|
||||
factory = "{{ pipe }}"
|
||||
|
@ -270,7 +299,6 @@ factory = "{{ pipe }}"
|
|||
@readers = "spacy.Corpus.v1"
|
||||
path = ${paths.train}
|
||||
max_length = {{ 500 if hardware == "gpu" else 2000 }}
|
||||
augmenter = {"@augmenters": "spacy.orth_variants.v1", "level": 0.1, "lower": 0.5}
|
||||
|
||||
[corpora.dev]
|
||||
@readers = "spacy.Corpus.v1"
|
||||
|
@ -278,11 +306,6 @@ path = ${paths.dev}
|
|||
max_length = 0
|
||||
|
||||
[training]
|
||||
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
|
||||
vectors = null
|
||||
{% else -%}
|
||||
vectors = "{{ word_vectors }}"
|
||||
{% endif -%}
|
||||
{% if use_transformer -%}
|
||||
accumulate_gradient = {{ transformer["size_factor"] }}
|
||||
{% endif -%}
|
||||
|
@ -318,3 +341,10 @@ start = 100
|
|||
stop = 1000
|
||||
compound = 1.001
|
||||
{% endif %}
|
||||
|
||||
[initialize]
|
||||
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
|
||||
vectors = null
|
||||
{% else -%}
|
||||
vectors = "{{ word_vectors }}"
|
||||
{% endif -%}
|
||||
|
|
|
@ -1,23 +1,14 @@
|
|||
from typing import Optional, Dict, Any, Tuple, Union, Callable, List
|
||||
from timeit import default_timer as timer
|
||||
import srsly
|
||||
import tqdm
|
||||
from typing import Optional
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import thinc
|
||||
import thinc.schedules
|
||||
from thinc.api import Config, Optimizer, require_gpu, fix_random_seed, set_gpu_allocator
|
||||
import random
|
||||
import typer
|
||||
import logging
|
||||
|
||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
||||
from ._util import import_code, get_sourced_components
|
||||
from ..language import Language
|
||||
from ._util import import_code, setup_gpu
|
||||
from ..training.loop import train
|
||||
from ..training.initialize import init_nlp
|
||||
from .. import util
|
||||
from ..training.example import Example
|
||||
from ..errors import Errors
|
||||
from ..util import dot_to_object
|
||||
|
||||
|
||||
@app.command(
|
||||
|
@ -30,8 +21,7 @@ def train_cli(
|
|||
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
resume: bool = Opt(False, "--resume", "-R", help="Resume training"),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
|
@ -48,393 +38,19 @@ def train_cli(
|
|||
|
||||
DOCS: https://nightly.spacy.io/api/cli#train
|
||||
"""
|
||||
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
|
||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||
verify_cli_args(config_path, output_path)
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
import_code(code_path)
|
||||
train(
|
||||
config_path,
|
||||
output_path=output_path,
|
||||
config_overrides=overrides,
|
||||
use_gpu=use_gpu,
|
||||
resume_training=resume,
|
||||
)
|
||||
|
||||
|
||||
def train(
|
||||
config_path: Path,
|
||||
output_path: Optional[Path] = None,
|
||||
config_overrides: Dict[str, Any] = {},
|
||||
use_gpu: int = -1,
|
||||
resume_training: bool = False,
|
||||
) -> None:
|
||||
if use_gpu >= 0:
|
||||
msg.info(f"Using GPU: {use_gpu}")
|
||||
require_gpu(use_gpu)
|
||||
else:
|
||||
msg.info("Using CPU")
|
||||
msg.info(f"Loading config and nlp from: {config_path}")
|
||||
setup_gpu(use_gpu)
|
||||
with show_validation_error(config_path):
|
||||
# Keep an un-interpolated config so we can preserve variables in
|
||||
# the final nlp object we train and serialize
|
||||
raw_config = util.load_config(
|
||||
config_path, overrides=config_overrides, interpolate=False
|
||||
)
|
||||
config = raw_config.interpolate()
|
||||
if config["training"]["seed"] is not None:
|
||||
fix_random_seed(config["training"]["seed"])
|
||||
allocator = config["training"]["gpu_allocator"]
|
||||
if use_gpu >= 0 and allocator:
|
||||
set_gpu_allocator(allocator)
|
||||
# Use original config here before it's resolved to functions
|
||||
sourced_components = get_sourced_components(config)
|
||||
with show_validation_error(config_path):
|
||||
nlp = util.load_model_from_config(raw_config)
|
||||
# Resolve all training-relevant sections using the filled nlp config
|
||||
C = util.resolve_training_config(nlp.config)
|
||||
util.load_vocab_data_into_model(nlp, lookups=C["training"]["lookups"])
|
||||
if C["training"]["vectors"] is not None:
|
||||
add_vectors(nlp, C["training"]["vectors"])
|
||||
raw_text, tag_map, morph_rules, weights_data = load_from_paths(C)
|
||||
T_cfg = C["training"]
|
||||
optimizer = T_cfg["optimizer"]
|
||||
train_corpus = dot_to_object(C, T_cfg["train_corpus"])
|
||||
dev_corpus = dot_to_object(C, T_cfg["dev_corpus"])
|
||||
batcher = T_cfg["batcher"]
|
||||
train_logger = T_cfg["logger"]
|
||||
before_to_disk = create_before_to_disk_callback(T_cfg["before_to_disk"])
|
||||
# Components that shouldn't be updated during training
|
||||
frozen_components = T_cfg["frozen_components"]
|
||||
# Sourced components that require resume_training
|
||||
resume_components = [p for p in sourced_components if p not in frozen_components]
|
||||
msg.info(f"Pipeline: {nlp.pipe_names}")
|
||||
if resume_components:
|
||||
with nlp.select_pipes(enable=resume_components):
|
||||
msg.info(f"Resuming training for: {resume_components}")
|
||||
nlp.resume_training(sgd=optimizer)
|
||||
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
|
||||
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
|
||||
# Verify the config after calling 'begin_training' to ensure labels are properly initialized
|
||||
verify_config(nlp)
|
||||
|
||||
if tag_map:
|
||||
# Replace tag map with provided mapping
|
||||
nlp.vocab.morphology.load_tag_map(tag_map)
|
||||
if morph_rules:
|
||||
# Load morph rules
|
||||
nlp.vocab.morphology.load_morph_exceptions(morph_rules)
|
||||
|
||||
# Load pretrained tok2vec weights - cf. CLI command 'pretrain'
|
||||
if weights_data is not None:
|
||||
tok2vec_component = C["pretraining"]["component"]
|
||||
if tok2vec_component is None:
|
||||
msg.fail(
|
||||
f"To use pretrained tok2vec weights, [pretraining.component] "
|
||||
f"needs to specify the component that should load them.",
|
||||
exits=1,
|
||||
)
|
||||
layer = nlp.get_pipe(tok2vec_component).model
|
||||
tok2vec_layer = C["pretraining"]["layer"]
|
||||
if tok2vec_layer:
|
||||
layer = layer.get_ref(tok2vec_layer)
|
||||
layer.from_bytes(weights_data)
|
||||
msg.info(f"Loaded pretrained weights into component '{tok2vec_component}'")
|
||||
|
||||
# Create iterator, which yields out info after each optimization step.
|
||||
msg.info("Start training")
|
||||
score_weights = T_cfg["score_weights"]
|
||||
training_step_iterator = train_while_improving(
|
||||
nlp,
|
||||
optimizer,
|
||||
create_train_batches(train_corpus(nlp), batcher, T_cfg["max_epochs"]),
|
||||
create_evaluation_callback(nlp, dev_corpus, score_weights),
|
||||
dropout=T_cfg["dropout"],
|
||||
accumulate_gradient=T_cfg["accumulate_gradient"],
|
||||
patience=T_cfg["patience"],
|
||||
max_steps=T_cfg["max_steps"],
|
||||
eval_frequency=T_cfg["eval_frequency"],
|
||||
raw_text=None,
|
||||
exclude=frozen_components,
|
||||
)
|
||||
msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
|
||||
with nlp.select_pipes(disable=frozen_components):
|
||||
print_row, finalize_logger = train_logger(nlp)
|
||||
|
||||
try:
|
||||
progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
|
||||
progress.set_description(f"Epoch 1")
|
||||
for batch, info, is_best_checkpoint in training_step_iterator:
|
||||
progress.update(1)
|
||||
if is_best_checkpoint is not None:
|
||||
progress.close()
|
||||
print_row(info)
|
||||
if is_best_checkpoint and output_path is not None:
|
||||
with nlp.select_pipes(disable=frozen_components):
|
||||
update_meta(T_cfg, nlp, info)
|
||||
with nlp.use_params(optimizer.averages):
|
||||
nlp = before_to_disk(nlp)
|
||||
nlp.to_disk(output_path / "model-best")
|
||||
progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
|
||||
progress.set_description(f"Epoch {info['epoch']}")
|
||||
except Exception as e:
|
||||
finalize_logger()
|
||||
if output_path is not None:
|
||||
# We don't want to swallow the traceback if we don't have a
|
||||
# specific error.
|
||||
msg.warn(
|
||||
f"Aborting and saving the final best model. "
|
||||
f"Encountered exception: {str(e)}"
|
||||
)
|
||||
nlp = before_to_disk(nlp)
|
||||
nlp.to_disk(output_path / "model-final")
|
||||
raise e
|
||||
finally:
|
||||
finalize_logger()
|
||||
if output_path is not None:
|
||||
final_model_path = output_path / "model-final"
|
||||
if optimizer.averages:
|
||||
with nlp.use_params(optimizer.averages):
|
||||
nlp.to_disk(final_model_path)
|
||||
else:
|
||||
nlp.to_disk(final_model_path)
|
||||
msg.good(f"Saved pipeline to output directory {final_model_path}")
|
||||
|
||||
|
||||
def add_vectors(nlp: Language, vectors: str) -> None:
|
||||
title = f"Config validation error for vectors {vectors}"
|
||||
desc = (
|
||||
"This typically means that there's a problem in the config.cfg included "
|
||||
"with the packaged vectors. Make sure that the vectors package you're "
|
||||
"loading is compatible with the current version of spaCy."
|
||||
)
|
||||
with show_validation_error(
|
||||
title=title, desc=desc, hint_fill=False, show_config=False
|
||||
):
|
||||
util.load_vectors_into_model(nlp, vectors)
|
||||
|
||||
|
||||
def create_train_batches(iterator, batcher, max_epochs: int):
|
||||
epoch = 0
|
||||
examples = list(iterator)
|
||||
if not examples:
|
||||
# Raise error if no data
|
||||
raise ValueError(Errors.E986)
|
||||
while max_epochs < 1 or epoch != max_epochs:
|
||||
random.shuffle(examples)
|
||||
for batch in batcher(examples):
|
||||
yield epoch, batch
|
||||
epoch += 1
|
||||
|
||||
|
||||
def create_evaluation_callback(
|
||||
nlp: Language, dev_corpus: Callable, weights: Dict[str, float]
|
||||
) -> Callable[[], Tuple[float, Dict[str, float]]]:
|
||||
weights = {key: value for key, value in weights.items() if value is not None}
|
||||
|
||||
def evaluate() -> Tuple[float, Dict[str, float]]:
|
||||
dev_examples = list(dev_corpus(nlp))
|
||||
scores = nlp.evaluate(dev_examples)
|
||||
# Calculate a weighted sum based on score_weights for the main score.
|
||||
# We can only consider scores that are ints/floats, not dicts like
|
||||
# entity scores per type etc.
|
||||
for key, value in scores.items():
|
||||
if key in weights and not isinstance(value, (int, float)):
|
||||
raise ValueError(Errors.E915.format(name=key, score_type=type(value)))
|
||||
try:
|
||||
weighted_score = sum(
|
||||
scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights
|
||||
)
|
||||
except KeyError as e:
|
||||
keys = list(scores.keys())
|
||||
err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys)
|
||||
raise KeyError(err) from None
|
||||
return weighted_score, scores
|
||||
|
||||
return evaluate
|
||||
|
||||
|
||||
def create_before_to_disk_callback(
|
||||
callback: Optional[Callable[[Language], Language]]
|
||||
) -> Callable[[Language], Language]:
|
||||
def before_to_disk(nlp: Language) -> Language:
|
||||
if not callback:
|
||||
return nlp
|
||||
modified_nlp = callback(nlp)
|
||||
if not isinstance(modified_nlp, Language):
|
||||
err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp))
|
||||
raise ValueError(err)
|
||||
return modified_nlp
|
||||
|
||||
return before_to_disk
|
||||
|
||||
|
||||
def train_while_improving(
|
||||
nlp: Language,
|
||||
optimizer: Optimizer,
|
||||
train_data,
|
||||
evaluate,
|
||||
*,
|
||||
dropout: float,
|
||||
eval_frequency: int,
|
||||
accumulate_gradient: int,
|
||||
patience: int,
|
||||
max_steps: int,
|
||||
raw_text: List[Dict[str, str]],
|
||||
exclude: List[str],
|
||||
):
|
||||
"""Train until an evaluation stops improving. Works as a generator,
|
||||
with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
|
||||
where info is a dict, and is_best_checkpoint is in [True, False, None] --
|
||||
None indicating that the iteration was not evaluated as a checkpoint.
|
||||
The evaluation is conducted by calling the evaluate callback.
|
||||
|
||||
Positional arguments:
|
||||
nlp: The spaCy pipeline to evaluate.
|
||||
optimizer: The optimizer callable.
|
||||
train_data (Iterable[Batch]): A generator of batches, with the training
|
||||
data. Each batch should be a Sized[Tuple[Input, Annot]]. The training
|
||||
data iterable needs to take care of iterating over the epochs and
|
||||
shuffling.
|
||||
evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation.
|
||||
The callback should take no arguments and return a tuple
|
||||
`(main_score, other_scores)`. The main_score should be a float where
|
||||
higher is better. other_scores can be any object.
|
||||
|
||||
Every iteration, the function yields out a tuple with:
|
||||
|
||||
* batch: A list of Example objects.
|
||||
* info: A dict with various information about the last update (see below).
|
||||
* is_best_checkpoint: A value in None, False, True, indicating whether this
|
||||
was the best evaluation so far. You should use this to save the model
|
||||
checkpoints during training. If None, evaluation was not conducted on
|
||||
that iteration. False means evaluation was conducted, but a previous
|
||||
evaluation was better.
|
||||
|
||||
The info dict provides the following information:
|
||||
|
||||
epoch (int): How many passes over the data have been completed.
|
||||
step (int): How many steps have been completed.
|
||||
score (float): The main score from the last evaluation.
|
||||
other_scores: : The other scores from the last evaluation.
|
||||
losses: The accumulated losses throughout training.
|
||||
checkpoints: A list of previous results, where each result is a
|
||||
(score, step, epoch) tuple.
|
||||
"""
|
||||
if isinstance(dropout, float):
|
||||
dropouts = thinc.schedules.constant(dropout)
|
||||
else:
|
||||
dropouts = dropout
|
||||
results = []
|
||||
losses = {}
|
||||
if raw_text:
|
||||
random.shuffle(raw_text)
|
||||
raw_examples = [
|
||||
Example.from_dict(nlp.make_doc(rt["text"]), {}) for rt in raw_text
|
||||
]
|
||||
raw_batches = util.minibatch(raw_examples, size=8)
|
||||
|
||||
words_seen = 0
|
||||
start_time = timer()
|
||||
for step, (epoch, batch) in enumerate(train_data):
|
||||
dropout = next(dropouts)
|
||||
for subbatch in subdivide_batch(batch, accumulate_gradient):
|
||||
|
||||
nlp.update(
|
||||
subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude
|
||||
)
|
||||
if raw_text:
|
||||
# If raw text is available, perform 'rehearsal' updates,
|
||||
# which use unlabelled data to reduce overfitting.
|
||||
raw_batch = list(next(raw_batches))
|
||||
nlp.rehearse(raw_batch, sgd=optimizer, losses=losses, exclude=exclude)
|
||||
# TODO: refactor this so we don't have to run it separately in here
|
||||
for name, proc in nlp.pipeline:
|
||||
if (
|
||||
name not in exclude
|
||||
and hasattr(proc, "model")
|
||||
and proc.model not in (True, False, None)
|
||||
):
|
||||
proc.model.finish_update(optimizer)
|
||||
optimizer.step_schedules()
|
||||
if not (step % eval_frequency):
|
||||
if optimizer.averages:
|
||||
with nlp.use_params(optimizer.averages):
|
||||
score, other_scores = evaluate()
|
||||
else:
|
||||
score, other_scores = evaluate()
|
||||
results.append((score, step))
|
||||
is_best_checkpoint = score == max(results)[0]
|
||||
else:
|
||||
score, other_scores = (None, None)
|
||||
is_best_checkpoint = None
|
||||
words_seen += sum(len(eg) for eg in batch)
|
||||
info = {
|
||||
"epoch": epoch,
|
||||
"step": step,
|
||||
"score": score,
|
||||
"other_scores": other_scores,
|
||||
"losses": losses,
|
||||
"checkpoints": results,
|
||||
"seconds": int(timer() - start_time),
|
||||
"words": words_seen,
|
||||
}
|
||||
yield batch, info, is_best_checkpoint
|
||||
if is_best_checkpoint is not None:
|
||||
losses = {}
|
||||
# Stop if no improvement in `patience` updates (if specified)
|
||||
best_score, best_step = max(results)
|
||||
if patience and (step - best_step) >= patience:
|
||||
break
|
||||
# Stop if we've exhausted our max steps (if specified)
|
||||
if max_steps and step >= max_steps:
|
||||
break
|
||||
|
||||
|
||||
def subdivide_batch(batch, accumulate_gradient):
|
||||
batch = list(batch)
|
||||
batch.sort(key=lambda eg: len(eg.predicted))
|
||||
sub_len = len(batch) // accumulate_gradient
|
||||
start = 0
|
||||
for i in range(accumulate_gradient):
|
||||
subbatch = batch[start : start + sub_len]
|
||||
if subbatch:
|
||||
yield subbatch
|
||||
start += len(subbatch)
|
||||
subbatch = batch[start:]
|
||||
if subbatch:
|
||||
yield subbatch
|
||||
|
||||
|
||||
def update_meta(
|
||||
training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
|
||||
) -> None:
|
||||
nlp.meta["performance"] = {}
|
||||
for metric in training["score_weights"]:
|
||||
if metric is not None:
|
||||
nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
|
||||
for pipe_name in nlp.pipe_names:
|
||||
nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
|
||||
|
||||
|
||||
def load_from_paths(
|
||||
config: Config,
|
||||
) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]:
|
||||
# TODO: separate checks from loading
|
||||
raw_text = util.ensure_path(config["training"]["raw_text"])
|
||||
if raw_text is not None:
|
||||
if not raw_text.exists():
|
||||
msg.fail("Can't find raw text", raw_text, exits=1)
|
||||
raw_text = list(srsly.read_jsonl(config["training"]["raw_text"]))
|
||||
tag_map = {}
|
||||
morph_rules = {}
|
||||
weights_data = None
|
||||
init_tok2vec = util.ensure_path(config["training"]["init_tok2vec"])
|
||||
if init_tok2vec is not None:
|
||||
if not init_tok2vec.exists():
|
||||
msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
|
||||
with init_tok2vec.open("rb") as file_:
|
||||
weights_data = file_.read()
|
||||
return raw_text, tag_map, morph_rules, weights_data
|
||||
config = util.load_config(config_path, overrides=overrides, interpolate=False)
|
||||
msg.divider("Initializing pipeline")
|
||||
with show_validation_error(config_path, hint_fill=False):
|
||||
nlp = init_nlp(config, use_gpu=use_gpu)
|
||||
msg.good("Initialized pipeline")
|
||||
msg.divider("Training pipeline")
|
||||
train(nlp, output_path, use_gpu=use_gpu, silent=False)
|
||||
|
||||
|
||||
def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> None:
|
||||
|
@ -445,30 +61,3 @@ def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> No
|
|||
if not output_path.exists():
|
||||
output_path.mkdir()
|
||||
msg.good(f"Created output directory: {output_path}")
|
||||
|
||||
|
||||
def verify_config(nlp: Language) -> None:
|
||||
"""Perform additional checks based on the config, loaded nlp object and training data."""
|
||||
# TODO: maybe we should validate based on the actual components, the list
|
||||
# in config["nlp"]["pipeline"] instead?
|
||||
for pipe_config in nlp.config["components"].values():
|
||||
# We can't assume that the component name == the factory
|
||||
factory = pipe_config["factory"]
|
||||
if factory == "textcat":
|
||||
verify_textcat_config(nlp, pipe_config)
|
||||
|
||||
|
||||
def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None:
|
||||
# if 'positive_label' is provided: double check whether it's in the data and
|
||||
# the task is binary
|
||||
if pipe_config.get("positive_label"):
|
||||
textcat_labels = nlp.get_pipe("textcat").labels
|
||||
pos_label = pipe_config.get("positive_label")
|
||||
if pos_label not in textcat_labels:
|
||||
raise ValueError(
|
||||
Errors.E920.format(pos_label=pos_label, labels=textcat_labels)
|
||||
)
|
||||
if len(list(textcat_labels)) != 2:
|
||||
raise ValueError(
|
||||
Errors.E919.format(pos_label=pos_label, labels=textcat_labels)
|
||||
)
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
[paths]
|
||||
train = ""
|
||||
dev = ""
|
||||
raw = null
|
||||
train = null
|
||||
dev = null
|
||||
vectors = null
|
||||
init_tok2vec = null
|
||||
|
||||
[system]
|
||||
|
@ -10,8 +10,13 @@ gpu_allocator = null
|
|||
|
||||
[nlp]
|
||||
lang = null
|
||||
# List of pipeline component names, in order. The names should correspond to
|
||||
# components defined in the [components block]
|
||||
pipeline = []
|
||||
# Components that are loaded but disabled by default
|
||||
disabled = []
|
||||
# Optional callbacks to modify the nlp object before it's initialized, after
|
||||
# it's created and after the pipeline has been set up
|
||||
before_creation = null
|
||||
after_creation = null
|
||||
after_pipeline_creation = null
|
||||
|
@ -19,6 +24,7 @@ after_pipeline_creation = null
|
|||
[nlp.tokenizer]
|
||||
@tokenizers = "spacy.Tokenizer.v1"
|
||||
|
||||
# The pipeline components and their models
|
||||
[components]
|
||||
|
||||
# Readers for corpora like dev and train.
|
||||
|
@ -37,9 +43,8 @@ max_length = 0
|
|||
limit = 0
|
||||
# Apply some simply data augmentation, where we replace tokens with variations.
|
||||
# This is especially useful for punctuation and case replacement, to help
|
||||
# generalize beyond corpora that don't have smart-quotes, or only have smart
|
||||
# quotes, etc.
|
||||
augmenter = {"@augmenters": "spacy.orth_variants.v1", "level": 0.1, "lower": 0.5}
|
||||
# generalize beyond corpora that don't/only have smart quotes etc.
|
||||
augmenter = null
|
||||
|
||||
[corpora.dev]
|
||||
@readers = "spacy.Corpus.v1"
|
||||
|
@ -52,6 +57,8 @@ gold_preproc = false
|
|||
max_length = 0
|
||||
# Limitation on number of training examples
|
||||
limit = 0
|
||||
# Optional callback for data augmentation
|
||||
augmenter = null
|
||||
|
||||
# Training hyper-parameters and additional features.
|
||||
[training]
|
||||
|
@ -59,11 +66,6 @@ seed = ${system.seed}
|
|||
gpu_allocator = ${system.gpu_allocator}
|
||||
dropout = 0.1
|
||||
accumulate_gradient = 1
|
||||
# Extra resources for transfer-learning or pseudo-rehearsal
|
||||
init_tok2vec = ${paths.init_tok2vec}
|
||||
raw_text = ${paths.raw}
|
||||
vectors = null
|
||||
lookups = null
|
||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
||||
patience = 1600
|
||||
max_epochs = 0
|
||||
|
@ -104,3 +106,19 @@ grad_clip = 1.0
|
|||
use_averages = false
|
||||
eps = 1e-8
|
||||
learn_rate = 0.001
|
||||
|
||||
# These settings are used when nlp.initialize() is called (typically before
|
||||
# training or pretraining). Components and the tokenizer can each define their
|
||||
# own arguments via their initialize methods that are populated by the config.
|
||||
# This lets them gather data resources, build label sets etc.
|
||||
[initialize]
|
||||
vectors = ${paths.vectors}
|
||||
# Extra resources for transfer-learning or pseudo-rehearsal
|
||||
init_tok2vec = ${paths.init_tok2vec}
|
||||
# Data and lookups for vocabulary
|
||||
vocab_data = null
|
||||
lookups = null
|
||||
# Arguments passed to the tokenizer's initialize method
|
||||
tokenizer = {}
|
||||
# Arguments for initialize methods of the components (keyed by component)
|
||||
components = {}
|
||||
|
|
|
@ -1,3 +1,6 @@
|
|||
[paths]
|
||||
raw_text = null
|
||||
|
||||
[pretraining]
|
||||
max_epochs = 1000
|
||||
dropout = 0.2
|
||||
|
@ -31,8 +34,8 @@ learn_rate = 0.001
|
|||
[corpora]
|
||||
|
||||
[corpora.pretrain]
|
||||
@readers = "spacy.JsonlReader.v1"
|
||||
path = ${paths.raw}
|
||||
@readers = "spacy.JsonlCorpus.v1"
|
||||
path = ${paths.raw_text}
|
||||
min_length = 5
|
||||
max_length = 500
|
||||
limit = 0
|
||||
|
|
|
@ -85,6 +85,7 @@ class Warnings:
|
|||
"attribute or operator.")
|
||||
|
||||
# TODO: fix numbering after merging develop into master
|
||||
W089 = ("The nlp.begin_training method has been renamed to nlp.initialize.")
|
||||
W090 = ("Could not locate any {format} files in path '{path}'.")
|
||||
W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
|
||||
W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
|
||||
|
@ -306,7 +307,7 @@ class Errors:
|
|||
"settings: {opts}")
|
||||
E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}")
|
||||
E109 = ("Component '{name}' could not be run. Did you forget to "
|
||||
"call begin_training()?")
|
||||
"call initialize()?")
|
||||
E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
|
||||
E111 = ("Pickling a token is not supported, because tokens are only views "
|
||||
"of the parent Doc and can't exist on their own. A pickled token "
|
||||
|
@ -376,7 +377,7 @@ class Errors:
|
|||
"provided {found}.")
|
||||
E143 = ("Labels for component '{name}' not initialized. This can be fixed "
|
||||
"by calling add_label, or by providing a representative batch of "
|
||||
"examples to the component's begin_training method.")
|
||||
"examples to the component's initialize method.")
|
||||
E145 = ("Error reading `{param}` from input file.")
|
||||
E146 = ("Could not access `{path}`.")
|
||||
E147 = ("Unexpected error in the {method} functionality of the "
|
||||
|
@ -418,7 +419,7 @@ class Errors:
|
|||
E164 = ("x is neither increasing nor decreasing: {}.")
|
||||
E165 = ("Only one class present in y_true. ROC AUC score is not defined in "
|
||||
"that case.")
|
||||
E166 = ("Can only merge DocBins with the same pre-defined attributes.\n"
|
||||
E166 = ("Can only merge DocBins with the same value for '{param}'.\n"
|
||||
"Current DocBin: {current}\nOther DocBin: {other}")
|
||||
E169 = ("Can't find module: {module}")
|
||||
E170 = ("Cannot apply transition {name}: invalid for the current state.")
|
||||
|
@ -476,6 +477,10 @@ class Errors:
|
|||
E201 = ("Span index out of range.")
|
||||
|
||||
# TODO: fix numbering after merging develop into master
|
||||
E912 = ("Failed to initialize lemmatizer. Missing lemmatizer table(s) found "
|
||||
"for mode '{mode}'. Required tables: {tables}. Found: {found}.")
|
||||
E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
|
||||
"config.cfg or override it on the CLI?")
|
||||
E914 = ("Executing {name} callback failed. Expected the function to "
|
||||
"return the nlp object but got: {value}. Maybe you forgot to return "
|
||||
"the modified object in your function?")
|
||||
|
@ -517,7 +522,7 @@ class Errors:
|
|||
"but the provided argument {loc} points to a file.")
|
||||
E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does "
|
||||
"not seem to exist.")
|
||||
E930 = ("Received invalid get_examples callback in {name}.begin_training. "
|
||||
E930 = ("Received invalid get_examples callback in {name}.initialize. "
|
||||
"Expected function that returns an iterable of Example objects but "
|
||||
"got: {obj}")
|
||||
E931 = ("Encountered Pipe subclass without Pipe.{method} method in component "
|
||||
|
@ -553,7 +558,10 @@ class Errors:
|
|||
E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
|
||||
E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
|
||||
"component.")
|
||||
E955 = ("Can't find table(s) '{table}' for language '{lang}' in spacy-lookups-data.")
|
||||
E955 = ("Can't find table(s) {table} for language '{lang}' in "
|
||||
"spacy-lookups-data. Make sure you have the package installed or "
|
||||
"provide your own lookup tables if no default lookups are available "
|
||||
"for your language.")
|
||||
E956 = ("Can't find component '{name}' in [components] block in the config. "
|
||||
"Available components: {opts}")
|
||||
E957 = ("Writing directly to Language.factories isn't needed anymore in "
|
||||
|
@ -670,18 +678,17 @@ class Errors:
|
|||
"'{token_attrs}'.")
|
||||
E999 = ("Unable to merge the `Doc` objects because they do not all share "
|
||||
"the same `Vocab`.")
|
||||
E1000 = ("No pkuseg model available. Provide a pkuseg model when "
|
||||
"initializing the pipeline:\n"
|
||||
'cfg = {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path}}\n'
|
||||
'nlp = Chinese(config=cfg)')
|
||||
E1000 = ("The Chinese word segmenter is pkuseg but no pkuseg model was "
|
||||
"loaded. Provide the name of a pretrained model or the path to "
|
||||
"a model and initialize the pipeline:\n\n"
|
||||
'nlp.tokenizer.initialize(pkuseg_model="default")')
|
||||
E1001 = ("Target token outside of matched span for match with tokens "
|
||||
"'{span}' and offset '{index}' matched by patterns '{patterns}'.")
|
||||
E1002 = ("Span index out of range.")
|
||||
E1003 = ("Unsupported lemmatizer mode '{mode}'.")
|
||||
E1004 = ("Missing lemmatizer table(s) found for lemmatizer mode '{mode}'. "
|
||||
"Required tables '{tables}', found '{found}'. If you are not "
|
||||
"providing custom lookups, make sure you have the package "
|
||||
"spacy-lookups-data installed.")
|
||||
"Required tables: {tables}. Found: {found}. Maybe you forgot to "
|
||||
"call nlp.initialize() to load in the data?")
|
||||
E1005 = ("Unable to set attribute '{attr}' in tokenizer exception for "
|
||||
"'{chunk}'. Tokenizer exceptions are only allowed to specify "
|
||||
"`ORTH` and `NORM`.")
|
||||
|
@ -698,6 +705,9 @@ class Errors:
|
|||
"options: {modes}")
|
||||
E1012 = ("Entity spans and blocked/missing/outside spans should be "
|
||||
"provided to doc.set_ents as lists of `Span` objects.")
|
||||
E1013 = ("Invalid morph: the MorphAnalysis must have the same vocab as the "
|
||||
"token itself. To set the morph from this MorphAnalysis, set from "
|
||||
"the string value with: `token.set_morph(str(other_morph))`.")
|
||||
|
||||
|
||||
@add_codes
|
||||
|
|
|
@ -4,7 +4,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...pipeline import Lemmatizer
|
||||
|
||||
|
||||
|
@ -24,18 +23,11 @@ class Bengali(Language):
|
|||
@Bengali.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
default_config={"model": None, "mode": "rule"},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode)
|
||||
|
||||
|
||||
__all__ = ["Bengali"]
|
||||
|
|
|
@ -7,7 +7,6 @@ from .lex_attrs import LEX_ATTRS
|
|||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from .lemmatizer import GreekLemmatizer
|
||||
from ...lookups import Lookups
|
||||
from ...language import Language
|
||||
|
||||
|
||||
|
@ -29,18 +28,11 @@ class Greek(Language):
|
|||
@Greek.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
default_config={"model": None, "mode": "rule"},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
lookups = GreekLemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return GreekLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||
return GreekLemmatizer(nlp.vocab, model, name, mode=mode)
|
||||
|
||||
|
||||
__all__ = ["Greek"]
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
from typing import Optional
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
|
@ -9,7 +8,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
|
|||
from .punctuation import TOKENIZER_INFIXES
|
||||
from .lemmatizer import EnglishLemmatizer
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
|
||||
|
||||
class EnglishDefaults(Language.Defaults):
|
||||
|
@ -28,18 +26,11 @@ class English(Language):
|
|||
@English.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
default_config={"model": None, "mode": "rule"},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
lookups = EnglishLemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return EnglishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||
return EnglishLemmatizer(nlp.vocab, model, name, mode=mode)
|
||||
|
||||
|
||||
__all__ = ["English"]
|
||||
|
|
|
@ -3,8 +3,7 @@ from ...tokens import Token
|
|||
|
||||
|
||||
class EnglishLemmatizer(Lemmatizer):
|
||||
"""English lemmatizer. Only overrides is_base_form.
|
||||
"""
|
||||
"""English lemmatizer. Only overrides is_base_form."""
|
||||
|
||||
def is_base_form(self, token: Token) -> bool:
|
||||
"""
|
||||
|
|
|
@ -58,7 +58,7 @@ def noun_bounds(
|
|||
doc, token, np_left_deps, np_right_deps, stop_deps
|
||||
)
|
||||
filter_func = lambda t: is_verb_token(t) or t.dep in stop_deps
|
||||
if list(filter(filter_func, doc[left_bound.i : right.i],)):
|
||||
if list(filter(filter_func, doc[left_bound.i : right.i])):
|
||||
break
|
||||
else:
|
||||
right_bound = right
|
||||
|
|
|
@ -6,7 +6,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...pipeline import Lemmatizer
|
||||
|
||||
|
||||
|
@ -27,18 +26,11 @@ class Persian(Language):
|
|||
@Persian.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
default_config={"model": None, "mode": "rule"},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode)
|
||||
|
||||
|
||||
__all__ = ["Persian"]
|
||||
|
|
|
@ -9,7 +9,6 @@ from .stop_words import STOP_WORDS
|
|||
from .lex_attrs import LEX_ATTRS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from .lemmatizer import FrenchLemmatizer
|
||||
from ...lookups import Lookups
|
||||
from ...language import Language
|
||||
|
||||
|
||||
|
@ -32,18 +31,11 @@ class French(Language):
|
|||
@French.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
default_config={"model": None, "mode": "rule"},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
lookups = FrenchLemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return FrenchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||
return FrenchLemmatizer(nlp.vocab, model, name, mode=mode)
|
||||
|
||||
|
||||
__all__ = ["French"]
|
||||
|
|
|
@ -2,7 +2,6 @@ from typing import Optional, Union, Dict, Any
|
|||
from pathlib import Path
|
||||
import srsly
|
||||
from collections import namedtuple
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
@ -16,7 +15,7 @@ from ...scorer import Scorer
|
|||
from ...symbols import POS
|
||||
from ...tokens import Doc
|
||||
from ...training import validate_examples
|
||||
from ...util import DummyTokenizer, registry
|
||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||
from ... import util
|
||||
|
||||
|
||||
|
@ -166,7 +165,7 @@ class JapaneseTokenizer(DummyTokenizer):
|
|||
|
||||
|
||||
class JapaneseDefaults(Language.Defaults):
|
||||
config = Config().from_str(DEFAULT_CONFIG)
|
||||
config = load_config_from_str(DEFAULT_CONFIG)
|
||||
stop_words = STOP_WORDS
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
from typing import Optional, Any, Dict
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tag_map import TAG_MAP
|
||||
|
@ -10,7 +9,7 @@ from ...compat import copy_reg
|
|||
from ...scorer import Scorer
|
||||
from ...symbols import POS
|
||||
from ...training import validate_examples
|
||||
from ...util import DummyTokenizer, registry
|
||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -70,7 +69,7 @@ class KoreanTokenizer(DummyTokenizer):
|
|||
|
||||
|
||||
class KoreanDefaults(Language.Defaults):
|
||||
config = Config().from_str(DEFAULT_CONFIG)
|
||||
config = load_config_from_str(DEFAULT_CONFIG)
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||
|
|
|
@ -6,7 +6,6 @@ from .punctuation import TOKENIZER_SUFFIXES
|
|||
from .stop_words import STOP_WORDS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...pipeline import Lemmatizer
|
||||
|
||||
|
||||
|
@ -27,18 +26,11 @@ class Norwegian(Language):
|
|||
@Norwegian.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
default_config={"model": None, "mode": "rule"},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode)
|
||||
|
||||
|
||||
__all__ = ["Norwegian"]
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
from typing import Optional
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
|
@ -8,7 +7,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
from .lemmatizer import DutchLemmatizer
|
||||
from ...lookups import Lookups
|
||||
from ...language import Language
|
||||
|
||||
|
||||
|
@ -29,18 +27,11 @@ class Dutch(Language):
|
|||
@Dutch.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
default_config={"model": None, "mode": "rule"},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
lookups = DutchLemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return DutchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||
return DutchLemmatizer(nlp.vocab, model, name, mode=mode)
|
||||
|
||||
|
||||
__all__ = ["Dutch"]
|
||||
|
|
|
@ -34,18 +34,11 @@ class Polish(Language):
|
|||
@Polish.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "pos_lookup", "lookups": None},
|
||||
default_config={"model": None, "mode": "pos_lookup"},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
lookups = PolishLemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return PolishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||
return PolishLemmatizer(nlp.vocab, model, name, mode=mode)
|
||||
|
||||
|
||||
__all__ = ["Polish"]
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
from typing import Optional
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
|
@ -7,7 +6,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|||
from .lex_attrs import LEX_ATTRS
|
||||
from .lemmatizer import RussianLemmatizer
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
|
||||
|
||||
class RussianDefaults(Language.Defaults):
|
||||
|
@ -24,17 +22,11 @@ class Russian(Language):
|
|||
@Russian.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "pymorphy2", "lookups": None},
|
||||
default_config={"model": None, "mode": "pymorphy2"},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
return RussianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||
return RussianLemmatizer(nlp.vocab, model, name, mode=mode)
|
||||
|
||||
|
||||
__all__ = ["Russian"]
|
||||
|
|
|
@ -108,8 +108,8 @@ _num_words = [
|
|||
|
||||
def like_num(text):
|
||||
"""
|
||||
Check if text resembles a number
|
||||
"""
|
||||
Check if text resembles a number
|
||||
"""
|
||||
if text.startswith(("+", "-", "±", "~")):
|
||||
text = text[1:]
|
||||
text = text.replace(",", "").replace(".", "")
|
||||
|
|
|
@ -5,7 +5,6 @@ from .stop_words import STOP_WORDS
|
|||
from .lex_attrs import LEX_ATTRS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...pipeline import Lemmatizer
|
||||
|
||||
|
||||
|
@ -30,18 +29,11 @@ class Swedish(Language):
|
|||
@Swedish.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
default_config={"model": None, "mode": "rule"},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode)
|
||||
|
||||
|
||||
__all__ = ["Swedish"]
|
||||
|
|
|
@ -1,10 +1,8 @@
|
|||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ...language import Language
|
||||
from ...tokens import Doc
|
||||
from ...util import DummyTokenizer, registry
|
||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -42,7 +40,7 @@ class ThaiTokenizer(DummyTokenizer):
|
|||
|
||||
|
||||
class ThaiDefaults(Language.Defaults):
|
||||
config = Config().from_str(DEFAULT_CONFIG)
|
||||
config = load_config_from_str(DEFAULT_CONFIG)
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
|
|
@ -7,7 +7,6 @@ from .stop_words import STOP_WORDS
|
|||
from .lex_attrs import LEX_ATTRS
|
||||
from .lemmatizer import UkrainianLemmatizer
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
|
||||
|
||||
class UkrainianDefaults(Language.Defaults):
|
||||
|
@ -24,17 +23,11 @@ class Ukrainian(Language):
|
|||
@Ukrainian.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "pymorphy2", "lookups": None},
|
||||
default_config={"model": None, "mode": "pymorphy2"},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||
return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode)
|
||||
|
||||
|
||||
__all__ = ["Ukrainian"]
|
||||
|
|
|
@ -1,10 +1,8 @@
|
|||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ...language import Language
|
||||
from ...tokens import Doc
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...util import DummyTokenizer, registry
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -17,7 +15,7 @@ use_pyvi = true
|
|||
|
||||
|
||||
@registry.tokenizers("spacy.vi.VietnameseTokenizer")
|
||||
def create_vietnamese_tokenizer(use_pyvi: bool = True,):
|
||||
def create_vietnamese_tokenizer(use_pyvi: bool = True):
|
||||
def vietnamese_tokenizer_factory(nlp):
|
||||
return VietnameseTokenizer(nlp, use_pyvi=use_pyvi)
|
||||
|
||||
|
@ -55,7 +53,7 @@ class VietnameseTokenizer(DummyTokenizer):
|
|||
|
||||
|
||||
class VietnameseDefaults(Language.Defaults):
|
||||
config = Config().from_str(DEFAULT_CONFIG)
|
||||
config = load_config_from_str(DEFAULT_CONFIG)
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
|
|
@ -1,23 +1,25 @@
|
|||
from typing import Optional, List, Dict, Any
|
||||
from typing import Optional, List, Dict, Any, Callable, Iterable
|
||||
from enum import Enum
|
||||
import tempfile
|
||||
import srsly
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from thinc.api import Config
|
||||
|
||||
from ...errors import Warnings, Errors
|
||||
from ...language import Language
|
||||
from ...scorer import Scorer
|
||||
from ...tokens import Doc
|
||||
from ...training import validate_examples
|
||||
from ...util import DummyTokenizer, registry
|
||||
from ...training import validate_examples, Example
|
||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .stop_words import STOP_WORDS
|
||||
from ... import util
|
||||
|
||||
|
||||
_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.25` or from https://github.com/lancopku/pkuseg-python"
|
||||
# fmt: off
|
||||
_PKUSEG_INSTALL_MSG = "install pkuseg and pickle5 with `pip install pkuseg==0.0.25 pickle5`"
|
||||
_PKUSEG_PICKLE_WARNING = "Failed to force pkuseg model to use pickle protocol 4. If you're saving this model with python 3.8, it may not work with python 3.6-3.7."
|
||||
# fmt: on
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
|
@ -25,6 +27,10 @@ DEFAULT_CONFIG = """
|
|||
[nlp.tokenizer]
|
||||
@tokenizers = "spacy.zh.ChineseTokenizer"
|
||||
segmenter = "char"
|
||||
|
||||
[initialize]
|
||||
|
||||
[initialize.tokenizer]
|
||||
pkuseg_model = null
|
||||
pkuseg_user_dict = "default"
|
||||
"""
|
||||
|
@ -41,41 +47,23 @@ class Segmenter(str, Enum):
|
|||
|
||||
|
||||
@registry.tokenizers("spacy.zh.ChineseTokenizer")
|
||||
def create_chinese_tokenizer(
|
||||
segmenter: Segmenter = Segmenter.char,
|
||||
pkuseg_model: Optional[str] = None,
|
||||
pkuseg_user_dict: Optional[str] = "default",
|
||||
):
|
||||
def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char,):
|
||||
def chinese_tokenizer_factory(nlp):
|
||||
return ChineseTokenizer(
|
||||
nlp,
|
||||
segmenter=segmenter,
|
||||
pkuseg_model=pkuseg_model,
|
||||
pkuseg_user_dict=pkuseg_user_dict,
|
||||
)
|
||||
return ChineseTokenizer(nlp, segmenter=segmenter)
|
||||
|
||||
return chinese_tokenizer_factory
|
||||
|
||||
|
||||
class ChineseTokenizer(DummyTokenizer):
|
||||
def __init__(
|
||||
self,
|
||||
nlp: Language,
|
||||
segmenter: Segmenter = Segmenter.char,
|
||||
pkuseg_model: Optional[str] = None,
|
||||
pkuseg_user_dict: Optional[str] = None,
|
||||
self, nlp: Language, segmenter: Segmenter = Segmenter.char,
|
||||
):
|
||||
self.vocab = nlp.vocab
|
||||
if isinstance(segmenter, Segmenter): # we might have the Enum here
|
||||
if isinstance(segmenter, Segmenter):
|
||||
segmenter = segmenter.value
|
||||
self.segmenter = segmenter
|
||||
self.pkuseg_model = pkuseg_model
|
||||
self.pkuseg_user_dict = pkuseg_user_dict
|
||||
self.pkuseg_seg = None
|
||||
self.jieba_seg = None
|
||||
self.configure_segmenter(segmenter)
|
||||
|
||||
def configure_segmenter(self, segmenter: str):
|
||||
if segmenter not in Segmenter.values():
|
||||
warn_msg = Warnings.W103.format(
|
||||
lang="Chinese",
|
||||
|
@ -85,12 +73,21 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
)
|
||||
warnings.warn(warn_msg)
|
||||
self.segmenter = Segmenter.char
|
||||
self.jieba_seg = try_jieba_import(self.segmenter)
|
||||
self.pkuseg_seg = try_pkuseg_import(
|
||||
self.segmenter,
|
||||
pkuseg_model=self.pkuseg_model,
|
||||
pkuseg_user_dict=self.pkuseg_user_dict,
|
||||
)
|
||||
if segmenter == Segmenter.jieba:
|
||||
self.jieba_seg = try_jieba_import()
|
||||
|
||||
def initialize(
|
||||
self,
|
||||
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
|
||||
*,
|
||||
nlp: Optional[Language] = None,
|
||||
pkuseg_model: Optional[str] = None,
|
||||
pkuseg_user_dict: str = "default",
|
||||
):
|
||||
if self.segmenter == Segmenter.pkuseg:
|
||||
self.pkuseg_seg = try_pkuseg_import(
|
||||
pkuseg_model=pkuseg_model, pkuseg_user_dict=pkuseg_user_dict,
|
||||
)
|
||||
|
||||
def __call__(self, text: str) -> Doc:
|
||||
if self.segmenter == Segmenter.jieba:
|
||||
|
@ -145,14 +142,10 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
def _get_config(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"segmenter": self.segmenter,
|
||||
"pkuseg_model": self.pkuseg_model,
|
||||
"pkuseg_user_dict": self.pkuseg_user_dict,
|
||||
}
|
||||
|
||||
def _set_config(self, config: Dict[str, Any] = {}) -> None:
|
||||
self.segmenter = config.get("segmenter", Segmenter.char)
|
||||
self.pkuseg_model = config.get("pkuseg_model", None)
|
||||
self.pkuseg_user_dict = config.get("pkuseg_user_dict", "default")
|
||||
|
||||
def to_bytes(self, **kwargs):
|
||||
pkuseg_features_b = b""
|
||||
|
@ -163,6 +156,22 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
self.pkuseg_seg.feature_extractor.save(tempdir)
|
||||
self.pkuseg_seg.model.save(tempdir)
|
||||
tempdir = Path(tempdir)
|
||||
# pkuseg saves features.pkl with pickle.HIGHEST_PROTOCOL, which
|
||||
# means that it will be saved with pickle protocol 5 with
|
||||
# python 3.8, which can't be reloaded with python 3.6-3.7.
|
||||
# To try to make the model compatible with python 3.6+, reload
|
||||
# the data with pickle5 and convert it back to protocol 4.
|
||||
try:
|
||||
import pickle5
|
||||
|
||||
with open(tempdir / "features.pkl", "rb") as fileh:
|
||||
features = pickle5.load(fileh)
|
||||
with open(tempdir / "features.pkl", "wb") as fileh:
|
||||
pickle5.dump(features, fileh, protocol=4)
|
||||
except ImportError as e:
|
||||
raise e
|
||||
except Exception:
|
||||
warnings.warn(_PKUSEG_PICKLE_WARNING)
|
||||
with open(tempdir / "features.pkl", "rb") as fileh:
|
||||
pkuseg_features_b = fileh.read()
|
||||
with open(tempdir / "weights.npz", "rb") as fileh:
|
||||
|
@ -235,6 +244,18 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
path.mkdir(parents=True)
|
||||
self.pkuseg_seg.model.save(path)
|
||||
self.pkuseg_seg.feature_extractor.save(path)
|
||||
# try to convert features.pkl to pickle protocol 4
|
||||
try:
|
||||
import pickle5
|
||||
|
||||
with open(path / "features.pkl", "rb") as fileh:
|
||||
features = pickle5.load(fileh)
|
||||
with open(path / "features.pkl", "wb") as fileh:
|
||||
pickle5.dump(features, fileh, protocol=4)
|
||||
except ImportError as e:
|
||||
raise e
|
||||
except Exception:
|
||||
warnings.warn(_PKUSEG_PICKLE_WARNING)
|
||||
|
||||
def save_pkuseg_processors(path):
|
||||
if self.pkuseg_seg:
|
||||
|
@ -291,7 +312,7 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
|
||||
|
||||
class ChineseDefaults(Language.Defaults):
|
||||
config = Config().from_str(DEFAULT_CONFIG)
|
||||
config = load_config_from_str(DEFAULT_CONFIG)
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||
|
@ -302,47 +323,33 @@ class Chinese(Language):
|
|||
Defaults = ChineseDefaults
|
||||
|
||||
|
||||
def try_jieba_import(segmenter: str) -> None:
|
||||
def try_jieba_import() -> None:
|
||||
try:
|
||||
import jieba
|
||||
|
||||
if segmenter == Segmenter.jieba:
|
||||
# segment a short text to have jieba initialize its cache in advance
|
||||
list(jieba.cut("作为", cut_all=False))
|
||||
# segment a short text to have jieba initialize its cache in advance
|
||||
list(jieba.cut("作为", cut_all=False))
|
||||
|
||||
return jieba
|
||||
except ImportError:
|
||||
if segmenter == Segmenter.jieba:
|
||||
msg = (
|
||||
"Jieba not installed. To use jieba, install it with `pip "
|
||||
" install jieba` or from https://github.com/fxsjy/jieba"
|
||||
)
|
||||
raise ImportError(msg) from None
|
||||
msg = (
|
||||
"Jieba not installed. To use jieba, install it with `pip "
|
||||
" install jieba` or from https://github.com/fxsjy/jieba"
|
||||
)
|
||||
raise ImportError(msg) from None
|
||||
|
||||
|
||||
def try_pkuseg_import(segmenter: str, pkuseg_model: str, pkuseg_user_dict: str) -> None:
|
||||
def try_pkuseg_import(pkuseg_model: str, pkuseg_user_dict: str) -> None:
|
||||
try:
|
||||
import pkuseg
|
||||
|
||||
if pkuseg_model:
|
||||
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
|
||||
elif segmenter == Segmenter.pkuseg:
|
||||
msg = (
|
||||
"The Chinese word segmenter is 'pkuseg' but no pkuseg model "
|
||||
"was specified. Please provide the name of a pretrained model "
|
||||
"or the path to a model with:\n"
|
||||
'cfg = {"nlp": {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path }}\n'
|
||||
"nlp = Chinese.from_config(cfg)"
|
||||
)
|
||||
raise ValueError(msg)
|
||||
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
|
||||
except ImportError:
|
||||
if segmenter == Segmenter.pkuseg:
|
||||
msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
|
||||
raise ImportError(msg) from None
|
||||
msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
|
||||
raise ImportError(msg) from None
|
||||
except FileNotFoundError:
|
||||
if segmenter == Segmenter.pkuseg:
|
||||
msg = "Unable to load pkuseg model from: " + pkuseg_model
|
||||
raise FileNotFoundError(msg) from None
|
||||
msg = "Unable to load pkuseg model from: " + pkuseg_model
|
||||
raise FileNotFoundError(msg) from None
|
||||
|
||||
|
||||
def _get_pkuseg_trie_data(node, path=""):
|
||||
|
|
|
@ -8,7 +8,7 @@ from contextlib import contextmanager
|
|||
from copy import deepcopy
|
||||
from pathlib import Path
|
||||
import warnings
|
||||
from thinc.api import Model, get_current_ops, Config, require_gpu, Optimizer
|
||||
from thinc.api import Model, get_current_ops, Config, Optimizer
|
||||
import srsly
|
||||
import multiprocessing as mp
|
||||
from itertools import chain, cycle
|
||||
|
@ -18,8 +18,9 @@ from .tokens.underscore import Underscore
|
|||
from .vocab import Vocab, create_vocab
|
||||
from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
|
||||
from .training import Example, validate_examples
|
||||
from .training.initialize import init_vocab, init_tok2vec
|
||||
from .scorer import Scorer
|
||||
from .util import create_default_optimizer, registry, SimpleFrozenList
|
||||
from .util import registry, SimpleFrozenList
|
||||
from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
|
||||
from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
|
||||
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||
|
@ -27,7 +28,8 @@ from .lang.punctuation import TOKENIZER_INFIXES
|
|||
from .tokens import Doc
|
||||
from .tokenizer import Tokenizer
|
||||
from .errors import Errors, Warnings
|
||||
from .schemas import ConfigSchema, ConfigSchemaNlp
|
||||
from .schemas import ConfigSchema, ConfigSchemaNlp, ConfigSchemaInit
|
||||
from .schemas import ConfigSchemaPretrain, validate_init_settings
|
||||
from .git_info import GIT_VERSION
|
||||
from . import util
|
||||
from . import about
|
||||
|
@ -1066,7 +1068,7 @@ class Language:
|
|||
validate_examples(examples, "Language.update")
|
||||
if sgd is None:
|
||||
if self._optimizer is None:
|
||||
self._optimizer = create_default_optimizer()
|
||||
self._optimizer = self.create_optimizer()
|
||||
sgd = self._optimizer
|
||||
if component_cfg is None:
|
||||
component_cfg = {}
|
||||
|
@ -1124,7 +1126,7 @@ class Language:
|
|||
validate_examples(examples, "Language.rehearse")
|
||||
if sgd is None:
|
||||
if self._optimizer is None:
|
||||
self._optimizer = create_default_optimizer()
|
||||
self._optimizer = self.create_optimizer()
|
||||
sgd = self._optimizer
|
||||
pipes = list(self.pipeline)
|
||||
random.shuffle(pipes)
|
||||
|
@ -1154,61 +1156,73 @@ class Language:
|
|||
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
|
||||
*,
|
||||
sgd: Optional[Optimizer] = None,
|
||||
device: int = -1,
|
||||
) -> Optimizer:
|
||||
warnings.warn(Warnings.W089, DeprecationWarning)
|
||||
return self.initialize(get_examples, sgd=sgd)
|
||||
|
||||
def initialize(
|
||||
self,
|
||||
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
|
||||
*,
|
||||
sgd: Optional[Optimizer] = None,
|
||||
) -> Optimizer:
|
||||
"""Initialize the pipe for training, using data examples if available.
|
||||
|
||||
get_examples (Callable[[], Iterable[Example]]): Optional function that
|
||||
returns gold-standard Example objects.
|
||||
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
|
||||
create_optimizer if it doesn't exist.
|
||||
sgd (Optional[Optimizer]): An optimizer to use for updates. If not
|
||||
provided, will be created using the .create_optimizer() method.
|
||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/language#begin_training
|
||||
DOCS: https://nightly.spacy.io/api/language#initialize
|
||||
"""
|
||||
if get_examples is None:
|
||||
util.logger.debug(
|
||||
"No 'get_examples' callback provided to 'Language.begin_training', creating dummy examples"
|
||||
"No 'get_examples' callback provided to 'Language.initialize', creating dummy examples"
|
||||
)
|
||||
doc = Doc(self.vocab, words=["x", "y", "z"])
|
||||
get_examples = lambda: [Example.from_dict(doc, {})]
|
||||
# Populate vocab
|
||||
if not hasattr(get_examples, "__call__"):
|
||||
err = Errors.E930.format(name="Language", obj=type(get_examples))
|
||||
raise ValueError(err)
|
||||
valid_examples = False
|
||||
for example in get_examples():
|
||||
if not isinstance(example, Example):
|
||||
err = Errors.E978.format(
|
||||
name="Language.begin_training", types=type(example)
|
||||
)
|
||||
raise ValueError(err)
|
||||
else:
|
||||
valid_examples = True
|
||||
for word in [t.text for t in example.reference]:
|
||||
_ = self.vocab[word] # noqa: F841
|
||||
if not valid_examples:
|
||||
err = Errors.E930.format(name="Language", obj="empty list")
|
||||
raise ValueError(err)
|
||||
if device >= 0: # TODO: do we need this here?
|
||||
require_gpu(device)
|
||||
if self.vocab.vectors.data.shape[1] >= 1:
|
||||
ops = get_current_ops()
|
||||
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
||||
if sgd is None:
|
||||
sgd = create_default_optimizer()
|
||||
self._optimizer = sgd
|
||||
# Make sure the config is interpolated so we can resolve subsections
|
||||
config = self.config.interpolate()
|
||||
# These are the settings provided in the [initialize] block in the config
|
||||
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
|
||||
init_vocab(
|
||||
self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
|
||||
)
|
||||
pretrain_cfg = config.get("pretraining")
|
||||
if pretrain_cfg:
|
||||
P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain)
|
||||
init_tok2vec(self, P, I)
|
||||
if self.vocab.vectors.data.shape[1] >= 1:
|
||||
ops = get_current_ops()
|
||||
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
||||
if hasattr(self.tokenizer, "initialize"):
|
||||
tok_settings = validate_init_settings(
|
||||
self.tokenizer.initialize,
|
||||
I["tokenizer"],
|
||||
section="tokenizer",
|
||||
name="tokenizer",
|
||||
)
|
||||
self.tokenizer.initialize(get_examples, nlp=self, **tok_settings)
|
||||
for name, proc in self.pipeline:
|
||||
if hasattr(proc, "begin_training"):
|
||||
proc.begin_training(
|
||||
get_examples, pipeline=self.pipeline, sgd=self._optimizer
|
||||
if hasattr(proc, "initialize"):
|
||||
p_settings = I["components"].get(name, {})
|
||||
p_settings = validate_init_settings(
|
||||
proc.initialize, p_settings, section="components", name=name
|
||||
)
|
||||
proc.initialize(get_examples, nlp=self, **p_settings)
|
||||
self._link_components()
|
||||
self._optimizer = sgd
|
||||
if sgd is not None:
|
||||
self._optimizer = sgd
|
||||
elif self._optimizer is None:
|
||||
self._optimizer = self.create_optimizer()
|
||||
return self._optimizer
|
||||
|
||||
def resume_training(
|
||||
self, *, sgd: Optional[Optimizer] = None, device: int = -1
|
||||
) -> Optimizer:
|
||||
def resume_training(self, *, sgd: Optional[Optimizer] = None) -> Optimizer:
|
||||
"""Continue training a pretrained model.
|
||||
|
||||
Create and return an optimizer, and initialize "rehearsal" for any pipeline
|
||||
|
@ -1217,22 +1231,20 @@ class Language:
|
|||
rehearsal, collect samples of text you want the models to retain performance
|
||||
on, and call nlp.rehearse() with a batch of Example objects.
|
||||
|
||||
sgd (Optional[Optimizer]): An optimizer.
|
||||
RETURNS (Optimizer): The optimizer.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/language#resume_training
|
||||
"""
|
||||
if device >= 0: # TODO: do we need this here?
|
||||
require_gpu(device)
|
||||
ops = get_current_ops()
|
||||
if self.vocab.vectors.data.shape[1] >= 1:
|
||||
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
||||
if sgd is None:
|
||||
sgd = create_default_optimizer()
|
||||
self._optimizer = sgd
|
||||
ops = get_current_ops()
|
||||
if self.vocab.vectors.data.shape[1] >= 1:
|
||||
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
||||
for name, proc in self.pipeline:
|
||||
if hasattr(proc, "_rehearsal_model"):
|
||||
proc._rehearsal_model = deepcopy(proc.model)
|
||||
if sgd is not None:
|
||||
self._optimizer = sgd
|
||||
elif self._optimizer is None:
|
||||
self._optimizer = self.create_optimizer()
|
||||
return self._optimizer
|
||||
|
||||
def evaluate(
|
||||
|
@ -1294,6 +1306,11 @@ class Language:
|
|||
results["speed"] = n_words / (end_time - start_time)
|
||||
return results
|
||||
|
||||
def create_optimizer(self):
|
||||
"""Create an optimizer, usually using the [training.optimizer] config."""
|
||||
subconfig = {"optimizer": self.config["training"]["optimizer"]}
|
||||
return registry.resolve(subconfig)["optimizer"]
|
||||
|
||||
@contextmanager
|
||||
def use_params(self, params: Optional[dict]):
|
||||
"""Replace weights of models in the pipeline with those provided in the
|
||||
|
@ -1502,7 +1519,7 @@ class Language:
|
|||
).merge(config)
|
||||
if "nlp" not in config:
|
||||
raise ValueError(Errors.E985.format(config=config))
|
||||
config_lang = config["nlp"]["lang"]
|
||||
config_lang = config["nlp"].get("lang")
|
||||
if config_lang is not None and config_lang != cls.lang:
|
||||
raise ValueError(
|
||||
Errors.E958.format(
|
||||
|
|
28
spacy/ml/featureextractor.py
Normal file
28
spacy/ml/featureextractor.py
Normal file
|
@ -0,0 +1,28 @@
|
|||
from typing import List, Union, Callable, Tuple
|
||||
from thinc.types import Ints2d
|
||||
from thinc.api import Model, registry
|
||||
|
||||
from ..tokens import Doc
|
||||
|
||||
|
||||
@registry.layers("spacy.FeatureExtractor.v1")
|
||||
def FeatureExtractor(columns: List[Union[int, str]]) -> Model[List[Doc], List[Ints2d]]:
|
||||
return Model("extract_features", forward, attrs={"columns": columns})
|
||||
|
||||
|
||||
def forward(
|
||||
model: Model[List[Doc], List[Ints2d]], docs, is_train: bool
|
||||
) -> Tuple[List[Ints2d], Callable]:
|
||||
columns = model.attrs["columns"]
|
||||
features: List[Ints2d] = []
|
||||
for doc in docs:
|
||||
if hasattr(doc, "to_array"):
|
||||
attrs = doc.to_array(columns)
|
||||
else:
|
||||
attrs = doc.doc.to_array(columns)[doc.start : doc.end]
|
||||
if attrs.ndim == 1:
|
||||
attrs = attrs.reshape((attrs.shape[0], 1))
|
||||
features.append(model.ops.asarray2i(attrs, dtype="uint64"))
|
||||
|
||||
backprop: Callable[[List[Ints2d]], List] = lambda d_features: []
|
||||
return features, backprop
|
|
@ -3,12 +3,13 @@ from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
|
|||
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
|
||||
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
|
||||
from thinc.api import HashEmbed, with_array, with_cpu, uniqued
|
||||
from thinc.api import Relu, residual, expand_window, FeatureExtractor
|
||||
from thinc.api import Relu, residual, expand_window
|
||||
|
||||
from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
|
||||
from ...util import registry
|
||||
from ..extract_ngrams import extract_ngrams
|
||||
from ..staticvectors import StaticVectors
|
||||
from ..featureextractor import FeatureExtractor
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.TextCatCNN.v1")
|
||||
|
|
|
@ -1,16 +1,16 @@
|
|||
from typing import Optional, List
|
||||
from thinc.api import chain, clone, concatenate, with_array, with_padded
|
||||
from thinc.api import Model, noop, list2ragged, ragged2list
|
||||
from thinc.api import FeatureExtractor, HashEmbed
|
||||
from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
|
||||
from typing import Optional, List, Union
|
||||
from thinc.types import Floats2d
|
||||
from thinc.api import chain, clone, concatenate, with_array, with_padded
|
||||
from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed
|
||||
from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
|
||||
|
||||
from ...tokens import Doc
|
||||
from ...util import registry
|
||||
from ...ml import _character_embed
|
||||
from ..staticvectors import StaticVectors
|
||||
from ..featureextractor import FeatureExtractor
|
||||
from ...pipeline.tok2vec import Tok2VecListener
|
||||
from ...attrs import ORTH, NORM, PREFIX, SUFFIX, SHAPE
|
||||
from ...attrs import ORTH, LOWER, PREFIX, SUFFIX, SHAPE, intify_attr
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.Tok2VecListener.v1")
|
||||
|
@ -98,7 +98,7 @@ def MultiHashEmbed(
|
|||
attributes using hash embedding, concatenates the results, and passes it
|
||||
through a feed-forward subnetwork to build a mixed representations.
|
||||
|
||||
The features used are the NORM, PREFIX, SUFFIX and SHAPE, which can have
|
||||
The features used are the LOWER, PREFIX, SUFFIX and SHAPE, which can have
|
||||
varying definitions depending on the Vocab of the Doc object passed in.
|
||||
Vectors from pretrained static vectors can also be incorporated into the
|
||||
concatenated representation.
|
||||
|
@ -115,7 +115,7 @@ def MultiHashEmbed(
|
|||
also_use_static_vectors (bool): Whether to also use static word vectors.
|
||||
Requires a vectors table to be loaded in the Doc objects' vocab.
|
||||
"""
|
||||
cols = [NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||
cols = [LOWER, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||
seed = 7
|
||||
|
||||
def make_hash_embed(feature):
|
||||
|
@ -123,7 +123,7 @@ def MultiHashEmbed(
|
|||
seed += 1
|
||||
return HashEmbed(
|
||||
width,
|
||||
rows if feature == NORM else rows // 2,
|
||||
rows if feature == LOWER else rows // 2,
|
||||
column=cols.index(feature),
|
||||
seed=seed,
|
||||
dropout=0.0,
|
||||
|
@ -131,13 +131,13 @@ def MultiHashEmbed(
|
|||
|
||||
if also_embed_subwords:
|
||||
embeddings = [
|
||||
make_hash_embed(NORM),
|
||||
make_hash_embed(LOWER),
|
||||
make_hash_embed(PREFIX),
|
||||
make_hash_embed(SUFFIX),
|
||||
make_hash_embed(SHAPE),
|
||||
]
|
||||
else:
|
||||
embeddings = [make_hash_embed(NORM)]
|
||||
embeddings = [make_hash_embed(LOWER)]
|
||||
concat_size = width * (len(embeddings) + also_use_static_vectors)
|
||||
if also_use_static_vectors:
|
||||
model = chain(
|
||||
|
@ -165,7 +165,8 @@ def MultiHashEmbed(
|
|||
|
||||
@registry.architectures.register("spacy.CharacterEmbed.v1")
|
||||
def CharacterEmbed(
|
||||
width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool
|
||||
width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool,
|
||||
feature: Union[int, str]="LOWER"
|
||||
) -> Model[List[Doc], List[Floats2d]]:
|
||||
"""Construct an embedded representation based on character embeddings, using
|
||||
a feed-forward network. A fixed number of UTF-8 byte characters are used for
|
||||
|
@ -179,12 +180,13 @@ def CharacterEmbed(
|
|||
of being in an arbitrary position depending on the word length.
|
||||
|
||||
The characters are embedded in a embedding table with a given number of rows,
|
||||
and the vectors concatenated. A hash-embedded vector of the NORM of the word is
|
||||
and the vectors concatenated. A hash-embedded vector of the LOWER of the word is
|
||||
also concatenated on, and the result is then passed through a feed-forward
|
||||
network to construct a single vector to represent the information.
|
||||
|
||||
width (int): The width of the output vector and the NORM hash embedding.
|
||||
rows (int): The number of rows in the NORM hash embedding table.
|
||||
feature (int or str): An attribute to embed, to concatenate with the characters.
|
||||
width (int): The width of the output vector and the feature embedding.
|
||||
rows (int): The number of rows in the LOWER hash embedding table.
|
||||
nM (int): The dimensionality of the character embeddings. Recommended values
|
||||
are between 16 and 64.
|
||||
nC (int): The number of UTF-8 bytes to embed per word. Recommended values
|
||||
|
@ -193,12 +195,15 @@ def CharacterEmbed(
|
|||
also_use_static_vectors (bool): Whether to also use static word vectors.
|
||||
Requires a vectors table to be loaded in the Doc objects' vocab.
|
||||
"""
|
||||
feature = intify_attr(feature)
|
||||
if feature is None:
|
||||
raise ValueError("Invalid feature: Must be a token attribute.")
|
||||
if also_use_static_vectors:
|
||||
model = chain(
|
||||
concatenate(
|
||||
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
|
||||
chain(
|
||||
FeatureExtractor([NORM]),
|
||||
FeatureExtractor([feature]),
|
||||
list2ragged(),
|
||||
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
|
||||
),
|
||||
|
@ -214,7 +219,7 @@ def CharacterEmbed(
|
|||
concatenate(
|
||||
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
|
||||
chain(
|
||||
FeatureExtractor([NORM]),
|
||||
FeatureExtractor([feature]),
|
||||
list2ragged(),
|
||||
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
|
||||
),
|
||||
|
|
|
@ -78,7 +78,7 @@ def get_attr_info(nlp: "Language", attr: str) -> Dict[str, List[str]]:
|
|||
|
||||
|
||||
def analyze_pipes(
|
||||
nlp: "Language", *, keys: List[str] = DEFAULT_KEYS,
|
||||
nlp: "Language", *, keys: List[str] = DEFAULT_KEYS
|
||||
) -> Dict[str, Union[List[str], Dict[str, List[str]]]]:
|
||||
"""Print a formatted summary for the current nlp object's pipeline. Shows
|
||||
a table with the pipeline components and why they assign and require, as
|
||||
|
|
|
@ -82,8 +82,7 @@ class AttributeRuler(Pipe):
|
|||
matches = self.matcher(doc, allow_missing=True)
|
||||
# Sort by the attribute ID, so that later rules have precendence
|
||||
matches = [
|
||||
(int(self.vocab.strings[m_id]), m_id, s, e)
|
||||
for m_id, s, e in matches
|
||||
(int(self.vocab.strings[m_id]), m_id, s, e) for m_id, s, e in matches
|
||||
]
|
||||
matches.sort()
|
||||
for attr_id, match_id, start, end in matches:
|
||||
|
@ -93,7 +92,7 @@ class AttributeRuler(Pipe):
|
|||
try:
|
||||
# The index can be negative, which makes it annoying to do
|
||||
# the boundscheck. Let Span do it instead.
|
||||
token = span[index]
|
||||
token = span[index] # noqa: F841
|
||||
except IndexError:
|
||||
# The original exception is just our conditional logic, so we
|
||||
# raise from.
|
||||
|
@ -103,7 +102,7 @@ class AttributeRuler(Pipe):
|
|||
span=[t.text for t in span],
|
||||
index=index,
|
||||
)
|
||||
) from None
|
||||
) from None
|
||||
set_token_attrs(span[index], attrs)
|
||||
return doc
|
||||
|
||||
|
|
|
@ -126,13 +126,13 @@ cdef class DependencyParser(Parser):
|
|||
def add_multitask_objective(self, mt_component):
|
||||
self._multitasks.append(mt_component)
|
||||
|
||||
def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
|
||||
def init_multitask_objectives(self, get_examples, nlp=None, **cfg):
|
||||
# TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ?
|
||||
for labeller in self._multitasks:
|
||||
labeller.model.set_dim("nO", len(self.labels))
|
||||
if labeller.model.has_ref("output_layer"):
|
||||
labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
|
||||
labeller.begin_training(get_examples, pipeline=pipeline, sgd=sgd)
|
||||
labeller.initialize(get_examples, nlp=nlp)
|
||||
|
||||
@property
|
||||
def labels(self):
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from itertools import islice
|
||||
from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List, Tuple
|
||||
from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List
|
||||
from pathlib import Path
|
||||
import srsly
|
||||
import random
|
||||
|
@ -140,26 +140,20 @@ class EntityLinker(Pipe):
|
|||
if len(self.kb) == 0:
|
||||
raise ValueError(Errors.E139.format(name=self.name))
|
||||
|
||||
def begin_training(
|
||||
def initialize(
|
||||
self,
|
||||
get_examples: Callable[[], Iterable[Example]],
|
||||
*,
|
||||
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
|
||||
sgd: Optional[Optimizer] = None,
|
||||
) -> Optimizer:
|
||||
nlp: Optional[Language] = None,
|
||||
):
|
||||
"""Initialize the pipe for training, using a representative set
|
||||
of data examples.
|
||||
|
||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||
returns a representative sample of gold-standard Example objects.
|
||||
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
||||
components that this component is part of. Corresponds to
|
||||
nlp.pipeline.
|
||||
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
|
||||
create_optimizer if it doesn't exist.
|
||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||
nlp (Language): The current nlp object the component is part of.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/entitylinker#begin_training
|
||||
DOCS: https://nightly.spacy.io/api/entitylinker#initialize
|
||||
"""
|
||||
self._ensure_examples(get_examples)
|
||||
self._require_kb()
|
||||
|
@ -174,9 +168,6 @@ class EntityLinker(Pipe):
|
|||
self.model.initialize(
|
||||
X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
|
||||
)
|
||||
if sgd is None:
|
||||
sgd = self.create_optimizer()
|
||||
return sgd
|
||||
|
||||
def update(
|
||||
self,
|
||||
|
|
|
@ -1,26 +1,25 @@
|
|||
from typing import Optional, List, Dict, Any
|
||||
from typing import Optional, List, Dict, Any, Callable, Iterable, Iterator, Union
|
||||
from typing import Tuple
|
||||
from thinc.api import Model
|
||||
from pathlib import Path
|
||||
|
||||
from .pipe import Pipe
|
||||
from ..errors import Errors
|
||||
from ..language import Language
|
||||
from ..training import Example
|
||||
from ..lookups import Lookups, load_lookups
|
||||
from ..scorer import Scorer
|
||||
from ..tokens import Doc, Token
|
||||
from ..vocab import Vocab
|
||||
from ..training import validate_examples
|
||||
from ..util import logger, SimpleFrozenList
|
||||
from .. import util
|
||||
|
||||
|
||||
@Language.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "lookup",
|
||||
"lookups": None,
|
||||
"overwrite": False,
|
||||
},
|
||||
default_config={"model": None, "mode": "lookup", "overwrite": False},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
|
@ -28,13 +27,9 @@ def make_lemmatizer(
|
|||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
overwrite: bool = False,
|
||||
):
|
||||
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return Lemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, lookups=lookups, overwrite=overwrite
|
||||
)
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
|
||||
|
||||
class Lemmatizer(Pipe):
|
||||
|
@ -46,59 +41,19 @@ class Lemmatizer(Pipe):
|
|||
"""
|
||||
|
||||
@classmethod
|
||||
def get_lookups_config(cls, mode: str) -> Dict:
|
||||
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
|
||||
"""Returns the lookups configuration settings for a given mode for use
|
||||
in Lemmatizer.load_lookups.
|
||||
|
||||
mode (str): The lemmatizer mode.
|
||||
RETURNS (dict): The lookups configuration settings for this mode.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/lemmatizer#get_lookups_config
|
||||
RETURNS (Tuple[List[str], List[str]]): The required and optional
|
||||
lookup tables for this mode.
|
||||
"""
|
||||
if mode == "lookup":
|
||||
return {
|
||||
"required_tables": ["lemma_lookup"],
|
||||
}
|
||||
return (["lemma_lookup"], [])
|
||||
elif mode == "rule":
|
||||
return {
|
||||
"required_tables": ["lemma_rules"],
|
||||
"optional_tables": ["lemma_exc", "lemma_index"],
|
||||
}
|
||||
return {}
|
||||
|
||||
@classmethod
|
||||
def load_lookups(cls, lang: str, mode: str, lookups: Optional[Lookups],) -> Lookups:
|
||||
"""Load and validate lookups tables. If the provided lookups is None,
|
||||
load the default lookups tables according to the language and mode
|
||||
settings. Confirm that all required tables for the language and mode
|
||||
are present.
|
||||
|
||||
lang (str): The language code.
|
||||
mode (str): The lemmatizer mode.
|
||||
lookups (Lookups): The provided lookups, may be None if the default
|
||||
lookups should be loaded.
|
||||
RETURNS (Lookups): The Lookups object.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/lemmatizer#get_lookups_config
|
||||
"""
|
||||
config = cls.get_lookups_config(mode)
|
||||
required_tables = config.get("required_tables", [])
|
||||
optional_tables = config.get("optional_tables", [])
|
||||
if lookups is None:
|
||||
lookups = load_lookups(lang=lang, tables=required_tables)
|
||||
optional_lookups = load_lookups(
|
||||
lang=lang, tables=optional_tables, strict=False
|
||||
)
|
||||
for table in optional_lookups.tables:
|
||||
lookups.set_table(table, optional_lookups.get_table(table))
|
||||
for table in required_tables:
|
||||
if table not in lookups:
|
||||
raise ValueError(
|
||||
Errors.E1004.format(
|
||||
mode=mode, tables=required_tables, found=lookups.tables
|
||||
)
|
||||
)
|
||||
return lookups
|
||||
return (["lemma_rules"], ["lemma_exc", "lemma_index"])
|
||||
return ([], [])
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -107,7 +62,6 @@ class Lemmatizer(Pipe):
|
|||
name: str = "lemmatizer",
|
||||
*,
|
||||
mode: str = "lookup",
|
||||
lookups: Optional[Lookups] = None,
|
||||
overwrite: bool = False,
|
||||
) -> None:
|
||||
"""Initialize a Lemmatizer.
|
||||
|
@ -116,9 +70,6 @@ class Lemmatizer(Pipe):
|
|||
model (Model): A model (not yet implemented).
|
||||
name (str): The component name. Defaults to "lemmatizer".
|
||||
mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup".
|
||||
lookups (Lookups): The lookups object containing the (optional) tables
|
||||
such as "lemma_rules", "lemma_index", "lemma_exc" and
|
||||
"lemma_lookup". Defaults to None
|
||||
overwrite (bool): Whether to overwrite existing lemmas. Defaults to
|
||||
`False`.
|
||||
|
||||
|
@ -128,8 +79,9 @@ class Lemmatizer(Pipe):
|
|||
self.model = model
|
||||
self.name = name
|
||||
self._mode = mode
|
||||
self.lookups = lookups if lookups is not None else Lookups()
|
||||
self.lookups = Lookups()
|
||||
self.overwrite = overwrite
|
||||
self._validated = False
|
||||
if self.mode == "lookup":
|
||||
self.lemmatize = self.lookup_lemmatize
|
||||
elif self.mode == "rule":
|
||||
|
@ -153,12 +105,56 @@ class Lemmatizer(Pipe):
|
|||
|
||||
DOCS: https://nightly.spacy.io/api/lemmatizer#call
|
||||
"""
|
||||
if not self._validated:
|
||||
self._validate_tables(Errors.E1004)
|
||||
for token in doc:
|
||||
if self.overwrite or token.lemma == 0:
|
||||
token.lemma_ = self.lemmatize(token)[0]
|
||||
return doc
|
||||
|
||||
def pipe(self, stream, *, batch_size=128):
|
||||
def initialize(
|
||||
self,
|
||||
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
|
||||
*,
|
||||
nlp: Optional[Language] = None,
|
||||
lookups: Optional[Lookups] = None,
|
||||
):
|
||||
"""Initialize the lemmatizer and load in data.
|
||||
|
||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||
returns a representative sample of gold-standard Example objects.
|
||||
nlp (Language): The current nlp object the component is part of.
|
||||
lookups (Lookups): The lookups object containing the (optional) tables
|
||||
such as "lemma_rules", "lemma_index", "lemma_exc" and
|
||||
"lemma_lookup". Defaults to None.
|
||||
"""
|
||||
required_tables, optional_tables = self.get_lookups_config(self.mode)
|
||||
if lookups is None:
|
||||
logger.debug("Lemmatizer: loading tables from spacy-lookups-data")
|
||||
lookups = load_lookups(lang=self.vocab.lang, tables=required_tables)
|
||||
optional_lookups = load_lookups(
|
||||
lang=self.vocab.lang, tables=optional_tables, strict=False
|
||||
)
|
||||
for table in optional_lookups.tables:
|
||||
lookups.set_table(table, optional_lookups.get_table(table))
|
||||
self.lookups = lookups
|
||||
self._validate_tables(Errors.E1004)
|
||||
|
||||
def _validate_tables(self, error_message: str = Errors.E912) -> None:
|
||||
"""Check that the lookups are correct for the current mode."""
|
||||
required_tables, optional_tables = self.get_lookups_config(self.mode)
|
||||
for table in required_tables:
|
||||
if table not in self.lookups:
|
||||
raise ValueError(
|
||||
error_message.format(
|
||||
mode=self.mode,
|
||||
tables=required_tables,
|
||||
found=self.lookups.tables,
|
||||
)
|
||||
)
|
||||
self._validated = True
|
||||
|
||||
def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
|
||||
"""Apply the pipe to a stream of documents. This usually happens under
|
||||
the hood when the nlp object is called on a text and all components are
|
||||
applied to the Doc.
|
||||
|
@ -263,7 +259,7 @@ class Lemmatizer(Pipe):
|
|||
"""
|
||||
return False
|
||||
|
||||
def score(self, examples, **kwargs) -> Dict[str, Any]:
|
||||
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||
"""Score a batch of examples.
|
||||
|
||||
examples (Iterable[Example]): The examples to score.
|
||||
|
@ -274,58 +270,66 @@ class Lemmatizer(Pipe):
|
|||
validate_examples(examples, "Lemmatizer.score")
|
||||
return Scorer.score_token_attr(examples, "lemma", **kwargs)
|
||||
|
||||
def to_disk(self, path, *, exclude=tuple()):
|
||||
"""Save the current state to a directory.
|
||||
def to_disk(
|
||||
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
||||
):
|
||||
"""Serialize the pipe to disk.
|
||||
|
||||
path (unicode or Path): A path to a directory, which will be created if
|
||||
it doesn't exist.
|
||||
exclude (list): String names of serialization fields to exclude.
|
||||
path (str / Path): Path to a directory.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/vocab#to_disk
|
||||
DOCS: https://nightly.spacy.io/api/lemmatizer#to_disk
|
||||
"""
|
||||
serialize = {}
|
||||
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
|
||||
serialize["lookups"] = lambda p: self.lookups.to_disk(p)
|
||||
util.to_disk(path, serialize, exclude)
|
||||
|
||||
def from_disk(self, path, *, exclude=tuple()):
|
||||
"""Loads state from a directory. Modifies the object in place and
|
||||
returns it.
|
||||
def from_disk(
|
||||
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
||||
) -> "Lemmatizer":
|
||||
"""Load the pipe from disk. Modifies the object in place and returns it.
|
||||
|
||||
path (unicode or Path): A path to a directory.
|
||||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (Vocab): The modified `Vocab` object.
|
||||
path (str / Path): Path to a directory.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (Lemmatizer): The modified Lemmatizer object.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/vocab#to_disk
|
||||
DOCS: https://nightly.spacy.io/api/lemmatizer#from_disk
|
||||
"""
|
||||
deserialize = {}
|
||||
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
|
||||
deserialize["lookups"] = lambda p: self.lookups.from_disk(p)
|
||||
util.from_disk(path, deserialize, exclude)
|
||||
self._validate_tables()
|
||||
return self
|
||||
|
||||
def to_bytes(self, *, exclude=tuple()) -> bytes:
|
||||
"""Serialize the current state to a binary string.
|
||||
def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
|
||||
"""Serialize the pipe to a bytestring.
|
||||
|
||||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (bytes): The serialized form of the `Vocab` object.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (bytes): The serialized object.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/vocab#to_bytes
|
||||
DOCS: https://nightly.spacy.io/api/lemmatizer#to_bytes
|
||||
"""
|
||||
serialize = {}
|
||||
serialize["vocab"] = self.vocab.to_bytes
|
||||
serialize["lookups"] = self.lookups.to_bytes
|
||||
return util.to_bytes(serialize, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data: bytes, *, exclude=tuple()):
|
||||
"""Load state from a binary string.
|
||||
def from_bytes(
|
||||
self, bytes_data: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
|
||||
) -> "Lemmatizer":
|
||||
"""Load the pipe from a bytestring.
|
||||
|
||||
bytes_data (bytes): The data to load from.
|
||||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (Vocab): The `Vocab` object.
|
||||
bytes_data (bytes): The serialized pipe.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (Lemmatizer): The loaded Lemmatizer.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/vocab#from_bytes
|
||||
DOCS: https://nightly.spacy.io/api/lemmatizer#from_bytes
|
||||
"""
|
||||
deserialize = {}
|
||||
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
|
||||
deserialize["lookups"] = lambda b: self.lookups.from_bytes(b)
|
||||
util.from_bytes(bytes_data, deserialize, exclude)
|
||||
self._validate_tables()
|
||||
return self
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# cython: infer_types=True, profile=True, binding=True
|
||||
from typing import Optional
|
||||
from typing import Optional, Union, Dict
|
||||
import srsly
|
||||
from thinc.api import SequenceCategoricalCrossentropy, Model, Config
|
||||
from itertools import islice
|
||||
|
@ -101,6 +101,11 @@ class Morphologizer(Tagger):
|
|||
"""RETURNS (Tuple[str]): The labels currently added to the component."""
|
||||
return tuple(self.cfg["labels_morph"].keys())
|
||||
|
||||
@property
|
||||
def label_data(self) -> Dict[str, Dict[str, Union[str, float, int, None]]]:
|
||||
"""A dictionary with all labels data."""
|
||||
return {"morph": self.cfg["labels_morph"], "pos": self.cfg["labels_pos"]}
|
||||
|
||||
def add_label(self, label):
|
||||
"""Add a new label to the pipe.
|
||||
|
||||
|
@ -129,27 +134,22 @@ class Morphologizer(Tagger):
|
|||
self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
|
||||
return 1
|
||||
|
||||
def begin_training(self, get_examples, *, pipeline=None, sgd=None):
|
||||
def initialize(self, get_examples, *, nlp=None):
|
||||
"""Initialize the pipe for training, using a representative set
|
||||
of data examples.
|
||||
|
||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||
returns a representative sample of gold-standard Example objects.
|
||||
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
||||
components that this component is part of. Corresponds to
|
||||
nlp.pipeline.
|
||||
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
|
||||
create_optimizer if it doesn't exist.
|
||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||
nlp (Language): The current nlp object the component is part of.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/morphologizer#begin_training
|
||||
DOCS: https://nightly.spacy.io/api/morphologizer#initialize
|
||||
"""
|
||||
self._ensure_examples(get_examples)
|
||||
# First, fetch all labels from the data
|
||||
for example in get_examples():
|
||||
for i, token in enumerate(example.reference):
|
||||
pos = token.pos_
|
||||
morph = token.morph_
|
||||
morph = str(token.morph)
|
||||
# create and add the combined morph+POS label
|
||||
morph_dict = Morphology.feats_to_dict(morph)
|
||||
if pos:
|
||||
|
@ -167,7 +167,7 @@ class Morphologizer(Tagger):
|
|||
gold_array = []
|
||||
for i, token in enumerate(example.reference):
|
||||
pos = token.pos_
|
||||
morph = token.morph_
|
||||
morph = str(token.morph)
|
||||
morph_dict = Morphology.feats_to_dict(morph)
|
||||
if pos:
|
||||
morph_dict[self.POS_FEAT] = pos
|
||||
|
@ -178,9 +178,6 @@ class Morphologizer(Tagger):
|
|||
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
||||
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
||||
self.model.initialize(X=doc_sample, Y=label_sample)
|
||||
if sgd is None:
|
||||
sgd = self.create_optimizer()
|
||||
return sgd
|
||||
|
||||
def set_annotations(self, docs, batch_tag_ids):
|
||||
"""Modify a batch of documents, using pre-computed scores.
|
||||
|
|
|
@ -81,7 +81,7 @@ class MultitaskObjective(Tagger):
|
|||
def set_annotations(self, docs, dep_ids):
|
||||
pass
|
||||
|
||||
def begin_training(self, get_examples, pipeline=None, sgd=None):
|
||||
def initialize(self, get_examples, nlp=None):
|
||||
if not hasattr(get_examples, "__call__"):
|
||||
err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
|
||||
raise ValueError(err)
|
||||
|
@ -91,9 +91,6 @@ class MultitaskObjective(Tagger):
|
|||
if label is not None and label not in self.labels:
|
||||
self.labels[label] = len(self.labels)
|
||||
self.model.initialize() # TODO: fix initialization by defining X and Y
|
||||
if sgd is None:
|
||||
sgd = self.create_optimizer()
|
||||
return sgd
|
||||
|
||||
def predict(self, docs):
|
||||
tokvecs = self.model.get_ref("tok2vec")(docs)
|
||||
|
@ -177,13 +174,10 @@ class ClozeMultitask(Pipe):
|
|||
def set_annotations(self, docs, dep_ids):
|
||||
pass
|
||||
|
||||
def begin_training(self, get_examples, pipeline=None, sgd=None):
|
||||
def initialize(self, get_examples, nlp=None):
|
||||
self.model.initialize() # TODO: fix initialization by defining X and Y
|
||||
X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
|
||||
self.model.output_layer.begin_training(X)
|
||||
if sgd is None:
|
||||
sgd = self.create_optimizer()
|
||||
return sgd
|
||||
self.model.output_layer.initialize(X)
|
||||
|
||||
def predict(self, docs):
|
||||
tokvecs = self.model.get_ref("tok2vec")(docs)
|
||||
|
|
|
@ -96,14 +96,14 @@ cdef class EntityRecognizer(Parser):
|
|||
"""Register another component as a multi-task objective. Experimental."""
|
||||
self._multitasks.append(mt_component)
|
||||
|
||||
def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
|
||||
def init_multitask_objectives(self, get_examples, nlp=None, **cfg):
|
||||
"""Setup multi-task objective components. Experimental and internal."""
|
||||
# TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ?
|
||||
for labeller in self._multitasks:
|
||||
labeller.model.set_dim("nO", len(self.labels))
|
||||
if labeller.model.has_ref("output_layer"):
|
||||
labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
|
||||
labeller.begin_training(get_examples, pipeline=pipeline)
|
||||
labeller.initialize(get_examples, nlp=nlp)
|
||||
|
||||
@property
|
||||
def labels(self):
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
# cython: infer_types=True, profile=True
|
||||
from typing import Optional, Tuple
|
||||
import srsly
|
||||
from thinc.api import set_dropout_rate, Model
|
||||
|
||||
|
@ -32,6 +33,17 @@ cdef class Pipe:
|
|||
self.name = name
|
||||
self.cfg = dict(cfg)
|
||||
|
||||
@property
|
||||
def labels(self) -> Optional[Tuple[str]]:
|
||||
return []
|
||||
|
||||
@property
|
||||
def label_data(self):
|
||||
"""Optional JSON-serializable data that would be sufficient to recreate
|
||||
the label set if provided to the `pipe.initialize()` method.
|
||||
"""
|
||||
return None
|
||||
|
||||
def __call__(self, Doc doc):
|
||||
"""Apply the pipe to one document. The document is modified in place,
|
||||
and returned. This usually happens under the hood when the nlp object
|
||||
|
@ -183,7 +195,7 @@ cdef class Pipe:
|
|||
"""
|
||||
return util.create_default_optimizer()
|
||||
|
||||
def begin_training(self, get_examples, *, pipeline=None, sgd=None):
|
||||
def initialize(self, get_examples, *, nlp=None):
|
||||
"""Initialize the pipe for training, using data examples if available.
|
||||
This method needs to be implemented by each Pipe component,
|
||||
ensuring the internal model (if available) is initialized properly
|
||||
|
@ -191,16 +203,11 @@ cdef class Pipe:
|
|||
|
||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||
returns a representative sample of gold-standard Example objects.
|
||||
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
||||
components that this component is part of. Corresponds to
|
||||
nlp.pipeline.
|
||||
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
|
||||
create_optimizer if it doesn't exist.
|
||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||
nlp (Language): The current nlp object the component is part of.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/pipe#begin_training
|
||||
DOCS: https://nightly.spacy.io/api/pipe#initialize
|
||||
"""
|
||||
raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name))
|
||||
pass
|
||||
|
||||
def _ensure_examples(self, get_examples):
|
||||
if get_examples is None or not hasattr(get_examples, "__call__"):
|
||||
|
|
|
@ -58,7 +58,7 @@ class Sentencizer(Pipe):
|
|||
else:
|
||||
self.punct_chars = set(self.default_punct_chars)
|
||||
|
||||
def begin_training(self, get_examples, pipeline=None, sgd=None):
|
||||
def initialize(self, get_examples, nlp=None):
|
||||
pass
|
||||
|
||||
def __call__(self, doc):
|
||||
|
|
|
@ -71,6 +71,10 @@ class SentenceRecognizer(Tagger):
|
|||
# are 0
|
||||
return tuple(["I", "S"])
|
||||
|
||||
@property
|
||||
def label_data(self):
|
||||
return self.labels
|
||||
|
||||
def set_annotations(self, docs, batch_tag_ids):
|
||||
"""Modify a batch of documents, using pre-computed scores.
|
||||
|
||||
|
@ -124,20 +128,15 @@ class SentenceRecognizer(Tagger):
|
|||
raise ValueError("nan value when computing loss")
|
||||
return float(loss), d_scores
|
||||
|
||||
def begin_training(self, get_examples, *, pipeline=None, sgd=None):
|
||||
def initialize(self, get_examples, *, nlp=None):
|
||||
"""Initialize the pipe for training, using a representative set
|
||||
of data examples.
|
||||
|
||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||
returns a representative sample of gold-standard Example objects.
|
||||
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
||||
components that this component is part of. Corresponds to
|
||||
nlp.pipeline.
|
||||
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
|
||||
create_optimizer if it doesn't exist.
|
||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||
nlp (Language): The current nlp object the component is part of.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/sentencerecognizer#begin_training
|
||||
DOCS: https://nightly.spacy.io/api/sentencerecognizer#initialize
|
||||
"""
|
||||
self._ensure_examples(get_examples)
|
||||
doc_sample = []
|
||||
|
@ -151,9 +150,6 @@ class SentenceRecognizer(Tagger):
|
|||
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
||||
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
||||
self.model.initialize(X=doc_sample, Y=label_sample)
|
||||
if sgd is None:
|
||||
sgd = self.create_optimizer()
|
||||
return sgd
|
||||
|
||||
def add_label(self, label, values=None):
|
||||
raise NotImplementedError
|
||||
|
|
|
@ -90,6 +90,11 @@ class Tagger(Pipe):
|
|||
"""
|
||||
return tuple(self.cfg["labels"])
|
||||
|
||||
@property
|
||||
def label_data(self):
|
||||
"""Data about the labels currently added to the component."""
|
||||
return tuple(self.cfg["labels"])
|
||||
|
||||
def __call__(self, doc):
|
||||
"""Apply the pipe to a Doc.
|
||||
|
||||
|
@ -256,31 +261,33 @@ class Tagger(Pipe):
|
|||
raise ValueError("nan value when computing loss")
|
||||
return float(loss), d_scores
|
||||
|
||||
def begin_training(self, get_examples, *, pipeline=None, sgd=None):
|
||||
def initialize(self, get_examples, *, nlp=None, labels=None):
|
||||
"""Initialize the pipe for training, using a representative set
|
||||
of data examples.
|
||||
|
||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||
returns a representative sample of gold-standard Example objects..
|
||||
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
||||
components that this component is part of. Corresponds to
|
||||
nlp.pipeline.
|
||||
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
|
||||
create_optimizer if it doesn't exist.
|
||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||
nlp (Language): The current nlp object the component is part of.
|
||||
labels: The labels to add to the component, typically generated by the
|
||||
`init labels` command. If no labels are provided, the get_examples
|
||||
callback is used to extract the labels from the data.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/tagger#begin_training
|
||||
DOCS: https://nightly.spacy.io/api/tagger#initialize
|
||||
"""
|
||||
self._ensure_examples(get_examples)
|
||||
if labels is not None:
|
||||
for tag in labels:
|
||||
self.add_label(tag)
|
||||
else:
|
||||
tags = set()
|
||||
for example in get_examples():
|
||||
for token in example.y:
|
||||
if token.tag_:
|
||||
tags.add(token.tag_)
|
||||
for tag in sorted(tags):
|
||||
self.add_label(tag)
|
||||
doc_sample = []
|
||||
label_sample = []
|
||||
tags = set()
|
||||
for example in get_examples():
|
||||
for token in example.y:
|
||||
if token.tag_:
|
||||
tags.add(token.tag_)
|
||||
for tag in sorted(tags):
|
||||
self.add_label(tag)
|
||||
for example in islice(get_examples(), 10):
|
||||
doc_sample.append(example.x)
|
||||
gold_tags = example.get_aligned("TAG", as_string=True)
|
||||
|
@ -289,9 +296,6 @@ class Tagger(Pipe):
|
|||
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
||||
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
||||
self.model.initialize(X=doc_sample, Y=label_sample)
|
||||
if sgd is None:
|
||||
sgd = self.create_optimizer()
|
||||
return sgd
|
||||
|
||||
def add_label(self, label):
|
||||
"""Add a new label to the pipe.
|
||||
|
|
|
@ -154,8 +154,16 @@ class TextCategorizer(Pipe):
|
|||
|
||||
@labels.setter
|
||||
def labels(self, value: List[str]) -> None:
|
||||
# TODO: This really shouldn't be here. I had a look and I added it when
|
||||
# I added the labels property, but it's pretty nasty to have this, and
|
||||
# will lead to problems.
|
||||
self.cfg["labels"] = tuple(value)
|
||||
|
||||
@property
|
||||
def label_data(self) -> List[str]:
|
||||
"""RETURNS (List[str]): Information about the component's labels."""
|
||||
return self.labels
|
||||
|
||||
def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
|
||||
"""Apply the pipe to a stream of documents. This usually happens under
|
||||
the hood when the nlp object is called on a text and all components are
|
||||
|
@ -334,43 +342,40 @@ class TextCategorizer(Pipe):
|
|||
self.labels = tuple(list(self.labels) + [label])
|
||||
return 1
|
||||
|
||||
def begin_training(
|
||||
def initialize(
|
||||
self,
|
||||
get_examples: Callable[[], Iterable[Example]],
|
||||
*,
|
||||
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
|
||||
sgd: Optional[Optimizer] = None,
|
||||
) -> Optimizer:
|
||||
nlp: Optional[Language] = None,
|
||||
labels: Optional[Dict] = None,
|
||||
):
|
||||
"""Initialize the pipe for training, using a representative set
|
||||
of data examples.
|
||||
|
||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||
returns a representative sample of gold-standard Example objects.
|
||||
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
||||
components that this component is part of. Corresponds to
|
||||
nlp.pipeline.
|
||||
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
|
||||
create_optimizer if it doesn't exist.
|
||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||
nlp (Language): The current nlp object the component is part of.
|
||||
labels: The labels to add to the component, typically generated by the
|
||||
`init labels` command. If no labels are provided, the get_examples
|
||||
callback is used to extract the labels from the data.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/textcategorizer#begin_training
|
||||
DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
|
||||
"""
|
||||
self._ensure_examples(get_examples)
|
||||
subbatch = [] # Select a subbatch of examples to initialize the model
|
||||
for example in islice(get_examples(), 10):
|
||||
if len(subbatch) < 2:
|
||||
subbatch.append(example)
|
||||
for cat in example.y.cats:
|
||||
self.add_label(cat)
|
||||
if labels is None:
|
||||
for example in get_examples():
|
||||
for cat in example.y.cats:
|
||||
self.add_label(cat)
|
||||
else:
|
||||
for label in labels:
|
||||
self.add_label(label)
|
||||
subbatch = list(islice(get_examples(), 10))
|
||||
doc_sample = [eg.reference for eg in subbatch]
|
||||
label_sample, _ = self._examples_to_truth(subbatch)
|
||||
self._require_labels()
|
||||
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
||||
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
||||
self.model.initialize(X=doc_sample, Y=label_sample)
|
||||
if sgd is None:
|
||||
sgd = self.create_optimizer()
|
||||
return sgd
|
||||
|
||||
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||
"""Score a batch of examples.
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List, Tuple
|
||||
from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List
|
||||
from thinc.api import Model, set_dropout_rate, Optimizer, Config
|
||||
from itertools import islice
|
||||
|
||||
|
@ -203,26 +203,20 @@ class Tok2Vec(Pipe):
|
|||
def get_loss(self, examples, scores) -> None:
|
||||
pass
|
||||
|
||||
def begin_training(
|
||||
def initialize(
|
||||
self,
|
||||
get_examples: Callable[[], Iterable[Example]],
|
||||
*,
|
||||
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
|
||||
sgd: Optional[Optimizer] = None,
|
||||
nlp: Optional[Language] = None,
|
||||
):
|
||||
"""Initialize the pipe for training, using a representative set
|
||||
of data examples.
|
||||
|
||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||
returns a representative sample of gold-standard Example objects.
|
||||
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
||||
components that this component is part of. Corresponds to
|
||||
nlp.pipeline.
|
||||
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
|
||||
create_optimizer if it doesn't exist.
|
||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||
nlp (Language): The current nlp object the component is part of.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/tok2vec#begin_training
|
||||
DOCS: https://nightly.spacy.io/api/tok2vec#initialize
|
||||
"""
|
||||
self._ensure_examples(get_examples)
|
||||
doc_sample = []
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# cython: infer_types=True, cdivision=True, boundscheck=False
|
||||
# cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
|
||||
from __future__ import print_function
|
||||
from cymem.cymem cimport Pool
|
||||
cimport numpy as np
|
||||
|
@ -7,6 +7,7 @@ from libcpp.vector cimport vector
|
|||
from libc.string cimport memset
|
||||
from libc.stdlib cimport calloc, free
|
||||
import random
|
||||
from typing import Optional
|
||||
|
||||
import srsly
|
||||
from thinc.api import set_dropout_rate
|
||||
|
@ -95,6 +96,10 @@ cdef class Parser(Pipe):
|
|||
class_names = [self.moves.get_class_name(i) for i in range(self.moves.n_moves)]
|
||||
return class_names
|
||||
|
||||
@property
|
||||
def label_data(self):
|
||||
return self.moves.labels
|
||||
|
||||
@property
|
||||
def tok2vec(self):
|
||||
"""Return the embedding and convolutional layer of the model."""
|
||||
|
@ -354,7 +359,7 @@ cdef class Parser(Pipe):
|
|||
# If all weights for an output are 0 in the original model, don't
|
||||
# supervise that output. This allows us to add classes.
|
||||
loss += (d_scores**2).sum()
|
||||
backprop(d_scores, sgd=sgd)
|
||||
backprop(d_scores)
|
||||
# Follow the predicted action
|
||||
self.transition_states(states, guesses)
|
||||
states = [state for state in states if not state.is_final()]
|
||||
|
@ -405,18 +410,20 @@ cdef class Parser(Pipe):
|
|||
def set_output(self, nO):
|
||||
self.model.attrs["resize_output"](self.model, nO)
|
||||
|
||||
def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
|
||||
def initialize(self, get_examples, nlp=None, labels=None):
|
||||
self._ensure_examples(get_examples)
|
||||
self.cfg.update(kwargs)
|
||||
lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
|
||||
if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
|
||||
langs = ", ".join(util.LEXEME_NORM_LANGS)
|
||||
util.logger.debug(Warnings.W033.format(model="parser or NER", langs=langs))
|
||||
actions = self.moves.get_actions(
|
||||
examples=get_examples(),
|
||||
min_freq=self.cfg['min_action_freq'],
|
||||
learn_tokens=self.cfg["learn_tokens"]
|
||||
)
|
||||
if labels is not None:
|
||||
actions = dict(labels)
|
||||
else:
|
||||
actions = self.moves.get_actions(
|
||||
examples=get_examples(),
|
||||
min_freq=self.cfg['min_action_freq'],
|
||||
learn_tokens=self.cfg["learn_tokens"]
|
||||
)
|
||||
for action, labels in self.moves.labels.items():
|
||||
actions.setdefault(action, {})
|
||||
for label, freq in labels.items():
|
||||
|
@ -425,11 +432,9 @@ cdef class Parser(Pipe):
|
|||
self.moves.initialize_actions(actions)
|
||||
# make sure we resize so we have an appropriate upper layer
|
||||
self._resize()
|
||||
if sgd is None:
|
||||
sgd = self.create_optimizer()
|
||||
doc_sample = []
|
||||
if pipeline is not None:
|
||||
for name, component in pipeline:
|
||||
if nlp is not None:
|
||||
for name, component in nlp.pipeline:
|
||||
if component is self:
|
||||
break
|
||||
if hasattr(component, "pipe"):
|
||||
|
@ -441,9 +446,8 @@ cdef class Parser(Pipe):
|
|||
doc_sample.append(example.predicted)
|
||||
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
||||
self.model.initialize(doc_sample)
|
||||
if pipeline is not None:
|
||||
self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg)
|
||||
return sgd
|
||||
if nlp is not None:
|
||||
self.init_multitask_objectives(get_examples, nlp.pipeline)
|
||||
|
||||
def to_disk(self, path, exclude=tuple()):
|
||||
serializers = {
|
||||
|
|
144
spacy/schemas.py
144
spacy/schemas.py
|
@ -1,15 +1,17 @@
|
|||
from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
|
||||
from typing import Iterable, TypeVar, TYPE_CHECKING
|
||||
from enum import Enum
|
||||
from pydantic import BaseModel, Field, ValidationError, validator
|
||||
from pydantic import BaseModel, Field, ValidationError, validator, create_model
|
||||
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
|
||||
from pydantic import root_validator
|
||||
from pydantic.main import ModelMetaclass
|
||||
from thinc.api import Optimizer, ConfigValidationError
|
||||
from thinc.config import Promise
|
||||
from collections import defaultdict
|
||||
from thinc.api import Optimizer
|
||||
import inspect
|
||||
|
||||
from .attrs import NAMES
|
||||
from .lookups import Lookups
|
||||
from .util import is_cython_func
|
||||
|
||||
if TYPE_CHECKING:
|
||||
# This lets us add type hints for mypy etc. without causing circular imports
|
||||
|
@ -44,6 +46,96 @@ def validate(schema: Type[BaseModel], obj: Dict[str, Any]) -> List[str]:
|
|||
return [f"[{loc}] {', '.join(msg)}" for loc, msg in data.items()]
|
||||
|
||||
|
||||
# Initialization
|
||||
|
||||
|
||||
class ArgSchemaConfig:
|
||||
extra = "forbid"
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
|
||||
class ArgSchemaConfigExtra:
|
||||
extra = "forbid"
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
|
||||
def get_arg_model(
|
||||
func: Callable,
|
||||
*,
|
||||
exclude: Iterable[str] = tuple(),
|
||||
name: str = "ArgModel",
|
||||
strict: bool = True,
|
||||
) -> ModelMetaclass:
|
||||
"""Generate a pydantic model for function arguments.
|
||||
|
||||
func (Callable): The function to generate the schema for.
|
||||
exclude (Iterable[str]): Parameter names to ignore.
|
||||
name (str): Name of created model class.
|
||||
strict (bool): Don't allow extra arguments if no variable keyword arguments
|
||||
are allowed on the function.
|
||||
RETURNS (ModelMetaclass): A pydantic model.
|
||||
"""
|
||||
sig_args = {}
|
||||
try:
|
||||
sig = inspect.signature(func)
|
||||
except ValueError:
|
||||
# Typically happens if the method is part of a Cython module without
|
||||
# binding=True. Here we just use an empty model that allows everything.
|
||||
return create_model(name, __config__=ArgSchemaConfigExtra)
|
||||
has_variable = False
|
||||
for param in sig.parameters.values():
|
||||
if param.name in exclude:
|
||||
continue
|
||||
if param.kind == param.VAR_KEYWORD:
|
||||
# The function allows variable keyword arguments so we shouldn't
|
||||
# include **kwargs etc. in the schema and switch to non-strict
|
||||
# mode and pass through all other values
|
||||
has_variable = True
|
||||
continue
|
||||
# If no annotation is specified assume it's anything
|
||||
annotation = param.annotation if param.annotation != param.empty else Any
|
||||
# If no default value is specified assume that it's required. Cython
|
||||
# functions/methods will have param.empty for default value None so we
|
||||
# need to treat them differently
|
||||
default_empty = None if is_cython_func(func) else ...
|
||||
default = param.default if param.default != param.empty else default_empty
|
||||
sig_args[param.name] = (annotation, default)
|
||||
is_strict = strict and not has_variable
|
||||
sig_args["__config__"] = ArgSchemaConfig if is_strict else ArgSchemaConfigExtra
|
||||
return create_model(name, **sig_args)
|
||||
|
||||
|
||||
def validate_init_settings(
|
||||
func: Callable,
|
||||
settings: Dict[str, Any],
|
||||
*,
|
||||
section: Optional[str] = None,
|
||||
name: str = "",
|
||||
exclude: Iterable[str] = ("get_examples", "nlp"),
|
||||
) -> Dict[str, Any]:
|
||||
"""Validate initialization settings against the expected arguments in
|
||||
the method signature. Will parse values if possible (e.g. int to string)
|
||||
and return the updated settings dict. Will raise a ConfigValidationError
|
||||
if types don't match or required values are missing.
|
||||
|
||||
func (Callable): The initialize method of a given component etc.
|
||||
settings (Dict[str, Any]): The settings from the repsective [initialize] block.
|
||||
section (str): Initialize section, for error message.
|
||||
name (str): Name of the block in the section.
|
||||
exclude (Iterable[str]): Parameter names to exclude from schema.
|
||||
RETURNS (Dict[str, Any]): The validated settings.
|
||||
"""
|
||||
schema = get_arg_model(func, exclude=exclude, name="InitArgModel")
|
||||
try:
|
||||
return schema(**settings).dict()
|
||||
except ValidationError as e:
|
||||
block = "initialize" if not section else f"initialize.{section}"
|
||||
title = f"Error validating initialization settings in [{block}]"
|
||||
raise ConfigValidationError(
|
||||
title=title, errors=e.errors(), config=settings, parent=name
|
||||
) from None
|
||||
|
||||
|
||||
# Matcher token patterns
|
||||
|
||||
|
||||
|
@ -190,7 +282,7 @@ class ModelMetaSchema(BaseModel):
|
|||
sources: Optional[Union[List[StrictStr], List[Dict[str, str]]]] = Field(None, title="Training data sources")
|
||||
vectors: Dict[str, Any] = Field({}, title="Included word vectors")
|
||||
labels: Dict[str, List[str]] = Field({}, title="Component labels, keyed by component name")
|
||||
performance: Dict[str, Union[float, Dict[str, float]]] = Field({}, title="Accuracy and speed numbers")
|
||||
performance: Dict[str, Union[float, Dict[str, Union[float, dict]]]] = Field({}, title="Accuracy and speed numbers")
|
||||
spacy_git_version: StrictStr = Field("", title="Commit of spaCy version used")
|
||||
# fmt: on
|
||||
|
||||
|
@ -205,8 +297,6 @@ class ModelMetaSchema(BaseModel):
|
|||
|
||||
class ConfigSchemaTraining(BaseModel):
|
||||
# fmt: off
|
||||
vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
|
||||
lookups: Optional[Lookups] = Field(..., title="Vocab lookups")
|
||||
dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data")
|
||||
train_corpus: StrictStr = Field(..., title="Path in the config to the training data")
|
||||
batcher: Batcher = Field(..., title="Batcher for the training data")
|
||||
|
@ -219,8 +309,6 @@ class ConfigSchemaTraining(BaseModel):
|
|||
gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU")
|
||||
accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps")
|
||||
score_weights: Dict[StrictStr, Optional[Union[StrictFloat, StrictInt]]] = Field(..., title="Scores to report and their weights for selecting final model")
|
||||
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
|
||||
raw_text: Optional[StrictStr] = Field(default=None, title="Raw text")
|
||||
optimizer: Optimizer = Field(..., title="The optimizer to use")
|
||||
logger: Logger = Field(..., title="The logger to track training progress")
|
||||
frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training")
|
||||
|
@ -273,36 +361,40 @@ class ConfigSchemaPretrain(BaseModel):
|
|||
arbitrary_types_allowed = True
|
||||
|
||||
|
||||
class ConfigSchemaInit(BaseModel):
|
||||
# fmt: off
|
||||
vocab_data: Optional[StrictStr] = Field(..., title="Path to JSON-formatted vocabulary file")
|
||||
lookups: Optional[Lookups] = Field(..., title="Vocabulary lookups, e.g. lexeme normalization")
|
||||
vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
|
||||
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
|
||||
tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize")
|
||||
components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for Pipe.initialize methods of pipeline components, keyed by component")
|
||||
# fmt: on
|
||||
|
||||
class Config:
|
||||
extra = "forbid"
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
|
||||
class ConfigSchema(BaseModel):
|
||||
training: ConfigSchemaTraining
|
||||
nlp: ConfigSchemaNlp
|
||||
pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {}
|
||||
components: Dict[str, Dict[str, Any]]
|
||||
corpora: Dict[str, Reader]
|
||||
|
||||
@root_validator(allow_reuse=True)
|
||||
def validate_config(cls, values):
|
||||
"""Perform additional validation for settings with dependencies."""
|
||||
pt = values.get("pretraining")
|
||||
if pt and not isinstance(pt, ConfigSchemaPretrainEmpty):
|
||||
if pt.objective.get("type") == "vectors" and not values["nlp"].vectors:
|
||||
err = "Need nlp.vectors if pretraining.objective.type is vectors"
|
||||
raise ValueError(err)
|
||||
return values
|
||||
initialize: ConfigSchemaInit
|
||||
|
||||
class Config:
|
||||
extra = "allow"
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
|
||||
class TrainingSchema(BaseModel):
|
||||
training: ConfigSchemaTraining
|
||||
pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {}
|
||||
corpora: Dict[str, Reader]
|
||||
|
||||
class Config:
|
||||
extra = "allow"
|
||||
arbitrary_types_allowed = True
|
||||
CONFIG_SCHEMAS = {
|
||||
"nlp": ConfigSchemaNlp,
|
||||
"training": ConfigSchemaTraining,
|
||||
"pretraining": ConfigSchemaPretrain,
|
||||
"initialize": ConfigSchemaInit,
|
||||
}
|
||||
|
||||
|
||||
# Project config Schema
|
||||
|
|
|
@ -32,9 +32,7 @@ class PRFScore:
|
|||
|
||||
def __add__(self, other):
|
||||
return PRFScore(
|
||||
tp=self.tp+other.tp,
|
||||
fp=self.fp+other.fp,
|
||||
fn=self.fn+other.fn
|
||||
tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn
|
||||
)
|
||||
|
||||
def score_set(self, cand: set, gold: set) -> None:
|
||||
|
@ -485,7 +483,7 @@ class Scorer:
|
|||
(pred_ent.start_char, pred_ent.end_char), None
|
||||
)
|
||||
label = gold_span.label_
|
||||
if not label in f_per_type:
|
||||
if label not in f_per_type:
|
||||
f_per_type[label] = PRFScore()
|
||||
gold = gold_span.kb_id_
|
||||
# only evaluating entities that overlap between gold and pred,
|
||||
|
@ -632,7 +630,6 @@ def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]:
|
|||
continue
|
||||
golds = {(e.label_, e.start, e.end) for e in eg.y.ents}
|
||||
align_x2y = eg.alignment.x2y
|
||||
preds = set()
|
||||
for pred_ent in eg.x.ents:
|
||||
if pred_ent.label_ not in scores:
|
||||
scores[pred_ent.label_] = PRFScore()
|
||||
|
|
|
@ -272,22 +272,35 @@ def zh_tokenizer_char():
|
|||
def zh_tokenizer_jieba():
|
||||
pytest.importorskip("jieba")
|
||||
config = {
|
||||
"@tokenizers": "spacy.zh.ChineseTokenizer",
|
||||
"segmenter": "jieba",
|
||||
"nlp": {
|
||||
"tokenizer": {
|
||||
"@tokenizers": "spacy.zh.ChineseTokenizer",
|
||||
"segmenter": "jieba",
|
||||
}
|
||||
}
|
||||
}
|
||||
nlp = get_lang_class("zh").from_config({"nlp": {"tokenizer": config}})
|
||||
nlp = get_lang_class("zh").from_config(config)
|
||||
return nlp.tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def zh_tokenizer_pkuseg():
|
||||
pytest.importorskip("pkuseg")
|
||||
pytest.importorskip("pickle5")
|
||||
config = {
|
||||
"@tokenizers": "spacy.zh.ChineseTokenizer",
|
||||
"segmenter": "pkuseg",
|
||||
"pkuseg_model": "default",
|
||||
"nlp": {
|
||||
"tokenizer": {
|
||||
"@tokenizers": "spacy.zh.ChineseTokenizer",
|
||||
"segmenter": "pkuseg",
|
||||
}
|
||||
},
|
||||
"initialize": {"tokenizer": {
|
||||
"pkuseg_model": "default",
|
||||
}
|
||||
},
|
||||
}
|
||||
nlp = get_lang_class("zh").from_config({"nlp": {"tokenizer": config}})
|
||||
nlp = get_lang_class("zh").from_config(config)
|
||||
nlp.initialize()
|
||||
return nlp.tokenizer
|
||||
|
||||
|
||||
|
|
|
@ -26,7 +26,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
|
|||
cfg = {"model": DEFAULT_NER_MODEL}
|
||||
model = registry.resolve(cfg, validate=True)["model"]
|
||||
ner = EntityRecognizer(en_vocab, model, **config)
|
||||
ner.begin_training(lambda: [_ner_example(ner)])
|
||||
ner.initialize(lambda: [_ner_example(ner)])
|
||||
ner(doc)
|
||||
|
||||
doc.ents = [("ANIMAL", 3, 4)]
|
||||
|
@ -48,7 +48,7 @@ def test_ents_reset(en_vocab):
|
|||
cfg = {"model": DEFAULT_NER_MODEL}
|
||||
model = registry.resolve(cfg, validate=True)["model"]
|
||||
ner = EntityRecognizer(en_vocab, model, **config)
|
||||
ner.begin_training(lambda: [_ner_example(ner)])
|
||||
ner.initialize(lambda: [_ner_example(ner)])
|
||||
ner(doc)
|
||||
orig_iobs = [t.ent_iob_ for t in doc]
|
||||
doc.ents = list(doc.ents)
|
||||
|
|
|
@ -46,9 +46,9 @@ def test_doc_array_morph(en_vocab):
|
|||
words = ["Eat", "blue", "ham"]
|
||||
morph = ["Feat=V", "Feat=J", "Feat=N"]
|
||||
doc = Doc(en_vocab, words=words, morphs=morph)
|
||||
assert morph[0] == doc[0].morph_
|
||||
assert morph[1] == doc[1].morph_
|
||||
assert morph[2] == doc[2].morph_
|
||||
assert morph[0] == str(doc[0].morph)
|
||||
assert morph[1] == str(doc[1].morph)
|
||||
assert morph[2] == str(doc[2].morph)
|
||||
|
||||
feats_array = doc.to_array((ORTH, MORPH))
|
||||
assert feats_array[0][1] == doc[0].morph.key
|
||||
|
|
|
@ -19,7 +19,7 @@ def test_doc_api_init(en_vocab):
|
|||
assert [t.is_sent_start for t in doc] == [True, False, True, False]
|
||||
# heads override sent_starts
|
||||
doc = Doc(
|
||||
en_vocab, words=words, sent_starts=[True] * 4, heads=heads, deps=["dep"] * 4,
|
||||
en_vocab, words=words, sent_starts=[True] * 4, heads=heads, deps=["dep"] * 4
|
||||
)
|
||||
assert [t.is_sent_start for t in doc] == [True, False, True, False]
|
||||
|
||||
|
@ -319,15 +319,13 @@ def test_doc_from_array_morph(en_vocab):
|
|||
words = ["I", "live", "in", "New", "York", "."]
|
||||
morphs = ["Feat1=A", "Feat1=B", "Feat1=C", "Feat1=A|Feat2=D", "Feat2=E", "Feat3=F"]
|
||||
# fmt: on
|
||||
doc = Doc(en_vocab, words=words)
|
||||
for i, morph in enumerate(morphs):
|
||||
doc[i].morph_ = morph
|
||||
doc = Doc(en_vocab, words=words, morphs=morphs)
|
||||
attrs = [MORPH]
|
||||
arr = doc.to_array(attrs)
|
||||
new_doc = Doc(en_vocab, words=words)
|
||||
new_doc.from_array(attrs, arr)
|
||||
assert [t.morph_ for t in new_doc] == morphs
|
||||
assert [t.morph_ for t in doc] == [t.morph_ for t in new_doc]
|
||||
assert [str(t.morph) for t in new_doc] == morphs
|
||||
assert [str(t.morph) for t in doc] == [str(t.morph) for t in new_doc]
|
||||
|
||||
|
||||
def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
||||
|
@ -423,7 +421,7 @@ def test_has_annotation(en_vocab):
|
|||
|
||||
doc[0].tag_ = "A"
|
||||
doc[0].pos_ = "X"
|
||||
doc[0].morph_ = "Feat=Val"
|
||||
doc[0].set_morph("Feat=Val")
|
||||
doc[0].lemma_ = "a"
|
||||
doc[0].dep_ = "dep"
|
||||
doc[0].head = doc[1]
|
||||
|
@ -435,7 +433,7 @@ def test_has_annotation(en_vocab):
|
|||
|
||||
doc[1].tag_ = "A"
|
||||
doc[1].pos_ = "X"
|
||||
doc[1].morph_ = ""
|
||||
doc[1].set_morph("")
|
||||
doc[1].lemma_ = "a"
|
||||
doc[1].dep_ = "dep"
|
||||
doc.ents = [Span(doc, 0, 2, label="HELLO")]
|
||||
|
@ -533,5 +531,78 @@ def test_doc_ents_setter():
|
|||
assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
|
||||
vocab = Vocab()
|
||||
ents = [("HELLO", 0, 2), (vocab.strings.add("WORLD"), 3, 5)]
|
||||
ents = ["B-HELLO", "I-HELLO", "O", "B-WORLD", "I-WORLD"]
|
||||
doc = Doc(vocab, words=words, ents=ents)
|
||||
assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
|
||||
|
||||
|
||||
def test_doc_morph_setter(en_tokenizer, de_tokenizer):
|
||||
doc1 = en_tokenizer("a b")
|
||||
doc1b = en_tokenizer("c d")
|
||||
doc2 = de_tokenizer("a b")
|
||||
|
||||
# unset values can be copied
|
||||
doc1[0].morph = doc1[1].morph
|
||||
assert doc1[0].morph.key == 0
|
||||
assert doc1[1].morph.key == 0
|
||||
|
||||
# morph values from the same vocab can be copied
|
||||
doc1[0].set_morph("Feat=Val")
|
||||
doc1[1].morph = doc1[0].morph
|
||||
assert doc1[0].morph == doc1[1].morph
|
||||
|
||||
# ... also across docs
|
||||
doc1b[0].morph = doc1[0].morph
|
||||
assert doc1[0].morph == doc1b[0].morph
|
||||
|
||||
doc2[0].set_morph("Feat2=Val2")
|
||||
|
||||
# the morph value must come from the same vocab
|
||||
with pytest.raises(ValueError):
|
||||
doc1[0].morph = doc2[0].morph
|
||||
|
||||
|
||||
def test_doc_init_iob():
|
||||
"""Test ents validation/normalization in Doc.__init__"""
|
||||
words = ["a", "b", "c", "d", "e"]
|
||||
ents = ["O"] * len(words)
|
||||
doc = Doc(Vocab(), words=words, ents=ents)
|
||||
assert doc.ents == ()
|
||||
|
||||
ents = ["B-PERSON", "I-PERSON", "O", "I-PERSON", "I-PERSON"]
|
||||
doc = Doc(Vocab(), words=words, ents=ents)
|
||||
assert len(doc.ents) == 2
|
||||
|
||||
ents = ["B-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
|
||||
doc = Doc(Vocab(), words=words, ents=ents)
|
||||
assert len(doc.ents) == 3
|
||||
|
||||
# None is missing
|
||||
ents = ["B-PERSON", "I-PERSON", "O", None, "I-GPE"]
|
||||
doc = Doc(Vocab(), words=words, ents=ents)
|
||||
assert len(doc.ents) == 2
|
||||
|
||||
# empty tag is missing
|
||||
ents = ["", "B-PERSON", "O", "B-PERSON", "I-PERSON"]
|
||||
doc = Doc(Vocab(), words=words, ents=ents)
|
||||
assert len(doc.ents) == 2
|
||||
|
||||
# invalid IOB
|
||||
ents = ["Q-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
|
||||
with pytest.raises(ValueError):
|
||||
doc = Doc(Vocab(), words=words, ents=ents)
|
||||
|
||||
# no dash
|
||||
ents = ["OPERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
|
||||
with pytest.raises(ValueError):
|
||||
doc = Doc(Vocab(), words=words, ents=ents)
|
||||
|
||||
# no ent type
|
||||
ents = ["O", "B-", "O", "I-PERSON", "I-GPE"]
|
||||
with pytest.raises(ValueError):
|
||||
doc = Doc(Vocab(), words=words, ents=ents)
|
||||
|
||||
# not strings or None
|
||||
ents = [0, "B-", "O", "I-PERSON", "I-GPE"]
|
||||
with pytest.raises(ValueError):
|
||||
doc = Doc(Vocab(), words=words, ents=ents)
|
||||
|
|
|
@ -4,13 +4,13 @@ import pytest
|
|||
@pytest.fixture
|
||||
def i_has(en_tokenizer):
|
||||
doc = en_tokenizer("I has")
|
||||
doc[0].morph_ = {"PronType": "prs"}
|
||||
doc[1].morph_ = {
|
||||
doc[0].set_morph({"PronType": "prs"})
|
||||
doc[1].set_morph({
|
||||
"VerbForm": "fin",
|
||||
"Tense": "pres",
|
||||
"Number": "sing",
|
||||
"Person": "three",
|
||||
}
|
||||
})
|
||||
|
||||
return doc
|
||||
|
||||
|
@ -47,20 +47,20 @@ def test_morph_get(i_has):
|
|||
def test_morph_set(i_has):
|
||||
assert i_has[0].morph.get("PronType") == ["prs"]
|
||||
# set by string
|
||||
i_has[0].morph_ = "PronType=unk"
|
||||
i_has[0].set_morph("PronType=unk")
|
||||
assert i_has[0].morph.get("PronType") == ["unk"]
|
||||
# set by string, fields are alphabetized
|
||||
i_has[0].morph_ = "PronType=123|NounType=unk"
|
||||
assert i_has[0].morph_ == "NounType=unk|PronType=123"
|
||||
i_has[0].set_morph("PronType=123|NounType=unk")
|
||||
assert str(i_has[0].morph) == "NounType=unk|PronType=123"
|
||||
# set by dict
|
||||
i_has[0].morph_ = {"AType": "123", "BType": "unk"}
|
||||
assert i_has[0].morph_ == "AType=123|BType=unk"
|
||||
i_has[0].set_morph({"AType": "123", "BType": "unk"})
|
||||
assert str(i_has[0].morph) == "AType=123|BType=unk"
|
||||
# set by string with multiple values, fields and values are alphabetized
|
||||
i_has[0].morph_ = "BType=c|AType=b,a"
|
||||
assert i_has[0].morph_ == "AType=a,b|BType=c"
|
||||
i_has[0].set_morph("BType=c|AType=b,a")
|
||||
assert str(i_has[0].morph) == "AType=a,b|BType=c"
|
||||
# set by dict with multiple values, fields and values are alphabetized
|
||||
i_has[0].morph_ = {"AType": "b,a", "BType": "c"}
|
||||
assert i_has[0].morph_ == "AType=a,b|BType=c"
|
||||
i_has[0].set_morph({"AType": "b,a", "BType": "c"})
|
||||
assert str(i_has[0].morph) == "AType=a,b|BType=c"
|
||||
|
||||
|
||||
def test_morph_str(i_has):
|
||||
|
@ -72,25 +72,25 @@ def test_morph_property(tokenizer):
|
|||
doc = tokenizer("a dog")
|
||||
|
||||
# set through token.morph_
|
||||
doc[0].morph_ = "PronType=prs"
|
||||
assert doc[0].morph_ == "PronType=prs"
|
||||
doc[0].set_morph("PronType=prs")
|
||||
assert str(doc[0].morph) == "PronType=prs"
|
||||
assert doc.to_array(["MORPH"])[0] != 0
|
||||
|
||||
# unset with token.morph
|
||||
doc[0].morph = 0
|
||||
doc[0].set_morph(None)
|
||||
assert doc.to_array(["MORPH"])[0] == 0
|
||||
|
||||
# empty morph is equivalent to "_"
|
||||
doc[0].morph_ = ""
|
||||
assert doc[0].morph_ == ""
|
||||
doc[0].set_morph("")
|
||||
assert str(doc[0].morph) == ""
|
||||
assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
|
||||
|
||||
# "_" morph is also equivalent to empty morph
|
||||
doc[0].morph_ = "_"
|
||||
assert doc[0].morph_ == ""
|
||||
doc[0].set_morph("_")
|
||||
assert str(doc[0].morph) == ""
|
||||
assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
|
||||
|
||||
# set through existing hash with token.morph
|
||||
tokenizer.vocab.strings.add("Feat=Val")
|
||||
doc[0].morph = tokenizer.vocab.strings.add("Feat=Val")
|
||||
assert doc[0].morph_ == "Feat=Val"
|
||||
doc[0].set_morph(tokenizer.vocab.strings.add("Feat=Val"))
|
||||
assert str(doc[0].morph) == "Feat=Val"
|
||||
|
|
|
@ -21,11 +21,11 @@ def test_doc_retokenize_merge(en_tokenizer):
|
|||
assert doc[4].text == "the beach boys"
|
||||
assert doc[4].text_with_ws == "the beach boys "
|
||||
assert doc[4].tag_ == "NAMED"
|
||||
assert doc[4].morph_ == "Number=Plur"
|
||||
assert str(doc[4].morph) == "Number=Plur"
|
||||
assert doc[5].text == "all night"
|
||||
assert doc[5].text_with_ws == "all night"
|
||||
assert doc[5].tag_ == "NAMED"
|
||||
assert doc[5].morph_ == "Number=Plur"
|
||||
assert str(doc[5].morph) == "Number=Plur"
|
||||
|
||||
|
||||
def test_doc_retokenize_merge_children(en_tokenizer):
|
||||
|
@ -201,6 +201,12 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer):
|
|||
heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15]
|
||||
tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"]
|
||||
ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)]
|
||||
ents = ["O"] * len(heads)
|
||||
ents[0] = "B-PERSON"
|
||||
ents[1] = "I-PERSON"
|
||||
ents[10] = "B-GPE"
|
||||
ents[13] = "B-PERSON"
|
||||
ents[14] = "I-PERSON"
|
||||
# fmt: on
|
||||
tokens = en_tokenizer(text)
|
||||
doc = Doc(
|
||||
|
@ -269,7 +275,11 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
|
|||
# if there is a parse, span.root provides default values
|
||||
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
|
||||
heads = [0, 0, 3, 0, 0, 0, 5, 0, 0]
|
||||
ents = [("ent-de", 3, 5), ("ent-fg", 5, 7)]
|
||||
ents = ["O"] * len(words)
|
||||
ents[3] = "B-ent-de"
|
||||
ents[4] = "I-ent-de"
|
||||
ents[5] = "B-ent-fg"
|
||||
ents[6] = "I-ent-fg"
|
||||
deps = ["dep"] * len(words)
|
||||
en_vocab.strings.add("ent-de")
|
||||
en_vocab.strings.add("ent-fg")
|
||||
|
@ -292,7 +302,11 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
|
|||
# check that B is preserved if span[start] is B
|
||||
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
|
||||
heads = [0, 0, 3, 4, 0, 0, 5, 0, 0]
|
||||
ents = [("ent-de", 3, 5), ("ent-de", 5, 7)]
|
||||
ents = ["O"] * len(words)
|
||||
ents[3] = "B-ent-de"
|
||||
ents[4] = "I-ent-de"
|
||||
ents[5] = "B-ent-de"
|
||||
ents[6] = "I-ent-de"
|
||||
deps = ["dep"] * len(words)
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
|
||||
with doc.retokenize() as retokenizer:
|
||||
|
|
|
@ -27,11 +27,11 @@ def test_doc_retokenize_split(en_vocab):
|
|||
assert doc[0].text == "Los"
|
||||
assert doc[0].head.text == "Angeles"
|
||||
assert doc[0].idx == 0
|
||||
assert doc[0].morph_ == "Number=Sing"
|
||||
assert str(doc[0].morph) == "Number=Sing"
|
||||
assert doc[1].idx == 3
|
||||
assert doc[1].text == "Angeles"
|
||||
assert doc[1].head.text == "start"
|
||||
assert doc[1].morph_ == "Number=Sing"
|
||||
assert str(doc[1].morph) == "Number=Sing"
|
||||
assert doc[2].text == "start"
|
||||
assert doc[2].head.text == "."
|
||||
assert doc[3].text == "."
|
||||
|
|
|
@ -9,7 +9,7 @@ def doc(en_vocab):
|
|||
tags = ["VBP", "NN", "NN"]
|
||||
heads = [0, 0, 0]
|
||||
deps = ["ROOT", "dobj", "dobj"]
|
||||
ents = [("ORG", 1, 2)]
|
||||
ents = ["O", "B-ORG", "O"]
|
||||
return Doc(
|
||||
en_vocab, words=words, pos=pos, tags=tags, heads=heads, deps=deps, ents=ents
|
||||
)
|
||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
|||
|
||||
|
||||
def test_noun_chunks_is_parsed_de(de_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed.
|
||||
"""
|
||||
"""Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed."""
|
||||
doc = de_tokenizer("Er lag auf seinem")
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
|||
|
||||
|
||||
def test_noun_chunks_is_parsed_el(el_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed.
|
||||
"""
|
||||
"""Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed."""
|
||||
doc = el_tokenizer("είναι χώρα της νοτιοανατολικής")
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
|
|
@ -7,8 +7,7 @@ import pytest
|
|||
|
||||
|
||||
def test_noun_chunks_is_parsed(en_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed.
|
||||
"""
|
||||
"""Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed."""
|
||||
doc = en_tokenizer("This is a sentence")
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
|||
|
||||
|
||||
def test_noun_chunks_is_parsed_es(es_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed.
|
||||
"""
|
||||
"""Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed."""
|
||||
doc = es_tokenizer("en Oxford este verano")
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
|||
|
||||
|
||||
def test_noun_chunks_is_parsed_fa(fa_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'fa' language if Doc is not parsed.
|
||||
"""
|
||||
"""Test that noun_chunks raises Value Error for 'fa' language if Doc is not parsed."""
|
||||
|
||||
doc = fa_tokenizer("این یک جمله نمونه می باشد.")
|
||||
with pytest.raises(ValueError):
|
||||
|
|
|
@ -36,9 +36,7 @@ def test_fr_tokenizer_infix_exceptions(fr_tokenizer, text):
|
|||
assert len(tokens) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text", ["janv.", "juill.", "Dr.", "av.", "sept."],
|
||||
)
|
||||
@pytest.mark.parametrize("text", ["janv.", "juill.", "Dr.", "av.", "sept."])
|
||||
def test_fr_tokenizer_handles_abbr(fr_tokenizer, text):
|
||||
tokens = fr_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
|||
|
||||
|
||||
def test_noun_chunks_is_parsed_fr(fr_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed.
|
||||
"""
|
||||
"""Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed."""
|
||||
doc = fr_tokenizer("trouver des travaux antérieurs")
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
|||
|
||||
|
||||
def test_noun_chunks_is_parsed_id(id_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed.
|
||||
"""
|
||||
"""Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed."""
|
||||
doc = id_tokenizer("sebelas")
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
|
|
@ -112,7 +112,7 @@ def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c):
|
|||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS,
|
||||
"text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS
|
||||
)
|
||||
def test_ja_tokenizer_sub_tokens(
|
||||
ja_tokenizer, text, sub_tokens_list_a, sub_tokens_list_b, sub_tokens_list_c
|
||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
|||
|
||||
|
||||
def test_noun_chunks_is_parsed_nb(nb_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed.
|
||||
"""
|
||||
"""Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed."""
|
||||
doc = nb_tokenizer("Smørsausen brukes bl.a. til")
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
|
|
@ -8,7 +8,7 @@ def test_ne_tokenizer_handlers_long_text(ne_tokenizer):
|
|||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,length", [("समय जान कति पनि बेर लाग्दैन ।", 7), ("म ठूलो हुँदै थिएँ ।", 5)],
|
||||
"text,length", [("समय जान कति पनि बेर लाग्दैन ।", 7), ("म ठूलो हुँदै थिएँ ।", 5)]
|
||||
)
|
||||
def test_ne_tokenizer_handles_cnts(ne_tokenizer, text, length):
|
||||
tokens = ne_tokenizer(text)
|
||||
|
|
|
@ -10,7 +10,7 @@ def test_sa_tokenizer_handles_long_text(sa_tokenizer):
|
|||
@pytest.mark.parametrize(
|
||||
"text,length",
|
||||
[
|
||||
("श्री भगवानुवाच पश्य मे पार्थ रूपाणि शतशोऽथ सहस्रशः।", 9,),
|
||||
("श्री भगवानुवाच पश्य मे पार्थ रूपाणि शतशोऽथ सहस्रशः।", 9),
|
||||
("गुणान् सर्वान् स्वभावो मूर्ध्नि वर्तते ।", 6),
|
||||
],
|
||||
)
|
||||
|
|
|
@ -3,8 +3,7 @@ from spacy.tokens import Doc
|
|||
|
||||
|
||||
def test_noun_chunks_is_parsed_sv(sv_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed.
|
||||
"""
|
||||
"""Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed."""
|
||||
doc = sv_tokenizer("Studenten läste den bästa boken")
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
|
|
@ -17,16 +17,31 @@ def test_lemmatizer_initialize(lang, capfd):
|
|||
@registry.misc("lemmatizer_init_lookups")
|
||||
def lemmatizer_init_lookups():
|
||||
lookups = Lookups()
|
||||
lookups.add_table("lemma_lookup", {"cope": "cope"})
|
||||
lookups.add_table("lemma_lookup", {"cope": "cope", "x": "y"})
|
||||
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
|
||||
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
|
||||
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
|
||||
return lookups
|
||||
|
||||
"""Test that languages can be initialized."""
|
||||
# Test that languages can be initialized
|
||||
nlp = get_lang_class(lang)()
|
||||
nlp.add_pipe("lemmatizer", config={"lookups": {"@misc": "lemmatizer_init_lookups"}})
|
||||
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
|
||||
assert not lemmatizer.lookups.tables
|
||||
nlp.config["initialize"]["components"]["lemmatizer"] = {
|
||||
"lookups": {"@misc": "lemmatizer_init_lookups"}
|
||||
}
|
||||
with pytest.raises(ValueError):
|
||||
nlp("x")
|
||||
nlp.initialize()
|
||||
assert lemmatizer.lookups.tables
|
||||
doc = nlp("x")
|
||||
# Check for stray print statements (see #3342)
|
||||
doc = nlp("test") # noqa: F841
|
||||
captured = capfd.readouterr()
|
||||
assert not captured.out
|
||||
assert doc[0].lemma_ == "y"
|
||||
|
||||
# Test initialization by calling .initialize() directly
|
||||
nlp = get_lang_class(lang)()
|
||||
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
|
||||
lemmatizer.initialize(lookups=lemmatizer_init_lookups())
|
||||
assert nlp("x")[0].lemma_ == "y"
|
||||
|
|
|
@ -27,9 +27,18 @@ def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba):
|
|||
|
||||
@pytest.mark.slow
|
||||
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
|
||||
nlp = Chinese(
|
||||
meta={
|
||||
"tokenizer": {"config": {"segmenter": "pkuseg", "pkuseg_model": "medicine"}}
|
||||
}
|
||||
)
|
||||
config = {
|
||||
"nlp": {
|
||||
"tokenizer": {
|
||||
"@tokenizers": "spacy.zh.ChineseTokenizer",
|
||||
"segmenter": "pkuseg",
|
||||
}
|
||||
},
|
||||
"initialize": {"tokenizer": {
|
||||
"pkuseg_model": "medicine",
|
||||
}
|
||||
},
|
||||
}
|
||||
nlp = Chinese.from_config(config)
|
||||
nlp.initialize()
|
||||
zh_tokenizer_serialize(nlp.tokenizer)
|
||||
|
|
|
@ -236,13 +236,13 @@ def test_matcher_subset_value_operator(en_vocab):
|
|||
matcher.add("M", [pattern])
|
||||
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||
assert len(matcher(doc)) == 3
|
||||
doc[0].morph_ = "Feat=Val"
|
||||
doc[0].set_morph("Feat=Val")
|
||||
assert len(matcher(doc)) == 3
|
||||
doc[0].morph_ = "Feat=Val|Feat2=Val2"
|
||||
doc[0].set_morph("Feat=Val|Feat2=Val2")
|
||||
assert len(matcher(doc)) == 3
|
||||
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3"
|
||||
doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3")
|
||||
assert len(matcher(doc)) == 2
|
||||
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4"
|
||||
doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4")
|
||||
assert len(matcher(doc)) == 2
|
||||
|
||||
# IS_SUBSET acts like "IN" for attrs other than MORPH
|
||||
|
@ -268,11 +268,11 @@ def test_matcher_superset_value_operator(en_vocab):
|
|||
matcher.add("M", [pattern])
|
||||
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||
assert len(matcher(doc)) == 0
|
||||
doc[0].morph_ = "Feat=Val|Feat2=Val2"
|
||||
doc[0].set_morph("Feat=Val|Feat2=Val2")
|
||||
assert len(matcher(doc)) == 0
|
||||
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3"
|
||||
doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3")
|
||||
assert len(matcher(doc)) == 1
|
||||
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4"
|
||||
doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4")
|
||||
assert len(matcher(doc)) == 1
|
||||
|
||||
# IS_SUPERSET with more than one value only matches for MORPH
|
||||
|
@ -310,9 +310,9 @@ def test_matcher_morph_handling(en_vocab):
|
|||
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||
assert len(matcher(doc)) == 0
|
||||
|
||||
doc[0].morph_ = "Feat2=Val2|Feat1=Val1"
|
||||
doc[0].set_morph("Feat2=Val2|Feat1=Val1")
|
||||
assert len(matcher(doc)) == 2
|
||||
doc[0].morph_ = "Feat1=Val1|Feat2=Val2"
|
||||
doc[0].set_morph("Feat1=Val1|Feat2=Val2")
|
||||
assert len(matcher(doc)) == 2
|
||||
|
||||
# multiple values are split
|
||||
|
@ -324,9 +324,9 @@ def test_matcher_morph_handling(en_vocab):
|
|||
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||
assert len(matcher(doc)) == 0
|
||||
|
||||
doc[0].morph_ = "Feat2=Val2,Val3|Feat1=Val1"
|
||||
doc[0].set_morph("Feat2=Val2,Val3|Feat1=Val1")
|
||||
assert len(matcher(doc)) == 1
|
||||
doc[0].morph_ = "Feat1=Val1,Val3|Feat2=Val2"
|
||||
doc[0].set_morph("Feat1=Val1,Val3|Feat2=Val2")
|
||||
assert len(matcher(doc)) == 2
|
||||
|
||||
|
||||
|
@ -405,7 +405,7 @@ def test_attr_pipeline_checks(en_vocab):
|
|||
doc2 = Doc(en_vocab, words=["Test"])
|
||||
doc2[0].tag_ = "TAG"
|
||||
doc2[0].pos_ = "X"
|
||||
doc2[0].morph_ = "Feat=Val"
|
||||
doc2[0].set_morph("Feat=Val")
|
||||
doc2[0].lemma_ = "LEMMA"
|
||||
doc3 = Doc(en_vocab, words=["Test"])
|
||||
# DEP requires DEP
|
||||
|
|
|
@ -190,7 +190,7 @@ def test_phrase_matcher_validation(en_vocab):
|
|||
doc2 = Doc(en_vocab, words=["Test"])
|
||||
doc2[0].tag_ = "TAG"
|
||||
doc2[0].pos_ = "X"
|
||||
doc2[0].morph_ = "Feat=Val"
|
||||
doc2[0].set_morph("Feat=Val")
|
||||
doc3 = Doc(en_vocab, words=["Test"])
|
||||
matcher = PhraseMatcher(en_vocab, validate=True)
|
||||
with pytest.warns(UserWarning):
|
||||
|
@ -217,7 +217,7 @@ def test_attr_pipeline_checks(en_vocab):
|
|||
doc2 = Doc(en_vocab, words=["Test"])
|
||||
doc2[0].tag_ = "TAG"
|
||||
doc2[0].pos_ = "X"
|
||||
doc2[0].morph_ = "Feat=Val"
|
||||
doc2[0].set_morph("Feat=Val")
|
||||
doc2[0].lemma_ = "LEMMA"
|
||||
doc3 = Doc(en_vocab, words=["Test"])
|
||||
# DEP requires DEP
|
||||
|
|
|
@ -35,7 +35,7 @@ def test_init_parser(parser):
|
|||
def _train_parser(parser):
|
||||
fix_random_seed(1)
|
||||
parser.add_label("left")
|
||||
parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg)
|
||||
parser.initialize(lambda: [_parser_example(parser)])
|
||||
sgd = Adam(0.001)
|
||||
|
||||
for i in range(5):
|
||||
|
@ -87,7 +87,7 @@ def test_add_label_deserializes_correctly():
|
|||
ner1.add_label("C")
|
||||
ner1.add_label("B")
|
||||
ner1.add_label("A")
|
||||
ner1.begin_training(lambda: [_ner_example(ner1)])
|
||||
ner1.initialize(lambda: [_ner_example(ner1)])
|
||||
ner2 = EntityRecognizer(Vocab(), model, **config)
|
||||
|
||||
# the second model needs to be resized before we can call from_bytes
|
||||
|
|
|
@ -202,7 +202,7 @@ def test_train_empty():
|
|||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
ner = nlp.add_pipe("ner", last=True)
|
||||
ner.add_label("PERSON")
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
for itn in range(2):
|
||||
losses = {}
|
||||
batches = util.minibatch(train_examples, size=8)
|
||||
|
@ -213,7 +213,7 @@ def test_train_empty():
|
|||
def test_overwrite_token():
|
||||
nlp = English()
|
||||
nlp.add_pipe("ner")
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
# The untrained NER will predict O for each token
|
||||
doc = nlp("I live in New York")
|
||||
assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"]
|
||||
|
@ -235,7 +235,7 @@ def test_empty_ner():
|
|||
nlp = English()
|
||||
ner = nlp.add_pipe("ner")
|
||||
ner.add_label("MY_LABEL")
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
doc = nlp("John is watching the news about Croatia's elections")
|
||||
# if this goes wrong, the initialization of the parser's upper layer is probably broken
|
||||
result = ["O", "O", "O", "O", "O", "O", "O", "O", "O"]
|
||||
|
@ -254,7 +254,7 @@ def test_ruler_before_ner():
|
|||
# 2: untrained NER - should set everything else to O
|
||||
untrained_ner = nlp.add_pipe("ner")
|
||||
untrained_ner.add_label("MY_LABEL")
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
doc = nlp("This is Antti Korhonen speaking in Finland")
|
||||
expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
|
||||
expected_types = ["THING", "", "", "", "", "", ""]
|
||||
|
@ -269,7 +269,7 @@ def test_ner_before_ruler():
|
|||
# 1: untrained NER - should set everything to O
|
||||
untrained_ner = nlp.add_pipe("ner", name="uner")
|
||||
untrained_ner.add_label("MY_LABEL")
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
|
||||
# 2 : Entity Ruler - should set "this" to B and keep everything else O
|
||||
patterns = [{"label": "THING", "pattern": "This"}]
|
||||
|
@ -290,7 +290,7 @@ def test_block_ner():
|
|||
nlp.add_pipe("blocker", config={"start": 2, "end": 5})
|
||||
untrained_ner = nlp.add_pipe("ner")
|
||||
untrained_ner.add_label("MY_LABEL")
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
doc = nlp("This is Antti L Korhonen speaking in Finland")
|
||||
expected_iobs = ["O", "O", "B", "B", "B", "O", "O", "O"]
|
||||
expected_types = ["", "", "", "", "", "", "", ""]
|
||||
|
@ -307,7 +307,7 @@ def test_overfitting_IO():
|
|||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||
for ent in annotations.get("entities"):
|
||||
ner.add_label(ent[2])
|
||||
optimizer = nlp.begin_training()
|
||||
optimizer = nlp.initialize()
|
||||
|
||||
for i in range(50):
|
||||
losses = {}
|
||||
|
@ -340,13 +340,13 @@ def test_ner_warns_no_lookups(caplog):
|
|||
assert not len(nlp.vocab.lookups)
|
||||
nlp.add_pipe("ner")
|
||||
with caplog.at_level(logging.DEBUG):
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
assert "W033" in caplog.text
|
||||
caplog.clear()
|
||||
nlp.vocab.lookups.add_table("lexeme_norm")
|
||||
nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A"
|
||||
with caplog.at_level(logging.DEBUG):
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
assert "W033" not in caplog.text
|
||||
|
||||
|
||||
|
@ -358,5 +358,5 @@ class BlockerComponent1:
|
|||
self.name = name
|
||||
|
||||
def __call__(self, doc):
|
||||
doc.set_ents([], blocked=[doc[self.start:self.end]], default="unmodified")
|
||||
doc.set_ents([], blocked=[doc[self.start : self.end]], default="unmodified")
|
||||
return doc
|
||||
|
|
|
@ -191,7 +191,7 @@ def test_overfitting_IO():
|
|||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||
for dep in annotations.get("deps", []):
|
||||
parser.add_label(dep)
|
||||
optimizer = nlp.begin_training()
|
||||
optimizer = nlp.initialize()
|
||||
for i in range(100):
|
||||
losses = {}
|
||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||
|
|
|
@ -34,7 +34,7 @@ def parser(vocab):
|
|||
parser.cfg["hidden_width"] = 32
|
||||
# parser.add_label('right')
|
||||
parser.add_label("left")
|
||||
parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg)
|
||||
parser.initialize(lambda: [_parser_example(parser)])
|
||||
sgd = Adam(0.001)
|
||||
|
||||
for i in range(10):
|
||||
|
|
|
@ -69,9 +69,9 @@ def test_attributeruler_init(nlp, pattern_dicts):
|
|||
a.add(**p)
|
||||
doc = nlp("This is a test.")
|
||||
assert doc[2].lemma_ == "the"
|
||||
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
||||
assert str(doc[2].morph) == "Case=Nom|Number=Plur"
|
||||
assert doc[3].lemma_ == "cat"
|
||||
assert doc[3].morph_ == "Case=Nom|Number=Sing"
|
||||
assert str(doc[3].morph) == "Case=Nom|Number=Sing"
|
||||
assert doc.has_annotation("LEMMA")
|
||||
assert doc.has_annotation("MORPH")
|
||||
|
||||
|
@ -81,9 +81,9 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
|||
nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
|
||||
doc = nlp("This is a test.")
|
||||
assert doc[2].lemma_ == "the"
|
||||
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
||||
assert str(doc[2].morph) == "Case=Nom|Number=Plur"
|
||||
assert doc[3].lemma_ == "cat"
|
||||
assert doc[3].morph_ == "Case=Nom|Number=Sing"
|
||||
assert str(doc[3].morph) == "Case=Nom|Number=Sing"
|
||||
assert doc.has_annotation("LEMMA")
|
||||
assert doc.has_annotation("MORPH")
|
||||
nlp.remove_pipe("attribute_ruler")
|
||||
|
@ -94,9 +94,9 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
|||
)
|
||||
doc = nlp("This is a test.")
|
||||
assert doc[2].lemma_ == "the"
|
||||
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
||||
assert str(doc[2].morph) == "Case=Nom|Number=Plur"
|
||||
assert doc[3].lemma_ == "cat"
|
||||
assert doc[3].morph_ == "Case=Nom|Number=Sing"
|
||||
assert str(doc[3].morph) == "Case=Nom|Number=Sing"
|
||||
assert doc.has_annotation("LEMMA")
|
||||
assert doc.has_annotation("MORPH")
|
||||
|
||||
|
@ -106,9 +106,9 @@ def test_attributeruler_score(nlp, pattern_dicts):
|
|||
nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
|
||||
doc = nlp("This is a test.")
|
||||
assert doc[2].lemma_ == "the"
|
||||
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
||||
assert str(doc[2].morph) == "Case=Nom|Number=Plur"
|
||||
assert doc[3].lemma_ == "cat"
|
||||
assert doc[3].morph_ == "Case=Nom|Number=Sing"
|
||||
assert str(doc[3].morph) == "Case=Nom|Number=Sing"
|
||||
|
||||
dev_examples = [
|
||||
Example.from_dict(
|
||||
|
@ -150,10 +150,10 @@ def test_attributeruler_tag_map(nlp, tag_map):
|
|||
for i in range(len(doc)):
|
||||
if i == 4:
|
||||
assert doc[i].pos_ == "PUNCT"
|
||||
assert doc[i].morph_ == "PunctType=peri"
|
||||
assert str(doc[i].morph) == "PunctType=peri"
|
||||
else:
|
||||
assert doc[i].pos_ == ""
|
||||
assert doc[i].morph_ == ""
|
||||
assert str(doc[i].morph) == ""
|
||||
|
||||
|
||||
def test_attributeruler_morph_rules(nlp, morph_rules):
|
||||
|
@ -168,11 +168,11 @@ def test_attributeruler_morph_rules(nlp, morph_rules):
|
|||
for i in range(len(doc)):
|
||||
if i != 2:
|
||||
assert doc[i].pos_ == ""
|
||||
assert doc[i].morph_ == ""
|
||||
assert str(doc[i].morph) == ""
|
||||
else:
|
||||
assert doc[2].pos_ == "DET"
|
||||
assert doc[2].lemma_ == "a"
|
||||
assert doc[2].morph_ == "Case=Nom"
|
||||
assert str(doc[2].morph) == "Case=Nom"
|
||||
|
||||
|
||||
def test_attributeruler_indices(nlp):
|
||||
|
@ -194,14 +194,14 @@ def test_attributeruler_indices(nlp):
|
|||
for i in range(len(doc)):
|
||||
if i == 1:
|
||||
assert doc[i].lemma_ == "was"
|
||||
assert doc[i].morph_ == "Case=Nom|Number=Sing"
|
||||
assert str(doc[i].morph) == "Case=Nom|Number=Sing"
|
||||
elif i == 2:
|
||||
assert doc[i].lemma_ == "the"
|
||||
assert doc[i].morph_ == "Case=Nom|Number=Plur"
|
||||
assert str(doc[i].morph) == "Case=Nom|Number=Plur"
|
||||
elif i == 3:
|
||||
assert doc[i].lemma_ == "cat"
|
||||
else:
|
||||
assert doc[i].morph_ == ""
|
||||
assert str(doc[i].morph) == ""
|
||||
# raises an error when trying to modify a token outside of the match
|
||||
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=2)
|
||||
with pytest.raises(ValueError):
|
||||
|
|
|
@ -134,7 +134,7 @@ def test_kb_undefined(nlp):
|
|||
"""Test that the EL can't train without defining a KB"""
|
||||
entity_linker = nlp.add_pipe("entity_linker", config={})
|
||||
with pytest.raises(ValueError):
|
||||
entity_linker.begin_training(lambda: [])
|
||||
entity_linker.initialize(lambda: [])
|
||||
|
||||
|
||||
def test_kb_empty(nlp):
|
||||
|
@ -143,7 +143,7 @@ def test_kb_empty(nlp):
|
|||
entity_linker = nlp.add_pipe("entity_linker", config=config)
|
||||
assert len(entity_linker.kb) == 0
|
||||
with pytest.raises(ValueError):
|
||||
entity_linker.begin_training(lambda: [])
|
||||
entity_linker.initialize(lambda: [])
|
||||
|
||||
|
||||
def test_kb_serialize(nlp):
|
||||
|
@ -254,14 +254,12 @@ def test_vocab_serialization(nlp):
|
|||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||
|
||||
# adding entities
|
||||
q1_hash = mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
|
||||
mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
|
||||
q2_hash = mykb.add_entity(entity="Q2", freq=12, entity_vector=[2])
|
||||
q3_hash = mykb.add_entity(entity="Q3", freq=5, entity_vector=[3])
|
||||
mykb.add_entity(entity="Q3", freq=5, entity_vector=[3])
|
||||
|
||||
# adding aliases
|
||||
douglas_hash = mykb.add_alias(
|
||||
alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1]
|
||||
)
|
||||
mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1])
|
||||
adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
|
||||
|
||||
candidates = mykb.get_alias_candidates("adam")
|
||||
|
@ -360,7 +358,7 @@ def test_preserving_links_asdoc(nlp):
|
|||
ruler.add_patterns(patterns)
|
||||
el_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}, "incl_prior": False}
|
||||
entity_linker = nlp.add_pipe("entity_linker", config=el_config, last=True)
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
assert entity_linker.model.get_dim("nO") == vector_length
|
||||
|
||||
# test whether the entity links are preserved by the `as_doc()` function
|
||||
|
@ -463,7 +461,7 @@ def test_overfitting_IO():
|
|||
)
|
||||
|
||||
# train the NEL pipe
|
||||
optimizer = nlp.begin_training(get_examples=lambda: train_examples)
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
assert entity_linker.model.get_dim("nO") == vector_length
|
||||
assert entity_linker.model.get_dim("nO") == entity_linker.kb.entity_vector_length
|
||||
|
||||
|
|
69
spacy/tests/pipeline/test_initialize.py
Normal file
69
spacy/tests/pipeline/test_initialize.py
Normal file
|
@ -0,0 +1,69 @@
|
|||
import pytest
|
||||
from spacy.language import Language
|
||||
from spacy.lang.en import English
|
||||
from spacy.training import Example
|
||||
from thinc.api import ConfigValidationError
|
||||
from pydantic import StrictBool
|
||||
|
||||
|
||||
def test_initialize_arguments():
|
||||
name = "test_initialize_arguments"
|
||||
|
||||
class CustomTokenizer:
|
||||
def __init__(self, tokenizer):
|
||||
self.tokenizer = tokenizer
|
||||
self.from_initialize = None
|
||||
|
||||
def __call__(self, text):
|
||||
return self.tokenizer(text)
|
||||
|
||||
def initialize(self, get_examples, nlp, custom: int):
|
||||
self.from_initialize = custom
|
||||
|
||||
class Component:
|
||||
def __init__(self):
|
||||
self.from_initialize = None
|
||||
|
||||
def initialize(
|
||||
self, get_examples, nlp, custom1: str, custom2: StrictBool = False
|
||||
):
|
||||
self.from_initialize = (custom1, custom2)
|
||||
|
||||
Language.factory(name, func=lambda nlp, name: Component())
|
||||
|
||||
nlp = English()
|
||||
nlp.tokenizer = CustomTokenizer(nlp.tokenizer)
|
||||
example = Example.from_dict(nlp("x"), {})
|
||||
get_examples = lambda: [example]
|
||||
nlp.add_pipe(name)
|
||||
# The settings here will typically come from the [initialize] block
|
||||
init_cfg = {"tokenizer": {"custom": 1}, "components": {name: {}}}
|
||||
nlp.config["initialize"].update(init_cfg)
|
||||
with pytest.raises(ConfigValidationError) as e:
|
||||
# Empty config for component, no required custom1 argument
|
||||
nlp.initialize(get_examples)
|
||||
errors = e.value.errors
|
||||
assert len(errors) == 1
|
||||
assert errors[0]["loc"] == ("custom1",)
|
||||
assert errors[0]["type"] == "value_error.missing"
|
||||
init_cfg = {
|
||||
"tokenizer": {"custom": 1},
|
||||
"components": {name: {"custom1": "x", "custom2": 1}},
|
||||
}
|
||||
nlp.config["initialize"].update(init_cfg)
|
||||
with pytest.raises(ConfigValidationError) as e:
|
||||
# Wrong type of custom 2
|
||||
nlp.initialize(get_examples)
|
||||
errors = e.value.errors
|
||||
assert len(errors) == 1
|
||||
assert errors[0]["loc"] == ("custom2",)
|
||||
assert errors[0]["type"] == "value_error.strictbool"
|
||||
init_cfg = {
|
||||
"tokenizer": {"custom": 1},
|
||||
"components": {name: {"custom1": "x"}},
|
||||
}
|
||||
nlp.config["initialize"].update(init_cfg)
|
||||
nlp.initialize(get_examples)
|
||||
assert nlp.tokenizer.from_initialize == 1
|
||||
pipe = nlp.get_pipe(name)
|
||||
assert pipe.from_initialize == ("x", False)
|
|
@ -8,61 +8,52 @@ from ..util import make_tempdir
|
|||
|
||||
@pytest.fixture
|
||||
def nlp():
|
||||
return English()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def lemmatizer(nlp):
|
||||
@registry.misc("cope_lookups")
|
||||
def cope_lookups():
|
||||
lookups = Lookups()
|
||||
lookups.add_table("lemma_lookup", {"cope": "cope"})
|
||||
lookups.add_table("lemma_lookup", {"cope": "cope", "coped": "cope"})
|
||||
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
|
||||
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
|
||||
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
|
||||
return lookups
|
||||
|
||||
lemmatizer = nlp.add_pipe(
|
||||
"lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}}
|
||||
)
|
||||
return lemmatizer
|
||||
nlp = English()
|
||||
nlp.config["initialize"]["components"]["lemmatizer"] = {
|
||||
"lookups": {"@misc": "cope_lookups"}
|
||||
}
|
||||
return nlp
|
||||
|
||||
|
||||
def test_lemmatizer_init(nlp):
|
||||
@registry.misc("cope_lookups")
|
||||
def cope_lookups():
|
||||
lookups = Lookups()
|
||||
lookups.add_table("lemma_lookup", {"cope": "cope"})
|
||||
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
|
||||
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
|
||||
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
|
||||
return lookups
|
||||
|
||||
lemmatizer = nlp.add_pipe(
|
||||
"lemmatizer", config={"mode": "lookup", "lookups": {"@misc": "cope_lookups"}}
|
||||
)
|
||||
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
|
||||
assert isinstance(lemmatizer.lookups, Lookups)
|
||||
assert not lemmatizer.lookups.tables
|
||||
assert lemmatizer.mode == "lookup"
|
||||
with pytest.raises(ValueError):
|
||||
nlp("test")
|
||||
nlp.initialize()
|
||||
assert lemmatizer.lookups.tables
|
||||
assert nlp("cope")[0].lemma_ == "cope"
|
||||
assert nlp("coped")[0].lemma_ == "cope"
|
||||
# replace any tables from spacy-lookups-data
|
||||
lemmatizer.lookups = Lookups()
|
||||
doc = nlp("coping")
|
||||
# lookup with no tables sets text as lemma
|
||||
assert doc[0].lemma_ == "coping"
|
||||
|
||||
assert nlp("cope")[0].lemma_ == "cope"
|
||||
assert nlp("coped")[0].lemma_ == "coped"
|
||||
nlp.remove_pipe("lemmatizer")
|
||||
|
||||
@registry.misc("empty_lookups")
|
||||
def empty_lookups():
|
||||
return Lookups()
|
||||
|
||||
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
|
||||
with pytest.raises(ValueError):
|
||||
nlp.add_pipe(
|
||||
"lemmatizer",
|
||||
config={"mode": "lookup", "lookups": {"@misc": "empty_lookups"}},
|
||||
)
|
||||
# Can't initialize without required tables
|
||||
lemmatizer.initialize(lookups=Lookups())
|
||||
lookups = Lookups()
|
||||
lookups.add_table("lemma_lookup", {})
|
||||
lemmatizer.initialize(lookups=lookups)
|
||||
|
||||
|
||||
def test_lemmatizer_config(nlp, lemmatizer):
|
||||
def test_lemmatizer_config(nlp):
|
||||
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"})
|
||||
nlp.initialize()
|
||||
|
||||
doc = nlp.make_doc("coping")
|
||||
doc[0].pos_ = "VERB"
|
||||
assert doc[0].lemma_ == ""
|
||||
|
@ -78,20 +69,21 @@ def test_lemmatizer_config(nlp, lemmatizer):
|
|||
assert doc[0].lemma_ == "cope"
|
||||
|
||||
|
||||
def test_lemmatizer_serialize(nlp, lemmatizer):
|
||||
@registry.misc("cope_lookups")
|
||||
def test_lemmatizer_serialize(nlp):
|
||||
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"})
|
||||
nlp.initialize()
|
||||
|
||||
def cope_lookups():
|
||||
lookups = Lookups()
|
||||
lookups.add_table("lemma_lookup", {"cope": "cope"})
|
||||
lookups.add_table("lemma_lookup", {"cope": "cope", "coped": "cope"})
|
||||
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
|
||||
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
|
||||
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
|
||||
return lookups
|
||||
|
||||
nlp2 = English()
|
||||
lemmatizer2 = nlp2.add_pipe(
|
||||
"lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}}
|
||||
)
|
||||
lemmatizer2 = nlp2.add_pipe("lemmatizer", config={"mode": "rule"})
|
||||
lemmatizer2.initialize(lookups=cope_lookups())
|
||||
lemmatizer2.from_bytes(lemmatizer.to_bytes())
|
||||
assert lemmatizer.to_bytes() == lemmatizer2.to_bytes()
|
||||
assert lemmatizer.lookups.tables == lemmatizer2.lookups.tables
|
||||
|
@ -100,9 +92,9 @@ def test_lemmatizer_serialize(nlp, lemmatizer):
|
|||
with make_tempdir() as tmp_dir:
|
||||
nlp.to_disk(tmp_dir)
|
||||
nlp2 = util.load_model_from_path(tmp_dir)
|
||||
doc2 = nlp2.make_doc("coping")
|
||||
doc2[0].pos_ = "VERB"
|
||||
assert doc2[0].lemma_ == ""
|
||||
doc2 = lemmatizer(doc2)
|
||||
assert doc2[0].text == "coping"
|
||||
assert doc2[0].lemma_ == "cope"
|
||||
doc2 = nlp2.make_doc("coping")
|
||||
doc2[0].pos_ = "VERB"
|
||||
assert doc2[0].lemma_ == ""
|
||||
doc2 = lemmatizer(doc2)
|
||||
assert doc2[0].text == "coping"
|
||||
assert doc2[0].lemma_ == "cope"
|
||||
|
|
|
@ -33,7 +33,7 @@ def test_no_label():
|
|||
nlp = Language()
|
||||
nlp.add_pipe("morphologizer")
|
||||
with pytest.raises(ValueError):
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
|
||||
|
||||
def test_implicit_label():
|
||||
|
@ -42,7 +42,7 @@ def test_implicit_label():
|
|||
train_examples = []
|
||||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
nlp.begin_training(get_examples=lambda: train_examples)
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
|
||||
def test_no_resize():
|
||||
|
@ -50,13 +50,13 @@ def test_no_resize():
|
|||
morphologizer = nlp.add_pipe("morphologizer")
|
||||
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
|
||||
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "VERB")
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
# this throws an error because the morphologizer can't be resized after initialization
|
||||
with pytest.raises(ValueError):
|
||||
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "ADJ")
|
||||
|
||||
|
||||
def test_begin_training_examples():
|
||||
def test_initialize_examples():
|
||||
nlp = Language()
|
||||
morphologizer = nlp.add_pipe("morphologizer")
|
||||
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
|
||||
|
@ -64,12 +64,12 @@ def test_begin_training_examples():
|
|||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
# you shouldn't really call this more than once, but for testing it should be fine
|
||||
nlp.begin_training()
|
||||
nlp.begin_training(get_examples=lambda: train_examples)
|
||||
with pytest.raises(TypeError):
|
||||
nlp.begin_training(get_examples=lambda: None)
|
||||
nlp.initialize()
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
with pytest.raises(ValueError):
|
||||
nlp.begin_training(get_examples=train_examples)
|
||||
nlp.initialize(get_examples=lambda: None)
|
||||
with pytest.raises(ValueError):
|
||||
nlp.initialize(get_examples=train_examples)
|
||||
|
||||
|
||||
def test_overfitting_IO():
|
||||
|
@ -79,7 +79,7 @@ def test_overfitting_IO():
|
|||
train_examples = []
|
||||
for inst in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
|
||||
optimizer = nlp.begin_training(get_examples=lambda: train_examples)
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
for i in range(50):
|
||||
losses = {}
|
||||
|
@ -91,7 +91,7 @@ def test_overfitting_IO():
|
|||
doc = nlp(test_text)
|
||||
gold_morphs = ["Feat=N", "Feat=V", "", ""]
|
||||
gold_pos_tags = ["NOUN", "VERB", "ADJ", ""]
|
||||
assert [t.morph_ for t in doc] == gold_morphs
|
||||
assert [str(t.morph) for t in doc] == gold_morphs
|
||||
assert [t.pos_ for t in doc] == gold_pos_tags
|
||||
|
||||
# Also test the results are still the same after IO
|
||||
|
@ -99,5 +99,5 @@ def test_overfitting_IO():
|
|||
nlp.to_disk(tmp_dir)
|
||||
nlp2 = util.load_model_from_path(tmp_dir)
|
||||
doc2 = nlp2(test_text)
|
||||
assert [t.morph_ for t in doc2] == gold_morphs
|
||||
assert [str(t.morph) for t in doc2] == gold_morphs
|
||||
assert [t.pos_ for t in doc2] == gold_pos_tags
|
||||
|
|
|
@ -31,19 +31,19 @@ TRAIN_DATA = [
|
|||
]
|
||||
|
||||
|
||||
def test_begin_training_examples():
|
||||
def test_initialize_examples():
|
||||
nlp = Language()
|
||||
nlp.add_pipe("senter")
|
||||
train_examples = []
|
||||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
# you shouldn't really call this more than once, but for testing it should be fine
|
||||
nlp.begin_training()
|
||||
nlp.begin_training(get_examples=lambda: train_examples)
|
||||
with pytest.raises(TypeError):
|
||||
nlp.begin_training(get_examples=lambda: None)
|
||||
nlp.initialize()
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
with pytest.raises(ValueError):
|
||||
nlp.begin_training(get_examples=train_examples)
|
||||
nlp.initialize(get_examples=lambda: None)
|
||||
with pytest.raises(ValueError):
|
||||
nlp.initialize(get_examples=train_examples)
|
||||
|
||||
|
||||
def test_overfitting_IO():
|
||||
|
@ -58,7 +58,7 @@ def test_overfitting_IO():
|
|||
train_examples[1].reference[11].is_sent_start = False
|
||||
|
||||
nlp.add_pipe("senter")
|
||||
optimizer = nlp.begin_training()
|
||||
optimizer = nlp.initialize()
|
||||
|
||||
for i in range(200):
|
||||
losses = {}
|
||||
|
|
|
@ -15,14 +15,14 @@ def test_label_types():
|
|||
tagger.add_label(9)
|
||||
|
||||
|
||||
def test_tagger_begin_training_tag_map():
|
||||
"""Test that Tagger.begin_training() without gold tuples does not clobber
|
||||
def test_tagger_initialize_tag_map():
|
||||
"""Test that Tagger.initialize() without gold tuples does not clobber
|
||||
the tag map."""
|
||||
nlp = Language()
|
||||
tagger = nlp.add_pipe("tagger")
|
||||
orig_tag_count = len(tagger.labels)
|
||||
tagger.add_label("A")
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels)
|
||||
|
||||
|
||||
|
@ -38,7 +38,7 @@ def test_no_label():
|
|||
nlp = Language()
|
||||
nlp.add_pipe("tagger")
|
||||
with pytest.raises(ValueError):
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
|
||||
|
||||
def test_no_resize():
|
||||
|
@ -47,7 +47,7 @@ def test_no_resize():
|
|||
tagger.add_label("N")
|
||||
tagger.add_label("V")
|
||||
assert tagger.labels == ("N", "V")
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
assert tagger.model.get_dim("nO") == 2
|
||||
# this throws an error because the tagger can't be resized after initialization
|
||||
with pytest.raises(ValueError):
|
||||
|
@ -60,10 +60,10 @@ def test_implicit_label():
|
|||
train_examples = []
|
||||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
nlp.begin_training(get_examples=lambda: train_examples)
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
|
||||
def test_begin_training_examples():
|
||||
def test_initialize_examples():
|
||||
nlp = Language()
|
||||
tagger = nlp.add_pipe("tagger")
|
||||
train_examples = []
|
||||
|
@ -72,16 +72,16 @@ def test_begin_training_examples():
|
|||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
# you shouldn't really call this more than once, but for testing it should be fine
|
||||
nlp.begin_training()
|
||||
nlp.begin_training(get_examples=lambda: train_examples)
|
||||
with pytest.raises(TypeError):
|
||||
nlp.begin_training(get_examples=lambda: None)
|
||||
with pytest.raises(TypeError):
|
||||
nlp.begin_training(get_examples=lambda: train_examples[0])
|
||||
nlp.initialize()
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
with pytest.raises(ValueError):
|
||||
nlp.begin_training(get_examples=lambda: [])
|
||||
nlp.initialize(get_examples=lambda: None)
|
||||
with pytest.raises(TypeError):
|
||||
nlp.initialize(get_examples=lambda: train_examples[0])
|
||||
with pytest.raises(ValueError):
|
||||
nlp.begin_training(get_examples=train_examples)
|
||||
nlp.initialize(get_examples=lambda: [])
|
||||
with pytest.raises(ValueError):
|
||||
nlp.initialize(get_examples=train_examples)
|
||||
|
||||
|
||||
def test_overfitting_IO():
|
||||
|
@ -91,7 +91,7 @@ def test_overfitting_IO():
|
|||
train_examples = []
|
||||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
optimizer = nlp.begin_training(get_examples=lambda: train_examples)
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
assert tagger.model.get_dim("nO") == len(TAGS)
|
||||
|
||||
for i in range(50):
|
||||
|
@ -122,4 +122,4 @@ def test_tagger_requires_labels():
|
|||
nlp = English()
|
||||
nlp.add_pipe("tagger")
|
||||
with pytest.raises(ValueError):
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
|
|
|
@ -9,10 +9,10 @@ from spacy.pipeline import TextCategorizer
|
|||
from spacy.tokens import Doc
|
||||
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
||||
from spacy.scorer import Scorer
|
||||
from spacy.training import Example
|
||||
from spacy.training.initialize import verify_textcat_config
|
||||
|
||||
from ..util import make_tempdir
|
||||
from ...cli.train import verify_textcat_config
|
||||
from ...training import Example
|
||||
|
||||
|
||||
TRAIN_DATA = [
|
||||
|
@ -26,7 +26,7 @@ def test_simple_train():
|
|||
nlp = Language()
|
||||
textcat = nlp.add_pipe("textcat")
|
||||
textcat.add_label("answer")
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
for i in range(5):
|
||||
for text, answer in [
|
||||
("aaaa", 1.0),
|
||||
|
@ -56,7 +56,7 @@ def test_textcat_learns_multilabel():
|
|||
textcat = TextCategorizer(nlp.vocab, width=8)
|
||||
for letter in letters:
|
||||
textcat.add_label(letter)
|
||||
optimizer = textcat.begin_training(lambda: [])
|
||||
optimizer = textcat.initialize(lambda: [])
|
||||
for i in range(30):
|
||||
losses = {}
|
||||
examples = [Example.from_dict(doc, {"cats": cats}) for doc, cat in docs]
|
||||
|
@ -86,7 +86,7 @@ def test_no_label():
|
|||
nlp = Language()
|
||||
nlp.add_pipe("textcat")
|
||||
with pytest.raises(ValueError):
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
|
||||
|
||||
def test_implicit_label():
|
||||
|
@ -95,7 +95,7 @@ def test_implicit_label():
|
|||
train_examples = []
|
||||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
nlp.begin_training(get_examples=lambda: train_examples)
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
|
||||
def test_no_resize():
|
||||
|
@ -103,14 +103,14 @@ def test_no_resize():
|
|||
textcat = nlp.add_pipe("textcat")
|
||||
textcat.add_label("POSITIVE")
|
||||
textcat.add_label("NEGATIVE")
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
assert textcat.model.get_dim("nO") == 2
|
||||
# this throws an error because the textcat can't be resized after initialization
|
||||
with pytest.raises(ValueError):
|
||||
textcat.add_label("NEUTRAL")
|
||||
|
||||
|
||||
def test_begin_training_examples():
|
||||
def test_initialize_examples():
|
||||
nlp = Language()
|
||||
textcat = nlp.add_pipe("textcat")
|
||||
train_examples = []
|
||||
|
@ -119,12 +119,12 @@ def test_begin_training_examples():
|
|||
for label, value in annotations.get("cats").items():
|
||||
textcat.add_label(label)
|
||||
# you shouldn't really call this more than once, but for testing it should be fine
|
||||
nlp.begin_training()
|
||||
nlp.begin_training(get_examples=lambda: train_examples)
|
||||
with pytest.raises(TypeError):
|
||||
nlp.begin_training(get_examples=lambda: None)
|
||||
nlp.initialize()
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
with pytest.raises(ValueError):
|
||||
nlp.begin_training(get_examples=train_examples)
|
||||
nlp.initialize(get_examples=lambda: None)
|
||||
with pytest.raises(ValueError):
|
||||
nlp.initialize(get_examples=train_examples)
|
||||
|
||||
|
||||
def test_overfitting_IO():
|
||||
|
@ -139,7 +139,7 @@ def test_overfitting_IO():
|
|||
train_examples = []
|
||||
for text, annotations in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||
optimizer = nlp.begin_training(get_examples=lambda: train_examples)
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
assert textcat.model.get_dim("nO") == 2
|
||||
|
||||
for i in range(50):
|
||||
|
@ -195,7 +195,7 @@ def test_textcat_configs(textcat_config):
|
|||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||
for label, value in annotations.get("cats").items():
|
||||
textcat.add_label(label)
|
||||
optimizer = nlp.begin_training()
|
||||
optimizer = nlp.initialize()
|
||||
for i in range(5):
|
||||
losses = {}
|
||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||
|
@ -226,6 +226,7 @@ def test_positive_class_not_binary():
|
|||
with pytest.raises(ValueError):
|
||||
verify_textcat_config(nlp, pipe_config)
|
||||
|
||||
|
||||
def test_textcat_evaluation():
|
||||
train_examples = []
|
||||
nlp = English()
|
||||
|
@ -241,15 +242,17 @@ def test_textcat_evaluation():
|
|||
pred2.cats = {"winter": 1.0, "summer": 0.0, "spring": 0.0, "autumn": 1.0}
|
||||
train_examples.append(Example(pred2, ref2))
|
||||
|
||||
scores = Scorer().score_cats(train_examples, "cats", labels=["winter", "summer", "spring", "autumn"])
|
||||
assert scores["cats_f_per_type"]["winter"]["p"] == 1/2
|
||||
assert scores["cats_f_per_type"]["winter"]["r"] == 1/1
|
||||
scores = Scorer().score_cats(
|
||||
train_examples, "cats", labels=["winter", "summer", "spring", "autumn"]
|
||||
)
|
||||
assert scores["cats_f_per_type"]["winter"]["p"] == 1 / 2
|
||||
assert scores["cats_f_per_type"]["winter"]["r"] == 1 / 1
|
||||
assert scores["cats_f_per_type"]["summer"]["p"] == 0
|
||||
assert scores["cats_f_per_type"]["summer"]["r"] == 0/1
|
||||
assert scores["cats_f_per_type"]["spring"]["p"] == 1/1
|
||||
assert scores["cats_f_per_type"]["spring"]["r"] == 1/2
|
||||
assert scores["cats_f_per_type"]["autumn"]["p"] == 2/2
|
||||
assert scores["cats_f_per_type"]["autumn"]["r"] == 2/2
|
||||
assert scores["cats_f_per_type"]["summer"]["r"] == 0 / 1
|
||||
assert scores["cats_f_per_type"]["spring"]["p"] == 1 / 1
|
||||
assert scores["cats_f_per_type"]["spring"]["r"] == 1 / 2
|
||||
assert scores["cats_f_per_type"]["autumn"]["p"] == 2 / 2
|
||||
assert scores["cats_f_per_type"]["autumn"]["r"] == 2 / 2
|
||||
|
||||
assert scores["cats_micro_p"] == 4/5
|
||||
assert scores["cats_micro_r"] == 4/6
|
||||
assert scores["cats_micro_p"] == 4 / 5
|
||||
assert scores["cats_micro_r"] == 4 / 6
|
||||
|
|
|
@ -73,8 +73,7 @@ def test_tok2vec_configs(width, embed_arch, embed_config, encode_arch, encode_co
|
|||
encode_config["width"] = width
|
||||
docs = get_batch(3)
|
||||
tok2vec = build_Tok2Vec_model(
|
||||
embed_arch(**embed_config),
|
||||
encode_arch(**encode_config)
|
||||
embed_arch(**embed_config), encode_arch(**encode_config)
|
||||
)
|
||||
tok2vec.initialize(docs)
|
||||
vectors, backprop = tok2vec.begin_update(docs)
|
||||
|
@ -88,7 +87,7 @@ def test_init_tok2vec():
|
|||
nlp = English()
|
||||
tok2vec = nlp.add_pipe("tok2vec")
|
||||
assert tok2vec.listeners == []
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
assert tok2vec.model.get_dim("nO")
|
||||
|
||||
|
||||
|
@ -154,7 +153,7 @@ def test_tok2vec_listener():
|
|||
|
||||
# Check that the Tok2Vec component finds it listeners
|
||||
assert tok2vec.listeners == []
|
||||
optimizer = nlp.begin_training(lambda: train_examples)
|
||||
optimizer = nlp.initialize(lambda: train_examples)
|
||||
assert tok2vec.listeners == [tagger_tok2vec]
|
||||
|
||||
for i in range(5):
|
||||
|
|
|
@ -428,7 +428,7 @@ def test_issue999():
|
|||
for _, offsets in TRAIN_DATA:
|
||||
for start, end, label in offsets:
|
||||
ner.add_label(label)
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
for itn in range(20):
|
||||
random.shuffle(TRAIN_DATA)
|
||||
for raw_text, entity_offsets in TRAIN_DATA:
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user