mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-11 09:00:36 +03:00
Merge from develop
This commit is contained in:
commit
706e652820
|
@ -9,6 +9,7 @@ max_length = 0
|
||||||
limit = 0
|
limit = 0
|
||||||
# Data augmentation
|
# Data augmentation
|
||||||
orth_variant_level = 0.0
|
orth_variant_level = 0.0
|
||||||
|
noise_level = 0.0
|
||||||
dropout = 0.1
|
dropout = 0.1
|
||||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
# Controls early-stopping. 0 or -1 mean unlimited.
|
||||||
patience = 1600
|
patience = 1600
|
||||||
|
@ -24,8 +25,8 @@ scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
|
||||||
score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
|
score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
|
||||||
# These settings are invalid for the transformer models.
|
# These settings are invalid for the transformer models.
|
||||||
init_tok2vec = null
|
init_tok2vec = null
|
||||||
vectors = null
|
|
||||||
discard_oversize = false
|
discard_oversize = false
|
||||||
|
omit_extra_lookups = false
|
||||||
|
|
||||||
[training.batch_size]
|
[training.batch_size]
|
||||||
@schedules = "compounding.v1"
|
@schedules = "compounding.v1"
|
||||||
|
@ -52,7 +53,7 @@ learn_rate = 0.001
|
||||||
|
|
||||||
[nlp]
|
[nlp]
|
||||||
lang = "en"
|
lang = "en"
|
||||||
vectors = ${training:vectors}
|
vectors = null
|
||||||
|
|
||||||
[nlp.pipeline.tok2vec]
|
[nlp.pipeline.tok2vec]
|
||||||
factory = "tok2vec"
|
factory = "tok2vec"
|
||||||
|
@ -62,12 +63,20 @@ factory = "senter"
|
||||||
|
|
||||||
[nlp.pipeline.ner]
|
[nlp.pipeline.ner]
|
||||||
factory = "ner"
|
factory = "ner"
|
||||||
|
learn_tokens = false
|
||||||
|
min_action_freq = 1
|
||||||
|
beam_width = 1
|
||||||
|
beam_update_prob = 1.0
|
||||||
|
|
||||||
[nlp.pipeline.tagger]
|
[nlp.pipeline.tagger]
|
||||||
factory = "tagger"
|
factory = "tagger"
|
||||||
|
|
||||||
[nlp.pipeline.parser]
|
[nlp.pipeline.parser]
|
||||||
factory = "parser"
|
factory = "parser"
|
||||||
|
learn_tokens = false
|
||||||
|
min_action_freq = 1
|
||||||
|
beam_width = 1
|
||||||
|
beam_update_prob = 1.0
|
||||||
|
|
||||||
[nlp.pipeline.senter.model]
|
[nlp.pipeline.senter.model]
|
||||||
@architectures = "spacy.Tagger.v1"
|
@architectures = "spacy.Tagger.v1"
|
||||||
|
|
|
@ -9,6 +9,7 @@ max_length = 0
|
||||||
limit = 0
|
limit = 0
|
||||||
# Data augmentation
|
# Data augmentation
|
||||||
orth_variant_level = 0.0
|
orth_variant_level = 0.0
|
||||||
|
noise_level = 0.0
|
||||||
dropout = 0.1
|
dropout = 0.1
|
||||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
# Controls early-stopping. 0 or -1 mean unlimited.
|
||||||
patience = 1600
|
patience = 1600
|
||||||
|
@ -24,7 +25,6 @@ scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
|
||||||
score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
|
score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
|
||||||
# These settings are invalid for the transformer models.
|
# These settings are invalid for the transformer models.
|
||||||
init_tok2vec = null
|
init_tok2vec = null
|
||||||
vectors = null
|
|
||||||
discard_oversize = false
|
discard_oversize = false
|
||||||
|
|
||||||
[training.batch_size]
|
[training.batch_size]
|
||||||
|
@ -72,7 +72,7 @@ normalize = true
|
||||||
|
|
||||||
[nlp]
|
[nlp]
|
||||||
lang = "en"
|
lang = "en"
|
||||||
vectors = ${training:vectors}
|
vectors = null
|
||||||
|
|
||||||
[nlp.pipeline.tok2vec]
|
[nlp.pipeline.tok2vec]
|
||||||
factory = "tok2vec"
|
factory = "tok2vec"
|
||||||
|
@ -82,12 +82,20 @@ factory = "senter"
|
||||||
|
|
||||||
[nlp.pipeline.ner]
|
[nlp.pipeline.ner]
|
||||||
factory = "ner"
|
factory = "ner"
|
||||||
|
learn_tokens = false
|
||||||
|
min_action_freq = 1
|
||||||
|
beam_width = 1
|
||||||
|
beam_update_prob = 1.0
|
||||||
|
|
||||||
[nlp.pipeline.tagger]
|
[nlp.pipeline.tagger]
|
||||||
factory = "tagger"
|
factory = "tagger"
|
||||||
|
|
||||||
[nlp.pipeline.parser]
|
[nlp.pipeline.parser]
|
||||||
factory = "parser"
|
factory = "parser"
|
||||||
|
learn_tokens = false
|
||||||
|
min_action_freq = 1
|
||||||
|
beam_width = 1
|
||||||
|
beam_update_prob = 1.0
|
||||||
|
|
||||||
[nlp.pipeline.senter.model]
|
[nlp.pipeline.senter.model]
|
||||||
@architectures = "spacy.Tagger.v1"
|
@architectures = "spacy.Tagger.v1"
|
||||||
|
|
|
@ -6,6 +6,7 @@ init_tok2vec = null
|
||||||
vectors = null
|
vectors = null
|
||||||
max_epochs = 100
|
max_epochs = 100
|
||||||
orth_variant_level = 0.0
|
orth_variant_level = 0.0
|
||||||
|
noise_level = 0.0
|
||||||
gold_preproc = true
|
gold_preproc = true
|
||||||
max_length = 0
|
max_length = 0
|
||||||
use_gpu = 0
|
use_gpu = 0
|
||||||
|
@ -40,6 +41,10 @@ factory = "tagger"
|
||||||
|
|
||||||
[nlp.pipeline.parser]
|
[nlp.pipeline.parser]
|
||||||
factory = "parser"
|
factory = "parser"
|
||||||
|
learn_tokens = false
|
||||||
|
min_action_freq = 1
|
||||||
|
beam_width = 1
|
||||||
|
beam_update_prob = 1.0
|
||||||
|
|
||||||
[nlp.pipeline.tagger.model]
|
[nlp.pipeline.tagger.model]
|
||||||
@architectures = "spacy.Tagger.v1"
|
@architectures = "spacy.Tagger.v1"
|
||||||
|
|
|
@ -6,6 +6,7 @@ init_tok2vec = null
|
||||||
vectors = null
|
vectors = null
|
||||||
max_epochs = 100
|
max_epochs = 100
|
||||||
orth_variant_level = 0.0
|
orth_variant_level = 0.0
|
||||||
|
noise_level = 0.0
|
||||||
gold_preproc = true
|
gold_preproc = true
|
||||||
max_length = 0
|
max_length = 0
|
||||||
use_gpu = -1
|
use_gpu = -1
|
||||||
|
@ -40,6 +41,10 @@ factory = "tagger"
|
||||||
|
|
||||||
[nlp.pipeline.parser]
|
[nlp.pipeline.parser]
|
||||||
factory = "parser"
|
factory = "parser"
|
||||||
|
learn_tokens = false
|
||||||
|
min_action_freq = 1
|
||||||
|
beam_width = 1
|
||||||
|
beam_update_prob = 1.0
|
||||||
|
|
||||||
[nlp.pipeline.tagger.model]
|
[nlp.pipeline.tagger.model]
|
||||||
@architectures = "spacy.Tagger.v1"
|
@architectures = "spacy.Tagger.v1"
|
||||||
|
|
|
@ -120,13 +120,22 @@ def load_data(dataset, threshold, limit=0, split=0.8):
|
||||||
random.shuffle(train_data)
|
random.shuffle(train_data)
|
||||||
texts, labels = zip(*train_data)
|
texts, labels = zip(*train_data)
|
||||||
|
|
||||||
unique_labels = sorted(set([l for label_set in labels for l in label_set]))
|
unique_labels = set()
|
||||||
|
for label_set in labels:
|
||||||
|
if isinstance(label_set, int) or isinstance(label_set, str):
|
||||||
|
unique_labels.add(label_set)
|
||||||
|
elif isinstance(label_set, list) or isinstance(label_set, set):
|
||||||
|
unique_labels.update(label_set)
|
||||||
|
unique_labels = sorted(unique_labels)
|
||||||
print(f"# of unique_labels: {len(unique_labels)}")
|
print(f"# of unique_labels: {len(unique_labels)}")
|
||||||
|
|
||||||
count_values_train = dict()
|
count_values_train = dict()
|
||||||
for text, annot_list in train_data:
|
for text, annot_list in train_data:
|
||||||
for annot in annot_list:
|
if isinstance(annot_list, int) or isinstance(annot_list, str):
|
||||||
count_values_train[annot] = count_values_train.get(annot, 0) + 1
|
count_values_train[annot_list] = count_values_train.get(annot_list, 0) + 1
|
||||||
|
else:
|
||||||
|
for annot in annot_list:
|
||||||
|
count_values_train[annot] = count_values_train.get(annot, 0) + 1
|
||||||
for value, count in sorted(count_values_train.items(), key=lambda item: item[1]):
|
for value, count in sorted(count_values_train.items(), key=lambda item: item[1]):
|
||||||
if count < threshold:
|
if count < threshold:
|
||||||
unique_labels.remove(value)
|
unique_labels.remove(value)
|
||||||
|
@ -138,7 +147,7 @@ def load_data(dataset, threshold, limit=0, split=0.8):
|
||||||
else:
|
else:
|
||||||
cats = []
|
cats = []
|
||||||
for y in labels:
|
for y in labels:
|
||||||
if isinstance(y, str):
|
if isinstance(y, str) or isinstance(y, int):
|
||||||
cats.append({str(label): (label == y) for label in unique_labels})
|
cats.append({str(label): (label == y) for label in unique_labels})
|
||||||
elif isinstance(y, set):
|
elif isinstance(y, set):
|
||||||
cats.append({str(label): (label in y) for label in unique_labels})
|
cats.append({str(label): (label in y) for label in unique_labels})
|
||||||
|
|
|
@ -54,7 +54,8 @@ def evaluate(
|
||||||
"NER P": f"{scorer.ents_p:.2f}",
|
"NER P": f"{scorer.ents_p:.2f}",
|
||||||
"NER R": f"{scorer.ents_r:.2f}",
|
"NER R": f"{scorer.ents_r:.2f}",
|
||||||
"NER F": f"{scorer.ents_f:.2f}",
|
"NER F": f"{scorer.ents_f:.2f}",
|
||||||
"Textcat": f"{scorer.textcat_score:.2f}",
|
"Textcat AUC": f"{scorer.textcat_auc:.2f}",
|
||||||
|
"Textcat F": f"{scorer.textcat_f:.2f}",
|
||||||
"Sent P": f"{scorer.sent_p:.2f}",
|
"Sent P": f"{scorer.sent_p:.2f}",
|
||||||
"Sent R": f"{scorer.sent_r:.2f}",
|
"Sent R": f"{scorer.sent_r:.2f}",
|
||||||
"Sent F": f"{scorer.sent_f:.2f}",
|
"Sent F": f"{scorer.sent_f:.2f}",
|
||||||
|
|
|
@ -266,17 +266,15 @@ def create_pretraining_model(nlp, tok2vec):
|
||||||
the tok2vec input model. The tok2vec input model needs to be a model that
|
the tok2vec input model. The tok2vec input model needs to be a model that
|
||||||
takes a batch of Doc objects (as a list), and returns a list of arrays.
|
takes a batch of Doc objects (as a list), and returns a list of arrays.
|
||||||
Each array in the output needs to have one row per token in the doc.
|
Each array in the output needs to have one row per token in the doc.
|
||||||
|
The actual tok2vec layer is stored as a reference, and only this bit will be
|
||||||
|
serialized to file and read back in when calling the 'train' command.
|
||||||
"""
|
"""
|
||||||
output_size = nlp.vocab.vectors.data.shape[1]
|
output_size = nlp.vocab.vectors.data.shape[1]
|
||||||
output_layer = chain(
|
output_layer = chain(
|
||||||
Maxout(nO=300, nP=3, normalize=True, dropout=0.0), Linear(output_size)
|
Maxout(nO=300, nP=3, normalize=True, dropout=0.0), Linear(output_size)
|
||||||
)
|
)
|
||||||
# This is annoying, but the parser etc have the flatten step after
|
model = chain(tok2vec, list2array())
|
||||||
# the tok2vec. To load the weights in cleanly, we need to match
|
model = chain(model, output_layer)
|
||||||
# the shape of the models' components exactly. So what we cann
|
|
||||||
# "tok2vec" has to be the same set of processes as what the components do.
|
|
||||||
tok2vec = chain(tok2vec, list2array())
|
|
||||||
model = chain(tok2vec, output_layer)
|
|
||||||
model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
|
model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
|
||||||
mlm_model = build_masked_language_model(nlp.vocab, model)
|
mlm_model = build_masked_language_model(nlp.vocab, model)
|
||||||
mlm_model.set_ref("tok2vec", tok2vec)
|
mlm_model.set_ref("tok2vec", tok2vec)
|
||||||
|
|
|
@ -1,773 +0,0 @@
|
||||||
import os
|
|
||||||
import tqdm
|
|
||||||
from pathlib import Path
|
|
||||||
from thinc.api import use_ops
|
|
||||||
from timeit import default_timer as timer
|
|
||||||
import shutil
|
|
||||||
import srsly
|
|
||||||
from wasabi import msg
|
|
||||||
import contextlib
|
|
||||||
import random
|
|
||||||
|
|
||||||
from ..util import create_default_optimizer
|
|
||||||
from ..util import use_gpu as set_gpu
|
|
||||||
from ..gold import GoldCorpus
|
|
||||||
from ..lookups import Lookups
|
|
||||||
from .. import util
|
|
||||||
from .. import about
|
|
||||||
|
|
||||||
|
|
||||||
def train(
|
|
||||||
# fmt: off
|
|
||||||
lang: ("Model language", "positional", None, str),
|
|
||||||
output_path: ("Output directory to store model in", "positional", None, Path),
|
|
||||||
train_path: ("Location of JSON-formatted training data", "positional", None, Path),
|
|
||||||
dev_path: ("Location of JSON-formatted development data", "positional", None, Path),
|
|
||||||
raw_text: ("Path to jsonl file with unlabelled text documents.", "option", "rt", Path) = None,
|
|
||||||
base_model: ("Name of model to update (optional)", "option", "b", str) = None,
|
|
||||||
pipeline: ("Comma-separated names of pipeline components", "option", "p", str) = "tagger,parser,ner",
|
|
||||||
vectors: ("Model to load vectors from", "option", "v", str) = None,
|
|
||||||
replace_components: ("Replace components from base model", "flag", "R", bool) = False,
|
|
||||||
n_iter: ("Number of iterations", "option", "n", int) = 30,
|
|
||||||
n_early_stopping: ("Maximum number of training epochs without dev accuracy improvement", "option", "ne", int) = None,
|
|
||||||
n_examples: ("Number of examples", "option", "ns", int) = 0,
|
|
||||||
use_gpu: ("Use GPU", "option", "g", int) = -1,
|
|
||||||
version: ("Model version", "option", "V", str) = "0.0.0",
|
|
||||||
meta_path: ("Optional path to meta.json to use as base.", "option", "m", Path) = None,
|
|
||||||
init_tok2vec: ("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None,
|
|
||||||
parser_multitasks: ("Side objectives for parser CNN, e.g. 'dep' or 'dep,tag'", "option", "pt", str) = "",
|
|
||||||
entity_multitasks: ("Side objectives for NER CNN, e.g. 'dep' or 'dep,tag'", "option", "et", str) = "",
|
|
||||||
noise_level: ("Amount of corruption for data augmentation", "option", "nl", float) = 0.0,
|
|
||||||
orth_variant_level: ("Amount of orthography variation for data augmentation", "option", "ovl", float) = 0.0,
|
|
||||||
eval_beam_widths: ("Beam widths to evaluate, e.g. 4,8", "option", "bw", str) = "",
|
|
||||||
gold_preproc: ("Use gold preprocessing", "flag", "G", bool) = False,
|
|
||||||
learn_tokens: ("Make parser learn gold-standard tokenization", "flag", "T", bool) = False,
|
|
||||||
textcat_multilabel: ("Textcat classes aren't mutually exclusive (multilabel)", "flag", "TML", bool) = False,
|
|
||||||
textcat_arch: ("Textcat model architecture", "option", "ta", str) = "bow",
|
|
||||||
textcat_positive_label: ("Textcat positive label for binary classes with two labels", "option", "tpl", str) = None,
|
|
||||||
tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None,
|
|
||||||
omit_extra_lookups: ("Don't include extra lookups in model", "flag", "OEL", bool) = False,
|
|
||||||
verbose: ("Display more information for debug", "flag", "VV", bool) = False,
|
|
||||||
debug: ("Run data diagnostics before training", "flag", "D", bool) = False,
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Train or update a spaCy model. Requires data to be formatted in spaCy's
|
|
||||||
JSON format. To convert data from other formats, use the `spacy convert`
|
|
||||||
command.
|
|
||||||
"""
|
|
||||||
util.fix_random_seed()
|
|
||||||
util.set_env_log(verbose)
|
|
||||||
|
|
||||||
# Make sure all files and paths exists if they are needed
|
|
||||||
train_path = util.ensure_path(train_path)
|
|
||||||
dev_path = util.ensure_path(dev_path)
|
|
||||||
meta_path = util.ensure_path(meta_path)
|
|
||||||
output_path = util.ensure_path(output_path)
|
|
||||||
if raw_text is not None:
|
|
||||||
raw_text = list(srsly.read_jsonl(raw_text))
|
|
||||||
if not train_path or not train_path.exists():
|
|
||||||
msg.fail("Training data not found", train_path, exits=1)
|
|
||||||
if not dev_path or not dev_path.exists():
|
|
||||||
msg.fail("Development data not found", dev_path, exits=1)
|
|
||||||
if meta_path is not None and not meta_path.exists():
|
|
||||||
msg.fail("Can't find model meta.json", meta_path, exits=1)
|
|
||||||
meta = srsly.read_json(meta_path) if meta_path else {}
|
|
||||||
if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
|
|
||||||
msg.warn(
|
|
||||||
"Output directory is not empty",
|
|
||||||
"This can lead to unintended side effects when saving the model. "
|
|
||||||
"Please use an empty directory or a different path instead. If "
|
|
||||||
"the specified output path doesn't exist, the directory will be "
|
|
||||||
"created for you.",
|
|
||||||
)
|
|
||||||
if not output_path.exists():
|
|
||||||
output_path.mkdir()
|
|
||||||
msg.good(f"Created output directory: {output_path}")
|
|
||||||
|
|
||||||
tag_map = {}
|
|
||||||
if tag_map_path is not None:
|
|
||||||
tag_map = srsly.read_json(tag_map_path)
|
|
||||||
# Take dropout and batch size as generators of values -- dropout
|
|
||||||
# starts high and decays sharply, to force the optimizer to explore.
|
|
||||||
# Batch size starts at 1 and grows, so that we make updates quickly
|
|
||||||
# at the beginning of training.
|
|
||||||
dropout_rates = util.decaying(
|
|
||||||
util.env_opt("dropout_from", 0.2),
|
|
||||||
util.env_opt("dropout_to", 0.2),
|
|
||||||
util.env_opt("dropout_decay", 0.0),
|
|
||||||
)
|
|
||||||
batch_sizes = util.compounding(
|
|
||||||
util.env_opt("batch_from", 100.0),
|
|
||||||
util.env_opt("batch_to", 1000.0),
|
|
||||||
util.env_opt("batch_compound", 1.001),
|
|
||||||
)
|
|
||||||
|
|
||||||
if not eval_beam_widths:
|
|
||||||
eval_beam_widths = [1]
|
|
||||||
else:
|
|
||||||
eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")]
|
|
||||||
if 1 not in eval_beam_widths:
|
|
||||||
eval_beam_widths.append(1)
|
|
||||||
eval_beam_widths.sort()
|
|
||||||
has_beam_widths = eval_beam_widths != [1]
|
|
||||||
|
|
||||||
default_dir = Path(__file__).parent.parent / "pipeline" / "defaults"
|
|
||||||
|
|
||||||
# Set up the base model and pipeline. If a base model is specified, load
|
|
||||||
# the model and make sure the pipeline matches the pipeline setting. If
|
|
||||||
# training starts from a blank model, intitalize the language class.
|
|
||||||
pipeline = [p.strip() for p in pipeline.split(",")]
|
|
||||||
msg.text(f"Training pipeline: {pipeline}")
|
|
||||||
disabled_pipes = None
|
|
||||||
pipes_added = False
|
|
||||||
if use_gpu >= 0:
|
|
||||||
activated_gpu = None
|
|
||||||
try:
|
|
||||||
activated_gpu = set_gpu(use_gpu)
|
|
||||||
except Exception as e:
|
|
||||||
msg.warn(f"Exception: {e}")
|
|
||||||
if activated_gpu is not None:
|
|
||||||
msg.text(f"Using GPU: {use_gpu}")
|
|
||||||
else:
|
|
||||||
msg.warn(f"Unable to activate GPU: {use_gpu}")
|
|
||||||
msg.text("Using CPU only")
|
|
||||||
use_gpu = -1
|
|
||||||
if base_model:
|
|
||||||
msg.text(f"Starting with base model '{base_model}'")
|
|
||||||
nlp = util.load_model(base_model)
|
|
||||||
if nlp.lang != lang:
|
|
||||||
msg.fail(
|
|
||||||
f"Model language ('{nlp.lang}') doesn't match language "
|
|
||||||
f"specified as `lang` argument ('{lang}') ",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
if vectors:
|
|
||||||
msg.text(f"Loading vectors from model '{vectors}'")
|
|
||||||
_load_vectors(nlp, vectors)
|
|
||||||
|
|
||||||
nlp.select_pipes(disable=[p for p in nlp.pipe_names if p not in pipeline])
|
|
||||||
for pipe in pipeline:
|
|
||||||
# first, create the model.
|
|
||||||
# Bit of a hack after the refactor to get the vectors into a default config
|
|
||||||
# use train-from-config instead :-)
|
|
||||||
if pipe == "parser":
|
|
||||||
config_loc = default_dir / "parser_defaults.cfg"
|
|
||||||
elif pipe == "tagger":
|
|
||||||
config_loc = default_dir / "tagger_defaults.cfg"
|
|
||||||
elif pipe == "ner":
|
|
||||||
config_loc = default_dir / "ner_defaults.cfg"
|
|
||||||
elif pipe == "textcat":
|
|
||||||
config_loc = default_dir / "textcat_defaults.cfg"
|
|
||||||
elif pipe == "senter":
|
|
||||||
config_loc = default_dir / "senter_defaults.cfg"
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Component {pipe} currently not supported.")
|
|
||||||
pipe_cfg = util.load_config(config_loc, create_objects=False)
|
|
||||||
if vectors:
|
|
||||||
pretrained_config = {
|
|
||||||
"@architectures": "spacy.VocabVectors.v1",
|
|
||||||
"name": vectors,
|
|
||||||
}
|
|
||||||
pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config
|
|
||||||
|
|
||||||
if pipe == "parser":
|
|
||||||
pipe_cfg["learn_tokens"] = learn_tokens
|
|
||||||
elif pipe == "textcat":
|
|
||||||
pipe_cfg["exclusive_classes"] = not textcat_multilabel
|
|
||||||
pipe_cfg["architecture"] = textcat_arch
|
|
||||||
pipe_cfg["positive_label"] = textcat_positive_label
|
|
||||||
|
|
||||||
if pipe not in nlp.pipe_names:
|
|
||||||
msg.text(f"Adding component to base model '{pipe}'")
|
|
||||||
nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
|
|
||||||
pipes_added = True
|
|
||||||
elif replace_components:
|
|
||||||
msg.text(f"Replacing component from base model '{pipe}'")
|
|
||||||
nlp.replace_pipe(pipe, nlp.create_pipe(pipe, config=pipe_cfg))
|
|
||||||
pipes_added = True
|
|
||||||
else:
|
|
||||||
if pipe == "textcat":
|
|
||||||
textcat_cfg = nlp.get_pipe("textcat").cfg
|
|
||||||
base_cfg = {
|
|
||||||
"exclusive_classes": textcat_cfg["exclusive_classes"],
|
|
||||||
"architecture": textcat_cfg["architecture"],
|
|
||||||
"positive_label": textcat_cfg["positive_label"],
|
|
||||||
}
|
|
||||||
if base_cfg != pipe_cfg:
|
|
||||||
msg.fail(
|
|
||||||
f"The base textcat model configuration does"
|
|
||||||
f"not match the provided training options. "
|
|
||||||
f"Existing cfg: {base_cfg}, provided cfg: {pipe_cfg}",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
msg.text(f"Extending component from base model '{pipe}'")
|
|
||||||
disabled_pipes = nlp.select_pipes(
|
|
||||||
disable=[p for p in nlp.pipe_names if p not in pipeline]
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
msg.text(f"Starting with blank model '{lang}'")
|
|
||||||
lang_cls = util.get_lang_class(lang)
|
|
||||||
nlp = lang_cls()
|
|
||||||
|
|
||||||
if vectors:
|
|
||||||
msg.text(f"Loading vectors from model '{vectors}'")
|
|
||||||
_load_vectors(nlp, vectors)
|
|
||||||
|
|
||||||
for pipe in pipeline:
|
|
||||||
# first, create the model.
|
|
||||||
# Bit of a hack after the refactor to get the vectors into a default config
|
|
||||||
# use train-from-config instead :-)
|
|
||||||
if pipe == "parser":
|
|
||||||
config_loc = default_dir / "parser_defaults.cfg"
|
|
||||||
elif pipe == "tagger":
|
|
||||||
config_loc = default_dir / "tagger_defaults.cfg"
|
|
||||||
elif pipe == "morphologizer":
|
|
||||||
config_loc = default_dir / "morphologizer_defaults.cfg"
|
|
||||||
elif pipe == "ner":
|
|
||||||
config_loc = default_dir / "ner_defaults.cfg"
|
|
||||||
elif pipe == "textcat":
|
|
||||||
config_loc = default_dir / "textcat_defaults.cfg"
|
|
||||||
elif pipe == "senter":
|
|
||||||
config_loc = default_dir / "senter_defaults.cfg"
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Component {pipe} currently not supported.")
|
|
||||||
pipe_cfg = util.load_config(config_loc, create_objects=False)
|
|
||||||
if vectors:
|
|
||||||
pretrained_config = {
|
|
||||||
"@architectures": "spacy.VocabVectors.v1",
|
|
||||||
"name": vectors,
|
|
||||||
}
|
|
||||||
pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config
|
|
||||||
|
|
||||||
if pipe == "parser":
|
|
||||||
pipe_cfg["learn_tokens"] = learn_tokens
|
|
||||||
elif pipe == "textcat":
|
|
||||||
pipe_cfg["exclusive_classes"] = not textcat_multilabel
|
|
||||||
pipe_cfg["architecture"] = textcat_arch
|
|
||||||
pipe_cfg["positive_label"] = textcat_positive_label
|
|
||||||
|
|
||||||
pipe = nlp.create_pipe(pipe, config=pipe_cfg)
|
|
||||||
nlp.add_pipe(pipe)
|
|
||||||
|
|
||||||
# Update tag map with provided mapping
|
|
||||||
nlp.vocab.morphology.tag_map.update(tag_map)
|
|
||||||
|
|
||||||
# Create empty extra lexeme tables so the data from spacy-lookups-data
|
|
||||||
# isn't loaded if these features are accessed
|
|
||||||
if omit_extra_lookups:
|
|
||||||
nlp.vocab.lookups_extra = Lookups()
|
|
||||||
nlp.vocab.lookups_extra.add_table("lexeme_cluster")
|
|
||||||
nlp.vocab.lookups_extra.add_table("lexeme_prob")
|
|
||||||
nlp.vocab.lookups_extra.add_table("lexeme_settings")
|
|
||||||
|
|
||||||
if vectors:
|
|
||||||
msg.text("Loading vector from model '{}'".format(vectors))
|
|
||||||
_load_vectors(nlp, vectors)
|
|
||||||
|
|
||||||
# Multitask objectives
|
|
||||||
multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)]
|
|
||||||
for pipe_name, multitasks in multitask_options:
|
|
||||||
if multitasks:
|
|
||||||
if pipe_name not in pipeline:
|
|
||||||
msg.fail(
|
|
||||||
f"Can't use multitask objective without '{pipe_name}' in "
|
|
||||||
f"the pipeline"
|
|
||||||
)
|
|
||||||
pipe = nlp.get_pipe(pipe_name)
|
|
||||||
for objective in multitasks.split(","):
|
|
||||||
pipe.add_multitask_objective(objective)
|
|
||||||
|
|
||||||
# Prepare training corpus
|
|
||||||
msg.text(f"Counting training words (limit={n_examples})")
|
|
||||||
corpus = GoldCorpus(train_path, dev_path, limit=n_examples)
|
|
||||||
n_train_words = corpus.count_train()
|
|
||||||
|
|
||||||
if base_model and not pipes_added:
|
|
||||||
# Start with an existing model, use default optimizer
|
|
||||||
optimizer = create_default_optimizer()
|
|
||||||
else:
|
|
||||||
# Start with a blank model, call begin_training
|
|
||||||
cfg = {"device": use_gpu}
|
|
||||||
optimizer = nlp.begin_training(lambda: corpus.train_examples, **cfg)
|
|
||||||
nlp._optimizer = None
|
|
||||||
|
|
||||||
# Load in pretrained weights (TODO: this may be broken in the config rewrite)
|
|
||||||
if init_tok2vec is not None:
|
|
||||||
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
|
|
||||||
msg.text(f"Loaded pretrained tok2vec for: {components}")
|
|
||||||
|
|
||||||
# Verify textcat config
|
|
||||||
if "textcat" in pipeline:
|
|
||||||
textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", [])
|
|
||||||
if textcat_positive_label and textcat_positive_label not in textcat_labels:
|
|
||||||
msg.fail(
|
|
||||||
f"The textcat_positive_label (tpl) '{textcat_positive_label}' "
|
|
||||||
f"does not match any label in the training data.",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
if textcat_positive_label and len(textcat_labels) != 2:
|
|
||||||
msg.fail(
|
|
||||||
"A textcat_positive_label (tpl) '{textcat_positive_label}' was "
|
|
||||||
"provided for training data that does not appear to be a "
|
|
||||||
"binary classification problem with two labels.",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
train_data = corpus.train_data(
|
|
||||||
nlp,
|
|
||||||
noise_level=noise_level,
|
|
||||||
gold_preproc=gold_preproc,
|
|
||||||
max_length=0,
|
|
||||||
ignore_misaligned=True,
|
|
||||||
)
|
|
||||||
train_labels = set()
|
|
||||||
if textcat_multilabel:
|
|
||||||
multilabel_found = False
|
|
||||||
for ex in train_data:
|
|
||||||
train_labels.update(ex.gold.cats.keys())
|
|
||||||
if list(ex.gold.cats.values()).count(1.0) != 1:
|
|
||||||
multilabel_found = True
|
|
||||||
if not multilabel_found and not base_model:
|
|
||||||
msg.warn(
|
|
||||||
"The textcat training instances look like they have "
|
|
||||||
"mutually-exclusive classes. Remove the flag "
|
|
||||||
"'--textcat-multilabel' to train a classifier with "
|
|
||||||
"mutually-exclusive classes."
|
|
||||||
)
|
|
||||||
if not textcat_multilabel:
|
|
||||||
for ex in train_data:
|
|
||||||
train_labels.update(ex.gold.cats.keys())
|
|
||||||
if list(ex.gold.cats.values()).count(1.0) != 1 and not base_model:
|
|
||||||
msg.warn(
|
|
||||||
"Some textcat training instances do not have exactly "
|
|
||||||
"one positive label. Modifying training options to "
|
|
||||||
"include the flag '--textcat-multilabel' for classes "
|
|
||||||
"that are not mutually exclusive."
|
|
||||||
)
|
|
||||||
nlp.get_pipe("textcat").cfg["exclusive_classes"] = False
|
|
||||||
textcat_multilabel = True
|
|
||||||
break
|
|
||||||
if base_model and set(textcat_labels) != train_labels:
|
|
||||||
msg.fail(
|
|
||||||
f"Cannot extend textcat model using data with different "
|
|
||||||
f"labels. Base model labels: {textcat_labels}, training data "
|
|
||||||
f"labels: {list(train_labels)}",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
if textcat_multilabel:
|
|
||||||
msg.text(
|
|
||||||
f"Textcat evaluation score: ROC AUC score macro-averaged across "
|
|
||||||
f"the labels '{', '.join(textcat_labels)}'"
|
|
||||||
)
|
|
||||||
elif textcat_positive_label and len(textcat_labels) == 2:
|
|
||||||
msg.text(
|
|
||||||
f"Textcat evaluation score: F1-score for the "
|
|
||||||
f"label '{textcat_positive_label}'"
|
|
||||||
)
|
|
||||||
elif len(textcat_labels) > 1:
|
|
||||||
if len(textcat_labels) == 2:
|
|
||||||
msg.warn(
|
|
||||||
"If the textcat component is a binary classifier with "
|
|
||||||
"exclusive classes, provide '--textcat-positive-label' for "
|
|
||||||
"an evaluation on the positive class."
|
|
||||||
)
|
|
||||||
msg.text(
|
|
||||||
f"Textcat evaluation score: F1-score macro-averaged across "
|
|
||||||
f"the labels '{', '.join(textcat_labels)}'"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
msg.fail(
|
|
||||||
"Unsupported textcat configuration. Use `spacy debug-data` "
|
|
||||||
"for more information."
|
|
||||||
)
|
|
||||||
|
|
||||||
# fmt: off
|
|
||||||
row_head, output_stats = _configure_training_output(pipeline, use_gpu, has_beam_widths)
|
|
||||||
row_widths = [len(w) for w in row_head]
|
|
||||||
row_settings = {"widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2}
|
|
||||||
# fmt: on
|
|
||||||
print("")
|
|
||||||
msg.row(row_head, **row_settings)
|
|
||||||
msg.row(["-" * width for width in row_settings["widths"]], **row_settings)
|
|
||||||
try:
|
|
||||||
iter_since_best = 0
|
|
||||||
best_score = 0.0
|
|
||||||
for i in range(n_iter):
|
|
||||||
train_data = corpus.train_dataset(
|
|
||||||
nlp,
|
|
||||||
noise_level=noise_level,
|
|
||||||
orth_variant_level=orth_variant_level,
|
|
||||||
gold_preproc=gold_preproc,
|
|
||||||
max_length=0,
|
|
||||||
ignore_misaligned=True,
|
|
||||||
)
|
|
||||||
if raw_text:
|
|
||||||
random.shuffle(raw_text)
|
|
||||||
raw_batches = util.minibatch(
|
|
||||||
(nlp.make_doc(rt["text"]) for rt in raw_text), size=8
|
|
||||||
)
|
|
||||||
words_seen = 0
|
|
||||||
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
|
||||||
losses = {}
|
|
||||||
for batch in util.minibatch_by_words(train_data, size=batch_sizes):
|
|
||||||
if not batch:
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
nlp.update(
|
|
||||||
batch,
|
|
||||||
sgd=optimizer,
|
|
||||||
drop=next(dropout_rates),
|
|
||||||
losses=losses,
|
|
||||||
)
|
|
||||||
except ValueError as e:
|
|
||||||
err = "Error during training"
|
|
||||||
if init_tok2vec:
|
|
||||||
err += " Did you provide the same parameters during 'train' as during 'pretrain'?"
|
|
||||||
msg.fail(err, f"Original error message: {e}", exits=1)
|
|
||||||
if raw_text:
|
|
||||||
# If raw text is available, perform 'rehearsal' updates,
|
|
||||||
# which use unlabelled data to reduce overfitting.
|
|
||||||
raw_batch = list(next(raw_batches))
|
|
||||||
nlp.rehearse(raw_batch, sgd=optimizer, losses=losses)
|
|
||||||
docs = [ex.doc for ex in batch]
|
|
||||||
if not int(os.environ.get("LOG_FRIENDLY", 0)):
|
|
||||||
pbar.update(sum(len(doc) for doc in docs))
|
|
||||||
words_seen += sum(len(doc) for doc in docs)
|
|
||||||
with nlp.use_params(optimizer.averages):
|
|
||||||
util.set_env_log(False)
|
|
||||||
epoch_model_path = output_path / f"model{i}"
|
|
||||||
nlp.to_disk(epoch_model_path)
|
|
||||||
nlp_loaded = util.load_model_from_path(epoch_model_path)
|
|
||||||
for beam_width in eval_beam_widths:
|
|
||||||
for name, component in nlp_loaded.pipeline:
|
|
||||||
if hasattr(component, "cfg"):
|
|
||||||
component.cfg["beam_width"] = beam_width
|
|
||||||
dev_dataset = list(
|
|
||||||
corpus.dev_dataset(
|
|
||||||
nlp_loaded,
|
|
||||||
gold_preproc=gold_preproc,
|
|
||||||
ignore_misaligned=True,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
nwords = sum(len(ex.doc) for ex in dev_dataset)
|
|
||||||
start_time = timer()
|
|
||||||
scorer = nlp_loaded.evaluate(dev_dataset, verbose=verbose)
|
|
||||||
end_time = timer()
|
|
||||||
if use_gpu < 0:
|
|
||||||
gpu_wps = None
|
|
||||||
cpu_wps = nwords / (end_time - start_time)
|
|
||||||
else:
|
|
||||||
gpu_wps = nwords / (end_time - start_time)
|
|
||||||
# Evaluate on CPU in the first iteration only (for
|
|
||||||
# timing) when GPU is enabled
|
|
||||||
if i == 0:
|
|
||||||
with use_ops("numpy"):
|
|
||||||
nlp_loaded = util.load_model_from_path(epoch_model_path)
|
|
||||||
for name, component in nlp_loaded.pipeline:
|
|
||||||
if hasattr(component, "cfg"):
|
|
||||||
component.cfg["beam_width"] = beam_width
|
|
||||||
dev_dataset = list(
|
|
||||||
corpus.dev_dataset(
|
|
||||||
nlp_loaded,
|
|
||||||
gold_preproc=gold_preproc,
|
|
||||||
ignore_misaligned=True,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
start_time = timer()
|
|
||||||
scorer = nlp_loaded.evaluate(dev_dataset, verbose=verbose)
|
|
||||||
end_time = timer()
|
|
||||||
cpu_wps = nwords / (end_time - start_time)
|
|
||||||
acc_loc = output_path / f"model{i}" / "accuracy.json"
|
|
||||||
srsly.write_json(acc_loc, scorer.scores)
|
|
||||||
|
|
||||||
# Update model meta.json
|
|
||||||
meta["lang"] = nlp.lang
|
|
||||||
meta["pipeline"] = nlp.pipe_names
|
|
||||||
if beam_width == 1:
|
|
||||||
meta["speed"] = {
|
|
||||||
"nwords": nwords,
|
|
||||||
"cpu": cpu_wps,
|
|
||||||
"gpu": gpu_wps,
|
|
||||||
}
|
|
||||||
meta.setdefault("accuracy", {})
|
|
||||||
for component in nlp.pipe_names:
|
|
||||||
for metric in _get_metrics(component):
|
|
||||||
meta["accuracy"][metric] = scorer.scores[metric]
|
|
||||||
else:
|
|
||||||
meta.setdefault("beam_accuracy", {})
|
|
||||||
meta.setdefault("beam_speed", {})
|
|
||||||
for component in nlp.pipe_names:
|
|
||||||
for metric in _get_metrics(component):
|
|
||||||
meta["beam_accuracy"][metric] = scorer.scores[metric]
|
|
||||||
meta["beam_speed"][beam_width] = {
|
|
||||||
"nwords": nwords,
|
|
||||||
"cpu": cpu_wps,
|
|
||||||
"gpu": gpu_wps,
|
|
||||||
}
|
|
||||||
meta["vectors"] = {
|
|
||||||
"width": nlp.vocab.vectors_length,
|
|
||||||
"vectors": len(nlp.vocab.vectors),
|
|
||||||
"keys": nlp.vocab.vectors.n_keys,
|
|
||||||
"name": nlp.vocab.vectors.name,
|
|
||||||
}
|
|
||||||
meta.setdefault("name", f"model{i}")
|
|
||||||
meta.setdefault("version", version)
|
|
||||||
meta["labels"] = nlp.meta["labels"]
|
|
||||||
meta_loc = output_path / f"model{i}" / "meta.json"
|
|
||||||
srsly.write_json(meta_loc, meta)
|
|
||||||
util.set_env_log(verbose)
|
|
||||||
|
|
||||||
progress = _get_progress(
|
|
||||||
i,
|
|
||||||
losses,
|
|
||||||
scorer.scores,
|
|
||||||
output_stats,
|
|
||||||
beam_width=beam_width if has_beam_widths else None,
|
|
||||||
cpu_wps=cpu_wps,
|
|
||||||
gpu_wps=gpu_wps,
|
|
||||||
)
|
|
||||||
if i == 0 and "textcat" in pipeline:
|
|
||||||
textcats_per_cat = scorer.scores.get("textcats_per_cat", {})
|
|
||||||
for cat, cat_score in textcats_per_cat.items():
|
|
||||||
if cat_score.get("roc_auc_score", 0) < 0:
|
|
||||||
msg.warn(
|
|
||||||
f"Textcat ROC AUC score is undefined due to "
|
|
||||||
f"only one value in label '{cat}'."
|
|
||||||
)
|
|
||||||
msg.row(progress, **row_settings)
|
|
||||||
# Early stopping
|
|
||||||
if n_early_stopping is not None:
|
|
||||||
current_score = _score_for_model(meta)
|
|
||||||
if current_score < best_score:
|
|
||||||
iter_since_best += 1
|
|
||||||
else:
|
|
||||||
iter_since_best = 0
|
|
||||||
best_score = current_score
|
|
||||||
if iter_since_best >= n_early_stopping:
|
|
||||||
msg.text(
|
|
||||||
f"Early stopping, best iteration is: {i - iter_since_best}"
|
|
||||||
)
|
|
||||||
msg.text(
|
|
||||||
f"Best score = {best_score}; Final iteration score = {current_score}"
|
|
||||||
)
|
|
||||||
break
|
|
||||||
except Exception as e:
|
|
||||||
msg.warn(f"Aborting and saving final best model. Encountered exception: {e}", exits=1)
|
|
||||||
finally:
|
|
||||||
best_pipes = nlp.pipe_names
|
|
||||||
if disabled_pipes:
|
|
||||||
disabled_pipes.restore()
|
|
||||||
with nlp.use_params(optimizer.averages):
|
|
||||||
final_model_path = output_path / "model-final"
|
|
||||||
nlp.to_disk(final_model_path)
|
|
||||||
meta_loc = output_path / "model-final" / "meta.json"
|
|
||||||
final_meta = srsly.read_json(meta_loc)
|
|
||||||
final_meta.setdefault("accuracy", {})
|
|
||||||
final_meta["accuracy"].update(meta.get("accuracy", {}))
|
|
||||||
final_meta.setdefault("speed", {})
|
|
||||||
final_meta["speed"].setdefault("cpu", None)
|
|
||||||
final_meta["speed"].setdefault("gpu", None)
|
|
||||||
meta.setdefault("speed", {})
|
|
||||||
meta["speed"].setdefault("cpu", None)
|
|
||||||
meta["speed"].setdefault("gpu", None)
|
|
||||||
# combine cpu and gpu speeds with the base model speeds
|
|
||||||
if final_meta["speed"]["cpu"] and meta["speed"]["cpu"]:
|
|
||||||
speed = _get_total_speed(
|
|
||||||
[final_meta["speed"]["cpu"], meta["speed"]["cpu"]]
|
|
||||||
)
|
|
||||||
final_meta["speed"]["cpu"] = speed
|
|
||||||
if final_meta["speed"]["gpu"] and meta["speed"]["gpu"]:
|
|
||||||
speed = _get_total_speed(
|
|
||||||
[final_meta["speed"]["gpu"], meta["speed"]["gpu"]]
|
|
||||||
)
|
|
||||||
final_meta["speed"]["gpu"] = speed
|
|
||||||
# if there were no speeds to update, overwrite with meta
|
|
||||||
if (
|
|
||||||
final_meta["speed"]["cpu"] is None
|
|
||||||
and final_meta["speed"]["gpu"] is None
|
|
||||||
):
|
|
||||||
final_meta["speed"].update(meta["speed"])
|
|
||||||
# note: beam speeds are not combined with the base model
|
|
||||||
if has_beam_widths:
|
|
||||||
final_meta.setdefault("beam_accuracy", {})
|
|
||||||
final_meta["beam_accuracy"].update(meta.get("beam_accuracy", {}))
|
|
||||||
final_meta.setdefault("beam_speed", {})
|
|
||||||
final_meta["beam_speed"].update(meta.get("beam_speed", {}))
|
|
||||||
srsly.write_json(meta_loc, final_meta)
|
|
||||||
msg.good("Saved model to output directory", final_model_path)
|
|
||||||
with msg.loading("Creating best model..."):
|
|
||||||
best_model_path = _collate_best_model(final_meta, output_path, best_pipes)
|
|
||||||
msg.good("Created best model", best_model_path)
|
|
||||||
|
|
||||||
|
|
||||||
def _score_for_model(meta):
|
|
||||||
""" Returns mean score between tasks in pipeline that can be used for early stopping. """
|
|
||||||
mean_acc = list()
|
|
||||||
pipes = meta["pipeline"]
|
|
||||||
acc = meta["accuracy"]
|
|
||||||
if "tagger" in pipes:
|
|
||||||
mean_acc.append(acc["tags_acc"])
|
|
||||||
if "morphologizer" in pipes:
|
|
||||||
mean_acc.append((acc["morphs_acc"] + acc["pos_acc"]) / 2)
|
|
||||||
if "parser" in pipes:
|
|
||||||
mean_acc.append((acc["uas"] + acc["las"]) / 2)
|
|
||||||
if "ner" in pipes:
|
|
||||||
mean_acc.append((acc["ents_p"] + acc["ents_r"] + acc["ents_f"]) / 3)
|
|
||||||
if "textcat" in pipes:
|
|
||||||
mean_acc.append(acc["textcat_score"])
|
|
||||||
if "senter" in pipes:
|
|
||||||
mean_acc.append((acc["sent_p"] + acc["sent_r"] + acc["sent_f"]) / 3)
|
|
||||||
return sum(mean_acc) / len(mean_acc)
|
|
||||||
|
|
||||||
|
|
||||||
@contextlib.contextmanager
|
|
||||||
def _create_progress_bar(total):
|
|
||||||
if int(os.environ.get("LOG_FRIENDLY", 0)):
|
|
||||||
yield
|
|
||||||
else:
|
|
||||||
pbar = tqdm.tqdm(total=total, leave=False)
|
|
||||||
yield pbar
|
|
||||||
|
|
||||||
|
|
||||||
def _load_vectors(nlp, vectors):
|
|
||||||
util.load_model(vectors, vocab=nlp.vocab)
|
|
||||||
|
|
||||||
|
|
||||||
def _load_pretrained_tok2vec(nlp, loc):
|
|
||||||
"""Load pretrained weights for the 'token-to-vector' part of the component
|
|
||||||
models, which is typically a CNN. See 'spacy pretrain'. Experimental.
|
|
||||||
"""
|
|
||||||
with loc.open("rb") as file_:
|
|
||||||
weights_data = file_.read()
|
|
||||||
loaded = []
|
|
||||||
for name, component in nlp.pipeline:
|
|
||||||
if hasattr(component, "model") and component.model.has_ref("tok2vec"):
|
|
||||||
component.get_ref("tok2vec").from_bytes(weights_data)
|
|
||||||
loaded.append(name)
|
|
||||||
return loaded
|
|
||||||
|
|
||||||
|
|
||||||
def _collate_best_model(meta, output_path, components):
|
|
||||||
bests = {}
|
|
||||||
meta.setdefault("accuracy", {})
|
|
||||||
for component in components:
|
|
||||||
bests[component] = _find_best(output_path, component)
|
|
||||||
best_dest = output_path / "model-best"
|
|
||||||
shutil.copytree(str(output_path / "model-final"), str(best_dest))
|
|
||||||
for component, best_component_src in bests.items():
|
|
||||||
shutil.rmtree(str(best_dest / component))
|
|
||||||
shutil.copytree(str(best_component_src / component), str(best_dest / component))
|
|
||||||
accs = srsly.read_json(best_component_src / "accuracy.json")
|
|
||||||
for metric in _get_metrics(component):
|
|
||||||
meta["accuracy"][metric] = accs[metric]
|
|
||||||
srsly.write_json(best_dest / "meta.json", meta)
|
|
||||||
return best_dest
|
|
||||||
|
|
||||||
|
|
||||||
def _find_best(experiment_dir, component):
|
|
||||||
accuracies = []
|
|
||||||
for epoch_model in experiment_dir.iterdir():
|
|
||||||
if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final":
|
|
||||||
accs = srsly.read_json(epoch_model / "accuracy.json")
|
|
||||||
scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)]
|
|
||||||
# remove per_type dicts from score list for max() comparison
|
|
||||||
scores = [score for score in scores if isinstance(score, float)]
|
|
||||||
accuracies.append((scores, epoch_model))
|
|
||||||
if accuracies:
|
|
||||||
return max(accuracies)[1]
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def _get_metrics(component):
|
|
||||||
if component == "parser":
|
|
||||||
return ("las", "uas", "las_per_type", "sent_f", "token_acc")
|
|
||||||
elif component == "tagger":
|
|
||||||
return ("tags_acc", "token_acc")
|
|
||||||
elif component == "morphologizer":
|
|
||||||
return ("morphs_acc", "pos_acc", "token_acc")
|
|
||||||
elif component == "ner":
|
|
||||||
return ("ents_f", "ents_p", "ents_r", "ents_per_type", "token_acc")
|
|
||||||
elif component == "senter":
|
|
||||||
return ("sent_f", "sent_p", "sent_r", "token_acc")
|
|
||||||
elif component == "textcat":
|
|
||||||
return ("textcat_score", "token_acc")
|
|
||||||
return ("token_acc",)
|
|
||||||
|
|
||||||
|
|
||||||
def _configure_training_output(pipeline, use_gpu, has_beam_widths):
|
|
||||||
row_head = ["Itn"]
|
|
||||||
output_stats = []
|
|
||||||
for pipe in pipeline:
|
|
||||||
if pipe == "tagger":
|
|
||||||
row_head.extend(["Tag Loss ", " Tag % "])
|
|
||||||
output_stats.extend(["tag_loss", "tags_acc"])
|
|
||||||
elif pipe == "morphologizer" or pipe == "morphologizertagger":
|
|
||||||
row_head.extend(["Morph Loss ", " Morph % ", " POS % "])
|
|
||||||
output_stats.extend(["morph_loss", "morphs_acc", "pos_acc"])
|
|
||||||
elif pipe == "parser":
|
|
||||||
row_head.extend(
|
|
||||||
["Dep Loss ", " UAS ", " LAS ", "Sent P", "Sent R", "Sent F"]
|
|
||||||
)
|
|
||||||
output_stats.extend(
|
|
||||||
["dep_loss", "uas", "las", "sent_p", "sent_r", "sent_f"]
|
|
||||||
)
|
|
||||||
elif pipe == "ner":
|
|
||||||
row_head.extend(["NER Loss ", "NER P ", "NER R ", "NER F "])
|
|
||||||
output_stats.extend(["ner_loss", "ents_p", "ents_r", "ents_f"])
|
|
||||||
elif pipe == "textcat":
|
|
||||||
row_head.extend(["Textcat Loss", "Textcat"])
|
|
||||||
output_stats.extend(["textcat_loss", "textcat_score"])
|
|
||||||
elif pipe == "senter":
|
|
||||||
row_head.extend(["Senter Loss", "Sent P", "Sent R", "Sent F"])
|
|
||||||
output_stats.extend(["senter_loss", "sent_p", "sent_r", "sent_f"])
|
|
||||||
row_head.extend(["Token %", "CPU WPS"])
|
|
||||||
output_stats.extend(["token_acc", "cpu_wps"])
|
|
||||||
|
|
||||||
if use_gpu >= 0:
|
|
||||||
row_head.extend(["GPU WPS"])
|
|
||||||
output_stats.extend(["gpu_wps"])
|
|
||||||
|
|
||||||
if has_beam_widths:
|
|
||||||
row_head.insert(1, "Beam W.")
|
|
||||||
# remove duplicates
|
|
||||||
row_head_dict = {k: 1 for k in row_head}
|
|
||||||
output_stats_dict = {k: 1 for k in output_stats}
|
|
||||||
return row_head_dict.keys(), output_stats_dict.keys()
|
|
||||||
|
|
||||||
|
|
||||||
def _get_progress(
|
|
||||||
itn, losses, dev_scores, output_stats, beam_width=None, cpu_wps=0.0, gpu_wps=0.0
|
|
||||||
):
|
|
||||||
scores = {}
|
|
||||||
for stat in output_stats:
|
|
||||||
scores[stat] = 0.0
|
|
||||||
scores["dep_loss"] = losses.get("parser", 0.0)
|
|
||||||
scores["ner_loss"] = losses.get("ner", 0.0)
|
|
||||||
scores["tag_loss"] = losses.get("tagger", 0.0)
|
|
||||||
scores["morph_loss"] = losses.get("morphologizer", 0.0)
|
|
||||||
scores["textcat_loss"] = losses.get("textcat", 0.0)
|
|
||||||
scores["senter_loss"] = losses.get("senter", 0.0)
|
|
||||||
scores["cpu_wps"] = cpu_wps
|
|
||||||
scores["gpu_wps"] = gpu_wps or 0.0
|
|
||||||
scores.update(dev_scores)
|
|
||||||
formatted_scores = []
|
|
||||||
for stat in output_stats:
|
|
||||||
format_spec = "{:.3f}"
|
|
||||||
if stat.endswith("_wps"):
|
|
||||||
format_spec = "{:.0f}"
|
|
||||||
formatted_scores.append(format_spec.format(scores[stat]))
|
|
||||||
result = [itn + 1]
|
|
||||||
result.extend(formatted_scores)
|
|
||||||
if beam_width is not None:
|
|
||||||
result.insert(1, beam_width)
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def _get_total_speed(speeds):
|
|
||||||
seconds_per_word = 0.0
|
|
||||||
for words_per_second in speeds:
|
|
||||||
if words_per_second is None:
|
|
||||||
return None
|
|
||||||
seconds_per_word += 1.0 / words_per_second
|
|
||||||
return 1.0 / seconds_per_word
|
|
|
@ -1,5 +1,7 @@
|
||||||
from typing import Optional, Dict, List, Union, Sequence
|
from typing import Optional, Dict, List, Union, Sequence
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
|
|
||||||
|
import srsly
|
||||||
from pydantic import BaseModel, FilePath
|
from pydantic import BaseModel, FilePath
|
||||||
import plac
|
import plac
|
||||||
import tqdm
|
import tqdm
|
||||||
|
@ -11,10 +13,14 @@ from thinc.api import Model, use_pytorch_for_gpu_memory
|
||||||
import random
|
import random
|
||||||
|
|
||||||
from ..gold import GoldCorpus
|
from ..gold import GoldCorpus
|
||||||
|
<<<<<<< HEAD
|
||||||
from ..gold import Example
|
from ..gold import Example
|
||||||
|
=======
|
||||||
|
from ..lookups import Lookups
|
||||||
|
>>>>>>> origin/develop
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..ml import models # don't remove - required to load the built-in architectures
|
from ..ml import models # don't remove - required to load the built-in architectures
|
||||||
|
|
||||||
registry = util.registry
|
registry = util.registry
|
||||||
|
|
||||||
|
@ -24,7 +30,6 @@ patience = 10
|
||||||
eval_frequency = 10
|
eval_frequency = 10
|
||||||
dropout = 0.2
|
dropout = 0.2
|
||||||
init_tok2vec = null
|
init_tok2vec = null
|
||||||
vectors = null
|
|
||||||
max_epochs = 100
|
max_epochs = 100
|
||||||
orth_variant_level = 0.0
|
orth_variant_level = 0.0
|
||||||
gold_preproc = false
|
gold_preproc = false
|
||||||
|
@ -48,7 +53,7 @@ beta2 = 0.999
|
||||||
|
|
||||||
[nlp]
|
[nlp]
|
||||||
lang = "en"
|
lang = "en"
|
||||||
vectors = ${training:vectors}
|
vectors = null
|
||||||
|
|
||||||
[nlp.pipeline.tok2vec]
|
[nlp.pipeline.tok2vec]
|
||||||
factory = "tok2vec"
|
factory = "tok2vec"
|
||||||
|
@ -94,7 +99,6 @@ class ConfigSchema(BaseModel):
|
||||||
eval_frequency: int = 100
|
eval_frequency: int = 100
|
||||||
dropout: float = 0.2
|
dropout: float = 0.2
|
||||||
init_tok2vec: Optional[FilePath] = None
|
init_tok2vec: Optional[FilePath] = None
|
||||||
vectors: Optional[str] = None
|
|
||||||
max_epochs: int = 100
|
max_epochs: int = 100
|
||||||
orth_variant_level: float = 0.0
|
orth_variant_level: float = 0.0
|
||||||
gold_preproc: bool = False
|
gold_preproc: bool = False
|
||||||
|
@ -120,9 +124,14 @@ class ConfigSchema(BaseModel):
|
||||||
dev_path=("Location of JSON-formatted development data", "positional", None, Path),
|
dev_path=("Location of JSON-formatted development data", "positional", None, Path),
|
||||||
config_path=("Path to config file", "positional", None, Path),
|
config_path=("Path to config file", "positional", None, Path),
|
||||||
output_path=("Output directory to store model in", "option", "o", Path),
|
output_path=("Output directory to store model in", "option", "o", Path),
|
||||||
meta_path=("Optional path to meta.json to use as base.", "option", "m", Path),
|
init_tok2vec=(
|
||||||
|
"Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental.", "option", "t2v",
|
||||||
|
Path),
|
||||||
raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path),
|
raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path),
|
||||||
|
verbose=("Display more information for debugging purposes", "flag", "VV", bool),
|
||||||
use_gpu=("Use GPU", "option", "g", int),
|
use_gpu=("Use GPU", "option", "g", int),
|
||||||
|
tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
|
||||||
|
omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
)
|
)
|
||||||
def train_cli(
|
def train_cli(
|
||||||
|
@ -130,30 +139,54 @@ def train_cli(
|
||||||
dev_path,
|
dev_path,
|
||||||
config_path,
|
config_path,
|
||||||
output_path=None,
|
output_path=None,
|
||||||
meta_path=None,
|
init_tok2vec=None,
|
||||||
raw_text=None,
|
raw_text=None,
|
||||||
debug=False,
|
|
||||||
verbose=False,
|
verbose=False,
|
||||||
use_gpu=-1,
|
use_gpu=-1,
|
||||||
|
tag_map_path=None,
|
||||||
|
omit_extra_lookups=False,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Train or update a spaCy model. Requires data to be formatted in spaCy's
|
Train or update a spaCy model. Requires data to be formatted in spaCy's
|
||||||
JSON format. To convert data from other formats, use the `spacy convert`
|
JSON format. To convert data from other formats, use the `spacy convert`
|
||||||
command.
|
command.
|
||||||
"""
|
"""
|
||||||
|
util.set_env_log(verbose)
|
||||||
|
|
||||||
|
# Make sure all files and paths exists if they are needed
|
||||||
if not config_path or not config_path.exists():
|
if not config_path or not config_path.exists():
|
||||||
msg.fail("Config file not found", config_path, exits=1)
|
msg.fail("Config file not found", config_path, exits=1)
|
||||||
if not train_path or not train_path.exists():
|
if not train_path or not train_path.exists():
|
||||||
msg.fail("Training data not found", train_path, exits=1)
|
msg.fail("Training data not found", train_path, exits=1)
|
||||||
if not dev_path or not dev_path.exists():
|
if not dev_path or not dev_path.exists():
|
||||||
msg.fail("Development data not found", dev_path, exits=1)
|
msg.fail("Development data not found", dev_path, exits=1)
|
||||||
if meta_path is not None and not meta_path.exists():
|
if output_path is not None:
|
||||||
msg.fail("Can't find model meta.json", meta_path, exits=1)
|
if not output_path.exists():
|
||||||
if output_path is not None and not output_path.exists():
|
output_path.mkdir()
|
||||||
output_path.mkdir()
|
msg.good(f"Created output directory: {output_path}")
|
||||||
|
elif output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
|
||||||
|
msg.warn(
|
||||||
|
"Output directory is not empty.",
|
||||||
|
"This can lead to unintended side effects when saving the model. "
|
||||||
|
"Please use an empty directory or a different path instead. If "
|
||||||
|
"the specified output path doesn't exist, the directory will be "
|
||||||
|
"created for you.",
|
||||||
|
)
|
||||||
|
if raw_text is not None:
|
||||||
|
raw_text = list(srsly.read_jsonl(raw_text))
|
||||||
|
tag_map = {}
|
||||||
|
if tag_map_path is not None:
|
||||||
|
tag_map = srsly.read_json(tag_map_path)
|
||||||
|
|
||||||
|
weights_data = None
|
||||||
|
if init_tok2vec is not None:
|
||||||
|
if not init_tok2vec.exists():
|
||||||
|
msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
|
||||||
|
with init_tok2vec.open("rb") as file_:
|
||||||
|
weights_data = file_.read()
|
||||||
|
|
||||||
if use_gpu >= 0:
|
if use_gpu >= 0:
|
||||||
msg.info("Using GPU")
|
msg.info("Using GPU: {use_gpu}")
|
||||||
util.use_gpu(use_gpu)
|
util.use_gpu(use_gpu)
|
||||||
else:
|
else:
|
||||||
msg.info("Using CPU")
|
msg.info("Using CPU")
|
||||||
|
@ -162,31 +195,126 @@ def train_cli(
|
||||||
config_path,
|
config_path,
|
||||||
{"train": train_path, "dev": dev_path},
|
{"train": train_path, "dev": dev_path},
|
||||||
output_path=output_path,
|
output_path=output_path,
|
||||||
meta_path=meta_path,
|
|
||||||
raw_text=raw_text,
|
raw_text=raw_text,
|
||||||
|
tag_map=tag_map,
|
||||||
|
weights_data=weights_data,
|
||||||
|
omit_extra_lookups=omit_extra_lookups,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def train(
|
def train(
|
||||||
config_path, data_paths, raw_text=None, meta_path=None, output_path=None,
|
config_path,
|
||||||
|
data_paths,
|
||||||
|
raw_text=None,
|
||||||
|
output_path=None,
|
||||||
|
tag_map=None,
|
||||||
|
weights_data=None,
|
||||||
|
omit_extra_lookups=False,
|
||||||
):
|
):
|
||||||
msg.info(f"Loading config from: {config_path}")
|
msg.info(f"Loading config from: {config_path}")
|
||||||
# Read the config first without creating objects, to get to the original nlp_config
|
# Read the config first without creating objects, to get to the original nlp_config
|
||||||
config = util.load_config(config_path, create_objects=False)
|
config = util.load_config(config_path, create_objects=False)
|
||||||
util.fix_random_seed(config["training"]["seed"])
|
util.fix_random_seed(config["training"]["seed"])
|
||||||
if config["training"]["use_pytorch_for_gpu_memory"]:
|
if config["training"].get("use_pytorch_for_gpu_memory"):
|
||||||
|
# It feels kind of weird to not have a default for this.
|
||||||
use_pytorch_for_gpu_memory()
|
use_pytorch_for_gpu_memory()
|
||||||
nlp_config = config["nlp"]
|
nlp_config = config["nlp"]
|
||||||
config = util.load_config(config_path, create_objects=True)
|
config = util.load_config(config_path, create_objects=True)
|
||||||
|
training = config["training"]
|
||||||
msg.info("Creating nlp from config")
|
msg.info("Creating nlp from config")
|
||||||
nlp = util.load_model_from_config(nlp_config)
|
nlp = util.load_model_from_config(nlp_config)
|
||||||
training = config["training"]
|
|
||||||
optimizer = training["optimizer"]
|
optimizer = training["optimizer"]
|
||||||
limit = training["limit"]
|
limit = training["limit"]
|
||||||
msg.info("Loading training corpus")
|
msg.info("Loading training corpus")
|
||||||
corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit)
|
corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit)
|
||||||
msg.info("Initializing the nlp pipeline")
|
# verify textcat config
|
||||||
nlp.begin_training(lambda: corpus.train_dataset(nlp))
|
if "textcat" in nlp_config["pipeline"]:
|
||||||
|
textcat_labels = set(nlp.get_pipe("textcat").labels)
|
||||||
|
textcat_multilabel = not nlp_config["pipeline"]["textcat"]["model"]["exclusive_classes"]
|
||||||
|
|
||||||
|
# check whether the setting 'exclusive_classes' corresponds to the provided training data
|
||||||
|
if textcat_multilabel:
|
||||||
|
multilabel_found = False
|
||||||
|
for ex in corpus.train_examples:
|
||||||
|
cats = ex.doc_annotation.cats
|
||||||
|
textcat_labels.update(cats.keys())
|
||||||
|
if list(cats.values()).count(1.0) != 1:
|
||||||
|
multilabel_found = True
|
||||||
|
if not multilabel_found:
|
||||||
|
msg.warn(
|
||||||
|
"The textcat training instances look like they have "
|
||||||
|
"mutually exclusive classes. Set 'exclusive_classes' "
|
||||||
|
"to 'true' in the config to train a classifier with "
|
||||||
|
"mutually exclusive classes more accurately."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
for ex in corpus.train_examples:
|
||||||
|
cats = ex.doc_annotation.cats
|
||||||
|
textcat_labels.update(cats.keys())
|
||||||
|
if list(cats.values()).count(1.0) != 1:
|
||||||
|
msg.fail(
|
||||||
|
"Some textcat training instances do not have exactly "
|
||||||
|
"one positive label. Set 'exclusive_classes' "
|
||||||
|
"to 'false' in the config to train a classifier with classes "
|
||||||
|
"that are not mutually exclusive."
|
||||||
|
)
|
||||||
|
msg.info(f"Initialized textcat component for {len(textcat_labels)} unique labels")
|
||||||
|
nlp.get_pipe("textcat").labels = tuple(textcat_labels)
|
||||||
|
|
||||||
|
# if 'positive_label' is provided: double check whether it's in the data and the task is binary
|
||||||
|
if nlp_config["pipeline"]["textcat"].get("positive_label", None):
|
||||||
|
textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", [])
|
||||||
|
pos_label = nlp_config["pipeline"]["textcat"]["positive_label"]
|
||||||
|
if pos_label not in textcat_labels:
|
||||||
|
msg.fail(
|
||||||
|
f"The textcat's 'positive_label' config setting '{pos_label}' "
|
||||||
|
f"does not match any label in the training data.",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
|
if len(textcat_labels) != 2:
|
||||||
|
msg.fail(
|
||||||
|
f"A textcat 'positive_label' '{pos_label}' was "
|
||||||
|
f"provided for training data that does not appear to be a "
|
||||||
|
f"binary classification problem with two labels.",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
if training.get("resume", False):
|
||||||
|
msg.info("Resuming training")
|
||||||
|
nlp.resume_training()
|
||||||
|
else:
|
||||||
|
msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
|
||||||
|
nlp.begin_training(lambda: corpus.train_dataset(nlp))
|
||||||
|
|
||||||
|
# Update tag map with provided mapping
|
||||||
|
nlp.vocab.morphology.tag_map.update(tag_map)
|
||||||
|
|
||||||
|
# Create empty extra lexeme tables so the data from spacy-lookups-data
|
||||||
|
# isn't loaded if these features are accessed
|
||||||
|
if omit_extra_lookups:
|
||||||
|
nlp.vocab.lookups_extra = Lookups()
|
||||||
|
nlp.vocab.lookups_extra.add_table("lexeme_cluster")
|
||||||
|
nlp.vocab.lookups_extra.add_table("lexeme_prob")
|
||||||
|
nlp.vocab.lookups_extra.add_table("lexeme_settings")
|
||||||
|
|
||||||
|
# Load a pretrained tok2vec model - cf. CLI command 'pretrain'
|
||||||
|
if weights_data is not None:
|
||||||
|
tok2vec_path = config.get("pretraining", {}).get("tok2vec_model", None)
|
||||||
|
if tok2vec_path is None:
|
||||||
|
msg.fail(
|
||||||
|
f"To use a pretrained tok2vec model, the config needs to specify which "
|
||||||
|
f"tok2vec layer to load in the setting [pretraining.tok2vec_model].",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
|
tok2vec = config
|
||||||
|
for subpath in tok2vec_path.split("."):
|
||||||
|
tok2vec = tok2vec.get(subpath)
|
||||||
|
if not tok2vec:
|
||||||
|
msg.fail(
|
||||||
|
f"Could not locate the tok2vec model at {tok2vec_path}.",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
|
tok2vec.from_bytes(weights_data)
|
||||||
|
|
||||||
train_batches = create_train_batches(nlp, corpus, training)
|
train_batches = create_train_batches(nlp, corpus, training)
|
||||||
evaluate = create_evaluation_callback(nlp, optimizer, corpus, training)
|
evaluate = create_evaluation_callback(nlp, optimizer, corpus, training)
|
||||||
|
@ -203,6 +331,7 @@ def train(
|
||||||
patience=training.get("patience", 0),
|
patience=training.get("patience", 0),
|
||||||
max_steps=training.get("max_steps", 0),
|
max_steps=training.get("max_steps", 0),
|
||||||
eval_frequency=training["eval_frequency"],
|
eval_frequency=training["eval_frequency"],
|
||||||
|
raw_text=raw_text,
|
||||||
)
|
)
|
||||||
|
|
||||||
msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
|
msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
|
||||||
|
@ -216,7 +345,8 @@ def train(
|
||||||
progress.close()
|
progress.close()
|
||||||
print_row(info)
|
print_row(info)
|
||||||
if is_best_checkpoint and output_path is not None:
|
if is_best_checkpoint and output_path is not None:
|
||||||
nlp.to_disk(output_path)
|
update_meta(training, nlp, info)
|
||||||
|
nlp.to_disk(output_path / "model-best")
|
||||||
progress = tqdm.tqdm(total=training["eval_frequency"], leave=False)
|
progress = tqdm.tqdm(total=training["eval_frequency"], leave=False)
|
||||||
# Clean up the objects to faciliate garbage collection.
|
# Clean up the objects to faciliate garbage collection.
|
||||||
for eg in batch:
|
for eg in batch:
|
||||||
|
@ -224,6 +354,12 @@ def train(
|
||||||
eg.goldparse = None
|
eg.goldparse = None
|
||||||
eg.doc_annotation = None
|
eg.doc_annotation = None
|
||||||
eg.token_annotation = None
|
eg.token_annotation = None
|
||||||
|
except Exception as e:
|
||||||
|
msg.warn(
|
||||||
|
f"Aborting and saving the final best model. "
|
||||||
|
f"Encountered exception: {str(e)}",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
finally:
|
finally:
|
||||||
if output_path is not None:
|
if output_path is not None:
|
||||||
final_model_path = output_path / "model-final"
|
final_model_path = output_path / "model-final"
|
||||||
|
@ -232,12 +368,13 @@ def train(
|
||||||
nlp.to_disk(final_model_path)
|
nlp.to_disk(final_model_path)
|
||||||
else:
|
else:
|
||||||
nlp.to_disk(final_model_path)
|
nlp.to_disk(final_model_path)
|
||||||
msg.good("Saved model to output directory", final_model_path)
|
msg.good(f"Saved model to output directory {final_model_path}")
|
||||||
|
|
||||||
|
|
||||||
def create_train_batches(nlp, corpus, cfg):
|
def create_train_batches(nlp, corpus, cfg):
|
||||||
epochs_todo = cfg.get("max_epochs", 0)
|
epochs_todo = cfg.get("max_epochs", 0)
|
||||||
while True:
|
while True:
|
||||||
|
<<<<<<< HEAD
|
||||||
train_examples = list(corpus.train_dataset(
|
train_examples = list(corpus.train_dataset(
|
||||||
nlp,
|
nlp,
|
||||||
noise_level=0.0,
|
noise_level=0.0,
|
||||||
|
@ -246,10 +383,26 @@ def create_train_batches(nlp, corpus, cfg):
|
||||||
max_length=cfg["max_length"],
|
max_length=cfg["max_length"],
|
||||||
ignore_misaligned=True
|
ignore_misaligned=True
|
||||||
))
|
))
|
||||||
|
=======
|
||||||
|
train_examples = list(
|
||||||
|
corpus.train_dataset(
|
||||||
|
nlp,
|
||||||
|
noise_level=0.0, # I think this is deprecated?
|
||||||
|
orth_variant_level=cfg["orth_variant_level"],
|
||||||
|
gold_preproc=cfg["gold_preproc"],
|
||||||
|
max_length=cfg["max_length"],
|
||||||
|
ignore_misaligned=True,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
>>>>>>> origin/develop
|
||||||
if len(train_examples) == 0:
|
if len(train_examples) == 0:
|
||||||
raise ValueError(Errors.E988)
|
raise ValueError(Errors.E988)
|
||||||
random.shuffle(train_examples)
|
random.shuffle(train_examples)
|
||||||
batches = util.minibatch_by_words(train_examples, size=cfg["batch_size"], discard_oversize=cfg["discard_oversize"])
|
batches = util.minibatch_by_words(
|
||||||
|
train_examples,
|
||||||
|
size=cfg["batch_size"],
|
||||||
|
discard_oversize=cfg["discard_oversize"],
|
||||||
|
)
|
||||||
# make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
|
# make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
|
||||||
try:
|
try:
|
||||||
first = next(batches)
|
first = next(batches)
|
||||||
|
@ -275,7 +428,7 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
|
||||||
|
|
||||||
n_words = sum(len(ex.doc) for ex in dev_examples)
|
n_words = sum(len(ex.doc) for ex in dev_examples)
|
||||||
start_time = timer()
|
start_time = timer()
|
||||||
|
|
||||||
if optimizer.averages:
|
if optimizer.averages:
|
||||||
with nlp.use_params(optimizer.averages):
|
with nlp.use_params(optimizer.averages):
|
||||||
scorer = nlp.evaluate(dev_examples, batch_size=32)
|
scorer = nlp.evaluate(dev_examples, batch_size=32)
|
||||||
|
@ -286,7 +439,11 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
|
||||||
scores = scorer.scores
|
scores = scorer.scores
|
||||||
# Calculate a weighted sum based on score_weights for the main score
|
# Calculate a weighted sum based on score_weights for the main score
|
||||||
weights = cfg["score_weights"]
|
weights = cfg["score_weights"]
|
||||||
weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights)
|
try:
|
||||||
|
weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights)
|
||||||
|
except KeyError as e:
|
||||||
|
raise KeyError(Errors.E983.format(dict_name='score_weights', key=str(e), keys=list(scores.keys())))
|
||||||
|
|
||||||
scores["speed"] = wps
|
scores["speed"] = wps
|
||||||
return weighted_score, scores
|
return weighted_score, scores
|
||||||
|
|
||||||
|
@ -294,8 +451,17 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
|
||||||
|
|
||||||
|
|
||||||
def train_while_improving(
|
def train_while_improving(
|
||||||
nlp, optimizer, train_data, evaluate, *, dropout, eval_frequency,
|
nlp,
|
||||||
accumulate_gradient=1, patience=0, max_steps=0
|
optimizer,
|
||||||
|
train_data,
|
||||||
|
evaluate,
|
||||||
|
*,
|
||||||
|
dropout,
|
||||||
|
eval_frequency,
|
||||||
|
accumulate_gradient=1,
|
||||||
|
patience=0,
|
||||||
|
max_steps=0,
|
||||||
|
raw_text=None,
|
||||||
):
|
):
|
||||||
"""Train until an evaluation stops improving. Works as a generator,
|
"""Train until an evaluation stops improving. Works as a generator,
|
||||||
with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
|
with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
|
||||||
|
@ -343,11 +509,22 @@ def train_while_improving(
|
||||||
losses = {}
|
losses = {}
|
||||||
to_enable = [name for name, proc in nlp.pipeline if hasattr(proc, "model")]
|
to_enable = [name for name, proc in nlp.pipeline if hasattr(proc, "model")]
|
||||||
|
|
||||||
|
if raw_text:
|
||||||
|
random.shuffle(raw_text)
|
||||||
|
raw_batches = util.minibatch(
|
||||||
|
(nlp.make_doc(rt["text"]) for rt in raw_text), size=8
|
||||||
|
)
|
||||||
|
|
||||||
for step, batch in enumerate(train_data):
|
for step, batch in enumerate(train_data):
|
||||||
dropout = next(dropouts)
|
dropout = next(dropouts)
|
||||||
with nlp.select_pipes(enable=to_enable):
|
with nlp.select_pipes(enable=to_enable):
|
||||||
for subbatch in subdivide_batch(batch, accumulate_gradient):
|
for subbatch in subdivide_batch(batch, accumulate_gradient):
|
||||||
nlp.update(subbatch, drop=dropout, losses=losses, sgd=False)
|
nlp.update(subbatch, drop=dropout, losses=losses, sgd=False)
|
||||||
|
if raw_text:
|
||||||
|
# If raw text is available, perform 'rehearsal' updates,
|
||||||
|
# which use unlabelled data to reduce overfitting.
|
||||||
|
raw_batch = list(next(raw_batches))
|
||||||
|
nlp.rehearse(raw_batch, sgd=optimizer, losses=losses)
|
||||||
for name, proc in nlp.pipeline:
|
for name, proc in nlp.pipeline:
|
||||||
if hasattr(proc, "model"):
|
if hasattr(proc, "model"):
|
||||||
proc.model.finish_update(optimizer)
|
proc.model.finish_update(optimizer)
|
||||||
|
@ -388,7 +565,7 @@ def subdivide_batch(batch, accumulate_gradient):
|
||||||
if subbatch:
|
if subbatch:
|
||||||
yield subbatch
|
yield subbatch
|
||||||
start += len(subbatch)
|
start += len(subbatch)
|
||||||
subbatch = batch[start : ]
|
subbatch = batch[start:]
|
||||||
if subbatch:
|
if subbatch:
|
||||||
yield subbatch
|
yield subbatch
|
||||||
|
|
||||||
|
@ -407,14 +584,34 @@ def setup_printer(training, nlp):
|
||||||
msg.row(["-" * width for width in table_widths])
|
msg.row(["-" * width for width in table_widths])
|
||||||
|
|
||||||
def print_row(info):
|
def print_row(info):
|
||||||
losses = [
|
try:
|
||||||
"{0:.2f}".format(float(info["losses"].get(pipe_name, 0.0)))
|
losses = [
|
||||||
for pipe_name in nlp.pipe_names
|
"{0:.2f}".format(float(info["losses"][pipe_name]))
|
||||||
]
|
for pipe_name in nlp.pipe_names
|
||||||
scores = [
|
]
|
||||||
"{0:.2f}".format(float(info["other_scores"].get(col, 0.0))) for col in score_cols
|
except KeyError as e:
|
||||||
]
|
raise KeyError(
|
||||||
data = [info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))]
|
Errors.E983.format(dict_name='scores (losses)', key=str(e), keys=list(info["losses"].keys())))
|
||||||
|
|
||||||
|
try:
|
||||||
|
scores = [
|
||||||
|
"{0:.2f}".format(float(info["other_scores"][col]))
|
||||||
|
for col in score_cols
|
||||||
|
]
|
||||||
|
except KeyError as e:
|
||||||
|
raise KeyError(Errors.E983.format(dict_name='scores (other)', key=str(e), keys=list(info["other_scores"].keys())))
|
||||||
|
data = (
|
||||||
|
[info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))]
|
||||||
|
)
|
||||||
msg.row(data, widths=table_widths, aligns=table_aligns)
|
msg.row(data, widths=table_widths, aligns=table_aligns)
|
||||||
|
|
||||||
return print_row
|
return print_row
|
||||||
|
|
||||||
|
|
||||||
|
def update_meta(training, nlp, info):
|
||||||
|
score_cols = training["scores"]
|
||||||
|
nlp.meta["performance"] = {}
|
||||||
|
for metric in score_cols:
|
||||||
|
nlp.meta["performance"][metric] = info["other_scores"][metric]
|
||||||
|
for pipe_name in nlp.pipe_names:
|
||||||
|
nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
|
||||||
|
|
|
@ -587,6 +587,15 @@ class Errors(object):
|
||||||
"to the offsets of the 'entities' annotations.")
|
"to the offsets of the 'entities' annotations.")
|
||||||
E985 = ("The 'ent_iob' attribute of a Token should be an integer indexing "
|
E985 = ("The 'ent_iob' attribute of a Token should be an integer indexing "
|
||||||
"into {values}, but found {value}.")
|
"into {values}, but found {value}.")
|
||||||
|
|
||||||
|
E983 = ("Invalid key for '{dict_name}': {key}. Available keys: "
|
||||||
|
"{keys}")
|
||||||
|
E984 = ("Could not parse the {input} - double check the data is written "
|
||||||
|
"in the correct format as expected by spaCy.")
|
||||||
|
E985 = ("The pipeline component '{component}' is already available in the base "
|
||||||
|
"model. The settings in the component block in the config file are "
|
||||||
|
"being ignored. If you want to replace this component instead, set "
|
||||||
|
"'replace' to True in the training configuration.")
|
||||||
E986 = ("Could not create any training batches: check your input. "
|
E986 = ("Could not create any training batches: check your input. "
|
||||||
"Perhaps discard_oversize should be set to False ?")
|
"Perhaps discard_oversize should be set to False ?")
|
||||||
E987 = ("The text of an example training instance is either a Doc or "
|
E987 = ("The text of an example training instance is either a Doc or "
|
||||||
|
|
|
@ -319,14 +319,14 @@ class Language(object):
|
||||||
# transform the model's config to an actual Model
|
# transform the model's config to an actual Model
|
||||||
factory_cfg = dict(config)
|
factory_cfg = dict(config)
|
||||||
|
|
||||||
# check whether we have a proper model config, or load a default one
|
# check whether we have a proper model config, ignore if the type is wrong
|
||||||
if "model" in factory_cfg and not isinstance(factory_cfg["model"], dict):
|
if "model" in factory_cfg and not isinstance(factory_cfg["model"], dict):
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
Warnings.W099.format(type=type(factory_cfg["model"]), pipe=name)
|
Warnings.W099.format(type=type(factory_cfg["model"]), pipe=name)
|
||||||
)
|
)
|
||||||
|
|
||||||
# refer to the model configuration in the cfg settings for this component
|
# refer to the model configuration in the cfg settings for this component
|
||||||
if "model" in factory_cfg:
|
elif "model" in factory_cfg:
|
||||||
self.config[name] = {"model": factory_cfg["model"]}
|
self.config[name] = {"model": factory_cfg["model"]}
|
||||||
|
|
||||||
# create all objects in the config
|
# create all objects in the config
|
||||||
|
@ -1089,6 +1089,7 @@ class component(object):
|
||||||
requires=tuple(),
|
requires=tuple(),
|
||||||
retokenizes=False,
|
retokenizes=False,
|
||||||
default_model=lambda: None,
|
default_model=lambda: None,
|
||||||
|
default_config=None,
|
||||||
):
|
):
|
||||||
"""Decorate a pipeline component.
|
"""Decorate a pipeline component.
|
||||||
|
|
||||||
|
@ -1102,6 +1103,7 @@ class component(object):
|
||||||
self.requires = validate_attrs(requires)
|
self.requires = validate_attrs(requires)
|
||||||
self.retokenizes = retokenizes
|
self.retokenizes = retokenizes
|
||||||
self.default_model = default_model
|
self.default_model = default_model
|
||||||
|
self.default_config = default_config
|
||||||
|
|
||||||
def __call__(self, *args, **kwargs):
|
def __call__(self, *args, **kwargs):
|
||||||
obj = args[0]
|
obj = args[0]
|
||||||
|
@ -1116,9 +1118,10 @@ class component(object):
|
||||||
def factory(nlp, model, **cfg):
|
def factory(nlp, model, **cfg):
|
||||||
if model is None:
|
if model is None:
|
||||||
model = self.default_model()
|
model = self.default_model()
|
||||||
warnings.warn(Warnings.W098.format(name=self.name))
|
if self.default_config:
|
||||||
if model is None:
|
for key, value in self.default_config.items():
|
||||||
warnings.warn(Warnings.W097.format(name=self.name))
|
if key not in cfg:
|
||||||
|
cfg[key] = value
|
||||||
if hasattr(obj, "from_nlp"):
|
if hasattr(obj, "from_nlp"):
|
||||||
return obj.from_nlp(nlp, model, **cfg)
|
return obj.from_nlp(nlp, model, **cfg)
|
||||||
elif isinstance(obj, type):
|
elif isinstance(obj, type):
|
||||||
|
|
|
@ -3,26 +3,31 @@ import numpy
|
||||||
from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
|
from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
|
||||||
|
|
||||||
|
|
||||||
def build_multi_task_model(n_tags, tok2vec=None, token_vector_width=96):
|
def build_multi_task_model(tok2vec, maxout_pieces, token_vector_width, nO=None):
|
||||||
|
softmax = Softmax(nO=nO, nI=token_vector_width * 2)
|
||||||
model = chain(
|
model = chain(
|
||||||
tok2vec,
|
tok2vec,
|
||||||
Maxout(nO=token_vector_width * 2, nI=token_vector_width, nP=3, dropout=0.0),
|
Maxout(nO=token_vector_width * 2, nI=token_vector_width, nP=maxout_pieces, dropout=0.0),
|
||||||
LayerNorm(token_vector_width * 2),
|
LayerNorm(token_vector_width * 2),
|
||||||
Softmax(nO=n_tags, nI=token_vector_width * 2),
|
softmax,
|
||||||
)
|
)
|
||||||
|
model.set_ref("tok2vec", tok2vec)
|
||||||
|
model.set_ref("output_layer", softmax)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
def build_cloze_multi_task_model(vocab, tok2vec):
|
def build_cloze_multi_task_model(vocab, tok2vec, maxout_pieces, nO=None):
|
||||||
output_size = vocab.vectors.data.shape[1]
|
# nO = vocab.vectors.data.shape[1]
|
||||||
output_layer = chain(
|
output_layer = chain(
|
||||||
Maxout(
|
Maxout(
|
||||||
nO=output_size, nI=tok2vec.get_dim("nO"), nP=3, normalize=True, dropout=0.0
|
nO=nO, nI=tok2vec.get_dim("nO"), nP=maxout_pieces, normalize=True, dropout=0.0
|
||||||
),
|
),
|
||||||
Linear(nO=output_size, nI=output_size, init_W=zero_init),
|
Linear(nO=nO, nI=nO, init_W=zero_init),
|
||||||
)
|
)
|
||||||
model = chain(tok2vec, output_layer)
|
model = chain(tok2vec, output_layer)
|
||||||
model = build_masked_language_model(vocab, model)
|
model = build_masked_language_model(vocab, model)
|
||||||
|
model.set_ref("tok2vec", tok2vec)
|
||||||
|
model.set_ref("output_layer", output_layer)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -31,6 +31,7 @@ def build_simple_cnn_text_classifier(tok2vec, exclusive_classes, nO=None):
|
||||||
model.set_ref("output_layer", linear_layer)
|
model.set_ref("output_layer", linear_layer)
|
||||||
model.set_ref("tok2vec", tok2vec)
|
model.set_ref("tok2vec", tok2vec)
|
||||||
model.set_dim("nO", nO)
|
model.set_dim("nO", nO)
|
||||||
|
model.attrs["multi_label"] = not exclusive_classes
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
@ -44,6 +45,7 @@ def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO
|
||||||
output_layer = softmax_activation() if exclusive_classes else Logistic()
|
output_layer = softmax_activation() if exclusive_classes else Logistic()
|
||||||
model = model >> with_cpu(output_layer, output_layer.ops)
|
model = model >> with_cpu(output_layer, output_layer.ops)
|
||||||
model.set_ref("output_layer", sparse_linear)
|
model.set_ref("output_layer", sparse_linear)
|
||||||
|
model.attrs["multi_label"] = not exclusive_classes
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
@ -110,6 +112,7 @@ def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_class
|
||||||
if model.has_dim("nO") is not False:
|
if model.has_dim("nO") is not False:
|
||||||
model.set_dim("nO", nO)
|
model.set_dim("nO", nO)
|
||||||
model.set_ref("output_layer", linear_model.get_ref("output_layer"))
|
model.set_ref("output_layer", linear_model.get_ref("output_layer"))
|
||||||
|
model.attrs["multi_label"] = not exclusive_classes
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
|
15
spacy/pipeline/defaults/multitask_defaults.cfg
Normal file
15
spacy/pipeline/defaults/multitask_defaults.cfg
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
[model]
|
||||||
|
@architectures = "spacy.MultiTask.v1"
|
||||||
|
maxout_pieces = 3
|
||||||
|
token_vector_width = 96
|
||||||
|
|
||||||
|
[model.tok2vec]
|
||||||
|
@architectures = "spacy.HashEmbedCNN.v1"
|
||||||
|
pretrained_vectors = null
|
||||||
|
width = 96
|
||||||
|
depth = 4
|
||||||
|
embed_size = 2000
|
||||||
|
window_size = 1
|
||||||
|
maxout_pieces = 2
|
||||||
|
subword_features = true
|
||||||
|
dropout = null
|
|
@ -619,9 +619,10 @@ class MultitaskObjective(Tagger):
|
||||||
side-objective.
|
side-objective.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, vocab, model, target='dep_tag_offset', **cfg):
|
def __init__(self, vocab, model, **cfg):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
|
target = cfg["target"] # default: 'dep_tag_offset'
|
||||||
if target == "dep":
|
if target == "dep":
|
||||||
self.make_label = self.make_dep
|
self.make_label = self.make_dep
|
||||||
elif target == "tag":
|
elif target == "tag":
|
||||||
|
@ -639,8 +640,6 @@ class MultitaskObjective(Tagger):
|
||||||
else:
|
else:
|
||||||
raise ValueError(Errors.E016)
|
raise ValueError(Errors.E016)
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
# TODO: remove - put in config
|
|
||||||
self.cfg.setdefault("maxout_pieces", 2)
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
|
@ -653,7 +652,7 @@ class MultitaskObjective(Tagger):
|
||||||
def set_annotations(self, docs, dep_ids, tensors=None):
|
def set_annotations(self, docs, dep_ids, tensors=None):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def begin_training(self, get_examples=lambda: [], pipeline=None, tok2vec=None,
|
def begin_training(self, get_examples=lambda: [], pipeline=None,
|
||||||
sgd=None, **kwargs):
|
sgd=None, **kwargs):
|
||||||
gold_examples = nonproj.preprocess_training_data(get_examples())
|
gold_examples = nonproj.preprocess_training_data(get_examples())
|
||||||
# for raw_text, doc_annot in gold_tuples:
|
# for raw_text, doc_annot in gold_tuples:
|
||||||
|
@ -745,13 +744,13 @@ class ClozeMultitask(Pipe):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
self.cfg = cfg
|
self.cfg = cfg
|
||||||
self.distance = CosineDistance(ignore_zeros=True, normalize=False)
|
self.distance = CosineDistance(ignore_zeros=True, normalize=False) # TODO: in config
|
||||||
|
|
||||||
def set_annotations(self, docs, dep_ids, tensors=None):
|
def set_annotations(self, docs, dep_ids, tensors=None):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def begin_training(self, get_examples=lambda: [], pipeline=None,
|
def begin_training(self, get_examples=lambda: [], pipeline=None,
|
||||||
tok2vec=None, sgd=None, **kwargs):
|
sgd=None, **kwargs):
|
||||||
link_vectors_to_models(self.vocab)
|
link_vectors_to_models(self.vocab)
|
||||||
self.model.initialize()
|
self.model.initialize()
|
||||||
X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
|
X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
|
||||||
|
@ -960,28 +959,27 @@ cdef class DependencyParser(Parser):
|
||||||
output.append(merge_subtokens)
|
output.append(merge_subtokens)
|
||||||
return tuple(output)
|
return tuple(output)
|
||||||
|
|
||||||
def add_multitask_objective(self, target):
|
def add_multitask_objective(self, mt_component):
|
||||||
if target == "cloze":
|
self._multitasks.append(mt_component)
|
||||||
cloze = ClozeMultitask(self.vocab)
|
|
||||||
self._multitasks.append(cloze)
|
|
||||||
else:
|
|
||||||
labeller = MultitaskObjective(self.vocab, target=target)
|
|
||||||
self._multitasks.append(labeller)
|
|
||||||
|
|
||||||
def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
|
def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
|
||||||
|
# TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ?
|
||||||
for labeller in self._multitasks:
|
for labeller in self._multitasks:
|
||||||
tok2vec = self.model.get_ref("tok2vec")
|
labeller.model.set_dim("nO", len(self.labels))
|
||||||
labeller.begin_training(get_examples, pipeline=pipeline,
|
if labeller.model.has_ref("output_layer"):
|
||||||
tok2vec=tok2vec, sgd=sgd)
|
labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
|
||||||
|
labeller.begin_training(get_examples, pipeline=pipeline, sgd=sgd)
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return (DependencyParser, (self.vocab, self.model), self.moves)
|
return (DependencyParser, (self.vocab, self.model), (self.moves, self.cfg))
|
||||||
|
|
||||||
def __getstate__(self):
|
def __getstate__(self):
|
||||||
return self.moves
|
return (self.moves, self.cfg)
|
||||||
|
|
||||||
def __setstate__(self, moves):
|
def __setstate__(self, state):
|
||||||
|
moves, config = state
|
||||||
self.moves = moves
|
self.moves = moves
|
||||||
|
self.cfg = config
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
|
@ -1007,28 +1005,27 @@ cdef class EntityRecognizer(Parser):
|
||||||
requires = []
|
requires = []
|
||||||
TransitionSystem = BiluoPushDown
|
TransitionSystem = BiluoPushDown
|
||||||
|
|
||||||
def add_multitask_objective(self, target):
|
def add_multitask_objective(self, mt_component):
|
||||||
if target == "cloze":
|
self._multitasks.append(mt_component)
|
||||||
cloze = ClozeMultitask(self.vocab)
|
|
||||||
self._multitasks.append(cloze)
|
|
||||||
else:
|
|
||||||
labeller = MultitaskObjective(self.vocab, target=target)
|
|
||||||
self._multitasks.append(labeller)
|
|
||||||
|
|
||||||
def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
|
def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
|
||||||
|
# TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ?
|
||||||
for labeller in self._multitasks:
|
for labeller in self._multitasks:
|
||||||
tok2vec = self.model.get_ref("tok2vec")
|
labeller.model.set_dim("nO", len(self.labels))
|
||||||
labeller.begin_training(get_examples, pipeline=pipeline,
|
if labeller.model.has_ref("output_layer"):
|
||||||
tok2vec=tok2vec)
|
labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
|
||||||
|
labeller.begin_training(get_examples, pipeline=pipeline)
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return (EntityRecognizer, (self.vocab, self.model), self.moves)
|
return (EntityRecognizer, (self.vocab, self.model), (self.moves, self.cfg))
|
||||||
|
|
||||||
def __getstate__(self):
|
def __getstate__(self):
|
||||||
return self.moves
|
return self.moves, self.cfg
|
||||||
|
|
||||||
def __setstate__(self, moves):
|
def __setstate__(self, state):
|
||||||
|
moves, config = state
|
||||||
self.moves = moves
|
self.moves = moves
|
||||||
|
self.cfg = config
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
|
@ -1487,15 +1484,23 @@ Language.factories["parser"] = lambda nlp, model, **cfg: parser_factory(nlp, mod
|
||||||
Language.factories["ner"] = lambda nlp, model, **cfg: ner_factory(nlp, model, **cfg)
|
Language.factories["ner"] = lambda nlp, model, **cfg: ner_factory(nlp, model, **cfg)
|
||||||
|
|
||||||
def parser_factory(nlp, model, **cfg):
|
def parser_factory(nlp, model, **cfg):
|
||||||
|
default_config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||||
if model is None:
|
if model is None:
|
||||||
model = default_parser()
|
model = default_parser()
|
||||||
warnings.warn(Warnings.W098.format(name="parser"))
|
warnings.warn(Warnings.W098.format(name="parser"))
|
||||||
|
for key, value in default_config.items():
|
||||||
|
if key not in cfg:
|
||||||
|
cfg[key] = value
|
||||||
return DependencyParser.from_nlp(nlp, model, **cfg)
|
return DependencyParser.from_nlp(nlp, model, **cfg)
|
||||||
|
|
||||||
def ner_factory(nlp, model, **cfg):
|
def ner_factory(nlp, model, **cfg):
|
||||||
|
default_config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||||
if model is None:
|
if model is None:
|
||||||
model = default_ner()
|
model = default_ner()
|
||||||
warnings.warn(Warnings.W098.format(name="ner"))
|
warnings.warn(Warnings.W098.format(name="ner"))
|
||||||
|
for key, value in default_config.items():
|
||||||
|
if key not in cfg:
|
||||||
|
cfg[key] = value
|
||||||
return EntityRecognizer.from_nlp(nlp, model, **cfg)
|
return EntityRecognizer.from_nlp(nlp, model, **cfg)
|
||||||
|
|
||||||
__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"]
|
__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"]
|
||||||
|
|
|
@ -172,7 +172,7 @@ class Tok2VecListener(Model):
|
||||||
|
|
||||||
def verify_inputs(self, inputs):
|
def verify_inputs(self, inputs):
|
||||||
if self._batch_id is None and self._outputs is None:
|
if self._batch_id is None and self._outputs is None:
|
||||||
raise ValueError
|
raise ValueError("The Tok2Vec listener did not receive valid input.")
|
||||||
else:
|
else:
|
||||||
batch_id = self.get_batch_id(inputs)
|
batch_id = self.get_batch_id(inputs)
|
||||||
if batch_id != self._batch_id:
|
if batch_id != self._batch_id:
|
||||||
|
|
106
spacy/scorer.py
106
spacy/scorer.py
|
@ -88,24 +88,20 @@ class Scorer(object):
|
||||||
self.ner = PRFScore()
|
self.ner = PRFScore()
|
||||||
self.ner_per_ents = dict()
|
self.ner_per_ents = dict()
|
||||||
self.eval_punct = eval_punct
|
self.eval_punct = eval_punct
|
||||||
self.textcat = None
|
self.textcat = PRFScore()
|
||||||
self.textcat_per_cat = dict()
|
self.textcat_f_per_cat = dict()
|
||||||
|
self.textcat_auc_per_cat = dict()
|
||||||
self.textcat_positive_label = None
|
self.textcat_positive_label = None
|
||||||
self.textcat_multilabel = False
|
self.textcat_multilabel = False
|
||||||
|
|
||||||
if pipeline:
|
if pipeline:
|
||||||
for name, model in pipeline:
|
for name, component in pipeline:
|
||||||
if name == "textcat":
|
if name == "textcat":
|
||||||
self.textcat_positive_label = model.cfg.get("positive_label", None)
|
self.textcat_multilabel = component.model.attrs["multi_label"]
|
||||||
if self.textcat_positive_label:
|
self.textcat_positive_label = component.cfg.get("positive_label", None)
|
||||||
self.textcat = PRFScore()
|
for label in component.cfg.get("labels", []):
|
||||||
if not model.cfg.get("exclusive_classes", False):
|
self.textcat_auc_per_cat[label] = ROCAUCScore()
|
||||||
self.textcat_multilabel = True
|
self.textcat_f_per_cat[label] = PRFScore()
|
||||||
for label in model.cfg.get("labels", []):
|
|
||||||
self.textcat_per_cat[label] = ROCAUCScore()
|
|
||||||
else:
|
|
||||||
for label in model.cfg.get("labels", []):
|
|
||||||
self.textcat_per_cat[label] = PRFScore()
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def tags_acc(self):
|
def tags_acc(self):
|
||||||
|
@ -207,46 +203,52 @@ class Scorer(object):
|
||||||
}
|
}
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def textcat_score(self):
|
def textcat_f(self):
|
||||||
"""RETURNS (float): f-score on positive label for binary exclusive,
|
"""RETURNS (float): f-score on positive label for binary classification,
|
||||||
macro-averaged f-score for 3+ exclusive,
|
macro-averaged f-score for multilabel classification
|
||||||
macro-averaged AUC ROC score for multilabel (-1 if undefined)
|
|
||||||
"""
|
"""
|
||||||
if not self.textcat_multilabel:
|
if not self.textcat_multilabel:
|
||||||
# binary multiclass
|
|
||||||
if self.textcat_positive_label:
|
if self.textcat_positive_label:
|
||||||
|
# binary classification
|
||||||
return self.textcat.fscore * 100
|
return self.textcat.fscore * 100
|
||||||
# other multiclass
|
# multi-class and/or multi-label
|
||||||
return (
|
return (
|
||||||
sum([score.fscore for label, score in self.textcat_per_cat.items()])
|
sum([score.fscore for label, score in self.textcat_f_per_cat.items()])
|
||||||
/ (len(self.textcat_per_cat) + 1e-100)
|
/ (len(self.textcat_f_per_cat) + 1e-100)
|
||||||
* 100
|
* 100
|
||||||
)
|
)
|
||||||
# multilabel
|
|
||||||
|
@property
|
||||||
|
def textcat_auc(self):
|
||||||
|
"""RETURNS (float): macro-averaged AUC ROC score for multilabel classification (-1 if undefined)
|
||||||
|
"""
|
||||||
return max(
|
return max(
|
||||||
sum([score.score for label, score in self.textcat_per_cat.items()])
|
sum([score.score for label, score in self.textcat_auc_per_cat.items()])
|
||||||
/ (len(self.textcat_per_cat) + 1e-100),
|
/ (len(self.textcat_auc_per_cat) + 1e-100),
|
||||||
-1,
|
-1,
|
||||||
)
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def textcats_per_cat(self):
|
def textcats_auc_per_cat(self):
|
||||||
"""RETURNS (dict): Scores per textcat label.
|
"""RETURNS (dict): AUC ROC Scores per textcat label.
|
||||||
"""
|
"""
|
||||||
if not self.textcat_multilabel:
|
|
||||||
return {
|
|
||||||
k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
|
|
||||||
for k, v in self.textcat_per_cat.items()
|
|
||||||
}
|
|
||||||
return {
|
return {
|
||||||
k: {"roc_auc_score": max(v.score, -1)}
|
k: {"roc_auc_score": max(v.score, -1)}
|
||||||
for k, v in self.textcat_per_cat.items()
|
for k, v in self.textcat_auc_per_cat.items()
|
||||||
|
}
|
||||||
|
|
||||||
|
@property
|
||||||
|
def textcats_f_per_cat(self):
|
||||||
|
"""RETURNS (dict): F-scores per textcat label.
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
|
||||||
|
for k, v in self.textcat_f_per_cat.items()
|
||||||
}
|
}
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def scores(self):
|
def scores(self):
|
||||||
"""RETURNS (dict): All scores with keys `uas`, `las`, `ents_p`,
|
"""RETURNS (dict): All scores mapped by key.
|
||||||
`ents_r`, `ents_f`, `tags_acc`, `token_acc`, and `textcat_score`.
|
|
||||||
"""
|
"""
|
||||||
return {
|
return {
|
||||||
"uas": self.uas,
|
"uas": self.uas,
|
||||||
|
@ -264,8 +266,10 @@ class Scorer(object):
|
||||||
"sent_r": self.sent_r,
|
"sent_r": self.sent_r,
|
||||||
"sent_f": self.sent_f,
|
"sent_f": self.sent_f,
|
||||||
"token_acc": self.token_acc,
|
"token_acc": self.token_acc,
|
||||||
"textcat_score": self.textcat_score,
|
"textcat_f": self.textcat_f,
|
||||||
"textcats_per_cat": self.textcats_per_cat,
|
"textcat_auc": self.textcat_auc,
|
||||||
|
"textcats_f_per_cat": self.textcats_f_per_cat,
|
||||||
|
"textcats_auc_per_cat": self.textcats_auc_per_cat,
|
||||||
}
|
}
|
||||||
|
|
||||||
def score(self, example, verbose=False, punct_labels=("p", "punct")):
|
def score(self, example, verbose=False, punct_labels=("p", "punct")):
|
||||||
|
@ -408,7 +412,7 @@ class Scorer(object):
|
||||||
)
|
)
|
||||||
if (
|
if (
|
||||||
len(gold.cats) > 0
|
len(gold.cats) > 0
|
||||||
and set(self.textcat_per_cat) == set(gold.cats)
|
and set(self.textcat_f_per_cat) == set(self.textcat_auc_per_cat) == set(gold.cats)
|
||||||
and set(gold.cats) == set(doc.cats)
|
and set(gold.cats) == set(doc.cats)
|
||||||
):
|
):
|
||||||
goldcat = max(gold.cats, key=gold.cats.get)
|
goldcat = max(gold.cats, key=gold.cats.get)
|
||||||
|
@ -418,17 +422,21 @@ class Scorer(object):
|
||||||
set([self.textcat_positive_label]) & set([candcat]),
|
set([self.textcat_positive_label]) & set([candcat]),
|
||||||
set([self.textcat_positive_label]) & set([goldcat]),
|
set([self.textcat_positive_label]) & set([goldcat]),
|
||||||
)
|
)
|
||||||
for label in self.textcat_per_cat:
|
for label in set(gold.cats):
|
||||||
if self.textcat_multilabel:
|
self.textcat_auc_per_cat[label].score_set(
|
||||||
self.textcat_per_cat[label].score_set(
|
|
||||||
doc.cats[label], gold.cats[label]
|
doc.cats[label], gold.cats[label]
|
||||||
)
|
)
|
||||||
else:
|
self.textcat_f_per_cat[label].score_set(
|
||||||
self.textcat_per_cat[label].score_set(
|
|
||||||
set([label]) & set([candcat]), set([label]) & set([goldcat])
|
set([label]) & set([candcat]), set([label]) & set([goldcat])
|
||||||
)
|
)
|
||||||
elif len(self.textcat_per_cat) > 0:
|
elif len(self.textcat_f_per_cat) > 0:
|
||||||
model_labels = set(self.textcat_per_cat)
|
model_labels = set(self.textcat_f_per_cat)
|
||||||
|
eval_labels = set(gold.cats)
|
||||||
|
raise ValueError(
|
||||||
|
Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels)
|
||||||
|
)
|
||||||
|
elif len(self.textcat_auc_per_cat) > 0:
|
||||||
|
model_labels = set(self.textcat_auc_per_cat)
|
||||||
eval_labels = set(gold.cats)
|
eval_labels = set(gold.cats)
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels)
|
Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels)
|
||||||
|
|
|
@ -64,15 +64,14 @@ cdef class Parser:
|
||||||
# defined by EntityRecognizer as a BiluoPushDown
|
# defined by EntityRecognizer as a BiluoPushDown
|
||||||
moves = self.TransitionSystem(self.vocab.strings)
|
moves = self.TransitionSystem(self.vocab.strings)
|
||||||
self.moves = moves
|
self.moves = moves
|
||||||
cfg.setdefault('min_action_freq', 30)
|
|
||||||
cfg.setdefault('learn_tokens', False)
|
|
||||||
cfg.setdefault('beam_width', 1)
|
|
||||||
cfg.setdefault('beam_update_prob', 1.0) # or 0.5 (both defaults were previously used)
|
|
||||||
self.model = model
|
self.model = model
|
||||||
if self.moves.n_moves != 0:
|
if self.moves.n_moves != 0:
|
||||||
self.set_output(self.moves.n_moves)
|
self.set_output(self.moves.n_moves)
|
||||||
self.cfg = cfg
|
self.cfg = cfg
|
||||||
self._multitasks = []
|
self._multitasks = []
|
||||||
|
for multitask in cfg.get("multitasks", []):
|
||||||
|
self.add_multitask_objective(multitask)
|
||||||
|
|
||||||
self._rehearsal_model = None
|
self._rehearsal_model = None
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -80,13 +79,15 @@ cdef class Parser:
|
||||||
return cls(nlp.vocab, model, **cfg)
|
return cls(nlp.vocab, model, **cfg)
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return (Parser, (self.vocab, self.model), self.moves)
|
return (Parser, (self.vocab, self.model), (self.moves, self.cfg))
|
||||||
|
|
||||||
def __getstate__(self):
|
def __getstate__(self):
|
||||||
return self.moves
|
return (self.moves, self.cfg)
|
||||||
|
|
||||||
def __setstate__(self, moves):
|
def __setstate__(self, state):
|
||||||
|
moves, config = state
|
||||||
self.moves = moves
|
self.moves = moves
|
||||||
|
self.cfg = config
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def move_names(self):
|
def move_names(self):
|
||||||
|
|
|
@ -9,7 +9,8 @@ from spacy.pipeline.defaults import default_ner
|
||||||
def test_doc_add_entities_set_ents_iob(en_vocab):
|
def test_doc_add_entities_set_ents_iob(en_vocab):
|
||||||
text = ["This", "is", "a", "lion"]
|
text = ["This", "is", "a", "lion"]
|
||||||
doc = get_doc(en_vocab, text)
|
doc = get_doc(en_vocab, text)
|
||||||
ner = EntityRecognizer(en_vocab, default_ner())
|
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||||
|
ner = EntityRecognizer(en_vocab, default_ner(), **config)
|
||||||
ner.begin_training([])
|
ner.begin_training([])
|
||||||
ner(doc)
|
ner(doc)
|
||||||
assert len(list(doc.ents)) == 0
|
assert len(list(doc.ents)) == 0
|
||||||
|
@ -25,7 +26,8 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
|
||||||
def test_ents_reset(en_vocab):
|
def test_ents_reset(en_vocab):
|
||||||
text = ["This", "is", "a", "lion"]
|
text = ["This", "is", "a", "lion"]
|
||||||
doc = get_doc(en_vocab, text)
|
doc = get_doc(en_vocab, text)
|
||||||
ner = EntityRecognizer(en_vocab, default_ner())
|
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||||
|
ner = EntityRecognizer(en_vocab, default_ner(), **config)
|
||||||
ner.begin_training([])
|
ner.begin_training([])
|
||||||
ner(doc)
|
ner(doc)
|
||||||
assert [t.ent_iob_ for t in doc] == (["O"] * len(doc))
|
assert [t.ent_iob_ for t in doc] == (["O"] * len(doc))
|
||||||
|
|
|
@ -17,7 +17,8 @@ def vocab():
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def parser(vocab):
|
def parser(vocab):
|
||||||
parser = DependencyParser(vocab, default_parser())
|
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||||
|
parser = DependencyParser(vocab, default_parser(), **config)
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
@ -61,12 +62,13 @@ def test_add_label(parser):
|
||||||
|
|
||||||
|
|
||||||
def test_add_label_deserializes_correctly():
|
def test_add_label_deserializes_correctly():
|
||||||
ner1 = EntityRecognizer(Vocab(), default_ner())
|
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||||
|
ner1 = EntityRecognizer(Vocab(), default_ner(), **config)
|
||||||
ner1.add_label("C")
|
ner1.add_label("C")
|
||||||
ner1.add_label("B")
|
ner1.add_label("B")
|
||||||
ner1.add_label("A")
|
ner1.add_label("A")
|
||||||
ner1.begin_training([])
|
ner1.begin_training([])
|
||||||
ner2 = EntityRecognizer(Vocab(), default_ner())
|
ner2 = EntityRecognizer(Vocab(), default_ner(), **config)
|
||||||
|
|
||||||
# the second model needs to be resized before we can call from_bytes
|
# the second model needs to be resized before we can call from_bytes
|
||||||
ner2.model.attrs["resize_output"](ner2.model, ner1.moves.n_moves)
|
ner2.model.attrs["resize_output"](ner2.model, ner1.moves.n_moves)
|
||||||
|
|
|
@ -138,7 +138,8 @@ def test_get_oracle_actions():
|
||||||
deps.append(dep)
|
deps.append(dep)
|
||||||
ents.append(ent)
|
ents.append(ent)
|
||||||
doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
|
doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
|
||||||
parser = DependencyParser(doc.vocab, default_parser())
|
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||||
|
parser = DependencyParser(doc.vocab, default_parser(), **config)
|
||||||
parser.moves.add_action(0, "")
|
parser.moves.add_action(0, "")
|
||||||
parser.moves.add_action(1, "")
|
parser.moves.add_action(1, "")
|
||||||
parser.moves.add_action(1, "")
|
parser.moves.add_action(1, "")
|
||||||
|
|
|
@ -138,7 +138,8 @@ def test_accept_blocked_token():
|
||||||
# 1. test normal behaviour
|
# 1. test normal behaviour
|
||||||
nlp1 = English()
|
nlp1 = English()
|
||||||
doc1 = nlp1("I live in New York")
|
doc1 = nlp1("I live in New York")
|
||||||
ner1 = EntityRecognizer(doc1.vocab, default_ner())
|
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||||
|
ner1 = EntityRecognizer(doc1.vocab, default_ner(), **config)
|
||||||
assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""]
|
assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""]
|
||||||
assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""]
|
assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""]
|
||||||
|
|
||||||
|
@ -156,7 +157,8 @@ def test_accept_blocked_token():
|
||||||
# 2. test blocking behaviour
|
# 2. test blocking behaviour
|
||||||
nlp2 = English()
|
nlp2 = English()
|
||||||
doc2 = nlp2("I live in New York")
|
doc2 = nlp2("I live in New York")
|
||||||
ner2 = EntityRecognizer(doc2.vocab, default_ner())
|
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||||
|
ner2 = EntityRecognizer(doc2.vocab, default_ner(), **config)
|
||||||
|
|
||||||
# set "New York" to a blocked entity
|
# set "New York" to a blocked entity
|
||||||
doc2.ents = [(0, 3, 5)]
|
doc2.ents = [(0, 3, 5)]
|
||||||
|
@ -213,7 +215,8 @@ def test_overwrite_token():
|
||||||
assert [token.ent_type_ for token in doc] == ["", "", "", "", ""]
|
assert [token.ent_type_ for token in doc] == ["", "", "", "", ""]
|
||||||
|
|
||||||
# Check that a new ner can overwrite O
|
# Check that a new ner can overwrite O
|
||||||
ner2 = EntityRecognizer(doc.vocab, default_ner())
|
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||||
|
ner2 = EntityRecognizer(doc.vocab, default_ner(), **config)
|
||||||
ner2.moves.add_action(5, "")
|
ner2.moves.add_action(5, "")
|
||||||
ner2.add_label("GPE")
|
ner2.add_label("GPE")
|
||||||
state = ner2.moves.init_batch([doc])[0]
|
state = ner2.moves.init_batch([doc])[0]
|
||||||
|
|
|
@ -28,7 +28,8 @@ def tok2vec():
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def parser(vocab, arc_eager):
|
def parser(vocab, arc_eager):
|
||||||
return Parser(vocab, model=default_parser(), moves=arc_eager)
|
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||||
|
return Parser(vocab, model=default_parser(), moves=arc_eager, **config)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
|
|
@ -94,7 +94,8 @@ def test_beam_advance_too_few_scores(beam, scores):
|
||||||
|
|
||||||
def test_beam_parse():
|
def test_beam_parse():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
nlp.add_pipe(DependencyParser(nlp.vocab, default_parser()), name="parser")
|
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||||
|
nlp.add_pipe(DependencyParser(nlp.vocab, default_parser(), **config), name="parser")
|
||||||
nlp.parser.add_label("nsubj")
|
nlp.parser.add_label("nsubj")
|
||||||
nlp.parser.begin_training([], token_vector_width=8, hidden_width=8)
|
nlp.parser.begin_training([], token_vector_width=8, hidden_width=8)
|
||||||
doc = nlp.make_doc("Australia is a country")
|
doc = nlp.make_doc("Australia is a country")
|
||||||
|
|
|
@ -15,7 +15,8 @@ def vocab():
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def parser(vocab):
|
def parser(vocab):
|
||||||
parser = DependencyParser(vocab, default_parser())
|
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||||
|
parser = DependencyParser(vocab, default_parser(), **config)
|
||||||
parser.cfg["token_vector_width"] = 4
|
parser.cfg["token_vector_width"] = 4
|
||||||
parser.cfg["hidden_width"] = 32
|
parser.cfg["hidden_width"] = 32
|
||||||
# parser.add_label('right')
|
# parser.add_label('right')
|
||||||
|
|
|
@ -270,7 +270,8 @@ def test_issue1963(en_tokenizer):
|
||||||
|
|
||||||
@pytest.mark.parametrize("label", ["U-JOB-NAME"])
|
@pytest.mark.parametrize("label", ["U-JOB-NAME"])
|
||||||
def test_issue1967(label):
|
def test_issue1967(label):
|
||||||
ner = EntityRecognizer(Vocab(), default_ner())
|
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||||
|
ner = EntityRecognizer(Vocab(), default_ner(), **config)
|
||||||
example = Example(
|
example = Example(
|
||||||
doc=Doc(ner.vocab, words=["word"]),
|
doc=Doc(ner.vocab, words=["word"]),
|
||||||
token_annotation=TokenAnnotation(
|
token_annotation=TokenAnnotation(
|
||||||
|
|
|
@ -196,7 +196,8 @@ def test_issue3345():
|
||||||
doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
|
doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
|
||||||
doc[4].is_sent_start = True
|
doc[4].is_sent_start = True
|
||||||
ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
|
ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
|
||||||
ner = EntityRecognizer(doc.vocab, default_ner())
|
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||||
|
ner = EntityRecognizer(doc.vocab, default_ner(), **config)
|
||||||
# Add the OUT action. I wouldn't have thought this would be necessary...
|
# Add the OUT action. I wouldn't have thought this would be necessary...
|
||||||
ner.moves.add_action(5, "")
|
ner.moves.add_action(5, "")
|
||||||
ner.add_label("GPE")
|
ner.add_label("GPE")
|
||||||
|
|
|
@ -6,7 +6,8 @@ from spacy.pipeline.defaults import default_parser
|
||||||
|
|
||||||
def test_issue3830_no_subtok():
|
def test_issue3830_no_subtok():
|
||||||
"""Test that the parser doesn't have subtok label if not learn_tokens"""
|
"""Test that the parser doesn't have subtok label if not learn_tokens"""
|
||||||
parser = DependencyParser(Vocab(), default_parser())
|
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||||
|
parser = DependencyParser(Vocab(), default_parser(), **config)
|
||||||
parser.add_label("nsubj")
|
parser.add_label("nsubj")
|
||||||
assert "subtok" not in parser.labels
|
assert "subtok" not in parser.labels
|
||||||
parser.begin_training(lambda: [])
|
parser.begin_training(lambda: [])
|
||||||
|
@ -15,7 +16,8 @@ def test_issue3830_no_subtok():
|
||||||
|
|
||||||
def test_issue3830_with_subtok():
|
def test_issue3830_with_subtok():
|
||||||
"""Test that the parser does have subtok label if learn_tokens=True."""
|
"""Test that the parser does have subtok label if learn_tokens=True."""
|
||||||
parser = DependencyParser(Vocab(), default_parser(), learn_tokens=True)
|
config = {"learn_tokens": True, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||||
|
parser = DependencyParser(Vocab(), default_parser(), **config)
|
||||||
parser.add_label("nsubj")
|
parser.add_label("nsubj")
|
||||||
assert "subtok" not in parser.labels
|
assert "subtok" not in parser.labels
|
||||||
parser.begin_training(lambda: [])
|
parser.begin_training(lambda: [])
|
||||||
|
|
|
@ -74,6 +74,7 @@ def test_issue4042_bug2():
|
||||||
output_dir.mkdir()
|
output_dir.mkdir()
|
||||||
ner1.to_disk(output_dir)
|
ner1.to_disk(output_dir)
|
||||||
|
|
||||||
ner2 = EntityRecognizer(vocab, default_ner())
|
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||||
|
ner2 = EntityRecognizer(vocab, default_ner(), **config)
|
||||||
ner2.from_disk(output_dir)
|
ner2.from_disk(output_dir)
|
||||||
assert len(ner2.labels) == 2
|
assert len(ner2.labels) == 2
|
||||||
|
|
|
@ -12,7 +12,8 @@ def test_issue4313():
|
||||||
beam_width = 16
|
beam_width = 16
|
||||||
beam_density = 0.0001
|
beam_density = 0.0001
|
||||||
nlp = English()
|
nlp = English()
|
||||||
ner = EntityRecognizer(nlp.vocab, default_ner())
|
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||||
|
ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
|
||||||
ner.add_label("SOME_LABEL")
|
ner.add_label("SOME_LABEL")
|
||||||
ner.begin_training([])
|
ner.begin_training([])
|
||||||
nlp.add_pipe(ner)
|
nlp.add_pipe(ner)
|
||||||
|
|
|
@ -1,12 +1,30 @@
|
||||||
import pytest
|
import pickle
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
|
|
||||||
|
from spacy.tests.util import make_tempdir
|
||||||
|
|
||||||
|
|
||||||
|
def test_pickle_ner():
|
||||||
|
""" Ensure the pickling of the NER goes well"""
|
||||||
|
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
||||||
|
nlp = English(vocab=vocab)
|
||||||
|
ner = nlp.create_pipe("ner", config={"min_action_freq": 342})
|
||||||
|
with make_tempdir() as tmp_path:
|
||||||
|
with (tmp_path / "ner.pkl").open("wb") as file_:
|
||||||
|
pickle.dump(ner, file_)
|
||||||
|
assert ner.cfg["min_action_freq"] == 342
|
||||||
|
|
||||||
|
with (tmp_path / "ner.pkl").open("rb") as file_:
|
||||||
|
ner2 = pickle.load(file_)
|
||||||
|
assert ner2.cfg["min_action_freq"] == 342
|
||||||
|
|
||||||
|
|
||||||
def test_issue4725():
|
def test_issue4725():
|
||||||
# ensures that this runs correctly and doesn't hang or crash because of the global vectors
|
# ensures that this runs correctly and doesn't hang or crash because of the global vectors
|
||||||
|
# if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows)
|
||||||
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
||||||
data = numpy.ndarray((5, 3), dtype="f")
|
data = numpy.ndarray((5, 3), dtype="f")
|
||||||
data[0] = 1.0
|
data[0] = 1.0
|
||||||
|
|
|
@ -12,7 +12,8 @@ test_parsers = [DependencyParser, EntityRecognizer]
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def parser(en_vocab):
|
def parser(en_vocab):
|
||||||
parser = DependencyParser(en_vocab, default_parser())
|
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||||
|
parser = DependencyParser(en_vocab, default_parser(), **config)
|
||||||
parser.add_label("nsubj")
|
parser.add_label("nsubj")
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
|
@ -186,7 +186,7 @@ def load_model_from_path(model_path, meta=False, **overrides):
|
||||||
return nlp.from_disk(model_path, exclude=disable)
|
return nlp.from_disk(model_path, exclude=disable)
|
||||||
|
|
||||||
|
|
||||||
def load_model_from_config(nlp_config):
|
def load_model_from_config(nlp_config, replace=False):
|
||||||
if "name" in nlp_config:
|
if "name" in nlp_config:
|
||||||
nlp = load_model(**nlp_config)
|
nlp = load_model(**nlp_config)
|
||||||
elif "lang" in nlp_config:
|
elif "lang" in nlp_config:
|
||||||
|
@ -197,8 +197,15 @@ def load_model_from_config(nlp_config):
|
||||||
if "pipeline" in nlp_config:
|
if "pipeline" in nlp_config:
|
||||||
for name, component_cfg in nlp_config["pipeline"].items():
|
for name, component_cfg in nlp_config["pipeline"].items():
|
||||||
factory = component_cfg.pop("factory")
|
factory = component_cfg.pop("factory")
|
||||||
component = nlp.create_pipe(factory, config=component_cfg)
|
if name in nlp.pipe_names:
|
||||||
nlp.add_pipe(component, name=name)
|
if replace:
|
||||||
|
component = nlp.create_pipe(factory, config=component_cfg)
|
||||||
|
nlp.replace_pipe(name, component)
|
||||||
|
else:
|
||||||
|
raise ValueError(Errors.E985.format(component=name))
|
||||||
|
else:
|
||||||
|
component = nlp.create_pipe(factory, config=component_cfg)
|
||||||
|
nlp.add_pipe(component, name=name)
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -46,17 +46,19 @@ Update the evaluation scores from a single [`Doc`](/api/doc) /
|
||||||
|
|
||||||
## Properties
|
## Properties
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------------------------------------------- | ----- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| --------------------------------------------------- | ----- | ---------------------------------------------------------------------------------------------------------- |
|
||||||
| `token_acc` | float | Tokenization accuracy. |
|
| `token_acc` | float | Tokenization accuracy. |
|
||||||
| `tags_acc` | float | Part-of-speech tag accuracy (fine grained tags, i.e. `Token.tag`). |
|
| `tags_acc` | float | Part-of-speech tag accuracy (fine grained tags, i.e. `Token.tag`). |
|
||||||
| `uas` | float | Unlabelled dependency score. |
|
| `uas` | float | Unlabelled dependency score. |
|
||||||
| `las` | float | Labelled dependency score. |
|
| `las` | float | Labelled dependency score. |
|
||||||
| `ents_p` | float | Named entity accuracy (precision). |
|
| `ents_p` | float | Named entity accuracy (precision). |
|
||||||
| `ents_r` | float | Named entity accuracy (recall). |
|
| `ents_r` | float | Named entity accuracy (recall). |
|
||||||
| `ents_f` | float | Named entity accuracy (F-score). |
|
| `ents_f` | float | Named entity accuracy (F-score). |
|
||||||
| `ents_per_type` <Tag variant="new">2.1.5</Tag> | dict | Scores per entity label. Keyed by label, mapped to a dict of `p`, `r` and `f` scores. |
|
| `ents_per_type` <Tag variant="new">2.1.5</Tag> | dict | Scores per entity label. Keyed by label, mapped to a dict of `p`, `r` and `f` scores. |
|
||||||
| `textcat_score` <Tag variant="new">2.2</Tag> | float | F-score on positive label for binary exclusive, macro-averaged F-score for 3+ exclusive, macro-averaged AUC ROC score for multilabel (`-1` if undefined). |
|
| `textcat_f` <Tag variant="new">3.0</Tag> | float | F-score on positive label for binary classification, macro-averaged F-score otherwise. |
|
||||||
| `textcats_per_cat` <Tag variant="new">2.2</Tag> | dict | Scores per textcat label, keyed by label. |
|
| `textcat_auc` <Tag variant="new"3.0</Tag> | float | Macro-averaged AUC ROC score for multilabel classification (`-1` if undefined). |
|
||||||
| `las_per_type` <Tag variant="new">2.2.3</Tag> | dict | Labelled dependency scores, keyed by label. |
|
| `textcats_f_per_cat` <Tag variant="new">3.0</Tag> | dict | F-scores per textcat label, keyed by label. |
|
||||||
| `scores` | dict | All scores, keyed by type. |
|
| `textcats_auc_per_cat` <Tag variant="new">3.0</Tag> | dict | ROC AUC scores per textcat label, keyed by label. |
|
||||||
|
| `las_per_type` <Tag variant="new">2.2.3</Tag> | dict | Labelled dependency scores, keyed by label. |
|
||||||
|
| `scores` | dict | All scores, keyed by type. |
|
||||||
|
|
Loading…
Reference in New Issue
Block a user