Merge from develop

This commit is contained in:
Matthew Honnibal 2020-06-14 17:35:01 +02:00
commit 706e652820
34 changed files with 513 additions and 960 deletions

View File

@ -9,6 +9,7 @@ max_length = 0
limit = 0 limit = 0
# Data augmentation # Data augmentation
orth_variant_level = 0.0 orth_variant_level = 0.0
noise_level = 0.0
dropout = 0.1 dropout = 0.1
# Controls early-stopping. 0 or -1 mean unlimited. # Controls early-stopping. 0 or -1 mean unlimited.
patience = 1600 patience = 1600
@ -24,8 +25,8 @@ scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2} score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
# These settings are invalid for the transformer models. # These settings are invalid for the transformer models.
init_tok2vec = null init_tok2vec = null
vectors = null
discard_oversize = false discard_oversize = false
omit_extra_lookups = false
[training.batch_size] [training.batch_size]
@schedules = "compounding.v1" @schedules = "compounding.v1"
@ -52,7 +53,7 @@ learn_rate = 0.001
[nlp] [nlp]
lang = "en" lang = "en"
vectors = ${training:vectors} vectors = null
[nlp.pipeline.tok2vec] [nlp.pipeline.tok2vec]
factory = "tok2vec" factory = "tok2vec"
@ -62,12 +63,20 @@ factory = "senter"
[nlp.pipeline.ner] [nlp.pipeline.ner]
factory = "ner" factory = "ner"
learn_tokens = false
min_action_freq = 1
beam_width = 1
beam_update_prob = 1.0
[nlp.pipeline.tagger] [nlp.pipeline.tagger]
factory = "tagger" factory = "tagger"
[nlp.pipeline.parser] [nlp.pipeline.parser]
factory = "parser" factory = "parser"
learn_tokens = false
min_action_freq = 1
beam_width = 1
beam_update_prob = 1.0
[nlp.pipeline.senter.model] [nlp.pipeline.senter.model]
@architectures = "spacy.Tagger.v1" @architectures = "spacy.Tagger.v1"

View File

@ -9,6 +9,7 @@ max_length = 0
limit = 0 limit = 0
# Data augmentation # Data augmentation
orth_variant_level = 0.0 orth_variant_level = 0.0
noise_level = 0.0
dropout = 0.1 dropout = 0.1
# Controls early-stopping. 0 or -1 mean unlimited. # Controls early-stopping. 0 or -1 mean unlimited.
patience = 1600 patience = 1600
@ -24,7 +25,6 @@ scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2} score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
# These settings are invalid for the transformer models. # These settings are invalid for the transformer models.
init_tok2vec = null init_tok2vec = null
vectors = null
discard_oversize = false discard_oversize = false
[training.batch_size] [training.batch_size]
@ -72,7 +72,7 @@ normalize = true
[nlp] [nlp]
lang = "en" lang = "en"
vectors = ${training:vectors} vectors = null
[nlp.pipeline.tok2vec] [nlp.pipeline.tok2vec]
factory = "tok2vec" factory = "tok2vec"
@ -82,12 +82,20 @@ factory = "senter"
[nlp.pipeline.ner] [nlp.pipeline.ner]
factory = "ner" factory = "ner"
learn_tokens = false
min_action_freq = 1
beam_width = 1
beam_update_prob = 1.0
[nlp.pipeline.tagger] [nlp.pipeline.tagger]
factory = "tagger" factory = "tagger"
[nlp.pipeline.parser] [nlp.pipeline.parser]
factory = "parser" factory = "parser"
learn_tokens = false
min_action_freq = 1
beam_width = 1
beam_update_prob = 1.0
[nlp.pipeline.senter.model] [nlp.pipeline.senter.model]
@architectures = "spacy.Tagger.v1" @architectures = "spacy.Tagger.v1"

View File

@ -6,6 +6,7 @@ init_tok2vec = null
vectors = null vectors = null
max_epochs = 100 max_epochs = 100
orth_variant_level = 0.0 orth_variant_level = 0.0
noise_level = 0.0
gold_preproc = true gold_preproc = true
max_length = 0 max_length = 0
use_gpu = 0 use_gpu = 0
@ -40,6 +41,10 @@ factory = "tagger"
[nlp.pipeline.parser] [nlp.pipeline.parser]
factory = "parser" factory = "parser"
learn_tokens = false
min_action_freq = 1
beam_width = 1
beam_update_prob = 1.0
[nlp.pipeline.tagger.model] [nlp.pipeline.tagger.model]
@architectures = "spacy.Tagger.v1" @architectures = "spacy.Tagger.v1"

View File

@ -6,6 +6,7 @@ init_tok2vec = null
vectors = null vectors = null
max_epochs = 100 max_epochs = 100
orth_variant_level = 0.0 orth_variant_level = 0.0
noise_level = 0.0
gold_preproc = true gold_preproc = true
max_length = 0 max_length = 0
use_gpu = -1 use_gpu = -1
@ -40,6 +41,10 @@ factory = "tagger"
[nlp.pipeline.parser] [nlp.pipeline.parser]
factory = "parser" factory = "parser"
learn_tokens = false
min_action_freq = 1
beam_width = 1
beam_update_prob = 1.0
[nlp.pipeline.tagger.model] [nlp.pipeline.tagger.model]
@architectures = "spacy.Tagger.v1" @architectures = "spacy.Tagger.v1"

View File

@ -120,13 +120,22 @@ def load_data(dataset, threshold, limit=0, split=0.8):
random.shuffle(train_data) random.shuffle(train_data)
texts, labels = zip(*train_data) texts, labels = zip(*train_data)
unique_labels = sorted(set([l for label_set in labels for l in label_set])) unique_labels = set()
for label_set in labels:
if isinstance(label_set, int) or isinstance(label_set, str):
unique_labels.add(label_set)
elif isinstance(label_set, list) or isinstance(label_set, set):
unique_labels.update(label_set)
unique_labels = sorted(unique_labels)
print(f"# of unique_labels: {len(unique_labels)}") print(f"# of unique_labels: {len(unique_labels)}")
count_values_train = dict() count_values_train = dict()
for text, annot_list in train_data: for text, annot_list in train_data:
for annot in annot_list: if isinstance(annot_list, int) or isinstance(annot_list, str):
count_values_train[annot] = count_values_train.get(annot, 0) + 1 count_values_train[annot_list] = count_values_train.get(annot_list, 0) + 1
else:
for annot in annot_list:
count_values_train[annot] = count_values_train.get(annot, 0) + 1
for value, count in sorted(count_values_train.items(), key=lambda item: item[1]): for value, count in sorted(count_values_train.items(), key=lambda item: item[1]):
if count < threshold: if count < threshold:
unique_labels.remove(value) unique_labels.remove(value)
@ -138,7 +147,7 @@ def load_data(dataset, threshold, limit=0, split=0.8):
else: else:
cats = [] cats = []
for y in labels: for y in labels:
if isinstance(y, str): if isinstance(y, str) or isinstance(y, int):
cats.append({str(label): (label == y) for label in unique_labels}) cats.append({str(label): (label == y) for label in unique_labels})
elif isinstance(y, set): elif isinstance(y, set):
cats.append({str(label): (label in y) for label in unique_labels}) cats.append({str(label): (label in y) for label in unique_labels})

View File

@ -54,7 +54,8 @@ def evaluate(
"NER P": f"{scorer.ents_p:.2f}", "NER P": f"{scorer.ents_p:.2f}",
"NER R": f"{scorer.ents_r:.2f}", "NER R": f"{scorer.ents_r:.2f}",
"NER F": f"{scorer.ents_f:.2f}", "NER F": f"{scorer.ents_f:.2f}",
"Textcat": f"{scorer.textcat_score:.2f}", "Textcat AUC": f"{scorer.textcat_auc:.2f}",
"Textcat F": f"{scorer.textcat_f:.2f}",
"Sent P": f"{scorer.sent_p:.2f}", "Sent P": f"{scorer.sent_p:.2f}",
"Sent R": f"{scorer.sent_r:.2f}", "Sent R": f"{scorer.sent_r:.2f}",
"Sent F": f"{scorer.sent_f:.2f}", "Sent F": f"{scorer.sent_f:.2f}",

View File

@ -266,17 +266,15 @@ def create_pretraining_model(nlp, tok2vec):
the tok2vec input model. The tok2vec input model needs to be a model that the tok2vec input model. The tok2vec input model needs to be a model that
takes a batch of Doc objects (as a list), and returns a list of arrays. takes a batch of Doc objects (as a list), and returns a list of arrays.
Each array in the output needs to have one row per token in the doc. Each array in the output needs to have one row per token in the doc.
The actual tok2vec layer is stored as a reference, and only this bit will be
serialized to file and read back in when calling the 'train' command.
""" """
output_size = nlp.vocab.vectors.data.shape[1] output_size = nlp.vocab.vectors.data.shape[1]
output_layer = chain( output_layer = chain(
Maxout(nO=300, nP=3, normalize=True, dropout=0.0), Linear(output_size) Maxout(nO=300, nP=3, normalize=True, dropout=0.0), Linear(output_size)
) )
# This is annoying, but the parser etc have the flatten step after model = chain(tok2vec, list2array())
# the tok2vec. To load the weights in cleanly, we need to match model = chain(model, output_layer)
# the shape of the models' components exactly. So what we cann
# "tok2vec" has to be the same set of processes as what the components do.
tok2vec = chain(tok2vec, list2array())
model = chain(tok2vec, output_layer)
model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")]) model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
mlm_model = build_masked_language_model(nlp.vocab, model) mlm_model = build_masked_language_model(nlp.vocab, model)
mlm_model.set_ref("tok2vec", tok2vec) mlm_model.set_ref("tok2vec", tok2vec)

View File

@ -1,773 +0,0 @@
import os
import tqdm
from pathlib import Path
from thinc.api import use_ops
from timeit import default_timer as timer
import shutil
import srsly
from wasabi import msg
import contextlib
import random
from ..util import create_default_optimizer
from ..util import use_gpu as set_gpu
from ..gold import GoldCorpus
from ..lookups import Lookups
from .. import util
from .. import about
def train(
# fmt: off
lang: ("Model language", "positional", None, str),
output_path: ("Output directory to store model in", "positional", None, Path),
train_path: ("Location of JSON-formatted training data", "positional", None, Path),
dev_path: ("Location of JSON-formatted development data", "positional", None, Path),
raw_text: ("Path to jsonl file with unlabelled text documents.", "option", "rt", Path) = None,
base_model: ("Name of model to update (optional)", "option", "b", str) = None,
pipeline: ("Comma-separated names of pipeline components", "option", "p", str) = "tagger,parser,ner",
vectors: ("Model to load vectors from", "option", "v", str) = None,
replace_components: ("Replace components from base model", "flag", "R", bool) = False,
n_iter: ("Number of iterations", "option", "n", int) = 30,
n_early_stopping: ("Maximum number of training epochs without dev accuracy improvement", "option", "ne", int) = None,
n_examples: ("Number of examples", "option", "ns", int) = 0,
use_gpu: ("Use GPU", "option", "g", int) = -1,
version: ("Model version", "option", "V", str) = "0.0.0",
meta_path: ("Optional path to meta.json to use as base.", "option", "m", Path) = None,
init_tok2vec: ("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None,
parser_multitasks: ("Side objectives for parser CNN, e.g. 'dep' or 'dep,tag'", "option", "pt", str) = "",
entity_multitasks: ("Side objectives for NER CNN, e.g. 'dep' or 'dep,tag'", "option", "et", str) = "",
noise_level: ("Amount of corruption for data augmentation", "option", "nl", float) = 0.0,
orth_variant_level: ("Amount of orthography variation for data augmentation", "option", "ovl", float) = 0.0,
eval_beam_widths: ("Beam widths to evaluate, e.g. 4,8", "option", "bw", str) = "",
gold_preproc: ("Use gold preprocessing", "flag", "G", bool) = False,
learn_tokens: ("Make parser learn gold-standard tokenization", "flag", "T", bool) = False,
textcat_multilabel: ("Textcat classes aren't mutually exclusive (multilabel)", "flag", "TML", bool) = False,
textcat_arch: ("Textcat model architecture", "option", "ta", str) = "bow",
textcat_positive_label: ("Textcat positive label for binary classes with two labels", "option", "tpl", str) = None,
tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None,
omit_extra_lookups: ("Don't include extra lookups in model", "flag", "OEL", bool) = False,
verbose: ("Display more information for debug", "flag", "VV", bool) = False,
debug: ("Run data diagnostics before training", "flag", "D", bool) = False,
# fmt: on
):
"""
Train or update a spaCy model. Requires data to be formatted in spaCy's
JSON format. To convert data from other formats, use the `spacy convert`
command.
"""
util.fix_random_seed()
util.set_env_log(verbose)
# Make sure all files and paths exists if they are needed
train_path = util.ensure_path(train_path)
dev_path = util.ensure_path(dev_path)
meta_path = util.ensure_path(meta_path)
output_path = util.ensure_path(output_path)
if raw_text is not None:
raw_text = list(srsly.read_jsonl(raw_text))
if not train_path or not train_path.exists():
msg.fail("Training data not found", train_path, exits=1)
if not dev_path or not dev_path.exists():
msg.fail("Development data not found", dev_path, exits=1)
if meta_path is not None and not meta_path.exists():
msg.fail("Can't find model meta.json", meta_path, exits=1)
meta = srsly.read_json(meta_path) if meta_path else {}
if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
msg.warn(
"Output directory is not empty",
"This can lead to unintended side effects when saving the model. "
"Please use an empty directory or a different path instead. If "
"the specified output path doesn't exist, the directory will be "
"created for you.",
)
if not output_path.exists():
output_path.mkdir()
msg.good(f"Created output directory: {output_path}")
tag_map = {}
if tag_map_path is not None:
tag_map = srsly.read_json(tag_map_path)
# Take dropout and batch size as generators of values -- dropout
# starts high and decays sharply, to force the optimizer to explore.
# Batch size starts at 1 and grows, so that we make updates quickly
# at the beginning of training.
dropout_rates = util.decaying(
util.env_opt("dropout_from", 0.2),
util.env_opt("dropout_to", 0.2),
util.env_opt("dropout_decay", 0.0),
)
batch_sizes = util.compounding(
util.env_opt("batch_from", 100.0),
util.env_opt("batch_to", 1000.0),
util.env_opt("batch_compound", 1.001),
)
if not eval_beam_widths:
eval_beam_widths = [1]
else:
eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")]
if 1 not in eval_beam_widths:
eval_beam_widths.append(1)
eval_beam_widths.sort()
has_beam_widths = eval_beam_widths != [1]
default_dir = Path(__file__).parent.parent / "pipeline" / "defaults"
# Set up the base model and pipeline. If a base model is specified, load
# the model and make sure the pipeline matches the pipeline setting. If
# training starts from a blank model, intitalize the language class.
pipeline = [p.strip() for p in pipeline.split(",")]
msg.text(f"Training pipeline: {pipeline}")
disabled_pipes = None
pipes_added = False
if use_gpu >= 0:
activated_gpu = None
try:
activated_gpu = set_gpu(use_gpu)
except Exception as e:
msg.warn(f"Exception: {e}")
if activated_gpu is not None:
msg.text(f"Using GPU: {use_gpu}")
else:
msg.warn(f"Unable to activate GPU: {use_gpu}")
msg.text("Using CPU only")
use_gpu = -1
if base_model:
msg.text(f"Starting with base model '{base_model}'")
nlp = util.load_model(base_model)
if nlp.lang != lang:
msg.fail(
f"Model language ('{nlp.lang}') doesn't match language "
f"specified as `lang` argument ('{lang}') ",
exits=1,
)
if vectors:
msg.text(f"Loading vectors from model '{vectors}'")
_load_vectors(nlp, vectors)
nlp.select_pipes(disable=[p for p in nlp.pipe_names if p not in pipeline])
for pipe in pipeline:
# first, create the model.
# Bit of a hack after the refactor to get the vectors into a default config
# use train-from-config instead :-)
if pipe == "parser":
config_loc = default_dir / "parser_defaults.cfg"
elif pipe == "tagger":
config_loc = default_dir / "tagger_defaults.cfg"
elif pipe == "ner":
config_loc = default_dir / "ner_defaults.cfg"
elif pipe == "textcat":
config_loc = default_dir / "textcat_defaults.cfg"
elif pipe == "senter":
config_loc = default_dir / "senter_defaults.cfg"
else:
raise ValueError(f"Component {pipe} currently not supported.")
pipe_cfg = util.load_config(config_loc, create_objects=False)
if vectors:
pretrained_config = {
"@architectures": "spacy.VocabVectors.v1",
"name": vectors,
}
pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config
if pipe == "parser":
pipe_cfg["learn_tokens"] = learn_tokens
elif pipe == "textcat":
pipe_cfg["exclusive_classes"] = not textcat_multilabel
pipe_cfg["architecture"] = textcat_arch
pipe_cfg["positive_label"] = textcat_positive_label
if pipe not in nlp.pipe_names:
msg.text(f"Adding component to base model '{pipe}'")
nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
pipes_added = True
elif replace_components:
msg.text(f"Replacing component from base model '{pipe}'")
nlp.replace_pipe(pipe, nlp.create_pipe(pipe, config=pipe_cfg))
pipes_added = True
else:
if pipe == "textcat":
textcat_cfg = nlp.get_pipe("textcat").cfg
base_cfg = {
"exclusive_classes": textcat_cfg["exclusive_classes"],
"architecture": textcat_cfg["architecture"],
"positive_label": textcat_cfg["positive_label"],
}
if base_cfg != pipe_cfg:
msg.fail(
f"The base textcat model configuration does"
f"not match the provided training options. "
f"Existing cfg: {base_cfg}, provided cfg: {pipe_cfg}",
exits=1,
)
msg.text(f"Extending component from base model '{pipe}'")
disabled_pipes = nlp.select_pipes(
disable=[p for p in nlp.pipe_names if p not in pipeline]
)
else:
msg.text(f"Starting with blank model '{lang}'")
lang_cls = util.get_lang_class(lang)
nlp = lang_cls()
if vectors:
msg.text(f"Loading vectors from model '{vectors}'")
_load_vectors(nlp, vectors)
for pipe in pipeline:
# first, create the model.
# Bit of a hack after the refactor to get the vectors into a default config
# use train-from-config instead :-)
if pipe == "parser":
config_loc = default_dir / "parser_defaults.cfg"
elif pipe == "tagger":
config_loc = default_dir / "tagger_defaults.cfg"
elif pipe == "morphologizer":
config_loc = default_dir / "morphologizer_defaults.cfg"
elif pipe == "ner":
config_loc = default_dir / "ner_defaults.cfg"
elif pipe == "textcat":
config_loc = default_dir / "textcat_defaults.cfg"
elif pipe == "senter":
config_loc = default_dir / "senter_defaults.cfg"
else:
raise ValueError(f"Component {pipe} currently not supported.")
pipe_cfg = util.load_config(config_loc, create_objects=False)
if vectors:
pretrained_config = {
"@architectures": "spacy.VocabVectors.v1",
"name": vectors,
}
pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config
if pipe == "parser":
pipe_cfg["learn_tokens"] = learn_tokens
elif pipe == "textcat":
pipe_cfg["exclusive_classes"] = not textcat_multilabel
pipe_cfg["architecture"] = textcat_arch
pipe_cfg["positive_label"] = textcat_positive_label
pipe = nlp.create_pipe(pipe, config=pipe_cfg)
nlp.add_pipe(pipe)
# Update tag map with provided mapping
nlp.vocab.morphology.tag_map.update(tag_map)
# Create empty extra lexeme tables so the data from spacy-lookups-data
# isn't loaded if these features are accessed
if omit_extra_lookups:
nlp.vocab.lookups_extra = Lookups()
nlp.vocab.lookups_extra.add_table("lexeme_cluster")
nlp.vocab.lookups_extra.add_table("lexeme_prob")
nlp.vocab.lookups_extra.add_table("lexeme_settings")
if vectors:
msg.text("Loading vector from model '{}'".format(vectors))
_load_vectors(nlp, vectors)
# Multitask objectives
multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)]
for pipe_name, multitasks in multitask_options:
if multitasks:
if pipe_name not in pipeline:
msg.fail(
f"Can't use multitask objective without '{pipe_name}' in "
f"the pipeline"
)
pipe = nlp.get_pipe(pipe_name)
for objective in multitasks.split(","):
pipe.add_multitask_objective(objective)
# Prepare training corpus
msg.text(f"Counting training words (limit={n_examples})")
corpus = GoldCorpus(train_path, dev_path, limit=n_examples)
n_train_words = corpus.count_train()
if base_model and not pipes_added:
# Start with an existing model, use default optimizer
optimizer = create_default_optimizer()
else:
# Start with a blank model, call begin_training
cfg = {"device": use_gpu}
optimizer = nlp.begin_training(lambda: corpus.train_examples, **cfg)
nlp._optimizer = None
# Load in pretrained weights (TODO: this may be broken in the config rewrite)
if init_tok2vec is not None:
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
msg.text(f"Loaded pretrained tok2vec for: {components}")
# Verify textcat config
if "textcat" in pipeline:
textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", [])
if textcat_positive_label and textcat_positive_label not in textcat_labels:
msg.fail(
f"The textcat_positive_label (tpl) '{textcat_positive_label}' "
f"does not match any label in the training data.",
exits=1,
)
if textcat_positive_label and len(textcat_labels) != 2:
msg.fail(
"A textcat_positive_label (tpl) '{textcat_positive_label}' was "
"provided for training data that does not appear to be a "
"binary classification problem with two labels.",
exits=1,
)
train_data = corpus.train_data(
nlp,
noise_level=noise_level,
gold_preproc=gold_preproc,
max_length=0,
ignore_misaligned=True,
)
train_labels = set()
if textcat_multilabel:
multilabel_found = False
for ex in train_data:
train_labels.update(ex.gold.cats.keys())
if list(ex.gold.cats.values()).count(1.0) != 1:
multilabel_found = True
if not multilabel_found and not base_model:
msg.warn(
"The textcat training instances look like they have "
"mutually-exclusive classes. Remove the flag "
"'--textcat-multilabel' to train a classifier with "
"mutually-exclusive classes."
)
if not textcat_multilabel:
for ex in train_data:
train_labels.update(ex.gold.cats.keys())
if list(ex.gold.cats.values()).count(1.0) != 1 and not base_model:
msg.warn(
"Some textcat training instances do not have exactly "
"one positive label. Modifying training options to "
"include the flag '--textcat-multilabel' for classes "
"that are not mutually exclusive."
)
nlp.get_pipe("textcat").cfg["exclusive_classes"] = False
textcat_multilabel = True
break
if base_model and set(textcat_labels) != train_labels:
msg.fail(
f"Cannot extend textcat model using data with different "
f"labels. Base model labels: {textcat_labels}, training data "
f"labels: {list(train_labels)}",
exits=1,
)
if textcat_multilabel:
msg.text(
f"Textcat evaluation score: ROC AUC score macro-averaged across "
f"the labels '{', '.join(textcat_labels)}'"
)
elif textcat_positive_label and len(textcat_labels) == 2:
msg.text(
f"Textcat evaluation score: F1-score for the "
f"label '{textcat_positive_label}'"
)
elif len(textcat_labels) > 1:
if len(textcat_labels) == 2:
msg.warn(
"If the textcat component is a binary classifier with "
"exclusive classes, provide '--textcat-positive-label' for "
"an evaluation on the positive class."
)
msg.text(
f"Textcat evaluation score: F1-score macro-averaged across "
f"the labels '{', '.join(textcat_labels)}'"
)
else:
msg.fail(
"Unsupported textcat configuration. Use `spacy debug-data` "
"for more information."
)
# fmt: off
row_head, output_stats = _configure_training_output(pipeline, use_gpu, has_beam_widths)
row_widths = [len(w) for w in row_head]
row_settings = {"widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2}
# fmt: on
print("")
msg.row(row_head, **row_settings)
msg.row(["-" * width for width in row_settings["widths"]], **row_settings)
try:
iter_since_best = 0
best_score = 0.0
for i in range(n_iter):
train_data = corpus.train_dataset(
nlp,
noise_level=noise_level,
orth_variant_level=orth_variant_level,
gold_preproc=gold_preproc,
max_length=0,
ignore_misaligned=True,
)
if raw_text:
random.shuffle(raw_text)
raw_batches = util.minibatch(
(nlp.make_doc(rt["text"]) for rt in raw_text), size=8
)
words_seen = 0
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
losses = {}
for batch in util.minibatch_by_words(train_data, size=batch_sizes):
if not batch:
continue
try:
nlp.update(
batch,
sgd=optimizer,
drop=next(dropout_rates),
losses=losses,
)
except ValueError as e:
err = "Error during training"
if init_tok2vec:
err += " Did you provide the same parameters during 'train' as during 'pretrain'?"
msg.fail(err, f"Original error message: {e}", exits=1)
if raw_text:
# If raw text is available, perform 'rehearsal' updates,
# which use unlabelled data to reduce overfitting.
raw_batch = list(next(raw_batches))
nlp.rehearse(raw_batch, sgd=optimizer, losses=losses)
docs = [ex.doc for ex in batch]
if not int(os.environ.get("LOG_FRIENDLY", 0)):
pbar.update(sum(len(doc) for doc in docs))
words_seen += sum(len(doc) for doc in docs)
with nlp.use_params(optimizer.averages):
util.set_env_log(False)
epoch_model_path = output_path / f"model{i}"
nlp.to_disk(epoch_model_path)
nlp_loaded = util.load_model_from_path(epoch_model_path)
for beam_width in eval_beam_widths:
for name, component in nlp_loaded.pipeline:
if hasattr(component, "cfg"):
component.cfg["beam_width"] = beam_width
dev_dataset = list(
corpus.dev_dataset(
nlp_loaded,
gold_preproc=gold_preproc,
ignore_misaligned=True,
)
)
nwords = sum(len(ex.doc) for ex in dev_dataset)
start_time = timer()
scorer = nlp_loaded.evaluate(dev_dataset, verbose=verbose)
end_time = timer()
if use_gpu < 0:
gpu_wps = None
cpu_wps = nwords / (end_time - start_time)
else:
gpu_wps = nwords / (end_time - start_time)
# Evaluate on CPU in the first iteration only (for
# timing) when GPU is enabled
if i == 0:
with use_ops("numpy"):
nlp_loaded = util.load_model_from_path(epoch_model_path)
for name, component in nlp_loaded.pipeline:
if hasattr(component, "cfg"):
component.cfg["beam_width"] = beam_width
dev_dataset = list(
corpus.dev_dataset(
nlp_loaded,
gold_preproc=gold_preproc,
ignore_misaligned=True,
)
)
start_time = timer()
scorer = nlp_loaded.evaluate(dev_dataset, verbose=verbose)
end_time = timer()
cpu_wps = nwords / (end_time - start_time)
acc_loc = output_path / f"model{i}" / "accuracy.json"
srsly.write_json(acc_loc, scorer.scores)
# Update model meta.json
meta["lang"] = nlp.lang
meta["pipeline"] = nlp.pipe_names
if beam_width == 1:
meta["speed"] = {
"nwords": nwords,
"cpu": cpu_wps,
"gpu": gpu_wps,
}
meta.setdefault("accuracy", {})
for component in nlp.pipe_names:
for metric in _get_metrics(component):
meta["accuracy"][metric] = scorer.scores[metric]
else:
meta.setdefault("beam_accuracy", {})
meta.setdefault("beam_speed", {})
for component in nlp.pipe_names:
for metric in _get_metrics(component):
meta["beam_accuracy"][metric] = scorer.scores[metric]
meta["beam_speed"][beam_width] = {
"nwords": nwords,
"cpu": cpu_wps,
"gpu": gpu_wps,
}
meta["vectors"] = {
"width": nlp.vocab.vectors_length,
"vectors": len(nlp.vocab.vectors),
"keys": nlp.vocab.vectors.n_keys,
"name": nlp.vocab.vectors.name,
}
meta.setdefault("name", f"model{i}")
meta.setdefault("version", version)
meta["labels"] = nlp.meta["labels"]
meta_loc = output_path / f"model{i}" / "meta.json"
srsly.write_json(meta_loc, meta)
util.set_env_log(verbose)
progress = _get_progress(
i,
losses,
scorer.scores,
output_stats,
beam_width=beam_width if has_beam_widths else None,
cpu_wps=cpu_wps,
gpu_wps=gpu_wps,
)
if i == 0 and "textcat" in pipeline:
textcats_per_cat = scorer.scores.get("textcats_per_cat", {})
for cat, cat_score in textcats_per_cat.items():
if cat_score.get("roc_auc_score", 0) < 0:
msg.warn(
f"Textcat ROC AUC score is undefined due to "
f"only one value in label '{cat}'."
)
msg.row(progress, **row_settings)
# Early stopping
if n_early_stopping is not None:
current_score = _score_for_model(meta)
if current_score < best_score:
iter_since_best += 1
else:
iter_since_best = 0
best_score = current_score
if iter_since_best >= n_early_stopping:
msg.text(
f"Early stopping, best iteration is: {i - iter_since_best}"
)
msg.text(
f"Best score = {best_score}; Final iteration score = {current_score}"
)
break
except Exception as e:
msg.warn(f"Aborting and saving final best model. Encountered exception: {e}", exits=1)
finally:
best_pipes = nlp.pipe_names
if disabled_pipes:
disabled_pipes.restore()
with nlp.use_params(optimizer.averages):
final_model_path = output_path / "model-final"
nlp.to_disk(final_model_path)
meta_loc = output_path / "model-final" / "meta.json"
final_meta = srsly.read_json(meta_loc)
final_meta.setdefault("accuracy", {})
final_meta["accuracy"].update(meta.get("accuracy", {}))
final_meta.setdefault("speed", {})
final_meta["speed"].setdefault("cpu", None)
final_meta["speed"].setdefault("gpu", None)
meta.setdefault("speed", {})
meta["speed"].setdefault("cpu", None)
meta["speed"].setdefault("gpu", None)
# combine cpu and gpu speeds with the base model speeds
if final_meta["speed"]["cpu"] and meta["speed"]["cpu"]:
speed = _get_total_speed(
[final_meta["speed"]["cpu"], meta["speed"]["cpu"]]
)
final_meta["speed"]["cpu"] = speed
if final_meta["speed"]["gpu"] and meta["speed"]["gpu"]:
speed = _get_total_speed(
[final_meta["speed"]["gpu"], meta["speed"]["gpu"]]
)
final_meta["speed"]["gpu"] = speed
# if there were no speeds to update, overwrite with meta
if (
final_meta["speed"]["cpu"] is None
and final_meta["speed"]["gpu"] is None
):
final_meta["speed"].update(meta["speed"])
# note: beam speeds are not combined with the base model
if has_beam_widths:
final_meta.setdefault("beam_accuracy", {})
final_meta["beam_accuracy"].update(meta.get("beam_accuracy", {}))
final_meta.setdefault("beam_speed", {})
final_meta["beam_speed"].update(meta.get("beam_speed", {}))
srsly.write_json(meta_loc, final_meta)
msg.good("Saved model to output directory", final_model_path)
with msg.loading("Creating best model..."):
best_model_path = _collate_best_model(final_meta, output_path, best_pipes)
msg.good("Created best model", best_model_path)
def _score_for_model(meta):
""" Returns mean score between tasks in pipeline that can be used for early stopping. """
mean_acc = list()
pipes = meta["pipeline"]
acc = meta["accuracy"]
if "tagger" in pipes:
mean_acc.append(acc["tags_acc"])
if "morphologizer" in pipes:
mean_acc.append((acc["morphs_acc"] + acc["pos_acc"]) / 2)
if "parser" in pipes:
mean_acc.append((acc["uas"] + acc["las"]) / 2)
if "ner" in pipes:
mean_acc.append((acc["ents_p"] + acc["ents_r"] + acc["ents_f"]) / 3)
if "textcat" in pipes:
mean_acc.append(acc["textcat_score"])
if "senter" in pipes:
mean_acc.append((acc["sent_p"] + acc["sent_r"] + acc["sent_f"]) / 3)
return sum(mean_acc) / len(mean_acc)
@contextlib.contextmanager
def _create_progress_bar(total):
if int(os.environ.get("LOG_FRIENDLY", 0)):
yield
else:
pbar = tqdm.tqdm(total=total, leave=False)
yield pbar
def _load_vectors(nlp, vectors):
util.load_model(vectors, vocab=nlp.vocab)
def _load_pretrained_tok2vec(nlp, loc):
"""Load pretrained weights for the 'token-to-vector' part of the component
models, which is typically a CNN. See 'spacy pretrain'. Experimental.
"""
with loc.open("rb") as file_:
weights_data = file_.read()
loaded = []
for name, component in nlp.pipeline:
if hasattr(component, "model") and component.model.has_ref("tok2vec"):
component.get_ref("tok2vec").from_bytes(weights_data)
loaded.append(name)
return loaded
def _collate_best_model(meta, output_path, components):
bests = {}
meta.setdefault("accuracy", {})
for component in components:
bests[component] = _find_best(output_path, component)
best_dest = output_path / "model-best"
shutil.copytree(str(output_path / "model-final"), str(best_dest))
for component, best_component_src in bests.items():
shutil.rmtree(str(best_dest / component))
shutil.copytree(str(best_component_src / component), str(best_dest / component))
accs = srsly.read_json(best_component_src / "accuracy.json")
for metric in _get_metrics(component):
meta["accuracy"][metric] = accs[metric]
srsly.write_json(best_dest / "meta.json", meta)
return best_dest
def _find_best(experiment_dir, component):
accuracies = []
for epoch_model in experiment_dir.iterdir():
if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final":
accs = srsly.read_json(epoch_model / "accuracy.json")
scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)]
# remove per_type dicts from score list for max() comparison
scores = [score for score in scores if isinstance(score, float)]
accuracies.append((scores, epoch_model))
if accuracies:
return max(accuracies)[1]
else:
return None
def _get_metrics(component):
if component == "parser":
return ("las", "uas", "las_per_type", "sent_f", "token_acc")
elif component == "tagger":
return ("tags_acc", "token_acc")
elif component == "morphologizer":
return ("morphs_acc", "pos_acc", "token_acc")
elif component == "ner":
return ("ents_f", "ents_p", "ents_r", "ents_per_type", "token_acc")
elif component == "senter":
return ("sent_f", "sent_p", "sent_r", "token_acc")
elif component == "textcat":
return ("textcat_score", "token_acc")
return ("token_acc",)
def _configure_training_output(pipeline, use_gpu, has_beam_widths):
row_head = ["Itn"]
output_stats = []
for pipe in pipeline:
if pipe == "tagger":
row_head.extend(["Tag Loss ", " Tag % "])
output_stats.extend(["tag_loss", "tags_acc"])
elif pipe == "morphologizer" or pipe == "morphologizertagger":
row_head.extend(["Morph Loss ", " Morph % ", " POS % "])
output_stats.extend(["morph_loss", "morphs_acc", "pos_acc"])
elif pipe == "parser":
row_head.extend(
["Dep Loss ", " UAS ", " LAS ", "Sent P", "Sent R", "Sent F"]
)
output_stats.extend(
["dep_loss", "uas", "las", "sent_p", "sent_r", "sent_f"]
)
elif pipe == "ner":
row_head.extend(["NER Loss ", "NER P ", "NER R ", "NER F "])
output_stats.extend(["ner_loss", "ents_p", "ents_r", "ents_f"])
elif pipe == "textcat":
row_head.extend(["Textcat Loss", "Textcat"])
output_stats.extend(["textcat_loss", "textcat_score"])
elif pipe == "senter":
row_head.extend(["Senter Loss", "Sent P", "Sent R", "Sent F"])
output_stats.extend(["senter_loss", "sent_p", "sent_r", "sent_f"])
row_head.extend(["Token %", "CPU WPS"])
output_stats.extend(["token_acc", "cpu_wps"])
if use_gpu >= 0:
row_head.extend(["GPU WPS"])
output_stats.extend(["gpu_wps"])
if has_beam_widths:
row_head.insert(1, "Beam W.")
# remove duplicates
row_head_dict = {k: 1 for k in row_head}
output_stats_dict = {k: 1 for k in output_stats}
return row_head_dict.keys(), output_stats_dict.keys()
def _get_progress(
itn, losses, dev_scores, output_stats, beam_width=None, cpu_wps=0.0, gpu_wps=0.0
):
scores = {}
for stat in output_stats:
scores[stat] = 0.0
scores["dep_loss"] = losses.get("parser", 0.0)
scores["ner_loss"] = losses.get("ner", 0.0)
scores["tag_loss"] = losses.get("tagger", 0.0)
scores["morph_loss"] = losses.get("morphologizer", 0.0)
scores["textcat_loss"] = losses.get("textcat", 0.0)
scores["senter_loss"] = losses.get("senter", 0.0)
scores["cpu_wps"] = cpu_wps
scores["gpu_wps"] = gpu_wps or 0.0
scores.update(dev_scores)
formatted_scores = []
for stat in output_stats:
format_spec = "{:.3f}"
if stat.endswith("_wps"):
format_spec = "{:.0f}"
formatted_scores.append(format_spec.format(scores[stat]))
result = [itn + 1]
result.extend(formatted_scores)
if beam_width is not None:
result.insert(1, beam_width)
return result
def _get_total_speed(speeds):
seconds_per_word = 0.0
for words_per_second in speeds:
if words_per_second is None:
return None
seconds_per_word += 1.0 / words_per_second
return 1.0 / seconds_per_word

View File

@ -1,5 +1,7 @@
from typing import Optional, Dict, List, Union, Sequence from typing import Optional, Dict, List, Union, Sequence
from timeit import default_timer as timer from timeit import default_timer as timer
import srsly
from pydantic import BaseModel, FilePath from pydantic import BaseModel, FilePath
import plac import plac
import tqdm import tqdm
@ -11,10 +13,14 @@ from thinc.api import Model, use_pytorch_for_gpu_memory
import random import random
from ..gold import GoldCorpus from ..gold import GoldCorpus
<<<<<<< HEAD
from ..gold import Example from ..gold import Example
=======
from ..lookups import Lookups
>>>>>>> origin/develop
from .. import util from .. import util
from ..errors import Errors from ..errors import Errors
from ..ml import models # don't remove - required to load the built-in architectures from ..ml import models # don't remove - required to load the built-in architectures
registry = util.registry registry = util.registry
@ -24,7 +30,6 @@ patience = 10
eval_frequency = 10 eval_frequency = 10
dropout = 0.2 dropout = 0.2
init_tok2vec = null init_tok2vec = null
vectors = null
max_epochs = 100 max_epochs = 100
orth_variant_level = 0.0 orth_variant_level = 0.0
gold_preproc = false gold_preproc = false
@ -48,7 +53,7 @@ beta2 = 0.999
[nlp] [nlp]
lang = "en" lang = "en"
vectors = ${training:vectors} vectors = null
[nlp.pipeline.tok2vec] [nlp.pipeline.tok2vec]
factory = "tok2vec" factory = "tok2vec"
@ -94,7 +99,6 @@ class ConfigSchema(BaseModel):
eval_frequency: int = 100 eval_frequency: int = 100
dropout: float = 0.2 dropout: float = 0.2
init_tok2vec: Optional[FilePath] = None init_tok2vec: Optional[FilePath] = None
vectors: Optional[str] = None
max_epochs: int = 100 max_epochs: int = 100
orth_variant_level: float = 0.0 orth_variant_level: float = 0.0
gold_preproc: bool = False gold_preproc: bool = False
@ -120,9 +124,14 @@ class ConfigSchema(BaseModel):
dev_path=("Location of JSON-formatted development data", "positional", None, Path), dev_path=("Location of JSON-formatted development data", "positional", None, Path),
config_path=("Path to config file", "positional", None, Path), config_path=("Path to config file", "positional", None, Path),
output_path=("Output directory to store model in", "option", "o", Path), output_path=("Output directory to store model in", "option", "o", Path),
meta_path=("Optional path to meta.json to use as base.", "option", "m", Path), init_tok2vec=(
"Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental.", "option", "t2v",
Path),
raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path), raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path),
verbose=("Display more information for debugging purposes", "flag", "VV", bool),
use_gpu=("Use GPU", "option", "g", int), use_gpu=("Use GPU", "option", "g", int),
tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool),
# fmt: on # fmt: on
) )
def train_cli( def train_cli(
@ -130,30 +139,54 @@ def train_cli(
dev_path, dev_path,
config_path, config_path,
output_path=None, output_path=None,
meta_path=None, init_tok2vec=None,
raw_text=None, raw_text=None,
debug=False,
verbose=False, verbose=False,
use_gpu=-1, use_gpu=-1,
tag_map_path=None,
omit_extra_lookups=False,
): ):
""" """
Train or update a spaCy model. Requires data to be formatted in spaCy's Train or update a spaCy model. Requires data to be formatted in spaCy's
JSON format. To convert data from other formats, use the `spacy convert` JSON format. To convert data from other formats, use the `spacy convert`
command. command.
""" """
util.set_env_log(verbose)
# Make sure all files and paths exists if they are needed
if not config_path or not config_path.exists(): if not config_path or not config_path.exists():
msg.fail("Config file not found", config_path, exits=1) msg.fail("Config file not found", config_path, exits=1)
if not train_path or not train_path.exists(): if not train_path or not train_path.exists():
msg.fail("Training data not found", train_path, exits=1) msg.fail("Training data not found", train_path, exits=1)
if not dev_path or not dev_path.exists(): if not dev_path or not dev_path.exists():
msg.fail("Development data not found", dev_path, exits=1) msg.fail("Development data not found", dev_path, exits=1)
if meta_path is not None and not meta_path.exists(): if output_path is not None:
msg.fail("Can't find model meta.json", meta_path, exits=1) if not output_path.exists():
if output_path is not None and not output_path.exists(): output_path.mkdir()
output_path.mkdir() msg.good(f"Created output directory: {output_path}")
elif output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
msg.warn(
"Output directory is not empty.",
"This can lead to unintended side effects when saving the model. "
"Please use an empty directory or a different path instead. If "
"the specified output path doesn't exist, the directory will be "
"created for you.",
)
if raw_text is not None:
raw_text = list(srsly.read_jsonl(raw_text))
tag_map = {}
if tag_map_path is not None:
tag_map = srsly.read_json(tag_map_path)
weights_data = None
if init_tok2vec is not None:
if not init_tok2vec.exists():
msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
with init_tok2vec.open("rb") as file_:
weights_data = file_.read()
if use_gpu >= 0: if use_gpu >= 0:
msg.info("Using GPU") msg.info("Using GPU: {use_gpu}")
util.use_gpu(use_gpu) util.use_gpu(use_gpu)
else: else:
msg.info("Using CPU") msg.info("Using CPU")
@ -162,31 +195,126 @@ def train_cli(
config_path, config_path,
{"train": train_path, "dev": dev_path}, {"train": train_path, "dev": dev_path},
output_path=output_path, output_path=output_path,
meta_path=meta_path,
raw_text=raw_text, raw_text=raw_text,
tag_map=tag_map,
weights_data=weights_data,
omit_extra_lookups=omit_extra_lookups,
) )
def train( def train(
config_path, data_paths, raw_text=None, meta_path=None, output_path=None, config_path,
data_paths,
raw_text=None,
output_path=None,
tag_map=None,
weights_data=None,
omit_extra_lookups=False,
): ):
msg.info(f"Loading config from: {config_path}") msg.info(f"Loading config from: {config_path}")
# Read the config first without creating objects, to get to the original nlp_config # Read the config first without creating objects, to get to the original nlp_config
config = util.load_config(config_path, create_objects=False) config = util.load_config(config_path, create_objects=False)
util.fix_random_seed(config["training"]["seed"]) util.fix_random_seed(config["training"]["seed"])
if config["training"]["use_pytorch_for_gpu_memory"]: if config["training"].get("use_pytorch_for_gpu_memory"):
# It feels kind of weird to not have a default for this.
use_pytorch_for_gpu_memory() use_pytorch_for_gpu_memory()
nlp_config = config["nlp"] nlp_config = config["nlp"]
config = util.load_config(config_path, create_objects=True) config = util.load_config(config_path, create_objects=True)
training = config["training"]
msg.info("Creating nlp from config") msg.info("Creating nlp from config")
nlp = util.load_model_from_config(nlp_config) nlp = util.load_model_from_config(nlp_config)
training = config["training"]
optimizer = training["optimizer"] optimizer = training["optimizer"]
limit = training["limit"] limit = training["limit"]
msg.info("Loading training corpus") msg.info("Loading training corpus")
corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit) corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit)
msg.info("Initializing the nlp pipeline") # verify textcat config
nlp.begin_training(lambda: corpus.train_dataset(nlp)) if "textcat" in nlp_config["pipeline"]:
textcat_labels = set(nlp.get_pipe("textcat").labels)
textcat_multilabel = not nlp_config["pipeline"]["textcat"]["model"]["exclusive_classes"]
# check whether the setting 'exclusive_classes' corresponds to the provided training data
if textcat_multilabel:
multilabel_found = False
for ex in corpus.train_examples:
cats = ex.doc_annotation.cats
textcat_labels.update(cats.keys())
if list(cats.values()).count(1.0) != 1:
multilabel_found = True
if not multilabel_found:
msg.warn(
"The textcat training instances look like they have "
"mutually exclusive classes. Set 'exclusive_classes' "
"to 'true' in the config to train a classifier with "
"mutually exclusive classes more accurately."
)
else:
for ex in corpus.train_examples:
cats = ex.doc_annotation.cats
textcat_labels.update(cats.keys())
if list(cats.values()).count(1.0) != 1:
msg.fail(
"Some textcat training instances do not have exactly "
"one positive label. Set 'exclusive_classes' "
"to 'false' in the config to train a classifier with classes "
"that are not mutually exclusive."
)
msg.info(f"Initialized textcat component for {len(textcat_labels)} unique labels")
nlp.get_pipe("textcat").labels = tuple(textcat_labels)
# if 'positive_label' is provided: double check whether it's in the data and the task is binary
if nlp_config["pipeline"]["textcat"].get("positive_label", None):
textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", [])
pos_label = nlp_config["pipeline"]["textcat"]["positive_label"]
if pos_label not in textcat_labels:
msg.fail(
f"The textcat's 'positive_label' config setting '{pos_label}' "
f"does not match any label in the training data.",
exits=1,
)
if len(textcat_labels) != 2:
msg.fail(
f"A textcat 'positive_label' '{pos_label}' was "
f"provided for training data that does not appear to be a "
f"binary classification problem with two labels.",
exits=1,
)
if training.get("resume", False):
msg.info("Resuming training")
nlp.resume_training()
else:
msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
nlp.begin_training(lambda: corpus.train_dataset(nlp))
# Update tag map with provided mapping
nlp.vocab.morphology.tag_map.update(tag_map)
# Create empty extra lexeme tables so the data from spacy-lookups-data
# isn't loaded if these features are accessed
if omit_extra_lookups:
nlp.vocab.lookups_extra = Lookups()
nlp.vocab.lookups_extra.add_table("lexeme_cluster")
nlp.vocab.lookups_extra.add_table("lexeme_prob")
nlp.vocab.lookups_extra.add_table("lexeme_settings")
# Load a pretrained tok2vec model - cf. CLI command 'pretrain'
if weights_data is not None:
tok2vec_path = config.get("pretraining", {}).get("tok2vec_model", None)
if tok2vec_path is None:
msg.fail(
f"To use a pretrained tok2vec model, the config needs to specify which "
f"tok2vec layer to load in the setting [pretraining.tok2vec_model].",
exits=1,
)
tok2vec = config
for subpath in tok2vec_path.split("."):
tok2vec = tok2vec.get(subpath)
if not tok2vec:
msg.fail(
f"Could not locate the tok2vec model at {tok2vec_path}.",
exits=1,
)
tok2vec.from_bytes(weights_data)
train_batches = create_train_batches(nlp, corpus, training) train_batches = create_train_batches(nlp, corpus, training)
evaluate = create_evaluation_callback(nlp, optimizer, corpus, training) evaluate = create_evaluation_callback(nlp, optimizer, corpus, training)
@ -203,6 +331,7 @@ def train(
patience=training.get("patience", 0), patience=training.get("patience", 0),
max_steps=training.get("max_steps", 0), max_steps=training.get("max_steps", 0),
eval_frequency=training["eval_frequency"], eval_frequency=training["eval_frequency"],
raw_text=raw_text,
) )
msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}") msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
@ -216,7 +345,8 @@ def train(
progress.close() progress.close()
print_row(info) print_row(info)
if is_best_checkpoint and output_path is not None: if is_best_checkpoint and output_path is not None:
nlp.to_disk(output_path) update_meta(training, nlp, info)
nlp.to_disk(output_path / "model-best")
progress = tqdm.tqdm(total=training["eval_frequency"], leave=False) progress = tqdm.tqdm(total=training["eval_frequency"], leave=False)
# Clean up the objects to faciliate garbage collection. # Clean up the objects to faciliate garbage collection.
for eg in batch: for eg in batch:
@ -224,6 +354,12 @@ def train(
eg.goldparse = None eg.goldparse = None
eg.doc_annotation = None eg.doc_annotation = None
eg.token_annotation = None eg.token_annotation = None
except Exception as e:
msg.warn(
f"Aborting and saving the final best model. "
f"Encountered exception: {str(e)}",
exits=1,
)
finally: finally:
if output_path is not None: if output_path is not None:
final_model_path = output_path / "model-final" final_model_path = output_path / "model-final"
@ -232,12 +368,13 @@ def train(
nlp.to_disk(final_model_path) nlp.to_disk(final_model_path)
else: else:
nlp.to_disk(final_model_path) nlp.to_disk(final_model_path)
msg.good("Saved model to output directory", final_model_path) msg.good(f"Saved model to output directory {final_model_path}")
def create_train_batches(nlp, corpus, cfg): def create_train_batches(nlp, corpus, cfg):
epochs_todo = cfg.get("max_epochs", 0) epochs_todo = cfg.get("max_epochs", 0)
while True: while True:
<<<<<<< HEAD
train_examples = list(corpus.train_dataset( train_examples = list(corpus.train_dataset(
nlp, nlp,
noise_level=0.0, noise_level=0.0,
@ -246,10 +383,26 @@ def create_train_batches(nlp, corpus, cfg):
max_length=cfg["max_length"], max_length=cfg["max_length"],
ignore_misaligned=True ignore_misaligned=True
)) ))
=======
train_examples = list(
corpus.train_dataset(
nlp,
noise_level=0.0, # I think this is deprecated?
orth_variant_level=cfg["orth_variant_level"],
gold_preproc=cfg["gold_preproc"],
max_length=cfg["max_length"],
ignore_misaligned=True,
)
)
>>>>>>> origin/develop
if len(train_examples) == 0: if len(train_examples) == 0:
raise ValueError(Errors.E988) raise ValueError(Errors.E988)
random.shuffle(train_examples) random.shuffle(train_examples)
batches = util.minibatch_by_words(train_examples, size=cfg["batch_size"], discard_oversize=cfg["discard_oversize"]) batches = util.minibatch_by_words(
train_examples,
size=cfg["batch_size"],
discard_oversize=cfg["discard_oversize"],
)
# make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop # make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
try: try:
first = next(batches) first = next(batches)
@ -286,7 +439,11 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
scores = scorer.scores scores = scorer.scores
# Calculate a weighted sum based on score_weights for the main score # Calculate a weighted sum based on score_weights for the main score
weights = cfg["score_weights"] weights = cfg["score_weights"]
weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights) try:
weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights)
except KeyError as e:
raise KeyError(Errors.E983.format(dict_name='score_weights', key=str(e), keys=list(scores.keys())))
scores["speed"] = wps scores["speed"] = wps
return weighted_score, scores return weighted_score, scores
@ -294,8 +451,17 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
def train_while_improving( def train_while_improving(
nlp, optimizer, train_data, evaluate, *, dropout, eval_frequency, nlp,
accumulate_gradient=1, patience=0, max_steps=0 optimizer,
train_data,
evaluate,
*,
dropout,
eval_frequency,
accumulate_gradient=1,
patience=0,
max_steps=0,
raw_text=None,
): ):
"""Train until an evaluation stops improving. Works as a generator, """Train until an evaluation stops improving. Works as a generator,
with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`, with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
@ -343,11 +509,22 @@ def train_while_improving(
losses = {} losses = {}
to_enable = [name for name, proc in nlp.pipeline if hasattr(proc, "model")] to_enable = [name for name, proc in nlp.pipeline if hasattr(proc, "model")]
if raw_text:
random.shuffle(raw_text)
raw_batches = util.minibatch(
(nlp.make_doc(rt["text"]) for rt in raw_text), size=8
)
for step, batch in enumerate(train_data): for step, batch in enumerate(train_data):
dropout = next(dropouts) dropout = next(dropouts)
with nlp.select_pipes(enable=to_enable): with nlp.select_pipes(enable=to_enable):
for subbatch in subdivide_batch(batch, accumulate_gradient): for subbatch in subdivide_batch(batch, accumulate_gradient):
nlp.update(subbatch, drop=dropout, losses=losses, sgd=False) nlp.update(subbatch, drop=dropout, losses=losses, sgd=False)
if raw_text:
# If raw text is available, perform 'rehearsal' updates,
# which use unlabelled data to reduce overfitting.
raw_batch = list(next(raw_batches))
nlp.rehearse(raw_batch, sgd=optimizer, losses=losses)
for name, proc in nlp.pipeline: for name, proc in nlp.pipeline:
if hasattr(proc, "model"): if hasattr(proc, "model"):
proc.model.finish_update(optimizer) proc.model.finish_update(optimizer)
@ -388,7 +565,7 @@ def subdivide_batch(batch, accumulate_gradient):
if subbatch: if subbatch:
yield subbatch yield subbatch
start += len(subbatch) start += len(subbatch)
subbatch = batch[start : ] subbatch = batch[start:]
if subbatch: if subbatch:
yield subbatch yield subbatch
@ -407,14 +584,34 @@ def setup_printer(training, nlp):
msg.row(["-" * width for width in table_widths]) msg.row(["-" * width for width in table_widths])
def print_row(info): def print_row(info):
losses = [ try:
"{0:.2f}".format(float(info["losses"].get(pipe_name, 0.0))) losses = [
for pipe_name in nlp.pipe_names "{0:.2f}".format(float(info["losses"][pipe_name]))
] for pipe_name in nlp.pipe_names
scores = [ ]
"{0:.2f}".format(float(info["other_scores"].get(col, 0.0))) for col in score_cols except KeyError as e:
] raise KeyError(
data = [info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))] Errors.E983.format(dict_name='scores (losses)', key=str(e), keys=list(info["losses"].keys())))
try:
scores = [
"{0:.2f}".format(float(info["other_scores"][col]))
for col in score_cols
]
except KeyError as e:
raise KeyError(Errors.E983.format(dict_name='scores (other)', key=str(e), keys=list(info["other_scores"].keys())))
data = (
[info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))]
)
msg.row(data, widths=table_widths, aligns=table_aligns) msg.row(data, widths=table_widths, aligns=table_aligns)
return print_row return print_row
def update_meta(training, nlp, info):
score_cols = training["scores"]
nlp.meta["performance"] = {}
for metric in score_cols:
nlp.meta["performance"][metric] = info["other_scores"][metric]
for pipe_name in nlp.pipe_names:
nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]

View File

@ -587,6 +587,15 @@ class Errors(object):
"to the offsets of the 'entities' annotations.") "to the offsets of the 'entities' annotations.")
E985 = ("The 'ent_iob' attribute of a Token should be an integer indexing " E985 = ("The 'ent_iob' attribute of a Token should be an integer indexing "
"into {values}, but found {value}.") "into {values}, but found {value}.")
E983 = ("Invalid key for '{dict_name}': {key}. Available keys: "
"{keys}")
E984 = ("Could not parse the {input} - double check the data is written "
"in the correct format as expected by spaCy.")
E985 = ("The pipeline component '{component}' is already available in the base "
"model. The settings in the component block in the config file are "
"being ignored. If you want to replace this component instead, set "
"'replace' to True in the training configuration.")
E986 = ("Could not create any training batches: check your input. " E986 = ("Could not create any training batches: check your input. "
"Perhaps discard_oversize should be set to False ?") "Perhaps discard_oversize should be set to False ?")
E987 = ("The text of an example training instance is either a Doc or " E987 = ("The text of an example training instance is either a Doc or "

View File

@ -319,14 +319,14 @@ class Language(object):
# transform the model's config to an actual Model # transform the model's config to an actual Model
factory_cfg = dict(config) factory_cfg = dict(config)
# check whether we have a proper model config, or load a default one # check whether we have a proper model config, ignore if the type is wrong
if "model" in factory_cfg and not isinstance(factory_cfg["model"], dict): if "model" in factory_cfg and not isinstance(factory_cfg["model"], dict):
warnings.warn( warnings.warn(
Warnings.W099.format(type=type(factory_cfg["model"]), pipe=name) Warnings.W099.format(type=type(factory_cfg["model"]), pipe=name)
) )
# refer to the model configuration in the cfg settings for this component # refer to the model configuration in the cfg settings for this component
if "model" in factory_cfg: elif "model" in factory_cfg:
self.config[name] = {"model": factory_cfg["model"]} self.config[name] = {"model": factory_cfg["model"]}
# create all objects in the config # create all objects in the config
@ -1089,6 +1089,7 @@ class component(object):
requires=tuple(), requires=tuple(),
retokenizes=False, retokenizes=False,
default_model=lambda: None, default_model=lambda: None,
default_config=None,
): ):
"""Decorate a pipeline component. """Decorate a pipeline component.
@ -1102,6 +1103,7 @@ class component(object):
self.requires = validate_attrs(requires) self.requires = validate_attrs(requires)
self.retokenizes = retokenizes self.retokenizes = retokenizes
self.default_model = default_model self.default_model = default_model
self.default_config = default_config
def __call__(self, *args, **kwargs): def __call__(self, *args, **kwargs):
obj = args[0] obj = args[0]
@ -1116,9 +1118,10 @@ class component(object):
def factory(nlp, model, **cfg): def factory(nlp, model, **cfg):
if model is None: if model is None:
model = self.default_model() model = self.default_model()
warnings.warn(Warnings.W098.format(name=self.name)) if self.default_config:
if model is None: for key, value in self.default_config.items():
warnings.warn(Warnings.W097.format(name=self.name)) if key not in cfg:
cfg[key] = value
if hasattr(obj, "from_nlp"): if hasattr(obj, "from_nlp"):
return obj.from_nlp(nlp, model, **cfg) return obj.from_nlp(nlp, model, **cfg)
elif isinstance(obj, type): elif isinstance(obj, type):

View File

@ -3,26 +3,31 @@ import numpy
from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
def build_multi_task_model(n_tags, tok2vec=None, token_vector_width=96): def build_multi_task_model(tok2vec, maxout_pieces, token_vector_width, nO=None):
softmax = Softmax(nO=nO, nI=token_vector_width * 2)
model = chain( model = chain(
tok2vec, tok2vec,
Maxout(nO=token_vector_width * 2, nI=token_vector_width, nP=3, dropout=0.0), Maxout(nO=token_vector_width * 2, nI=token_vector_width, nP=maxout_pieces, dropout=0.0),
LayerNorm(token_vector_width * 2), LayerNorm(token_vector_width * 2),
Softmax(nO=n_tags, nI=token_vector_width * 2), softmax,
) )
model.set_ref("tok2vec", tok2vec)
model.set_ref("output_layer", softmax)
return model return model
def build_cloze_multi_task_model(vocab, tok2vec): def build_cloze_multi_task_model(vocab, tok2vec, maxout_pieces, nO=None):
output_size = vocab.vectors.data.shape[1] # nO = vocab.vectors.data.shape[1]
output_layer = chain( output_layer = chain(
Maxout( Maxout(
nO=output_size, nI=tok2vec.get_dim("nO"), nP=3, normalize=True, dropout=0.0 nO=nO, nI=tok2vec.get_dim("nO"), nP=maxout_pieces, normalize=True, dropout=0.0
), ),
Linear(nO=output_size, nI=output_size, init_W=zero_init), Linear(nO=nO, nI=nO, init_W=zero_init),
) )
model = chain(tok2vec, output_layer) model = chain(tok2vec, output_layer)
model = build_masked_language_model(vocab, model) model = build_masked_language_model(vocab, model)
model.set_ref("tok2vec", tok2vec)
model.set_ref("output_layer", output_layer)
return model return model

View File

@ -31,6 +31,7 @@ def build_simple_cnn_text_classifier(tok2vec, exclusive_classes, nO=None):
model.set_ref("output_layer", linear_layer) model.set_ref("output_layer", linear_layer)
model.set_ref("tok2vec", tok2vec) model.set_ref("tok2vec", tok2vec)
model.set_dim("nO", nO) model.set_dim("nO", nO)
model.attrs["multi_label"] = not exclusive_classes
return model return model
@ -44,6 +45,7 @@ def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO
output_layer = softmax_activation() if exclusive_classes else Logistic() output_layer = softmax_activation() if exclusive_classes else Logistic()
model = model >> with_cpu(output_layer, output_layer.ops) model = model >> with_cpu(output_layer, output_layer.ops)
model.set_ref("output_layer", sparse_linear) model.set_ref("output_layer", sparse_linear)
model.attrs["multi_label"] = not exclusive_classes
return model return model
@ -110,6 +112,7 @@ def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_class
if model.has_dim("nO") is not False: if model.has_dim("nO") is not False:
model.set_dim("nO", nO) model.set_dim("nO", nO)
model.set_ref("output_layer", linear_model.get_ref("output_layer")) model.set_ref("output_layer", linear_model.get_ref("output_layer"))
model.attrs["multi_label"] = not exclusive_classes
return model return model

View File

@ -0,0 +1,15 @@
[model]
@architectures = "spacy.MultiTask.v1"
maxout_pieces = 3
token_vector_width = 96
[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = null
width = 96
depth = 4
embed_size = 2000
window_size = 1
maxout_pieces = 2
subword_features = true
dropout = null

View File

@ -619,9 +619,10 @@ class MultitaskObjective(Tagger):
side-objective. side-objective.
""" """
def __init__(self, vocab, model, target='dep_tag_offset', **cfg): def __init__(self, vocab, model, **cfg):
self.vocab = vocab self.vocab = vocab
self.model = model self.model = model
target = cfg["target"] # default: 'dep_tag_offset'
if target == "dep": if target == "dep":
self.make_label = self.make_dep self.make_label = self.make_dep
elif target == "tag": elif target == "tag":
@ -639,8 +640,6 @@ class MultitaskObjective(Tagger):
else: else:
raise ValueError(Errors.E016) raise ValueError(Errors.E016)
self.cfg = dict(cfg) self.cfg = dict(cfg)
# TODO: remove - put in config
self.cfg.setdefault("maxout_pieces", 2)
@property @property
def labels(self): def labels(self):
@ -653,7 +652,7 @@ class MultitaskObjective(Tagger):
def set_annotations(self, docs, dep_ids, tensors=None): def set_annotations(self, docs, dep_ids, tensors=None):
pass pass
def begin_training(self, get_examples=lambda: [], pipeline=None, tok2vec=None, def begin_training(self, get_examples=lambda: [], pipeline=None,
sgd=None, **kwargs): sgd=None, **kwargs):
gold_examples = nonproj.preprocess_training_data(get_examples()) gold_examples = nonproj.preprocess_training_data(get_examples())
# for raw_text, doc_annot in gold_tuples: # for raw_text, doc_annot in gold_tuples:
@ -745,13 +744,13 @@ class ClozeMultitask(Pipe):
self.vocab = vocab self.vocab = vocab
self.model = model self.model = model
self.cfg = cfg self.cfg = cfg
self.distance = CosineDistance(ignore_zeros=True, normalize=False) self.distance = CosineDistance(ignore_zeros=True, normalize=False) # TODO: in config
def set_annotations(self, docs, dep_ids, tensors=None): def set_annotations(self, docs, dep_ids, tensors=None):
pass pass
def begin_training(self, get_examples=lambda: [], pipeline=None, def begin_training(self, get_examples=lambda: [], pipeline=None,
tok2vec=None, sgd=None, **kwargs): sgd=None, **kwargs):
link_vectors_to_models(self.vocab) link_vectors_to_models(self.vocab)
self.model.initialize() self.model.initialize()
X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO"))) X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
@ -960,28 +959,27 @@ cdef class DependencyParser(Parser):
output.append(merge_subtokens) output.append(merge_subtokens)
return tuple(output) return tuple(output)
def add_multitask_objective(self, target): def add_multitask_objective(self, mt_component):
if target == "cloze": self._multitasks.append(mt_component)
cloze = ClozeMultitask(self.vocab)
self._multitasks.append(cloze)
else:
labeller = MultitaskObjective(self.vocab, target=target)
self._multitasks.append(labeller)
def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg): def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
# TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ?
for labeller in self._multitasks: for labeller in self._multitasks:
tok2vec = self.model.get_ref("tok2vec") labeller.model.set_dim("nO", len(self.labels))
labeller.begin_training(get_examples, pipeline=pipeline, if labeller.model.has_ref("output_layer"):
tok2vec=tok2vec, sgd=sgd) labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
labeller.begin_training(get_examples, pipeline=pipeline, sgd=sgd)
def __reduce__(self): def __reduce__(self):
return (DependencyParser, (self.vocab, self.model), self.moves) return (DependencyParser, (self.vocab, self.model), (self.moves, self.cfg))
def __getstate__(self): def __getstate__(self):
return self.moves return (self.moves, self.cfg)
def __setstate__(self, moves): def __setstate__(self, state):
moves, config = state
self.moves = moves self.moves = moves
self.cfg = config
@property @property
def labels(self): def labels(self):
@ -1007,28 +1005,27 @@ cdef class EntityRecognizer(Parser):
requires = [] requires = []
TransitionSystem = BiluoPushDown TransitionSystem = BiluoPushDown
def add_multitask_objective(self, target): def add_multitask_objective(self, mt_component):
if target == "cloze": self._multitasks.append(mt_component)
cloze = ClozeMultitask(self.vocab)
self._multitasks.append(cloze)
else:
labeller = MultitaskObjective(self.vocab, target=target)
self._multitasks.append(labeller)
def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg): def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
# TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ?
for labeller in self._multitasks: for labeller in self._multitasks:
tok2vec = self.model.get_ref("tok2vec") labeller.model.set_dim("nO", len(self.labels))
labeller.begin_training(get_examples, pipeline=pipeline, if labeller.model.has_ref("output_layer"):
tok2vec=tok2vec) labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
labeller.begin_training(get_examples, pipeline=pipeline)
def __reduce__(self): def __reduce__(self):
return (EntityRecognizer, (self.vocab, self.model), self.moves) return (EntityRecognizer, (self.vocab, self.model), (self.moves, self.cfg))
def __getstate__(self): def __getstate__(self):
return self.moves return self.moves, self.cfg
def __setstate__(self, moves): def __setstate__(self, state):
moves, config = state
self.moves = moves self.moves = moves
self.cfg = config
@property @property
def labels(self): def labels(self):
@ -1487,15 +1484,23 @@ Language.factories["parser"] = lambda nlp, model, **cfg: parser_factory(nlp, mod
Language.factories["ner"] = lambda nlp, model, **cfg: ner_factory(nlp, model, **cfg) Language.factories["ner"] = lambda nlp, model, **cfg: ner_factory(nlp, model, **cfg)
def parser_factory(nlp, model, **cfg): def parser_factory(nlp, model, **cfg):
default_config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
if model is None: if model is None:
model = default_parser() model = default_parser()
warnings.warn(Warnings.W098.format(name="parser")) warnings.warn(Warnings.W098.format(name="parser"))
for key, value in default_config.items():
if key not in cfg:
cfg[key] = value
return DependencyParser.from_nlp(nlp, model, **cfg) return DependencyParser.from_nlp(nlp, model, **cfg)
def ner_factory(nlp, model, **cfg): def ner_factory(nlp, model, **cfg):
default_config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
if model is None: if model is None:
model = default_ner() model = default_ner()
warnings.warn(Warnings.W098.format(name="ner")) warnings.warn(Warnings.W098.format(name="ner"))
for key, value in default_config.items():
if key not in cfg:
cfg[key] = value
return EntityRecognizer.from_nlp(nlp, model, **cfg) return EntityRecognizer.from_nlp(nlp, model, **cfg)
__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"] __all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"]

View File

@ -172,7 +172,7 @@ class Tok2VecListener(Model):
def verify_inputs(self, inputs): def verify_inputs(self, inputs):
if self._batch_id is None and self._outputs is None: if self._batch_id is None and self._outputs is None:
raise ValueError raise ValueError("The Tok2Vec listener did not receive valid input.")
else: else:
batch_id = self.get_batch_id(inputs) batch_id = self.get_batch_id(inputs)
if batch_id != self._batch_id: if batch_id != self._batch_id:

View File

@ -88,24 +88,20 @@ class Scorer(object):
self.ner = PRFScore() self.ner = PRFScore()
self.ner_per_ents = dict() self.ner_per_ents = dict()
self.eval_punct = eval_punct self.eval_punct = eval_punct
self.textcat = None self.textcat = PRFScore()
self.textcat_per_cat = dict() self.textcat_f_per_cat = dict()
self.textcat_auc_per_cat = dict()
self.textcat_positive_label = None self.textcat_positive_label = None
self.textcat_multilabel = False self.textcat_multilabel = False
if pipeline: if pipeline:
for name, model in pipeline: for name, component in pipeline:
if name == "textcat": if name == "textcat":
self.textcat_positive_label = model.cfg.get("positive_label", None) self.textcat_multilabel = component.model.attrs["multi_label"]
if self.textcat_positive_label: self.textcat_positive_label = component.cfg.get("positive_label", None)
self.textcat = PRFScore() for label in component.cfg.get("labels", []):
if not model.cfg.get("exclusive_classes", False): self.textcat_auc_per_cat[label] = ROCAUCScore()
self.textcat_multilabel = True self.textcat_f_per_cat[label] = PRFScore()
for label in model.cfg.get("labels", []):
self.textcat_per_cat[label] = ROCAUCScore()
else:
for label in model.cfg.get("labels", []):
self.textcat_per_cat[label] = PRFScore()
@property @property
def tags_acc(self): def tags_acc(self):
@ -207,46 +203,52 @@ class Scorer(object):
} }
@property @property
def textcat_score(self): def textcat_f(self):
"""RETURNS (float): f-score on positive label for binary exclusive, """RETURNS (float): f-score on positive label for binary classification,
macro-averaged f-score for 3+ exclusive, macro-averaged f-score for multilabel classification
macro-averaged AUC ROC score for multilabel (-1 if undefined)
""" """
if not self.textcat_multilabel: if not self.textcat_multilabel:
# binary multiclass
if self.textcat_positive_label: if self.textcat_positive_label:
# binary classification
return self.textcat.fscore * 100 return self.textcat.fscore * 100
# other multiclass # multi-class and/or multi-label
return ( return (
sum([score.fscore for label, score in self.textcat_per_cat.items()]) sum([score.fscore for label, score in self.textcat_f_per_cat.items()])
/ (len(self.textcat_per_cat) + 1e-100) / (len(self.textcat_f_per_cat) + 1e-100)
* 100 * 100
) )
# multilabel
@property
def textcat_auc(self):
"""RETURNS (float): macro-averaged AUC ROC score for multilabel classification (-1 if undefined)
"""
return max( return max(
sum([score.score for label, score in self.textcat_per_cat.items()]) sum([score.score for label, score in self.textcat_auc_per_cat.items()])
/ (len(self.textcat_per_cat) + 1e-100), / (len(self.textcat_auc_per_cat) + 1e-100),
-1, -1,
) )
@property @property
def textcats_per_cat(self): def textcats_auc_per_cat(self):
"""RETURNS (dict): Scores per textcat label. """RETURNS (dict): AUC ROC Scores per textcat label.
""" """
if not self.textcat_multilabel:
return {
k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
for k, v in self.textcat_per_cat.items()
}
return { return {
k: {"roc_auc_score": max(v.score, -1)} k: {"roc_auc_score": max(v.score, -1)}
for k, v in self.textcat_per_cat.items() for k, v in self.textcat_auc_per_cat.items()
}
@property
def textcats_f_per_cat(self):
"""RETURNS (dict): F-scores per textcat label.
"""
return {
k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
for k, v in self.textcat_f_per_cat.items()
} }
@property @property
def scores(self): def scores(self):
"""RETURNS (dict): All scores with keys `uas`, `las`, `ents_p`, """RETURNS (dict): All scores mapped by key.
`ents_r`, `ents_f`, `tags_acc`, `token_acc`, and `textcat_score`.
""" """
return { return {
"uas": self.uas, "uas": self.uas,
@ -264,8 +266,10 @@ class Scorer(object):
"sent_r": self.sent_r, "sent_r": self.sent_r,
"sent_f": self.sent_f, "sent_f": self.sent_f,
"token_acc": self.token_acc, "token_acc": self.token_acc,
"textcat_score": self.textcat_score, "textcat_f": self.textcat_f,
"textcats_per_cat": self.textcats_per_cat, "textcat_auc": self.textcat_auc,
"textcats_f_per_cat": self.textcats_f_per_cat,
"textcats_auc_per_cat": self.textcats_auc_per_cat,
} }
def score(self, example, verbose=False, punct_labels=("p", "punct")): def score(self, example, verbose=False, punct_labels=("p", "punct")):
@ -408,7 +412,7 @@ class Scorer(object):
) )
if ( if (
len(gold.cats) > 0 len(gold.cats) > 0
and set(self.textcat_per_cat) == set(gold.cats) and set(self.textcat_f_per_cat) == set(self.textcat_auc_per_cat) == set(gold.cats)
and set(gold.cats) == set(doc.cats) and set(gold.cats) == set(doc.cats)
): ):
goldcat = max(gold.cats, key=gold.cats.get) goldcat = max(gold.cats, key=gold.cats.get)
@ -418,17 +422,21 @@ class Scorer(object):
set([self.textcat_positive_label]) & set([candcat]), set([self.textcat_positive_label]) & set([candcat]),
set([self.textcat_positive_label]) & set([goldcat]), set([self.textcat_positive_label]) & set([goldcat]),
) )
for label in self.textcat_per_cat: for label in set(gold.cats):
if self.textcat_multilabel: self.textcat_auc_per_cat[label].score_set(
self.textcat_per_cat[label].score_set(
doc.cats[label], gold.cats[label] doc.cats[label], gold.cats[label]
) )
else: self.textcat_f_per_cat[label].score_set(
self.textcat_per_cat[label].score_set(
set([label]) & set([candcat]), set([label]) & set([goldcat]) set([label]) & set([candcat]), set([label]) & set([goldcat])
) )
elif len(self.textcat_per_cat) > 0: elif len(self.textcat_f_per_cat) > 0:
model_labels = set(self.textcat_per_cat) model_labels = set(self.textcat_f_per_cat)
eval_labels = set(gold.cats)
raise ValueError(
Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels)
)
elif len(self.textcat_auc_per_cat) > 0:
model_labels = set(self.textcat_auc_per_cat)
eval_labels = set(gold.cats) eval_labels = set(gold.cats)
raise ValueError( raise ValueError(
Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels) Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels)

View File

@ -64,15 +64,14 @@ cdef class Parser:
# defined by EntityRecognizer as a BiluoPushDown # defined by EntityRecognizer as a BiluoPushDown
moves = self.TransitionSystem(self.vocab.strings) moves = self.TransitionSystem(self.vocab.strings)
self.moves = moves self.moves = moves
cfg.setdefault('min_action_freq', 30)
cfg.setdefault('learn_tokens', False)
cfg.setdefault('beam_width', 1)
cfg.setdefault('beam_update_prob', 1.0) # or 0.5 (both defaults were previously used)
self.model = model self.model = model
if self.moves.n_moves != 0: if self.moves.n_moves != 0:
self.set_output(self.moves.n_moves) self.set_output(self.moves.n_moves)
self.cfg = cfg self.cfg = cfg
self._multitasks = [] self._multitasks = []
for multitask in cfg.get("multitasks", []):
self.add_multitask_objective(multitask)
self._rehearsal_model = None self._rehearsal_model = None
@classmethod @classmethod
@ -80,13 +79,15 @@ cdef class Parser:
return cls(nlp.vocab, model, **cfg) return cls(nlp.vocab, model, **cfg)
def __reduce__(self): def __reduce__(self):
return (Parser, (self.vocab, self.model), self.moves) return (Parser, (self.vocab, self.model), (self.moves, self.cfg))
def __getstate__(self): def __getstate__(self):
return self.moves return (self.moves, self.cfg)
def __setstate__(self, moves): def __setstate__(self, state):
moves, config = state
self.moves = moves self.moves = moves
self.cfg = config
@property @property
def move_names(self): def move_names(self):

View File

@ -9,7 +9,8 @@ from spacy.pipeline.defaults import default_ner
def test_doc_add_entities_set_ents_iob(en_vocab): def test_doc_add_entities_set_ents_iob(en_vocab):
text = ["This", "is", "a", "lion"] text = ["This", "is", "a", "lion"]
doc = get_doc(en_vocab, text) doc = get_doc(en_vocab, text)
ner = EntityRecognizer(en_vocab, default_ner()) config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
ner = EntityRecognizer(en_vocab, default_ner(), **config)
ner.begin_training([]) ner.begin_training([])
ner(doc) ner(doc)
assert len(list(doc.ents)) == 0 assert len(list(doc.ents)) == 0
@ -25,7 +26,8 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
def test_ents_reset(en_vocab): def test_ents_reset(en_vocab):
text = ["This", "is", "a", "lion"] text = ["This", "is", "a", "lion"]
doc = get_doc(en_vocab, text) doc = get_doc(en_vocab, text)
ner = EntityRecognizer(en_vocab, default_ner()) config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
ner = EntityRecognizer(en_vocab, default_ner(), **config)
ner.begin_training([]) ner.begin_training([])
ner(doc) ner(doc)
assert [t.ent_iob_ for t in doc] == (["O"] * len(doc)) assert [t.ent_iob_ for t in doc] == (["O"] * len(doc))

View File

@ -17,7 +17,8 @@ def vocab():
@pytest.fixture @pytest.fixture
def parser(vocab): def parser(vocab):
parser = DependencyParser(vocab, default_parser()) config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
parser = DependencyParser(vocab, default_parser(), **config)
return parser return parser
@ -61,12 +62,13 @@ def test_add_label(parser):
def test_add_label_deserializes_correctly(): def test_add_label_deserializes_correctly():
ner1 = EntityRecognizer(Vocab(), default_ner()) config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
ner1 = EntityRecognizer(Vocab(), default_ner(), **config)
ner1.add_label("C") ner1.add_label("C")
ner1.add_label("B") ner1.add_label("B")
ner1.add_label("A") ner1.add_label("A")
ner1.begin_training([]) ner1.begin_training([])
ner2 = EntityRecognizer(Vocab(), default_ner()) ner2 = EntityRecognizer(Vocab(), default_ner(), **config)
# the second model needs to be resized before we can call from_bytes # the second model needs to be resized before we can call from_bytes
ner2.model.attrs["resize_output"](ner2.model, ner1.moves.n_moves) ner2.model.attrs["resize_output"](ner2.model, ner1.moves.n_moves)

View File

@ -138,7 +138,8 @@ def test_get_oracle_actions():
deps.append(dep) deps.append(dep)
ents.append(ent) ents.append(ent)
doc = Doc(Vocab(), words=[t[1] for t in annot_tuples]) doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
parser = DependencyParser(doc.vocab, default_parser()) config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
parser = DependencyParser(doc.vocab, default_parser(), **config)
parser.moves.add_action(0, "") parser.moves.add_action(0, "")
parser.moves.add_action(1, "") parser.moves.add_action(1, "")
parser.moves.add_action(1, "") parser.moves.add_action(1, "")

View File

@ -138,7 +138,8 @@ def test_accept_blocked_token():
# 1. test normal behaviour # 1. test normal behaviour
nlp1 = English() nlp1 = English()
doc1 = nlp1("I live in New York") doc1 = nlp1("I live in New York")
ner1 = EntityRecognizer(doc1.vocab, default_ner()) config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
ner1 = EntityRecognizer(doc1.vocab, default_ner(), **config)
assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""] assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""]
assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""] assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""]
@ -156,7 +157,8 @@ def test_accept_blocked_token():
# 2. test blocking behaviour # 2. test blocking behaviour
nlp2 = English() nlp2 = English()
doc2 = nlp2("I live in New York") doc2 = nlp2("I live in New York")
ner2 = EntityRecognizer(doc2.vocab, default_ner()) config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
ner2 = EntityRecognizer(doc2.vocab, default_ner(), **config)
# set "New York" to a blocked entity # set "New York" to a blocked entity
doc2.ents = [(0, 3, 5)] doc2.ents = [(0, 3, 5)]
@ -213,7 +215,8 @@ def test_overwrite_token():
assert [token.ent_type_ for token in doc] == ["", "", "", "", ""] assert [token.ent_type_ for token in doc] == ["", "", "", "", ""]
# Check that a new ner can overwrite O # Check that a new ner can overwrite O
ner2 = EntityRecognizer(doc.vocab, default_ner()) config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
ner2 = EntityRecognizer(doc.vocab, default_ner(), **config)
ner2.moves.add_action(5, "") ner2.moves.add_action(5, "")
ner2.add_label("GPE") ner2.add_label("GPE")
state = ner2.moves.init_batch([doc])[0] state = ner2.moves.init_batch([doc])[0]

View File

@ -28,7 +28,8 @@ def tok2vec():
@pytest.fixture @pytest.fixture
def parser(vocab, arc_eager): def parser(vocab, arc_eager):
return Parser(vocab, model=default_parser(), moves=arc_eager) config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
return Parser(vocab, model=default_parser(), moves=arc_eager, **config)
@pytest.fixture @pytest.fixture

View File

@ -94,7 +94,8 @@ def test_beam_advance_too_few_scores(beam, scores):
def test_beam_parse(): def test_beam_parse():
nlp = Language() nlp = Language()
nlp.add_pipe(DependencyParser(nlp.vocab, default_parser()), name="parser") config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
nlp.add_pipe(DependencyParser(nlp.vocab, default_parser(), **config), name="parser")
nlp.parser.add_label("nsubj") nlp.parser.add_label("nsubj")
nlp.parser.begin_training([], token_vector_width=8, hidden_width=8) nlp.parser.begin_training([], token_vector_width=8, hidden_width=8)
doc = nlp.make_doc("Australia is a country") doc = nlp.make_doc("Australia is a country")

View File

@ -15,7 +15,8 @@ def vocab():
@pytest.fixture @pytest.fixture
def parser(vocab): def parser(vocab):
parser = DependencyParser(vocab, default_parser()) config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
parser = DependencyParser(vocab, default_parser(), **config)
parser.cfg["token_vector_width"] = 4 parser.cfg["token_vector_width"] = 4
parser.cfg["hidden_width"] = 32 parser.cfg["hidden_width"] = 32
# parser.add_label('right') # parser.add_label('right')

View File

@ -270,7 +270,8 @@ def test_issue1963(en_tokenizer):
@pytest.mark.parametrize("label", ["U-JOB-NAME"]) @pytest.mark.parametrize("label", ["U-JOB-NAME"])
def test_issue1967(label): def test_issue1967(label):
ner = EntityRecognizer(Vocab(), default_ner()) config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
ner = EntityRecognizer(Vocab(), default_ner(), **config)
example = Example( example = Example(
doc=Doc(ner.vocab, words=["word"]), doc=Doc(ner.vocab, words=["word"]),
token_annotation=TokenAnnotation( token_annotation=TokenAnnotation(

View File

@ -196,7 +196,8 @@ def test_issue3345():
doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"]) doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
doc[4].is_sent_start = True doc[4].is_sent_start = True
ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}]) ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
ner = EntityRecognizer(doc.vocab, default_ner()) config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
ner = EntityRecognizer(doc.vocab, default_ner(), **config)
# Add the OUT action. I wouldn't have thought this would be necessary... # Add the OUT action. I wouldn't have thought this would be necessary...
ner.moves.add_action(5, "") ner.moves.add_action(5, "")
ner.add_label("GPE") ner.add_label("GPE")

View File

@ -6,7 +6,8 @@ from spacy.pipeline.defaults import default_parser
def test_issue3830_no_subtok(): def test_issue3830_no_subtok():
"""Test that the parser doesn't have subtok label if not learn_tokens""" """Test that the parser doesn't have subtok label if not learn_tokens"""
parser = DependencyParser(Vocab(), default_parser()) config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
parser = DependencyParser(Vocab(), default_parser(), **config)
parser.add_label("nsubj") parser.add_label("nsubj")
assert "subtok" not in parser.labels assert "subtok" not in parser.labels
parser.begin_training(lambda: []) parser.begin_training(lambda: [])
@ -15,7 +16,8 @@ def test_issue3830_no_subtok():
def test_issue3830_with_subtok(): def test_issue3830_with_subtok():
"""Test that the parser does have subtok label if learn_tokens=True.""" """Test that the parser does have subtok label if learn_tokens=True."""
parser = DependencyParser(Vocab(), default_parser(), learn_tokens=True) config = {"learn_tokens": True, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
parser = DependencyParser(Vocab(), default_parser(), **config)
parser.add_label("nsubj") parser.add_label("nsubj")
assert "subtok" not in parser.labels assert "subtok" not in parser.labels
parser.begin_training(lambda: []) parser.begin_training(lambda: [])

View File

@ -74,6 +74,7 @@ def test_issue4042_bug2():
output_dir.mkdir() output_dir.mkdir()
ner1.to_disk(output_dir) ner1.to_disk(output_dir)
ner2 = EntityRecognizer(vocab, default_ner()) config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
ner2 = EntityRecognizer(vocab, default_ner(), **config)
ner2.from_disk(output_dir) ner2.from_disk(output_dir)
assert len(ner2.labels) == 2 assert len(ner2.labels) == 2

View File

@ -12,7 +12,8 @@ def test_issue4313():
beam_width = 16 beam_width = 16
beam_density = 0.0001 beam_density = 0.0001
nlp = English() nlp = English()
ner = EntityRecognizer(nlp.vocab, default_ner()) config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
ner.add_label("SOME_LABEL") ner.add_label("SOME_LABEL")
ner.begin_training([]) ner.begin_training([])
nlp.add_pipe(ner) nlp.add_pipe(ner)

View File

@ -1,12 +1,30 @@
import pytest import pickle
import numpy import numpy
from spacy.lang.en import English from spacy.lang.en import English
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.tests.util import make_tempdir
def test_pickle_ner():
""" Ensure the pickling of the NER goes well"""
vocab = Vocab(vectors_name="test_vocab_add_vector")
nlp = English(vocab=vocab)
ner = nlp.create_pipe("ner", config={"min_action_freq": 342})
with make_tempdir() as tmp_path:
with (tmp_path / "ner.pkl").open("wb") as file_:
pickle.dump(ner, file_)
assert ner.cfg["min_action_freq"] == 342
with (tmp_path / "ner.pkl").open("rb") as file_:
ner2 = pickle.load(file_)
assert ner2.cfg["min_action_freq"] == 342
def test_issue4725(): def test_issue4725():
# ensures that this runs correctly and doesn't hang or crash because of the global vectors # ensures that this runs correctly and doesn't hang or crash because of the global vectors
# if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows)
vocab = Vocab(vectors_name="test_vocab_add_vector") vocab = Vocab(vectors_name="test_vocab_add_vector")
data = numpy.ndarray((5, 3), dtype="f") data = numpy.ndarray((5, 3), dtype="f")
data[0] = 1.0 data[0] = 1.0

View File

@ -12,7 +12,8 @@ test_parsers = [DependencyParser, EntityRecognizer]
@pytest.fixture @pytest.fixture
def parser(en_vocab): def parser(en_vocab):
parser = DependencyParser(en_vocab, default_parser()) config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
parser = DependencyParser(en_vocab, default_parser(), **config)
parser.add_label("nsubj") parser.add_label("nsubj")
return parser return parser

View File

@ -186,7 +186,7 @@ def load_model_from_path(model_path, meta=False, **overrides):
return nlp.from_disk(model_path, exclude=disable) return nlp.from_disk(model_path, exclude=disable)
def load_model_from_config(nlp_config): def load_model_from_config(nlp_config, replace=False):
if "name" in nlp_config: if "name" in nlp_config:
nlp = load_model(**nlp_config) nlp = load_model(**nlp_config)
elif "lang" in nlp_config: elif "lang" in nlp_config:
@ -197,8 +197,15 @@ def load_model_from_config(nlp_config):
if "pipeline" in nlp_config: if "pipeline" in nlp_config:
for name, component_cfg in nlp_config["pipeline"].items(): for name, component_cfg in nlp_config["pipeline"].items():
factory = component_cfg.pop("factory") factory = component_cfg.pop("factory")
component = nlp.create_pipe(factory, config=component_cfg) if name in nlp.pipe_names:
nlp.add_pipe(component, name=name) if replace:
component = nlp.create_pipe(factory, config=component_cfg)
nlp.replace_pipe(name, component)
else:
raise ValueError(Errors.E985.format(component=name))
else:
component = nlp.create_pipe(factory, config=component_cfg)
nlp.add_pipe(component, name=name)
return nlp return nlp

View File

@ -46,17 +46,19 @@ Update the evaluation scores from a single [`Doc`](/api/doc) /
## Properties ## Properties
| Name | Type | Description | | Name | Type | Description |
| ----------------------------------------------- | ----- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | | --------------------------------------------------- | ----- | ---------------------------------------------------------------------------------------------------------- |
| `token_acc` | float | Tokenization accuracy. | | `token_acc` | float | Tokenization accuracy. |
| `tags_acc` | float | Part-of-speech tag accuracy (fine grained tags, i.e. `Token.tag`). | | `tags_acc` | float | Part-of-speech tag accuracy (fine grained tags, i.e. `Token.tag`). |
| `uas` | float | Unlabelled dependency score. | | `uas` | float | Unlabelled dependency score. |
| `las` | float | Labelled dependency score. | | `las` | float | Labelled dependency score. |
| `ents_p` | float | Named entity accuracy (precision). | | `ents_p` | float | Named entity accuracy (precision). |
| `ents_r` | float | Named entity accuracy (recall). | | `ents_r` | float | Named entity accuracy (recall). |
| `ents_f` | float | Named entity accuracy (F-score). | | `ents_f` | float | Named entity accuracy (F-score). |
| `ents_per_type` <Tag variant="new">2.1.5</Tag> | dict | Scores per entity label. Keyed by label, mapped to a dict of `p`, `r` and `f` scores. | | `ents_per_type` <Tag variant="new">2.1.5</Tag> | dict | Scores per entity label. Keyed by label, mapped to a dict of `p`, `r` and `f` scores. |
| `textcat_score` <Tag variant="new">2.2</Tag> | float | F-score on positive label for binary exclusive, macro-averaged F-score for 3+ exclusive, macro-averaged AUC ROC score for multilabel (`-1` if undefined). | | `textcat_f` <Tag variant="new">3.0</Tag> | float | F-score on positive label for binary classification, macro-averaged F-score otherwise. |
| `textcats_per_cat` <Tag variant="new">2.2</Tag> | dict | Scores per textcat label, keyed by label. | | `textcat_auc` <Tag variant="new"3.0</Tag> | float | Macro-averaged AUC ROC score for multilabel classification (`-1` if undefined). |
| `las_per_type` <Tag variant="new">2.2.3</Tag> | dict | Labelled dependency scores, keyed by label. | | `textcats_f_per_cat` <Tag variant="new">3.0</Tag> | dict | F-scores per textcat label, keyed by label. |
| `scores` | dict | All scores, keyed by type. | | `textcats_auc_per_cat` <Tag variant="new">3.0</Tag> | dict | ROC AUC scores per textcat label, keyed by label. |
| `las_per_type` <Tag variant="new">2.2.3</Tag> | dict | Labelled dependency scores, keyed by label. |
| `scores` | dict | All scores, keyed by type. |