Tidy up and auto-format

This commit is contained in:
Ines Montani 2020-06-20 14:15:04 +02:00
parent a1c5b694be
commit 8283df80e9
39 changed files with 421 additions and 232 deletions

View File

@ -24,8 +24,8 @@ from ..gold import Example
output_dir=("Directory to write models to on each epoch", "positional", None, Path), output_dir=("Directory to write models to on each epoch", "positional", None, Path),
config_path=("Path to config file", "positional", None, Path), config_path=("Path to config file", "positional", None, Path),
use_gpu=("Use GPU", "option", "g", int), use_gpu=("Use GPU", "option", "g", int),
resume_path=("Path to pretrained weights from which to resume pretraining", "option","r", Path), resume_path=("Path to pretrained weights from which to resume pretraining", "option", "r", Path),
epoch_resume=("The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files.","option", "er", int), epoch_resume=("The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files.", "option", "er", int),
# fmt: on # fmt: on
) )
def pretrain( def pretrain(

View File

@ -3,7 +3,6 @@ from timeit import default_timer as timer
import srsly import srsly
from pydantic import BaseModel, FilePath from pydantic import BaseModel, FilePath
import plac
import tqdm import tqdm
from pathlib import Path from pathlib import Path
from wasabi import msg from wasabi import msg
@ -16,7 +15,9 @@ from ..gold import GoldCorpus
from ..lookups import Lookups from ..lookups import Lookups
from .. import util from .. import util
from ..errors import Errors from ..errors import Errors
from ..ml import models # don't remove - required to load the built-in architectures
# Don't remove - required to load the built-in architectures
from ..ml import models # noqa: F401
registry = util.registry registry = util.registry
@ -114,33 +115,19 @@ class ConfigSchema(BaseModel):
extra = "allow" extra = "allow"
@plac.annotations(
# fmt: off
train_path=("Location of JSON-formatted training data", "positional", None, Path),
dev_path=("Location of JSON-formatted development data", "positional", None, Path),
config_path=("Path to config file", "positional", None, Path),
output_path=("Output directory to store model in", "option", "o", Path),
init_tok2vec=(
"Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental.", "option", "t2v",
Path),
raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path),
verbose=("Display more information for debugging purposes", "flag", "VV", bool),
use_gpu=("Use GPU", "option", "g", int),
tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool),
# fmt: on
)
def train_cli( def train_cli(
train_path, # fmt: off
dev_path, train_path: ("Location of JSON-formatted training data", "positional", None, Path),
config_path, dev_path: ("Location of JSON-formatted development data", "positional", None, Path),
output_path=None, config_path: ("Path to config file", "positional", None, Path),
init_tok2vec=None, output_path: ("Output directory to store model in", "option", "o", Path) = None,
raw_text=None, init_tok2vec: ("Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None,
verbose=False, raw_text: ("Path to jsonl file with unlabelled text documents.", "option", "rt", Path) = None,
use_gpu=-1, verbose: ("Display more information for debugging purposes", "flag", "VV", bool) = False,
tag_map_path=None, use_gpu: ("Use GPU", "option", "g", int) = -1,
omit_extra_lookups=False, tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None,
omit_extra_lookups: ("Don't include extra lookups in model", "flag", "OEL", bool) = False,
# fmt: on
): ):
""" """
Train or update a spaCy model. Requires data to be formatted in spaCy's Train or update a spaCy model. Requires data to be formatted in spaCy's
@ -227,7 +214,9 @@ def train(
# verify textcat config # verify textcat config
if "textcat" in nlp_config["pipeline"]: if "textcat" in nlp_config["pipeline"]:
textcat_labels = set(nlp.get_pipe("textcat").labels) textcat_labels = set(nlp.get_pipe("textcat").labels)
textcat_multilabel = not nlp_config["pipeline"]["textcat"]["model"]["exclusive_classes"] textcat_multilabel = not nlp_config["pipeline"]["textcat"]["model"][
"exclusive_classes"
]
# check whether the setting 'exclusive_classes' corresponds to the provided training data # check whether the setting 'exclusive_classes' corresponds to the provided training data
if textcat_multilabel: if textcat_multilabel:
@ -255,7 +244,9 @@ def train(
"to 'false' in the config to train a classifier with classes " "to 'false' in the config to train a classifier with classes "
"that are not mutually exclusive." "that are not mutually exclusive."
) )
msg.info(f"Initialized textcat component for {len(textcat_labels)} unique labels") msg.info(
f"Initialized textcat component for {len(textcat_labels)} unique labels"
)
nlp.get_pipe("textcat").labels = tuple(textcat_labels) nlp.get_pipe("textcat").labels = tuple(textcat_labels)
# if 'positive_label' is provided: double check whether it's in the data and the task is binary # if 'positive_label' is provided: double check whether it's in the data and the task is binary
@ -281,9 +272,7 @@ def train(
nlp.resume_training() nlp.resume_training()
else: else:
msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}") msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
nlp.begin_training( nlp.begin_training(lambda: corpus.train_examples)
lambda: corpus.train_examples
)
# Update tag map with provided mapping # Update tag map with provided mapping
nlp.vocab.morphology.tag_map.update(tag_map) nlp.vocab.morphology.tag_map.update(tag_map)
@ -310,8 +299,7 @@ def train(
tok2vec = tok2vec.get(subpath) tok2vec = tok2vec.get(subpath)
if not tok2vec: if not tok2vec:
msg.fail( msg.fail(
f"Could not locate the tok2vec model at {tok2vec_path}.", f"Could not locate the tok2vec model at {tok2vec_path}.", exits=1,
exits=1,
) )
tok2vec.from_bytes(weights_data) tok2vec.from_bytes(weights_data)
@ -376,7 +364,7 @@ def create_train_batches(nlp, corpus, cfg):
train_examples = list( train_examples = list(
corpus.train_dataset( corpus.train_dataset(
nlp, nlp,
noise_level=0.0, # I think this is deprecated? noise_level=0.0, # I think this is deprecated?
orth_variant_level=cfg["orth_variant_level"], orth_variant_level=cfg["orth_variant_level"],
gold_preproc=cfg["gold_preproc"], gold_preproc=cfg["gold_preproc"],
max_length=cfg["max_length"], max_length=cfg["max_length"],
@ -429,7 +417,11 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
try: try:
weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights) weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights)
except KeyError as e: except KeyError as e:
raise KeyError(Errors.E983.format(dict_name='score_weights', key=str(e), keys=list(scores.keys()))) raise KeyError(
Errors.E983.format(
dict_name="score_weights", key=str(e), keys=list(scores.keys())
)
)
scores["speed"] = wps scores["speed"] = wps
return weighted_score, scores return weighted_score, scores
@ -578,15 +570,25 @@ def setup_printer(training, nlp):
] ]
except KeyError as e: except KeyError as e:
raise KeyError( raise KeyError(
Errors.E983.format(dict_name='scores (losses)', key=str(e), keys=list(info["losses"].keys()))) Errors.E983.format(
dict_name="scores (losses)",
key=str(e),
keys=list(info["losses"].keys()),
)
)
try: try:
scores = [ scores = [
"{0:.2f}".format(float(info["other_scores"][col])) "{0:.2f}".format(float(info["other_scores"][col])) for col in score_cols
for col in score_cols
] ]
except KeyError as e: except KeyError as e:
raise KeyError(Errors.E983.format(dict_name='scores (other)', key=str(e), keys=list(info["other_scores"].keys()))) raise KeyError(
Errors.E983.format(
dict_name="scores (other)",
key=str(e),
keys=list(info["other_scores"].keys()),
)
)
data = ( data = (
[info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))] [info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))]
) )

View File

@ -1,4 +1,3 @@
from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN
from .errors import Errors from .errors import Errors
from .lookups import Lookups from .lookups import Lookups
from .parts_of_speech import NAMES as UPOS_NAMES from .parts_of_speech import NAMES as UPOS_NAMES
@ -51,7 +50,13 @@ class Lemmatizer(object):
index_table = self.lookups.get_table("lemma_index", {}) index_table = self.lookups.get_table("lemma_index", {})
exc_table = self.lookups.get_table("lemma_exc", {}) exc_table = self.lookups.get_table("lemma_exc", {})
rules_table = self.lookups.get_table("lemma_rules", {}) rules_table = self.lookups.get_table("lemma_rules", {})
if not any((index_table.get(univ_pos), exc_table.get(univ_pos), rules_table.get(univ_pos))): if not any(
(
index_table.get(univ_pos),
exc_table.get(univ_pos),
rules_table.get(univ_pos),
)
):
if univ_pos == "propn": if univ_pos == "propn":
return [string] return [string]
else: else:

View File

@ -1 +1 @@
from .models import * from .models import * # noqa: F401, F403

View File

@ -1,11 +1,8 @@
"""Thinc layer to do simpler transition-based parsing, NER, etc.""" """Thinc layer to do simpler transition-based parsing, NER, etc."""
from typing import List, Tuple, Dict, Optional from typing import Dict, Optional
import numpy import numpy
from thinc.api import Ops, Model, with_array, softmax_activation, padded2list from thinc.api import Model
from thinc.api import to_numpy from thinc.types import Padded, Floats3d
from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d
from ..tokens import Doc
def BILUO() -> Model[Padded, Padded]: def BILUO() -> Model[Padded, Padded]:
@ -14,11 +11,11 @@ def BILUO() -> Model[Padded, Padded]:
forward, forward,
init=init, init=init,
dims={"nO": None}, dims={"nO": None},
attrs={"get_num_actions": get_num_actions} attrs={"get_num_actions": get_num_actions},
) )
def init(model, X: Optional[Padded]=None, Y: Optional[Padded]=None): def init(model, X: Optional[Padded] = None, Y: Optional[Padded] = None):
if X is not None and Y is not None: if X is not None and Y is not None:
if X.data.shape != Y.data.shape: if X.data.shape != Y.data.shape:
# TODO: Fix error # TODO: Fix error
@ -49,12 +46,12 @@ def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool):
masks = model.ops.alloc3f(*Y.shape) masks = model.ops.alloc3f(*Y.shape)
max_value = Xp.data.max() max_value = Xp.data.max()
for t in range(Xp.data.shape[0]): for t in range(Xp.data.shape[0]):
is_last = (Xp.lengths < (t+2)).astype("i") is_last = (Xp.lengths < (t + 2)).astype("i")
masks[t] = valid_transitions[is_last, prev_actions] masks[t] = valid_transitions[is_last, prev_actions]
# Don't train the out-of-bounds sequences. # Don't train the out-of-bounds sequences.
masks[t, Xp.size_at_t[t]:] = 0 masks[t, Xp.size_at_t[t] :] = 0
# Valid actions get 0*10e8, invalid get large negative value # Valid actions get 0*10e8, invalid get large negative value
Y[t] = Xp.data[t] + ((masks[t]-1) * max_value * 10) Y[t] = Xp.data[t] + ((masks[t] - 1) * max_value * 10)
prev_actions = Y[t].argmax(axis=-1) prev_actions = Y[t].argmax(axis=-1)
def backprop_biluo(dY: Padded) -> Padded: def backprop_biluo(dY: Padded) -> Padded:
@ -83,13 +80,13 @@ def _get_transition_table(
B_start, B_end = (0, n_labels) B_start, B_end = (0, n_labels)
I_start, I_end = (B_end, B_end + n_labels) I_start, I_end = (B_end, B_end + n_labels)
L_start, L_end = (I_end, I_end + n_labels) L_start, L_end = (I_end, I_end + n_labels)
U_start, U_end = (L_end, L_end + n_labels) U_start, U_end = (L_end, L_end + n_labels) # noqa: F841
# Using ranges allows us to set specific cells, which is necessary to express # Using ranges allows us to set specific cells, which is necessary to express
# that only actions of the same label are valid continuations. # that only actions of the same label are valid continuations.
B_range = numpy.arange(B_start, B_end) B_range = numpy.arange(B_start, B_end)
I_range = numpy.arange(I_start, I_end) I_range = numpy.arange(I_start, I_end)
L_range = numpy.arange(L_start, L_end) L_range = numpy.arange(L_start, L_end)
O_action = U_end O_action = U_end # noqa: F841
# If this is the last token and the previous action was B or I, only L # If this is the last token and the previous action was B or I, only L
# of that label is valid # of that label is valid
table[1, B_range, L_range] = 1 table[1, B_range, L_range] = 1

View File

@ -1,9 +1,7 @@
"""Thinc layer to do simpler transition-based parsing, NER, etc.""" """Thinc layer to do simpler transition-based parsing, NER, etc."""
from typing import List, Tuple, Dict, Optional from typing import Dict, Optional
from thinc.api import Ops, Model, with_array, softmax_activation, padded2list from thinc.api import Ops, Model
from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d from thinc.types import Padded, Floats3d
from ..tokens import Doc
def IOB() -> Model[Padded, Padded]: def IOB() -> Model[Padded, Padded]:
@ -12,11 +10,11 @@ def IOB() -> Model[Padded, Padded]:
forward, forward,
init=init, init=init,
dims={"nO": None}, dims={"nO": None},
attrs={"get_num_actions": get_num_actions} attrs={"get_num_actions": get_num_actions},
) )
def init(model, X: Optional[Padded]=None, Y: Optional[Padded]=None): def init(model, X: Optional[Padded] = None, Y: Optional[Padded] = None):
if X is not None and Y is not None: if X is not None and Y is not None:
if X.data.shape != Y.data.shape: if X.data.shape != Y.data.shape:
# TODO: Fix error # TODO: Fix error
@ -48,14 +46,14 @@ def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool):
for t in range(Xp.data.shape[0]): for t in range(Xp.data.shape[0]):
masks[t] = valid_transitions[prev_actions] masks[t] = valid_transitions[prev_actions]
# Don't train the out-of-bounds sequences. # Don't train the out-of-bounds sequences.
masks[t, Xp.size_at_t[t]:] = 0 masks[t, Xp.size_at_t[t] :] = 0
# Valid actions get 0*10e8, invalid get -1*10e8 # Valid actions get 0*10e8, invalid get -1*10e8
Y[t] = Xp.data[t] + ((masks[t]-1) * 10e8) Y[t] = Xp.data[t] + ((masks[t] - 1) * 10e8)
prev_actions = Y[t].argmax(axis=-1) prev_actions = Y[t].argmax(axis=-1)
def backprop_biluo(dY: Padded) -> Padded: def backprop_biluo(dY: Padded) -> Padded:
# Masking the gradient seems to do poorly here. But why? # Masking the gradient seems to do poorly here. But why?
#dY.data *= masks # dY.data *= masks
return dY return dY
return Padded(Y, Xp.size_at_t, Xp.lengths, Xp.indices), backprop_biluo return Padded(Y, Xp.size_at_t, Xp.lengths, Xp.indices), backprop_biluo
@ -83,7 +81,7 @@ def _get_transition_table(
B_range = ops.xp.arange(B_start, B_end) B_range = ops.xp.arange(B_start, B_end)
I_range = ops.xp.arange(I_start, I_end) I_range = ops.xp.arange(I_start, I_end)
# B and O are always valid # B and O are always valid
table[:, B_start : B_end] = 1 table[:, B_start:B_end] = 1
table[:, O_action] = 1 table[:, O_action] = 1
# I can only follow a matching B # I can only follow a matching B
table[B_range, I_range] = 1 table[B_range, I_range] = 1

View File

@ -84,7 +84,7 @@ def _backprop_precomputable_affine_padding(model, dY, ids):
# #
# (ids < 0).T @ dY # (ids < 0).T @ dY
mask = model.ops.asarray(ids < 0, dtype="f") mask = model.ops.asarray(ids < 0, dtype="f")
d_pad = model.ops.gemm(mask, dY.reshape(nB, nO*nP), trans1=True) d_pad = model.ops.gemm(mask, dY.reshape(nB, nO * nP), trans1=True)
return d_pad.reshape((1, nF, nO, nP)) return d_pad.reshape((1, nF, nO, nP))

View File

@ -1,6 +1,6 @@
from .entity_linker import * # noqa from .entity_linker import * # noqa
from .parser import * # noqa from .parser import * # noqa
from .simple_ner import * from .simple_ner import * # noqa
from .tagger import * # noqa from .tagger import * # noqa
from .textcat import * # noqa from .textcat import * # noqa
from .tok2vec import * # noqa from .tok2vec import * # noqa

View File

@ -7,7 +7,12 @@ def build_multi_task_model(tok2vec, maxout_pieces, token_vector_width, nO=None):
softmax = Softmax(nO=nO, nI=token_vector_width * 2) softmax = Softmax(nO=nO, nI=token_vector_width * 2)
model = chain( model = chain(
tok2vec, tok2vec,
Maxout(nO=token_vector_width * 2, nI=token_vector_width, nP=maxout_pieces, dropout=0.0), Maxout(
nO=token_vector_width * 2,
nI=token_vector_width,
nP=maxout_pieces,
dropout=0.0,
),
LayerNorm(token_vector_width * 2), LayerNorm(token_vector_width * 2),
softmax, softmax,
) )
@ -20,7 +25,11 @@ def build_cloze_multi_task_model(vocab, tok2vec, maxout_pieces, nO=None):
# nO = vocab.vectors.data.shape[1] # nO = vocab.vectors.data.shape[1]
output_layer = chain( output_layer = chain(
Maxout( Maxout(
nO=nO, nI=tok2vec.get_dim("nO"), nP=maxout_pieces, normalize=True, dropout=0.0 nO=nO,
nI=tok2vec.get_dim("nO"),
nP=maxout_pieces,
normalize=True,
dropout=0.0,
), ),
Linear(nO=nO, nI=nO, init_W=zero_init), Linear(nO=nO, nI=nO, init_W=zero_init),
) )
@ -39,7 +48,9 @@ def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15):
def mlm_forward(model, docs, is_train): def mlm_forward(model, docs, is_train):
mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob) mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob)
mask = model.ops.asarray(mask).reshape((mask.shape[0], 1)) mask = model.ops.asarray(mask).reshape((mask.shape[0], 1))
output, backprop = model.get_ref("wrapped-model").begin_update(docs) # drop=drop output, backprop = model.get_ref("wrapped-model").begin_update(
docs
) # drop=drop
def mlm_backward(d_output): def mlm_backward(d_output):
d_output *= 1 - mask d_output *= 1 - mask

View File

@ -16,18 +16,14 @@ def build_tb_parser_model(
nO=None, nO=None,
): ):
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
tok2vec = chain( tok2vec = chain(tok2vec, with_array(Linear(hidden_width, t2v_width)), list2array(),)
tok2vec,
with_array(Linear(hidden_width, t2v_width)),
list2array(),
)
tok2vec.set_dim("nO", hidden_width) tok2vec.set_dim("nO", hidden_width)
lower = PrecomputableAffine( lower = PrecomputableAffine(
nO=hidden_width if use_upper else nO, nO=hidden_width if use_upper else nO,
nF=nr_feature_tokens, nF=nr_feature_tokens,
nI=tok2vec.get_dim("nO"), nI=tok2vec.get_dim("nO"),
nP=maxout_pieces nP=maxout_pieces,
) )
if use_upper: if use_upper:
with use_ops("numpy"): with use_ops("numpy"):

View File

@ -1,9 +1,8 @@
import functools from typing import List
from typing import List, Tuple, Dict, Optional from thinc.api import Model, Linear, with_array, softmax_activation, padded2list
from thinc.api import Ops, Model, Linear, Softmax, with_array, softmax_activation, padded2list
from thinc.api import chain, list2padded, configure_normal_init from thinc.api import chain, list2padded, configure_normal_init
from thinc.api import Dropout from thinc.api import Dropout
from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d from thinc.types import Floats2d
from ...tokens import Doc from ...tokens import Doc
from .._biluo import BILUO from .._biluo import BILUO
@ -12,12 +11,12 @@ from ...util import registry
@registry.architectures.register("spacy.BiluoTagger.v1") @registry.architectures.register("spacy.BiluoTagger.v1")
def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], List[Floats2d]]: def BiluoTagger(
tok2vec: Model[List[Doc], List[Floats2d]]
) -> Model[List[Doc], List[Floats2d]]:
biluo = BILUO() biluo = BILUO()
linear = Linear( linear = Linear(
nO=None, nO=None, nI=tok2vec.get_dim("nO"), init_W=configure_normal_init(mean=0.02)
nI=tok2vec.get_dim("nO"),
init_W=configure_normal_init(mean=0.02)
) )
model = chain( model = chain(
tok2vec, tok2vec,
@ -25,7 +24,7 @@ def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], L
with_array(chain(Dropout(0.1), linear)), with_array(chain(Dropout(0.1), linear)),
biluo, biluo,
with_array(softmax_activation()), with_array(softmax_activation()),
padded2list() padded2list(),
) )
return Model( return Model(
@ -35,11 +34,14 @@ def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], L
layers=[model, linear], layers=[model, linear],
refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo}, refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo},
dims={"nO": None}, dims={"nO": None},
attrs={"get_num_actions": biluo.attrs["get_num_actions"]} attrs={"get_num_actions": biluo.attrs["get_num_actions"]},
) )
@registry.architectures.register("spacy.IOBTagger.v1") @registry.architectures.register("spacy.IOBTagger.v1")
def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], List[Floats2d]]: def IOBTagger(
tok2vec: Model[List[Doc], List[Floats2d]]
) -> Model[List[Doc], List[Floats2d]]:
biluo = IOB() biluo = IOB()
linear = Linear(nO=None, nI=tok2vec.get_dim("nO")) linear = Linear(nO=None, nI=tok2vec.get_dim("nO"))
model = chain( model = chain(
@ -48,7 +50,7 @@ def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], Lis
with_array(linear), with_array(linear),
biluo, biluo,
with_array(softmax_activation()), with_array(softmax_activation()),
padded2list() padded2list(),
) )
return Model( return Model(
@ -58,11 +60,10 @@ def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], Lis
layers=[model], layers=[model],
refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo}, refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo},
dims={"nO": None}, dims={"nO": None},
attrs={"get_num_actions": biluo.attrs["get_num_actions"]} attrs={"get_num_actions": biluo.attrs["get_num_actions"]},
) )
def init(model: Model[List[Doc], List[Floats2d]], X=None, Y=None) -> None: def init(model: Model[List[Doc], List[Floats2d]], X=None, Y=None) -> None:
if model.get_dim("nO") is None and Y: if model.get_dim("nO") is None and Y:
model.set_dim("nO", Y[0].shape[1]) model.set_dim("nO", Y[0].shape[1])

View File

@ -1,5 +1,4 @@
from thinc.api import zero_init, with_array, Softmax, chain, Model, Dropout from thinc.api import zero_init, with_array, Softmax, chain, Model
from thinc.api import glorot_uniform_init
from ...util import registry from ...util import registry

View File

@ -1,11 +1,12 @@
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic, ParametricAttention from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
from thinc.api import chain, concatenate, clone, Dropout from thinc.api import ParametricAttention, chain, concatenate, clone, Dropout
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum, Relu, residual, expand_window from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout
from thinc.api import HashEmbed, with_ragged, with_array, with_cpu, uniqued, FeatureExtractor from thinc.api import reduce_sum, Relu, residual, expand_window, HashEmbed
from thinc.api import with_ragged, with_array, with_cpu, uniqued, FeatureExtractor
from ..spacy_vectors import SpacyVectors from ..spacy_vectors import SpacyVectors
from ... import util from ... import util
from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE, LOWER from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
from ...util import registry from ...util import registry
from ..extract_ngrams import extract_ngrams from ..extract_ngrams import extract_ngrams
@ -50,14 +51,31 @@ def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO
@registry.architectures.register("spacy.TextCat.v1") @registry.architectures.register("spacy.TextCat.v1")
def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_classes, ngram_size, def build_text_classifier(
window_size, conv_depth, dropout, nO=None): width,
embed_size,
pretrained_vectors,
exclusive_classes,
ngram_size,
window_size,
conv_depth,
dropout,
nO=None,
):
cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID] cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
lower = HashEmbed(nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout) lower = HashEmbed(
prefix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout) nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout
suffix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout) )
shape = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout) prefix = HashEmbed(
nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout
)
suffix = HashEmbed(
nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout
)
shape = HashEmbed(
nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout
)
width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape]) width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
trained_vectors = FeatureExtractor(cols) >> with_array( trained_vectors = FeatureExtractor(cols) >> with_array(
@ -83,30 +101,38 @@ def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_class
vectors_width = width vectors_width = width
tok2vec = vector_layer >> with_array( tok2vec = vector_layer >> with_array(
Maxout(width, vectors_width, normalize=True) Maxout(width, vectors_width, normalize=True)
>> residual((expand_window(window_size=window_size) >> residual(
>> Maxout(nO=width, nI=width * ((window_size * 2) + 1), normalize=True))) ** conv_depth, (
expand_window(window_size=window_size)
>> Maxout(
nO=width, nI=width * ((window_size * 2) + 1), normalize=True
)
)
)
** conv_depth,
pad=conv_depth, pad=conv_depth,
) )
cnn_model = ( cnn_model = (
tok2vec tok2vec
>> list2ragged() >> list2ragged()
>> ParametricAttention(width) >> ParametricAttention(width)
>> reduce_sum() >> reduce_sum()
>> residual(Maxout(nO=width, nI=width)) >> residual(Maxout(nO=width, nI=width))
>> Linear(nO=nO, nI=width) >> Linear(nO=nO, nI=width)
>> Dropout(0.0) >> Dropout(0.0)
) )
linear_model = build_bow_text_classifier( linear_model = build_bow_text_classifier(
nO=nO, ngram_size=ngram_size, exclusive_classes=exclusive_classes, no_output_layer=False nO=nO,
ngram_size=ngram_size,
exclusive_classes=exclusive_classes,
no_output_layer=False,
) )
nO_double = nO*2 if nO else None nO_double = nO * 2 if nO else None
if exclusive_classes: if exclusive_classes:
output_layer = Softmax(nO=nO, nI=nO_double) output_layer = Softmax(nO=nO, nI=nO_double)
else: else:
output_layer = ( output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic()
Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic()
)
model = (linear_model | cnn_model) >> output_layer model = (linear_model | cnn_model) >> output_layer
model.set_ref("tok2vec", tok2vec) model.set_ref("tok2vec", tok2vec)
if model.has_dim("nO") is not False: if model.has_dim("nO") is not False:

View File

@ -99,7 +99,13 @@ def hash_charembed_cnn(
@registry.architectures.register("spacy.HashEmbedBiLSTM.v1") @registry.architectures.register("spacy.HashEmbedBiLSTM.v1")
def hash_embed_bilstm_v1( def hash_embed_bilstm_v1(
pretrained_vectors, width, depth, embed_size, subword_features, maxout_pieces, dropout pretrained_vectors,
width,
depth,
embed_size,
subword_features,
maxout_pieces,
dropout,
): ):
# Does not use character embeddings: set to False by default # Does not use character embeddings: set to False by default
return build_Tok2Vec_model( return build_Tok2Vec_model(
@ -141,21 +147,24 @@ def hash_char_embed_bilstm_v1(
@registry.architectures.register("spacy.LayerNormalizedMaxout.v1") @registry.architectures.register("spacy.LayerNormalizedMaxout.v1")
def LayerNormalizedMaxout(width, maxout_pieces): def LayerNormalizedMaxout(width, maxout_pieces):
return Maxout( return Maxout(nO=width, nP=maxout_pieces, dropout=0.0, normalize=True,)
nO=width,
nP=maxout_pieces,
dropout=0.0,
normalize=True,
)
@registry.architectures.register("spacy.MultiHashEmbed.v1") @registry.architectures.register("spacy.MultiHashEmbed.v1")
def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix, dropout): def MultiHashEmbed(
columns, width, rows, use_subwords, pretrained_vectors, mix, dropout
):
norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout) norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout)
if use_subwords: if use_subwords:
prefix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout) prefix = HashEmbed(
suffix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout) nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout
shape = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout) )
suffix = HashEmbed(
nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout
)
shape = HashEmbed(
nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout
)
if pretrained_vectors: if pretrained_vectors:
glove = StaticVectors( glove = StaticVectors(
@ -195,7 +204,13 @@ def CharacterEmbed(columns, width, rows, nM, nC, features, dropout):
def MaxoutWindowEncoder(width, window_size, maxout_pieces, depth): def MaxoutWindowEncoder(width, window_size, maxout_pieces, depth):
cnn = chain( cnn = chain(
expand_window(window_size=window_size), expand_window(window_size=window_size),
Maxout(nO=width, nI=width * ((window_size * 2) + 1), nP=maxout_pieces, dropout=0.0, normalize=True), Maxout(
nO=width,
nI=width * ((window_size * 2) + 1),
nP=maxout_pieces,
dropout=0.0,
normalize=True,
),
) )
model = clone(residual(cnn), depth) model = clone(residual(cnn), depth)
model.set_dim("nO", width) model.set_dim("nO", width)
@ -247,11 +262,19 @@ def build_Tok2Vec_model(
subword_features = False subword_features = False
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout) norm = HashEmbed(
nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout
)
if subword_features: if subword_features:
prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout) prefix = HashEmbed(
suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout) nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout
shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout) )
suffix = HashEmbed(
nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout
)
shape = HashEmbed(
nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout
)
else: else:
prefix, suffix, shape = (None, None, None) prefix, suffix, shape = (None, None, None)
if pretrained_vectors is not None: if pretrained_vectors is not None:

View File

@ -20,8 +20,8 @@ def TransitionModel(tok2vec, lower, upper, unseen_classes=set()):
attrs={ attrs={
"has_upper": has_upper, "has_upper": has_upper,
"unseen_classes": set(unseen_classes), "unseen_classes": set(unseen_classes),
"resize_output": resize_output "resize_output": resize_output,
} },
) )
@ -31,14 +31,14 @@ def forward(model, X, is_train):
model.layers, model.layers,
unseen_classes=model.attrs["unseen_classes"], unseen_classes=model.attrs["unseen_classes"],
train=is_train, train=is_train,
has_upper=model.attrs["has_upper"] has_upper=model.attrs["has_upper"],
) )
return step_model, step_model.finish_steps return step_model, step_model.finish_steps
def init(model, X=None, Y=None): def init(model, X=None, Y=None):
tok2vec = model.get_ref("tok2vec").initialize(X=X) tok2vec = model.get_ref("tok2vec").initialize(X=X) # noqa: F841
lower = model.get_ref("lower").initialize() lower = model.get_ref("lower").initialize()
if model.attrs["has_upper"]: if model.attrs["has_upper"]:
statevecs = model.ops.alloc2f(2, lower.get_dim("nO")) statevecs = model.ops.alloc2f(2, lower.get_dim("nO"))
@ -46,7 +46,7 @@ def init(model, X=None, Y=None):
def resize_output(model, new_nO): def resize_output(model, new_nO):
tok2vec = model.get_ref("tok2vec") tok2vec = model.get_ref("tok2vec") # noqa: F841
lower = model.get_ref("lower") lower = model.get_ref("lower")
upper = model.get_ref("upper") upper = model.get_ref("upper")
if not model.attrs["has_upper"]: if not model.attrs["has_upper"]:
@ -62,7 +62,7 @@ def resize_output(model, new_nO):
nI = None nI = None
if smaller.has_dim("nI"): if smaller.has_dim("nI"):
nI = smaller.get_dim("nI") nI = smaller.get_dim("nI")
with use_ops('numpy'): with use_ops("numpy"):
larger = Linear(nO=new_nO, nI=nI) larger = Linear(nO=new_nO, nI=nI)
larger.init = smaller.init larger.init = smaller.init
# it could be that the model is not initialized yet, then skip this bit # it could be that the model is not initialized yet, then skip this bit
@ -74,8 +74,8 @@ def resize_output(model, new_nO):
# Weights are stored in (nr_out, nr_in) format, so we're basically # Weights are stored in (nr_out, nr_in) format, so we're basically
# just adding rows here. # just adding rows here.
if smaller.has_dim("nO"): if smaller.has_dim("nO"):
larger_W[:smaller.get_dim("nO")] = smaller_W larger_W[: smaller.get_dim("nO")] = smaller_W
larger_b[:smaller.get_dim("nO")] = smaller_b larger_b[: smaller.get_dim("nO")] = smaller_b
for i in range(smaller.get_dim("nO"), new_nO): for i in range(smaller.get_dim("nO"), new_nO):
model.attrs["unseen_classes"].add(i) model.attrs["unseen_classes"].add(i)

View File

@ -21,9 +21,7 @@ class SimpleNER(Pipe):
self.model = model self.model = model
self.cfg = {"labels": []} self.cfg = {"labels": []}
self.loss_func = SequenceCategoricalCrossentropy( self.loss_func = SequenceCategoricalCrossentropy(
names=self.get_tag_names(), names=self.get_tag_names(), normalize=True, missing_value=None
normalize=True,
missing_value=None
) )
assert self.model is not None assert self.model is not None
@ -42,17 +40,17 @@ class SimpleNER(Pipe):
def get_tag_names(self): def get_tag_names(self):
if self.is_biluo: if self.is_biluo:
return ( return (
[f"B-{label}" for label in self.labels] + [f"B-{label}" for label in self.labels]
[f"I-{label}" for label in self.labels] + + [f"I-{label}" for label in self.labels]
[f"L-{label}" for label in self.labels] + + [f"L-{label}" for label in self.labels]
[f"U-{label}" for label in self.labels] + + [f"U-{label}" for label in self.labels]
["O"] + ["O"]
) )
else: else:
return ( return (
[f"B-{label}" for label in self.labels] + [f"B-{label}" for label in self.labels]
[f"I-{label}" for label in self.labels] + + [f"I-{label}" for label in self.labels]
["O"] + ["O"]
) )
def predict(self, docs: List[Doc]) -> List[Floats2d]: def predict(self, docs: List[Doc]) -> List[Floats2d]:
@ -108,7 +106,7 @@ class SimpleNER(Pipe):
def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs): def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
self.cfg.update(kwargs) self.cfg.update(kwargs)
if not hasattr(get_examples, '__call__'): if not hasattr(get_examples, "__call__"):
gold_tuples = get_examples gold_tuples = get_examples
get_examples = lambda: gold_tuples get_examples = lambda: gold_tuples
labels = _get_labels(get_examples()) labels = _get_labels(get_examples())
@ -122,9 +120,7 @@ class SimpleNER(Pipe):
self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg) self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg)
link_vectors_to_models(self.vocab) link_vectors_to_models(self.vocab)
self.loss_func = SequenceCategoricalCrossentropy( self.loss_func = SequenceCategoricalCrossentropy(
names=self.get_tag_names(), names=self.get_tag_names(), normalize=True, missing_value=None
normalize=True,
missing_value=None
) )
return sgd return sgd
@ -135,7 +131,7 @@ class SimpleNER(Pipe):
def _has_ner(eg): def _has_ner(eg):
for ner_tag in eg.gold.ner: for ner_tag in eg.gold.ner:
if ner_tag != "-" and ner_tag != None: if ner_tag != "-" and ner_tag is not None:
return True return True
else: else:
return False return False
@ -145,7 +141,7 @@ def _get_labels(examples):
labels = set() labels = set()
for eg in examples: for eg in examples:
for ner_tag in eg.token_annotation.entities: for ner_tag in eg.token_annotation.entities:
if ner_tag != 'O' and ner_tag != '-': if ner_tag != "O" and ner_tag != "-":
_, label = ner_tag.split('-', 1) _, label = ner_tag.split("-", 1)
labels.add(label) labels.add(label)
return list(sorted(labels)) return list(sorted(labels))

View File

@ -98,7 +98,9 @@ class Scorer(object):
for name, component in pipeline: for name, component in pipeline:
if name == "textcat": if name == "textcat":
self.textcat_multilabel = component.model.attrs["multi_label"] self.textcat_multilabel = component.model.attrs["multi_label"]
self.textcat_positive_label = component.cfg.get("positive_label", None) self.textcat_positive_label = component.cfg.get(
"positive_label", None
)
for label in component.cfg.get("labels", []): for label in component.cfg.get("labels", []):
self.textcat_auc_per_cat[label] = ROCAUCScore() self.textcat_auc_per_cat[label] = ROCAUCScore()
self.textcat_f_per_cat[label] = PRFScore() self.textcat_f_per_cat[label] = PRFScore()
@ -119,19 +121,19 @@ class Scorer(object):
@property @property
def morphs_acc(self): def morphs_acc(self):
"""RETURNS (float): Morph tag accuracy (morphological features, """RETURNS (float): Morph tag accuracy (morphological features,
i.e. `Token.morph`). i.e. `Token.morph`).
""" """
return self.morphs.fscore * 100 return self.morphs.fscore * 100
@property @property
def morphs_per_type(self): def morphs_per_type(self):
"""RETURNS (dict): Scores per dependency label. """RETURNS (dict): Scores per dependency label.
""" """
return { return {
k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100} k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
for k, v in self.morphs_per_feat.items() for k, v in self.morphs_per_feat.items()
} }
@property @property
def sent_p(self): def sent_p(self):
@ -302,7 +304,15 @@ class Scorer(object):
gold_morphs_per_feat = {} gold_morphs_per_feat = {}
gold_sent_starts = set() gold_sent_starts = set()
gold_ents = set(tags_to_entities(orig.entities)) gold_ents = set(tags_to_entities(orig.entities))
for id_, tag, pos, morph, head, dep, sent_start in zip(orig.ids, orig.tags, orig.pos, orig.morphs, orig.heads, orig.deps, orig.sent_starts): for id_, tag, pos, morph, head, dep, sent_start in zip(
orig.ids,
orig.tags,
orig.pos,
orig.morphs,
orig.heads,
orig.deps,
orig.sent_starts,
):
gold_tags.add((id_, tag)) gold_tags.add((id_, tag))
gold_pos.add((id_, pos)) gold_pos.add((id_, pos))
gold_morphs.add((id_, morph)) gold_morphs.add((id_, morph))
@ -400,7 +410,10 @@ class Scorer(object):
self.pos.score_set(cand_pos, gold_pos) self.pos.score_set(cand_pos, gold_pos)
self.morphs.score_set(cand_morphs, gold_morphs) self.morphs.score_set(cand_morphs, gold_morphs)
for field in self.morphs_per_feat: for field in self.morphs_per_feat:
self.morphs_per_feat[field].score_set(cand_morphs_per_feat.get(field, set()), gold_morphs_per_feat.get(field, set())) self.morphs_per_feat[field].score_set(
cand_morphs_per_feat.get(field, set()),
gold_morphs_per_feat.get(field, set()),
)
self.sent_starts.score_set(cand_sent_starts, gold_sent_starts) self.sent_starts.score_set(cand_sent_starts, gold_sent_starts)
self.labelled.score_set(cand_deps, gold_deps) self.labelled.score_set(cand_deps, gold_deps)
for dep in self.labelled_per_dep: for dep in self.labelled_per_dep:
@ -412,7 +425,9 @@ class Scorer(object):
) )
if ( if (
len(gold.cats) > 0 len(gold.cats) > 0
and set(self.textcat_f_per_cat) == set(self.textcat_auc_per_cat) == set(gold.cats) and set(self.textcat_f_per_cat)
== set(self.textcat_auc_per_cat)
== set(gold.cats)
and set(gold.cats) == set(doc.cats) and set(gold.cats) == set(doc.cats)
): ):
goldcat = max(gold.cats, key=gold.cats.get) goldcat = max(gold.cats, key=gold.cats.get)
@ -424,10 +439,10 @@ class Scorer(object):
) )
for label in set(gold.cats): for label in set(gold.cats):
self.textcat_auc_per_cat[label].score_set( self.textcat_auc_per_cat[label].score_set(
doc.cats[label], gold.cats[label] doc.cats[label], gold.cats[label]
) )
self.textcat_f_per_cat[label].score_set( self.textcat_f_per_cat[label].score_set(
set([label]) & set([candcat]), set([label]) & set([goldcat]) set([label]) & set([candcat]), set([label]) & set([goldcat])
) )
elif len(self.textcat_f_per_cat) > 0: elif len(self.textcat_f_per_cat) > 0:
model_labels = set(self.textcat_f_per_cat) model_labels = set(self.textcat_f_per_cat)

View File

@ -9,7 +9,12 @@ from spacy.pipeline.defaults import default_ner
def test_doc_add_entities_set_ents_iob(en_vocab): def test_doc_add_entities_set_ents_iob(en_vocab):
text = ["This", "is", "a", "lion"] text = ["This", "is", "a", "lion"]
doc = get_doc(en_vocab, text) doc = get_doc(en_vocab, text)
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner = EntityRecognizer(en_vocab, default_ner(), **config) ner = EntityRecognizer(en_vocab, default_ner(), **config)
ner.begin_training([]) ner.begin_training([])
ner(doc) ner(doc)
@ -26,7 +31,12 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
def test_ents_reset(en_vocab): def test_ents_reset(en_vocab):
text = ["This", "is", "a", "lion"] text = ["This", "is", "a", "lion"]
doc = get_doc(en_vocab, text) doc = get_doc(en_vocab, text)
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner = EntityRecognizer(en_vocab, default_ner(), **config) ner = EntityRecognizer(en_vocab, default_ner(), **config)
ner.begin_training([]) ner.begin_training([])
ner(doc) ner(doc)

View File

@ -1,9 +1,8 @@
import pytest import pytest
from thinc.api import Adam, NumpyOps from thinc.api import Adam
from spacy.attrs import NORM from spacy.attrs import NORM
from spacy.gold import GoldParse from spacy.gold import GoldParse
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.pipeline.defaults import default_parser, default_ner from spacy.pipeline.defaults import default_parser, default_ner
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.pipeline import DependencyParser, EntityRecognizer from spacy.pipeline import DependencyParser, EntityRecognizer
@ -17,7 +16,12 @@ def vocab():
@pytest.fixture @pytest.fixture
def parser(vocab): def parser(vocab):
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
parser = DependencyParser(vocab, default_parser(), **config) parser = DependencyParser(vocab, default_parser(), **config)
return parser return parser
@ -58,7 +62,12 @@ def test_add_label(parser):
def test_add_label_deserializes_correctly(): def test_add_label_deserializes_correctly():
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner1 = EntityRecognizer(Vocab(), default_ner(), **config) ner1 = EntityRecognizer(Vocab(), default_ner(), **config)
ner1.add_label("C") ner1.add_label("C")
ner1.add_label("B") ner1.add_label("B")

View File

@ -138,7 +138,12 @@ def test_get_oracle_actions():
deps.append(dep) deps.append(dep)
ents.append(ent) ents.append(ent)
doc = Doc(Vocab(), words=[t[1] for t in annot_tuples]) doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
parser = DependencyParser(doc.vocab, default_parser(), **config) parser = DependencyParser(doc.vocab, default_parser(), **config)
parser.moves.add_action(0, "") parser.moves.add_action(0, "")
parser.moves.add_action(1, "") parser.moves.add_action(1, "")

View File

@ -138,7 +138,12 @@ def test_accept_blocked_token():
# 1. test normal behaviour # 1. test normal behaviour
nlp1 = English() nlp1 = English()
doc1 = nlp1("I live in New York") doc1 = nlp1("I live in New York")
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner1 = EntityRecognizer(doc1.vocab, default_ner(), **config) ner1 = EntityRecognizer(doc1.vocab, default_ner(), **config)
assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""] assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""]
assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""] assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""]
@ -157,7 +162,12 @@ def test_accept_blocked_token():
# 2. test blocking behaviour # 2. test blocking behaviour
nlp2 = English() nlp2 = English()
doc2 = nlp2("I live in New York") doc2 = nlp2("I live in New York")
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner2 = EntityRecognizer(doc2.vocab, default_ner(), **config) ner2 = EntityRecognizer(doc2.vocab, default_ner(), **config)
# set "New York" to a blocked entity # set "New York" to a blocked entity
@ -215,7 +225,12 @@ def test_overwrite_token():
assert [token.ent_type_ for token in doc] == ["", "", "", "", ""] assert [token.ent_type_ for token in doc] == ["", "", "", "", ""]
# Check that a new ner can overwrite O # Check that a new ner can overwrite O
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner2 = EntityRecognizer(doc.vocab, default_ner(), **config) ner2 = EntityRecognizer(doc.vocab, default_ner(), **config)
ner2.moves.add_action(5, "") ner2.moves.add_action(5, "")
ner2.add_label("GPE") ner2.add_label("GPE")

View File

@ -28,7 +28,12 @@ def tok2vec():
@pytest.fixture @pytest.fixture
def parser(vocab, arc_eager): def parser(vocab, arc_eager):
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
return Parser(vocab, model=default_parser(), moves=arc_eager, **config) return Parser(vocab, model=default_parser(), moves=arc_eager, **config)

View File

@ -94,7 +94,12 @@ def test_beam_advance_too_few_scores(beam, scores):
def test_beam_parse(): def test_beam_parse():
nlp = Language() nlp = Language()
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
nlp.add_pipe(DependencyParser(nlp.vocab, default_parser(), **config), name="parser") nlp.add_pipe(DependencyParser(nlp.vocab, default_parser(), **config), name="parser")
nlp.parser.add_label("nsubj") nlp.parser.add_label("nsubj")
nlp.parser.begin_training([], token_vector_width=8, hidden_width=8) nlp.parser.begin_training([], token_vector_width=8, hidden_width=8)

View File

@ -16,7 +16,12 @@ def vocab():
@pytest.fixture @pytest.fixture
def parser(vocab): def parser(vocab):
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
parser = DependencyParser(vocab, default_parser(), **config) parser = DependencyParser(vocab, default_parser(), **config)
parser.cfg["token_vector_width"] = 4 parser.cfg["token_vector_width"] = 4
parser.cfg["hidden_width"] = 32 parser.cfg["hidden_width"] = 32

View File

@ -264,11 +264,13 @@ GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
def test_overfitting_IO(): def test_overfitting_IO():
# Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
nlp = English() nlp = English()
nlp.add_pipe(nlp.create_pipe('sentencizer')) nlp.add_pipe(nlp.create_pipe("sentencizer"))
# Add a custom component to recognize "Russ Cochran" as an entity for the example training data # Add a custom component to recognize "Russ Cochran" as an entity for the example training data
ruler = EntityRuler(nlp) ruler = EntityRuler(nlp)
patterns = [{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}] patterns = [
{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}
]
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
nlp.add_pipe(ruler) nlp.add_pipe(ruler)
@ -285,7 +287,11 @@ def test_overfitting_IO():
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3) mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7]) mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
mykb.add_alias(alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.5, 0.5]) mykb.add_alias(
alias="Russ Cochran",
entities=["Q2146908", "Q7381115"],
probabilities=[0.5, 0.5],
)
# Create the Entity Linker component and add it to the pipeline # Create the Entity Linker component and add it to the pipeline
entity_linker = nlp.create_pipe("entity_linker", config={"kb": mykb}) entity_linker = nlp.create_pipe("entity_linker", config={"kb": mykb})

View File

@ -15,8 +15,17 @@ def test_label_types():
TRAIN_DATA = [ TRAIN_DATA = [
("I like green eggs", {"morphs": ["Feat=N", "Feat=V", "Feat=J", "Feat=N"], "pos": ["NOUN", "VERB", "ADJ", "NOUN"]}), (
("Eat blue ham", {"morphs": ["Feat=V", "Feat=J", "Feat=N"], "pos": ["VERB", "ADJ", "NOUN"]}), "I like green eggs",
{
"morphs": ["Feat=N", "Feat=V", "Feat=J", "Feat=N"],
"pos": ["NOUN", "VERB", "ADJ", "NOUN"],
},
),
(
"Eat blue ham",
{"morphs": ["Feat=V", "Feat=J", "Feat=N"], "pos": ["VERB", "ADJ", "NOUN"]},
),
] ]
@ -38,7 +47,12 @@ def test_overfitting_IO():
# test the trained model # test the trained model
test_text = "I like blue eggs" test_text = "I like blue eggs"
doc = nlp(test_text) doc = nlp(test_text)
gold_morphs = ["Feat=N|POS=NOUN", "Feat=V|POS=VERB", "Feat=J|POS=ADJ", "Feat=N|POS=NOUN"] gold_morphs = [
"Feat=N|POS=NOUN",
"Feat=V|POS=VERB",
"Feat=J|POS=ADJ",
"Feat=N|POS=NOUN",
]
assert gold_morphs == [t.morph_ for t in doc] assert gold_morphs == [t.morph_ for t in doc]
# Also test the results are still the same after IO # Also test the results are still the same after IO

View File

@ -1,30 +1,31 @@
import pytest import pytest
from collections import namedtuple from collections import namedtuple
from thinc.api import NumpyOps from thinc.api import NumpyOps
from spacy.ml._biluo import BILUO, _get_transition_table from spacy.ml._biluo import BILUO, _get_transition_table
from spacy.pipeline.simple_ner import SimpleNER
import spacy
@pytest.fixture(params=[ @pytest.fixture(
["PER", "ORG", "LOC", "MISC"], params=[
["GPE", "PERSON", "NUMBER", "CURRENCY", "EVENT"] ["PER", "ORG", "LOC", "MISC"],
]) ["GPE", "PERSON", "NUMBER", "CURRENCY", "EVENT"],
]
)
def labels(request): def labels(request):
return request.param return request.param
@pytest.fixture @pytest.fixture
def ops(): def ops():
return NumpyOps() return NumpyOps()
def _get_actions(labels): def _get_actions(labels):
action_names = ( action_names = (
[f"B{label}" for label in labels] + \ [f"B{label}" for label in labels]
[f"I{label}" for label in labels] + \ + [f"I{label}" for label in labels]
[f"L{label}" for label in labels] + \ + [f"L{label}" for label in labels]
[f"U{label}" for label in labels] + \ + [f"U{label}" for label in labels]
["O"] + ["O"]
) )
A = namedtuple("actions", action_names) A = namedtuple("actions", action_names)
return A(**{name: i for i, name in enumerate(action_names)}) return A(**{name: i for i, name in enumerate(action_names)})

View File

@ -270,7 +270,12 @@ def test_issue1963(en_tokenizer):
@pytest.mark.parametrize("label", ["U-JOB-NAME"]) @pytest.mark.parametrize("label", ["U-JOB-NAME"])
def test_issue1967(label): def test_issue1967(label):
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner = EntityRecognizer(Vocab(), default_ner(), **config) ner = EntityRecognizer(Vocab(), default_ner(), **config)
example = Example(doc=None) example = Example(doc=None)
example.set_token_annotation( example.set_token_annotation(

View File

@ -196,7 +196,12 @@ def test_issue3345():
doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"]) doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
doc[4].is_sent_start = True doc[4].is_sent_start = True
ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}]) ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner = EntityRecognizer(doc.vocab, default_ner(), **config) ner = EntityRecognizer(doc.vocab, default_ner(), **config)
# Add the OUT action. I wouldn't have thought this would be necessary... # Add the OUT action. I wouldn't have thought this would be necessary...
ner.moves.add_action(5, "") ner.moves.add_action(5, "")

View File

@ -6,7 +6,12 @@ from spacy.pipeline.defaults import default_parser
def test_issue3830_no_subtok(): def test_issue3830_no_subtok():
"""Test that the parser doesn't have subtok label if not learn_tokens""" """Test that the parser doesn't have subtok label if not learn_tokens"""
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
parser = DependencyParser(Vocab(), default_parser(), **config) parser = DependencyParser(Vocab(), default_parser(), **config)
parser.add_label("nsubj") parser.add_label("nsubj")
assert "subtok" not in parser.labels assert "subtok" not in parser.labels
@ -16,7 +21,12 @@ def test_issue3830_no_subtok():
def test_issue3830_with_subtok(): def test_issue3830_with_subtok():
"""Test that the parser does have subtok label if learn_tokens=True.""" """Test that the parser does have subtok label if learn_tokens=True."""
config = {"learn_tokens": True, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} config = {
"learn_tokens": True,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
parser = DependencyParser(Vocab(), default_parser(), **config) parser = DependencyParser(Vocab(), default_parser(), **config)
parser.add_label("nsubj") parser.add_label("nsubj")
assert "subtok" not in parser.labels assert "subtok" not in parser.labels

View File

@ -74,7 +74,12 @@ def test_issue4042_bug2():
output_dir.mkdir() output_dir.mkdir()
ner1.to_disk(output_dir) ner1.to_disk(output_dir)
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner2 = EntityRecognizer(vocab, default_ner(), **config) ner2 = EntityRecognizer(vocab, default_ner(), **config)
ner2.from_disk(output_dir) ner2.from_disk(output_dir)
assert len(ner2.labels) == 2 assert len(ner2.labels) == 2

View File

@ -12,7 +12,12 @@ def test_issue4313():
beam_width = 16 beam_width = 16
beam_density = 0.0001 beam_density = 0.0001
nlp = English() nlp = English()
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner = EntityRecognizer(nlp.vocab, default_ner(), **config) ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
ner.add_label("SOME_LABEL") ner.add_label("SOME_LABEL")
ner.begin_training([]) ner.begin_training([])

View File

@ -1,4 +1,3 @@
import pytest
from spacy.language import Language from spacy.language import Language

View File

@ -112,7 +112,7 @@ def test_serialize_custom_nlp():
nlp.to_disk(d) nlp.to_disk(d)
nlp2 = spacy.load(d) nlp2 = spacy.load(d)
model = nlp2.get_pipe("parser").model model = nlp2.get_pipe("parser").model
tok2vec = model.get_ref("tok2vec") tok2vec = model.get_ref("tok2vec") # noqa: F841
upper = model.get_ref("upper") upper = model.get_ref("upper")
# check that we have the correct settings, not the default ones # check that we have the correct settings, not the default ones
@ -132,7 +132,7 @@ def test_serialize_parser():
nlp.to_disk(d) nlp.to_disk(d)
nlp2 = spacy.load(d) nlp2 = spacy.load(d)
model = nlp2.get_pipe("parser").model model = nlp2.get_pipe("parser").model
tok2vec = model.get_ref("tok2vec") tok2vec = model.get_ref("tok2vec") # noqa: F841
upper = model.get_ref("upper") upper = model.get_ref("upper")
# check that we have the correct settings, not the default ones # check that we have the correct settings, not the default ones

View File

@ -12,7 +12,12 @@ test_parsers = [DependencyParser, EntityRecognizer]
@pytest.fixture @pytest.fixture
def parser(en_vocab): def parser(en_vocab):
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
parser = DependencyParser(en_vocab, default_parser(), **config) parser = DependencyParser(en_vocab, default_parser(), **config)
parser.add_label("nsubj") parser.add_label("nsubj")
return parser return parser

View File

@ -35,8 +35,10 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
assert vocab1.to_bytes() == vocab1_b assert vocab1.to_bytes() == vocab1_b
new_vocab1 = Vocab().from_bytes(vocab1_b) new_vocab1 = Vocab().from_bytes(vocab1_b)
assert new_vocab1.to_bytes() == vocab1_b assert new_vocab1.to_bytes() == vocab1_b
assert len(new_vocab1.strings) == len(strings1) + 2 # adds _SP and POS=SPACE assert len(new_vocab1.strings) == len(strings1) + 2 # adds _SP and POS=SPACE
assert sorted([s for s in new_vocab1.strings]) == sorted(strings1 + list(default_strings)) assert sorted([s for s in new_vocab1.strings]) == sorted(
strings1 + list(default_strings)
)
@pytest.mark.parametrize("strings1,strings2", test_strings) @pytest.mark.parametrize("strings1,strings2", test_strings)

View File

@ -40,6 +40,7 @@ test_ner_apple = [
] ]
] ]
@pytest.fixture @pytest.fixture
def tagged_doc(): def tagged_doc():
text = "Sarah's sister flew to Silicon Valley via London." text = "Sarah's sister flew to Silicon Valley via London."
@ -184,7 +185,7 @@ def test_tag_score(tagged_doc):
tagged_doc, tagged_doc,
tags=[t.tag_ for t in tagged_doc], tags=[t.tag_ for t in tagged_doc],
pos=[t.pos_ for t in tagged_doc], pos=[t.pos_ for t in tagged_doc],
morphs=[t.morph_ for t in tagged_doc] morphs=[t.morph_ for t in tagged_doc],
) )
scorer.score((tagged_doc, gold)) scorer.score((tagged_doc, gold))
results = scorer.scores results = scorer.scores

View File

@ -13,7 +13,7 @@ from spacy.util import minibatch_by_words
([400, 400, 199, 3], [4]), ([400, 400, 199, 3], [4]),
([400, 400, 199, 3, 200], [3, 2]), ([400, 400, 199, 3, 200], [3, 2]),
([400, 400, 199, 3, 1], [5]), ([400, 400, 199, 3, 1], [5]),
([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded ([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded
([400, 400, 199, 3, 1, 200], [3, 3]), ([400, 400, 199, 3, 1, 200], [3, 3]),
([400, 400, 199, 3, 1, 999], [3, 3]), ([400, 400, 199, 3, 1, 999], [3, 3]),
([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]), ([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]),
@ -28,7 +28,11 @@ def test_util_minibatch(doc_sizes, expected_batches):
examples = [Example(doc=doc) for doc in docs] examples = [Example(doc=doc) for doc in docs]
tol = 0.2 tol = 0.2
batch_size = 1000 batch_size = 1000
batches = list(minibatch_by_words(examples=examples, size=batch_size, tolerance=tol, discard_oversize=True)) batches = list(
minibatch_by_words(
examples=examples, size=batch_size, tolerance=tol, discard_oversize=True
)
)
assert [len(batch) for batch in batches] == expected_batches assert [len(batch) for batch in batches] == expected_batches
max_size = batch_size + batch_size * tol max_size = batch_size + batch_size * tol
@ -53,7 +57,9 @@ def test_util_minibatch_oversize(doc_sizes, expected_batches):
examples = [Example(doc=doc) for doc in docs] examples = [Example(doc=doc) for doc in docs]
tol = 0.2 tol = 0.2
batch_size = 1000 batch_size = 1000
batches = list(minibatch_by_words(examples=examples, size=batch_size, tolerance=tol, discard_oversize=False)) batches = list(
minibatch_by_words(
examples=examples, size=batch_size, tolerance=tol, discard_oversize=False
)
)
assert [len(batch) for batch in batches] == expected_batches assert [len(batch) for batch in batches] == expected_batches

View File

@ -697,7 +697,9 @@ def decaying(start, stop, decay):
curr -= decay curr -= decay
def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_oversize=False): def minibatch_by_words(
examples, size, count_words=len, tolerance=0.2, discard_oversize=False
):
"""Create minibatches of roughly a given number of words. If any examples """Create minibatches of roughly a given number of words. If any examples
are longer than the specified batch length, they will appear in a batch by are longer than the specified batch length, they will appear in a batch by
themselves, or be discarded if discard_oversize=True.""" themselves, or be discarded if discard_oversize=True."""