Tidy up and auto-format

This commit is contained in:
Ines Montani 2020-06-20 14:15:04 +02:00
parent a1c5b694be
commit 8283df80e9
39 changed files with 421 additions and 232 deletions

View File

@ -24,8 +24,8 @@ from ..gold import Example
output_dir=("Directory to write models to on each epoch", "positional", None, Path),
config_path=("Path to config file", "positional", None, Path),
use_gpu=("Use GPU", "option", "g", int),
resume_path=("Path to pretrained weights from which to resume pretraining", "option","r", Path),
epoch_resume=("The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files.","option", "er", int),
resume_path=("Path to pretrained weights from which to resume pretraining", "option", "r", Path),
epoch_resume=("The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files.", "option", "er", int),
# fmt: on
)
def pretrain(

View File

@ -3,7 +3,6 @@ from timeit import default_timer as timer
import srsly
from pydantic import BaseModel, FilePath
import plac
import tqdm
from pathlib import Path
from wasabi import msg
@ -16,7 +15,9 @@ from ..gold import GoldCorpus
from ..lookups import Lookups
from .. import util
from ..errors import Errors
from ..ml import models # don't remove - required to load the built-in architectures
# Don't remove - required to load the built-in architectures
from ..ml import models # noqa: F401
registry = util.registry
@ -114,33 +115,19 @@ class ConfigSchema(BaseModel):
extra = "allow"
@plac.annotations(
# fmt: off
train_path=("Location of JSON-formatted training data", "positional", None, Path),
dev_path=("Location of JSON-formatted development data", "positional", None, Path),
config_path=("Path to config file", "positional", None, Path),
output_path=("Output directory to store model in", "option", "o", Path),
init_tok2vec=(
"Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental.", "option", "t2v",
Path),
raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path),
verbose=("Display more information for debugging purposes", "flag", "VV", bool),
use_gpu=("Use GPU", "option", "g", int),
tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool),
# fmt: on
)
def train_cli(
train_path,
dev_path,
config_path,
output_path=None,
init_tok2vec=None,
raw_text=None,
verbose=False,
use_gpu=-1,
tag_map_path=None,
omit_extra_lookups=False,
# fmt: off
train_path: ("Location of JSON-formatted training data", "positional", None, Path),
dev_path: ("Location of JSON-formatted development data", "positional", None, Path),
config_path: ("Path to config file", "positional", None, Path),
output_path: ("Output directory to store model in", "option", "o", Path) = None,
init_tok2vec: ("Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None,
raw_text: ("Path to jsonl file with unlabelled text documents.", "option", "rt", Path) = None,
verbose: ("Display more information for debugging purposes", "flag", "VV", bool) = False,
use_gpu: ("Use GPU", "option", "g", int) = -1,
tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None,
omit_extra_lookups: ("Don't include extra lookups in model", "flag", "OEL", bool) = False,
# fmt: on
):
"""
Train or update a spaCy model. Requires data to be formatted in spaCy's
@ -212,7 +199,7 @@ def train(
config = util.load_config(config_path, create_objects=False)
util.fix_random_seed(config["training"]["seed"])
if config["training"].get("use_pytorch_for_gpu_memory"):
# It feels kind of weird to not have a default for this.
# It feels kind of weird to not have a default for this.
use_pytorch_for_gpu_memory()
nlp_config = config["nlp"]
config = util.load_config(config_path, create_objects=True)
@ -227,7 +214,9 @@ def train(
# verify textcat config
if "textcat" in nlp_config["pipeline"]:
textcat_labels = set(nlp.get_pipe("textcat").labels)
textcat_multilabel = not nlp_config["pipeline"]["textcat"]["model"]["exclusive_classes"]
textcat_multilabel = not nlp_config["pipeline"]["textcat"]["model"][
"exclusive_classes"
]
# check whether the setting 'exclusive_classes' corresponds to the provided training data
if textcat_multilabel:
@ -255,7 +244,9 @@ def train(
"to 'false' in the config to train a classifier with classes "
"that are not mutually exclusive."
)
msg.info(f"Initialized textcat component for {len(textcat_labels)} unique labels")
msg.info(
f"Initialized textcat component for {len(textcat_labels)} unique labels"
)
nlp.get_pipe("textcat").labels = tuple(textcat_labels)
# if 'positive_label' is provided: double check whether it's in the data and the task is binary
@ -281,9 +272,7 @@ def train(
nlp.resume_training()
else:
msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
nlp.begin_training(
lambda: corpus.train_examples
)
nlp.begin_training(lambda: corpus.train_examples)
# Update tag map with provided mapping
nlp.vocab.morphology.tag_map.update(tag_map)
@ -310,8 +299,7 @@ def train(
tok2vec = tok2vec.get(subpath)
if not tok2vec:
msg.fail(
f"Could not locate the tok2vec model at {tok2vec_path}.",
exits=1,
f"Could not locate the tok2vec model at {tok2vec_path}.", exits=1,
)
tok2vec.from_bytes(weights_data)
@ -376,7 +364,7 @@ def create_train_batches(nlp, corpus, cfg):
train_examples = list(
corpus.train_dataset(
nlp,
noise_level=0.0, # I think this is deprecated?
noise_level=0.0, # I think this is deprecated?
orth_variant_level=cfg["orth_variant_level"],
gold_preproc=cfg["gold_preproc"],
max_length=cfg["max_length"],
@ -429,7 +417,11 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
try:
weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights)
except KeyError as e:
raise KeyError(Errors.E983.format(dict_name='score_weights', key=str(e), keys=list(scores.keys())))
raise KeyError(
Errors.E983.format(
dict_name="score_weights", key=str(e), keys=list(scores.keys())
)
)
scores["speed"] = wps
return weighted_score, scores
@ -578,15 +570,25 @@ def setup_printer(training, nlp):
]
except KeyError as e:
raise KeyError(
Errors.E983.format(dict_name='scores (losses)', key=str(e), keys=list(info["losses"].keys())))
Errors.E983.format(
dict_name="scores (losses)",
key=str(e),
keys=list(info["losses"].keys()),
)
)
try:
scores = [
"{0:.2f}".format(float(info["other_scores"][col]))
for col in score_cols
"{0:.2f}".format(float(info["other_scores"][col])) for col in score_cols
]
except KeyError as e:
raise KeyError(Errors.E983.format(dict_name='scores (other)', key=str(e), keys=list(info["other_scores"].keys())))
raise KeyError(
Errors.E983.format(
dict_name="scores (other)",
key=str(e),
keys=list(info["other_scores"].keys()),
)
)
data = (
[info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))]
)

View File

@ -1,4 +1,3 @@
from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN
from .errors import Errors
from .lookups import Lookups
from .parts_of_speech import NAMES as UPOS_NAMES
@ -51,7 +50,13 @@ class Lemmatizer(object):
index_table = self.lookups.get_table("lemma_index", {})
exc_table = self.lookups.get_table("lemma_exc", {})
rules_table = self.lookups.get_table("lemma_rules", {})
if not any((index_table.get(univ_pos), exc_table.get(univ_pos), rules_table.get(univ_pos))):
if not any(
(
index_table.get(univ_pos),
exc_table.get(univ_pos),
rules_table.get(univ_pos),
)
):
if univ_pos == "propn":
return [string]
else:

View File

@ -1 +1 @@
from .models import *
from .models import * # noqa: F401, F403

View File

@ -1,11 +1,8 @@
"""Thinc layer to do simpler transition-based parsing, NER, etc."""
from typing import List, Tuple, Dict, Optional
from typing import Dict, Optional
import numpy
from thinc.api import Ops, Model, with_array, softmax_activation, padded2list
from thinc.api import to_numpy
from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d
from ..tokens import Doc
from thinc.api import Model
from thinc.types import Padded, Floats3d
def BILUO() -> Model[Padded, Padded]:
@ -14,11 +11,11 @@ def BILUO() -> Model[Padded, Padded]:
forward,
init=init,
dims={"nO": None},
attrs={"get_num_actions": get_num_actions}
attrs={"get_num_actions": get_num_actions},
)
def init(model, X: Optional[Padded]=None, Y: Optional[Padded]=None):
def init(model, X: Optional[Padded] = None, Y: Optional[Padded] = None):
if X is not None and Y is not None:
if X.data.shape != Y.data.shape:
# TODO: Fix error
@ -49,12 +46,12 @@ def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool):
masks = model.ops.alloc3f(*Y.shape)
max_value = Xp.data.max()
for t in range(Xp.data.shape[0]):
is_last = (Xp.lengths < (t+2)).astype("i")
is_last = (Xp.lengths < (t + 2)).astype("i")
masks[t] = valid_transitions[is_last, prev_actions]
# Don't train the out-of-bounds sequences.
masks[t, Xp.size_at_t[t]:] = 0
masks[t, Xp.size_at_t[t] :] = 0
# Valid actions get 0*10e8, invalid get large negative value
Y[t] = Xp.data[t] + ((masks[t]-1) * max_value * 10)
Y[t] = Xp.data[t] + ((masks[t] - 1) * max_value * 10)
prev_actions = Y[t].argmax(axis=-1)
def backprop_biluo(dY: Padded) -> Padded:
@ -83,13 +80,13 @@ def _get_transition_table(
B_start, B_end = (0, n_labels)
I_start, I_end = (B_end, B_end + n_labels)
L_start, L_end = (I_end, I_end + n_labels)
U_start, U_end = (L_end, L_end + n_labels)
U_start, U_end = (L_end, L_end + n_labels) # noqa: F841
# Using ranges allows us to set specific cells, which is necessary to express
# that only actions of the same label are valid continuations.
B_range = numpy.arange(B_start, B_end)
I_range = numpy.arange(I_start, I_end)
L_range = numpy.arange(L_start, L_end)
O_action = U_end
O_action = U_end # noqa: F841
# If this is the last token and the previous action was B or I, only L
# of that label is valid
table[1, B_range, L_range] = 1

View File

@ -1,9 +1,7 @@
"""Thinc layer to do simpler transition-based parsing, NER, etc."""
from typing import List, Tuple, Dict, Optional
from thinc.api import Ops, Model, with_array, softmax_activation, padded2list
from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d
from ..tokens import Doc
from typing import Dict, Optional
from thinc.api import Ops, Model
from thinc.types import Padded, Floats3d
def IOB() -> Model[Padded, Padded]:
@ -12,11 +10,11 @@ def IOB() -> Model[Padded, Padded]:
forward,
init=init,
dims={"nO": None},
attrs={"get_num_actions": get_num_actions}
attrs={"get_num_actions": get_num_actions},
)
def init(model, X: Optional[Padded]=None, Y: Optional[Padded]=None):
def init(model, X: Optional[Padded] = None, Y: Optional[Padded] = None):
if X is not None and Y is not None:
if X.data.shape != Y.data.shape:
# TODO: Fix error
@ -48,14 +46,14 @@ def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool):
for t in range(Xp.data.shape[0]):
masks[t] = valid_transitions[prev_actions]
# Don't train the out-of-bounds sequences.
masks[t, Xp.size_at_t[t]:] = 0
masks[t, Xp.size_at_t[t] :] = 0
# Valid actions get 0*10e8, invalid get -1*10e8
Y[t] = Xp.data[t] + ((masks[t]-1) * 10e8)
Y[t] = Xp.data[t] + ((masks[t] - 1) * 10e8)
prev_actions = Y[t].argmax(axis=-1)
def backprop_biluo(dY: Padded) -> Padded:
# Masking the gradient seems to do poorly here. But why?
#dY.data *= masks
# dY.data *= masks
return dY
return Padded(Y, Xp.size_at_t, Xp.lengths, Xp.indices), backprop_biluo
@ -83,10 +81,10 @@ def _get_transition_table(
B_range = ops.xp.arange(B_start, B_end)
I_range = ops.xp.arange(I_start, I_end)
# B and O are always valid
table[:, B_start : B_end] = 1
table[:, B_start:B_end] = 1
table[:, O_action] = 1
# I can only follow a matching B
table[B_range, I_range] = 1
_cache[n_actions] = table
return table

View File

@ -84,7 +84,7 @@ def _backprop_precomputable_affine_padding(model, dY, ids):
#
# (ids < 0).T @ dY
mask = model.ops.asarray(ids < 0, dtype="f")
d_pad = model.ops.gemm(mask, dY.reshape(nB, nO*nP), trans1=True)
d_pad = model.ops.gemm(mask, dY.reshape(nB, nO * nP), trans1=True)
return d_pad.reshape((1, nF, nO, nP))

View File

@ -1,6 +1,6 @@
from .entity_linker import * # noqa
from .parser import * # noqa
from .simple_ner import *
from .simple_ner import * # noqa
from .tagger import * # noqa
from .textcat import * # noqa
from .tok2vec import * # noqa

View File

@ -7,7 +7,12 @@ def build_multi_task_model(tok2vec, maxout_pieces, token_vector_width, nO=None):
softmax = Softmax(nO=nO, nI=token_vector_width * 2)
model = chain(
tok2vec,
Maxout(nO=token_vector_width * 2, nI=token_vector_width, nP=maxout_pieces, dropout=0.0),
Maxout(
nO=token_vector_width * 2,
nI=token_vector_width,
nP=maxout_pieces,
dropout=0.0,
),
LayerNorm(token_vector_width * 2),
softmax,
)
@ -20,7 +25,11 @@ def build_cloze_multi_task_model(vocab, tok2vec, maxout_pieces, nO=None):
# nO = vocab.vectors.data.shape[1]
output_layer = chain(
Maxout(
nO=nO, nI=tok2vec.get_dim("nO"), nP=maxout_pieces, normalize=True, dropout=0.0
nO=nO,
nI=tok2vec.get_dim("nO"),
nP=maxout_pieces,
normalize=True,
dropout=0.0,
),
Linear(nO=nO, nI=nO, init_W=zero_init),
)
@ -39,7 +48,9 @@ def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15):
def mlm_forward(model, docs, is_train):
mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob)
mask = model.ops.asarray(mask).reshape((mask.shape[0], 1))
output, backprop = model.get_ref("wrapped-model").begin_update(docs) # drop=drop
output, backprop = model.get_ref("wrapped-model").begin_update(
docs
) # drop=drop
def mlm_backward(d_output):
d_output *= 1 - mask

View File

@ -16,18 +16,14 @@ def build_tb_parser_model(
nO=None,
):
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
tok2vec = chain(
tok2vec,
with_array(Linear(hidden_width, t2v_width)),
list2array(),
)
tok2vec = chain(tok2vec, with_array(Linear(hidden_width, t2v_width)), list2array(),)
tok2vec.set_dim("nO", hidden_width)
lower = PrecomputableAffine(
nO=hidden_width if use_upper else nO,
nF=nr_feature_tokens,
nI=tok2vec.get_dim("nO"),
nP=maxout_pieces
nP=maxout_pieces,
)
if use_upper:
with use_ops("numpy"):

View File

@ -1,9 +1,8 @@
import functools
from typing import List, Tuple, Dict, Optional
from thinc.api import Ops, Model, Linear, Softmax, with_array, softmax_activation, padded2list
from typing import List
from thinc.api import Model, Linear, with_array, softmax_activation, padded2list
from thinc.api import chain, list2padded, configure_normal_init
from thinc.api import Dropout
from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d
from thinc.types import Floats2d
from ...tokens import Doc
from .._biluo import BILUO
@ -12,12 +11,12 @@ from ...util import registry
@registry.architectures.register("spacy.BiluoTagger.v1")
def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], List[Floats2d]]:
def BiluoTagger(
tok2vec: Model[List[Doc], List[Floats2d]]
) -> Model[List[Doc], List[Floats2d]]:
biluo = BILUO()
linear = Linear(
nO=None,
nI=tok2vec.get_dim("nO"),
init_W=configure_normal_init(mean=0.02)
nO=None, nI=tok2vec.get_dim("nO"), init_W=configure_normal_init(mean=0.02)
)
model = chain(
tok2vec,
@ -25,7 +24,7 @@ def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], L
with_array(chain(Dropout(0.1), linear)),
biluo,
with_array(softmax_activation()),
padded2list()
padded2list(),
)
return Model(
@ -35,11 +34,14 @@ def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], L
layers=[model, linear],
refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo},
dims={"nO": None},
attrs={"get_num_actions": biluo.attrs["get_num_actions"]}
attrs={"get_num_actions": biluo.attrs["get_num_actions"]},
)
@registry.architectures.register("spacy.IOBTagger.v1")
def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], List[Floats2d]]:
def IOBTagger(
tok2vec: Model[List[Doc], List[Floats2d]]
) -> Model[List[Doc], List[Floats2d]]:
biluo = IOB()
linear = Linear(nO=None, nI=tok2vec.get_dim("nO"))
model = chain(
@ -48,7 +50,7 @@ def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], Lis
with_array(linear),
biluo,
with_array(softmax_activation()),
padded2list()
padded2list(),
)
return Model(
@ -58,11 +60,10 @@ def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], Lis
layers=[model],
refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo},
dims={"nO": None},
attrs={"get_num_actions": biluo.attrs["get_num_actions"]}
attrs={"get_num_actions": biluo.attrs["get_num_actions"]},
)
def init(model: Model[List[Doc], List[Floats2d]], X=None, Y=None) -> None:
if model.get_dim("nO") is None and Y:
model.set_dim("nO", Y[0].shape[1])

View File

@ -1,5 +1,4 @@
from thinc.api import zero_init, with_array, Softmax, chain, Model, Dropout
from thinc.api import glorot_uniform_init
from thinc.api import zero_init, with_array, Softmax, chain, Model
from ...util import registry

View File

@ -1,11 +1,12 @@
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic, ParametricAttention
from thinc.api import chain, concatenate, clone, Dropout
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum, Relu, residual, expand_window
from thinc.api import HashEmbed, with_ragged, with_array, with_cpu, uniqued, FeatureExtractor
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
from thinc.api import ParametricAttention, chain, concatenate, clone, Dropout
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout
from thinc.api import reduce_sum, Relu, residual, expand_window, HashEmbed
from thinc.api import with_ragged, with_array, with_cpu, uniqued, FeatureExtractor
from ..spacy_vectors import SpacyVectors
from ... import util
from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE, LOWER
from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
from ...util import registry
from ..extract_ngrams import extract_ngrams
@ -50,14 +51,31 @@ def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO
@registry.architectures.register("spacy.TextCat.v1")
def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_classes, ngram_size,
window_size, conv_depth, dropout, nO=None):
def build_text_classifier(
width,
embed_size,
pretrained_vectors,
exclusive_classes,
ngram_size,
window_size,
conv_depth,
dropout,
nO=None,
):
cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
lower = HashEmbed(nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout)
prefix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout)
suffix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout)
shape = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout)
lower = HashEmbed(
nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout
)
prefix = HashEmbed(
nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout
)
suffix = HashEmbed(
nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout
)
shape = HashEmbed(
nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout
)
width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
trained_vectors = FeatureExtractor(cols) >> with_array(
@ -83,30 +101,38 @@ def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_class
vectors_width = width
tok2vec = vector_layer >> with_array(
Maxout(width, vectors_width, normalize=True)
>> residual((expand_window(window_size=window_size)
>> Maxout(nO=width, nI=width * ((window_size * 2) + 1), normalize=True))) ** conv_depth,
>> residual(
(
expand_window(window_size=window_size)
>> Maxout(
nO=width, nI=width * ((window_size * 2) + 1), normalize=True
)
)
)
** conv_depth,
pad=conv_depth,
)
cnn_model = (
tok2vec
>> list2ragged()
>> ParametricAttention(width)
>> reduce_sum()
>> residual(Maxout(nO=width, nI=width))
>> Linear(nO=nO, nI=width)
>> Dropout(0.0)
tok2vec
>> list2ragged()
>> ParametricAttention(width)
>> reduce_sum()
>> residual(Maxout(nO=width, nI=width))
>> Linear(nO=nO, nI=width)
>> Dropout(0.0)
)
linear_model = build_bow_text_classifier(
nO=nO, ngram_size=ngram_size, exclusive_classes=exclusive_classes, no_output_layer=False
nO=nO,
ngram_size=ngram_size,
exclusive_classes=exclusive_classes,
no_output_layer=False,
)
nO_double = nO*2 if nO else None
nO_double = nO * 2 if nO else None
if exclusive_classes:
output_layer = Softmax(nO=nO, nI=nO_double)
else:
output_layer = (
Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic()
)
output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic()
model = (linear_model | cnn_model) >> output_layer
model.set_ref("tok2vec", tok2vec)
if model.has_dim("nO") is not False:

View File

@ -99,7 +99,13 @@ def hash_charembed_cnn(
@registry.architectures.register("spacy.HashEmbedBiLSTM.v1")
def hash_embed_bilstm_v1(
pretrained_vectors, width, depth, embed_size, subword_features, maxout_pieces, dropout
pretrained_vectors,
width,
depth,
embed_size,
subword_features,
maxout_pieces,
dropout,
):
# Does not use character embeddings: set to False by default
return build_Tok2Vec_model(
@ -141,21 +147,24 @@ def hash_char_embed_bilstm_v1(
@registry.architectures.register("spacy.LayerNormalizedMaxout.v1")
def LayerNormalizedMaxout(width, maxout_pieces):
return Maxout(
nO=width,
nP=maxout_pieces,
dropout=0.0,
normalize=True,
)
return Maxout(nO=width, nP=maxout_pieces, dropout=0.0, normalize=True,)
@registry.architectures.register("spacy.MultiHashEmbed.v1")
def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix, dropout):
def MultiHashEmbed(
columns, width, rows, use_subwords, pretrained_vectors, mix, dropout
):
norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout)
if use_subwords:
prefix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout)
suffix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout)
shape = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout)
prefix = HashEmbed(
nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout
)
suffix = HashEmbed(
nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout
)
shape = HashEmbed(
nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout
)
if pretrained_vectors:
glove = StaticVectors(
@ -195,7 +204,13 @@ def CharacterEmbed(columns, width, rows, nM, nC, features, dropout):
def MaxoutWindowEncoder(width, window_size, maxout_pieces, depth):
cnn = chain(
expand_window(window_size=window_size),
Maxout(nO=width, nI=width * ((window_size * 2) + 1), nP=maxout_pieces, dropout=0.0, normalize=True),
Maxout(
nO=width,
nI=width * ((window_size * 2) + 1),
nP=maxout_pieces,
dropout=0.0,
normalize=True,
),
)
model = clone(residual(cnn), depth)
model.set_dim("nO", width)
@ -247,11 +262,19 @@ def build_Tok2Vec_model(
subword_features = False
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout)
norm = HashEmbed(
nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout
)
if subword_features:
prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout)
suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout)
shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout)
prefix = HashEmbed(
nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout
)
suffix = HashEmbed(
nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout
)
shape = HashEmbed(
nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout
)
else:
prefix, suffix, shape = (None, None, None)
if pretrained_vectors is not None:

View File

@ -20,8 +20,8 @@ def TransitionModel(tok2vec, lower, upper, unseen_classes=set()):
attrs={
"has_upper": has_upper,
"unseen_classes": set(unseen_classes),
"resize_output": resize_output
}
"resize_output": resize_output,
},
)
@ -31,14 +31,14 @@ def forward(model, X, is_train):
model.layers,
unseen_classes=model.attrs["unseen_classes"],
train=is_train,
has_upper=model.attrs["has_upper"]
has_upper=model.attrs["has_upper"],
)
return step_model, step_model.finish_steps
def init(model, X=None, Y=None):
tok2vec = model.get_ref("tok2vec").initialize(X=X)
tok2vec = model.get_ref("tok2vec").initialize(X=X) # noqa: F841
lower = model.get_ref("lower").initialize()
if model.attrs["has_upper"]:
statevecs = model.ops.alloc2f(2, lower.get_dim("nO"))
@ -46,7 +46,7 @@ def init(model, X=None, Y=None):
def resize_output(model, new_nO):
tok2vec = model.get_ref("tok2vec")
tok2vec = model.get_ref("tok2vec") # noqa: F841
lower = model.get_ref("lower")
upper = model.get_ref("upper")
if not model.attrs["has_upper"]:
@ -62,7 +62,7 @@ def resize_output(model, new_nO):
nI = None
if smaller.has_dim("nI"):
nI = smaller.get_dim("nI")
with use_ops('numpy'):
with use_ops("numpy"):
larger = Linear(nO=new_nO, nI=nI)
larger.init = smaller.init
# it could be that the model is not initialized yet, then skip this bit
@ -74,8 +74,8 @@ def resize_output(model, new_nO):
# Weights are stored in (nr_out, nr_in) format, so we're basically
# just adding rows here.
if smaller.has_dim("nO"):
larger_W[:smaller.get_dim("nO")] = smaller_W
larger_b[:smaller.get_dim("nO")] = smaller_b
larger_W[: smaller.get_dim("nO")] = smaller_W
larger_b[: smaller.get_dim("nO")] = smaller_b
for i in range(smaller.get_dim("nO"), new_nO):
model.attrs["unseen_classes"].add(i)

View File

@ -21,9 +21,7 @@ class SimpleNER(Pipe):
self.model = model
self.cfg = {"labels": []}
self.loss_func = SequenceCategoricalCrossentropy(
names=self.get_tag_names(),
normalize=True,
missing_value=None
names=self.get_tag_names(), normalize=True, missing_value=None
)
assert self.model is not None
@ -38,21 +36,21 @@ class SimpleNER(Pipe):
def add_label(self, label):
if label not in self.cfg["labels"]:
self.cfg["labels"].append(label)
def get_tag_names(self):
if self.is_biluo:
return (
[f"B-{label}" for label in self.labels] +
[f"I-{label}" for label in self.labels] +
[f"L-{label}" for label in self.labels] +
[f"U-{label}" for label in self.labels] +
["O"]
[f"B-{label}" for label in self.labels]
+ [f"I-{label}" for label in self.labels]
+ [f"L-{label}" for label in self.labels]
+ [f"U-{label}" for label in self.labels]
+ ["O"]
)
else:
return (
[f"B-{label}" for label in self.labels] +
[f"I-{label}" for label in self.labels] +
["O"]
[f"B-{label}" for label in self.labels]
+ [f"I-{label}" for label in self.labels]
+ ["O"]
)
def predict(self, docs: List[Doc]) -> List[Floats2d]:
@ -108,7 +106,7 @@ class SimpleNER(Pipe):
def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
self.cfg.update(kwargs)
if not hasattr(get_examples, '__call__'):
if not hasattr(get_examples, "__call__"):
gold_tuples = get_examples
get_examples = lambda: gold_tuples
labels = _get_labels(get_examples())
@ -117,14 +115,12 @@ class SimpleNER(Pipe):
labels = self.labels
n_actions = self.model.attrs["get_num_actions"](len(labels))
self.model.set_dim("nO", n_actions)
self.model.initialize()
self.model.initialize()
if pipeline is not None:
self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg)
link_vectors_to_models(self.vocab)
self.loss_func = SequenceCategoricalCrossentropy(
names=self.get_tag_names(),
normalize=True,
missing_value=None
names=self.get_tag_names(), normalize=True, missing_value=None
)
return sgd
@ -135,7 +131,7 @@ class SimpleNER(Pipe):
def _has_ner(eg):
for ner_tag in eg.gold.ner:
if ner_tag != "-" and ner_tag != None:
if ner_tag != "-" and ner_tag is not None:
return True
else:
return False
@ -145,7 +141,7 @@ def _get_labels(examples):
labels = set()
for eg in examples:
for ner_tag in eg.token_annotation.entities:
if ner_tag != 'O' and ner_tag != '-':
_, label = ner_tag.split('-', 1)
if ner_tag != "O" and ner_tag != "-":
_, label = ner_tag.split("-", 1)
labels.add(label)
return list(sorted(labels))

View File

@ -98,7 +98,9 @@ class Scorer(object):
for name, component in pipeline:
if name == "textcat":
self.textcat_multilabel = component.model.attrs["multi_label"]
self.textcat_positive_label = component.cfg.get("positive_label", None)
self.textcat_positive_label = component.cfg.get(
"positive_label", None
)
for label in component.cfg.get("labels", []):
self.textcat_auc_per_cat[label] = ROCAUCScore()
self.textcat_f_per_cat[label] = PRFScore()
@ -119,19 +121,19 @@ class Scorer(object):
@property
def morphs_acc(self):
"""RETURNS (float): Morph tag accuracy (morphological features,
"""RETURNS (float): Morph tag accuracy (morphological features,
i.e. `Token.morph`).
"""
return self.morphs.fscore * 100
return self.morphs.fscore * 100
@property
def morphs_per_type(self):
"""RETURNS (dict): Scores per dependency label.
"""RETURNS (dict): Scores per dependency label.
"""
return {
k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
for k, v in self.morphs_per_feat.items()
}
return {
k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
for k, v in self.morphs_per_feat.items()
}
@property
def sent_p(self):
@ -302,7 +304,15 @@ class Scorer(object):
gold_morphs_per_feat = {}
gold_sent_starts = set()
gold_ents = set(tags_to_entities(orig.entities))
for id_, tag, pos, morph, head, dep, sent_start in zip(orig.ids, orig.tags, orig.pos, orig.morphs, orig.heads, orig.deps, orig.sent_starts):
for id_, tag, pos, morph, head, dep, sent_start in zip(
orig.ids,
orig.tags,
orig.pos,
orig.morphs,
orig.heads,
orig.deps,
orig.sent_starts,
):
gold_tags.add((id_, tag))
gold_pos.add((id_, pos))
gold_morphs.add((id_, morph))
@ -400,7 +410,10 @@ class Scorer(object):
self.pos.score_set(cand_pos, gold_pos)
self.morphs.score_set(cand_morphs, gold_morphs)
for field in self.morphs_per_feat:
self.morphs_per_feat[field].score_set(cand_morphs_per_feat.get(field, set()), gold_morphs_per_feat.get(field, set()))
self.morphs_per_feat[field].score_set(
cand_morphs_per_feat.get(field, set()),
gold_morphs_per_feat.get(field, set()),
)
self.sent_starts.score_set(cand_sent_starts, gold_sent_starts)
self.labelled.score_set(cand_deps, gold_deps)
for dep in self.labelled_per_dep:
@ -412,7 +425,9 @@ class Scorer(object):
)
if (
len(gold.cats) > 0
and set(self.textcat_f_per_cat) == set(self.textcat_auc_per_cat) == set(gold.cats)
and set(self.textcat_f_per_cat)
== set(self.textcat_auc_per_cat)
== set(gold.cats)
and set(gold.cats) == set(doc.cats)
):
goldcat = max(gold.cats, key=gold.cats.get)
@ -424,10 +439,10 @@ class Scorer(object):
)
for label in set(gold.cats):
self.textcat_auc_per_cat[label].score_set(
doc.cats[label], gold.cats[label]
doc.cats[label], gold.cats[label]
)
self.textcat_f_per_cat[label].score_set(
set([label]) & set([candcat]), set([label]) & set([goldcat])
set([label]) & set([candcat]), set([label]) & set([goldcat])
)
elif len(self.textcat_f_per_cat) > 0:
model_labels = set(self.textcat_f_per_cat)

View File

@ -9,7 +9,12 @@ from spacy.pipeline.defaults import default_ner
def test_doc_add_entities_set_ents_iob(en_vocab):
text = ["This", "is", "a", "lion"]
doc = get_doc(en_vocab, text)
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner = EntityRecognizer(en_vocab, default_ner(), **config)
ner.begin_training([])
ner(doc)
@ -26,7 +31,12 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
def test_ents_reset(en_vocab):
text = ["This", "is", "a", "lion"]
doc = get_doc(en_vocab, text)
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner = EntityRecognizer(en_vocab, default_ner(), **config)
ner.begin_training([])
ner(doc)

View File

@ -1,9 +1,8 @@
import pytest
from thinc.api import Adam, NumpyOps
from thinc.api import Adam
from spacy.attrs import NORM
from spacy.gold import GoldParse
from spacy.vocab import Vocab
from spacy.pipeline.defaults import default_parser, default_ner
from spacy.tokens import Doc
from spacy.pipeline import DependencyParser, EntityRecognizer
@ -17,7 +16,12 @@ def vocab():
@pytest.fixture
def parser(vocab):
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
parser = DependencyParser(vocab, default_parser(), **config)
return parser
@ -58,7 +62,12 @@ def test_add_label(parser):
def test_add_label_deserializes_correctly():
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner1 = EntityRecognizer(Vocab(), default_ner(), **config)
ner1.add_label("C")
ner1.add_label("B")

View File

@ -138,7 +138,12 @@ def test_get_oracle_actions():
deps.append(dep)
ents.append(ent)
doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
parser = DependencyParser(doc.vocab, default_parser(), **config)
parser.moves.add_action(0, "")
parser.moves.add_action(1, "")

View File

@ -138,7 +138,12 @@ def test_accept_blocked_token():
# 1. test normal behaviour
nlp1 = English()
doc1 = nlp1("I live in New York")
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner1 = EntityRecognizer(doc1.vocab, default_ner(), **config)
assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""]
assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""]
@ -157,7 +162,12 @@ def test_accept_blocked_token():
# 2. test blocking behaviour
nlp2 = English()
doc2 = nlp2("I live in New York")
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner2 = EntityRecognizer(doc2.vocab, default_ner(), **config)
# set "New York" to a blocked entity
@ -215,7 +225,12 @@ def test_overwrite_token():
assert [token.ent_type_ for token in doc] == ["", "", "", "", ""]
# Check that a new ner can overwrite O
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner2 = EntityRecognizer(doc.vocab, default_ner(), **config)
ner2.moves.add_action(5, "")
ner2.add_label("GPE")

View File

@ -28,7 +28,12 @@ def tok2vec():
@pytest.fixture
def parser(vocab, arc_eager):
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
return Parser(vocab, model=default_parser(), moves=arc_eager, **config)

View File

@ -94,7 +94,12 @@ def test_beam_advance_too_few_scores(beam, scores):
def test_beam_parse():
nlp = Language()
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
nlp.add_pipe(DependencyParser(nlp.vocab, default_parser(), **config), name="parser")
nlp.parser.add_label("nsubj")
nlp.parser.begin_training([], token_vector_width=8, hidden_width=8)

View File

@ -16,7 +16,12 @@ def vocab():
@pytest.fixture
def parser(vocab):
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
parser = DependencyParser(vocab, default_parser(), **config)
parser.cfg["token_vector_width"] = 4
parser.cfg["hidden_width"] = 32

View File

@ -264,11 +264,13 @@ GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
def test_overfitting_IO():
# Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
nlp = English()
nlp.add_pipe(nlp.create_pipe('sentencizer'))
nlp.add_pipe(nlp.create_pipe("sentencizer"))
# Add a custom component to recognize "Russ Cochran" as an entity for the example training data
ruler = EntityRuler(nlp)
patterns = [{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}]
patterns = [
{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}
]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
@ -285,7 +287,11 @@ def test_overfitting_IO():
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
mykb.add_alias(alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.5, 0.5])
mykb.add_alias(
alias="Russ Cochran",
entities=["Q2146908", "Q7381115"],
probabilities=[0.5, 0.5],
)
# Create the Entity Linker component and add it to the pipeline
entity_linker = nlp.create_pipe("entity_linker", config={"kb": mykb})

View File

@ -15,8 +15,17 @@ def test_label_types():
TRAIN_DATA = [
("I like green eggs", {"morphs": ["Feat=N", "Feat=V", "Feat=J", "Feat=N"], "pos": ["NOUN", "VERB", "ADJ", "NOUN"]}),
("Eat blue ham", {"morphs": ["Feat=V", "Feat=J", "Feat=N"], "pos": ["VERB", "ADJ", "NOUN"]}),
(
"I like green eggs",
{
"morphs": ["Feat=N", "Feat=V", "Feat=J", "Feat=N"],
"pos": ["NOUN", "VERB", "ADJ", "NOUN"],
},
),
(
"Eat blue ham",
{"morphs": ["Feat=V", "Feat=J", "Feat=N"], "pos": ["VERB", "ADJ", "NOUN"]},
),
]
@ -38,7 +47,12 @@ def test_overfitting_IO():
# test the trained model
test_text = "I like blue eggs"
doc = nlp(test_text)
gold_morphs = ["Feat=N|POS=NOUN", "Feat=V|POS=VERB", "Feat=J|POS=ADJ", "Feat=N|POS=NOUN"]
gold_morphs = [
"Feat=N|POS=NOUN",
"Feat=V|POS=VERB",
"Feat=J|POS=ADJ",
"Feat=N|POS=NOUN",
]
assert gold_morphs == [t.morph_ for t in doc]
# Also test the results are still the same after IO

View File

@ -1,30 +1,31 @@
import pytest
from collections import namedtuple
from thinc.api import NumpyOps
from spacy.ml._biluo import BILUO, _get_transition_table
from spacy.pipeline.simple_ner import SimpleNER
import spacy
@pytest.fixture(params=[
["PER", "ORG", "LOC", "MISC"],
["GPE", "PERSON", "NUMBER", "CURRENCY", "EVENT"]
])
@pytest.fixture(
params=[
["PER", "ORG", "LOC", "MISC"],
["GPE", "PERSON", "NUMBER", "CURRENCY", "EVENT"],
]
)
def labels(request):
return request.param
@pytest.fixture
def ops():
return NumpyOps()
def _get_actions(labels):
action_names = (
[f"B{label}" for label in labels] + \
[f"I{label}" for label in labels] + \
[f"L{label}" for label in labels] + \
[f"U{label}" for label in labels] + \
["O"]
[f"B{label}" for label in labels]
+ [f"I{label}" for label in labels]
+ [f"L{label}" for label in labels]
+ [f"U{label}" for label in labels]
+ ["O"]
)
A = namedtuple("actions", action_names)
return A(**{name: i for i, name in enumerate(action_names)})
@ -228,7 +229,7 @@ def test_transition_table(ops):
assert table[0, a.O, a.Uloc] == 1
assert table[0, a.O, a.Uorg] == 1
assert table[0, a.O, a.O] == 1
# Last token, prev action was B
assert table[1, a.Bper, a.Bper] == 0
assert table[1, a.Bper, a.Bloc] == 0

View File

@ -270,7 +270,12 @@ def test_issue1963(en_tokenizer):
@pytest.mark.parametrize("label", ["U-JOB-NAME"])
def test_issue1967(label):
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner = EntityRecognizer(Vocab(), default_ner(), **config)
example = Example(doc=None)
example.set_token_annotation(

View File

@ -196,7 +196,12 @@ def test_issue3345():
doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
doc[4].is_sent_start = True
ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner = EntityRecognizer(doc.vocab, default_ner(), **config)
# Add the OUT action. I wouldn't have thought this would be necessary...
ner.moves.add_action(5, "")

View File

@ -6,7 +6,12 @@ from spacy.pipeline.defaults import default_parser
def test_issue3830_no_subtok():
"""Test that the parser doesn't have subtok label if not learn_tokens"""
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
parser = DependencyParser(Vocab(), default_parser(), **config)
parser.add_label("nsubj")
assert "subtok" not in parser.labels
@ -16,7 +21,12 @@ def test_issue3830_no_subtok():
def test_issue3830_with_subtok():
"""Test that the parser does have subtok label if learn_tokens=True."""
config = {"learn_tokens": True, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
config = {
"learn_tokens": True,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
parser = DependencyParser(Vocab(), default_parser(), **config)
parser.add_label("nsubj")
assert "subtok" not in parser.labels

View File

@ -74,7 +74,12 @@ def test_issue4042_bug2():
output_dir.mkdir()
ner1.to_disk(output_dir)
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner2 = EntityRecognizer(vocab, default_ner(), **config)
ner2.from_disk(output_dir)
assert len(ner2.labels) == 2

View File

@ -12,7 +12,12 @@ def test_issue4313():
beam_width = 16
beam_density = 0.0001
nlp = English()
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
ner.add_label("SOME_LABEL")
ner.begin_training([])

View File

@ -1,4 +1,3 @@
import pytest
from spacy.language import Language

View File

@ -112,7 +112,7 @@ def test_serialize_custom_nlp():
nlp.to_disk(d)
nlp2 = spacy.load(d)
model = nlp2.get_pipe("parser").model
tok2vec = model.get_ref("tok2vec")
tok2vec = model.get_ref("tok2vec") # noqa: F841
upper = model.get_ref("upper")
# check that we have the correct settings, not the default ones
@ -132,7 +132,7 @@ def test_serialize_parser():
nlp.to_disk(d)
nlp2 = spacy.load(d)
model = nlp2.get_pipe("parser").model
tok2vec = model.get_ref("tok2vec")
tok2vec = model.get_ref("tok2vec") # noqa: F841
upper = model.get_ref("upper")
# check that we have the correct settings, not the default ones

View File

@ -12,7 +12,12 @@ test_parsers = [DependencyParser, EntityRecognizer]
@pytest.fixture
def parser(en_vocab):
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
parser = DependencyParser(en_vocab, default_parser(), **config)
parser.add_label("nsubj")
return parser

View File

@ -35,8 +35,10 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
assert vocab1.to_bytes() == vocab1_b
new_vocab1 = Vocab().from_bytes(vocab1_b)
assert new_vocab1.to_bytes() == vocab1_b
assert len(new_vocab1.strings) == len(strings1) + 2 # adds _SP and POS=SPACE
assert sorted([s for s in new_vocab1.strings]) == sorted(strings1 + list(default_strings))
assert len(new_vocab1.strings) == len(strings1) + 2 # adds _SP and POS=SPACE
assert sorted([s for s in new_vocab1.strings]) == sorted(
strings1 + list(default_strings)
)
@pytest.mark.parametrize("strings1,strings2", test_strings)

View File

@ -40,6 +40,7 @@ test_ner_apple = [
]
]
@pytest.fixture
def tagged_doc():
text = "Sarah's sister flew to Silicon Valley via London."
@ -184,7 +185,7 @@ def test_tag_score(tagged_doc):
tagged_doc,
tags=[t.tag_ for t in tagged_doc],
pos=[t.pos_ for t in tagged_doc],
morphs=[t.morph_ for t in tagged_doc]
morphs=[t.morph_ for t in tagged_doc],
)
scorer.score((tagged_doc, gold))
results = scorer.scores

View File

@ -13,7 +13,7 @@ from spacy.util import minibatch_by_words
([400, 400, 199, 3], [4]),
([400, 400, 199, 3, 200], [3, 2]),
([400, 400, 199, 3, 1], [5]),
([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded
([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded
([400, 400, 199, 3, 1, 200], [3, 3]),
([400, 400, 199, 3, 1, 999], [3, 3]),
([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]),
@ -28,7 +28,11 @@ def test_util_minibatch(doc_sizes, expected_batches):
examples = [Example(doc=doc) for doc in docs]
tol = 0.2
batch_size = 1000
batches = list(minibatch_by_words(examples=examples, size=batch_size, tolerance=tol, discard_oversize=True))
batches = list(
minibatch_by_words(
examples=examples, size=batch_size, tolerance=tol, discard_oversize=True
)
)
assert [len(batch) for batch in batches] == expected_batches
max_size = batch_size + batch_size * tol
@ -53,7 +57,9 @@ def test_util_minibatch_oversize(doc_sizes, expected_batches):
examples = [Example(doc=doc) for doc in docs]
tol = 0.2
batch_size = 1000
batches = list(minibatch_by_words(examples=examples, size=batch_size, tolerance=tol, discard_oversize=False))
batches = list(
minibatch_by_words(
examples=examples, size=batch_size, tolerance=tol, discard_oversize=False
)
)
assert [len(batch) for batch in batches] == expected_batches

View File

@ -697,7 +697,9 @@ def decaying(start, stop, decay):
curr -= decay
def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_oversize=False):
def minibatch_by_words(
examples, size, count_words=len, tolerance=0.2, discard_oversize=False
):
"""Create minibatches of roughly a given number of words. If any examples
are longer than the specified batch length, they will appear in a batch by
themselves, or be discarded if discard_oversize=True."""