Merge pull request #5617 from explosion/chore/tidy-auto-format

This commit is contained in:
Ines Montani 2020-06-20 05:47:44 -07:00 committed by GitHub
commit dbe9c29f61
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
38 changed files with 415 additions and 226 deletions

View File

@ -3,7 +3,6 @@ from timeit import default_timer as timer
import srsly
from pydantic import BaseModel, FilePath
import plac
import tqdm
from pathlib import Path
from wasabi import msg
@ -16,7 +15,9 @@ from ..gold import GoldCorpus
from ..lookups import Lookups
from .. import util
from ..errors import Errors
from ..ml import models # don't remove - required to load the built-in architectures
# Don't remove - required to load the built-in architectures
from ..ml import models # noqa: F401
registry = util.registry
@ -114,33 +115,19 @@ class ConfigSchema(BaseModel):
extra = "allow"
@plac.annotations(
# fmt: off
train_path=("Location of JSON-formatted training data", "positional", None, Path),
dev_path=("Location of JSON-formatted development data", "positional", None, Path),
config_path=("Path to config file", "positional", None, Path),
output_path=("Output directory to store model in", "option", "o", Path),
init_tok2vec=(
"Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental.", "option", "t2v",
Path),
raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path),
verbose=("Display more information for debugging purposes", "flag", "VV", bool),
use_gpu=("Use GPU", "option", "g", int),
tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool),
# fmt: on
)
def train_cli(
train_path,
dev_path,
config_path,
output_path=None,
init_tok2vec=None,
raw_text=None,
verbose=False,
use_gpu=-1,
tag_map_path=None,
omit_extra_lookups=False,
# fmt: off
train_path: ("Location of JSON-formatted training data", "positional", None, Path),
dev_path: ("Location of JSON-formatted development data", "positional", None, Path),
config_path: ("Path to config file", "positional", None, Path),
output_path: ("Output directory to store model in", "option", "o", Path) = None,
init_tok2vec: ("Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None,
raw_text: ("Path to jsonl file with unlabelled text documents.", "option", "rt", Path) = None,
verbose: ("Display more information for debugging purposes", "flag", "VV", bool) = False,
use_gpu: ("Use GPU", "option", "g", int) = -1,
tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None,
omit_extra_lookups: ("Don't include extra lookups in model", "flag", "OEL", bool) = False,
# fmt: on
):
"""
Train or update a spaCy model. Requires data to be formatted in spaCy's
@ -227,7 +214,9 @@ def train(
# verify textcat config
if "textcat" in nlp_config["pipeline"]:
textcat_labels = set(nlp.get_pipe("textcat").labels)
textcat_multilabel = not nlp_config["pipeline"]["textcat"]["model"]["exclusive_classes"]
textcat_multilabel = not nlp_config["pipeline"]["textcat"]["model"][
"exclusive_classes"
]
# check whether the setting 'exclusive_classes' corresponds to the provided training data
if textcat_multilabel:
@ -255,7 +244,9 @@ def train(
"to 'false' in the config to train a classifier with classes "
"that are not mutually exclusive."
)
msg.info(f"Initialized textcat component for {len(textcat_labels)} unique labels")
msg.info(
f"Initialized textcat component for {len(textcat_labels)} unique labels"
)
nlp.get_pipe("textcat").labels = tuple(textcat_labels)
# if 'positive_label' is provided: double check whether it's in the data and the task is binary
@ -281,9 +272,7 @@ def train(
nlp.resume_training()
else:
msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
nlp.begin_training(
lambda: corpus.train_examples
)
nlp.begin_training(lambda: corpus.train_examples)
# Update tag map with provided mapping
nlp.vocab.morphology.tag_map.update(tag_map)
@ -310,8 +299,7 @@ def train(
tok2vec = tok2vec.get(subpath)
if not tok2vec:
msg.fail(
f"Could not locate the tok2vec model at {tok2vec_path}.",
exits=1,
f"Could not locate the tok2vec model at {tok2vec_path}.", exits=1,
)
tok2vec.from_bytes(weights_data)
@ -429,7 +417,11 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
try:
weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights)
except KeyError as e:
raise KeyError(Errors.E983.format(dict_name='score_weights', key=str(e), keys=list(scores.keys())))
raise KeyError(
Errors.E983.format(
dict_name="score_weights", key=str(e), keys=list(scores.keys())
)
)
scores["speed"] = wps
return weighted_score, scores
@ -578,15 +570,25 @@ def setup_printer(training, nlp):
]
except KeyError as e:
raise KeyError(
Errors.E983.format(dict_name='scores (losses)', key=str(e), keys=list(info["losses"].keys())))
Errors.E983.format(
dict_name="scores (losses)",
key=str(e),
keys=list(info["losses"].keys()),
)
)
try:
scores = [
"{0:.2f}".format(float(info["other_scores"][col]))
for col in score_cols
"{0:.2f}".format(float(info["other_scores"][col])) for col in score_cols
]
except KeyError as e:
raise KeyError(Errors.E983.format(dict_name='scores (other)', key=str(e), keys=list(info["other_scores"].keys())))
raise KeyError(
Errors.E983.format(
dict_name="scores (other)",
key=str(e),
keys=list(info["other_scores"].keys()),
)
)
data = (
[info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))]
)

View File

@ -1,4 +1,3 @@
from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN
from .errors import Errors
from .lookups import Lookups
from .parts_of_speech import NAMES as UPOS_NAMES
@ -51,7 +50,13 @@ class Lemmatizer(object):
index_table = self.lookups.get_table("lemma_index", {})
exc_table = self.lookups.get_table("lemma_exc", {})
rules_table = self.lookups.get_table("lemma_rules", {})
if not any((index_table.get(univ_pos), exc_table.get(univ_pos), rules_table.get(univ_pos))):
if not any(
(
index_table.get(univ_pos),
exc_table.get(univ_pos),
rules_table.get(univ_pos),
)
):
if univ_pos == "propn":
return [string]
else:

View File

@ -1 +1 @@
from .models import *
from .models import * # noqa: F401, F403

View File

@ -1,11 +1,8 @@
"""Thinc layer to do simpler transition-based parsing, NER, etc."""
from typing import List, Tuple, Dict, Optional
from typing import Dict, Optional
import numpy
from thinc.api import Ops, Model, with_array, softmax_activation, padded2list
from thinc.api import to_numpy
from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d
from ..tokens import Doc
from thinc.api import Model
from thinc.types import Padded, Floats3d
def BILUO() -> Model[Padded, Padded]:
@ -14,7 +11,7 @@ def BILUO() -> Model[Padded, Padded]:
forward,
init=init,
dims={"nO": None},
attrs={"get_num_actions": get_num_actions}
attrs={"get_num_actions": get_num_actions},
)

View File

@ -1,9 +1,7 @@
"""Thinc layer to do simpler transition-based parsing, NER, etc."""
from typing import List, Tuple, Dict, Optional
from thinc.api import Ops, Model, with_array, softmax_activation, padded2list
from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d
from ..tokens import Doc
from typing import Dict, Optional
from thinc.api import Ops, Model
from thinc.types import Padded, Floats3d
def IOB() -> Model[Padded, Padded]:
@ -12,7 +10,7 @@ def IOB() -> Model[Padded, Padded]:
forward,
init=init,
dims={"nO": None},
attrs={"get_num_actions": get_num_actions}
attrs={"get_num_actions": get_num_actions},
)

View File

@ -1,6 +1,6 @@
from .entity_linker import * # noqa
from .parser import * # noqa
from .simple_ner import *
from .simple_ner import * # noqa
from .tagger import * # noqa
from .textcat import * # noqa
from .tok2vec import * # noqa

View File

@ -7,7 +7,12 @@ def build_multi_task_model(tok2vec, maxout_pieces, token_vector_width, nO=None):
softmax = Softmax(nO=nO, nI=token_vector_width * 2)
model = chain(
tok2vec,
Maxout(nO=token_vector_width * 2, nI=token_vector_width, nP=maxout_pieces, dropout=0.0),
Maxout(
nO=token_vector_width * 2,
nI=token_vector_width,
nP=maxout_pieces,
dropout=0.0,
),
LayerNorm(token_vector_width * 2),
softmax,
)
@ -20,7 +25,11 @@ def build_cloze_multi_task_model(vocab, tok2vec, maxout_pieces, nO=None):
# nO = vocab.vectors.data.shape[1]
output_layer = chain(
Maxout(
nO=nO, nI=tok2vec.get_dim("nO"), nP=maxout_pieces, normalize=True, dropout=0.0
nO=nO,
nI=tok2vec.get_dim("nO"),
nP=maxout_pieces,
normalize=True,
dropout=0.0,
),
Linear(nO=nO, nI=nO, init_W=zero_init),
)
@ -39,7 +48,9 @@ def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15):
def mlm_forward(model, docs, is_train):
mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob)
mask = model.ops.asarray(mask).reshape((mask.shape[0], 1))
output, backprop = model.get_ref("wrapped-model").begin_update(docs) # drop=drop
output, backprop = model.get_ref("wrapped-model").begin_update(
docs
) # drop=drop
def mlm_backward(d_output):
d_output *= 1 - mask

View File

@ -16,18 +16,14 @@ def build_tb_parser_model(
nO=None,
):
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
tok2vec = chain(
tok2vec,
with_array(Linear(hidden_width, t2v_width)),
list2array(),
)
tok2vec = chain(tok2vec, with_array(Linear(hidden_width, t2v_width)), list2array(),)
tok2vec.set_dim("nO", hidden_width)
lower = PrecomputableAffine(
nO=hidden_width if use_upper else nO,
nF=nr_feature_tokens,
nI=tok2vec.get_dim("nO"),
nP=maxout_pieces
nP=maxout_pieces,
)
if use_upper:
with use_ops("numpy"):

View File

@ -1,9 +1,8 @@
import functools
from typing import List, Tuple, Dict, Optional
from thinc.api import Ops, Model, Linear, Softmax, with_array, softmax_activation, padded2list
from typing import List
from thinc.api import Model, Linear, with_array, softmax_activation, padded2list
from thinc.api import chain, list2padded, configure_normal_init
from thinc.api import Dropout
from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d
from thinc.types import Floats2d
from ...tokens import Doc
from .._biluo import BILUO
@ -12,12 +11,12 @@ from ...util import registry
@registry.architectures.register("spacy.BiluoTagger.v1")
def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], List[Floats2d]]:
def BiluoTagger(
tok2vec: Model[List[Doc], List[Floats2d]]
) -> Model[List[Doc], List[Floats2d]]:
biluo = BILUO()
linear = Linear(
nO=None,
nI=tok2vec.get_dim("nO"),
init_W=configure_normal_init(mean=0.02)
nO=None, nI=tok2vec.get_dim("nO"), init_W=configure_normal_init(mean=0.02)
)
model = chain(
tok2vec,
@ -25,7 +24,7 @@ def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], L
with_array(chain(Dropout(0.1), linear)),
biluo,
with_array(softmax_activation()),
padded2list()
padded2list(),
)
return Model(
@ -35,11 +34,14 @@ def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], L
layers=[model, linear],
refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo},
dims={"nO": None},
attrs={"get_num_actions": biluo.attrs["get_num_actions"]}
attrs={"get_num_actions": biluo.attrs["get_num_actions"]},
)
@registry.architectures.register("spacy.IOBTagger.v1")
def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], List[Floats2d]]:
def IOBTagger(
tok2vec: Model[List[Doc], List[Floats2d]]
) -> Model[List[Doc], List[Floats2d]]:
biluo = IOB()
linear = Linear(nO=None, nI=tok2vec.get_dim("nO"))
model = chain(
@ -48,7 +50,7 @@ def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], Lis
with_array(linear),
biluo,
with_array(softmax_activation()),
padded2list()
padded2list(),
)
return Model(
@ -58,11 +60,10 @@ def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], Lis
layers=[model],
refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo},
dims={"nO": None},
attrs={"get_num_actions": biluo.attrs["get_num_actions"]}
attrs={"get_num_actions": biluo.attrs["get_num_actions"]},
)
def init(model: Model[List[Doc], List[Floats2d]], X=None, Y=None) -> None:
if model.get_dim("nO") is None and Y:
model.set_dim("nO", Y[0].shape[1])

View File

@ -1,5 +1,4 @@
from thinc.api import zero_init, with_array, Softmax, chain, Model, Dropout
from thinc.api import glorot_uniform_init
from thinc.api import zero_init, with_array, Softmax, chain, Model
from ...util import registry

View File

@ -1,11 +1,12 @@
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic, ParametricAttention
from thinc.api import chain, concatenate, clone, Dropout
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum, Relu, residual, expand_window
from thinc.api import HashEmbed, with_ragged, with_array, with_cpu, uniqued, FeatureExtractor
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
from thinc.api import ParametricAttention, chain, concatenate, clone, Dropout
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout
from thinc.api import reduce_sum, Relu, residual, expand_window, HashEmbed
from thinc.api import with_ragged, with_array, with_cpu, uniqued, FeatureExtractor
from ..spacy_vectors import SpacyVectors
from ... import util
from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE, LOWER
from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
from ...util import registry
from ..extract_ngrams import extract_ngrams
@ -50,14 +51,31 @@ def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO
@registry.architectures.register("spacy.TextCat.v1")
def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_classes, ngram_size,
window_size, conv_depth, dropout, nO=None):
def build_text_classifier(
width,
embed_size,
pretrained_vectors,
exclusive_classes,
ngram_size,
window_size,
conv_depth,
dropout,
nO=None,
):
cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
lower = HashEmbed(nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout)
prefix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout)
suffix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout)
shape = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout)
lower = HashEmbed(
nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout
)
prefix = HashEmbed(
nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout
)
suffix = HashEmbed(
nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout
)
shape = HashEmbed(
nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout
)
width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
trained_vectors = FeatureExtractor(cols) >> with_array(
@ -83,8 +101,15 @@ def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_class
vectors_width = width
tok2vec = vector_layer >> with_array(
Maxout(width, vectors_width, normalize=True)
>> residual((expand_window(window_size=window_size)
>> Maxout(nO=width, nI=width * ((window_size * 2) + 1), normalize=True))) ** conv_depth,
>> residual(
(
expand_window(window_size=window_size)
>> Maxout(
nO=width, nI=width * ((window_size * 2) + 1), normalize=True
)
)
)
** conv_depth,
pad=conv_depth,
)
cnn_model = (
@ -98,15 +123,16 @@ def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_class
)
linear_model = build_bow_text_classifier(
nO=nO, ngram_size=ngram_size, exclusive_classes=exclusive_classes, no_output_layer=False
nO=nO,
ngram_size=ngram_size,
exclusive_classes=exclusive_classes,
no_output_layer=False,
)
nO_double = nO * 2 if nO else None
if exclusive_classes:
output_layer = Softmax(nO=nO, nI=nO_double)
else:
output_layer = (
Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic()
)
output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic()
model = (linear_model | cnn_model) >> output_layer
model.set_ref("tok2vec", tok2vec)
if model.has_dim("nO") is not False:

View File

@ -99,7 +99,13 @@ def hash_charembed_cnn(
@registry.architectures.register("spacy.HashEmbedBiLSTM.v1")
def hash_embed_bilstm_v1(
pretrained_vectors, width, depth, embed_size, subword_features, maxout_pieces, dropout
pretrained_vectors,
width,
depth,
embed_size,
subword_features,
maxout_pieces,
dropout,
):
# Does not use character embeddings: set to False by default
return build_Tok2Vec_model(
@ -141,21 +147,24 @@ def hash_char_embed_bilstm_v1(
@registry.architectures.register("spacy.LayerNormalizedMaxout.v1")
def LayerNormalizedMaxout(width, maxout_pieces):
return Maxout(
nO=width,
nP=maxout_pieces,
dropout=0.0,
normalize=True,
)
return Maxout(nO=width, nP=maxout_pieces, dropout=0.0, normalize=True,)
@registry.architectures.register("spacy.MultiHashEmbed.v1")
def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix, dropout):
def MultiHashEmbed(
columns, width, rows, use_subwords, pretrained_vectors, mix, dropout
):
norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout)
if use_subwords:
prefix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout)
suffix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout)
shape = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout)
prefix = HashEmbed(
nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout
)
suffix = HashEmbed(
nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout
)
shape = HashEmbed(
nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout
)
if pretrained_vectors:
glove = StaticVectors(
@ -195,7 +204,13 @@ def CharacterEmbed(columns, width, rows, nM, nC, features, dropout):
def MaxoutWindowEncoder(width, window_size, maxout_pieces, depth):
cnn = chain(
expand_window(window_size=window_size),
Maxout(nO=width, nI=width * ((window_size * 2) + 1), nP=maxout_pieces, dropout=0.0, normalize=True),
Maxout(
nO=width,
nI=width * ((window_size * 2) + 1),
nP=maxout_pieces,
dropout=0.0,
normalize=True,
),
)
model = clone(residual(cnn), depth)
model.set_dim("nO", width)
@ -247,11 +262,19 @@ def build_Tok2Vec_model(
subword_features = False
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout)
norm = HashEmbed(
nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout
)
if subword_features:
prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout)
suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout)
shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout)
prefix = HashEmbed(
nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout
)
suffix = HashEmbed(
nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout
)
shape = HashEmbed(
nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout
)
else:
prefix, suffix, shape = (None, None, None)
if pretrained_vectors is not None:

View File

@ -20,8 +20,8 @@ def TransitionModel(tok2vec, lower, upper, unseen_classes=set()):
attrs={
"has_upper": has_upper,
"unseen_classes": set(unseen_classes),
"resize_output": resize_output
}
"resize_output": resize_output,
},
)
@ -31,7 +31,7 @@ def forward(model, X, is_train):
model.layers,
unseen_classes=model.attrs["unseen_classes"],
train=is_train,
has_upper=model.attrs["has_upper"]
has_upper=model.attrs["has_upper"],
)
return step_model, step_model.finish_steps
@ -62,7 +62,7 @@ def resize_output(model, new_nO):
nI = None
if smaller.has_dim("nI"):
nI = smaller.get_dim("nI")
with use_ops('numpy'):
with use_ops("numpy"):
larger = Linear(nO=new_nO, nI=nI)
larger.init = smaller.init
# it could be that the model is not initialized yet, then skip this bit

View File

@ -21,9 +21,7 @@ class SimpleNER(Pipe):
self.model = model
self.cfg = {"labels": []}
self.loss_func = SequenceCategoricalCrossentropy(
names=self.get_tag_names(),
normalize=True,
missing_value=None
names=self.get_tag_names(), normalize=True, missing_value=None
)
assert self.model is not None
@ -42,17 +40,17 @@ class SimpleNER(Pipe):
def get_tag_names(self):
if self.is_biluo:
return (
[f"B-{label}" for label in self.labels] +
[f"I-{label}" for label in self.labels] +
[f"L-{label}" for label in self.labels] +
[f"U-{label}" for label in self.labels] +
["O"]
[f"B-{label}" for label in self.labels]
+ [f"I-{label}" for label in self.labels]
+ [f"L-{label}" for label in self.labels]
+ [f"U-{label}" for label in self.labels]
+ ["O"]
)
else:
return (
[f"B-{label}" for label in self.labels] +
[f"I-{label}" for label in self.labels] +
["O"]
[f"B-{label}" for label in self.labels]
+ [f"I-{label}" for label in self.labels]
+ ["O"]
)
def predict(self, docs: List[Doc]) -> List[Floats2d]:
@ -108,7 +106,7 @@ class SimpleNER(Pipe):
def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
self.cfg.update(kwargs)
if not hasattr(get_examples, '__call__'):
if not hasattr(get_examples, "__call__"):
gold_tuples = get_examples
get_examples = lambda: gold_tuples
labels = _get_labels(get_examples())
@ -122,9 +120,7 @@ class SimpleNER(Pipe):
self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg)
link_vectors_to_models(self.vocab)
self.loss_func = SequenceCategoricalCrossentropy(
names=self.get_tag_names(),
normalize=True,
missing_value=None
names=self.get_tag_names(), normalize=True, missing_value=None
)
return sgd
@ -135,7 +131,7 @@ class SimpleNER(Pipe):
def _has_ner(eg):
for ner_tag in eg.gold.ner:
if ner_tag != "-" and ner_tag != None:
if ner_tag != "-" and ner_tag is not None:
return True
else:
return False
@ -145,7 +141,7 @@ def _get_labels(examples):
labels = set()
for eg in examples:
for ner_tag in eg.token_annotation.entities:
if ner_tag != 'O' and ner_tag != '-':
_, label = ner_tag.split('-', 1)
if ner_tag != "O" and ner_tag != "-":
_, label = ner_tag.split("-", 1)
labels.add(label)
return list(sorted(labels))

View File

@ -98,7 +98,9 @@ class Scorer(object):
for name, component in pipeline:
if name == "textcat":
self.textcat_multilabel = component.model.attrs["multi_label"]
self.textcat_positive_label = component.cfg.get("positive_label", None)
self.textcat_positive_label = component.cfg.get(
"positive_label", None
)
for label in component.cfg.get("labels", []):
self.textcat_auc_per_cat[label] = ROCAUCScore()
self.textcat_f_per_cat[label] = PRFScore()
@ -302,7 +304,15 @@ class Scorer(object):
gold_morphs_per_feat = {}
gold_sent_starts = set()
gold_ents = set(tags_to_entities(orig.entities))
for id_, tag, pos, morph, head, dep, sent_start in zip(orig.ids, orig.tags, orig.pos, orig.morphs, orig.heads, orig.deps, orig.sent_starts):
for id_, tag, pos, morph, head, dep, sent_start in zip(
orig.ids,
orig.tags,
orig.pos,
orig.morphs,
orig.heads,
orig.deps,
orig.sent_starts,
):
gold_tags.add((id_, tag))
gold_pos.add((id_, pos))
gold_morphs.add((id_, morph))
@ -400,7 +410,10 @@ class Scorer(object):
self.pos.score_set(cand_pos, gold_pos)
self.morphs.score_set(cand_morphs, gold_morphs)
for field in self.morphs_per_feat:
self.morphs_per_feat[field].score_set(cand_morphs_per_feat.get(field, set()), gold_morphs_per_feat.get(field, set()))
self.morphs_per_feat[field].score_set(
cand_morphs_per_feat.get(field, set()),
gold_morphs_per_feat.get(field, set()),
)
self.sent_starts.score_set(cand_sent_starts, gold_sent_starts)
self.labelled.score_set(cand_deps, gold_deps)
for dep in self.labelled_per_dep:
@ -412,7 +425,9 @@ class Scorer(object):
)
if (
len(gold.cats) > 0
and set(self.textcat_f_per_cat) == set(self.textcat_auc_per_cat) == set(gold.cats)
and set(self.textcat_f_per_cat)
== set(self.textcat_auc_per_cat)
== set(gold.cats)
and set(gold.cats) == set(doc.cats)
):
goldcat = max(gold.cats, key=gold.cats.get)

View File

@ -9,7 +9,12 @@ from spacy.pipeline.defaults import default_ner
def test_doc_add_entities_set_ents_iob(en_vocab):
text = ["This", "is", "a", "lion"]
doc = get_doc(en_vocab, text)
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner = EntityRecognizer(en_vocab, default_ner(), **config)
ner.begin_training([])
ner(doc)
@ -26,7 +31,12 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
def test_ents_reset(en_vocab):
text = ["This", "is", "a", "lion"]
doc = get_doc(en_vocab, text)
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner = EntityRecognizer(en_vocab, default_ner(), **config)
ner.begin_training([])
ner(doc)

View File

@ -1,9 +1,8 @@
import pytest
from thinc.api import Adam, NumpyOps
from thinc.api import Adam
from spacy.attrs import NORM
from spacy.gold import GoldParse
from spacy.vocab import Vocab
from spacy.pipeline.defaults import default_parser, default_ner
from spacy.tokens import Doc
from spacy.pipeline import DependencyParser, EntityRecognizer
@ -17,7 +16,12 @@ def vocab():
@pytest.fixture
def parser(vocab):
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
parser = DependencyParser(vocab, default_parser(), **config)
return parser
@ -58,7 +62,12 @@ def test_add_label(parser):
def test_add_label_deserializes_correctly():
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner1 = EntityRecognizer(Vocab(), default_ner(), **config)
ner1.add_label("C")
ner1.add_label("B")

View File

@ -138,7 +138,12 @@ def test_get_oracle_actions():
deps.append(dep)
ents.append(ent)
doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
parser = DependencyParser(doc.vocab, default_parser(), **config)
parser.moves.add_action(0, "")
parser.moves.add_action(1, "")

View File

@ -138,7 +138,12 @@ def test_accept_blocked_token():
# 1. test normal behaviour
nlp1 = English()
doc1 = nlp1("I live in New York")
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner1 = EntityRecognizer(doc1.vocab, default_ner(), **config)
assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""]
assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""]
@ -157,7 +162,12 @@ def test_accept_blocked_token():
# 2. test blocking behaviour
nlp2 = English()
doc2 = nlp2("I live in New York")
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner2 = EntityRecognizer(doc2.vocab, default_ner(), **config)
# set "New York" to a blocked entity
@ -215,7 +225,12 @@ def test_overwrite_token():
assert [token.ent_type_ for token in doc] == ["", "", "", "", ""]
# Check that a new ner can overwrite O
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner2 = EntityRecognizer(doc.vocab, default_ner(), **config)
ner2.moves.add_action(5, "")
ner2.add_label("GPE")

View File

@ -28,7 +28,12 @@ def tok2vec():
@pytest.fixture
def parser(vocab, arc_eager):
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
return Parser(vocab, model=default_parser(), moves=arc_eager, **config)

View File

@ -94,7 +94,12 @@ def test_beam_advance_too_few_scores(beam, scores):
def test_beam_parse():
nlp = Language()
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
nlp.add_pipe(DependencyParser(nlp.vocab, default_parser(), **config), name="parser")
nlp.parser.add_label("nsubj")
nlp.parser.begin_training([], token_vector_width=8, hidden_width=8)

View File

@ -16,7 +16,12 @@ def vocab():
@pytest.fixture
def parser(vocab):
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
parser = DependencyParser(vocab, default_parser(), **config)
parser.cfg["token_vector_width"] = 4
parser.cfg["hidden_width"] = 32

View File

@ -264,11 +264,13 @@ GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
def test_overfitting_IO():
# Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
nlp = English()
nlp.add_pipe(nlp.create_pipe('sentencizer'))
nlp.add_pipe(nlp.create_pipe("sentencizer"))
# Add a custom component to recognize "Russ Cochran" as an entity for the example training data
ruler = EntityRuler(nlp)
patterns = [{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}]
patterns = [
{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}
]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
@ -285,7 +287,11 @@ def test_overfitting_IO():
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
mykb.add_alias(alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.5, 0.5])
mykb.add_alias(
alias="Russ Cochran",
entities=["Q2146908", "Q7381115"],
probabilities=[0.5, 0.5],
)
# Create the Entity Linker component and add it to the pipeline
entity_linker = nlp.create_pipe("entity_linker", config={"kb": mykb})

View File

@ -15,8 +15,17 @@ def test_label_types():
TRAIN_DATA = [
("I like green eggs", {"morphs": ["Feat=N", "Feat=V", "Feat=J", "Feat=N"], "pos": ["NOUN", "VERB", "ADJ", "NOUN"]}),
("Eat blue ham", {"morphs": ["Feat=V", "Feat=J", "Feat=N"], "pos": ["VERB", "ADJ", "NOUN"]}),
(
"I like green eggs",
{
"morphs": ["Feat=N", "Feat=V", "Feat=J", "Feat=N"],
"pos": ["NOUN", "VERB", "ADJ", "NOUN"],
},
),
(
"Eat blue ham",
{"morphs": ["Feat=V", "Feat=J", "Feat=N"], "pos": ["VERB", "ADJ", "NOUN"]},
),
]
@ -38,7 +47,12 @@ def test_overfitting_IO():
# test the trained model
test_text = "I like blue eggs"
doc = nlp(test_text)
gold_morphs = ["Feat=N|POS=NOUN", "Feat=V|POS=VERB", "Feat=J|POS=ADJ", "Feat=N|POS=NOUN"]
gold_morphs = [
"Feat=N|POS=NOUN",
"Feat=V|POS=VERB",
"Feat=J|POS=ADJ",
"Feat=N|POS=NOUN",
]
assert gold_morphs == [t.morph_ for t in doc]
# Also test the results are still the same after IO

View File

@ -1,30 +1,31 @@
import pytest
from collections import namedtuple
from thinc.api import NumpyOps
from spacy.ml._biluo import BILUO, _get_transition_table
from spacy.pipeline.simple_ner import SimpleNER
import spacy
@pytest.fixture(params=[
@pytest.fixture(
params=[
["PER", "ORG", "LOC", "MISC"],
["GPE", "PERSON", "NUMBER", "CURRENCY", "EVENT"]
])
["GPE", "PERSON", "NUMBER", "CURRENCY", "EVENT"],
]
)
def labels(request):
return request.param
@pytest.fixture
def ops():
return NumpyOps()
def _get_actions(labels):
action_names = (
[f"B{label}" for label in labels] + \
[f"I{label}" for label in labels] + \
[f"L{label}" for label in labels] + \
[f"U{label}" for label in labels] + \
["O"]
[f"B{label}" for label in labels]
+ [f"I{label}" for label in labels]
+ [f"L{label}" for label in labels]
+ [f"U{label}" for label in labels]
+ ["O"]
)
A = namedtuple("actions", action_names)
return A(**{name: i for i, name in enumerate(action_names)})

View File

@ -270,7 +270,12 @@ def test_issue1963(en_tokenizer):
@pytest.mark.parametrize("label", ["U-JOB-NAME"])
def test_issue1967(label):
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner = EntityRecognizer(Vocab(), default_ner(), **config)
example = Example(doc=None)
example.set_token_annotation(

View File

@ -196,7 +196,12 @@ def test_issue3345():
doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
doc[4].is_sent_start = True
ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner = EntityRecognizer(doc.vocab, default_ner(), **config)
# Add the OUT action. I wouldn't have thought this would be necessary...
ner.moves.add_action(5, "")

View File

@ -6,7 +6,12 @@ from spacy.pipeline.defaults import default_parser
def test_issue3830_no_subtok():
"""Test that the parser doesn't have subtok label if not learn_tokens"""
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
parser = DependencyParser(Vocab(), default_parser(), **config)
parser.add_label("nsubj")
assert "subtok" not in parser.labels
@ -16,7 +21,12 @@ def test_issue3830_no_subtok():
def test_issue3830_with_subtok():
"""Test that the parser does have subtok label if learn_tokens=True."""
config = {"learn_tokens": True, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
config = {
"learn_tokens": True,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
parser = DependencyParser(Vocab(), default_parser(), **config)
parser.add_label("nsubj")
assert "subtok" not in parser.labels

View File

@ -74,7 +74,12 @@ def test_issue4042_bug2():
output_dir.mkdir()
ner1.to_disk(output_dir)
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner2 = EntityRecognizer(vocab, default_ner(), **config)
ner2.from_disk(output_dir)
assert len(ner2.labels) == 2

View File

@ -12,7 +12,12 @@ def test_issue4313():
beam_width = 16
beam_density = 0.0001
nlp = English()
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
ner.add_label("SOME_LABEL")
ner.begin_training([])

View File

@ -1,4 +1,3 @@
import pytest
from spacy.language import Language

View File

@ -12,7 +12,12 @@ test_parsers = [DependencyParser, EntityRecognizer]
@pytest.fixture
def parser(en_vocab):
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
parser = DependencyParser(en_vocab, default_parser(), **config)
parser.add_label("nsubj")
return parser

View File

@ -36,7 +36,9 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
new_vocab1 = Vocab().from_bytes(vocab1_b)
assert new_vocab1.to_bytes() == vocab1_b
assert len(new_vocab1.strings) == len(strings1) + 2 # adds _SP and POS=SPACE
assert sorted([s for s in new_vocab1.strings]) == sorted(strings1 + list(default_strings))
assert sorted([s for s in new_vocab1.strings]) == sorted(
strings1 + list(default_strings)
)
@pytest.mark.parametrize("strings1,strings2", test_strings)

View File

@ -40,6 +40,7 @@ test_ner_apple = [
]
]
@pytest.fixture
def tagged_doc():
text = "Sarah's sister flew to Silicon Valley via London."
@ -184,7 +185,7 @@ def test_tag_score(tagged_doc):
tagged_doc,
tags=[t.tag_ for t in tagged_doc],
pos=[t.pos_ for t in tagged_doc],
morphs=[t.morph_ for t in tagged_doc]
morphs=[t.morph_ for t in tagged_doc],
)
scorer.score((tagged_doc, gold))
results = scorer.scores

View File

@ -28,7 +28,11 @@ def test_util_minibatch(doc_sizes, expected_batches):
examples = [Example(doc=doc) for doc in docs]
tol = 0.2
batch_size = 1000
batches = list(minibatch_by_words(examples=examples, size=batch_size, tolerance=tol, discard_oversize=True))
batches = list(
minibatch_by_words(
examples=examples, size=batch_size, tolerance=tol, discard_oversize=True
)
)
assert [len(batch) for batch in batches] == expected_batches
max_size = batch_size + batch_size * tol
@ -53,7 +57,9 @@ def test_util_minibatch_oversize(doc_sizes, expected_batches):
examples = [Example(doc=doc) for doc in docs]
tol = 0.2
batch_size = 1000
batches = list(minibatch_by_words(examples=examples, size=batch_size, tolerance=tol, discard_oversize=False))
batches = list(
minibatch_by_words(
examples=examples, size=batch_size, tolerance=tol, discard_oversize=False
)
)
assert [len(batch) for batch in batches] == expected_batches

View File

@ -697,7 +697,9 @@ def decaying(start, stop, decay):
curr -= decay
def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_oversize=False):
def minibatch_by_words(
examples, size, count_words=len, tolerance=0.2, discard_oversize=False
):
"""Create minibatches of roughly a given number of words. If any examples
are longer than the specified batch length, they will appear in a batch by
themselves, or be discarded if discard_oversize=True."""