diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 4f4707b52..4f4029834 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -24,8 +24,8 @@ from ..gold import Example output_dir=("Directory to write models to on each epoch", "positional", None, Path), config_path=("Path to config file", "positional", None, Path), use_gpu=("Use GPU", "option", "g", int), - resume_path=("Path to pretrained weights from which to resume pretraining", "option","r", Path), - epoch_resume=("The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files.","option", "er", int), + resume_path=("Path to pretrained weights from which to resume pretraining", "option", "r", Path), + epoch_resume=("The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files.", "option", "er", int), # fmt: on ) def pretrain( diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index f24feffab..6080b698b 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -3,7 +3,6 @@ from timeit import default_timer as timer import srsly from pydantic import BaseModel, FilePath -import plac import tqdm from pathlib import Path from wasabi import msg @@ -16,7 +15,9 @@ from ..gold import GoldCorpus from ..lookups import Lookups from .. import util from ..errors import Errors -from ..ml import models # don't remove - required to load the built-in architectures + +# Don't remove - required to load the built-in architectures +from ..ml import models # noqa: F401 registry = util.registry @@ -114,33 +115,19 @@ class ConfigSchema(BaseModel): extra = "allow" -@plac.annotations( - # fmt: off - train_path=("Location of JSON-formatted training data", "positional", None, Path), - dev_path=("Location of JSON-formatted development data", "positional", None, Path), - config_path=("Path to config file", "positional", None, Path), - output_path=("Output directory to store model in", "option", "o", Path), - init_tok2vec=( - "Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental.", "option", "t2v", - Path), - raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path), - verbose=("Display more information for debugging purposes", "flag", "VV", bool), - use_gpu=("Use GPU", "option", "g", int), - tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path), - omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool), - # fmt: on -) def train_cli( - train_path, - dev_path, - config_path, - output_path=None, - init_tok2vec=None, - raw_text=None, - verbose=False, - use_gpu=-1, - tag_map_path=None, - omit_extra_lookups=False, + # fmt: off + train_path: ("Location of JSON-formatted training data", "positional", None, Path), + dev_path: ("Location of JSON-formatted development data", "positional", None, Path), + config_path: ("Path to config file", "positional", None, Path), + output_path: ("Output directory to store model in", "option", "o", Path) = None, + init_tok2vec: ("Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None, + raw_text: ("Path to jsonl file with unlabelled text documents.", "option", "rt", Path) = None, + verbose: ("Display more information for debugging purposes", "flag", "VV", bool) = False, + use_gpu: ("Use GPU", "option", "g", int) = -1, + tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None, + omit_extra_lookups: ("Don't include extra lookups in model", "flag", "OEL", bool) = False, + # fmt: on ): """ Train or update a spaCy model. Requires data to be formatted in spaCy's @@ -212,7 +199,7 @@ def train( config = util.load_config(config_path, create_objects=False) util.fix_random_seed(config["training"]["seed"]) if config["training"].get("use_pytorch_for_gpu_memory"): - # It feels kind of weird to not have a default for this. + # It feels kind of weird to not have a default for this. use_pytorch_for_gpu_memory() nlp_config = config["nlp"] config = util.load_config(config_path, create_objects=True) @@ -227,7 +214,9 @@ def train( # verify textcat config if "textcat" in nlp_config["pipeline"]: textcat_labels = set(nlp.get_pipe("textcat").labels) - textcat_multilabel = not nlp_config["pipeline"]["textcat"]["model"]["exclusive_classes"] + textcat_multilabel = not nlp_config["pipeline"]["textcat"]["model"][ + "exclusive_classes" + ] # check whether the setting 'exclusive_classes' corresponds to the provided training data if textcat_multilabel: @@ -255,7 +244,9 @@ def train( "to 'false' in the config to train a classifier with classes " "that are not mutually exclusive." ) - msg.info(f"Initialized textcat component for {len(textcat_labels)} unique labels") + msg.info( + f"Initialized textcat component for {len(textcat_labels)} unique labels" + ) nlp.get_pipe("textcat").labels = tuple(textcat_labels) # if 'positive_label' is provided: double check whether it's in the data and the task is binary @@ -281,9 +272,7 @@ def train( nlp.resume_training() else: msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}") - nlp.begin_training( - lambda: corpus.train_examples - ) + nlp.begin_training(lambda: corpus.train_examples) # Update tag map with provided mapping nlp.vocab.morphology.tag_map.update(tag_map) @@ -310,8 +299,7 @@ def train( tok2vec = tok2vec.get(subpath) if not tok2vec: msg.fail( - f"Could not locate the tok2vec model at {tok2vec_path}.", - exits=1, + f"Could not locate the tok2vec model at {tok2vec_path}.", exits=1, ) tok2vec.from_bytes(weights_data) @@ -376,7 +364,7 @@ def create_train_batches(nlp, corpus, cfg): train_examples = list( corpus.train_dataset( nlp, - noise_level=0.0, # I think this is deprecated? + noise_level=0.0, # I think this is deprecated? orth_variant_level=cfg["orth_variant_level"], gold_preproc=cfg["gold_preproc"], max_length=cfg["max_length"], @@ -429,7 +417,11 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg): try: weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights) except KeyError as e: - raise KeyError(Errors.E983.format(dict_name='score_weights', key=str(e), keys=list(scores.keys()))) + raise KeyError( + Errors.E983.format( + dict_name="score_weights", key=str(e), keys=list(scores.keys()) + ) + ) scores["speed"] = wps return weighted_score, scores @@ -578,15 +570,25 @@ def setup_printer(training, nlp): ] except KeyError as e: raise KeyError( - Errors.E983.format(dict_name='scores (losses)', key=str(e), keys=list(info["losses"].keys()))) + Errors.E983.format( + dict_name="scores (losses)", + key=str(e), + keys=list(info["losses"].keys()), + ) + ) try: scores = [ - "{0:.2f}".format(float(info["other_scores"][col])) - for col in score_cols + "{0:.2f}".format(float(info["other_scores"][col])) for col in score_cols ] except KeyError as e: - raise KeyError(Errors.E983.format(dict_name='scores (other)', key=str(e), keys=list(info["other_scores"].keys()))) + raise KeyError( + Errors.E983.format( + dict_name="scores (other)", + key=str(e), + keys=list(info["other_scores"].keys()), + ) + ) data = ( [info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))] ) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index c4944407f..7d6bfbc12 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -1,4 +1,3 @@ -from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN from .errors import Errors from .lookups import Lookups from .parts_of_speech import NAMES as UPOS_NAMES @@ -51,7 +50,13 @@ class Lemmatizer(object): index_table = self.lookups.get_table("lemma_index", {}) exc_table = self.lookups.get_table("lemma_exc", {}) rules_table = self.lookups.get_table("lemma_rules", {}) - if not any((index_table.get(univ_pos), exc_table.get(univ_pos), rules_table.get(univ_pos))): + if not any( + ( + index_table.get(univ_pos), + exc_table.get(univ_pos), + rules_table.get(univ_pos), + ) + ): if univ_pos == "propn": return [string] else: diff --git a/spacy/ml/__init__.py b/spacy/ml/__init__.py index aed4fa323..c382d915b 100644 --- a/spacy/ml/__init__.py +++ b/spacy/ml/__init__.py @@ -1 +1 @@ -from .models import * +from .models import * # noqa: F401, F403 diff --git a/spacy/ml/_biluo.py b/spacy/ml/_biluo.py index 28339089a..77a2a6a77 100644 --- a/spacy/ml/_biluo.py +++ b/spacy/ml/_biluo.py @@ -1,11 +1,8 @@ """Thinc layer to do simpler transition-based parsing, NER, etc.""" -from typing import List, Tuple, Dict, Optional +from typing import Dict, Optional import numpy -from thinc.api import Ops, Model, with_array, softmax_activation, padded2list -from thinc.api import to_numpy -from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d - -from ..tokens import Doc +from thinc.api import Model +from thinc.types import Padded, Floats3d def BILUO() -> Model[Padded, Padded]: @@ -14,11 +11,11 @@ def BILUO() -> Model[Padded, Padded]: forward, init=init, dims={"nO": None}, - attrs={"get_num_actions": get_num_actions} + attrs={"get_num_actions": get_num_actions}, ) -def init(model, X: Optional[Padded]=None, Y: Optional[Padded]=None): +def init(model, X: Optional[Padded] = None, Y: Optional[Padded] = None): if X is not None and Y is not None: if X.data.shape != Y.data.shape: # TODO: Fix error @@ -49,12 +46,12 @@ def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool): masks = model.ops.alloc3f(*Y.shape) max_value = Xp.data.max() for t in range(Xp.data.shape[0]): - is_last = (Xp.lengths < (t+2)).astype("i") + is_last = (Xp.lengths < (t + 2)).astype("i") masks[t] = valid_transitions[is_last, prev_actions] # Don't train the out-of-bounds sequences. - masks[t, Xp.size_at_t[t]:] = 0 + masks[t, Xp.size_at_t[t] :] = 0 # Valid actions get 0*10e8, invalid get large negative value - Y[t] = Xp.data[t] + ((masks[t]-1) * max_value * 10) + Y[t] = Xp.data[t] + ((masks[t] - 1) * max_value * 10) prev_actions = Y[t].argmax(axis=-1) def backprop_biluo(dY: Padded) -> Padded: @@ -83,13 +80,13 @@ def _get_transition_table( B_start, B_end = (0, n_labels) I_start, I_end = (B_end, B_end + n_labels) L_start, L_end = (I_end, I_end + n_labels) - U_start, U_end = (L_end, L_end + n_labels) + U_start, U_end = (L_end, L_end + n_labels) # noqa: F841 # Using ranges allows us to set specific cells, which is necessary to express # that only actions of the same label are valid continuations. B_range = numpy.arange(B_start, B_end) I_range = numpy.arange(I_start, I_end) L_range = numpy.arange(L_start, L_end) - O_action = U_end + O_action = U_end # noqa: F841 # If this is the last token and the previous action was B or I, only L # of that label is valid table[1, B_range, L_range] = 1 diff --git a/spacy/ml/_iob.py b/spacy/ml/_iob.py index 0ce9a71e6..9f385ec0d 100644 --- a/spacy/ml/_iob.py +++ b/spacy/ml/_iob.py @@ -1,9 +1,7 @@ """Thinc layer to do simpler transition-based parsing, NER, etc.""" -from typing import List, Tuple, Dict, Optional -from thinc.api import Ops, Model, with_array, softmax_activation, padded2list -from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d - -from ..tokens import Doc +from typing import Dict, Optional +from thinc.api import Ops, Model +from thinc.types import Padded, Floats3d def IOB() -> Model[Padded, Padded]: @@ -12,11 +10,11 @@ def IOB() -> Model[Padded, Padded]: forward, init=init, dims={"nO": None}, - attrs={"get_num_actions": get_num_actions} + attrs={"get_num_actions": get_num_actions}, ) -def init(model, X: Optional[Padded]=None, Y: Optional[Padded]=None): +def init(model, X: Optional[Padded] = None, Y: Optional[Padded] = None): if X is not None and Y is not None: if X.data.shape != Y.data.shape: # TODO: Fix error @@ -48,14 +46,14 @@ def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool): for t in range(Xp.data.shape[0]): masks[t] = valid_transitions[prev_actions] # Don't train the out-of-bounds sequences. - masks[t, Xp.size_at_t[t]:] = 0 + masks[t, Xp.size_at_t[t] :] = 0 # Valid actions get 0*10e8, invalid get -1*10e8 - Y[t] = Xp.data[t] + ((masks[t]-1) * 10e8) + Y[t] = Xp.data[t] + ((masks[t] - 1) * 10e8) prev_actions = Y[t].argmax(axis=-1) def backprop_biluo(dY: Padded) -> Padded: # Masking the gradient seems to do poorly here. But why? - #dY.data *= masks + # dY.data *= masks return dY return Padded(Y, Xp.size_at_t, Xp.lengths, Xp.indices), backprop_biluo @@ -83,10 +81,10 @@ def _get_transition_table( B_range = ops.xp.arange(B_start, B_end) I_range = ops.xp.arange(I_start, I_end) # B and O are always valid - table[:, B_start : B_end] = 1 + table[:, B_start:B_end] = 1 table[:, O_action] = 1 # I can only follow a matching B table[B_range, I_range] = 1 - + _cache[n_actions] = table return table diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py index f4b5b16fe..215cdeda1 100644 --- a/spacy/ml/_precomputable_affine.py +++ b/spacy/ml/_precomputable_affine.py @@ -84,7 +84,7 @@ def _backprop_precomputable_affine_padding(model, dY, ids): # # (ids < 0).T @ dY mask = model.ops.asarray(ids < 0, dtype="f") - d_pad = model.ops.gemm(mask, dY.reshape(nB, nO*nP), trans1=True) + d_pad = model.ops.gemm(mask, dY.reshape(nB, nO * nP), trans1=True) return d_pad.reshape((1, nF, nO, nP)) diff --git a/spacy/ml/models/__init__.py b/spacy/ml/models/__init__.py index 40cde2437..dd58dab00 100644 --- a/spacy/ml/models/__init__.py +++ b/spacy/ml/models/__init__.py @@ -1,6 +1,6 @@ from .entity_linker import * # noqa from .parser import * # noqa -from .simple_ner import * +from .simple_ner import * # noqa from .tagger import * # noqa from .textcat import * # noqa from .tok2vec import * # noqa diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py index 4a360a9e6..b3a9e0815 100644 --- a/spacy/ml/models/multi_task.py +++ b/spacy/ml/models/multi_task.py @@ -7,7 +7,12 @@ def build_multi_task_model(tok2vec, maxout_pieces, token_vector_width, nO=None): softmax = Softmax(nO=nO, nI=token_vector_width * 2) model = chain( tok2vec, - Maxout(nO=token_vector_width * 2, nI=token_vector_width, nP=maxout_pieces, dropout=0.0), + Maxout( + nO=token_vector_width * 2, + nI=token_vector_width, + nP=maxout_pieces, + dropout=0.0, + ), LayerNorm(token_vector_width * 2), softmax, ) @@ -20,7 +25,11 @@ def build_cloze_multi_task_model(vocab, tok2vec, maxout_pieces, nO=None): # nO = vocab.vectors.data.shape[1] output_layer = chain( Maxout( - nO=nO, nI=tok2vec.get_dim("nO"), nP=maxout_pieces, normalize=True, dropout=0.0 + nO=nO, + nI=tok2vec.get_dim("nO"), + nP=maxout_pieces, + normalize=True, + dropout=0.0, ), Linear(nO=nO, nI=nO, init_W=zero_init), ) @@ -39,7 +48,9 @@ def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15): def mlm_forward(model, docs, is_train): mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob) mask = model.ops.asarray(mask).reshape((mask.shape[0], 1)) - output, backprop = model.get_ref("wrapped-model").begin_update(docs) # drop=drop + output, backprop = model.get_ref("wrapped-model").begin_update( + docs + ) # drop=drop def mlm_backward(d_output): d_output *= 1 - mask diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py index bdcd709b1..47c94cfa1 100644 --- a/spacy/ml/models/parser.py +++ b/spacy/ml/models/parser.py @@ -16,18 +16,14 @@ def build_tb_parser_model( nO=None, ): t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None - tok2vec = chain( - tok2vec, - with_array(Linear(hidden_width, t2v_width)), - list2array(), - ) + tok2vec = chain(tok2vec, with_array(Linear(hidden_width, t2v_width)), list2array(),) tok2vec.set_dim("nO", hidden_width) lower = PrecomputableAffine( nO=hidden_width if use_upper else nO, nF=nr_feature_tokens, nI=tok2vec.get_dim("nO"), - nP=maxout_pieces + nP=maxout_pieces, ) if use_upper: with use_ops("numpy"): diff --git a/spacy/ml/models/simple_ner.py b/spacy/ml/models/simple_ner.py index 01661f55b..1fb5a71c0 100644 --- a/spacy/ml/models/simple_ner.py +++ b/spacy/ml/models/simple_ner.py @@ -1,9 +1,8 @@ -import functools -from typing import List, Tuple, Dict, Optional -from thinc.api import Ops, Model, Linear, Softmax, with_array, softmax_activation, padded2list +from typing import List +from thinc.api import Model, Linear, with_array, softmax_activation, padded2list from thinc.api import chain, list2padded, configure_normal_init from thinc.api import Dropout -from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d +from thinc.types import Floats2d from ...tokens import Doc from .._biluo import BILUO @@ -12,12 +11,12 @@ from ...util import registry @registry.architectures.register("spacy.BiluoTagger.v1") -def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], List[Floats2d]]: +def BiluoTagger( + tok2vec: Model[List[Doc], List[Floats2d]] +) -> Model[List[Doc], List[Floats2d]]: biluo = BILUO() linear = Linear( - nO=None, - nI=tok2vec.get_dim("nO"), - init_W=configure_normal_init(mean=0.02) + nO=None, nI=tok2vec.get_dim("nO"), init_W=configure_normal_init(mean=0.02) ) model = chain( tok2vec, @@ -25,7 +24,7 @@ def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], L with_array(chain(Dropout(0.1), linear)), biluo, with_array(softmax_activation()), - padded2list() + padded2list(), ) return Model( @@ -35,11 +34,14 @@ def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], L layers=[model, linear], refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo}, dims={"nO": None}, - attrs={"get_num_actions": biluo.attrs["get_num_actions"]} + attrs={"get_num_actions": biluo.attrs["get_num_actions"]}, ) + @registry.architectures.register("spacy.IOBTagger.v1") -def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], List[Floats2d]]: +def IOBTagger( + tok2vec: Model[List[Doc], List[Floats2d]] +) -> Model[List[Doc], List[Floats2d]]: biluo = IOB() linear = Linear(nO=None, nI=tok2vec.get_dim("nO")) model = chain( @@ -48,7 +50,7 @@ def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], Lis with_array(linear), biluo, with_array(softmax_activation()), - padded2list() + padded2list(), ) return Model( @@ -58,11 +60,10 @@ def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], Lis layers=[model], refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo}, dims={"nO": None}, - attrs={"get_num_actions": biluo.attrs["get_num_actions"]} + attrs={"get_num_actions": biluo.attrs["get_num_actions"]}, ) - def init(model: Model[List[Doc], List[Floats2d]], X=None, Y=None) -> None: if model.get_dim("nO") is None and Y: model.set_dim("nO", Y[0].shape[1]) diff --git a/spacy/ml/models/tagger.py b/spacy/ml/models/tagger.py index 00e268ede..7fe417321 100644 --- a/spacy/ml/models/tagger.py +++ b/spacy/ml/models/tagger.py @@ -1,5 +1,4 @@ -from thinc.api import zero_init, with_array, Softmax, chain, Model, Dropout -from thinc.api import glorot_uniform_init +from thinc.api import zero_init, with_array, Softmax, chain, Model from ...util import registry diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index a02e1a5a1..9db6f982f 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -1,11 +1,12 @@ -from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic, ParametricAttention -from thinc.api import chain, concatenate, clone, Dropout -from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum, Relu, residual, expand_window -from thinc.api import HashEmbed, with_ragged, with_array, with_cpu, uniqued, FeatureExtractor +from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic +from thinc.api import ParametricAttention, chain, concatenate, clone, Dropout +from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout +from thinc.api import reduce_sum, Relu, residual, expand_window, HashEmbed +from thinc.api import with_ragged, with_array, with_cpu, uniqued, FeatureExtractor from ..spacy_vectors import SpacyVectors from ... import util -from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE, LOWER +from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER from ...util import registry from ..extract_ngrams import extract_ngrams @@ -50,14 +51,31 @@ def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO @registry.architectures.register("spacy.TextCat.v1") -def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_classes, ngram_size, - window_size, conv_depth, dropout, nO=None): +def build_text_classifier( + width, + embed_size, + pretrained_vectors, + exclusive_classes, + ngram_size, + window_size, + conv_depth, + dropout, + nO=None, +): cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID] with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): - lower = HashEmbed(nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout) - prefix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout) - suffix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout) - shape = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout) + lower = HashEmbed( + nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout + ) + prefix = HashEmbed( + nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout + ) + suffix = HashEmbed( + nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout + ) + shape = HashEmbed( + nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout + ) width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape]) trained_vectors = FeatureExtractor(cols) >> with_array( @@ -83,30 +101,38 @@ def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_class vectors_width = width tok2vec = vector_layer >> with_array( Maxout(width, vectors_width, normalize=True) - >> residual((expand_window(window_size=window_size) - >> Maxout(nO=width, nI=width * ((window_size * 2) + 1), normalize=True))) ** conv_depth, + >> residual( + ( + expand_window(window_size=window_size) + >> Maxout( + nO=width, nI=width * ((window_size * 2) + 1), normalize=True + ) + ) + ) + ** conv_depth, pad=conv_depth, ) cnn_model = ( - tok2vec - >> list2ragged() - >> ParametricAttention(width) - >> reduce_sum() - >> residual(Maxout(nO=width, nI=width)) - >> Linear(nO=nO, nI=width) - >> Dropout(0.0) + tok2vec + >> list2ragged() + >> ParametricAttention(width) + >> reduce_sum() + >> residual(Maxout(nO=width, nI=width)) + >> Linear(nO=nO, nI=width) + >> Dropout(0.0) ) linear_model = build_bow_text_classifier( - nO=nO, ngram_size=ngram_size, exclusive_classes=exclusive_classes, no_output_layer=False + nO=nO, + ngram_size=ngram_size, + exclusive_classes=exclusive_classes, + no_output_layer=False, ) - nO_double = nO*2 if nO else None + nO_double = nO * 2 if nO else None if exclusive_classes: output_layer = Softmax(nO=nO, nI=nO_double) else: - output_layer = ( - Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic() - ) + output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic() model = (linear_model | cnn_model) >> output_layer model.set_ref("tok2vec", tok2vec) if model.has_dim("nO") is not False: diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 53798e57c..b1bed1ea1 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -99,7 +99,13 @@ def hash_charembed_cnn( @registry.architectures.register("spacy.HashEmbedBiLSTM.v1") def hash_embed_bilstm_v1( - pretrained_vectors, width, depth, embed_size, subword_features, maxout_pieces, dropout + pretrained_vectors, + width, + depth, + embed_size, + subword_features, + maxout_pieces, + dropout, ): # Does not use character embeddings: set to False by default return build_Tok2Vec_model( @@ -141,21 +147,24 @@ def hash_char_embed_bilstm_v1( @registry.architectures.register("spacy.LayerNormalizedMaxout.v1") def LayerNormalizedMaxout(width, maxout_pieces): - return Maxout( - nO=width, - nP=maxout_pieces, - dropout=0.0, - normalize=True, - ) + return Maxout(nO=width, nP=maxout_pieces, dropout=0.0, normalize=True,) @registry.architectures.register("spacy.MultiHashEmbed.v1") -def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix, dropout): +def MultiHashEmbed( + columns, width, rows, use_subwords, pretrained_vectors, mix, dropout +): norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout) if use_subwords: - prefix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout) - suffix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout) - shape = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout) + prefix = HashEmbed( + nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout + ) + suffix = HashEmbed( + nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout + ) + shape = HashEmbed( + nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout + ) if pretrained_vectors: glove = StaticVectors( @@ -195,7 +204,13 @@ def CharacterEmbed(columns, width, rows, nM, nC, features, dropout): def MaxoutWindowEncoder(width, window_size, maxout_pieces, depth): cnn = chain( expand_window(window_size=window_size), - Maxout(nO=width, nI=width * ((window_size * 2) + 1), nP=maxout_pieces, dropout=0.0, normalize=True), + Maxout( + nO=width, + nI=width * ((window_size * 2) + 1), + nP=maxout_pieces, + dropout=0.0, + normalize=True, + ), ) model = clone(residual(cnn), depth) model.set_dim("nO", width) @@ -247,11 +262,19 @@ def build_Tok2Vec_model( subword_features = False cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): - norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout) + norm = HashEmbed( + nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout + ) if subword_features: - prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout) - suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout) - shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout) + prefix = HashEmbed( + nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout + ) + suffix = HashEmbed( + nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout + ) + shape = HashEmbed( + nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout + ) else: prefix, suffix, shape = (None, None, None) if pretrained_vectors is not None: diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index 251189389..69b40cbcf 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -20,8 +20,8 @@ def TransitionModel(tok2vec, lower, upper, unseen_classes=set()): attrs={ "has_upper": has_upper, "unseen_classes": set(unseen_classes), - "resize_output": resize_output - } + "resize_output": resize_output, + }, ) @@ -31,14 +31,14 @@ def forward(model, X, is_train): model.layers, unseen_classes=model.attrs["unseen_classes"], train=is_train, - has_upper=model.attrs["has_upper"] + has_upper=model.attrs["has_upper"], ) return step_model, step_model.finish_steps def init(model, X=None, Y=None): - tok2vec = model.get_ref("tok2vec").initialize(X=X) + tok2vec = model.get_ref("tok2vec").initialize(X=X) # noqa: F841 lower = model.get_ref("lower").initialize() if model.attrs["has_upper"]: statevecs = model.ops.alloc2f(2, lower.get_dim("nO")) @@ -46,7 +46,7 @@ def init(model, X=None, Y=None): def resize_output(model, new_nO): - tok2vec = model.get_ref("tok2vec") + tok2vec = model.get_ref("tok2vec") # noqa: F841 lower = model.get_ref("lower") upper = model.get_ref("upper") if not model.attrs["has_upper"]: @@ -62,7 +62,7 @@ def resize_output(model, new_nO): nI = None if smaller.has_dim("nI"): nI = smaller.get_dim("nI") - with use_ops('numpy'): + with use_ops("numpy"): larger = Linear(nO=new_nO, nI=nI) larger.init = smaller.init # it could be that the model is not initialized yet, then skip this bit @@ -74,8 +74,8 @@ def resize_output(model, new_nO): # Weights are stored in (nr_out, nr_in) format, so we're basically # just adding rows here. if smaller.has_dim("nO"): - larger_W[:smaller.get_dim("nO")] = smaller_W - larger_b[:smaller.get_dim("nO")] = smaller_b + larger_W[: smaller.get_dim("nO")] = smaller_W + larger_b[: smaller.get_dim("nO")] = smaller_b for i in range(smaller.get_dim("nO"), new_nO): model.attrs["unseen_classes"].add(i) diff --git a/spacy/pipeline/simple_ner.py b/spacy/pipeline/simple_ner.py index c674046af..58f647b67 100644 --- a/spacy/pipeline/simple_ner.py +++ b/spacy/pipeline/simple_ner.py @@ -21,9 +21,7 @@ class SimpleNER(Pipe): self.model = model self.cfg = {"labels": []} self.loss_func = SequenceCategoricalCrossentropy( - names=self.get_tag_names(), - normalize=True, - missing_value=None + names=self.get_tag_names(), normalize=True, missing_value=None ) assert self.model is not None @@ -38,21 +36,21 @@ class SimpleNER(Pipe): def add_label(self, label): if label not in self.cfg["labels"]: self.cfg["labels"].append(label) - + def get_tag_names(self): if self.is_biluo: return ( - [f"B-{label}" for label in self.labels] + - [f"I-{label}" for label in self.labels] + - [f"L-{label}" for label in self.labels] + - [f"U-{label}" for label in self.labels] + - ["O"] + [f"B-{label}" for label in self.labels] + + [f"I-{label}" for label in self.labels] + + [f"L-{label}" for label in self.labels] + + [f"U-{label}" for label in self.labels] + + ["O"] ) else: return ( - [f"B-{label}" for label in self.labels] + - [f"I-{label}" for label in self.labels] + - ["O"] + [f"B-{label}" for label in self.labels] + + [f"I-{label}" for label in self.labels] + + ["O"] ) def predict(self, docs: List[Doc]) -> List[Floats2d]: @@ -108,7 +106,7 @@ class SimpleNER(Pipe): def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs): self.cfg.update(kwargs) - if not hasattr(get_examples, '__call__'): + if not hasattr(get_examples, "__call__"): gold_tuples = get_examples get_examples = lambda: gold_tuples labels = _get_labels(get_examples()) @@ -117,14 +115,12 @@ class SimpleNER(Pipe): labels = self.labels n_actions = self.model.attrs["get_num_actions"](len(labels)) self.model.set_dim("nO", n_actions) - self.model.initialize() + self.model.initialize() if pipeline is not None: self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg) link_vectors_to_models(self.vocab) self.loss_func = SequenceCategoricalCrossentropy( - names=self.get_tag_names(), - normalize=True, - missing_value=None + names=self.get_tag_names(), normalize=True, missing_value=None ) return sgd @@ -135,7 +131,7 @@ class SimpleNER(Pipe): def _has_ner(eg): for ner_tag in eg.gold.ner: - if ner_tag != "-" and ner_tag != None: + if ner_tag != "-" and ner_tag is not None: return True else: return False @@ -145,7 +141,7 @@ def _get_labels(examples): labels = set() for eg in examples: for ner_tag in eg.token_annotation.entities: - if ner_tag != 'O' and ner_tag != '-': - _, label = ner_tag.split('-', 1) + if ner_tag != "O" and ner_tag != "-": + _, label = ner_tag.split("-", 1) labels.add(label) return list(sorted(labels)) diff --git a/spacy/scorer.py b/spacy/scorer.py index 288da23aa..af74db80e 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -98,7 +98,9 @@ class Scorer(object): for name, component in pipeline: if name == "textcat": self.textcat_multilabel = component.model.attrs["multi_label"] - self.textcat_positive_label = component.cfg.get("positive_label", None) + self.textcat_positive_label = component.cfg.get( + "positive_label", None + ) for label in component.cfg.get("labels", []): self.textcat_auc_per_cat[label] = ROCAUCScore() self.textcat_f_per_cat[label] = PRFScore() @@ -119,19 +121,19 @@ class Scorer(object): @property def morphs_acc(self): - """RETURNS (float): Morph tag accuracy (morphological features, + """RETURNS (float): Morph tag accuracy (morphological features, i.e. `Token.morph`). """ - return self.morphs.fscore * 100 + return self.morphs.fscore * 100 @property def morphs_per_type(self): - """RETURNS (dict): Scores per dependency label. + """RETURNS (dict): Scores per dependency label. """ - return { - k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100} - for k, v in self.morphs_per_feat.items() - } + return { + k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100} + for k, v in self.morphs_per_feat.items() + } @property def sent_p(self): @@ -302,7 +304,15 @@ class Scorer(object): gold_morphs_per_feat = {} gold_sent_starts = set() gold_ents = set(tags_to_entities(orig.entities)) - for id_, tag, pos, morph, head, dep, sent_start in zip(orig.ids, orig.tags, orig.pos, orig.morphs, orig.heads, orig.deps, orig.sent_starts): + for id_, tag, pos, morph, head, dep, sent_start in zip( + orig.ids, + orig.tags, + orig.pos, + orig.morphs, + orig.heads, + orig.deps, + orig.sent_starts, + ): gold_tags.add((id_, tag)) gold_pos.add((id_, pos)) gold_morphs.add((id_, morph)) @@ -400,7 +410,10 @@ class Scorer(object): self.pos.score_set(cand_pos, gold_pos) self.morphs.score_set(cand_morphs, gold_morphs) for field in self.morphs_per_feat: - self.morphs_per_feat[field].score_set(cand_morphs_per_feat.get(field, set()), gold_morphs_per_feat.get(field, set())) + self.morphs_per_feat[field].score_set( + cand_morphs_per_feat.get(field, set()), + gold_morphs_per_feat.get(field, set()), + ) self.sent_starts.score_set(cand_sent_starts, gold_sent_starts) self.labelled.score_set(cand_deps, gold_deps) for dep in self.labelled_per_dep: @@ -412,7 +425,9 @@ class Scorer(object): ) if ( len(gold.cats) > 0 - and set(self.textcat_f_per_cat) == set(self.textcat_auc_per_cat) == set(gold.cats) + and set(self.textcat_f_per_cat) + == set(self.textcat_auc_per_cat) + == set(gold.cats) and set(gold.cats) == set(doc.cats) ): goldcat = max(gold.cats, key=gold.cats.get) @@ -424,10 +439,10 @@ class Scorer(object): ) for label in set(gold.cats): self.textcat_auc_per_cat[label].score_set( - doc.cats[label], gold.cats[label] + doc.cats[label], gold.cats[label] ) self.textcat_f_per_cat[label].score_set( - set([label]) & set([candcat]), set([label]) & set([goldcat]) + set([label]) & set([candcat]), set([label]) & set([goldcat]) ) elif len(self.textcat_f_per_cat) > 0: model_labels = set(self.textcat_f_per_cat) diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py index 879334056..b9c230516 100644 --- a/spacy/tests/doc/test_add_entities.py +++ b/spacy/tests/doc/test_add_entities.py @@ -9,7 +9,12 @@ from spacy.pipeline.defaults import default_ner def test_doc_add_entities_set_ents_iob(en_vocab): text = ["This", "is", "a", "lion"] doc = get_doc(en_vocab, text) - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner = EntityRecognizer(en_vocab, default_ner(), **config) ner.begin_training([]) ner(doc) @@ -26,7 +31,12 @@ def test_doc_add_entities_set_ents_iob(en_vocab): def test_ents_reset(en_vocab): text = ["This", "is", "a", "lion"] doc = get_doc(en_vocab, text) - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner = EntityRecognizer(en_vocab, default_ner(), **config) ner.begin_training([]) ner(doc) diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index f9663ba32..893465b45 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -1,9 +1,8 @@ import pytest -from thinc.api import Adam, NumpyOps +from thinc.api import Adam from spacy.attrs import NORM from spacy.gold import GoldParse from spacy.vocab import Vocab - from spacy.pipeline.defaults import default_parser, default_ner from spacy.tokens import Doc from spacy.pipeline import DependencyParser, EntityRecognizer @@ -17,7 +16,12 @@ def vocab(): @pytest.fixture def parser(vocab): - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } parser = DependencyParser(vocab, default_parser(), **config) return parser @@ -58,7 +62,12 @@ def test_add_label(parser): def test_add_label_deserializes_correctly(): - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner1 = EntityRecognizer(Vocab(), default_ner(), **config) ner1.add_label("C") ner1.add_label("B") diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py index 5d265261f..42b62251e 100644 --- a/spacy/tests/parser/test_arc_eager_oracle.py +++ b/spacy/tests/parser/test_arc_eager_oracle.py @@ -138,7 +138,12 @@ def test_get_oracle_actions(): deps.append(dep) ents.append(ent) doc = Doc(Vocab(), words=[t[1] for t in annot_tuples]) - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } parser = DependencyParser(doc.vocab, default_parser(), **config) parser.moves.add_action(0, "") parser.moves.add_action(1, "") diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index b0a8109dc..e82de03bf 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -138,7 +138,12 @@ def test_accept_blocked_token(): # 1. test normal behaviour nlp1 = English() doc1 = nlp1("I live in New York") - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner1 = EntityRecognizer(doc1.vocab, default_ner(), **config) assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""] assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""] @@ -157,7 +162,12 @@ def test_accept_blocked_token(): # 2. test blocking behaviour nlp2 = English() doc2 = nlp2("I live in New York") - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner2 = EntityRecognizer(doc2.vocab, default_ner(), **config) # set "New York" to a blocked entity @@ -215,7 +225,12 @@ def test_overwrite_token(): assert [token.ent_type_ for token in doc] == ["", "", "", "", ""] # Check that a new ner can overwrite O - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner2 = EntityRecognizer(doc.vocab, default_ner(), **config) ner2.moves.add_action(5, "") ner2.add_label("GPE") diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py index 7f3e981ea..d88517fb5 100644 --- a/spacy/tests/parser/test_neural_parser.py +++ b/spacy/tests/parser/test_neural_parser.py @@ -28,7 +28,12 @@ def tok2vec(): @pytest.fixture def parser(vocab, arc_eager): - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } return Parser(vocab, model=default_parser(), moves=arc_eager, **config) diff --git a/spacy/tests/parser/test_nn_beam.py b/spacy/tests/parser/test_nn_beam.py index fa5d59f9e..841eb058c 100644 --- a/spacy/tests/parser/test_nn_beam.py +++ b/spacy/tests/parser/test_nn_beam.py @@ -94,7 +94,12 @@ def test_beam_advance_too_few_scores(beam, scores): def test_beam_parse(): nlp = Language() - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } nlp.add_pipe(DependencyParser(nlp.vocab, default_parser(), **config), name="parser") nlp.parser.add_label("nsubj") nlp.parser.begin_training([], token_vector_width=8, hidden_width=8) diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py index ccf7d3ba3..37a9136aa 100644 --- a/spacy/tests/parser/test_preset_sbd.py +++ b/spacy/tests/parser/test_preset_sbd.py @@ -16,7 +16,12 @@ def vocab(): @pytest.fixture def parser(vocab): - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } parser = DependencyParser(vocab, default_parser(), **config) parser.cfg["token_vector_width"] = 4 parser.cfg["hidden_width"] = 32 diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 32b434e04..62c7fbf17 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -264,11 +264,13 @@ GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"] def test_overfitting_IO(): # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly nlp = English() - nlp.add_pipe(nlp.create_pipe('sentencizer')) + nlp.add_pipe(nlp.create_pipe("sentencizer")) # Add a custom component to recognize "Russ Cochran" as an entity for the example training data ruler = EntityRuler(nlp) - patterns = [{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}] + patterns = [ + {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]} + ] ruler.add_patterns(patterns) nlp.add_pipe(ruler) @@ -285,7 +287,11 @@ def test_overfitting_IO(): mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3) mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7]) - mykb.add_alias(alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.5, 0.5]) + mykb.add_alias( + alias="Russ Cochran", + entities=["Q2146908", "Q7381115"], + probabilities=[0.5, 0.5], + ) # Create the Entity Linker component and add it to the pipeline entity_linker = nlp.create_pipe("entity_linker", config={"kb": mykb}) diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py index f9307afc2..f052c4380 100644 --- a/spacy/tests/pipeline/test_morphologizer.py +++ b/spacy/tests/pipeline/test_morphologizer.py @@ -15,8 +15,17 @@ def test_label_types(): TRAIN_DATA = [ - ("I like green eggs", {"morphs": ["Feat=N", "Feat=V", "Feat=J", "Feat=N"], "pos": ["NOUN", "VERB", "ADJ", "NOUN"]}), - ("Eat blue ham", {"morphs": ["Feat=V", "Feat=J", "Feat=N"], "pos": ["VERB", "ADJ", "NOUN"]}), + ( + "I like green eggs", + { + "morphs": ["Feat=N", "Feat=V", "Feat=J", "Feat=N"], + "pos": ["NOUN", "VERB", "ADJ", "NOUN"], + }, + ), + ( + "Eat blue ham", + {"morphs": ["Feat=V", "Feat=J", "Feat=N"], "pos": ["VERB", "ADJ", "NOUN"]}, + ), ] @@ -38,7 +47,12 @@ def test_overfitting_IO(): # test the trained model test_text = "I like blue eggs" doc = nlp(test_text) - gold_morphs = ["Feat=N|POS=NOUN", "Feat=V|POS=VERB", "Feat=J|POS=ADJ", "Feat=N|POS=NOUN"] + gold_morphs = [ + "Feat=N|POS=NOUN", + "Feat=V|POS=VERB", + "Feat=J|POS=ADJ", + "Feat=N|POS=NOUN", + ] assert gold_morphs == [t.morph_ for t in doc] # Also test the results are still the same after IO diff --git a/spacy/tests/pipeline/test_simple_ner.py b/spacy/tests/pipeline/test_simple_ner.py index 9d4acf2fd..024d7bd26 100644 --- a/spacy/tests/pipeline/test_simple_ner.py +++ b/spacy/tests/pipeline/test_simple_ner.py @@ -1,30 +1,31 @@ import pytest from collections import namedtuple - from thinc.api import NumpyOps from spacy.ml._biluo import BILUO, _get_transition_table -from spacy.pipeline.simple_ner import SimpleNER -import spacy -@pytest.fixture(params=[ - ["PER", "ORG", "LOC", "MISC"], - ["GPE", "PERSON", "NUMBER", "CURRENCY", "EVENT"] -]) +@pytest.fixture( + params=[ + ["PER", "ORG", "LOC", "MISC"], + ["GPE", "PERSON", "NUMBER", "CURRENCY", "EVENT"], + ] +) def labels(request): return request.param + @pytest.fixture def ops(): return NumpyOps() + def _get_actions(labels): action_names = ( - [f"B{label}" for label in labels] + \ - [f"I{label}" for label in labels] + \ - [f"L{label}" for label in labels] + \ - [f"U{label}" for label in labels] + \ - ["O"] + [f"B{label}" for label in labels] + + [f"I{label}" for label in labels] + + [f"L{label}" for label in labels] + + [f"U{label}" for label in labels] + + ["O"] ) A = namedtuple("actions", action_names) return A(**{name: i for i, name in enumerate(action_names)}) @@ -228,7 +229,7 @@ def test_transition_table(ops): assert table[0, a.O, a.Uloc] == 1 assert table[0, a.O, a.Uorg] == 1 assert table[0, a.O, a.O] == 1 - + # Last token, prev action was B assert table[1, a.Bper, a.Bper] == 0 assert table[1, a.Bper, a.Bloc] == 0 diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index 177b6bb3d..6a2d16733 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -270,7 +270,12 @@ def test_issue1963(en_tokenizer): @pytest.mark.parametrize("label", ["U-JOB-NAME"]) def test_issue1967(label): - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner = EntityRecognizer(Vocab(), default_ner(), **config) example = Example(doc=None) example.set_token_annotation( diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index 6df437b3c..a37707379 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -196,7 +196,12 @@ def test_issue3345(): doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"]) doc[4].is_sent_start = True ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}]) - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner = EntityRecognizer(doc.vocab, default_ner(), **config) # Add the OUT action. I wouldn't have thought this would be necessary... ner.moves.add_action(5, "") diff --git a/spacy/tests/regression/test_issue3830.py b/spacy/tests/regression/test_issue3830.py index 15632bdf8..06b7893a7 100644 --- a/spacy/tests/regression/test_issue3830.py +++ b/spacy/tests/regression/test_issue3830.py @@ -6,7 +6,12 @@ from spacy.pipeline.defaults import default_parser def test_issue3830_no_subtok(): """Test that the parser doesn't have subtok label if not learn_tokens""" - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } parser = DependencyParser(Vocab(), default_parser(), **config) parser.add_label("nsubj") assert "subtok" not in parser.labels @@ -16,7 +21,12 @@ def test_issue3830_no_subtok(): def test_issue3830_with_subtok(): """Test that the parser does have subtok label if learn_tokens=True.""" - config = {"learn_tokens": True, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": True, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } parser = DependencyParser(Vocab(), default_parser(), **config) parser.add_label("nsubj") assert "subtok" not in parser.labels diff --git a/spacy/tests/regression/test_issue4042.py b/spacy/tests/regression/test_issue4042.py index 4978aba44..f47290b92 100644 --- a/spacy/tests/regression/test_issue4042.py +++ b/spacy/tests/regression/test_issue4042.py @@ -74,7 +74,12 @@ def test_issue4042_bug2(): output_dir.mkdir() ner1.to_disk(output_dir) - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner2 = EntityRecognizer(vocab, default_ner(), **config) ner2.from_disk(output_dir) assert len(ner2.labels) == 2 diff --git a/spacy/tests/regression/test_issue4313.py b/spacy/tests/regression/test_issue4313.py index 946316d85..5e2764618 100644 --- a/spacy/tests/regression/test_issue4313.py +++ b/spacy/tests/regression/test_issue4313.py @@ -12,7 +12,12 @@ def test_issue4313(): beam_width = 16 beam_density = 0.0001 nlp = English() - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner = EntityRecognizer(nlp.vocab, default_ner(), **config) ner.add_label("SOME_LABEL") ner.begin_training([]) diff --git a/spacy/tests/regression/test_issue4924.py b/spacy/tests/regression/test_issue4924.py index b240f6d4a..10c7868a0 100644 --- a/spacy/tests/regression/test_issue4924.py +++ b/spacy/tests/regression/test_issue4924.py @@ -1,4 +1,3 @@ -import pytest from spacy.language import Language diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index 870a980f2..cfb9d7381 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -112,7 +112,7 @@ def test_serialize_custom_nlp(): nlp.to_disk(d) nlp2 = spacy.load(d) model = nlp2.get_pipe("parser").model - tok2vec = model.get_ref("tok2vec") + tok2vec = model.get_ref("tok2vec") # noqa: F841 upper = model.get_ref("upper") # check that we have the correct settings, not the default ones @@ -132,7 +132,7 @@ def test_serialize_parser(): nlp.to_disk(d) nlp2 = spacy.load(d) model = nlp2.get_pipe("parser").model - tok2vec = model.get_ref("tok2vec") + tok2vec = model.get_ref("tok2vec") # noqa: F841 upper = model.get_ref("upper") # check that we have the correct settings, not the default ones diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index 9c4e1f61e..abb5ccb27 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -12,7 +12,12 @@ test_parsers = [DependencyParser, EntityRecognizer] @pytest.fixture def parser(en_vocab): - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } parser = DependencyParser(en_vocab, default_parser(), **config) parser.add_label("nsubj") return parser diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py index d3e82296e..e570b1025 100644 --- a/spacy/tests/serialize/test_serialize_vocab_strings.py +++ b/spacy/tests/serialize/test_serialize_vocab_strings.py @@ -35,8 +35,10 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2): assert vocab1.to_bytes() == vocab1_b new_vocab1 = Vocab().from_bytes(vocab1_b) assert new_vocab1.to_bytes() == vocab1_b - assert len(new_vocab1.strings) == len(strings1) + 2 # adds _SP and POS=SPACE - assert sorted([s for s in new_vocab1.strings]) == sorted(strings1 + list(default_strings)) + assert len(new_vocab1.strings) == len(strings1) + 2 # adds _SP and POS=SPACE + assert sorted([s for s in new_vocab1.strings]) == sorted( + strings1 + list(default_strings) + ) @pytest.mark.parametrize("strings1,strings2", test_strings) diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index d750a8202..2e1cf2730 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -40,6 +40,7 @@ test_ner_apple = [ ] ] + @pytest.fixture def tagged_doc(): text = "Sarah's sister flew to Silicon Valley via London." @@ -184,7 +185,7 @@ def test_tag_score(tagged_doc): tagged_doc, tags=[t.tag_ for t in tagged_doc], pos=[t.pos_ for t in tagged_doc], - morphs=[t.morph_ for t in tagged_doc] + morphs=[t.morph_ for t in tagged_doc], ) scorer.score((tagged_doc, gold)) results = scorer.scores diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py index 1410755db..a7258449d 100644 --- a/spacy/tests/test_util.py +++ b/spacy/tests/test_util.py @@ -13,7 +13,7 @@ from spacy.util import minibatch_by_words ([400, 400, 199, 3], [4]), ([400, 400, 199, 3, 200], [3, 2]), ([400, 400, 199, 3, 1], [5]), - ([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded + ([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded ([400, 400, 199, 3, 1, 200], [3, 3]), ([400, 400, 199, 3, 1, 999], [3, 3]), ([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]), @@ -28,7 +28,11 @@ def test_util_minibatch(doc_sizes, expected_batches): examples = [Example(doc=doc) for doc in docs] tol = 0.2 batch_size = 1000 - batches = list(minibatch_by_words(examples=examples, size=batch_size, tolerance=tol, discard_oversize=True)) + batches = list( + minibatch_by_words( + examples=examples, size=batch_size, tolerance=tol, discard_oversize=True + ) + ) assert [len(batch) for batch in batches] == expected_batches max_size = batch_size + batch_size * tol @@ -53,7 +57,9 @@ def test_util_minibatch_oversize(doc_sizes, expected_batches): examples = [Example(doc=doc) for doc in docs] tol = 0.2 batch_size = 1000 - batches = list(minibatch_by_words(examples=examples, size=batch_size, tolerance=tol, discard_oversize=False)) + batches = list( + minibatch_by_words( + examples=examples, size=batch_size, tolerance=tol, discard_oversize=False + ) + ) assert [len(batch) for batch in batches] == expected_batches - - diff --git a/spacy/util.py b/spacy/util.py index d2d87bef9..ad3dc3635 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -697,7 +697,9 @@ def decaying(start, stop, decay): curr -= decay -def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_oversize=False): +def minibatch_by_words( + examples, size, count_words=len, tolerance=0.2, discard_oversize=False +): """Create minibatches of roughly a given number of words. If any examples are longer than the specified batch length, they will appear in a batch by themselves, or be discarded if discard_oversize=True."""