diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index f4bddac39..40c6d861f 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -14,9 +14,9 @@ from ..gold.converters import iob2docs, conll_ner2docs, json2docs # imported from /converters. CONVERTERS = { - #"conllubio": conllu2docs, TODO - #"conllu": conllu2docs, TODO - #"conll": conllu2docs, TODO + # "conllubio": conllu2docs, TODO + # "conllu": conllu2docs, TODO + # "conll": conllu2docs, TODO "ner": conll_ner2docs, "iob": iob2docs, "json": json2docs, @@ -134,7 +134,7 @@ def verify_cli_args( merge_subtokens, converter, ner_map, - lang + lang, ): if converter == "ner" or converter == "iob": input_data = input_path.open("r", encoding="utf-8").read() @@ -148,7 +148,7 @@ def verify_cli_args( else: msg.warn( "Can't automatically detect NER format. Conversion may not", - "succeed. See https://spacy.io/api/cli#convert" + "succeed. See https://spacy.io/api/cli#convert", ) if file_type not in FILE_TYPES_STDOUT and output_dir == "-": # TODO: support msgpack via stdout in srsly? @@ -176,7 +176,7 @@ def verify_cli_args( if converter not in CONVERTERS: msg.fail(f"Can't find converter for {converter}", exits=1) return converter - + def _get_converter(msg, converter, input_path): if input_path.is_dir(): diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 06e9be15b..ff8b9dc96 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -158,7 +158,7 @@ def train_cli( weights_data = None if init_tok2vec is not None: - with init_tok2vec.open("rb") as file_: + with init_tok2vec.open("rb") as file_: weights_data = file_.read() if use_gpu >= 0: @@ -178,7 +178,6 @@ def train_cli( ) - def train( config_path, data_paths, @@ -193,7 +192,7 @@ def train( config = util.load_config(config_path, create_objects=False) util.fix_random_seed(config["training"]["seed"]) if config["training"].get("use_pytorch_for_gpu_memory"): - # It feels kind of weird to not have a default for this. + # It feels kind of weird to not have a default for this. use_pytorch_for_gpu_memory() nlp_config = config["nlp"] config = util.load_config(config_path, create_objects=True) @@ -238,8 +237,7 @@ def train( tok2vec = tok2vec.get(subpath) if not tok2vec: msg.fail( - f"Could not locate the tok2vec model at {tok2vec_path}.", - exits=1, + f"Could not locate the tok2vec model at {tok2vec_path}.", exits=1, ) tok2vec.from_bytes(weights_data) @@ -351,7 +349,11 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg): try: weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights) except KeyError as e: - raise KeyError(Errors.E983.format(dict='score_weights', key=str(e), keys=list(scores.keys()))) + raise KeyError( + Errors.E983.format( + dict="score_weights", key=str(e), keys=list(scores.keys()) + ) + ) scores["speed"] = wps return weighted_score, scores @@ -500,15 +502,23 @@ def setup_printer(training, nlp): ] except KeyError as e: raise KeyError( - Errors.E983.format(dict='scores (losses)', key=str(e), keys=list(info["losses"].keys()))) + Errors.E983.format( + dict="scores (losses)", key=str(e), keys=list(info["losses"].keys()) + ) + ) try: scores = [ - "{0:.2f}".format(float(info["other_scores"][col])) - for col in score_cols + "{0:.2f}".format(float(info["other_scores"][col])) for col in score_cols ] except KeyError as e: - raise KeyError(Errors.E983.format(dict='scores (other)', key=str(e), keys=list(info["other_scores"].keys()))) + raise KeyError( + Errors.E983.format( + dict="scores (other)", + key=str(e), + keys=list(info["other_scores"].keys()), + ) + ) data = ( [info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))] ) @@ -564,7 +574,7 @@ def verify_cli_args( def verify_textcat_config(nlp, nlp_config): msg.info(f"Initialized textcat component for {len(textcat_labels)} unique labels") nlp.get_pipe("textcat").labels = tuple(textcat_labels) - # if 'positive_label' is provided: double check whether it's in the data and + # if 'positive_label' is provided: double check whether it's in the data and # the task is binary if nlp_config["pipeline"]["textcat"].get("positive_label", None): textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", []) diff --git a/spacy/gold/augment.py b/spacy/gold/augment.py index dda51cda6..45cfc0abe 100644 --- a/spacy/gold/augment.py +++ b/spacy/gold/augment.py @@ -5,7 +5,9 @@ import itertools def make_orth_variants_example(nlp, example, orth_variant_level=0.0): # TODO: naming raw_text = example.text orig_dict = example.to_dict() - variant_text, variant_token_annot = make_orth_variants(nlp, raw_text, orig_dict["token_annotation"], orth_variant_level) + variant_text, variant_token_annot = make_orth_variants( + nlp, raw_text, orig_dict["token_annotation"], orth_variant_level + ) doc = nlp.make_doc(variant_text) orig_dict["token_annotation"] = variant_token_annot return example.from_dict(doc, orig_dict) diff --git a/spacy/gold/converters/__init__.py b/spacy/gold/converters/__init__.py index 0a1242fb4..3e366933a 100644 --- a/spacy/gold/converters/__init__.py +++ b/spacy/gold/converters/__init__.py @@ -1,6 +1,6 @@ -from .iob2docs import iob2docs # noqa: F401 +from .iob2docs import iob2docs # noqa: F401 from .conll_ner2docs import conll_ner2docs # noqa: F401 from .json2docs import json2docs # TODO: Update this one -#from .conllu2docs import conllu2docs # noqa: F401 +# from .conllu2docs import conllu2docs # noqa: F401 diff --git a/spacy/gold/converters/conll_ner2docs.py b/spacy/gold/converters/conll_ner2docs.py index 7042bd7d6..4b32893f4 100644 --- a/spacy/gold/converters/conll_ner2docs.py +++ b/spacy/gold/converters/conll_ner2docs.py @@ -119,7 +119,7 @@ def conll_ner2docs( token.tag_ = pos_tags[i] token.is_sent_start = sent_starts[i] entities = tags_to_entities(biluo_tags) - doc.ents = [Span(doc, start=s, end=e+1, label=L) for L, s, e in entities] + doc.ents = [Span(doc, start=s, end=e + 1, label=L) for L, s, e in entities] output_docs.append(doc) return output_docs diff --git a/spacy/gold/converters/conllu2json.py b/spacy/gold/converters/conllu2json.py index 25ca1d4eb..8f54965f6 100644 --- a/spacy/gold/converters/conllu2json.py +++ b/spacy/gold/converters/conllu2json.py @@ -43,10 +43,7 @@ def conllu2json( raw += example.text sentences.append( generate_sentence( - example.to_dict(), - has_ner_tags, - MISC_NER_PATTERN, - ner_map=ner_map, + example.to_dict(), has_ner_tags, MISC_NER_PATTERN, ner_map=ner_map, ) ) # Real-sized documents could be extracted using the comments on the diff --git a/spacy/gold/converters/json2docs.py b/spacy/gold/converters/json2docs.py index 98219bb04..8f94e169e 100644 --- a/spacy/gold/converters/json2docs.py +++ b/spacy/gold/converters/json2docs.py @@ -8,6 +8,7 @@ from ..example import _fix_legacy_dict_data, _parse_example_dict_data from ...util import load_model from ...lang.xx import MultiLanguage + @contextlib.contextmanager def make_tempdir(): d = Path(tempfile.mkdtemp()) @@ -15,11 +16,7 @@ def make_tempdir(): shutil.rmtree(str(d)) -def json2docs( - input_data, - model=None, - **kwargs -): +def json2docs(input_data, model=None, **kwargs): nlp = load_model(model) if model is not None else MultiLanguage() docs = [] with make_tempdir() as tmp_dir: @@ -29,10 +26,6 @@ def json2docs( for json_annot in read_json_file(json_path): example_dict = _fix_legacy_dict_data(json_annot) tok_dict, doc_dict = _parse_example_dict_data(example_dict) - doc = annotations2doc( - nlp.vocab, - tok_dict, - doc_dict - ) + doc = annotations2doc(nlp.vocab, tok_dict, doc_dict) docs.append(doc) return docs diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py index 8e1c1d204..9efa71ff7 100644 --- a/spacy/gold/corpus.py +++ b/spacy/gold/corpus.py @@ -12,6 +12,7 @@ class Corpus: DOCS: https://spacy.io/api/goldcorpus """ + def __init__(self, train_loc, dev_loc, limit=0): """Create a GoldCorpus. @@ -19,7 +20,7 @@ class Corpus: dev (str / Path): File or directory of development data. RETURNS (GoldCorpus): The newly created object. """ - self.train_loc = train_loc + self.train_loc = train_loc self.dev_loc = dev_loc @staticmethod @@ -56,7 +57,7 @@ class Corpus: with loc.open("rb") as file_: doc_bin = DocBin().from_bytes(file_.read()) yield from doc_bin.get_docs(vocab) - + def count_train(self, nlp): """Returns count of words in train examples""" n = 0 diff --git a/spacy/gold/iob_utils.py b/spacy/gold/iob_utils.py index c74ef5671..3ae911418 100644 --- a/spacy/gold/iob_utils.py +++ b/spacy/gold/iob_utils.py @@ -54,7 +54,7 @@ def biluo_tags_from_doc(doc, missing="O"): return biluo_tags_from_offsets( doc, [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents], - missing=missing + missing=missing, ) diff --git a/spacy/language.py b/spacy/language.py index 01a31400a..1eb53149c 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -542,7 +542,6 @@ class Language(object): raise ValueError(Errors.E979.format(type=type(eg))) return converted_examples - def update( self, examples, @@ -822,7 +821,7 @@ class Language(object): batch_size=batch_size, disable=disable, n_process=n_process, - component_cfg=component_cfg + component_cfg=component_cfg, ) for doc, context in zip(docs, contexts): yield (doc, context) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index c4944407f..b8a45db01 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -51,7 +51,13 @@ class Lemmatizer(object): index_table = self.lookups.get_table("lemma_index", {}) exc_table = self.lookups.get_table("lemma_exc", {}) rules_table = self.lookups.get_table("lemma_rules", {}) - if not any((index_table.get(univ_pos), exc_table.get(univ_pos), rules_table.get(univ_pos))): + if not any( + ( + index_table.get(univ_pos), + exc_table.get(univ_pos), + rules_table.get(univ_pos), + ) + ): if univ_pos == "propn": return [string] else: diff --git a/spacy/ml/_biluo.py b/spacy/ml/_biluo.py index 28339089a..16dcae792 100644 --- a/spacy/ml/_biluo.py +++ b/spacy/ml/_biluo.py @@ -14,11 +14,11 @@ def BILUO() -> Model[Padded, Padded]: forward, init=init, dims={"nO": None}, - attrs={"get_num_actions": get_num_actions} + attrs={"get_num_actions": get_num_actions}, ) -def init(model, X: Optional[Padded]=None, Y: Optional[Padded]=None): +def init(model, X: Optional[Padded] = None, Y: Optional[Padded] = None): if X is not None and Y is not None: if X.data.shape != Y.data.shape: # TODO: Fix error @@ -49,12 +49,12 @@ def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool): masks = model.ops.alloc3f(*Y.shape) max_value = Xp.data.max() for t in range(Xp.data.shape[0]): - is_last = (Xp.lengths < (t+2)).astype("i") + is_last = (Xp.lengths < (t + 2)).astype("i") masks[t] = valid_transitions[is_last, prev_actions] # Don't train the out-of-bounds sequences. - masks[t, Xp.size_at_t[t]:] = 0 + masks[t, Xp.size_at_t[t] :] = 0 # Valid actions get 0*10e8, invalid get large negative value - Y[t] = Xp.data[t] + ((masks[t]-1) * max_value * 10) + Y[t] = Xp.data[t] + ((masks[t] - 1) * max_value * 10) prev_actions = Y[t].argmax(axis=-1) def backprop_biluo(dY: Padded) -> Padded: diff --git a/spacy/ml/_iob.py b/spacy/ml/_iob.py index 0ce9a71e6..39feb3285 100644 --- a/spacy/ml/_iob.py +++ b/spacy/ml/_iob.py @@ -12,11 +12,11 @@ def IOB() -> Model[Padded, Padded]: forward, init=init, dims={"nO": None}, - attrs={"get_num_actions": get_num_actions} + attrs={"get_num_actions": get_num_actions}, ) -def init(model, X: Optional[Padded]=None, Y: Optional[Padded]=None): +def init(model, X: Optional[Padded] = None, Y: Optional[Padded] = None): if X is not None and Y is not None: if X.data.shape != Y.data.shape: # TODO: Fix error @@ -48,14 +48,14 @@ def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool): for t in range(Xp.data.shape[0]): masks[t] = valid_transitions[prev_actions] # Don't train the out-of-bounds sequences. - masks[t, Xp.size_at_t[t]:] = 0 + masks[t, Xp.size_at_t[t] :] = 0 # Valid actions get 0*10e8, invalid get -1*10e8 - Y[t] = Xp.data[t] + ((masks[t]-1) * 10e8) + Y[t] = Xp.data[t] + ((masks[t] - 1) * 10e8) prev_actions = Y[t].argmax(axis=-1) def backprop_biluo(dY: Padded) -> Padded: # Masking the gradient seems to do poorly here. But why? - #dY.data *= masks + # dY.data *= masks return dY return Padded(Y, Xp.size_at_t, Xp.lengths, Xp.indices), backprop_biluo @@ -83,10 +83,10 @@ def _get_transition_table( B_range = ops.xp.arange(B_start, B_end) I_range = ops.xp.arange(I_start, I_end) # B and O are always valid - table[:, B_start : B_end] = 1 + table[:, B_start:B_end] = 1 table[:, O_action] = 1 # I can only follow a matching B table[B_range, I_range] = 1 - + _cache[n_actions] = table return table diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py index f4b5b16fe..215cdeda1 100644 --- a/spacy/ml/_precomputable_affine.py +++ b/spacy/ml/_precomputable_affine.py @@ -84,7 +84,7 @@ def _backprop_precomputable_affine_padding(model, dY, ids): # # (ids < 0).T @ dY mask = model.ops.asarray(ids < 0, dtype="f") - d_pad = model.ops.gemm(mask, dY.reshape(nB, nO*nP), trans1=True) + d_pad = model.ops.gemm(mask, dY.reshape(nB, nO * nP), trans1=True) return d_pad.reshape((1, nF, nO, nP)) diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py index 4a360a9e6..b3a9e0815 100644 --- a/spacy/ml/models/multi_task.py +++ b/spacy/ml/models/multi_task.py @@ -7,7 +7,12 @@ def build_multi_task_model(tok2vec, maxout_pieces, token_vector_width, nO=None): softmax = Softmax(nO=nO, nI=token_vector_width * 2) model = chain( tok2vec, - Maxout(nO=token_vector_width * 2, nI=token_vector_width, nP=maxout_pieces, dropout=0.0), + Maxout( + nO=token_vector_width * 2, + nI=token_vector_width, + nP=maxout_pieces, + dropout=0.0, + ), LayerNorm(token_vector_width * 2), softmax, ) @@ -20,7 +25,11 @@ def build_cloze_multi_task_model(vocab, tok2vec, maxout_pieces, nO=None): # nO = vocab.vectors.data.shape[1] output_layer = chain( Maxout( - nO=nO, nI=tok2vec.get_dim("nO"), nP=maxout_pieces, normalize=True, dropout=0.0 + nO=nO, + nI=tok2vec.get_dim("nO"), + nP=maxout_pieces, + normalize=True, + dropout=0.0, ), Linear(nO=nO, nI=nO, init_W=zero_init), ) @@ -39,7 +48,9 @@ def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15): def mlm_forward(model, docs, is_train): mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob) mask = model.ops.asarray(mask).reshape((mask.shape[0], 1)) - output, backprop = model.get_ref("wrapped-model").begin_update(docs) # drop=drop + output, backprop = model.get_ref("wrapped-model").begin_update( + docs + ) # drop=drop def mlm_backward(d_output): d_output *= 1 - mask diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py index bdcd709b1..47c94cfa1 100644 --- a/spacy/ml/models/parser.py +++ b/spacy/ml/models/parser.py @@ -16,18 +16,14 @@ def build_tb_parser_model( nO=None, ): t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None - tok2vec = chain( - tok2vec, - with_array(Linear(hidden_width, t2v_width)), - list2array(), - ) + tok2vec = chain(tok2vec, with_array(Linear(hidden_width, t2v_width)), list2array(),) tok2vec.set_dim("nO", hidden_width) lower = PrecomputableAffine( nO=hidden_width if use_upper else nO, nF=nr_feature_tokens, nI=tok2vec.get_dim("nO"), - nP=maxout_pieces + nP=maxout_pieces, ) if use_upper: with use_ops("numpy"): diff --git a/spacy/ml/models/simple_ner.py b/spacy/ml/models/simple_ner.py index 01661f55b..d857813ac 100644 --- a/spacy/ml/models/simple_ner.py +++ b/spacy/ml/models/simple_ner.py @@ -1,6 +1,14 @@ import functools from typing import List, Tuple, Dict, Optional -from thinc.api import Ops, Model, Linear, Softmax, with_array, softmax_activation, padded2list +from thinc.api import ( + Ops, + Model, + Linear, + Softmax, + with_array, + softmax_activation, + padded2list, +) from thinc.api import chain, list2padded, configure_normal_init from thinc.api import Dropout from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d @@ -12,12 +20,12 @@ from ...util import registry @registry.architectures.register("spacy.BiluoTagger.v1") -def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], List[Floats2d]]: +def BiluoTagger( + tok2vec: Model[List[Doc], List[Floats2d]] +) -> Model[List[Doc], List[Floats2d]]: biluo = BILUO() linear = Linear( - nO=None, - nI=tok2vec.get_dim("nO"), - init_W=configure_normal_init(mean=0.02) + nO=None, nI=tok2vec.get_dim("nO"), init_W=configure_normal_init(mean=0.02) ) model = chain( tok2vec, @@ -25,7 +33,7 @@ def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], L with_array(chain(Dropout(0.1), linear)), biluo, with_array(softmax_activation()), - padded2list() + padded2list(), ) return Model( @@ -35,11 +43,14 @@ def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], L layers=[model, linear], refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo}, dims={"nO": None}, - attrs={"get_num_actions": biluo.attrs["get_num_actions"]} + attrs={"get_num_actions": biluo.attrs["get_num_actions"]}, ) + @registry.architectures.register("spacy.IOBTagger.v1") -def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], List[Floats2d]]: +def IOBTagger( + tok2vec: Model[List[Doc], List[Floats2d]] +) -> Model[List[Doc], List[Floats2d]]: biluo = IOB() linear = Linear(nO=None, nI=tok2vec.get_dim("nO")) model = chain( @@ -48,7 +59,7 @@ def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], Lis with_array(linear), biluo, with_array(softmax_activation()), - padded2list() + padded2list(), ) return Model( @@ -58,11 +69,10 @@ def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], Lis layers=[model], refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo}, dims={"nO": None}, - attrs={"get_num_actions": biluo.attrs["get_num_actions"]} + attrs={"get_num_actions": biluo.attrs["get_num_actions"]}, ) - def init(model: Model[List[Doc], List[Floats2d]], X=None, Y=None) -> None: if model.get_dim("nO") is None and Y: model.set_dim("nO", Y[0].shape[1]) diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index a02e1a5a1..12a60345c 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -1,7 +1,30 @@ -from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic, ParametricAttention +from thinc.api import ( + Model, + reduce_mean, + Linear, + list2ragged, + Logistic, + ParametricAttention, +) from thinc.api import chain, concatenate, clone, Dropout -from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum, Relu, residual, expand_window -from thinc.api import HashEmbed, with_ragged, with_array, with_cpu, uniqued, FeatureExtractor +from thinc.api import ( + SparseLinear, + Softmax, + softmax_activation, + Maxout, + reduce_sum, + Relu, + residual, + expand_window, +) +from thinc.api import ( + HashEmbed, + with_ragged, + with_array, + with_cpu, + uniqued, + FeatureExtractor, +) from ..spacy_vectors import SpacyVectors from ... import util @@ -50,14 +73,31 @@ def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO @registry.architectures.register("spacy.TextCat.v1") -def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_classes, ngram_size, - window_size, conv_depth, dropout, nO=None): +def build_text_classifier( + width, + embed_size, + pretrained_vectors, + exclusive_classes, + ngram_size, + window_size, + conv_depth, + dropout, + nO=None, +): cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID] with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): - lower = HashEmbed(nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout) - prefix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout) - suffix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout) - shape = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout) + lower = HashEmbed( + nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout + ) + prefix = HashEmbed( + nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout + ) + suffix = HashEmbed( + nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout + ) + shape = HashEmbed( + nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout + ) width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape]) trained_vectors = FeatureExtractor(cols) >> with_array( @@ -83,30 +123,38 @@ def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_class vectors_width = width tok2vec = vector_layer >> with_array( Maxout(width, vectors_width, normalize=True) - >> residual((expand_window(window_size=window_size) - >> Maxout(nO=width, nI=width * ((window_size * 2) + 1), normalize=True))) ** conv_depth, + >> residual( + ( + expand_window(window_size=window_size) + >> Maxout( + nO=width, nI=width * ((window_size * 2) + 1), normalize=True + ) + ) + ) + ** conv_depth, pad=conv_depth, ) cnn_model = ( - tok2vec - >> list2ragged() - >> ParametricAttention(width) - >> reduce_sum() - >> residual(Maxout(nO=width, nI=width)) - >> Linear(nO=nO, nI=width) - >> Dropout(0.0) + tok2vec + >> list2ragged() + >> ParametricAttention(width) + >> reduce_sum() + >> residual(Maxout(nO=width, nI=width)) + >> Linear(nO=nO, nI=width) + >> Dropout(0.0) ) linear_model = build_bow_text_classifier( - nO=nO, ngram_size=ngram_size, exclusive_classes=exclusive_classes, no_output_layer=False + nO=nO, + ngram_size=ngram_size, + exclusive_classes=exclusive_classes, + no_output_layer=False, ) - nO_double = nO*2 if nO else None + nO_double = nO * 2 if nO else None if exclusive_classes: output_layer = Softmax(nO=nO, nI=nO_double) else: - output_layer = ( - Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic() - ) + output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic() model = (linear_model | cnn_model) >> output_layer model.set_ref("tok2vec", tok2vec) if model.has_dim("nO") is not False: diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 53798e57c..b1bed1ea1 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -99,7 +99,13 @@ def hash_charembed_cnn( @registry.architectures.register("spacy.HashEmbedBiLSTM.v1") def hash_embed_bilstm_v1( - pretrained_vectors, width, depth, embed_size, subword_features, maxout_pieces, dropout + pretrained_vectors, + width, + depth, + embed_size, + subword_features, + maxout_pieces, + dropout, ): # Does not use character embeddings: set to False by default return build_Tok2Vec_model( @@ -141,21 +147,24 @@ def hash_char_embed_bilstm_v1( @registry.architectures.register("spacy.LayerNormalizedMaxout.v1") def LayerNormalizedMaxout(width, maxout_pieces): - return Maxout( - nO=width, - nP=maxout_pieces, - dropout=0.0, - normalize=True, - ) + return Maxout(nO=width, nP=maxout_pieces, dropout=0.0, normalize=True,) @registry.architectures.register("spacy.MultiHashEmbed.v1") -def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix, dropout): +def MultiHashEmbed( + columns, width, rows, use_subwords, pretrained_vectors, mix, dropout +): norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout) if use_subwords: - prefix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout) - suffix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout) - shape = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout) + prefix = HashEmbed( + nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout + ) + suffix = HashEmbed( + nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout + ) + shape = HashEmbed( + nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout + ) if pretrained_vectors: glove = StaticVectors( @@ -195,7 +204,13 @@ def CharacterEmbed(columns, width, rows, nM, nC, features, dropout): def MaxoutWindowEncoder(width, window_size, maxout_pieces, depth): cnn = chain( expand_window(window_size=window_size), - Maxout(nO=width, nI=width * ((window_size * 2) + 1), nP=maxout_pieces, dropout=0.0, normalize=True), + Maxout( + nO=width, + nI=width * ((window_size * 2) + 1), + nP=maxout_pieces, + dropout=0.0, + normalize=True, + ), ) model = clone(residual(cnn), depth) model.set_dim("nO", width) @@ -247,11 +262,19 @@ def build_Tok2Vec_model( subword_features = False cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): - norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout) + norm = HashEmbed( + nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout + ) if subword_features: - prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout) - suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout) - shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout) + prefix = HashEmbed( + nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout + ) + suffix = HashEmbed( + nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout + ) + shape = HashEmbed( + nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout + ) else: prefix, suffix, shape = (None, None, None) if pretrained_vectors is not None: diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index 251189389..f7dad565e 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -20,8 +20,8 @@ def TransitionModel(tok2vec, lower, upper, unseen_classes=set()): attrs={ "has_upper": has_upper, "unseen_classes": set(unseen_classes), - "resize_output": resize_output - } + "resize_output": resize_output, + }, ) @@ -31,7 +31,7 @@ def forward(model, X, is_train): model.layers, unseen_classes=model.attrs["unseen_classes"], train=is_train, - has_upper=model.attrs["has_upper"] + has_upper=model.attrs["has_upper"], ) return step_model, step_model.finish_steps @@ -62,7 +62,7 @@ def resize_output(model, new_nO): nI = None if smaller.has_dim("nI"): nI = smaller.get_dim("nI") - with use_ops('numpy'): + with use_ops("numpy"): larger = Linear(nO=new_nO, nI=nI) larger.init = smaller.init # it could be that the model is not initialized yet, then skip this bit @@ -74,8 +74,8 @@ def resize_output(model, new_nO): # Weights are stored in (nr_out, nr_in) format, so we're basically # just adding rows here. if smaller.has_dim("nO"): - larger_W[:smaller.get_dim("nO")] = smaller_W - larger_b[:smaller.get_dim("nO")] = smaller_b + larger_W[: smaller.get_dim("nO")] = smaller_W + larger_b[: smaller.get_dim("nO")] = smaller_b for i in range(smaller.get_dim("nO"), new_nO): model.attrs["unseen_classes"].add(i) diff --git a/spacy/pipeline/simple_ner.py b/spacy/pipeline/simple_ner.py index 3ef6a48ce..08453becc 100644 --- a/spacy/pipeline/simple_ner.py +++ b/spacy/pipeline/simple_ner.py @@ -21,9 +21,7 @@ class SimpleNER(Pipe): self.model = model self.cfg = {"labels": []} self.loss_func = SequenceCategoricalCrossentropy( - names=self.get_tag_names(), - normalize=True, - missing_value=None + names=self.get_tag_names(), normalize=True, missing_value=None ) assert self.model is not None @@ -38,21 +36,21 @@ class SimpleNER(Pipe): def add_label(self, label): if label not in self.cfg["labels"]: self.cfg["labels"].append(label) - + def get_tag_names(self): if self.is_biluo: return ( - [f"B-{label}" for label in self.labels] + - [f"I-{label}" for label in self.labels] + - [f"L-{label}" for label in self.labels] + - [f"U-{label}" for label in self.labels] + - ["O"] + [f"B-{label}" for label in self.labels] + + [f"I-{label}" for label in self.labels] + + [f"L-{label}" for label in self.labels] + + [f"U-{label}" for label in self.labels] + + ["O"] ) else: return ( - [f"B-{label}" for label in self.labels] + - [f"I-{label}" for label in self.labels] + - ["O"] + [f"B-{label}" for label in self.labels] + + [f"I-{label}" for label in self.labels] + + ["O"] ) def predict(self, docs: List[Doc]) -> List[Floats2d]: @@ -107,7 +105,7 @@ class SimpleNER(Pipe): def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs): self.cfg.update(kwargs) - if not hasattr(get_examples, '__call__'): + if not hasattr(get_examples, "__call__"): gold_tuples = get_examples get_examples = lambda: gold_tuples labels = _get_labels(get_examples()) @@ -116,14 +114,12 @@ class SimpleNER(Pipe): labels = self.labels n_actions = self.model.attrs["get_num_actions"](len(labels)) self.model.set_dim("nO", n_actions) - self.model.initialize() + self.model.initialize() if pipeline is not None: self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg) link_vectors_to_models(self.vocab) self.loss_func = SequenceCategoricalCrossentropy( - names=self.get_tag_names(), - normalize=True, - missing_value=None + names=self.get_tag_names(), normalize=True, missing_value=None ) return sgd @@ -144,6 +140,6 @@ def _get_labels(examples): labels = set() for eg in examples: for ner_tag in eg.get_aligned("ENT_TYPE", as_string=True): - if ner_tag != 'O' and ner_tag != '-': + if ner_tag != "O" and ner_tag != "-": labels.add(ner_tag) return list(sorted(labels)) diff --git a/spacy/scorer.py b/spacy/scorer.py index 71cbc019a..31ff9ca33 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -97,7 +97,9 @@ class Scorer(object): for name, component in pipeline: if name == "textcat": self.textcat_multilabel = component.model.attrs["multi_label"] - self.textcat_positive_label = component.cfg.get("positive_label", None) + self.textcat_positive_label = component.cfg.get( + "positive_label", None + ) for label in component.cfg.get("labels", []): self.textcat_auc_per_cat[label] = ROCAUCScore() self.textcat_f_per_cat[label] = PRFScore() @@ -118,19 +120,19 @@ class Scorer(object): @property def morphs_acc(self): - """RETURNS (float): Morph tag accuracy (morphological features, + """RETURNS (float): Morph tag accuracy (morphological features, i.e. `Token.morph`). """ - return self.morphs.fscore * 100 + return self.morphs.fscore * 100 @property def morphs_per_type(self): - """RETURNS (dict): Scores per dependency label. + """RETURNS (dict): Scores per dependency label. """ - return { - k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100} - for k, v in self.morphs_per_feat.items() - } + return { + k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100} + for k, v in self.morphs_per_feat.items() + } @property def sent_p(self): @@ -359,7 +361,9 @@ class Scorer(object): (gold_i, gold_head, token.dep_.lower()) ) # Find all NER labels in gold and doc - ent_labels = set([k.label_ for k in gold_doc.ents] + [k.label_ for k in doc.ents]) + ent_labels = set( + [k.label_ for k in gold_doc.ents] + [k.label_ for k in doc.ents] + ) # Set up all labels for per type scoring and prepare gold per type gold_per_ents = {ent_label: set() for ent_label in ent_labels} for ent_label in ent_labels: @@ -392,7 +396,10 @@ class Scorer(object): self.pos.score_set(cand_pos, gold_pos) self.morphs.score_set(cand_morphs, gold_morphs) for field in self.morphs_per_feat: - self.morphs_per_feat[field].score_set(cand_morphs_per_feat.get(field, set()), gold_morphs_per_feat.get(field, set())) + self.morphs_per_feat[field].score_set( + cand_morphs_per_feat.get(field, set()), + gold_morphs_per_feat.get(field, set()), + ) self.sent_starts.score_set(cand_sent_starts, gold_sent_starts) self.labelled.score_set(cand_deps, gold_deps) for dep in self.labelled_per_dep: @@ -404,7 +411,9 @@ class Scorer(object): ) if ( len(gold_doc.cats) > 0 - and set(self.textcat_f_per_cat) == set(self.textcat_auc_per_cat) == set(gold_doc.cats) + and set(self.textcat_f_per_cat) + == set(self.textcat_auc_per_cat) + == set(gold_doc.cats) and set(gold_doc.cats) == set(doc.cats) ): goldcat = max(gold_doc.cats, key=gold_doc.cats.get) @@ -416,10 +425,10 @@ class Scorer(object): ) for label in set(gold_doc.cats): self.textcat_auc_per_cat[label].score_set( - doc.cats[label], gold_doc.cats[label] + doc.cats[label], gold_doc.cats[label] ) self.textcat_f_per_cat[label].score_set( - set([label]) & set([candcat]), set([label]) & set([goldcat]) + set([label]) & set([candcat]), set([label]) & set([goldcat]) ) elif len(self.textcat_f_per_cat) > 0: model_labels = set(self.textcat_f_per_cat) diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py index 879334056..b9c230516 100644 --- a/spacy/tests/doc/test_add_entities.py +++ b/spacy/tests/doc/test_add_entities.py @@ -9,7 +9,12 @@ from spacy.pipeline.defaults import default_ner def test_doc_add_entities_set_ents_iob(en_vocab): text = ["This", "is", "a", "lion"] doc = get_doc(en_vocab, text) - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner = EntityRecognizer(en_vocab, default_ner(), **config) ner.begin_training([]) ner(doc) @@ -26,7 +31,12 @@ def test_doc_add_entities_set_ents_iob(en_vocab): def test_ents_reset(en_vocab): text = ["This", "is", "a", "lion"] doc = get_doc(en_vocab, text) - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner = EntityRecognizer(en_vocab, default_ner(), **config) ner.begin_training([]) ner(doc) diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index 4afa11963..87675e94d 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -17,7 +17,12 @@ def vocab(): @pytest.fixture def parser(vocab): - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } parser = DependencyParser(vocab, default_parser(), **config) return parser @@ -35,10 +40,7 @@ def _train_parser(parser): for i in range(5): losses = {} doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) - gold = { - "heads": [1, 1, 3, 3], - "deps": ["left", "ROOT", "left", "ROOT"] - } + gold = {"heads": [1, 1, 3, 3], "deps": ["left", "ROOT", "left", "ROOT"]} example = Example.from_dict(doc, gold) parser.update([example], sgd=sgd, losses=losses) return parser @@ -51,10 +53,7 @@ def test_add_label(parser): for i in range(100): losses = {} doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) - gold = { - "heads": [1, 1, 3, 3], - "deps": ["right", "ROOT", "left", "ROOT"] - } + gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]} parser.update((doc, gold), sgd=sgd, losses=losses) doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) doc = parser(doc) @@ -63,7 +62,12 @@ def test_add_label(parser): def test_add_label_deserializes_correctly(): - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner1 = EntityRecognizer(Vocab(), default_ner(), **config) ner1.add_label("C") ner1.add_label("B") @@ -78,6 +82,7 @@ def test_add_label_deserializes_correctly(): for i in range(ner1.moves.n_moves): assert ner1.moves.get_class_name(i) == ner2.moves.get_class_name(i) + @pytest.mark.parametrize( "pipe_cls,n_moves,model", [(DependencyParser, 5, default_parser()), (EntityRecognizer, 4, default_ner())], diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py index 12883ee08..f0f41e645 100644 --- a/spacy/tests/parser/test_arc_eager_oracle.py +++ b/spacy/tests/parser/test_arc_eager_oracle.py @@ -139,7 +139,12 @@ def test_get_oracle_actions(): deps.append(dep) ents.append(ent) doc = Doc(Vocab(), words=[t[1] for t in annot_tuples]) - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } parser = DependencyParser(doc.vocab, default_parser(), **config) parser.moves.add_action(0, "") parser.moves.add_action(1, "") @@ -151,7 +156,9 @@ def test_get_oracle_actions(): parser.moves.add_action(2, dep) elif head < i: parser.moves.add_action(3, dep) - example = Example.from_dict(doc, {"words": words, "tags": tags, "heads": heads, "deps": deps}) + example = Example.from_dict( + doc, {"words": words, "tags": tags, "heads": heads, "deps": deps} + ) parser.moves.get_oracle_sequence(example) @@ -179,41 +186,41 @@ def test_oracle_dev_sentence(vocab, arc_eager): . punct said """ expected_transitions = [ - "S", # Shift 'Motor' - "S", # Shift 'Cars' - "L-nn", # Attach 'Cars' to 'Inc.' - "L-nn", # Attach 'Motor' to 'Inc.' - "L-nn", # Attach 'Rolls-Royce' to 'Inc.', force shift - "L-nsubj", # Attach 'Inc.' to 'said' - "S", # Shift 'it' - "L-nsubj", # Attach 'it.' to 'expects' - "R-ccomp", # Attach 'expects' to 'said' - "S", # Shift 'its' - "S", # Shift 'U.S.' - "L-nn", # Attach 'U.S.' to 'sales' - "L-poss", # Attach 'its' to 'sales' - "S", # Shift 'sales' - "S", # Shift 'to' - "S", # Shift 'remain' - "L-cop", # Attach 'remain' to 'steady' - "L-aux", # Attach 'to' to 'steady' - "L-nsubj", # Attach 'sales' to 'steady' - "R-xcomp", # Attach 'steady' to 'expects' - "R-prep", # Attach 'at' to 'steady' - "S", # Shift 'about' - "L-quantmod", # Attach "about" to "1,200" - "S", # Shift "1,200" - "L-num", # Attach "1,200" to "cars" - "R-pobj", # Attach "cars" to "at" - "D", # Reduce "cars" - "D", # Reduce "at" - "R-prep", # Attach "in" to "steady" - "R-pobj", # Attach "1990" to "in" - "D", # Reduce "1990" - "D", # Reduce "in" - "D", # Reduce "steady" - "D", # Reduce "expects" - "R-punct", # Attach "." to "said" + "S", # Shift 'Motor' + "S", # Shift 'Cars' + "L-nn", # Attach 'Cars' to 'Inc.' + "L-nn", # Attach 'Motor' to 'Inc.' + "L-nn", # Attach 'Rolls-Royce' to 'Inc.', force shift + "L-nsubj", # Attach 'Inc.' to 'said' + "S", # Shift 'it' + "L-nsubj", # Attach 'it.' to 'expects' + "R-ccomp", # Attach 'expects' to 'said' + "S", # Shift 'its' + "S", # Shift 'U.S.' + "L-nn", # Attach 'U.S.' to 'sales' + "L-poss", # Attach 'its' to 'sales' + "S", # Shift 'sales' + "S", # Shift 'to' + "S", # Shift 'remain' + "L-cop", # Attach 'remain' to 'steady' + "L-aux", # Attach 'to' to 'steady' + "L-nsubj", # Attach 'sales' to 'steady' + "R-xcomp", # Attach 'steady' to 'expects' + "R-prep", # Attach 'at' to 'steady' + "S", # Shift 'about' + "L-quantmod", # Attach "about" to "1,200" + "S", # Shift "1,200" + "L-num", # Attach "1,200" to "cars" + "R-pobj", # Attach "cars" to "at" + "D", # Reduce "cars" + "D", # Reduce "at" + "R-prep", # Attach "in" to "steady" + "R-pobj", # Attach "1990" to "in" + "D", # Reduce "1990" + "D", # Reduce "in" + "D", # Reduce "steady" + "D", # Reduce "expects" + "R-punct", # Attach "." to "said" ] gold_words = [] @@ -229,8 +236,8 @@ def test_oracle_dev_sentence(vocab, arc_eager): gold_heads.append(head) gold_heads = [gold_words.index(head) for head in gold_heads] for dep in gold_deps: - arc_eager.add_action(2, dep) # Left - arc_eager.add_action(3, dep) # Right + arc_eager.add_action(2, dep) # Left + arc_eager.add_action(3, dep) # Right doc = Doc(Vocab(), words=gold_words) example = Example.from_dict(doc, {"heads": gold_heads, "deps": gold_deps}) diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index ff8117196..61e25ffee 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -143,7 +143,12 @@ def test_accept_blocked_token(): # 1. test normal behaviour nlp1 = English() doc1 = nlp1("I live in New York") - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner1 = EntityRecognizer(doc1.vocab, default_ner(), **config) assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""] assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""] @@ -162,7 +167,12 @@ def test_accept_blocked_token(): # 2. test blocking behaviour nlp2 = English() doc2 = nlp2("I live in New York") - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner2 = EntityRecognizer(doc2.vocab, default_ner(), **config) # set "New York" to a blocked entity @@ -220,7 +230,12 @@ def test_overwrite_token(): assert [token.ent_type_ for token in doc] == ["", "", "", "", ""] # Check that a new ner can overwrite O - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner2 = EntityRecognizer(doc.vocab, default_ner(), **config) ner2.moves.add_action(5, "") ner2.add_label("GPE") diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py index 32177d947..93d92e26b 100644 --- a/spacy/tests/parser/test_neural_parser.py +++ b/spacy/tests/parser/test_neural_parser.py @@ -29,7 +29,12 @@ def tok2vec(): @pytest.fixture def parser(vocab, arc_eager): - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } return Parser(vocab, model=default_parser(), moves=arc_eager, **config) diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 0d9e257b9..06e363b6b 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -33,7 +33,7 @@ def test_parser_root(en_tokenizer): @pytest.mark.xfail -#@pytest.mark.parametrize("text", ["Hello"]) +# @pytest.mark.parametrize("text", ["Hello"]) def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text): tokens = en_tokenizer(text) doc = get_doc( @@ -180,6 +180,7 @@ def test_parser_set_sent_starts(en_vocab): for token in sent: assert token.head in sent + def test_overfitting_IO(): # Simple test to try and quickly overfit the dependency parser - ensuring the ML models work correctly nlp = English() diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py index 5a29d84f4..ffd0c5df4 100644 --- a/spacy/tests/parser/test_preset_sbd.py +++ b/spacy/tests/parser/test_preset_sbd.py @@ -16,7 +16,12 @@ def vocab(): @pytest.fixture def parser(vocab): - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } parser = DependencyParser(vocab, default_parser(), **config) parser.cfg["token_vector_width"] = 4 parser.cfg["hidden_width"] = 32 @@ -28,7 +33,9 @@ def parser(vocab): for i in range(10): losses = {} doc = Doc(vocab, words=["a", "b", "c", "d"]) - example = Example.from_dict(doc, {"heads": [1, 1, 3, 3], "deps": ["left", "ROOT", "left", "ROOT"]}) + example = Example.from_dict( + doc, {"heads": [1, 1, 3, 3], "deps": ["left", "ROOT", "left", "ROOT"]} + ) parser.update([example], sgd=sgd, losses=losses) return parser diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 37dddc63e..a50ad8499 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -272,11 +272,13 @@ GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"] def test_overfitting_IO(): # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly nlp = English() - nlp.add_pipe(nlp.create_pipe('sentencizer')) + nlp.add_pipe(nlp.create_pipe("sentencizer")) # Add a custom component to recognize "Russ Cochran" as an entity for the example training data ruler = EntityRuler(nlp) - patterns = [{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}] + patterns = [ + {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]} + ] ruler.add_patterns(patterns) nlp.add_pipe(ruler) @@ -293,7 +295,11 @@ def test_overfitting_IO(): mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3) mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7]) - mykb.add_alias(alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.5, 0.5]) + mykb.add_alias( + alias="Russ Cochran", + entities=["Q2146908", "Q7381115"], + probabilities=[0.5, 0.5], + ) # Create the Entity Linker component and add it to the pipeline entity_linker = nlp.create_pipe("entity_linker", config={"kb": mykb}) diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py index 647e1a429..c853de232 100644 --- a/spacy/tests/pipeline/test_morphologizer.py +++ b/spacy/tests/pipeline/test_morphologizer.py @@ -15,8 +15,17 @@ def test_label_types(): TRAIN_DATA = [ - ("I like green eggs", {"morphs": ["Feat=N", "Feat=V", "Feat=J", "Feat=N"], "pos": ["NOUN", "VERB", "ADJ", "NOUN"]}), - ("Eat blue ham", {"morphs": ["Feat=V", "Feat=J", "Feat=N"], "pos": ["VERB", "ADJ", "NOUN"]}), + ( + "I like green eggs", + { + "morphs": ["Feat=N", "Feat=V", "Feat=J", "Feat=N"], + "pos": ["NOUN", "VERB", "ADJ", "NOUN"], + }, + ), + ( + "Eat blue ham", + {"morphs": ["Feat=V", "Feat=J", "Feat=N"], "pos": ["VERB", "ADJ", "NOUN"]}, + ), ] @@ -38,7 +47,12 @@ def test_overfitting_IO(): # test the trained model test_text = "I like blue eggs" doc = nlp(test_text) - gold_morphs = ["Feat=N|POS=NOUN", "Feat=V|POS=VERB", "Feat=J|POS=ADJ", "Feat=N|POS=NOUN"] + gold_morphs = [ + "Feat=N|POS=NOUN", + "Feat=V|POS=VERB", + "Feat=J|POS=ADJ", + "Feat=N|POS=NOUN", + ] assert [t.morph_ for t in doc] == gold_morphs # Also test the results are still the same after IO diff --git a/spacy/tests/pipeline/test_simple_ner.py b/spacy/tests/pipeline/test_simple_ner.py index 9d4acf2fd..939786b0a 100644 --- a/spacy/tests/pipeline/test_simple_ner.py +++ b/spacy/tests/pipeline/test_simple_ner.py @@ -7,24 +7,28 @@ from spacy.pipeline.simple_ner import SimpleNER import spacy -@pytest.fixture(params=[ - ["PER", "ORG", "LOC", "MISC"], - ["GPE", "PERSON", "NUMBER", "CURRENCY", "EVENT"] -]) +@pytest.fixture( + params=[ + ["PER", "ORG", "LOC", "MISC"], + ["GPE", "PERSON", "NUMBER", "CURRENCY", "EVENT"], + ] +) def labels(request): return request.param + @pytest.fixture def ops(): return NumpyOps() + def _get_actions(labels): action_names = ( - [f"B{label}" for label in labels] + \ - [f"I{label}" for label in labels] + \ - [f"L{label}" for label in labels] + \ - [f"U{label}" for label in labels] + \ - ["O"] + [f"B{label}" for label in labels] + + [f"I{label}" for label in labels] + + [f"L{label}" for label in labels] + + [f"U{label}" for label in labels] + + ["O"] ) A = namedtuple("actions", action_names) return A(**{name: i for i, name in enumerate(action_names)}) @@ -228,7 +232,7 @@ def test_transition_table(ops): assert table[0, a.O, a.Uloc] == 1 assert table[0, a.O, a.Uorg] == 1 assert table[0, a.O, a.O] == 1 - + # Last token, prev action was B assert table[1, a.Bper, a.Bper] == 0 assert table[1, a.Bper, a.Bloc] == 0 diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index 129c00d99..94996c410 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -270,7 +270,12 @@ def test_issue1963(en_tokenizer): @pytest.mark.parametrize("label", ["U-JOB-NAME"]) def test_issue1967(label): - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner = EntityRecognizer(Vocab(), default_ner(), **config) example = Example.from_dict( Doc(ner.vocab, words=["word"]), @@ -280,8 +285,8 @@ def test_issue1967(label): "tags": ["tag"], "heads": [0], "deps": ["dep"], - "entities": [label] - } + "entities": [label], + }, ) assert "JOB-NAME" in ner.moves.get_actions(gold_parses=[example])[1] diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index 6df437b3c..a37707379 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -196,7 +196,12 @@ def test_issue3345(): doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"]) doc[4].is_sent_start = True ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}]) - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner = EntityRecognizer(doc.vocab, default_ner(), **config) # Add the OUT action. I wouldn't have thought this would be necessary... ner.moves.add_action(5, "") diff --git a/spacy/tests/regression/test_issue3830.py b/spacy/tests/regression/test_issue3830.py index 15632bdf8..06b7893a7 100644 --- a/spacy/tests/regression/test_issue3830.py +++ b/spacy/tests/regression/test_issue3830.py @@ -6,7 +6,12 @@ from spacy.pipeline.defaults import default_parser def test_issue3830_no_subtok(): """Test that the parser doesn't have subtok label if not learn_tokens""" - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } parser = DependencyParser(Vocab(), default_parser(), **config) parser.add_label("nsubj") assert "subtok" not in parser.labels @@ -16,7 +21,12 @@ def test_issue3830_no_subtok(): def test_issue3830_with_subtok(): """Test that the parser does have subtok label if learn_tokens=True.""" - config = {"learn_tokens": True, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": True, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } parser = DependencyParser(Vocab(), default_parser(), **config) parser.add_label("nsubj") assert "subtok" not in parser.labels diff --git a/spacy/tests/regression/test_issue4042.py b/spacy/tests/regression/test_issue4042.py index 4978aba44..f47290b92 100644 --- a/spacy/tests/regression/test_issue4042.py +++ b/spacy/tests/regression/test_issue4042.py @@ -74,7 +74,12 @@ def test_issue4042_bug2(): output_dir.mkdir() ner1.to_disk(output_dir) - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner2 = EntityRecognizer(vocab, default_ner(), **config) ner2.from_disk(output_dir) assert len(ner2.labels) == 2 diff --git a/spacy/tests/regression/test_issue4313.py b/spacy/tests/regression/test_issue4313.py index 46f79d6f5..3bddc26ca 100644 --- a/spacy/tests/regression/test_issue4313.py +++ b/spacy/tests/regression/test_issue4313.py @@ -16,7 +16,12 @@ def test_issue4313(): beam_width = 16 beam_density = 0.0001 nlp = English() - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner = EntityRecognizer(nlp.vocab, default_ner(), **config) ner.add_label("SOME_LABEL") ner.begin_training([]) diff --git a/spacy/tests/regression/test_issue4665.py b/spacy/tests/regression/test_issue4665.py index cb9279250..2e1a6e549 100644 --- a/spacy/tests/regression/test_issue4665.py +++ b/spacy/tests/regression/test_issue4665.py @@ -1,6 +1,7 @@ import pytest + # TODO -#from spacy.gold.converters.conllu2docs import conllu2docs +# from spacy.gold.converters.conllu2docs import conllu2docs input_data = """ 1 [ _ PUNCT -LRB- _ _ punct _ _ diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index 9c4e1f61e..abb5ccb27 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -12,7 +12,12 @@ test_parsers = [DependencyParser, EntityRecognizer] @pytest.fixture def parser(en_vocab): - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } parser = DependencyParser(en_vocab, default_parser(), **config) parser.add_label("nsubj") return parser diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py index d3e82296e..e570b1025 100644 --- a/spacy/tests/serialize/test_serialize_vocab_strings.py +++ b/spacy/tests/serialize/test_serialize_vocab_strings.py @@ -35,8 +35,10 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2): assert vocab1.to_bytes() == vocab1_b new_vocab1 = Vocab().from_bytes(vocab1_b) assert new_vocab1.to_bytes() == vocab1_b - assert len(new_vocab1.strings) == len(strings1) + 2 # adds _SP and POS=SPACE - assert sorted([s for s in new_vocab1.strings]) == sorted(strings1 + list(default_strings)) + assert len(new_vocab1.strings) == len(strings1) + 2 # adds _SP and POS=SPACE + assert sorted([s for s in new_vocab1.strings]) == sorted( + strings1 + list(default_strings) + ) @pytest.mark.parametrize("strings1,strings2", test_strings) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 4b244a3ce..3eb43ab92 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -3,6 +3,7 @@ import pytest from spacy.lang.en import English from spacy.gold.converters import iob2docs, conll_ner2docs from spacy.cli.pretrain import make_docs + # TODO # from spacy.gold.converters import conllu2docs diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 7af62accb..61b9ca57c 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -155,7 +155,18 @@ def test_gold_biluo_misalign(en_vocab): def test_split_sentences(en_vocab): words = ["I", "flew", "to", "San Francisco Valley", "had", "loads of fun"] doc = Doc(en_vocab, words=words) - gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "had", "loads", "of", "fun"] + gold_words = [ + "I", + "flew", + "to", + "San", + "Francisco", + "Valley", + "had", + "loads", + "of", + "fun", + ] sent_starts = [True, False, False, False, False, False, True, False, False, False] example = Example.from_dict(doc, {"words": gold_words, "sent_starts": sent_starts}) assert example.text == "I flew to San Francisco Valley had loads of fun " @@ -166,7 +177,16 @@ def test_split_sentences(en_vocab): words = ["I", "flew", "to", "San", "Francisco", "Valley", "had", "loads", "of fun"] doc = Doc(en_vocab, words=words) - gold_words = ["I", "flew", "to", "San Francisco", "Valley", "had", "loads of", "fun"] + gold_words = [ + "I", + "flew", + "to", + "San Francisco", + "Valley", + "had", + "loads of", + "fun", + ] sent_starts = [True, False, False, False, False, True, False, False] example = Example.from_dict(doc, {"words": gold_words, "sent_starts": sent_starts}) assert example.text == "I flew to San Francisco Valley had loads of fun " @@ -195,7 +215,15 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer): gold_words = ["I", "flew to", "San Francisco Valley", "."] example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) assert example.get_aligned("ENT_IOB") == [2, 2, 2, 3, 1, 1, 2] - assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "LOC", "LOC", "LOC", ""] + assert example.get_aligned("ENT_TYPE", as_string=True) == [ + "", + "", + "", + "LOC", + "LOC", + "LOC", + "", + ] # misaligned words = ["I flew", "to", "San Francisco", "Valley", "."] @@ -206,11 +234,21 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer): entities = [(offset_start, offset_end, "LOC")] links = {(offset_start, offset_end): {"Q816843": 1.0}} gold_words = ["I", "flew to", "San", "Francisco Valley", "."] - example = Example.from_dict(doc, {"words": gold_words, "entities": entities, "links": links}) + example = Example.from_dict( + doc, {"words": gold_words, "entities": entities, "links": links} + ) assert example.get_aligned("ENT_IOB") == [2, 2, 3, 1, 2] assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "LOC", "LOC", ""] - assert example.get_aligned("ENT_KB_ID", as_string=True) == ["", "", "Q816843", "Q816843", ""] - assert example.to_dict()["doc_annotation"]["links"][(offset_start, offset_end)] == {"Q816843": 1.0} + assert example.get_aligned("ENT_KB_ID", as_string=True) == [ + "", + "", + "Q816843", + "Q816843", + "", + ] + assert example.to_dict()["doc_annotation"]["links"][(offset_start, offset_end)] == { + "Q816843": 1.0 + } # additional whitespace tokens in GoldParse words words, spaces = get_words_and_spaces( @@ -221,26 +259,55 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer): entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] gold_words = ["I", "flew", " ", "to", "San Francisco Valley", "."] gold_spaces = [True, True, False, True, False, False] - example = Example.from_dict(doc, {"words": gold_words, "spaces": gold_spaces, "entities": entities}) + example = Example.from_dict( + doc, {"words": gold_words, "spaces": gold_spaces, "entities": entities} + ) assert example.get_aligned("ENT_IOB") == [2, 2, 2, 2, 3, 1, 2] - assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "", "LOC", "LOC", ""] + assert example.get_aligned("ENT_TYPE", as_string=True) == [ + "", + "", + "", + "", + "LOC", + "LOC", + "", + ] # from issue #4791 doc = en_tokenizer("I'll return the ₹54 amount") gold_words = ["I", "'ll", "return", "the", "₹", "54", "amount"] gold_spaces = [False, True, True, True, False, True, False] entities = [(16, 19, "MONEY")] - example = Example.from_dict(doc, {"words": gold_words, "spaces": gold_spaces, "entities": entities}) + example = Example.from_dict( + doc, {"words": gold_words, "spaces": gold_spaces, "entities": entities} + ) assert example.get_aligned("ENT_IOB") == [2, 2, 2, 2, 3, 2] - assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "", "MONEY", ""] + assert example.get_aligned("ENT_TYPE", as_string=True) == [ + "", + "", + "", + "", + "MONEY", + "", + ] doc = en_tokenizer("I'll return the $54 amount") gold_words = ["I", "'ll", "return", "the", "$", "54", "amount"] gold_spaces = [False, True, True, True, False, True, False] entities = [(16, 19, "MONEY")] - example = Example.from_dict(doc, {"words": gold_words, "spaces": gold_spaces, "entities": entities}) + example = Example.from_dict( + doc, {"words": gold_words, "spaces": gold_spaces, "entities": entities} + ) assert example.get_aligned("ENT_IOB") == [2, 2, 2, 2, 3, 1, 2] - assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "", "MONEY", "MONEY", ""] + assert example.get_aligned("ENT_TYPE", as_string=True) == [ + "", + "", + "", + "", + "MONEY", + "MONEY", + "", + ] def test_roundtrip_offsets_biluo_conversion(en_tokenizer): @@ -311,14 +378,16 @@ def test_roundtrip_docs_to_json(doc): assert lemmas == [t.lemma_ for t in reloaded_example.reference] assert deps == [t.dep_ for t in reloaded_example.reference] assert heads == [t.head.i for t in reloaded_example.reference] - assert ents == [(e.start_char, e.end_char, e.label_) for e in reloaded_example.reference.ents] + assert ents == [ + (e.start_char, e.end_char, e.label_) for e in reloaded_example.reference.ents + ] assert "TRAVEL" in reloaded_example.reference.cats assert "BAKING" in reloaded_example.reference.cats assert cats["TRAVEL"] == reloaded_example.reference.cats["TRAVEL"] assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"] -@pytest.mark.xfail # TODO do we need to do the projectivity differently? +@pytest.mark.xfail # TODO do we need to do the projectivity differently? def test_projective_train_vs_nonprojective_dev(doc): nlp = English() deps = [t.dep_ for t in doc] @@ -348,9 +417,9 @@ def test_projective_train_vs_nonprojective_dev(doc): # Hm, not sure where misalignment check would be handled? In the components too? -# I guess that does make sense. A text categorizer doesn't care if it's +# I guess that does make sense. A text categorizer doesn't care if it's # misaligned... -@pytest.mark.xfail # TODO +@pytest.mark.xfail # TODO def test_ignore_misaligned(doc): nlp = English() text = doc.text @@ -375,7 +444,9 @@ def test_ignore_misaligned(doc): # doesn't raise an AlignmentError, but there is nothing to iterate over # because the only example can't be aligned - train_reloaded_example = list(goldcorpus.train_dataset(nlp, ignore_misaligned=True)) + train_reloaded_example = list( + goldcorpus.train_dataset(nlp, ignore_misaligned=True) + ) assert len(train_reloaded_example) == 0 @@ -389,7 +460,9 @@ def test_make_orth_variants(doc): # due to randomness, test only that this runs with no errors for now train_example = next(goldcorpus.train_dataset(nlp)) - variant_example = make_orth_variants_example(nlp, train_example, orth_variant_level=0.2) + variant_example = make_orth_variants_example( + nlp, train_example, orth_variant_level=0.2 + ) @pytest.mark.parametrize( @@ -430,7 +503,9 @@ def test_goldparse_startswith_space(en_tokenizer): entities = ["U-DATE"] deps = ["ROOT"] heads = [0] - example = Example.from_dict(doc, {"words": gold_words, "entities": entities, "deps":deps, "heads": heads}) + example = Example.from_dict( + doc, {"words": gold_words, "entities": entities, "deps": deps, "heads": heads} + ) assert example.get_aligned("ENT_IOB") == [None, 3] assert example.get_aligned("ENT_TYPE", as_string=True) == [None, "DATE"] assert example.get_aligned("DEP", as_string=True) == [None, "ROOT"] @@ -441,7 +516,12 @@ def test_gold_constructor(): nlp = English() doc = nlp("This is a sentence") example = Example.from_dict(doc, {"cats": {"cat1": 1.0, "cat2": 0.0}}) - assert example.get_aligned("ORTH", as_string=True) == ["This", "is", "a", "sentence"] + assert example.get_aligned("ORTH", as_string=True) == [ + "This", + "is", + "a", + "sentence", + ] assert example.reference.cats["cat1"] assert not example.reference.cats["cat2"] @@ -496,7 +576,7 @@ def test_split_sents(merged_dict): nlp = English() example = Example.from_dict( Doc(nlp.vocab, words=merged_dict["words"], spaces=merged_dict["spaces"]), - merged_dict + merged_dict, ) assert example.text == "Hi there everyone It is just me" @@ -517,15 +597,12 @@ def test_split_sents(merged_dict): # This fails on some None value? Need to look into that. -@pytest.mark.xfail # TODO +@pytest.mark.xfail # TODO def test_tuples_to_example(vocab, merged_dict): cats = {"TRAVEL": 1.0, "BAKING": 0.0} merged_dict = dict(merged_dict) merged_dict["cats"] = cats - ex = Example.from_dict( - Doc(vocab, words=merged_dict["words"]), - merged_dict - ) + ex = Example.from_dict(Doc(vocab, words=merged_dict["words"]), merged_dict) words = [token.text for token in ex.reference] assert words == merged_dict["words"] tags = [token.tag_ for token in ex.reference] diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 9da89e947..e5555bbc7 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -36,9 +36,7 @@ def test_language_update(nlp): def test_language_evaluate(nlp): text = "hello world" - annots = { - "doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}} - } + annots = {"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}} doc = Doc(nlp.vocab, words=text.split(" ")) # Evaluate with text and dict nlp.evaluate([(text, annots)]) diff --git a/spacy/tests/test_new_example.py b/spacy/tests/test_new_example.py index 4c44543c4..b89654554 100644 --- a/spacy/tests/test_new_example.py +++ b/spacy/tests/test_new_example.py @@ -32,7 +32,9 @@ def test_Example_from_dict_invalid(annots): Example.from_dict(predicted, annots) -@pytest.mark.parametrize("pred_words", [["ice", "cream"], ["icecream"], ["i", "ce", "cream"]]) +@pytest.mark.parametrize( + "pred_words", [["ice", "cream"], ["icecream"], ["i", "ce", "cream"]] +) @pytest.mark.parametrize("annots", [{"words": ["icecream"], "tags": ["NN"]}]) def test_Example_from_dict_with_tags(pred_words, annots): vocab = Vocab() @@ -161,7 +163,15 @@ def test_Example_from_dict_with_entities(annots): example = Example.from_dict(predicted, annots) assert len(list(example.reference.ents)) == 2 - assert [example.reference[i].ent_iob_ for i in range(7)] == ["O", "O", "B", "I", "O", "B", "O"] + assert [example.reference[i].ent_iob_ for i in range(7)] == [ + "O", + "O", + "B", + "I", + "O", + "B", + "O", + ] assert example.get_aligned("ENT_IOB") == [2, 2, 3, 1, 2, 3, 2] assert example.reference[2].ent_type_ == "LOC" @@ -174,7 +184,10 @@ def test_Example_from_dict_with_entities(annots): [ { "words": ["I", "like", "New", "York", "and", "Berlin", "."], - "entities": [(0, 4, "LOC"), (21, 27, "LOC")], # not aligned to token boundaries + "entities": [ + (0, 4, "LOC"), + (21, 27, "LOC"), + ], # not aligned to token boundaries } ], ) @@ -182,7 +195,7 @@ def test_Example_from_dict_with_entities_invalid(annots): vocab = Vocab() predicted = Doc(vocab, words=annots["words"]) example = Example.from_dict(predicted, annots) - # TODO: shouldn't this throw some sort of warning ? + # TODO: shouldn't this throw some sort of warning ? assert len(list(example.reference.ents)) == 0 @@ -192,7 +205,10 @@ def test_Example_from_dict_with_entities_invalid(annots): { "words": ["I", "like", "New", "York", "and", "Berlin", "."], "entities": [(7, 15, "LOC"), (20, 26, "LOC")], - "links": {(7, 15): {"Q60": 1.0, "Q64": 0.0}, (20, 26): {"Q60": 0.0, "Q64": 1.0}}, + "links": { + (7, 15): {"Q60": 1.0, "Q64": 0.0}, + (20, 26): {"Q60": 0.0, "Q64": 1.0}, + }, } ], ) @@ -224,4 +240,3 @@ def test_Example_from_dict_with_links_invalid(annots): predicted = Doc(vocab, words=annots["words"]) with pytest.raises(ValueError): Example.from_dict(predicted, annots) - diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index 5574b7d6a..a6684b706 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -42,6 +42,7 @@ test_ner_apple = [ ] ] + @pytest.fixture def tagged_doc(): text = "Sarah's sister flew to Silicon Valley via London." diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py index 9d02c6c6a..65c33c54a 100644 --- a/spacy/tests/test_util.py +++ b/spacy/tests/test_util.py @@ -12,7 +12,7 @@ from spacy.util import minibatch_by_words ([400, 400, 199, 3], [4]), ([400, 400, 199, 3, 200], [3, 2]), ([400, 400, 199, 3, 1], [5]), - ([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded + ([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded ([400, 400, 199, 3, 1, 200], [3, 3]), ([400, 400, 199, 3, 1, 999], [3, 3]), ([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]), @@ -26,7 +26,9 @@ def test_util_minibatch(doc_sizes, expected_batches): docs = [get_random_doc(doc_size) for doc_size in doc_sizes] tol = 0.2 batch_size = 1000 - batches = list(minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=True)) + batches = list( + minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=True) + ) assert [len(batch) for batch in batches] == expected_batches max_size = batch_size + batch_size * tol @@ -50,7 +52,7 @@ def test_util_minibatch_oversize(doc_sizes, expected_batches): docs = [get_random_doc(doc_size) for doc_size in doc_sizes] tol = 0.2 batch_size = 1000 - batches = list(minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=False)) + batches = list( + minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=False) + ) assert [len(batch) for batch in batches] == expected_batches - - diff --git a/spacy/tests/util.py b/spacy/tests/util.py index a5d1737f1..7c3eaf8ad 100644 --- a/spacy/tests/util.py +++ b/spacy/tests/util.py @@ -27,7 +27,15 @@ def make_tempdir(): def get_doc( - vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None, morphs=None + vocab, + words=[], + pos=None, + heads=None, + deps=None, + tags=None, + ents=None, + lemmas=None, + morphs=None, ): """Create Doc object from given vocab, words and annotations.""" if deps and not heads: diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 97f336eb3..a3b089222 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -9,16 +9,7 @@ from ..attrs import SPACY, ORTH, intify_attr from ..errors import Errors -ALL_ATTRS = ( - "ORTH", - "TAG", - "HEAD", - "DEP", - "ENT_IOB", - "ENT_TYPE", - "LEMMA", - "MORPH" -) +ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "LEMMA", "MORPH") class DocBin(object):