From 5eeb25f0432d8a23246eed6fcb75eee2da8e5a63 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 28 Jun 2021 11:48:00 +0200 Subject: [PATCH] Tidy up code --- spacy/__init__.py | 1 + spacy/cli/debug_model.py | 5 ++- spacy/cli/evaluate.py | 4 +- spacy/displacy/__init__.py | 4 +- spacy/lang/bg/lex_attrs.py | 8 ++-- spacy/lang/bg/tokenizer_exceptions.py | 1 - spacy/lang/fi/tokenizer_exceptions.py | 28 +++++++++--- spacy/lang/it/lemmatizer.py | 2 +- spacy/lang/it/tokenizer_exceptions.py | 2 +- spacy/language.py | 43 +++++++++++++------ spacy/lookups.py | 4 +- spacy/ml/models/__init__.py | 2 +- spacy/pipeline/entity_linker.py | 26 ++++------- spacy/pipeline/entityruler.py | 4 +- spacy/pipeline/spancat.py | 4 +- spacy/pipeline/textcat.py | 4 +- spacy/scorer.py | 16 ++++--- spacy/tests/lang/bg/test_text.py | 1 + spacy/tests/lang/fi/test_tokenizer.py | 7 +-- spacy/tests/matcher/test_matcher_logic.py | 16 +++++-- spacy/tests/parser/test_ner.py | 6 ++- spacy/tests/pipeline/test_entity_linker.py | 20 ++++++--- spacy/tests/pipeline/test_models.py | 4 +- spacy/tests/pipeline/test_pipe_factories.py | 18 ++++++-- spacy/tests/pipeline/test_pipe_methods.py | 7 ++- spacy/tests/pipeline/test_spancat.py | 5 +-- spacy/tests/pipeline/test_textcat.py | 16 ++++--- spacy/tests/pipeline/test_tok2vec.py | 14 ++++-- spacy/tests/regression/test_issue6501-7000.py | 3 +- spacy/tests/regression/test_issue7056.py | 3 +- spacy/tests/regression/test_issue7062.py | 2 +- spacy/tests/regression/test_issue7065.py | 32 +++++++++++--- spacy/tests/regression/test_issue8168.py | 19 ++++++-- spacy/tests/regression/test_issue8190.py | 11 +---- .../tests/serialize/test_serialize_config.py | 9 +++- spacy/tests/serialize/test_serialize_doc.py | 4 +- spacy/tests/test_architectures.py | 1 - spacy/tests/test_language.py | 4 +- spacy/tests/test_scorer.py | 8 +++- spacy/tests/training/test_pretraining.py | 7 ++- spacy/tests/training/test_training.py | 14 ++++-- spacy/tests/vocab_vectors/test_vectors.py | 1 + spacy/training/batchers.py | 6 ++- spacy/training/initialize.py | 8 +++- spacy/training/loop.py | 3 +- spacy/util.py | 5 +-- 46 files changed, 276 insertions(+), 136 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index d07931cfd..f20c32eb5 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -4,6 +4,7 @@ import sys # set library-specific custom warning handling before doing anything else from .errors import setup_default_warnings + setup_default_warnings() # These are imported as part of the API diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py index 015e3a76b..ed4293902 100644 --- a/spacy/cli/debug_model.py +++ b/spacy/cli/debug_model.py @@ -139,7 +139,10 @@ def debug_model( upstream_component = None if model.has_ref("tok2vec") and "tok2vec-listener" in model.get_ref("tok2vec").name: upstream_component = nlp.get_pipe("tok2vec") - if model.has_ref("tok2vec") and "transformer-listener" in model.get_ref("tok2vec").name: + if ( + model.has_ref("tok2vec") + and "transformer-listener" in model.get_ref("tok2vec").name + ): upstream_component = nlp.get_pipe("transformer") goldY = None for e in range(3): diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 687d63ad2..c563f24d3 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -127,7 +127,9 @@ def evaluate( data["ents_per_type"] = scores["ents_per_type"] if f"spans_{spans_key}_per_type" in scores: if scores[f"spans_{spans_key}_per_type"]: - print_prf_per_type(msg, scores[f"spans_{spans_key}_per_type"], "SPANS", "type") + print_prf_per_type( + msg, scores[f"spans_{spans_key}_per_type"], "SPANS", "type" + ) data[f"spans_{spans_key}_per_type"] = scores[f"spans_{spans_key}_per_type"] if "cats_f_per_type" in scores: if scores["cats_f_per_type"]: diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index aa61fb9f7..78b83f2e5 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -120,7 +120,9 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]: doc (Doc): Document do parse. RETURNS (dict): Generated dependency parse keyed by words and arcs. """ - doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data", "user_hooks"])) + doc = Doc(orig_doc.vocab).from_bytes( + orig_doc.to_bytes(exclude=["user_data", "user_hooks"]) + ) if not doc.has_annotation("DEP"): warnings.warn(Warnings.W005) if options.get("collapse_phrases", False): diff --git a/spacy/lang/bg/lex_attrs.py b/spacy/lang/bg/lex_attrs.py index 62b69d6cc..bba3c74cd 100644 --- a/spacy/lang/bg/lex_attrs.py +++ b/spacy/lang/bg/lex_attrs.py @@ -22,13 +22,13 @@ _num_words = [ "тринадесет", "тринайсет", "четиринадесет", - "четиринайсет" + "четиринайсет", "петнадесет", - "петнайсет" + "петнайсет", "шестнадесет", "шестнайсет", "седемнадесет", - "седемнайсет" + "седемнайсет", "осемнадесет", "осемнайсет", "деветнадесет", @@ -36,7 +36,7 @@ _num_words = [ "двадесет", "двайсет", "тридесет", - "трийсет" + "трийсет", "четиридесет", "четиресет", "петдесет", diff --git a/spacy/lang/bg/tokenizer_exceptions.py b/spacy/lang/bg/tokenizer_exceptions.py index defa00ef7..0b7487c64 100644 --- a/spacy/lang/bg/tokenizer_exceptions.py +++ b/spacy/lang/bg/tokenizer_exceptions.py @@ -58,7 +58,6 @@ _abbr_dot_exc = [ {ORTH: "стр.", NORM: "страница"}, {ORTH: "ул.", NORM: "улица"}, {ORTH: "чл.", NORM: "член"}, - ] for abbr in _abbr_dot_exc: diff --git a/spacy/lang/fi/tokenizer_exceptions.py b/spacy/lang/fi/tokenizer_exceptions.py index f0161f8b3..465333b0a 100644 --- a/spacy/lang/fi/tokenizer_exceptions.py +++ b/spacy/lang/fi/tokenizer_exceptions.py @@ -81,16 +81,32 @@ for exc_data in [ # Source: https://kaino.kotus.fi/visk/sisallys.php?p=141 conj_contraction_bases = [ - ("ett", "että"), ("jott", "jotta"), ("kosk", "koska"), ("mutt", "mutta"), - ("vaikk", "vaikka"), ("ehk", "ehkä"), ("miks", "miksi"), ("siks", "siksi"), - ("joll", "jos"), ("ell", "jos") + ("ett", "että"), + ("jott", "jotta"), + ("kosk", "koska"), + ("mutt", "mutta"), + ("vaikk", "vaikka"), + ("ehk", "ehkä"), + ("miks", "miksi"), + ("siks", "siksi"), + ("joll", "jos"), + ("ell", "jos"), ] conj_contraction_negations = [ - ("en", "en"), ("et", "et"), ("ei", "ei"), ("emme", "emme"), - ("ette", "ette"), ("eivat", "eivät"), ("eivät", "eivät")] + ("en", "en"), + ("et", "et"), + ("ei", "ei"), + ("emme", "emme"), + ("ette", "ette"), + ("eivat", "eivät"), + ("eivät", "eivät"), +] for (base_lower, base_norm) in conj_contraction_bases: for base in [base_lower, base_lower.title()]: for (suffix, suffix_norm) in conj_contraction_negations: - _exc[base + suffix] = [{ORTH: base, NORM: base_norm}, {ORTH: suffix, NORM: suffix_norm}] + _exc[base + suffix] = [ + {ORTH: base, NORM: base_norm}, + {ORTH: suffix, NORM: suffix_norm}, + ] TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/it/lemmatizer.py b/spacy/lang/it/lemmatizer.py index fced97d35..e44e64e3a 100644 --- a/spacy/lang/it/lemmatizer.py +++ b/spacy/lang/it/lemmatizer.py @@ -4,12 +4,12 @@ from ...pipeline import Lemmatizer from ...tokens import Token - class ItalianLemmatizer(Lemmatizer): """This lemmatizer was adapted from the Polish one (version of April 2021). It implements lookup lemmatization based on the morphological lexicon morph-it (Baroni and Zanchetta). The table lemma_lookup with non-POS-aware entries is used as a backup for words that aren't handled by morph-it.""" + @classmethod def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]: if mode == "pos_lookup": diff --git a/spacy/lang/it/tokenizer_exceptions.py b/spacy/lang/it/tokenizer_exceptions.py index 87c2929bf..42883863b 100644 --- a/spacy/lang/it/tokenizer_exceptions.py +++ b/spacy/lang/it/tokenizer_exceptions.py @@ -25,7 +25,7 @@ for orth in [ "artt.", "att.", "avv.", - "Avv." + "Avv.", "by-pass", "c.d.", "c/c", diff --git a/spacy/language.py b/spacy/language.py index c35a8c016..e1cb1cb05 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -687,9 +687,11 @@ class Language: if not isinstance(source, Language): raise ValueError(Errors.E945.format(name=source_name, source=type(source))) # Check vectors, with faster checks first - if self.vocab.vectors.shape != source.vocab.vectors.shape or \ - self.vocab.vectors.key2row != source.vocab.vectors.key2row or \ - self.vocab.vectors.to_bytes() != source.vocab.vectors.to_bytes(): + if ( + self.vocab.vectors.shape != source.vocab.vectors.shape + or self.vocab.vectors.key2row != source.vocab.vectors.key2row + or self.vocab.vectors.to_bytes() != source.vocab.vectors.to_bytes() + ): warnings.warn(Warnings.W113.format(name=source_name)) if not source_name in source.component_names: raise KeyError( @@ -1539,15 +1541,21 @@ class Language: # Cycle channels not to break the order of docs. # The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable. - byte_tuples = chain.from_iterable(recv.recv() for recv in cycle(bytedocs_recv_ch)) + byte_tuples = chain.from_iterable( + recv.recv() for recv in cycle(bytedocs_recv_ch) + ) try: - for i, (_, (byte_doc, byte_error)) in enumerate(zip(raw_texts, byte_tuples), 1): + for i, (_, (byte_doc, byte_error)) in enumerate( + zip(raw_texts, byte_tuples), 1 + ): if byte_doc is not None: doc = Doc(self.vocab).from_bytes(byte_doc) yield doc elif byte_error is not None: error = srsly.msgpack_loads(byte_error) - self.default_error_handler(None, None, None, ValueError(Errors.E871.format(error=error))) + self.default_error_handler( + None, None, None, ValueError(Errors.E871.format(error=error)) + ) if i % batch_size == 0: # tell `sender` that one batch was consumed. sender.step() @@ -1707,7 +1715,9 @@ class Language: if "replace_listeners" in pipe_cfg: for name, proc in source_nlps[model].pipeline: if source_name in getattr(proc, "listening_components", []): - source_nlps[model].replace_listeners(name, source_name, pipe_cfg["replace_listeners"]) + source_nlps[model].replace_listeners( + name, source_name, pipe_cfg["replace_listeners"] + ) listeners_replaced = True nlp.add_pipe(source_name, source=source_nlps[model], name=pipe_name) # Delete from cache if listeners were replaced @@ -1727,12 +1737,16 @@ class Language: for name, proc in nlp.pipeline: # Remove listeners not in the pipeline listener_names = getattr(proc, "listening_components", []) - unused_listener_names = [ll for ll in listener_names if ll not in nlp.pipe_names] + unused_listener_names = [ + ll for ll in listener_names if ll not in nlp.pipe_names + ] for listener_name in unused_listener_names: for listener in proc.listener_map.get(listener_name, []): proc.remove_listener(listener, listener_name) - for listener in getattr(proc, "listening_components", []): # e.g. tok2vec/transformer + for listener in getattr( + proc, "listening_components", [] + ): # e.g. tok2vec/transformer # If it's a component sourced from another pipeline, we check if # the tok2vec listeners should be replaced with standalone tok2vec # models (e.g. so component can be frozen without its performance @@ -1827,7 +1841,9 @@ class Language: new_config = tok2vec_cfg["model"] if "replace_listener_cfg" in tok2vec_model.attrs: replace_func = tok2vec_model.attrs["replace_listener_cfg"] - new_config = replace_func(tok2vec_cfg["model"], pipe_cfg["model"]["tok2vec"]) + new_config = replace_func( + tok2vec_cfg["model"], pipe_cfg["model"]["tok2vec"] + ) util.set_dot_to_object(pipe_cfg, listener_path, new_config) # Go over the listener layers and replace them for listener in pipe_listeners: @@ -1866,8 +1882,11 @@ class Language: util.to_disk(path, serializers, exclude) def from_disk( - self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList(), - overrides: Dict[str, Any] = SimpleFrozenDict(), + self, + path: Union[str, Path], + *, + exclude: Iterable[str] = SimpleFrozenList(), + overrides: Dict[str, Any] = SimpleFrozenDict(), ) -> "Language": """Loads state from a directory. Modifies the object in place and returns it. If the saved `Language` object contains a model, the diff --git a/spacy/lookups.py b/spacy/lookups.py index f635f0dcf..025afa04b 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -12,9 +12,7 @@ from .strings import get_string_id UNSET = object() -def load_lookups( - lang: str, tables: List[str], strict: bool = True -) -> 'Lookups': +def load_lookups(lang: str, tables: List[str], strict: bool = True) -> "Lookups": """Load the data from the spacy-lookups-data package for a given language, if available. Returns an empty `Lookups` container if there's no data or if the package is not installed. diff --git a/spacy/ml/models/__init__.py b/spacy/ml/models/__init__.py index daf47ef27..9b7628f0e 100644 --- a/spacy/ml/models/__init__.py +++ b/spacy/ml/models/__init__.py @@ -1,7 +1,7 @@ from .entity_linker import * # noqa from .multi_task import * # noqa from .parser import * # noqa -from .spancat import * # noqa +from .spancat import * # noqa from .tagger import * # noqa from .textcat import * # noqa from .tok2vec import * # noqa diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index a8c80df05..1c7f0ac8a 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -309,9 +309,7 @@ class EntityLinker(TrainablePipe): assert sent_index >= 0 # get n_neighbour sentences, clipped to the length of the document start_sentence = max(0, sent_index - self.n_sents) - end_sentence = min( - len(sentences) - 1, sent_index + self.n_sents - ) + end_sentence = min(len(sentences) - 1, sent_index + self.n_sents) start_token = sentences[start_sentence].start end_token = sentences[end_sentence].end sent_doc = doc[start_token:end_token].as_doc() @@ -337,22 +335,16 @@ class EntityLinker(TrainablePipe): else: random.shuffle(candidates) # set all prior probabilities to 0 if incl_prior=False - prior_probs = xp.asarray( - [c.prior_prob for c in candidates] - ) + prior_probs = xp.asarray([c.prior_prob for c in candidates]) if not self.incl_prior: - prior_probs = xp.asarray( - [0.0 for _ in candidates] - ) + prior_probs = xp.asarray([0.0 for _ in candidates]) scores = prior_probs # add in similarity from the context if self.incl_context: entity_encodings = xp.asarray( [c.entity_vector for c in candidates] ) - entity_norm = xp.linalg.norm( - entity_encodings, axis=1 - ) + entity_norm = xp.linalg.norm(entity_encodings, axis=1) if len(entity_encodings) != len(prior_probs): raise RuntimeError( Errors.E147.format( @@ -361,14 +353,12 @@ class EntityLinker(TrainablePipe): ) ) # cosine similarity - sims = xp.dot( - entity_encodings, sentence_encoding_t - ) / (sentence_norm * entity_norm) + sims = xp.dot(entity_encodings, sentence_encoding_t) / ( + sentence_norm * entity_norm + ) if sims.shape != prior_probs.shape: raise ValueError(Errors.E161) - scores = ( - prior_probs + sims - (prior_probs * sims) - ) + scores = prior_probs + sims - (prior_probs * sims) # TODO: thresholding best_index = scores.argmax().item() best_candidate = candidates[best_index] diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 761ff12bf..ea14dae69 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -278,9 +278,7 @@ class EntityRuler(Pipe): if self == pipe: current_index = i break - subsequent_pipes = [ - pipe for pipe in self.nlp.pipe_names[current_index :] - ] + subsequent_pipes = [pipe for pipe in self.nlp.pipe_names[current_index:]] except ValueError: subsequent_pipes = [] with self.nlp.select_pipes(disable=subsequent_pipes): diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py index f59d8ddc9..fdf6f9f5e 100644 --- a/spacy/pipeline/spancat.py +++ b/spacy/pipeline/spancat.py @@ -61,7 +61,7 @@ def build_ngram_suggester(sizes: List[int]) -> Callable[[List[Doc]], Ragged]: length = 0 for size in sizes: if size <= len(doc): - starts_size = starts[:len(doc) - (size - 1)] + starts_size = starts[: len(doc) - (size - 1)] spans.append(ops.xp.hstack((starts_size, starts_size + size))) length += spans[-1].shape[0] if spans: @@ -70,7 +70,7 @@ def build_ngram_suggester(sizes: List[int]) -> Callable[[List[Doc]], Ragged]: if len(spans) > 0: output = Ragged(ops.xp.vstack(spans), ops.asarray(lengths, dtype="i")) else: - output = Ragged(ops.xp.zeros((0,0)), ops.asarray(lengths, dtype="i")) + output = Ragged(ops.xp.zeros((0, 0)), ops.asarray(lengths, dtype="i")) assert output.dataXd.ndim == 2 return output diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 0d3bbdf35..72a6dcd61 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -299,7 +299,9 @@ class TextCategorizer(TrainablePipe): self._allow_extra_label() self.cfg["labels"].append(label) if self.model and "resize_output" in self.model.attrs: - self.model = self.model.attrs["resize_output"](self.model, len(self.cfg["labels"])) + self.model = self.model.attrs["resize_output"]( + self.model, len(self.cfg["labels"]) + ) self.vocab.strings.add(label) return 1 diff --git a/spacy/scorer.py b/spacy/scorer.py index 25df44f14..f4ccb2269 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -365,7 +365,9 @@ class Scorer: gold_spans.add(gold_span) gold_per_type[span.label_].add(gold_span) pred_per_type = {label: set() for label in labels} - for span in example.get_aligned_spans_x2y(getter(pred_doc, attr), allow_overlap): + for span in example.get_aligned_spans_x2y( + getter(pred_doc, attr), allow_overlap + ): if labeled: pred_span = (span.label_, span.start, span.end - 1) else: @@ -381,10 +383,10 @@ class Scorer: score.score_set(pred_spans, gold_spans) # Assemble final result final_scores = { - f"{attr}_p": None, - f"{attr}_r": None, - f"{attr}_f": None, - } + f"{attr}_p": None, + f"{attr}_r": None, + f"{attr}_f": None, + } if labeled: final_scores[f"{attr}_per_type"] = None if len(score) > 0: @@ -392,7 +394,9 @@ class Scorer: final_scores[f"{attr}_r"] = score.recall final_scores[f"{attr}_f"] = score.fscore if labeled: - final_scores[f"{attr}_per_type"] = {k: v.to_dict() for k, v in score_per_type.items()} + final_scores[f"{attr}_per_type"] = { + k: v.to_dict() for k, v in score_per_type.items() + } return final_scores @staticmethod diff --git a/spacy/tests/lang/bg/test_text.py b/spacy/tests/lang/bg/test_text.py index 3d35ba997..63ae4ffd8 100644 --- a/spacy/tests/lang/bg/test_text.py +++ b/spacy/tests/lang/bg/test_text.py @@ -1,6 +1,7 @@ import pytest from spacy.lang.bg.lex_attrs import like_num + @pytest.mark.parametrize( "word,match", [ diff --git a/spacy/tests/lang/fi/test_tokenizer.py b/spacy/tests/lang/fi/test_tokenizer.py index b2f23f7fd..dc40e18a3 100644 --- a/spacy/tests/lang/fi/test_tokenizer.py +++ b/spacy/tests/lang/fi/test_tokenizer.py @@ -40,20 +40,21 @@ CONTRACTION_TESTS = [ ( "Päätimme ettemme tule.", ["Päätimme", "ett", "emme", "tule", "."], - ["päätimme", "että", "emme", "tule", "."] + ["päätimme", "että", "emme", "tule", "."], ), ( "Miksei puhuttaisi?", ["Miks", "ei", "puhuttaisi", "?"], - ["miksi", "ei", "puhuttaisi", "?"] + ["miksi", "ei", "puhuttaisi", "?"], ), ( "He tottelivat vaikkeivat halunneet", ["He", "tottelivat", "vaikk", "eivat", "halunneet"], - ["he", "tottelivat", "vaikka", "eivät", "halunneet"] + ["he", "tottelivat", "vaikka", "eivät", "halunneet"], ), ] + @pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_TESTS) def test_fi_tokenizer_abbreviations(fi_tokenizer, text, expected_tokens): tokens = fi_tokenizer(text) diff --git a/spacy/tests/matcher/test_matcher_logic.py b/spacy/tests/matcher/test_matcher_logic.py index 36708edd0..dcbe1ff33 100644 --- a/spacy/tests/matcher/test_matcher_logic.py +++ b/spacy/tests/matcher/test_matcher_logic.py @@ -255,13 +255,23 @@ def test_matcher_with_alignments_nongreedy(en_vocab): (0, "aaab", "a* b", [[0, 1], [0, 0, 1], [0, 0, 0, 1], [1]]), (1, "baab", "b a* b", [[0, 1, 1, 2]]), (2, "aaab", "a a a b", [[0, 1, 2, 3]]), - (3, "aaab", "a+ b", [[0, 1], [0, 0, 1], [0, 0, 0, 1]]), + (3, "aaab", "a+ b", [[0, 1], [0, 0, 1], [0, 0, 0, 1]]), (4, "aaba", "a+ b a+", [[0, 1, 2], [0, 0, 1, 2]]), - (5, "aabaa", "a+ b a+", [[0, 1, 2], [0, 0, 1, 2], [0, 0, 1, 2, 2], [0, 1, 2, 2] ]), + ( + 5, + "aabaa", + "a+ b a+", + [[0, 1, 2], [0, 0, 1, 2], [0, 0, 1, 2, 2], [0, 1, 2, 2]], + ), (6, "aaba", "a+ b a*", [[0, 1], [0, 0, 1], [0, 0, 1, 2], [0, 1, 2]]), (7, "aaaa", "a*", [[0], [0, 0], [0, 0, 0], [0, 0, 0, 0]]), (8, "baab", "b a* b b*", [[0, 1, 1, 2]]), - (9, "aabb", "a* b* a*", [[1], [2], [2, 2], [0, 1], [0, 0, 1], [0, 0, 1, 1], [0, 1, 1], [1, 1]]), + ( + 9, + "aabb", + "a* b* a*", + [[1], [2], [2, 2], [0, 1], [0, 0, 1], [0, 0, 1, 1], [0, 1, 1], [1, 1]], + ), (10, "aaab", "a+ a+ a b", [[0, 1, 2, 3]]), (11, "aaab", "a+ a+ a+ b", [[0, 1, 2, 3]]), (12, "aaab", "a+ a a b", [[0, 1, 2, 3]]), diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index eccfbf174..00617df56 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -557,7 +557,11 @@ def test_neg_annotation(neg_key): ner.add_label("PERSON") ner.add_label("ORG") example = Example.from_dict(neg_doc, {"entities": [(7, 17, "PERSON")]}) - example.reference.spans[neg_key] = [Span(neg_doc, 2, 4, "ORG"), Span(neg_doc, 2, 3, "PERSON"), Span(neg_doc, 1, 4, "PERSON")] + example.reference.spans[neg_key] = [ + Span(neg_doc, 2, 4, "ORG"), + Span(neg_doc, 2, 3, "PERSON"), + Span(neg_doc, 1, 4, "PERSON"), + ] optimizer = nlp.initialize() for i in range(2): diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 13c8cb72e..78259be6a 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -254,7 +254,9 @@ def test_nel_nsents(nlp): """Test that n_sents can be set through the configuration""" entity_linker = nlp.add_pipe("entity_linker", config={}) assert entity_linker.n_sents == 0 - entity_linker = nlp.replace_pipe("entity_linker", "entity_linker", config={"n_sents": 2}) + entity_linker = nlp.replace_pipe( + "entity_linker", "entity_linker", config={"n_sents": 2} + ) assert entity_linker.n_sents == 2 @@ -596,7 +598,9 @@ def test_kb_to_bytes(): kb_1.add_entity(entity="Q66", freq=9, entity_vector=[1, 2, 3]) kb_1.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8]) kb_1.add_alias(alias="Boeing", entities=["Q66"], probabilities=[0.5]) - kb_1.add_alias(alias="Randomness", entities=["Q66", "Q2146908"], probabilities=[0.1, 0.2]) + kb_1.add_alias( + alias="Randomness", entities=["Q66", "Q2146908"], probabilities=[0.1, 0.2] + ) assert kb_1.contains_alias("Russ Cochran") kb_bytes = kb_1.to_bytes() kb_2 = KnowledgeBase(nlp.vocab, entity_vector_length=3) @@ -611,8 +615,12 @@ def test_kb_to_bytes(): assert kb_2.contains_alias("Russ Cochran") assert kb_1.get_size_aliases() == kb_2.get_size_aliases() assert kb_1.get_alias_strings() == kb_2.get_alias_strings() - assert len(kb_1.get_alias_candidates("Russ Cochran")) == len(kb_2.get_alias_candidates("Russ Cochran")) - assert len(kb_1.get_alias_candidates("Randomness")) == len(kb_2.get_alias_candidates("Randomness")) + assert len(kb_1.get_alias_candidates("Russ Cochran")) == len( + kb_2.get_alias_candidates("Russ Cochran") + ) + assert len(kb_1.get_alias_candidates("Randomness")) == len( + kb_2.get_alias_candidates("Randomness") + ) def test_nel_to_bytes(): @@ -640,7 +648,9 @@ def test_nel_to_bytes(): kb_2 = nlp_2.get_pipe("entity_linker").kb assert kb_2.contains_alias("Russ Cochran") assert kb_2.get_vector("Q2146908") == [6, -4, 3] - assert_almost_equal(kb_2.get_prior_prob(entity="Q2146908", alias="Russ Cochran"), 0.8) + assert_almost_equal( + kb_2.get_prior_prob(entity="Q2146908", alias="Russ Cochran"), 0.8 + ) def test_scorer_links(): diff --git a/spacy/tests/pipeline/test_models.py b/spacy/tests/pipeline/test_models.py index 302c307e2..e3fd28d0f 100644 --- a/spacy/tests/pipeline/test_models.py +++ b/spacy/tests/pipeline/test_models.py @@ -82,7 +82,9 @@ def util_batch_unbatch_docs_list( Y_batched = model.predict(in_data) Y_not_batched = [model.predict([u])[0] for u in in_data] for i in range(len(Y_batched)): - assert_almost_equal(OPS.to_numpy(Y_batched[i]), OPS.to_numpy(Y_not_batched[i]), decimal=4) + assert_almost_equal( + OPS.to_numpy(Y_batched[i]), OPS.to_numpy(Y_not_batched[i]), decimal=4 + ) def util_batch_unbatch_docs_array( diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py index b28886925..f1f0c8a6e 100644 --- a/spacy/tests/pipeline/test_pipe_factories.py +++ b/spacy/tests/pipeline/test_pipe_factories.py @@ -351,9 +351,21 @@ def test_language_factories_invalid(): ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.0}, {"a": 0.0, "b": 1.0}), ([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {}, {"a": 0.0, "b": 0.0, "c": 0.0}), ([{"a": 0.0, "b": 0.0}, {"c": 1.0}], {}, {"a": 0.0, "b": 0.0, "c": 1.0}), - ([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"c": 0.2}, {"a": 0.0, "b": 0.0, "c": 1.0}), - ([{"a": 0.5, "b": 0.5, "c": 1.0, "d": 1.0}], {"a": 0.0, "b": 0.0}, {"a": 0.0, "b": 0.0, "c": 0.5, "d": 0.5}), - ([{"a": 0.5, "b": 0.5, "c": 1.0, "d": 1.0}], {"a": 0.0, "b": 0.0, "f": 0.0}, {"a": 0.0, "b": 0.0, "c": 0.5, "d": 0.5, "f": 0.0}), + ( + [{"a": 0.0, "b": 0.0}, {"c": 0.0}], + {"c": 0.2}, + {"a": 0.0, "b": 0.0, "c": 1.0}, + ), + ( + [{"a": 0.5, "b": 0.5, "c": 1.0, "d": 1.0}], + {"a": 0.0, "b": 0.0}, + {"a": 0.0, "b": 0.0, "c": 0.5, "d": 0.5}, + ), + ( + [{"a": 0.5, "b": 0.5, "c": 1.0, "d": 1.0}], + {"a": 0.0, "b": 0.0, "f": 0.0}, + {"a": 0.0, "b": 0.0, "c": 0.5, "d": 0.5, "f": 0.0}, + ), ], ) def test_language_factories_combine_score_weights(weights, override, expected): diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index 0b84db4c0..e530cb5c4 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -446,7 +446,12 @@ def test_update_with_annotates(): for text in texts: examples.append(Example(nlp.make_doc(text), nlp.make_doc(text))) - for components_to_annotate in [[], [f"{name}1"], [f"{name}1", f"{name}2"], [f"{name}2", f"{name}1"]]: + for components_to_annotate in [ + [], + [f"{name}1"], + [f"{name}1", f"{name}2"], + [f"{name}2", f"{name}1"], + ]: for key in results: results[key] = "" nlp = English(vocab=nlp.vocab) diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py index d5e5db63c..f70df7478 100644 --- a/spacy/tests/pipeline/test_spancat.py +++ b/spacy/tests/pipeline/test_spancat.py @@ -79,10 +79,7 @@ def test_ngram_suggester(en_tokenizer): assert spans.shape[0] == len(spans_set) offset += ngrams.lengths[i] # the number of spans is correct - assert_equal( - ngrams.lengths, - [max(0, len(doc) - (size - 1)) for doc in docs] - ) + assert_equal(ngrams.lengths, [max(0, len(doc) - (size - 1)) for doc in docs]) # test 1-3-gram suggestions ngram_suggester = registry.misc.get("ngram_suggester.v1")(sizes=[1, 2, 3]) diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 6f1d22eba..fdb44b412 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -131,7 +131,7 @@ def test_implicit_label(name, get_examples): nlp.initialize(get_examples=get_examples(nlp)) -#fmt: off +# fmt: off @pytest.mark.parametrize( "name,textcat_config", [ @@ -150,7 +150,7 @@ def test_implicit_label(name, get_examples): ("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), ], ) -#fmt: on +# fmt: on def test_no_resize(name, textcat_config): """The old textcat architectures weren't resizable""" nlp = Language() @@ -165,7 +165,7 @@ def test_no_resize(name, textcat_config): textcat.add_label("NEUTRAL") -#fmt: off +# fmt: off @pytest.mark.parametrize( "name,textcat_config", [ @@ -179,7 +179,7 @@ def test_no_resize(name, textcat_config): ("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), ], ) -#fmt: on +# fmt: on def test_resize(name, textcat_config): """The new textcat architectures are resizable""" nlp = Language() @@ -194,7 +194,7 @@ def test_resize(name, textcat_config): assert textcat.model.maybe_get_dim("nO") in [3, None] -#fmt: off +# fmt: off @pytest.mark.parametrize( "name,textcat_config", [ @@ -208,7 +208,7 @@ def test_resize(name, textcat_config): ("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), ], ) -#fmt: on +# fmt: on def test_resize_same_results(name, textcat_config): # Ensure that the resized textcat classifiers still produce the same results for old labels fix_random_seed(0) @@ -511,7 +511,9 @@ def test_textcat_threshold(): macro_f = scores["cats_score"] assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0 - scores = nlp.evaluate(train_examples, scorer_cfg={"threshold": 0, "positive_label": "POSITIVE"}) + scores = nlp.evaluate( + train_examples, scorer_cfg={"threshold": 0, "positive_label": "POSITIVE"} + ) pos_f = scores["cats_score"] assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0 assert pos_f > macro_f diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index 809a79dd6..eeea906bb 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -129,8 +129,14 @@ cfg_string = """ """ TRAIN_DATA = [ - ("I like green eggs", {"tags": ["N", "V", "J", "N"], "cats": {"preference": 1.0, "imperative": 0.0}}), - ("Eat blue ham", {"tags": ["V", "J", "N"], "cats": {"preference": 0.0, "imperative": 1.0}}), + ( + "I like green eggs", + {"tags": ["N", "V", "J", "N"], "cats": {"preference": 1.0, "imperative": 0.0}}, + ), + ( + "Eat blue ham", + {"tags": ["V", "J", "N"], "cats": {"preference": 0.0, "imperative": 1.0}}, + ), ] @@ -405,5 +411,5 @@ def test_tok2vec_listeners_textcat(): cats1 = docs[1].cats assert cats1["preference"] > 0.1 assert cats1["imperative"] < 0.9 - assert([t.tag_ for t in docs[0]] == ["V", "J", "N"]) - assert([t.tag_ for t in docs[1]] == ["N", "V", "J", "N"]) + assert [t.tag_ for t in docs[0]] == ["V", "J", "N"] + assert [t.tag_ for t in docs[1]] == ["N", "V", "J", "N"] diff --git a/spacy/tests/regression/test_issue6501-7000.py b/spacy/tests/regression/test_issue6501-7000.py index 3007f1dc6..f57e4085c 100644 --- a/spacy/tests/regression/test_issue6501-7000.py +++ b/spacy/tests/regression/test_issue6501-7000.py @@ -152,7 +152,8 @@ labels = ['label1', 'label2'] @pytest.mark.parametrize( - "component_name", ["textcat", "textcat_multilabel"], + "component_name", + ["textcat", "textcat_multilabel"], ) def test_issue6908(component_name): """Test intializing textcat with labels in a list""" diff --git a/spacy/tests/regression/test_issue7056.py b/spacy/tests/regression/test_issue7056.py index 541144877..e94a975d4 100644 --- a/spacy/tests/regression/test_issue7056.py +++ b/spacy/tests/regression/test_issue7056.py @@ -8,8 +8,7 @@ def test_issue7056(): sentence segmentation errors.""" vocab = Vocab() ae = ArcEager( - vocab.strings, - ArcEager.get_actions(left_labels=["amod"], right_labels=["pobj"]) + vocab.strings, ArcEager.get_actions(left_labels=["amod"], right_labels=["pobj"]) ) doc = Doc(vocab, words="Severe pain , after trauma".split()) state = ae.init_batch([doc])[0] diff --git a/spacy/tests/regression/test_issue7062.py b/spacy/tests/regression/test_issue7062.py index 88e5d2520..66bf09523 100644 --- a/spacy/tests/regression/test_issue7062.py +++ b/spacy/tests/regression/test_issue7062.py @@ -41,7 +41,7 @@ def test_partial_links(): nlp.add_pipe("sentencizer", first=True) patterns = [ {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}, - {"label": "ORG", "pattern": [{"LOWER": "ec"}, {"LOWER": "comics"}]} + {"label": "ORG", "pattern": [{"LOWER": "ec"}, {"LOWER": "comics"}]}, ] ruler = nlp.add_pipe("entity_ruler", before="entity_linker") ruler.add_patterns(patterns) diff --git a/spacy/tests/regression/test_issue7065.py b/spacy/tests/regression/test_issue7065.py index 63d36552a..d40763c63 100644 --- a/spacy/tests/regression/test_issue7065.py +++ b/spacy/tests/regression/test_issue7065.py @@ -8,7 +8,17 @@ def test_issue7065(): nlp = English() nlp.add_pipe("sentencizer") ruler = nlp.add_pipe("entity_ruler") - patterns = [{"label": "THING", "pattern": [{"LOWER": "symphony"}, {"LOWER": "no"}, {"LOWER": "."}, {"LOWER": "8"}]}] + patterns = [ + { + "label": "THING", + "pattern": [ + {"LOWER": "symphony"}, + {"LOWER": "no"}, + {"LOWER": "."}, + {"LOWER": "8"}, + ], + } + ] ruler.add_patterns(patterns) doc = nlp(text) @@ -28,11 +38,15 @@ def test_issue7065_b(): text = "Mahler 's Symphony No. 8 was beautiful." entities = [(0, 6, "PERSON"), (10, 24, "WORK")] - links = {(0, 6): {"Q7304": 1.0, "Q270853": 0.0}, - (10, 24): {"Q7304": 0.0, "Q270853": 1.0}} + links = { + (0, 6): {"Q7304": 1.0, "Q270853": 0.0}, + (10, 24): {"Q7304": 0.0, "Q270853": 1.0}, + } sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0] doc = nlp(text) - example = Example.from_dict(doc, {"entities": entities, "links": links, "sent_starts": sent_starts}) + example = Example.from_dict( + doc, {"entities": entities, "links": links, "sent_starts": sent_starts} + ) train_examples = [example] def create_kb(vocab): @@ -65,7 +79,15 @@ def test_issue7065_b(): # Add a custom rule-based component to mimick NER patterns = [ {"label": "PERSON", "pattern": [{"LOWER": "mahler"}]}, - {"label": "WORK", "pattern": [{"LOWER": "symphony"}, {"LOWER": "no"}, {"LOWER": "."}, {"LOWER": "8"}]} + { + "label": "WORK", + "pattern": [ + {"LOWER": "symphony"}, + {"LOWER": "no"}, + {"LOWER": "."}, + {"LOWER": "8"}, + ], + }, ] ruler = nlp.add_pipe("entity_ruler", before="entity_linker") ruler.add_patterns(patterns) diff --git a/spacy/tests/regression/test_issue8168.py b/spacy/tests/regression/test_issue8168.py index cf5a9fc7a..fbddf643c 100644 --- a/spacy/tests/regression/test_issue8168.py +++ b/spacy/tests/regression/test_issue8168.py @@ -1,11 +1,22 @@ from spacy.lang.en import English + def test_issue8168(): nlp = English() ruler = nlp.add_pipe("entity_ruler") - patterns = [{"label": "ORG", "pattern": "Apple"}, - {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], "id": "san-francisco"}, - {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}], "id": "san-francisco"}] + patterns = [ + {"label": "ORG", "pattern": "Apple"}, + { + "label": "GPE", + "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], + "id": "san-francisco", + }, + { + "label": "GPE", + "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}], + "id": "san-francisco", + }, + ] ruler.add_patterns(patterns) - assert ruler._ent_ids == {8043148519967183733: ('GPE', 'san-francisco')} \ No newline at end of file + assert ruler._ent_ids == {8043148519967183733: ("GPE", "san-francisco")} diff --git a/spacy/tests/regression/test_issue8190.py b/spacy/tests/regression/test_issue8190.py index 800a1638d..6ddbe53e0 100644 --- a/spacy/tests/regression/test_issue8190.py +++ b/spacy/tests/regression/test_issue8190.py @@ -9,20 +9,13 @@ def test_issue8190(): "nlp": { "lang": "en", }, - "custom": { - "key": "value" - } - + "custom": {"key": "value"}, } source_nlp = English.from_config(source_cfg) with make_tempdir() as dir_path: # We need to create a loadable source pipeline source_path = dir_path / "test_model" source_nlp.to_disk(source_path) - nlp = spacy.load(source_path, config={ - "custom": { - "key": "updated_value" - } - }) + nlp = spacy.load(source_path, config={"custom": {"key": "updated_value"}}) assert nlp.config["custom"]["key"] == "updated_value" diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index 2cd0e4ab6..114d4865c 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -4,7 +4,12 @@ import spacy from spacy.lang.en import English from spacy.lang.de import German from spacy.language import Language, DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH -from spacy.util import registry, load_model_from_config, load_config, load_config_from_str +from spacy.util import ( + registry, + load_model_from_config, + load_config, + load_config_from_str, +) from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder from spacy.schemas import ConfigSchema, ConfigSchemaPretrain @@ -493,4 +498,4 @@ def test_hyphen_in_config(): self.punctuation = punctuation nlp = English.from_config(load_config_from_str(hyphen_config_str)) - assert nlp.get_pipe("my_punctual_component").punctuation == ['?', '-'] + assert nlp.get_pipe("my_punctual_component").punctuation == ["?", "-"] diff --git a/spacy/tests/serialize/test_serialize_doc.py b/spacy/tests/serialize/test_serialize_doc.py index 5ce2549aa..e51c7f45b 100644 --- a/spacy/tests/serialize/test_serialize_doc.py +++ b/spacy/tests/serialize/test_serialize_doc.py @@ -64,7 +64,9 @@ def test_serialize_doc_span_groups(en_vocab): def test_serialize_doc_bin(): - doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE", "NORM", "ENT_ID"], store_user_data=True) + doc_bin = DocBin( + attrs=["LEMMA", "ENT_IOB", "ENT_TYPE", "NORM", "ENT_ID"], store_user_data=True + ) texts = ["Some text", "Lots of texts...", "..."] cats = {"A": 0.5} nlp = English() diff --git a/spacy/tests/test_architectures.py b/spacy/tests/test_architectures.py index c9e451471..26eabd4e5 100644 --- a/spacy/tests/test_architectures.py +++ b/spacy/tests/test_architectures.py @@ -5,7 +5,6 @@ from catalogue import RegistryError def test_get_architecture(): - @registry.architectures("my_test_function") def create_model(nr_in, nr_out): return Linear(nr_in, nr_out) diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 916247d4d..57ec4bbb8 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -143,7 +143,9 @@ def sample_vectors(): @pytest.fixture def nlp2(nlp, sample_vectors): - Language.component("test_language_vector_modification_pipe", func=vector_modification_pipe) + Language.component( + "test_language_vector_modification_pipe", func=vector_modification_pipe + ) Language.component("test_language_userdata_pipe", func=userdata_pipe) Language.component("test_language_ner_pipe", func=ner_pipe) add_vecs_to_vocab(nlp.vocab, sample_vectors) diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index c044d8afe..16cc97f6d 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -444,7 +444,9 @@ def test_score_spans(): assert f"{key}_per_type" in scores # Discard labels from the evaluation - scores = Scorer.score_spans([eg], attr=key, getter=span_getter, allow_overlap=True, labeled=False) + scores = Scorer.score_spans( + [eg], attr=key, getter=span_getter, allow_overlap=True, labeled=False + ) assert scores[f"{key}_p"] == 1.0 assert scores[f"{key}_r"] == 1.0 assert f"{key}_per_type" not in scores @@ -467,4 +469,6 @@ def test_prf_score(): assert (c.precision, c.recall, c.fscore) == approx((0.25, 0.5, 0.33333333)) a += b - assert (a.precision, a.recall, a.fscore) == approx((c.precision, c.recall, c.fscore)) \ No newline at end of file + assert (a.precision, a.recall, a.fscore) == approx( + (c.precision, c.recall, c.fscore) + ) diff --git a/spacy/tests/training/test_pretraining.py b/spacy/tests/training/test_pretraining.py index bd8810a5c..8ee54b544 100644 --- a/spacy/tests/training/test_pretraining.py +++ b/spacy/tests/training/test_pretraining.py @@ -278,7 +278,9 @@ def test_pretraining_training(): filled = filled.interpolate() P = filled["pretraining"] nlp_base = init_nlp(filled) - model_base = nlp_base.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed") + model_base = ( + nlp_base.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed") + ) embed_base = None for node in model_base.walk(): if node.name == "hashembed": @@ -331,11 +333,12 @@ def write_sample_training(tmp_dir): def write_vectors_model(tmp_dir): import numpy + vocab = Vocab() vector_data = { "dog": numpy.random.uniform(-1, 1, (300,)), "cat": numpy.random.uniform(-1, 1, (300,)), - "orange": numpy.random.uniform(-1, 1, (300,)) + "orange": numpy.random.uniform(-1, 1, (300,)), } for word, vector in vector_data.items(): vocab.set_vector(word, vector) diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py index 0ea5f0fcc..cd428be15 100644 --- a/spacy/tests/training/test_training.py +++ b/spacy/tests/training/test_training.py @@ -434,8 +434,14 @@ def test_aligned_spans_y2x_overlap(en_vocab, en_tokenizer): gold_doc = nlp.make_doc(text) spans = [] prefix = "I flew to " - spans.append(gold_doc.char_span(len(prefix), len(prefix + "San Francisco"), label="CITY")) - spans.append(gold_doc.char_span(len(prefix), len(prefix + "San Francisco Valley"), label="VALLEY")) + spans.append( + gold_doc.char_span(len(prefix), len(prefix + "San Francisco"), label="CITY") + ) + spans.append( + gold_doc.char_span( + len(prefix), len(prefix + "San Francisco Valley"), label="VALLEY" + ) + ) spans_key = "overlap_ents" gold_doc.spans[spans_key] = spans example = Example(doc, gold_doc) @@ -443,7 +449,9 @@ def test_aligned_spans_y2x_overlap(en_vocab, en_tokenizer): assert [(ent.start, ent.end) for ent in spans_gold] == [(3, 5), (3, 6)] # Ensure that 'get_aligned_spans_y2x' has the aligned entities correct - spans_y2x_no_overlap = example.get_aligned_spans_y2x(spans_gold, allow_overlap=False) + spans_y2x_no_overlap = example.get_aligned_spans_y2x( + spans_gold, allow_overlap=False + ) assert [(ent.start, ent.end) for ent in spans_y2x_no_overlap] == [(3, 5)] spans_y2x_overlap = example.get_aligned_spans_y2x(spans_gold, allow_overlap=True) assert [(ent.start, ent.end) for ent in spans_y2x_overlap] == [(3, 5), (3, 6)] diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index 37d48ad0f..8a7dd22c3 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -12,6 +12,7 @@ from ..util import add_vecs_to_vocab, get_cosine, make_tempdir OPS = get_current_ops() + @pytest.fixture def strings(): return ["apple", "orange"] diff --git a/spacy/training/batchers.py b/spacy/training/batchers.py index e9fa86c83..e79ba79b0 100644 --- a/spacy/training/batchers.py +++ b/spacy/training/batchers.py @@ -66,7 +66,11 @@ def configure_minibatch_by_words( """ optionals = {"get_length": get_length} if get_length is not None else {} return partial( - minibatch_by_words, size=size, tolerance=tolerance, discard_oversize=discard_oversize, **optionals + minibatch_by_words, + size=size, + tolerance=tolerance, + discard_oversize=discard_oversize, + **optionals ) diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 36384d67b..c1fda9181 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -70,14 +70,18 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": nlp._link_components() with nlp.select_pipes(disable=[*frozen_components, *resume_components]): if T["max_epochs"] == -1: - logger.debug("Due to streamed train corpus, using only first 100 examples for initialization. If necessary, provide all labels in [initialize]. More info: https://spacy.io/api/cli#init_labels") + logger.debug( + "Due to streamed train corpus, using only first 100 examples for initialization. If necessary, provide all labels in [initialize]. More info: https://spacy.io/api/cli#init_labels" + ) nlp.initialize(lambda: islice(train_corpus(nlp), 100), sgd=optimizer) else: nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) logger.info(f"Initialized pipeline components: {nlp.pipe_names}") # Detect components with listeners that are not frozen consistently for name, proc in nlp.pipeline: - for listener in getattr(proc, "listening_components", []): # e.g. tok2vec/transformer + for listener in getattr( + proc, "listening_components", [] + ): # e.g. tok2vec/transformer # Don't warn about components not in the pipeline if listener not in nlp.pipe_names: continue diff --git a/spacy/training/loop.py b/spacy/training/loop.py index 85aa458f0..0c4aba7e3 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -96,8 +96,7 @@ def train( stdout.write(msg.info(f"Frozen components: {frozen_components}") + "\n") if annotating_components: stdout.write( - msg.info(f"Set annotations on update for: {annotating_components}") - + "\n" + msg.info(f"Set annotations on update for: {annotating_components}") + "\n" ) stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate}") + "\n") with nlp.select_pipes(disable=frozen_components): diff --git a/spacy/util.py b/spacy/util.py index 984445d81..421287ce2 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -57,13 +57,13 @@ if TYPE_CHECKING: from .vocab import Vocab # noqa: F401 +# fmt: off OOV_RANK = numpy.iinfo(numpy.uint64).max DEFAULT_OOV_PROB = -20 LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"] # Default order of sections in the config.cfg. Not all sections needs to exist, # and additional sections are added at the end, in alphabetical order. -# fmt: off CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining", "initialize"] # fmt: on @@ -649,8 +649,7 @@ def get_model_version_range(spacy_version: str) -> str: def get_model_lower_version(constraint: str) -> Optional[str]: - """From a version range like >=1.2.3,<1.3.0 return the lower pin. - """ + """From a version range like >=1.2.3,<1.3.0 return the lower pin.""" try: specset = SpecifierSet(constraint) for spec in specset: