diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index c67863ef1..040434c05 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -6,7 +6,6 @@ from wasabi import msg import srsly import hashlib import typer -import subprocess from click import NoSuchOption from typer.main import get_command from contextlib import contextmanager @@ -327,7 +326,7 @@ def git_checkout( ) with make_tempdir() as tmp_dir: cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}" - ret = run_command(cmd, capture=True) + run_command(cmd, capture=True) # We need Path(name) to make sure we also support subdirectories shutil.copytree(str(tmp_dir / Path(subpath)), str(dest)) diff --git a/spacy/language.py b/spacy/language.py index 7d463731a..4dffd9679 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -156,11 +156,7 @@ class Language: raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab))) if vocab is True: vectors_name = meta.get("vectors", {}).get("name") - vocab = create_vocab( - self.lang, - self.Defaults, - vectors_name=vectors_name, - ) + vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name) else: if (self.lang and vocab.lang) and (self.lang != vocab.lang): raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang)) @@ -1462,7 +1458,7 @@ class Language: # here :( for i, (name1, proc1) in enumerate(self.pipeline): if hasattr(proc1, "find_listeners"): - for name2, proc2 in self.pipeline[i+1:]: + for name2, proc2 in self.pipeline[i + 1 :]: if isinstance(getattr(proc2, "model", None), Model): proc1.find_listeners(proc2.model) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 7ced4bd04..fec478e21 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -164,7 +164,9 @@ def MultiHashEmbed( @registry.architectures.register("spacy.CharacterEmbed.v1") -def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool): +def CharacterEmbed( + width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool +): """Construct an embedded representation based on character embeddings, using a feed-forward network. A fixed number of UTF-8 byte characters are used for each word, taken from the beginning and end of the word equally. Padding is @@ -202,9 +204,11 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vect ), StaticVectors(width, dropout=0.0), ), - with_array(Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0)), + with_array( + Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0) + ), ragged2list(), - ) + ) else: model = chain( concatenate( @@ -215,9 +219,11 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vect with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)), ), ), - with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)), + with_array( + Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0) + ), ragged2list(), - ) + ) return model diff --git a/spacy/schemas.py b/spacy/schemas.py index 60655da8c..b0f26dcd7 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Union, Optional, Sequence, Any, Callable, Type, Tuple +from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple from typing import Iterable, TypeVar, TYPE_CHECKING from enum import Enum from pydantic import BaseModel, Field, ValidationError, validator @@ -255,7 +255,7 @@ class ConfigSchemaPretrain(BaseModel): batcher: Batcher = Field(..., title="Batcher for the training data") component: str = Field(..., title="Component to find the layer to pretrain") layer: str = Field(..., title="Layer to pretrain. Whole model if empty.") - + # TODO: use a more detailed schema for this? objective: Dict[str, Any] = Field(..., title="Pretraining objective") # fmt: on diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index ad4f49042..0c538a0eb 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -92,7 +92,12 @@ def test_spans_span_sent(doc, doc_not_parsed): def test_spans_lca_matrix(en_tokenizer): """Test span's lca matrix generation""" tokens = en_tokenizer("the lazy dog slept") - doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0], deps=["dep"] * 4) + doc = get_doc( + tokens.vocab, + words=[t.text for t in tokens], + heads=[2, 1, 1, 0], + deps=["dep"] * 4, + ) lca = doc[:2].get_lca_matrix() assert lca.shape == (2, 2) assert lca[0, 0] == 0 # the & the -> the diff --git a/spacy/tests/parser/test_parse_navigate.py b/spacy/tests/parser/test_parse_navigate.py index db1e98ba0..f181a799a 100644 --- a/spacy/tests/parser/test_parse_navigate.py +++ b/spacy/tests/parser/test_parse_navigate.py @@ -63,7 +63,12 @@ def test_parser_parse_navigate_consistency(en_tokenizer, text, heads): def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads): tokens = en_tokenizer(text) - doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=["dep"] * len(heads)) + doc = get_doc( + tokens.vocab, + words=[t.text for t in tokens], + heads=heads, + deps=["dep"] * len(heads), + ) lefts = {} rights = {} diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py index 1cf06d97f..881460704 100644 --- a/spacy/tests/pipeline/test_pipe_factories.py +++ b/spacy/tests/pipeline/test_pipe_factories.py @@ -345,10 +345,7 @@ def test_language_factories_invalid(): [{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}], {"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25}, ), - ( - [{"a": 0.5, "b": 0.5}, {"b": 1.0}], - {"a": 0.25, "b": 0.75}, - ), + ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75},), ], ) def test_language_factories_combine_score_weights(weights, expected): @@ -363,16 +360,10 @@ def test_language_factories_scores(): weights1 = {"a1": 0.5, "a2": 0.5} weights2 = {"b1": 0.2, "b2": 0.7, "b3": 0.1} Language.factory( - f"{name}1", - scores=list(weights1), - default_score_weights=weights1, - func=func, + f"{name}1", scores=list(weights1), default_score_weights=weights1, func=func, ) Language.factory( - f"{name}2", - scores=list(weights2), - default_score_weights=weights2, - func=func, + f"{name}2", scores=list(weights2), default_score_weights=weights2, func=func, ) meta1 = Language.get_factory_meta(f"{name}1") assert meta1.default_score_weights == weights1 diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index e226c8524..71ed2ea03 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -212,9 +212,17 @@ def test_issue1834(): heads=[0, -1, -2, -3, -4, -5, 0, -1, -2], deps=["dep"] * len(words), ) - print(doc.has_annotation("DEP"), [t.head.i for t in doc], [t.is_sent_start for t in doc]) + print( + doc.has_annotation("DEP"), + [t.head.i for t in doc], + [t.is_sent_start for t in doc], + ) new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes()) - print(new_doc.has_annotation("DEP"), [t.head.i for t in new_doc], [t.is_sent_start for t in new_doc]) + print( + new_doc.has_annotation("DEP"), + [t.head.i for t in new_doc], + [t.is_sent_start for t in new_doc], + ) assert new_doc[6].sent_start assert new_doc.has_annotation("DEP") assert new_doc.has_annotation("TAG") diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index eedad31e0..d1c4553be 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -136,7 +136,13 @@ def test_serialize_textcat_empty(en_vocab): # See issue #1105 cfg = {"model": DEFAULT_TEXTCAT_MODEL} model = registry.make_from_config(cfg, validate=True)["model"] - textcat = TextCategorizer(en_vocab, model, labels=["ENTITY", "ACTION", "MODIFIER"], threshold=0.5, positive_label=None) + textcat = TextCategorizer( + en_vocab, + model, + labels=["ENTITY", "ACTION", "MODIFIER"], + threshold=0.5, + positive_label=None, + ) textcat.to_bytes(exclude=["vocab"]) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 0a2300455..422ae74b4 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -3,7 +3,6 @@ from click import NoSuchOption from spacy.training import docs_to_json, biluo_tags_from_offsets from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs -from spacy.lang.en import English from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate from spacy.cli.init_config import init_config, RECOMMENDATIONS from spacy.cli._util import validate_project_commands, parse_config_overrides diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 2a24d368a..da46ad424 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -291,8 +291,7 @@ def test_spacy_blank(): @pytest.mark.parametrize( - "value", - [False, None, ["x", "y"], Language, Vocab], + "value", [False, None, ["x", "y"], Language, Vocab], ) def test_language_init_invalid_vocab(value): err_fragment = "invalid value" diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py index 8c931d31e..1668991cd 100644 --- a/spacy/tests/test_util.py +++ b/spacy/tests/test_util.py @@ -95,7 +95,7 @@ def test_util_dot_section(): assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False # Test that default values got overwritten assert en_config["nlp"]["pipeline"] == ["textcat"] - assert nl_config["nlp"]["pipeline"] == [] # default value [] + assert nl_config["nlp"]["pipeline"] == [] # default value [] # Test proper functioning of 'dot_to_object' with pytest.raises(KeyError): dot_to_object(en_config, "nlp.pipeline.tagger") diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py index 898746c2a..d20a032e8 100644 --- a/spacy/tests/training/test_readers.py +++ b/spacy/tests/training/test_readers.py @@ -1,7 +1,6 @@ from typing import Dict, Iterable, Callable import pytest from thinc.api import Config - from spacy import Language from spacy.util import load_model_from_config, registry, dot_to_object from spacy.training import Example @@ -10,19 +9,19 @@ from spacy.training import Example def test_readers(): config_string = """ [training] - + [corpora] @readers = "myreader.v1" [nlp] lang = "en" pipeline = ["tok2vec", "textcat"] - + [components] - + [components.tok2vec] factory = "tok2vec" - + [components.textcat] factory = "textcat" """ @@ -69,19 +68,19 @@ def test_readers(): def test_cat_readers(reader, additional_config): nlp_config_string = """ [training] - + [corpora] @readers = "PLACEHOLDER" [nlp] lang = "en" pipeline = ["tok2vec", "textcat"] - + [components] - + [components.tok2vec] factory = "tok2vec" - + [components.textcat] factory = "textcat" """ diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py index 1d3c72a8b..b09487965 100644 --- a/spacy/tests/training/test_training.py +++ b/spacy/tests/training/test_training.py @@ -34,7 +34,17 @@ def doc(): # fmt: on nlp = English() words = [t.text for t in nlp.make_doc(text)] - doc = get_doc(nlp.vocab, words=words, tags=tags, pos=pos, morphs=morphs, heads=heads, deps=deps, lemmas=lemmas, ents=ents) + doc = get_doc( + nlp.vocab, + words=words, + tags=tags, + pos=pos, + morphs=morphs, + heads=heads, + deps=deps, + lemmas=lemmas, + ents=ents, + ) doc.cats = cats return doc