diff --git a/spacy/gold/batchers.py b/spacy/gold/batchers.py index c15b88502..ec1f35815 100644 --- a/spacy/gold/batchers.py +++ b/spacy/gold/batchers.py @@ -1,4 +1,4 @@ -from typing import Union, Iterator, Iterable, Sequence, TypeVar, List, Callable +from typing import Union, Iterable, Sequence, TypeVar, List, Callable from typing import Optional, Any from functools import partial import itertools @@ -20,7 +20,7 @@ def configure_minibatch_by_padded_size( get_length: Optional[Callable[[ItemT], int]] = None ) -> BatcherT: """Create a batcher that uses the `batch_by_padded_size` strategy. - + The padded size is defined as the maximum length of sequences within the batch multiplied by the number of sequences in the batch. @@ -92,7 +92,7 @@ def minibatch_by_padded_size( ) -> Iterable[List[ItemT]]: """Minibatch a sequence by the size of padded batches that would result, with sequences binned by length within a window. - + The padded size is defined as the maximum length of sequences within the batch multiplied by the number of sequences in the batch. @@ -123,7 +123,11 @@ def minibatch_by_padded_size( def minibatch_by_words( - seqs: Iterable[ItemT], size: Sizing, tolerance=0.2, discard_oversize=False, get_length=len + seqs: Iterable[ItemT], + size: Sizing, + tolerance=0.2, + discard_oversize=False, + get_length=len, ) -> Iterable[List[ItemT]]: """Create minibatches of roughly a given number of words. If any examples are longer than the specified batch length, they will appear in a batch by diff --git a/spacy/lang/en/lemmatizer.py b/spacy/lang/en/lemmatizer.py index b8bef39b9..be389f117 100644 --- a/spacy/lang/en/lemmatizer.py +++ b/spacy/lang/en/lemmatizer.py @@ -1,5 +1,3 @@ -from typing import Optional - from ...pipeline import Lemmatizer from ...tokens import Token diff --git a/spacy/language.py b/spacy/language.py index 96661915a..85aac15ef 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -27,7 +27,6 @@ from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_INFIXES from .tokens import Doc -from .lookups import load_lookups from .tokenizer import Tokenizer from .errors import Errors, Warnings from .schemas import ConfigSchema @@ -1439,10 +1438,7 @@ class Language: or lang_cls is not cls ): raise ValueError(Errors.E943.format(value=type(lang_cls))) - nlp = lang_cls( - vocab=vocab, - create_tokenizer=create_tokenizer, - ) + nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer) if after_creation is not None: nlp = after_creation(nlp) if not isinstance(nlp, cls): diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index 54b7987ff..c9f0a99e9 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -34,11 +34,11 @@ def make_tok2vec(nlp: Language, name: str, model: Model) -> "Tok2Vec": class Tok2Vec(Pipe): """Apply a "token-to-vector" model and set its outputs in the doc.tensor attribute. This is mostly useful to share a single subnetwork between multiple - components, e.g. to have one embedding and CNN network shared between a + components, e.g. to have one embedding and CNN network shared between a parser, tagger and NER. In order to use the `Tok2Vec` predictions, subsequent components should use - the `Tok2VecListener` layer as the tok2vec subnetwork of their model. This + the `Tok2VecListener` layer as the tok2vec subnetwork of their model. This layer will read data from the `doc.tensor` attribute during prediction. During training, the `Tok2Vec` component will save its prediction and backprop callback for each batch, so that the subsequent components can backpropagate @@ -46,6 +46,7 @@ class Tok2Vec(Pipe): avoid relying on object identity within the models to achieve the parameter sharing. """ + def __init__(self, vocab: Vocab, model: Model, name: str = "tok2vec") -> None: """Initialize a tok2vec component. @@ -239,6 +240,7 @@ class Tok2VecListener(Model): from the Tok2Vec component into downstream components, and communicating gradients back upstream. """ + name = "tok2vec-listener" def __init__(self, upstream_name: str, width: int) -> None: diff --git a/spacy/scorer.py b/spacy/scorer.py index 4a81d39d0..d77881ad0 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -426,7 +426,7 @@ class Scorer: f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()}, } if len(labels) == 2 and not multi_label and positive_label: - positive_label_f = results[f"{attr}_f_per_type"][positive_label]['f'] + positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"] results[f"{attr}_score"] = positive_label_f results[f"{attr}_score_desc"] = f"F ({positive_label})" elif not multi_label: diff --git a/spacy/tests/morphology/test_morph_pickle.py b/spacy/tests/morphology/test_morph_pickle.py index 0758a6c01..d9b0e3476 100644 --- a/spacy/tests/morphology/test_morph_pickle.py +++ b/spacy/tests/morphology/test_morph_pickle.py @@ -15,5 +15,7 @@ def morphology(): def test_morphology_pickle_roundtrip(morphology): b = pickle.dumps(morphology) reloaded_morphology = pickle.loads(b) - assert reloaded_morphology.get(morphology.strings["Feat1=Val1|Feat2=Val2"]) == "Feat1=Val1|Feat2=Val2" - assert reloaded_morphology.get(morphology.strings["Feat3=Val3|Feat4=Val4"]) == "Feat3=Val3|Feat4=Val4" + feat = reloaded_morphology.get(morphology.strings["Feat1=Val1|Feat2=Val2"]) + assert feat == "Feat1=Val1|Feat2=Val2" + feat = reloaded_morphology.get(morphology.strings["Feat3=Val3|Feat4=Val4"]) + assert feat == "Feat3=Val3|Feat4=Val4" diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 60ba5246f..0ffe74273 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -144,8 +144,7 @@ def test_accept_blocked_token(): # 1. test normal behaviour nlp1 = English() doc1 = nlp1("I live in New York") - config = { - } + config = {} ner1 = nlp1.create_pipe("ner", config=config) assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""] assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""] @@ -164,8 +163,7 @@ def test_accept_blocked_token(): # 2. test blocking behaviour nlp2 = English() doc2 = nlp2("I live in New York") - config = { - } + config = {} ner2 = nlp2.create_pipe("ner", config=config) # set "New York" to a blocked entity @@ -220,8 +218,7 @@ def test_overwrite_token(): assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"] assert [token.ent_type_ for token in doc] == ["", "", "", "", ""] # Check that a new ner can overwrite O - config = { - } + config = {} ner2 = nlp.create_pipe("ner", config=config) ner2.moves.add_action(5, "") ner2.add_label("GPE") diff --git a/spacy/tests/pipeline/test_lemmatizer.py b/spacy/tests/pipeline/test_lemmatizer.py index 644fa0f01..8a70fdeeb 100644 --- a/spacy/tests/pipeline/test_lemmatizer.py +++ b/spacy/tests/pipeline/test_lemmatizer.py @@ -1,8 +1,7 @@ import pytest - from spacy import util, registry from spacy.lang.en import English -from spacy.lookups import Lookups, load_lookups +from spacy.lookups import Lookups from ..util import make_tempdir diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index 5f27a0afa..1af4a5121 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -1,10 +1,8 @@ import pytest - from spacy import util from spacy.gold import Example from spacy.lang.en import English from spacy.language import Language -from spacy.symbols import POS, NOUN from ..util import make_tempdir diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 363a16a11..17add7391 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -117,9 +117,7 @@ def test_overfitting_IO(): assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1) # Test scoring - scores = nlp.evaluate( - train_examples, scorer_cfg={"positive_label": "POSITIVE"} - ) + scores = nlp.evaluate(train_examples, scorer_cfg={"positive_label": "POSITIVE"}) assert scores["cats_micro_f"] == 1.0 assert scores["cats_score"] == 1.0 assert "cats_score_desc" in scores diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py index b642ca229..5c93ea3c8 100644 --- a/spacy/tests/regression/test_issue1-1000.py +++ b/spacy/tests/regression/test_issue1-1000.py @@ -1,11 +1,9 @@ import pytest import random - from spacy import util from spacy.gold import Example from spacy.matcher import Matcher from spacy.attrs import IS_PUNCT, ORTH, LOWER -from spacy.symbols import POS, VERB from spacy.vocab import Vocab from spacy.lang.en import English from spacy.lookups import Lookups diff --git a/spacy/tests/regression/test_issue1001-1500.py b/spacy/tests/regression/test_issue1001-1500.py index 0ac895546..d6a4600e3 100644 --- a/spacy/tests/regression/test_issue1001-1500.py +++ b/spacy/tests/regression/test_issue1001-1500.py @@ -6,8 +6,7 @@ from spacy.lang.en import English from spacy.lang.lex_attrs import LEX_ATTRS from spacy.matcher import Matcher from spacy.tokenizer import Tokenizer -from spacy.lookups import Lookups -from spacy.symbols import ORTH, LEMMA, POS, VERB +from spacy.symbols import ORTH, LEMMA, POS def test_issue1061(): diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index eee22c93d..4988575ea 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -271,8 +271,7 @@ def test_issue1963(en_tokenizer): @pytest.mark.parametrize("label", ["U-JOB-NAME"]) def test_issue1967(label): nlp = Language() - config = { - } + config = {} ner = nlp.create_pipe("ner", config=config) example = Example.from_dict( Doc(ner.vocab, words=["word"]), diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py index e42779ad7..de554a5ec 100644 --- a/spacy/tests/regression/test_issue3501-4000.py +++ b/spacy/tests/regression/test_issue3501-4000.py @@ -157,7 +157,11 @@ def test_issue3540(en_vocab): with doc.retokenize() as retokenizer: heads = [(doc[3], 1), doc[2]] - attrs = {"POS": ["PROPN", "PROPN"], "LEMMA": ["New", "York"], "DEP": ["pobj", "compound"]} + attrs = { + "POS": ["PROPN", "PROPN"], + "LEMMA": ["New", "York"], + "DEP": ["pobj", "compound"], + } retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs) gold_text = ["I", "live", "in", "New", "York", "right", "now"] diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py index ad577cbe5..423015106 100644 --- a/spacy/tests/regression/test_issue4001-4500.py +++ b/spacy/tests/regression/test_issue4001-4500.py @@ -138,8 +138,7 @@ def test_issue4042_bug2(): if not output_dir.exists(): output_dir.mkdir() ner1.to_disk(output_dir) - config = { - } + config = {} ner2 = nlp1.create_pipe("ner", config=config) ner2.from_disk(output_dir) assert len(ner2.labels) == 2 @@ -301,8 +300,7 @@ def test_issue4313(): beam_width = 16 beam_density = 0.0001 nlp = English() - config = { - } + config = {} ner = nlp.create_pipe("ner", config=config) ner.add_label("SOME_LABEL") ner.begin_training([])