Tidy up and auto-format

2025-07-15 10:42:34 +03:00 · 2020-08-05 16:00:59 +02:00 · 2020-08-05 16:00:59 +02:00 · e68459296d
commit e68459296d
parent 2a4d56e730
24 changed files with 72 additions and 107 deletions
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@ -7,8 +7,6 @@ import typer
 from ._util import Arg, Opt, debug_cli, show_validation_error, parse_config_overrides
 from .. import util
 from ..lang.en import English
 from ..util import dot_to_object
@debug_cli.command("model")
@ -130,8 +128,8 @@ def _sentences():
    ]
-def _get_docs():
+def _get_docs(lang: str = "en"):
-    nlp = English()
+    nlp = util.get_lang_class(lang)()
    return list(nlp.pipe(_sentences()))
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@ -1,5 +1,4 @@
 from typing import Optional, List, Dict
 from timeit import default_timer as timer
 from wasabi import Printer
 from pathlib import Path
 import re
--- a/spacy/cli/project/assets.py
+++ b/spacy/cli/project/assets.py
@ -1,7 +1,6 @@
 from typing import Optional
 from pathlib import Path
 from wasabi import msg
 import tqdm
 import re
 import shutil
 import requests
--- a/spacy/gold/init.py
+++ b/spacy/gold/init.py
@ -1,14 +1,8 @@
-from .corpus import Corpus
+from .corpus import Corpus  # noqa: F401
-from .example import Example
+from .example import Example  # noqa: F401
-from .align import Alignment
+from .align import Alignment  # noqa: F401
-
+from .iob_utils import iob_to_biluo, biluo_to_iob  # noqa: F401
-from .iob_utils import iob_to_biluo, biluo_to_iob
+from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags  # noqa: F401
-from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags
+from .iob_utils import spans_from_biluo_tags, tags_to_entities  # noqa: F401
-from .iob_utils import spans_from_biluo_tags
+from .gold_io import docs_to_json, read_json_file  # noqa: F401
-from .iob_utils import tags_to_entities
+from .batchers import minibatch_by_padded_size, minibatch_by_words  # noqa: F401
 from .gold_io import docs_to_json
 from .gold_io import read_json_file
 from .batchers import minibatch_by_padded_size, minibatch_by_words
--- a/spacy/gold/batchers.py
+++ b/spacy/gold/batchers.py
@ -3,7 +3,6 @@ from typing import Optional, Any
 from functools import partial
 import itertools
 from .example import Example
 from ..util import registry, minibatch
@ -41,16 +40,13 @@ def configure_minibatch_by_words(
 ) -> BatcherT:
    optionals = {"get_length": get_length} if get_length is not None else {}
    return partial(
-        minibatch_by_words,
+        minibatch_by_words, size=size, discard_oversize=discard_oversize, **optionals
        size=size,
        discard_oversize=discard_oversize,
        **optionals
    )
@registry.batchers("batch_by_sequence.v1")
 def configure_minibatch(size: Sizing, get_length=None) -> BatcherT:
-    optionals = ({"get_length": get_length} if get_length is not None else {})
+    optionals = {"get_length": get_length} if get_length is not None else {}
    return partial(minibatch, size=size, **optionals)
--- a/spacy/gold/converters/init.py
+++ b/spacy/gold/converters/init.py
@ -1,4 +1,4 @@
 from .iob2docs import iob2docs  # noqa: F401
 from .conll_ner2docs import conll_ner2docs  # noqa: F401
-from .json2docs import json2docs
+from .json2docs import json2docs  # noqa: F401
 from .conllu2docs import conllu2docs  # noqa: F401
--- a/spacy/gold/corpus.py
+++ b/spacy/gold/corpus.py
@ -1,6 +1,5 @@
-from typing import Union, List, Iterable, Iterator, TYPE_CHECKING, Callable, Tuple
+from typing import Union, List, Iterable, Iterator, TYPE_CHECKING, Callable
 from pathlib import Path
 import random
 from .. import util
 from .example import Example
@ -25,7 +24,7 @@ class Corpus:
    path (Path): The directory or filename to read from.
    gold_preproc (bool): Whether to set up the Example object with gold-standard
-        sentences and tokens for the predictions. Gold preprocessing helps 
+        sentences and tokens for the predictions. Gold preprocessing helps
        the annotations align to the tokenization, and may result in sequences
        of more consistent length. However, it may reduce run-time accuracy due
        to train/test skew. Defaults to False.
@ -39,7 +38,12 @@ class Corpus:
    """
    def __init__(
-        self, path, *, limit: int = 0, gold_preproc: bool = False, max_length: bool = False,
+        self,
        path,
        *,
        limit: int = 0,
        gold_preproc: bool = False,
        max_length: bool = False,
    ) -> None:
        self.path = util.ensure_path(path)
        self.gold_preproc = gold_preproc
--- a/spacy/ml/_biluo.py
+++ b/spacy/ml/_biluo.py
@ -80,7 +80,7 @@ def _get_transition_table(
    B_start, B_end = (0, n_labels)
    I_start, I_end = (B_end, B_end + n_labels)
    L_start, L_end = (I_end, I_end + n_labels)
-    U_start, _ = (L_end, L_end + n_labels)
+    U_start, _ = (L_end, L_end + n_labels)  # noqa: F841
    # Using ranges allows us to set specific cells, which is necessary to express
    # that only actions of the same label are valid continuations.
    B_range = numpy.arange(B_start, B_end)
--- a/spacy/pipeline/attributeruler.py
+++ b/spacy/pipeline/attributeruler.py
@ -17,9 +17,7 @@ MatcherPatternType = List[Dict[Union[int, str], Any]]
 AttributeRulerPatternType = Dict[str, Union[MatcherPatternType, Dict, int]]
-@Language.factory(
+@Language.factory("attribute_ruler")
    "attribute_ruler",
 )
 def make_attribute_ruler(
    nlp: Language,
    name: str,
@ -58,7 +56,7 @@ class AttributeRuler(Pipe):
        self.vocab = vocab
        self.matcher = Matcher(self.vocab)
        self.attrs = []
-        self._attrs_unnormed = [] # store for reference
+        self._attrs_unnormed = []  # store for reference
        self.indices = []
        if pattern_dicts:
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -1,17 +1,23 @@
 from typing import Dict, List, Union, Optional, Sequence, Any, Callable, Type
-from typing import Iterable, TypeVar
+from typing import Iterable, TypeVar, TYPE_CHECKING
 from enum import Enum
 from pydantic import BaseModel, Field, ValidationError, validator
 from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
 from pydantic import root_validator
 from collections import defaultdict
 from thinc.api import Optimizer
 from pathlib import Path
 from .attrs import NAMES
 if TYPE_CHECKING:
    # This lets us add type hints for mypy etc. without causing circular imports
    from .language import Language  # noqa: F401
    from .gold import Example  # noqa: F401
 ItemT = TypeVar("ItemT")
 Batcher = Callable[[Iterable[ItemT]], Iterable[List[ItemT]]]
 Reader = Callable[["Language", str], Iterable["Example"]]
 def validate(schema: Type[BaseModel], obj: Dict[str, Any]) -> List[str]:
@ -183,7 +189,6 @@ class ModelMetaSchema(BaseModel):
 # check that against this schema in the test suite to make sure it's always
 # up to date.
 Reader = Callable[["Language", str], Iterable["Example"]]
 class ConfigSchemaTraining(BaseModel):
    # fmt: off
@ -209,7 +214,6 @@ class ConfigSchemaTraining(BaseModel):
        extra = "forbid"
        arbitrary_types_allowed = True
 #eval_batch_size: StrictInt = Field(..., title="Evaluation batch size")
 class ConfigSchemaNlp(BaseModel):
    # fmt: off
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@ -291,6 +291,6 @@ def test_span_boundaries(doc):
    for i in range(start, end):
        assert span[i - start] == doc[i]
    with pytest.raises(IndexError):
-        _ = span[-5]
+        span[-5]
    with pytest.raises(IndexError):
-        _ = span[5]
+        span[5]
--- a/spacy/tests/lang/zh/test_serialize.py
+++ b/spacy/tests/lang/zh/test_serialize.py
@ -29,9 +29,7 @@ def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba):
 def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
    nlp = Chinese(
        meta={
-            "tokenizer": {
+            "tokenizer": {"config": {"segmenter": "pkuseg", "pkuseg_model": "medicine"}}
                "config": {"segmenter": "pkuseg", "pkuseg_model": "medicine",}
            }
        }
    )
    zh_tokenizer_serialize(nlp.tokenizer)
--- a/spacy/tests/matcher/test_matcher_logic.py
+++ b/spacy/tests/matcher/test_matcher_logic.py
@ -21,7 +21,7 @@ re_pattern5 = "B*A*B"
 longest1 = "A A A A A"
 longest2 = "A A A A A"
 longest3 = "A A"
-longest4 = "B A A A A A B"      # "FIRST" would be "B B"
+longest4 = "B A A A A A B"  # "FIRST" would be "B B"
 longest5 = "B B A A A A A B"
--- a/spacy/tests/parser/test_nonproj.py
+++ b/spacy/tests/parser/test_nonproj.py
@ -1,6 +1,6 @@
 import pytest
-from spacy.pipeline._parser_internals.nonproj import ancestors, contains_cycle, is_nonproj_arc
+from spacy.pipeline._parser_internals.nonproj import ancestors, contains_cycle
-from spacy.pipeline._parser_internals.nonproj import is_nonproj_tree
+from spacy.pipeline._parser_internals.nonproj import is_nonproj_tree, is_nonproj_arc
 from spacy.pipeline._parser_internals import nonproj
 from ..util import get_doc
--- a/spacy/tests/pipeline/test_attributeruler.py
+++ b/spacy/tests/pipeline/test_attributeruler.py
@ -75,19 +75,18 @@ def test_attributeruler_init(nlp, pattern_dicts):
 def test_attributeruler_init_patterns(nlp, pattern_dicts):
    # initialize with patterns
-    a = nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
+    nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
    doc = nlp("This is a test.")
    assert doc[2].lemma_ == "the"
    assert doc[2].morph_ == "Case=Nom|Number=Plur"
    assert doc[3].lemma_ == "cat"
    assert doc[3].morph_ == "Case=Nom|Number=Sing"
    nlp.remove_pipe("attribute_ruler")
    # initialize with patterns from asset
-    a = nlp.add_pipe("attribute_ruler", config={"pattern_dicts": {"@assets": "attribute_ruler_patterns"}})
+    nlp.add_pipe(
-
+        "attribute_ruler",
        config={"pattern_dicts": {"@assets": "attribute_ruler_patterns"}},
    )
    doc = nlp("This is a test.")
    assert doc[2].lemma_ == "the"
    assert doc[2].morph_ == "Case=Nom|Number=Plur"
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@ -117,12 +117,15 @@ def test_kb_default(nlp):
    assert len(entity_linker.kb) == 0
    assert entity_linker.kb.get_size_entities() == 0
    assert entity_linker.kb.get_size_aliases() == 0
-    assert entity_linker.kb.entity_vector_length == 64    # default value from pipeline.entity_linker
+    # default value from pipeline.entity_linker
    assert entity_linker.kb.entity_vector_length == 64
 def test_kb_custom_length(nlp):
    """Test that the default (empty) KB can be configured with a custom entity length"""
-    entity_linker = nlp.add_pipe("entity_linker", config={"kb": {"entity_vector_length": 35}})
+    entity_linker = nlp.add_pipe(
        "entity_linker", config={"kb": {"entity_vector_length": 35}}
    )
    assert len(entity_linker.kb) == 0
    assert entity_linker.kb.get_size_entities() == 0
    assert entity_linker.kb.get_size_aliases() == 0
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@ -117,9 +117,7 @@ def test_overfitting_IO():
        assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1)
    # Test scoring
-    scores = nlp.evaluate(
+    scores = nlp.evaluate(train_examples, scorer_cfg={"positive_label": "POSITIVE"})
        train_examples, scorer_cfg={"positive_label": "POSITIVE"}
    )
    assert scores["cats_f"] == 1.0
    assert scores["cats_score"] == 1.0
    assert "cats_score_desc" in scores
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@ -88,14 +88,9 @@ def my_parser():
            width=321,
            rows=5432,
            also_embed_subwords=True,
-            also_use_static_vectors=False
+            also_use_static_vectors=False,
        ),
-        MaxoutWindowEncoder(
+        MaxoutWindowEncoder(width=321, window_size=3, maxout_pieces=4, depth=2),
            width=321,
            window_size=3,
            maxout_pieces=4,
            depth=2
        )
    )
    parser = build_tb_parser_model(
        tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5
--- a/spacy/tests/serialize/test_serialize_doc.py
+++ b/spacy/tests/serialize/test_serialize_doc.py
@ -1,5 +1,4 @@
 import spacy
 import pytest
 from spacy.lang.en import English
 from spacy.tokens import Doc, DocBin
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@ -711,16 +711,18 @@ def test_alignment_different_texts():
    with pytest.raises(ValueError):
        Alignment.from_strings(other_tokens, spacy_tokens)
 def test_retokenized_docs(doc):
    a = doc.to_array(["TAG"])
    doc1 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a)
    doc2 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a)
    example = Example(doc1, doc2)
-
+    # fmt: off
-    assert example.get_aligned("ORTH", as_string=True) == ['Sarah', "'s", 'sister', 'flew', 'to', 'Silicon', 'Valley', 'via', 'London', '.']
+    expected1 = ["Sarah", "'s", "sister", "flew", "to", "Silicon", "Valley", "via", "London", "."]
-
+    expected2 = [None, "sister", "flew", "to", None, "via", "London", "."]
    # fmt: on
    assert example.get_aligned("ORTH", as_string=True) == expected1
    with doc1.retokenize() as retokenizer:
        retokenizer.merge(doc1[0:2])
        retokenizer.merge(doc1[5:7])
-
+    assert example.get_aligned("ORTH", as_string=True) == expected2
    assert example.get_aligned("ORTH", as_string=True) == [None, 'sister', 'flew', 'to', None, 'via', 'London', '.']
--- a/spacy/tests/test_models.py
+++ b/spacy/tests/test_models.py
@ -24,6 +24,7 @@ def get_textcat_kwargs():
        "nO": 7,
    }
 def get_textcat_cnn_kwargs():
    return {
        "tok2vec": test_tok2vec(),
@ -31,6 +32,7 @@ def get_textcat_cnn_kwargs():
        "nO": 13,
    }
 def get_all_params(model):
    params = []
    for node in model.walk():
@ -59,17 +61,11 @@ def get_tok2vec_kwargs():
    # This actually creates models, so seems best to put it in a function.
    return {
        "embed": MultiHashEmbed(
-            width=32,
+            width=32, rows=500, also_embed_subwords=True, also_use_static_vectors=False
            rows=500,
            also_embed_subwords=True,
            also_use_static_vectors=False
        ),
        "encode": MaxoutWindowEncoder(
-            width=32,
+            width=32, depth=2, maxout_pieces=2, window_size=1,
-            depth=2,
+        ),
            maxout_pieces=2,
            window_size=1,
        )
    }
--- a/spacy/tests/test_tok2vec.py
+++ b/spacy/tests/test_tok2vec.py
@ -19,14 +19,9 @@ def test_empty_doc():
            width=width,
            rows=embed_size,
            also_use_static_vectors=False,
-            also_embed_subwords=True
+            also_embed_subwords=True,
        ),
-        MaxoutWindowEncoder(
+        MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3),
            width=width,
            depth=4,
            window_size=1,
            maxout_pieces=3
        )
    )
    tok2vec.initialize()
    vectors, backprop = tok2vec.begin_update([doc])
@ -44,14 +39,9 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
            width=width,
            rows=embed_size,
            also_use_static_vectors=False,
-            also_embed_subwords=True
+            also_embed_subwords=True,
        ),
-        MaxoutWindowEncoder(
+        MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3,),
            width=width,
            depth=4,
            window_size=1,
            maxout_pieces=3,
        )
    )
    tok2vec.initialize()
    vectors, backprop = tok2vec.begin_update(batch)
--- a/spacy/tests/test_util.py
+++ b/spacy/tests/test_util.py
@ -85,27 +85,24 @@ def test_util_dot_section():
    """
    nlp_config = Config().from_str(cfg_string)
    en_nlp, en_config = util.load_model_from_config(nlp_config, auto_fill=True)
    default_config = Config().from_disk(DEFAULT_CONFIG_PATH)
    default_config["nlp"]["lang"] = "nl"
    nl_nlp, nl_config = util.load_model_from_config(default_config, auto_fill=True)
    # Test that creation went OK
    assert isinstance(en_nlp, English)
    assert isinstance(nl_nlp, Dutch)
    assert nl_nlp.pipe_names == []
    assert en_nlp.pipe_names == ["textcat"]
-    assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] == False   # not exclusive_classes
+    # not exclusive_classes
-
+    assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
    # Test that default values got overwritten
    assert not en_config["nlp"]["load_vocab_data"]
    assert nl_config["nlp"]["load_vocab_data"]  # default value True
    # Test proper functioning of 'dot_to_object'
    with pytest.raises(KeyError):
-        obj = dot_to_object(en_config, "nlp.pipeline.tagger")
+        dot_to_object(en_config, "nlp.pipeline.tagger")
    with pytest.raises(KeyError):
-        obj = dot_to_object(en_config, "nlp.unknownattribute")
+        dot_to_object(en_config, "nlp.unknownattribute")
    assert not dot_to_object(en_config, "nlp.load_vocab_data")
    assert dot_to_object(nl_config, "nlp.load_vocab_data")
    assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer)
--- a/spacy/util.py
+++ b/spacy/util.py
@ -1,5 +1,5 @@
 from typing import List, Union, Dict, Any, Optional, Iterable, Callable, Tuple
-from typing import Iterator, Type, Pattern, Sequence, TYPE_CHECKING
+from typing import Iterator, Type, Pattern, TYPE_CHECKING
 from types import ModuleType
 import os
 import importlib
@ -764,7 +764,6 @@ def normalize_slice(
    return start, stop
 def filter_spans(spans: Iterable["Span"]) -> List["Span"]:
    """Filter a sequence of spans and remove duplicates or overlaps. Useful for
    creating named entities (where one token can only be part of one entity) or
@ -1113,6 +1112,3 @@ def minibatch(items, size):
        if len(batch) == 0:
            break
        yield list(batch)