Merge pull request #6729 from explosion/chore/tidy-up

2025-11-10 04:47:51 +03:00 · 2021-01-15 13:27:59 +11:00 · 2021-01-15 13:27:59 +11:00 · 330f9818c0
commit 330f9818c0
parent e8a97a2bd6 f9e4ac1283
23 changed files with 141 additions and 144 deletions
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -384,7 +384,10 @@ def debug_data(
        # rare labels in projectivized train
        rare_projectivized_labels = []
        for label in gold_train_data["deps"]:
-            if gold_train_data["deps"][label] <= DEP_LABEL_THRESHOLD and DELIMITER in label:
+            if (
+                gold_train_data["deps"][label] <= DEP_LABEL_THRESHOLD
+                and DELIMITER in label
+            ):
                rare_projectivized_labels.append(
                    f"{label}: {gold_train_data['deps'][label]}"
                )
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@ -30,7 +30,11 @@ def info_cli(


 def info(
-    model: Optional[str] = None, *, markdown: bool = False, silent: bool = True, exclude: List[str]
+    model: Optional[str] = None,
+    *,
+    markdown: bool = False,
+    silent: bool = True,
+    exclude: List[str],
 ) -> Union[str, dict]:
    msg = Printer(no_print=silent, pretty=not silent)
    if model:
@ -98,7 +102,9 @@ def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
    }


-def get_markdown(data: Dict[str, Any], title: Optional[str] = None, exclude: List[str] = None) -> str:
+def get_markdown(
+    data: Dict[str, Any], title: Optional[str] = None, exclude: List[str] = None
+) -> str:
    """Get data in GitHub-flavoured Markdown format for issues etc.

    data (dict or list of tuples): Label/value pairs.
@ -115,7 +121,7 @@ def get_markdown(data: Dict[str, Any], title: Optional[str] = None, exclude: Lis
        if isinstance(value, str):
            try:
                existing_path = Path(value).exists()
-            except:
+            except Exception:
                # invalid Path, like a URL string
                existing_path = False
            if existing_path:
--- a/spacy/lang/am/lex_attrs.py
+++ b/spacy/lang/am/lex_attrs.py
@ -36,7 +36,7 @@ _num_words = [
    "ትሪሊዮን",
    "ኳድሪሊዮን",
    "ገጅሊዮን",
-    "ባዝሊዮን"
+    "ባዝሊዮን",
 ]

 _ordinal_words = [
@ -60,8 +60,7 @@ _ordinal_words = [
    "አስራ ስምንተኛ",
    "አስራ ዘጠነኛ",
    "ሃያኛ",
-     "ሰላሳኛ"
-     "አርባኛ",
+    "ሰላሳኛ" "አርባኛ",
    "አምሳኛ",
    "ስድሳኛ",
    "ሰባኛ",
@ -71,8 +70,10 @@ _ordinal_words = [
    "ሺኛ",
    "ሚሊዮንኛ",
    "ቢሊዮንኛ",
-     "ትሪሊዮንኛ"
+    "ትሪሊዮንኛ",
 ]
+
+
 def like_num(text):
    if text.startswith(("+", "-", "±", "~")):
        text = text[1:]
--- a/spacy/lang/am/tokenizer_exceptions.py
+++ b/spacy/lang/am/tokenizer_exceptions.py
@ -7,7 +7,6 @@ _exc = {}
 for exc_data in [
    {ORTH: "ት/ቤት"},
    {ORTH: "ወ/ሮ", NORM: "ወይዘሮ"},
-
 ]:
    _exc[exc_data[ORTH]] = [exc_data]

--- a/spacy/lang/es/syntax_iterators.py
+++ b/spacy/lang/es/syntax_iterators.py
@ -1,4 +1,4 @@
-from typing import Union, Iterator, Optional, List, Tuple
+from typing import Union, Iterator

 from ...symbols import NOUN, PROPN, PRON, VERB, AUX
 from ...errors import Errors
--- a/spacy/lang/ti/lex_attrs.py
+++ b/spacy/lang/ti/lex_attrs.py
@ -36,7 +36,7 @@ _num_words = [
    "ትሪልዮን",
    "ኳድሪልዮን",
    "ገጅልዮን",
-    "ባዝልዮን"
+    "ባዝልዮን",
 ]

 _ordinal_words = [
@ -60,8 +60,7 @@ _ordinal_words = [
    "ኣሰርተ ስምንተኛ",
    "ኣሰርተ ዘጠነኛ",
    "ሃያኛ",
-     "ሰላሳኛ"
-     "አርባኛ",
+    "ሰላሳኛ" "አርባኛ",
    "አምሳኛ",
    "ስድሳኛ",
    "ሰባኛ",
@ -71,8 +70,10 @@ _ordinal_words = [
    "ሺኛ",
    "ሚሊዮንኛ",
    "ቢሊዮንኛ",
-     "ትሪሊዮንኛ"
+    "ትሪሊዮንኛ",
 ]
+
+
 def like_num(text):
    if text.startswith(("+", "-", "±", "~")):
        text = text[1:]
--- a/spacy/lang/ti/tokenizer_exceptions.py
+++ b/spacy/lang/ti/tokenizer_exceptions.py
@ -8,7 +8,6 @@ for exc_data in [
    {ORTH: "ት/ቤት"},
    {ORTH: "ወ/ሮ", NORM: "ወይዘሮ"},
    {ORTH: "ወ/ሪ", NORM: "ወይዘሪት"},
-
 ]:
    _exc[exc_data[ORTH]] = [exc_data]

--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@ -71,7 +71,9 @@ def build_text_classifier_v2(
    exclusive_classes = not linear_model.attrs["multi_label"]
    with Model.define_operators({">>": chain, "|": concatenate}):
        width = tok2vec.maybe_get_dim("nO")
-        attention_layer = ParametricAttention(width)   # TODO: benchmark performance difference of this layer
+        attention_layer = ParametricAttention(
+            width
+        )  # TODO: benchmark performance difference of this layer
        maxout_layer = Maxout(nO=width, nI=width)
        linear_layer = Linear(nO=nO, nI=width)
        cnn_model = (
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@ -89,7 +89,7 @@ def build_hash_embed_cnn_tok2vec(

 # TODO: archive
@registry.architectures.register("spacy.Tok2Vec.v1")
-def build_Tok2Vec_model(
+def _build_Tok2Vec_model(
    embed: Model[List[Doc], List[Floats2d]],
    encode: Model[List[Floats2d], List[Floats2d]],
 ) -> Model[List[Doc], List[Floats2d]]:
@ -109,7 +109,6 @@ def build_Tok2Vec_model(
    return tok2vec


-
@registry.architectures.register("spacy.Tok2Vec.v2")
 def build_Tok2Vec_model(
    embed: Model[List[Doc], List[Floats2d]],
@ -130,7 +129,6 @@ def build_Tok2Vec_model(
    return tok2vec


-
@registry.architectures.register("spacy.MultiHashEmbed.v1")
 def MultiHashEmbed(
    width: int,
@ -280,7 +278,7 @@ def CharacterEmbed(

 # TODO: archive
@registry.architectures.register("spacy.MaxoutWindowEncoder.v1")
-def MaxoutWindowEncoder(
+def _MaxoutWindowEncoder(
    width: int, window_size: int, maxout_pieces: int, depth: int
 ) -> Model[List[Floats2d], List[Floats2d]]:
    """Encode context using convolutions with maxout activation, layer
@ -310,6 +308,7 @@ def MaxoutWindowEncoder(
    model.attrs["receptive_field"] = window_size * depth
    return model

+
@registry.architectures.register("spacy.MaxoutWindowEncoder.v2")
 def MaxoutWindowEncoder(
    width: int, window_size: int, maxout_pieces: int, depth: int
@ -344,7 +343,7 @@ def MaxoutWindowEncoder(

 # TODO: archive
@registry.architectures.register("spacy.MishWindowEncoder.v1")
-def MishWindowEncoder(
+def _MishWindowEncoder(
    width: int, window_size: int, depth: int
 ) -> Model[List[Floats2d], List[Floats2d]]:
    """Encode context using convolutions with mish activation, layer
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -388,7 +388,6 @@ class TextCategorizer(TrainablePipe):
            **kwargs,
        )

-
    def _validate_categories(self, examples: List[Example]):
        """Check whether the provided examples all have single-label cats annotations."""
        for ex in examples:
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@ -187,5 +187,5 @@ class MultiLabel_TextCategorizer(TextCategorizer):

    def _validate_categories(self, examples: List[Example]):
        """This component allows any type of single- or multi-label annotations.
-        This method overwrites the more strict one from 'textcat'. """
+        This method overwrites the more strict one from 'textcat'."""
        pass
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -28,10 +28,12 @@ def pytest_runtest_setup(item):
 def tokenizer():
    return get_lang_class("xx")().tokenizer

+
@pytest.fixture(scope="session")
 def am_tokenizer():
    return get_lang_class("am")().tokenizer

+
@pytest.fixture(scope="session")
 def ar_tokenizer():
    return get_lang_class("ar")().tokenizer
@ -247,10 +249,12 @@ def th_tokenizer():
    pytest.importorskip("pythainlp")
    return get_lang_class("th")().tokenizer

+
@pytest.fixture(scope="session")
 def ti_tokenizer():
    return get_lang_class("ti")().tokenizer

+
@pytest.fixture(scope="session")
 def tr_tokenizer():
    return get_lang_class("tr")().tokenizer
--- a/spacy/tests/doc/test_graph.py
+++ b/spacy/tests/doc/test_graph.py
@ -17,17 +17,8 @@ def test_graph_edges_and_nodes():
    assert graph.get_node((0,)) == node1
    node2 = graph.add_node((1, 3))
    assert list(node2) == [1, 3]
-    graph.add_edge(
-        node1,
-        node2,
-        label="one",
-        weight=-10.5
-    )
-    assert graph.has_edge(
-        node1,
-        node2,
-        label="one"
-    )
+    graph.add_edge(node1, node2, label="one", weight=-10.5)
+    assert graph.has_edge(node1, node2, label="one")
    assert node1.heads() == []
    assert [tuple(h) for h in node2.heads()] == [(0,)]
    assert [tuple(t) for t in node1.tails()] == [(1, 3)]
@ -42,7 +33,7 @@ def test_graph_walk():
        nodes=[(0,), (1,), (2,), (3,)],
        edges=[(0, 1), (0, 2), (0, 3), (3, 0)],
        labels=None,
-        weights=None
+        weights=None,
    )
    node0, node1, node2, node3 = list(graph.nodes)
    assert [tuple(h) for h in node0.heads()] == [(3,)]
--- a/spacy/tests/lang/am/test_text.py
+++ b/spacy/tests/lang/am/test_text.py
@ -1,5 +1,4 @@
 import pytest
-from spacy.lang.am.lex_attrs import like_num


 def test_am_tokenizer_handles_long_text(am_tokenizer):
--- a/spacy/tests/lang/en/test_exceptions.py
+++ b/spacy/tests/lang/en/test_exceptions.py
@ -121,9 +121,7 @@ def test_en_tokenizer_norm_exceptions(en_tokenizer, text, norms):
    assert [token.norm_ for token in tokens] == norms


-@pytest.mark.parametrize(
-    "text,norm", [("Jan.", "January"), ("'cuz", "because")]
-)
+@pytest.mark.parametrize("text,norm", [("Jan.", "January"), ("'cuz", "because")])
 def test_en_lex_attrs_norm_exceptions(en_tokenizer, text, norm):
    tokens = en_tokenizer(text)
    assert tokens[0].norm_ == norm
--- a/spacy/tests/lang/ti/test_text.py
+++ b/spacy/tests/lang/ti/test_text.py
@ -1,5 +1,4 @@
 import pytest
-from spacy.lang.ti.lex_attrs import like_num


 def test_ti_tokenizer_handles_long_text(ti_tokenizer):
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@ -389,7 +389,7 @@ def test_beam_ner_scores():

    for j in range(len(doc)):
        for label in ner.labels:
-            score = entity_scores[(j, j+1, label)]
+            score = entity_scores[(j, j + 1, label)]
            eps = 0.00001
            assert 0 - eps <= score <= 1 + eps

--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@ -146,12 +146,12 @@ def test_no_resize(name):

 def test_error_with_multi_labels():
    nlp = Language()
-    textcat = nlp.add_pipe("textcat")
+    nlp.add_pipe("textcat")
    train_examples = []
    for text, annotations in TRAIN_DATA_MULTI_LABEL:
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
    with pytest.raises(ValueError):
-        optimizer = nlp.initialize(get_examples=lambda: train_examples)
+        nlp.initialize(get_examples=lambda: train_examples)


@pytest.mark.parametrize(
@ -226,7 +226,8 @@ def test_overfitting_IO():
    assert_equal(batch_cats_1, no_batch_cats)


-def test_overfitting_IO_multi():
+@pytest.mark.skip(reason="TODO: Can this be removed?")
+def test_overfitting_IO_multi_old():
    # Simple test to try and quickly overfit the multi-label textcat component - ensuring the ML models work correctly
    fix_random_seed(0)
    nlp = English()
@ -362,7 +363,9 @@ def test_positive_class():
    textcat_multilabel = nlp.add_pipe("textcat_multilabel")
    get_examples = make_get_examples_multi_label(nlp)
    with pytest.raises(TypeError):
-        textcat_multilabel.initialize(get_examples, labels=["POS", "NEG"], positive_label="POS")
+        textcat_multilabel.initialize(
+            get_examples, labels=["POS", "NEG"], positive_label="POS"
+        )
    textcat_multilabel.initialize(get_examples, labels=["FICTION", "DRAMA"])
    assert textcat_multilabel.labels == ("FICTION", "DRAMA")
    assert "positive_label" not in textcat_multilabel.cfg
@ -381,7 +384,9 @@ def test_positive_class_not_binary():
    textcat = nlp.add_pipe("textcat")
    get_examples = make_get_examples_multi_label(nlp)
    with pytest.raises(ValueError):
-        textcat.initialize(get_examples, labels=["SOME", "THING", "POS"], positive_label="POS")
+        textcat.initialize(
+            get_examples, labels=["SOME", "THING", "POS"], positive_label="POS"
+        )


 def test_textcat_evaluation():
--- a/spacy/tests/regression/test_issue4001-4500.py
+++ b/spacy/tests/regression/test_issue4001-4500.py
@ -13,7 +13,6 @@ from spacy.lang.el import Greek
 from spacy.language import Language
 import spacy
 from thinc.api import compounding
-from collections import defaultdict

 from ..util import make_tempdir

@ -311,9 +310,7 @@ def test_issue4313():
    # ensure the beam_parse still works with the new label
    docs = [doc]
    ner = nlp.get_pipe("beam_ner")
-    beams = ner.beam_parse(
-        docs, drop=0.0, beam_width=beam_width, beam_density=beam_density
-    )
+    ner.beam_parse(docs, drop=0.0, beam_width=beam_width, beam_density=beam_density)


 def test_issue4348():
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@ -251,7 +251,9 @@ def test_language_from_config_before_after_init():
    nlp.initialize()
    assert nlp.meta["before_init"] == "before"
    assert nlp.meta["after_init"] == "after"
-    assert all([ran_before, ran_after, ran_after_pipeline, ran_before_init, ran_after_init])
+    assert all(
+        [ran_before, ran_after, ran_after_pipeline, ran_before_init, ran_after_init]
+    )


 def test_language_from_config_before_after_init_invalid():
--- a/spacy/tests/training/test_new_example.py
+++ b/spacy/tests/training/test_new_example.py
@ -166,19 +166,11 @@ def test_Example_from_dict_with_entities(annots):
    vocab = Vocab()
    predicted = Doc(vocab, words=annots["words"])
    example = Example.from_dict(predicted, annots)
-
    assert len(list(example.reference.ents)) == 2
-    assert [example.reference[i].ent_iob_ for i in range(7)] == [
-        "O",
-        "O",
-        "B",
-        "I",
-        "O",
-        "B",
-        "O",
-    ]
+    # fmt: off
+    assert [example.reference[i].ent_iob_ for i in range(7)] == ["O", "O", "B", "I", "O", "B", "O"]
    assert example.get_aligned("ENT_IOB") == [2, 2, 3, 1, 2, 3, 2]
-
+    # fmt: on
    assert example.reference[2].ent_type_ == "LOC"
    assert example.reference[3].ent_type_ == "LOC"
    assert example.reference[5].ent_type_ == "LOC"
@ -299,7 +291,8 @@ def test_Example_missing_heads():
    assert parsed_heads[2] == heads[2]
    assert parsed_heads[4] == heads[4]
    assert parsed_heads[5] == heads[5]
-    assert [t.has_head() for t in example.reference] == [True, True, True, False, True, True]
-
+    expected = [True, True, True, False, True, True]
+    assert [t.has_head() for t in example.reference] == expected
    # Ensure that the missing head doesn't create an artificial new sentence start
-    assert example.get_aligned_sent_starts() == [True, False, False, False, False, False]
+    expected = [True, False, False, False, False, False]
+    assert example.get_aligned_sent_starts() == expected