Tidy up and auto-format

2025-08-03 03:40:24 +03:00 · 2019-10-18 11:27:38 +02:00 · 2019-10-18 11:27:38 +02:00 · 181c01f629
commit 181c01f629
parent fb11852750
23 changed files with 101 additions and 107 deletions
--- a/spacy/lang/lb/init.py
+++ b/spacy/lang/lb/init.py
@ -1,15 +1,11 @@
 # coding: utf8
-
 from __future__ import unicode_literals

-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS 
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .norm_exceptions import NORM_EXCEPTIONS
-from .punctuation import TOKENIZER_INFIXES
 from .lex_attrs import LEX_ATTRS
 from .tag_map import TAG_MAP
 from .stop_words import STOP_WORDS
-#from .lemmatizer import LOOKUP
-#from .syntax_iterators import SYNTAX_ITERATORS

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
@ -21,17 +17,18 @@ from ...util import update_exc, add_lookups
 class LuxembourgishDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
-    lex_attr_getters[LANG] = lambda text: 'lb'
-    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
+    lex_attr_getters[LANG] = lambda text: "lb"
+    lex_attr_getters[NORM] = add_lookups(
+        Language.Defaults.lex_attr_getters[NORM], NORM_EXCEPTIONS, BASE_NORMS
+    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
-    #suffixes = TOKENIZER_SUFFIXES
-    #lemma_lookup = LOOKUP
-    
+    tag_map = TAG_MAP
+

 class Luxembourgish(Language):
-    lang = 'lb'
+    lang = "lb"
    Defaults = LuxembourgishDefaults


-__all__ = ['Luxembourgish']
+__all__ = ["Luxembourgish"]
--- a/spacy/lang/lb/examples.py
+++ b/spacy/lang/lb/examples.py
@ -9,10 +9,10 @@ Example sentences to test spaCy and its language models.
 """

 sentences = [
-	"An der Zäit hunn sech den Nordwand an d’Sonn gestridden, wie vun hinnen zwee wuel méi staark wier, wéi e Wanderer, deen an ee waarme Mantel agepak war, iwwert de Wee koum.",
-	"Si goufen sech eens, dass deejéinege fir de Stäerkste gëlle sollt, deen de Wanderer forcéiere géif, säi Mantel auszedoen.",
-	"Den Nordwand huet mat aller Force geblosen, awer wat e méi geblosen huet, wat de Wanderer sech méi a säi Mantel agewéckelt huet.",
-	"Um Enn huet den Nordwand säi Kampf opginn.",
-	"Dunn huet d’Sonn d’Loft mat hire frëndleche Strale gewiermt, a schonn no kuerzer Zäit huet de Wanderer säi Mantel ausgedoen.",
-	"Do huet den Nordwand missen zouginn, dass d’Sonn vun hinnen zwee de Stäerkste wier."
+    "An der Zäit hunn sech den Nordwand an d’Sonn gestridden, wie vun hinnen zwee wuel méi staark wier, wéi e Wanderer, deen an ee waarme Mantel agepak war, iwwert de Wee koum.",
+    "Si goufen sech eens, dass deejéinege fir de Stäerkste gëlle sollt, deen de Wanderer forcéiere géif, säi Mantel auszedoen.",
+    "Den Nordwand huet mat aller Force geblosen, awer wat e méi geblosen huet, wat de Wanderer sech méi a säi Mantel agewéckelt huet.",
+    "Um Enn huet den Nordwand säi Kampf opginn.",
+    "Dunn huet d’Sonn d’Loft mat hire frëndleche Strale gewiermt, a schonn no kuerzer Zäit huet de Wanderer säi Mantel ausgedoen.",
+    "Do huet den Nordwand missen zouginn, dass d’Sonn vun hinnen zwee de Stäerkste wier.",
 ]
--- a/spacy/lang/lb/lex_attrs.py
+++ b/spacy/lang/lb/lex_attrs.py
@ -4,29 +4,34 @@ from __future__ import unicode_literals
 from ...attrs import LIKE_NUM


-_num_words = set("""
-null eent zwee dräi véier fënnef sechs ziwen aacht néng zéng eelef zwielef dräizéng 
-véierzéng foffzéng siechzéng siwwenzéng uechtzeng uechzeng nonnzéng nongzéng zwanzeg drësseg véierzeg foffzeg sechzeg siechzeg siwenzeg achtzeg achzeg uechtzeg uechzeg nonnzeg 
+_num_words = set(
+    """
+null eent zwee dräi véier fënnef sechs ziwen aacht néng zéng eelef zwielef dräizéng
+véierzéng foffzéng siechzéng siwwenzéng uechtzeng uechzeng nonnzéng nongzéng zwanzeg drësseg véierzeg foffzeg sechzeg siechzeg siwenzeg achtzeg achzeg uechtzeg uechzeg nonnzeg
 honnert dausend millioun milliard billioun billiard trillioun triliard
-""".split())
+""".split()
+)

-_ordinal_words = set("""
+_ordinal_words = set(
+    """
 éischten zweeten drëtten véierten fënneften sechsten siwenten aachten néngten zéngten eeleften
 zwieleften dräizéngten véierzéngten foffzéngten siechzéngten uechtzéngen uechzéngten nonnzéngten nongzéngten zwanzegsten
 drëssegsten véierzegsten foffzegsten siechzegsten siwenzegsten uechzegsten nonnzegsten
 honnertsten dausendsten milliounsten
 milliardsten billiounsten billiardsten trilliounsten trilliardsten
-""".split())
+""".split()
+)
+

 def like_num(text):
    """
    check if text resembles a number
    """
-    text = text.replace(',', '').replace('.', '')
+    text = text.replace(",", "").replace(".", "")
    if text.isdigit():
        return True
-    if text.count('/') == 1:
-        num, denom = text.split('/')
+    if text.count("/") == 1:
+        num, denom = text.split("/")
        if num.isdigit() and denom.isdigit():
            return True
    if text in _num_words:
@ -36,6 +41,4 @@ def like_num(text):
    return False


-LEX_ATTRS = {
-    LIKE_NUM: like_num
-}
+LEX_ATTRS = {LIKE_NUM: like_num}
--- a/spacy/lang/lb/norm_exceptions.py
+++ b/spacy/lang/lb/norm_exceptions.py
@ -2,19 +2,15 @@
 from __future__ import unicode_literals

 # TODO
-# norm execptions: find a possibility to deal with the zillions of spelling variants (vläicht = vlaicht, vleicht, viläicht, viläischt, etc. etc.)
-
+# norm execptions: find a possibility to deal with the zillions of spelling
+# variants (vläicht = vlaicht, vleicht, viläicht, viläischt, etc. etc.)
 # here one could include the most common spelling mistakes

-_exc = {
-    "datt": "dass",
-    "wgl.": "weg.",
-    "wgl.": "wegl.",
-    "vläicht": "viläicht"}
+_exc = {"datt": "dass", "wgl.": "weg.", "vläicht": "viläicht"}


 NORM_EXCEPTIONS = {}

 for string, norm in _exc.items():
    NORM_EXCEPTIONS[string] = norm
-    NORM_EXCEPTIONS[string.title()] = norm
+    NORM_EXCEPTIONS[string.title()] = norm
--- a/spacy/lang/lb/punctuation.py
+++ b/spacy/lang/lb/punctuation.py
@ -1,25 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS
-from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
-
-
-_quotes = CONCAT_QUOTES.replace("'", "")
-
-_infixes = (
-    LIST_ELLIPSES
-    + LIST_ICONS
-    + [
-        r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
-        r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
-        r'(?<=[{a}])[:;<>=](?=[{a}])'.format(a=ALPHA),
-        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
-        r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
-        r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
-        r"(?<=[0-9])-(?=[0-9])",
-    ]
-)
-
-
-TOKENIZER_INFIXES = _infixes
--- a/spacy/lang/lb/stop_words.py
+++ b/spacy/lang/lb/stop_words.py
@ -1,7 +1,8 @@
 # coding: utf8
 from __future__ import unicode_literals

-STOP_WORDS = set("""
+STOP_WORDS = set(
+    """
 a
 à
 äis
@ -209,4 +210,5 @@ ze
 zu
 zum
 zwar
-""".split())
+""".split()
+)
--- a/spacy/lang/lb/tag_map.py
+++ b/spacy/lang/lb/tag_map.py
@ -1,11 +1,11 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ...symbols import POS, PUNCT, ADJ, CONJ, SCONJ, NUM, DET, ADV, ADP, X, VERB
-from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX
+from ...symbols import POS, PUNCT, ADJ, CONJ, NUM, DET, ADV, ADP, X, VERB
+from ...symbols import NOUN, PART, SPACE, AUX

 # TODO: tag map is still using POS tags from an internal training set.
-# These POS tags have to be modified to match those from Universal Dependencies 
+# These POS tags have to be modified to match those from Universal Dependencies

 TAG_MAP = {
    "$": {POS: PUNCT},
--- a/spacy/lang/lb/tokenizer_exceptions.py
+++ b/spacy/lang/lb/tokenizer_exceptions.py
@ -1,7 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
+from ...symbols import ORTH, LEMMA, NORM
 from ..punctuation import TOKENIZER_PREFIXES

 # TODO
@ -9,16 +9,20 @@ from ..punctuation import TOKENIZER_PREFIXES
 # treat other apostrophes within words as part of the word: [op d'mannst], [fir d'éischt] (= exceptions)

 # how to write the tokenisation exeption for the articles d' / D' ? This one is not working.
-_prefixes = [prefix for prefix in TOKENIZER_PREFIXES if prefix not in ["d'", "D'", "d’", "D’", r"\' "]]
+_prefixes = [
+    prefix for prefix in TOKENIZER_PREFIXES if prefix not in ["d'", "D'", "d’", "D’"]
+]


 _exc = {
    "d'mannst": [
        {ORTH: "d'", LEMMA: "d'"},
-        {ORTH: "mannst", LEMMA: "mann", NORM: "mann"}],
+        {ORTH: "mannst", LEMMA: "mann", NORM: "mann"},
+    ],
    "d'éischt": [
        {ORTH: "d'", LEMMA: "d'"},
-        {ORTH: "éischt", LEMMA: "éischt", NORM: "éischt"}]
+        {ORTH: "éischt", LEMMA: "éischt", NORM: "éischt"},
+    ],
 }

 # translate / delete what is not necessary
@ -26,20 +30,38 @@ _exc = {
 for exc_data in [
    {ORTH: "wgl.", LEMMA: "wann ech gelift", NORM: "wann ech gelieft"},
    {ORTH: "M.", LEMMA: "Monsieur", NORM: "Monsieur"},
-    {ORTH: "Mme.", LEMMA: "Madame", NORM: "Madame"}, 
-    {ORTH: "Dr.", LEMMA: "Dokter", NORM: "Dokter"}, 
+    {ORTH: "Mme.", LEMMA: "Madame", NORM: "Madame"},
+    {ORTH: "Dr.", LEMMA: "Dokter", NORM: "Dokter"},
    {ORTH: "Tel.", LEMMA: "Telefon", NORM: "Telefon"},
    {ORTH: "asw.", LEMMA: "an sou weider", NORM: "an sou weider"},
    {ORTH: "etc.", LEMMA: "et cetera", NORM: "et cetera"},
    {ORTH: "bzw.", LEMMA: "bezéiungsweis", NORM: "bezéiungsweis"},
-    {ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"}]:
+    {ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"},
+]:
    _exc[exc_data[ORTH]] = [exc_data]


 # to be extended
 for orth in [
-    "z.B.", "Dipl.", "Dr.", "etc.", "i.e.", "o.k.", "O.K.", "p.a.", "p.s.", "P.S.", "phil.",
-    "q.e.d.", "R.I.P.", "rer.", "sen.", "ë.a.", "U.S.", "U.S.A."]:
+    "z.B.",
+    "Dipl.",
+    "Dr.",
+    "etc.",
+    "i.e.",
+    "o.k.",
+    "O.K.",
+    "p.a.",
+    "p.s.",
+    "P.S.",
+    "phil.",
+    "q.e.d.",
+    "R.I.P.",
+    "rer.",
+    "sen.",
+    "ë.a.",
+    "U.S.",
+    "U.S.A.",
+]:
    _exc[orth] = [{ORTH: orth}]


--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@ -183,7 +183,9 @@ class EntityRuler(object):
        # disable the nlp components after this one in case they hadn't been initialized / deserialised yet
        try:
            current_index = self.nlp.pipe_names.index(self.name)
-            subsequent_pipes = [pipe for pipe in self.nlp.pipe_names[current_index + 1:]]
+            subsequent_pipes = [
+                pipe for pipe in self.nlp.pipe_names[current_index + 1 :]
+            ]
        except ValueError:
            subsequent_pipes = []
        with self.nlp.disable_pipes(*subsequent_pipes):
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -219,7 +219,9 @@ class Scorer(object):
        DOCS: https://spacy.io/api/scorer#score
        """
        if len(doc) != len(gold):
-            gold = GoldParse.from_annot_tuples(doc, tuple(zip(*gold.orig_annot)) + (gold.cats,))
+            gold = GoldParse.from_annot_tuples(
+                doc, tuple(zip(*gold.orig_annot)) + (gold.cats,)
+            )
        gold_deps = set()
        gold_tags = set()
        gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot]))
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -134,10 +134,12 @@ def ko_tokenizer():
    pytest.importorskip("natto")
    return get_lang_class("ko").Defaults.create_tokenizer()

+
@pytest.fixture(scope="session")
 def lb_tokenizer():
    return get_lang_class("lb").Defaults.create_tokenizer()
-    
+
+
@pytest.fixture(scope="session")
 def lt_tokenizer():
    return get_lang_class("lt").Defaults.create_tokenizer()
--- a/spacy/tests/lang/lb/test_exceptions.py
+++ b/spacy/tests/lang/lb/test_exceptions.py
@ -1,5 +1,4 @@
 # coding: utf-8
-# from __future__ import unicolb_literals
 from __future__ import unicode_literals

 import pytest
@ -9,4 +8,3 @@ import pytest
 def test_lb_tokenizer_handles_abbr(lb_tokenizer, text):
    tokens = lb_tokenizer(text)
    assert len(tokens) == 1
-
--- a/spacy/tests/lang/lb/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/lb/test_prefix_suffix_infix.py
@ -1,5 +1,4 @@
 # coding: utf-8
-#from __future__ import unicolb_literals
 from __future__ import unicode_literals

 import pytest
@ -21,6 +20,3 @@ def test_lb_tokenizer_splits_suffix_interact(lb_tokenizer, text):
 def test_lb_tokenizer_splits_even_wrap_interact(lb_tokenizer, text):
    tokens = lb_tokenizer(text)
    assert len(tokens) == 3
-
-
-
--- a/spacy/tests/lang/lb/test_text.py
+++ b/spacy/tests/lang/lb/test_text.py
@ -1,6 +1,5 @@
 # coding: utf-8
 from __future__ import unicode_literals
-from __future__ import unicode_literals

 import pytest

--- a/spacy/tests/matcher/test_matcher_logic.py
+++ b/spacy/tests/matcher/test_matcher_logic.py
@ -159,14 +159,14 @@ def test_matcher_remove():

    # should give two matches
    results1 = matcher(nlp(text))
-    assert(len(results1) == 2)
+    assert len(results1) == 2

    # removing once should work
    matcher.remove("Rule")

    # should not return any maches anymore
    results2 = matcher(nlp(text))
-    assert (len(results2) == 0)
+    assert len(results2) == 0

    # removing again should throw an error
    with pytest.raises(ValueError):
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@ -103,7 +103,7 @@ def test_oracle_moves_missing_B(en_vocab):
            moves.add_action(move_types.index("L"), label)
            moves.add_action(move_types.index("U"), label)
    moves.preprocess_gold(gold)
-    seq = moves.get_oracle_sequence(doc, gold)
+    moves.get_oracle_sequence(doc, gold)


 def test_oracle_moves_whitespace(en_vocab):
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@ -323,7 +323,7 @@ def test_issue3456():
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("tagger"))
    nlp.begin_training()
-    list(nlp.pipe(['hi', '']))
+    list(nlp.pipe(["hi", ""]))


 def test_issue3468():
--- a/spacy/tests/regression/test_issue4042.py
+++ b/spacy/tests/regression/test_issue4042.py
@ -76,7 +76,6 @@ def test_issue4042_bug2():
            output_dir.mkdir()
        ner1.to_disk(output_dir)

-        nlp2 = English(vocab)
        ner2 = EntityRecognizer(vocab)
        ner2.from_disk(output_dir)
        assert len(ner2.labels) == 2
--- a/spacy/tests/regression/test_issue4267.py
+++ b/spacy/tests/regression/test_issue4267.py
@ -1,13 +1,8 @@
 # coding: utf8
 from __future__ import unicode_literals

-import pytest
-
-import spacy
-
 from spacy.lang.en import English
 from spacy.pipeline import EntityRuler
-from spacy.tokens import Span


 def test_issue4267():
--- a/spacy/tests/regression/test_issue4367.py
+++ b/spacy/tests/regression/test_issue4367.py
@ -6,6 +6,6 @@ from spacy.tokens import DocBin

 def test_issue4367():
    """Test that docbin init goes well"""
-    doc_bin_1 = DocBin()
-    doc_bin_2 = DocBin(attrs=["LEMMA"])
-    doc_bin_3 = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
+    DocBin()
+    DocBin(attrs=["LEMMA"])
+    DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
--- a/spacy/tests/serialize/test_serialize_doc.py
+++ b/spacy/tests/serialize/test_serialize_doc.py
@ -74,4 +74,4 @@ def test_serialize_doc_bin():
    # Deserialize later, e.g. in a new process
    nlp = spacy.blank("en")
    doc_bin = DocBin().from_bytes(bytes_data)
-    docs = list(doc_bin.get_docs(nlp.vocab))
+    list(doc_bin.get_docs(nlp.vocab))
--- a/spacy/tests/tokenizer/test_urls.py
+++ b/spacy/tests/tokenizer/test_urls.py
@ -48,8 +48,13 @@ URLS_SHOULD_MATCH = [
    "http://a.b--c.de/",  # this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014
    "ssh://login@server.com:12345/repository.git",
    "svn+ssh://user@ssh.yourdomain.com/path",
-    pytest.param("chrome://extensions/?id=mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()),
-    pytest.param("chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()),
+    pytest.param(
+        "chrome://extensions/?id=mhjfbmdgcfjbbpaeojofohoefgiehjai",
+        marks=pytest.mark.xfail(),
+    ),
+    pytest.param(
+        "chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()
+    ),
    pytest.param("http://foo.com/blah_blah_(wikipedia)", marks=pytest.mark.xfail()),
    pytest.param(
        "http://foo.com/blah_blah_(wikipedia)_(again)", marks=pytest.mark.xfail()
--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@ -50,12 +50,13 @@ def ngrams_vocab(en_vocab, ngrams_vectors):
 def data():
    return numpy.asarray([[0.0, 1.0, 2.0], [3.0, -2.0, 4.0]], dtype="f")

+
@pytest.fixture
 def most_similar_vectors_data():
-    return numpy.asarray([[0.0, 1.0, 2.0],
-                          [1.0, -2.0, 4.0],
-                          [1.0, 1.0, -1.0],
-                          [2.0, 3.0, 1.0]], dtype="f")
+    return numpy.asarray(
+        [[0.0, 1.0, 2.0], [1.0, -2.0, 4.0], [1.0, 1.0, -1.0], [2.0, 3.0, 1.0]],
+        dtype="f",
+    )


@pytest.fixture