Merge branch 'master' into fix/travis-tests

2026-02-01 21:16:05 +03:00 · 2020-05-21 14:23:04 +02:00 · 2020-05-21 14:23:04 +02:00 · bd6353715a
commit bd6353715a
parent 56de520afd d8f3190c0a
43 changed files with 132 additions and 79 deletions
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -187,12 +187,17 @@ def debug_data(
        n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
        msg.warn(
            "{} words in training data without vectors ({:0.2f}%)".format(
-                n_missing_vectors,
-                n_missing_vectors / gold_train_data["n_words"],
+                n_missing_vectors, n_missing_vectors / gold_train_data["n_words"],
            ),
        )
        msg.text(
-            "10 most common words without vectors: {}".format(_format_labels(gold_train_data["words_missing_vectors"].most_common(10), counts=True)), show=verbose,
+            "10 most common words without vectors: {}".format(
+                _format_labels(
+                    gold_train_data["words_missing_vectors"].most_common(10),
+                    counts=True,
+                )
+            ),
+            show=verbose,
        )
    else:
        msg.info("No word vectors present in the model")
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@ -49,7 +49,12 @@ DEFAULT_OOV_PROB = -20
        str,
    ),
    model_name=("Optional name for the model meta", "option", "mn", str),
-    base_model=("Base model (for languages with custom tokenizers)", "option", "b", str),
+    base_model=(
+        "Base model (for languages with custom tokenizers)",
+        "option",
+        "b",
+        str,
+    ),
 )
 def init_model(
    lang,
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -8,7 +8,7 @@ def add_codes(err_cls):
    class ErrorsWithCodes(err_cls):
        def __getattribute__(self, code):
            msg = super().__getattribute__(code)
-            if code.startswith('__'):  # python system attributes like __class__
+            if code.startswith("__"):  # python system attributes like __class__
                return msg
            else:
                return "[{code}] {msg}".format(code=code, msg=msg)
@ -116,6 +116,7 @@ class Warnings(object):
            " to check the alignment. Misaligned entities ('-') will be "
            "ignored during training.")

+
@add_codes
 class Errors(object):
    E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
--- a/spacy/lang/da/init.py
+++ b/spacy/lang/da/init.py
@ -9,7 +9,6 @@ from .morph_rules import MORPH_RULES
 from ..tag_map import TAG_MAP

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
 from ...language import Language
 from ...attrs import LANG
 from ...util import update_exc
--- a/spacy/lang/de/stop_words.py
+++ b/spacy/lang/de/stop_words.py
@ -47,7 +47,7 @@ kleines kommen kommt können könnt konnte könnte konnten kurz
 lang lange leicht leider lieber los

 machen macht machte mag magst man manche manchem manchen mancher manches mehr
-mein meine meinem meinen meiner meines mich mir mit mittel mochte möchte mochten 
+mein meine meinem meinen meiner meines mich mir mit mittel mochte möchte mochten
 mögen möglich mögt morgen muss muß müssen musst müsst musste mussten

 na nach nachdem nahm natürlich neben nein neue neuen neun neunte neunten neunter
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@ -197,7 +197,7 @@ for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:

        _exc[orth + "d"] = [
            {ORTH: orth, LEMMA: word, NORM: word},
-            {ORTH: "d", NORM: "'d"}
+            {ORTH: "d", NORM: "'d"},
        ]

        _exc[orth + "'d've"] = [
--- a/spacy/lang/es/punctuation.py
+++ b/spacy/lang/es/punctuation.py
@ -5,7 +5,6 @@ from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
 from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT
 from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
 from ..char_classes import merge_chars
-from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES


 _list_units = [u for u in LIST_UNITS if u != "%"]
--- a/spacy/lang/fr/tokenizer_exceptions.py
+++ b/spacy/lang/fr/tokenizer_exceptions.py
@ -461,5 +461,5 @@ _regular_exp.append(URL_PATTERN)

 TOKENIZER_EXCEPTIONS = _exc
 TOKEN_MATCH = re.compile(
-        "(?iu)" + "|".join("(?:{})".format(m) for m in _regular_exp)
+    "(?iu)" + "|".join("(?:{})".format(m) for m in _regular_exp)
 ).match
--- a/spacy/lang/gu/stop_words.py
+++ b/spacy/lang/gu/stop_words.py
@ -3,7 +3,7 @@ from __future__ import unicode_literals

 STOP_WORDS = set(
    """
-એમ 
+એમ
 આ
 એ
 રહી
@ -24,7 +24,7 @@ STOP_WORDS = set(
 તેમને
 તેમના
 તેમણે
-તેમનું 
+તેમનું
 તેમાં
 અને
 અહીં
@ -33,12 +33,12 @@ STOP_WORDS = set(
 થાય
 જે
 ને
-કે 
+કે
 ના
 ની
 નો
 ને
-નું 
+નું
 શું
 માં
 પણ
@ -69,12 +69,12 @@ STOP_WORDS = set(
 કોઈ
 કેમ
 કર્યો
-કર્યુ 
+કર્યુ
 કરે
 સૌથી
-ત્યારબાદ 
+ત્યારબાદ
 તથા
-દ્વારા 
+દ્વારા
 જુઓ
 જાઓ
 જ્યારે
--- a/spacy/lang/hy/init.py
+++ b/spacy/lang/hy/init.py
@ -1,11 +1,12 @@
+# coding: utf8
+from __future__ import unicode_literals
+
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .tag_map import TAG_MAP

-
 from ...attrs import LANG
 from ...language import Language
-from ...tokens import Doc


 class ArmenianDefaults(Language.Defaults):
--- a/spacy/lang/hy/examples.py
+++ b/spacy/lang/hy/examples.py
@ -1,7 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals

-
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.hy.examples import sentences
--- a/spacy/lang/hy/stop_words.py
+++ b/spacy/lang/hy/stop_words.py
@ -1,7 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals

-
 STOP_WORDS = set(
    """
 նա
--- a/spacy/lang/hy/tag_map.py
+++ b/spacy/lang/hy/tag_map.py
@ -1,7 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ...symbols import POS, SYM, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN
+from ...symbols import POS, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN
 from ...symbols import PROPN, PART, INTJ, PRON, SCONJ, AUX, CCONJ

 TAG_MAP = {
@ -716,7 +716,7 @@ TAG_MAP = {
        POS: NOUN,
        "Animacy": "Nhum",
        "Case": "Dat",
-        "Number": "Coll",
+        # "Number": "Coll",
        "Number": "Sing",
        "Person": "1",
    },
@ -815,7 +815,7 @@ TAG_MAP = {
        "Animacy": "Nhum",
        "Case": "Nom",
        "Definite": "Def",
-        "Number": "Plur",
+        # "Number": "Plur",
        "Number": "Sing",
        "Poss": "Yes",
    },
@ -880,7 +880,7 @@ TAG_MAP = {
        POS: NOUN,
        "Animacy": "Nhum",
        "Case": "Nom",
-        "Number": "Plur",
+        # "Number": "Plur",
        "Number": "Sing",
        "Person": "2",
    },
@ -1223,9 +1223,9 @@ TAG_MAP = {
    "PRON_Case=Nom|Number=Sing|Number=Plur|Person=3|Person=1|PronType=Emp": {
        POS: PRON,
        "Case": "Nom",
-        "Number": "Sing",
+        # "Number": "Sing",
        "Number": "Plur",
-        "Person": "3",
+        # "Person": "3",
        "Person": "1",
        "PronType": "Emp",
    },
--- a/spacy/lang/ml/lex_attrs.py
+++ b/spacy/lang/ml/lex_attrs.py
@ -55,7 +55,7 @@ _num_words = [
    "തൊണ്ണൂറ് ",
    "നുറ് ",
    "ആയിരം ",
-    "പത്തുലക്ഷം"
+    "പത്തുലക്ഷം",
 ]


--- a/spacy/lang/ml/stop_words.py
+++ b/spacy/lang/ml/stop_words.py
@ -3,7 +3,6 @@ from __future__ import unicode_literals


 STOP_WORDS = set(
-
    """
 അത്
 ഇത്
--- a/spacy/lang/pl/init.py
+++ b/spacy/lang/pl/init.py
@ -12,7 +12,7 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
 from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
+from ...util import add_lookups
 from ...lookups import Lookups


--- a/spacy/lang/pl/lemmatizer.py
+++ b/spacy/lang/pl/lemmatizer.py
@ -3,7 +3,6 @@ from __future__ import unicode_literals

 from ...lemmatizer import Lemmatizer
 from ...parts_of_speech import NAMES
-from ...errors import Errors


 class PolishLemmatizer(Lemmatizer):
--- a/spacy/lang/pl/punctuation.py
+++ b/spacy/lang/pl/punctuation.py
@ -8,7 +8,9 @@ from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES

 _quotes = CONCAT_QUOTES.replace("'", "")

-_prefixes = _prefixes = [r"(długo|krótko|jedno|dwu|trzy|cztero)-"] + BASE_TOKENIZER_PREFIXES
+_prefixes = _prefixes = [
+    r"(długo|krótko|jedno|dwu|trzy|cztero)-"
+] + BASE_TOKENIZER_PREFIXES

 _infixes = (
    LIST_ELLIPSES
--- a/spacy/lang/sv/lex_attrs.py
+++ b/spacy/lang/sv/lex_attrs.py
@ -40,7 +40,7 @@ _num_words = [
    "miljard",
    "biljon",
    "biljard",
-    "kvadriljon"
+    "kvadriljon",
 ]


--- a/spacy/lang/ur/tag_map.py
+++ b/spacy/lang/ur/tag_map.py
@ -38,7 +38,6 @@ TAG_MAP = {
    "NNPC": {POS: PROPN},
    "NNC": {POS: NOUN},
    "PSP": {POS: ADP},
-
    ".": {POS: PUNCT},
    ",": {POS: PUNCT},
    "-LRB-": {POS: PUNCT},
--- a/spacy/language.py
+++ b/spacy/language.py
@ -79,7 +79,9 @@ class BaseDefaults(object):
            lookups=lookups,
        )
        vocab.lex_attr_getters[NORM] = util.add_lookups(
-            vocab.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), BASE_NORMS, vocab.lookups.get_table("lexeme_norm")
+            vocab.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]),
+            BASE_NORMS,
+            vocab.lookups.get_table("lexeme_norm"),
        )
        for tag_str, exc in cls.morph_rules.items():
            for orth_str, attrs in exc.items():
@ -974,7 +976,9 @@ class Language(object):
        serializers = OrderedDict()
        serializers["vocab"] = lambda: self.vocab.to_bytes()
        serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"])
-        serializers["meta.json"] = lambda: srsly.json_dumps(OrderedDict(sorted(self.meta.items())))
+        serializers["meta.json"] = lambda: srsly.json_dumps(
+            OrderedDict(sorted(self.meta.items()))
+        )
        for name, proc in self.pipeline:
            if name in exclude:
                continue
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -112,6 +112,7 @@ def ga_tokenizer():
 def gu_tokenizer():
    return get_lang_class("gu").Defaults.create_tokenizer()

+
@pytest.fixture(scope="session")
 def he_tokenizer():
    return get_lang_class("he").Defaults.create_tokenizer()
@ -246,7 +247,9 @@ def yo_tokenizer():

@pytest.fixture(scope="session")
 def zh_tokenizer_char():
-    return get_lang_class("zh").Defaults.create_tokenizer(config={"use_jieba": False, "use_pkuseg": False})
+    return get_lang_class("zh").Defaults.create_tokenizer(
+        config={"use_jieba": False, "use_pkuseg": False}
+    )


@pytest.fixture(scope="session")
@ -258,7 +261,9 @@ def zh_tokenizer_jieba():
@pytest.fixture(scope="session")
 def zh_tokenizer_pkuseg():
    pytest.importorskip("pkuseg")
-    return get_lang_class("zh").Defaults.create_tokenizer(config={"pkuseg_model": "default", "use_jieba": False, "use_pkuseg": True})
+    return get_lang_class("zh").Defaults.create_tokenizer(
+        config={"pkuseg_model": "default", "use_jieba": False, "use_pkuseg": True}
+    )


@pytest.fixture(scope="session")
--- a/spacy/tests/doc/test_creation.py
+++ b/spacy/tests/doc/test_creation.py
@ -50,7 +50,9 @@ def test_create_from_words_and_text(vocab):
    assert [t.text for t in doc] == ["  ", "'", "dogs", "'", "\n\n", "run", " "]
    assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
    assert doc.text == text
-    assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()]
+    assert [t.text for t in doc if not t.text.isspace()] == [
+        word for word in words if not word.isspace()
+    ]

    # partial whitespace in words
    words = ["  ", "'", "dogs", "'", "\n\n", "run", " "]
@ -60,7 +62,9 @@ def test_create_from_words_and_text(vocab):
    assert [t.text for t in doc] == ["  ", "'", "dogs", "'", "\n\n", "run", " "]
    assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
    assert doc.text == text
-    assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()]
+    assert [t.text for t in doc if not t.text.isspace()] == [
+        word for word in words if not word.isspace()
+    ]

    # non-standard whitespace tokens
    words = [" ", " ", "'", "dogs", "'", "\n\n", "run"]
@ -70,7 +74,9 @@ def test_create_from_words_and_text(vocab):
    assert [t.text for t in doc] == ["  ", "'", "dogs", "'", "\n\n", "run", " "]
    assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
    assert doc.text == text
-    assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()]
+    assert [t.text for t in doc if not t.text.isspace()] == [
+        word for word in words if not word.isspace()
+    ]

    # mismatch between words and text
    with pytest.raises(ValueError):
--- a/spacy/tests/doc/test_token_api.py
+++ b/spacy/tests/doc/test_token_api.py
@ -181,6 +181,7 @@ def test_is_sent_start(en_tokenizer):
    doc.is_parsed = True
    assert len(list(doc.sents)) == 2

+
 def test_is_sent_end(en_tokenizer):
    doc = en_tokenizer("This is a sentence. This is another.")
    assert doc[4].is_sent_end is None
@ -213,6 +214,7 @@ def test_token0_has_sent_start_true():
    assert doc[1].is_sent_start is None
    assert not doc.is_sentenced

+
 def test_tokenlast_has_sent_end_true():
    doc = Doc(Vocab(), words=["hello", "world"])
    assert doc[0].is_sent_end is None
--- a/spacy/tests/lang/de/test_noun_chunks.py
+++ b/spacy/tests/lang/de/test_noun_chunks.py
@ -5,9 +5,9 @@ import pytest


 def test_noun_chunks_is_parsed_de(de_tokenizer):
-    """Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed. 
+    """Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed.
    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False' 
+    with a new Vocab here and forcing is_parsed to 'False'
    to make sure the noun chunks don't run.
    """
    doc = de_tokenizer("Er lag auf seinem")
--- a/spacy/tests/lang/el/test_noun_chunks.py
+++ b/spacy/tests/lang/el/test_noun_chunks.py
@ -5,9 +5,9 @@ import pytest


 def test_noun_chunks_is_parsed_el(el_tokenizer):
-    """Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed. 
+    """Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed.
    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False' 
+    with a new Vocab here and forcing is_parsed to 'False'
    to make sure the noun chunks don't run.
    """
    doc = el_tokenizer("είναι χώρα της νοτιοανατολικής")
--- a/spacy/tests/lang/en/test_noun_chunks.py
+++ b/spacy/tests/lang/en/test_noun_chunks.py
@ -13,9 +13,9 @@ from ...util import get_doc


 def test_noun_chunks_is_parsed(en_tokenizer):
-    """Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed. 
+    """Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed.
    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False' 
+    with a new Vocab here and forcing is_parsed to 'False'
    to make sure the noun chunks don't run.
    """
    doc = en_tokenizer("This is a sentence")
--- a/spacy/tests/lang/es/test_noun_chunks.py
+++ b/spacy/tests/lang/es/test_noun_chunks.py
@ -5,9 +5,9 @@ import pytest


 def test_noun_chunks_is_parsed_es(es_tokenizer):
-    """Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed. 
+    """Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed.
    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False' 
+    with a new Vocab here and forcing is_parsed to 'False'
    to make sure the noun chunks don't run.
    """
    doc = es_tokenizer("en Oxford este verano")
--- a/spacy/tests/lang/es/test_text.py
+++ b/spacy/tests/lang/es/test_text.py
@ -62,4 +62,4 @@ def test_lex_attrs_like_number(es_tokenizer, text, match):
@pytest.mark.parametrize("word", ["once"])
 def test_es_lex_attrs_capitals(word):
    assert like_num(word)
-    assert like_num(word.upper())
+    assert like_num(word.upper())
--- a/spacy/tests/lang/fr/test_noun_chunks.py
+++ b/spacy/tests/lang/fr/test_noun_chunks.py
@ -5,9 +5,9 @@ import pytest


 def test_noun_chunks_is_parsed_fr(fr_tokenizer):
-    """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed. 
+    """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed.
    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False' 
+    with a new Vocab here and forcing is_parsed to 'False'
    to make sure the noun chunks don't run.
    """
    doc = fr_tokenizer("trouver des travaux antérieurs")
--- a/spacy/tests/lang/gu/test_text.py
+++ b/spacy/tests/lang/gu/test_text.py
@ -3,17 +3,16 @@ from __future__ import unicode_literals

 import pytest

+
 def test_gu_tokenizer_handlers_long_text(gu_tokenizer):
    text = """પશ્ચિમ ભારતમાં આવેલું ગુજરાત રાજ્ય જે વ્યક્તિઓની માતૃભૂમિ છે"""
    tokens = gu_tokenizer(text)
    assert len(tokens) == 9

+
@pytest.mark.parametrize(
    "text,length",
-    [
-        ("ગુજરાતીઓ ખાવાના શોખીન માનવામાં આવે છે", 6),
-        ("ખેતરની ખેડ કરવામાં આવે છે.", 5),
-    ],
+    [("ગુજરાતીઓ ખાવાના શોખીન માનવામાં આવે છે", 6), ("ખેતરની ખેડ કરવામાં આવે છે.", 5)],
 )
 def test_gu_tokenizer_handles_cnts(gu_tokenizer, text, length):
    tokens = gu_tokenizer(text)
--- a/spacy/tests/lang/id/test_noun_chunks.py
+++ b/spacy/tests/lang/id/test_noun_chunks.py
@ -5,9 +5,9 @@ import pytest


 def test_noun_chunks_is_parsed_id(id_tokenizer):
-    """Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed. 
+    """Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed.
    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False' 
+    with a new Vocab here and forcing is_parsed to 'False'
    to make sure the noun chunks don't run.
    """
    doc = id_tokenizer("sebelas")
--- a/spacy/tests/lang/ml/test_text.py
+++ b/spacy/tests/lang/ml/test_text.py
@ -10,7 +10,16 @@ def test_ml_tokenizer_handles_long_text(ml_tokenizer):
    assert len(tokens) == 5


-@pytest.mark.parametrize("text,length", [("എന്നാൽ അച്ചടിയുടെ ആവിർഭാവം ലിപിയിൽ കാര്യമായ മാറ്റങ്ങൾ വരുത്തിയത് കൂട്ടക്ഷരങ്ങളെ അണുഅക്ഷരങ്ങളായി പിരിച്ചുകൊണ്ടായിരുന്നു", 10), ("പരമ്പരാഗതമായി മലയാളം ഇടത്തുനിന്ന് വലത്തോട്ടാണ് എഴുതുന്നത്", 5)])
+@pytest.mark.parametrize(
+    "text,length",
+    [
+        (
+            "എന്നാൽ അച്ചടിയുടെ ആവിർഭാവം ലിപിയിൽ കാര്യമായ മാറ്റങ്ങൾ വരുത്തിയത് കൂട്ടക്ഷരങ്ങളെ അണുഅക്ഷരങ്ങളായി പിരിച്ചുകൊണ്ടായിരുന്നു",
+            10,
+        ),
+        ("പരമ്പരാഗതമായി മലയാളം ഇടത്തുനിന്ന് വലത്തോട്ടാണ് എഴുതുന്നത്", 5),
+    ],
+)
 def test_ml_tokenizer_handles_cnts(ml_tokenizer, text, length):
    tokens = ml_tokenizer(text)
    assert len(tokens) == length
--- a/spacy/tests/lang/nb/test_noun_chunks.py
+++ b/spacy/tests/lang/nb/test_noun_chunks.py
@ -5,9 +5,9 @@ import pytest


 def test_noun_chunks_is_parsed_nb(nb_tokenizer):
-    """Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed. 
+    """Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed.
    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False' 
+    with a new Vocab here and forcing is_parsed to 'False'
    to make sure the noun chunks don't run.
    """
    doc = nb_tokenizer("Smørsausen brukes bl.a. til")
--- a/spacy/tests/lang/sv/test_noun_chunks.py
+++ b/spacy/tests/lang/sv/test_noun_chunks.py
@ -7,9 +7,9 @@ from ...util import get_doc


 def test_noun_chunks_is_parsed_sv(sv_tokenizer):
-    """Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed. 
+    """Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed.
    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False' 
+    with a new Vocab here and forcing is_parsed to 'False'
    to make sure the noun chunks don't run.
    """
    doc = sv_tokenizer("Studenten läste den bästa boken")
--- a/spacy/tests/lang/zh/test_serialize.py
+++ b/spacy/tests/lang/zh/test_serialize.py
@ -34,5 +34,15 @@ def test_zh_tokenizer_serialize_pkuseg(zh_tokenizer_pkuseg):

@pytest.mark.slow
 def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
-    nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False, "use_pkuseg": True, "pkuseg_model": "medicine"}}})
+    nlp = Chinese(
+        meta={
+            "tokenizer": {
+                "config": {
+                    "use_jieba": False,
+                    "use_pkuseg": True,
+                    "pkuseg_model": "medicine",
+                }
+            }
+        }
+    )
    zh_tokenizer_serialize(nlp.tokenizer)
--- a/spacy/tests/lang/zh/test_tokenizer.py
+++ b/spacy/tests/lang/zh/test_tokenizer.py
@ -43,12 +43,16 @@ def test_zh_tokenizer_pkuseg(zh_tokenizer_pkuseg, text, expected_tokens):
 def test_zh_tokenizer_pkuseg_user_dict(zh_tokenizer_pkuseg):
    user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
    zh_tokenizer_pkuseg.pkuseg_update_user_dict(["nonsense_asdf"])
-    updated_user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
+    updated_user_dict = _get_pkuseg_trie_data(
+        zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie
+    )
    assert len(user_dict) == len(updated_user_dict) - 1

    # reset user dict
    zh_tokenizer_pkuseg.pkuseg_update_user_dict([], reset=True)
-    reset_user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
+    reset_user_dict = _get_pkuseg_trie_data(
+        zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie
+    )
    assert len(reset_user_dict) == 0


--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@ -265,15 +265,15 @@ def test_matcher_regex_shape(en_vocab):


@pytest.mark.parametrize(
-    "cmp, bad", 
+    "cmp, bad",
    [
        ("==", ["a", "aaa"]),
        ("!=", ["aa"]),
        (">=", ["a"]),
        ("<=", ["aaa"]),
        (">", ["a", "aa"]),
-        ("<", ["aa", "aaa"])
-    ]
+        ("<", ["aa", "aaa"]),
+    ],
 )
 def test_matcher_compare_length(en_vocab, cmp, bad):
    matcher = Matcher(en_vocab)
--- a/spacy/tests/pipeline/test_sentencizer.py
+++ b/spacy/tests/pipeline/test_sentencizer.py
@ -106,7 +106,9 @@ def test_sentencizer_complex(en_vocab, words, sent_starts, sent_ends, n_sents):
        ),
    ],
 )
-def test_sentencizer_custom_punct(en_vocab, punct_chars, words, sent_starts, sent_ends, n_sents):
+def test_sentencizer_custom_punct(
+    en_vocab, punct_chars, words, sent_starts, sent_ends, n_sents
+):
    doc = Doc(en_vocab, words=words)
    sentencizer = Sentencizer(punct_chars=punct_chars)
    doc = sentencizer(doc)
--- a/spacy/tests/serialize/test_serialize_vocab_strings.py
+++ b/spacy/tests/serialize/test_serialize_vocab_strings.py
@ -37,7 +37,7 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
    assert vocab1.to_bytes() == vocab1_b
    new_vocab1 = Vocab().from_bytes(vocab1_b)
    assert new_vocab1.to_bytes() == vocab1_b
-    assert len(new_vocab1.strings) == len(strings1) + 1 # adds _SP
+    assert len(new_vocab1.strings) == len(strings1) + 1  # adds _SP
    assert sorted([s for s in new_vocab1.strings]) == sorted(strings1 + ["_SP"])


@ -56,9 +56,13 @@ def test_serialize_vocab_roundtrip_disk(strings1, strings2):
        assert strings1 == [s for s in vocab1_d.strings if s != "_SP"]
        assert strings2 == [s for s in vocab2_d.strings if s != "_SP"]
        if strings1 == strings2:
-            assert [s for s in vocab1_d.strings if s != "_SP"] == [s for s in vocab2_d.strings if s != "_SP"]
+            assert [s for s in vocab1_d.strings if s != "_SP"] == [
+                s for s in vocab2_d.strings if s != "_SP"
+            ]
        else:
-            assert [s for s in vocab1_d.strings if s != "_SP"] != [s for s in vocab2_d.strings if s != "_SP"]
+            assert [s for s in vocab1_d.strings if s != "_SP"] != [
+                s for s in vocab2_d.strings if s != "_SP"
+            ]


@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
@ -76,9 +80,8 @@ def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr):
 def test_deserialize_vocab_seen_entries(strings, lex_attr):
    # Reported in #2153
    vocab = Vocab(strings=strings)
-    length = len(vocab)
    vocab.from_bytes(vocab.to_bytes())
-    assert len(vocab.strings) == len(strings) + 1 # adds _SP
+    assert len(vocab.strings) == len(strings) + 1  # adds _SP


@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
@ -130,6 +133,7 @@ def test_serialize_stringstore_roundtrip_disk(strings1, strings2):
        else:
            assert list(sstore1_d) != list(sstore2_d)

+
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
 def test_pickle_vocab(strings, lex_attr):
    vocab = Vocab(strings=strings)
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@ -112,7 +112,7 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
    data = (
        "I'll return the ₹54 amount",
        {
-            "words": ["I", "'ll", "return", "the", "₹", "54", "amount",],
+            "words": ["I", "'ll", "return", "the", "₹", "54", "amount"],
            "entities": [(16, 19, "MONEY")],
        },
    )
@ -122,7 +122,7 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
    data = (
        "I'll return the $54 amount",
        {
-            "words": ["I", "'ll", "return", "the", "$", "54", "amount",],
+            "words": ["I", "'ll", "return", "the", "$", "54", "amount"],
            "entities": [(16, 19, "MONEY")],
        },
    )
--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@ -366,6 +366,7 @@ def test_vectors_serialize():
        assert row == row_r
        assert_equal(v.data, v_r.data)

+
 def test_vector_is_oov():
    vocab = Vocab(vectors_name="test_vocab_is_oov")
    data = numpy.ndarray((5, 3), dtype="f")
@ -375,4 +376,4 @@ def test_vector_is_oov():
    vocab.set_vector("dog", data[1])
    assert vocab["cat"].is_oov is True
    assert vocab["dog"].is_oov is True
-    assert vocab["hamster"].is_oov is False
+    assert vocab["hamster"].is_oov is False
--- a/spacy/util.py
+++ b/spacy/util.py
@ -774,7 +774,7 @@ def get_words_and_spaces(words, text):
        except ValueError:
            raise ValueError(Errors.E194.format(text=text, words=words))
        if word_start > 0:
-            text_words.append(text[text_pos:text_pos+word_start])
+            text_words.append(text[text_pos : text_pos + word_start])
            text_spaces.append(False)
            text_pos += word_start
        text_words.append(word)