Merge branch 'master' into fix/travis-tests

2025-07-15 10:42:34 +03:00 · 2020-05-21 14:23:04 +02:00 · 2020-05-21 14:23:04 +02:00 · bd6353715a
commit bd6353715a
parent 56de520afd d8f3190c0a
43 changed files with 132 additions and 79 deletions
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -187,12 +187,17 @@ def debug_data(
        n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
        msg.warn(
            "{} words in training data without vectors ({:0.2f}%)".format(
-                n_missing_vectors,
+                n_missing_vectors, n_missing_vectors / gold_train_data["n_words"],
                n_missing_vectors / gold_train_data["n_words"],
            ),
        )
        msg.text(
-            "10 most common words without vectors: {}".format(_format_labels(gold_train_data["words_missing_vectors"].most_common(10), counts=True)), show=verbose,
+            "10 most common words without vectors: {}".format(
                _format_labels(
                    gold_train_data["words_missing_vectors"].most_common(10),
                    counts=True,
                )
            ),
            show=verbose,
        )
    else:
        msg.info("No word vectors present in the model")
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@ -49,7 +49,12 @@ DEFAULT_OOV_PROB = -20
        str,
    ),
    model_name=("Optional name for the model meta", "option", "mn", str),
-    base_model=("Base model (for languages with custom tokenizers)", "option", "b", str),
+    base_model=(
        "Base model (for languages with custom tokenizers)",
        "option",
        "b",
        str,
    ),
 )
 def init_model(
    lang,
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -8,7 +8,7 @@ def add_codes(err_cls):
    class ErrorsWithCodes(err_cls):
        def __getattribute__(self, code):
            msg = super().__getattribute__(code)
-            if code.startswith('__'):  # python system attributes like __class__
+            if code.startswith("__"):  # python system attributes like __class__
                return msg
            else:
                return "[{code}] {msg}".format(code=code, msg=msg)
@ -116,6 +116,7 @@ class Warnings(object):
            " to check the alignment. Misaligned entities ('-') will be "
            "ignored during training.")
@add_codes
 class Errors(object):
    E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
--- a/spacy/lang/da/init.py
+++ b/spacy/lang/da/init.py
@ -9,7 +9,6 @@ from .morph_rules import MORPH_RULES
 from ..tag_map import TAG_MAP
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
 from ...attrs import LANG
 from ...util import update_exc
--- a/spacy/lang/de/stop_words.py
+++ b/spacy/lang/de/stop_words.py
@ -47,7 +47,7 @@ kleines kommen kommt können könnt konnte könnte konnten kurz
 lang lange leicht leider lieber los
 machen macht machte mag magst man manche manchem manchen mancher manches mehr
-mein meine meinem meinen meiner meines mich mir mit mittel mochte möchte mochten 
+mein meine meinem meinen meiner meines mich mir mit mittel mochte möchte mochten
 mögen möglich mögt morgen muss muß müssen musst müsst musste mussten
 na nach nachdem nahm natürlich neben nein neue neuen neun neunte neunten neunter
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@ -197,7 +197,7 @@ for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:
        _exc[orth + "d"] = [
            {ORTH: orth, LEMMA: word, NORM: word},
-            {ORTH: "d", NORM: "'d"}
+            {ORTH: "d", NORM: "'d"},
        ]
        _exc[orth + "'d've"] = [
--- a/spacy/lang/es/punctuation.py
+++ b/spacy/lang/es/punctuation.py
@ -5,7 +5,6 @@ from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
 from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT
 from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
 from ..char_classes import merge_chars
 from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
 _list_units = [u for u in LIST_UNITS if u != "%"]
--- a/spacy/lang/fr/tokenizer_exceptions.py
+++ b/spacy/lang/fr/tokenizer_exceptions.py
@ -461,5 +461,5 @@ _regular_exp.append(URL_PATTERN)
 TOKENIZER_EXCEPTIONS = _exc
 TOKEN_MATCH = re.compile(
-        "(?iu)" + "|".join("(?:{})".format(m) for m in _regular_exp)
+    "(?iu)" + "|".join("(?:{})".format(m) for m in _regular_exp)
 ).match
--- a/spacy/lang/gu/stop_words.py
+++ b/spacy/lang/gu/stop_words.py
@ -3,7 +3,7 @@ from __future__ import unicode_literals
 STOP_WORDS = set(
    """
-એમ 
+એમ
 આ
 એ
 રહી
@ -24,7 +24,7 @@ STOP_WORDS = set(
 તેમને
 તેમના
 તેમણે
-તેમનું 
+તેમનું
 તેમાં
 અને
 અહીં
@ -33,12 +33,12 @@ STOP_WORDS = set(
 થાય
 જે
 ને
-કે 
+કે
 ના
 ની
 નો
 ને
-નું 
+નું
 શું
 માં
 પણ
@ -69,12 +69,12 @@ STOP_WORDS = set(
 કોઈ
 કેમ
 કર્યો
-કર્યુ 
+કર્યુ
 કરે
 સૌથી
-ત્યારબાદ 
+ત્યારબાદ
 તથા
-દ્વારા 
+દ્વારા
 જુઓ
 જાઓ
 જ્યારે
--- a/spacy/lang/hy/init.py
+++ b/spacy/lang/hy/init.py
@ -1,11 +1,12 @@
 # coding: utf8
 from __future__ import unicode_literals
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .tag_map import TAG_MAP
 from ...attrs import LANG
 from ...language import Language
 from ...tokens import Doc
 class ArmenianDefaults(Language.Defaults):
--- a/spacy/lang/hy/examples.py
+++ b/spacy/lang/hy/examples.py
@ -1,7 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.hy.examples import sentences
--- a/spacy/lang/hy/stop_words.py
+++ b/spacy/lang/hy/stop_words.py
@ -1,7 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals
 STOP_WORDS = set(
    """
 նա
--- a/spacy/lang/hy/tag_map.py
+++ b/spacy/lang/hy/tag_map.py
@ -1,7 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals
-from ...symbols import POS, SYM, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN
+from ...symbols import POS, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN
 from ...symbols import PROPN, PART, INTJ, PRON, SCONJ, AUX, CCONJ
 TAG_MAP = {
@ -716,7 +716,7 @@ TAG_MAP = {
        POS: NOUN,
        "Animacy": "Nhum",
        "Case": "Dat",
-        "Number": "Coll",
+        # "Number": "Coll",
        "Number": "Sing",
        "Person": "1",
    },
@ -815,7 +815,7 @@ TAG_MAP = {
        "Animacy": "Nhum",
        "Case": "Nom",
        "Definite": "Def",
-        "Number": "Plur",
+        # "Number": "Plur",
        "Number": "Sing",
        "Poss": "Yes",
    },
@ -880,7 +880,7 @@ TAG_MAP = {
        POS: NOUN,
        "Animacy": "Nhum",
        "Case": "Nom",
-        "Number": "Plur",
+        # "Number": "Plur",
        "Number": "Sing",
        "Person": "2",
    },
@ -1223,9 +1223,9 @@ TAG_MAP = {
    "PRON_Case=Nom|Number=Sing|Number=Plur|Person=3|Person=1|PronType=Emp": {
        POS: PRON,
        "Case": "Nom",
-        "Number": "Sing",
+        # "Number": "Sing",
        "Number": "Plur",
-        "Person": "3",
+        # "Person": "3",
        "Person": "1",
        "PronType": "Emp",
    },
--- a/spacy/lang/ml/lex_attrs.py
+++ b/spacy/lang/ml/lex_attrs.py
@ -55,7 +55,7 @@ _num_words = [
    "തൊണ്ണൂറ് ",
    "നുറ് ",
    "ആയിരം ",
-    "പത്തുലക്ഷം"
+    "പത്തുലക്ഷം",
 ]
--- a/spacy/lang/ml/stop_words.py
+++ b/spacy/lang/ml/stop_words.py
@ -3,7 +3,6 @@ from __future__ import unicode_literals
 STOP_WORDS = set(
    """
 അത്
 ഇത്
--- a/spacy/lang/pl/init.py
+++ b/spacy/lang/pl/init.py
@ -12,7 +12,7 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
 from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
+from ...util import add_lookups
 from ...lookups import Lookups
--- a/spacy/lang/pl/lemmatizer.py
+++ b/spacy/lang/pl/lemmatizer.py
@ -3,7 +3,6 @@ from __future__ import unicode_literals
 from ...lemmatizer import Lemmatizer
 from ...parts_of_speech import NAMES
 from ...errors import Errors
 class PolishLemmatizer(Lemmatizer):
--- a/spacy/lang/pl/punctuation.py
+++ b/spacy/lang/pl/punctuation.py
@ -8,7 +8,9 @@ from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
 _quotes = CONCAT_QUOTES.replace("'", "")
-_prefixes = _prefixes = [r"(długo|krótko|jedno|dwu|trzy|cztero)-"] + BASE_TOKENIZER_PREFIXES
+_prefixes = _prefixes = [
    r"(długo|krótko|jedno|dwu|trzy|cztero)-"
 ] + BASE_TOKENIZER_PREFIXES
 _infixes = (
    LIST_ELLIPSES
--- a/spacy/lang/sv/lex_attrs.py
+++ b/spacy/lang/sv/lex_attrs.py
@ -40,7 +40,7 @@ _num_words = [
    "miljard",
    "biljon",
    "biljard",
-    "kvadriljon"
+    "kvadriljon",
 ]
--- a/spacy/lang/ur/tag_map.py
+++ b/spacy/lang/ur/tag_map.py
@ -38,7 +38,6 @@ TAG_MAP = {
    "NNPC": {POS: PROPN},
    "NNC": {POS: NOUN},
    "PSP": {POS: ADP},
    ".": {POS: PUNCT},
    ",": {POS: PUNCT},
    "-LRB-": {POS: PUNCT},
--- a/spacy/language.py
+++ b/spacy/language.py
@ -79,7 +79,9 @@ class BaseDefaults(object):
            lookups=lookups,
        )
        vocab.lex_attr_getters[NORM] = util.add_lookups(
-            vocab.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), BASE_NORMS, vocab.lookups.get_table("lexeme_norm")
+            vocab.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]),
            BASE_NORMS,
            vocab.lookups.get_table("lexeme_norm"),
        )
        for tag_str, exc in cls.morph_rules.items():
            for orth_str, attrs in exc.items():
@ -974,7 +976,9 @@ class Language(object):
        serializers = OrderedDict()
        serializers["vocab"] = lambda: self.vocab.to_bytes()
        serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"])
-        serializers["meta.json"] = lambda: srsly.json_dumps(OrderedDict(sorted(self.meta.items())))
+        serializers["meta.json"] = lambda: srsly.json_dumps(
            OrderedDict(sorted(self.meta.items()))
        )
        for name, proc in self.pipeline:
            if name in exclude:
                continue
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -112,6 +112,7 @@ def ga_tokenizer():
 def gu_tokenizer():
    return get_lang_class("gu").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
 def he_tokenizer():
    return get_lang_class("he").Defaults.create_tokenizer()
@ -246,7 +247,9 @@ def yo_tokenizer():
@pytest.fixture(scope="session")
 def zh_tokenizer_char():
-    return get_lang_class("zh").Defaults.create_tokenizer(config={"use_jieba": False, "use_pkuseg": False})
+    return get_lang_class("zh").Defaults.create_tokenizer(
        config={"use_jieba": False, "use_pkuseg": False}
    )
@pytest.fixture(scope="session")
@ -258,7 +261,9 @@ def zh_tokenizer_jieba():
@pytest.fixture(scope="session")
 def zh_tokenizer_pkuseg():
    pytest.importorskip("pkuseg")
-    return get_lang_class("zh").Defaults.create_tokenizer(config={"pkuseg_model": "default", "use_jieba": False, "use_pkuseg": True})
+    return get_lang_class("zh").Defaults.create_tokenizer(
        config={"pkuseg_model": "default", "use_jieba": False, "use_pkuseg": True}
    )
@pytest.fixture(scope="session")
--- a/spacy/tests/doc/test_creation.py
+++ b/spacy/tests/doc/test_creation.py
@ -50,7 +50,9 @@ def test_create_from_words_and_text(vocab):
    assert [t.text for t in doc] == ["  ", "'", "dogs", "'", "\n\n", "run", " "]
    assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
    assert doc.text == text
-    assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()]
+    assert [t.text for t in doc if not t.text.isspace()] == [
        word for word in words if not word.isspace()
    ]
    # partial whitespace in words
    words = ["  ", "'", "dogs", "'", "\n\n", "run", " "]
@ -60,7 +62,9 @@ def test_create_from_words_and_text(vocab):
    assert [t.text for t in doc] == ["  ", "'", "dogs", "'", "\n\n", "run", " "]
    assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
    assert doc.text == text
-    assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()]
+    assert [t.text for t in doc if not t.text.isspace()] == [
        word for word in words if not word.isspace()
    ]
    # non-standard whitespace tokens
    words = [" ", " ", "'", "dogs", "'", "\n\n", "run"]
@ -70,7 +74,9 @@ def test_create_from_words_and_text(vocab):
    assert [t.text for t in doc] == ["  ", "'", "dogs", "'", "\n\n", "run", " "]
    assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
    assert doc.text == text
-    assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()]
+    assert [t.text for t in doc if not t.text.isspace()] == [
        word for word in words if not word.isspace()
    ]
    # mismatch between words and text
    with pytest.raises(ValueError):
--- a/spacy/tests/doc/test_token_api.py
+++ b/spacy/tests/doc/test_token_api.py
@ -181,6 +181,7 @@ def test_is_sent_start(en_tokenizer):
    doc.is_parsed = True
    assert len(list(doc.sents)) == 2
 def test_is_sent_end(en_tokenizer):
    doc = en_tokenizer("This is a sentence. This is another.")
    assert doc[4].is_sent_end is None
@ -213,6 +214,7 @@ def test_token0_has_sent_start_true():
    assert doc[1].is_sent_start is None
    assert not doc.is_sentenced
 def test_tokenlast_has_sent_end_true():
    doc = Doc(Vocab(), words=["hello", "world"])
    assert doc[0].is_sent_end is None
--- a/spacy/tests/lang/de/test_noun_chunks.py
+++ b/spacy/tests/lang/de/test_noun_chunks.py
@ -5,9 +5,9 @@ import pytest
 def test_noun_chunks_is_parsed_de(de_tokenizer):
-    """Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed. 
+    """Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed.
    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False' 
+    with a new Vocab here and forcing is_parsed to 'False'
    to make sure the noun chunks don't run.
    """
    doc = de_tokenizer("Er lag auf seinem")
--- a/spacy/tests/lang/el/test_noun_chunks.py
+++ b/spacy/tests/lang/el/test_noun_chunks.py
@ -5,9 +5,9 @@ import pytest
 def test_noun_chunks_is_parsed_el(el_tokenizer):
-    """Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed. 
+    """Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed.
    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False' 
+    with a new Vocab here and forcing is_parsed to 'False'
    to make sure the noun chunks don't run.
    """
    doc = el_tokenizer("είναι χώρα της νοτιοανατολικής")
--- a/spacy/tests/lang/en/test_noun_chunks.py
+++ b/spacy/tests/lang/en/test_noun_chunks.py
@ -13,9 +13,9 @@ from ...util import get_doc
 def test_noun_chunks_is_parsed(en_tokenizer):
-    """Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed. 
+    """Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed.
    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False' 
+    with a new Vocab here and forcing is_parsed to 'False'
    to make sure the noun chunks don't run.
    """
    doc = en_tokenizer("This is a sentence")
--- a/spacy/tests/lang/es/test_noun_chunks.py
+++ b/spacy/tests/lang/es/test_noun_chunks.py
@ -5,9 +5,9 @@ import pytest
 def test_noun_chunks_is_parsed_es(es_tokenizer):
-    """Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed. 
+    """Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed.
    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False' 
+    with a new Vocab here and forcing is_parsed to 'False'
    to make sure the noun chunks don't run.
    """
    doc = es_tokenizer("en Oxford este verano")
--- a/spacy/tests/lang/es/test_text.py
+++ b/spacy/tests/lang/es/test_text.py
@ -62,4 +62,4 @@ def test_lex_attrs_like_number(es_tokenizer, text, match):
@pytest.mark.parametrize("word", ["once"])
 def test_es_lex_attrs_capitals(word):
    assert like_num(word)
-    assert like_num(word.upper())
+    assert like_num(word.upper())
--- a/spacy/tests/lang/fr/test_noun_chunks.py
+++ b/spacy/tests/lang/fr/test_noun_chunks.py
@ -5,9 +5,9 @@ import pytest
 def test_noun_chunks_is_parsed_fr(fr_tokenizer):
-    """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed. 
+    """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed.
    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False' 
+    with a new Vocab here and forcing is_parsed to 'False'
    to make sure the noun chunks don't run.
    """
    doc = fr_tokenizer("trouver des travaux antérieurs")
--- a/spacy/tests/lang/gu/test_text.py
+++ b/spacy/tests/lang/gu/test_text.py
@ -3,17 +3,16 @@ from __future__ import unicode_literals
 import pytest
 def test_gu_tokenizer_handlers_long_text(gu_tokenizer):
    text = """પશ્ચિમ ભારતમાં આવેલું ગુજરાત રાજ્ય જે વ્યક્તિઓની માતૃભૂમિ છે"""
    tokens = gu_tokenizer(text)
    assert len(tokens) == 9
@pytest.mark.parametrize(
    "text,length",
-    [
+    [("ગુજરાતીઓ ખાવાના શોખીન માનવામાં આવે છે", 6), ("ખેતરની ખેડ કરવામાં આવે છે.", 5)],
        ("ગુજરાતીઓ ખાવાના શોખીન માનવામાં આવે છે", 6),
        ("ખેતરની ખેડ કરવામાં આવે છે.", 5),
    ],
 )
 def test_gu_tokenizer_handles_cnts(gu_tokenizer, text, length):
    tokens = gu_tokenizer(text)
--- a/spacy/tests/lang/id/test_noun_chunks.py
+++ b/spacy/tests/lang/id/test_noun_chunks.py
@ -5,9 +5,9 @@ import pytest
 def test_noun_chunks_is_parsed_id(id_tokenizer):
-    """Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed. 
+    """Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed.
    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False' 
+    with a new Vocab here and forcing is_parsed to 'False'
    to make sure the noun chunks don't run.
    """
    doc = id_tokenizer("sebelas")
--- a/spacy/tests/lang/ml/test_text.py
+++ b/spacy/tests/lang/ml/test_text.py
@ -10,7 +10,16 @@ def test_ml_tokenizer_handles_long_text(ml_tokenizer):
    assert len(tokens) == 5
-@pytest.mark.parametrize("text,length", [("എന്നാൽ അച്ചടിയുടെ ആവിർഭാവം ലിപിയിൽ കാര്യമായ മാറ്റങ്ങൾ വരുത്തിയത് കൂട്ടക്ഷരങ്ങളെ അണുഅക്ഷരങ്ങളായി പിരിച്ചുകൊണ്ടായിരുന്നു", 10), ("പരമ്പരാഗതമായി മലയാളം ഇടത്തുനിന്ന് വലത്തോട്ടാണ് എഴുതുന്നത്", 5)])
+@pytest.mark.parametrize(
    "text,length",
    [
        (
            "എന്നാൽ അച്ചടിയുടെ ആവിർഭാവം ലിപിയിൽ കാര്യമായ മാറ്റങ്ങൾ വരുത്തിയത് കൂട്ടക്ഷരങ്ങളെ അണുഅക്ഷരങ്ങളായി പിരിച്ചുകൊണ്ടായിരുന്നു",
            10,
        ),
        ("പരമ്പരാഗതമായി മലയാളം ഇടത്തുനിന്ന് വലത്തോട്ടാണ് എഴുതുന്നത്", 5),
    ],
 )
 def test_ml_tokenizer_handles_cnts(ml_tokenizer, text, length):
    tokens = ml_tokenizer(text)
    assert len(tokens) == length
--- a/spacy/tests/lang/nb/test_noun_chunks.py
+++ b/spacy/tests/lang/nb/test_noun_chunks.py
@ -5,9 +5,9 @@ import pytest
 def test_noun_chunks_is_parsed_nb(nb_tokenizer):
-    """Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed. 
+    """Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed.
    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False' 
+    with a new Vocab here and forcing is_parsed to 'False'
    to make sure the noun chunks don't run.
    """
    doc = nb_tokenizer("Smørsausen brukes bl.a. til")
--- a/spacy/tests/lang/sv/test_noun_chunks.py
+++ b/spacy/tests/lang/sv/test_noun_chunks.py
@ -7,9 +7,9 @@ from ...util import get_doc
 def test_noun_chunks_is_parsed_sv(sv_tokenizer):
-    """Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed. 
+    """Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed.
    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False' 
+    with a new Vocab here and forcing is_parsed to 'False'
    to make sure the noun chunks don't run.
    """
    doc = sv_tokenizer("Studenten läste den bästa boken")
--- a/spacy/tests/lang/zh/test_serialize.py
+++ b/spacy/tests/lang/zh/test_serialize.py
@ -34,5 +34,15 @@ def test_zh_tokenizer_serialize_pkuseg(zh_tokenizer_pkuseg):
@pytest.mark.slow
 def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
-    nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False, "use_pkuseg": True, "pkuseg_model": "medicine"}}})
+    nlp = Chinese(
        meta={
            "tokenizer": {
                "config": {
                    "use_jieba": False,
                    "use_pkuseg": True,
                    "pkuseg_model": "medicine",
                }
            }
        }
    )
    zh_tokenizer_serialize(nlp.tokenizer)
--- a/spacy/tests/lang/zh/test_tokenizer.py
+++ b/spacy/tests/lang/zh/test_tokenizer.py
@ -43,12 +43,16 @@ def test_zh_tokenizer_pkuseg(zh_tokenizer_pkuseg, text, expected_tokens):
 def test_zh_tokenizer_pkuseg_user_dict(zh_tokenizer_pkuseg):
    user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
    zh_tokenizer_pkuseg.pkuseg_update_user_dict(["nonsense_asdf"])
-    updated_user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
+    updated_user_dict = _get_pkuseg_trie_data(
        zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie
    )
    assert len(user_dict) == len(updated_user_dict) - 1
    # reset user dict
    zh_tokenizer_pkuseg.pkuseg_update_user_dict([], reset=True)
-    reset_user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
+    reset_user_dict = _get_pkuseg_trie_data(
        zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie
    )
    assert len(reset_user_dict) == 0
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@ -265,15 +265,15 @@ def test_matcher_regex_shape(en_vocab):
@pytest.mark.parametrize(
-    "cmp, bad", 
+    "cmp, bad",
    [
        ("==", ["a", "aaa"]),
        ("!=", ["aa"]),
        (">=", ["a"]),
        ("<=", ["aaa"]),
        (">", ["a", "aa"]),
-        ("<", ["aa", "aaa"])
+        ("<", ["aa", "aaa"]),
-    ]
+    ],
 )
 def test_matcher_compare_length(en_vocab, cmp, bad):
    matcher = Matcher(en_vocab)
--- a/spacy/tests/pipeline/test_sentencizer.py
+++ b/spacy/tests/pipeline/test_sentencizer.py
@ -106,7 +106,9 @@ def test_sentencizer_complex(en_vocab, words, sent_starts, sent_ends, n_sents):
        ),
    ],
 )
-def test_sentencizer_custom_punct(en_vocab, punct_chars, words, sent_starts, sent_ends, n_sents):
+def test_sentencizer_custom_punct(
    en_vocab, punct_chars, words, sent_starts, sent_ends, n_sents
 ):
    doc = Doc(en_vocab, words=words)
    sentencizer = Sentencizer(punct_chars=punct_chars)
    doc = sentencizer(doc)
--- a/spacy/tests/serialize/test_serialize_vocab_strings.py
+++ b/spacy/tests/serialize/test_serialize_vocab_strings.py
@ -37,7 +37,7 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
    assert vocab1.to_bytes() == vocab1_b
    new_vocab1 = Vocab().from_bytes(vocab1_b)
    assert new_vocab1.to_bytes() == vocab1_b
-    assert len(new_vocab1.strings) == len(strings1) + 1 # adds _SP
+    assert len(new_vocab1.strings) == len(strings1) + 1  # adds _SP
    assert sorted([s for s in new_vocab1.strings]) == sorted(strings1 + ["_SP"])
@ -56,9 +56,13 @@ def test_serialize_vocab_roundtrip_disk(strings1, strings2):
        assert strings1 == [s for s in vocab1_d.strings if s != "_SP"]
        assert strings2 == [s for s in vocab2_d.strings if s != "_SP"]
        if strings1 == strings2:
-            assert [s for s in vocab1_d.strings if s != "_SP"] == [s for s in vocab2_d.strings if s != "_SP"]
+            assert [s for s in vocab1_d.strings if s != "_SP"] == [
                s for s in vocab2_d.strings if s != "_SP"
            ]
        else:
-            assert [s for s in vocab1_d.strings if s != "_SP"] != [s for s in vocab2_d.strings if s != "_SP"]
+            assert [s for s in vocab1_d.strings if s != "_SP"] != [
                s for s in vocab2_d.strings if s != "_SP"
            ]
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
@ -76,9 +80,8 @@ def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr):
 def test_deserialize_vocab_seen_entries(strings, lex_attr):
    # Reported in #2153
    vocab = Vocab(strings=strings)
    length = len(vocab)
    vocab.from_bytes(vocab.to_bytes())
-    assert len(vocab.strings) == len(strings) + 1 # adds _SP
+    assert len(vocab.strings) == len(strings) + 1  # adds _SP
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
@ -130,6 +133,7 @@ def test_serialize_stringstore_roundtrip_disk(strings1, strings2):
        else:
            assert list(sstore1_d) != list(sstore2_d)
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
 def test_pickle_vocab(strings, lex_attr):
    vocab = Vocab(strings=strings)
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@ -112,7 +112,7 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
    data = (
        "I'll return the ₹54 amount",
        {
-            "words": ["I", "'ll", "return", "the", "₹", "54", "amount",],
+            "words": ["I", "'ll", "return", "the", "₹", "54", "amount"],
            "entities": [(16, 19, "MONEY")],
        },
    )
@ -122,7 +122,7 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
    data = (
        "I'll return the $54 amount",
        {
-            "words": ["I", "'ll", "return", "the", "$", "54", "amount",],
+            "words": ["I", "'ll", "return", "the", "$", "54", "amount"],
            "entities": [(16, 19, "MONEY")],
        },
    )
--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@ -366,6 +366,7 @@ def test_vectors_serialize():
        assert row == row_r
        assert_equal(v.data, v_r.data)
 def test_vector_is_oov():
    vocab = Vocab(vectors_name="test_vocab_is_oov")
    data = numpy.ndarray((5, 3), dtype="f")
@ -375,4 +376,4 @@ def test_vector_is_oov():
    vocab.set_vector("dog", data[1])
    assert vocab["cat"].is_oov is True
    assert vocab["dog"].is_oov is True
-    assert vocab["hamster"].is_oov is False
+    assert vocab["hamster"].is_oov is False
--- a/spacy/util.py
+++ b/spacy/util.py
@ -774,7 +774,7 @@ def get_words_and_spaces(words, text):
        except ValueError:
            raise ValueError(Errors.E194.format(text=text, words=words))
        if word_start > 0:
-            text_words.append(text[text_pos:text_pos+word_start])
+            text_words.append(text[text_pos : text_pos + word_start])
            text_spaces.append(False)
            text_pos += word_start
        text_words.append(word)