Merge branch 'master' into fix/travis-tests

2025-08-30 08:54:57 +03:00 · 2020-05-21 14:23:04 +02:00 · 2020-05-21 14:23:04 +02:00 · bd6353715a
commit bd6353715a
parent 56de520afd d8f3190c0a
43 changed files with 132 additions and 79 deletions
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -187,12 +187,17 @@ def debug_data(
        n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
        msg.warn(
            "{} words in training data without vectors ({:0.2f}%)".format(
-                n_missing_vectors,
-                n_missing_vectors / gold_train_data["n_words"],
+                n_missing_vectors, n_missing_vectors / gold_train_data["n_words"],
            ),
        )
        msg.text(
-            "10 most common words without vectors: {}".format(_format_labels(gold_train_data["words_missing_vectors"].most_common(10), counts=True)), show=verbose,
+            "10 most common words without vectors: {}".format(
+                _format_labels(
+                    gold_train_data["words_missing_vectors"].most_common(10),
+                    counts=True,
+                )
+            ),
+            show=verbose,
        )
    else:
        msg.info("No word vectors present in the model")
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@ -49,7 +49,12 @@ DEFAULT_OOV_PROB = -20
        str,
    ),
    model_name=("Optional name for the model meta", "option", "mn", str),
-    base_model=("Base model (for languages with custom tokenizers)", "option", "b", str),
+    base_model=(
+        "Base model (for languages with custom tokenizers)",
+        "option",
+        "b",
+        str,
+    ),
 )
 def init_model(
    lang,
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -8,7 +8,7 @@ def add_codes(err_cls):
    class ErrorsWithCodes(err_cls):
        def __getattribute__(self, code):
            msg = super().__getattribute__(code)
-            if code.startswith('__'):  # python system attributes like __class__
+            if code.startswith("__"):  # python system attributes like __class__
                return msg
            else:
                return "[{code}] {msg}".format(code=code, msg=msg)
@ -116,6 +116,7 @@ class Warnings(object):
            " to check the alignment. Misaligned entities ('-') will be "
            "ignored during training.")

+
@add_codes
 class Errors(object):
    E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
--- a/spacy/lang/da/init.py
+++ b/spacy/lang/da/init.py
@ -9,7 +9,6 @@ from .morph_rules import MORPH_RULES
 from ..tag_map import TAG_MAP

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
 from ...language import Language
 from ...attrs import LANG
 from ...util import update_exc
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@ -197,7 +197,7 @@ for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:

        _exc[orth + "d"] = [
            {ORTH: orth, LEMMA: word, NORM: word},
-            {ORTH: "d", NORM: "'d"}
+            {ORTH: "d", NORM: "'d"},
        ]

        _exc[orth + "'d've"] = [
--- a/spacy/lang/es/punctuation.py
+++ b/spacy/lang/es/punctuation.py
@ -5,7 +5,6 @@ from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
 from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT
 from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
 from ..char_classes import merge_chars
-from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES


 _list_units = [u for u in LIST_UNITS if u != "%"]
--- a/spacy/lang/hy/init.py
+++ b/spacy/lang/hy/init.py
@ -1,11 +1,12 @@
+# coding: utf8
+from __future__ import unicode_literals
+
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .tag_map import TAG_MAP

-
 from ...attrs import LANG
 from ...language import Language
-from ...tokens import Doc


 class ArmenianDefaults(Language.Defaults):
--- a/spacy/lang/hy/examples.py
+++ b/spacy/lang/hy/examples.py
@ -1,7 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals

-
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.hy.examples import sentences
--- a/spacy/lang/hy/stop_words.py
+++ b/spacy/lang/hy/stop_words.py
@ -1,7 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals

-
 STOP_WORDS = set(
    """
 նա
--- a/spacy/lang/hy/tag_map.py
+++ b/spacy/lang/hy/tag_map.py
@ -1,7 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ...symbols import POS, SYM, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN
+from ...symbols import POS, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN
 from ...symbols import PROPN, PART, INTJ, PRON, SCONJ, AUX, CCONJ

 TAG_MAP = {
@ -716,7 +716,7 @@ TAG_MAP = {
        POS: NOUN,
        "Animacy": "Nhum",
        "Case": "Dat",
-        "Number": "Coll",
+        # "Number": "Coll",
        "Number": "Sing",
        "Person": "1",
    },
@ -815,7 +815,7 @@ TAG_MAP = {
        "Animacy": "Nhum",
        "Case": "Nom",
        "Definite": "Def",
-        "Number": "Plur",
+        # "Number": "Plur",
        "Number": "Sing",
        "Poss": "Yes",
    },
@ -880,7 +880,7 @@ TAG_MAP = {
        POS: NOUN,
        "Animacy": "Nhum",
        "Case": "Nom",
-        "Number": "Plur",
+        # "Number": "Plur",
        "Number": "Sing",
        "Person": "2",
    },
@ -1223,9 +1223,9 @@ TAG_MAP = {
    "PRON_Case=Nom|Number=Sing|Number=Plur|Person=3|Person=1|PronType=Emp": {
        POS: PRON,
        "Case": "Nom",
-        "Number": "Sing",
+        # "Number": "Sing",
        "Number": "Plur",
-        "Person": "3",
+        # "Person": "3",
        "Person": "1",
        "PronType": "Emp",
    },
--- a/spacy/lang/ml/lex_attrs.py
+++ b/spacy/lang/ml/lex_attrs.py
@ -55,7 +55,7 @@ _num_words = [
    "തൊണ്ണൂറ് ",
    "നുറ് ",
    "ആയിരം ",
-    "പത്തുലക്ഷം"
+    "പത്തുലക്ഷം",
 ]


--- a/spacy/lang/ml/stop_words.py
+++ b/spacy/lang/ml/stop_words.py
@ -3,7 +3,6 @@ from __future__ import unicode_literals


 STOP_WORDS = set(
-
    """
 അത്
 ഇത്
--- a/spacy/lang/pl/init.py
+++ b/spacy/lang/pl/init.py
@ -12,7 +12,7 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
 from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
+from ...util import add_lookups
 from ...lookups import Lookups


--- a/spacy/lang/pl/lemmatizer.py
+++ b/spacy/lang/pl/lemmatizer.py
@ -3,7 +3,6 @@ from __future__ import unicode_literals

 from ...lemmatizer import Lemmatizer
 from ...parts_of_speech import NAMES
-from ...errors import Errors


 class PolishLemmatizer(Lemmatizer):
--- a/spacy/lang/pl/punctuation.py
+++ b/spacy/lang/pl/punctuation.py
@ -8,7 +8,9 @@ from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES

 _quotes = CONCAT_QUOTES.replace("'", "")

-_prefixes = _prefixes = [r"(długo|krótko|jedno|dwu|trzy|cztero)-"] + BASE_TOKENIZER_PREFIXES
+_prefixes = _prefixes = [
+    r"(długo|krótko|jedno|dwu|trzy|cztero)-"
+] + BASE_TOKENIZER_PREFIXES

 _infixes = (
    LIST_ELLIPSES
--- a/spacy/lang/sv/lex_attrs.py
+++ b/spacy/lang/sv/lex_attrs.py
@ -40,7 +40,7 @@ _num_words = [
    "miljard",
    "biljon",
    "biljard",
-    "kvadriljon"
+    "kvadriljon",
 ]


--- a/spacy/lang/ur/tag_map.py
+++ b/spacy/lang/ur/tag_map.py
@ -38,7 +38,6 @@ TAG_MAP = {
    "NNPC": {POS: PROPN},
    "NNC": {POS: NOUN},
    "PSP": {POS: ADP},
-
    ".": {POS: PUNCT},
    ",": {POS: PUNCT},
    "-LRB-": {POS: PUNCT},
--- a/spacy/language.py
+++ b/spacy/language.py
@ -79,7 +79,9 @@ class BaseDefaults(object):
            lookups=lookups,
        )
        vocab.lex_attr_getters[NORM] = util.add_lookups(
-            vocab.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), BASE_NORMS, vocab.lookups.get_table("lexeme_norm")
+            vocab.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]),
+            BASE_NORMS,
+            vocab.lookups.get_table("lexeme_norm"),
        )
        for tag_str, exc in cls.morph_rules.items():
            for orth_str, attrs in exc.items():
@ -974,7 +976,9 @@ class Language(object):
        serializers = OrderedDict()
        serializers["vocab"] = lambda: self.vocab.to_bytes()
        serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"])
-        serializers["meta.json"] = lambda: srsly.json_dumps(OrderedDict(sorted(self.meta.items())))
+        serializers["meta.json"] = lambda: srsly.json_dumps(
+            OrderedDict(sorted(self.meta.items()))
+        )
        for name, proc in self.pipeline:
            if name in exclude:
                continue
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -112,6 +112,7 @@ def ga_tokenizer():
 def gu_tokenizer():
    return get_lang_class("gu").Defaults.create_tokenizer()

+
@pytest.fixture(scope="session")
 def he_tokenizer():
    return get_lang_class("he").Defaults.create_tokenizer()
@ -246,7 +247,9 @@ def yo_tokenizer():

@pytest.fixture(scope="session")
 def zh_tokenizer_char():
-    return get_lang_class("zh").Defaults.create_tokenizer(config={"use_jieba": False, "use_pkuseg": False})
+    return get_lang_class("zh").Defaults.create_tokenizer(
+        config={"use_jieba": False, "use_pkuseg": False}
+    )


@pytest.fixture(scope="session")
@ -258,7 +261,9 @@ def zh_tokenizer_jieba():
@pytest.fixture(scope="session")
 def zh_tokenizer_pkuseg():
    pytest.importorskip("pkuseg")
-    return get_lang_class("zh").Defaults.create_tokenizer(config={"pkuseg_model": "default", "use_jieba": False, "use_pkuseg": True})
+    return get_lang_class("zh").Defaults.create_tokenizer(
+        config={"pkuseg_model": "default", "use_jieba": False, "use_pkuseg": True}
+    )


@pytest.fixture(scope="session")
--- a/spacy/tests/doc/test_creation.py
+++ b/spacy/tests/doc/test_creation.py
@ -50,7 +50,9 @@ def test_create_from_words_and_text(vocab):
    assert [t.text for t in doc] == ["  ", "'", "dogs", "'", "\n\n", "run", " "]
    assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
    assert doc.text == text
-    assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()]
+    assert [t.text for t in doc if not t.text.isspace()] == [
+        word for word in words if not word.isspace()
+    ]

    # partial whitespace in words
    words = ["  ", "'", "dogs", "'", "\n\n", "run", " "]
@ -60,7 +62,9 @@ def test_create_from_words_and_text(vocab):
    assert [t.text for t in doc] == ["  ", "'", "dogs", "'", "\n\n", "run", " "]
    assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
    assert doc.text == text
-    assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()]
+    assert [t.text for t in doc if not t.text.isspace()] == [
+        word for word in words if not word.isspace()
+    ]

    # non-standard whitespace tokens
    words = [" ", " ", "'", "dogs", "'", "\n\n", "run"]
@ -70,7 +74,9 @@ def test_create_from_words_and_text(vocab):
    assert [t.text for t in doc] == ["  ", "'", "dogs", "'", "\n\n", "run", " "]
    assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
    assert doc.text == text
-    assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()]
+    assert [t.text for t in doc if not t.text.isspace()] == [
+        word for word in words if not word.isspace()
+    ]

    # mismatch between words and text
    with pytest.raises(ValueError):
--- a/spacy/tests/doc/test_token_api.py
+++ b/spacy/tests/doc/test_token_api.py
@ -181,6 +181,7 @@ def test_is_sent_start(en_tokenizer):
    doc.is_parsed = True
    assert len(list(doc.sents)) == 2

+
 def test_is_sent_end(en_tokenizer):
    doc = en_tokenizer("This is a sentence. This is another.")
    assert doc[4].is_sent_end is None
@ -213,6 +214,7 @@ def test_token0_has_sent_start_true():
    assert doc[1].is_sent_start is None
    assert not doc.is_sentenced

+
 def test_tokenlast_has_sent_end_true():
    doc = Doc(Vocab(), words=["hello", "world"])
    assert doc[0].is_sent_end is None
--- a/spacy/tests/lang/gu/test_text.py
+++ b/spacy/tests/lang/gu/test_text.py
@ -3,17 +3,16 @@ from __future__ import unicode_literals

 import pytest

+
 def test_gu_tokenizer_handlers_long_text(gu_tokenizer):
    text = """પશ્ચિમ ભારતમાં આવેલું ગુજરાત રાજ્ય જે વ્યક્તિઓની માતૃભૂમિ છે"""
    tokens = gu_tokenizer(text)
    assert len(tokens) == 9

+
@pytest.mark.parametrize(
    "text,length",
-    [
-        ("ગુજરાતીઓ ખાવાના શોખીન માનવામાં આવે છે", 6),
-        ("ખેતરની ખેડ કરવામાં આવે છે.", 5),
-    ],
+    [("ગુજરાતીઓ ખાવાના શોખીન માનવામાં આવે છે", 6), ("ખેતરની ખેડ કરવામાં આવે છે.", 5)],
 )
 def test_gu_tokenizer_handles_cnts(gu_tokenizer, text, length):
    tokens = gu_tokenizer(text)
--- a/spacy/tests/lang/ml/test_text.py
+++ b/spacy/tests/lang/ml/test_text.py
@ -10,7 +10,16 @@ def test_ml_tokenizer_handles_long_text(ml_tokenizer):
    assert len(tokens) == 5


-@pytest.mark.parametrize("text,length", [("എന്നാൽ അച്ചടിയുടെ ആവിർഭാവം ലിപിയിൽ കാര്യമായ മാറ്റങ്ങൾ വരുത്തിയത് കൂട്ടക്ഷരങ്ങളെ അണുഅക്ഷരങ്ങളായി പിരിച്ചുകൊണ്ടായിരുന്നു", 10), ("പരമ്പരാഗതമായി മലയാളം ഇടത്തുനിന്ന് വലത്തോട്ടാണ് എഴുതുന്നത്", 5)])
+@pytest.mark.parametrize(
+    "text,length",
+    [
+        (
+            "എന്നാൽ അച്ചടിയുടെ ആവിർഭാവം ലിപിയിൽ കാര്യമായ മാറ്റങ്ങൾ വരുത്തിയത് കൂട്ടക്ഷരങ്ങളെ അണുഅക്ഷരങ്ങളായി പിരിച്ചുകൊണ്ടായിരുന്നു",
+            10,
+        ),
+        ("പരമ്പരാഗതമായി മലയാളം ഇടത്തുനിന്ന് വലത്തോട്ടാണ് എഴുതുന്നത്", 5),
+    ],
+)
 def test_ml_tokenizer_handles_cnts(ml_tokenizer, text, length):
    tokens = ml_tokenizer(text)
    assert len(tokens) == length
--- a/spacy/tests/lang/zh/test_serialize.py
+++ b/spacy/tests/lang/zh/test_serialize.py
@ -34,5 +34,15 @@ def test_zh_tokenizer_serialize_pkuseg(zh_tokenizer_pkuseg):

@pytest.mark.slow
 def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
-    nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False, "use_pkuseg": True, "pkuseg_model": "medicine"}}})
+    nlp = Chinese(
+        meta={
+            "tokenizer": {
+                "config": {
+                    "use_jieba": False,
+                    "use_pkuseg": True,
+                    "pkuseg_model": "medicine",
+                }
+            }
+        }
+    )
    zh_tokenizer_serialize(nlp.tokenizer)
--- a/spacy/tests/lang/zh/test_tokenizer.py
+++ b/spacy/tests/lang/zh/test_tokenizer.py
@ -43,12 +43,16 @@ def test_zh_tokenizer_pkuseg(zh_tokenizer_pkuseg, text, expected_tokens):
 def test_zh_tokenizer_pkuseg_user_dict(zh_tokenizer_pkuseg):
    user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
    zh_tokenizer_pkuseg.pkuseg_update_user_dict(["nonsense_asdf"])
-    updated_user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
+    updated_user_dict = _get_pkuseg_trie_data(
+        zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie
+    )
    assert len(user_dict) == len(updated_user_dict) - 1

    # reset user dict
    zh_tokenizer_pkuseg.pkuseg_update_user_dict([], reset=True)
-    reset_user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
+    reset_user_dict = _get_pkuseg_trie_data(
+        zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie
+    )
    assert len(reset_user_dict) == 0


--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@ -272,8 +272,8 @@ def test_matcher_regex_shape(en_vocab):
        (">=", ["a"]),
        ("<=", ["aaa"]),
        (">", ["a", "aa"]),
-        ("<", ["aa", "aaa"])
-    ]
+        ("<", ["aa", "aaa"]),
+    ],
 )
 def test_matcher_compare_length(en_vocab, cmp, bad):
    matcher = Matcher(en_vocab)
--- a/spacy/tests/pipeline/test_sentencizer.py
+++ b/spacy/tests/pipeline/test_sentencizer.py
@ -106,7 +106,9 @@ def test_sentencizer_complex(en_vocab, words, sent_starts, sent_ends, n_sents):
        ),
    ],
 )
-def test_sentencizer_custom_punct(en_vocab, punct_chars, words, sent_starts, sent_ends, n_sents):
+def test_sentencizer_custom_punct(
+    en_vocab, punct_chars, words, sent_starts, sent_ends, n_sents
+):
    doc = Doc(en_vocab, words=words)
    sentencizer = Sentencizer(punct_chars=punct_chars)
    doc = sentencizer(doc)
--- a/spacy/tests/serialize/test_serialize_vocab_strings.py
+++ b/spacy/tests/serialize/test_serialize_vocab_strings.py
@ -56,9 +56,13 @@ def test_serialize_vocab_roundtrip_disk(strings1, strings2):
        assert strings1 == [s for s in vocab1_d.strings if s != "_SP"]
        assert strings2 == [s for s in vocab2_d.strings if s != "_SP"]
        if strings1 == strings2:
-            assert [s for s in vocab1_d.strings if s != "_SP"] == [s for s in vocab2_d.strings if s != "_SP"]
+            assert [s for s in vocab1_d.strings if s != "_SP"] == [
+                s for s in vocab2_d.strings if s != "_SP"
+            ]
        else:
-            assert [s for s in vocab1_d.strings if s != "_SP"] != [s for s in vocab2_d.strings if s != "_SP"]
+            assert [s for s in vocab1_d.strings if s != "_SP"] != [
+                s for s in vocab2_d.strings if s != "_SP"
+            ]


@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
@ -76,7 +80,6 @@ def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr):
 def test_deserialize_vocab_seen_entries(strings, lex_attr):
    # Reported in #2153
    vocab = Vocab(strings=strings)
-    length = len(vocab)
    vocab.from_bytes(vocab.to_bytes())
    assert len(vocab.strings) == len(strings) + 1  # adds _SP

@ -130,6 +133,7 @@ def test_serialize_stringstore_roundtrip_disk(strings1, strings2):
        else:
            assert list(sstore1_d) != list(sstore2_d)

+
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
 def test_pickle_vocab(strings, lex_attr):
    vocab = Vocab(strings=strings)
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@ -112,7 +112,7 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
    data = (
        "I'll return the ₹54 amount",
        {
-            "words": ["I", "'ll", "return", "the", "₹", "54", "amount",],
+            "words": ["I", "'ll", "return", "the", "₹", "54", "amount"],
            "entities": [(16, 19, "MONEY")],
        },
    )
@ -122,7 +122,7 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
    data = (
        "I'll return the $54 amount",
        {
-            "words": ["I", "'ll", "return", "the", "$", "54", "amount",],
+            "words": ["I", "'ll", "return", "the", "$", "54", "amount"],
            "entities": [(16, 19, "MONEY")],
        },
    )
--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@ -366,6 +366,7 @@ def test_vectors_serialize():
        assert row == row_r
        assert_equal(v.data, v_r.data)

+
 def test_vector_is_oov():
    vocab = Vocab(vectors_name="test_vocab_is_oov")
    data = numpy.ndarray((5, 3), dtype="f")
--- a/spacy/util.py
+++ b/spacy/util.py
@ -774,7 +774,7 @@ def get_words_and_spaces(words, text):
        except ValueError:
            raise ValueError(Errors.E194.format(text=text, words=words))
        if word_start > 0:
-            text_words.append(text[text_pos:text_pos+word_start])
+            text_words.append(text[text_pos : text_pos + word_start])
            text_spaces.append(False)
            text_pos += word_start
        text_words.append(word)