Tidy up and auto-format

2025-10-16 16:54:38 +03:00 · 2020-06-21 22:38:04 +02:00 · 2020-06-21 22:38:04 +02:00 · ef5f548fb0
commit ef5f548fb0
parent f77e0bc028
10 changed files with 39 additions and 47 deletions
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@ -136,7 +136,19 @@ for pron in ["he", "she", "it"]:
 # W-words, relative pronouns, prepositions etc.
-for word in ["who", "what", "when", "where", "why", "how", "there", "that", "this", "these", "those"]:
+for word in [
    "who",
    "what",
    "when",
    "where",
    "why",
    "how",
    "there",
    "that",
    "this",
    "these",
    "those",
 ]:
    for orth in [word, word.title()]:
        _exc[orth + "'s"] = [
            {ORTH: orth, LEMMA: word, NORM: word},
@ -396,14 +408,8 @@ _other_exc = {
        {ORTH: "Let", LEMMA: "let", NORM: "let"},
        {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"},
    ],
-    "c'mon": [
+    "c'mon": [{ORTH: "c'm", NORM: "come", LEMMA: "come"}, {ORTH: "on"}],
-        {ORTH: "c'm", NORM: "come", LEMMA: "come"},
+    "C'mon": [{ORTH: "C'm", NORM: "come", LEMMA: "come"}, {ORTH: "on"}],
        {ORTH: "on"}
    ],
    "C'mon": [
        {ORTH: "C'm", NORM: "come", LEMMA: "come"},
        {ORTH: "on"}
    ]
 }
 _exc.update(_other_exc)
--- a/spacy/lang/ja/syntax_iterators.py
+++ b/spacy/lang/ja/syntax_iterators.py
@ -24,7 +24,7 @@ def noun_chunks(obj):
    doc = obj.doc  # Ensure works on both Doc and Span.
    np_deps = [doc.vocab.strings.add(label) for label in labels]
-    conj = doc.vocab.strings.add("conj")
+    doc.vocab.strings.add("conj")
    np_label = doc.vocab.strings.add("NP")
    seen = set()
    for i, word in enumerate(obj):
--- a/spacy/lang/ja/tag_bigram_map.py
+++ b/spacy/lang/ja/tag_bigram_map.py
@ -1,21 +1,15 @@
-# encoding: utf8
+from ...symbols import ADJ, AUX, NOUN, PART, VERB
 from __future__ import unicode_literals
 from ...symbols import POS, ADJ, AUX, NOUN, PART, VERB
 # mapping from tag bi-gram to pos of previous token
 TAG_BIGRAM_MAP = {
    # This covers only small part of AUX.
    ("形容詞-非自立可能", "助詞-終助詞"): (AUX, None),
    ("名詞-普通名詞-形状詞可能", "助動詞"): (ADJ, None),
    # ("副詞", "名詞-普通名詞-形状詞可能"): (None, ADJ),
    # This covers acl, advcl, obl and root, but has side effect for compound.
    ("名詞-普通名詞-サ変可能", "動詞-非自立可能"): (VERB, AUX),
    # This covers almost all of the deps
    ("名詞-普通名詞-サ変形状詞可能", "動詞-非自立可能"): (VERB, AUX),
    ("名詞-普通名詞-副詞可能", "動詞-非自立可能"): (None, VERB),
    ("副詞", "動詞-非自立可能"): (None, VERB),
    ("形容詞-一般", "動詞-非自立可能"): (None, VERB),
@ -25,12 +19,9 @@ TAG_BIGRAM_MAP = {
    ("助詞-副助詞", "動詞-非自立可能"): (None, VERB),
    ("助詞-格助詞", "動詞-非自立可能"): (None, VERB),
    ("補助記号-読点", "動詞-非自立可能"): (None, VERB),
    ("形容詞-一般", "接尾辞-名詞的-一般"): (None, PART),
    ("助詞-格助詞", "形状詞-助動詞語幹"): (None, NOUN),
    ("連体詞", "形状詞-助動詞語幹"): (None, NOUN),
    ("動詞-一般", "助詞-副助詞"): (None, PART),
    ("動詞-非自立可能", "助詞-副助詞"): (None, PART),
    ("助動詞", "助詞-副助詞"): (None, PART),
--- a/spacy/lang/ja/tag_orth_map.py
+++ b/spacy/lang/ja/tag_orth_map.py
@ -1,17 +1,9 @@
-# encoding: utf8
+from ...symbols import DET, PART, PRON, SPACE, X
 from __future__ import unicode_literals
 from ...symbols import POS, ADJ, AUX, DET, PART, PRON, SPACE ,X
 # mapping from tag bi-gram to pos of previous token
 TAG_ORTH_MAP = {
-    "空白": {
+    "空白": {" ": SPACE, "　": X},
-        " ": SPACE,
+    "助詞-副助詞": {"たり": PART},
        "　": X,
    },
    "助詞-副助詞": {
        "たり": PART,
    },
    "連体詞": {
        "あの": DET,
        "かの": DET,
--- a/spacy/lang/ta/examples.py
+++ b/spacy/lang/ta/examples.py
@ -18,5 +18,5 @@ sentences = [
    "ஆப்பிள் நிறுவனம் யு.கே. தொடக்க நிறுவனத்தை ஒரு லட்சம் கோடிக்கு வாங்கப் பார்க்கிறது",
    "தன்னாட்சி கார்கள் காப்பீட்டு பொறுப்பை உற்பத்தியாளரிடம் மாற்றுகின்றன",
    "நடைபாதை விநியோக ரோபோக்களை தடை செய்வதை சான் பிரான்சிஸ்கோ கருதுகிறது",
-    "லண்டன் ஐக்கிய இராச்சியத்தில் ஒரு பெரிய நகரம்."
+    "லண்டன் ஐக்கிய இராச்சியத்தில் ஒரு பெரிய நகரம்.",
 ]
--- a/spacy/lang/tokenizer_exceptions.py
+++ b/spacy/lang/tokenizer_exceptions.py
@ -1,6 +1,6 @@
 import re
-from .char_classes import ALPHA_LOWER, ALPHA
+from .char_classes import ALPHA_LOWER
 from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
--- a/spacy/tests/lang/ja/test_serialize.py
+++ b/spacy/tests/lang/ja/test_serialize.py
@ -7,7 +7,7 @@ def test_ja_tokenizer_serialize(ja_tokenizer):
    nlp = Japanese()
    nlp.tokenizer.from_bytes(tokenizer_bytes)
    assert tokenizer_bytes == nlp.tokenizer.to_bytes()
-    assert nlp.tokenizer.split_mode == None
+    assert nlp.tokenizer.split_mode is None
    with make_tempdir() as d:
        file_path = d / "tokenizer"
@ -15,7 +15,7 @@ def test_ja_tokenizer_serialize(ja_tokenizer):
        nlp = Japanese()
        nlp.tokenizer.from_disk(file_path)
        assert tokenizer_bytes == nlp.tokenizer.to_bytes()
-        assert nlp.tokenizer.split_mode == None
+        assert nlp.tokenizer.split_mode is None
    # split mode is (de)serialized correctly
    nlp = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}})
--- a/spacy/tests/lang/ja/test_tokenizer.py
+++ b/spacy/tests/lang/ja/test_tokenizer.py
@ -29,10 +29,9 @@ POS_TESTS = [
 ]
 SENTENCE_TESTS = [
-        ('あれ。これ。', ['あれ。', 'これ。']),
+    ("あれ。これ。", ["あれ。", "これ。"]),
-        ('「伝染るんです。」という漫画があります。', 
+    ("「伝染るんです。」という漫画があります。", ["「伝染るんです。」という漫画があります。"]),
-            ['「伝染るんです。」という漫画があります。']),
+]
        ]
 # fmt: on
@ -48,7 +47,7 @@ def test_ja_tokenizer_tags(ja_tokenizer, text, expected_tags):
    assert tags == expected_tags
-#XXX This isn't working? Always passes
+# XXX This isn't working? Always passes
@pytest.mark.parametrize("text,expected_pos", POS_TESTS)
 def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos):
    pos = [token.pos_ for token in ja_tokenizer(text)]
@ -57,7 +56,7 @@ def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos):
@pytest.mark.skip(reason="sentence segmentation in tokenizer is buggy")
@pytest.mark.parametrize("text,expected_sents", SENTENCE_TESTS)
-def test_ja_tokenizer_pos(ja_tokenizer, text, expected_sents):
+def test_ja_tokenizer_sents(ja_tokenizer, text, expected_sents):
    sents = [str(sent) for sent in ja_tokenizer(text).sents]
    assert sents == expected_sents
@ -74,13 +73,14 @@ def test_ja_tokenizer_naughty_strings(ja_tokenizer, text):
    assert tokens.text_with_ws == text
-@pytest.mark.parametrize("text,len_a,len_b,len_c",
+@pytest.mark.parametrize(
    "text,len_a,len_b,len_c",
    [
        ("選挙管理委員会", 4, 3, 1),
        ("客室乗務員", 3, 2, 1),
        ("労働者協同組合", 4, 3, 1),
        ("機能性食品", 3, 2, 1),
-    ]
+    ],
 )
 def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c):
    nlp_a = Japanese(meta={"tokenizer": {"config": {"split_mode": "A"}}})
--- a/spacy/tests/package/test_requirements.py
+++ b/spacy/tests/package/test_requirements.py
@ -10,7 +10,13 @@ def test_build_dependencies():
        "mock",
        "flake8",
    ]
-    libs_ignore_setup = ["fugashi", "natto-py", "pythainlp", "sudachipy", "sudachidict_core"]
+    libs_ignore_setup = [
        "fugashi",
        "natto-py",
        "pythainlp",
        "sudachipy",
        "sudachidict_core",
    ]
    # check requirements.txt
    req_dict = {}
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@ -1,7 +1,6 @@
 import pytest
 import os
 import ctypes
 import srsly
 from pathlib import Path
 from spacy.about import __version__ as spacy_version
 from spacy import util
@ -9,8 +8,6 @@ from spacy import prefer_gpu, require_gpu
 from spacy.ml._precomputable_affine import PrecomputableAffine
 from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
 from .util import make_tempdir
@pytest.fixture
 def is_admin():