From ef5f548fb0b8f4737a41a838c0d1123752e12346 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 21 Jun 2020 22:38:04 +0200 Subject: [PATCH] Tidy up and auto-format --- spacy/lang/en/tokenizer_exceptions.py | 24 +++++++++++++++--------- spacy/lang/ja/syntax_iterators.py | 2 +- spacy/lang/ja/tag_bigram_map.py | 11 +---------- spacy/lang/ja/tag_orth_map.py | 14 +++----------- spacy/lang/ta/examples.py | 2 +- spacy/lang/tokenizer_exceptions.py | 2 +- spacy/tests/lang/ja/test_serialize.py | 4 ++-- spacy/tests/lang/ja/test_tokenizer.py | 16 ++++++++-------- spacy/tests/package/test_requirements.py | 8 +++++++- spacy/tests/test_misc.py | 3 --- 10 files changed, 39 insertions(+), 47 deletions(-) diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py index e024dd483..dc8a5c04d 100644 --- a/spacy/lang/en/tokenizer_exceptions.py +++ b/spacy/lang/en/tokenizer_exceptions.py @@ -136,7 +136,19 @@ for pron in ["he", "she", "it"]: # W-words, relative pronouns, prepositions etc. -for word in ["who", "what", "when", "where", "why", "how", "there", "that", "this", "these", "those"]: +for word in [ + "who", + "what", + "when", + "where", + "why", + "how", + "there", + "that", + "this", + "these", + "those", +]: for orth in [word, word.title()]: _exc[orth + "'s"] = [ {ORTH: orth, LEMMA: word, NORM: word}, @@ -396,14 +408,8 @@ _other_exc = { {ORTH: "Let", LEMMA: "let", NORM: "let"}, {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}, ], - "c'mon": [ - {ORTH: "c'm", NORM: "come", LEMMA: "come"}, - {ORTH: "on"} - ], - "C'mon": [ - {ORTH: "C'm", NORM: "come", LEMMA: "come"}, - {ORTH: "on"} - ] + "c'mon": [{ORTH: "c'm", NORM: "come", LEMMA: "come"}, {ORTH: "on"}], + "C'mon": [{ORTH: "C'm", NORM: "come", LEMMA: "come"}, {ORTH: "on"}], } _exc.update(_other_exc) diff --git a/spacy/lang/ja/syntax_iterators.py b/spacy/lang/ja/syntax_iterators.py index 3f6e4bfa3..bb0554cf9 100644 --- a/spacy/lang/ja/syntax_iterators.py +++ b/spacy/lang/ja/syntax_iterators.py @@ -24,7 +24,7 @@ def noun_chunks(obj): doc = obj.doc # Ensure works on both Doc and Span. np_deps = [doc.vocab.strings.add(label) for label in labels] - conj = doc.vocab.strings.add("conj") + doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") seen = set() for i, word in enumerate(obj): diff --git a/spacy/lang/ja/tag_bigram_map.py b/spacy/lang/ja/tag_bigram_map.py index 5ed9aec89..9d15fc520 100644 --- a/spacy/lang/ja/tag_bigram_map.py +++ b/spacy/lang/ja/tag_bigram_map.py @@ -1,21 +1,15 @@ -# encoding: utf8 -from __future__ import unicode_literals - -from ...symbols import POS, ADJ, AUX, NOUN, PART, VERB +from ...symbols import ADJ, AUX, NOUN, PART, VERB # mapping from tag bi-gram to pos of previous token TAG_BIGRAM_MAP = { # This covers only small part of AUX. ("形容詞-非自立可能", "助詞-終助詞"): (AUX, None), - ("名詞-普通名詞-形状詞可能", "助動詞"): (ADJ, None), # ("副詞", "名詞-普通名詞-形状詞可能"): (None, ADJ), - # This covers acl, advcl, obl and root, but has side effect for compound. ("名詞-普通名詞-サ変可能", "動詞-非自立可能"): (VERB, AUX), # This covers almost all of the deps ("名詞-普通名詞-サ変形状詞可能", "動詞-非自立可能"): (VERB, AUX), - ("名詞-普通名詞-副詞可能", "動詞-非自立可能"): (None, VERB), ("副詞", "動詞-非自立可能"): (None, VERB), ("形容詞-一般", "動詞-非自立可能"): (None, VERB), @@ -25,12 +19,9 @@ TAG_BIGRAM_MAP = { ("助詞-副助詞", "動詞-非自立可能"): (None, VERB), ("助詞-格助詞", "動詞-非自立可能"): (None, VERB), ("補助記号-読点", "動詞-非自立可能"): (None, VERB), - ("形容詞-一般", "接尾辞-名詞的-一般"): (None, PART), - ("助詞-格助詞", "形状詞-助動詞語幹"): (None, NOUN), ("連体詞", "形状詞-助動詞語幹"): (None, NOUN), - ("動詞-一般", "助詞-副助詞"): (None, PART), ("動詞-非自立可能", "助詞-副助詞"): (None, PART), ("助動詞", "助詞-副助詞"): (None, PART), diff --git a/spacy/lang/ja/tag_orth_map.py b/spacy/lang/ja/tag_orth_map.py index 355cc655b..9d32cdea7 100644 --- a/spacy/lang/ja/tag_orth_map.py +++ b/spacy/lang/ja/tag_orth_map.py @@ -1,17 +1,9 @@ -# encoding: utf8 -from __future__ import unicode_literals - -from ...symbols import POS, ADJ, AUX, DET, PART, PRON, SPACE ,X +from ...symbols import DET, PART, PRON, SPACE, X # mapping from tag bi-gram to pos of previous token TAG_ORTH_MAP = { - "空白": { - " ": SPACE, - " ": X, - }, - "助詞-副助詞": { - "たり": PART, - }, + "空白": {" ": SPACE, " ": X}, + "助詞-副助詞": {"たり": PART}, "連体詞": { "あの": DET, "かの": DET, diff --git a/spacy/lang/ta/examples.py b/spacy/lang/ta/examples.py index 245b8ba1a..c3c47e66e 100644 --- a/spacy/lang/ta/examples.py +++ b/spacy/lang/ta/examples.py @@ -18,5 +18,5 @@ sentences = [ "ஆப்பிள் நிறுவனம் யு.கே. தொடக்க நிறுவனத்தை ஒரு லட்சம் கோடிக்கு வாங்கப் பார்க்கிறது", "தன்னாட்சி கார்கள் காப்பீட்டு பொறுப்பை உற்பத்தியாளரிடம் மாற்றுகின்றன", "நடைபாதை விநியோக ரோபோக்களை தடை செய்வதை சான் பிரான்சிஸ்கோ கருதுகிறது", - "லண்டன் ஐக்கிய இராச்சியத்தில் ஒரு பெரிய நகரம்." + "லண்டன் ஐக்கிய இராச்சியத்தில் ஒரு பெரிய நகரம்.", ] diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index 28bc51228..f732a9097 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -1,6 +1,6 @@ import re -from .char_classes import ALPHA_LOWER, ALPHA +from .char_classes import ALPHA_LOWER from ..symbols import ORTH, POS, TAG, LEMMA, SPACE diff --git a/spacy/tests/lang/ja/test_serialize.py b/spacy/tests/lang/ja/test_serialize.py index 9e703e63d..4d4174b03 100644 --- a/spacy/tests/lang/ja/test_serialize.py +++ b/spacy/tests/lang/ja/test_serialize.py @@ -7,7 +7,7 @@ def test_ja_tokenizer_serialize(ja_tokenizer): nlp = Japanese() nlp.tokenizer.from_bytes(tokenizer_bytes) assert tokenizer_bytes == nlp.tokenizer.to_bytes() - assert nlp.tokenizer.split_mode == None + assert nlp.tokenizer.split_mode is None with make_tempdir() as d: file_path = d / "tokenizer" @@ -15,7 +15,7 @@ def test_ja_tokenizer_serialize(ja_tokenizer): nlp = Japanese() nlp.tokenizer.from_disk(file_path) assert tokenizer_bytes == nlp.tokenizer.to_bytes() - assert nlp.tokenizer.split_mode == None + assert nlp.tokenizer.split_mode is None # split mode is (de)serialized correctly nlp = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}}) diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py index ee532cb81..f76a9067a 100644 --- a/spacy/tests/lang/ja/test_tokenizer.py +++ b/spacy/tests/lang/ja/test_tokenizer.py @@ -29,10 +29,9 @@ POS_TESTS = [ ] SENTENCE_TESTS = [ - ('あれ。これ。', ['あれ。', 'これ。']), - ('「伝染るんです。」という漫画があります。', - ['「伝染るんです。」という漫画があります。']), - ] + ("あれ。これ。", ["あれ。", "これ。"]), + ("「伝染るんです。」という漫画があります。", ["「伝染るんです。」という漫画があります。"]), +] # fmt: on @@ -48,7 +47,7 @@ def test_ja_tokenizer_tags(ja_tokenizer, text, expected_tags): assert tags == expected_tags -#XXX This isn't working? Always passes +# XXX This isn't working? Always passes @pytest.mark.parametrize("text,expected_pos", POS_TESTS) def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos): pos = [token.pos_ for token in ja_tokenizer(text)] @@ -57,7 +56,7 @@ def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos): @pytest.mark.skip(reason="sentence segmentation in tokenizer is buggy") @pytest.mark.parametrize("text,expected_sents", SENTENCE_TESTS) -def test_ja_tokenizer_pos(ja_tokenizer, text, expected_sents): +def test_ja_tokenizer_sents(ja_tokenizer, text, expected_sents): sents = [str(sent) for sent in ja_tokenizer(text).sents] assert sents == expected_sents @@ -74,13 +73,14 @@ def test_ja_tokenizer_naughty_strings(ja_tokenizer, text): assert tokens.text_with_ws == text -@pytest.mark.parametrize("text,len_a,len_b,len_c", +@pytest.mark.parametrize( + "text,len_a,len_b,len_c", [ ("選挙管理委員会", 4, 3, 1), ("客室乗務員", 3, 2, 1), ("労働者協同組合", 4, 3, 1), ("機能性食品", 3, 2, 1), - ] + ], ) def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c): nlp_a = Japanese(meta={"tokenizer": {"config": {"split_mode": "A"}}}) diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py index a7c9a3ea4..6cc8fa6a8 100644 --- a/spacy/tests/package/test_requirements.py +++ b/spacy/tests/package/test_requirements.py @@ -10,7 +10,13 @@ def test_build_dependencies(): "mock", "flake8", ] - libs_ignore_setup = ["fugashi", "natto-py", "pythainlp", "sudachipy", "sudachidict_core"] + libs_ignore_setup = [ + "fugashi", + "natto-py", + "pythainlp", + "sudachipy", + "sudachidict_core", + ] # check requirements.txt req_dict = {} diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 5f9e72f79..f6724f632 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -1,7 +1,6 @@ import pytest import os import ctypes -import srsly from pathlib import Path from spacy.about import __version__ as spacy_version from spacy import util @@ -9,8 +8,6 @@ from spacy import prefer_gpu, require_gpu from spacy.ml._precomputable_affine import PrecomputableAffine from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding -from .util import make_tempdir - @pytest.fixture def is_admin():