Tidy up and auto-format

This commit is contained in:
Ines Montani 2020-06-21 22:38:04 +02:00
parent f77e0bc028
commit ef5f548fb0
10 changed files with 39 additions and 47 deletions

View File

@ -136,7 +136,19 @@ for pron in ["he", "she", "it"]:
# W-words, relative pronouns, prepositions etc. # W-words, relative pronouns, prepositions etc.
for word in ["who", "what", "when", "where", "why", "how", "there", "that", "this", "these", "those"]: for word in [
"who",
"what",
"when",
"where",
"why",
"how",
"there",
"that",
"this",
"these",
"those",
]:
for orth in [word, word.title()]: for orth in [word, word.title()]:
_exc[orth + "'s"] = [ _exc[orth + "'s"] = [
{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: orth, LEMMA: word, NORM: word},
@ -396,14 +408,8 @@ _other_exc = {
{ORTH: "Let", LEMMA: "let", NORM: "let"}, {ORTH: "Let", LEMMA: "let", NORM: "let"},
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}, {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"},
], ],
"c'mon": [ "c'mon": [{ORTH: "c'm", NORM: "come", LEMMA: "come"}, {ORTH: "on"}],
{ORTH: "c'm", NORM: "come", LEMMA: "come"}, "C'mon": [{ORTH: "C'm", NORM: "come", LEMMA: "come"}, {ORTH: "on"}],
{ORTH: "on"}
],
"C'mon": [
{ORTH: "C'm", NORM: "come", LEMMA: "come"},
{ORTH: "on"}
]
} }
_exc.update(_other_exc) _exc.update(_other_exc)

View File

@ -24,7 +24,7 @@ def noun_chunks(obj):
doc = obj.doc # Ensure works on both Doc and Span. doc = obj.doc # Ensure works on both Doc and Span.
np_deps = [doc.vocab.strings.add(label) for label in labels] np_deps = [doc.vocab.strings.add(label) for label in labels]
conj = doc.vocab.strings.add("conj") doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP") np_label = doc.vocab.strings.add("NP")
seen = set() seen = set()
for i, word in enumerate(obj): for i, word in enumerate(obj):

View File

@ -1,21 +1,15 @@
# encoding: utf8 from ...symbols import ADJ, AUX, NOUN, PART, VERB
from __future__ import unicode_literals
from ...symbols import POS, ADJ, AUX, NOUN, PART, VERB
# mapping from tag bi-gram to pos of previous token # mapping from tag bi-gram to pos of previous token
TAG_BIGRAM_MAP = { TAG_BIGRAM_MAP = {
# This covers only small part of AUX. # This covers only small part of AUX.
("形容詞-非自立可能", "助詞-終助詞"): (AUX, None), ("形容詞-非自立可能", "助詞-終助詞"): (AUX, None),
("名詞-普通名詞-形状詞可能", "助動詞"): (ADJ, None), ("名詞-普通名詞-形状詞可能", "助動詞"): (ADJ, None),
# ("副詞", "名詞-普通名詞-形状詞可能"): (None, ADJ), # ("副詞", "名詞-普通名詞-形状詞可能"): (None, ADJ),
# This covers acl, advcl, obl and root, but has side effect for compound. # This covers acl, advcl, obl and root, but has side effect for compound.
("名詞-普通名詞-サ変可能", "動詞-非自立可能"): (VERB, AUX), ("名詞-普通名詞-サ変可能", "動詞-非自立可能"): (VERB, AUX),
# This covers almost all of the deps # This covers almost all of the deps
("名詞-普通名詞-サ変形状詞可能", "動詞-非自立可能"): (VERB, AUX), ("名詞-普通名詞-サ変形状詞可能", "動詞-非自立可能"): (VERB, AUX),
("名詞-普通名詞-副詞可能", "動詞-非自立可能"): (None, VERB), ("名詞-普通名詞-副詞可能", "動詞-非自立可能"): (None, VERB),
("副詞", "動詞-非自立可能"): (None, VERB), ("副詞", "動詞-非自立可能"): (None, VERB),
("形容詞-一般", "動詞-非自立可能"): (None, VERB), ("形容詞-一般", "動詞-非自立可能"): (None, VERB),
@ -25,12 +19,9 @@ TAG_BIGRAM_MAP = {
("助詞-副助詞", "動詞-非自立可能"): (None, VERB), ("助詞-副助詞", "動詞-非自立可能"): (None, VERB),
("助詞-格助詞", "動詞-非自立可能"): (None, VERB), ("助詞-格助詞", "動詞-非自立可能"): (None, VERB),
("補助記号-読点", "動詞-非自立可能"): (None, VERB), ("補助記号-読点", "動詞-非自立可能"): (None, VERB),
("形容詞-一般", "接尾辞-名詞的-一般"): (None, PART), ("形容詞-一般", "接尾辞-名詞的-一般"): (None, PART),
("助詞-格助詞", "形状詞-助動詞語幹"): (None, NOUN), ("助詞-格助詞", "形状詞-助動詞語幹"): (None, NOUN),
("連体詞", "形状詞-助動詞語幹"): (None, NOUN), ("連体詞", "形状詞-助動詞語幹"): (None, NOUN),
("動詞-一般", "助詞-副助詞"): (None, PART), ("動詞-一般", "助詞-副助詞"): (None, PART),
("動詞-非自立可能", "助詞-副助詞"): (None, PART), ("動詞-非自立可能", "助詞-副助詞"): (None, PART),
("助動詞", "助詞-副助詞"): (None, PART), ("助動詞", "助詞-副助詞"): (None, PART),

View File

@ -1,17 +1,9 @@
# encoding: utf8 from ...symbols import DET, PART, PRON, SPACE, X
from __future__ import unicode_literals
from ...symbols import POS, ADJ, AUX, DET, PART, PRON, SPACE ,X
# mapping from tag bi-gram to pos of previous token # mapping from tag bi-gram to pos of previous token
TAG_ORTH_MAP = { TAG_ORTH_MAP = {
"空白": { "空白": {" ": SPACE, " ": X},
" ": SPACE, "助詞-副助詞": {"たり": PART},
" ": X,
},
"助詞-副助詞": {
"たり": PART,
},
"連体詞": { "連体詞": {
"あの": DET, "あの": DET,
"かの": DET, "かの": DET,

View File

@ -18,5 +18,5 @@ sentences = [
"ஆப்பிள் நிறுவனம் யு.கே. தொடக்க நிறுவனத்தை ஒரு லட்சம் கோடிக்கு வாங்கப் பார்க்கிறது", "ஆப்பிள் நிறுவனம் யு.கே. தொடக்க நிறுவனத்தை ஒரு லட்சம் கோடிக்கு வாங்கப் பார்க்கிறது",
"தன்னாட்சி கார்கள் காப்பீட்டு பொறுப்பை உற்பத்தியாளரிடம் மாற்றுகின்றன", "தன்னாட்சி கார்கள் காப்பீட்டு பொறுப்பை உற்பத்தியாளரிடம் மாற்றுகின்றன",
"நடைபாதை விநியோக ரோபோக்களை தடை செய்வதை சான் பிரான்சிஸ்கோ கருதுகிறது", "நடைபாதை விநியோக ரோபோக்களை தடை செய்வதை சான் பிரான்சிஸ்கோ கருதுகிறது",
"லண்டன் ஐக்கிய இராச்சியத்தில் ஒரு பெரிய நகரம்." "லண்டன் ஐக்கிய இராச்சியத்தில் ஒரு பெரிய நகரம்.",
] ]

View File

@ -1,6 +1,6 @@
import re import re
from .char_classes import ALPHA_LOWER, ALPHA from .char_classes import ALPHA_LOWER
from ..symbols import ORTH, POS, TAG, LEMMA, SPACE from ..symbols import ORTH, POS, TAG, LEMMA, SPACE

View File

@ -7,7 +7,7 @@ def test_ja_tokenizer_serialize(ja_tokenizer):
nlp = Japanese() nlp = Japanese()
nlp.tokenizer.from_bytes(tokenizer_bytes) nlp.tokenizer.from_bytes(tokenizer_bytes)
assert tokenizer_bytes == nlp.tokenizer.to_bytes() assert tokenizer_bytes == nlp.tokenizer.to_bytes()
assert nlp.tokenizer.split_mode == None assert nlp.tokenizer.split_mode is None
with make_tempdir() as d: with make_tempdir() as d:
file_path = d / "tokenizer" file_path = d / "tokenizer"
@ -15,7 +15,7 @@ def test_ja_tokenizer_serialize(ja_tokenizer):
nlp = Japanese() nlp = Japanese()
nlp.tokenizer.from_disk(file_path) nlp.tokenizer.from_disk(file_path)
assert tokenizer_bytes == nlp.tokenizer.to_bytes() assert tokenizer_bytes == nlp.tokenizer.to_bytes()
assert nlp.tokenizer.split_mode == None assert nlp.tokenizer.split_mode is None
# split mode is (de)serialized correctly # split mode is (de)serialized correctly
nlp = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}}) nlp = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}})

View File

@ -29,10 +29,9 @@ POS_TESTS = [
] ]
SENTENCE_TESTS = [ SENTENCE_TESTS = [
('あれ。これ。', ['あれ。', 'これ。']), ("あれ。これ。", ["あれ。", "これ。"]),
('「伝染るんです。」という漫画があります。', ("「伝染るんです。」という漫画があります。", ["「伝染るんです。」という漫画があります。"]),
['「伝染るんです。」という漫画があります。']), ]
]
# fmt: on # fmt: on
@ -48,7 +47,7 @@ def test_ja_tokenizer_tags(ja_tokenizer, text, expected_tags):
assert tags == expected_tags assert tags == expected_tags
#XXX This isn't working? Always passes # XXX This isn't working? Always passes
@pytest.mark.parametrize("text,expected_pos", POS_TESTS) @pytest.mark.parametrize("text,expected_pos", POS_TESTS)
def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos): def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos):
pos = [token.pos_ for token in ja_tokenizer(text)] pos = [token.pos_ for token in ja_tokenizer(text)]
@ -57,7 +56,7 @@ def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos):
@pytest.mark.skip(reason="sentence segmentation in tokenizer is buggy") @pytest.mark.skip(reason="sentence segmentation in tokenizer is buggy")
@pytest.mark.parametrize("text,expected_sents", SENTENCE_TESTS) @pytest.mark.parametrize("text,expected_sents", SENTENCE_TESTS)
def test_ja_tokenizer_pos(ja_tokenizer, text, expected_sents): def test_ja_tokenizer_sents(ja_tokenizer, text, expected_sents):
sents = [str(sent) for sent in ja_tokenizer(text).sents] sents = [str(sent) for sent in ja_tokenizer(text).sents]
assert sents == expected_sents assert sents == expected_sents
@ -74,13 +73,14 @@ def test_ja_tokenizer_naughty_strings(ja_tokenizer, text):
assert tokens.text_with_ws == text assert tokens.text_with_ws == text
@pytest.mark.parametrize("text,len_a,len_b,len_c", @pytest.mark.parametrize(
"text,len_a,len_b,len_c",
[ [
("選挙管理委員会", 4, 3, 1), ("選挙管理委員会", 4, 3, 1),
("客室乗務員", 3, 2, 1), ("客室乗務員", 3, 2, 1),
("労働者協同組合", 4, 3, 1), ("労働者協同組合", 4, 3, 1),
("機能性食品", 3, 2, 1), ("機能性食品", 3, 2, 1),
] ],
) )
def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c): def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c):
nlp_a = Japanese(meta={"tokenizer": {"config": {"split_mode": "A"}}}) nlp_a = Japanese(meta={"tokenizer": {"config": {"split_mode": "A"}}})

View File

@ -10,7 +10,13 @@ def test_build_dependencies():
"mock", "mock",
"flake8", "flake8",
] ]
libs_ignore_setup = ["fugashi", "natto-py", "pythainlp", "sudachipy", "sudachidict_core"] libs_ignore_setup = [
"fugashi",
"natto-py",
"pythainlp",
"sudachipy",
"sudachidict_core",
]
# check requirements.txt # check requirements.txt
req_dict = {} req_dict = {}

View File

@ -1,7 +1,6 @@
import pytest import pytest
import os import os
import ctypes import ctypes
import srsly
from pathlib import Path from pathlib import Path
from spacy.about import __version__ as spacy_version from spacy.about import __version__ as spacy_version
from spacy import util from spacy import util
@ -9,8 +8,6 @@ from spacy import prefer_gpu, require_gpu
from spacy.ml._precomputable_affine import PrecomputableAffine from spacy.ml._precomputable_affine import PrecomputableAffine
from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
from .util import make_tempdir
@pytest.fixture @pytest.fixture
def is_admin(): def is_admin():