Tidy up and auto-format

This commit is contained in:
Ines Montani 2020-06-21 22:38:04 +02:00
parent f77e0bc028
commit ef5f548fb0
10 changed files with 39 additions and 47 deletions

View File

@ -136,7 +136,19 @@ for pron in ["he", "she", "it"]:
# W-words, relative pronouns, prepositions etc.
for word in ["who", "what", "when", "where", "why", "how", "there", "that", "this", "these", "those"]:
for word in [
"who",
"what",
"when",
"where",
"why",
"how",
"there",
"that",
"this",
"these",
"those",
]:
for orth in [word, word.title()]:
_exc[orth + "'s"] = [
{ORTH: orth, LEMMA: word, NORM: word},
@ -396,14 +408,8 @@ _other_exc = {
{ORTH: "Let", LEMMA: "let", NORM: "let"},
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"},
],
"c'mon": [
{ORTH: "c'm", NORM: "come", LEMMA: "come"},
{ORTH: "on"}
],
"C'mon": [
{ORTH: "C'm", NORM: "come", LEMMA: "come"},
{ORTH: "on"}
]
"c'mon": [{ORTH: "c'm", NORM: "come", LEMMA: "come"}, {ORTH: "on"}],
"C'mon": [{ORTH: "C'm", NORM: "come", LEMMA: "come"}, {ORTH: "on"}],
}
_exc.update(_other_exc)

View File

@ -24,7 +24,7 @@ def noun_chunks(obj):
doc = obj.doc # Ensure works on both Doc and Span.
np_deps = [doc.vocab.strings.add(label) for label in labels]
conj = doc.vocab.strings.add("conj")
doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
seen = set()
for i, word in enumerate(obj):

View File

@ -1,21 +1,15 @@
# encoding: utf8
from __future__ import unicode_literals
from ...symbols import POS, ADJ, AUX, NOUN, PART, VERB
from ...symbols import ADJ, AUX, NOUN, PART, VERB
# mapping from tag bi-gram to pos of previous token
TAG_BIGRAM_MAP = {
# This covers only small part of AUX.
("形容詞-非自立可能", "助詞-終助詞"): (AUX, None),
("名詞-普通名詞-形状詞可能", "助動詞"): (ADJ, None),
# ("副詞", "名詞-普通名詞-形状詞可能"): (None, ADJ),
# This covers acl, advcl, obl and root, but has side effect for compound.
("名詞-普通名詞-サ変可能", "動詞-非自立可能"): (VERB, AUX),
# This covers almost all of the deps
("名詞-普通名詞-サ変形状詞可能", "動詞-非自立可能"): (VERB, AUX),
("名詞-普通名詞-副詞可能", "動詞-非自立可能"): (None, VERB),
("副詞", "動詞-非自立可能"): (None, VERB),
("形容詞-一般", "動詞-非自立可能"): (None, VERB),
@ -25,12 +19,9 @@ TAG_BIGRAM_MAP = {
("助詞-副助詞", "動詞-非自立可能"): (None, VERB),
("助詞-格助詞", "動詞-非自立可能"): (None, VERB),
("補助記号-読点", "動詞-非自立可能"): (None, VERB),
("形容詞-一般", "接尾辞-名詞的-一般"): (None, PART),
("助詞-格助詞", "形状詞-助動詞語幹"): (None, NOUN),
("連体詞", "形状詞-助動詞語幹"): (None, NOUN),
("動詞-一般", "助詞-副助詞"): (None, PART),
("動詞-非自立可能", "助詞-副助詞"): (None, PART),
("助動詞", "助詞-副助詞"): (None, PART),

View File

@ -1,17 +1,9 @@
# encoding: utf8
from __future__ import unicode_literals
from ...symbols import POS, ADJ, AUX, DET, PART, PRON, SPACE ,X
from ...symbols import DET, PART, PRON, SPACE, X
# mapping from tag bi-gram to pos of previous token
TAG_ORTH_MAP = {
"空白": {
" ": SPACE,
" ": X,
},
"助詞-副助詞": {
"たり": PART,
},
"空白": {" ": SPACE, " ": X},
"助詞-副助詞": {"たり": PART},
"連体詞": {
"あの": DET,
"かの": DET,

View File

@ -18,5 +18,5 @@ sentences = [
"ஆப்பிள் நிறுவனம் யு.கே. தொடக்க நிறுவனத்தை ஒரு லட்சம் கோடிக்கு வாங்கப் பார்க்கிறது",
"தன்னாட்சி கார்கள் காப்பீட்டு பொறுப்பை உற்பத்தியாளரிடம் மாற்றுகின்றன",
"நடைபாதை விநியோக ரோபோக்களை தடை செய்வதை சான் பிரான்சிஸ்கோ கருதுகிறது",
"லண்டன் ஐக்கிய இராச்சியத்தில் ஒரு பெரிய நகரம்."
"லண்டன் ஐக்கிய இராச்சியத்தில் ஒரு பெரிய நகரம்.",
]

View File

@ -1,6 +1,6 @@
import re
from .char_classes import ALPHA_LOWER, ALPHA
from .char_classes import ALPHA_LOWER
from ..symbols import ORTH, POS, TAG, LEMMA, SPACE

View File

@ -7,7 +7,7 @@ def test_ja_tokenizer_serialize(ja_tokenizer):
nlp = Japanese()
nlp.tokenizer.from_bytes(tokenizer_bytes)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
assert nlp.tokenizer.split_mode == None
assert nlp.tokenizer.split_mode is None
with make_tempdir() as d:
file_path = d / "tokenizer"
@ -15,7 +15,7 @@ def test_ja_tokenizer_serialize(ja_tokenizer):
nlp = Japanese()
nlp.tokenizer.from_disk(file_path)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
assert nlp.tokenizer.split_mode == None
assert nlp.tokenizer.split_mode is None
# split mode is (de)serialized correctly
nlp = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}})

View File

@ -29,9 +29,8 @@ POS_TESTS = [
]
SENTENCE_TESTS = [
('あれ。これ。', ['あれ。', 'これ。']),
('「伝染るんです。」という漫画があります。',
['「伝染るんです。」という漫画があります。']),
("あれ。これ。", ["あれ。", "これ。"]),
("「伝染るんです。」という漫画があります。", ["「伝染るんです。」という漫画があります。"]),
]
# fmt: on
@ -57,7 +56,7 @@ def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos):
@pytest.mark.skip(reason="sentence segmentation in tokenizer is buggy")
@pytest.mark.parametrize("text,expected_sents", SENTENCE_TESTS)
def test_ja_tokenizer_pos(ja_tokenizer, text, expected_sents):
def test_ja_tokenizer_sents(ja_tokenizer, text, expected_sents):
sents = [str(sent) for sent in ja_tokenizer(text).sents]
assert sents == expected_sents
@ -74,13 +73,14 @@ def test_ja_tokenizer_naughty_strings(ja_tokenizer, text):
assert tokens.text_with_ws == text
@pytest.mark.parametrize("text,len_a,len_b,len_c",
@pytest.mark.parametrize(
"text,len_a,len_b,len_c",
[
("選挙管理委員会", 4, 3, 1),
("客室乗務員", 3, 2, 1),
("労働者協同組合", 4, 3, 1),
("機能性食品", 3, 2, 1),
]
],
)
def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c):
nlp_a = Japanese(meta={"tokenizer": {"config": {"split_mode": "A"}}})

View File

@ -10,7 +10,13 @@ def test_build_dependencies():
"mock",
"flake8",
]
libs_ignore_setup = ["fugashi", "natto-py", "pythainlp", "sudachipy", "sudachidict_core"]
libs_ignore_setup = [
"fugashi",
"natto-py",
"pythainlp",
"sudachipy",
"sudachidict_core",
]
# check requirements.txt
req_dict = {}

View File

@ -1,7 +1,6 @@
import pytest
import os
import ctypes
import srsly
from pathlib import Path
from spacy.about import __version__ as spacy_version
from spacy import util
@ -9,8 +8,6 @@ from spacy import prefer_gpu, require_gpu
from spacy.ml._precomputable_affine import PrecomputableAffine
from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
from .util import make_tempdir
@pytest.fixture
def is_admin():