mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Tidy up and auto-format
This commit is contained in:
parent
f77e0bc028
commit
ef5f548fb0
|
@ -136,7 +136,19 @@ for pron in ["he", "she", "it"]:
|
|||
|
||||
# W-words, relative pronouns, prepositions etc.
|
||||
|
||||
for word in ["who", "what", "when", "where", "why", "how", "there", "that", "this", "these", "those"]:
|
||||
for word in [
|
||||
"who",
|
||||
"what",
|
||||
"when",
|
||||
"where",
|
||||
"why",
|
||||
"how",
|
||||
"there",
|
||||
"that",
|
||||
"this",
|
||||
"these",
|
||||
"those",
|
||||
]:
|
||||
for orth in [word, word.title()]:
|
||||
_exc[orth + "'s"] = [
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
|
@ -396,14 +408,8 @@ _other_exc = {
|
|||
{ORTH: "Let", LEMMA: "let", NORM: "let"},
|
||||
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"},
|
||||
],
|
||||
"c'mon": [
|
||||
{ORTH: "c'm", NORM: "come", LEMMA: "come"},
|
||||
{ORTH: "on"}
|
||||
],
|
||||
"C'mon": [
|
||||
{ORTH: "C'm", NORM: "come", LEMMA: "come"},
|
||||
{ORTH: "on"}
|
||||
]
|
||||
"c'mon": [{ORTH: "c'm", NORM: "come", LEMMA: "come"}, {ORTH: "on"}],
|
||||
"C'mon": [{ORTH: "C'm", NORM: "come", LEMMA: "come"}, {ORTH: "on"}],
|
||||
}
|
||||
|
||||
_exc.update(_other_exc)
|
||||
|
|
|
@ -24,7 +24,7 @@ def noun_chunks(obj):
|
|||
|
||||
doc = obj.doc # Ensure works on both Doc and Span.
|
||||
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
||||
conj = doc.vocab.strings.add("conj")
|
||||
doc.vocab.strings.add("conj")
|
||||
np_label = doc.vocab.strings.add("NP")
|
||||
seen = set()
|
||||
for i, word in enumerate(obj):
|
||||
|
|
|
@ -1,21 +1,15 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import POS, ADJ, AUX, NOUN, PART, VERB
|
||||
from ...symbols import ADJ, AUX, NOUN, PART, VERB
|
||||
|
||||
# mapping from tag bi-gram to pos of previous token
|
||||
TAG_BIGRAM_MAP = {
|
||||
# This covers only small part of AUX.
|
||||
("形容詞-非自立可能", "助詞-終助詞"): (AUX, None),
|
||||
|
||||
("名詞-普通名詞-形状詞可能", "助動詞"): (ADJ, None),
|
||||
# ("副詞", "名詞-普通名詞-形状詞可能"): (None, ADJ),
|
||||
|
||||
# This covers acl, advcl, obl and root, but has side effect for compound.
|
||||
("名詞-普通名詞-サ変可能", "動詞-非自立可能"): (VERB, AUX),
|
||||
# This covers almost all of the deps
|
||||
("名詞-普通名詞-サ変形状詞可能", "動詞-非自立可能"): (VERB, AUX),
|
||||
|
||||
("名詞-普通名詞-副詞可能", "動詞-非自立可能"): (None, VERB),
|
||||
("副詞", "動詞-非自立可能"): (None, VERB),
|
||||
("形容詞-一般", "動詞-非自立可能"): (None, VERB),
|
||||
|
@ -25,12 +19,9 @@ TAG_BIGRAM_MAP = {
|
|||
("助詞-副助詞", "動詞-非自立可能"): (None, VERB),
|
||||
("助詞-格助詞", "動詞-非自立可能"): (None, VERB),
|
||||
("補助記号-読点", "動詞-非自立可能"): (None, VERB),
|
||||
|
||||
("形容詞-一般", "接尾辞-名詞的-一般"): (None, PART),
|
||||
|
||||
("助詞-格助詞", "形状詞-助動詞語幹"): (None, NOUN),
|
||||
("連体詞", "形状詞-助動詞語幹"): (None, NOUN),
|
||||
|
||||
("動詞-一般", "助詞-副助詞"): (None, PART),
|
||||
("動詞-非自立可能", "助詞-副助詞"): (None, PART),
|
||||
("助動詞", "助詞-副助詞"): (None, PART),
|
||||
|
|
|
@ -1,17 +1,9 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import POS, ADJ, AUX, DET, PART, PRON, SPACE ,X
|
||||
from ...symbols import DET, PART, PRON, SPACE, X
|
||||
|
||||
# mapping from tag bi-gram to pos of previous token
|
||||
TAG_ORTH_MAP = {
|
||||
"空白": {
|
||||
" ": SPACE,
|
||||
" ": X,
|
||||
},
|
||||
"助詞-副助詞": {
|
||||
"たり": PART,
|
||||
},
|
||||
"空白": {" ": SPACE, " ": X},
|
||||
"助詞-副助詞": {"たり": PART},
|
||||
"連体詞": {
|
||||
"あの": DET,
|
||||
"かの": DET,
|
||||
|
|
|
@ -18,5 +18,5 @@ sentences = [
|
|||
"ஆப்பிள் நிறுவனம் யு.கே. தொடக்க நிறுவனத்தை ஒரு லட்சம் கோடிக்கு வாங்கப் பார்க்கிறது",
|
||||
"தன்னாட்சி கார்கள் காப்பீட்டு பொறுப்பை உற்பத்தியாளரிடம் மாற்றுகின்றன",
|
||||
"நடைபாதை விநியோக ரோபோக்களை தடை செய்வதை சான் பிரான்சிஸ்கோ கருதுகிறது",
|
||||
"லண்டன் ஐக்கிய இராச்சியத்தில் ஒரு பெரிய நகரம்."
|
||||
"லண்டன் ஐக்கிய இராச்சியத்தில் ஒரு பெரிய நகரம்.",
|
||||
]
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import re
|
||||
|
||||
from .char_classes import ALPHA_LOWER, ALPHA
|
||||
from .char_classes import ALPHA_LOWER
|
||||
from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
|
||||
|
||||
|
||||
|
|
|
@ -7,7 +7,7 @@ def test_ja_tokenizer_serialize(ja_tokenizer):
|
|||
nlp = Japanese()
|
||||
nlp.tokenizer.from_bytes(tokenizer_bytes)
|
||||
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||
assert nlp.tokenizer.split_mode == None
|
||||
assert nlp.tokenizer.split_mode is None
|
||||
|
||||
with make_tempdir() as d:
|
||||
file_path = d / "tokenizer"
|
||||
|
@ -15,7 +15,7 @@ def test_ja_tokenizer_serialize(ja_tokenizer):
|
|||
nlp = Japanese()
|
||||
nlp.tokenizer.from_disk(file_path)
|
||||
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||
assert nlp.tokenizer.split_mode == None
|
||||
assert nlp.tokenizer.split_mode is None
|
||||
|
||||
# split mode is (de)serialized correctly
|
||||
nlp = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}})
|
||||
|
|
|
@ -29,10 +29,9 @@ POS_TESTS = [
|
|||
]
|
||||
|
||||
SENTENCE_TESTS = [
|
||||
('あれ。これ。', ['あれ。', 'これ。']),
|
||||
('「伝染るんです。」という漫画があります。',
|
||||
['「伝染るんです。」という漫画があります。']),
|
||||
]
|
||||
("あれ。これ。", ["あれ。", "これ。"]),
|
||||
("「伝染るんです。」という漫画があります。", ["「伝染るんです。」という漫画があります。"]),
|
||||
]
|
||||
# fmt: on
|
||||
|
||||
|
||||
|
@ -48,7 +47,7 @@ def test_ja_tokenizer_tags(ja_tokenizer, text, expected_tags):
|
|||
assert tags == expected_tags
|
||||
|
||||
|
||||
#XXX This isn't working? Always passes
|
||||
# XXX This isn't working? Always passes
|
||||
@pytest.mark.parametrize("text,expected_pos", POS_TESTS)
|
||||
def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos):
|
||||
pos = [token.pos_ for token in ja_tokenizer(text)]
|
||||
|
@ -57,7 +56,7 @@ def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos):
|
|||
|
||||
@pytest.mark.skip(reason="sentence segmentation in tokenizer is buggy")
|
||||
@pytest.mark.parametrize("text,expected_sents", SENTENCE_TESTS)
|
||||
def test_ja_tokenizer_pos(ja_tokenizer, text, expected_sents):
|
||||
def test_ja_tokenizer_sents(ja_tokenizer, text, expected_sents):
|
||||
sents = [str(sent) for sent in ja_tokenizer(text).sents]
|
||||
assert sents == expected_sents
|
||||
|
||||
|
@ -74,13 +73,14 @@ def test_ja_tokenizer_naughty_strings(ja_tokenizer, text):
|
|||
assert tokens.text_with_ws == text
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,len_a,len_b,len_c",
|
||||
@pytest.mark.parametrize(
|
||||
"text,len_a,len_b,len_c",
|
||||
[
|
||||
("選挙管理委員会", 4, 3, 1),
|
||||
("客室乗務員", 3, 2, 1),
|
||||
("労働者協同組合", 4, 3, 1),
|
||||
("機能性食品", 3, 2, 1),
|
||||
]
|
||||
],
|
||||
)
|
||||
def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c):
|
||||
nlp_a = Japanese(meta={"tokenizer": {"config": {"split_mode": "A"}}})
|
||||
|
|
|
@ -10,7 +10,13 @@ def test_build_dependencies():
|
|||
"mock",
|
||||
"flake8",
|
||||
]
|
||||
libs_ignore_setup = ["fugashi", "natto-py", "pythainlp", "sudachipy", "sudachidict_core"]
|
||||
libs_ignore_setup = [
|
||||
"fugashi",
|
||||
"natto-py",
|
||||
"pythainlp",
|
||||
"sudachipy",
|
||||
"sudachidict_core",
|
||||
]
|
||||
|
||||
# check requirements.txt
|
||||
req_dict = {}
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
import pytest
|
||||
import os
|
||||
import ctypes
|
||||
import srsly
|
||||
from pathlib import Path
|
||||
from spacy.about import __version__ as spacy_version
|
||||
from spacy import util
|
||||
|
@ -9,8 +8,6 @@ from spacy import prefer_gpu, require_gpu
|
|||
from spacy.ml._precomputable_affine import PrecomputableAffine
|
||||
from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
|
||||
|
||||
from .util import make_tempdir
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def is_admin():
|
||||
|
|
Loading…
Reference in New Issue
Block a user