mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Tidy up and auto-format
This commit is contained in:
parent
f77e0bc028
commit
ef5f548fb0
|
@ -136,7 +136,19 @@ for pron in ["he", "she", "it"]:
|
||||||
|
|
||||||
# W-words, relative pronouns, prepositions etc.
|
# W-words, relative pronouns, prepositions etc.
|
||||||
|
|
||||||
for word in ["who", "what", "when", "where", "why", "how", "there", "that", "this", "these", "those"]:
|
for word in [
|
||||||
|
"who",
|
||||||
|
"what",
|
||||||
|
"when",
|
||||||
|
"where",
|
||||||
|
"why",
|
||||||
|
"how",
|
||||||
|
"there",
|
||||||
|
"that",
|
||||||
|
"this",
|
||||||
|
"these",
|
||||||
|
"those",
|
||||||
|
]:
|
||||||
for orth in [word, word.title()]:
|
for orth in [word, word.title()]:
|
||||||
_exc[orth + "'s"] = [
|
_exc[orth + "'s"] = [
|
||||||
{ORTH: orth, LEMMA: word, NORM: word},
|
{ORTH: orth, LEMMA: word, NORM: word},
|
||||||
|
@ -396,14 +408,8 @@ _other_exc = {
|
||||||
{ORTH: "Let", LEMMA: "let", NORM: "let"},
|
{ORTH: "Let", LEMMA: "let", NORM: "let"},
|
||||||
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"},
|
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"},
|
||||||
],
|
],
|
||||||
"c'mon": [
|
"c'mon": [{ORTH: "c'm", NORM: "come", LEMMA: "come"}, {ORTH: "on"}],
|
||||||
{ORTH: "c'm", NORM: "come", LEMMA: "come"},
|
"C'mon": [{ORTH: "C'm", NORM: "come", LEMMA: "come"}, {ORTH: "on"}],
|
||||||
{ORTH: "on"}
|
|
||||||
],
|
|
||||||
"C'mon": [
|
|
||||||
{ORTH: "C'm", NORM: "come", LEMMA: "come"},
|
|
||||||
{ORTH: "on"}
|
|
||||||
]
|
|
||||||
}
|
}
|
||||||
|
|
||||||
_exc.update(_other_exc)
|
_exc.update(_other_exc)
|
||||||
|
|
|
@ -24,7 +24,7 @@ def noun_chunks(obj):
|
||||||
|
|
||||||
doc = obj.doc # Ensure works on both Doc and Span.
|
doc = obj.doc # Ensure works on both Doc and Span.
|
||||||
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
||||||
conj = doc.vocab.strings.add("conj")
|
doc.vocab.strings.add("conj")
|
||||||
np_label = doc.vocab.strings.add("NP")
|
np_label = doc.vocab.strings.add("NP")
|
||||||
seen = set()
|
seen = set()
|
||||||
for i, word in enumerate(obj):
|
for i, word in enumerate(obj):
|
||||||
|
|
|
@ -1,21 +1,15 @@
|
||||||
# encoding: utf8
|
from ...symbols import ADJ, AUX, NOUN, PART, VERB
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...symbols import POS, ADJ, AUX, NOUN, PART, VERB
|
|
||||||
|
|
||||||
# mapping from tag bi-gram to pos of previous token
|
# mapping from tag bi-gram to pos of previous token
|
||||||
TAG_BIGRAM_MAP = {
|
TAG_BIGRAM_MAP = {
|
||||||
# This covers only small part of AUX.
|
# This covers only small part of AUX.
|
||||||
("形容詞-非自立可能", "助詞-終助詞"): (AUX, None),
|
("形容詞-非自立可能", "助詞-終助詞"): (AUX, None),
|
||||||
|
|
||||||
("名詞-普通名詞-形状詞可能", "助動詞"): (ADJ, None),
|
("名詞-普通名詞-形状詞可能", "助動詞"): (ADJ, None),
|
||||||
# ("副詞", "名詞-普通名詞-形状詞可能"): (None, ADJ),
|
# ("副詞", "名詞-普通名詞-形状詞可能"): (None, ADJ),
|
||||||
|
|
||||||
# This covers acl, advcl, obl and root, but has side effect for compound.
|
# This covers acl, advcl, obl and root, but has side effect for compound.
|
||||||
("名詞-普通名詞-サ変可能", "動詞-非自立可能"): (VERB, AUX),
|
("名詞-普通名詞-サ変可能", "動詞-非自立可能"): (VERB, AUX),
|
||||||
# This covers almost all of the deps
|
# This covers almost all of the deps
|
||||||
("名詞-普通名詞-サ変形状詞可能", "動詞-非自立可能"): (VERB, AUX),
|
("名詞-普通名詞-サ変形状詞可能", "動詞-非自立可能"): (VERB, AUX),
|
||||||
|
|
||||||
("名詞-普通名詞-副詞可能", "動詞-非自立可能"): (None, VERB),
|
("名詞-普通名詞-副詞可能", "動詞-非自立可能"): (None, VERB),
|
||||||
("副詞", "動詞-非自立可能"): (None, VERB),
|
("副詞", "動詞-非自立可能"): (None, VERB),
|
||||||
("形容詞-一般", "動詞-非自立可能"): (None, VERB),
|
("形容詞-一般", "動詞-非自立可能"): (None, VERB),
|
||||||
|
@ -25,12 +19,9 @@ TAG_BIGRAM_MAP = {
|
||||||
("助詞-副助詞", "動詞-非自立可能"): (None, VERB),
|
("助詞-副助詞", "動詞-非自立可能"): (None, VERB),
|
||||||
("助詞-格助詞", "動詞-非自立可能"): (None, VERB),
|
("助詞-格助詞", "動詞-非自立可能"): (None, VERB),
|
||||||
("補助記号-読点", "動詞-非自立可能"): (None, VERB),
|
("補助記号-読点", "動詞-非自立可能"): (None, VERB),
|
||||||
|
|
||||||
("形容詞-一般", "接尾辞-名詞的-一般"): (None, PART),
|
("形容詞-一般", "接尾辞-名詞的-一般"): (None, PART),
|
||||||
|
|
||||||
("助詞-格助詞", "形状詞-助動詞語幹"): (None, NOUN),
|
("助詞-格助詞", "形状詞-助動詞語幹"): (None, NOUN),
|
||||||
("連体詞", "形状詞-助動詞語幹"): (None, NOUN),
|
("連体詞", "形状詞-助動詞語幹"): (None, NOUN),
|
||||||
|
|
||||||
("動詞-一般", "助詞-副助詞"): (None, PART),
|
("動詞-一般", "助詞-副助詞"): (None, PART),
|
||||||
("動詞-非自立可能", "助詞-副助詞"): (None, PART),
|
("動詞-非自立可能", "助詞-副助詞"): (None, PART),
|
||||||
("助動詞", "助詞-副助詞"): (None, PART),
|
("助動詞", "助詞-副助詞"): (None, PART),
|
||||||
|
|
|
@ -1,17 +1,9 @@
|
||||||
# encoding: utf8
|
from ...symbols import DET, PART, PRON, SPACE, X
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...symbols import POS, ADJ, AUX, DET, PART, PRON, SPACE ,X
|
|
||||||
|
|
||||||
# mapping from tag bi-gram to pos of previous token
|
# mapping from tag bi-gram to pos of previous token
|
||||||
TAG_ORTH_MAP = {
|
TAG_ORTH_MAP = {
|
||||||
"空白": {
|
"空白": {" ": SPACE, " ": X},
|
||||||
" ": SPACE,
|
"助詞-副助詞": {"たり": PART},
|
||||||
" ": X,
|
|
||||||
},
|
|
||||||
"助詞-副助詞": {
|
|
||||||
"たり": PART,
|
|
||||||
},
|
|
||||||
"連体詞": {
|
"連体詞": {
|
||||||
"あの": DET,
|
"あの": DET,
|
||||||
"かの": DET,
|
"かの": DET,
|
||||||
|
|
|
@ -18,5 +18,5 @@ sentences = [
|
||||||
"ஆப்பிள் நிறுவனம் யு.கே. தொடக்க நிறுவனத்தை ஒரு லட்சம் கோடிக்கு வாங்கப் பார்க்கிறது",
|
"ஆப்பிள் நிறுவனம் யு.கே. தொடக்க நிறுவனத்தை ஒரு லட்சம் கோடிக்கு வாங்கப் பார்க்கிறது",
|
||||||
"தன்னாட்சி கார்கள் காப்பீட்டு பொறுப்பை உற்பத்தியாளரிடம் மாற்றுகின்றன",
|
"தன்னாட்சி கார்கள் காப்பீட்டு பொறுப்பை உற்பத்தியாளரிடம் மாற்றுகின்றன",
|
||||||
"நடைபாதை விநியோக ரோபோக்களை தடை செய்வதை சான் பிரான்சிஸ்கோ கருதுகிறது",
|
"நடைபாதை விநியோக ரோபோக்களை தடை செய்வதை சான் பிரான்சிஸ்கோ கருதுகிறது",
|
||||||
"லண்டன் ஐக்கிய இராச்சியத்தில் ஒரு பெரிய நகரம்."
|
"லண்டன் ஐக்கிய இராச்சியத்தில் ஒரு பெரிய நகரம்.",
|
||||||
]
|
]
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from .char_classes import ALPHA_LOWER, ALPHA
|
from .char_classes import ALPHA_LOWER
|
||||||
from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
|
from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,7 @@ def test_ja_tokenizer_serialize(ja_tokenizer):
|
||||||
nlp = Japanese()
|
nlp = Japanese()
|
||||||
nlp.tokenizer.from_bytes(tokenizer_bytes)
|
nlp.tokenizer.from_bytes(tokenizer_bytes)
|
||||||
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||||
assert nlp.tokenizer.split_mode == None
|
assert nlp.tokenizer.split_mode is None
|
||||||
|
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
file_path = d / "tokenizer"
|
file_path = d / "tokenizer"
|
||||||
|
@ -15,7 +15,7 @@ def test_ja_tokenizer_serialize(ja_tokenizer):
|
||||||
nlp = Japanese()
|
nlp = Japanese()
|
||||||
nlp.tokenizer.from_disk(file_path)
|
nlp.tokenizer.from_disk(file_path)
|
||||||
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||||
assert nlp.tokenizer.split_mode == None
|
assert nlp.tokenizer.split_mode is None
|
||||||
|
|
||||||
# split mode is (de)serialized correctly
|
# split mode is (de)serialized correctly
|
||||||
nlp = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}})
|
nlp = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}})
|
||||||
|
|
|
@ -29,10 +29,9 @@ POS_TESTS = [
|
||||||
]
|
]
|
||||||
|
|
||||||
SENTENCE_TESTS = [
|
SENTENCE_TESTS = [
|
||||||
('あれ。これ。', ['あれ。', 'これ。']),
|
("あれ。これ。", ["あれ。", "これ。"]),
|
||||||
('「伝染るんです。」という漫画があります。',
|
("「伝染るんです。」という漫画があります。", ["「伝染るんです。」という漫画があります。"]),
|
||||||
['「伝染るんです。」という漫画があります。']),
|
]
|
||||||
]
|
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
|
@ -48,7 +47,7 @@ def test_ja_tokenizer_tags(ja_tokenizer, text, expected_tags):
|
||||||
assert tags == expected_tags
|
assert tags == expected_tags
|
||||||
|
|
||||||
|
|
||||||
#XXX This isn't working? Always passes
|
# XXX This isn't working? Always passes
|
||||||
@pytest.mark.parametrize("text,expected_pos", POS_TESTS)
|
@pytest.mark.parametrize("text,expected_pos", POS_TESTS)
|
||||||
def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos):
|
def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos):
|
||||||
pos = [token.pos_ for token in ja_tokenizer(text)]
|
pos = [token.pos_ for token in ja_tokenizer(text)]
|
||||||
|
@ -57,7 +56,7 @@ def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos):
|
||||||
|
|
||||||
@pytest.mark.skip(reason="sentence segmentation in tokenizer is buggy")
|
@pytest.mark.skip(reason="sentence segmentation in tokenizer is buggy")
|
||||||
@pytest.mark.parametrize("text,expected_sents", SENTENCE_TESTS)
|
@pytest.mark.parametrize("text,expected_sents", SENTENCE_TESTS)
|
||||||
def test_ja_tokenizer_pos(ja_tokenizer, text, expected_sents):
|
def test_ja_tokenizer_sents(ja_tokenizer, text, expected_sents):
|
||||||
sents = [str(sent) for sent in ja_tokenizer(text).sents]
|
sents = [str(sent) for sent in ja_tokenizer(text).sents]
|
||||||
assert sents == expected_sents
|
assert sents == expected_sents
|
||||||
|
|
||||||
|
@ -74,13 +73,14 @@ def test_ja_tokenizer_naughty_strings(ja_tokenizer, text):
|
||||||
assert tokens.text_with_ws == text
|
assert tokens.text_with_ws == text
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("text,len_a,len_b,len_c",
|
@pytest.mark.parametrize(
|
||||||
|
"text,len_a,len_b,len_c",
|
||||||
[
|
[
|
||||||
("選挙管理委員会", 4, 3, 1),
|
("選挙管理委員会", 4, 3, 1),
|
||||||
("客室乗務員", 3, 2, 1),
|
("客室乗務員", 3, 2, 1),
|
||||||
("労働者協同組合", 4, 3, 1),
|
("労働者協同組合", 4, 3, 1),
|
||||||
("機能性食品", 3, 2, 1),
|
("機能性食品", 3, 2, 1),
|
||||||
]
|
],
|
||||||
)
|
)
|
||||||
def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c):
|
def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c):
|
||||||
nlp_a = Japanese(meta={"tokenizer": {"config": {"split_mode": "A"}}})
|
nlp_a = Japanese(meta={"tokenizer": {"config": {"split_mode": "A"}}})
|
||||||
|
|
|
@ -10,7 +10,13 @@ def test_build_dependencies():
|
||||||
"mock",
|
"mock",
|
||||||
"flake8",
|
"flake8",
|
||||||
]
|
]
|
||||||
libs_ignore_setup = ["fugashi", "natto-py", "pythainlp", "sudachipy", "sudachidict_core"]
|
libs_ignore_setup = [
|
||||||
|
"fugashi",
|
||||||
|
"natto-py",
|
||||||
|
"pythainlp",
|
||||||
|
"sudachipy",
|
||||||
|
"sudachidict_core",
|
||||||
|
]
|
||||||
|
|
||||||
# check requirements.txt
|
# check requirements.txt
|
||||||
req_dict = {}
|
req_dict = {}
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
import pytest
|
import pytest
|
||||||
import os
|
import os
|
||||||
import ctypes
|
import ctypes
|
||||||
import srsly
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from spacy.about import __version__ as spacy_version
|
from spacy.about import __version__ as spacy_version
|
||||||
from spacy import util
|
from spacy import util
|
||||||
|
@ -9,8 +8,6 @@ from spacy import prefer_gpu, require_gpu
|
||||||
from spacy.ml._precomputable_affine import PrecomputableAffine
|
from spacy.ml._precomputable_affine import PrecomputableAffine
|
||||||
from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
|
from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
|
||||||
|
|
||||||
from .util import make_tempdir
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def is_admin():
|
def is_admin():
|
||||||
|
|
Loading…
Reference in New Issue
Block a user