Tidy up and auto-format

This commit is contained in:
Ines Montani 2020-10-10 19:14:48 +02:00
parent 74972744e5
commit 539b0c10da
9 changed files with 32 additions and 31 deletions

View File

@ -62,6 +62,7 @@ _ordinal_words = [
_ordinal_endings = ("inci", "ıncı", "nci", "ncı", "uncu", "üncü") _ordinal_endings = ("inci", "ıncı", "nci", "ncı", "uncu", "üncü")
def like_num(text): def like_num(text):
if text.startswith(("+", "-", "±", "~")): if text.startswith(("+", "-", "±", "~")):
text = text[1:] text = text[1:]
@ -75,11 +76,11 @@ def like_num(text):
text_lower = text.lower() text_lower = text.lower()
#Check cardinal number # Check cardinal number
if text_lower in _num_words: if text_lower in _num_words:
return True return True
#Check ordinal number # Check ordinal number
if text_lower in _ordinal_words: if text_lower in _ordinal_words:
return True return True
if text_lower.endswith(_ordinal_endings): if text_lower.endswith(_ordinal_endings):

View File

@ -49,11 +49,10 @@ def noun_chunks(doclike):
prev_end = word.left_edge.i prev_end = word.left_edge.i
yield word.left_edge.i, extend_right(word), np_label yield word.left_edge.i, extend_right(word), np_label
elif word.dep == conj: elif word.dep == conj:
cc_token = word.left_edge cc_token = word.left_edge
prev_end = cc_token.i prev_end = cc_token.i
yield cc_token.right_edge.i + 1, extend_right(word), np_label # Shave off cc tokens from the NP # Shave off cc tokens from the NP
yield cc_token.right_edge.i + 1, extend_right(word), np_label
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}

View File

@ -1,5 +1,5 @@
from typing import Optional, Any, Dict, Callable, Iterable, Union, List, Pattern from typing import Optional, Any, Dict, Callable, Iterable, Union, List, Pattern
from typing import Tuple, Iterator from typing import Tuple
from dataclasses import dataclass from dataclasses import dataclass
import random import random
import itertools import itertools
@ -1197,7 +1197,9 @@ class Language:
doc = Doc(self.vocab, words=["x", "y", "z"]) doc = Doc(self.vocab, words=["x", "y", "z"])
get_examples = lambda: [Example.from_dict(doc, {})] get_examples = lambda: [Example.from_dict(doc, {})]
if not hasattr(get_examples, "__call__"): if not hasattr(get_examples, "__call__"):
err = Errors.E930.format(method="Language.initialize", obj=type(get_examples)) err = Errors.E930.format(
method="Language.initialize", obj=type(get_examples)
)
raise TypeError(err) raise TypeError(err)
# Make sure the config is interpolated so we can resolve subsections # Make sure the config is interpolated so we can resolve subsections
config = self.config.interpolate() config = self.config.interpolate()

View File

@ -239,10 +239,12 @@ def th_tokenizer():
def tr_tokenizer(): def tr_tokenizer():
return get_lang_class("tr")().tokenizer return get_lang_class("tr")().tokenizer
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def tr_vocab(): def tr_vocab():
return get_lang_class("tr").Defaults.create_vocab() return get_lang_class("tr").Defaults.create_vocab()
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def tt_tokenizer(): def tt_tokenizer():
return get_lang_class("tt")().tokenizer return get_lang_class("tt")().tokenizer

View File

@ -225,7 +225,7 @@ def test_tr_noun_chunks_acl_nmod(tr_tokenizer):
assert chunks[0].text_with_ws == "en sevdiğim ses sanatçısı " assert chunks[0].text_with_ws == "en sevdiğim ses sanatçısı "
def test_tr_noun_chunks_acl_nmod(tr_tokenizer): def test_tr_noun_chunks_acl_nmod2(tr_tokenizer):
text = "bildiğim bir turizm şirketi" text = "bildiğim bir turizm şirketi"
heads = [3, 3, 3, 3] heads = [3, 3, 3, 3]
deps = ["acl", "det", "nmod", "ROOT"] deps = ["acl", "det", "nmod", "ROOT"]
@ -308,7 +308,7 @@ def test_tr_noun_chunks_np_recursive_four_nouns(tr_tokenizer):
assert len(chunks) == 1 assert len(chunks) == 1
assert chunks[0].text_with_ws == "kızına piyano dersi verdiğim hanım " assert chunks[0].text_with_ws == "kızına piyano dersi verdiğim hanım "
def test_tr_noun_chunks_np_recursive_no_nmod(tr_tokenizer): def test_tr_noun_chunks_np_recursive_no_nmod(tr_tokenizer):
text = "içine birkaç çiçek konmuş olan bir vazo" text = "içine birkaç çiçek konmuş olan bir vazo"
heads = [3, 2, 3, 6, 3, 6, 6] heads = [3, 2, 3, 6, 3, 6, 6]
@ -326,7 +326,7 @@ def test_tr_noun_chunks_np_recursive_no_nmod(tr_tokenizer):
def test_tr_noun_chunks_np_recursive_long_two_acls(tr_tokenizer): def test_tr_noun_chunks_np_recursive_long_two_acls(tr_tokenizer):
text = "içine Simge'nin bahçesinden toplanmış birkaç çiçeğin konmuş olduğu bir vazo" text = "içine Simge'nin bahçesinden toplanmış birkaç çiçeğin konmuş olduğu bir vazo"
heads = [6, 2, 3, 5, 5, 6, 9, 6, 9, 9] heads = [6, 2, 3, 5, 5, 6, 9, 6, 9, 9]
deps = ["obl", "nmod" , "obl", "acl", "det", "nsubj", "acl", "aux", "det", "ROOT"] deps = ["obl", "nmod", "obl", "acl", "det", "nsubj", "acl", "aux", "det", "ROOT"]
pos = ["ADP", "PROPN", "NOUN", "VERB", "DET", "NOUN", "VERB", "AUX", "DET", "NOUN"] pos = ["ADP", "PROPN", "NOUN", "VERB", "DET", "NOUN", "VERB", "AUX", "DET", "NOUN"]
tokens = tr_tokenizer(text) tokens = tr_tokenizer(text)
doc = Doc( doc = Doc(
@ -334,7 +334,10 @@ def test_tr_noun_chunks_np_recursive_long_two_acls(tr_tokenizer):
) )
chunks = list(doc.noun_chunks) chunks = list(doc.noun_chunks)
assert len(chunks) == 1 assert len(chunks) == 1
assert chunks[0].text_with_ws == "içine Simge'nin bahçesinden toplanmış birkaç çiçeğin konmuş olduğu bir vazo " assert (
chunks[0].text_with_ws
== "içine Simge'nin bahçesinden toplanmış birkaç çiçeğin konmuş olduğu bir vazo "
)
def test_tr_noun_chunks_two_nouns_in_nmod(tr_tokenizer): def test_tr_noun_chunks_two_nouns_in_nmod(tr_tokenizer):
@ -350,7 +353,8 @@ def test_tr_noun_chunks_two_nouns_in_nmod(tr_tokenizer):
assert len(chunks) == 1 assert len(chunks) == 1
assert chunks[0].text_with_ws == "kız ve erkek çocuklar " assert chunks[0].text_with_ws == "kız ve erkek çocuklar "
def test_tr_noun_chunks_two_nouns_in_nmod(tr_tokenizer):
def test_tr_noun_chunks_two_nouns_in_nmod2(tr_tokenizer):
text = "tatlı ve gürbüz çocuklar" text = "tatlı ve gürbüz çocuklar"
heads = [3, 2, 0, 3] heads = [3, 2, 0, 3]
deps = ["amod", "cc", "conj", "ROOT"] deps = ["amod", "cc", "conj", "ROOT"]
@ -378,6 +382,7 @@ def test_tr_noun_chunks_conj_simple(tr_tokenizer):
assert chunks[0].text_with_ws == "ben " assert chunks[0].text_with_ws == "ben "
assert chunks[1].text_with_ws == "Sen " assert chunks[1].text_with_ws == "Sen "
def test_tr_noun_chunks_conj_three(tr_tokenizer): def test_tr_noun_chunks_conj_three(tr_tokenizer):
text = "sen, ben ve ondan" text = "sen, ben ve ondan"
heads = [0, 2, 0, 4, 0] heads = [0, 2, 0, 4, 0]
@ -394,7 +399,7 @@ def test_tr_noun_chunks_conj_three(tr_tokenizer):
assert chunks[2].text_with_ws == "sen " assert chunks[2].text_with_ws == "sen "
def test_tr_noun_chunks_conj_three(tr_tokenizer): def test_tr_noun_chunks_conj_three2(tr_tokenizer):
text = "ben ya da sen ya da onlar" text = "ben ya da sen ya da onlar"
heads = [0, 3, 1, 0, 6, 4, 3] heads = [0, 3, 1, 0, 6, 4, 3]
deps = ["ROOT", "cc", "fixed", "conj", "cc", "fixed", "conj"] deps = ["ROOT", "cc", "fixed", "conj", "cc", "fixed", "conj"]
@ -499,7 +504,7 @@ def test_tr_noun_chunks_flat_names_and_title(tr_tokenizer):
assert chunks[0].text_with_ws == "Gazi Mustafa Kemal " assert chunks[0].text_with_ws == "Gazi Mustafa Kemal "
def test_tr_noun_chunks_flat_names_and_title(tr_tokenizer): def test_tr_noun_chunks_flat_names_and_title2(tr_tokenizer):
text = "Ahmet Vefik Paşa" text = "Ahmet Vefik Paşa"
heads = [2, 0, 2] heads = [2, 0, 2]
deps = ["nmod", "flat", "ROOT"] deps = ["nmod", "flat", "ROOT"]

View File

@ -15,8 +15,8 @@ from spacy.lang.tr.lex_attrs import like_num
"üçüncü", "üçüncü",
"beşinci", "beşinci",
"100üncü", "100üncü",
"8inci" "8inci",
] ],
) )
def test_tr_lex_attrs_like_number_cardinal_ordinal(word): def test_tr_lex_attrs_like_number_cardinal_ordinal(word):
assert like_num(word) assert like_num(word)
@ -26,4 +26,3 @@ def test_tr_lex_attrs_like_number_cardinal_ordinal(word):
def test_tr_lex_attrs_capitals(word): def test_tr_lex_attrs_capitals(word):
assert like_num(word) assert like_num(word)
assert like_num(word.upper()) assert like_num(word.upper())

View File

@ -446,7 +446,7 @@ def test_overfitting_IO():
return mykb return mykb
# Create the Entity Linker component and add it to the pipeline # Create the Entity Linker component and add it to the pipeline
entity_linker = nlp.add_pipe("entity_linker", last=True,) entity_linker = nlp.add_pipe("entity_linker", last=True)
entity_linker.set_kb(create_kb) entity_linker.set_kb(create_kb)
assert "Q2146908" in entity_linker.vocab.strings assert "Q2146908" in entity_linker.vocab.strings
assert "Q2146908" in entity_linker.kb.vocab.strings assert "Q2146908" in entity_linker.kb.vocab.strings

View File

@ -6,8 +6,8 @@ def test_issue6207(en_tokenizer):
# Make spans # Make spans
s1 = doc[:4] s1 = doc[:4]
s2 = doc[3:6] # overlaps with s1 s2 = doc[3:6] # overlaps with s1
s3 = doc[5:7] # overlaps with s2, not s1 s3 = doc[5:7] # overlaps with s2, not s1
result = filter_spans((s1, s2, s3)) result = filter_spans((s1, s2, s3))
assert s1 in result assert s1 in result

View File

@ -1,10 +1,8 @@
from typing import List from typing import List
import pytest import pytest
from thinc.api import fix_random_seed, Adam, set_dropout_rate from thinc.api import fix_random_seed, Adam, set_dropout_rate
from numpy.testing import assert_array_equal from numpy.testing import assert_array_equal
import numpy import numpy
from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder
from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier
from spacy.ml.staticvectors import StaticVectors from spacy.ml.staticvectors import StaticVectors
@ -188,12 +186,7 @@ def test_models_update_consistently(seed, dropout, model_func, kwargs, get_X):
assert_array_equal(get_all_params(model1), get_all_params(model2)) assert_array_equal(get_all_params(model1), get_all_params(model2))
@pytest.mark.parametrize( @pytest.mark.parametrize("model_func,kwargs", [(StaticVectors, {"nO": 128, "nM": 300})])
"model_func,kwargs",
[
(StaticVectors, {"nO": 128, "nM": 300}),
]
)
def test_empty_docs(model_func, kwargs): def test_empty_docs(model_func, kwargs):
nlp = English() nlp = English()
model = model_func(**kwargs).initialize() model = model_func(**kwargs).initialize()
@ -201,7 +194,7 @@ def test_empty_docs(model_func, kwargs):
for n_docs in range(3): for n_docs in range(3):
docs = [nlp("") for _ in range(n_docs)] docs = [nlp("") for _ in range(n_docs)]
# Test predict # Test predict
_ = model.predict(docs) model.predict(docs)
# Test backprop # Test backprop
output, backprop = model.begin_update(docs) output, backprop = model.begin_update(docs)
_ = backprop(output) backprop(output)