mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 09:56:28 +03:00
Tidy up and auto-format
This commit is contained in:
parent
74972744e5
commit
539b0c10da
|
@ -62,6 +62,7 @@ _ordinal_words = [
|
||||||
|
|
||||||
_ordinal_endings = ("inci", "ıncı", "nci", "ncı", "uncu", "üncü")
|
_ordinal_endings = ("inci", "ıncı", "nci", "ncı", "uncu", "üncü")
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
if text.startswith(("+", "-", "±", "~")):
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
text = text[1:]
|
text = text[1:]
|
||||||
|
@ -75,11 +76,11 @@ def like_num(text):
|
||||||
|
|
||||||
text_lower = text.lower()
|
text_lower = text.lower()
|
||||||
|
|
||||||
#Check cardinal number
|
# Check cardinal number
|
||||||
if text_lower in _num_words:
|
if text_lower in _num_words:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
#Check ordinal number
|
# Check ordinal number
|
||||||
if text_lower in _ordinal_words:
|
if text_lower in _ordinal_words:
|
||||||
return True
|
return True
|
||||||
if text_lower.endswith(_ordinal_endings):
|
if text_lower.endswith(_ordinal_endings):
|
||||||
|
|
|
@ -51,9 +51,8 @@ def noun_chunks(doclike):
|
||||||
elif word.dep == conj:
|
elif word.dep == conj:
|
||||||
cc_token = word.left_edge
|
cc_token = word.left_edge
|
||||||
prev_end = cc_token.i
|
prev_end = cc_token.i
|
||||||
yield cc_token.right_edge.i + 1, extend_right(word), np_label # Shave off cc tokens from the NP
|
# Shave off cc tokens from the NP
|
||||||
|
yield cc_token.right_edge.i + 1, extend_right(word), np_label
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
from typing import Optional, Any, Dict, Callable, Iterable, Union, List, Pattern
|
from typing import Optional, Any, Dict, Callable, Iterable, Union, List, Pattern
|
||||||
from typing import Tuple, Iterator
|
from typing import Tuple
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
import random
|
import random
|
||||||
import itertools
|
import itertools
|
||||||
|
@ -1197,7 +1197,9 @@ class Language:
|
||||||
doc = Doc(self.vocab, words=["x", "y", "z"])
|
doc = Doc(self.vocab, words=["x", "y", "z"])
|
||||||
get_examples = lambda: [Example.from_dict(doc, {})]
|
get_examples = lambda: [Example.from_dict(doc, {})]
|
||||||
if not hasattr(get_examples, "__call__"):
|
if not hasattr(get_examples, "__call__"):
|
||||||
err = Errors.E930.format(method="Language.initialize", obj=type(get_examples))
|
err = Errors.E930.format(
|
||||||
|
method="Language.initialize", obj=type(get_examples)
|
||||||
|
)
|
||||||
raise TypeError(err)
|
raise TypeError(err)
|
||||||
# Make sure the config is interpolated so we can resolve subsections
|
# Make sure the config is interpolated so we can resolve subsections
|
||||||
config = self.config.interpolate()
|
config = self.config.interpolate()
|
||||||
|
|
|
@ -239,10 +239,12 @@ def th_tokenizer():
|
||||||
def tr_tokenizer():
|
def tr_tokenizer():
|
||||||
return get_lang_class("tr")().tokenizer
|
return get_lang_class("tr")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def tr_vocab():
|
def tr_vocab():
|
||||||
return get_lang_class("tr").Defaults.create_vocab()
|
return get_lang_class("tr").Defaults.create_vocab()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def tt_tokenizer():
|
def tt_tokenizer():
|
||||||
return get_lang_class("tt")().tokenizer
|
return get_lang_class("tt")().tokenizer
|
||||||
|
|
|
@ -225,7 +225,7 @@ def test_tr_noun_chunks_acl_nmod(tr_tokenizer):
|
||||||
assert chunks[0].text_with_ws == "en sevdiğim ses sanatçısı "
|
assert chunks[0].text_with_ws == "en sevdiğim ses sanatçısı "
|
||||||
|
|
||||||
|
|
||||||
def test_tr_noun_chunks_acl_nmod(tr_tokenizer):
|
def test_tr_noun_chunks_acl_nmod2(tr_tokenizer):
|
||||||
text = "bildiğim bir turizm şirketi"
|
text = "bildiğim bir turizm şirketi"
|
||||||
heads = [3, 3, 3, 3]
|
heads = [3, 3, 3, 3]
|
||||||
deps = ["acl", "det", "nmod", "ROOT"]
|
deps = ["acl", "det", "nmod", "ROOT"]
|
||||||
|
@ -326,7 +326,7 @@ def test_tr_noun_chunks_np_recursive_no_nmod(tr_tokenizer):
|
||||||
def test_tr_noun_chunks_np_recursive_long_two_acls(tr_tokenizer):
|
def test_tr_noun_chunks_np_recursive_long_two_acls(tr_tokenizer):
|
||||||
text = "içine Simge'nin bahçesinden toplanmış birkaç çiçeğin konmuş olduğu bir vazo"
|
text = "içine Simge'nin bahçesinden toplanmış birkaç çiçeğin konmuş olduğu bir vazo"
|
||||||
heads = [6, 2, 3, 5, 5, 6, 9, 6, 9, 9]
|
heads = [6, 2, 3, 5, 5, 6, 9, 6, 9, 9]
|
||||||
deps = ["obl", "nmod" , "obl", "acl", "det", "nsubj", "acl", "aux", "det", "ROOT"]
|
deps = ["obl", "nmod", "obl", "acl", "det", "nsubj", "acl", "aux", "det", "ROOT"]
|
||||||
pos = ["ADP", "PROPN", "NOUN", "VERB", "DET", "NOUN", "VERB", "AUX", "DET", "NOUN"]
|
pos = ["ADP", "PROPN", "NOUN", "VERB", "DET", "NOUN", "VERB", "AUX", "DET", "NOUN"]
|
||||||
tokens = tr_tokenizer(text)
|
tokens = tr_tokenizer(text)
|
||||||
doc = Doc(
|
doc = Doc(
|
||||||
|
@ -334,7 +334,10 @@ def test_tr_noun_chunks_np_recursive_long_two_acls(tr_tokenizer):
|
||||||
)
|
)
|
||||||
chunks = list(doc.noun_chunks)
|
chunks = list(doc.noun_chunks)
|
||||||
assert len(chunks) == 1
|
assert len(chunks) == 1
|
||||||
assert chunks[0].text_with_ws == "içine Simge'nin bahçesinden toplanmış birkaç çiçeğin konmuş olduğu bir vazo "
|
assert (
|
||||||
|
chunks[0].text_with_ws
|
||||||
|
== "içine Simge'nin bahçesinden toplanmış birkaç çiçeğin konmuş olduğu bir vazo "
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_tr_noun_chunks_two_nouns_in_nmod(tr_tokenizer):
|
def test_tr_noun_chunks_two_nouns_in_nmod(tr_tokenizer):
|
||||||
|
@ -350,7 +353,8 @@ def test_tr_noun_chunks_two_nouns_in_nmod(tr_tokenizer):
|
||||||
assert len(chunks) == 1
|
assert len(chunks) == 1
|
||||||
assert chunks[0].text_with_ws == "kız ve erkek çocuklar "
|
assert chunks[0].text_with_ws == "kız ve erkek çocuklar "
|
||||||
|
|
||||||
def test_tr_noun_chunks_two_nouns_in_nmod(tr_tokenizer):
|
|
||||||
|
def test_tr_noun_chunks_two_nouns_in_nmod2(tr_tokenizer):
|
||||||
text = "tatlı ve gürbüz çocuklar"
|
text = "tatlı ve gürbüz çocuklar"
|
||||||
heads = [3, 2, 0, 3]
|
heads = [3, 2, 0, 3]
|
||||||
deps = ["amod", "cc", "conj", "ROOT"]
|
deps = ["amod", "cc", "conj", "ROOT"]
|
||||||
|
@ -378,6 +382,7 @@ def test_tr_noun_chunks_conj_simple(tr_tokenizer):
|
||||||
assert chunks[0].text_with_ws == "ben "
|
assert chunks[0].text_with_ws == "ben "
|
||||||
assert chunks[1].text_with_ws == "Sen "
|
assert chunks[1].text_with_ws == "Sen "
|
||||||
|
|
||||||
|
|
||||||
def test_tr_noun_chunks_conj_three(tr_tokenizer):
|
def test_tr_noun_chunks_conj_three(tr_tokenizer):
|
||||||
text = "sen, ben ve ondan"
|
text = "sen, ben ve ondan"
|
||||||
heads = [0, 2, 0, 4, 0]
|
heads = [0, 2, 0, 4, 0]
|
||||||
|
@ -394,7 +399,7 @@ def test_tr_noun_chunks_conj_three(tr_tokenizer):
|
||||||
assert chunks[2].text_with_ws == "sen "
|
assert chunks[2].text_with_ws == "sen "
|
||||||
|
|
||||||
|
|
||||||
def test_tr_noun_chunks_conj_three(tr_tokenizer):
|
def test_tr_noun_chunks_conj_three2(tr_tokenizer):
|
||||||
text = "ben ya da sen ya da onlar"
|
text = "ben ya da sen ya da onlar"
|
||||||
heads = [0, 3, 1, 0, 6, 4, 3]
|
heads = [0, 3, 1, 0, 6, 4, 3]
|
||||||
deps = ["ROOT", "cc", "fixed", "conj", "cc", "fixed", "conj"]
|
deps = ["ROOT", "cc", "fixed", "conj", "cc", "fixed", "conj"]
|
||||||
|
@ -499,7 +504,7 @@ def test_tr_noun_chunks_flat_names_and_title(tr_tokenizer):
|
||||||
assert chunks[0].text_with_ws == "Gazi Mustafa Kemal "
|
assert chunks[0].text_with_ws == "Gazi Mustafa Kemal "
|
||||||
|
|
||||||
|
|
||||||
def test_tr_noun_chunks_flat_names_and_title(tr_tokenizer):
|
def test_tr_noun_chunks_flat_names_and_title2(tr_tokenizer):
|
||||||
text = "Ahmet Vefik Paşa"
|
text = "Ahmet Vefik Paşa"
|
||||||
heads = [2, 0, 2]
|
heads = [2, 0, 2]
|
||||||
deps = ["nmod", "flat", "ROOT"]
|
deps = ["nmod", "flat", "ROOT"]
|
||||||
|
|
|
@ -15,8 +15,8 @@ from spacy.lang.tr.lex_attrs import like_num
|
||||||
"üçüncü",
|
"üçüncü",
|
||||||
"beşinci",
|
"beşinci",
|
||||||
"100üncü",
|
"100üncü",
|
||||||
"8inci"
|
"8inci",
|
||||||
]
|
],
|
||||||
)
|
)
|
||||||
def test_tr_lex_attrs_like_number_cardinal_ordinal(word):
|
def test_tr_lex_attrs_like_number_cardinal_ordinal(word):
|
||||||
assert like_num(word)
|
assert like_num(word)
|
||||||
|
@ -26,4 +26,3 @@ def test_tr_lex_attrs_like_number_cardinal_ordinal(word):
|
||||||
def test_tr_lex_attrs_capitals(word):
|
def test_tr_lex_attrs_capitals(word):
|
||||||
assert like_num(word)
|
assert like_num(word)
|
||||||
assert like_num(word.upper())
|
assert like_num(word.upper())
|
||||||
|
|
||||||
|
|
|
@ -446,7 +446,7 @@ def test_overfitting_IO():
|
||||||
return mykb
|
return mykb
|
||||||
|
|
||||||
# Create the Entity Linker component and add it to the pipeline
|
# Create the Entity Linker component and add it to the pipeline
|
||||||
entity_linker = nlp.add_pipe("entity_linker", last=True,)
|
entity_linker = nlp.add_pipe("entity_linker", last=True)
|
||||||
entity_linker.set_kb(create_kb)
|
entity_linker.set_kb(create_kb)
|
||||||
assert "Q2146908" in entity_linker.vocab.strings
|
assert "Q2146908" in entity_linker.vocab.strings
|
||||||
assert "Q2146908" in entity_linker.kb.vocab.strings
|
assert "Q2146908" in entity_linker.kb.vocab.strings
|
||||||
|
|
|
@ -6,8 +6,8 @@ def test_issue6207(en_tokenizer):
|
||||||
|
|
||||||
# Make spans
|
# Make spans
|
||||||
s1 = doc[:4]
|
s1 = doc[:4]
|
||||||
s2 = doc[3:6] # overlaps with s1
|
s2 = doc[3:6] # overlaps with s1
|
||||||
s3 = doc[5:7] # overlaps with s2, not s1
|
s3 = doc[5:7] # overlaps with s2, not s1
|
||||||
|
|
||||||
result = filter_spans((s1, s2, s3))
|
result = filter_spans((s1, s2, s3))
|
||||||
assert s1 in result
|
assert s1 in result
|
||||||
|
|
|
@ -1,10 +1,8 @@
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from thinc.api import fix_random_seed, Adam, set_dropout_rate
|
from thinc.api import fix_random_seed, Adam, set_dropout_rate
|
||||||
from numpy.testing import assert_array_equal
|
from numpy.testing import assert_array_equal
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder
|
from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder
|
||||||
from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier
|
from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier
|
||||||
from spacy.ml.staticvectors import StaticVectors
|
from spacy.ml.staticvectors import StaticVectors
|
||||||
|
@ -188,12 +186,7 @@ def test_models_update_consistently(seed, dropout, model_func, kwargs, get_X):
|
||||||
assert_array_equal(get_all_params(model1), get_all_params(model2))
|
assert_array_equal(get_all_params(model1), get_all_params(model2))
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize("model_func,kwargs", [(StaticVectors, {"nO": 128, "nM": 300})])
|
||||||
"model_func,kwargs",
|
|
||||||
[
|
|
||||||
(StaticVectors, {"nO": 128, "nM": 300}),
|
|
||||||
]
|
|
||||||
)
|
|
||||||
def test_empty_docs(model_func, kwargs):
|
def test_empty_docs(model_func, kwargs):
|
||||||
nlp = English()
|
nlp = English()
|
||||||
model = model_func(**kwargs).initialize()
|
model = model_func(**kwargs).initialize()
|
||||||
|
@ -201,7 +194,7 @@ def test_empty_docs(model_func, kwargs):
|
||||||
for n_docs in range(3):
|
for n_docs in range(3):
|
||||||
docs = [nlp("") for _ in range(n_docs)]
|
docs = [nlp("") for _ in range(n_docs)]
|
||||||
# Test predict
|
# Test predict
|
||||||
_ = model.predict(docs)
|
model.predict(docs)
|
||||||
# Test backprop
|
# Test backprop
|
||||||
output, backprop = model.begin_update(docs)
|
output, backprop = model.begin_update(docs)
|
||||||
_ = backprop(output)
|
backprop(output)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user