mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
💫 Tidy up and auto-format tests (#2967)
* Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility
This commit is contained in:
parent
2c37e0ccf6
commit
b6e991440c
4
.flake8
Normal file
4
.flake8
Normal file
|
@ -0,0 +1,4 @@
|
|||
[flake8]
|
||||
ignore = E203, E266, E501, W503
|
||||
max-line-length = 80
|
||||
select = B,C,E,F,W,T4,B9
|
|
@ -11,7 +11,7 @@ ujson>=1.35
|
|||
dill>=0.2,<0.3
|
||||
regex==2018.01.10
|
||||
requests>=2.13.0,<3.0.0
|
||||
pytest>=3.6.0,<4.0.0
|
||||
pytest>=4.0.0,<5.0.0
|
||||
pytest-timeout>=1.3.0,<2.0.0
|
||||
mock>=2.0.0,<3.0.0
|
||||
pathlib==1.0.1; python_version < "3.4"
|
||||
|
|
|
@ -2,7 +2,6 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from io import StringIO, BytesIO
|
||||
from spacy.util import get_lang_class
|
||||
|
||||
|
||||
|
@ -11,126 +10,135 @@ def pytest_addoption(parser):
|
|||
|
||||
|
||||
def pytest_runtest_setup(item):
|
||||
for opt in ['slow']:
|
||||
for opt in ["slow"]:
|
||||
if opt in item.keywords and not item.config.getoption("--%s" % opt):
|
||||
pytest.skip("need --%s option to run" % opt)
|
||||
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
@pytest.fixture(scope="module")
|
||||
def tokenizer():
|
||||
return get_lang_class('xx').Defaults.create_tokenizer()
|
||||
return get_lang_class("xx").Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
@pytest.fixture(scope="session")
|
||||
def en_tokenizer():
|
||||
return get_lang_class('en').Defaults.create_tokenizer()
|
||||
return get_lang_class("en").Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
@pytest.fixture(scope="session")
|
||||
def en_vocab():
|
||||
return get_lang_class('en').Defaults.create_vocab()
|
||||
return get_lang_class("en").Defaults.create_vocab()
|
||||
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
@pytest.fixture(scope="session")
|
||||
def en_parser(en_vocab):
|
||||
nlp = get_lang_class('en')(en_vocab)
|
||||
return nlp.create_pipe('parser')
|
||||
nlp = get_lang_class("en")(en_vocab)
|
||||
return nlp.create_pipe("parser")
|
||||
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
@pytest.fixture(scope="session")
|
||||
def es_tokenizer():
|
||||
return get_lang_class('es').Defaults.create_tokenizer()
|
||||
return get_lang_class("es").Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
@pytest.fixture(scope="session")
|
||||
def de_tokenizer():
|
||||
return get_lang_class('de').Defaults.create_tokenizer()
|
||||
return get_lang_class("de").Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
@pytest.fixture(scope="session")
|
||||
def fr_tokenizer():
|
||||
return get_lang_class('fr').Defaults.create_tokenizer()
|
||||
return get_lang_class("fr").Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def hu_tokenizer():
|
||||
return get_lang_class('hu').Defaults.create_tokenizer()
|
||||
return get_lang_class("hu").Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
@pytest.fixture(scope="session")
|
||||
def fi_tokenizer():
|
||||
return get_lang_class('fi').Defaults.create_tokenizer()
|
||||
return get_lang_class("fi").Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
@pytest.fixture(scope="session")
|
||||
def ro_tokenizer():
|
||||
return get_lang_class('ro').Defaults.create_tokenizer()
|
||||
return get_lang_class("ro").Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
@pytest.fixture(scope="session")
|
||||
def id_tokenizer():
|
||||
return get_lang_class('id').Defaults.create_tokenizer()
|
||||
return get_lang_class("id").Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
@pytest.fixture(scope="session")
|
||||
def sv_tokenizer():
|
||||
return get_lang_class('sv').Defaults.create_tokenizer()
|
||||
return get_lang_class("sv").Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
@pytest.fixture(scope="session")
|
||||
def bn_tokenizer():
|
||||
return get_lang_class('bn').Defaults.create_tokenizer()
|
||||
return get_lang_class("bn").Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
@pytest.fixture(scope="session")
|
||||
def ga_tokenizer():
|
||||
return get_lang_class('ga').Defaults.create_tokenizer()
|
||||
return get_lang_class("ga").Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
@pytest.fixture(scope="session")
|
||||
def he_tokenizer():
|
||||
return get_lang_class('he').Defaults.create_tokenizer()
|
||||
return get_lang_class("he").Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
@pytest.fixture(scope="session")
|
||||
def nb_tokenizer():
|
||||
return get_lang_class('nb').Defaults.create_tokenizer()
|
||||
return get_lang_class("nb").Defaults.create_tokenizer()
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def da_tokenizer():
|
||||
return get_lang_class('da').Defaults.create_tokenizer()
|
||||
return get_lang_class("da").Defaults.create_tokenizer()
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def ja_tokenizer():
|
||||
mecab = pytest.importorskip("MeCab")
|
||||
return get_lang_class('ja').Defaults.create_tokenizer()
|
||||
return get_lang_class("ja").Defaults.create_tokenizer()
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def th_tokenizer():
|
||||
pythainlp = pytest.importorskip("pythainlp")
|
||||
return get_lang_class('th').Defaults.create_tokenizer()
|
||||
return get_lang_class("th").Defaults.create_tokenizer()
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def tr_tokenizer():
|
||||
return get_lang_class('tr').Defaults.create_tokenizer()
|
||||
return get_lang_class("tr").Defaults.create_tokenizer()
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def tt_tokenizer():
|
||||
return get_lang_class('tt').Defaults.create_tokenizer()
|
||||
return get_lang_class("tt").Defaults.create_tokenizer()
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def el_tokenizer():
|
||||
return get_lang_class('el').Defaults.create_tokenizer()
|
||||
return get_lang_class("el").Defaults.create_tokenizer()
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def ar_tokenizer():
|
||||
return get_lang_class('ar').Defaults.create_tokenizer()
|
||||
return get_lang_class("ar").Defaults.create_tokenizer()
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def ur_tokenizer():
|
||||
return get_lang_class('ur').Defaults.create_tokenizer()
|
||||
return get_lang_class("ur").Defaults.create_tokenizer()
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def ru_tokenizer():
|
||||
pymorphy = pytest.importorskip('pymorphy2')
|
||||
return get_lang_class('ru').Defaults.create_tokenizer()
|
||||
pymorphy = pytest.importorskip("pymorphy2")
|
||||
return get_lang_class("ru").Defaults.create_tokenizer()
|
||||
|
|
|
@ -38,7 +38,7 @@ def test_doc_scalar_attr_of_token(en_tokenizer, en_vocab):
|
|||
|
||||
def test_doc_array_tag(en_tokenizer):
|
||||
text = "A nice sentence."
|
||||
pos = ['DET', 'ADJ', 'NOUN', 'PUNCT']
|
||||
pos = ["DET", "ADJ", "NOUN", "PUNCT"]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], pos=pos)
|
||||
assert doc[0].pos != doc[1].pos != doc[2].pos != doc[3].pos
|
||||
|
@ -51,7 +51,7 @@ def test_doc_array_tag(en_tokenizer):
|
|||
|
||||
def test_doc_array_dep(en_tokenizer):
|
||||
text = "A nice sentence."
|
||||
deps = ['det', 'amod', 'ROOT', 'punct']
|
||||
deps = ["det", "amod", "ROOT", "punct"]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps)
|
||||
feats_array = doc.to_array((ORTH, DEP))
|
||||
|
|
|
@ -9,7 +9,7 @@ from spacy.lemmatizer import Lemmatizer
|
|||
|
||||
@pytest.fixture
|
||||
def lemmatizer():
|
||||
return Lemmatizer(lookup={'dogs': 'dog', 'boxen': 'box', 'mice': 'mouse'})
|
||||
return Lemmatizer(lookup={"dogs": "dog", "boxen": "box", "mice": "mouse"})
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -23,15 +23,15 @@ def test_empty_doc(vocab):
|
|||
|
||||
|
||||
def test_single_word(vocab):
|
||||
doc = Doc(vocab, words=['a'])
|
||||
assert doc.text == 'a '
|
||||
doc = Doc(vocab, words=['a'], spaces=[False])
|
||||
assert doc.text == 'a'
|
||||
doc = Doc(vocab, words=["a"])
|
||||
assert doc.text == "a "
|
||||
doc = Doc(vocab, words=["a"], spaces=[False])
|
||||
assert doc.text == "a"
|
||||
|
||||
|
||||
def test_lookup_lemmatization(vocab):
|
||||
doc = Doc(vocab, words=['dogs', 'dogses'])
|
||||
assert doc[0].text == 'dogs'
|
||||
assert doc[0].lemma_ == 'dog'
|
||||
assert doc[1].text == 'dogses'
|
||||
assert doc[1].lemma_ == 'dogses'
|
||||
doc = Doc(vocab, words=["dogs", "dogses"])
|
||||
assert doc[0].text == "dogs"
|
||||
assert doc[0].lemma_ == "dog"
|
||||
assert doc[1].text == "dogses"
|
||||
assert doc[1].lemma_ == "dogses"
|
||||
|
|
|
@ -10,7 +10,7 @@ from spacy.attrs import LEMMA
|
|||
from ..util import get_doc
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', [["one", "two", "three"]])
|
||||
@pytest.mark.parametrize("text", [["one", "two", "three"]])
|
||||
def test_doc_api_compare_by_string_position(en_vocab, text):
|
||||
doc = Doc(en_vocab, words=text)
|
||||
# Get the tokens in this order, so their ID ordering doesn't match the idx
|
||||
|
@ -28,80 +28,81 @@ def test_doc_api_compare_by_string_position(en_vocab, text):
|
|||
def test_doc_api_getitem(en_tokenizer):
|
||||
text = "Give it back! He pleaded."
|
||||
tokens = en_tokenizer(text)
|
||||
assert tokens[0].text == 'Give'
|
||||
assert tokens[-1].text == '.'
|
||||
assert tokens[0].text == "Give"
|
||||
assert tokens[-1].text == "."
|
||||
with pytest.raises(IndexError):
|
||||
tokens[len(tokens)]
|
||||
|
||||
def to_str(span):
|
||||
return '/'.join(token.text for token in span)
|
||||
return "/".join(token.text for token in span)
|
||||
|
||||
span = tokens[1:1]
|
||||
assert not to_str(span)
|
||||
span = tokens[1:4]
|
||||
assert to_str(span) == 'it/back/!'
|
||||
assert to_str(span) == "it/back/!"
|
||||
span = tokens[1:4:1]
|
||||
assert to_str(span) == 'it/back/!'
|
||||
assert to_str(span) == "it/back/!"
|
||||
with pytest.raises(ValueError):
|
||||
tokens[1:4:2]
|
||||
with pytest.raises(ValueError):
|
||||
tokens[1:4:-1]
|
||||
|
||||
span = tokens[-3:6]
|
||||
assert to_str(span) == 'He/pleaded'
|
||||
assert to_str(span) == "He/pleaded"
|
||||
span = tokens[4:-1]
|
||||
assert to_str(span) == 'He/pleaded'
|
||||
assert to_str(span) == "He/pleaded"
|
||||
span = tokens[-5:-3]
|
||||
assert to_str(span) == 'back/!'
|
||||
assert to_str(span) == "back/!"
|
||||
span = tokens[5:4]
|
||||
assert span.start == span.end == 5 and not to_str(span)
|
||||
span = tokens[4:-3]
|
||||
assert span.start == span.end == 4 and not to_str(span)
|
||||
|
||||
span = tokens[:]
|
||||
assert to_str(span) == 'Give/it/back/!/He/pleaded/.'
|
||||
assert to_str(span) == "Give/it/back/!/He/pleaded/."
|
||||
span = tokens[4:]
|
||||
assert to_str(span) == 'He/pleaded/.'
|
||||
assert to_str(span) == "He/pleaded/."
|
||||
span = tokens[:4]
|
||||
assert to_str(span) == 'Give/it/back/!'
|
||||
assert to_str(span) == "Give/it/back/!"
|
||||
span = tokens[:-3]
|
||||
assert to_str(span) == 'Give/it/back/!'
|
||||
assert to_str(span) == "Give/it/back/!"
|
||||
span = tokens[-3:]
|
||||
assert to_str(span) == 'He/pleaded/.'
|
||||
assert to_str(span) == "He/pleaded/."
|
||||
|
||||
span = tokens[4:50]
|
||||
assert to_str(span) == 'He/pleaded/.'
|
||||
assert to_str(span) == "He/pleaded/."
|
||||
span = tokens[-50:4]
|
||||
assert to_str(span) == 'Give/it/back/!'
|
||||
assert to_str(span) == "Give/it/back/!"
|
||||
span = tokens[-50:-40]
|
||||
assert span.start == span.end == 0 and not to_str(span)
|
||||
span = tokens[40:50]
|
||||
assert span.start == span.end == 7 and not to_str(span)
|
||||
|
||||
span = tokens[1:4]
|
||||
assert span[0].orth_ == 'it'
|
||||
assert span[0].orth_ == "it"
|
||||
subspan = span[:]
|
||||
assert to_str(subspan) == 'it/back/!'
|
||||
assert to_str(subspan) == "it/back/!"
|
||||
subspan = span[:2]
|
||||
assert to_str(subspan) == 'it/back'
|
||||
assert to_str(subspan) == "it/back"
|
||||
subspan = span[1:]
|
||||
assert to_str(subspan) == 'back/!'
|
||||
assert to_str(subspan) == "back/!"
|
||||
subspan = span[:-1]
|
||||
assert to_str(subspan) == 'it/back'
|
||||
assert to_str(subspan) == "it/back"
|
||||
subspan = span[-2:]
|
||||
assert to_str(subspan) == 'back/!'
|
||||
assert to_str(subspan) == "back/!"
|
||||
subspan = span[1:2]
|
||||
assert to_str(subspan) == 'back'
|
||||
assert to_str(subspan) == "back"
|
||||
subspan = span[-2:-1]
|
||||
assert to_str(subspan) == 'back'
|
||||
assert to_str(subspan) == "back"
|
||||
subspan = span[-50:50]
|
||||
assert to_str(subspan) == 'it/back/!'
|
||||
assert to_str(subspan) == "it/back/!"
|
||||
subspan = span[50:-50]
|
||||
assert subspan.start == subspan.end == 4 and not to_str(subspan)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["Give it back! He pleaded.",
|
||||
" Give it back! He pleaded. "])
|
||||
@pytest.mark.parametrize(
|
||||
"text", ["Give it back! He pleaded.", " Give it back! He pleaded. "]
|
||||
)
|
||||
def test_doc_api_serialize(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
new_tokens = Doc(tokens.vocab).from_bytes(tokens.to_bytes())
|
||||
|
@ -110,13 +111,15 @@ def test_doc_api_serialize(en_tokenizer, text):
|
|||
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
|
||||
|
||||
new_tokens = Doc(tokens.vocab).from_bytes(
|
||||
tokens.to_bytes(tensor=False), tensor=False)
|
||||
tokens.to_bytes(tensor=False), tensor=False
|
||||
)
|
||||
assert tokens.text == new_tokens.text
|
||||
assert [t.text for t in tokens] == [t.text for t in new_tokens]
|
||||
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
|
||||
|
||||
new_tokens = Doc(tokens.vocab).from_bytes(
|
||||
tokens.to_bytes(sentiment=False), sentiment=False)
|
||||
tokens.to_bytes(sentiment=False), sentiment=False
|
||||
)
|
||||
assert tokens.text == new_tokens.text
|
||||
assert [t.text for t in tokens] == [t.text for t in new_tokens]
|
||||
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
|
||||
|
@ -126,10 +129,10 @@ def test_doc_api_set_ents(en_tokenizer):
|
|||
text = "I use goggle chrone to surf the web"
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens.ents) == 0
|
||||
tokens.ents = [(tokens.vocab.strings['PRODUCT'], 2, 4)]
|
||||
tokens.ents = [(tokens.vocab.strings["PRODUCT"], 2, 4)]
|
||||
assert len(list(tokens.ents)) == 1
|
||||
assert [t.ent_iob for t in tokens] == [0, 0, 3, 1, 0, 0, 0, 0]
|
||||
assert tokens.ents[0].label_ == 'PRODUCT'
|
||||
assert tokens.ents[0].label_ == "PRODUCT"
|
||||
assert tokens.ents[0].start == 2
|
||||
assert tokens.ents[0].end == 4
|
||||
|
||||
|
@ -140,21 +143,31 @@ def test_doc_api_merge(en_tokenizer):
|
|||
# merge 'The Beach Boys'
|
||||
doc = en_tokenizer(text)
|
||||
assert len(doc) == 9
|
||||
doc.merge(doc[4].idx, doc[6].idx + len(doc[6]), tag='NAMED', lemma='LEMMA',
|
||||
ent_type='TYPE')
|
||||
doc.merge(
|
||||
doc[4].idx,
|
||||
doc[6].idx + len(doc[6]),
|
||||
tag="NAMED",
|
||||
lemma="LEMMA",
|
||||
ent_type="TYPE",
|
||||
)
|
||||
assert len(doc) == 7
|
||||
assert doc[4].text == 'the beach boys'
|
||||
assert doc[4].text_with_ws == 'the beach boys '
|
||||
assert doc[4].tag_ == 'NAMED'
|
||||
assert doc[4].text == "the beach boys"
|
||||
assert doc[4].text_with_ws == "the beach boys "
|
||||
assert doc[4].tag_ == "NAMED"
|
||||
|
||||
# merge 'all night'
|
||||
doc = en_tokenizer(text)
|
||||
assert len(doc) == 9
|
||||
doc.merge(doc[7].idx, doc[8].idx + len(doc[8]), tag='NAMED', lemma='LEMMA',
|
||||
ent_type='TYPE')
|
||||
doc.merge(
|
||||
doc[7].idx,
|
||||
doc[8].idx + len(doc[8]),
|
||||
tag="NAMED",
|
||||
lemma="LEMMA",
|
||||
ent_type="TYPE",
|
||||
)
|
||||
assert len(doc) == 8
|
||||
assert doc[7].text == 'all night'
|
||||
assert doc[7].text_with_ws == 'all night'
|
||||
assert doc[7].text == "all night"
|
||||
assert doc[7].text_with_ws == "all night"
|
||||
|
||||
|
||||
def test_doc_api_merge_children(en_tokenizer):
|
||||
|
@ -162,8 +175,13 @@ def test_doc_api_merge_children(en_tokenizer):
|
|||
text = "WKRO played songs by the beach boys all night"
|
||||
doc = en_tokenizer(text)
|
||||
assert len(doc) == 9
|
||||
doc.merge(doc[4].idx, doc[6].idx + len(doc[6]), tag='NAMED', lemma='LEMMA',
|
||||
ent_type='TYPE')
|
||||
doc.merge(
|
||||
doc[4].idx,
|
||||
doc[6].idx + len(doc[6]),
|
||||
tag="NAMED",
|
||||
lemma="LEMMA",
|
||||
ent_type="TYPE",
|
||||
)
|
||||
|
||||
for word in doc:
|
||||
if word.i < word.head.i:
|
||||
|
@ -175,8 +193,8 @@ def test_doc_api_merge_children(en_tokenizer):
|
|||
def test_doc_api_merge_hang(en_tokenizer):
|
||||
text = "through North and South Carolina"
|
||||
doc = en_tokenizer(text)
|
||||
doc.merge(18, 32, tag='', lemma='', ent_type='ORG')
|
||||
doc.merge(8, 32, tag='', lemma='', ent_type='ORG')
|
||||
doc.merge(18, 32, tag="", lemma="", ent_type="ORG")
|
||||
doc.merge(8, 32, tag="", lemma="", ent_type="ORG")
|
||||
|
||||
|
||||
def test_doc_api_retokenizer(en_tokenizer):
|
||||
|
@ -184,19 +202,19 @@ def test_doc_api_retokenizer(en_tokenizer):
|
|||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(doc[4:7])
|
||||
assert len(doc) == 7
|
||||
assert doc[4].text == 'the beach boys'
|
||||
assert doc[4].text == "the beach boys"
|
||||
|
||||
|
||||
def test_doc_api_retokenizer_attrs(en_tokenizer):
|
||||
doc = en_tokenizer("WKRO played songs by the beach boys all night")
|
||||
# test both string and integer attributes and values
|
||||
attrs = {LEMMA: 'boys', 'ENT_TYPE': doc.vocab.strings['ORG']}
|
||||
attrs = {LEMMA: "boys", "ENT_TYPE": doc.vocab.strings["ORG"]}
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(doc[4:7], attrs=attrs)
|
||||
assert len(doc) == 7
|
||||
assert doc[4].text == 'the beach boys'
|
||||
assert doc[4].lemma_ == 'boys'
|
||||
assert doc[4].ent_type_ == 'ORG'
|
||||
assert doc[4].text == "the beach boys"
|
||||
assert doc[4].lemma_ == "boys"
|
||||
assert doc[4].ent_type_ == "ORG"
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
|
@ -205,11 +223,11 @@ def test_doc_api_retokenizer_lex_attrs(en_tokenizer):
|
|||
doc = en_tokenizer("WKRO played beach boys songs")
|
||||
assert not any(token.is_stop for token in doc)
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(doc[2:4], attrs={'LEMMA': 'boys', 'IS_STOP': True})
|
||||
assert doc[2].text == 'beach boys'
|
||||
assert doc[2].lemma_ == 'boys'
|
||||
retokenizer.merge(doc[2:4], attrs={"LEMMA": "boys", "IS_STOP": True})
|
||||
assert doc[2].text == "beach boys"
|
||||
assert doc[2].lemma_ == "boys"
|
||||
assert doc[2].is_stop
|
||||
new_doc = Doc(doc.vocab, words=['beach boys'])
|
||||
new_doc = Doc(doc.vocab, words=["beach boys"])
|
||||
assert new_doc[0].is_stop
|
||||
|
||||
|
||||
|
@ -222,21 +240,25 @@ def test_doc_api_sents_empty_string(en_tokenizer):
|
|||
|
||||
def test_doc_api_runtime_error(en_tokenizer):
|
||||
# Example that caused run-time error while parsing Reddit
|
||||
# fmt: off
|
||||
text = "67% of black households are single parent \n\n72% of all black babies born out of wedlock \n\n50% of all black kids don\u2019t finish high school"
|
||||
deps = ['nsubj', 'prep', 'amod', 'pobj', 'ROOT', 'amod', 'attr', '',
|
||||
'nummod', 'prep', 'det', 'amod', 'pobj', 'acl', 'prep', 'prep',
|
||||
'pobj', '', 'nummod', 'prep', 'det', 'amod', 'pobj', 'aux', 'neg',
|
||||
'ROOT', 'amod', 'dobj']
|
||||
deps = ["nsubj", "prep", "amod", "pobj", "ROOT", "amod", "attr", "",
|
||||
"nummod", "prep", "det", "amod", "pobj", "acl", "prep", "prep",
|
||||
"pobj", "", "nummod", "prep", "det", "amod", "pobj", "aux", "neg",
|
||||
"ROOT", "amod", "dobj"]
|
||||
# fmt: on
|
||||
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps)
|
||||
|
||||
nps = []
|
||||
for np in doc.noun_chunks:
|
||||
while len(np) > 1 and np[0].dep_ not in ('advmod', 'amod', 'compound'):
|
||||
while len(np) > 1 and np[0].dep_ not in ("advmod", "amod", "compound"):
|
||||
np = np[1:]
|
||||
if len(np) > 1:
|
||||
nps.append((np.start_char, np.end_char, np.root.tag_, np.text, np.root.ent_type_))
|
||||
nps.append(
|
||||
(np.start_char, np.end_char, np.root.tag_, np.text, np.root.ent_type_)
|
||||
)
|
||||
for np in nps:
|
||||
start, end, tag, lemma, ent_type = np
|
||||
doc.merge(start, end, tag=tag, lemma=lemma, ent_type=ent_type)
|
||||
|
@ -244,57 +266,76 @@ def test_doc_api_runtime_error(en_tokenizer):
|
|||
|
||||
def test_doc_api_right_edge(en_tokenizer):
|
||||
"""Test for bug occurring from Unshift action, causing incorrect right edge"""
|
||||
# fmt: off
|
||||
text = "I have proposed to myself, for the sake of such as live under the government of the Romans, to translate those books into the Greek tongue."
|
||||
heads = [2, 1, 0, -1, -1, -3, 15, 1, -2, -1, 1, -3, -1, -1, 1, -2, -1, 1,
|
||||
-2, -7, 1, -19, 1, -2, -3, 2, 1, -3, -26]
|
||||
# fmt: on
|
||||
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
assert doc[6].text == 'for'
|
||||
assert doc[6].text == "for"
|
||||
subtree = [w.text for w in doc[6].subtree]
|
||||
assert subtree == ['for', 'the', 'sake', 'of', 'such', 'as',
|
||||
'live', 'under', 'the', 'government', 'of', 'the', 'Romans', ',']
|
||||
assert doc[6].right_edge.text == ','
|
||||
assert subtree == [
|
||||
"for",
|
||||
"the",
|
||||
"sake",
|
||||
"of",
|
||||
"such",
|
||||
"as",
|
||||
"live",
|
||||
"under",
|
||||
"the",
|
||||
"government",
|
||||
"of",
|
||||
"the",
|
||||
"Romans",
|
||||
",",
|
||||
]
|
||||
assert doc[6].right_edge.text == ","
|
||||
|
||||
|
||||
def test_doc_api_has_vector():
|
||||
vocab = Vocab()
|
||||
vocab.reset_vectors(width=2)
|
||||
vocab.set_vector('kitten', vector=numpy.asarray([0., 2.], dtype='f'))
|
||||
doc = Doc(vocab, words=['kitten'])
|
||||
vocab.set_vector("kitten", vector=numpy.asarray([0.0, 2.0], dtype="f"))
|
||||
doc = Doc(vocab, words=["kitten"])
|
||||
assert doc.has_vector
|
||||
|
||||
|
||||
def test_doc_api_similarity_match():
|
||||
doc = Doc(Vocab(), words=['a'])
|
||||
doc = Doc(Vocab(), words=["a"])
|
||||
with pytest.warns(None):
|
||||
assert doc.similarity(doc[0]) == 1.0
|
||||
assert doc.similarity(doc.vocab['a']) == 1.0
|
||||
doc2 = Doc(doc.vocab, words=['a', 'b', 'c'])
|
||||
assert doc.similarity(doc.vocab["a"]) == 1.0
|
||||
doc2 = Doc(doc.vocab, words=["a", "b", "c"])
|
||||
with pytest.warns(None):
|
||||
assert doc.similarity(doc2[:1]) == 1.0
|
||||
assert doc.similarity(doc2) == 0.0
|
||||
|
||||
|
||||
def test_lowest_common_ancestor(en_tokenizer):
|
||||
tokens = en_tokenizer('the lazy dog slept')
|
||||
tokens = en_tokenizer("the lazy dog slept")
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0])
|
||||
lca = doc.get_lca_matrix()
|
||||
assert(lca[1, 1] == 1)
|
||||
assert(lca[0, 1] == 2)
|
||||
assert(lca[1, 2] == 2)
|
||||
assert lca[1, 1] == 1
|
||||
assert lca[0, 1] == 2
|
||||
assert lca[1, 2] == 2
|
||||
|
||||
|
||||
def test_parse_tree(en_tokenizer):
|
||||
"""Tests doc.print_tree() method."""
|
||||
text = 'I like New York in Autumn.'
|
||||
text = "I like New York in Autumn."
|
||||
heads = [1, 0, 1, -2, -3, -1, -5]
|
||||
tags = ['PRP', 'IN', 'NNP', 'NNP', 'IN', 'NNP', '.']
|
||||
tags = ["PRP", "IN", "NNP", "NNP", "IN", "NNP", "."]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags)
|
||||
# full method parse_tree(text) is a trivial composition
|
||||
trees = doc.print_tree()
|
||||
assert len(trees) > 0
|
||||
tree = trees[0]
|
||||
assert all(k in list(tree.keys()) for k in ['word', 'lemma', 'NE', 'POS_fine', 'POS_coarse', 'arc', 'modifiers'])
|
||||
assert tree['word'] == 'like' # check root is correct
|
||||
assert all(
|
||||
k in list(tree.keys())
|
||||
for k in ["word", "lemma", "NE", "POS_fine", "POS_coarse", "arc", "modifiers"]
|
||||
)
|
||||
assert tree["word"] == "like" # check root is correct
|
||||
|
|
|
@ -7,37 +7,38 @@ from spacy.compat import pickle, unicode_
|
|||
|
||||
def test_pickle_single_doc():
|
||||
nlp = Language()
|
||||
doc = nlp('pickle roundtrip')
|
||||
doc = nlp("pickle roundtrip")
|
||||
data = pickle.dumps(doc, 1)
|
||||
doc2 = pickle.loads(data)
|
||||
assert doc2.text == 'pickle roundtrip'
|
||||
assert doc2.text == "pickle roundtrip"
|
||||
|
||||
|
||||
def test_list_of_docs_pickles_efficiently():
|
||||
nlp = Language()
|
||||
for i in range(10000):
|
||||
_ = nlp.vocab[unicode_(i)]
|
||||
one_pickled = pickle.dumps(nlp('0'), -1)
|
||||
one_pickled = pickle.dumps(nlp("0"), -1)
|
||||
docs = list(nlp.pipe(unicode_(i) for i in range(100)))
|
||||
many_pickled = pickle.dumps(docs, -1)
|
||||
assert len(many_pickled) < (len(one_pickled) * 2)
|
||||
many_unpickled = pickle.loads(many_pickled)
|
||||
assert many_unpickled[0].text == '0'
|
||||
assert many_unpickled[-1].text == '99'
|
||||
assert many_unpickled[0].text == "0"
|
||||
assert many_unpickled[-1].text == "99"
|
||||
assert len(many_unpickled) == 100
|
||||
|
||||
|
||||
def test_user_data_from_disk():
|
||||
nlp = Language()
|
||||
doc = nlp('Hello')
|
||||
doc = nlp("Hello")
|
||||
doc.user_data[(0, 1)] = False
|
||||
b = doc.to_bytes()
|
||||
doc2 = doc.__class__(doc.vocab).from_bytes(b)
|
||||
assert doc2.user_data[(0, 1)] == False
|
||||
|
||||
|
||||
def test_user_data_unpickles():
|
||||
nlp = Language()
|
||||
doc = nlp('Hello')
|
||||
doc = nlp("Hello")
|
||||
doc.user_data[(0, 1)] = False
|
||||
b = pickle.dumps(doc)
|
||||
doc2 = pickle.loads(b)
|
||||
|
@ -46,10 +47,11 @@ def test_user_data_unpickles():
|
|||
|
||||
def test_hooks_unpickle():
|
||||
def inner_func(d1, d2):
|
||||
return 'hello!'
|
||||
return "hello!"
|
||||
|
||||
nlp = Language()
|
||||
doc = nlp('Hello')
|
||||
doc.user_hooks['similarity'] = inner_func
|
||||
doc = nlp("Hello")
|
||||
doc.user_hooks["similarity"] = inner_func
|
||||
b = pickle.dumps(doc)
|
||||
doc2 = pickle.loads(b)
|
||||
assert doc2.similarity(None) == 'hello!'
|
||||
assert doc2.similarity(None) == "hello!"
|
||||
|
|
|
@ -11,10 +11,12 @@ from ..util import get_doc
|
|||
|
||||
@pytest.fixture
|
||||
def doc(en_tokenizer):
|
||||
# fmt: off
|
||||
text = "This is a sentence. This is another sentence. And a third."
|
||||
heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3, 0, 1, -2, -1]
|
||||
deps = ['nsubj', 'ROOT', 'det', 'attr', 'punct', 'nsubj', 'ROOT', 'det',
|
||||
'attr', 'punct', 'ROOT', 'det', 'npadvmod', 'punct']
|
||||
deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det",
|
||||
"attr", "punct", "ROOT", "det", "npadvmod", "punct"]
|
||||
# fmt: on
|
||||
tokens = en_tokenizer(text)
|
||||
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
|
||||
|
@ -39,17 +41,17 @@ def test_spans_sent_spans(doc):
|
|||
def test_spans_root(doc):
|
||||
span = doc[2:4]
|
||||
assert len(span) == 2
|
||||
assert span.text == 'a sentence'
|
||||
assert span.root.text == 'sentence'
|
||||
assert span.root.head.text == 'is'
|
||||
assert span.text == "a sentence"
|
||||
assert span.root.text == "sentence"
|
||||
assert span.root.head.text == "is"
|
||||
|
||||
|
||||
def test_spans_string_fn(doc):
|
||||
span = doc[0:4]
|
||||
assert len(span) == 4
|
||||
assert span.text == 'This is a sentence'
|
||||
assert span.upper_ == 'THIS IS A SENTENCE'
|
||||
assert span.lower_ == 'this is a sentence'
|
||||
assert span.text == "This is a sentence"
|
||||
assert span.upper_ == "THIS IS A SENTENCE"
|
||||
assert span.lower_ == "this is a sentence"
|
||||
|
||||
|
||||
def test_spans_root2(en_tokenizer):
|
||||
|
@ -57,15 +59,15 @@ def test_spans_root2(en_tokenizer):
|
|||
heads = [0, 3, -1, -2, -4]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
assert doc[-2:].root.text == 'Carolina'
|
||||
assert doc[-2:].root.text == "Carolina"
|
||||
|
||||
|
||||
def test_spans_span_sent(doc, doc_not_parsed):
|
||||
"""Test span.sent property"""
|
||||
assert len(list(doc.sents))
|
||||
assert doc[:2].sent.root.text == 'is'
|
||||
assert doc[:2].sent.text == 'This is a sentence .'
|
||||
assert doc[6:7].sent.root.left_edge.text == 'This'
|
||||
assert doc[:2].sent.root.text == "is"
|
||||
assert doc[:2].sent.text == "This is a sentence ."
|
||||
assert doc[6:7].sent.root.left_edge.text == "This"
|
||||
# test on manual sbd
|
||||
doc_not_parsed[0].is_sent_start = True
|
||||
doc_not_parsed[5].is_sent_start = True
|
||||
|
@ -75,23 +77,23 @@ def test_spans_span_sent(doc, doc_not_parsed):
|
|||
|
||||
def test_spans_lca_matrix(en_tokenizer):
|
||||
"""Test span's lca matrix generation"""
|
||||
tokens = en_tokenizer('the lazy dog slept')
|
||||
tokens = en_tokenizer("the lazy dog slept")
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0])
|
||||
lca = doc[:2].get_lca_matrix()
|
||||
assert(lca[0, 0] == 0)
|
||||
assert(lca[0, 1] == -1)
|
||||
assert(lca[1, 0] == -1)
|
||||
assert(lca[1, 1] == 1)
|
||||
assert lca[0, 0] == 0
|
||||
assert lca[0, 1] == -1
|
||||
assert lca[1, 0] == -1
|
||||
assert lca[1, 1] == 1
|
||||
|
||||
|
||||
def test_span_similarity_match():
|
||||
doc = Doc(Vocab(), words=['a', 'b', 'a', 'b'])
|
||||
doc = Doc(Vocab(), words=["a", "b", "a", "b"])
|
||||
span1 = doc[:2]
|
||||
span2 = doc[2:]
|
||||
with pytest.warns(None):
|
||||
assert span1.similarity(span2) == 1.0
|
||||
assert span1.similarity(doc) == 0.0
|
||||
assert span1[:1].similarity(doc.vocab['a']) == 1.0
|
||||
assert span1[:1].similarity(doc.vocab["a"]) == 1.0
|
||||
|
||||
|
||||
def test_spans_default_sentiment(en_tokenizer):
|
||||
|
@ -102,8 +104,8 @@ def test_spans_default_sentiment(en_tokenizer):
|
|||
tokens.vocab[tokens[2].text].sentiment = -2.0
|
||||
doc = Doc(tokens.vocab, words=[t.text for t in tokens])
|
||||
assert doc[:2].sentiment == 3.0 / 2
|
||||
assert doc[-2:].sentiment == -2. / 2
|
||||
assert doc[:-1].sentiment == (3.+-2) / 3.
|
||||
assert doc[-2:].sentiment == -2.0 / 2
|
||||
assert doc[:-1].sentiment == (3.0 + -2) / 3.0
|
||||
|
||||
|
||||
def test_spans_override_sentiment(en_tokenizer):
|
||||
|
@ -113,7 +115,7 @@ def test_spans_override_sentiment(en_tokenizer):
|
|||
tokens.vocab[tokens[0].text].sentiment = 3.0
|
||||
tokens.vocab[tokens[2].text].sentiment = -2.0
|
||||
doc = Doc(tokens.vocab, words=[t.text for t in tokens])
|
||||
doc.user_span_hooks['sentiment'] = lambda span: 10.0
|
||||
doc.user_span_hooks["sentiment"] = lambda span: 10.0
|
||||
assert doc[:2].sentiment == 10.0
|
||||
assert doc[-2:].sentiment == 10.0
|
||||
assert doc[:-1].sentiment == 10.0
|
||||
|
@ -132,10 +134,10 @@ def test_spans_are_hashable(en_tokenizer):
|
|||
|
||||
def test_spans_by_character(doc):
|
||||
span1 = doc[1:-2]
|
||||
span2 = doc.char_span(span1.start_char, span1.end_char, label='GPE')
|
||||
span2 = doc.char_span(span1.start_char, span1.end_char, label="GPE")
|
||||
assert span1.start_char == span2.start_char
|
||||
assert span1.end_char == span2.end_char
|
||||
assert span2.label_ == 'GPE'
|
||||
assert span2.label_ == "GPE"
|
||||
|
||||
|
||||
def test_span_to_array(doc):
|
||||
|
@ -151,12 +153,13 @@ def test_span_as_doc(doc):
|
|||
span_doc = span.as_doc()
|
||||
assert span.text == span_doc.text.strip()
|
||||
|
||||
|
||||
def test_span_ents_property(doc):
|
||||
"""Test span.ents for the """
|
||||
doc.ents = [
|
||||
(doc.vocab.strings['PRODUCT'], 0, 1),
|
||||
(doc.vocab.strings['PRODUCT'], 7, 8),
|
||||
(doc.vocab.strings['PRODUCT'], 11, 14)
|
||||
(doc.vocab.strings["PRODUCT"], 0, 1),
|
||||
(doc.vocab.strings["PRODUCT"], 7, 8),
|
||||
(doc.vocab.strings["PRODUCT"], 11, 14),
|
||||
]
|
||||
assert len(list(doc.ents)) == 3
|
||||
sentences = list(doc.sents)
|
||||
|
|
|
@ -13,22 +13,23 @@ def test_spans_merge_tokens(en_tokenizer):
|
|||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
assert len(doc) == 4
|
||||
assert doc[0].head.text == 'Angeles'
|
||||
assert doc[1].head.text == 'start'
|
||||
doc.merge(0, len('Los Angeles'), tag='NNP', lemma='Los Angeles', ent_type='GPE')
|
||||
assert doc[0].head.text == "Angeles"
|
||||
assert doc[1].head.text == "start"
|
||||
doc.merge(0, len("Los Angeles"), tag="NNP", lemma="Los Angeles", ent_type="GPE")
|
||||
assert len(doc) == 3
|
||||
assert doc[0].text == 'Los Angeles'
|
||||
assert doc[0].head.text == 'start'
|
||||
assert doc[0].text == "Los Angeles"
|
||||
assert doc[0].head.text == "start"
|
||||
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
assert len(doc) == 4
|
||||
assert doc[0].head.text == 'Angeles'
|
||||
assert doc[1].head.text == 'start'
|
||||
doc.merge(0, len('Los Angeles'), tag='NNP', lemma='Los Angeles', label='GPE')
|
||||
assert doc[0].head.text == "Angeles"
|
||||
assert doc[1].head.text == "start"
|
||||
doc.merge(0, len("Los Angeles"), tag="NNP", lemma="Los Angeles", label="GPE")
|
||||
assert len(doc) == 3
|
||||
assert doc[0].text == 'Los Angeles'
|
||||
assert doc[0].head.text == 'start'
|
||||
assert doc[0].ent_type_ == 'GPE'
|
||||
assert doc[0].text == "Los Angeles"
|
||||
assert doc[0].head.text == "start"
|
||||
assert doc[0].ent_type_ == "GPE"
|
||||
|
||||
|
||||
def test_spans_merge_heads(en_tokenizer):
|
||||
text = "I found a pilates class near work."
|
||||
|
@ -37,8 +38,13 @@ def test_spans_merge_heads(en_tokenizer):
|
|||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
|
||||
assert len(doc) == 8
|
||||
doc.merge(doc[3].idx, doc[4].idx + len(doc[4]), tag=doc[4].tag_,
|
||||
lemma='pilates class', ent_type='O')
|
||||
doc.merge(
|
||||
doc[3].idx,
|
||||
doc[4].idx + len(doc[4]),
|
||||
tag=doc[4].tag_,
|
||||
lemma="pilates class",
|
||||
ent_type="O",
|
||||
)
|
||||
assert len(doc) == 7
|
||||
assert doc[0].head.i == 1
|
||||
assert doc[1].head.i == 1
|
||||
|
@ -55,8 +61,9 @@ def test_span_np_merges(en_tokenizer):
|
|||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
|
||||
assert doc[4].head.i == 1
|
||||
doc.merge(doc[2].idx, doc[4].idx + len(doc[4]), tag='NP', lemma='tool',
|
||||
ent_type='O')
|
||||
doc.merge(
|
||||
doc[2].idx, doc[4].idx + len(doc[4]), tag="NP", lemma="tool", ent_type="O"
|
||||
)
|
||||
assert doc[2].head.i == 1
|
||||
|
||||
text = "displaCy is a lightweight and modern dependency parse tree visualization tool built with CSS3 and JavaScript."
|
||||
|
@ -69,7 +76,6 @@ def test_span_np_merges(en_tokenizer):
|
|||
merged = doc.merge(start, end, tag=label, lemma=lemma, ent_type=label)
|
||||
assert merged != None, (start, end, label, lemma)
|
||||
|
||||
|
||||
text = "One test with entities like New York City so the ents list is not void"
|
||||
heads = [1, 11, -1, -1, -1, 1, 1, -3, 4, 2, 1, 1, 0, -1, -2]
|
||||
tokens = en_tokenizer(text)
|
||||
|
@ -80,15 +86,23 @@ def test_span_np_merges(en_tokenizer):
|
|||
|
||||
|
||||
def test_spans_entity_merge(en_tokenizer):
|
||||
# fmt: off
|
||||
text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale.\n"
|
||||
heads = [1, 1, 0, 1, 2, -1, -4, 1, -2, -1, -1, -3, -10, 1, -2, -13, -1]
|
||||
tags = ['NNP', 'NNP', 'VBZ', 'DT', 'VB', 'RP', 'NN', 'WP', 'VBZ', 'IN', 'NNP', 'CC', 'VBZ', 'NNP', 'NNP', '.', 'SP']
|
||||
ents = [(0, 2, 'PERSON'), (10, 11, 'GPE'), (13, 15, 'PERSON')]
|
||||
tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"]
|
||||
ents = [(0, 2, "PERSON"), (10, 11, "GPE"), (13, 15, "PERSON")]
|
||||
# fmt: on
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags, ents=ents)
|
||||
doc = get_doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags, ents=ents
|
||||
)
|
||||
assert len(doc) == 17
|
||||
for ent in doc.ents:
|
||||
label, lemma, type_ = (ent.root.tag_, ent.root.lemma_, max(w.ent_type_ for w in ent))
|
||||
label, lemma, type_ = (
|
||||
ent.root.tag_,
|
||||
ent.root.lemma_,
|
||||
max(w.ent_type_ for w in ent),
|
||||
)
|
||||
ent.merge(label=label, lemma=lemma, ent_type=type_)
|
||||
# check looping is ok
|
||||
assert len(doc) == 15
|
||||
|
@ -98,8 +112,10 @@ def test_spans_entity_merge_iob():
|
|||
# Test entity IOB stays consistent after merging
|
||||
words = ["a", "b", "c", "d", "e"]
|
||||
doc = Doc(Vocab(), words=words)
|
||||
doc.ents = [(doc.vocab.strings.add('ent-abc'), 0, 3),
|
||||
(doc.vocab.strings.add('ent-d'), 3, 4)]
|
||||
doc.ents = [
|
||||
(doc.vocab.strings.add("ent-abc"), 0, 3),
|
||||
(doc.vocab.strings.add("ent-d"), 3, 4),
|
||||
]
|
||||
assert doc[0].ent_iob_ == "B"
|
||||
assert doc[1].ent_iob_ == "I"
|
||||
assert doc[2].ent_iob_ == "I"
|
||||
|
@ -110,33 +126,37 @@ def test_spans_entity_merge_iob():
|
|||
|
||||
|
||||
def test_spans_sentence_update_after_merge(en_tokenizer):
|
||||
# fmt: off
|
||||
text = "Stewart Lee is a stand up comedian. He lives in England and loves Joe Pasquale."
|
||||
heads = [1, 1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2, -7]
|
||||
deps = ['compound', 'nsubj', 'ROOT', 'det', 'amod', 'prt', 'attr',
|
||||
'punct', 'nsubj', 'ROOT', 'prep', 'pobj', 'cc', 'conj',
|
||||
'compound', 'dobj', 'punct']
|
||||
# fmt: on
|
||||
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
sent1, sent2 = list(doc.sents)
|
||||
init_len = len(sent1)
|
||||
init_len2 = len(sent2)
|
||||
doc[0:2].merge(label='none', lemma='none', ent_type='none')
|
||||
doc[-2:].merge(label='none', lemma='none', ent_type='none')
|
||||
doc[0:2].merge(label="none", lemma="none", ent_type="none")
|
||||
doc[-2:].merge(label="none", lemma="none", ent_type="none")
|
||||
assert len(sent1) == init_len - 1
|
||||
assert len(sent2) == init_len2 - 1
|
||||
|
||||
|
||||
def test_spans_subtree_size_check(en_tokenizer):
|
||||
# fmt: off
|
||||
text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale"
|
||||
heads = [1, 1, 0, 1, 2, -1, -4, 1, -2, -1, -1, -3, -10, 1, -2]
|
||||
deps = ['compound', 'nsubj', 'ROOT', 'det', 'amod', 'prt', 'attr',
|
||||
'nsubj', 'relcl', 'prep', 'pobj', 'cc', 'conj', 'compound',
|
||||
'dobj']
|
||||
deps = ["compound", "nsubj", "ROOT", "det", "amod", "prt", "attr",
|
||||
"nsubj", "relcl", "prep", "pobj", "cc", "conj", "compound",
|
||||
"dobj"]
|
||||
# fmt: on
|
||||
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
sent1 = list(doc.sents)[0]
|
||||
init_len = len(list(sent1.root.subtree))
|
||||
doc[0:2].merge(label='none', lemma='none', ent_type='none')
|
||||
doc[0:2].merge(label="none", lemma="none", ent_type="none")
|
||||
assert len(list(sent1.root.subtree)) == init_len - 1
|
||||
|
|
|
@ -13,31 +13,35 @@ from ..util import get_doc
|
|||
|
||||
@pytest.fixture
|
||||
def doc(en_tokenizer):
|
||||
# fmt: off
|
||||
text = "This is a sentence. This is another sentence. And a third."
|
||||
heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3, 0, 1, -2, -1]
|
||||
deps = ['nsubj', 'ROOT', 'det', 'attr', 'punct', 'nsubj', 'ROOT', 'det',
|
||||
'attr', 'punct', 'ROOT', 'det', 'npadvmod', 'punct']
|
||||
deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det",
|
||||
"attr", "punct", "ROOT", "det", "npadvmod", "punct"]
|
||||
# fmt: on
|
||||
tokens = en_tokenizer(text)
|
||||
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
|
||||
|
||||
def test_doc_token_api_strings(en_tokenizer):
|
||||
text = "Give it back! He pleaded."
|
||||
pos = ['VERB', 'PRON', 'PART', 'PUNCT', 'PRON', 'VERB', 'PUNCT']
|
||||
pos = ["VERB", "PRON", "PART", "PUNCT", "PRON", "VERB", "PUNCT"]
|
||||
heads = [0, -1, -2, -3, 1, 0, -1]
|
||||
deps = ['ROOT', 'dobj', 'prt', 'punct', 'nsubj', 'ROOT', 'punct']
|
||||
deps = ["ROOT", "dobj", "prt", "punct", "nsubj", "ROOT", "punct"]
|
||||
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps)
|
||||
assert doc[0].orth_ == 'Give'
|
||||
assert doc[0].text == 'Give'
|
||||
assert doc[0].text_with_ws == 'Give '
|
||||
assert doc[0].lower_ == 'give'
|
||||
assert doc[0].shape_ == 'Xxxx'
|
||||
assert doc[0].prefix_ == 'G'
|
||||
assert doc[0].suffix_ == 'ive'
|
||||
assert doc[0].pos_ == 'VERB'
|
||||
assert doc[0].dep_ == 'ROOT'
|
||||
doc = get_doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
|
||||
)
|
||||
assert doc[0].orth_ == "Give"
|
||||
assert doc[0].text == "Give"
|
||||
assert doc[0].text_with_ws == "Give "
|
||||
assert doc[0].lower_ == "give"
|
||||
assert doc[0].shape_ == "Xxxx"
|
||||
assert doc[0].prefix_ == "G"
|
||||
assert doc[0].suffix_ == "ive"
|
||||
assert doc[0].pos_ == "VERB"
|
||||
assert doc[0].dep_ == "ROOT"
|
||||
|
||||
|
||||
def test_doc_token_api_flags(en_tokenizer):
|
||||
|
@ -53,7 +57,7 @@ def test_doc_token_api_flags(en_tokenizer):
|
|||
# TODO: Test more of these, esp. if a bug is found
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["Give it back! He pleaded."])
|
||||
@pytest.mark.parametrize("text", ["Give it back! He pleaded."])
|
||||
def test_doc_token_api_prob_inherited_from_vocab(en_tokenizer, text):
|
||||
word = text.split()[0]
|
||||
en_tokenizer.vocab[word].prob = -1
|
||||
|
@ -61,11 +65,11 @@ def test_doc_token_api_prob_inherited_from_vocab(en_tokenizer, text):
|
|||
assert tokens[0].prob != 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["one two"])
|
||||
@pytest.mark.parametrize("text", ["one two"])
|
||||
def test_doc_token_api_str_builtin(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert str(tokens[0]) == text.split(' ')[0]
|
||||
assert str(tokens[1]) == text.split(' ')[1]
|
||||
assert str(tokens[0]) == text.split(" ")[0]
|
||||
assert str(tokens[1]) == text.split(" ")[1]
|
||||
|
||||
|
||||
def test_doc_token_api_is_properties(en_vocab):
|
||||
|
@ -83,16 +87,16 @@ def test_doc_token_api_is_properties(en_vocab):
|
|||
def test_doc_token_api_vectors():
|
||||
vocab = Vocab()
|
||||
vocab.reset_vectors(width=2)
|
||||
vocab.set_vector('apples', vector=numpy.asarray([0., 2.], dtype='f'))
|
||||
vocab.set_vector('oranges', vector=numpy.asarray([0., 1.], dtype='f'))
|
||||
doc = Doc(vocab, words=['apples', 'oranges', 'oov'])
|
||||
vocab.set_vector("apples", vector=numpy.asarray([0.0, 2.0], dtype="f"))
|
||||
vocab.set_vector("oranges", vector=numpy.asarray([0.0, 1.0], dtype="f"))
|
||||
doc = Doc(vocab, words=["apples", "oranges", "oov"])
|
||||
assert doc.has_vector
|
||||
assert doc[0].has_vector
|
||||
assert doc[1].has_vector
|
||||
assert not doc[2].has_vector
|
||||
apples_norm = (0*0 + 2*2) ** 0.5
|
||||
oranges_norm = (0*0 + 1*1) ** 0.5
|
||||
cosine = ((0*0) + (2*1)) / (apples_norm * oranges_norm)
|
||||
apples_norm = (0 * 0 + 2 * 2) ** 0.5
|
||||
oranges_norm = (0 * 0 + 1 * 1) ** 0.5
|
||||
cosine = ((0 * 0) + (2 * 1)) / (apples_norm * oranges_norm)
|
||||
assert doc[0].similarity(doc[1]) == cosine
|
||||
|
||||
|
||||
|
@ -165,7 +169,7 @@ def test_doc_token_api_head_setter(en_tokenizer):
|
|||
|
||||
|
||||
def test_is_sent_start(en_tokenizer):
|
||||
doc = en_tokenizer('This is a sentence. This is another.')
|
||||
doc = en_tokenizer("This is a sentence. This is another.")
|
||||
assert doc[5].is_sent_start is None
|
||||
doc[5].is_sent_start = True
|
||||
assert doc[5].is_sent_start is True
|
||||
|
@ -174,17 +178,17 @@ def test_is_sent_start(en_tokenizer):
|
|||
|
||||
|
||||
def test_set_pos():
|
||||
doc = Doc(Vocab(), words=['hello', 'world'])
|
||||
doc[0].pos_ = 'NOUN'
|
||||
assert doc[0].pos_ == 'NOUN'
|
||||
doc = Doc(Vocab(), words=["hello", "world"])
|
||||
doc[0].pos_ = "NOUN"
|
||||
assert doc[0].pos_ == "NOUN"
|
||||
doc[1].pos = VERB
|
||||
assert doc[1].pos_ == 'VERB'
|
||||
assert doc[1].pos_ == "VERB"
|
||||
|
||||
|
||||
def test_tokens_sent(doc):
|
||||
"""Test token.sent property"""
|
||||
assert len(list(doc.sents)) == 3
|
||||
assert doc[1].sent.text == 'This is a sentence .'
|
||||
assert doc[7].sent.text == 'This is another sentence .'
|
||||
assert doc[1].sent.root.left_edge.text == 'This'
|
||||
assert doc[7].sent.root.left_edge.text == 'This'
|
||||
assert doc[1].sent.text == "This is a sentence ."
|
||||
assert doc[7].sent.text == "This is another sentence ."
|
||||
assert doc[1].sent.root.left_edge.text == "This"
|
||||
assert doc[7].sent.root.left_edge.text == "This"
|
||||
|
|
|
@ -20,7 +20,7 @@ def test_doc_underscore_getattr_setattr():
|
|||
doc = Mock()
|
||||
doc.doc = doc
|
||||
doc.user_data = {}
|
||||
Underscore.doc_extensions['hello'] = (False, None, None, None)
|
||||
Underscore.doc_extensions["hello"] = (False, None, None, None)
|
||||
doc._ = Underscore(Underscore.doc_extensions, doc)
|
||||
assert doc._.hello == False
|
||||
doc._.hello = True
|
||||
|
@ -29,8 +29,9 @@ def test_doc_underscore_getattr_setattr():
|
|||
|
||||
def test_create_span_underscore():
|
||||
span = Mock(doc=Mock(), start=0, end=2)
|
||||
uscore = Underscore(Underscore.span_extensions, span,
|
||||
start=span.start, end=span.end)
|
||||
uscore = Underscore(
|
||||
Underscore.span_extensions, span, start=span.start, end=span.end
|
||||
)
|
||||
assert uscore._doc is span.doc
|
||||
assert uscore._start is span.start
|
||||
assert uscore._end is span.end
|
||||
|
@ -38,60 +39,70 @@ def test_create_span_underscore():
|
|||
|
||||
def test_span_underscore_getter_setter():
|
||||
span = Mock(doc=Mock(), start=0, end=2)
|
||||
Underscore.span_extensions['hello'] = (None, None,
|
||||
lambda s: (s.start, 'hi'),
|
||||
lambda s, value: setattr(s, 'start',
|
||||
value))
|
||||
span._ = Underscore(Underscore.span_extensions, span,
|
||||
start=span.start, end=span.end)
|
||||
Underscore.span_extensions["hello"] = (
|
||||
None,
|
||||
None,
|
||||
lambda s: (s.start, "hi"),
|
||||
lambda s, value: setattr(s, "start", value),
|
||||
)
|
||||
span._ = Underscore(
|
||||
Underscore.span_extensions, span, start=span.start, end=span.end
|
||||
)
|
||||
|
||||
assert span._.hello == (0, 'hi')
|
||||
assert span._.hello == (0, "hi")
|
||||
span._.hello = 1
|
||||
assert span._.hello == (1, 'hi')
|
||||
assert span._.hello == (1, "hi")
|
||||
|
||||
|
||||
def test_token_underscore_method():
|
||||
token = Mock(doc=Mock(), idx=7, say_cheese=lambda token: 'cheese')
|
||||
Underscore.token_extensions['hello'] = (None, token.say_cheese,
|
||||
None, None)
|
||||
token = Mock(doc=Mock(), idx=7, say_cheese=lambda token: "cheese")
|
||||
Underscore.token_extensions["hello"] = (None, token.say_cheese, None, None)
|
||||
token._ = Underscore(Underscore.token_extensions, token, start=token.idx)
|
||||
assert token._.hello() == 'cheese'
|
||||
assert token._.hello() == "cheese"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('obj', [Doc, Span, Token])
|
||||
@pytest.mark.parametrize("obj", [Doc, Span, Token])
|
||||
def test_doc_underscore_remove_extension(obj):
|
||||
ext_name = 'to_be_removed'
|
||||
ext_name = "to_be_removed"
|
||||
obj.set_extension(ext_name, default=False)
|
||||
assert obj.has_extension(ext_name)
|
||||
obj.remove_extension(ext_name)
|
||||
assert not obj.has_extension(ext_name)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('obj', [Doc, Span, Token])
|
||||
@pytest.mark.parametrize("obj", [Doc, Span, Token])
|
||||
def test_underscore_raises_for_dup(obj):
|
||||
obj.set_extension('test', default=None)
|
||||
obj.set_extension("test", default=None)
|
||||
with pytest.raises(ValueError):
|
||||
obj.set_extension('test', default=None)
|
||||
obj.set_extension("test", default=None)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('invalid_kwargs', [
|
||||
{'getter': None, 'setter': lambda: None},
|
||||
{'default': None, 'method': lambda: None, 'getter': lambda: None},
|
||||
{'setter': lambda: None},
|
||||
{'default': None, 'method': lambda: None},
|
||||
{'getter': True}])
|
||||
@pytest.mark.parametrize(
|
||||
"invalid_kwargs",
|
||||
[
|
||||
{"getter": None, "setter": lambda: None},
|
||||
{"default": None, "method": lambda: None, "getter": lambda: None},
|
||||
{"setter": lambda: None},
|
||||
{"default": None, "method": lambda: None},
|
||||
{"getter": True},
|
||||
],
|
||||
)
|
||||
def test_underscore_raises_for_invalid(invalid_kwargs):
|
||||
invalid_kwargs['force'] = True
|
||||
invalid_kwargs["force"] = True
|
||||
with pytest.raises(ValueError):
|
||||
Doc.set_extension('test', **invalid_kwargs)
|
||||
Doc.set_extension("test", **invalid_kwargs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('valid_kwargs', [
|
||||
{'getter': lambda: None},
|
||||
{'getter': lambda: None, 'setter': lambda: None},
|
||||
{'default': 'hello'},
|
||||
{'default': None},
|
||||
{'method': lambda: None}])
|
||||
@pytest.mark.parametrize(
|
||||
"valid_kwargs",
|
||||
[
|
||||
{"getter": lambda: None},
|
||||
{"getter": lambda: None, "setter": lambda: None},
|
||||
{"default": "hello"},
|
||||
{"default": None},
|
||||
{"method": lambda: None},
|
||||
],
|
||||
)
|
||||
def test_underscore_accepts_valid(valid_kwargs):
|
||||
valid_kwargs['force'] = True
|
||||
Doc.set_extension('test', **valid_kwargs)
|
||||
valid_kwargs["force"] = True
|
||||
Doc.set_extension("test", **valid_kwargs)
|
||||
|
|
|
@ -4,7 +4,7 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["ق.م", "إلخ", "ص.ب", "ت."])
|
||||
@pytest.mark.parametrize("text", ["ق.م", "إلخ", "ص.ب", "ت."])
|
||||
def test_ar_tokenizer_handles_abbr(ar_tokenizer, text):
|
||||
tokens = ar_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
@ -18,7 +18,7 @@ def test_ar_tokenizer_handles_exc_in_text(ar_tokenizer):
|
|||
assert tokens[6].lemma_ == "قبل الميلاد"
|
||||
|
||||
|
||||
def test_ar_tokenizer_handles_exc_in_text(ar_tokenizer):
|
||||
def test_ar_tokenizer_handles_exc_in_text_2(ar_tokenizer):
|
||||
text = "يبلغ طول مضيق طارق 14كم "
|
||||
tokens = ar_tokenizer(text)
|
||||
assert len(tokens) == 6
|
||||
|
|
|
@ -6,16 +6,22 @@ import pytest
|
|||
|
||||
TESTCASES = [
|
||||
# punctuation tests
|
||||
('আমি বাংলায় গান গাই!', ['আমি', 'বাংলায়', 'গান', 'গাই', '!']),
|
||||
('আমি বাংলায় কথা কই।', ['আমি', 'বাংলায়', 'কথা', 'কই', '।']),
|
||||
('বসুন্ধরা জনসম্মুখে দোষ স্বীকার করলো না?', ['বসুন্ধরা', 'জনসম্মুখে', 'দোষ', 'স্বীকার', 'করলো', 'না', '?']),
|
||||
('টাকা থাকলে কি না হয়!', ['টাকা', 'থাকলে', 'কি', 'না', 'হয়', '!']),
|
||||
("আমি বাংলায় গান গাই!", ["আমি", "বাংলায়", "গান", "গাই", "!"]),
|
||||
("আমি বাংলায় কথা কই।", ["আমি", "বাংলায়", "কথা", "কই", "।"]),
|
||||
(
|
||||
"বসুন্ধরা জনসম্মুখে দোষ স্বীকার করলো না?",
|
||||
["বসুন্ধরা", "জনসম্মুখে", "দোষ", "স্বীকার", "করলো", "না", "?"],
|
||||
),
|
||||
("টাকা থাকলে কি না হয়!", ["টাকা", "থাকলে", "কি", "না", "হয়", "!"]),
|
||||
# abbreviations
|
||||
('ডঃ খালেদ বললেন ঢাকায় ৩৫ ডিগ্রি সে.।', ['ডঃ', 'খালেদ', 'বললেন', 'ঢাকায়', '৩৫', 'ডিগ্রি', 'সে.', '।'])
|
||||
(
|
||||
"ডঃ খালেদ বললেন ঢাকায় ৩৫ ডিগ্রি সে.।",
|
||||
["ডঃ", "খালেদ", "বললেন", "ঢাকায়", "৩৫", "ডিগ্রি", "সে.", "।"],
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
|
||||
@pytest.mark.parametrize("text,expected_tokens", TESTCASES)
|
||||
def test_bn_tokenizer_handles_testcases(bn_tokenizer, text, expected_tokens):
|
||||
tokens = bn_tokenizer(text)
|
||||
token_list = [token.text for token in tokens if not token.is_space]
|
||||
|
|
|
@ -4,19 +4,19 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["ca.", "m.a.o.", "Jan.", "Dec.", "kr.", "jf."])
|
||||
@pytest.mark.parametrize("text", ["ca.", "m.a.o.", "Jan.", "Dec.", "kr.", "jf."])
|
||||
def test_da_tokenizer_handles_abbr(da_tokenizer, text):
|
||||
tokens = da_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["Jul.", "jul.", "Tor.", "Tors."])
|
||||
@pytest.mark.parametrize("text", ["Jul.", "jul.", "Tor.", "Tors."])
|
||||
def test_da_tokenizer_handles_ambiguous_abbr(da_tokenizer, text):
|
||||
tokens = da_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["1.", "10.", "31."])
|
||||
@pytest.mark.parametrize("text", ["1.", "10.", "31."])
|
||||
def test_da_tokenizer_handles_dates(da_tokenizer, text):
|
||||
tokens = da_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
@ -37,8 +37,9 @@ def test_da_tokenizer_handles_custom_base_exc(da_tokenizer):
|
|||
assert tokens[7].text == "."
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,norm', [
|
||||
("akvarium", "akvarie"), ("bedstemoder", "bedstemor")])
|
||||
@pytest.mark.parametrize(
|
||||
"text,norm", [("akvarium", "akvarie"), ("bedstemoder", "bedstemor")]
|
||||
)
|
||||
def test_da_tokenizer_norm_exceptions(da_tokenizer, text, norm):
|
||||
tokens = da_tokenizer(text)
|
||||
assert tokens[0].norm_ == norm
|
||||
|
|
|
@ -4,11 +4,15 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('string,lemma', [
|
||||
('affaldsgruppernes', 'affaldsgruppe'),
|
||||
('detailhandelsstrukturernes', 'detailhandelsstruktur'),
|
||||
('kolesterols', 'kolesterol'),
|
||||
('åsyns', 'åsyn')])
|
||||
@pytest.mark.parametrize(
|
||||
"string,lemma",
|
||||
[
|
||||
("affaldsgruppernes", "affaldsgruppe"),
|
||||
("detailhandelsstrukturernes", "detailhandelsstruktur"),
|
||||
("kolesterols", "kolesterol"),
|
||||
("åsyns", "åsyn"),
|
||||
],
|
||||
)
|
||||
def test_da_lemmatizer_lookup_assigns(da_tokenizer, string, lemma):
|
||||
tokens = da_tokenizer(string)
|
||||
assert tokens[0].lemma_ == lemma
|
||||
|
|
|
@ -4,19 +4,19 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(under)"])
|
||||
@pytest.mark.parametrize("text", ["(under)"])
|
||||
def test_da_tokenizer_splits_no_special(da_tokenizer, text):
|
||||
tokens = da_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["ta'r", "Søren's", "Lars'"])
|
||||
@pytest.mark.parametrize("text", ["ta'r", "Søren's", "Lars'"])
|
||||
def test_da_tokenizer_handles_no_punct(da_tokenizer, text):
|
||||
tokens = da_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(ta'r"])
|
||||
@pytest.mark.parametrize("text", ["(ta'r"])
|
||||
def test_da_tokenizer_splits_prefix_punct(da_tokenizer, text):
|
||||
tokens = da_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
@ -24,7 +24,7 @@ def test_da_tokenizer_splits_prefix_punct(da_tokenizer, text):
|
|||
assert tokens[1].text == "ta'r"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["ta'r)"])
|
||||
@pytest.mark.parametrize("text", ["ta'r)"])
|
||||
def test_da_tokenizer_splits_suffix_punct(da_tokenizer, text):
|
||||
tokens = da_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
@ -32,15 +32,16 @@ def test_da_tokenizer_splits_suffix_punct(da_tokenizer, text):
|
|||
assert tokens[1].text == ")"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,expected', [
|
||||
("(ta'r)", ["(", "ta'r", ")"]), ("'ta'r'", ["'", "ta'r", "'"])])
|
||||
@pytest.mark.parametrize(
|
||||
"text,expected", [("(ta'r)", ["(", "ta'r", ")"]), ("'ta'r'", ["'", "ta'r", "'"])]
|
||||
)
|
||||
def test_da_tokenizer_splits_even_wrap(da_tokenizer, text, expected):
|
||||
tokens = da_tokenizer(text)
|
||||
assert len(tokens) == len(expected)
|
||||
assert [t.text for t in tokens] == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(ta'r?)"])
|
||||
@pytest.mark.parametrize("text", ["(ta'r?)"])
|
||||
def test_da_tokenizer_splits_uneven_wrap(da_tokenizer, text):
|
||||
tokens = da_tokenizer(text)
|
||||
assert len(tokens) == 4
|
||||
|
@ -50,15 +51,17 @@ def test_da_tokenizer_splits_uneven_wrap(da_tokenizer, text):
|
|||
assert tokens[3].text == ")"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,expected', [
|
||||
("f.eks.", ["f.eks."]), ("fe.", ["fe", "."]), ("(f.eks.", ["(", "f.eks."])])
|
||||
@pytest.mark.parametrize(
|
||||
"text,expected",
|
||||
[("f.eks.", ["f.eks."]), ("fe.", ["fe", "."]), ("(f.eks.", ["(", "f.eks."])],
|
||||
)
|
||||
def test_da_tokenizer_splits_prefix_interact(da_tokenizer, text, expected):
|
||||
tokens = da_tokenizer(text)
|
||||
assert len(tokens) == len(expected)
|
||||
assert [t.text for t in tokens] == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["f.eks.)"])
|
||||
@pytest.mark.parametrize("text", ["f.eks.)"])
|
||||
def test_da_tokenizer_splits_suffix_interact(da_tokenizer, text):
|
||||
tokens = da_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
@ -66,7 +69,7 @@ def test_da_tokenizer_splits_suffix_interact(da_tokenizer, text):
|
|||
assert tokens[1].text == ")"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(f.eks.)"])
|
||||
@pytest.mark.parametrize("text", ["(f.eks.)"])
|
||||
def test_da_tokenizer_splits_even_wrap_interact(da_tokenizer, text):
|
||||
tokens = da_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
@ -75,7 +78,7 @@ def test_da_tokenizer_splits_even_wrap_interact(da_tokenizer, text):
|
|||
assert tokens[2].text == ")"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(f.eks.?)"])
|
||||
@pytest.mark.parametrize("text", ["(f.eks.?)"])
|
||||
def test_da_tokenizer_splits_uneven_wrap_interact(da_tokenizer, text):
|
||||
tokens = da_tokenizer(text)
|
||||
assert len(tokens) == 4
|
||||
|
@ -85,19 +88,19 @@ def test_da_tokenizer_splits_uneven_wrap_interact(da_tokenizer, text):
|
|||
assert tokens[3].text == ")"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["0,1-13,5", "0,0-0,1", "103,27-300", "1/2-3/4"])
|
||||
@pytest.mark.parametrize("text", ["0,1-13,5", "0,0-0,1", "103,27-300", "1/2-3/4"])
|
||||
def test_da_tokenizer_handles_numeric_range(da_tokenizer, text):
|
||||
tokens = da_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["sort.Gul", "Hej.Verden"])
|
||||
@pytest.mark.parametrize("text", ["sort.Gul", "Hej.Verden"])
|
||||
def test_da_tokenizer_splits_period_infix(da_tokenizer, text):
|
||||
tokens = da_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["Hej,Verden", "en,to"])
|
||||
@pytest.mark.parametrize("text", ["Hej,Verden", "en,to"])
|
||||
def test_da_tokenizer_splits_comma_infix(da_tokenizer, text):
|
||||
tokens = da_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
@ -106,20 +109,25 @@ def test_da_tokenizer_splits_comma_infix(da_tokenizer, text):
|
|||
assert tokens[2].text == text.split(",")[1]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["sort...Gul", "sort...gul"])
|
||||
@pytest.mark.parametrize("text", ["sort...Gul", "sort...gul"])
|
||||
def test_da_tokenizer_splits_ellipsis_infix(da_tokenizer, text):
|
||||
tokens = da_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ['gå-på-mod', '4-hjulstræk', '100-Pfennig-frimærke', 'TV-2-spots', 'trofæ-vaeggen'])
|
||||
@pytest.mark.parametrize(
|
||||
"text",
|
||||
["gå-på-mod", "4-hjulstræk", "100-Pfennig-frimærke", "TV-2-spots", "trofæ-vaeggen"],
|
||||
)
|
||||
def test_da_tokenizer_keeps_hyphens(da_tokenizer, text):
|
||||
tokens = da_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
||||
|
||||
def test_da_tokenizer_splits_double_hyphen_infix(da_tokenizer):
|
||||
tokens = da_tokenizer("Mange regler--eksempelvis bindestregs-reglerne--er komplicerede.")
|
||||
tokens = da_tokenizer(
|
||||
"Mange regler--eksempelvis bindestregs-reglerne--er komplicerede."
|
||||
)
|
||||
assert len(tokens) == 9
|
||||
assert tokens[0].text == "Mange"
|
||||
assert tokens[1].text == "regler"
|
||||
|
@ -132,7 +140,9 @@ def test_da_tokenizer_splits_double_hyphen_infix(da_tokenizer):
|
|||
|
||||
|
||||
def test_da_tokenizer_handles_posessives_and_contractions(da_tokenizer):
|
||||
tokens = da_tokenizer("'DBA's, Lars' og Liz' bil sku' sgu' ik' ha' en bule, det ka' han ik' li' mere', sagde hun.")
|
||||
tokens = da_tokenizer(
|
||||
"'DBA's, Lars' og Liz' bil sku' sgu' ik' ha' en bule, det ka' han ik' li' mere', sagde hun."
|
||||
)
|
||||
assert len(tokens) == 25
|
||||
assert tokens[0].text == "'"
|
||||
assert tokens[1].text == "DBA's"
|
||||
|
|
|
@ -15,17 +15,29 @@ Rundt om ager og eng var der store skove, og midt i skovene dybe søer; jo, der
|
|||
assert len(tokens) == 84
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,match', [
|
||||
('10', True), ('1', True), ('10.000', True), ('10.00', True),
|
||||
('999,0', True), ('en', True), ('treoghalvfemsindstyvende', True), ('hundrede', True),
|
||||
('hund', False), (',', False), ('1/2', True)])
|
||||
@pytest.mark.parametrize(
|
||||
"text,match",
|
||||
[
|
||||
("10", True),
|
||||
("1", True),
|
||||
("10.000", True),
|
||||
("10.00", True),
|
||||
("999,0", True),
|
||||
("en", True),
|
||||
("treoghalvfemsindstyvende", True),
|
||||
("hundrede", True),
|
||||
("hund", False),
|
||||
(",", False),
|
||||
("1/2", True),
|
||||
],
|
||||
)
|
||||
def test_lex_attrs_like_number(da_tokenizer, text, match):
|
||||
tokens = da_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
assert tokens[0].like_num == match
|
||||
|
||||
|
||||
@pytest.mark.parametrize('word', ['elleve', 'første'])
|
||||
@pytest.mark.parametrize("word", ["elleve", "første"])
|
||||
def test_da_lex_attrs_capitals(word):
|
||||
assert like_num(word)
|
||||
assert like_num(word.upper())
|
||||
|
|
|
@ -4,13 +4,13 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["auf'm", "du's", "über'm", "wir's"])
|
||||
@pytest.mark.parametrize("text", ["auf'm", "du's", "über'm", "wir's"])
|
||||
def test_de_tokenizer_splits_contractions(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["z.B.", "d.h.", "Jan.", "Dez.", "Chr."])
|
||||
@pytest.mark.parametrize("text", ["z.B.", "d.h.", "Jan.", "Dez.", "Chr."])
|
||||
def test_de_tokenizer_handles_abbr(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
@ -24,14 +24,16 @@ def test_de_tokenizer_handles_exc_in_text(de_tokenizer):
|
|||
assert tokens[2].lemma_ == "zur Zeit"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,norms', [("vor'm", ["vor", "dem"]), ("du's", ["du", "es"])])
|
||||
@pytest.mark.parametrize(
|
||||
"text,norms", [("vor'm", ["vor", "dem"]), ("du's", ["du", "es"])]
|
||||
)
|
||||
def test_de_tokenizer_norm_exceptions(de_tokenizer, text, norms):
|
||||
tokens = de_tokenizer(text)
|
||||
assert [token.norm_ for token in tokens] == norms
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize('text,norm', [("daß", "dass")])
|
||||
@pytest.mark.parametrize("text,norm", [("daß", "dass")])
|
||||
def test_de_lex_attrs_norm_exceptions(de_tokenizer, text, norm):
|
||||
tokens = de_tokenizer(text)
|
||||
assert tokens[0].norm_ == norm
|
||||
|
|
|
@ -4,13 +4,17 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('string,lemma', [
|
||||
('Abgehängten', 'Abgehängte'),
|
||||
('engagierte', 'engagieren'),
|
||||
('schließt', 'schließen'),
|
||||
('vorgebenden', 'vorgebend'),
|
||||
('die', 'der'),
|
||||
('Die', 'der')])
|
||||
@pytest.mark.parametrize(
|
||||
"string,lemma",
|
||||
[
|
||||
("Abgehängten", "Abgehängte"),
|
||||
("engagierte", "engagieren"),
|
||||
("schließt", "schließen"),
|
||||
("vorgebenden", "vorgebend"),
|
||||
("die", "der"),
|
||||
("Die", "der"),
|
||||
],
|
||||
)
|
||||
def test_de_lemmatizer_lookup_assigns(de_tokenizer, string, lemma):
|
||||
tokens = de_tokenizer(string)
|
||||
assert tokens[0].lemma_ == lemma
|
||||
|
|
|
@ -7,10 +7,12 @@ from ...util import get_doc
|
|||
def test_de_parser_noun_chunks_standard_de(de_tokenizer):
|
||||
text = "Eine Tasse steht auf dem Tisch."
|
||||
heads = [1, 1, 0, -1, 1, -2, -4]
|
||||
tags = ['ART', 'NN', 'VVFIN', 'APPR', 'ART', 'NN', '$.']
|
||||
deps = ['nk', 'sb', 'ROOT', 'mo', 'nk', 'nk', 'punct']
|
||||
tags = ["ART", "NN", "VVFIN", "APPR", "ART", "NN", "$."]
|
||||
deps = ["nk", "sb", "ROOT", "mo", "nk", "nk", "punct"]
|
||||
tokens = de_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads)
|
||||
doc = get_doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads
|
||||
)
|
||||
chunks = list(doc.noun_chunks)
|
||||
assert len(chunks) == 2
|
||||
assert chunks[0].text_with_ws == "Eine Tasse "
|
||||
|
@ -20,10 +22,12 @@ def test_de_parser_noun_chunks_standard_de(de_tokenizer):
|
|||
def test_de_extended_chunk(de_tokenizer):
|
||||
text = "Die Sängerin singt mit einer Tasse Kaffee Arien."
|
||||
heads = [1, 1, 0, -1, 1, -2, -1, -5, -6]
|
||||
tags = ['ART', 'NN', 'VVFIN', 'APPR', 'ART', 'NN', 'NN', 'NN', '$.']
|
||||
deps = ['nk', 'sb', 'ROOT', 'mo', 'nk', 'nk', 'nk', 'oa', 'punct']
|
||||
tags = ["ART", "NN", "VVFIN", "APPR", "ART", "NN", "NN", "NN", "$."]
|
||||
deps = ["nk", "sb", "ROOT", "mo", "nk", "nk", "nk", "oa", "punct"]
|
||||
tokens = de_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads)
|
||||
doc = get_doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads
|
||||
)
|
||||
chunks = list(doc.noun_chunks)
|
||||
assert len(chunks) == 3
|
||||
assert chunks[0].text_with_ws == "Die Sängerin "
|
||||
|
|
|
@ -4,79 +4,79 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(unter)"])
|
||||
@pytest.mark.parametrize("text", ["(unter)"])
|
||||
def test_de_tokenizer_splits_no_special(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["unter'm"])
|
||||
@pytest.mark.parametrize("text", ["unter'm"])
|
||||
def test_de_tokenizer_splits_no_punct(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(unter'm"])
|
||||
@pytest.mark.parametrize("text", ["(unter'm"])
|
||||
def test_de_tokenizer_splits_prefix_punct(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["unter'm)"])
|
||||
@pytest.mark.parametrize("text", ["unter'm)"])
|
||||
def test_de_tokenizer_splits_suffix_punct(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(unter'm)"])
|
||||
@pytest.mark.parametrize("text", ["(unter'm)"])
|
||||
def test_de_tokenizer_splits_even_wrap(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 4
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(unter'm?)"])
|
||||
@pytest.mark.parametrize("text", ["(unter'm?)"])
|
||||
def test_de_tokenizer_splits_uneven_wrap(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 5
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,length', [("z.B.", 1), ("zb.", 2), ("(z.B.", 2)])
|
||||
@pytest.mark.parametrize("text,length", [("z.B.", 1), ("zb.", 2), ("(z.B.", 2)])
|
||||
def test_de_tokenizer_splits_prefix_interact(de_tokenizer, text, length):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["z.B.)"])
|
||||
@pytest.mark.parametrize("text", ["z.B.)"])
|
||||
def test_de_tokenizer_splits_suffix_interact(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(z.B.)"])
|
||||
@pytest.mark.parametrize("text", ["(z.B.)"])
|
||||
def test_de_tokenizer_splits_even_wrap_interact(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(z.B.?)"])
|
||||
@pytest.mark.parametrize("text", ["(z.B.?)"])
|
||||
def test_de_tokenizer_splits_uneven_wrap_interact(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 4
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"])
|
||||
@pytest.mark.parametrize("text", ["0.1-13.5", "0.0-0.1", "103.27-300"])
|
||||
def test_de_tokenizer_splits_numeric_range(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["blau.Rot", "Hallo.Welt"])
|
||||
@pytest.mark.parametrize("text", ["blau.Rot", "Hallo.Welt"])
|
||||
def test_de_tokenizer_splits_period_infix(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["Hallo,Welt", "eins,zwei"])
|
||||
@pytest.mark.parametrize("text", ["Hallo,Welt", "eins,zwei"])
|
||||
def test_de_tokenizer_splits_comma_infix(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
@ -85,13 +85,13 @@ def test_de_tokenizer_splits_comma_infix(de_tokenizer, text):
|
|||
assert tokens[2].text == text.split(",")[1]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["blau...Rot", "blau...rot"])
|
||||
@pytest.mark.parametrize("text", ["blau...Rot", "blau...rot"])
|
||||
def test_de_tokenizer_splits_ellipsis_infix(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ['Islam-Konferenz', 'Ost-West-Konflikt'])
|
||||
@pytest.mark.parametrize("text", ["Islam-Konferenz", "Ost-West-Konflikt"])
|
||||
def test_de_tokenizer_keeps_hyphens(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
|
|
@ -22,19 +22,27 @@ Umfang kläglich dünnen Beine flimmerten ihm hilflos vor den Augen.
|
|||
assert len(tokens) == 109
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', [
|
||||
"Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten",
|
||||
"Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz",
|
||||
"Kraftfahrzeug-Haftpflichtversicherung",
|
||||
"Vakuum-Mittelfrequenz-Induktionsofen"])
|
||||
@pytest.mark.parametrize(
|
||||
"text",
|
||||
[
|
||||
"Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten",
|
||||
"Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz",
|
||||
"Kraftfahrzeug-Haftpflichtversicherung",
|
||||
"Vakuum-Mittelfrequenz-Induktionsofen",
|
||||
],
|
||||
)
|
||||
def test_de_tokenizer_handles_long_words(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,length', [
|
||||
("»Was ist mit mir geschehen?«, dachte er.", 12),
|
||||
("“Dies frühzeitige Aufstehen”, dachte er, “macht einen ganz blödsinnig. ", 15)])
|
||||
@pytest.mark.parametrize(
|
||||
"text,length",
|
||||
[
|
||||
("»Was ist mit mir geschehen?«, dachte er.", 12),
|
||||
("“Dies frühzeitige Aufstehen”, dachte er, “macht einen ganz blödsinnig. ", 15),
|
||||
],
|
||||
)
|
||||
def test_de_tokenizer_handles_examples(de_tokenizer, text, length):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
|
|
|
@ -4,7 +4,7 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["αριθ.", "τρισ.", "δισ.", "σελ."])
|
||||
@pytest.mark.parametrize("text", ["αριθ.", "τρισ.", "δισ.", "σελ."])
|
||||
def test_el_tokenizer_handles_abbr(el_tokenizer, text):
|
||||
tokens = el_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
|
|
@ -13,12 +13,22 @@ def test_el_tokenizer_handles_long_text(el_tokenizer):
|
|||
assert len(tokens) == 54
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,length',[
|
||||
("Διοικητικά η Ελλάδα διαιρείται σε 13 Περιφέρειες.", 8),
|
||||
("Η εκπαίδευση στην Ελλάδα χωρίζεται κυρίως σε τρία επίπεδα.", 10),
|
||||
("Η Ελλάδα είναι μία από τις χώρες της Ευρωπαϊκής Ένωσης (ΕΕ) που διαθέτει σηµαντικό ορυκτό πλούτο.", 19),
|
||||
("Η ναυτιλία αποτέλεσε ένα σημαντικό στοιχείο της Ελληνικής οικονομικής δραστηριότητας από τα αρχαία χρόνια.", 15),
|
||||
("Η Ελλάδα είναι μέλος σε αρκετούς διεθνείς οργανισμούς.", 9)])
|
||||
def test_el_tokenizer_handles_cnts(el_tokenizer,text, length):
|
||||
@pytest.mark.parametrize(
|
||||
"text,length",
|
||||
[
|
||||
("Διοικητικά η Ελλάδα διαιρείται σε 13 Περιφέρειες.", 8),
|
||||
("Η εκπαίδευση στην Ελλάδα χωρίζεται κυρίως σε τρία επίπεδα.", 10),
|
||||
(
|
||||
"Η Ελλάδα είναι μία από τις χώρες της Ευρωπαϊκής Ένωσης (ΕΕ) που διαθέτει σηµαντικό ορυκτό πλούτο.",
|
||||
19,
|
||||
),
|
||||
(
|
||||
"Η ναυτιλία αποτέλεσε ένα σημαντικό στοιχείο της Ελληνικής οικονομικής δραστηριότητας από τα αρχαία χρόνια.",
|
||||
15,
|
||||
),
|
||||
("Η Ελλάδα είναι μέλος σε αρκετούς διεθνείς οργανισμούς.", 9),
|
||||
],
|
||||
)
|
||||
def test_el_tokenizer_handles_cnts(el_tokenizer, text, length):
|
||||
tokens = el_tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
|
|
|
@ -12,29 +12,66 @@ from spacy.util import compile_infix_regex
|
|||
def custom_en_tokenizer(en_vocab):
|
||||
prefix_re = compile_prefix_regex(English.Defaults.prefixes)
|
||||
suffix_re = compile_suffix_regex(English.Defaults.suffixes)
|
||||
custom_infixes = ['\.\.\.+',
|
||||
'(?<=[0-9])-(?=[0-9])',
|
||||
# '(?<=[0-9]+),(?=[0-9]+)',
|
||||
'[0-9]+(,[0-9]+)+',
|
||||
'[\[\]!&:,()\*—–\/-]']
|
||||
custom_infixes = [
|
||||
"\.\.\.+",
|
||||
"(?<=[0-9])-(?=[0-9])",
|
||||
# '(?<=[0-9]+),(?=[0-9]+)',
|
||||
"[0-9]+(,[0-9]+)+",
|
||||
"[\[\]!&:,()\*—–\/-]",
|
||||
]
|
||||
infix_re = compile_infix_regex(custom_infixes)
|
||||
return Tokenizer(en_vocab,
|
||||
English.Defaults.tokenizer_exceptions,
|
||||
prefix_re.search,
|
||||
suffix_re.search,
|
||||
infix_re.finditer,
|
||||
token_match=None)
|
||||
return Tokenizer(
|
||||
en_vocab,
|
||||
English.Defaults.tokenizer_exceptions,
|
||||
prefix_re.search,
|
||||
suffix_re.search,
|
||||
infix_re.finditer,
|
||||
token_match=None,
|
||||
)
|
||||
|
||||
|
||||
def test_en_customized_tokenizer_handles_infixes(custom_en_tokenizer):
|
||||
sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion."
|
||||
context = [word.text for word in custom_en_tokenizer(sentence)]
|
||||
assert context == ['The', '8', 'and', '10', '-', 'county', 'definitions',
|
||||
'are', 'not', 'used', 'for', 'the', 'greater',
|
||||
'Southern', 'California', 'Megaregion', '.']
|
||||
assert context == [
|
||||
"The",
|
||||
"8",
|
||||
"and",
|
||||
"10",
|
||||
"-",
|
||||
"county",
|
||||
"definitions",
|
||||
"are",
|
||||
"not",
|
||||
"used",
|
||||
"for",
|
||||
"the",
|
||||
"greater",
|
||||
"Southern",
|
||||
"California",
|
||||
"Megaregion",
|
||||
".",
|
||||
]
|
||||
# the trailing '-' may cause Assertion Error
|
||||
sentence = "The 8- and 10-county definitions are not used for the greater Southern California Megaregion."
|
||||
context = [word.text for word in custom_en_tokenizer(sentence)]
|
||||
assert context == ['The', '8', '-', 'and', '10', '-', 'county',
|
||||
'definitions', 'are', 'not', 'used', 'for', 'the',
|
||||
'greater', 'Southern', 'California', 'Megaregion', '.']
|
||||
assert context == [
|
||||
"The",
|
||||
"8",
|
||||
"-",
|
||||
"and",
|
||||
"10",
|
||||
"-",
|
||||
"county",
|
||||
"definitions",
|
||||
"are",
|
||||
"not",
|
||||
"used",
|
||||
"for",
|
||||
"the",
|
||||
"greater",
|
||||
"Southern",
|
||||
"California",
|
||||
"Megaregion",
|
||||
".",
|
||||
]
|
||||
|
|
|
@ -15,13 +15,15 @@ def test_en_tokenizer_handles_basic_contraction(en_tokenizer):
|
|||
assert tokens[4].text == "!"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["`ain't", '''"isn't''', "can't!"])
|
||||
@pytest.mark.parametrize("text", ["`ain't", """"isn't""", "can't!"])
|
||||
def test_en_tokenizer_handles_basic_contraction_punct(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text_poss,text', [("Robin's", "Robin"), ("Alexis's", "Alexis")])
|
||||
@pytest.mark.parametrize(
|
||||
"text_poss,text", [("Robin's", "Robin"), ("Alexis's", "Alexis")]
|
||||
)
|
||||
def test_en_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text):
|
||||
tokens = en_tokenizer(text_poss)
|
||||
assert len(tokens) == 2
|
||||
|
@ -29,7 +31,7 @@ def test_en_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text):
|
|||
assert tokens[1].text == "'s"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["schools'", "Alexis'"])
|
||||
@pytest.mark.parametrize("text", ["schools'", "Alexis'"])
|
||||
def test_en_tokenizer_splits_trailing_apos(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
@ -37,14 +39,14 @@ def test_en_tokenizer_splits_trailing_apos(en_tokenizer, text):
|
|||
assert tokens[1].text == "'"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["'em", "nothin'", "ol'"])
|
||||
@pytest.mark.parametrize("text", ["'em", "nothin'", "ol'"])
|
||||
def test_en_tokenizer_doesnt_split_apos_exc(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
assert tokens[0].text == text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["we'll", "You'll", "there'll"])
|
||||
@pytest.mark.parametrize("text", ["we'll", "You'll", "there'll"])
|
||||
def test_en_tokenizer_handles_ll_contraction(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
@ -53,7 +55,9 @@ def test_en_tokenizer_handles_ll_contraction(en_tokenizer, text):
|
|||
assert tokens[1].lemma_ == "will"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text_lower,text_title', [("can't", "Can't"), ("ain't", "Ain't")])
|
||||
@pytest.mark.parametrize(
|
||||
"text_lower,text_title", [("can't", "Can't"), ("ain't", "Ain't")]
|
||||
)
|
||||
def test_en_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_title):
|
||||
tokens_lower = en_tokenizer(text_lower)
|
||||
tokens_title = en_tokenizer(text_title)
|
||||
|
@ -62,21 +66,23 @@ def test_en_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_titl
|
|||
assert tokens_lower[1].text == tokens_title[1].text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('pron', ["I", "You", "He", "She", "It", "We", "They"])
|
||||
@pytest.mark.parametrize('contraction', ["'ll", "'d"])
|
||||
@pytest.mark.parametrize("pron", ["I", "You", "He", "She", "It", "We", "They"])
|
||||
@pytest.mark.parametrize("contraction", ["'ll", "'d"])
|
||||
def test_en_tokenizer_keeps_title_case(en_tokenizer, pron, contraction):
|
||||
tokens = en_tokenizer(pron + contraction)
|
||||
assert tokens[0].text == pron
|
||||
assert tokens[1].text == contraction
|
||||
|
||||
|
||||
@pytest.mark.parametrize('exc', ["Ill", "ill", "Hell", "hell", "Well", "well"])
|
||||
@pytest.mark.parametrize("exc", ["Ill", "ill", "Hell", "hell", "Well", "well"])
|
||||
def test_en_tokenizer_excludes_ambiguous(en_tokenizer, exc):
|
||||
tokens = en_tokenizer(exc)
|
||||
assert len(tokens) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize('wo_punct,w_punct', [("We've", "``We've"), ("couldn't", "couldn't)")])
|
||||
@pytest.mark.parametrize(
|
||||
"wo_punct,w_punct", [("We've", "``We've"), ("couldn't", "couldn't)")]
|
||||
)
|
||||
def test_en_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct):
|
||||
tokens = en_tokenizer(wo_punct)
|
||||
assert len(tokens) == 2
|
||||
|
@ -84,7 +90,7 @@ def test_en_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct):
|
|||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["e.g.", "p.m.", "Jan.", "Dec.", "Inc."])
|
||||
@pytest.mark.parametrize("text", ["e.g.", "p.m.", "Jan.", "Dec.", "Inc."])
|
||||
def test_en_tokenizer_handles_abbr(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
@ -97,20 +103,24 @@ def test_en_tokenizer_handles_exc_in_text(en_tokenizer):
|
|||
assert tokens[3].text == "i.e."
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["1am", "12a.m.", "11p.m.", "4pm"])
|
||||
@pytest.mark.parametrize("text", ["1am", "12a.m.", "11p.m.", "4pm"])
|
||||
def test_en_tokenizer_handles_times(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[1].lemma_ in ["a.m.", "p.m."]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,norms', [("I'm", ["i", "am"]), ("shan't", ["shall", "not"])])
|
||||
@pytest.mark.parametrize(
|
||||
"text,norms", [("I'm", ["i", "am"]), ("shan't", ["shall", "not"])]
|
||||
)
|
||||
def test_en_tokenizer_norm_exceptions(en_tokenizer, text, norms):
|
||||
tokens = en_tokenizer(text)
|
||||
assert [token.norm_ for token in tokens] == norms
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,norm', [("radicalised", "radicalized"), ("cuz", "because")])
|
||||
@pytest.mark.parametrize(
|
||||
"text,norm", [("radicalised", "radicalized"), ("cuz", "because")]
|
||||
)
|
||||
def test_en_lex_attrs_norm_exceptions(en_tokenizer, text, norm):
|
||||
tokens = en_tokenizer(text)
|
||||
assert tokens[0].norm_ == norm
|
||||
|
|
|
@ -12,14 +12,25 @@ from ...util import get_doc
|
|||
def test_en_noun_chunks_not_nested(en_tokenizer):
|
||||
text = "Peter has chronic command and control issues"
|
||||
heads = [1, 0, 4, 3, -1, -2, -5]
|
||||
deps = ['nsubj', 'ROOT', 'amod', 'nmod', 'cc', 'conj', 'dobj']
|
||||
deps = ["nsubj", "ROOT", "amod", "nmod", "cc", "conj", "dobj"]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
tokens.from_array(
|
||||
[HEAD, DEP],
|
||||
numpy.asarray([[1, nsubj], [0, root], [4, amod], [3, nmod], [-1, cc],
|
||||
[-2, conj], [-5, dobj]], dtype='uint64'))
|
||||
tokens.noun_chunks_iterator = SYNTAX_ITERATORS['noun_chunks']
|
||||
numpy.asarray(
|
||||
[
|
||||
[1, nsubj],
|
||||
[0, root],
|
||||
[4, amod],
|
||||
[3, nmod],
|
||||
[-1, cc],
|
||||
[-2, conj],
|
||||
[-5, dobj],
|
||||
],
|
||||
dtype="uint64",
|
||||
),
|
||||
)
|
||||
tokens.noun_chunks_iterator = SYNTAX_ITERATORS["noun_chunks"]
|
||||
word_occurred = {}
|
||||
for chunk in tokens.noun_chunks:
|
||||
for word in chunk:
|
||||
|
|
|
@ -7,22 +7,28 @@ from ...util import get_doc
|
|||
def test_en_parser_noun_chunks_standard(en_tokenizer):
|
||||
text = "A base phrase should be recognized."
|
||||
heads = [2, 1, 3, 2, 1, 0, -1]
|
||||
tags = ['DT', 'JJ', 'NN', 'MD', 'VB', 'VBN', '.']
|
||||
deps = ['det', 'amod', 'nsubjpass', 'aux', 'auxpass', 'ROOT', 'punct']
|
||||
tags = ["DT", "JJ", "NN", "MD", "VB", "VBN", "."]
|
||||
deps = ["det", "amod", "nsubjpass", "aux", "auxpass", "ROOT", "punct"]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads)
|
||||
doc = get_doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads
|
||||
)
|
||||
chunks = list(doc.noun_chunks)
|
||||
assert len(chunks) == 1
|
||||
assert chunks[0].text_with_ws == "A base phrase "
|
||||
|
||||
|
||||
def test_en_parser_noun_chunks_coordinated(en_tokenizer):
|
||||
# fmt: off
|
||||
text = "A base phrase and a good phrase are often the same."
|
||||
heads = [2, 1, 5, -1, 2, 1, -4, 0, -1, 1, -3, -4]
|
||||
tags = ['DT', 'NN', 'NN', 'CC', 'DT', 'JJ', 'NN', 'VBP', 'RB', 'DT', 'JJ', '.']
|
||||
deps = ['det', 'compound', 'nsubj', 'cc', 'det', 'amod', 'conj', 'ROOT', 'advmod', 'det', 'attr', 'punct']
|
||||
tags = ["DT", "NN", "NN", "CC", "DT", "JJ", "NN", "VBP", "RB", "DT", "JJ", "."]
|
||||
deps = ["det", "compound", "nsubj", "cc", "det", "amod", "conj", "ROOT", "advmod", "det", "attr", "punct"]
|
||||
# fmt: on
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads)
|
||||
doc = get_doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads
|
||||
)
|
||||
chunks = list(doc.noun_chunks)
|
||||
assert len(chunks) == 2
|
||||
assert chunks[0].text_with_ws == "A base phrase "
|
||||
|
@ -32,10 +38,12 @@ def test_en_parser_noun_chunks_coordinated(en_tokenizer):
|
|||
def test_en_parser_noun_chunks_pp_chunks(en_tokenizer):
|
||||
text = "A phrase with another phrase occurs."
|
||||
heads = [1, 4, -1, 1, -2, 0, -1]
|
||||
tags = ['DT', 'NN', 'IN', 'DT', 'NN', 'VBZ', '.']
|
||||
deps = ['det', 'nsubj', 'prep', 'det', 'pobj', 'ROOT', 'punct']
|
||||
tags = ["DT", "NN", "IN", "DT", "NN", "VBZ", "."]
|
||||
deps = ["det", "nsubj", "prep", "det", "pobj", "ROOT", "punct"]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads)
|
||||
doc = get_doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads
|
||||
)
|
||||
chunks = list(doc.noun_chunks)
|
||||
assert len(chunks) == 2
|
||||
assert chunks[0].text_with_ws == "A phrase "
|
||||
|
@ -43,12 +51,16 @@ def test_en_parser_noun_chunks_pp_chunks(en_tokenizer):
|
|||
|
||||
|
||||
def test_en_parser_noun_chunks_appositional_modifiers(en_tokenizer):
|
||||
# fmt: off
|
||||
text = "Sam, my brother, arrived to the house."
|
||||
heads = [5, -1, 1, -3, -4, 0, -1, 1, -2, -4]
|
||||
tags = ['NNP', ',', 'PRP$', 'NN', ',', 'VBD', 'IN', 'DT', 'NN', '.']
|
||||
deps = ['nsubj', 'punct', 'poss', 'appos', 'punct', 'ROOT', 'prep', 'det', 'pobj', 'punct']
|
||||
tags = ["NNP", ",", "PRP$", "NN", ",", "VBD", "IN", "DT", "NN", "."]
|
||||
deps = ["nsubj", "punct", "poss", "appos", "punct", "ROOT", "prep", "det", "pobj", "punct"]
|
||||
# fmt: on
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads)
|
||||
doc = get_doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads
|
||||
)
|
||||
chunks = list(doc.noun_chunks)
|
||||
assert len(chunks) == 3
|
||||
assert chunks[0].text_with_ws == "Sam "
|
||||
|
@ -59,10 +71,12 @@ def test_en_parser_noun_chunks_appositional_modifiers(en_tokenizer):
|
|||
def test_en_parser_noun_chunks_dative(en_tokenizer):
|
||||
text = "She gave Bob a raise."
|
||||
heads = [1, 0, -1, 1, -3, -4]
|
||||
tags = ['PRP', 'VBD', 'NNP', 'DT', 'NN', '.']
|
||||
deps = ['nsubj', 'ROOT', 'dative', 'det', 'dobj', 'punct']
|
||||
tags = ["PRP", "VBD", "NNP", "DT", "NN", "."]
|
||||
deps = ["nsubj", "ROOT", "dative", "det", "dobj", "punct"]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads)
|
||||
doc = get_doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads
|
||||
)
|
||||
chunks = list(doc.noun_chunks)
|
||||
assert len(chunks) == 3
|
||||
assert chunks[0].text_with_ws == "She "
|
||||
|
|
|
@ -4,85 +4,85 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(can)"])
|
||||
@pytest.mark.parametrize("text", ["(can)"])
|
||||
def test_en_tokenizer_splits_no_special(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["can't"])
|
||||
@pytest.mark.parametrize("text", ["can't"])
|
||||
def test_en_tokenizer_splits_no_punct(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(can't"])
|
||||
@pytest.mark.parametrize("text", ["(can't"])
|
||||
def test_en_tokenizer_splits_prefix_punct(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["can't)"])
|
||||
@pytest.mark.parametrize("text", ["can't)"])
|
||||
def test_en_tokenizer_splits_suffix_punct(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(can't)"])
|
||||
@pytest.mark.parametrize("text", ["(can't)"])
|
||||
def test_en_tokenizer_splits_even_wrap(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 4
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(can't?)"])
|
||||
@pytest.mark.parametrize("text", ["(can't?)"])
|
||||
def test_en_tokenizer_splits_uneven_wrap(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 5
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,length', [("U.S.", 1), ("us.", 2), ("(U.S.", 2)])
|
||||
@pytest.mark.parametrize("text,length", [("U.S.", 1), ("us.", 2), ("(U.S.", 2)])
|
||||
def test_en_tokenizer_splits_prefix_interact(en_tokenizer, text, length):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["U.S.)"])
|
||||
@pytest.mark.parametrize("text", ["U.S.)"])
|
||||
def test_en_tokenizer_splits_suffix_interact(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(U.S.)"])
|
||||
@pytest.mark.parametrize("text", ["(U.S.)"])
|
||||
def test_en_tokenizer_splits_even_wrap_interact(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(U.S.?)"])
|
||||
@pytest.mark.parametrize("text", ["(U.S.?)"])
|
||||
def test_en_tokenizer_splits_uneven_wrap_interact(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 4
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["best-known"])
|
||||
@pytest.mark.parametrize("text", ["best-known"])
|
||||
def test_en_tokenizer_splits_hyphens(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"])
|
||||
@pytest.mark.parametrize("text", ["0.1-13.5", "0.0-0.1", "103.27-300"])
|
||||
def test_en_tokenizer_splits_numeric_range(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["best.Known", "Hello.World"])
|
||||
@pytest.mark.parametrize("text", ["best.Known", "Hello.World"])
|
||||
def test_en_tokenizer_splits_period_infix(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["Hello,world", "one,two"])
|
||||
@pytest.mark.parametrize("text", ["Hello,world", "one,two"])
|
||||
def test_en_tokenizer_splits_comma_infix(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
@ -91,7 +91,7 @@ def test_en_tokenizer_splits_comma_infix(en_tokenizer, text):
|
|||
assert tokens[2].text == text.split(",")[1]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["best...Known", "best...known"])
|
||||
@pytest.mark.parametrize("text", ["best...Known", "best...known"])
|
||||
def test_en_tokenizer_splits_ellipsis_infix(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
@ -126,8 +126,10 @@ def test_en_tokenizer_splits_period_abbr(en_tokenizer):
|
|||
@pytest.mark.xfail
|
||||
def test_en_tokenizer_splits_em_dash_infix(en_tokenizer):
|
||||
# Re Issue #225
|
||||
tokens = en_tokenizer("""Will this road take me to Puddleton?\u2014No, """
|
||||
"""you'll have to walk there.\u2014Ariel.""")
|
||||
tokens = en_tokenizer(
|
||||
"""Will this road take me to Puddleton?\u2014No, """
|
||||
"""you'll have to walk there.\u2014Ariel."""
|
||||
)
|
||||
assert tokens[6].text == "Puddleton"
|
||||
assert tokens[7].text == "?"
|
||||
assert tokens[8].text == "\u2014"
|
||||
|
|
|
@ -6,19 +6,19 @@ from spacy.util import compile_prefix_regex
|
|||
from spacy.lang.punctuation import TOKENIZER_PREFIXES
|
||||
|
||||
|
||||
PUNCT_OPEN = ['(', '[', '{', '*']
|
||||
PUNCT_CLOSE = [')', ']', '}', '*']
|
||||
PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
|
||||
PUNCT_OPEN = ["(", "[", "{", "*"]
|
||||
PUNCT_CLOSE = [")", "]", "}", "*"]
|
||||
PUNCT_PAIRED = [("(", ")"), ("[", "]"), ("{", "}"), ("*", "*")]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(", "((", "<"])
|
||||
@pytest.mark.parametrize("text", ["(", "((", "<"])
|
||||
def test_en_tokenizer_handles_only_punct(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == len(text)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
||||
@pytest.mark.parametrize('text', ["Hello"])
|
||||
@pytest.mark.parametrize("punct", PUNCT_OPEN)
|
||||
@pytest.mark.parametrize("text", ["Hello"])
|
||||
def test_en_tokenizer_splits_open_punct(en_tokenizer, punct, text):
|
||||
tokens = en_tokenizer(punct + text)
|
||||
assert len(tokens) == 2
|
||||
|
@ -26,8 +26,8 @@ def test_en_tokenizer_splits_open_punct(en_tokenizer, punct, text):
|
|||
assert tokens[1].text == text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
||||
@pytest.mark.parametrize('text', ["Hello"])
|
||||
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
|
||||
@pytest.mark.parametrize("text", ["Hello"])
|
||||
def test_en_tokenizer_splits_close_punct(en_tokenizer, punct, text):
|
||||
tokens = en_tokenizer(text + punct)
|
||||
assert len(tokens) == 2
|
||||
|
@ -35,9 +35,9 @@ def test_en_tokenizer_splits_close_punct(en_tokenizer, punct, text):
|
|||
assert tokens[1].text == punct
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
||||
@pytest.mark.parametrize('punct_add', ["`"])
|
||||
@pytest.mark.parametrize('text', ["Hello"])
|
||||
@pytest.mark.parametrize("punct", PUNCT_OPEN)
|
||||
@pytest.mark.parametrize("punct_add", ["`"])
|
||||
@pytest.mark.parametrize("text", ["Hello"])
|
||||
def test_en_tokenizer_splits_two_diff_open_punct(en_tokenizer, punct, punct_add, text):
|
||||
tokens = en_tokenizer(punct + punct_add + text)
|
||||
assert len(tokens) == 3
|
||||
|
@ -46,9 +46,9 @@ def test_en_tokenizer_splits_two_diff_open_punct(en_tokenizer, punct, punct_add,
|
|||
assert tokens[2].text == text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
||||
@pytest.mark.parametrize('punct_add', ["'"])
|
||||
@pytest.mark.parametrize('text', ["Hello"])
|
||||
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
|
||||
@pytest.mark.parametrize("punct_add", ["'"])
|
||||
@pytest.mark.parametrize("text", ["Hello"])
|
||||
def test_en_tokenizer_splits_two_diff_close_punct(en_tokenizer, punct, punct_add, text):
|
||||
tokens = en_tokenizer(text + punct + punct_add)
|
||||
assert len(tokens) == 3
|
||||
|
@ -57,8 +57,8 @@ def test_en_tokenizer_splits_two_diff_close_punct(en_tokenizer, punct, punct_add
|
|||
assert tokens[2].text == punct_add
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
||||
@pytest.mark.parametrize('text', ["Hello"])
|
||||
@pytest.mark.parametrize("punct", PUNCT_OPEN)
|
||||
@pytest.mark.parametrize("text", ["Hello"])
|
||||
def test_en_tokenizer_splits_same_open_punct(en_tokenizer, punct, text):
|
||||
tokens = en_tokenizer(punct + punct + punct + text)
|
||||
assert len(tokens) == 4
|
||||
|
@ -66,8 +66,8 @@ def test_en_tokenizer_splits_same_open_punct(en_tokenizer, punct, text):
|
|||
assert tokens[3].text == text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
||||
@pytest.mark.parametrize('text', ["Hello"])
|
||||
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
|
||||
@pytest.mark.parametrize("text", ["Hello"])
|
||||
def test_en_tokenizer_splits_same_close_punct(en_tokenizer, punct, text):
|
||||
tokens = en_tokenizer(text + punct + punct + punct)
|
||||
assert len(tokens) == 4
|
||||
|
@ -75,14 +75,14 @@ def test_en_tokenizer_splits_same_close_punct(en_tokenizer, punct, text):
|
|||
assert tokens[1].text == punct
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["'The"])
|
||||
@pytest.mark.parametrize("text", ["'The"])
|
||||
def test_en_tokenizer_splits_open_appostrophe(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text == "'"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["Hello''"])
|
||||
@pytest.mark.parametrize("text", ["Hello''"])
|
||||
def test_en_tokenizer_splits_double_end_quote(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
@ -90,10 +90,11 @@ def test_en_tokenizer_splits_double_end_quote(en_tokenizer, text):
|
|||
assert len(tokens_punct) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
|
||||
@pytest.mark.parametrize('text', ["Hello"])
|
||||
def test_en_tokenizer_splits_open_close_punct(en_tokenizer, punct_open,
|
||||
punct_close, text):
|
||||
@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED)
|
||||
@pytest.mark.parametrize("text", ["Hello"])
|
||||
def test_en_tokenizer_splits_open_close_punct(
|
||||
en_tokenizer, punct_open, punct_close, text
|
||||
):
|
||||
tokens = en_tokenizer(punct_open + text + punct_close)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[0].text == punct_open
|
||||
|
@ -101,11 +102,12 @@ def test_en_tokenizer_splits_open_close_punct(en_tokenizer, punct_open,
|
|||
assert tokens[2].text == punct_close
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
|
||||
@pytest.mark.parametrize('punct_open2,punct_close2', [("`", "'")])
|
||||
@pytest.mark.parametrize('text', ["Hello"])
|
||||
def test_en_tokenizer_two_diff_punct(en_tokenizer, punct_open, punct_close,
|
||||
punct_open2, punct_close2, text):
|
||||
@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED)
|
||||
@pytest.mark.parametrize("punct_open2,punct_close2", [("`", "'")])
|
||||
@pytest.mark.parametrize("text", ["Hello"])
|
||||
def test_en_tokenizer_two_diff_punct(
|
||||
en_tokenizer, punct_open, punct_close, punct_open2, punct_close2, text
|
||||
):
|
||||
tokens = en_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2)
|
||||
assert len(tokens) == 5
|
||||
assert tokens[0].text == punct_open2
|
||||
|
@ -115,7 +117,7 @@ def test_en_tokenizer_two_diff_punct(en_tokenizer, punct_open, punct_close,
|
|||
assert tokens[4].text == punct_close2
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,punct', [("(can't", "(")])
|
||||
@pytest.mark.parametrize("text,punct", [("(can't", "(")])
|
||||
def test_en_tokenizer_splits_pre_punct_regex(text, punct):
|
||||
en_search_prefixes = compile_prefix_regex(TOKENIZER_PREFIXES).search
|
||||
match = en_search_prefixes(text)
|
||||
|
|
|
@ -6,8 +6,8 @@ import pytest
|
|||
from ...util import get_doc, apply_transition_sequence
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["A test sentence"])
|
||||
@pytest.mark.parametrize('punct', ['.', '!', '?', ''])
|
||||
@pytest.mark.parametrize("text", ["A test sentence"])
|
||||
@pytest.mark.parametrize("punct", [".", "!", "?", ""])
|
||||
def test_en_sbd_single_punct(en_tokenizer, text, punct):
|
||||
heads = [2, 1, 0, -1] if punct else [2, 1, 0]
|
||||
tokens = en_tokenizer(text + punct)
|
||||
|
@ -19,16 +19,18 @@ def test_en_sbd_single_punct(en_tokenizer, text, punct):
|
|||
|
||||
@pytest.mark.xfail
|
||||
def test_en_sentence_breaks(en_tokenizer, en_parser):
|
||||
# fmt: off
|
||||
text = "This is a sentence . This is another one ."
|
||||
heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3]
|
||||
deps = ['nsubj', 'ROOT', 'det', 'attr', 'punct', 'nsubj', 'ROOT', 'det',
|
||||
'attr', 'punct']
|
||||
transition = ['L-nsubj', 'S', 'L-det', 'R-attr', 'D', 'R-punct', 'B-ROOT',
|
||||
'L-nsubj', 'S', 'L-attr', 'R-attr', 'D', 'R-punct']
|
||||
deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det",
|
||||
"attr", "punct"]
|
||||
transition = ["L-nsubj", "S", "L-det", "R-attr", "D", "R-punct", "B-ROOT",
|
||||
"L-nsubj", "S", "L-attr", "R-attr", "D", "R-punct"]
|
||||
# fmt: on
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
apply_transition_sequence(en_parser, doc, transition)
|
||||
assert len(list(doc.sents)) == 2
|
||||
for token in doc:
|
||||
assert token.dep != 0 or token.is_space
|
||||
assert [token.head.i for token in doc ] == [1, 1, 3, 1, 1, 6, 6, 8, 6, 6]
|
||||
assert [token.head.i for token in doc] == [1, 1, 3, 1, 1, 6, 6, 8, 6, 6]
|
||||
|
|
|
@ -6,10 +6,10 @@ from ...util import get_doc
|
|||
|
||||
def test_en_tagger_load_morph_exc(en_tokenizer):
|
||||
text = "I like his style."
|
||||
tags = ['PRP', 'VBP', 'PRP$', 'NN', '.']
|
||||
morph_exc = {'VBP': {'like': {'lemma': 'luck'}}}
|
||||
tags = ["PRP", "VBP", "PRP$", "NN", "."]
|
||||
morph_exc = {"VBP": {"like": {"lemma": "luck"}}}
|
||||
en_tokenizer.vocab.morphology.load_morph_exceptions(morph_exc)
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags)
|
||||
assert doc[1].tag_ == 'VBP'
|
||||
assert doc[1].lemma_ == 'luck'
|
||||
assert doc[1].tag_ == "VBP"
|
||||
assert doc[1].lemma_ == "luck"
|
||||
|
|
|
@ -20,30 +20,48 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
|
|||
assert len(tokens) == 76
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,length', [
|
||||
("The U.S. Army likes Shock and Awe.", 8),
|
||||
("U.N. regulations are not a part of their concern.", 10),
|
||||
("“Isn't it?”", 6),
|
||||
("""Yes! "I'd rather have a walk", Ms. Comble sighed. """, 15),
|
||||
("""'Me too!', Mr. P. Delaware cried. """, 11),
|
||||
("They ran about 10km.", 6),
|
||||
pytest.mark.xfail(("But then the 6,000-year ice age came...", 10))])
|
||||
@pytest.mark.parametrize(
|
||||
"text,length",
|
||||
[
|
||||
("The U.S. Army likes Shock and Awe.", 8),
|
||||
("U.N. regulations are not a part of their concern.", 10),
|
||||
("“Isn't it?”", 6),
|
||||
("""Yes! "I'd rather have a walk", Ms. Comble sighed. """, 15),
|
||||
("""'Me too!', Mr. P. Delaware cried. """, 11),
|
||||
("They ran about 10km.", 6),
|
||||
pytest.param(
|
||||
"But then the 6,000-year ice age came...", 10, marks=pytest.mark.xfail()
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_en_tokenizer_handles_cnts(en_tokenizer, text, length):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,match', [
|
||||
('10', True), ('1', True), ('10,000', True), ('10,00', True),
|
||||
('999.0', True), ('one', True), ('two', True), ('billion', True),
|
||||
('dog', False), (',', False), ('1/2', True)])
|
||||
@pytest.mark.parametrize(
|
||||
"text,match",
|
||||
[
|
||||
("10", True),
|
||||
("1", True),
|
||||
("10,000", True),
|
||||
("10,00", True),
|
||||
("999.0", True),
|
||||
("one", True),
|
||||
("two", True),
|
||||
("billion", True),
|
||||
("dog", False),
|
||||
(",", False),
|
||||
("1/2", True),
|
||||
],
|
||||
)
|
||||
def test_lex_attrs_like_number(en_tokenizer, text, match):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
assert tokens[0].like_num == match
|
||||
|
||||
|
||||
@pytest.mark.parametrize('word', ['eleven'])
|
||||
@pytest.mark.parametrize("word", ["eleven"])
|
||||
def test_en_lex_attrs_capitals(word):
|
||||
assert like_num(word)
|
||||
assert like_num(word.upper())
|
||||
|
|
|
@ -4,11 +4,15 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,lemma', [
|
||||
("aprox.", "aproximadamente"),
|
||||
("esq.", "esquina"),
|
||||
("pág.", "página"),
|
||||
("p.ej.", "por ejemplo")])
|
||||
@pytest.mark.parametrize(
|
||||
"text,lemma",
|
||||
[
|
||||
("aprox.", "aproximadamente"),
|
||||
("esq.", "esquina"),
|
||||
("pág.", "página"),
|
||||
("p.ej.", "por ejemplo"),
|
||||
],
|
||||
)
|
||||
def test_es_tokenizer_handles_abbr(es_tokenizer, text, lemma):
|
||||
tokens = es_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
|
|
@ -20,12 +20,16 @@ en Montevideo y que pregona las bondades de la vida austera."""
|
|||
assert len(tokens) == 90
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,length', [
|
||||
("¿Por qué José Mujica?", 6),
|
||||
("“¿Oh no?”", 6),
|
||||
("""¡Sí! "Vámonos", contestó José Arcadio Buendía""", 11),
|
||||
("Corrieron aprox. 10km.", 5),
|
||||
("Y entonces por qué...", 5)])
|
||||
@pytest.mark.parametrize(
|
||||
"text,length",
|
||||
[
|
||||
("¿Por qué José Mujica?", 6),
|
||||
("“¿Oh no?”", 6),
|
||||
("""¡Sí! "Vámonos", contestó José Arcadio Buendía""", 11),
|
||||
("Corrieron aprox. 10km.", 5),
|
||||
("Y entonces por qué...", 5),
|
||||
],
|
||||
)
|
||||
def test_es_tokenizer_handles_cnts(es_tokenizer, text, length):
|
||||
tokens = es_tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
|
|
|
@ -5,12 +5,15 @@ import pytest
|
|||
|
||||
|
||||
ABBREVIATION_TESTS = [
|
||||
('Hyvää uutta vuotta t. siht. Niemelä!', ['Hyvää', 'uutta', 'vuotta', 't.', 'siht.', 'Niemelä', '!']),
|
||||
('Paino on n. 2.2 kg', ['Paino', 'on', 'n.', '2.2', 'kg'])
|
||||
(
|
||||
"Hyvää uutta vuotta t. siht. Niemelä!",
|
||||
["Hyvää", "uutta", "vuotta", "t.", "siht.", "Niemelä", "!"],
|
||||
),
|
||||
("Paino on n. 2.2 kg", ["Paino", "on", "n.", "2.2", "kg"]),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,expected_tokens', ABBREVIATION_TESTS)
|
||||
@pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_TESTS)
|
||||
def test_fi_tokenizer_handles_testcases(fi_tokenizer, text, expected_tokens):
|
||||
tokens = fi_tokenizer(text)
|
||||
token_list = [token.text for token in tokens if not token.is_space]
|
||||
|
|
|
@ -2,26 +2,26 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from .... import util
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
def fr_tokenizer():
|
||||
return util.get_lang_class('fr').Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', [
|
||||
"aujourd'hui", "Aujourd'hui", "prud'hommes", "prud’hommal"])
|
||||
@pytest.mark.parametrize(
|
||||
"text", ["aujourd'hui", "Aujourd'hui", "prud'hommes", "prud’hommal"]
|
||||
)
|
||||
def test_fr_tokenizer_infix_exceptions(fr_tokenizer, text):
|
||||
tokens = fr_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,lemma', [
|
||||
("janv.", "janvier"),
|
||||
("juill.", "juillet"),
|
||||
("Dr.", "docteur"),
|
||||
("av.", "avant"),
|
||||
("sept.", "septembre")])
|
||||
@pytest.mark.parametrize(
|
||||
"text,lemma",
|
||||
[
|
||||
("janv.", "janvier"),
|
||||
("juill.", "juillet"),
|
||||
("Dr.", "docteur"),
|
||||
("av.", "avant"),
|
||||
("sept.", "septembre"),
|
||||
],
|
||||
)
|
||||
def test_fr_tokenizer_handles_abbr(fr_tokenizer, text, lemma):
|
||||
tokens = fr_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
@ -57,6 +57,7 @@ def test_fr_tokenizer_handles_title(fr_tokenizer):
|
|||
assert tokens[2].lemma_ == "ce"
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_fr_tokenizer_handles_title_2(fr_tokenizer):
|
||||
text = "Est-ce pas génial?"
|
||||
tokens = fr_tokenizer(text)
|
||||
|
@ -65,7 +66,7 @@ def test_fr_tokenizer_handles_title_2(fr_tokenizer):
|
|||
assert tokens[0].lemma_ == "être"
|
||||
|
||||
|
||||
def test_fr_tokenizer_handles_title_2(fr_tokenizer):
|
||||
def test_fr_tokenizer_handles_title_3(fr_tokenizer):
|
||||
text = "Qu'est-ce que tu fais?"
|
||||
tokens = fr_tokenizer(text)
|
||||
assert len(tokens) == 7
|
||||
|
|
|
@ -16,7 +16,9 @@ def test_fr_lemmatizer_noun_verb_2(fr_tokenizer):
|
|||
assert tokens[4].lemma_ == "être"
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="Costaricienne TAG is PROPN instead of NOUN and spacy don't lemmatize PROPN")
|
||||
@pytest.mark.xfail(
|
||||
reason="Costaricienne TAG is PROPN instead of NOUN and spacy don't lemmatize PROPN"
|
||||
)
|
||||
def test_fr_lemmatizer_noun(fr_tokenizer):
|
||||
tokens = fr_tokenizer("il y a des Costaricienne.")
|
||||
assert tokens[4].lemma_ == "Costaricain"
|
||||
|
|
|
@ -7,11 +7,12 @@ from spacy.lang.punctuation import TOKENIZER_INFIXES
|
|||
from spacy.lang.char_classes import ALPHA
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,expected_tokens', [
|
||||
("l'avion", ["l'", "avion"]), ("j'ai", ["j'", "ai"])])
|
||||
@pytest.mark.parametrize(
|
||||
"text,expected_tokens", [("l'avion", ["l'", "avion"]), ("j'ai", ["j'", "ai"])]
|
||||
)
|
||||
def test_issue768(text, expected_tokens):
|
||||
"""Allow zero-width 'infix' token during the tokenization process."""
|
||||
SPLIT_INFIX = r'(?<=[{a}]\')(?=[{a}])'.format(a=ALPHA)
|
||||
SPLIT_INFIX = r"(?<=[{a}]\')(?=[{a}])".format(a=ALPHA)
|
||||
|
||||
class FrenchTest(Language):
|
||||
class Defaults(Language.Defaults):
|
||||
|
|
|
@ -1,13 +1,5 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
|
||||
from .... import util
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
def fr_tokenizer():
|
||||
return util.get_lang_class('fr').Defaults.create_tokenizer()
|
||||
|
||||
|
||||
import pytest
|
||||
from spacy.lang.fr.lex_attrs import like_num
|
||||
|
@ -27,7 +19,7 @@ ou avec un autre vrai humain."""
|
|||
assert len(tokens) == 113
|
||||
|
||||
|
||||
@pytest.mark.parametrize('word', ['onze', 'onzième'])
|
||||
@pytest.mark.parametrize("word", ["onze", "onzième"])
|
||||
def test_fr_lex_attrs_capitals(word):
|
||||
assert like_num(word)
|
||||
assert like_num(word.upper())
|
||||
|
|
|
@ -4,13 +4,15 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
# fmt: off
|
||||
GA_TOKEN_EXCEPTION_TESTS = [
|
||||
('Niall Ó Domhnaill, Rialtas na hÉireann 1977 (lch. 600).', ['Niall', 'Ó', 'Domhnaill', ',', 'Rialtas', 'na', 'hÉireann', '1977', '(', 'lch.', '600', ')', '.']),
|
||||
('Daoine a bhfuil Gaeilge acu, m.sh. tusa agus mise', ['Daoine', 'a', 'bhfuil', 'Gaeilge', 'acu', ',', 'm.sh.', 'tusa', 'agus', 'mise'])
|
||||
("Niall Ó Domhnaill, Rialtas na hÉireann 1977 (lch. 600).", ["Niall", "Ó", "Domhnaill", ",", "Rialtas", "na", "hÉireann", "1977", "(", "lch.", "600", ")", "."]),
|
||||
("Daoine a bhfuil Gaeilge acu, m.sh. tusa agus mise", ["Daoine", "a", "bhfuil", "Gaeilge", "acu", ",", "m.sh.", "tusa", "agus", "mise"])
|
||||
]
|
||||
# fmt: on
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,expected_tokens', GA_TOKEN_EXCEPTION_TESTS)
|
||||
@pytest.mark.parametrize("text,expected_tokens", GA_TOKEN_EXCEPTION_TESTS)
|
||||
def test_ga_tokenizer_handles_exception_cases(ga_tokenizer, text, expected_tokens):
|
||||
tokens = ga_tokenizer(text)
|
||||
token_list = [token.text for token in tokens if not token.is_space]
|
||||
|
|
|
@ -4,20 +4,41 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,expected_tokens',
|
||||
[('פייתון היא שפת תכנות דינמית', ['פייתון', 'היא', 'שפת', 'תכנות', 'דינמית'])])
|
||||
@pytest.mark.parametrize(
|
||||
"text,expected_tokens",
|
||||
[("פייתון היא שפת תכנות דינמית", ["פייתון", "היא", "שפת", "תכנות", "דינמית"])],
|
||||
)
|
||||
def test_he_tokenizer_handles_abbreviation(he_tokenizer, text, expected_tokens):
|
||||
tokens = he_tokenizer(text)
|
||||
token_list = [token.text for token in tokens if not token.is_space]
|
||||
assert expected_tokens == token_list
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,expected_tokens', [
|
||||
('עקבת אחריו בכל רחבי המדינה.', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '.']),
|
||||
('עקבת אחריו בכל רחבי המדינה?', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '?']),
|
||||
('עקבת אחריו בכל רחבי המדינה!', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '!']),
|
||||
('עקבת אחריו בכל רחבי המדינה..', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '..']),
|
||||
('עקבת אחריו בכל רחבי המדינה...', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '...'])])
|
||||
@pytest.mark.parametrize(
|
||||
"text,expected_tokens",
|
||||
[
|
||||
(
|
||||
"עקבת אחריו בכל רחבי המדינה.",
|
||||
["עקבת", "אחריו", "בכל", "רחבי", "המדינה", "."],
|
||||
),
|
||||
(
|
||||
"עקבת אחריו בכל רחבי המדינה?",
|
||||
["עקבת", "אחריו", "בכל", "רחבי", "המדינה", "?"],
|
||||
),
|
||||
(
|
||||
"עקבת אחריו בכל רחבי המדינה!",
|
||||
["עקבת", "אחריו", "בכל", "רחבי", "המדינה", "!"],
|
||||
),
|
||||
(
|
||||
"עקבת אחריו בכל רחבי המדינה..",
|
||||
["עקבת", "אחריו", "בכל", "רחבי", "המדינה", ".."],
|
||||
),
|
||||
(
|
||||
"עקבת אחריו בכל רחבי המדינה...",
|
||||
["עקבת", "אחריו", "בכל", "רחבי", "המדינה", "..."],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_he_tokenizer_handles_punct(he_tokenizer, text, expected_tokens):
|
||||
tokens = he_tokenizer(text)
|
||||
assert expected_tokens == [token.text for token in tokens]
|
||||
|
|
|
@ -6,11 +6,11 @@ import pytest
|
|||
|
||||
DEFAULT_TESTS = [
|
||||
('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
|
||||
pytest.mark.xfail(('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.'])),
|
||||
pytest.param('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.'], marks=pytest.mark.xfail()),
|
||||
('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
|
||||
('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']),
|
||||
('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']),
|
||||
pytest.mark.xfail(('A .hu.', ['A', '.hu', '.'])),
|
||||
pytest.param('A .hu.', ['A', '.hu', '.'], marks=pytest.mark.xfail()),
|
||||
('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
|
||||
('A pl.', ['A', 'pl.']),
|
||||
('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
|
||||
|
@ -228,11 +228,11 @@ QUOTE_TESTS = [
|
|||
|
||||
DOT_TESTS = [
|
||||
('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
|
||||
pytest.mark.xfail(('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.'])),
|
||||
pytest.param('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.'], marks=pytest.mark.xfail()),
|
||||
('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
|
||||
('A pl. rövidítés.', ['A', 'pl.', 'rövidítés', '.']),
|
||||
('A S.M.A.R.T. szó.', ['A', 'S.M.A.R.T.', 'szó', '.']),
|
||||
pytest.mark.xfail(('A .hu.', ['A', '.hu', '.'])),
|
||||
pytest.param('A .hu.', ['A', '.hu', '.'], marks=pytest.mark.xfail()),
|
||||
('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
|
||||
('A pl.', ['A', 'pl.']),
|
||||
('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
|
||||
|
|
|
@ -4,85 +4,87 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(Ma'arif)"])
|
||||
@pytest.mark.parametrize("text", ["(Ma'arif)"])
|
||||
def test_id_tokenizer_splits_no_special(id_tokenizer, text):
|
||||
tokens = id_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["Ma'arif"])
|
||||
@pytest.mark.parametrize("text", ["Ma'arif"])
|
||||
def test_id_tokenizer_splits_no_punct(id_tokenizer, text):
|
||||
tokens = id_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(Ma'arif"])
|
||||
@pytest.mark.parametrize("text", ["(Ma'arif"])
|
||||
def test_id_tokenizer_splits_prefix_punct(id_tokenizer, text):
|
||||
tokens = id_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["Ma'arif)"])
|
||||
@pytest.mark.parametrize("text", ["Ma'arif)"])
|
||||
def test_id_tokenizer_splits_suffix_punct(id_tokenizer, text):
|
||||
tokens = id_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(Ma'arif)"])
|
||||
@pytest.mark.parametrize("text", ["(Ma'arif)"])
|
||||
def test_id_tokenizer_splits_even_wrap(id_tokenizer, text):
|
||||
tokens = id_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(Ma'arif?)"])
|
||||
@pytest.mark.parametrize("text", ["(Ma'arif?)"])
|
||||
def test_tokenizer_splits_uneven_wrap(id_tokenizer, text):
|
||||
tokens = id_tokenizer(text)
|
||||
assert len(tokens) == 4
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,length', [("S.Kom.", 1), ("SKom.", 2), ("(S.Kom.", 2)])
|
||||
@pytest.mark.parametrize("text,length", [("S.Kom.", 1), ("SKom.", 2), ("(S.Kom.", 2)])
|
||||
def test_id_tokenizer_splits_prefix_interact(id_tokenizer, text, length):
|
||||
tokens = id_tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["S.Kom.)"])
|
||||
@pytest.mark.parametrize("text", ["S.Kom.)"])
|
||||
def test_id_tokenizer_splits_suffix_interact(id_tokenizer, text):
|
||||
tokens = id_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(S.Kom.)"])
|
||||
@pytest.mark.parametrize("text", ["(S.Kom.)"])
|
||||
def test_id_tokenizer_splits_even_wrap_interact(id_tokenizer, text):
|
||||
tokens = id_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(S.Kom.?)"])
|
||||
@pytest.mark.parametrize("text", ["(S.Kom.?)"])
|
||||
def test_id_tokenizer_splits_uneven_wrap_interact(id_tokenizer, text):
|
||||
tokens = id_tokenizer(text)
|
||||
assert len(tokens) == 4
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,length', [("gara-gara", 1), ("Jokowi-Ahok", 3), ("Sukarno-Hatta", 3)])
|
||||
@pytest.mark.parametrize(
|
||||
"text,length", [("gara-gara", 1), ("Jokowi-Ahok", 3), ("Sukarno-Hatta", 3)]
|
||||
)
|
||||
def test_id_tokenizer_splits_hyphens(id_tokenizer, text, length):
|
||||
tokens = id_tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"])
|
||||
@pytest.mark.parametrize("text", ["0.1-13.5", "0.0-0.1", "103.27-300"])
|
||||
def test_id_tokenizer_splits_numeric_range(id_tokenizer, text):
|
||||
tokens = id_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["ini.Budi", "Halo.Bandung"])
|
||||
@pytest.mark.parametrize("text", ["ini.Budi", "Halo.Bandung"])
|
||||
def test_id_tokenizer_splits_period_infix(id_tokenizer, text):
|
||||
tokens = id_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["Halo,Bandung", "satu,dua"])
|
||||
@pytest.mark.parametrize("text", ["Halo,Bandung", "satu,dua"])
|
||||
def test_id_tokenizer_splits_comma_infix(id_tokenizer, text):
|
||||
tokens = id_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
@ -91,7 +93,7 @@ def test_id_tokenizer_splits_comma_infix(id_tokenizer, text):
|
|||
assert tokens[2].text == text.split(",")[1]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["halo...Bandung", "dia...pergi"])
|
||||
@pytest.mark.parametrize("text", ["halo...Bandung", "dia...pergi"])
|
||||
def test_id_tokenizer_splits_ellipsis_infix(id_tokenizer, text):
|
||||
tokens = id_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
|
|
@ -5,7 +5,7 @@ import pytest
|
|||
from spacy.lang.id.lex_attrs import like_num
|
||||
|
||||
|
||||
@pytest.mark.parametrize('word', ['sebelas'])
|
||||
@pytest.mark.parametrize("word", ["sebelas"])
|
||||
def test_id_lex_attrs_capitals(word):
|
||||
assert like_num(word)
|
||||
assert like_num(word.upper())
|
||||
|
|
|
@ -4,12 +4,10 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('word,lemma', [
|
||||
('新しく', '新しい'),
|
||||
('赤く', '赤い'),
|
||||
('すごく', '凄い'),
|
||||
('いただきました', '頂く'),
|
||||
('なった', '成る')])
|
||||
@pytest.mark.parametrize(
|
||||
"word,lemma",
|
||||
[("新しく", "新しい"), ("赤く", "赤い"), ("すごく", "凄い"), ("いただきました", "頂く"), ("なった", "成る")],
|
||||
)
|
||||
def test_ja_lemmatizer_assigns(ja_tokenizer, word, lemma):
|
||||
test_lemma = ja_tokenizer(word)[0].lemma_
|
||||
assert test_lemma == lemma
|
||||
|
|
|
@ -4,6 +4,7 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
# fmt: off
|
||||
TOKENIZER_TESTS = [
|
||||
("日本語だよ", ['日本', '語', 'だ', 'よ']),
|
||||
("東京タワーの近くに住んでいます。", ['東京', 'タワー', 'の', '近く', 'に', '住ん', 'で', 'い', 'ます', '。']),
|
||||
|
@ -27,21 +28,22 @@ POS_TESTS = [
|
|||
('月に代わって、お仕置きよ!', ['NOUN', 'ADP', 'VERB', 'SCONJ', 'PUNCT', 'NOUN', 'NOUN', 'PART', 'PUNCT']),
|
||||
('すもももももももものうち', ['NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'])
|
||||
]
|
||||
# fmt: on
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS)
|
||||
@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
|
||||
def test_ja_tokenizer(ja_tokenizer, text, expected_tokens):
|
||||
tokens = [token.text for token in ja_tokenizer(text)]
|
||||
assert tokens == expected_tokens
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,expected_tags', TAG_TESTS)
|
||||
def test_ja_tokenizer(ja_tokenizer, text, expected_tags):
|
||||
@pytest.mark.parametrize("text,expected_tags", TAG_TESTS)
|
||||
def test_ja_tokenizer_tags(ja_tokenizer, text, expected_tags):
|
||||
tags = [token.tag_ for token in ja_tokenizer(text)]
|
||||
assert tags == expected_tags
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,expected_pos', POS_TESTS)
|
||||
def test_ja_tokenizer(ja_tokenizer, text, expected_pos):
|
||||
@pytest.mark.parametrize("text,expected_pos", POS_TESTS)
|
||||
def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos):
|
||||
pos = [token.pos_ for token in ja_tokenizer(text)]
|
||||
assert pos == expected_pos
|
||||
|
|
|
@ -5,12 +5,18 @@ import pytest
|
|||
|
||||
|
||||
NB_TOKEN_EXCEPTION_TESTS = [
|
||||
('Smørsausen brukes bl.a. til fisk', ['Smørsausen', 'brukes', 'bl.a.', 'til', 'fisk']),
|
||||
('Jeg kommer først kl. 13 pga. diverse forsinkelser', ['Jeg', 'kommer', 'først', 'kl.', '13', 'pga.', 'diverse', 'forsinkelser'])
|
||||
(
|
||||
"Smørsausen brukes bl.a. til fisk",
|
||||
["Smørsausen", "brukes", "bl.a.", "til", "fisk"],
|
||||
),
|
||||
(
|
||||
"Jeg kommer først kl. 13 pga. diverse forsinkelser",
|
||||
["Jeg", "kommer", "først", "kl.", "13", "pga.", "diverse", "forsinkelser"],
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,expected_tokens', NB_TOKEN_EXCEPTION_TESTS)
|
||||
@pytest.mark.parametrize("text,expected_tokens", NB_TOKEN_EXCEPTION_TESTS)
|
||||
def test_nb_tokenizer_handles_exception_cases(nb_tokenizer, text, expected_tokens):
|
||||
tokens = nb_tokenizer(text)
|
||||
token_list = [token.text for token in tokens if not token.is_space]
|
||||
|
|
|
@ -5,7 +5,7 @@ import pytest
|
|||
from spacy.lang.nl.lex_attrs import like_num
|
||||
|
||||
|
||||
@pytest.mark.parametrize('word', ['elf', 'elfde'])
|
||||
@pytest.mark.parametrize("word", ["elf", "elfde"])
|
||||
def test_nl_lex_attrs_capitals(word):
|
||||
assert like_num(word)
|
||||
assert like_num(word.upper())
|
||||
|
|
|
@ -5,7 +5,7 @@ import pytest
|
|||
from spacy.lang.pt.lex_attrs import like_num
|
||||
|
||||
|
||||
@pytest.mark.parametrize('word', ['onze', 'quadragésimo'])
|
||||
@pytest.mark.parametrize("word", ["onze", "quadragésimo"])
|
||||
def test_pt_lex_attrs_capitals(word):
|
||||
assert like_num(word)
|
||||
assert like_num(word.upper())
|
||||
|
|
|
@ -4,11 +4,15 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('string,lemma', [
|
||||
('câini', 'câine'),
|
||||
('expedițiilor', 'expediție'),
|
||||
('pensete', 'pensetă'),
|
||||
('erau', 'fi')])
|
||||
@pytest.mark.parametrize(
|
||||
"string,lemma",
|
||||
[
|
||||
("câini", "câine"),
|
||||
("expedițiilor", "expediție"),
|
||||
("pensete", "pensetă"),
|
||||
("erau", "fi"),
|
||||
],
|
||||
)
|
||||
def test_ro_lemmatizer_lookup_assigns(ro_tokenizer, string, lemma):
|
||||
tokens = ro_tokenizer(string)
|
||||
assert tokens[0].lemma_ == lemma
|
||||
|
|
|
@ -5,17 +5,20 @@ import pytest
|
|||
|
||||
|
||||
TEST_CASES = [
|
||||
('Adresa este str. Principală nr. 5.', ['Adresa', 'este', 'str.', 'Principală', 'nr.', '5', '.']),
|
||||
('Teste, etc.', ['Teste', ',', 'etc.']),
|
||||
('Lista, ș.a.m.d.', ['Lista', ',', 'ș.a.m.d.']),
|
||||
('Și d.p.d.v. al...', ['Și', 'd.p.d.v.', 'al', '...']),
|
||||
(
|
||||
"Adresa este str. Principală nr. 5.",
|
||||
["Adresa", "este", "str.", "Principală", "nr.", "5", "."],
|
||||
),
|
||||
("Teste, etc.", ["Teste", ",", "etc."]),
|
||||
("Lista, ș.a.m.d.", ["Lista", ",", "ș.a.m.d."]),
|
||||
("Și d.p.d.v. al...", ["Și", "d.p.d.v.", "al", "..."]),
|
||||
# number tests
|
||||
('Clasa a 4-a.', ['Clasa', 'a', '4-a', '.']),
|
||||
('Al 12-lea ceas.', ['Al', '12-lea', 'ceas', '.'])
|
||||
("Clasa a 4-a.", ["Clasa", "a", "4-a", "."]),
|
||||
("Al 12-lea ceas.", ["Al", "12-lea", "ceas", "."]),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,expected_tokens', TEST_CASES)
|
||||
@pytest.mark.parametrize("text,expected_tokens", TEST_CASES)
|
||||
def test_ro_tokenizer_handles_testcases(ro_tokenizer, text, expected_tokens):
|
||||
tokens = ro_tokenizer(text)
|
||||
token_list = [token.text for token in tokens if not token.is_space]
|
||||
|
|
|
@ -4,10 +4,10 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,norms', [
|
||||
("пн.", ["понедельник"]),
|
||||
("пт.", ["пятница"]),
|
||||
("дек.", ["декабрь"])])
|
||||
@pytest.mark.parametrize(
|
||||
"text,norms",
|
||||
[("пн.", ["понедельник"]), ("пт.", ["пятница"]), ("дек.", ["декабрь"])],
|
||||
)
|
||||
def test_ru_tokenizer_abbrev_exceptions(ru_tokenizer, text, norms):
|
||||
tokens = ru_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
|
|
@ -9,55 +9,71 @@ from ...util import get_doc
|
|||
|
||||
@pytest.fixture
|
||||
def ru_lemmatizer():
|
||||
pymorphy = pytest.importorskip('pymorphy2')
|
||||
pymorphy = pytest.importorskip("pymorphy2")
|
||||
return Russian.Defaults.create_lemmatizer()
|
||||
|
||||
|
||||
def test_ru_doc_lemmatization(ru_tokenizer):
|
||||
words = ['мама', 'мыла', 'раму']
|
||||
tags = ['NOUN__Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing',
|
||||
'VERB__Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act',
|
||||
'NOUN__Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing']
|
||||
words = ["мама", "мыла", "раму"]
|
||||
tags = [
|
||||
"NOUN__Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing",
|
||||
"VERB__Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act",
|
||||
"NOUN__Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing",
|
||||
]
|
||||
doc = get_doc(ru_tokenizer.vocab, words=words, tags=tags)
|
||||
lemmas = [token.lemma_ for token in doc]
|
||||
assert lemmas == ['мама', 'мыть', 'рама']
|
||||
assert lemmas == ["мама", "мыть", "рама"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,lemmas', [
|
||||
('гвоздики', ['гвоздик', 'гвоздика']),
|
||||
('люди', ['человек']),
|
||||
('реки', ['река']),
|
||||
('кольцо', ['кольцо']),
|
||||
('пепперони', ['пепперони'])])
|
||||
@pytest.mark.parametrize(
|
||||
"text,lemmas",
|
||||
[
|
||||
("гвоздики", ["гвоздик", "гвоздика"]),
|
||||
("люди", ["человек"]),
|
||||
("реки", ["река"]),
|
||||
("кольцо", ["кольцо"]),
|
||||
("пепперони", ["пепперони"]),
|
||||
],
|
||||
)
|
||||
def test_ru_lemmatizer_noun_lemmas(ru_lemmatizer, text, lemmas):
|
||||
assert sorted(ru_lemmatizer.noun(text)) == lemmas
|
||||
|
||||
|
||||
@pytest.mark.models('ru')
|
||||
@pytest.mark.parametrize('text,pos,morphology,lemma', [
|
||||
('рой', 'NOUN', None, 'рой'),
|
||||
('рой', 'VERB', None, 'рыть'),
|
||||
('клей', 'NOUN', None, 'клей'),
|
||||
('клей', 'VERB', None, 'клеить'),
|
||||
('три', 'NUM', None, 'три'),
|
||||
('кос', 'NOUN', {'Number': 'Sing'}, 'кос'),
|
||||
('кос', 'NOUN', {'Number': 'Plur'}, 'коса'),
|
||||
('кос', 'ADJ', None, 'косой'),
|
||||
('потом', 'NOUN', None, 'пот'),
|
||||
('потом', 'ADV', None, 'потом')])
|
||||
def test_ru_lemmatizer_works_with_different_pos_homonyms(ru_lemmatizer, text, pos, morphology, lemma):
|
||||
@pytest.mark.models("ru")
|
||||
@pytest.mark.parametrize(
|
||||
"text,pos,morphology,lemma",
|
||||
[
|
||||
("рой", "NOUN", None, "рой"),
|
||||
("рой", "VERB", None, "рыть"),
|
||||
("клей", "NOUN", None, "клей"),
|
||||
("клей", "VERB", None, "клеить"),
|
||||
("три", "NUM", None, "три"),
|
||||
("кос", "NOUN", {"Number": "Sing"}, "кос"),
|
||||
("кос", "NOUN", {"Number": "Plur"}, "коса"),
|
||||
("кос", "ADJ", None, "косой"),
|
||||
("потом", "NOUN", None, "пот"),
|
||||
("потом", "ADV", None, "потом"),
|
||||
],
|
||||
)
|
||||
def test_ru_lemmatizer_works_with_different_pos_homonyms(
|
||||
ru_lemmatizer, text, pos, morphology, lemma
|
||||
):
|
||||
assert ru_lemmatizer(text, pos, morphology) == [lemma]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,morphology,lemma', [
|
||||
('гвоздики', {'Gender': 'Fem'}, 'гвоздика'),
|
||||
('гвоздики', {'Gender': 'Masc'}, 'гвоздик'),
|
||||
('вина', {'Gender': 'Fem'}, 'вина'),
|
||||
('вина', {'Gender': 'Neut'}, 'вино')])
|
||||
@pytest.mark.parametrize(
|
||||
"text,morphology,lemma",
|
||||
[
|
||||
("гвоздики", {"Gender": "Fem"}, "гвоздика"),
|
||||
("гвоздики", {"Gender": "Masc"}, "гвоздик"),
|
||||
("вина", {"Gender": "Fem"}, "вина"),
|
||||
("вина", {"Gender": "Neut"}, "вино"),
|
||||
],
|
||||
)
|
||||
def test_ru_lemmatizer_works_with_noun_homonyms(ru_lemmatizer, text, morphology, lemma):
|
||||
assert ru_lemmatizer.noun(text, morphology) == [lemma]
|
||||
|
||||
|
||||
def test_ru_lemmatizer_punct(ru_lemmatizer):
|
||||
assert ru_lemmatizer.punct('«') == ['"']
|
||||
assert ru_lemmatizer.punct('»') == ['"']
|
||||
assert ru_lemmatizer.punct("«") == ['"']
|
||||
assert ru_lemmatizer.punct("»") == ['"']
|
||||
|
|
|
@ -5,7 +5,7 @@ import pytest
|
|||
from spacy.lang.ru.lex_attrs import like_num
|
||||
|
||||
|
||||
@pytest.mark.parametrize('word', ['одиннадцать'])
|
||||
@pytest.mark.parametrize("word", ["одиннадцать"])
|
||||
def test_ru_lex_attrs_capitals(word):
|
||||
assert like_num(word)
|
||||
assert like_num(word.upper())
|
||||
|
|
|
@ -4,19 +4,19 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
PUNCT_OPEN = ['(', '[', '{', '*']
|
||||
PUNCT_CLOSE = [')', ']', '}', '*']
|
||||
PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
|
||||
PUNCT_OPEN = ["(", "[", "{", "*"]
|
||||
PUNCT_CLOSE = [")", "]", "}", "*"]
|
||||
PUNCT_PAIRED = [("(", ")"), ("[", "]"), ("{", "}"), ("*", "*")]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(", "((", "<"])
|
||||
@pytest.mark.parametrize("text", ["(", "((", "<"])
|
||||
def test_ru_tokenizer_handles_only_punct(ru_tokenizer, text):
|
||||
tokens = ru_tokenizer(text)
|
||||
assert len(tokens) == len(text)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
||||
@pytest.mark.parametrize('text', ["Привет"])
|
||||
@pytest.mark.parametrize("punct", PUNCT_OPEN)
|
||||
@pytest.mark.parametrize("text", ["Привет"])
|
||||
def test_ru_tokenizer_splits_open_punct(ru_tokenizer, punct, text):
|
||||
tokens = ru_tokenizer(punct + text)
|
||||
assert len(tokens) == 2
|
||||
|
@ -24,8 +24,8 @@ def test_ru_tokenizer_splits_open_punct(ru_tokenizer, punct, text):
|
|||
assert tokens[1].text == text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
||||
@pytest.mark.parametrize('text', ["Привет"])
|
||||
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
|
||||
@pytest.mark.parametrize("text", ["Привет"])
|
||||
def test_ru_tokenizer_splits_close_punct(ru_tokenizer, punct, text):
|
||||
tokens = ru_tokenizer(text + punct)
|
||||
assert len(tokens) == 2
|
||||
|
@ -33,9 +33,9 @@ def test_ru_tokenizer_splits_close_punct(ru_tokenizer, punct, text):
|
|||
assert tokens[1].text == punct
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
||||
@pytest.mark.parametrize('punct_add', ["`"])
|
||||
@pytest.mark.parametrize('text', ["Привет"])
|
||||
@pytest.mark.parametrize("punct", PUNCT_OPEN)
|
||||
@pytest.mark.parametrize("punct_add", ["`"])
|
||||
@pytest.mark.parametrize("text", ["Привет"])
|
||||
def test_ru_tokenizer_splits_two_diff_open_punct(ru_tokenizer, punct, punct_add, text):
|
||||
tokens = ru_tokenizer(punct + punct_add + text)
|
||||
assert len(tokens) == 3
|
||||
|
@ -44,9 +44,9 @@ def test_ru_tokenizer_splits_two_diff_open_punct(ru_tokenizer, punct, punct_add,
|
|||
assert tokens[2].text == text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
||||
@pytest.mark.parametrize('punct_add', ["'"])
|
||||
@pytest.mark.parametrize('text', ["Привет"])
|
||||
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
|
||||
@pytest.mark.parametrize("punct_add", ["'"])
|
||||
@pytest.mark.parametrize("text", ["Привет"])
|
||||
def test_ru_tokenizer_splits_two_diff_close_punct(ru_tokenizer, punct, punct_add, text):
|
||||
tokens = ru_tokenizer(text + punct + punct_add)
|
||||
assert len(tokens) == 3
|
||||
|
@ -55,8 +55,8 @@ def test_ru_tokenizer_splits_two_diff_close_punct(ru_tokenizer, punct, punct_add
|
|||
assert tokens[2].text == punct_add
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
||||
@pytest.mark.parametrize('text', ["Привет"])
|
||||
@pytest.mark.parametrize("punct", PUNCT_OPEN)
|
||||
@pytest.mark.parametrize("text", ["Привет"])
|
||||
def test_ru_tokenizer_splits_same_open_punct(ru_tokenizer, punct, text):
|
||||
tokens = ru_tokenizer(punct + punct + punct + text)
|
||||
assert len(tokens) == 4
|
||||
|
@ -64,8 +64,8 @@ def test_ru_tokenizer_splits_same_open_punct(ru_tokenizer, punct, text):
|
|||
assert tokens[3].text == text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
||||
@pytest.mark.parametrize('text', ["Привет"])
|
||||
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
|
||||
@pytest.mark.parametrize("text", ["Привет"])
|
||||
def test_ru_tokenizer_splits_same_close_punct(ru_tokenizer, punct, text):
|
||||
tokens = ru_tokenizer(text + punct + punct + punct)
|
||||
assert len(tokens) == 4
|
||||
|
@ -73,14 +73,14 @@ def test_ru_tokenizer_splits_same_close_punct(ru_tokenizer, punct, text):
|
|||
assert tokens[1].text == punct
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["'Тест"])
|
||||
@pytest.mark.parametrize("text", ["'Тест"])
|
||||
def test_ru_tokenizer_splits_open_appostrophe(ru_tokenizer, text):
|
||||
tokens = ru_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text == "'"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["Тест''"])
|
||||
@pytest.mark.parametrize("text", ["Тест''"])
|
||||
def test_ru_tokenizer_splits_double_end_quote(ru_tokenizer, text):
|
||||
tokens = ru_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
@ -88,10 +88,11 @@ def test_ru_tokenizer_splits_double_end_quote(ru_tokenizer, text):
|
|||
assert len(tokens_punct) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
|
||||
@pytest.mark.parametrize('text', ["Тест"])
|
||||
def test_ru_tokenizer_splits_open_close_punct(ru_tokenizer, punct_open,
|
||||
punct_close, text):
|
||||
@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED)
|
||||
@pytest.mark.parametrize("text", ["Тест"])
|
||||
def test_ru_tokenizer_splits_open_close_punct(
|
||||
ru_tokenizer, punct_open, punct_close, text
|
||||
):
|
||||
tokens = ru_tokenizer(punct_open + text + punct_close)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[0].text == punct_open
|
||||
|
@ -99,11 +100,12 @@ def test_ru_tokenizer_splits_open_close_punct(ru_tokenizer, punct_open,
|
|||
assert tokens[2].text == punct_close
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
|
||||
@pytest.mark.parametrize('punct_open2,punct_close2', [("`", "'")])
|
||||
@pytest.mark.parametrize('text', ["Тест"])
|
||||
def test_ru_tokenizer_two_diff_punct(ru_tokenizer, punct_open, punct_close,
|
||||
punct_open2, punct_close2, text):
|
||||
@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED)
|
||||
@pytest.mark.parametrize("punct_open2,punct_close2", [("`", "'")])
|
||||
@pytest.mark.parametrize("text", ["Тест"])
|
||||
def test_ru_tokenizer_two_diff_punct(
|
||||
ru_tokenizer, punct_open, punct_close, punct_open2, punct_close2, text
|
||||
):
|
||||
tokens = ru_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2)
|
||||
assert len(tokens) == 5
|
||||
assert tokens[0].text == punct_open2
|
||||
|
@ -113,7 +115,7 @@ def test_ru_tokenizer_two_diff_punct(ru_tokenizer, punct_open, punct_close,
|
|||
assert tokens[4].text == punct_close2
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["Тест."])
|
||||
@pytest.mark.parametrize("text", ["Тест."])
|
||||
def test_ru_tokenizer_splits_trailing_dot(ru_tokenizer, text):
|
||||
tokens = ru_tokenizer(text)
|
||||
assert tokens[1].text == "."
|
||||
|
|
|
@ -5,20 +5,29 @@ import pytest
|
|||
|
||||
|
||||
SV_TOKEN_EXCEPTION_TESTS = [
|
||||
('Smörsåsen används bl.a. till fisk', ['Smörsåsen', 'används', 'bl.a.', 'till', 'fisk']),
|
||||
('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar']),
|
||||
('Anders I. tycker om ord med i i.', ["Anders", "I.", "tycker", "om", "ord", "med", "i", "i", "."])
|
||||
(
|
||||
"Smörsåsen används bl.a. till fisk",
|
||||
["Smörsåsen", "används", "bl.a.", "till", "fisk"],
|
||||
),
|
||||
(
|
||||
"Jag kommer först kl. 13 p.g.a. diverse förseningar",
|
||||
["Jag", "kommer", "först", "kl.", "13", "p.g.a.", "diverse", "förseningar"],
|
||||
),
|
||||
(
|
||||
"Anders I. tycker om ord med i i.",
|
||||
["Anders", "I.", "tycker", "om", "ord", "med", "i", "i", "."],
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,expected_tokens', SV_TOKEN_EXCEPTION_TESTS)
|
||||
@pytest.mark.parametrize("text,expected_tokens", SV_TOKEN_EXCEPTION_TESTS)
|
||||
def test_sv_tokenizer_handles_exception_cases(sv_tokenizer, text, expected_tokens):
|
||||
tokens = sv_tokenizer(text)
|
||||
token_list = [token.text for token in tokens if not token.is_space]
|
||||
assert expected_tokens == token_list
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["driveru", "hajaru", "Serru", "Fixaru"])
|
||||
@pytest.mark.parametrize("text", ["driveru", "hajaru", "Serru", "Fixaru"])
|
||||
def test_sv_tokenizer_handles_verb_exceptions(sv_tokenizer, text):
|
||||
tokens = sv_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
|
|
@ -6,53 +6,85 @@ from spacy.attrs import intify_attrs, ORTH, NORM, LEMMA, IS_ALPHA
|
|||
from spacy.lang.lex_attrs import is_punct, is_ascii, is_currency, like_url, word_shape
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["dog"])
|
||||
@pytest.mark.parametrize("text", ["dog"])
|
||||
def test_attrs_key(text):
|
||||
assert intify_attrs({"ORTH": text}) == {ORTH: text}
|
||||
assert intify_attrs({"NORM": text}) == {NORM: text}
|
||||
assert intify_attrs({"lemma": text}, strings_map={text: 10}) == {LEMMA: 10}
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["dog"])
|
||||
@pytest.mark.parametrize("text", ["dog"])
|
||||
def test_attrs_idempotence(text):
|
||||
int_attrs = intify_attrs({"lemma": text, 'is_alpha': True}, strings_map={text: 10})
|
||||
int_attrs = intify_attrs({"lemma": text, "is_alpha": True}, strings_map={text: 10})
|
||||
assert intify_attrs(int_attrs) == {LEMMA: 10, IS_ALPHA: True}
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["dog"])
|
||||
@pytest.mark.parametrize("text", ["dog"])
|
||||
def test_attrs_do_deprecated(text):
|
||||
int_attrs = intify_attrs({"F": text, 'is_alpha': True}, strings_map={text: 10},
|
||||
_do_deprecated=True)
|
||||
int_attrs = intify_attrs(
|
||||
{"F": text, "is_alpha": True}, strings_map={text: 10}, _do_deprecated=True
|
||||
)
|
||||
assert int_attrs == {ORTH: 10, IS_ALPHA: True}
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,match', [(',', True), (' ', False), ('a', False)])
|
||||
@pytest.mark.parametrize("text,match", [(",", True), (" ", False), ("a", False)])
|
||||
def test_lex_attrs_is_punct(text, match):
|
||||
assert is_punct(text) == match
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,match', [(',', True), ('£', False), ('♥', False)])
|
||||
@pytest.mark.parametrize("text,match", [(",", True), ("£", False), ("♥", False)])
|
||||
def test_lex_attrs_is_ascii(text, match):
|
||||
assert is_ascii(text) == match
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,match', [('$', True), ('£', True), ('♥', False),
|
||||
('€', True), ('¥', True), ('¢', True),
|
||||
('a', False), ('www.google.com', False), ('dog', False)])
|
||||
@pytest.mark.parametrize(
|
||||
"text,match",
|
||||
[
|
||||
("$", True),
|
||||
("£", True),
|
||||
("♥", False),
|
||||
("€", True),
|
||||
("¥", True),
|
||||
("¢", True),
|
||||
("a", False),
|
||||
("www.google.com", False),
|
||||
("dog", False),
|
||||
],
|
||||
)
|
||||
def test_lex_attrs_is_currency(text, match):
|
||||
assert is_currency(text) == match
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,match', [
|
||||
('www.google.com', True), ('google.com', True), ('sydney.com', True),
|
||||
('2girls1cup.org', True), ('http://stupid', True), ('www.hi', True),
|
||||
('dog', False), ('1.2', False), ('1.a', False), ('hello.There', False)])
|
||||
@pytest.mark.parametrize(
|
||||
"text,match",
|
||||
[
|
||||
("www.google.com", True),
|
||||
("google.com", True),
|
||||
("sydney.com", True),
|
||||
("2girls1cup.org", True),
|
||||
("http://stupid", True),
|
||||
("www.hi", True),
|
||||
("dog", False),
|
||||
("1.2", False),
|
||||
("1.a", False),
|
||||
("hello.There", False),
|
||||
],
|
||||
)
|
||||
def test_lex_attrs_like_url(text, match):
|
||||
assert like_url(text) == match
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,shape', [
|
||||
('Nasa', 'Xxxx'), ('capitalized', 'xxxx'), ('999999999', 'dddd'),
|
||||
('C3P0', 'XdXd'), (',', ','), ('\n', '\n'), ('``,-', '``,-')])
|
||||
@pytest.mark.parametrize(
|
||||
"text,shape",
|
||||
[
|
||||
("Nasa", "Xxxx"),
|
||||
("capitalized", "xxxx"),
|
||||
("999999999", "dddd"),
|
||||
("C3P0", "XdXd"),
|
||||
(",", ","),
|
||||
("\n", "\n"),
|
||||
("``,-", "``,-"),
|
||||
],
|
||||
)
|
||||
def test_lex_attrs_word_shape(text, shape):
|
||||
assert word_shape(text) == shape
|
||||
|
|
|
@ -4,8 +4,9 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,expected_tokens', [
|
||||
("คุณรักผมไหม", ['คุณ', 'รัก', 'ผม', 'ไหม'])])
|
||||
@pytest.mark.parametrize(
|
||||
"text,expected_tokens", [("คุณรักผมไหม", ["คุณ", "รัก", "ผม", "ไหม"])]
|
||||
)
|
||||
def test_th_tokenizer(th_tokenizer, text, expected_tokens):
|
||||
tokens = [token.text for token in th_tokenizer(text)]
|
||||
assert tokens == expected_tokens
|
||||
|
|
|
@ -4,14 +4,18 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('string,lemma', [
|
||||
('evlerimizdeki', 'ev'),
|
||||
('işlerimizi', 'iş'),
|
||||
('biran', 'biran'),
|
||||
('bitirmeliyiz', 'bitir'),
|
||||
('isteklerimizi', 'istek'),
|
||||
('karşılaştırmamızın', 'karşılaştır'),
|
||||
('çoğulculuktan', 'çoğulcu')])
|
||||
@pytest.mark.parametrize(
|
||||
"string,lemma",
|
||||
[
|
||||
("evlerimizdeki", "ev"),
|
||||
("işlerimizi", "iş"),
|
||||
("biran", "biran"),
|
||||
("bitirmeliyiz", "bitir"),
|
||||
("isteklerimizi", "istek"),
|
||||
("karşılaştırmamızın", "karşılaştır"),
|
||||
("çoğulculuktan", "çoğulcu"),
|
||||
],
|
||||
)
|
||||
def test_tr_lemmatizer_lookup_assigns(tr_tokenizer, string, lemma):
|
||||
tokens = tr_tokenizer(string)
|
||||
assert tokens[0].lemma_ == lemma
|
||||
|
|
|
@ -6,14 +6,16 @@ import pytest
|
|||
|
||||
INFIX_HYPHEN_TESTS = [
|
||||
("Явым-төшем күләме.", "Явым-төшем күләме .".split()),
|
||||
("Хатын-кыз киеме.", "Хатын-кыз киеме .".split())
|
||||
("Хатын-кыз киеме.", "Хатын-кыз киеме .".split()),
|
||||
]
|
||||
|
||||
PUNC_INSIDE_WORDS_TESTS = [
|
||||
("Пассаҗир саны - 2,13 млн — кеше/көндә (2010), 783,9 млн. кеше/елда.",
|
||||
"Пассаҗир саны - 2,13 млн — кеше / көндә ( 2010 ) ,"
|
||||
" 783,9 млн. кеше / елда .".split()),
|
||||
("Ту\"кай", "Ту \" кай".split())
|
||||
(
|
||||
"Пассаҗир саны - 2,13 млн — кеше/көндә (2010), 783,9 млн. кеше/елда.",
|
||||
"Пассаҗир саны - 2,13 млн — кеше / көндә ( 2010 ) ,"
|
||||
" 783,9 млн. кеше / елда .".split(),
|
||||
),
|
||||
('Ту"кай', 'Ту " кай'.split()),
|
||||
]
|
||||
|
||||
MIXED_ORDINAL_NUMS_TESTS = [
|
||||
|
@ -22,14 +24,14 @@ MIXED_ORDINAL_NUMS_TESTS = [
|
|||
|
||||
ABBREV_TESTS = [
|
||||
("«3 елда (б.э.к.) туган", "« 3 елда ( б.э.к. ) туган".split()),
|
||||
("тукымадан һ.б.ш. тегелгән.", "тукымадан һ.б.ш. тегелгән .".split())
|
||||
("тукымадан һ.б.ш. тегелгән.", "тукымадан һ.б.ш. тегелгән .".split()),
|
||||
]
|
||||
|
||||
NAME_ABBREV_TESTS = [
|
||||
("Ә.Тукай", "Ә.Тукай".split()),
|
||||
("Ә.тукай", "Ә.тукай".split()),
|
||||
("ә.Тукай", "ә . Тукай".split()),
|
||||
("Миләүшә.", "Миләүшә .".split())
|
||||
("Миләүшә.", "Миләүшә .".split()),
|
||||
]
|
||||
|
||||
TYPOS_IN_PUNC_TESTS = [
|
||||
|
@ -37,30 +39,39 @@ TYPOS_IN_PUNC_TESTS = [
|
|||
("«3 елда,туган", "« 3 елда , туган".split()),
|
||||
("«3 елда,туган.", "« 3 елда , туган .".split()),
|
||||
("Ул эшли(кайчан?)", "Ул эшли ( кайчан ? )".split()),
|
||||
("Ул (кайчан?)эшли", "Ул ( кайчан ?) эшли".split()) # "?)" => "?)" or "? )"
|
||||
("Ул (кайчан?)эшли", "Ул ( кайчан ?) эшли".split()), # "?)" => "?)" or "? )"
|
||||
]
|
||||
|
||||
LONG_TEXTS_TESTS = [
|
||||
("Иң борынгы кешеләр суыклар һәм салкын кышлар булмый торган җылы"
|
||||
"якларда яшәгәннәр, шуңа күрә аларга кием кирәк булмаган.Йөз"
|
||||
"меңнәрчә еллар үткән, борынгы кешеләр акрынлап Европа һәм Азиянең"
|
||||
"салкын илләрендә дә яши башлаганнар. Алар кырыс һәм салкын"
|
||||
"кышлардан саклану өчен кием-салым уйлап тапканнар - итәк.",
|
||||
"Иң борынгы кешеләр суыклар һәм салкын кышлар булмый торган җылы"
|
||||
"якларда яшәгәннәр , шуңа күрә аларга кием кирәк булмаган . Йөз"
|
||||
"меңнәрчә еллар үткән , борынгы кешеләр акрынлап Европа һәм Азиянең"
|
||||
"салкын илләрендә дә яши башлаганнар . Алар кырыс һәм салкын"
|
||||
"кышлардан саклану өчен кием-салым уйлап тапканнар - итәк .".split()
|
||||
)
|
||||
(
|
||||
"Иң борынгы кешеләр суыклар һәм салкын кышлар булмый торган җылы"
|
||||
"якларда яшәгәннәр, шуңа күрә аларга кием кирәк булмаган.Йөз"
|
||||
"меңнәрчә еллар үткән, борынгы кешеләр акрынлап Европа һәм Азиянең"
|
||||
"салкын илләрендә дә яши башлаганнар. Алар кырыс һәм салкын"
|
||||
"кышлардан саклану өчен кием-салым уйлап тапканнар - итәк.",
|
||||
"Иң борынгы кешеләр суыклар һәм салкын кышлар булмый торган җылы"
|
||||
"якларда яшәгәннәр , шуңа күрә аларга кием кирәк булмаган . Йөз"
|
||||
"меңнәрчә еллар үткән , борынгы кешеләр акрынлап Европа һәм Азиянең"
|
||||
"салкын илләрендә дә яши башлаганнар . Алар кырыс һәм салкын"
|
||||
"кышлардан саклану өчен кием-салым уйлап тапканнар - итәк .".split(),
|
||||
)
|
||||
]
|
||||
|
||||
TESTCASES = (INFIX_HYPHEN_TESTS + PUNC_INSIDE_WORDS_TESTS +
|
||||
MIXED_ORDINAL_NUMS_TESTS + ABBREV_TESTS + NAME_ABBREV_TESTS +
|
||||
LONG_TEXTS_TESTS + TYPOS_IN_PUNC_TESTS)
|
||||
TESTCASES = (
|
||||
INFIX_HYPHEN_TESTS
|
||||
+ PUNC_INSIDE_WORDS_TESTS
|
||||
+ MIXED_ORDINAL_NUMS_TESTS
|
||||
+ ABBREV_TESTS
|
||||
+ NAME_ABBREV_TESTS
|
||||
+ LONG_TEXTS_TESTS
|
||||
+ TYPOS_IN_PUNC_TESTS
|
||||
)
|
||||
|
||||
NORM_TESTCASES = [
|
||||
("тукымадан һ.б.ш. тегелгән.",
|
||||
["тукымадан", "һәм башка шундыйлар", "тегелгән", "."])
|
||||
(
|
||||
"тукымадан һ.б.ш. тегелгән.",
|
||||
["тукымадан", "һәм башка шундыйлар", "тегелгән", "."],
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
|
@ -70,7 +81,7 @@ def test_tt_tokenizer_handles_testcases(tt_tokenizer, text, expected_tokens):
|
|||
assert expected_tokens == tokens
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,norms', NORM_TESTCASES)
|
||||
@pytest.mark.parametrize("text,norms", NORM_TESTCASES)
|
||||
def test_tt_tokenizer_handles_norm_exceptions(tt_tokenizer, text, norms):
|
||||
tokens = tt_tokenizer(text)
|
||||
assert [token.norm_ for token in tokens] == norms
|
||||
|
|
|
@ -13,9 +13,7 @@ def test_ur_tokenizer_handles_long_text(ur_tokenizer):
|
|||
assert len(tokens) == 77
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,length', [
|
||||
("تحریر باسط حبیب", 3),
|
||||
("میرا پاکستان", 2)])
|
||||
@pytest.mark.parametrize("text,length", [("تحریر باسط حبیب", 3), ("میرا پاکستان", 2)])
|
||||
def test_ur_tokenizer_handles_cnts(ur_tokenizer, text, length):
|
||||
tokens = ur_tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
|
|
|
@ -10,9 +10,11 @@ from ..util import get_doc
|
|||
|
||||
@pytest.fixture
|
||||
def matcher(en_vocab):
|
||||
rules = {'JS': [[{'ORTH': 'JavaScript'}]],
|
||||
'GoogleNow': [[{'ORTH': 'Google'}, {'ORTH': 'Now'}]],
|
||||
'Java': [[{'LOWER': 'java'}]]}
|
||||
rules = {
|
||||
"JS": [[{"ORTH": "JavaScript"}]],
|
||||
"GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]],
|
||||
"Java": [[{"LOWER": "java"}]],
|
||||
}
|
||||
matcher = Matcher(en_vocab)
|
||||
for key, patterns in rules.items():
|
||||
matcher.add(key, None, *patterns)
|
||||
|
@ -21,44 +23,44 @@ def matcher(en_vocab):
|
|||
|
||||
def test_matcher_from_api_docs(en_vocab):
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{'ORTH': 'test'}]
|
||||
pattern = [{"ORTH": "test"}]
|
||||
assert len(matcher) == 0
|
||||
matcher.add('Rule', None, pattern)
|
||||
matcher.add("Rule", None, pattern)
|
||||
assert len(matcher) == 1
|
||||
matcher.remove('Rule')
|
||||
assert 'Rule' not in matcher
|
||||
matcher.add('Rule', None, pattern)
|
||||
assert 'Rule' in matcher
|
||||
on_match, patterns = matcher.get('Rule')
|
||||
matcher.remove("Rule")
|
||||
assert "Rule" not in matcher
|
||||
matcher.add("Rule", None, pattern)
|
||||
assert "Rule" in matcher
|
||||
on_match, patterns = matcher.get("Rule")
|
||||
assert len(patterns[0])
|
||||
|
||||
|
||||
def test_matcher_from_usage_docs(en_vocab):
|
||||
text = "Wow 😀 This is really cool! 😂 😂"
|
||||
doc = Doc(en_vocab, words=text.split(' '))
|
||||
pos_emoji = ['😀', '😃', '😂', '🤣', '😊', '😍']
|
||||
pos_patterns = [[{'ORTH': emoji}] for emoji in pos_emoji]
|
||||
doc = Doc(en_vocab, words=text.split(" "))
|
||||
pos_emoji = ["😀", "😃", "😂", "🤣", "😊", "😍"]
|
||||
pos_patterns = [[{"ORTH": emoji}] for emoji in pos_emoji]
|
||||
|
||||
def label_sentiment(matcher, doc, i, matches):
|
||||
match_id, start, end = matches[i]
|
||||
if doc.vocab.strings[match_id] == 'HAPPY':
|
||||
if doc.vocab.strings[match_id] == "HAPPY":
|
||||
doc.sentiment += 0.1
|
||||
span = doc[start : end]
|
||||
span = doc[start:end]
|
||||
token = span.merge()
|
||||
token.vocab[token.text].norm_ = 'happy emoji'
|
||||
token.vocab[token.text].norm_ = "happy emoji"
|
||||
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add('HAPPY', label_sentiment, *pos_patterns)
|
||||
matcher.add("HAPPY", label_sentiment, *pos_patterns)
|
||||
matches = matcher(doc)
|
||||
assert doc.sentiment != 0
|
||||
assert doc[1].norm_ == 'happy emoji'
|
||||
assert doc[1].norm_ == "happy emoji"
|
||||
|
||||
|
||||
def test_matcher_len_contains(matcher):
|
||||
assert len(matcher) == 3
|
||||
matcher.add('TEST', None, [{'ORTH': 'test'}])
|
||||
assert 'TEST' in matcher
|
||||
assert 'TEST2' not in matcher
|
||||
matcher.add("TEST", None, [{"ORTH": "test"}])
|
||||
assert "TEST" in matcher
|
||||
assert "TEST2" not in matcher
|
||||
|
||||
|
||||
def test_matcher_no_match(matcher):
|
||||
|
@ -68,38 +70,40 @@ def test_matcher_no_match(matcher):
|
|||
|
||||
def test_matcher_match_start(matcher):
|
||||
doc = Doc(matcher.vocab, words=["JavaScript", "is", "good"])
|
||||
assert matcher(doc) == [(matcher.vocab.strings['JS'], 0, 1)]
|
||||
assert matcher(doc) == [(matcher.vocab.strings["JS"], 0, 1)]
|
||||
|
||||
|
||||
def test_matcher_match_end(matcher):
|
||||
words = ["I", "like", "java"]
|
||||
doc = Doc(matcher.vocab, words=words)
|
||||
assert matcher(doc) == [(doc.vocab.strings['Java'], 2, 3)]
|
||||
assert matcher(doc) == [(doc.vocab.strings["Java"], 2, 3)]
|
||||
|
||||
|
||||
def test_matcher_match_middle(matcher):
|
||||
words = ["I", "like", "Google", "Now", "best"]
|
||||
doc = Doc(matcher.vocab, words=words)
|
||||
assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], 2, 4)]
|
||||
assert matcher(doc) == [(doc.vocab.strings["GoogleNow"], 2, 4)]
|
||||
|
||||
|
||||
def test_matcher_match_multi(matcher):
|
||||
words = ["I", "like", "Google", "Now", "and", "java", "best"]
|
||||
doc = Doc(matcher.vocab, words=words)
|
||||
assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], 2, 4),
|
||||
(doc.vocab.strings['Java'], 5, 6)]
|
||||
assert matcher(doc) == [
|
||||
(doc.vocab.strings["GoogleNow"], 2, 4),
|
||||
(doc.vocab.strings["Java"], 5, 6),
|
||||
]
|
||||
|
||||
|
||||
def test_matcher_empty_dict(en_vocab):
|
||||
"""Test matcher allows empty token specs, meaning match on any token."""
|
||||
matcher = Matcher(en_vocab)
|
||||
doc = Doc(matcher.vocab, words=["a", "b", "c"])
|
||||
matcher.add('A.C', None, [{'ORTH': 'a'}, {}, {'ORTH': 'c'}])
|
||||
matcher.add("A.C", None, [{"ORTH": "a"}, {}, {"ORTH": "c"}])
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 1
|
||||
assert matches[0][1:] == (0, 3)
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add('A.', None, [{'ORTH': 'a'}, {}])
|
||||
matcher.add("A.", None, [{"ORTH": "a"}, {}])
|
||||
matches = matcher(doc)
|
||||
assert matches[0][1:] == (0, 2)
|
||||
|
||||
|
@ -107,8 +111,8 @@ def test_matcher_empty_dict(en_vocab):
|
|||
def test_matcher_operator_shadow(en_vocab):
|
||||
matcher = Matcher(en_vocab)
|
||||
doc = Doc(matcher.vocab, words=["a", "b", "c"])
|
||||
pattern = [{'ORTH': 'a'}, {"IS_ALPHA": True, "OP": "+"}, {'ORTH': 'c'}]
|
||||
matcher.add('A.C', None, pattern)
|
||||
pattern = [{"ORTH": "a"}, {"IS_ALPHA": True, "OP": "+"}, {"ORTH": "c"}]
|
||||
matcher.add("A.C", None, pattern)
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 1
|
||||
assert matches[0][1:] == (0, 3)
|
||||
|
@ -117,43 +121,48 @@ def test_matcher_operator_shadow(en_vocab):
|
|||
def test_matcher_match_zero(matcher):
|
||||
words1 = 'He said , " some words " ...'.split()
|
||||
words2 = 'He said , " some three words " ...'.split()
|
||||
pattern1 = [{'ORTH': '"'},
|
||||
{'OP': '!', 'IS_PUNCT': True},
|
||||
{'OP': '!', 'IS_PUNCT': True},
|
||||
{'ORTH': '"'}]
|
||||
pattern2 = [{'ORTH': '"'},
|
||||
{'IS_PUNCT': True},
|
||||
{'IS_PUNCT': True},
|
||||
{'IS_PUNCT': True},
|
||||
{'ORTH': '"'}]
|
||||
matcher.add('Quote', None, pattern1)
|
||||
pattern1 = [
|
||||
{"ORTH": '"'},
|
||||
{"OP": "!", "IS_PUNCT": True},
|
||||
{"OP": "!", "IS_PUNCT": True},
|
||||
{"ORTH": '"'},
|
||||
]
|
||||
pattern2 = [
|
||||
{"ORTH": '"'},
|
||||
{"IS_PUNCT": True},
|
||||
{"IS_PUNCT": True},
|
||||
{"IS_PUNCT": True},
|
||||
{"ORTH": '"'},
|
||||
]
|
||||
matcher.add("Quote", None, pattern1)
|
||||
doc = Doc(matcher.vocab, words=words1)
|
||||
assert len(matcher(doc)) == 1
|
||||
doc = Doc(matcher.vocab, words=words2)
|
||||
assert len(matcher(doc)) == 0
|
||||
matcher.add('Quote', None, pattern2)
|
||||
matcher.add("Quote", None, pattern2)
|
||||
assert len(matcher(doc)) == 0
|
||||
|
||||
|
||||
def test_matcher_match_zero_plus(matcher):
|
||||
words = 'He said , " some words " ...'.split()
|
||||
pattern = [{'ORTH': '"'},
|
||||
{'OP': '*', 'IS_PUNCT': False},
|
||||
{'ORTH': '"'}]
|
||||
pattern = [{"ORTH": '"'}, {"OP": "*", "IS_PUNCT": False}, {"ORTH": '"'}]
|
||||
matcher = Matcher(matcher.vocab)
|
||||
matcher.add('Quote', None, pattern)
|
||||
matcher.add("Quote", None, pattern)
|
||||
doc = Doc(matcher.vocab, words=words)
|
||||
assert len(matcher(doc)) == 1
|
||||
|
||||
|
||||
def test_matcher_match_one_plus(matcher):
|
||||
control = Matcher(matcher.vocab)
|
||||
control.add('BasicPhilippe', None, [{'ORTH': 'Philippe'}])
|
||||
doc = Doc(control.vocab, words=['Philippe', 'Philippe'])
|
||||
control.add("BasicPhilippe", None, [{"ORTH": "Philippe"}])
|
||||
doc = Doc(control.vocab, words=["Philippe", "Philippe"])
|
||||
m = control(doc)
|
||||
assert len(m) == 2
|
||||
matcher.add('KleenePhilippe', None, [{'ORTH': 'Philippe', 'OP': '1'},
|
||||
{'ORTH': 'Philippe', 'OP': '+'}])
|
||||
matcher.add(
|
||||
"KleenePhilippe",
|
||||
None,
|
||||
[{"ORTH": "Philippe", "OP": "1"}, {"ORTH": "Philippe", "OP": "+"}],
|
||||
)
|
||||
m = matcher(doc)
|
||||
assert len(m) == 1
|
||||
|
||||
|
@ -161,54 +170,70 @@ def test_matcher_match_one_plus(matcher):
|
|||
def test_matcher_any_token_operator(en_vocab):
|
||||
"""Test that patterns with "any token" {} work with operators."""
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add('TEST', None, [{'ORTH': 'test'}, {'OP': '*'}])
|
||||
doc = Doc(en_vocab, words=['test', 'hello', 'world'])
|
||||
matcher.add("TEST", None, [{"ORTH": "test"}, {"OP": "*"}])
|
||||
doc = Doc(en_vocab, words=["test", "hello", "world"])
|
||||
matches = [doc[start:end].text for _, start, end in matcher(doc)]
|
||||
assert len(matches) == 3
|
||||
assert matches[0] == 'test'
|
||||
assert matches[1] == 'test hello'
|
||||
assert matches[2] == 'test hello world'
|
||||
assert matches[0] == "test"
|
||||
assert matches[1] == "test hello"
|
||||
assert matches[2] == "test hello world"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def text():
|
||||
return u"The quick brown fox jumped over the lazy fox"
|
||||
return "The quick brown fox jumped over the lazy fox"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def heads():
|
||||
return [3,2,1,1,0,-1,2,1,-3]
|
||||
return [3, 2, 1, 1, 0, -1, 2, 1, -3]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def deps():
|
||||
return ['det', 'amod', 'amod', 'nsubj', 'prep', 'pobj', 'det', 'amod']
|
||||
return ["det", "amod", "amod", "nsubj", "prep", "pobj", "det", "amod"]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dependency_tree_matcher(en_vocab):
|
||||
is_brown_yellow = lambda text: bool(re.compile(r'brown|yellow|over').match(text))
|
||||
def is_brown_yellow(text):
|
||||
return bool(re.compile(r"brown|yellow|over").match(text))
|
||||
|
||||
IS_BROWN_YELLOW = en_vocab.add_flag(is_brown_yellow)
|
||||
pattern1 = [
|
||||
{'SPEC': {'NODE_NAME': 'fox'}, 'PATTERN': {'ORTH': 'fox'}},
|
||||
{'SPEC': {'NODE_NAME': 'q', 'NBOR_RELOP': '>', 'NBOR_NAME': 'fox'},'PATTERN': {'LOWER': u'quick'}},
|
||||
{'SPEC': {'NODE_NAME': 'r', 'NBOR_RELOP': '>', 'NBOR_NAME': 'fox'}, 'PATTERN': {IS_BROWN_YELLOW: True}}
|
||||
{"SPEC": {"NODE_NAME": "fox"}, "PATTERN": {"ORTH": "fox"}},
|
||||
{
|
||||
"SPEC": {"NODE_NAME": "q", "NBOR_RELOP": ">", "NBOR_NAME": "fox"},
|
||||
"PATTERN": {"LOWER": "quick"},
|
||||
},
|
||||
{
|
||||
"SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">", "NBOR_NAME": "fox"},
|
||||
"PATTERN": {IS_BROWN_YELLOW: True},
|
||||
},
|
||||
]
|
||||
|
||||
pattern2 = [
|
||||
{'SPEC': {'NODE_NAME': 'jumped'}, 'PATTERN': {'ORTH': 'jumped'}},
|
||||
{'SPEC': {'NODE_NAME': 'fox', 'NBOR_RELOP': '>', 'NBOR_NAME': 'jumped'},'PATTERN': {'LOWER': u'fox'}},
|
||||
{'SPEC': {'NODE_NAME': 'over', 'NBOR_RELOP': '>', 'NBOR_NAME': 'fox'}, 'PATTERN': {IS_BROWN_YELLOW: True}}
|
||||
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
|
||||
{
|
||||
"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
|
||||
"PATTERN": {"LOWER": "fox"},
|
||||
},
|
||||
{
|
||||
"SPEC": {"NODE_NAME": "over", "NBOR_RELOP": ">", "NBOR_NAME": "fox"},
|
||||
"PATTERN": {IS_BROWN_YELLOW: True},
|
||||
},
|
||||
]
|
||||
matcher = DependencyTreeMatcher(en_vocab)
|
||||
matcher.add('pattern1', None, pattern1)
|
||||
matcher.add('pattern2', None, pattern2)
|
||||
matcher.add("pattern1", None, pattern1)
|
||||
matcher.add("pattern2", None, pattern2)
|
||||
return matcher
|
||||
|
||||
|
||||
|
||||
def test_dependency_tree_matcher_compile(dependency_tree_matcher):
|
||||
assert len(dependency_tree_matcher) == 2
|
||||
|
||||
def test_dependency_tree_matcher(dependency_tree_matcher,text,heads,deps):
|
||||
doc = get_doc(dependency_tree_matcher.vocab,text.split(),heads=heads,deps=deps)
|
||||
|
||||
def test_dependency_tree_matcher(dependency_tree_matcher, text, heads, deps):
|
||||
doc = get_doc(dependency_tree_matcher.vocab, text.split(), heads=heads, deps=deps)
|
||||
matches = dependency_tree_matcher(doc)
|
||||
assert len(matches) == 2
|
||||
|
||||
|
|
|
@ -7,17 +7,25 @@ from spacy.matcher import Matcher
|
|||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
pattern1 = [{'ORTH':'A', 'OP':'1'}, {'ORTH':'A', 'OP':'*'}]
|
||||
pattern2 = [{'ORTH':'A', 'OP':'*'}, {'ORTH':'A', 'OP':'1'}]
|
||||
pattern3 = [{'ORTH':'A', 'OP':'1'}, {'ORTH':'A', 'OP':'1'}]
|
||||
pattern4 = [{'ORTH':'B', 'OP':'1'}, {'ORTH':'A', 'OP':'*'}, {'ORTH':'B', 'OP':'1'}]
|
||||
pattern5 = [{'ORTH':'B', 'OP':'*'}, {'ORTH':'A', 'OP':'*'}, {'ORTH':'B', 'OP':'1'}]
|
||||
pattern1 = [{"ORTH": "A", "OP": "1"}, {"ORTH": "A", "OP": "*"}]
|
||||
pattern2 = [{"ORTH": "A", "OP": "*"}, {"ORTH": "A", "OP": "1"}]
|
||||
pattern3 = [{"ORTH": "A", "OP": "1"}, {"ORTH": "A", "OP": "1"}]
|
||||
pattern4 = [
|
||||
{"ORTH": "B", "OP": "1"},
|
||||
{"ORTH": "A", "OP": "*"},
|
||||
{"ORTH": "B", "OP": "1"},
|
||||
]
|
||||
pattern5 = [
|
||||
{"ORTH": "B", "OP": "*"},
|
||||
{"ORTH": "A", "OP": "*"},
|
||||
{"ORTH": "B", "OP": "1"},
|
||||
]
|
||||
|
||||
re_pattern1 = 'AA*'
|
||||
re_pattern2 = 'A*A'
|
||||
re_pattern3 = 'AA'
|
||||
re_pattern4 = 'BA*B'
|
||||
re_pattern5 = 'B*A*B'
|
||||
re_pattern1 = "AA*"
|
||||
re_pattern2 = "A*A"
|
||||
re_pattern3 = "AA"
|
||||
re_pattern4 = "BA*B"
|
||||
re_pattern5 = "B*A*B"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -27,17 +35,20 @@ def text():
|
|||
|
||||
@pytest.fixture
|
||||
def doc(en_tokenizer, text):
|
||||
doc = en_tokenizer(' '.join(text))
|
||||
doc = en_tokenizer(" ".join(text))
|
||||
return doc
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize('pattern,re_pattern', [
|
||||
(pattern1, re_pattern1),
|
||||
(pattern2, re_pattern2),
|
||||
(pattern3, re_pattern3),
|
||||
(pattern4, re_pattern4),
|
||||
(pattern5, re_pattern5)])
|
||||
@pytest.mark.parametrize(
|
||||
"pattern,re_pattern",
|
||||
[
|
||||
pytest.param(pattern1, re_pattern1, marks=pytest.mark.xfail()),
|
||||
pytest.param(pattern2, re_pattern2, marks=pytest.mark.xfail()),
|
||||
pytest.param(pattern3, re_pattern3, marks=pytest.mark.xfail()),
|
||||
(pattern4, re_pattern4),
|
||||
pytest.param(pattern5, re_pattern5, marks=pytest.mark.xfail()),
|
||||
],
|
||||
)
|
||||
def test_greedy_matching(doc, text, pattern, re_pattern):
|
||||
"""Test that the greedy matching behavior of the * op is consistant with
|
||||
other re implementations."""
|
||||
|
@ -50,12 +61,16 @@ def test_greedy_matching(doc, text, pattern, re_pattern):
|
|||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize('pattern,re_pattern', [
|
||||
(pattern1, re_pattern1),
|
||||
(pattern2, re_pattern2),
|
||||
(pattern3, re_pattern3),
|
||||
(pattern4, re_pattern4),
|
||||
(pattern5, re_pattern5)])
|
||||
@pytest.mark.parametrize(
|
||||
"pattern,re_pattern",
|
||||
[
|
||||
(pattern1, re_pattern1),
|
||||
(pattern2, re_pattern2),
|
||||
(pattern3, re_pattern3),
|
||||
(pattern4, re_pattern4),
|
||||
(pattern5, re_pattern5),
|
||||
],
|
||||
)
|
||||
def test_match_consuming(doc, text, pattern, re_pattern):
|
||||
"""Test that matcher.__call__ consumes tokens on a match similar to
|
||||
re.findall."""
|
||||
|
@ -68,33 +83,33 @@ def test_match_consuming(doc, text, pattern, re_pattern):
|
|||
|
||||
def test_operator_combos(en_vocab):
|
||||
cases = [
|
||||
('aaab', 'a a a b', True),
|
||||
('aaab', 'a+ b', True),
|
||||
('aaab', 'a+ a+ b', True),
|
||||
('aaab', 'a+ a+ a b', True),
|
||||
('aaab', 'a+ a+ a+ b', True),
|
||||
('aaab', 'a+ a a b', True),
|
||||
('aaab', 'a+ a a', True),
|
||||
('aaab', 'a+', True),
|
||||
('aaa', 'a+ b', False),
|
||||
('aaa', 'a+ a+ b', False),
|
||||
('aaa', 'a+ a+ a+ b', False),
|
||||
('aaa', 'a+ a b', False),
|
||||
('aaa', 'a+ a a b', False),
|
||||
('aaab', 'a+ a a', True),
|
||||
('aaab', 'a+', True),
|
||||
('aaab', 'a+ a b', True)
|
||||
("aaab", "a a a b", True),
|
||||
("aaab", "a+ b", True),
|
||||
("aaab", "a+ a+ b", True),
|
||||
("aaab", "a+ a+ a b", True),
|
||||
("aaab", "a+ a+ a+ b", True),
|
||||
("aaab", "a+ a a b", True),
|
||||
("aaab", "a+ a a", True),
|
||||
("aaab", "a+", True),
|
||||
("aaa", "a+ b", False),
|
||||
("aaa", "a+ a+ b", False),
|
||||
("aaa", "a+ a+ a+ b", False),
|
||||
("aaa", "a+ a b", False),
|
||||
("aaa", "a+ a a b", False),
|
||||
("aaab", "a+ a a", True),
|
||||
("aaab", "a+", True),
|
||||
("aaab", "a+ a b", True),
|
||||
]
|
||||
for string, pattern_str, result in cases:
|
||||
matcher = Matcher(en_vocab)
|
||||
doc = Doc(matcher.vocab, words=list(string))
|
||||
pattern = []
|
||||
for part in pattern_str.split():
|
||||
if part.endswith('+'):
|
||||
pattern.append({'ORTH': part[0], 'OP': '+'})
|
||||
if part.endswith("+"):
|
||||
pattern.append({"ORTH": part[0], "OP": "+"})
|
||||
else:
|
||||
pattern.append({'ORTH': part})
|
||||
matcher.add('PATTERN', None, pattern)
|
||||
pattern.append({"ORTH": part})
|
||||
matcher.add("PATTERN", None, pattern)
|
||||
matches = matcher(doc)
|
||||
if result:
|
||||
assert matches, (string, pattern_str)
|
||||
|
@ -105,12 +120,12 @@ def test_operator_combos(en_vocab):
|
|||
def test_matcher_end_zero_plus(en_vocab):
|
||||
"""Test matcher works when patterns end with * operator. (issue 1450)"""
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{'ORTH': "a"}, {'ORTH': "b", 'OP': "*"}]
|
||||
matcher.add('TSTEND', None, pattern)
|
||||
pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}]
|
||||
matcher.add("TSTEND", None, pattern)
|
||||
nlp = lambda string: Doc(matcher.vocab, words=string.split())
|
||||
assert len(matcher(nlp('a'))) == 1
|
||||
assert len(matcher(nlp('a b'))) == 2
|
||||
assert len(matcher(nlp('a c'))) == 1
|
||||
assert len(matcher(nlp('a b c'))) == 2
|
||||
assert len(matcher(nlp('a b b c'))) == 3
|
||||
assert len(matcher(nlp('a b b'))) == 3
|
||||
assert len(matcher(nlp("a"))) == 1
|
||||
assert len(matcher(nlp("a b"))) == 2
|
||||
assert len(matcher(nlp("a c"))) == 1
|
||||
assert len(matcher(nlp("a b c"))) == 2
|
||||
assert len(matcher(nlp("a b b c"))) == 3
|
||||
assert len(matcher(nlp("a b b"))) == 3
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from spacy.matcher import PhraseMatcher
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
@ -11,7 +10,7 @@ from ..util import get_doc
|
|||
def test_matcher_phrase_matcher(en_vocab):
|
||||
doc = Doc(en_vocab, words=["Google", "Now"])
|
||||
matcher = PhraseMatcher(en_vocab)
|
||||
matcher.add('COMPANY', None, doc)
|
||||
matcher.add("COMPANY", None, doc)
|
||||
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
|
||||
assert len(matcher(doc)) == 1
|
||||
|
||||
|
@ -19,63 +18,63 @@ def test_matcher_phrase_matcher(en_vocab):
|
|||
def test_phrase_matcher_length(en_vocab):
|
||||
matcher = PhraseMatcher(en_vocab)
|
||||
assert len(matcher) == 0
|
||||
matcher.add('TEST', None, Doc(en_vocab, words=['test']))
|
||||
matcher.add("TEST", None, Doc(en_vocab, words=["test"]))
|
||||
assert len(matcher) == 1
|
||||
matcher.add('TEST2', None, Doc(en_vocab, words=['test2']))
|
||||
matcher.add("TEST2", None, Doc(en_vocab, words=["test2"]))
|
||||
assert len(matcher) == 2
|
||||
|
||||
|
||||
def test_phrase_matcher_contains(en_vocab):
|
||||
matcher = PhraseMatcher(en_vocab)
|
||||
matcher.add('TEST', None, Doc(en_vocab, words=['test']))
|
||||
assert 'TEST' in matcher
|
||||
assert 'TEST2' not in matcher
|
||||
matcher.add("TEST", None, Doc(en_vocab, words=["test"]))
|
||||
assert "TEST" in matcher
|
||||
assert "TEST2" not in matcher
|
||||
|
||||
|
||||
def test_phrase_matcher_string_attrs(en_vocab):
|
||||
words1 = ['I', 'like', 'cats']
|
||||
pos1 = ['PRON', 'VERB', 'NOUN']
|
||||
words2 = ['Yes', ',', 'you', 'hate', 'dogs', 'very', 'much']
|
||||
pos2 = ['INTJ', 'PUNCT', 'PRON', 'VERB', 'NOUN', 'ADV', 'ADV']
|
||||
words1 = ["I", "like", "cats"]
|
||||
pos1 = ["PRON", "VERB", "NOUN"]
|
||||
words2 = ["Yes", ",", "you", "hate", "dogs", "very", "much"]
|
||||
pos2 = ["INTJ", "PUNCT", "PRON", "VERB", "NOUN", "ADV", "ADV"]
|
||||
pattern = get_doc(en_vocab, words=words1, pos=pos1)
|
||||
matcher = PhraseMatcher(en_vocab, attr='POS')
|
||||
matcher.add('TEST', None, pattern)
|
||||
matcher = PhraseMatcher(en_vocab, attr="POS")
|
||||
matcher.add("TEST", None, pattern)
|
||||
doc = get_doc(en_vocab, words=words2, pos=pos2)
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 1
|
||||
match_id, start, end = matches[0]
|
||||
assert match_id == en_vocab.strings['TEST']
|
||||
assert match_id == en_vocab.strings["TEST"]
|
||||
assert start == 2
|
||||
assert end == 5
|
||||
|
||||
|
||||
def test_phrase_matcher_string_attrs_negative(en_vocab):
|
||||
"""Test that token with the control codes as ORTH are *not* matched."""
|
||||
words1 = ['I', 'like', 'cats']
|
||||
pos1 = ['PRON', 'VERB', 'NOUN']
|
||||
words2 = ['matcher:POS-PRON', 'matcher:POS-VERB', 'matcher:POS-NOUN']
|
||||
pos2 = ['X', 'X', 'X']
|
||||
words1 = ["I", "like", "cats"]
|
||||
pos1 = ["PRON", "VERB", "NOUN"]
|
||||
words2 = ["matcher:POS-PRON", "matcher:POS-VERB", "matcher:POS-NOUN"]
|
||||
pos2 = ["X", "X", "X"]
|
||||
pattern = get_doc(en_vocab, words=words1, pos=pos1)
|
||||
matcher = PhraseMatcher(en_vocab, attr='POS')
|
||||
matcher.add('TEST', None, pattern)
|
||||
matcher = PhraseMatcher(en_vocab, attr="POS")
|
||||
matcher.add("TEST", None, pattern)
|
||||
doc = get_doc(en_vocab, words=words2, pos=pos2)
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 0
|
||||
|
||||
|
||||
def test_phrase_matcher_bool_attrs(en_vocab):
|
||||
words1 = ['Hello', 'world', '!']
|
||||
words2 = ['No', 'problem', ',', 'he', 'said', '.']
|
||||
words1 = ["Hello", "world", "!"]
|
||||
words2 = ["No", "problem", ",", "he", "said", "."]
|
||||
pattern = Doc(en_vocab, words=words1)
|
||||
matcher = PhraseMatcher(en_vocab, attr='IS_PUNCT')
|
||||
matcher.add('TEST', None, pattern)
|
||||
matcher = PhraseMatcher(en_vocab, attr="IS_PUNCT")
|
||||
matcher.add("TEST", None, pattern)
|
||||
doc = Doc(en_vocab, words=words2)
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 2
|
||||
match_id1, start1, end1 = matches[0]
|
||||
match_id2, start2, end2 = matches[1]
|
||||
assert match_id1 == en_vocab.strings['TEST']
|
||||
assert match_id2 == en_vocab.strings['TEST']
|
||||
assert match_id1 == en_vocab.strings["TEST"]
|
||||
assert match_id2 == en_vocab.strings["TEST"]
|
||||
assert start1 == 0
|
||||
assert end1 == 3
|
||||
assert start2 == 3
|
||||
|
|
|
@ -2,7 +2,6 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
import numpy.random
|
||||
from thinc.neural.optimizers import Adam
|
||||
from thinc.neural.ops import NumpyOps
|
||||
from spacy.attrs import NORM
|
||||
|
@ -20,18 +19,17 @@ def vocab():
|
|||
@pytest.fixture
|
||||
def parser(vocab):
|
||||
parser = DependencyParser(vocab)
|
||||
parser.cfg['token_vector_width'] = 8
|
||||
parser.cfg['hidden_width'] = 30
|
||||
parser.cfg['hist_size'] = 0
|
||||
parser.add_label('left')
|
||||
parser.cfg["token_vector_width"] = 8
|
||||
parser.cfg["hidden_width"] = 30
|
||||
parser.cfg["hist_size"] = 0
|
||||
parser.add_label("left")
|
||||
parser.begin_training([], **parser.cfg)
|
||||
sgd = Adam(NumpyOps(), 0.001)
|
||||
|
||||
for i in range(10):
|
||||
losses = {}
|
||||
doc = Doc(vocab, words=['a', 'b', 'c', 'd'])
|
||||
gold = GoldParse(doc, heads=[1, 1, 3, 3],
|
||||
deps=['left', 'ROOT', 'left', 'ROOT'])
|
||||
doc = Doc(vocab, words=["a", "b", "c", "d"])
|
||||
gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"])
|
||||
parser.update([doc], [gold], sgd=sgd, losses=losses)
|
||||
return parser
|
||||
|
||||
|
@ -44,29 +42,30 @@ def test_init_parser(parser):
|
|||
# TODO: This now seems to be implicated in segfaults. Not sure what's up!
|
||||
@pytest.mark.skip
|
||||
def test_add_label(parser):
|
||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
||||
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
|
||||
doc = parser(doc)
|
||||
assert doc[0].head.i == 1
|
||||
assert doc[0].dep_ == 'left'
|
||||
assert doc[0].dep_ == "left"
|
||||
assert doc[1].head.i == 1
|
||||
assert doc[2].head.i == 3
|
||||
assert doc[2].head.i == 3
|
||||
parser.add_label('right')
|
||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
||||
parser.add_label("right")
|
||||
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
|
||||
doc = parser(doc)
|
||||
assert doc[0].head.i == 1
|
||||
assert doc[0].dep_ == 'left'
|
||||
assert doc[0].dep_ == "left"
|
||||
assert doc[1].head.i == 1
|
||||
assert doc[2].head.i == 3
|
||||
assert doc[2].head.i == 3
|
||||
sgd = Adam(NumpyOps(), 0.001)
|
||||
for i in range(10):
|
||||
losses = {}
|
||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
||||
gold = GoldParse(doc, heads=[1, 1, 3, 3],
|
||||
deps=['right', 'ROOT', 'left', 'ROOT'])
|
||||
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
|
||||
gold = GoldParse(
|
||||
doc, heads=[1, 1, 3, 3], deps=["right", "ROOT", "left", "ROOT"]
|
||||
)
|
||||
parser.update([doc], [gold], sgd=sgd, losses=losses)
|
||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
||||
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
|
||||
doc = parser(doc)
|
||||
assert doc[0].dep_ == 'right'
|
||||
assert doc[2].dep_ == 'left'
|
||||
assert doc[0].dep_ == "right"
|
||||
assert doc[2].dep_ == "left"
|
||||
|
|
|
@ -31,16 +31,19 @@ def get_sequence_costs(M, words, heads, deps, transitions):
|
|||
def vocab():
|
||||
return Vocab()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def arc_eager(vocab):
|
||||
moves = ArcEager(vocab.strings, ArcEager.get_actions())
|
||||
moves.add_action(2, 'left')
|
||||
moves.add_action(3, 'right')
|
||||
moves.add_action(2, "left")
|
||||
moves.add_action(3, "right")
|
||||
return moves
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def words():
|
||||
return ['a', 'b']
|
||||
return ["a", "b"]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def doc(words, vocab):
|
||||
|
@ -48,19 +51,21 @@ def doc(words, vocab):
|
|||
vocab = Vocab()
|
||||
return Doc(vocab, words=list(words))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def gold(doc, words):
|
||||
if len(words) == 2:
|
||||
return GoldParse(doc, words=['a', 'b'], heads=[0, 0], deps=['ROOT', 'right'])
|
||||
return GoldParse(doc, words=["a", "b"], heads=[0, 0], deps=["ROOT", "right"])
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_oracle_four_words(arc_eager, vocab):
|
||||
words = ['a', 'b', 'c', 'd']
|
||||
words = ["a", "b", "c", "d"]
|
||||
heads = [1, 1, 3, 3]
|
||||
deps = ['left', 'ROOT', 'left', 'ROOT']
|
||||
actions = ['L-left', 'B-ROOT', 'L-left']
|
||||
deps = ["left", "ROOT", "left", "ROOT"]
|
||||
actions = ["L-left", "B-ROOT", "L-left"]
|
||||
state, cost_history = get_sequence_costs(arc_eager, words, heads, deps, actions)
|
||||
assert state.is_final()
|
||||
for i, state_costs in enumerate(cost_history):
|
||||
|
@ -72,63 +77,65 @@ def test_oracle_four_words(arc_eager, vocab):
|
|||
|
||||
|
||||
annot_tuples = [
|
||||
(0, 'When', 'WRB', 11, 'advmod', 'O'),
|
||||
(1, 'Walter', 'NNP', 2, 'compound', 'B-PERSON'),
|
||||
(2, 'Rodgers', 'NNP', 11, 'nsubj', 'L-PERSON'),
|
||||
(3, ',', ',', 2, 'punct', 'O'),
|
||||
(4, 'our', 'PRP$', 6, 'poss', 'O'),
|
||||
(5, 'embedded', 'VBN', 6, 'amod', 'O'),
|
||||
(6, 'reporter', 'NN', 2, 'appos', 'O'),
|
||||
(7, 'with', 'IN', 6, 'prep', 'O'),
|
||||
(8, 'the', 'DT', 10, 'det', 'B-ORG'),
|
||||
(9, '3rd', 'NNP', 10, 'compound', 'I-ORG'),
|
||||
(10, 'Cavalry', 'NNP', 7, 'pobj', 'L-ORG'),
|
||||
(11, 'says', 'VBZ', 44, 'advcl', 'O'),
|
||||
(12, 'three', 'CD', 13, 'nummod', 'U-CARDINAL'),
|
||||
(13, 'battalions', 'NNS', 16, 'nsubj', 'O'),
|
||||
(14, 'of', 'IN', 13, 'prep', 'O'),
|
||||
(15, 'troops', 'NNS', 14, 'pobj', 'O'),
|
||||
(16, 'are', 'VBP', 11, 'ccomp', 'O'),
|
||||
(17, 'on', 'IN', 16, 'prep', 'O'),
|
||||
(18, 'the', 'DT', 19, 'det', 'O'),
|
||||
(19, 'ground', 'NN', 17, 'pobj', 'O'),
|
||||
(20, ',', ',', 17, 'punct', 'O'),
|
||||
(21, 'inside', 'IN', 17, 'prep', 'O'),
|
||||
(22, 'Baghdad', 'NNP', 21, 'pobj', 'U-GPE'),
|
||||
(23, 'itself', 'PRP', 22, 'appos', 'O'),
|
||||
(24, ',', ',', 16, 'punct', 'O'),
|
||||
(25, 'have', 'VBP', 26, 'aux', 'O'),
|
||||
(26, 'taken', 'VBN', 16, 'dep', 'O'),
|
||||
(27, 'up', 'RP', 26, 'prt', 'O'),
|
||||
(28, 'positions', 'NNS', 26, 'dobj', 'O'),
|
||||
(29, 'they', 'PRP', 31, 'nsubj', 'O'),
|
||||
(30, "'re", 'VBP', 31, 'aux', 'O'),
|
||||
(31, 'going', 'VBG', 26, 'parataxis', 'O'),
|
||||
(32, 'to', 'TO', 33, 'aux', 'O'),
|
||||
(33, 'spend', 'VB', 31, 'xcomp', 'O'),
|
||||
(34, 'the', 'DT', 35, 'det', 'B-TIME'),
|
||||
(35, 'night', 'NN', 33, 'dobj', 'L-TIME'),
|
||||
(36, 'there', 'RB', 33, 'advmod', 'O'),
|
||||
(37, 'presumably', 'RB', 33, 'advmod', 'O'),
|
||||
(38, ',', ',', 44, 'punct', 'O'),
|
||||
(39, 'how', 'WRB', 40, 'advmod', 'O'),
|
||||
(40, 'many', 'JJ', 41, 'amod', 'O'),
|
||||
(41, 'soldiers', 'NNS', 44, 'pobj', 'O'),
|
||||
(42, 'are', 'VBP', 44, 'aux', 'O'),
|
||||
(43, 'we', 'PRP', 44, 'nsubj', 'O'),
|
||||
(44, 'talking', 'VBG', 44, 'ROOT', 'O'),
|
||||
(45, 'about', 'IN', 44, 'prep', 'O'),
|
||||
(46, 'right', 'RB', 47, 'advmod', 'O'),
|
||||
(47, 'now', 'RB', 44, 'advmod', 'O'),
|
||||
(48, '?', '.', 44, 'punct', 'O')]
|
||||
(0, "When", "WRB", 11, "advmod", "O"),
|
||||
(1, "Walter", "NNP", 2, "compound", "B-PERSON"),
|
||||
(2, "Rodgers", "NNP", 11, "nsubj", "L-PERSON"),
|
||||
(3, ",", ",", 2, "punct", "O"),
|
||||
(4, "our", "PRP$", 6, "poss", "O"),
|
||||
(5, "embedded", "VBN", 6, "amod", "O"),
|
||||
(6, "reporter", "NN", 2, "appos", "O"),
|
||||
(7, "with", "IN", 6, "prep", "O"),
|
||||
(8, "the", "DT", 10, "det", "B-ORG"),
|
||||
(9, "3rd", "NNP", 10, "compound", "I-ORG"),
|
||||
(10, "Cavalry", "NNP", 7, "pobj", "L-ORG"),
|
||||
(11, "says", "VBZ", 44, "advcl", "O"),
|
||||
(12, "three", "CD", 13, "nummod", "U-CARDINAL"),
|
||||
(13, "battalions", "NNS", 16, "nsubj", "O"),
|
||||
(14, "of", "IN", 13, "prep", "O"),
|
||||
(15, "troops", "NNS", 14, "pobj", "O"),
|
||||
(16, "are", "VBP", 11, "ccomp", "O"),
|
||||
(17, "on", "IN", 16, "prep", "O"),
|
||||
(18, "the", "DT", 19, "det", "O"),
|
||||
(19, "ground", "NN", 17, "pobj", "O"),
|
||||
(20, ",", ",", 17, "punct", "O"),
|
||||
(21, "inside", "IN", 17, "prep", "O"),
|
||||
(22, "Baghdad", "NNP", 21, "pobj", "U-GPE"),
|
||||
(23, "itself", "PRP", 22, "appos", "O"),
|
||||
(24, ",", ",", 16, "punct", "O"),
|
||||
(25, "have", "VBP", 26, "aux", "O"),
|
||||
(26, "taken", "VBN", 16, "dep", "O"),
|
||||
(27, "up", "RP", 26, "prt", "O"),
|
||||
(28, "positions", "NNS", 26, "dobj", "O"),
|
||||
(29, "they", "PRP", 31, "nsubj", "O"),
|
||||
(30, "'re", "VBP", 31, "aux", "O"),
|
||||
(31, "going", "VBG", 26, "parataxis", "O"),
|
||||
(32, "to", "TO", 33, "aux", "O"),
|
||||
(33, "spend", "VB", 31, "xcomp", "O"),
|
||||
(34, "the", "DT", 35, "det", "B-TIME"),
|
||||
(35, "night", "NN", 33, "dobj", "L-TIME"),
|
||||
(36, "there", "RB", 33, "advmod", "O"),
|
||||
(37, "presumably", "RB", 33, "advmod", "O"),
|
||||
(38, ",", ",", 44, "punct", "O"),
|
||||
(39, "how", "WRB", 40, "advmod", "O"),
|
||||
(40, "many", "JJ", 41, "amod", "O"),
|
||||
(41, "soldiers", "NNS", 44, "pobj", "O"),
|
||||
(42, "are", "VBP", 44, "aux", "O"),
|
||||
(43, "we", "PRP", 44, "nsubj", "O"),
|
||||
(44, "talking", "VBG", 44, "ROOT", "O"),
|
||||
(45, "about", "IN", 44, "prep", "O"),
|
||||
(46, "right", "RB", 47, "advmod", "O"),
|
||||
(47, "now", "RB", 44, "advmod", "O"),
|
||||
(48, "?", ".", 44, "punct", "O"),
|
||||
]
|
||||
|
||||
|
||||
def test_get_oracle_actions():
|
||||
doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
|
||||
parser = DependencyParser(doc.vocab)
|
||||
parser.moves.add_action(0, '')
|
||||
parser.moves.add_action(1, '')
|
||||
parser.moves.add_action(1, '')
|
||||
parser.moves.add_action(4, 'ROOT')
|
||||
parser.moves.add_action(0, "")
|
||||
parser.moves.add_action(1, "")
|
||||
parser.moves.add_action(1, "")
|
||||
parser.moves.add_action(4, "ROOT")
|
||||
for i, (id_, word, tag, head, dep, ent) in enumerate(annot_tuples):
|
||||
if head > i:
|
||||
parser.moves.add_action(2, dep)
|
||||
|
|
|
@ -16,15 +16,17 @@ def vocab():
|
|||
|
||||
@pytest.fixture
|
||||
def doc(vocab):
|
||||
return Doc(vocab, words=['Casey', 'went', 'to', 'New', 'York', '.'])
|
||||
return Doc(vocab, words=["Casey", "went", "to", "New", "York", "."])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def entity_annots(doc):
|
||||
casey = doc[0:1]
|
||||
ny = doc[3:5]
|
||||
return [(casey.start_char, casey.end_char, 'PERSON'),
|
||||
(ny.start_char, ny.end_char, 'GPE')]
|
||||
return [
|
||||
(casey.start_char, casey.end_char, "PERSON"),
|
||||
(ny.start_char, ny.end_char, "GPE"),
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -43,32 +45,33 @@ def test_get_oracle_moves(tsys, doc, entity_annots):
|
|||
tsys.preprocess_gold(gold)
|
||||
act_classes = tsys.get_oracle_sequence(doc, gold)
|
||||
names = [tsys.get_class_name(act) for act in act_classes]
|
||||
assert names == ['U-PERSON', 'O', 'O', 'B-GPE', 'L-GPE', 'O']
|
||||
assert names == ["U-PERSON", "O", "O", "B-GPE", "L-GPE", "O"]
|
||||
|
||||
|
||||
def test_get_oracle_moves_negative_entities(tsys, doc, entity_annots):
|
||||
entity_annots = [(s, e, '!' + label) for s, e, label in entity_annots]
|
||||
entity_annots = [(s, e, "!" + label) for s, e, label in entity_annots]
|
||||
gold = GoldParse(doc, entities=entity_annots)
|
||||
for i, tag in enumerate(gold.ner):
|
||||
if tag == 'L-!GPE':
|
||||
gold.ner[i] = '-'
|
||||
if tag == "L-!GPE":
|
||||
gold.ner[i] = "-"
|
||||
tsys.preprocess_gold(gold)
|
||||
act_classes = tsys.get_oracle_sequence(doc, gold)
|
||||
names = [tsys.get_class_name(act) for act in act_classes]
|
||||
|
||||
|
||||
def test_get_oracle_moves_negative_entities2(tsys, vocab):
|
||||
doc = Doc(vocab, words=['A', 'B', 'C', 'D'])
|
||||
doc = Doc(vocab, words=["A", "B", "C", "D"])
|
||||
gold = GoldParse(doc, entities=[])
|
||||
gold.ner = ['B-!PERSON', 'L-!PERSON', 'B-!PERSON', 'L-!PERSON']
|
||||
gold.ner = ["B-!PERSON", "L-!PERSON", "B-!PERSON", "L-!PERSON"]
|
||||
tsys.preprocess_gold(gold)
|
||||
act_classes = tsys.get_oracle_sequence(doc, gold)
|
||||
names = [tsys.get_class_name(act) for act in act_classes]
|
||||
|
||||
|
||||
def test_get_oracle_moves_negative_O(tsys, vocab):
|
||||
doc = Doc(vocab, words=['A', 'B', 'C', 'D'])
|
||||
doc = Doc(vocab, words=["A", "B", "C", "D"])
|
||||
gold = GoldParse(doc, entities=[])
|
||||
gold.ner = ['O', '!O', 'O', '!O']
|
||||
gold.ner = ["O", "!O", "O", "!O"]
|
||||
tsys.preprocess_gold(gold)
|
||||
act_classes = tsys.get_oracle_sequence(doc, gold)
|
||||
names = [tsys.get_class_name(act) for act in act_classes]
|
||||
|
@ -80,8 +83,8 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
|
|||
ner.begin_training([])
|
||||
ner(doc)
|
||||
assert len(list(doc.ents)) == 0
|
||||
assert [w.ent_iob_ for w in doc] == (['O'] * len(doc))
|
||||
doc.ents = [(doc.vocab.strings['ANIMAL'], 3, 4)]
|
||||
assert [w.ent_iob_ for w in doc] == ['', '', '', 'B']
|
||||
doc.ents = [(doc.vocab.strings['WORD'], 0, 2)]
|
||||
assert [w.ent_iob_ for w in doc] == ['B', 'I', '', '']
|
||||
assert [w.ent_iob_ for w in doc] == (["O"] * len(doc))
|
||||
doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)]
|
||||
assert [w.ent_iob_ for w in doc] == ["", "", "", "B"]
|
||||
doc.ents = [(doc.vocab.strings["WORD"], 0, 2)]
|
||||
assert [w.ent_iob_ for w in doc] == ["B", "I", "", ""]
|
||||
|
|
|
@ -17,7 +17,7 @@ def vocab():
|
|||
|
||||
@pytest.fixture
|
||||
def arc_eager(vocab):
|
||||
actions = ArcEager.get_actions(left_labels=['L'], right_labels=['R'])
|
||||
actions = ArcEager.get_actions(left_labels=["L"], right_labels=["R"])
|
||||
return ArcEager(vocab.strings, actions)
|
||||
|
||||
|
||||
|
@ -30,6 +30,7 @@ def tok2vec():
|
|||
def parser(vocab, arc_eager):
|
||||
return Parser(vocab, moves=arc_eager, model=None)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def model(arc_eager, tok2vec):
|
||||
return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.nO)[0]
|
||||
|
@ -37,12 +38,12 @@ def model(arc_eager, tok2vec):
|
|||
|
||||
@pytest.fixture
|
||||
def doc(vocab):
|
||||
return Doc(vocab, words=['a', 'b', 'c'])
|
||||
return Doc(vocab, words=["a", "b", "c"])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def gold(doc):
|
||||
return GoldParse(doc, heads=[1, 1, 1], deps=['L', 'ROOT', 'R'])
|
||||
return GoldParse(doc, heads=[1, 1, 1], deps=["L", "ROOT", "R"])
|
||||
|
||||
|
||||
def test_can_init_nn_parser(parser):
|
||||
|
@ -62,8 +63,10 @@ def test_predict_doc(parser, tok2vec, model, doc):
|
|||
|
||||
def test_update_doc(parser, model, doc, gold):
|
||||
parser.model = model
|
||||
|
||||
def optimize(weights, gradient, key=None):
|
||||
weights -= 0.001 * gradient
|
||||
|
||||
parser.update([doc], [gold], sgd=optimize)
|
||||
|
||||
|
||||
|
@ -76,6 +79,8 @@ def test_predict_doc_beam(parser, model, doc):
|
|||
@pytest.mark.xfail
|
||||
def test_update_doc_beam(parser, model, doc, gold):
|
||||
parser.model = model
|
||||
|
||||
def optimize(weights, gradient, key=None):
|
||||
weights -= 0.001 * gradient
|
||||
|
||||
parser.update_beam([doc], [gold], sgd=optimize)
|
||||
|
|
|
@ -21,20 +21,22 @@ def vocab():
|
|||
@pytest.fixture
|
||||
def moves(vocab):
|
||||
aeager = ArcEager(vocab.strings, {})
|
||||
aeager.add_action(2, 'nsubj')
|
||||
aeager.add_action(3, 'dobj')
|
||||
aeager.add_action(2, 'aux')
|
||||
aeager.add_action(2, "nsubj")
|
||||
aeager.add_action(3, "dobj")
|
||||
aeager.add_action(2, "aux")
|
||||
return aeager
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def docs(vocab):
|
||||
return [Doc(vocab, words=['Rats', 'bite', 'things'])]
|
||||
return [Doc(vocab, words=["Rats", "bite", "things"])]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def states(docs):
|
||||
return [StateClass(doc) for doc in docs]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tokvecs(docs, vector_size):
|
||||
output = []
|
||||
|
@ -73,9 +75,10 @@ def beam(moves, states, golds, beam_width):
|
|||
def scores(moves, batch_size, beam_width):
|
||||
return [
|
||||
numpy.asarray(
|
||||
numpy.random.uniform(-0.1, 0.1, (batch_size, moves.n_moves)),
|
||||
dtype='f')
|
||||
for _ in range(batch_size)]
|
||||
numpy.random.uniform(-0.1, 0.1, (batch_size, moves.n_moves)), dtype="f"
|
||||
)
|
||||
for _ in range(batch_size)
|
||||
]
|
||||
|
||||
|
||||
def test_create_beam(beam):
|
||||
|
@ -93,8 +96,8 @@ def test_beam_advance_too_few_scores(beam, scores):
|
|||
|
||||
def test_beam_parse():
|
||||
nlp = Language()
|
||||
nlp.add_pipe(DependencyParser(nlp.vocab), name='parser')
|
||||
nlp.parser.add_label('nsubj')
|
||||
nlp.add_pipe(DependencyParser(nlp.vocab), name="parser")
|
||||
nlp.parser.add_label("nsubj")
|
||||
nlp.parser.begin_training([], token_vector_width=8, hidden_width=8)
|
||||
doc = nlp.make_doc('Australia is a country')
|
||||
doc = nlp.make_doc("Australia is a country")
|
||||
nlp.parser(doc, beam_width=2)
|
||||
|
|
|
@ -40,106 +40,116 @@ def multirooted_tree():
|
|||
|
||||
|
||||
def test_parser_ancestors(tree, cyclic_tree, partial_tree, multirooted_tree):
|
||||
assert([a for a in ancestors(3, tree)] == [4, 5, 2])
|
||||
assert([a for a in ancestors(3, cyclic_tree)] == [4, 5, 3, 4, 5, 3, 4])
|
||||
assert([a for a in ancestors(3, partial_tree)] == [4, 5, None])
|
||||
assert([a for a in ancestors(17, multirooted_tree)] == [])
|
||||
assert [a for a in ancestors(3, tree)] == [4, 5, 2]
|
||||
assert [a for a in ancestors(3, cyclic_tree)] == [4, 5, 3, 4, 5, 3, 4]
|
||||
assert [a for a in ancestors(3, partial_tree)] == [4, 5, None]
|
||||
assert [a for a in ancestors(17, multirooted_tree)] == []
|
||||
|
||||
|
||||
def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree):
|
||||
assert(contains_cycle(tree) == None)
|
||||
assert(contains_cycle(cyclic_tree) == set([3, 4, 5]))
|
||||
assert(contains_cycle(partial_tree) == None)
|
||||
assert(contains_cycle(multirooted_tree) == None)
|
||||
assert contains_cycle(tree) == None
|
||||
assert contains_cycle(cyclic_tree) == set([3, 4, 5])
|
||||
assert contains_cycle(partial_tree) == None
|
||||
assert contains_cycle(multirooted_tree) == None
|
||||
|
||||
|
||||
def test_parser_is_nonproj_arc(nonproj_tree, partial_tree, multirooted_tree):
|
||||
assert(is_nonproj_arc(0, nonproj_tree) == False)
|
||||
assert(is_nonproj_arc(1, nonproj_tree) == False)
|
||||
assert(is_nonproj_arc(2, nonproj_tree) == False)
|
||||
assert(is_nonproj_arc(3, nonproj_tree) == False)
|
||||
assert(is_nonproj_arc(4, nonproj_tree) == False)
|
||||
assert(is_nonproj_arc(5, nonproj_tree) == False)
|
||||
assert(is_nonproj_arc(6, nonproj_tree) == False)
|
||||
assert(is_nonproj_arc(7, nonproj_tree) == True)
|
||||
assert(is_nonproj_arc(8, nonproj_tree) == False)
|
||||
assert(is_nonproj_arc(7, partial_tree) == False)
|
||||
assert(is_nonproj_arc(17, multirooted_tree) == False)
|
||||
assert(is_nonproj_arc(16, multirooted_tree) == True)
|
||||
assert is_nonproj_arc(0, nonproj_tree) == False
|
||||
assert is_nonproj_arc(1, nonproj_tree) == False
|
||||
assert is_nonproj_arc(2, nonproj_tree) == False
|
||||
assert is_nonproj_arc(3, nonproj_tree) == False
|
||||
assert is_nonproj_arc(4, nonproj_tree) == False
|
||||
assert is_nonproj_arc(5, nonproj_tree) == False
|
||||
assert is_nonproj_arc(6, nonproj_tree) == False
|
||||
assert is_nonproj_arc(7, nonproj_tree) == True
|
||||
assert is_nonproj_arc(8, nonproj_tree) == False
|
||||
assert is_nonproj_arc(7, partial_tree) == False
|
||||
assert is_nonproj_arc(17, multirooted_tree) == False
|
||||
assert is_nonproj_arc(16, multirooted_tree) == True
|
||||
|
||||
|
||||
def test_parser_is_nonproj_tree(proj_tree, nonproj_tree, partial_tree, multirooted_tree):
|
||||
assert(is_nonproj_tree(proj_tree) == False)
|
||||
assert(is_nonproj_tree(nonproj_tree) == True)
|
||||
assert(is_nonproj_tree(partial_tree) == False)
|
||||
assert(is_nonproj_tree(multirooted_tree) == True)
|
||||
def test_parser_is_nonproj_tree(
|
||||
proj_tree, nonproj_tree, partial_tree, multirooted_tree
|
||||
):
|
||||
assert is_nonproj_tree(proj_tree) == False
|
||||
assert is_nonproj_tree(nonproj_tree) == True
|
||||
assert is_nonproj_tree(partial_tree) == False
|
||||
assert is_nonproj_tree(multirooted_tree) == True
|
||||
|
||||
|
||||
def test_parser_pseudoprojectivity(en_tokenizer):
|
||||
def deprojectivize(proj_heads, deco_labels):
|
||||
tokens = en_tokenizer('whatever ' * len(proj_heads))
|
||||
rel_proj_heads = [head-i for i, head in enumerate(proj_heads)]
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens],
|
||||
deps=deco_labels, heads=rel_proj_heads)
|
||||
tokens = en_tokenizer("whatever " * len(proj_heads))
|
||||
rel_proj_heads = [head - i for i, head in enumerate(proj_heads)]
|
||||
doc = get_doc(
|
||||
tokens.vocab,
|
||||
words=[t.text for t in tokens],
|
||||
deps=deco_labels,
|
||||
heads=rel_proj_heads,
|
||||
)
|
||||
nonproj.deprojectivize(doc)
|
||||
return [t.head.i for t in doc], [token.dep_ for token in doc]
|
||||
|
||||
# fmt: off
|
||||
tree = [1, 2, 2]
|
||||
nonproj_tree = [1, 2, 2, 4, 5, 2, 7, 4, 2]
|
||||
nonproj_tree2 = [9, 1, 3, 1, 5, 6, 9, 8, 6, 1, 6, 12, 13, 10, 1]
|
||||
labels = ['det', 'nsubj', 'root', 'det', 'dobj', 'aux', 'nsubj', 'acl', 'punct']
|
||||
labels2 = ['advmod', 'root', 'det', 'nsubj', 'advmod', 'det', 'dobj', 'det', 'nmod', 'aux', 'nmod', 'advmod', 'det', 'amod', 'punct']
|
||||
labels = ["det", "nsubj", "root", "det", "dobj", "aux", "nsubj", "acl", "punct"]
|
||||
labels2 = ["advmod", "root", "det", "nsubj", "advmod", "det", "dobj", "det", "nmod", "aux", "nmod", "advmod", "det", "amod", "punct"]
|
||||
# fmt: on
|
||||
|
||||
assert(nonproj.decompose('X||Y') == ('X','Y'))
|
||||
assert(nonproj.decompose('X') == ('X',''))
|
||||
assert(nonproj.is_decorated('X||Y') == True)
|
||||
assert(nonproj.is_decorated('X') == False)
|
||||
assert nonproj.decompose("X||Y") == ("X", "Y")
|
||||
assert nonproj.decompose("X") == ("X", "")
|
||||
assert nonproj.is_decorated("X||Y") == True
|
||||
assert nonproj.is_decorated("X") == False
|
||||
|
||||
nonproj._lift(0, tree)
|
||||
assert(tree == [2, 2, 2])
|
||||
assert tree == [2, 2, 2]
|
||||
|
||||
assert(nonproj._get_smallest_nonproj_arc(nonproj_tree) == 7)
|
||||
assert(nonproj._get_smallest_nonproj_arc(nonproj_tree2) == 10)
|
||||
assert nonproj._get_smallest_nonproj_arc(nonproj_tree) == 7
|
||||
assert nonproj._get_smallest_nonproj_arc(nonproj_tree2) == 10
|
||||
|
||||
# fmt: off
|
||||
proj_heads, deco_labels = nonproj.projectivize(nonproj_tree, labels)
|
||||
assert(proj_heads == [1, 2, 2, 4, 5, 2, 7, 5, 2])
|
||||
assert(deco_labels == ['det', 'nsubj', 'root', 'det', 'dobj', 'aux',
|
||||
'nsubj', 'acl||dobj', 'punct'])
|
||||
assert proj_heads == [1, 2, 2, 4, 5, 2, 7, 5, 2]
|
||||
assert deco_labels == ["det", "nsubj", "root", "det", "dobj", "aux",
|
||||
"nsubj", "acl||dobj", "punct"]
|
||||
|
||||
deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
|
||||
assert(deproj_heads == nonproj_tree)
|
||||
assert(undeco_labels == labels)
|
||||
assert deproj_heads == nonproj_tree
|
||||
assert undeco_labels == labels
|
||||
|
||||
proj_heads, deco_labels = nonproj.projectivize(nonproj_tree2, labels2)
|
||||
assert(proj_heads == [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1])
|
||||
assert(deco_labels == ['advmod||aux', 'root', 'det', 'nsubj', 'advmod',
|
||||
'det', 'dobj', 'det', 'nmod', 'aux', 'nmod||dobj',
|
||||
'advmod', 'det', 'amod', 'punct'])
|
||||
assert proj_heads == [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1]
|
||||
assert deco_labels == ["advmod||aux", "root", "det", "nsubj", "advmod",
|
||||
"det", "dobj", "det", "nmod", "aux", "nmod||dobj",
|
||||
"advmod", "det", "amod", "punct"]
|
||||
|
||||
deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
|
||||
assert(deproj_heads == nonproj_tree2)
|
||||
assert(undeco_labels == labels2)
|
||||
assert deproj_heads == nonproj_tree2
|
||||
assert undeco_labels == labels2
|
||||
|
||||
# if decoration is wrong such that there is no head with the desired label
|
||||
# the structure is kept and the label is undecorated
|
||||
proj_heads = [1, 2, 2, 4, 5, 2, 7, 5, 2]
|
||||
deco_labels = ['det', 'nsubj', 'root', 'det', 'dobj', 'aux', 'nsubj',
|
||||
'acl||iobj', 'punct']
|
||||
deco_labels = ["det", "nsubj", "root", "det", "dobj", "aux", "nsubj",
|
||||
"acl||iobj", "punct"]
|
||||
|
||||
deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
|
||||
assert(deproj_heads == proj_heads)
|
||||
assert(undeco_labels == ['det', 'nsubj', 'root', 'det', 'dobj', 'aux',
|
||||
'nsubj', 'acl', 'punct'])
|
||||
assert deproj_heads == proj_heads
|
||||
assert undeco_labels == ["det", "nsubj", "root", "det", "dobj", "aux",
|
||||
"nsubj", "acl", "punct"]
|
||||
|
||||
# if there are two potential new heads, the first one is chosen even if
|
||||
# it's wrong
|
||||
# it"s wrong
|
||||
proj_heads = [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1]
|
||||
deco_labels = ['advmod||aux', 'root', 'det', 'aux', 'advmod', 'det',
|
||||
'dobj', 'det', 'nmod', 'aux', 'nmod||dobj', 'advmod',
|
||||
'det', 'amod', 'punct']
|
||||
deco_labels = ["advmod||aux", "root", "det", "aux", "advmod", "det",
|
||||
"dobj", "det", "nmod", "aux", "nmod||dobj", "advmod",
|
||||
"det", "amod", "punct"]
|
||||
|
||||
deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
|
||||
assert(deproj_heads == [3, 1, 3, 1, 5, 6, 9, 8, 6, 1, 6, 12, 13, 10, 1])
|
||||
assert(undeco_labels == ['advmod', 'root', 'det', 'aux', 'advmod', 'det',
|
||||
'dobj', 'det', 'nmod', 'aux', 'nmod', 'advmod',
|
||||
'det', 'amod', 'punct'])
|
||||
assert deproj_heads == [3, 1, 3, 1, 5, 6, 9, 8, 6, 1, 6, 12, 13, 10, 1]
|
||||
assert undeco_labels == ["advmod", "root", "det", "aux", "advmod", "det",
|
||||
"dobj", "det", "nmod", "aux", "nmod", "advmod",
|
||||
"det", "amod", "punct"]
|
||||
# fmt: on
|
||||
|
|
|
@ -9,7 +9,7 @@ from ..util import get_doc, apply_transition_sequence
|
|||
def test_parser_root(en_tokenizer):
|
||||
text = "i don't have other assistance"
|
||||
heads = [3, 2, 1, 0, 1, -2]
|
||||
deps = ['nsubj', 'aux', 'neg', 'ROOT', 'amod', 'dobj']
|
||||
deps = ["nsubj", "aux", "neg", "ROOT", "amod", "dobj"]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
for t in doc:
|
||||
|
@ -17,10 +17,12 @@ def test_parser_root(en_tokenizer):
|
|||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize('text', ["Hello"])
|
||||
@pytest.mark.parametrize("text", ["Hello"])
|
||||
def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text):
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=['ROOT'])
|
||||
doc = get_doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=["ROOT"]
|
||||
)
|
||||
|
||||
assert len(doc) == 1
|
||||
with en_parser.step_through(doc) as _:
|
||||
|
@ -32,7 +34,7 @@ def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text):
|
|||
def test_parser_initial(en_tokenizer, en_parser):
|
||||
text = "I ate the pizza with anchovies."
|
||||
heads = [1, 0, 1, -2, -3, -1, -5]
|
||||
transition = ['L-nsubj', 'S', 'L-det']
|
||||
transition = ["L-nsubj", "S", "L-det"]
|
||||
tokens = en_tokenizer(text)
|
||||
apply_transition_sequence(en_parser, tokens, transition)
|
||||
assert tokens[0].head.i == 1
|
||||
|
@ -58,17 +60,19 @@ def test_parser_parse_subtrees(en_tokenizer, en_parser):
|
|||
def test_parser_merge_pp(en_tokenizer):
|
||||
text = "A phrase with another phrase occurs"
|
||||
heads = [1, 4, -1, 1, -2, 0]
|
||||
deps = ['det', 'nsubj', 'prep', 'det', 'pobj', 'ROOT']
|
||||
tags = ['DT', 'NN', 'IN', 'DT', 'NN', 'VBZ']
|
||||
deps = ["det", "nsubj", "prep", "det", "pobj", "ROOT"]
|
||||
tags = ["DT", "NN", "IN", "DT", "NN", "VBZ"]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, tags=tags)
|
||||
doc = get_doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, tags=tags
|
||||
)
|
||||
nps = [(np[0].idx, np[-1].idx + len(np[-1]), np.lemma_) for np in doc.noun_chunks]
|
||||
for start, end, lemma in nps:
|
||||
doc.merge(start, end, label='NP', lemma=lemma)
|
||||
assert doc[0].text == 'A phrase'
|
||||
assert doc[1].text == 'with'
|
||||
assert doc[2].text == 'another phrase'
|
||||
assert doc[3].text == 'occurs'
|
||||
doc.merge(start, end, label="NP", lemma=lemma)
|
||||
assert doc[0].text == "A phrase"
|
||||
assert doc[1].text == "with"
|
||||
assert doc[2].text == "another phrase"
|
||||
assert doc[3].text == "occurs"
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
|
@ -76,7 +80,7 @@ def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
|
|||
text = "a b c d e"
|
||||
|
||||
# right branching
|
||||
transition = ['R-nsubj', 'D', 'R-nsubj', 'R-nsubj', 'D', 'R-ROOT']
|
||||
transition = ["R-nsubj", "D", "R-nsubj", "R-nsubj", "D", "R-ROOT"]
|
||||
tokens = en_tokenizer(text)
|
||||
apply_transition_sequence(en_parser, tokens, transition)
|
||||
|
||||
|
@ -111,7 +115,7 @@ def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
|
|||
assert tokens[4].head.i == 2
|
||||
|
||||
# left branching
|
||||
transition = ['S', 'S', 'S', 'L-nsubj','L-nsubj','L-nsubj', 'L-nsubj']
|
||||
transition = ["S", "S", "S", "L-nsubj", "L-nsubj", "L-nsubj", "L-nsubj"]
|
||||
tokens = en_tokenizer(text)
|
||||
apply_transition_sequence(en_parser, tokens, transition)
|
||||
|
||||
|
|
|
@ -33,6 +33,7 @@ BIG BROTHER IS WATCHING YOU, the caption beneath it ran.
|
|||
|
||||
@pytest.fixture
|
||||
def heads():
|
||||
# fmt: off
|
||||
return [1, 1, 0, 3, 2, 1, -4, -1, -1, -7, -8, 1, -10, 2, 1, -3, -1, -15,
|
||||
-1, 1, 4, -1, 1, -3, 0, -1, 1, -2, -4, 1, -2, 1, -2, 3, -1, 1,
|
||||
-4, -13, -14, -1, -2, 2, 1, -3, -1, 1, -2, -9, -1, 3, 1, 1, -14,
|
||||
|
@ -50,6 +51,7 @@ def heads():
|
|||
0, -1, 1, -2, -4, 1, 0, -1, -1, 2, -1, -3, 1, -2, 1, -2, 3, 1,
|
||||
1, -4, -1, -2, 2, 1, -5, -19, -1, 1, 1, 0, 1, 6, -1, 1, -3, -1,
|
||||
-1, -8, -9, -1]
|
||||
# fmt: on
|
||||
|
||||
|
||||
def test_parser_parse_navigate_consistency(en_tokenizer, text, heads):
|
||||
|
@ -100,7 +102,14 @@ def test_parser_parse_navigate_edges(en_tokenizer, text, heads):
|
|||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
for token in doc:
|
||||
subtree = list(token.subtree)
|
||||
debug = '\t'.join((token.text, token.left_edge.text, subtree[0].text))
|
||||
debug = "\t".join((token.text, token.left_edge.text, subtree[0].text))
|
||||
assert token.left_edge == subtree[0], debug
|
||||
debug = '\t'.join((token.text, token.right_edge.text, subtree[-1].text, token.right_edge.head.text))
|
||||
debug = "\t".join(
|
||||
(
|
||||
token.text,
|
||||
token.right_edge.text,
|
||||
subtree[-1].text,
|
||||
token.right_edge.head.text,
|
||||
)
|
||||
)
|
||||
assert token.right_edge == subtree[-1], debug
|
||||
|
|
|
@ -19,34 +19,33 @@ def vocab():
|
|||
@pytest.fixture
|
||||
def parser(vocab):
|
||||
parser = DependencyParser(vocab)
|
||||
parser.cfg['token_vector_width'] = 4
|
||||
parser.cfg['hidden_width'] = 32
|
||||
#parser.add_label('right')
|
||||
parser.add_label('left')
|
||||
parser.cfg["token_vector_width"] = 4
|
||||
parser.cfg["hidden_width"] = 32
|
||||
# parser.add_label('right')
|
||||
parser.add_label("left")
|
||||
parser.begin_training([], **parser.cfg)
|
||||
sgd = Adam(NumpyOps(), 0.001)
|
||||
|
||||
for i in range(10):
|
||||
losses = {}
|
||||
doc = Doc(vocab, words=['a', 'b', 'c', 'd'])
|
||||
gold = GoldParse(doc, heads=[1, 1, 3, 3],
|
||||
deps=['left', 'ROOT', 'left', 'ROOT'])
|
||||
doc = Doc(vocab, words=["a", "b", "c", "d"])
|
||||
gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"])
|
||||
parser.update([doc], [gold], sgd=sgd, losses=losses)
|
||||
return parser
|
||||
|
||||
|
||||
def test_no_sentences(parser):
|
||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
||||
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
|
||||
doc = parser(doc)
|
||||
assert len(list(doc.sents)) >= 1
|
||||
|
||||
|
||||
def test_sents_1(parser):
|
||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
||||
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
|
||||
doc[2].sent_start = True
|
||||
doc = parser(doc)
|
||||
assert len(list(doc.sents)) >= 2
|
||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
||||
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
|
||||
doc[1].sent_start = False
|
||||
doc[2].sent_start = True
|
||||
doc[3].sent_start = False
|
||||
|
@ -55,7 +54,7 @@ def test_sents_1(parser):
|
|||
|
||||
|
||||
def test_sents_1_2(parser):
|
||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
||||
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
|
||||
doc[1].sent_start = True
|
||||
doc[2].sent_start = True
|
||||
doc = parser(doc)
|
||||
|
@ -63,12 +62,12 @@ def test_sents_1_2(parser):
|
|||
|
||||
|
||||
def test_sents_1_3(parser):
|
||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
||||
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
|
||||
doc[1].sent_start = True
|
||||
doc[3].sent_start = True
|
||||
doc = parser(doc)
|
||||
assert len(list(doc.sents)) >= 3
|
||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
||||
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
|
||||
doc[1].sent_start = True
|
||||
doc[2].sent_start = False
|
||||
doc[3].sent_start = True
|
||||
|
|
|
@ -19,11 +19,13 @@ def test_parser_space_attachment(en_tokenizer):
|
|||
|
||||
|
||||
def test_parser_sentence_space(en_tokenizer):
|
||||
# fmt: off
|
||||
text = "I look forward to using Thingamajig. I've been told it will make my life easier..."
|
||||
heads = [1, 0, -1, -2, -1, -1, -5, -1, 3, 2, 1, 0, 2, 1, -3, 1, 1, -3, -7]
|
||||
deps = ['nsubj', 'ROOT', 'advmod', 'prep', 'pcomp', 'dobj', 'punct', '',
|
||||
'nsubjpass', 'aux', 'auxpass', 'ROOT', 'nsubj', 'aux', 'ccomp',
|
||||
'poss', 'nsubj', 'ccomp', 'punct']
|
||||
deps = ["nsubj", "ROOT", "advmod", "prep", "pcomp", "dobj", "punct", "",
|
||||
"nsubjpass", "aux", "auxpass", "ROOT", "nsubj", "aux", "ccomp",
|
||||
"poss", "nsubj", "ccomp", "punct"]
|
||||
# fmt: on
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
assert len(list(doc.sents)) == 2
|
||||
|
@ -34,10 +36,10 @@ def test_parser_space_attachment_leading(en_tokenizer, en_parser):
|
|||
text = "\t \n This is a sentence ."
|
||||
heads = [1, 1, 0, 1, -2, -3]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=text.split(' '), heads=heads)
|
||||
doc = get_doc(tokens.vocab, words=text.split(" "), heads=heads)
|
||||
assert doc[0].is_space
|
||||
assert doc[1].is_space
|
||||
assert doc[2].text == 'This'
|
||||
assert doc[2].text == "This"
|
||||
with en_parser.step_through(doc) as stepwise:
|
||||
pass
|
||||
assert doc[0].head.i == 2
|
||||
|
@ -49,9 +51,9 @@ def test_parser_space_attachment_leading(en_tokenizer, en_parser):
|
|||
def test_parser_space_attachment_intermediate_trailing(en_tokenizer, en_parser):
|
||||
text = "This is \t a \t\n \n sentence . \n\n \n"
|
||||
heads = [1, 0, -1, 2, -1, -4, -5, -1]
|
||||
transition = ['L-nsubj', 'S', 'L-det', 'R-attr', 'D', 'R-punct']
|
||||
transition = ["L-nsubj", "S", "L-det", "R-attr", "D", "R-punct"]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=text.split(' '), heads=heads)
|
||||
doc = get_doc(tokens.vocab, words=text.split(" "), heads=heads)
|
||||
assert doc[2].is_space
|
||||
assert doc[4].is_space
|
||||
assert doc[5].is_space
|
||||
|
@ -64,8 +66,7 @@ def test_parser_space_attachment_intermediate_trailing(en_tokenizer, en_parser):
|
|||
assert [token.head.i for token in doc] == [1, 1, 1, 6, 3, 3, 1, 1, 7, 7]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,length', [(['\n'], 1),
|
||||
(['\n', '\t', '\n\n', '\t'], 4)])
|
||||
@pytest.mark.parametrize("text,length", [(["\n"], 1), (["\n", "\t", "\n\n", "\t"], 4)])
|
||||
@pytest.mark.xfail
|
||||
def test_parser_space_attachment_space(en_tokenizer, en_parser, text, length):
|
||||
doc = Doc(en_parser.vocab, words=text)
|
||||
|
@ -74,4 +75,4 @@ def test_parser_space_attachment_space(en_tokenizer, en_parser, text, length):
|
|||
pass
|
||||
assert doc[0].is_space
|
||||
for token in doc:
|
||||
assert token.head.i == length-1
|
||||
assert token.head.i == length - 1
|
||||
|
|
|
@ -18,14 +18,16 @@ def patterns():
|
|||
{"label": "HELLO", "pattern": "hello world"},
|
||||
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
|
||||
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
|
||||
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}
|
||||
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def add_ent():
|
||||
def add_ent_component(doc):
|
||||
doc.ents = [Span(doc, 0, 3, label=doc.vocab.strings['ORG'])]
|
||||
doc.ents = [Span(doc, 0, 3, label=doc.vocab.strings["ORG"])]
|
||||
return doc
|
||||
|
||||
return add_ent_component
|
||||
|
||||
|
||||
|
@ -33,13 +35,13 @@ def test_entity_ruler_init(nlp, patterns):
|
|||
ruler = EntityRuler(nlp, patterns=patterns)
|
||||
assert len(ruler) == len(patterns)
|
||||
assert len(ruler.labels) == 3
|
||||
assert 'HELLO' in ruler
|
||||
assert 'BYE' in ruler
|
||||
assert "HELLO" in ruler
|
||||
assert "BYE" in ruler
|
||||
nlp.add_pipe(ruler)
|
||||
doc = nlp("hello world bye bye")
|
||||
assert len(doc.ents) == 2
|
||||
assert doc.ents[0].label_ == 'HELLO'
|
||||
assert doc.ents[1].label_ == 'BYE'
|
||||
assert doc.ents[0].label_ == "HELLO"
|
||||
assert doc.ents[1].label_ == "BYE"
|
||||
|
||||
|
||||
def test_entity_ruler_existing(nlp, patterns, add_ent):
|
||||
|
@ -48,8 +50,8 @@ def test_entity_ruler_existing(nlp, patterns, add_ent):
|
|||
nlp.add_pipe(ruler)
|
||||
doc = nlp("OH HELLO WORLD bye bye")
|
||||
assert len(doc.ents) == 2
|
||||
assert doc.ents[0].label_ == 'ORG'
|
||||
assert doc.ents[1].label_ == 'BYE'
|
||||
assert doc.ents[0].label_ == "ORG"
|
||||
assert doc.ents[1].label_ == "BYE"
|
||||
|
||||
|
||||
def test_entity_ruler_existing_overwrite(nlp, patterns, add_ent):
|
||||
|
@ -58,9 +60,9 @@ def test_entity_ruler_existing_overwrite(nlp, patterns, add_ent):
|
|||
nlp.add_pipe(ruler)
|
||||
doc = nlp("OH HELLO WORLD bye bye")
|
||||
assert len(doc.ents) == 2
|
||||
assert doc.ents[0].label_ == 'HELLO'
|
||||
assert doc.ents[0].text == 'HELLO'
|
||||
assert doc.ents[1].label_ == 'BYE'
|
||||
assert doc.ents[0].label_ == "HELLO"
|
||||
assert doc.ents[0].text == "HELLO"
|
||||
assert doc.ents[1].label_ == "BYE"
|
||||
|
||||
|
||||
def test_entity_ruler_existing_complex(nlp, patterns, add_ent):
|
||||
|
@ -69,8 +71,8 @@ def test_entity_ruler_existing_complex(nlp, patterns, add_ent):
|
|||
nlp.add_pipe(ruler)
|
||||
doc = nlp("foo foo bye bye")
|
||||
assert len(doc.ents) == 2
|
||||
assert doc.ents[0].label_ == 'COMPLEX'
|
||||
assert doc.ents[1].label_ == 'BYE'
|
||||
assert doc.ents[0].label_ == "COMPLEX"
|
||||
assert doc.ents[1].label_ == "BYE"
|
||||
assert len(doc.ents[0]) == 2
|
||||
assert len(doc.ents[1]) == 2
|
||||
|
||||
|
|
|
@ -10,15 +10,21 @@ from ..util import get_doc
|
|||
|
||||
@pytest.fixture
|
||||
def doc(en_tokenizer):
|
||||
text = 'I like New York in Autumn.'
|
||||
text = "I like New York in Autumn."
|
||||
heads = [1, 0, 1, -2, -3, -1, -5]
|
||||
tags = ['PRP', 'IN', 'NNP', 'NNP', 'IN', 'NNP', '.']
|
||||
pos = ['PRON', 'VERB', 'PROPN', 'PROPN', 'ADP', 'PROPN', 'PUNCT']
|
||||
deps = ['ROOT', 'prep', 'compound', 'pobj', 'prep', 'pobj', 'punct']
|
||||
tags = ["PRP", "IN", "NNP", "NNP", "IN", "NNP", "."]
|
||||
pos = ["PRON", "VERB", "PROPN", "PROPN", "ADP", "PROPN", "PUNCT"]
|
||||
deps = ["ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads,
|
||||
tags=tags, pos=pos, deps=deps)
|
||||
doc.ents = [Span(doc, 2, 4, doc.vocab.strings['GPE'])]
|
||||
doc = get_doc(
|
||||
tokens.vocab,
|
||||
words=[t.text for t in tokens],
|
||||
heads=heads,
|
||||
tags=tags,
|
||||
pos=pos,
|
||||
deps=deps,
|
||||
)
|
||||
doc.ents = [Span(doc, 2, 4, doc.vocab.strings["GPE"])]
|
||||
doc.is_parsed = True
|
||||
doc.is_tagged = True
|
||||
return doc
|
||||
|
@ -27,18 +33,18 @@ def doc(en_tokenizer):
|
|||
def test_factories_merge_noun_chunks(doc):
|
||||
assert len(doc) == 7
|
||||
nlp = Language()
|
||||
merge_noun_chunks = nlp.create_pipe('merge_noun_chunks')
|
||||
merge_noun_chunks = nlp.create_pipe("merge_noun_chunks")
|
||||
merge_noun_chunks(doc)
|
||||
assert len(doc) == 6
|
||||
assert doc[2].text == 'New York'
|
||||
assert doc[2].text == "New York"
|
||||
|
||||
|
||||
def test_factories_merge_ents(doc):
|
||||
assert len(doc) == 7
|
||||
assert len(list(doc.ents)) == 1
|
||||
nlp = Language()
|
||||
merge_entities = nlp.create_pipe('merge_entities')
|
||||
merge_entities = nlp.create_pipe("merge_entities")
|
||||
merge_entities(doc)
|
||||
assert len(doc) == 6
|
||||
assert len(list(doc.ents)) == 1
|
||||
assert doc[2].text == 'New York'
|
||||
assert doc[2].text == "New York"
|
||||
|
|
|
@ -16,22 +16,22 @@ def new_pipe(doc):
|
|||
|
||||
def test_add_pipe_no_name(nlp):
|
||||
nlp.add_pipe(new_pipe)
|
||||
assert 'new_pipe' in nlp.pipe_names
|
||||
assert "new_pipe" in nlp.pipe_names
|
||||
|
||||
|
||||
def test_add_pipe_duplicate_name(nlp):
|
||||
nlp.add_pipe(new_pipe, name='duplicate_name')
|
||||
nlp.add_pipe(new_pipe, name="duplicate_name")
|
||||
with pytest.raises(ValueError):
|
||||
nlp.add_pipe(new_pipe, name='duplicate_name')
|
||||
nlp.add_pipe(new_pipe, name="duplicate_name")
|
||||
|
||||
|
||||
@pytest.mark.parametrize('name', ['parser'])
|
||||
@pytest.mark.parametrize("name", ["parser"])
|
||||
def test_add_pipe_first(nlp, name):
|
||||
nlp.add_pipe(new_pipe, name=name, first=True)
|
||||
assert nlp.pipeline[0][0] == name
|
||||
|
||||
|
||||
@pytest.mark.parametrize('name1,name2', [('parser', 'lambda_pipe')])
|
||||
@pytest.mark.parametrize("name1,name2", [("parser", "lambda_pipe")])
|
||||
def test_add_pipe_last(nlp, name1, name2):
|
||||
nlp.add_pipe(lambda doc: doc, name=name2)
|
||||
nlp.add_pipe(new_pipe, name=name1, last=True)
|
||||
|
@ -44,7 +44,7 @@ def test_cant_add_pipe_first_and_last(nlp):
|
|||
nlp.add_pipe(new_pipe, first=True, last=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('name', ['my_component'])
|
||||
@pytest.mark.parametrize("name", ["my_component"])
|
||||
def test_get_pipe(nlp, name):
|
||||
with pytest.raises(KeyError):
|
||||
nlp.get_pipe(name)
|
||||
|
@ -52,7 +52,7 @@ def test_get_pipe(nlp, name):
|
|||
assert nlp.get_pipe(name) == new_pipe
|
||||
|
||||
|
||||
@pytest.mark.parametrize('name,replacement', [('my_component', lambda doc: doc)])
|
||||
@pytest.mark.parametrize("name,replacement", [("my_component", lambda doc: doc)])
|
||||
def test_replace_pipe(nlp, name, replacement):
|
||||
with pytest.raises(ValueError):
|
||||
nlp.replace_pipe(name, new_pipe)
|
||||
|
@ -62,7 +62,7 @@ def test_replace_pipe(nlp, name, replacement):
|
|||
assert nlp.get_pipe(name) == replacement
|
||||
|
||||
|
||||
@pytest.mark.parametrize('old_name,new_name', [('old_pipe', 'new_pipe')])
|
||||
@pytest.mark.parametrize("old_name,new_name", [("old_pipe", "new_pipe")])
|
||||
def test_rename_pipe(nlp, old_name, new_name):
|
||||
with pytest.raises(ValueError):
|
||||
nlp.rename_pipe(old_name, new_name)
|
||||
|
@ -71,7 +71,7 @@ def test_rename_pipe(nlp, old_name, new_name):
|
|||
assert nlp.pipeline[0][0] == new_name
|
||||
|
||||
|
||||
@pytest.mark.parametrize('name', ['my_component'])
|
||||
@pytest.mark.parametrize("name", ["my_component"])
|
||||
def test_remove_pipe(nlp, name):
|
||||
with pytest.raises(ValueError):
|
||||
nlp.remove_pipe(name)
|
||||
|
@ -83,7 +83,7 @@ def test_remove_pipe(nlp, name):
|
|||
assert removed_component == new_pipe
|
||||
|
||||
|
||||
@pytest.mark.parametrize('name', ['my_component'])
|
||||
@pytest.mark.parametrize("name", ["my_component"])
|
||||
def test_disable_pipes_method(nlp, name):
|
||||
nlp.add_pipe(new_pipe, name=name)
|
||||
assert nlp.has_pipe(name)
|
||||
|
@ -92,7 +92,7 @@ def test_disable_pipes_method(nlp, name):
|
|||
disabled.restore()
|
||||
|
||||
|
||||
@pytest.mark.parametrize('name', ['my_component'])
|
||||
@pytest.mark.parametrize("name", ["my_component"])
|
||||
def test_disable_pipes_context(nlp, name):
|
||||
nlp.add_pipe(new_pipe, name=name)
|
||||
assert nlp.has_pipe(name)
|
||||
|
@ -101,14 +101,14 @@ def test_disable_pipes_context(nlp, name):
|
|||
assert nlp.has_pipe(name)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('n_pipes', [100])
|
||||
@pytest.mark.parametrize("n_pipes", [100])
|
||||
def test_add_lots_of_pipes(nlp, n_pipes):
|
||||
for i in range(n_pipes):
|
||||
nlp.add_pipe(lambda doc: doc, name='pipe_%d' % i)
|
||||
nlp.add_pipe(lambda doc: doc, name="pipe_%d" % i)
|
||||
assert len(nlp.pipe_names) == n_pipes
|
||||
|
||||
|
||||
@pytest.mark.parametrize('component', ['ner', {'hello': 'world'}])
|
||||
@pytest.mark.parametrize("component", ["ner", {"hello": "world"}])
|
||||
def test_raise_for_invalid_components(nlp, component):
|
||||
with pytest.raises(ValueError):
|
||||
nlp.add_pipe(component)
|
||||
|
|
|
@ -13,16 +13,21 @@ from spacy.gold import GoldParse
|
|||
@pytest.mark.skip(reason="Test is flakey when run with others")
|
||||
def test_simple_train():
|
||||
nlp = Language()
|
||||
nlp.add_pipe(nlp.create_pipe('textcat'))
|
||||
nlp.get_pipe('textcat').add_label('answer')
|
||||
nlp.add_pipe(nlp.create_pipe("textcat"))
|
||||
nlp.get_pipe("textcat").add_label("answer")
|
||||
nlp.begin_training()
|
||||
for i in range(5):
|
||||
for text, answer in [('aaaa', 1.), ('bbbb', 0), ('aa', 1.),
|
||||
('bbbbbbbbb', 0.), ('aaaaaa', 1)]:
|
||||
nlp.update([text], [{'cats': {'answer': answer}}])
|
||||
doc = nlp('aaa')
|
||||
assert 'answer' in doc.cats
|
||||
assert doc.cats['answer'] >= 0.5
|
||||
for text, answer in [
|
||||
("aaaa", 1.0),
|
||||
("bbbb", 0),
|
||||
("aa", 1.0),
|
||||
("bbbbbbbbb", 0.0),
|
||||
("aaaaaa", 1),
|
||||
]:
|
||||
nlp.update([text], [{"cats": {"answer": answer}}])
|
||||
doc = nlp("aaa")
|
||||
assert "answer" in doc.cats
|
||||
assert doc.cats["answer"] >= 0.5
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Test is flakey when run with others")
|
||||
|
@ -31,11 +36,11 @@ def test_textcat_learns_multilabel():
|
|||
numpy.random.seed(5)
|
||||
docs = []
|
||||
nlp = Language()
|
||||
letters = ['a', 'b', 'c']
|
||||
letters = ["a", "b", "c"]
|
||||
for w1 in letters:
|
||||
for w2 in letters:
|
||||
cats = {letter: float(w2==letter) for letter in letters}
|
||||
docs.append((Doc(nlp.vocab, words=['d']*3 + [w1, w2] + ['d']*3), cats))
|
||||
cats = {letter: float(w2 == letter) for letter in letters}
|
||||
docs.append((Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3), cats))
|
||||
random.shuffle(docs)
|
||||
model = TextCategorizer(nlp.vocab, width=8)
|
||||
for letter in letters:
|
||||
|
@ -49,8 +54,8 @@ def test_textcat_learns_multilabel():
|
|||
random.shuffle(docs)
|
||||
for w1 in letters:
|
||||
for w2 in letters:
|
||||
doc = Doc(nlp.vocab, words=['d']*3 + [w1, w2] + ['d']*3)
|
||||
truth = {letter: w2==letter for letter in letters}
|
||||
doc = Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3)
|
||||
truth = {letter: w2 == letter for letter in letters}
|
||||
model(doc)
|
||||
for cat, score in doc.cats.items():
|
||||
if not truth[cat]:
|
||||
|
|
|
@ -14,14 +14,20 @@ from spacy.tokens import Doc
|
|||
from ..util import get_doc, make_tempdir
|
||||
|
||||
|
||||
@pytest.mark.parametrize('patterns', [
|
||||
[[{'LOWER': 'celtics'}], [{'LOWER': 'boston'}, {'LOWER': 'celtics'}]],
|
||||
[[{'LOWER': 'boston'}, {'LOWER': 'celtics'}], [{'LOWER': 'celtics'}]]])
|
||||
@pytest.mark.parametrize(
|
||||
"patterns",
|
||||
[
|
||||
[[{"LOWER": "celtics"}], [{"LOWER": "boston"}, {"LOWER": "celtics"}]],
|
||||
[[{"LOWER": "boston"}, {"LOWER": "celtics"}], [{"LOWER": "celtics"}]],
|
||||
],
|
||||
)
|
||||
def test_issue118(en_tokenizer, patterns):
|
||||
"""Test a bug that arose from having overlapping matches"""
|
||||
text = "how many points did lebron james score against the boston celtics last night"
|
||||
text = (
|
||||
"how many points did lebron james score against the boston celtics last night"
|
||||
)
|
||||
doc = en_tokenizer(text)
|
||||
ORG = doc.vocab.strings['ORG']
|
||||
ORG = doc.vocab.strings["ORG"]
|
||||
matcher = Matcher(doc.vocab)
|
||||
matcher.add("BostonCeltics", None, *patterns)
|
||||
assert len(list(doc.ents)) == 0
|
||||
|
@ -35,16 +41,22 @@ def test_issue118(en_tokenizer, patterns):
|
|||
assert ents[0].end == 11
|
||||
|
||||
|
||||
@pytest.mark.parametrize('patterns', [
|
||||
[[{'LOWER': 'boston'}], [{'LOWER': 'boston'}, {'LOWER': 'celtics'}]],
|
||||
[[{'LOWER': 'boston'}, {'LOWER': 'celtics'}], [{'LOWER': 'boston'}]]])
|
||||
@pytest.mark.parametrize(
|
||||
"patterns",
|
||||
[
|
||||
[[{"LOWER": "boston"}], [{"LOWER": "boston"}, {"LOWER": "celtics"}]],
|
||||
[[{"LOWER": "boston"}, {"LOWER": "celtics"}], [{"LOWER": "boston"}]],
|
||||
],
|
||||
)
|
||||
def test_issue118_prefix_reorder(en_tokenizer, patterns):
|
||||
"""Test a bug that arose from having overlapping matches"""
|
||||
text = "how many points did lebron james score against the boston celtics last night"
|
||||
text = (
|
||||
"how many points did lebron james score against the boston celtics last night"
|
||||
)
|
||||
doc = en_tokenizer(text)
|
||||
ORG = doc.vocab.strings['ORG']
|
||||
ORG = doc.vocab.strings["ORG"]
|
||||
matcher = Matcher(doc.vocab)
|
||||
matcher.add('BostonCeltics', None, *patterns)
|
||||
matcher.add("BostonCeltics", None, *patterns)
|
||||
assert len(list(doc.ents)) == 0
|
||||
matches = [(ORG, start, end) for _, start, end in matcher(doc)]
|
||||
doc.ents += tuple(matches)[1:]
|
||||
|
@ -59,11 +71,13 @@ def test_issue118_prefix_reorder(en_tokenizer, patterns):
|
|||
def test_issue242(en_tokenizer):
|
||||
"""Test overlapping multi-word phrases."""
|
||||
text = "There are different food safety standards in different countries."
|
||||
patterns = [[{'LOWER': 'food'}, {'LOWER': 'safety'}],
|
||||
[{'LOWER': 'safety'}, {'LOWER': 'standards'}]]
|
||||
patterns = [
|
||||
[{"LOWER": "food"}, {"LOWER": "safety"}],
|
||||
[{"LOWER": "safety"}, {"LOWER": "standards"}],
|
||||
]
|
||||
doc = en_tokenizer(text)
|
||||
matcher = Matcher(doc.vocab)
|
||||
matcher.add('FOOD', None, *patterns)
|
||||
matcher.add("FOOD", None, *patterns)
|
||||
|
||||
matches = [(ent_type, start, end) for ent_type, start, end in matcher(doc)]
|
||||
doc.ents += tuple(matches)
|
||||
|
@ -77,7 +91,9 @@ def test_issue242(en_tokenizer):
|
|||
def test_issue309(en_tokenizer):
|
||||
"""Test Issue #309: SBD fails on empty string"""
|
||||
tokens = en_tokenizer(" ")
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=['ROOT'])
|
||||
doc = get_doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=["ROOT"]
|
||||
)
|
||||
doc.is_parsed = True
|
||||
assert len(doc) == 1
|
||||
sents = list(doc.sents)
|
||||
|
@ -93,11 +109,11 @@ def test_issue351(en_tokenizer):
|
|||
|
||||
def test_issue360(en_tokenizer):
|
||||
"""Test tokenization of big ellipsis"""
|
||||
tokens = en_tokenizer('$45...............Asking')
|
||||
tokens = en_tokenizer("$45...............Asking")
|
||||
assert len(tokens) > 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text1,text2', [("cat", "dog")])
|
||||
@pytest.mark.parametrize("text1,text2", [("cat", "dog")])
|
||||
def test_issue361(en_vocab, text1, text2):
|
||||
"""Test Issue #361: Equality of lexemes"""
|
||||
assert en_vocab[text1] == en_vocab[text1]
|
||||
|
@ -106,15 +122,19 @@ def test_issue361(en_vocab, text1, text2):
|
|||
|
||||
def test_issue587(en_tokenizer):
|
||||
"""Test that Matcher doesn't segfault on particular input"""
|
||||
doc = en_tokenizer('a b; c')
|
||||
doc = en_tokenizer("a b; c")
|
||||
matcher = Matcher(doc.vocab)
|
||||
matcher.add('TEST1', None, [{ORTH: 'a'}, {ORTH: 'b'}])
|
||||
matcher.add("TEST1", None, [{ORTH: "a"}, {ORTH: "b"}])
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 1
|
||||
matcher.add('TEST2', None, [{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'c'}])
|
||||
matcher.add(
|
||||
"TEST2", None, [{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "c"}]
|
||||
)
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 2
|
||||
matcher.add('TEST3', None, [{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'd'}])
|
||||
matcher.add(
|
||||
"TEST3", None, [{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "d"}]
|
||||
)
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 2
|
||||
|
||||
|
@ -122,22 +142,26 @@ def test_issue587(en_tokenizer):
|
|||
def test_issue588(en_vocab):
|
||||
matcher = Matcher(en_vocab)
|
||||
with pytest.raises(ValueError):
|
||||
matcher.add('TEST', None, [])
|
||||
matcher.add("TEST", None, [])
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_issue589():
|
||||
vocab = Vocab()
|
||||
vocab.strings.set_frozen(True)
|
||||
doc = Doc(vocab, words=['whata'])
|
||||
doc = Doc(vocab, words=["whata"])
|
||||
|
||||
|
||||
def test_issue590(en_vocab):
|
||||
"""Test overlapping matches"""
|
||||
doc = Doc(en_vocab, words=['n', '=', '1', ';', 'a', ':', '5', '%'])
|
||||
doc = Doc(en_vocab, words=["n", "=", "1", ";", "a", ":", "5", "%"])
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add('ab', None, [{'IS_ALPHA': True}, {'ORTH': ':'}, {'LIKE_NUM': True}, {'ORTH': '%'}])
|
||||
matcher.add('ab', None, [{'IS_ALPHA': True}, {'ORTH': '='}, {'LIKE_NUM': True}])
|
||||
matcher.add(
|
||||
"ab",
|
||||
None,
|
||||
[{"IS_ALPHA": True}, {"ORTH": ":"}, {"LIKE_NUM": True}, {"ORTH": "%"}],
|
||||
)
|
||||
matcher.add("ab", None, [{"IS_ALPHA": True}, {"ORTH": "="}, {"LIKE_NUM": True}])
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 2
|
||||
|
||||
|
@ -145,14 +169,14 @@ def test_issue590(en_vocab):
|
|||
def test_issue595():
|
||||
"""Test lemmatization of base forms"""
|
||||
words = ["Do", "n't", "feed", "the", "dog"]
|
||||
tag_map = {'VB': {POS: VERB, VerbForm_inf: True}}
|
||||
tag_map = {"VB": {POS: VERB, VerbForm_inf: True}}
|
||||
rules = {"verb": [["ed", "e"]]}
|
||||
lemmatizer = Lemmatizer({'verb': {}}, {'verb': {}}, rules)
|
||||
lemmatizer = Lemmatizer({"verb": {}}, {"verb": {}}, rules)
|
||||
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
|
||||
doc = Doc(vocab, words=words)
|
||||
doc[2].tag_ = 'VB'
|
||||
assert doc[2].text == 'feed'
|
||||
assert doc[2].lemma_ == 'feed'
|
||||
doc[2].tag_ = "VB"
|
||||
assert doc[2].text == "feed"
|
||||
assert doc[2].lemma_ == "feed"
|
||||
|
||||
|
||||
def test_issue599(en_vocab):
|
||||
|
@ -165,9 +189,9 @@ def test_issue599(en_vocab):
|
|||
|
||||
|
||||
def test_issue600():
|
||||
vocab = Vocab(tag_map={'NN': {'pos': 'NOUN'}})
|
||||
vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}})
|
||||
doc = Doc(vocab, words=["hello"])
|
||||
doc[0].tag_ = 'NN'
|
||||
doc[0].tag_ = "NN"
|
||||
|
||||
|
||||
def test_issue615(en_tokenizer):
|
||||
|
@ -175,16 +199,17 @@ def test_issue615(en_tokenizer):
|
|||
"""Merge a phrase. We have to be careful here because we'll change the
|
||||
token indices. To avoid problems, merge all the phrases once we're called
|
||||
on the last match."""
|
||||
if i != len(matches)-1:
|
||||
if i != len(matches) - 1:
|
||||
return None
|
||||
spans = [(ent_id, ent_id, doc[start : end]) for ent_id, start, end in matches]
|
||||
spans = [(ent_id, ent_id, doc[start:end]) for ent_id, start, end in matches]
|
||||
for ent_id, label, span in spans:
|
||||
span.merge(tag='NNP' if label else span.root.tag_, lemma=span.text,
|
||||
label=label)
|
||||
span.merge(
|
||||
tag="NNP" if label else span.root.tag_, lemma=span.text, label=label
|
||||
)
|
||||
doc.ents = doc.ents + ((label, span.start, span.end),)
|
||||
|
||||
text = "The golf club is broken"
|
||||
pattern = [{'ORTH': "golf"}, {'ORTH': "club"}]
|
||||
pattern = [{"ORTH": "golf"}, {"ORTH": "club"}]
|
||||
label = "Sport_Equipment"
|
||||
doc = en_tokenizer(text)
|
||||
matcher = Matcher(doc.vocab)
|
||||
|
@ -195,7 +220,7 @@ def test_issue615(en_tokenizer):
|
|||
assert entities[0].label != 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,number', [("7am", "7"), ("11p.m.", "11")])
|
||||
@pytest.mark.parametrize("text,number", [("7am", "7"), ("11p.m.", "11")])
|
||||
def test_issue736(en_tokenizer, text, number):
|
||||
"""Test that times like "7am" are tokenized correctly and that numbers are
|
||||
converted to string."""
|
||||
|
@ -204,7 +229,7 @@ def test_issue736(en_tokenizer, text, number):
|
|||
assert tokens[0].text == number
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["3/4/2012", "01/12/1900"])
|
||||
@pytest.mark.parametrize("text", ["3/4/2012", "01/12/1900"])
|
||||
def test_issue740(en_tokenizer, text):
|
||||
"""Test that dates are not split and kept as one token. This behaviour is
|
||||
currently inconsistent, since dates separated by hyphens are still split.
|
||||
|
@ -214,14 +239,14 @@ def test_issue740(en_tokenizer, text):
|
|||
|
||||
|
||||
def test_issue743():
|
||||
doc = Doc(Vocab(), ['hello', 'world'])
|
||||
doc = Doc(Vocab(), ["hello", "world"])
|
||||
token = doc[0]
|
||||
s = set([token])
|
||||
items = list(s)
|
||||
assert items[0] is token
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["We were scared", "We Were Scared"])
|
||||
@pytest.mark.parametrize("text", ["We were scared", "We Were Scared"])
|
||||
def test_issue744(en_tokenizer, text):
|
||||
"""Test that 'were' and 'Were' are excluded from the contractions
|
||||
generated by the English tokenizer exceptions."""
|
||||
|
@ -230,14 +255,15 @@ def test_issue744(en_tokenizer, text):
|
|||
assert tokens[1].text.lower() == "were"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,is_num', [("one", True), ("ten", True),
|
||||
("teneleven", False)])
|
||||
@pytest.mark.parametrize(
|
||||
"text,is_num", [("one", True), ("ten", True), ("teneleven", False)]
|
||||
)
|
||||
def test_issue759(en_tokenizer, text, is_num):
|
||||
tokens = en_tokenizer(text)
|
||||
assert tokens[0].like_num == is_num
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["Shell", "shell", "Shed", "shed"])
|
||||
@pytest.mark.parametrize("text", ["Shell", "shell", "Shed", "shed"])
|
||||
def test_issue775(en_tokenizer, text):
|
||||
"""Test that 'Shell' and 'shell' are excluded from the contractions
|
||||
generated by the English tokenizer exceptions."""
|
||||
|
@ -246,28 +272,32 @@ def test_issue775(en_tokenizer, text):
|
|||
assert tokens[0].text == text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"])
|
||||
@pytest.mark.parametrize("text", ["This is a string ", "This is a string\u0020"])
|
||||
def test_issue792(en_tokenizer, text):
|
||||
"""Test for Issue #792: Trailing whitespace is removed after tokenization."""
|
||||
doc = en_tokenizer(text)
|
||||
assert ''.join([token.text_with_ws for token in doc]) == text
|
||||
assert "".join([token.text_with_ws for token in doc]) == text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["This is a string", "This is a string\n"])
|
||||
@pytest.mark.parametrize("text", ["This is a string", "This is a string\n"])
|
||||
def test_control_issue792(en_tokenizer, text):
|
||||
"""Test base case for Issue #792: Non-trailing whitespace"""
|
||||
doc = en_tokenizer(text)
|
||||
assert ''.join([token.text_with_ws for token in doc]) == text
|
||||
assert "".join([token.text_with_ws for token in doc]) == text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,tokens', [
|
||||
('"deserve,"--and', ['"', "deserve", ',"--', "and"]),
|
||||
("exception;--exclusive", ["exception", ";--", "exclusive"]),
|
||||
("day.--Is", ["day", ".--", "Is"]),
|
||||
("refinement:--just", ["refinement", ":--", "just"]),
|
||||
("memories?--To", ["memories", "?--", "To"]),
|
||||
("Useful.=--Therefore", ["Useful", ".=--", "Therefore"]),
|
||||
("=Hope.=--Pandora", ["=", "Hope", ".=--", "Pandora"])])
|
||||
@pytest.mark.parametrize(
|
||||
"text,tokens",
|
||||
[
|
||||
('"deserve,"--and', ['"', "deserve", ',"--', "and"]),
|
||||
("exception;--exclusive", ["exception", ";--", "exclusive"]),
|
||||
("day.--Is", ["day", ".--", "Is"]),
|
||||
("refinement:--just", ["refinement", ":--", "just"]),
|
||||
("memories?--To", ["memories", "?--", "To"]),
|
||||
("Useful.=--Therefore", ["Useful", ".=--", "Therefore"]),
|
||||
("=Hope.=--Pandora", ["=", "Hope", ".=--", "Pandora"]),
|
||||
],
|
||||
)
|
||||
def test_issue801(en_tokenizer, text, tokens):
|
||||
"""Test that special characters + hyphens are split correctly."""
|
||||
doc = en_tokenizer(text)
|
||||
|
@ -275,10 +305,19 @@ def test_issue801(en_tokenizer, text, tokens):
|
|||
assert [t.text for t in doc] == tokens
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,expected_tokens', [
|
||||
('Smörsåsen används bl.a. till fisk', ['Smörsåsen', 'används', 'bl.a.', 'till', 'fisk']),
|
||||
('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar'])
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"text,expected_tokens",
|
||||
[
|
||||
(
|
||||
"Smörsåsen används bl.a. till fisk",
|
||||
["Smörsåsen", "används", "bl.a.", "till", "fisk"],
|
||||
),
|
||||
(
|
||||
"Jag kommer först kl. 13 p.g.a. diverse förseningar",
|
||||
["Jag", "kommer", "först", "kl.", "13", "p.g.a.", "diverse", "förseningar"],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_issue805(sv_tokenizer, text, expected_tokens):
|
||||
tokens = sv_tokenizer(text)
|
||||
token_list = [token.text for token in tokens if not token.is_space]
|
||||
|
@ -291,9 +330,9 @@ def test_issue850():
|
|||
vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
|
||||
matcher = Matcher(vocab)
|
||||
IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
|
||||
pattern = [{'LOWER': "bob"}, {'OP': '*', 'IS_ANY_TOKEN': True}, {'LOWER': 'frank'}]
|
||||
matcher.add('FarAway', None, pattern)
|
||||
doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank'])
|
||||
pattern = [{"LOWER": "bob"}, {"OP": "*", "IS_ANY_TOKEN": True}, {"LOWER": "frank"}]
|
||||
matcher.add("FarAway", None, pattern)
|
||||
doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"])
|
||||
match = matcher(doc)
|
||||
assert len(match) == 1
|
||||
ent_id, start, end = match[0]
|
||||
|
@ -306,9 +345,9 @@ def test_issue850_basic():
|
|||
vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
|
||||
matcher = Matcher(vocab)
|
||||
IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
|
||||
pattern = [{'LOWER': "bob"}, {'OP': '*', 'LOWER': 'and'}, {'LOWER': 'frank'}]
|
||||
matcher.add('FarAway', None, pattern)
|
||||
doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank'])
|
||||
pattern = [{"LOWER": "bob"}, {"OP": "*", "LOWER": "and"}, {"LOWER": "frank"}]
|
||||
matcher.add("FarAway", None, pattern)
|
||||
doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"])
|
||||
match = matcher(doc)
|
||||
assert len(match) == 1
|
||||
ent_id, start, end = match[0]
|
||||
|
@ -316,23 +355,25 @@ def test_issue850_basic():
|
|||
assert end == 4
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["au-delàs", "pair-programmâmes",
|
||||
"terra-formées", "σ-compacts"])
|
||||
@pytest.mark.parametrize(
|
||||
"text", ["au-delàs", "pair-programmâmes", "terra-formées", "σ-compacts"]
|
||||
)
|
||||
def test_issue852(fr_tokenizer, text):
|
||||
"""Test that French tokenizer exceptions are imported correctly."""
|
||||
tokens = fr_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["aaabbb@ccc.com\nThank you!",
|
||||
"aaabbb@ccc.com \nThank you!"])
|
||||
@pytest.mark.parametrize(
|
||||
"text", ["aaabbb@ccc.com\nThank you!", "aaabbb@ccc.com \nThank you!"]
|
||||
)
|
||||
def test_issue859(en_tokenizer, text):
|
||||
"""Test that no extra space is added in doc.text method."""
|
||||
doc = en_tokenizer(text)
|
||||
assert doc.text == text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["Datum:2014-06-02\nDokument:76467"])
|
||||
@pytest.mark.parametrize("text", ["Datum:2014-06-02\nDokument:76467"])
|
||||
def test_issue886(en_tokenizer, text):
|
||||
"""Test that token.idx matches the original text index for texts with newlines."""
|
||||
doc = en_tokenizer(text)
|
||||
|
@ -341,7 +382,7 @@ def test_issue886(en_tokenizer, text):
|
|||
assert text[token.idx] == token.text[0]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["want/need"])
|
||||
@pytest.mark.parametrize("text", ["want/need"])
|
||||
def test_issue891(en_tokenizer, text):
|
||||
"""Test that / infixes are split correctly."""
|
||||
tokens = en_tokenizer(text)
|
||||
|
@ -349,11 +390,10 @@ def test_issue891(en_tokenizer, text):
|
|||
assert tokens[1].text == "/"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,tag,lemma', [
|
||||
("anus", "NN", "anus"),
|
||||
("princess", "NN", "princess"),
|
||||
("inner", "JJ", "inner")
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"text,tag,lemma",
|
||||
[("anus", "NN", "anus"), ("princess", "NN", "princess"), ("inner", "JJ", "inner")],
|
||||
)
|
||||
def test_issue912(en_vocab, text, tag, lemma):
|
||||
"""Test base-forms are preserved."""
|
||||
doc = Doc(en_vocab, words=[text])
|
||||
|
@ -364,10 +404,10 @@ def test_issue912(en_vocab, text, tag, lemma):
|
|||
def test_issue957(en_tokenizer):
|
||||
"""Test that spaCy doesn't hang on many periods."""
|
||||
# skip test if pytest-timeout is not installed
|
||||
timeout = pytest.importorskip('pytest-timeout')
|
||||
string = '0'
|
||||
timeout = pytest.importorskip("pytest-timeout")
|
||||
string = "0"
|
||||
for i in range(1, 100):
|
||||
string += '.%d' % i
|
||||
string += ".%d" % i
|
||||
doc = en_tokenizer(string)
|
||||
|
||||
|
||||
|
@ -386,13 +426,13 @@ def test_issue999(train_data):
|
|||
["hello", []],
|
||||
["hi", []],
|
||||
["i'm looking for a place to eat", []],
|
||||
["i'm looking for a place in the north of town", [[31,36,"LOCATION"]]],
|
||||
["show me chinese restaurants", [[8,15,"CUISINE"]]],
|
||||
["show me chines restaurants", [[8,14,"CUISINE"]]],
|
||||
["i'm looking for a place in the north of town", [[31, 36, "LOCATION"]]],
|
||||
["show me chinese restaurants", [[8, 15, "CUISINE"]]],
|
||||
["show me chines restaurants", [[8, 14, "CUISINE"]]],
|
||||
]
|
||||
|
||||
nlp = Language()
|
||||
ner = nlp.create_pipe('ner')
|
||||
ner = nlp.create_pipe("ner")
|
||||
nlp.add_pipe(ner)
|
||||
for _, offsets in TRAIN_DATA:
|
||||
for start, end, label in offsets:
|
||||
|
@ -402,7 +442,7 @@ def test_issue999(train_data):
|
|||
for itn in range(100):
|
||||
random.shuffle(TRAIN_DATA)
|
||||
for raw_text, entity_offsets in TRAIN_DATA:
|
||||
nlp.update([raw_text], [{'entities': entity_offsets}])
|
||||
nlp.update([raw_text], [{"entities": entity_offsets}])
|
||||
|
||||
with make_tempdir() as model_dir:
|
||||
nlp.to_disk(model_dir)
|
||||
|
|
|
@ -15,76 +15,84 @@ from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part
|
|||
|
||||
def test_issue1242():
|
||||
nlp = English()
|
||||
doc = nlp('')
|
||||
doc = nlp("")
|
||||
assert len(doc) == 0
|
||||
docs = list(nlp.pipe(['', 'hello']))
|
||||
docs = list(nlp.pipe(["", "hello"]))
|
||||
assert len(docs[0]) == 0
|
||||
assert len(docs[1]) == 1
|
||||
|
||||
|
||||
def test_issue1250():
|
||||
"""Test cached special cases."""
|
||||
special_case = [{ORTH: 'reimbur', LEMMA: 'reimburse', POS: 'VERB'}]
|
||||
special_case = [{ORTH: "reimbur", LEMMA: "reimburse", POS: "VERB"}]
|
||||
nlp = English()
|
||||
nlp.tokenizer.add_special_case('reimbur', special_case)
|
||||
lemmas = [w.lemma_ for w in nlp('reimbur, reimbur...')]
|
||||
assert lemmas == ['reimburse', ',', 'reimburse', '...']
|
||||
lemmas = [w.lemma_ for w in nlp('reimbur, reimbur...')]
|
||||
assert lemmas == ['reimburse', ',', 'reimburse', '...']
|
||||
nlp.tokenizer.add_special_case("reimbur", special_case)
|
||||
lemmas = [w.lemma_ for w in nlp("reimbur, reimbur...")]
|
||||
assert lemmas == ["reimburse", ",", "reimburse", "..."]
|
||||
lemmas = [w.lemma_ for w in nlp("reimbur, reimbur...")]
|
||||
assert lemmas == ["reimburse", ",", "reimburse", "..."]
|
||||
|
||||
|
||||
def test_issue1257():
|
||||
"""Test that tokens compare correctly."""
|
||||
doc1 = Doc(Vocab(), words=['a', 'b', 'c'])
|
||||
doc2 = Doc(Vocab(), words=['a', 'c', 'e'])
|
||||
doc1 = Doc(Vocab(), words=["a", "b", "c"])
|
||||
doc2 = Doc(Vocab(), words=["a", "c", "e"])
|
||||
assert doc1[0] != doc2[0]
|
||||
assert not doc1[0] == doc2[0]
|
||||
|
||||
|
||||
def test_issue1375():
|
||||
"""Test that token.nbor() raises IndexError for out-of-bounds access."""
|
||||
doc = Doc(Vocab(), words=['0', '1', '2'])
|
||||
doc = Doc(Vocab(), words=["0", "1", "2"])
|
||||
with pytest.raises(IndexError):
|
||||
assert doc[0].nbor(-1)
|
||||
assert doc[1].nbor(-1).text == '0'
|
||||
assert doc[1].nbor(-1).text == "0"
|
||||
with pytest.raises(IndexError):
|
||||
assert doc[2].nbor(1)
|
||||
assert doc[1].nbor(1).text == '2'
|
||||
assert doc[1].nbor(1).text == "2"
|
||||
|
||||
|
||||
def test_issue1387():
|
||||
tag_map = {'VBG': {POS: VERB, VerbForm_part: True}}
|
||||
index = {"verb": ("cope","cop")}
|
||||
tag_map = {"VBG": {POS: VERB, VerbForm_part: True}}
|
||||
index = {"verb": ("cope", "cop")}
|
||||
exc = {"verb": {"coping": ("cope",)}}
|
||||
rules = {"verb": [["ing", ""]]}
|
||||
lemmatizer = Lemmatizer(index, exc, rules)
|
||||
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
|
||||
doc = Doc(vocab, words=["coping"])
|
||||
doc[0].tag_ = 'VBG'
|
||||
doc[0].tag_ = "VBG"
|
||||
assert doc[0].text == "coping"
|
||||
assert doc[0].lemma_ == "cope"
|
||||
|
||||
|
||||
def test_issue1434():
|
||||
"""Test matches occur when optional element at end of short doc."""
|
||||
pattern = [{'ORTH': 'Hello' }, {'IS_ALPHA': True, 'OP': '?'}]
|
||||
pattern = [{"ORTH": "Hello"}, {"IS_ALPHA": True, "OP": "?"}]
|
||||
vocab = Vocab(lex_attr_getters=LEX_ATTRS)
|
||||
hello_world = Doc(vocab, words=['Hello', 'World'])
|
||||
hello = Doc(vocab, words=['Hello'])
|
||||
hello_world = Doc(vocab, words=["Hello", "World"])
|
||||
hello = Doc(vocab, words=["Hello"])
|
||||
matcher = Matcher(vocab)
|
||||
matcher.add('MyMatcher', None, pattern)
|
||||
matcher.add("MyMatcher", None, pattern)
|
||||
matches = matcher(hello_world)
|
||||
assert matches
|
||||
matches = matcher(hello)
|
||||
assert matches
|
||||
|
||||
|
||||
@pytest.mark.parametrize('string,start,end', [
|
||||
('a', 0, 1), ('a b', 0, 2), ('a c', 0, 1), ('a b c', 0, 2),
|
||||
('a b b c', 0, 3), ('a b b', 0, 3),])
|
||||
@pytest.mark.parametrize(
|
||||
"string,start,end",
|
||||
[
|
||||
("a", 0, 1),
|
||||
("a b", 0, 2),
|
||||
("a c", 0, 1),
|
||||
("a b c", 0, 2),
|
||||
("a b b c", 0, 3),
|
||||
("a b b", 0, 3),
|
||||
],
|
||||
)
|
||||
def test_issue1450(string, start, end):
|
||||
"""Test matcher works when patterns end with * operator."""
|
||||
pattern = [{'ORTH': "a"}, {'ORTH': "b", 'OP': "*"}]
|
||||
pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}]
|
||||
matcher = Matcher(Vocab())
|
||||
matcher.add("TSTEND", None, pattern)
|
||||
doc = Doc(Vocab(), words=string.split())
|
||||
|
@ -96,17 +104,20 @@ def test_issue1450(string, start, end):
|
|||
|
||||
|
||||
def test_issue1488():
|
||||
prefix_re = re.compile(r'''[\[\("']''')
|
||||
suffix_re = re.compile(r'''[\]\)"']''')
|
||||
infix_re = re.compile(r'''[-~\.]''')
|
||||
simple_url_re = re.compile(r'''^https?://''')
|
||||
prefix_re = re.compile(r"""[\[\("']""")
|
||||
suffix_re = re.compile(r"""[\]\)"']""")
|
||||
infix_re = re.compile(r"""[-~\.]""")
|
||||
simple_url_re = re.compile(r"""^https?://""")
|
||||
|
||||
def my_tokenizer(nlp):
|
||||
return Tokenizer(nlp.vocab, {},
|
||||
prefix_search=prefix_re.search,
|
||||
suffix_search=suffix_re.search,
|
||||
infix_finditer=infix_re.finditer,
|
||||
token_match=simple_url_re.match)
|
||||
return Tokenizer(
|
||||
nlp.vocab,
|
||||
{},
|
||||
prefix_search=prefix_re.search,
|
||||
suffix_search=suffix_re.search,
|
||||
infix_finditer=infix_re.finditer,
|
||||
token_match=simple_url_re.match,
|
||||
)
|
||||
|
||||
nlp = English()
|
||||
nlp.tokenizer = my_tokenizer(nlp)
|
||||
|
@ -116,11 +127,16 @@ def test_issue1488():
|
|||
|
||||
|
||||
def test_issue1494():
|
||||
infix_re = re.compile(r'''[^a-z]''')
|
||||
test_cases = [('token 123test', ['token', '1', '2', '3', 'test']),
|
||||
('token 1test', ['token', '1test']),
|
||||
('hello...test', ['hello', '.', '.', '.', 'test'])]
|
||||
new_tokenizer = lambda nlp: Tokenizer(nlp.vocab, {}, infix_finditer=infix_re.finditer)
|
||||
infix_re = re.compile(r"""[^a-z]""")
|
||||
test_cases = [
|
||||
("token 123test", ["token", "1", "2", "3", "test"]),
|
||||
("token 1test", ["token", "1test"]),
|
||||
("hello...test", ["hello", ".", ".", ".", "test"]),
|
||||
]
|
||||
|
||||
def new_tokenizer(nlp):
|
||||
return Tokenizer(nlp.vocab, {}, infix_finditer=infix_re.finditer)
|
||||
|
||||
nlp = English()
|
||||
nlp.tokenizer = new_tokenizer(nlp)
|
||||
for text, expected in test_cases:
|
||||
|
|
|
@ -45,17 +45,17 @@ def test_issue1506():
|
|||
def test_issue1518():
|
||||
"""Test vectors.resize() works."""
|
||||
vectors = Vectors(shape=(10, 10))
|
||||
vectors.add('hello', row=2)
|
||||
vectors.add("hello", row=2)
|
||||
vectors.resize((5, 9))
|
||||
|
||||
|
||||
def test_issue1537():
|
||||
"""Test that Span.as_doc() doesn't segfault."""
|
||||
string = 'The sky is blue . The man is pink . The dog is purple .'
|
||||
string = "The sky is blue . The man is pink . The dog is purple ."
|
||||
doc = Doc(Vocab(), words=string.split())
|
||||
doc[0].sent_start = True
|
||||
for word in doc[1:]:
|
||||
if word.nbor(-1).text == '.':
|
||||
if word.nbor(-1).text == ".":
|
||||
word.sent_start = True
|
||||
else:
|
||||
word.sent_start = False
|
||||
|
@ -67,7 +67,7 @@ def test_issue1537():
|
|||
|
||||
|
||||
# TODO: Currently segfaulting, due to l_edge and r_edge misalignment
|
||||
#def test_issue1537_model():
|
||||
# def test_issue1537_model():
|
||||
# nlp = load_spacy('en')
|
||||
# doc = nlp('The sky is blue. The man is pink. The dog is purple.')
|
||||
# sents = [s.as_doc() for s in doc.sents]
|
||||
|
@ -77,41 +77,41 @@ def test_issue1537():
|
|||
|
||||
def test_issue1539():
|
||||
"""Ensure vectors.resize() doesn't try to modify dictionary during iteration."""
|
||||
v = Vectors(shape=(10, 10), keys=[5,3,98,100])
|
||||
v.resize((100,100))
|
||||
v = Vectors(shape=(10, 10), keys=[5, 3, 98, 100])
|
||||
v.resize((100, 100))
|
||||
|
||||
|
||||
def test_issue1547():
|
||||
"""Test that entity labels still match after merging tokens."""
|
||||
words = ['\n', 'worda', '.', '\n', 'wordb', '-', 'Biosphere', '2', '-', ' \n']
|
||||
words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"]
|
||||
doc = Doc(Vocab(), words=words)
|
||||
doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings['PRODUCT'])]
|
||||
doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings["PRODUCT"])]
|
||||
doc[5:7].merge()
|
||||
assert [ent.text for ent in doc.ents]
|
||||
|
||||
|
||||
def test_issue1612(en_tokenizer):
|
||||
doc = en_tokenizer('The black cat purrs.')
|
||||
span = doc[1: 3]
|
||||
doc = en_tokenizer("The black cat purrs.")
|
||||
span = doc[1:3]
|
||||
assert span.orth_ == span.text
|
||||
|
||||
|
||||
def test_issue1654():
|
||||
nlp = Language(Vocab())
|
||||
assert not nlp.pipeline
|
||||
nlp.add_pipe(lambda doc: doc, name='1')
|
||||
nlp.add_pipe(lambda doc: doc, name='2', after='1')
|
||||
nlp.add_pipe(lambda doc: doc, name='3', after='2')
|
||||
assert nlp.pipe_names == ['1', '2', '3']
|
||||
nlp.add_pipe(lambda doc: doc, name="1")
|
||||
nlp.add_pipe(lambda doc: doc, name="2", after="1")
|
||||
nlp.add_pipe(lambda doc: doc, name="3", after="2")
|
||||
assert nlp.pipe_names == ["1", "2", "3"]
|
||||
nlp2 = Language(Vocab())
|
||||
assert not nlp2.pipeline
|
||||
nlp2.add_pipe(lambda doc: doc, name='3')
|
||||
nlp2.add_pipe(lambda doc: doc, name='2', before='3')
|
||||
nlp2.add_pipe(lambda doc: doc, name='1', before='2')
|
||||
assert nlp2.pipe_names == ['1', '2', '3']
|
||||
nlp2.add_pipe(lambda doc: doc, name="3")
|
||||
nlp2.add_pipe(lambda doc: doc, name="2", before="3")
|
||||
nlp2.add_pipe(lambda doc: doc, name="1", before="2")
|
||||
assert nlp2.pipe_names == ["1", "2", "3"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ['test@example.com', 'john.doe@example.co.uk'])
|
||||
@pytest.mark.parametrize("text", ["test@example.com", "john.doe@example.co.uk"])
|
||||
def test_issue1698(en_tokenizer, text):
|
||||
doc = en_tokenizer(text)
|
||||
assert len(doc) == 1
|
||||
|
@ -121,30 +121,30 @@ def test_issue1698(en_tokenizer, text):
|
|||
def test_issue1727():
|
||||
"""Test that models with no pretrained vectors can be deserialized
|
||||
correctly after vectors are added."""
|
||||
data = numpy.ones((3, 300), dtype='f')
|
||||
vectors = Vectors(data=data, keys=['I', 'am', 'Matt'])
|
||||
data = numpy.ones((3, 300), dtype="f")
|
||||
vectors = Vectors(data=data, keys=["I", "am", "Matt"])
|
||||
tagger = Tagger(Vocab())
|
||||
tagger.add_label('PRP')
|
||||
tagger.add_label("PRP")
|
||||
tagger.begin_training()
|
||||
assert tagger.cfg.get('pretrained_dims', 0) == 0
|
||||
assert tagger.cfg.get("pretrained_dims", 0) == 0
|
||||
tagger.vocab.vectors = vectors
|
||||
with make_tempdir() as path:
|
||||
tagger.to_disk(path)
|
||||
tagger = Tagger(Vocab()).from_disk(path)
|
||||
assert tagger.cfg.get('pretrained_dims', 0) == 0
|
||||
assert tagger.cfg.get("pretrained_dims", 0) == 0
|
||||
|
||||
|
||||
def test_issue1757():
|
||||
"""Test comparison against None doesn't cause segfault."""
|
||||
doc = Doc(Vocab(), words=['a', 'b', 'c'])
|
||||
doc = Doc(Vocab(), words=["a", "b", "c"])
|
||||
assert not doc[0] < None
|
||||
assert not doc[0] == None
|
||||
assert doc[0] >= None
|
||||
assert not doc[:2] < None
|
||||
assert not doc[:2] == None
|
||||
assert doc[:2] >= None
|
||||
assert not doc.vocab['a'] == None
|
||||
assert not doc.vocab['a'] < None
|
||||
assert not doc.vocab["a"] == None
|
||||
assert not doc.vocab["a"] < None
|
||||
|
||||
|
||||
def test_issue1758(en_tokenizer):
|
||||
|
@ -158,11 +158,20 @@ def test_issue1758(en_tokenizer):
|
|||
def test_issue1799():
|
||||
"""Test sentence boundaries are deserialized correctly, even for
|
||||
non-projective sentences."""
|
||||
heads_deps = numpy.asarray([[1, 397], [4, 436], [2, 426], [1, 402],
|
||||
[0, 8206900633647566924], [18446744073709551615, 440],
|
||||
[18446744073709551614, 442]], dtype='uint64')
|
||||
doc = Doc(Vocab(), words='Just what I was looking for .'.split())
|
||||
doc.vocab.strings.add('ROOT')
|
||||
heads_deps = numpy.asarray(
|
||||
[
|
||||
[1, 397],
|
||||
[4, 436],
|
||||
[2, 426],
|
||||
[1, 402],
|
||||
[0, 8206900633647566924],
|
||||
[18446744073709551615, 440],
|
||||
[18446744073709551614, 442],
|
||||
],
|
||||
dtype="uint64",
|
||||
)
|
||||
doc = Doc(Vocab(), words="Just what I was looking for .".split())
|
||||
doc.vocab.strings.add("ROOT")
|
||||
doc = doc.from_array([HEAD, DEP], heads_deps)
|
||||
assert len(list(doc.sents)) == 1
|
||||
|
||||
|
@ -170,9 +179,9 @@ def test_issue1799():
|
|||
def test_issue1807():
|
||||
"""Test vocab.set_vector also adds the word to the vocab."""
|
||||
vocab = Vocab()
|
||||
assert 'hello' not in vocab
|
||||
vocab.set_vector('hello', numpy.ones((50,), dtype='f'))
|
||||
assert 'hello' in vocab
|
||||
assert "hello" not in vocab
|
||||
vocab.set_vector("hello", numpy.ones((50,), dtype="f"))
|
||||
assert "hello" in vocab
|
||||
|
||||
|
||||
def test_issue1834():
|
||||
|
@ -195,34 +204,34 @@ def test_issue1834():
|
|||
def test_issue1868():
|
||||
"""Test Vocab.__contains__ works with int keys."""
|
||||
vocab = Vocab()
|
||||
lex = vocab['hello']
|
||||
lex = vocab["hello"]
|
||||
assert lex.orth in vocab
|
||||
assert lex.orth_ in vocab
|
||||
assert 'some string' not in vocab
|
||||
int_id = vocab.strings.add('some string')
|
||||
assert "some string" not in vocab
|
||||
int_id = vocab.strings.add("some string")
|
||||
assert int_id not in vocab
|
||||
|
||||
|
||||
def test_issue1883():
|
||||
matcher = Matcher(Vocab())
|
||||
matcher.add('pat1', None, [{'orth': 'hello'}])
|
||||
doc = Doc(matcher.vocab, words=['hello'])
|
||||
matcher.add("pat1", None, [{"orth": "hello"}])
|
||||
doc = Doc(matcher.vocab, words=["hello"])
|
||||
assert len(matcher(doc)) == 1
|
||||
new_matcher = copy.deepcopy(matcher)
|
||||
new_doc = Doc(new_matcher.vocab, words=['hello'])
|
||||
new_doc = Doc(new_matcher.vocab, words=["hello"])
|
||||
assert len(new_matcher(new_doc)) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize('word', ['the'])
|
||||
@pytest.mark.parametrize("word", ["the"])
|
||||
def test_issue1889(word):
|
||||
assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS)
|
||||
|
||||
|
||||
def test_issue1915():
|
||||
cfg = {'hidden_depth': 2} # should error out
|
||||
cfg = {"hidden_depth": 2} # should error out
|
||||
nlp = Language()
|
||||
nlp.add_pipe(nlp.create_pipe('ner'))
|
||||
nlp.get_pipe('ner').add_label('answer')
|
||||
nlp.add_pipe(nlp.create_pipe("ner"))
|
||||
nlp.get_pipe("ner").add_label("answer")
|
||||
with pytest.raises(ValueError):
|
||||
nlp.begin_training(**cfg)
|
||||
|
||||
|
@ -230,17 +239,17 @@ def test_issue1915():
|
|||
def test_issue1945():
|
||||
"""Test regression in Matcher introduced in v2.0.6."""
|
||||
matcher = Matcher(Vocab())
|
||||
matcher.add('MWE', None, [{'orth': 'a'}, {'orth': 'a'}])
|
||||
doc = Doc(matcher.vocab, words=['a', 'a', 'a'])
|
||||
matcher.add("MWE", None, [{"orth": "a"}, {"orth": "a"}])
|
||||
doc = Doc(matcher.vocab, words=["a", "a", "a"])
|
||||
matches = matcher(doc) # we should see two overlapping matches here
|
||||
assert len(matches) == 2
|
||||
assert matches[0][1:] == (0, 2)
|
||||
assert matches[1][1:] == (1, 3)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('label', ['U-JOB-NAME'])
|
||||
@pytest.mark.parametrize("label", ["U-JOB-NAME"])
|
||||
def test_issue1967(label):
|
||||
ner = EntityRecognizer(Vocab())
|
||||
entry = ([0], ['word'], ['tag'], [0], ['dep'], [label])
|
||||
entry = ([0], ["word"], ["tag"], [0], ["dep"], [label])
|
||||
gold_parses = [(None, [(entry, None)])]
|
||||
ner.moves.get_actions(gold_parses=gold_parses)
|
||||
|
|
|
@ -14,15 +14,15 @@ from ..util import add_vecs_to_vocab
|
|||
def test_issue2179():
|
||||
"""Test that spurious 'extra_labels' aren't created when initializing NER."""
|
||||
nlp = Italian()
|
||||
ner = nlp.create_pipe('ner')
|
||||
ner.add_label('CITIZENSHIP')
|
||||
ner = nlp.create_pipe("ner")
|
||||
ner.add_label("CITIZENSHIP")
|
||||
nlp.add_pipe(ner)
|
||||
nlp.begin_training()
|
||||
nlp2 = Italian()
|
||||
nlp2.add_pipe(nlp2.create_pipe('ner'))
|
||||
nlp2.add_pipe(nlp2.create_pipe("ner"))
|
||||
nlp2.from_bytes(nlp.to_bytes())
|
||||
assert 'extra_labels' not in nlp2.get_pipe('ner').cfg
|
||||
assert nlp2.get_pipe('ner').labels == ['CITIZENSHIP']
|
||||
assert "extra_labels" not in nlp2.get_pipe("ner").cfg
|
||||
assert nlp2.get_pipe("ner").labels == ["CITIZENSHIP"]
|
||||
|
||||
|
||||
def test_issue2219(en_vocab):
|
||||
|
@ -34,7 +34,7 @@ def test_issue2219(en_vocab):
|
|||
|
||||
|
||||
def test_issue2361(de_tokenizer):
|
||||
chars = ('<', '>', '&', '"')
|
||||
chars = ("<", ">", "&", """)
|
||||
doc = de_tokenizer('< > & " ')
|
||||
doc.is_parsed = True
|
||||
doc.is_tagged = True
|
||||
|
@ -46,25 +46,32 @@ def test_issue2361(de_tokenizer):
|
|||
def test_issue2385():
|
||||
"""Test that IOB tags are correctly converted to BILUO tags."""
|
||||
# fix bug in labels with a 'b' character
|
||||
tags1 = ('B-BRAWLER', 'I-BRAWLER', 'I-BRAWLER')
|
||||
assert iob_to_biluo(tags1) == ['B-BRAWLER', 'I-BRAWLER', 'L-BRAWLER']
|
||||
tags1 = ("B-BRAWLER", "I-BRAWLER", "I-BRAWLER")
|
||||
assert iob_to_biluo(tags1) == ["B-BRAWLER", "I-BRAWLER", "L-BRAWLER"]
|
||||
# maintain support for iob1 format
|
||||
tags2 = ('I-ORG', 'I-ORG', 'B-ORG')
|
||||
assert iob_to_biluo(tags2) == ['B-ORG', 'L-ORG', 'U-ORG']
|
||||
tags2 = ("I-ORG", "I-ORG", "B-ORG")
|
||||
assert iob_to_biluo(tags2) == ["B-ORG", "L-ORG", "U-ORG"]
|
||||
# maintain support for iob2 format
|
||||
tags3 = ('B-PERSON', 'I-PERSON', 'B-PERSON')
|
||||
assert iob_to_biluo(tags3) ==['B-PERSON', 'L-PERSON', 'U-PERSON']
|
||||
tags3 = ("B-PERSON", "I-PERSON", "B-PERSON")
|
||||
assert iob_to_biluo(tags3) == ["B-PERSON", "L-PERSON", "U-PERSON"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('tags', [
|
||||
('B-ORG', 'L-ORG'), ('B-PERSON', 'I-PERSON', 'L-PERSON'), ('U-BRAWLER', 'U-BRAWLER')])
|
||||
@pytest.mark.parametrize(
|
||||
"tags",
|
||||
[
|
||||
("B-ORG", "L-ORG"),
|
||||
("B-PERSON", "I-PERSON", "L-PERSON"),
|
||||
("U-BRAWLER", "U-BRAWLER"),
|
||||
],
|
||||
)
|
||||
def test_issue2385_biluo(tags):
|
||||
"""Test that BILUO-compatible tags aren't modified."""
|
||||
assert iob_to_biluo(tags) == list(tags)
|
||||
|
||||
|
||||
def test_issue2482():
|
||||
'''Test we can serialize and deserialize a blank NER or parser model.'''
|
||||
"""Test we can serialize and deserialize a blank NER or parser model."""
|
||||
nlp = Italian()
|
||||
nlp.add_pipe(nlp.create_pipe('ner'))
|
||||
nlp.add_pipe(nlp.create_pipe("ner"))
|
||||
b = nlp.to_bytes()
|
||||
nlp2 = Italian().from_bytes(b)
|
||||
|
|
|
@ -7,11 +7,11 @@ from spacy.language import Language
|
|||
def test_issue2564():
|
||||
"""Test the tagger sets is_tagged correctly when used via Language.pipe."""
|
||||
nlp = Language()
|
||||
tagger = nlp.create_pipe('tagger')
|
||||
tagger = nlp.create_pipe("tagger")
|
||||
tagger.begin_training() # initialise weights
|
||||
nlp.add_pipe(tagger)
|
||||
doc = nlp('hello world')
|
||||
doc = nlp("hello world")
|
||||
assert doc.is_tagged
|
||||
docs = nlp.pipe(['hello', 'world'])
|
||||
docs = nlp.pipe(["hello", "world"])
|
||||
piped_doc = next(docs)
|
||||
assert piped_doc.is_tagged
|
||||
|
|
|
@ -7,11 +7,11 @@ from spacy.tokens import Span
|
|||
|
||||
def test_issue2569(en_tokenizer):
|
||||
doc = en_tokenizer("It is May 15, 1993.")
|
||||
doc.ents = [Span(doc, 2, 6, label=doc.vocab.strings['DATE'])]
|
||||
doc.ents = [Span(doc, 2, 6, label=doc.vocab.strings["DATE"])]
|
||||
matcher = Matcher(doc.vocab)
|
||||
matcher.add("RULE", None, [{'ENT_TYPE':'DATE', 'OP':'+'}])
|
||||
matcher.add("RULE", None, [{"ENT_TYPE": "DATE", "OP": "+"}])
|
||||
matched = [doc[start:end] for _, start, end in matcher(doc)]
|
||||
matched = sorted(matched, key=len, reverse=True)
|
||||
assert len(matched) == 10
|
||||
assert len(matched[0]) == 4
|
||||
assert matched[0].text == 'May 15, 1993'
|
||||
assert matched[0].text == "May 15, 1993"
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from spacy.lang.en import English
|
||||
from spacy.matcher import Matcher
|
||||
|
||||
|
@ -10,6 +9,7 @@ def test_issue2671():
|
|||
"""Ensure the correct entity ID is returned for matches with quantifiers.
|
||||
See also #2675
|
||||
"""
|
||||
|
||||
def get_rule_id(nlp, matcher, doc):
|
||||
matches = matcher(doc)
|
||||
for match_id, start, end in matches:
|
||||
|
@ -19,10 +19,12 @@ def test_issue2671():
|
|||
|
||||
nlp = English()
|
||||
matcher = Matcher(nlp.vocab)
|
||||
pattern_id = 'test_pattern'
|
||||
pattern = [{'LOWER': 'high'},
|
||||
{'IS_PUNCT': True, 'OP': '?'},
|
||||
{'LOWER': 'adrenaline'}]
|
||||
pattern_id = "test_pattern"
|
||||
pattern = [
|
||||
{"LOWER": "high"},
|
||||
{"IS_PUNCT": True, "OP": "?"},
|
||||
{"LOWER": "adrenaline"},
|
||||
]
|
||||
matcher.add(pattern_id, None, pattern)
|
||||
doc1 = nlp("This is a high-adrenaline situation.")
|
||||
doc2 = nlp("This is a high adrenaline situation.")
|
||||
|
|
|
@ -1,17 +1,15 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
from ..util import get_doc
|
||||
|
||||
|
||||
def test_issue2772(en_vocab):
|
||||
"""Test that deprojectivization doesn't mess up sentence boundaries."""
|
||||
words = 'When we write or communicate virtually , we can hide our true feelings .'.split()
|
||||
words = "When we write or communicate virtually , we can hide our true feelings .".split()
|
||||
# A tree with a non-projective (i.e. crossing) arc
|
||||
# The arcs (0, 4) and (2, 9) cross.
|
||||
heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, -1, -2, -1]
|
||||
deps = ['dep'] * len(heads)
|
||||
deps = ["dep"] * len(heads)
|
||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
assert doc[1].is_sent_start is None
|
||||
|
|
|
@ -5,8 +5,8 @@ from spacy.util import get_lang_class
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ['-0.23', '+123,456', '±1'])
|
||||
@pytest.mark.parametrize('lang', ['en', 'xx'])
|
||||
@pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"])
|
||||
@pytest.mark.parametrize("lang", ["en", "xx"])
|
||||
def test_issue2782(text, lang):
|
||||
"""Check that like_num handles + and - before number."""
|
||||
cls = get_lang_class(lang)
|
||||
|
|
|
@ -18,25 +18,25 @@ def test_serialize_empty_doc(en_vocab):
|
|||
|
||||
|
||||
def test_serialize_doc_roundtrip_bytes(en_vocab):
|
||||
doc = Doc(en_vocab, words=['hello', 'world'])
|
||||
doc = Doc(en_vocab, words=["hello", "world"])
|
||||
doc_b = doc.to_bytes()
|
||||
new_doc = Doc(en_vocab).from_bytes(doc_b)
|
||||
assert new_doc.to_bytes() == doc_b
|
||||
|
||||
|
||||
def test_serialize_doc_roundtrip_disk(en_vocab):
|
||||
doc = Doc(en_vocab, words=['hello', 'world'])
|
||||
doc = Doc(en_vocab, words=["hello", "world"])
|
||||
with make_tempdir() as d:
|
||||
file_path = d / 'doc'
|
||||
file_path = d / "doc"
|
||||
doc.to_disk(file_path)
|
||||
doc_d = Doc(en_vocab).from_disk(file_path)
|
||||
assert doc.to_bytes() == doc_d.to_bytes()
|
||||
|
||||
|
||||
def test_serialize_doc_roundtrip_disk_str_path(en_vocab):
|
||||
doc = Doc(en_vocab, words=['hello', 'world'])
|
||||
doc = Doc(en_vocab, words=["hello", "world"])
|
||||
with make_tempdir() as d:
|
||||
file_path = d / 'doc'
|
||||
file_path = d / "doc"
|
||||
file_path = path2str(file_path)
|
||||
doc.to_disk(file_path)
|
||||
doc_d = Doc(en_vocab).from_disk(file_path)
|
||||
|
|
|
@ -8,19 +8,20 @@ from spacy.vocab import Vocab
|
|||
|
||||
@pytest.fixture
|
||||
def doc_w_attrs(en_tokenizer):
|
||||
Doc.set_extension('_test_attr', default=False)
|
||||
Doc.set_extension('_test_prop', getter=lambda doc: len(doc.text))
|
||||
Doc.set_extension('_test_method', method=lambda doc, arg: "{}{}".format(len(doc.text), arg))
|
||||
Doc.set_extension("_test_attr", default=False)
|
||||
Doc.set_extension("_test_prop", getter=lambda doc: len(doc.text))
|
||||
Doc.set_extension(
|
||||
"_test_method", method=lambda doc, arg: "{}{}".format(len(doc.text), arg)
|
||||
)
|
||||
doc = en_tokenizer("This is a test.")
|
||||
doc._._test_attr = 'test'
|
||||
doc._._test_attr = "test"
|
||||
return doc
|
||||
|
||||
|
||||
|
||||
def test_serialize_ext_attrs_from_bytes(doc_w_attrs):
|
||||
doc_b = doc_w_attrs.to_bytes()
|
||||
doc = Doc(Vocab()).from_bytes(doc_b)
|
||||
assert doc._.has('_test_attr')
|
||||
assert doc._._test_attr == 'test'
|
||||
assert doc._.has("_test_attr")
|
||||
assert doc._._test_attr == "test"
|
||||
assert doc._._test_prop == len(doc.text)
|
||||
assert doc._._test_method('test') == '{}{}'.format(len(doc.text), 'test')
|
||||
assert doc._._test_method("test") == "{}{}".format(len(doc.text), "test")
|
||||
|
|
|
@ -12,14 +12,14 @@ from ..util import make_tempdir
|
|||
@pytest.fixture
|
||||
def meta_data():
|
||||
return {
|
||||
'name': 'name-in-fixture',
|
||||
'version': 'version-in-fixture',
|
||||
'description': 'description-in-fixture',
|
||||
'author': 'author-in-fixture',
|
||||
'email': 'email-in-fixture',
|
||||
'url': 'url-in-fixture',
|
||||
'license': 'license-in-fixture',
|
||||
'vectors': {'width': 0, 'vectors': 0, 'keys': 0, 'name': None}
|
||||
"name": "name-in-fixture",
|
||||
"version": "version-in-fixture",
|
||||
"description": "description-in-fixture",
|
||||
"author": "author-in-fixture",
|
||||
"email": "email-in-fixture",
|
||||
"url": "url-in-fixture",
|
||||
"license": "license-in-fixture",
|
||||
"vectors": {"width": 0, "vectors": 0, "keys": 0, "name": None},
|
||||
}
|
||||
|
||||
|
||||
|
@ -35,16 +35,18 @@ def test_serialize_with_custom_tokenizer():
|
|||
"""Test that serialization with custom tokenizer works without token_match.
|
||||
See: https://support.prodi.gy/t/how-to-save-a-custom-tokenizer/661/2
|
||||
"""
|
||||
prefix_re = re.compile(r'''1/|2/|:[0-9][0-9][A-K]:|:[0-9][0-9]:''')
|
||||
suffix_re = re.compile(r'''''')
|
||||
infix_re = re.compile(r'''[~]''')
|
||||
prefix_re = re.compile(r"""1/|2/|:[0-9][0-9][A-K]:|:[0-9][0-9]:""")
|
||||
suffix_re = re.compile(r"""""")
|
||||
infix_re = re.compile(r"""[~]""")
|
||||
|
||||
def custom_tokenizer(nlp):
|
||||
return Tokenizer(nlp.vocab,
|
||||
{},
|
||||
prefix_search=prefix_re.search,
|
||||
suffix_search=suffix_re.search,
|
||||
infix_finditer=infix_re.finditer)
|
||||
return Tokenizer(
|
||||
nlp.vocab,
|
||||
{},
|
||||
prefix_search=prefix_re.search,
|
||||
suffix_search=suffix_re.search,
|
||||
infix_finditer=infix_re.finditer,
|
||||
)
|
||||
|
||||
nlp = Language()
|
||||
nlp.tokenizer = custom_tokenizer(nlp)
|
||||
|
|
|
@ -2,7 +2,8 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer, Tensorizer, TextCategorizer
|
||||
from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
|
||||
from spacy.pipeline import Tensorizer, TextCategorizer
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
@ -13,7 +14,7 @@ test_parsers = [DependencyParser, EntityRecognizer]
|
|||
@pytest.fixture
|
||||
def parser(en_vocab):
|
||||
parser = DependencyParser(en_vocab)
|
||||
parser.add_label('nsubj')
|
||||
parser.add_label("nsubj")
|
||||
parser.model, cfg = parser.Model(parser.moves.n_moves)
|
||||
parser.cfg.update(cfg)
|
||||
return parser
|
||||
|
@ -34,7 +35,7 @@ def taggers(en_vocab):
|
|||
return (tagger1, tagger2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('Parser', test_parsers)
|
||||
@pytest.mark.parametrize("Parser", test_parsers)
|
||||
def test_serialize_parser_roundtrip_bytes(en_vocab, Parser):
|
||||
parser = Parser(en_vocab)
|
||||
parser.model, _ = parser.Model(10)
|
||||
|
@ -44,12 +45,12 @@ def test_serialize_parser_roundtrip_bytes(en_vocab, Parser):
|
|||
assert new_parser.to_bytes() == parser.to_bytes()
|
||||
|
||||
|
||||
@pytest.mark.parametrize('Parser', test_parsers)
|
||||
@pytest.mark.parametrize("Parser", test_parsers)
|
||||
def test_serialize_parser_roundtrip_disk(en_vocab, Parser):
|
||||
parser = Parser(en_vocab)
|
||||
parser.model, _ = parser.Model(0)
|
||||
with make_tempdir() as d:
|
||||
file_path = d / 'parser'
|
||||
file_path = d / "parser"
|
||||
parser.to_disk(file_path)
|
||||
parser_d = Parser(en_vocab)
|
||||
parser_d.model, _ = parser_d.Model(0)
|
||||
|
@ -67,7 +68,9 @@ def test_to_from_bytes(parser, blank_parser):
|
|||
assert blank_parser.moves.n_moves == parser.moves.n_moves
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="This seems to be a dict ordering bug somewhere. Only failing on some platforms.")
|
||||
@pytest.mark.skip(
|
||||
reason="This seems to be a dict ordering bug somewhere. Only failing on some platforms."
|
||||
)
|
||||
def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers):
|
||||
tagger1, tagger2 = taggers
|
||||
tagger1_b = tagger1.to_bytes()
|
||||
|
@ -81,8 +84,8 @@ def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers):
|
|||
def test_serialize_tagger_roundtrip_disk(en_vocab, taggers):
|
||||
tagger1, tagger2 = taggers
|
||||
with make_tempdir() as d:
|
||||
file_path1 = d / 'tagger1'
|
||||
file_path2 = d / 'tagger2'
|
||||
file_path1 = d / "tagger1"
|
||||
file_path2 = d / "tagger2"
|
||||
tagger1.to_disk(file_path1)
|
||||
tagger2.to_disk(file_path2)
|
||||
tagger1_d = Tagger(en_vocab).from_disk(file_path1)
|
||||
|
@ -102,7 +105,7 @@ def test_serialize_tensorizer_roundtrip_disk(en_vocab):
|
|||
tensorizer = Tensorizer(en_vocab)
|
||||
tensorizer.model = tensorizer.Model()
|
||||
with make_tempdir() as d:
|
||||
file_path = d / 'tensorizer'
|
||||
file_path = d / "tensorizer"
|
||||
tensorizer.to_disk(file_path)
|
||||
tensorizer_d = Tensorizer(en_vocab).from_disk(file_path)
|
||||
assert tensorizer.to_bytes() == tensorizer_d.to_bytes()
|
||||
|
@ -110,5 +113,5 @@ def test_serialize_tensorizer_roundtrip_disk(en_vocab):
|
|||
|
||||
def test_serialize_textcat_empty(en_vocab):
|
||||
# See issue #1105
|
||||
textcat = TextCategorizer(en_vocab, labels=['ENTITY', 'ACTION', 'MODIFIER'])
|
||||
textcat = TextCategorizer(en_vocab, labels=["ENTITY", "ACTION", "MODIFIER"])
|
||||
textcat_bytes = textcat.to_bytes()
|
||||
|
|
|
@ -9,7 +9,7 @@ from ..util import make_tempdir, assert_packed_msg_equal
|
|||
|
||||
|
||||
def load_tokenizer(b):
|
||||
tok = get_lang_class('en').Defaults.create_tokenizer()
|
||||
tok = get_lang_class("en").Defaults.create_tokenizer()
|
||||
tok.from_bytes(b)
|
||||
return tok
|
||||
|
||||
|
@ -23,7 +23,7 @@ def test_serialize_custom_tokenizer(en_vocab, en_tokenizer):
|
|||
|
||||
|
||||
@pytest.mark.skip(reason="Currently unreliable across platforms")
|
||||
@pytest.mark.parametrize('text', ["I💜you", "they’re", "“hello”"])
|
||||
@pytest.mark.parametrize("text", ["I💜you", "they’re", "“hello”"])
|
||||
def test_serialize_tokenizer_roundtrip_bytes(en_tokenizer, text):
|
||||
tokenizer = en_tokenizer
|
||||
new_tokenizer = load_tokenizer(tokenizer.to_bytes())
|
||||
|
@ -38,7 +38,7 @@ def test_serialize_tokenizer_roundtrip_bytes(en_tokenizer, text):
|
|||
def test_serialize_tokenizer_roundtrip_disk(en_tokenizer):
|
||||
tokenizer = en_tokenizer
|
||||
with make_tempdir() as d:
|
||||
file_path = d / 'tokenizer'
|
||||
file_path = d / "tokenizer"
|
||||
tokenizer.to_disk(file_path)
|
||||
tokenizer_d = en_tokenizer.from_disk(file_path)
|
||||
assert tokenizer.to_bytes() == tokenizer_d.to_bytes()
|
||||
|
|
|
@ -8,12 +8,12 @@ from spacy.strings import StringStore
|
|||
from ..util import make_tempdir
|
||||
|
||||
|
||||
test_strings = [([], []), (['rats', 'are', 'cute'], ['i', 'like', 'rats'])]
|
||||
test_strings_attrs = [(['rats', 'are', 'cute'], 'Hello')]
|
||||
test_strings = [([], []), (["rats", "are", "cute"], ["i", "like", "rats"])]
|
||||
test_strings_attrs = [(["rats", "are", "cute"], "Hello")]
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize('text', ['rat'])
|
||||
@pytest.mark.parametrize("text", ["rat"])
|
||||
def test_serialize_vocab(en_vocab, text):
|
||||
text_hash = en_vocab.strings.add(text)
|
||||
vocab_bytes = en_vocab.to_bytes()
|
||||
|
@ -21,7 +21,7 @@ def test_serialize_vocab(en_vocab, text):
|
|||
assert new_vocab.strings(text_hash) == text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('strings1,strings2', test_strings)
|
||||
@pytest.mark.parametrize("strings1,strings2", test_strings)
|
||||
def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
|
||||
vocab1 = Vocab(strings=strings1)
|
||||
vocab2 = Vocab(strings=strings2)
|
||||
|
@ -39,13 +39,13 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
|
|||
assert sorted([lex.text for lex in new_vocab1]) == sorted(strings1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('strings1,strings2', test_strings)
|
||||
def test_serialize_vocab_roundtrip_disk(strings1,strings2):
|
||||
@pytest.mark.parametrize("strings1,strings2", test_strings)
|
||||
def test_serialize_vocab_roundtrip_disk(strings1, strings2):
|
||||
vocab1 = Vocab(strings=strings1)
|
||||
vocab2 = Vocab(strings=strings2)
|
||||
with make_tempdir() as d:
|
||||
file_path1 = d / 'vocab1'
|
||||
file_path2 = d / 'vocab2'
|
||||
file_path1 = d / "vocab1"
|
||||
file_path2 = d / "vocab2"
|
||||
vocab1.to_disk(file_path1)
|
||||
vocab2.to_disk(file_path2)
|
||||
vocab1_d = Vocab().from_disk(file_path1)
|
||||
|
@ -58,7 +58,7 @@ def test_serialize_vocab_roundtrip_disk(strings1,strings2):
|
|||
assert list(vocab1_d) != list(vocab2_d)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('strings,lex_attr', test_strings_attrs)
|
||||
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
|
||||
def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr):
|
||||
vocab1 = Vocab(strings=strings)
|
||||
vocab2 = Vocab()
|
||||
|
@ -69,7 +69,7 @@ def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr):
|
|||
assert vocab2[strings[0]].norm_ == lex_attr
|
||||
|
||||
|
||||
@pytest.mark.parametrize('strings,lex_attr', test_strings_attrs)
|
||||
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
|
||||
def test_serialize_vocab_lex_attrs_disk(strings, lex_attr):
|
||||
vocab1 = Vocab(strings=strings)
|
||||
vocab2 = Vocab()
|
||||
|
@ -77,13 +77,13 @@ def test_serialize_vocab_lex_attrs_disk(strings, lex_attr):
|
|||
assert vocab1[strings[0]].norm_ == lex_attr
|
||||
assert vocab2[strings[0]].norm_ != lex_attr
|
||||
with make_tempdir() as d:
|
||||
file_path = d / 'vocab'
|
||||
file_path = d / "vocab"
|
||||
vocab1.to_disk(file_path)
|
||||
vocab2 = vocab2.from_disk(file_path)
|
||||
assert vocab2[strings[0]].norm_ == lex_attr
|
||||
|
||||
|
||||
@pytest.mark.parametrize('strings1,strings2', test_strings)
|
||||
@pytest.mark.parametrize("strings1,strings2", test_strings)
|
||||
def test_serialize_stringstore_roundtrip_bytes(strings1, strings2):
|
||||
sstore1 = StringStore(strings=strings1)
|
||||
sstore2 = StringStore(strings=strings2)
|
||||
|
@ -100,13 +100,13 @@ def test_serialize_stringstore_roundtrip_bytes(strings1, strings2):
|
|||
assert list(new_sstore1) == strings1
|
||||
|
||||
|
||||
@pytest.mark.parametrize('strings1,strings2', test_strings)
|
||||
@pytest.mark.parametrize("strings1,strings2", test_strings)
|
||||
def test_serialize_stringstore_roundtrip_disk(strings1, strings2):
|
||||
sstore1 = StringStore(strings=strings1)
|
||||
sstore2 = StringStore(strings=strings2)
|
||||
with make_tempdir() as d:
|
||||
file_path1 = d / 'strings1'
|
||||
file_path2 = d / 'strings2'
|
||||
file_path1 = d / "strings1"
|
||||
file_path2 = d / "strings2"
|
||||
sstore1.to_disk(file_path1)
|
||||
sstore2.to_disk(file_path2)
|
||||
sstore1_d = StringStore().from_disk(file_path1)
|
||||
|
|
|
@ -5,52 +5,63 @@ import pytest
|
|||
from spacy._align import align, multi_align
|
||||
|
||||
|
||||
@pytest.mark.parametrize('string1,string2,cost', [
|
||||
('hello', 'hell', 1),
|
||||
('rat', 'cat', 1),
|
||||
('rat', 'rat', 0),
|
||||
('rat', 'catsie', 4),
|
||||
('t', 'catsie', 5),
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"string1,string2,cost",
|
||||
[
|
||||
("hello", "hell", 1),
|
||||
("rat", "cat", 1),
|
||||
("rat", "rat", 0),
|
||||
("rat", "catsie", 4),
|
||||
("t", "catsie", 5),
|
||||
],
|
||||
)
|
||||
def test_align_costs(string1, string2, cost):
|
||||
output_cost, i2j, j2i, matrix = align(string1, string2)
|
||||
assert output_cost == cost
|
||||
|
||||
|
||||
@pytest.mark.parametrize('string1,string2,i2j', [
|
||||
('hello', 'hell', [0,1,2,3,-1]),
|
||||
('rat', 'cat', [0,1,2]),
|
||||
('rat', 'rat', [0,1,2]),
|
||||
('rat', 'catsie', [0,1,2]),
|
||||
('t', 'catsie', [2]),
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"string1,string2,i2j",
|
||||
[
|
||||
("hello", "hell", [0, 1, 2, 3, -1]),
|
||||
("rat", "cat", [0, 1, 2]),
|
||||
("rat", "rat", [0, 1, 2]),
|
||||
("rat", "catsie", [0, 1, 2]),
|
||||
("t", "catsie", [2]),
|
||||
],
|
||||
)
|
||||
def test_align_i2j(string1, string2, i2j):
|
||||
output_cost, output_i2j, j2i, matrix = align(string1, string2)
|
||||
assert list(output_i2j) == i2j
|
||||
|
||||
|
||||
@pytest.mark.parametrize('string1,string2,j2i', [
|
||||
('hello', 'hell', [0,1,2,3]),
|
||||
('rat', 'cat', [0,1,2]),
|
||||
('rat', 'rat', [0,1,2]),
|
||||
('rat', 'catsie', [0,1,2, -1, -1, -1]),
|
||||
('t', 'catsie', [-1, -1, 0, -1, -1, -1]),
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"string1,string2,j2i",
|
||||
[
|
||||
("hello", "hell", [0, 1, 2, 3]),
|
||||
("rat", "cat", [0, 1, 2]),
|
||||
("rat", "rat", [0, 1, 2]),
|
||||
("rat", "catsie", [0, 1, 2, -1, -1, -1]),
|
||||
("t", "catsie", [-1, -1, 0, -1, -1, -1]),
|
||||
],
|
||||
)
|
||||
def test_align_i2j(string1, string2, j2i):
|
||||
output_cost, output_i2j, output_j2i, matrix = align(string1, string2)
|
||||
assert list(output_j2i) == j2i
|
||||
|
||||
|
||||
def test_align_strings():
|
||||
words1 = ['hello', 'this', 'is', 'test!']
|
||||
words2 = ['hellothis', 'is', 'test', '!']
|
||||
words1 = ["hello", "this", "is", "test!"]
|
||||
words2 = ["hellothis", "is", "test", "!"]
|
||||
cost, i2j, j2i, matrix = align(words1, words2)
|
||||
assert cost == 4
|
||||
assert list(i2j) == [-1, -1, 1, -1]
|
||||
assert list(j2i) == [-1, 2, -1, -1]
|
||||
|
||||
|
||||
def test_align_many_to_one():
|
||||
words1 = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']
|
||||
words2 = ['ab', 'bc', 'e', 'fg', 'h']
|
||||
words1 = ["a", "b", "c", "d", "e", "f", "g", "h"]
|
||||
words2 = ["ab", "bc", "e", "fg", "h"]
|
||||
cost, i2j, j2i, matrix = align(words1, words2)
|
||||
assert list(i2j) == [-1, -1, -1, -1, 2, -1, -1, 4]
|
||||
lengths1 = [len(w) for w in words1]
|
||||
|
|
|
@ -8,75 +8,78 @@ from .util import get_doc
|
|||
|
||||
|
||||
def test_gold_biluo_U(en_vocab):
|
||||
orths_and_spaces = [('I', True), ('flew', True), ('to', True),
|
||||
('London', False), ('.', True)]
|
||||
doc = Doc(en_vocab, orths_and_spaces=orths_and_spaces)
|
||||
entities = [(len("I flew to "), len("I flew to London"), 'LOC')]
|
||||
words = ["I", "flew", "to", "London", "."]
|
||||
spaces = [True, True, True, False, True]
|
||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||
entities = [(len("I flew to "), len("I flew to London"), "LOC")]
|
||||
tags = biluo_tags_from_offsets(doc, entities)
|
||||
assert tags == ['O', 'O', 'O', 'U-LOC', 'O']
|
||||
assert tags == ["O", "O", "O", "U-LOC", "O"]
|
||||
|
||||
|
||||
def test_gold_biluo_BL(en_vocab):
|
||||
orths_and_spaces = [('I', True), ('flew', True), ('to', True), ('San', True),
|
||||
('Francisco', False), ('.', True)]
|
||||
doc = Doc(en_vocab, orths_and_spaces=orths_and_spaces)
|
||||
entities = [(len("I flew to "), len("I flew to San Francisco"), 'LOC')]
|
||||
words = ["I", "flew", "to", "San", "Francisco", "."]
|
||||
spaces = [True, True, True, True, False, True]
|
||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||
entities = [(len("I flew to "), len("I flew to San Francisco"), "LOC")]
|
||||
tags = biluo_tags_from_offsets(doc, entities)
|
||||
assert tags == ['O', 'O', 'O', 'B-LOC', 'L-LOC', 'O']
|
||||
assert tags == ["O", "O", "O", "B-LOC", "L-LOC", "O"]
|
||||
|
||||
|
||||
def test_gold_biluo_BIL(en_vocab):
|
||||
orths_and_spaces = [('I', True), ('flew', True), ('to', True), ('San', True),
|
||||
('Francisco', True), ('Valley', False), ('.', True)]
|
||||
doc = Doc(en_vocab, orths_and_spaces=orths_and_spaces)
|
||||
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), 'LOC')]
|
||||
words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||
spaces = [True, True, True, True, True, False, True]
|
||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
|
||||
tags = biluo_tags_from_offsets(doc, entities)
|
||||
assert tags == ['O', 'O', 'O', 'B-LOC', 'I-LOC', 'L-LOC', 'O']
|
||||
assert tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
|
||||
|
||||
|
||||
def test_gold_biluo_misalign(en_vocab):
|
||||
orths_and_spaces = [('I', True), ('flew', True), ('to', True), ('San', True),
|
||||
('Francisco', True), ('Valley.', False)]
|
||||
doc = Doc(en_vocab, orths_and_spaces=orths_and_spaces)
|
||||
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), 'LOC')]
|
||||
words = ["I", "flew", "to", "San", "Francisco", "Valley."]
|
||||
spaces = [True, True, True, True, True, False]
|
||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
|
||||
tags = biluo_tags_from_offsets(doc, entities)
|
||||
assert tags == ['O', 'O', 'O', '-', '-', '-']
|
||||
assert tags == ["O", "O", "O", "-", "-", "-"]
|
||||
|
||||
|
||||
def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
|
||||
text = "I flew to Silicon Valley via London."
|
||||
biluo_tags = ['O', 'O', 'O', 'B-LOC', 'L-LOC', 'O', 'U-GPE', 'O']
|
||||
offsets = [(10, 24, 'LOC'), (29, 35, 'GPE')]
|
||||
biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
|
||||
offsets = [(10, 24, "LOC"), (29, 35, "GPE")]
|
||||
doc = en_tokenizer(text)
|
||||
biluo_tags_converted = biluo_tags_from_offsets(doc, offsets)
|
||||
assert biluo_tags_converted == biluo_tags
|
||||
offsets_converted = offsets_from_biluo_tags(doc, biluo_tags)
|
||||
assert offsets_converted == offsets
|
||||
|
||||
|
||||
def test_docs_to_json(en_vocab):
|
||||
'''Test we can convert a list of Doc objects into the JSON-serializable
|
||||
"""Test we can convert a list of Doc objects into the JSON-serializable
|
||||
format we use for training.
|
||||
'''
|
||||
"""
|
||||
docs = [
|
||||
get_doc(
|
||||
en_vocab,
|
||||
words=['a', 'b'],
|
||||
pos=['VBP', 'NN'],
|
||||
words=["a", "b"],
|
||||
pos=["VBP", "NN"],
|
||||
heads=[0, -1],
|
||||
deps=['ROOT', 'dobj'],
|
||||
ents=[]),
|
||||
deps=["ROOT", "dobj"],
|
||||
ents=[],
|
||||
),
|
||||
get_doc(
|
||||
en_vocab,
|
||||
words=['c', 'd', 'e'],
|
||||
pos=['VBP', 'NN', 'NN'],
|
||||
words=["c", "d", "e"],
|
||||
pos=["VBP", "NN", "NN"],
|
||||
heads=[0, -1, -2],
|
||||
deps=['ROOT', 'dobj', 'dobj'],
|
||||
ents=[(1, 2, 'ORG')]),
|
||||
deps=["ROOT", "dobj", "dobj"],
|
||||
ents=[(1, 2, "ORG")],
|
||||
),
|
||||
]
|
||||
json_doc = docs_to_json(0, docs)
|
||||
assert json_doc['id'] == 0
|
||||
assert len(json_doc['paragraphs']) == 2
|
||||
assert len(json_doc['paragraphs'][0]['sentences']) == 1
|
||||
assert len(json_doc['paragraphs'][1]['sentences']) == 1
|
||||
assert len(json_doc['paragraphs'][0]['sentences'][0]['tokens']) == 2
|
||||
assert len(json_doc['paragraphs'][1]['sentences'][0]['tokens']) == 3
|
||||
assert json_doc["id"] == 0
|
||||
assert len(json_doc["paragraphs"]) == 2
|
||||
assert len(json_doc["paragraphs"][0]["sentences"]) == 1
|
||||
assert len(json_doc["paragraphs"][1]["sentences"]) == 1
|
||||
assert len(json_doc["paragraphs"][0]["sentences"][0]["tokens"]) == 2
|
||||
assert len(json_doc["paragraphs"][1]["sentences"][0]["tokens"]) == 3
|
||||
|
|
|
@ -11,19 +11,19 @@ from spacy._ml import PrecomputableAffine
|
|||
from .util import get_doc
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ['hello/world', 'hello world'])
|
||||
@pytest.mark.parametrize("text", ["hello/world", "hello world"])
|
||||
def test_util_ensure_path_succeeds(text):
|
||||
path = util.ensure_path(text)
|
||||
assert isinstance(path, Path)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('package', ['numpy'])
|
||||
@pytest.mark.parametrize("package", ["numpy"])
|
||||
def test_util_is_package(package):
|
||||
"""Test that an installed package via pip is recognised by util.is_package."""
|
||||
assert util.is_package(package)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('package', ['thinc'])
|
||||
@pytest.mark.parametrize("package", ["thinc"])
|
||||
def test_util_get_package_path(package):
|
||||
"""Test that a Path object is returned for a package name."""
|
||||
path = util.get_package_path(package)
|
||||
|
@ -33,44 +33,47 @@ def test_util_get_package_path(package):
|
|||
def test_displacy_parse_ents(en_vocab):
|
||||
"""Test that named entities on a Doc are converted into displaCy's format."""
|
||||
doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
|
||||
doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings['ORG'])]
|
||||
doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
|
||||
ents = displacy.parse_ents(doc)
|
||||
assert isinstance(ents, dict)
|
||||
assert ents['text'] == 'But Google is starting from behind '
|
||||
assert ents['ents'] == [{'start': 4, 'end': 10, 'label': 'ORG'}]
|
||||
assert ents["text"] == "But Google is starting from behind "
|
||||
assert ents["ents"] == [{"start": 4, "end": 10, "label": "ORG"}]
|
||||
|
||||
|
||||
def test_displacy_parse_deps(en_vocab):
|
||||
"""Test that deps and tags on a Doc are converted into displaCy's format."""
|
||||
words = ["This", "is", "a", "sentence"]
|
||||
heads = [1, 0, 1, -2]
|
||||
pos = ['DET', 'VERB', 'DET', 'NOUN']
|
||||
tags = ['DT', 'VBZ', 'DT', 'NN']
|
||||
deps = ['nsubj', 'ROOT', 'det', 'attr']
|
||||
doc = get_doc(en_vocab, words=words, heads=heads, pos=pos, tags=tags,
|
||||
deps=deps)
|
||||
pos = ["DET", "VERB", "DET", "NOUN"]
|
||||
tags = ["DT", "VBZ", "DT", "NN"]
|
||||
deps = ["nsubj", "ROOT", "det", "attr"]
|
||||
doc = get_doc(en_vocab, words=words, heads=heads, pos=pos, tags=tags, deps=deps)
|
||||
deps = displacy.parse_deps(doc)
|
||||
assert isinstance(deps, dict)
|
||||
assert deps['words'] == [{'text': 'This', 'tag': 'DET'},
|
||||
{'text': 'is', 'tag': 'VERB'},
|
||||
{'text': 'a', 'tag': 'DET'},
|
||||
{'text': 'sentence', 'tag': 'NOUN'}]
|
||||
assert deps['arcs'] == [{'start': 0, 'end': 1, 'label': 'nsubj', 'dir': 'left'},
|
||||
{'start': 2, 'end': 3, 'label': 'det', 'dir': 'left'},
|
||||
{'start': 1, 'end': 3, 'label': 'attr', 'dir': 'right'}]
|
||||
assert deps["words"] == [
|
||||
{"text": "This", "tag": "DET"},
|
||||
{"text": "is", "tag": "VERB"},
|
||||
{"text": "a", "tag": "DET"},
|
||||
{"text": "sentence", "tag": "NOUN"},
|
||||
]
|
||||
assert deps["arcs"] == [
|
||||
{"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
|
||||
{"start": 2, "end": 3, "label": "det", "dir": "left"},
|
||||
{"start": 1, "end": 3, "label": "attr", "dir": "right"},
|
||||
]
|
||||
|
||||
|
||||
def test_displacy_spans(en_vocab):
|
||||
"""Test that displaCy can render Spans."""
|
||||
doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
|
||||
doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings['ORG'])]
|
||||
html = displacy.render(doc[1:4], style='ent')
|
||||
assert html.startswith('<div')
|
||||
doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
|
||||
html = displacy.render(doc[1:4], style="ent")
|
||||
assert html.startswith("<div")
|
||||
|
||||
|
||||
def test_displacy_raises_for_wrong_type(en_vocab):
|
||||
with pytest.raises(ValueError):
|
||||
html = displacy.render('hello world')
|
||||
html = displacy.render("hello world")
|
||||
|
||||
|
||||
def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
|
||||
|
@ -78,22 +81,22 @@ def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
|
|||
assert model.W.shape == (nF, nO, nP, nI)
|
||||
tensor = model.ops.allocate((10, nI))
|
||||
Y, get_dX = model.begin_update(tensor)
|
||||
assert Y.shape == (tensor.shape[0]+1, nF, nO, nP)
|
||||
assert Y.shape == (tensor.shape[0] + 1, nF, nO, nP)
|
||||
assert model.d_pad.shape == (1, nF, nO, nP)
|
||||
dY = model.ops.allocate((15, nO, nP))
|
||||
ids = model.ops.allocate((15, nF))
|
||||
ids[1,2] = -1
|
||||
ids[1, 2] = -1
|
||||
dY[1] = 1
|
||||
assert model.d_pad[0, 2, 0, 0] == 0.
|
||||
assert model.d_pad[0, 2, 0, 0] == 0.0
|
||||
model._backprop_padding(dY, ids)
|
||||
assert model.d_pad[0, 2, 0, 0] == 1.
|
||||
model.d_pad.fill(0.)
|
||||
ids.fill(0.)
|
||||
dY.fill(0.)
|
||||
ids[1,2] = -1
|
||||
ids[1,1] = -1
|
||||
ids[1,0] = -1
|
||||
assert model.d_pad[0, 2, 0, 0] == 1.0
|
||||
model.d_pad.fill(0.0)
|
||||
ids.fill(0.0)
|
||||
dY.fill(0.0)
|
||||
ids[1, 2] = -1
|
||||
ids[1, 1] = -1
|
||||
ids[1, 0] = -1
|
||||
dY[1] = 1
|
||||
assert model.d_pad[0, 2, 0, 0] == 0.
|
||||
assert model.d_pad[0, 2, 0, 0] == 0.0
|
||||
model._backprop_padding(dY, ids)
|
||||
assert model.d_pad[0, 2, 0, 0] == 3.
|
||||
assert model.d_pad[0, 2, 0, 0] == 3.0
|
||||
|
|
|
@ -9,7 +9,7 @@ from spacy.vocab import Vocab
|
|||
from spacy.attrs import NORM
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text1,text2', [('hello', 'bye')])
|
||||
@pytest.mark.parametrize("text1,text2", [("hello", "bye")])
|
||||
def test_pickle_string_store(text1, text2):
|
||||
stringstore = StringStore()
|
||||
store1 = stringstore[text1]
|
||||
|
@ -21,10 +21,10 @@ def test_pickle_string_store(text1, text2):
|
|||
assert len(stringstore) == len(unpickled)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text1,text2', [('dog', 'cat')])
|
||||
@pytest.mark.parametrize("text1,text2", [("dog", "cat")])
|
||||
def test_pickle_vocab(text1, text2):
|
||||
vocab = Vocab(lex_attr_getters={int(NORM): lambda string: string[:-1]})
|
||||
vocab.set_vector('dog', numpy.ones((5,), dtype='f'))
|
||||
vocab.set_vector("dog", numpy.ones((5,), dtype="f"))
|
||||
lex1 = vocab[text1]
|
||||
lex2 = vocab[text2]
|
||||
assert lex1.norm_ == text1[:-1]
|
||||
|
@ -37,4 +37,4 @@ def test_pickle_vocab(text1, text2):
|
|||
assert unpickled[text2].norm == lex2.norm
|
||||
assert unpickled[text1].norm != unpickled[text2].norm
|
||||
assert unpickled.vectors is not None
|
||||
assert list(vocab['dog'].vector) == [1.,1.,1.,1.,1.]
|
||||
assert list(vocab["dog"].vector) == [1.0, 1.0, 1.0, 1.0, 1.0]
|
||||
|
|
|
@ -29,17 +29,19 @@ def test_tokenizer_handles_emoticons(tokenizer):
|
|||
assert tokens[17].text == ":D"
|
||||
assert tokens[18].text == "=|"
|
||||
assert tokens[19].text == '")'
|
||||
assert tokens[20].text == ':>'
|
||||
assert tokens[21].text == '....'
|
||||
assert tokens[20].text == ":>"
|
||||
assert tokens[21].text == "...."
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,length', [("example:)", 3), ("108)", 2), ("XDN", 1)])
|
||||
@pytest.mark.parametrize("text,length", [("example:)", 3), ("108)", 2), ("XDN", 1)])
|
||||
def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
|
||||
tokens = tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
|
||||
@pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8),
|
||||
('i💙you', 3), ('🤘🤘yay!', 4)])
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,length", [("can you still dunk?🍕🍔😵LOL", 8), ("i💙you", 3), ("🤘🤘yay!", 4)]
|
||||
)
|
||||
def test_tokenizer_handles_emoji(tokenizer, text, length):
|
||||
# These break on narrow unicode builds, e.g. Windows
|
||||
if sys.maxunicode >= 1114111:
|
||||
|
|
|
@ -12,11 +12,9 @@ NAUGHTY_STRINGS = [
|
|||
",./;'[]\-=",
|
||||
'<>?:"{}|_+',
|
||||
'!@#$%^&*()`~"',
|
||||
|
||||
# Unicode additional control characters, byte order marks
|
||||
"",
|
||||
"",
|
||||
|
||||
# Unicode Symbols
|
||||
"Ω≈ç√∫˜µ≤≥÷",
|
||||
"åß∂ƒ©˙∆˚¬…æ",
|
||||
|
@ -29,13 +27,11 @@ NAUGHTY_STRINGS = [
|
|||
"⅛⅜⅝⅞",
|
||||
"ЁЂЃЄЅІЇЈЉЊЋЌЍЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя",
|
||||
"٠١٢٣٤٥٦٧٨٩",
|
||||
|
||||
# Unicode Subscript/Superscript/Accents
|
||||
"⁰⁴⁵",
|
||||
"₀₁₂",
|
||||
"⁰⁴⁵₀₁₂",
|
||||
"ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็",
|
||||
|
||||
# Two-Byte Characters
|
||||
"田中さんにあげて下さい",
|
||||
"パーティーへ行かないか",
|
||||
|
@ -46,7 +42,6 @@ NAUGHTY_STRINGS = [
|
|||
"社會科學院語學研究所",
|
||||
"울란바토르",
|
||||
"𠜎𠜱𠝹𠱓𠱸𠲖𠳏",
|
||||
|
||||
# Japanese Emoticons
|
||||
"ヽ༼ຈل͜ຈ༽ノ ヽ༼ຈل͜ຈ༽ノ",
|
||||
"(。◕ ∀ ◕。)",
|
||||
|
@ -55,11 +50,9 @@ NAUGHTY_STRINGS = [
|
|||
"・( ̄∀ ̄)・:*:",
|
||||
"゚・✿ヾ╲(。◕‿◕。)╱✿・゚",
|
||||
",。・:*:・゜’( ☻ ω ☻ )。・:*:・゜’",
|
||||
"(╯°□°)╯︵ ┻━┻)"
|
||||
"(ノಥ益ಥ)ノ ┻━┻",
|
||||
"(╯°□°)╯︵ ┻━┻)" "(ノಥ益ಥ)ノ ┻━┻",
|
||||
"┬─┬ノ( º _ ºノ)",
|
||||
"( ͡° ͜ʖ ͡°)",
|
||||
|
||||
# Emoji
|
||||
"😍",
|
||||
"👩🏽",
|
||||
|
@ -69,18 +62,14 @@ NAUGHTY_STRINGS = [
|
|||
"✋🏿 💪🏿 👐🏿 🙌🏿 👏🏿 🙏🏿",
|
||||
"🚾 🆒 🆓 🆕 🆖 🆗 🆙 🏧",
|
||||
"0️⃣ 1️⃣ 2️⃣ 3️⃣ 4️⃣ 5️⃣ 6️⃣ 7️⃣ 8️⃣ 9️⃣ 🔟",
|
||||
|
||||
# Regional Indicator Symbols
|
||||
"🇺🇸🇷🇺🇸 🇦🇫🇦🇲🇸",
|
||||
"🇺🇸🇷🇺🇸🇦🇫🇦🇲",
|
||||
"🇺🇸🇷🇺🇸🇦",
|
||||
|
||||
# Unicode Numbers
|
||||
"123",
|
||||
"١٢٣",
|
||||
|
||||
# Right-To-Left Strings
|
||||
|
||||
"ثم نفس سقطت وبالتحديد،, جزيرتي باستخدام أن دنو. إذ هنا؟ الستار وتنصيب كان. أهّل ايطاليا، بريطانيا-فرنسا قد أخذ. سليمان، إتفاقية بين ما, يذكر الحدود أي بعد, معاملة بولندا، الإطلاق عل إيو.",
|
||||
"إيو.",
|
||||
"בְּרֵאשִׁית, בָּרָא אֱלֹהִים, אֵת הַשָּׁמַיִם, וְאֵת הָאָרֶץ",
|
||||
|
@ -88,34 +77,21 @@ NAUGHTY_STRINGS = [
|
|||
"﷽",
|
||||
"ﷺ",
|
||||
"مُنَاقَشَةُ سُبُلِ اِسْتِخْدَامِ اللُّغَةِ فِي النُّظُمِ الْقَائِمَةِ وَفِيم يَخُصَّ التَّطْبِيقَاتُ الْحاسُوبِيَّةُ،",
|
||||
|
||||
# Trick Unicode
|
||||
"test",
|
||||
"test",
|
||||
"
test
",
|
||||
"testtest",
|
||||
"test",
|
||||
|
||||
# Zalgo Text
|
||||
"Ṱ̺̺̕o͞ ̷i̲̬͇̪͙n̝̗͕v̟̜̘̦͟o̶̙̰̠kè͚̮̺̪̹̱̤ ̖t̝͕̳̣̻̪͞h̼͓̲̦̳̘̲e͇̣̰̦̬͎ ̢̼̻̱̘h͚͎͙̜̣̲ͅi̦̲̣̰̤v̻͍e̺̭̳̪̰-m̢iͅn̖̺̞̲̯̰d̵̼̟͙̩̼̘̳ ̞̥̱̳̭r̛̗̘e͙p͠r̼̞̻̭̗e̺̠̣͟s̘͇̳͍̝͉e͉̥̯̞̲͚̬͜ǹ̬͎͎̟̖͇̤t͍̬̤͓̼̭͘ͅi̪̱n͠g̴͉ ͏͉ͅc̬̟h͡a̫̻̯͘o̫̟̖͍̙̝͉s̗̦̲.̨̹͈̣",
|
||||
|
||||
|
||||
"̡͓̞ͅI̗̘̦͝n͇͇͙v̮̫ok̲̫̙͈i̖͙̭̹̠̞n̡̻̮̣̺g̲͈͙̭͙̬͎ ̰t͔̦h̞̲e̢̤ ͍̬̲͖f̴̘͕̣è͖ẹ̥̩l͖͔͚i͓͚̦͠n͖͍̗͓̳̮g͍ ̨o͚̪͡f̘̣̬ ̖̘͖̟͙̮c҉͔̫͖͓͇͖ͅh̵̤̣͚͔á̗̼͕ͅo̼̣̥s̱͈̺̖̦̻͢.̛̖̞̠̫̰",
|
||||
|
||||
|
||||
"̗̺͖̹̯͓Ṯ̤͍̥͇͈h̲́e͏͓̼̗̙̼̣͔ ͇̜̱̠͓͍ͅN͕͠e̗̱z̘̝̜̺͙p̤̺̹͍̯͚e̠̻̠͜r̨̤͍̺̖͔̖̖d̠̟̭̬̝͟i̦͖̩͓͔̤a̠̗̬͉̙n͚͜ ̻̞̰͚ͅh̵͉i̳̞v̢͇ḙ͎͟-҉̭̩̼͔m̤̭̫i͕͇̝̦n̗͙ḍ̟ ̯̲͕͞ǫ̟̯̰̲͙̻̝f ̪̰̰̗̖̭̘͘c̦͍̲̞͍̩̙ḥ͚a̮͎̟̙͜ơ̩̹͎s̤.̝̝ ҉Z̡̖̜͖̰̣͉̜a͖̰͙̬͡l̲̫̳͍̩g̡̟̼̱͚̞̬ͅo̗͜.̟",
|
||||
|
||||
|
||||
"̦H̬̤̗̤͝e͜ ̜̥̝̻͍̟́w̕h̖̯͓o̝͙̖͎̱̮ ҉̺̙̞̟͈W̷̼̭a̺̪͍į͈͕̭͙̯̜t̶̼̮s̘͙͖̕ ̠̫̠B̻͍͙͉̳ͅe̵h̵̬͇̫͙i̹͓̳̳̮͎̫̕n͟d̴̪̜̖ ̰͉̩͇͙̲͞ͅT͖̼͓̪͢h͏͓̮̻e̬̝̟ͅ ̤̹̝W͙̞̝͔͇͝ͅa͏͓͔̹̼̣l̴͔̰̤̟͔ḽ̫.͕",
|
||||
|
||||
|
||||
"Z̮̞̠͙͔ͅḀ̗̞͈̻̗Ḷ͙͎̯̹̞͓G̻O̭̗̮",
|
||||
|
||||
|
||||
# Unicode Upsidedown
|
||||
"˙ɐnbᴉlɐ ɐuƃɐɯ ǝɹolop ʇǝ ǝɹoqɐl ʇn ʇunpᴉpᴉɔuᴉ ɹodɯǝʇ poɯsnᴉǝ op pǝs 'ʇᴉlǝ ƃuᴉɔsᴉdᴉpɐ ɹnʇǝʇɔǝsuoɔ 'ʇǝɯɐ ʇᴉs ɹolop ɯnsdᴉ ɯǝɹo˥",
|
||||
"00˙Ɩ$-",
|
||||
|
||||
# Unicode font
|
||||
"The quick brown fox jumps over the lazy dog",
|
||||
"𝐓𝐡𝐞 𝐪𝐮𝐢𝐜𝐤 𝐛𝐫𝐨𝐰𝐧 𝐟𝐨𝐱 𝐣𝐮𝐦𝐩𝐬 𝐨𝐯𝐞𝐫 𝐭𝐡𝐞 𝐥𝐚𝐳𝐲 𝐝𝐨𝐠",
|
||||
|
@ -125,19 +101,17 @@ NAUGHTY_STRINGS = [
|
|||
"𝕋𝕙𝕖 𝕢𝕦𝕚𝕔𝕜 𝕓𝕣𝕠𝕨𝕟 𝕗𝕠𝕩 𝕛𝕦𝕞𝕡𝕤 𝕠𝕧𝕖𝕣 𝕥𝕙𝕖 𝕝𝕒𝕫𝕪 𝕕𝕠𝕘",
|
||||
"𝚃𝚑𝚎 𝚚𝚞𝚒𝚌𝚔 𝚋𝚛𝚘𝚠𝚗 𝚏𝚘𝚡 𝚓𝚞𝚖𝚙𝚜 𝚘𝚟𝚎𝚛 𝚝𝚑𝚎 𝚕𝚊𝚣𝚢 𝚍𝚘𝚐",
|
||||
"⒯⒣⒠ ⒬⒰⒤⒞⒦ ⒝⒭⒪⒲⒩ ⒡⒪⒳ ⒥⒰⒨⒫⒮ ⒪⒱⒠⒭ ⒯⒣⒠ ⒧⒜⒵⒴ ⒟⒪⒢",
|
||||
|
||||
# File paths
|
||||
"../../../../../../../../../../../etc/passwd%00",
|
||||
"../../../../../../../../../../../etc/hosts",
|
||||
|
||||
# iOS Vulnerabilities
|
||||
"Powerلُلُصّبُلُلصّبُررً ॣ ॣh ॣ ॣ冗",
|
||||
"🏳0🌈️"
|
||||
"🏳0🌈️",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize('text', NAUGHTY_STRINGS)
|
||||
@pytest.mark.parametrize("text", NAUGHTY_STRINGS)
|
||||
def test_tokenizer_naughty_strings(tokenizer, text):
|
||||
tokens = tokenizer(text)
|
||||
assert tokens.text_with_ws == text
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user